forked from samtools/hts-specs
-
Notifications
You must be signed in to change notification settings - Fork 1
/
SAMtags.tex
338 lines (272 loc) · 13.1 KB
/
SAMtags.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
\documentclass[10pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{longtable}
\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref}
\newcommand{\mailtourl}[1]{\href{mailto:#1}{\tt #1}}
\newcommand{\tagvalue}[1]{\tt #1}
\newcommand{\tagregex}[1]{\tt #1}
\begin{document}
\input{SAMtags.ver}
\title{Sequence Alignment/Map Optional Fields Specification}
\author{The SAM/BAM Format Specification Working Group}
\date{\headdate}
\maketitle
\begin{quote}\small
The master version of this document can be found at
\url{https://github.com/samtools/hts-specs}.\\
This printing is version~\commitdesc\ from that repository,
last modified on the date shown above.
\end{quote}
\vspace*{1em}
\noindent
This document is a companion to the {\sl Sequence Alignment/Map Format
Specification} that defines the SAM and~BAM formats, and to the {\sl CRAM
Format Specification} that defines the CRAM format.\footnote{See
\href{http://samtools.github.io/hts-specs/SAMv1.pdf}{\tt SAMv1.pdf} and
\href{http://samtools.github.io/hts-specs/CRAMv3.pdf}{\tt CRAMv3.pdf}
at \url{https://github.com/samtools/hts-specs}.}
Alignment records in each of these formats may contain a number of optional
fields, each labelled with a {\it tag\/} identifying that field's data.
This document describes each of the predefined standard tags, and discusses
conventions around creating new tags.
\section{Standard tags}
Predefined standard tags are listed in the following table and described
in greater detail in later subsections.
Optional fields are usually displayed as {\tt TAG:TYPE:VALUE}; the {\it type\/}
may be one of
{\tt A} (character),
{\tt B} (general array),
{\tt f} (real number),
{\tt H} (hexadecimal array),
{\tt i} (integer),
or
{\tt Z} (string).
\begin{center}\small
\begin{longtable}{ccp{12.5cm}}
\hline
{\bf Tag} & {\bf Type} & {\bf Description} \\
\hline
{\tt AM} & i & The smallest template-independent mapping quality of segments in the rest \\
{\tt AS} & i & Alignment score generated by aligner \\
{\tt BC} & Z & Barcode sequence \\
{\tt BQ} & Z & Offset to base alignment quality (BAQ) \\
{\tt CC} & Z & Reference name of the next hit \\
{\tt CM} & i & Edit distance between the color sequence and the color reference (see also {\tt NM})\\
{\tt CO} & Z & Free-text comments \\
{\tt CP} & i & Leftmost coordinate of the next hit \\
{\tt CQ} & Z & Color read base qualities \\
{\tt CS} & Z & Color read sequence \\
{\tt CT} & Z & Complete read annotation tag, used for consensus annotation dummy features.\\
{\tt E2} & Z & The 2nd most likely base calls \\
{\tt FI} & i & The index of segment in the template \\
{\tt FS} & Z & Segment suffix \\
{\tt FZ} & B,S & Flow signal intensities \\
{\tt GC} & ? & Reserved for backwards compatibility reasons \\
{\tt GQ} & ? & Reserved for backwards compatibility reasons \\
{\tt GS} & ? & Reserved for backwards compatibility reasons \\
{\tt H0} & i & Number of perfect hits \\
{\tt H1} & i & Number of 1-difference hits (see also {\tt NM}) \\
{\tt H2} & i & Number of 2-difference hits \\
{\tt HI} & i & Query hit index \\
{\tt IH} & i & Number of stored alignments in SAM that contains the query in the current record\\
{\tt LB} & Z & Library \\
{\tt MC} & Z & CIGAR string for mate/next segment\\
{\tt MD} & Z & String for mismatching positions \\
{\tt MF} & ? & Reserved for backwards compatibility reasons \\
{\tt MQ} & i & Mapping quality of the mate/next segment \\
{\tt NH} & i & Number of reported alignments that contains the query in the current record\\
{\tt NM} & i & Edit distance to the reference \\
{\tt OC} & Z & Original CIGAR \\
{\tt OP} & i & Original mapping position \\
{\tt OQ} & Z & Original base quality \\
{\tt PG} & Z & Program \\
{\tt PQ} & i & Phred likelihood of the template \\
{\tt PT} & Z & Read annotations for parts of the padded read sequence \\
{\tt PU} & Z & Platform unit \\
{\tt QT} & Z & Barcode ({\tt BC} or {\tt RT}) phred-scaled base qualities \\
{\tt Q2} & Z & Phred quality of the mate/next segment sequence in the {\tt R2} tag \\
{\tt R2} & Z & Sequence of the mate/next segment in the template \\
{\tt RG} & Z & Read group \\
{\tt RT} & Z & Barcode sequence (deprecated; use {\tt BC} instead) \\
{\tt SA} & Z & Other canonical alignments in a chimeric alignment \\
{\tt SM} & i & Template-independent mapping quality \\
{\tt SQ} & ? & Reserved for backwards compatibility reasons \\
{\tt S2} & ? & Reserved for backwards compatibility reasons \\
{\tt TC} & i & The number of segments in the template \\
{\tt U2} & Z & Phred probility of the 2nd call being wrong conditional on the best being wrong \\
{\tt UQ} & i & Phred likelihood of the segment, conditional on the mapping being correct \\
{\tt X?} & ? & Reserved for end users \\
{\tt Y?} & ? & Reserved for end users \\
{\tt Z?} & ? & Reserved for end users \\
\hline
\end{longtable}
\end{center}
\subsection{Additional Template and Mapping data}
\begin{description}
\item[AM:i:\tagvalue{int}]
The smallest template-independent mapping quality of segments in the rest.
\item[AS:i:\tagvalue{score}]
Alignment score generated by aligner.
\item[BQ:Z:\tagvalue{qualities}]
Offset to base alignment quality (BAQ), of the same length as the read sequence.
At the $i$-th read base, ${\rm BAQ}_i=Q_i-({\rm BQ}_i-64)$ where $Q_i$ is the $i$-th base quality.
\item[CC:Z:\tagvalue{rname}]
Reference name of the next hit; `{\tt =}' for the same chromosome.
\item[CP:i:\tagvalue{pos}]
Leftmost coordinate of the next hit.
\item[E2:Z:\tagvalue{qualities}]
The 2nd most likely base calls. Same encoding and same length as {\sf QUAL}.
\item[FI:i:\tagvalue{int}]
The index of segment in the template.
\item[FS:Z:\tagvalue{str}]
Segment suffix.
\item[H0:i:\tagvalue{count}]
Number of perfect hits.
\item[H1:i:\tagvalue{count}]
Number of 1-difference hits (see also {\tt NM}).
\item[H2:i:\tagvalue{count}]
Number of 2-difference hits.
\item[HI:i:\emph{i}]
Query hit index, indicating the alignment record is the $i$-th one stored
in SAM.
\item[IH:i:\tagvalue{count}]
Number of stored alignments in SAM that contains the query in the current
record.
\item[MC:Z:\tagvalue{cigar}]
CIGAR string for mate/next segment.
\item[MD:Z:\tagregex{[0-9]+(([A-Z]|\char92\char94[A-Z]+)[0-9]+)*}]
String for mismatching positions.
The {\tt MD} field aims to achieve SNP/indel calling without
looking at the reference. For example, a string `{\tt 10A5\char94AC6}' means
from the leftmost reference base in the alignment, there are 10 matches
followed by an A on the reference which is different from the aligned read
base; the next 5 reference bases are matches followed by a 2bp deletion from
the reference; the deleted sequence is AC; the last 6~bases are matches.
The {\tt MD} field ought to match the {\sf CIGAR} string.
\item[MQ:i:\tagvalue{}]
Mapping quality of the mate/next segment.
\item[NH:i:\tagvalue{}]
Number of reported alignments that contains the query in the current record.
\item[NM:i:\tagvalue{}]
Edit distance to the reference, including ambiguous bases but excluding clipping.
\item[PQ:i:\tagvalue{}]
Phred likelihood of the template, conditional on both the mapping being correct.
\item[Q2:Z:\tagvalue{}]
Phred quality of the mate/next segment sequence in the {\tt R2} tag.
Same encoding as {\sf QUAL}.
\item[R2:Z:\tagvalue{}]
Sequence of the mate/next segment in the template.
\item[SA:Z:\tagregex{{\tt (}\emph{rname}{\tt ,}\emph{pos}{\tt ,}\emph{strand}{\tt ,}\emph{CIGAR}{\tt ,}\emph{mapQ}{\tt ,}\emph{NM}{\tt ;)}+}]
Other canonical alignments in a chimeric alignment, formatted as a semicolon-delimited list.
Each element in the list represents a part of the chimeric alignment. Conventionally, at a supplementary line, the first element points to the primary line.
\item[SM:i:\tagvalue{}]
Template-independent mapping quality.
\item[TC:i:\tagvalue{}]
The number of segments in the template.
\item[U2:Z:\tagvalue{}]
Phred probility of the 2nd call being wrong conditional on the best being wrong.
The same encoding as {\sf QUAL}.
\item[UQ:i:\tagvalue{}]
Phred likelihood of the segment, conditional on the mapping being correct.
\end{description}
\subsection{Metadata}
\begin{description}
\item[RG:Z:\tagvalue{readgroup}]
The read group to which the read belongs.
If {\tt @RG} headers are present, then \emph{readgroup} must match the
{\tt RG-ID} field of one of the headers.
\item[LB:Z:\tagvalue{library}]
The library from which the read has been sequenced.
If {\tt @RG} headers are present, then \emph{library} must match the
{\tt RG-LB} field of one of the headers.
\item[PG:Z:\tagvalue{}]
Program. Value matches the header {\tt PG-ID} tag if {\tt @PG} is present.
\item[PU:Z:\tagvalue{platformunit}]
The platform unit in which the read was sequenced.
If {\tt @RG} headers are present, then \emph{platformunit} must match the
{\tt RG-PU} field of one of the headers.
\item[CO:Z:\tagvalue{text}]
Free-text comments.
\end{description}
\subsection{Barcodes}
\begin{description}
\item[BC:Z:\tagvalue{sequence}]
Barcode sequence, with any quality scores stored in the {\tt QT} tag.
\item[QT:Z:\tagvalue{qualities}]
Phred quality of the barcode sequence in the {\tt BC} (or {\tt RT}) tag. Same encoding as {\sf QUAL}.
\item[RT:Z:\tagvalue{sequence}]
Deprecated alternative to {\tt BC} tag originally used at Sanger.
\end{description}
\subsection{Original data}
\begin{description}
\item[OC:Z:\tagvalue{cigar}]
Original CIGAR, usually before realignment.
\item[OP:i:\tagvalue{pos}]
Original mapping position, usually before realignment.
\item[OQ:Z:\tagvalue{qualities}]
Original base quality, usually before recalibration.
Same encoding as {\sf QUAL}.
\end{description}
\subsection{Annotation and Padding}
\begin{description}
\item[CT:Z:\tagregex{\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*}]
Complete read annotation tag, used for consensus annotation dummy features.
The {\tt CT} tag is intended primarily for annotation
dummy reads, and consists of a \emph{strand}, \emph{type} and zero or
more \emph{key}=\emph{value} pairs, each separated with semicolons.
The \emph{strand} field has four values as in GFF3, and supplements FLAG
bit 0x10 to allow unstranded (`{\tt .}'), and stranded but unknown strand
(`{\tt ?}') annotation. For these and annotation on the forward strand
(\emph{strand} set to `{\tt +}'), do not set FLAG bit 0x10. For
annotation on the reverse strand, set the \emph{strand} to `{\tt -}'
and set FLAG bit 0x10.
The \emph{type} and any \emph{keys} and their
optional \emph{values} are all percent encoded according to
RFC3986 to escape meta-characters `{\tt =}', `{\tt \%}', `{\tt ;}',
`{\tt |}' or non-printable characters not matched by the isprint()
macro (with the C locale). For example a percent sign becomes
`{\tt \%2C}'.
%NOTE - This leaves open the possibility of allowing multiple such
%entries for a single CT tag to be combined with | as in the PT tag.
\item[PT:Z:\tagregex{\tt \emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*(\char92|\emph{start};\emph{end};\emph{strand};\emph{type}(;\emph{key}(=\emph{value}))*)*}]
Read annotations for parts of the padded read sequence.
The {\tt PT} tag value has the format of a series of
tags separated by `{\tt |}', each annotating a sub-region of the read.
Each tag consists of \emph{start}, \emph{end}, \emph{strand},
\emph{type} and zero or more \emph{key}{\tt =}\emph{value} pairs, each
separated with semicolons. \emph{Start} and \emph{end} are 1-based
positions between one and the sum of the {\tt M/I/D/P/S/=/X}
{\sf CIGAR} operators, i.e. {\sf SEQ} length plus any pads. Note
any editing of the CIGAR string may require updating the `{\tt PT}'
tag coordinates, or even invalidate them.
As in GFF3, \emph{strand} is one of `{\tt +}' for forward strand tags,
`{\tt -}' for reverse strand, `{\tt .}' for unstranded or `{\tt ?}'
for stranded but unknown strand.
The \emph{type} and any \emph{keys} and their optional \emph{values}
are all percent encoded as in the {\tt CT} tag.
\end{description}
\subsection{Technology-specific data}
\begin{description}
\item[FZ:B,S:\tagvalue{intensities}]
Flow signal intensities on the original strand of the read, stored as {\tt (uint16\_t) round(value * 100.0)}.
\end{description}
\subsubsection{Color space}
% TODO Describe color space and the encoding here.
\begin{description}
\item[CM:i:\tagvalue{distance}]
Edit distance between the color sequence and the color reference (see also {\tt NM}).
\item[CS:Z:\tagvalue{sequence}]
Color read sequence on the original strand of the read. The primer base must be included.
\item[CQ:Z:\tagvalue{qualities}]
Color read quality on the original strand of the read. Same encoding as {\sf QUAL}; same length as {\tt CS}.
\end{description}
\section{Locally-defined tags}
You can freely add new tags.
Note that tags starting with `{\tt X}', `{\tt Y}', or `{\tt Z}' and tags
containing lowercase letters in either position are reserved for local use
and will not be formally defined in any future version of this specification.
If a new tag may be of general interest, it may be useful to have it added
to this specification. Additions can be proposed by opening a new issue at
\url{https://github.com/samtools/hts-specs/issues} and/or by sending email
to \mailtourl{[email protected]}.
\end{document}