-
Notifications
You must be signed in to change notification settings - Fork 4
/
bam.cpp
134 lines (126 loc) · 3.55 KB
/
bam.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include "bam.hpp"
uint32_t cigar_len_mask = 0xFFFFFFF0;
uint32_t cigar_type_mask = 0xF;
string print_cigar_symbol(int type) {
if (type == BAM_CMATCH) {
return "M";
}
if (type == BAM_CINS) {
return "I";
}
if (type == BAM_CDEL) {
return "D";
}
if (type == BAM_CSOFT_CLIP) {
return "S";
}
if (type == BAM_CHARD_CLIP) {
return "H";
}
return "X";
}
vector<pair<int, int>> decode_cigar(bam1_t *read) {
// get CIGAR
vector<pair<int, int>> cigar_offsets;
uint32_t *cigar = bam_get_cigar(read);
for (size_t i = 0; i < read->core.n_cigar; i++) {
uint32_t type = cigar[i] & cigar_type_mask;
uint32_t length = cigar[i] >> 4;
cigar_offsets.push_back(make_pair(length, type));
}
return cigar_offsets;
}
uint8_t *encode_cigar(vector<pair<uint32_t, uint32_t>> cigar) {
uint32_t *cigar_bytes = (uint32_t *)malloc(sizeof(uint32_t) * cigar.size());
for (size_t i = 0; i < cigar.size(); i++) {
cigar_bytes[i] =
(cigar[i].first << 4) | (cigar[i].second & cigar_type_mask);
}
return (uint8_t *)cigar_bytes;
}
uint8_t *encode_bam_seq(char *seq) {
int n = (strlen(seq) + 1) >> 1;
int l_seq = strlen(seq);
uint8_t *seq_bytes = (uint8_t *)malloc(sizeof(uint8_t) * n);
int i = 0;
n = 0;
for (i = 0; i + 1 < l_seq; i += 2) {
seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) |
seq_nt16_table[(unsigned char)seq[i + 1]];
n += 1;
}
for (; i < l_seq; i++) {
seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4;
n += 1;
}
return seq_bytes;
}
char reverse_complement_base(char base) {
if (base == 'C' || base == 'c') {
return 'G';
}
if (base == 'A' || base == 'a') {
return 'T';
}
if (base == 'G' || base == 'g') {
return 'C';
}
if (base == 'T' || base == 't') {
return 'A';
} else {
return 'N';
}
}
void reverse_complement_read(char *seq) {
int l = strlen(seq);
int i = 0;
while (i < l / 2) {
auto t = reverse_complement_base(seq[l - i]);
seq[l - 1 - i] = reverse_complement_base(seq[i]);
seq[i] = t;
i += 1;
}
}
vector<pair<int, int>> get_aligned_pairs(bam1_t *alignment) {
vector<pair<int, int>> result;
uint ref_pos = alignment->core.pos;
uint read_pos = 0;
auto cigar_offsets = decode_cigar(alignment);
size_t m = 0;
while (true) {
if (m == cigar_offsets.size()) {
break;
}
if (cigar_offsets[m].second == BAM_CMATCH or
cigar_offsets[m].second == BAM_CEQUAL or
cigar_offsets[m].second == BAM_CDIFF) {
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) {
result.push_back(make_pair(read_pos, i));
read_pos++;
}
ref_pos += cigar_offsets[m].first;
} else if (cigar_offsets[m].second == BAM_CINS or
cigar_offsets[m].second == BAM_CSOFT_CLIP) {
for (int i = 0; i < cigar_offsets[m].first; ++i) {
result.push_back(make_pair(read_pos, -1));
read_pos++;
}
} else if (cigar_offsets[m].second == BAM_CDEL) {
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) {
result.push_back(make_pair(-1, i));
}
ref_pos += cigar_offsets[m].first;
} else if (cigar_offsets[m].second == BAM_CHARD_CLIP) {
// advances neither
} else if (cigar_offsets[m].second == BAM_CREF_SKIP) {
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) {
result.push_back(make_pair(-1, i));
}
ref_pos += cigar_offsets[m].first;
} else { // if (cigar_offsets[m].second == BAM_CPAD) {
// TODO
}
m++;
}
return result;
}