This repository has been archived by the owner on Feb 27, 2023. It is now read-only.
forked from hallamlab/FragGeneScanPlus
-
Notifications
You must be signed in to change notification settings - Fork 3
/
util_lib.c
306 lines (252 loc) · 9.2 KB
/
util_lib.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "util_lib.h"
/** The queue of empty buffers, waiting for input by the reader thread. */
QUEUE *q_empty_head;
QUEUE *q_empty_tail;
/** The queue of finished buffers, waiting for the writer thread to handle output. */
QUEUE *q_done_head;
QUEUE *q_done_tail;
// which corresponds to either 0 (done) or 1 (empty) buffer
void enqueue(ThreadData *td, unsigned int buffer, unsigned int which) {
QUEUE *item = (QUEUE *) malloc(sizeof(QUEUE));
item->td = td;
item->buffer = buffer;
item->next = 0;
QUEUE **head = 0;
QUEUE **tail = 0;
if (which) {
head = &q_empty_head;
tail = &q_empty_tail;
} else {
head = &q_done_head;
tail = &q_done_tail;
}
// printf("INSIDE ENQ, head %d tail %d\n", *head, *tail);
if (!*head) {
*head = item;
*tail = item;
} else {
(*tail)->next = item;
*tail = item;
}
}
void printq(unsigned int which) {
QUEUE *head;
if (which) head = q_empty_head;
else head = q_done_head;
// printf("QUEUE %d head %d\n", which, head);
while (head) {
printf("QUEUE %d has id %d buffer %d\n", which, head->td->id, head->buffer);
head = head->next;
}
}
// which corresponds to either 0 (done) or 1 (empty) buffer
QUEUE *deq(unsigned int which) {
QUEUE **head = 0;
if (which) head = &q_empty_head;
else head = &q_done_head;
if (!*head) return 0;
QUEUE *temp = *head;
*head = (*head)->next;
return temp;
}
// which corresponds to either 0 (done) or 1 (empty) buffer
void cutnpaste_q(QUEUE **dest, unsigned int which) {
QUEUE **head = 0;
if (which) {
head = &q_empty_head;
} else {
head = &q_done_head;
}
if (!*head) {
*dest = 0;
return;
}
*dest = *head;
*head = 0;
}
/* This is the viterbi score matrix for this sequence. */
double **dmatrix(int num_col) {
int i;
double **m = calloc(NUM_STATE, sizeof(double *));
if (!m) {
fprintf(stderr, "%s\n", "ERROR: Allocation failure for points to rows in dmatrix()");
exit(EXIT_FAILURE);
}
for (i=0; i < NUM_STATE; i++) {
m[i] = calloc(num_col, sizeof(double));
if (!m[i]) {
fprintf(stderr, "%s %d %s\n", "ERROR: Allocation failure for the row ", i, " in dmatrix()");
exit(EXIT_FAILURE);
}
}
return m;
}
/* NUM_STATE = NUM_STATE. This is the viterbi score matrix for this sequence. */
int **imatrix(int num_col) {
int i;
int **m = calloc(NUM_STATE, sizeof(int *));
if (!m) {
fprintf(stderr, "%s\n", "ERROR: Allocation failure for points to rows in imatrix()");
exit(EXIT_FAILURE);
}
for (i = 0; i < NUM_STATE; i++) {
m[i] = calloc(num_col, sizeof(int));
if (!m[i]) {
fprintf(stderr, "%s %d %s\n", "ERROR: Allocation failure for the row ", i ," in imatrix()");
exit(EXIT_FAILURE);
}
}
return m;
}
int *ivector(int nh) {
int *v = calloc(nh, sizeof(int));
if (!v) {
fprintf(stderr, "%s\n", "ERROR: Allocation failure in ivector()");
exit(EXIT_FAILURE);
}
return v;
}
void free_dmatrix(double **m) {
int i;
for (i=NUM_STATE-1; i>=0; i--) {
free(m[i]);
m[i] = NULL;
}
free(m);
m = NULL;
}
void free_imatrix(int **m) {
int i;
for (i=NUM_STATE-1; i>=0; i--) {
free(m[i]);
m[i] = NULL;
}
free(m);
m = NULL;
}
HMM_StateTransition hmm_state_transition_parse (const char *tr) {
if (strcmp(tr, "MM") == 0)
return TR_MM;
if (strcmp(tr, "MI") == 0)
return TR_MI;
if (strcmp(tr, "MD") == 0)
return TR_MD;
if (strcmp(tr, "II") == 0)
return TR_II;
if (strcmp(tr, "IM") == 0)
return TR_IM;
if (strcmp(tr, "DD") == 0)
return TR_DD;
if (strcmp(tr, "DM") == 0)
return TR_DM;
if (strcmp(tr, "GE") == 0)
return TR_GE;
if (strcmp(tr, "GG") == 0)
return TR_GG;
if (strcmp(tr, "ER") == 0)
return TR_ER;
if (strcmp(tr, "RS") == 0)
return TR_RS;
if (strcmp(tr, "RR") == 0)
return TR_RR;
if (strcmp(tr, "ES") == 0)
return TR_ES;
if (strcmp(tr, "ES1") == 0)
return TR_ES;
return -1;
}
Nucleotide nucleotide_parse (char nt) {
if (nt == 'A' || nt == 'a')
return NUCL_A;
if (nt == 'C' || nt == 'c')
return NUCL_C;
if (nt == 'G' || nt == 'g')
return NUCL_G;
if (nt == 'T' || nt == 't')
return NUCL_T;
return NUCL_INVALID;
}
Nucleotide nucleotide_complement (Nucleotide nt) {
if (nt == NUCL_INVALID)
return NUCL_INVALID;
return 3 - nt;
}
int trinucleotide (Nucleotide a, Nucleotide b, Nucleotide c) {
if (c == NUCL_INVALID)
return 0;
if (b == NUCL_INVALID)
return c;
if (a == NUCL_INVALID)
return (b << 2) + c;
return (a << 4) + (b << 2) + c;
}
static int trinucleotide_pep (Nucleotide a, Nucleotide b, Nucleotide c) {
if (a == NUCL_INVALID || b == NUCL_INVALID || c == NUCL_INVALID)
return 64;
return (a << 4) + (b << 2) + c;
}
// Calculates the reverse complement of the bases in `dna` and saves it in `reverse_complement`
void get_rc_dna(const Nucleotide dna[], int dna_len, char *reverse_complement) {
static char NUCL_CHARACTERS_RC[] = "TGCAN";
int i;
for (i = 0; i < dna_len; i++)
reverse_complement[dna_len-i-1] = NUCL_CHARACTERS_RC[dna[i]];
}
void get_protein(const Nucleotide dna[], int dna_len, char *protein, Strand strand, const char *translation_table, const char *translation_table_rc) {
int i;
/* If our DNA sequence is not a multiple of 3, throw away the last nucleotides */
dna_len -= dna_len % 3;
if (strand == FORWARD_STRAND) {
for (i = 0; i < dna_len; i += 3)
protein[i/3] = translation_table[trinucleotide_pep(dna[i], dna[i+1], dna[i+2])];
} else {
for (i = 0; i < dna_len; i += 3)
protein[(dna_len-i)/3-1] = translation_table_rc[trinucleotide_pep(dna[i], dna[i+1], dna[i+2])];
}
/* Don't forget the string terminator */
protein[(dna_len/3)] = '\0';
}
void print_usage() {
puts("USAGE: FGS++ -s [seq_file_name] -c [chunk_size] -o [output_file_name] -w [1 or 0] -t [train_file_name] -p [thread_num] -e [1 or 0] -d [1 or 0] ");
puts("EXAMPLE USAGE: FGS++ -s example/NC_000913-454.fna -o output -w 0 -t 454_5 -p 16 ");
puts("MINIMAL USAGE: FGS++ -s [seq_file_name] -o [output_file_name] -w [1 or 0] -t [train_file_name] ");
puts("INFO: FragGeneScan++ will only output the amino acid files by default. To obtain the meta information set -e 1 and for the DNA files set -d 1\n");
puts(" Mandatory parameters:");
puts(" -s [seq_file_name]: sequence file name including the full path");
puts(" for standard input specify as stdin");
puts(" -o [output_file_name]: output file name including the full path");
puts(" for standard output specify as stdout");
puts(" -w [1 or 0]: 1 if the sequence file has complete genomic sequences");
puts(" 0 if the sequence file has short sequence reads");
puts(" -t [train_file_name]: file name that contains model parameters; this file should be in the -r directory");
puts(" Note that four files containing model parameters already exist in the \"train\" directory");
puts(" [complete] for complete genomic sequences or short sequence reads without sequencing error");
puts(" [sanger_5] for Sanger sequencing reads with about 0.5% error rate");
puts(" [sanger_10] for Sanger sequencing reads with about 1% error rate");
puts(" [454_5] for 454 pyrosequencing reads with about 0.5% error rate");
puts(" [454_10] for 454 pyrosequencing reads with about 1% error rate");
puts(" [454_30] for 454 pyrosequencing reads with about 3% error rate");
puts(" [illumina_1] for Illumina sequencing reads with about 0.1% error rate");
puts(" [illumina_5] for Illumina sequencing reads with about 0.5% error rate");
puts(" [illumina_10] for Illumina sequencing reads with about 1% error rate\n");
puts(" Optional flags");
puts(" -r [train_file_dir]: Full path of the directory containing the training model files.");
puts(" -p [thread_num] The number of threads used by FragGeneScan++; default is 1 thread.");
puts(" -e [1 or 0] Output metadata for sequences.");
puts(" -d [1 or 0] Output DNA file.");
puts(" -c [chunk_size] Number of sequences in a chunk, scales speed and memory usage.");
puts(" -x [translation_table] Which translation table to use (default: 11)");
}
void stopMemset(char *ptr, int length) {
int i;
for (i=0; i<length; i++) {
if (ptr[i] == '\0') {
return;
}
ptr[i] = '\0';
}
}