-
Notifications
You must be signed in to change notification settings - Fork 2
/
FlowClus.h
284 lines (265 loc) · 10.5 KB
/
FlowClus.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/*
John M. Gaspar ([email protected])
June 2013 (updated 1/14, 3/14)
Header for FlowClus.c
*/
// constants
#define VERSION "1.1" // version of FlowClus
#define MAX_SIZE 65528 // maximum length (characters) for each line of input
#define HEADER 20 // maximum header length
#define NUC 4 // number of nucleotides (ACGT)
#define MIDPRIM 100 // maximum mid tag - primer length
#define DELIM " \n" // delimiter for flow values
#define CSV ",\t" // delimiter for primer - mid tag file
#define SEP "-" // separator for header information
#define PER "_" // separator for QIIME, Perseus output
#define COM "," // separator for chimera mapping file
#define TAB " " // tab for stdout
#define END -1.0f // tag to indicate end of flowgram
#define MIN 0.50f // maximum flow value not to call a base (rounding down)
// -- change to 0.49f to round up
// label names in sff.txt file
#define NUMRE " # of Reads"
#define NUMFL " # of Flows"
#define CHARS " Flow Chars"
#define CQL " Clip Qual Left"
#define CQR " Clip Qual Right"
#define FLOWG "Flowgram"
#define FLOWI "Flow Indexes"
#define BASE "Bases"
#define QUAL "Quality Scores"
#define COL ":"
// label names in master file
#define PRIMER "primer"
#define MIDTAG "midtag"
#define REVERSE "reverse"
// default values
#define DEFFLOWEXT ".flow" // default file extension for cleaned flowgrams
#define DEFFEXT ".den" // default file extension for denoised flowgrams
#define DEFMEXT ".map" // default file extension for mapping files
#define DEFCHEXT ".chfasta" // default file extension for output files for UCHIME
#define DEFCHMAP ".chmap" // default file extension for mapping files for UCHIME
#define DEFSDFILE "stddev.txt" // default input file containing standard deviations
#define DEFMAXFLOW 19.99f // default maximum flow value
#define DEFDENFILE "denoised.fasta" // default output file for denoised fasta
/***** command-line options *****/
#define HELP "-h" // print usage
#define STATUSOPT "-st" // prints status updates while running
// analysis options
#define CLEANOPT "-a" // option to clean only (eliminate, truncate)
#define DENOPT "-b" // option to denoise only
#define BOTHOPT "-ab" // option to do both cleaning and denoising (default)
// input/output files
#define MASTERFILE "-m" // input master file, containing primer and mid tag sequences
#define SFFFILE "-i" // input sff.txt file -- required if filtering
#define OUTFILE "-e" // output fasta file after filtering
#define DENFASTA "-o" // output fasta file after denoising
#define NOMIDOPT "-x" // option to produce "QIIME-style" output fasta file(s)
// (no mid tag - primer sequences)
#define FLOWEXT "-f" // file extension for filtered flowgrams
#define DENPOPT "-v" // option to produce consensus flowgram and mapping files
#define DENFEXT "-vf" // file extension for denoised flowgrams
#define DENMEXT "-vm" // file extension for mapping files after denoising
#define ERRFILE "-c" // output file for filtering counts
#define FILFILE "-cv" // output file for detailed filtering information
#define MISSFILE "-d" // output file for misses
#define SDFILE "-sd" // input file containing standard deviations for each flow value
#define CHIMOPT "-ch" // option to produce output fasta files for de novo chimera-checking
#define UCHEXT "-cu" // file extension for UCHIME output fasta files
#define UMAPEXT "-cm" // file extension for chimera-checking mapping files
#define PEREXT "-cp" // file extension for Perseus output fasta files
// sequence analysis
#define MIDMIS "-em" // number of mismatches to mid tag sequence to allow
#define PRIMMIS "-ep" // number of mismatches to primer sequence to allow
#define MINSLEN "-l" // minimum sequence length
#define MAXSLEN "-L" // maximum sequence length
#define MAXTRLEN "-t" // maximum length at which to truncate a sequence
#define MAXAMBIG "-N" // maximum number of ambiguous bases
#define OKAMBIG "-n" // number of ambiguous bases to allow before truncating
#define MAXHOMO "-G" // maximum homopolymer length to allow
#define OKHOMO "-g" // maximum homopolymer length to allow before truncating
#define REVMOPT "-r" // option to remove opposite primer if it's found
#define REVQOPT "-rq" // option to require opposite primer in a read
#define REVMIS "-er" // number of mismatches to reverse primer sequence to allow
// quality-score analysis
#define AVGQUAL "-s" // average quality score
#define WINDOWLEN "-wl" // length for sliding window of quality scores
#define WINDOWAVG "-wq" // average quality score for sliding window
#define WINDOWOPT "-wx" // option to throw out a read if there is a bad window
// flowgram analysis
#define MAXFLOW "-u" // maximum flow value -- larger values will be changed to this
#define MINFLEN "-lf" // minimum flowgram length
#define MAXFLEN "-Lf" // maximum flowgram length
#define MININT "-p" // minimum flow value of interval
#define MAXINT "-q" // maximum flow value of interval
#define MAXNVAL "-z" // maximum flow value to truncate at
#define NOFLOW "-y" // truncate flowgram if this flow value is not
// reached for 4 consecutive flows
// denoising options
#define CINTER "-j" // constant value for denoising
#define ZINTER "-k" // number of std devs for denoising
#define TRIEOPT "-tr" // option to denoise using a trie
// error messages
#define ERRMEM 0
#define MERRMEM "Cannot alloc memory"
#define ERRCLOSE 1
#define MERRCLOSE "Cannot close file"
#define ERRPARAM 2
#define MERRPARAM ": cannot find value in sff.txt"
#define ERRFLOAT 3
#define MERRFLOAT ": cannot convert to float"
#define ERRINT 4
#define MERRINT ": cannot convert to int"
#define ERROPENR 5
#define MERROPENR ": cannot open file for reading"
#define ERROPENW 6
#define MERROPENW ": cannot open file for writing"
#define ERRINVAL 7
#define MERRINVAL ": invalid parameter or usage"
#define ERREXIST 8
#define MERREXIST ": file already exists"
#define ERRLOAD 9
#define MERRLOAD "Cannot load indexes or quality scores"
#define ERRNEED 10
#define MERRNEED ": missing required file"
#define ERRSLEN 11
#define MERRSLEN "Invalid min/max sequence length"
#define ERRWIND 12
#define MERRWIND "Invalid quality window values"
#define ERRINTER 13
#define MERRINTER "Invalid min/max interval flow values"
#define ERRFLEN 14
#define MERRFLEN "Invalid min/max flowgram length"
#define ERRMAXFL 15
#define MERRMAXFL "Invalid max noisy flow value"
#define ERRSDVAL 16
#define MERRSDVAL "Cannot load distance value for denoising"
#define ERRDEN 17
#define MERRDEN "Must specify either constant value or factor for denoising"
#define ERRDENVAL 18
#define MERRDENVAL "Invalid denoising value"
#define ERRPRIM 19
#define MERRPRIM "Invalid base in primer"
#define ERRPREP 20
#define MERRPREP ": cannot repeat primer name"
#define ERRMREP 21
#define MERRMREP ": cannot repeat midtag name within a primer"
#define ERRNFLOWS 22
#define MERRNFLOWS "Invalid header information in flowgram file"
#define ERRHEAD 23
#define MERRHEAD "Invalid read header"
#define ERRORDER 24
#define MERRORDER "Invalid flow order"
#define ERRMID 25
#define MERRMID ": cannot find mid tag"
#define ERRREV 26
#define MERRREV "Cannot specify both remove and require reverse primer"
#define ERRNOREV 27
#define MERRNOREV ": no reverse primer specified"
#define ERRMISM 28
#define MERRMISM "Invalid number of primer mismatches"
#define ERRMAXF 29
#define MERRMAXF "Invalid absolute maximum flow value"
#define ERRLEN 30
#define MERRLEN ": length exceeds maximum"
#define ERRCARR 31
#define MERRCARR "Remove carriage returns from master file"
#define UNKNOWN "Unknown error"
// elimination/truncation criteria
#define ETCAT 2 // number of designations -- elim or trunc
#define ELIM 0
#define TRUNC 1
#define SELIM "Reads eliminated"
#define STRUNC "Reads truncated"
#define COUNT "Reads analyzed"
#define MATCH 0
#define PRINT 1
#define SMATCH "Mid-primer matches"
#define SPRINT "Reads printed"
#define ERRCAT 16 // number of categories
#define NOERR 0
#define EMINSLEN 1
#define DMINSLEN "Min. sequence length"
#define EMAXSLEN 2
#define DMAXSLEN "Max. sequence length for elimination"
#define EMAXTRLEN 3
#define DMAXTRLEN "Max. sequence length for truncation"
#define EMAXAMBIG 4
#define DMAXAMBIG "Max. ambiguous bases allowed"
#define EOKAMBIG 5
#define DOKAMBIG "Max. ambiguous bases allowed before truncation"
#define EMAXHOMO 6
#define DMAXHOMO "Max. homopolymer length allowed"
#define EOKHOMO 7
#define DOKHOMO "Max. homopolymer length allowed before truncation"
#define EREVERSE 8
#define DREVERSE "Reverse primer removed"
#define EAVGQUAL 9
#define DAVGQUAL "Min. average quality score"
#define EWINDOW 10
#define DWINDOW "Min. window quality score"
#define EMINFLEN 11
#define DMINFLEN "Min. flowgram length"
#define EMAXFLEN 12
#define DMAXFLEN "Max. flowgram length"
#define EFLOWINT 13
#define DFLOWINT "Noisy flow interval"
#define EMAXNVAL 14
#define DMAXNVAL "Max. flow value"
#define ENOFLOW 15
#define DNOFLOW "Four consecutive flows below min."
#define NA "n/a"
#define TOTAL "Total\n"
#define FILHEAD "Read\tSample\tPrimer\tOutcome\tCriterion\tLength before\tLength after\n"
#define NEITHER "Passed"
#define ELIML "Eliminated"
#define TRUNCL "Truncated"
// structs
typedef struct read {
int length;
int start;
float* flow;
char* header;
struct midtag* mid;
struct read* next;
} Read;
typedef struct cluster {
float* flows;
int* weight;
Read* first;
Read* lon; // longest read
struct cluster* next;
} Cluster;
typedef struct midtag {
char* name;
char* seq;
int num;
struct midtag* next;
struct primer* prim;
} Midtag;
typedef struct node {
float* flow;
int st;
int end;
int num;
Read* first;
struct node* next;
struct node* child;
} Node;
typedef struct primer {
char* name;
char* seq;
char* rev;
Midtag* first;
FILE* out;
FILE* den;
FILE* map;
FILE* per;
FILE* cmap;
struct primer* next;
Cluster* head;
Cluster* tail;
Read* dummy;
Read* prev;
Node* root;
} Primer;