-
Notifications
You must be signed in to change notification settings - Fork 1
/
CPT_GFFParser.py
637 lines (584 loc) · 27 KB
/
CPT_GFFParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
# Copyright 2020-2021, Anthony Criscione
# Developed for the Center for Phage Technology, Texas A&M University
#
# Distributed under the BSD 3-Clause License, see included LICENSE file
from Bio import SeqIO, SeqFeature
from Bio.SeqFeature import FeatureLocation, CompoundLocation
from Bio.Seq import Seq
from collections import OrderedDict
from Bio.SeqRecord import SeqRecord
from CPT_ParserHelpers import *
from gffSeqFeature import *
import sys
#Try/Except blocks used for limited python 2.7 compatibility. Python3 spec is within the try block
try:
import urllib.parse
except:
import urllib
def lineAnalysis(line, codingTypes = ["CDS"]):
IDName = ""
startLoc = -1
endLoc = -1
scoreIn = 0.0
if len(line.strip()) == 0 or line.strip() == "\n":
return None, None, None
if line[0] == "#":
if len(line) > 2 and line[1] == "#":
return None, line[2:-1], None
# else handle ## Pragmas
else:
return None, None, None
errorMessage = ""
fields = line.split("\t")
if len(fields) != 9:
errorMessage += "GFF3 is a 9-column tab-separated format, line has %d columns.\n" % (len(fields))
if len(fields) > 9:
errorMessage += "Possible unescaped tab in a qualifier field.\n"
return errorMessage, None, None
for x in range(0, len(fields)):
if fields[x] == "":
errorMessage += "Field #%d is empty. Please supply correct or default value.\n" % (x + 1)
if errorMessage != "":
return errorMessage, None, None
idEval = validateID(fields[0])
if len(idEval) != 0:
errorMessage += "Organism ID contains the following invalid characters: %s\n" % (idEval)
# fields[1]
# fields[2]
# fields[3]
if fields[3][0] in "<>":
uncert = 1
fields[3] = (fields[3][1:]).strip()
else:
uncert = 0
fields[3] = fields[3].strip()
if isNum(fields[3]):
startLoc = int(fields[3])
else:
errorMessage += "Feature location start contains non-numeric character.\n"
# fields[4]
if fields[4][0] in "<>":
uncert = 1
fields[4] = (fields[4][1:]).strip()
else:
uncert = 0
fields[4] = fields[4].strip()
if isNum(fields[4]):
endLoc = int(fields[4])
else:
errorMessage += "Feature location end contains non-numeric character.\n"
locScrutiny = False
if startLoc >= 0 and endLoc >= 0 and endLoc < startLoc:
locScrutiny = True # Check for circular when we do qualifiers
# errorMessage += "Feature Location end is less than start (GFF spec requires all features, regardless of strand, to have the lower number as the start). If your feature is circular, be sure to add Is_circular=true; to your qualifiers list.\n"
# fields[5]
if fields[5] != ".":
try:
scoreIn = float(fields[5])
except:
scoreIn = 0.0
errorMessage += "Score field could not be interpreted as a floating-point (real) number. Ensure notation is correct.\n"
# fields[6]
if fields[6] not in ['-', '+', '.', '?']:
errorMessage += "Feature strand must be '+', '-', '.', or '?', actual value is '%s'.\n" % (fields[6])
# fields[7]
if fields[7] not in ['.', '0', '1', '2']:
errorMessage += "Expected 0, 1, 2, or . for Phase field value, actual value is '%s'.\n" % (fields[7])
elif fields[7] =='.' and fields[1] in codingTypes:
errorMessage += "Expected 0, 1, or 2 in Phase field for %s-type feature, actual value is '%s'.\n" % (fields[1], fields[7])
if fields[7] == '.':
phaseIn = 0
else:
phaseIn = int(fields[7])
# fields[8]
keyName = ""
valNames = [""]
valInd = 0
parseMode = 0
qualDict = {}
contCounter = 0
for x in range(0, len(fields[8])):
currChar = fields[8][x]
if contCounter:
contCounter += -1
continue
if parseMode == 0:
if not (currChar in "=,;%\n"):
keyName += currChar
elif currChar == "=":
if len(keyName) == 0:
errorMessage += "No ID name supplied for a value in the qualifiers field, aborting.\n"
break
parseMode = 1
continue
elif currChar == "%" and (fields[8][x+1] in encoders) and (fields[8][x+2] in encoders):
try:
keyName += urllib.parse.unquote(fields[8][x:x+3])
except:
keyName += urllib.unquote(fields[8][x:x+3])
contCounter = 2
else: #Encode special char
encoded = str(hex(ord(currChar)))
keyName += "%" + encoded[2:].upper()
elif parseMode == 1:
if not (currChar in "=,;%\n"):
valNames[valInd] += currChar
elif currChar == ",":
valInd += 1
valNames.append("")
elif currChar == "=":
valNames[valInd] += "%3D"
elif currChar == "%" and (fields[8][x+1] in encoders) and (fields[8][x+2] in encoders):
try:
valNames[valInd] += urllib.parse.unquote(fields[8][x:x+3])
except:
valNames[valInd] += urllib.unquote(fields[8][x:x+3])
contCounter = 2
elif currChar == "\n":
parseMode = 2
else:
if x == len(fields[8]) - 2: # Assume if last char in fields[8] is a semicolon, then just the end of qualifier
parseMode = 2
elif encodeFromLookahead(fields[8][x+1:]):
valNames[valInd] += "%3B"
continue
else:
parseMode = 2
if parseMode == 2: # Do not elif, this is used as a wrapup for each qualifier and we want it checked if parsemode == 1 incremented itself
if keyName not in qualDict.keys():
qualDict[keyName] = valNames
else:
for x in valNames:
qualDict[keyName].append(x)
keyName = ""
valNames = [""]
valInd = 0
parseMode = 0
for x in qualDict.keys():
if x == "ID":
#if len(qualDict[x]) > 1:
#errorMessage += "More than one ID supplied for feature.\n"
IDName = qualDict[x][0]
if x == "Is_circular":
if locScrutiny and qualDict[x][0] == "true":
errorMessage += "Feature location end is less than start in this feature marked as circular. For circular features, add the length of your sequence region to your end value to designate the location it wraps back around to.\n"
locScrutiny = False
tmp = endLoc # Swap them so we can still construct the FeatureLocation and check the other features without crashing BioPython
endLoc = startLoc
startLoc = tmp
if locScrutiny:
errorMessage += "Feature location end is less than the start. GFF spec requires start to always be a lesser value than end, regardless of strand.\nIf your feature is circular, add the length of your sequence region to your end value to designate the location it wraps back around to.\n"
tmp = endLoc # Swap them so we can still construct the FeatureLocation and check the other features without crashing BioPython
endLoc = startLoc
startLoc = tmp
if startLoc == -1 or endLoc == -1 or (not(fields[6] in '-+.?')):
errorMessage += "Unable to construct feature location, aborting.\n"
elif fields[6] == '+':
featLoc = FeatureLocation(startLoc - 1, endLoc, strand = +1)
elif fields[6] == '-':
featLoc = FeatureLocation(startLoc - 1, endLoc, strand = -1)
else:
featLoc = FeatureLocation(startLoc - 1, endLoc, strand = 0)
if "Parent" in qualDict.keys():
for x in qualDict["Parent"]:
if x == IDName:
errorMessage += "Feature lists itself as a sub_feature, cycles/loops not permitted in GFF format.\n"
if errorMessage != "":
return errorMessage, None, None
return None, fields[0], gffSeqFeature(featLoc, fields[2], '', featLoc.strand, IDName, qualDict, None, None, None, phaseIn, scoreIn, fields[1])
def gffParse(gff3In, base_dict = {}, outStream = sys.stderr, codingTypes=["CDS"], metaTypes = ["remark"], suppressMeta = 2, pragmaPriority = True, pragmaOverridePriority = True):
# gff3In --- source file
# base_dict --- file with additional SeqRecord information. Keys are OrganismIDs and values are SeqRecords.
# For BCBio backwards compatibility.
# outStream --- output filestream or stringstream
# codingTypes --- list of feature types where a non-'.' phase value is expected, passed along to lineAnalysis
# metaTypes --- list of metadata feature types. Features of this type will be affected by the remaining arguments
# suppressMeta --- Suppress metadata fields. Int, where 0 == no suppression, all metadata from features and pragmas
# will be read and output to the SeqRecord as .annotation entries.
# 1 == As above, but metadata features will not be entered into the SeqRecord's feature list
# after their metadata is recorded and entered into the SeqRecord.annotation
# 2 == Total suppression, no metadata features will even be processed, and no pragmas except
# those related to sequence length (##FASTA and ##sequence-region) and ##gff-version (required
# by GFF spec) will be utilized.
# pragmaPriority --- In cases where pragmas and metadata features disagree/conflict, pragmas will take precedence for creating
# SeqRecord.annotation value if true, else the feature will.
# pragmaOverridePriority --- Similar to above, in the event of a conflict between metadata features and pragmas, the pragma's
# value will override the metadata gffSeqFeature.qualifier value with the pragma's own. This will force
# the metadata and pragmas to sync, and avoid future discrepancies. Should only be used with pragmaPrority
if hasattr(gff3In, 'readlines') and callable(getattr(gff3In, 'readlines')):
gff3In = gff3In.readlines()
fastaDirective = False # Once true, must assume remainder of file is a FASTA, per spec
errOut = ""
warnOut = ""
lineInd = 0
pragmaAnnotesDict = {} # Annotations dictionaries, one for ones derived via pragmas and another for ones derived from meta features
metaAnnotesDict = {} # Keys are annotation title, values are a list of lists, where the first entry in a list is the value, and
# the rest the orgIDs which correspond to the value
orgDict = {} # Dictionary of orgIDs, with a list of their features (Top-level or otherwise) as its value
finalOrg = {}
seekParentDict = {} # Dictionary of orgIDs, with a list of indexes in the equivalent orgDict list which are subfeatures
seqDict = {}
regionDict = {} # Dictionary of max length for records (For handling circular and bound checking)
# Values are (maxVal, status), where status is 0 if maxVal is derived from features, 1 if derived from sequence-region
# pragma (Enforce this boundary), or -1 if derived from a feature with is_circular (Unable to construct record)
currFastaKey = ""
if pragmaPriority: # In cases where pragmas and meta-features conflict, such as sequence-region
pragBit = 1 # pragmas will take priority
annoteBit = 0
else: # Else meta features will
pragBit = 0 # Split into ints to cleanly slot into regionDict format
annoteBit = 1
for line in gff3In:
lineInd += 1
err = None
prag = None
res = None
### FASTA Pragma Handling
if line[0] == ">": # For compatibility with Artemis-style GFF
fastaDirective = True
if not fastaDirective:
err, prag, res = lineAnalysis(line, codingTypes)
else:
if line[0] == ">":
currFastaKey = line[1:-1]
if currFastaKey not in seqDict.keys():
seqDict[currFastaKey] = ""
elif line[0] == "#":
continue
elif line:
seqDict[currFastaKey] += (line[:-1]).strip()
continue
### Error message construction
if err:
errOut += "Line %d: %s\n" % (lineInd, err)
### Pragma handling
if prag and not res:
prag = prag.split(" ")
if prag[0] == "FASTA":
fastaDirective = True
elif prag[0] == "sequence-region":
if prag[1] not in regionDict.keys():
regionDict[prag[1]] = (int(prag[2]) - 1, int(prag[3]), pragBit)
elif pragBit > regionDict[prag[1]][2]:
regionDict[prag[1]] = (int(prag[2]) - 1, int(prag[3]), pragBit)
elif prag[0] == "#":
orgDict, resolveErr = resolveParent(orgDict, seekParentDict)
if resolveErr:
errOut += resolveErr
finalOrg = rAddDict(finalOrg, orgDict)
seekParentDict = {}
orgDict = {}
elif suppressMeta < 2:
if prag[0] in pragmaAnnotesDict.keys():
dictVal = " ".join(prag[1:])
pragmaAnnotesDict[prag[0]].append([dictVal])
else:
dictVal = " ".join(prag[1:])
pragmaAnnotesDict[prag[0]] = [[dictVal]]
### Feature Handling
if res:
if suppressMeta == 2 and res.type in metaTypes:
continue
## First time encountering orgID
if prag not in orgDict.keys():
orgDict[prag] = [res]
seekParentDict[prag] = []
possSeq = base_dict.get(prag, None)
# Process base_dict
# .seq priority is: ##FASTA directives will always define sequence-region and seq if present (done further down)
# base_dict is next, and will also accept an empty seq, so take care with what's passed in this field
# Finally, parser will infer an None seq from either ##sequence-region pragma or the 'last' feature,
# depending on arguments passed to parser.
if isinstance(possSeq, SeqRecord):
if possSeq.seq == None:
seqDict[prag] = ""
else:
seqDict[prag] = str(possSeq.seq)
else:
seqDict[prag] = ""
for x in pragmaAnnotesDict.keys():
if prag in pragmaAnnotesDict[x][-1]:
continue
pragmaAnnotesDict[x][-1].append(prag)
if prag not in regionDict.keys() or (prag in regionDict.keys() and regionDict[prag][-1] != 1):
if res.qualifiers.get("sequence-region") and res.type in metaTypes:
fields = res.qualifiers["sequence-region"][0].split(" ")
regStr = writeMetaQuals({"sequence-region": res.qualifiers["sequence-region"][0]})
regStr = regStr[regStr.find(" ") + 1:-1]
fields = regStr.split(" ")
regionDict[prag] = (int(fields[1]) - 1, int(fields[2]), annoteBit)
elif res.qualifiers.get("is_circular") == ['true'] or res.qualifiers.get("is_circular") == ['True'] or res.qualifiers.get("is_circular") == ['TRUE']:
regionDict[prag] = (0, int(res.location.end), -1)
else:
regionDict[prag] = (0, int(res.location.end), 0)
if suppressMeta <= 1 and res.type in metaTypes:
qualsToAnnotes(metaAnnotesDict, res, prag)
if suppressMeta == 1:
orgDict[prag] = []
if "Parent" in res.qualifiers.keys():
seekParentDict[prag].append(0)
else: # Check if it's possible to resolve as a CompoundLocation feature
if suppressMeta <= 1 and res.type in metaTypes:
qualsToAnnotes(metaAnnotesDict, res, prag)
if suppressMeta == 1:
break
incInd = True
if regionDict[prag][2] < 1:
if res.qualifiers.get("is_circular") == ['True'] and int(res.location.end) > regionDict[prag][1]:
regionDict[prag] = (0, int(res.location.end), -1)
elif int(res.location.end) > regionDict[prag][1]: # Can't just max() or else we'll maybe overwrite -1 status
regionDict[prag] = (0, int(res.location.end), 0)
if res.id:
for x in range(0, len(orgDict[prag])):
if res.id == orgDict[prag][x].id:
if orgDict[prag][x].type != res.type:
errOut += ("Line %d: Duplicate IDs in file but differing types. Cannot assume CompoundFeature/ join location, please resolve type descrepancy or de-duplicate ID %d.\n" % (lineInd, res.id))
orgDict[prag][x].location = orgDict[prag][x].location + res.location
incInd = False
break
# If incInd is still true, then it's a unique feature, append to list
if incInd:
orgDict[prag].append(res)
if "Parent" in res.qualifiers.keys():
seekParentDict[prag].append(len(orgDict[prag])-1)
orgDict, resolveErr = resolveParent(orgDict, seekParentDict)
if resolveErr:
errOut += resolveErr
finalOrg = rAddDict(finalOrg, orgDict)
# All features and pragmas should be read in by now, resolve any outstanding
# annotation or sequence associations
for x in regionDict.keys():
if x in seqDict.keys() and seqDict[x] != "": ## If x not in SeqDict, then a sequence-region pragma was made for organism with no features
regionDict[x] = (0, len(seqDict[x]), 1) # Make FASTA the final arbiter of region if present
for x in regionDict.keys():
if regionDict[x][2] == -1:
errOut += "Organism %s: No sequence-region specified and organism contains a circular feature, unable to infer organism length from features alone.\nPlease add a ##sequence-region START_INTEGER END_INTEGER to this organism" % (x)
for x in finalOrg.keys():
for i in range(0, len(finalOrg[x])):
circ = False
badIDs = []
checkList = [finalOrg[x][i]]
potenBad = []
for j in checkList: # make is_circular retroactively applicable to all features in a tree
potenBad = []
for k in j.sub_features:
checkList.append(k)
if j.qualifiers.get("Is_circular") == ['True'] or j.qualifiers.get("Is_circular") == ['true'] or j.qualifiers.get("Is_circular") == ['TRUE']:
circ = True
if circ == False and (int(j.location.start) < regionDict[x][0] or int(j.location.end) > regionDict[x][1]):
potenBad.append(j.id + " [" + str(j.location.start) + ":" + str(j.location.end) + "]")
if circ == False and potenBad != []:
badIDs = badIDs + potenBad
elif circ == True:
finalOrg[x][i].qualifiers["Is_circular"] = ["true"]
if badIDs != []:
errOut += "Organism %s: The following features fall outside of the specified sequence region: %s.\n" % (x, str(badIDs)[1:-1])
# By this point, all features and pragmas should be processed and resolved
if errOut:
outStream.write(errOut + "\n")
raise Exception("Failed GFF Feature Parsing, error log output to stderr\n")
# Construct a SeqRecord from all processed OrgIDs
res = []
for x in finalOrg.keys():
finalOrgHeirarchy = []
annoteDict = {}
for pragma in pragmaAnnotesDict.keys():
for vals in pragmaAnnotesDict[pragma]:
if x in vals[1:]:
annoteDict[pragma]=vals[0]
break
for pragma in metaAnnotesDict.keys():
for vals in metaAnnotesDict[pragma]:
if x in vals[1:]:
if pragma in annoteDict.keys():
if pragmaOverridePriority == False and pragBit < annoteBit:
annoteDict[pragma]=vals[0]
break
else:
annoteDict[pragma]=vals[0]
for i in finalOrg[x]:
if "Parent" not in i.qualifiers.keys():
finalOrgHeirarchy.append(i)
if i.type in metaTypes:
if pragmaOverridePriority == False:
for key in annoteDict.keys():
if key not in finalOrgHeirarchy[-1].qualifiers.keys():
finalOrgHeirarchy[-1].qualifiers[key] = [annoteDict[key]]
else:
for key in annoteDict.keys():
finalOrgHeirarchy[-1].qualifiers[key] = [annoteDict[key]]
if seqDict[x]:
if x in regionDict.keys():
annoteDict["sequence-region"] = "%s %s %s" % (x, regionDict[x][0] + 1, regionDict[x][1])
if len(seqDict[x]) < regionDict[x][1] - regionDict[x][0]:
seqDict[x] += "?" * (regionDict[x][1] - regionDict[x][0] - len(seqDict[x]))
else:
seqDict[x] = seqDict[x][regionDict[x][0]:regionDict[x][1]]
else:
annoteDict["sequence-region"] = "%s 1 %s" % (x, int(len(seqDict[x])))
seqDict[x] = str(seqDict[x])
elif x in regionDict.keys():
annoteDict["sequence-region"] = "%s %s %s" % (x, regionDict[x][0] + 1, regionDict[x][1])
seqDict[x] = ""
else: # Should actually no longer be reachable
seqDict[x] = ""
if seqDict[x] != "":
res.append(SeqRecord(Seq(seqDict[x]), x, "<unknown name>", "<unknown description>", None, finalOrgHeirarchy, annoteDict, None))
else:
res.append(SeqRecord(Seq(None, length = regionDict[x][1] - regionDict[x][0]), x, "<unknown name>", "<unknown description>", None, finalOrgHeirarchy, annoteDict, None))
return res
def printFeatLine(inFeat, orgName, source = 'feature', score = None, phase = None, outStream = sys.stdout, parents = None, codingTypes = ["CDS"]):
for loc in inFeat.location.parts:
line = orgName + "\t"
if source:
line += source + "\t"
else:
line += ".\t"
line += inFeat.type + "\t"
startStr = str(min(loc.start, loc.end) + 1)
endStr = str(max(loc.start, loc.end))
if startStr[0] == "<":
startStr = startStr[1:]
if endStr[0] == ">":
endStr = endStr[1:]
line += startStr + "\t" + endStr + "\t"
if score:
line += str(score) + "\t"
else:
line += ".\t"
if inFeat.location.strand == 0:
line += ".\t"
elif inFeat.location.strand == 1:
line += "+\t"
elif inFeat.location.strand == -1:
line += "-\t"
else:
line += "?\t"
if inFeat.type in codingTypes:
if phase or phase == 0:
line += str(phase) + "\t"
else:
line += "0\t"
elif phase != 0:
line += str(phase) + "\t"
else:
line += ".\t"
if parents and "Parent" not in inFeat.qualifiers.keys():
if "ID" in parents.qualifiers.keys():
inFeat.qualifiers["Parent"] = parents.qualifiers["ID"]
else:
inFeat.qualifiers["Parent"] = [parents.id]
for qual in inFeat.qualifiers.keys():
for keyChar in str(qual):
if keyChar in "%,=;\n\t":
encoded = str(hex(ord(keyChar)))
line += "%" + encoded[2:].upper()
else:
line += keyChar
line += "="
if type(inFeat.qualifiers[qual]) != list:
inFeat.qualifiers[qual] = [inFeat.qualifiers[qual]]
for ind in range(0, len(inFeat.qualifiers[qual])):
for valChar in str(inFeat.qualifiers[qual][ind]):
if valChar in "%,=;\n\t":
encoded = str(hex(ord(valChar)))
line += "%" + encoded[2:].upper()
else:
line += valChar
if ind < len(inFeat.qualifiers[qual]) - 1:
line += ","
else:
line += ";"
outStream.write(line + "\n")
if type(inFeat) == gffSeqFeature and inFeat.sub_features:
for x in inFeat.sub_features:
printFeatLine(x, orgName, x.source, x.score, x.phase, outStream, inFeat)
def gffWrite(inRec, outStream = sys.stdout, suppressMeta = 1, suppressFasta=True, codingTypes = ["CDS"], metaTypes = ["remark"], validPragmas = None, recPriority = True, createMetaFeat=None):
writeFasta = {}
verOut = "3"
firstRec = True
maxInd = 0
if not inRec:
outStream.write("##gff-version 3\n")
return
if type(inRec) != list:
inRec = [inRec]
for rec in inRec:
for x in rec.features:
if "gffSeqFeature" not in str(type(x)):
rec = convertSeqRec(rec, defaultSource = "gffSeqFeature", deriveSeqRegion = False)[0]
break
writeFasta[rec.id] = False
try:
if len(rec.seq) > 0:
writeFasta[rec.id] = True
rec.seq[0] # Check for unknown seq
except:
writeFasta[rec.id] = False
seenList = []
outList = {}
if suppressMeta < 2:
outList = rec.annotations
#outStr = writeMetaQuals(rec.annotations)
#outStream.write(outStr)
metaFeats = []
for feat in rec.features:
if feat.type in metaTypes:
metaFeats.append(feat)
maxInd = max(maxInd, feat.location.end)
for feat in metaFeats:
for x in feat.qualifiers.keys():
if recPriority == False or x not in outList.keys():
outList[x] = " ".join(feat.qualifiers[x])
if "gff-version" in outList.keys() and outList["gff-version"] != verOut:
verOut = outList["gff-version"]
outStream.write("##gff-version %s\n" % verOut)
firstRec = False
elif firstRec:
outStream.write("##gff-version %s\n" % verOut)
firstRec = False
if "sequence-region" in outList.keys():
fields = outList["sequence-region"].split(" ")
if int(fields[2]) < maxInd:
fields[2] = str(int(maxInd))
outList["sequence-region"] = " ".join(fields)
if validPragmas == None:
outStr = writeMetaQuals(outList)
else:
whiteList = {}
for x in outList.keys():
if x in validPragmas:
whiteList[x] = outList[x]
outStr = writeMetaQuals(whiteList)
outList = whiteList
outStream.write(outStr)
elif firstRec:
outStream.write("##gff-version 3\n")
foundMeta = False
if createMetaFeat != None:
for key in outList.keys(): # Change to GFF format qualifier dict
outList[key] = [outList[key]]
for feat in rec.features:
if feat.type in metaTypes:
foundMeta = True
for key in outList.keys():
if recPriority or key not in feat.qualifiers.keys():
feat.qualifiers[key] = outList[key]
break
if not foundMeta:
tempSeq = gffSeqFeature(FeatureLocation(0, len(rec.seq), 0), createMetaFeat, '', 0, 0, outList, None, None, None, '.', '.', "CPT_GFFParse")
printFeatLine(tempSeq, rec.id, source = tempSeq.source, score = tempSeq.score, phase = tempSeq.phase, outStream = outStream)
for feat in rec.features:
if suppressMeta > 0 and feat.type in metaTypes:
continue
printFeatLine(feat, rec.id, source = feat.source, score = feat.score, phase = feat.phase, outStream = outStream)
firstRec = False
if not suppressFasta:
wroteHeader = False
for rec in inRec:
rec.description = ""
if writeFasta[rec.id]:
if not wroteHeader:
outStream.write("##FASTA\n")
wroteHeader = True
SeqIO.write(rec, outStream, "fasta")