-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_constitutive.py
executable file
·114 lines (106 loc) · 3.05 KB
/
read_constitutive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
import sys
class GenomicLocation:
def __init__(self, chr, start, end, strand):
#instance variables
self.chr = chr
self.start = start # start and end are 1-indexed (as per UCSC)
self.end = end
self.strand = strand
def __eq__( self, other):
return( (self.chr,self.start,self.end,self.strand) == (other.chr,other.start,other.end,other.strand) )
def __ne__( self, other):
return not self == other
def __hash__( self ):
s = str(self.chr)+str(self.start)+str(self.end)+str(self.strand)
hash = int(''.join(str(ord(c)) for c in s))
return hash
def __str__( self ):
return "%s %s:%s%s" % (self.chr, self.start, self.end, self.strand)
class Gene:
def __init__(self, id, start, end, transcripts):
self.id = id
self.start = start
self.end = end
self.transcripts = transcripts
def __str__( self ):
return "Gene %s %s %s %s" % (self.id, self.start, self.end, self.transcripts)
def get_chromosome( self ):
chrs = set(t.get_chromosome() for t in self.transcripts)
if len( chrs ) == 1:
return chrs.pop()
else:
print "Found mixed transcript chromosomes!"
return 0
def get_strand( self ):
strands = set( t.get_strand() for t in self.transcripts)
if len( strands ) == 1:
return strands.pop()
else:
print "Found mixed transcript strands!"
return 0
class Transcript:
def __init__(self, id, start, end, tss, principal_isoform, exons):
self.id = id
self.start = start
self.end = end
self.tss = tss
self.prin_iso = principal_isoform
self.exons = exons
def __str__( self ):
return "Transcript %s %s %s" % (self.id, self.start, self.end, self.tss, self.prin_iso, self.exons)
def get_chromosome( self ):
chrs = set(exon.location.chr for exon in self.exons)
if len( chrs ) == 1:
return chrs.pop()
else:
print "Found mixed exon chromosomes!"
return 0
def get_strand( self ):
strands = set( e.location.strand for e in self.exons )
if len( strands ) == 1:
return strands.pop()
else:
print "Found mixed exon strands!"
return 0
class Exon:
def __init__(self, location, constitutive):
self.location = location
self.constitutive = constitutive
def __str__( self ):
return "Exon %s %s" % (self.location, self.constitutive)
fname = sys.argv[1]
f = open( fname, mode="rU")
prev_geneid = ""
prev_txid = ""
for line in f:
if line[0] == "#":
continue
geneid, txid, genename, chr, tss, strand, start, end, constitutive, rank = line.rstrip().split(",")
chr = "chr"+chr
tss = int(tss)
start = int(start)
end = int(end)
if strand == "1":
strand = "+"
elif strand == "-1":
strand = "-"
if constitutive == 1:
constitutive = True
elif constitutive == 0:
constitutive = False
rank = int(rank)
e = Exon( GenomicLocation( chr, start, end, strand ), constitutive )
fake_start = 0
fake_end = 0
if prev_txid == txid:
t.exons.append( e )
for e in t.exons:
else:
t = Transcript( txid, fake_start, fake_end, tss, True, [e] )
if prev_geneid == geneid:
g.transcripts.append( t )
else:
g = Gene( geneid+"_"+genename, fake_start, fake_end, [t] )
prev_geneid = geneid
prev_txid = txid