-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigwig_outlier_bed.py
148 lines (129 loc) · 5.79 KB
/
bigwig_outlier_bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Ross Lazarus June 2024 for VGP
Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions.
Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point.
0.99 and 0.01 work well in testing with a minimum span of 10 bp.
Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately
Combining multiple references works but is silly because only display will rely on one reference so others will not be shown...
Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
takes about 95 seconds for a 17MB test wiggle
JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option.
Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores
Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi).
"""
import argparse
import numpy as np
import pybigtools
import sys
from pathlib import Path
class findOut():
def __init__(self, args):
self.bwnames=args.bigwig
self.bwlabels=args.bigwiglabels
self.bedwin=args.minwin
self.qlo=args.qlo
self.qhi=args.qhi
self.bedouthilo=args.bedouthilo
self.bedouthi=args.bedouthi
self.bedoutlo=args.bedoutlo
self.tableout = args.tableout
self.bedwin = args.minwin
self.qhi = args.qhi
self.qlo = args.qlo
self.makeBed()
def processVals(self, bw, isTop):
# http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
if isTop:
bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s
else:
bwex = np.r_[False, bw <= self.bwbot, False]
bwexd = np.diff(bwex)
bwexdnz = bwexd.nonzero()[0]
bwregions = np.reshape(bwexdnz, (-1,2))
return bwregions
def writeBed(self, bed, bedfname):
"""
potentially multiple
"""
bed.sort()
beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed]
with open(bedfname, "w") as bedf:
bedf.write('\n'.join(beds))
bedf.write('\n')
print('Wrote %d bed regions to %s' % (len(bed), bedfname))
def makeBed(self):
bedhi = []
bedlo = []
bwlabels = self.bwlabels
bwnames = self.bwnames
print('bwnames=', bwnames, "bwlabs=", bwlabels)
for i, bwname in enumerate(bwnames):
bwlabel = bwlabels[i].replace(" ",'')
p = Path('in.bw')
p.symlink_to( bwname ) # required by pybigtools (!)
bwf = pybigtools.open('in.bw')
chrlist = bwf.chroms()
chrs = list(chrlist.keys())
chrs.sort()
restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
for chr in chrs:
bw = bwf.values(chr)
bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered
if self.qhi is not None:
self.bwtop = np.quantile(bw, self.qhi)
bwhi = self.processVals(bw, isTop=True)
for i, seg in enumerate(bwhi):
if seg[1] - seg[0] >= self.bedwin:
bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1))
if self.qlo is not None:
self.bwbot = np.quantile(bw, self.qlo)
bwlo = self.processVals(bw, isTop=False)
for i, seg in enumerate(bwlo):
if seg[1] - seg[0] >= self.bedwin:
bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1))
bwmean = np.mean(bw)
bwstd = np.std(bw)
bwmax = np.max(bw)
nrow = np.size(bw)
bwmin = np.min(bw)
restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot))
print('\n'.join(restab), '\n')
if self.tableout:
with open(self.tableout) as t:
t.write('\n'.join(restab))
t.write('\n')
if self.bedoutlo:
if self.qlo:
self.writeBed(bedlo, self.bedoutlo)
if self.bedouthi:
if self.qhi:
self.writeBed(bedhi, self.bedouthi)
if self.bedouthilo:
allbed = bedlo + bedhi
self.writeBed(allbed, self.bedouthilo)
return restab
if __name__ == "__main__":
parser = argparse.ArgumentParser()
a = parser.add_argument
a('-m', '--minwin',default=10, type=int)
a('-l', '--qlo',default=None, type=float)
a('-i', '--qhi',default=None, type=float)
a('-w', '--bigwig', nargs='+')
a('-n', '--bigwiglabels', nargs='+')
a('-o', '--bedouthilo', default=None, help="optional high and low combined bed")
a('-u', '--bedouthi', default=None, help="optional high only bed")
a('-b', '--bedoutlo', default=None, help="optional low only bed")
a('-t', '--tableout', default=None)
args = parser.parse_args()
print('args=', args)
if not (args.bedouthilo or args.bedouthi or args.bedoutlo):
sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined")
sys.exit(2)
if not (args.qlo or args.qhi):
sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo")
sys.exit(2)
restab = findOut(args)
if args.tableout:
with open(args.tableout, 'w') as tout:
tout.write('\n'.join(restab))
tout.write('\n')