forked from CGATOxford/Pipeline_iCLIP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlength_stats.py
107 lines (75 loc) · 2.59 KB
/
length_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
'''
length_stats.py - Count frequency of UMIs
====================================================
:Author: Ian Sudbery
:Release: $Id$
:Date: |today|
:Tags: Python
Purpose
-------
Take a bam file and calculate alignment length histogram.
Options
-------
-l --length
The read length. Any aligned length longer than this is
truncated to this length.
Usage
-----
python length_stats.py -I [BAMFILE]
Command line options
--------------------
'''
import os
import sys
import re
import optparse
import collections
import CGAT.Experiment as E
import pysam
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-l","--length", dest = "length", type="int",
help="max read length", default = 100)
parser.add_option("-p", "--paired", dest="paired", type = "int",
help="Data is paired. Use fragment length where aligned"
"length is greater than or equal to this length", default=None)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
out_hist = collections.defaultdict(int)
if options.stdin == sys.stdin:
in_bam = pysam.Samfile("-", "rb")
else:
fn = options.stdin.name
options.stdin.close()
in_bam = pysam.Samfile(fn, "rb")
nreads=0
nlonger=0
for read in in_bam.fetch():
if read.is_read2 or read.is_unmapped or read.mate_is_unmapped:
continue
nreads+=1
length = read.inferred_length - sum([l for o,l in read.cigar if o=="S"])
if read.inferred_length > length:
nlonger+=1
if not (options.paired is None) and length >= options.paired:
splice_1 = max(0, read.alen-length)
out_hist[abs(read.tlen) - splice_1] += 1
else:
out_hist[length] += 1
header = ["Length", "Count"]
keys = sorted(out_hist.keys())
outlines = ["\t".join(map(str, (key,out_hist[key]))) for key in keys]
outlines = "\t".join(header) + "\n" + "\n".join(outlines) + "\n"
options.stdout.write(outlines)
# write footer and output benchmark information.
E.info("%i reads processed. %i reads have inferred reads length longer than aligned length" % (nreads, nlonger))
E.Stop()
if __name__ == "__main__":
sys.exit(main(sys.argv))