forked from AstraZeneca-NGS/Seq2C
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq2cov.pl
executable file
·195 lines (182 loc) · 9.27 KB
/
seq2cov.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env perl
# Parse a list of refseq and check CDS coverage
use warnings;
use Getopt::Std;
use strict;
our ($opt_h, $opt_b, $opt_s, $opt_c, $opt_S, $opt_E, $opt_n, $opt_e, $opt_g, $opt_x, $opt_z, $opt_N, $opt_a);
USAGE() unless( getopts( 'hzb:s:e:S:E:n:c:g:x:N:a:' ) );
USAGE() if ( $opt_h );
my $BAM = $opt_b; # the bam file
my $sample = $1 if ( $BAM =~ /([^\/.]+)[\/]*.bam/ );
if ( $opt_n ) {
$sample = $1 if ( $BAM =~ /$opt_n/ ); # set sample name by regex from BAM file name if -n option set
}
$sample = $opt_N if ( $opt_N ); #sample name to set
my $c_col = $opt_c ? $opt_c - 1 : 2; #chromosome
my $S_col = $opt_S ? $opt_S - 1 : 6; #region start (gene)
my $E_col = $opt_E ? $opt_E - 1 : 7; #region end (gene)
my $s_col = $opt_s ? $opt_s - 1 : 9; #segment start (exon)
my $e_col = $opt_e ? $opt_e - 1 : 10; #segment end (exon)
my $g_col = $opt_g ? $opt_g - 1 : 12; #gene
my $SPLICE = $opt_x ? $opt_x : 0; #number of bases to extend on each segment (exon)
my %regions; #hash of gene on (chr, CDS)
while( <> ) {
next if ( /^track/i );
next if ( /^browser/i );
next if ( /^#/ );
next if ( /^Sample/ );
chomp;
my @A = split(/\t/, $_, -1); #split bed file by columns
# Amplicon/exon case: if option -a set, or if there are 8 columns in BED and start of segment more than start of
# region and end of segment less then end of region.
if ( $opt_a || (@A == 8 && $A[6] =~ /^\d+$/ && $A[7] =~ /^\d+$/ && $A[6] > $A[1] && $A[7] < $A[2]) ) {
$opt_a = "10:0.95" unless($opt_a); #set number of base pairs to 10 and overlap fraction to 0.95 if were not set
my ($chr, $start, $end, $gene, $score, $strand, $istart, $iend) = @A; #fill values from BED
$start++ && $istart++; # if ( $opt_z ); #increment starts of region and segment
push(@{ $regions{ $gene }->{ CDS } }, [$start, $end, $istart, $iend]); #push data to gene about coding region to the CDS
$regions{ $gene }->{ chr } = $chr; #push data to gene about chromosome
} else {
# Non-amplicon case: of there are 4 columns in BED file and start of region less then end of region.
# Segment start and end will be the same as start and end of region.
($c_col, $S_col, $E_col, $g_col, $s_col, $e_col) = (0, 1, 2, 3, 1, 2) if ( (! $opt_c) && @A == 4 && $A[1] =~ /^\d+$/ && $A[2] =~ /^\d+$/ && $A[1] <= $A[2] );
my ($chr, $cdss, $cdse, $gene) = @A[$c_col,$S_col,$E_col,$g_col]; #fill values from BED
my @starts = split(/,/, $A[$s_col]); #if there are few starts or ends in one line, split them and add data about
# coding region to array of CDS
my @ends = split(/,/, $A[$e_col]);
my @CDS = ();
for(my $i = 0; $i < @starts; $i++) {
my ($s, $e) = ($starts[$i], $ends[$i]);
next if ( $cdss > $e ); # not a coding exon
# TODO: check this, how we will have comma if region and segment are the same.
last if ( $cdse < $s ); # No more coding exon
$s = $cdss if ( $s < $cdss ); # adjust start and end of coding region
$e = $cdse if ( $e > $cdse );
$s -= $SPLICE unless ( $s == $cdss ); #extend segment with splice length if splitted start/end and segment start/end differ
$e += $SPLICE unless ( $e == $cdse );
$s += 1 if ( $opt_z ); #increment start of segment for zero based numering
push(@CDS, [$s, $e]); #push data about coding region to the CDS array
}
push( @{ $regions{ $gene }->{ CDS } }, @CDS); #push data to gene about array of coding regions to the CDS
$regions{ $gene }->{ chr } = $chr; #push data to gene about chromosome
}
}
print join("\t", "Sample", "Gene", "Chr", "Start", "End", "Tag", "Length", "MeanDepth"), "\n";
my $bamhdr = `samtools view -H $BAM`; #bam header
my $genome = $bamhdr =~ /SN:chr\d+\t/ ? "hg" : "grch"; #determine reference by chromosome name
while( my ($gene, $r) = each %regions ) {
my $exoncov; #coverage of exon
my $CDS = $r->{ CDS };
my $chr = $r->{ chr };
my $tchr = $chr;
if ( $genome eq "hg" ) { #fix the cromosome name
$tchr = "chr$tchr" unless( $tchr =~ /^chr/ );
} else {
$tchr =~ s/^chr// if( $tchr =~ /^chr/ );
}
my $total = 0;
my $gene_length = 0;
my ($gene_start, $gene_end) = (500000000, 0);
for(my $i = 0; $i < @{ $CDS }; $i++) {
my ($START, $END, $ISTART, $IEND) = @{$CDS->[$i]}; #get info about current coding region.
# For non amplicon/exon case ISTART and IEND will be unitialized
$gene_length += $END-$START+1;
$gene_start = $START if ( $START < $gene_start ); # adjust start and end of gene
$gene_end = $END if ( $END > $gene_end );
open(SAM, "samtools view $BAM $tchr:$START-$END |");
$exoncov = 0;
#parse SAM file
while( <SAM> ) {
my @a = split(/\t/);
next if ( $a[1] & 0x800 && /\tSA:Z:/ ); # ignore supplementary alignments
my $dir = $a[1] & 0x10 ? "-" : "+"; #determine direction (reverse -, forward +)
my $start = $a[3]; #set start to alignment start
my @segs = $a[5] =~ /(\d+)[MD]/g; #get matched and deletion lengths from cigar
my $end = $start-1; $end += $_ foreach(@segs); #calculate end position
# Amplicon case
if ( $opt_a ) {
my ($dis, $ovlp) = split(/:/, $opt_a); #split for distance and overlap fraction
($dis, $ovlp) = (10, 0.95) unless($dis && $ovlp);
my ($segstart, $segend) = ($start, $end); #set segment start and end to start and end of crrent read (matched+deleted part)
if ( $a[5] =~ /^(\d+)S/ && $dir eq "+" ) { #forward softclips
my $ts1 = $segstart-$1 > $START ? $segstart-$1 : $START;
my $te1 = $segend < $END ? $segend : $END;
next unless( abs($ts1-$te1)/($segend-$segstart+$1) > $ovlp); #skip if current overlap is more then overlap fraction
} elsif ($a[5] =~ /(\d+)S$/ && $dir eq "-" ) { #reverse softclips
my $ts1 = $segstart > $START ? $segstart : $START;
my $te1 = $segend+$1 < $END ? $segend+$1 : $END;
next unless( abs($te1-$ts1)/($segend+$1-$segstart) > $ovlp); #skip if current overlap is more then overlap fraction
} else {
if ($a[6] eq "=" && $a[8]) { #if read and mate have the same chromosome name and inferred insert size exists
($segstart, $segend) = $a[8] > 0 ? ($segstart, $segstart+$a[8]-1) : ($a[7], $a[7]-$a[8]-1);
}
my $ts1 = $segstart > $START ? $segstart : $START;
my $te1 = $segend < $END ? $segend : $END;
next unless( (abs($segstart - $START) <= $dis && abs($segend - $END) <= $dis ) && abs(($ts1-$te1)/($segend-$segstart)) > $ovlp);
}
$exoncov++;
$total++;
} else { #Non-amplicon case: alignment length calculates as a smallest length from (region, read length)
my $alignlen = ($END > $end ? $end : $END) - ($START > $start ? $START : $start)+1;
$exoncov += $alignlen;
$total += $alignlen;
}
}
close( SAM );
# Print result for each coding region of gene
if ( $opt_a ) {
print join("\t", $sample, $gene, $chr, $START, $END, "Amplicon", $END-$START+1, $exoncov), "\n";
} else {
print join("\t", $sample, $gene, $chr, $START, $END, "Amplicon", $END-$START+1, sprintf("%.2f", $exoncov/($END-$START+1))), "\n";
}
}
# Print result for each gene
if ( $opt_a ) {
print join("\t", $sample, $gene, $chr, $gene_start, $gene_end, "Whole-Gene", $gene_length, $total), "\n";
} else {
print join("\t", $sample, $gene, $chr, $gene_start, $gene_end, "Whole-Gene", $gene_length, sprintf("%.2f", $total/$gene_length)), "\n";
}
}
sub USAGE {
print STDERR <<USAGE;
$0 [-hz] [-n name_reg] [-b bam] [-c chr] [-S start] [-E end] [-s seg_starts] [-e seg_ends] [-x #_nu] [-g gene] [-o ori] [-d depth] region_info
The program will calculate candidate variance for a given region(s) in an indexed BAM file. The default
input is IGV's one or more entries in refGene.txt, but can be regions
-h Print this help
-a int:float
Indicate that it's PCR amplicon based calling. Each line in region_info represents a PCR amplicon (including primers).
Two numbers in option are parameter to decide whether a particular read or pairs belongs to the amplicon. First is the
number of base pairs. The second is the fraction of overlapped portion to the length of read or pairs. If the edges of
reads (paired for Illumina) are within defined base pairs to the edges of amplicons and overlapped portion greater then
the fraction, it's considered belonging to the amplicon. Suggested values are: 10:0.95. When given a 6 column amplicon
format BED files, it'll be set to 10:0.95 automatically, but can still be overwritten by -a option.
-n name_reg
The regular expression to extract sample name from bam filename
-N name
Mutual exclusive to -n. Set the sample name to name
-b bam
The indexed BAM file
-c chr
The column for chr
-S start
The column for region start, e.g. gene start
-E end
The column for region end, e.g. gene end
-s seg_starts
The column for segment starts in the region, e.g. exon starts
-e seg_ends
The column for segment ends in the region, e.g. exon ends
-g gene
The column for gene name
-x num
The number of nucleotide to extend for each segment, default: 0
-z
Indicate whether it's zero based numbering, default is 1-based
AUTHOR
Written by Zhongwu Lai, AstraZeneca, Boston, USA
REPORTING BUGS
Report bugs to zhongwu\@yahoo.com
COPYRIGHT
This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.
USAGE
exit(0);
}