-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathfpfilter.pl
596 lines (488 loc) · 21.3 KB
/
fpfilter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use IO::File;
use File::Temp qw( tempdir );
use File::Spec;
use Cwd qw( abs_path cwd);
## Define filtering parameters ##
my $min_read_pos = 0.10;
my $max_read_pos = 1 - $min_read_pos;
my $min_var_freq = 0.05;
my $min_var_count = 4;
my $min_strandedness = 0.01;
my $max_strandedness = 1 - $min_strandedness;
my $max_mm_qualsum_diff = 50;
my $max_mapqual_diff = 30;
my $max_readlen_diff = 25;
my $min_var_dist_3 = 0.20;
my $max_var_mm_qualsum = 100;
## Parse arguments ##
my $output_basename;
my $vcf_file;
my $output_file;
my $bam_file;
my $bam_index;
my $sample;
my $bam_readcount_path = 'bam-readcount';
my $samtools_path = 'samtools';
my $ref_fasta;
my $help;
my $opt_result;
my @params = @ARGV;
$opt_result = GetOptions(
'vcf-file=s' => \$vcf_file,
'bam-file=s' => \$bam_file,
'bam-index=s' => \$bam_index,
'sample=s' => \$sample,
'bam-readcount=s' => \$bam_readcount_path,
'samtools=s' => \$samtools_path,
'reference=s' => \$ref_fasta,
'output=s' => \$output_file,
'min-read-pos=f' => \$min_read_pos,
'min-var-freq=f' => \$min_var_freq,
'min-var-count=f' => \$min_var_count,
'min-strandedness=f' => \$min_strandedness,
'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
'max-mapqual-diff=f' => \$max_mapqual_diff,
'max-readlen-diff=f' => \$max_readlen_diff,
'min-var-dist-3=f' => \$min_var_dist_3,
'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
'help' => \$help,
);
unless($opt_result) {
die help_text();
}
if($help) {
print STDOUT help_text();
exit 0;
}
unless($vcf_file) {
warn "You must provide a file to be filtered\n";
die help_text();
}
unless(-s $vcf_file) {
die "Can not find VCF file: $vcf_file\n";
}
unless($ref_fasta) {
warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
die help_text();
}
unless(-s $ref_fasta) {
die "Can not find valid reference fasta: $ref_fasta\n";
}
unless($bam_file) {
die "You must provide a BAM file for generating readcounts\n";
die help_text();
}
unless(-s $bam_file) {
die "Can not find valid BAM file: $bam_file\n";
}
if($bam_index && !-s $bam_index) {
die "Can not find valid BAM index file: $bam_index\n";
}
unless($output_file) {
warn "You must provide an output file name: $output_file\n";
die help_text();
}
else {
$output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
}
unless($sample) {
warn "You must provide a sample name\n";
die help_text();
}
my %filters;
$filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
$filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
$filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
$filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
$filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
$filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
$filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
$filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
$filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
$filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
$filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
my $vcf_header;
my @vcf_lines;
my %rc_for_snp; # store info on snp positions and VCF line
my %rc_for_indel; #store info on indel positions and VCF line
my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
my $starting_dir = cwd();
my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
$vcf_header = parse_vcf_header($input);
add_filters_to_vcf_header($vcf_header, values %filters);
add_process_log_to_header($vcf_header, $vcf_file, @params);
my $header_line = $vcf_header->[-1];
chomp $header_line;
my @header_fields = split "\t", $header_line;
while(my $entry = $input->getline) {
push @vcf_lines, $entry;
chomp $entry;
my %fields;
@fields{@header_fields} = split("\t", $entry);
my $filter_sample = $fields{$sample};
unless($filter_sample) {
die "Unable to find field for $sample\n";
}
my @sample_fields = split /:/, $filter_sample;
unless(@sample_fields) {
die "Unable to parse field for $sample\n";
}
my $index = 0;
my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
#these are in order ACGT
my @alleles = ($fields{REF}, split /,/, $fields{ALT});
my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
my @used_alleles;
for my $allele_index (keys %gt_alleles) {
push @used_alleles, $alleles[$allele_index];
}
my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
$var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
$var = uc($var);
my $ref = uc($fields{REF});
my $chrom = $fields{'#CHROM'};
my $pos = $fields{POS};
if(length($ref) > 1 || length($var) > 1) {
#it's an indel or mnp
if(length($ref) == length($var)) {
die "MNPs unsupported\n";
}
elsif(length($ref) > length($var)) {
#it's a deletion
$pos += 1;
($ref, $var) = ($var, $ref);
$ref = substr($var, 1, 1);
$var = "-" . substr($var, 1);
}
else {
#it's an insertion
substr($var, 0, 1, "+");
}
$rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
}
else {
#it's a SNP
$rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
}
}
if(%rc_for_snp) {
filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
}
else {
print STDERR "No SNP sites identified\n";
}
if(%rc_for_indel) {
filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
}
else {
print STDERR "No Indel sites identified\n";
}
## Open the output files ##
my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
print $filtered_vcf @$vcf_header;
print $filtered_vcf @vcf_lines;
chdir $starting_dir or die "Unable to go back to starting dir\n";
exit(0);
################################################################################
=head3 read_counts_by_allele
Retrieve relevant read counts for a certain allele
=cut
sub read_counts_by_allele {
(my $line, my $allele) = @_;
my @lineContents = split(/\t/, $line);
my $numContents = @lineContents;
for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
my $this_allele = $lineContents[$colCounter];
my @alleleContents = split(/\:/, $this_allele);
if($alleleContents[0] eq $allele) {
my $numAlleleContents = @alleleContents;
return("") if($numAlleleContents < 8);
my $return_string = "";
my $return_sum = 0;
for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
$return_sum += $alleleContents[$printCounter];
$return_string .= "\t" if($return_string);
$return_string .= $alleleContents[$printCounter];
}
return($return_string);
}
}
return("");
}
sub help_text {
return <<HELP;
fpfilter - Filtering for Illumina Sequencing
SYNOPSIS
fpfilter [options] [file ...]
OPTIONS
--vcf-file the input VCF file. Must have a GT field.
--bam-file the BAM file of the sample you are filtering on. Typically the tumor.
--sample the sample name of the sample you want to filter on in the VCF file.
--reference a fasta containing the reference sequence the BAM file was aligned to.
--output the filename of the output VCF file
--min-read-pos minimum average relative distance from start/end of read
--min-var-freq minimum variant allele frequency
--min-var-count minimum number of variant-supporting reads
--min-strandedness minimum representation of variant allele on each strand
--max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
--max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads
--max-mapqual-diff maximum difference of mapping quality between variant and reference reads
--max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter)
--min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
--help this message
DESCRIPTION
This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
AUTHORS
Dan Koboldt Original code
Dave Larson Modifications for VCF and exportation.
HELP
}
### methods copied from elsewhere begin here...
sub parse_vcf_header {
my $input_fh = shift;
my @header;
my $header_end = 0;
while (!$header_end) {
my $line = $input_fh->getline;
if ($line =~ m/^##/) {
push @header, $line;
} elsif ($line =~ m/^#/) {
push @header, $line;
$header_end = 1;
} else {
die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
}
}
return \@header;
}
sub generate_region_list {
my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
print STDERR "Printing variants to temporary region_list file...\n";
for my $chr (keys %$hash) {
for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
print $region_fh "$chr\t$pos\t$pos\n";
}
}
}
sub _simplify_indel_allele {
my ($ref, $var) = @_;
#these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
#they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
#they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
#
#I think the algorithm should be trim end (no updating of coords required)
#trim beginning and return number of bases trimmed
my @ref_array = map { uc } split //, $ref;
my @var_array = map { uc } split //, $var;
while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
pop @ref_array;
pop @var_array;
}
my $right_shift = 0;
while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
shift @ref_array;
shift @var_array;
$right_shift++;
}
return (join("",@ref_array), join("",@var_array), $right_shift);
}
sub filter_site {
my ($ref_result, $var_result) = @_;
#this will return a list of filter names
my @filter_names;
if($ref_result && $var_result) {
## Parse out the bam-readcounts details for each allele. The fields should be: ##
#num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
my $ref_strandedness = my $var_strandedness = 0.50;
$ref_dist_3 = 0.5 if(!$ref_dist_3);
## Use conservative defaults if we can't get mismatch quality sums ##
$ref_mmqs = 50 if(!$ref_mmqs);
$var_mmqs = 0 if(!$var_mmqs);
my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
## Determine map qual diff ##
my $mapqual_diff = $ref_map_qual - $var_map_qual;
## Determine difference in average supporting read length ##
my $readlen_diff = $ref_avg_rl - $var_avg_rl;
## Determine ref strandedness ##
if(($ref_plus + $ref_minus) > 0) {
$ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
$ref_strandedness = sprintf("%.2f", $ref_strandedness);
}
## Determine var strandedness ##
if(($var_plus + $var_minus) > 0) {
$var_strandedness = $var_plus / ($var_plus + $var_minus);
$var_strandedness = sprintf("%.2f", $var_strandedness);
}
if($var_count && ($var_plus + $var_minus)) {
## We must obtain variant read counts to proceed ##
my $var_freq = $var_count / ($ref_count + $var_count);
## FAILURE 1: READ POSITION ##
if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
#$stats{'num_fail_pos'}++;
push @filter_names, $filters{'position'}->[0];
}
## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
#$stats{'num_fail_strand'}++;
push @filter_names, $filters{'strand_bias'}->[0];
}
## FAILURE : Variant allele count does not meet minimum ##
if($var_count < $min_var_count) {
#$stats{'num_fail_varcount'}++;
push @filter_names, $filters{'min_var_count'}->[0];
}
## FAILURE : Variant allele frequency does not meet minimum ##
if($var_freq < $min_var_freq) {
#$stats{'num_fail_varfreq'}++;
push @filter_names, $filters{'min_var_freq'}->[0];
}
## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
#$stats{'num_fail_mmqs'}++;
push @filter_names, $filters{'mmqs_diff'}->[0];
}
## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
if($mapqual_diff > $max_mapqual_diff) {
#$stats{'num_fail_mapqual'}++;
push @filter_names, $filters{'mq_diff'}->[0];
}
## FAILURE 5: Read length difference exceeds allowable maximum ##
if($readlen_diff > $max_readlen_diff) {
#$stats{'num_fail_readlen'}++;
push @filter_names, $filters{'read_length'}->[0];
}
## FAILURE 5: Read length difference exceeds allowable maximum ##
if($var_dist_3 < $min_var_dist_3) {
#$stats{'num_fail_dist3'}++;
push @filter_names, $filters{'dist3'}->[0];
}
if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
#$stats{'num_fail_var_mmqs'}++;
push @filter_names, $filters{'var_mmqs'}->[0];
}
## SUCCESS: Pass Filter ##
if(@filter_names == 0) {
#$stats{'num_pass_filter'}++;
## Print output, and append strandedness information ##
@filter_names = ('PASS');
}
}
else {
push @filter_names, $filters{'no_var_readcount'}->[0];
}
}
else {
#$stats{'num_no_readcounts'}++;
#print $fail_fh "$line\tno_readcounts\n";
push @filter_names, $filters{'incomplete_readcount'}->[0];
}
return @filter_names;
}
sub add_filters_to_vcf_header {
my ($parsed_header, @filter_refs) = @_;
my $column_header = pop @$parsed_header;
for my $filter_ref (@filter_refs) {
my ($filter_name, $filter_description) = @$filter_ref;
my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
push @$parsed_header, $filter_line;
}
push @$parsed_header, $column_header;
}
sub add_process_log_to_header {
my ($parsed_header, $input, @params) = @_;
my $column_header = pop @$parsed_header;
my $param_string = join(" ", @params);
push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
}
sub filter_sites_in_hash {
my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
#done parsing vcf
$optional_param ||= '';
my $list_name = File::Spec->catfile($working_dir, "regions.txt");
my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
generate_region_list($hash, $list_fh);
$list_fh->close();
## run bam-readcount
my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
while(my $rc_line = $rc_results->getline) {
chomp $rc_line;
my ($chrom, $position) = split(/\t/, $rc_line);
if($hash->{$chrom}{$position}) {
for my $ref (keys %{$hash->{$chrom}{$position}}) {
for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
my $ref_result = read_counts_by_allele($rc_line, $ref);
my $var_result = read_counts_by_allele($rc_line, $var);
my @filters = filter_site($ref_result, $var_result);
my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
my @fields = split "\t", $$vcf_line_ref;
if($fields[6] eq '.' || $fields[6] eq 'PASS') {
$fields[6] = join(";", @filters);
}
else {
$fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
}
$$vcf_line_ref = join("\t", @fields);
}
}
}
else {
die "Unknown site for rc\n";
}
}
unless($rc_results->close) {
die "Error running bam-readcount\n";
}
}
sub setup_workdir {
my ($reference, $bam_file, $bam_index) = @_;
$reference = abs_path($reference);
$bam_file = abs_path($bam_file);
$bam_index = abs_path($bam_index) if $bam_index;
my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
die "Unable to create working directory\n";
#symlink in necessary files to run
my $working_reference = File::Spec->catfile($dir, "reference.fa");
symlink $reference, $working_reference;
my $fa_index = $reference . ".fai";
unless(-e $fa_index) {
index_fasta($working_reference);
}
else {
symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
}
my $working_bam = File::Spec->catfile($dir, "tumor.bam");
my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
symlink $bam_file, $working_bam;
if($bam_index) {
symlink $bam_index, $working_bam_index;
}
elsif(-e $bam_file . ".bai") {
symlink $bam_file . ".bai", $working_bam_index;
}
else {
index_bam($working_bam);
}
return ($dir, $working_reference, $working_bam);
}
sub index_fasta {
my ($fasta) = @_;
print STDERR "Indexing fasta...\n";
my @args = ($samtools_path, "faidx", $fasta);
system(@args) == 0
or die "Unable to index $fasta: $?\n";
}
sub index_bam {
my ($bam) = @_;
print STDERR "Indexing BAM...\n";
my @args = ($samtools_path, "index", $bam);
system(@args) == 0
or die "Unable to index $bam: $?\n";
}