forked from AstraZeneca-NGS/Seq2C
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq2c2fm.pl
executable file
·128 lines (105 loc) · 4.26 KB
/
seq2c2fm.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env perl
use Getopt::Std;
use strict;
our ($opt_n, $opt_d, $opt_a, $opt_e, $opt_A, $opt_D, $opt_N, $opt_p, $opt_g, $opt_P, $opt_H, $opt_G);
getopts('Hgn:d:a:e:D:A:N:p:G:') || USAGE();
$opt_H && USAGE();
my %purity;
if ( $opt_p ) {
setPurity($opt_p);
}
my $MINDEL = $opt_D ? $opt_D : -2.0;
my $MINAMP = $opt_A ? $opt_A : 1.45; # 6 copies for pure sample
my $MINEXONDEL = $opt_d ? $opt_d : -2.0;
my $MINEXONAMP = $opt_a ? $opt_a : 1.75;
my $MINEXON = $opt_e ? $opt_e : 1;
my $N = $opt_N ? $opt_N : 5; # If a breakpoint is called more than $N samples, then it's deemed a false positive and filtered
my @GAIN = $opt_G ? split(/:/, $opt_G) : ("MYC");
my %GAIN = map { ($_, 1); } @GAIN;
my %count;
my %samples;
while( <> ) {
my @a = split(/\t/);
my $gene = $a[1];
my $sample = $a[0];
if ( $opt_n ) {
if ( $a[0] =~ /$opt_n/ ) {
$sample = $1;
} else {
next;
}
}
$samples{ $sample } = 1;
my ($SMINDEL, $SMINAMP, $SMINEXONDEL, $SMINEXONAMP) = ($MINDEL, $MINAMP, $MINEXONDEL, $MINEXONAMP);
if ( $purity{ $sample } ) {
} elsif ( $opt_P ) {
}
my $lr = $a[6];
#if ( $a[10] && $a[10] < $MINEXON ) {
#$lr = $a[12];
#next unless( $lr > $MINAMP || $lr < $MINDEL );
#}
my $desc = ($a[8] && $a[8] eq "BP") ? "$a[10] of $a[11]" : "$a[11] of $a[11]";
if ( $opt_g || $GAIN{ $gene } ) { # Only do whole gene for copy gains
if ( $lr >= 0.75 && $lr < $SMINAMP ) {
print join("\t", $sample, "", "copy-number-alteration", $a[1], "NA", "-", "-", "$a[2]:$a[3]", "-", "-", sprintf("%.1f", 2**$lr*2), $desc, $lr, "gain", "-", "-", "-", "-", "-", "-", "-", "Gain"), "\n";
next;
}
}
if ( $a[10] && $a[8] eq "BP" && ($a[10] >= $MINEXON || $a[11] - $a[10] >= $MINEXON) ) {
$lr = $a[12];
next unless( $lr >= $SMINEXONAMP || $lr <= $SMINEXONDEL );
}
next unless( $lr >= $SMINAMP || $lr <= $SMINDEL );
my $type = $lr < $SMINDEL ? "Deletion" : "Amplification";
next if ( $a[15] && $a[15] >= $N );
print join("\t", $sample, "", "copy-number-alteration", $a[1], "NA", "-", "-", "$a[2]:$a[3]", "-", "-", sprintf("%.1f", 2**$lr*2), $desc, $lr, $type eq "Deletion" ? "loss" : "amplification", "-", "-", "-", "-", "-", "-", "-", $type), "\n";
}
# Set the tumor purity for each sample
sub setPurity {
my $file = shift;
open(PUR, $file);
while( <PUR> ) {
chomp;
my ($sample, $purity) = split(/\t/);
$purity /= 100 if ( $purity >= 5 );
$purity{ $sample } = $purity;
}
close(PUR);
}
sub USAGE {
print STDERR<<USAGE;
$0 [-g] [-e exons] [-n reg] [-N num] [-A amp] [-a amp] [-D del] [-d del] [-p purity_file] [-P purity] lr2gene_output
The program will parse seq2c output and make calls for each gene and output in the format compatible with OncoPrint.
It has the option to provide the purity so that log2ratio thresholds will be adjusted accordingly. By default, it calls
genes that are homozygously deleted or amplified with >= 6 copies.
Options:
-g Whether to output copy gains [4-5] copies. Default: no.
-p file
A file contains the tumor purity for all samples. Two columns, first is sample name, second is the purity in % or fraction [0-1].
-P double
The purity. Default: 1 or 100%, as is for cell lines. If set, all samples will assume to have the same purity.
-n regex
The regular expression to extract sample names. Default: none.
-N num
If an breakpoint has been called in >= num of samples, it's deemed false positive. Default: 5
-e exons
Minimum number of exons/amplicon. Default: 1
For whole gene:
-D log2ratio
The log2ratio to determine that a gene is homozygously deleted. Default: -2.0
-A log2ratio
The log2ratio to determine that a gene is amplified. Default: 1.45.
For < 3 exons:
-d log2ratio
The minimum log2ratio to determine that 1-2 exons are deleted. Should be lower than [-d] to reduce false positives. Default: -2.5
-a log2ratio
The minimum log2ratio to determine that 1-2 exons are amplified. Should be larger than [-a] to reduce false positives. Default: 1.75
For gains:
-G Genes
List of genes, seperated by ":", for which gain will be captured. Default: MYC
AUTHOR
Written by Zhongwu Lai
USAGE
exit(0);
}