forked from mtokuyama/ERVmap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
erv_genome.pl
253 lines (183 loc) · 6.41 KB
/
erv_genome.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#!/usr/bin/env perl
#
# erv: map to genome
#
use warnings;
use strict;
use File::Basename;
use Cwd;
use File::Type;
use Getopt::Long;
use Pod::Usage;
my $help = 0;
my $man = 0;
my ($btrim, $aligner, $bwa, $samtools, $filter, $bedtools,);
my ($genome, $genome_Bowtie2, $bed, $genomefile, $gtf, $transcriptome, $adaptor, );
my $cell = "sample"; # sample name
my $workdir = "."; # working dir "$cell"
my $outdir = "tmp"; # dir under $cell
my $stage = 0;
my $stage2 = 0;
my $fastq; # sequence file
my $length = 40;
my $score_offset = 33;
my $score = 25;
my $cat = '';
my $test = 0;
print map("\t$_", @ARGV), "\n";
GetOptions (
'help|h|?' => \$help,
'man' => \$man,
"test|t" => \$test,
"cell=s" => \$cell,
"workdir=s" => \$workdir,
"outdir=s" => \$outdir,
"stage=i" => \$stage,
"stage2=i" => \$stage2,
"length=i" => \$length,
"score_offset=i" => \$score_offset,
"btrim=s" => \$btrim,
"adaptor=s" => \$adaptor,
"bwa=s" => \$bwa,
"samtools=s" => \$samtools,
"filter=s" => \$filter,
"tophat=s" => \$aligner,
"bedtools=s" => \$bedtools,
"bed=s" => \$bed,
"genomefile=s" => \$genomefile,
"genome=s" => \$genome,
"genome_Bowtie2=s" => \$genome_Bowtie2,
"gtf=s" => \$gtf,
"transcriptome=s" => \$transcriptome,
"cat" => \$cat,
"score=i" => \$score,
"fastq=s" => \$fastq, ### absolute path
) or pod2usage( "Try '$0 --help' for more information." );
pod2usage(1) if $help;
pod2usage(2) if $man;
pod2usage(1) unless ($btrim && $aligner && $bwa && $samtools && $filter && $bedtools);
pod2usage(1) unless (-x $btrim && -x $aligner && -x $bwa && -x $samtools && -x $filter && -x $bedtools);
pod2usage(1) unless (-e $genome && -e $genome_Bowtie2 && -e $bed && -e $genomefile && -e $gtf && -e $transcriptome && -e $adaptor);
pod2usage(1) unless ($fastq && $stage > 0 && $stage2 > 0);
### check if fastq file is zipped
my $zipped = 0;
my $ftype = File::Type->new->checktype_filename($fastq);
print STDERR "$fastq: $ftype\n";
if ($ftype =~ /zip/) {
$zipped = 1;
}
### btrim
my $btrimout = "btrim_g_se.out";
my $btrim_cmd;
if ($score_offset == 33) {
if ($zipped) {
$btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'";
} else {
$btrim_cmd = "/bin/bash -c '$btrim -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'";
}
} else {
if ($zipped) {
$btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(gunzip -c $fastq) -C > btrim.log 2> btrim.log'";
} else {
$btrim_cmd = "/bin/bash -c '$btrim -i -l $length -w 10 -a 25 -p $adaptor -3 -P -o $btrimout -t <(cat $fastq) -C > btrim.log 2> btrim.log'";
}
}
### bwa
my $bwabam = "bwa.bam";
#my $bwa_cmd = "/bin/bash -c '$bwa mem -t 8 -p $genome ${btrimout} | $samtools view -Sh -F4 - | tee >($samtools view -Shb - | $samtools sort - -o bwa_unfiltered.bam) | $filter | $samtools view -bSh - > $bwabam'";
my $bwa_cmd = "/bin/bash -c '$bwa mem -t 8 -p $genome ${btrimout} | $samtools view -Sh -F4 - | $filter | $samtools view -bSh - > $bwabam'";
### sort
my $bwabam_sorted = "bwa_sorted.bam";
my $sort_cmd = "$samtools sort $bwabam -o $bwabam_sorted";
# index
my $bamindex_cmd = "$samtools index ${bwabam_sorted}";
### count
my $cntfile = "herv_coverage_GRCh38_genome.txt";
my $count_cmd = "$bedtools coverage -b ${bwabam_sorted} -a $bed -counts -sorted -g $genomefile > $cntfile";
### tophat2
my $files = "btrim_g_se.out";
my $aln_log = "align.log";
my $aln_err = "align.err";
my $thread = "--num-threads 2";
my $out = "-o GRCh38";
my $ann = "-G $gtf";
my $aln_cmd = "$aligner --b2-very-fast --no-novel-junc --transcriptome-index=$transcriptome --no-coverage-search $thread $ann $out $genome_Bowtie2 $files > $aln_log 2> $aln_err";
# index
my $bam = "accepted_hits.bam";
my $bamindex_cmd2 = "$samtools index $bam";
# count
my $count = "htseq.cnt";
my $count_log = "htseq.log";
my $count_cmd2 = "$samtools view $bam | python2.7 -m HTSeq.scripts.count -f sam -s no - $gtf > $count 2> $count_log";
chdir($workdir);
my $wdir = "$cell";
my $cmd = "mkdir -p $wdir";
&cmd($cmd);
chdir($wdir);
# btrim
if ($stage <= 1 && $stage2 >= 1) {
&cmd($btrim_cmd);
}
# bwa
if ($stage <= 2 && $stage2 >= 2) {
&cmd($bwa_cmd);
# unlink($btrimout) if (-s $bwabam > 10);
}
# sort
if ($stage <= 3 && $stage2 >= 3) {
&cmd($sort_cmd);
&cmd($bamindex_cmd);
unlink($bwabam);
}
# count
if ($stage <= 4 && $stage2 >= 4) {
&cmd($count_cmd);
}
# tophat
if ($stage <= 5 && $stage2 >= 5) {
&cmd($aln_cmd);
chdir("GRCh38");
&cmd($bamindex_cmd2);
}
# count2
if ($stage <= 6 && $stage2 >= 6) {
&cmd($count_cmd2);
}
sub cmd {
my ($c, ) = @_;
print "In ", cwd(), ":\n";
print "$c\n";
system($c) unless ($test);
}
__END__
=head1 NAME
erv_se_genome_v2.pl - Script to run ervmap
=head1 SYNOPSIS
erv_se_genome_v2.pl [options]
Options:
-h, -? --help brief help message
-t, --test print out commands without run
--btrim path to btrim (http://graphics.med.yale.edu/trim/)
--tophat path to tophat
--bwa path to bwa
--samtools path to samtools
--filter path to filter (parse_bam.pl)
--bedtools path to bedtools
--genome path to bwa human genome index
--genome_Bowtie2 path to Bowtie2 human genome index
--bed path to bed file of ERVs
--genomefile path to genome size file
--gtf path to gtf file of human gene annotation
--transcriptome path to known transcriptome, used by tophat2
--adaptor path to adaptor files, used by btrim (http://graphics.med.yale.edu/trim/)
--fastq fastq file
--stage start stage (see below)
--stage2 end stage (see below)
Stages:
1 trim adaptors and low quality regions
2 map reads using bwa
3 sort bam file
4 count reads mapped to ERVs
5 map reads using tophat2
6 counts reads mapped to genes
=cut