-
Notifications
You must be signed in to change notification settings - Fork 0
/
hisat2_fc_pipeline.py
63 lines (50 loc) · 3.45 KB
/
hisat2_fc_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import os, sys
from datetime import datetime
hisat2_exe = '~/software/HISAT2/hisat2-2.2.1/hisat2/hisat2'
fastq_dir = '/home/shared/projects/dec2_illumina/fastq'
bam_dir = '/home/shared/projects/dec2_illumina/bam'
genome_index = '/home/shared/db/Dmel/dmel_r6.39/dmel-all-chromosome-r6.39.fasta_hisat2_index'
gtf_file = '/home/shared/db/Dmel/dmel_r6.39/dmel-all-r6.39.gtf'
splice_site_file = '/home/shared/db/Dmel/dmel_r6.39/dmel-all-r6.39.gtf.ss'
threads = 32
start_time = datetime.now()
libs = ['WT','DecWT','P384R']
#libs = ['WT']
for lib in libs:
for rep in [1, 2, 3]:
#for rep in [1]:
# hisat2 alignment of each output file
# define input and output files
fq1_input = f"{fastq_dir}/{lib}_{rep}_1.cleaned.fq.gz"
fq2_input = f"{fastq_dir}/{lib}_{rep}_2.cleaned.fq.gz"
bam_file = f"{bam_dir}/{lib}_{rep}.bam"
print(f"hisat2 -p {threads} --dta --rna-strandness RF --known-splicesite-infile {splice_site_file} -x {genome_index} -1 {fq1_input} -2 {fq2_input} | samtools view -@ {threads} -Shu - | samtools sort -n -@ {threads} -o {bam_file} -")
#os.system(f"hisat2 -p {threads} --dta --rna-strandness RF --known-splicesite-infile {splice_site_file} -x {genome_index} -1 {fq1_input} -2 {fq2_input} | samtools view -@ {threads} -Shu - | samtools sort -n -@ {threads} -o {bam_file} -")
# sort and index the bam file - not sure this is relevant for only differential expression analysis
print(f"samtools sort -@ {threads} -o {bam_file}.sorted.bam {bam_file}")
os.system(f"samtools sort -@ {threads} -o {bam_file}.sorted.bam {bam_file}")
# Index the sorted bam
print(f"samtools index -@ {threads} {bam_file}.sorted.bam")
os.system(f"samtools index -@ {threads} {bam_file}.sorted.bam")
# original bam and sorted bam give same count results
bam_file = f"{bam_dir}/{lib}_{rep}.bam.sorted.bam"
# run featureCounts
# featureCounts -R CORE -T 32 -t exon -g gene_id -L --primary -a gtf_in -o featureCounts.counts.txt bam_in
# -R CORE gives detailed info on the read mapping
# -L long read counting mode (only when sequencing nanopore)
# --primary - If specified, only primary alignments will be counted.
# all alignments - primary and all give same count results
#print(f"/home/shared/software/subread/bin/featureCounts -p -R CORE -T 32 -a {gtf_file} -o {bam_file}.featureCounts.all.counts {bam_file}")
#os.system(f"/home/shared/software/subread/bin/featureCounts -p -R CORE -T 32 -a {gtf_file} -o {bam_file}.featureCounts.all.counts {bam_file}")
#os.system(f"mv {bam_file}.featureCounts {bam_file}.featureCounts.all")
# only primary alignments
print(f"/home/shared/software/subread/bin/featureCounts -p -R CORE -T 32 --primary -a {gtf_file} -o {bam_file}.featureCounts.primary.counts {bam_file}")
os.system(f"/home/shared/software/subread/bin/featureCounts -p -R CORE -T 32 --primary -a {gtf_file} -o {bam_file}.featureCounts.primary.counts {bam_file}")
os.system(f"mv {bam_file}.featureCounts {bam_file}.featureCounts.primary")
# compress the main output file
print(f"pigz {bam_file}.featureCounts.primary")
os.system(f"pigz {bam_file}.featureCounts.primary")
# calculate and print the total run time
total_time = datetime.now() - start_time
print(f"total script excecution time = {total_time}")