-
Notifications
You must be signed in to change notification settings - Fork 0
/
circ_rna_snakemake.snk
138 lines (110 loc) · 4.66 KB
/
circ_rna_snakemake.snk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
SAMPLES = []
for line in shell("ls", iterable=True):
SAMPLES.append(line)
print("Inigo's message:Hunting circRNA")
print("Inigo's message:Checking if STAR tmp file or logs exist on the working directory, they will crash the pipeline")
if os.path.exists("_STARtmp") == True:
print("Inigo's message: _STARtmp directory was present in the working directory. I have delete it to avoid a crash")
os.system("rm -rf _STARtmp")
else:
print("Inigo's message: _STARtmp not present")
if os.path.exists("Log.out") == True:
print("Inigo's message: Log.out file was present in the working directory. I have delete it to avoid a crash")
os.system("rm -rf Log.out")
else:
print("Inigo's message: Log.out was not present")
rule all:
input:
expand('{sample}/fastqc/{sample}_1_fastqc.html', sample=SAMPLES),
expand('{sample}/fastqc/{sample}_2_fastqc.html', sample=SAMPLES),
expand('{sample}/{sample}_known_circrna.bed',sample=SAMPLES)
rule create_index:
input:
ref_genome='.reference/genome.fa',
ref_genome_dir='.reference'
output:
genome_parameters='.reference/genomeParameters.txt',
log_out='.logs/Log.out'
threads: 16
log:
".logs/star_index.log"
shell:
#MODIFY THIS LINE
'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; STAR --runThreadN {threads} --runMode genomeGenerate --outStd Log {log} --genomeDir {input.ref_genome_dir} --genomeFastaFiles {input.ref_genome} > {log} 2>&1 ; mv Log.out .logs/'
rule fastqc:
input:
r1='{sample}/{sample}_1.fastq.gz',
r2='{sample}/{sample}_2.fastq.gz'
output:
html1='{sample}/{sample}_1_fastqc.html',
zip1='{sample}/{sample}_1_fastqc.zip',
html2='{sample}/{sample}_2_fastqc.html',
zip2='{sample}/{sample}_2_fastqc.zip'
log:
"{sample}/.logs/{sample}_fastqc.log"
shell:
#MODIFY THIS LINE
'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; fastqc {input.r1} {input.r2} 2> {log}'
rule fastqc_to_dir:
input:
html1='{sample}/{sample}_1_fastqc.html',
zip1='{sample}/{sample}_1_fastqc.zip',
html2='{sample}/{sample}_2_fastqc.html',
zip2='{sample}/{sample}_2_fastqc.zip',
params:
outdir='{sample}/fastqc/'
output:
html1='{sample}/fastqc/{sample}_1_fastqc.html',
zip1='{sample}/fastqc/{sample}_1_fastqc.zip',
html2='{sample}/fastqc/{sample}_2_fastqc.html',
zip2='{sample}/fastqc/{sample}_2_fastqc.zip'
shell:
'mv {input.html1} {params.outdir} ; mv {input.html2} {params.outdir} ; mv {input.zip1} {params.outdir} ; mv {input.zip2} {params.outdir}'
rule align:
input:
r1='{sample}/{sample}_1.fastq.gz',
r2='{sample}/{sample}_2.fastq.gz',
genome_parameters='.reference/genomeParameters.txt'
output:
bam='{sample}/{sample}_Aligned.out.bam',
junctions='{sample}/{sample}_Chimeric.out.junction'
params:
substr='{sample}/{sample}_',
bam_output='BAM Unsorted',
#MODIFY THIS LINE
reference='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/'
log:
"{sample}/.logs/star_alignment.log"
shell:
#MODIFY THIS LINE
'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; STAR --chimSegmentMin 10 --runThreadN {threads} --outSAMtype {params.bam_output} --genomeDir {params.reference} --readFilesCommand zcat --outFileNamePrefix {params.substr} --outStd Log {log} --readFilesIn {input.r1} {input.r2}'
rule circexplorer_parse:
input:
junctions='{sample}/{sample}_Chimeric.out.junction'
output:
parsed_junctions='{sample}/{sample}_back_spliced_junction.bed'
params:
aligner='STAR'
log:
"{sample}/.logs/circexplorer2_parse.log"
shell:
'CIRCexplorer2 parse -t {params.aligner} -b {output.parsed_junctions} {input.junctions} > {log}'
rule circexplorer_annotate:
input:
#MODIFY THE FOLLOWING TWO LINES
ref_gene='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/refGene.txt',
hg38='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/genome.fa',
parsed_junctions='{sample}/{sample}_back_spliced_junction.bed'
output:
circrna='{sample}/{sample}_known_circrna.bed'
log:
"{sample}/.logs/circexplorer2_annotate.log"
params:
shell:
'CIRCexplorer2 annotate -r {input.ref_gene} -g {input.hg38} -b {input.parsed_junctions} -o {output.circrna} > {log}'
rule circ_explorer_dir:
output:
circ_dir='{sample}/circexplorer_outputs'
shell:
'mkdir {output.star_dir}'