circ_rna_snakemake.snk

import os

SAMPLES = []
for line in shell("ls", iterable=True):
    SAMPLES.append(line)

print("Inigo's message:Hunting circRNA")

print("Inigo's message:Checking if STAR tmp file or logs exist on the working directory, they will crash the pipeline")

if os.path.exists("_STARtmp") == True:
    print("Inigo's message: _STARtmp directory was present in the working directory. I have delete it to avoid a crash")
    os.system("rm -rf _STARtmp")
else:
    print("Inigo's message: _STARtmp not present")

if os.path.exists("Log.out") == True:
    print("Inigo's message: Log.out file was present in the working directory. I have delete it to avoid a crash")
    os.system("rm -rf Log.out")
else:
    print("Inigo's message: Log.out was not present")


rule all:
    input:
        expand('{sample}/fastqc/{sample}_1_fastqc.html', sample=SAMPLES),
        expand('{sample}/fastqc/{sample}_2_fastqc.html', sample=SAMPLES),
        expand('{sample}/{sample}_known_circrna.bed',sample=SAMPLES)


rule create_index:
    input:
        ref_genome='.reference/genome.fa',
        ref_genome_dir='.reference'
    output:
        genome_parameters='.reference/genomeParameters.txt',
        log_out='.logs/Log.out'

    threads: 16
    log:
        ".logs/star_index.log"
    shell:
    #MODIFY THIS LINE
        'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; STAR --runThreadN {threads} --runMode genomeGenerate --outStd Log {log} --genomeDir {input.ref_genome_dir} --genomeFastaFiles {input.ref_genome} > {log} 2>&1 ; mv Log.out .logs/'

rule fastqc:
    input:
        r1='{sample}/{sample}_1.fastq.gz',
        r2='{sample}/{sample}_2.fastq.gz'

    output:
        html1='{sample}/{sample}_1_fastqc.html',
        zip1='{sample}/{sample}_1_fastqc.zip',
        html2='{sample}/{sample}_2_fastqc.html',
        zip2='{sample}/{sample}_2_fastqc.zip'
    log:
        "{sample}/.logs/{sample}_fastqc.log"
    shell:
        #MODIFY THIS LINE
        'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; fastqc {input.r1} {input.r2} 2> {log}'


rule fastqc_to_dir:
    input:
        html1='{sample}/{sample}_1_fastqc.html',
        zip1='{sample}/{sample}_1_fastqc.zip',
        html2='{sample}/{sample}_2_fastqc.html',
        zip2='{sample}/{sample}_2_fastqc.zip',
    params:
        outdir='{sample}/fastqc/'
    output:
        html1='{sample}/fastqc/{sample}_1_fastqc.html',
        zip1='{sample}/fastqc/{sample}_1_fastqc.zip',
        html2='{sample}/fastqc/{sample}_2_fastqc.html',
        zip2='{sample}/fastqc/{sample}_2_fastqc.zip'
    shell:
        'mv {input.html1} {params.outdir} ; mv {input.html2} {params.outdir} ; mv {input.zip1} {params.outdir} ; mv {input.zip2} {params.outdir}'

       
rule align:
    input:
        r1='{sample}/{sample}_1.fastq.gz',
        r2='{sample}/{sample}_2.fastq.gz',
        genome_parameters='.reference/genomeParameters.txt'        

    output:
        bam='{sample}/{sample}_Aligned.out.bam',
        junctions='{sample}/{sample}_Chimeric.out.junction'
    params:
        substr='{sample}/{sample}_',
        bam_output='BAM Unsorted',
        #MODIFY THIS LINE
        reference='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/'

    log:
        "{sample}/.logs/star_alignment.log"
    shell:
    #MODIFY THIS LINE
         'export PATH="/home/iprada/bin/anaconda3/bin:$PATH" ; STAR --chimSegmentMin 10 --runThreadN {threads} --outSAMtype {params.bam_output} --genomeDir {params.reference} --readFilesCommand zcat --outFileNamePrefix {params.substr} --outStd Log {log} --readFilesIn {input.r1} {input.r2}'


rule circexplorer_parse:
    input:
        junctions='{sample}/{sample}_Chimeric.out.junction'
    output:
        parsed_junctions='{sample}/{sample}_back_spliced_junction.bed'
    params:
        aligner='STAR'
    log:
        "{sample}/.logs/circexplorer2_parse.log"
    shell:
        'CIRCexplorer2 parse -t {params.aligner} -b {output.parsed_junctions} {input.junctions} > {log}'

rule circexplorer_annotate:
    input:
    #MODIFY THE FOLLOWING TWO LINES
        ref_gene='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/refGene.txt',
        hg38='/home/iprada/faststorage/projects/circrna_snakemake/working_directory/.reference/genome.fa',
        parsed_junctions='{sample}/{sample}_back_spliced_junction.bed'        
    output:
        circrna='{sample}/{sample}_known_circrna.bed'
    log:
        "{sample}/.logs/circexplorer2_annotate.log"

    params:
    shell:
        'CIRCexplorer2 annotate -r {input.ref_gene} -g {input.hg38} -b {input.parsed_junctions} -o {output.circrna} > {log}'

rule circ_explorer_dir:
    output:
        circ_dir='{sample}/circexplorer_outputs'
    shell:
        'mkdir {output.star_dir}'