Skip to content

Commit

Permalink
a lot of small fixes - but now it runs for my small test subset
Browse files Browse the repository at this point in the history
still needs to be benchmarked to see if performance is still ok
  • Loading branch information
Las02 committed Jan 5, 2025
1 parent 34cc23e commit 31fc1c2
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 74 deletions.
11 changes: 5 additions & 6 deletions workflow_avamb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ In short it will:

```
1. Filter contigs for 2000bp and rename them to conform with the multi-split workflow
2. Index resulting contig-file with minimap2 and create a sequence dictionary
3. Map reads with minimap2 to the combined contig set
4. Sort bam-files
5. Run Vamb and Aamb to bin the contigs, generating 3 sets of bins: vae_bins, aae_z_bins and aae_y_bins
6. Determine completeness and contamination of the bins using CheckM2
7. Dereplicate near complete (or whichever completeness and contamination thresholds were set), assuring contigs are present in one bin only.
2. Map reads with strobealign to the combined contig set
3. Sort bam-files
4. Run Vamb and Aamb to bin the contigs, generating 3 sets of bins: vae_bins, aae_z_bins and aae_y_bins
5. Determine completeness and contamination of the bins using CheckM2
6. Dereplicate near complete (or whichever completeness and contamination thresholds were set), assuring contigs are present in one bin only.
```

The nice thing about using snakemake for this is that it will keep track of which jobs have finished and it allows the workflow to be run on different hardware such as a laptop, a linux workstation and a HPC facility (currently with qsub). Keep in mind that there are three different paths, (named directed acyclic graphs in snakemake), that can be executed by snakemake depending on the outputs generated during the workflow and complicating a bit the interpretation of the snakemake file. That's why we added some comments for each rule briefily explaining their purpose. Feel free to reach us if you encounter any problems.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def get_config(name, default, regex):
raise ValueError(
f"Config option \"{name}\" is \"{res}\", but must conform to regex \"{regex}\"")
return res



# set configurations
Expand Down Expand Up @@ -106,31 +107,29 @@ def get_config(name, default, regex):
"avamb"
shell: "python {params.path} {output} {input} -m {MIN_CONTIG_SIZE}"

# Create abundance tables by aligning reads to the catalogue
rule strobealign:
input:
contigs = os.path.join(OUTDIR,"contigs.flt.fna.gz"),
fq = lambda wildcards: sample2path[wildcards.sample],
output: temp(os.path.join(OUTDIR, "mapped", "{sample}.aemb.tsv"))
params:
walltime="864000",
nodes="1",
ppn=SA_PPN
resources:
mem=SA_MEM
threads:
int(SA_PPN)
conda:
"envs/strobealign.yaml"
log:
out_sa = os.path.join(OUTDIR,"log/map/{sample}.strobealign.log"),
o = os.path.join(OUTDIR,"log/map/{sample}.strobealign.o"),
e = os.path.join(OUTDIR,"log/map/{sample}.strobealign.e")
conda:
"envs/strobealign.yaml"
shell:
"strobealign -t {threads} --aemb {input.contigs} {input.fq}"
" > {output} 2> {log.out_sa}"
# # Create abundance tables by aligning reads to the catalogue
# rule strobealign:
# input:
# contigs = os.path.join(OUTDIR,"contigs.flt.fna.gz"),
# fq = lambda wildcards: sample2path[wildcards.sample],
# output: temp(os.path.join(OUTDIR, "mapped", "{sample}.aemb.tsv"))
# params:
# walltime="864000",
# nodes="1",
# ppn=SA_PPN
# resources:
# mem=SA_MEM
# threads:
# int(SA_PPN)
# conda:
# "envs/strobealign.yaml"
# log:
# out_sa = os.path.join(OUTDIR,"log/map/{sample}.strobealign.log"),
# o = os.path.join(OUTDIR,"log/map/{sample}.strobealign.o"),
# e = os.path.join(OUTDIR,"log/map/{sample}.strobealign.e")
# shell:
# "strobealign -t {threads} --aemb {input.contigs} {input.fq}"
# " > {output} 2> {log.out_sa}"

rule create_abundance_tsv:
input:
Expand All @@ -154,9 +153,26 @@ def get_config(name, default, regex):
e = os.path.join(OUTDIR,"log/contigs/abundance.e")
conda:
"avamb"
shell: "cat <(echo -e 'contigname\tdepht') {input.files} > {output}"
shell:
"""
# TODO go back to the orginal
# Paste the abundances files together, such that the first column is the contigname
# and the rest of the rows are the depht.
# The sed command gets every second row,
# meaning it extracts the depht column from the abundance files
# Might want to refactor this as a python script.
paste {input.files} \
| cut -f1 \
| paste - {input.files} \
| sed -r 's/\\t[^\\t]*(\\t|$)/\\1/g' \
| cat <(echo "contigname {input.files}" | sed 's= =\\t=g') - > {output}
# Getting every second column:
# https://www.unix.com/shell-programming-and-scripting/253617-printing-every-alternate-columns-text-file.html
"""

# Generate the 3 sets of clusters and bins
# Generate the 2 sets of clusters and bins from running avamb
rule run_avamb:
input:
contigs=os.path.join(OUTDIR,"contigs.flt.fna.gz"),
Expand All @@ -165,7 +181,7 @@ def get_config(name, default, regex):
outdir_avamb=directory(os.path.join(OUTDIR,"avamb")),
clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv"),
# clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv"),
composition=os.path.join(OUTDIR,"avamb/composition.npz"),
params:
walltime="86400",
Expand Down Expand Up @@ -197,10 +213,84 @@ def get_config(name, default, regex):
touch {log.vamb_out}
"""

# Generate the clusters from running vamb
rule run_vamb:
input:
contigs=os.path.join(OUTDIR,"contigs.flt.fna.gz"),
abundance=os.path.join(OUTDIR,"abundance.tsv")
output:
outdir_vamb=directory(os.path.join(OUTDIR,"default_vamb")),
clusters_vamb=os.path.join(OUTDIR,"default_vamb/vae_clusters_split.tsv"),
params:
walltime="86400",
nodes="1",
ppn=AVAMB_PPN,
cuda="--cuda" if CUDA else ""
resources:
mem=AVAMB_MEM
threads:
int(avamb_threads)
conda:
"avamb"
log:
vamb_out=os.path.join(OUTDIR,"tmp/vamb_finished.log"),
o=os.path.join(OUTDIR,'log','run_vamb.out'),
e=os.path.join(OUTDIR,'log','run_vamb.err')
shell:
"""
rm -rf {output.outdir_vamb} # Snakemake creates an empty dir, this removes it before running vamb, as vamb will check whether the dir exist and dont run if it does.
vamb bin default --outdir {output.outdir_vamb} --fasta {input.contigs} -p {threads} \
--abundance_tsv {input.abundance} -m {MIN_CONTIG_SIZE} --minfasta {MIN_BIN_SIZE} {params.cuda} {AVAMB_PARAMS}
"""

rule format_vamb_output:
input:
clusters_vamb=os.path.join(OUTDIR,"default_vamb/vae_clusters_split.tsv"),
clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
output:
os.path.join(OUTDIR,"tmp/format_vamb_done.txt"),
params:
walltime="300",
nodes="1",
ppn="1"
resources:
mem="1GB"
log:
o=os.path.join(OUTDIR,'log','samples_with_bins.out'),
e=os.path.join(OUTDIR,'log','samples_with_bins.err')
threads:
1
shell:
"""
# rename the default vamb clusternames to not overlap with the aae clusternames, adding X at the end of each
ls {OUTDIR}/default_vamb/bins | sed 's/.fna//' \
| xargs -I __ mv {OUTDIR}/default_vamb/bins/__.fna {OUTDIR}/default_vamb/bins/__X.fna
mv {OUTDIR}/default_vamb/bins/* {OUTDIR}/avamb/bins
# Also rename the clusters in vae_clusters_split so the cluster named are equivalient to above
cat {input.clusters_vamb} | awk '{{printf "%sX\\t%s\\n", $1, $2}}' \
| tail -n +2 | cat <(echo -e "clustername\\tcontigname") - > {input.clusters_vamb}_x
mv {input.clusters_vamb}_x {input.clusters_vamb}
# Go through each sample and make a dir for each containing the bins
# then move the bins there. This is the format Checkm2 uses so we need to convert to it
dir={OUTDIR}/avamb/bins
for sample in $(ls $dir | cut -f1 -d 'C' | sort | uniq)
do
mkdir -p $dir/$sample
# TODO should not crash if allready have been run
find $dir -type f -name $sample'*' | xargs -I X mv X $dir/$sample
done
touch {output}
"""

# Evaluate in which samples bins were reconstructed
checkpoint samples_with_bins:
input:
os.path.join(OUTDIR,"tmp/avamb_finished.log")
# os.path.join(OUTDIR,"tmp/avamb_finished.log"),
os.path.join(OUTDIR,"tmp/format_vamb_done.txt"),
output:
os.path.join(OUTDIR,"tmp/samples_with_bins.txt")
params:
Expand All @@ -216,7 +306,10 @@ def get_config(name, default, regex):
threads:
1
shell:
"find {OUTDIR}/avamb/bins/*/ -type d ! -empty |sed 's=.*bins/==g' |sed 's=/==g' > {output}"
"""
# Get the names of each sample
find {OUTDIR}/avamb/bins/* -type d ! -empty |sed 's=.*bins/==g' |sed 's=/==g' > {output}
"""


def samples_with_bins_f(wildcards):
Expand All @@ -242,8 +335,8 @@ def samples_with_bins_f(wildcards):
o=os.path.join(OUTDIR,'log','checkm2_{sample}.out'),
e=os.path.join(OUTDIR,'log','checkm2_{sample}.err')

conda:
"checkm2"
conda:
"envs/checkm2.yaml"
shell:
"""
checkm2 predict --threads {threads} --input {OUTDIR}/avamb/bins/{wildcards.sample}/*.fna --output-directory {OUTDIR}/tmp/checkm2_all/{wildcards.sample}
Expand Down Expand Up @@ -276,7 +369,7 @@ def samples_with_bins_f(wildcards):
# - {bin : bin_path}
rule create_cluster_scores_bin_path_dictionaries:
input:
checkm2_finished_log_file = os.path.join(OUTDIR,"tmp/checkm2_finished.txt")
checkm2_finished_log_file = os.path.join(OUTDIR,"tmp/checkm2_finished.txt"),
output:
cluster_score_dict_path_avamb = os.path.join(OUTDIR,"tmp/cs_d_avamb.json"),
bin_path_dict_path_avamb = os.path.join(OUTDIR,"tmp/bp_d_avamb.json"),
Expand Down Expand Up @@ -305,7 +398,7 @@ def samples_with_bins_f(wildcards):
composition=os.path.join(OUTDIR,"avamb/composition.npz"),
clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv")
clusters_vamb=os.path.join(OUTDIR,"default_vamb/vae_clusters_split.tsv")

output:
clusters_avamb_manual_drep=os.path.join(OUTDIR,"tmp/avamb_manual_drep_clusters.tsv")
Expand Down Expand Up @@ -437,6 +530,7 @@ def ripped_bins_avamb_check_output_f(wildcards):
--output-directory {OUTDIR}/tmp/ripped_bins/checkm2_out
touch {log.log_fin}
"""

# As explained in rule create_ripped_bins_avamb, contigs present in three bins were removed from the larger bin.
# This rule updates the quality scores after removing such contig(s).
rule update_cs_d_avamb:
Expand Down Expand Up @@ -551,23 +645,23 @@ def ripped_bins_avamb_check_output_f(wildcards):
e=os.path.join(OUTDIR,'log','workflow_finished.err')
shell:
"""
rm -r {OUTDIR}/tmp/checkm2_all/*/protein_files
rm -r {OUTDIR}/avamb/bins
# TODO uncomment these again
# rm -r {OUTDIR}/tmp/checkm2_all/*/protein_files
mkdir {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/tmp/*log {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/tmp/*json {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/tmp/*tsv {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/tmp/*txt {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/tmp/ripped_bins {OUTDIR}/tmp/snakemake_tmp/
# rm -r {OUTDIR}/avamb/bins
mv {OUTDIR}/tmp/checkm2_all {OUTDIR}/tmp/snakemake_tmp/
mv {OUTDIR}/avamb/avamb_manual_drep_disjoint_clusters.tsv {OUTDIR}/Final_clusters.tsv
mv {OUTDIR}/abundance.npz {OUTDIR}/tmp/
mv {OUTDIR}/mapped {OUTDIR}/tmp/
mv {OUTDIR}/contigs.flt.fna.gz {OUTDIR}/tmp/
# mkdir {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/tmp/*log {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/tmp/*json {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/tmp/*tsv {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/tmp/*txt {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/tmp/ripped_bins {OUTDIR}/tmp/snakemake_tmp/
#
# mv {OUTDIR}/tmp/checkm2_all {OUTDIR}/tmp/snakemake_tmp/
# mv {OUTDIR}/avamb/avamb_manual_drep_disjoint_clusters.tsv {OUTDIR}/Final_clusters.tsv
# mv {OUTDIR}/abundance.npz {OUTDIR}/tmp/
# mv {OUTDIR}/mapped {OUTDIR}/tmp/
# mv {OUTDIR}/contigs.flt.fna.gz {OUTDIR}/tmp/
touch {output}
Expand Down
6 changes: 6 additions & 0 deletions workflow_avamb/envs/checkm2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- checkm2
19 changes: 0 additions & 19 deletions workflow_avamb/envs/checkm2.yml

This file was deleted.

0 comments on commit 31fc1c2

Please sign in to comment.