Skip to content

Commit

Permalink
Added binning subworkflow and refinement from mag
Browse files Browse the repository at this point in the history
  • Loading branch information
uel3 committed Apr 12, 2024
1 parent f7d0830 commit c31a4e1
Show file tree
Hide file tree
Showing 12 changed files with 391 additions and 11 deletions.
102 changes: 102 additions & 0 deletions bin/split_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python

## Originally written by Daniel Straub and Sabrina Krakau and released
## under the MIT license.
## See git repository (https://github.com/nf-core/mag) for full license text.

# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> <min_length_unbinned_contigs> <max_unbinned_contigs> <min_contig_size>

import pandas as pd
import gzip
from sys import argv
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import os
import re

# Input
input_file = argv[1]
length_threshold = int(argv[2])
max_sequences = int(argv[3])
min_length_to_retain_contig = int(argv[4])

# Base name for file output
if input_file.endswith(".gz"):
rm_ext = input_file.replace(".gz", "")
out_base = out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", rm_ext)
else:
out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", input_file)

# Data structures to separate and store sequences
df_above_threshold = pd.DataFrame(columns=["id", "seq", "length"])
pooled = []
remaining = []

if input_file.endswith(".gz"):
with gzip.open(input_file, "rt") as f:
fasta_sequences = SeqIO.parse(f, "fasta")

for fasta in fasta_sequences:
name, sequence = fasta.id, str(fasta.seq)
length = len(sequence)

# store each sequence above threshold together with its length into df
if length >= length_threshold:
df_above_threshold = df_above_threshold.append(
{"id": name, "seq": sequence, "length": length}, ignore_index=True
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
pooled.append(
SeqRecord(Seq(sequence, generic_dna), id=name, description="")
)
# remaining sequences
else:
remaining.append(
SeqRecord(Seq(sequence, generic_dna), id=name, description="")
)
else:
with open(input_file) as f:
fasta_sequences = SeqIO.parse(f, "fasta")

for fasta in fasta_sequences:
name, sequence = fasta.id, str(fasta.seq)
length = len(sequence)

# store each sequence above threshold together with its length into df
if length >= length_threshold:
df_above_threshold = df_above_threshold.append(
{"id": name, "seq": sequence, "length": length}, ignore_index=True
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
pooled.append(
SeqRecord(Seq(sequence, generic_dna), id=name, description="")
)
# remaining sequences
else:
remaining.append(
SeqRecord(Seq(sequence, generic_dna), id=name, description="")
)

# Sort sequences above threshold by length
df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True)
df_above_threshold.reset_index(drop=True, inplace=True)

# Write `max_sequences` longest sequences (above threshold) into separate files, add remainder to pooled
for index, row in df_above_threshold.iterrows():
if index + 1 <= max_sequences:
print("write " + out_base + "." + str(index + 1) + ".fa")
out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
else:
pooled.append(
SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
)

print("write " + out_base + ".pooled.fa")
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
print("write " + out_base + ".remaining.fa")
SeqIO.write(remaining, out_base + ".remaining.fa", "fasta")
19 changes: 19 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,23 @@ process {
],
]
}
withName: SPLIT_FASTA {
publishDir = [
[
path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned" },
mode: params.publish_dir_mode,
pattern: '*.*[0-9].fa.gz'
],
[
path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" },
mode: params.publish_dir_mode,
pattern: '*.pooled.fa.gz'
],
[
path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" },
mode: params.publish_dir_mode,
pattern: '*.remaining.fa.gz'
]
]
}
}
44 changes: 43 additions & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"installed_by": [
"modules"
]
},
},
"megahit": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
Expand All @@ -46,6 +46,48 @@
"installed_by": [
"modules"
]
},
"dastool/dastool": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": [
"modules"
]
},
"dastool/fastatocontig2bin": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": [
"modules"
]
},
"gunzip": {
"branch": "master",
"git_sha": "3a5fef109d113b4997c9822198664ca5f2716208",
"installed_by": [
"modules"
]
},
"maxbin2": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": [
"modules"
]
},
"metabat2/jgisummarizebamcontigdepths": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": [
"modules"
]
},
"metabat2/metabat2": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
"installed_by": [
"modules"
]
}
}
}
Expand Down
34 changes: 34 additions & 0 deletions modules/local/split_fasta.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
process SPLIT_FASTA {
tag "${meta.assembler}-${meta.binner}-${meta.id}"
label 'process_low'

// Using container from metabat2 process, since this will be anyway already downloaded and contains biopython and pandas
conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' :
'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }"

input:
tuple val(meta), path(unbinned)

output:
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.[1-9]*.fa.gz") , optional:true, emit: unbinned
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.pooled.fa.gz") , optional:true, emit: pooled
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.remaining.fa.gz"), optional:true, emit: remaining
path "versions.yml" , emit: versions

script:
"""
# save unbinned contigs above thresholds into individual files, dump others in one file
split_fasta.py $unbinned ${params.min_length_unbinned_contigs} ${params.max_unbinned_contigs} ${params.min_contig_size}
gzip *.fa
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version 2>&1 | sed 's/Python //g')
biopython: 1.7.4
pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
END_VERSIONS
"""
}
7 changes: 7 additions & 0 deletions modules/nf-core/gunzip/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions modules/nf-core/gunzip/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions modules/nf-core/gunzip/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions modules/nf-core/gunzip/tests/main.nf.test

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions modules/nf-core/gunzip/tests/main.nf.test.snap

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions modules/nf-core/gunzip/tests/tags.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c31a4e1

Please sign in to comment.