-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/usr/bin/env python | ||
|
||
## Originally written by Daniel Straub and Sabrina Krakau and released | ||
## under the MIT license. | ||
## See git repository (https://github.com/nf-core/mag) for full license text. | ||
|
||
# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> <min_length_unbinned_contigs> <max_unbinned_contigs> <min_contig_size> | ||
|
||
import pandas as pd | ||
import gzip | ||
from sys import argv | ||
from Bio import SeqIO | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
from Bio.Alphabet import generic_dna | ||
import os | ||
import re | ||
|
||
# Input | ||
input_file = argv[1] | ||
length_threshold = int(argv[2]) | ||
max_sequences = int(argv[3]) | ||
min_length_to_retain_contig = int(argv[4]) | ||
|
||
# Base name for file output | ||
if input_file.endswith(".gz"): | ||
rm_ext = input_file.replace(".gz", "") | ||
out_base = out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", rm_ext) | ||
else: | ||
out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", input_file) | ||
|
||
# Data structures to separate and store sequences | ||
df_above_threshold = pd.DataFrame(columns=["id", "seq", "length"]) | ||
pooled = [] | ||
remaining = [] | ||
|
||
if input_file.endswith(".gz"): | ||
with gzip.open(input_file, "rt") as f: | ||
fasta_sequences = SeqIO.parse(f, "fasta") | ||
|
||
for fasta in fasta_sequences: | ||
name, sequence = fasta.id, str(fasta.seq) | ||
length = len(sequence) | ||
|
||
# store each sequence above threshold together with its length into df | ||
if length >= length_threshold: | ||
df_above_threshold = df_above_threshold.append( | ||
{"id": name, "seq": sequence, "length": length}, ignore_index=True | ||
) | ||
# contigs to retain and pool | ||
elif length >= min_length_to_retain_contig: | ||
pooled.append( | ||
SeqRecord(Seq(sequence, generic_dna), id=name, description="") | ||
) | ||
# remaining sequences | ||
else: | ||
remaining.append( | ||
SeqRecord(Seq(sequence, generic_dna), id=name, description="") | ||
) | ||
else: | ||
with open(input_file) as f: | ||
fasta_sequences = SeqIO.parse(f, "fasta") | ||
|
||
for fasta in fasta_sequences: | ||
name, sequence = fasta.id, str(fasta.seq) | ||
length = len(sequence) | ||
|
||
# store each sequence above threshold together with its length into df | ||
if length >= length_threshold: | ||
df_above_threshold = df_above_threshold.append( | ||
{"id": name, "seq": sequence, "length": length}, ignore_index=True | ||
) | ||
# contigs to retain and pool | ||
elif length >= min_length_to_retain_contig: | ||
pooled.append( | ||
SeqRecord(Seq(sequence, generic_dna), id=name, description="") | ||
) | ||
# remaining sequences | ||
else: | ||
remaining.append( | ||
SeqRecord(Seq(sequence, generic_dna), id=name, description="") | ||
) | ||
|
||
# Sort sequences above threshold by length | ||
df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True) | ||
df_above_threshold.reset_index(drop=True, inplace=True) | ||
|
||
# Write `max_sequences` longest sequences (above threshold) into separate files, add remainder to pooled | ||
for index, row in df_above_threshold.iterrows(): | ||
if index + 1 <= max_sequences: | ||
print("write " + out_base + "." + str(index + 1) + ".fa") | ||
out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") | ||
SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") | ||
else: | ||
pooled.append( | ||
SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") | ||
) | ||
|
||
print("write " + out_base + ".pooled.fa") | ||
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") | ||
print("write " + out_base + ".remaining.fa") | ||
SeqIO.write(remaining, out_base + ".remaining.fa", "fasta") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
process SPLIT_FASTA { | ||
tag "${meta.assembler}-${meta.binner}-${meta.id}" | ||
label 'process_low' | ||
|
||
// Using container from metabat2 process, since this will be anyway already downloaded and contains biopython and pandas | ||
conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' : | ||
'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" | ||
|
||
input: | ||
tuple val(meta), path(unbinned) | ||
|
||
output: | ||
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.[1-9]*.fa.gz") , optional:true, emit: unbinned | ||
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.pooled.fa.gz") , optional:true, emit: pooled | ||
tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.remaining.fa.gz"), optional:true, emit: remaining | ||
path "versions.yml" , emit: versions | ||
|
||
script: | ||
""" | ||
# save unbinned contigs above thresholds into individual files, dump others in one file | ||
split_fasta.py $unbinned ${params.min_length_unbinned_contigs} ${params.max_unbinned_contigs} ${params.min_contig_size} | ||
gzip *.fa | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
python: \$(python --version 2>&1 | sed 's/Python //g') | ||
biopython: 1.7.4 | ||
pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") | ||
END_VERSIONS | ||
""" | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.