Added binning subworkflow and refinement from mag

uel3 · Apr 12, 2024 · c31a4e1 · c31a4e1
1 parent f7d0830
commit c31a4e1
Show file tree

Hide file tree

Showing 12 changed files with 391 additions and 11 deletions.
diff --git a/bin/split_fasta.py b/bin/split_fasta.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+## Originally written by Daniel Straub and Sabrina Krakau and released
+## under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
+# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> <min_length_unbinned_contigs> <max_unbinned_contigs> <min_contig_size>
+
+import pandas as pd
+import gzip
+from sys import argv
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import generic_dna
+import os
+import re
+
+# Input
+input_file = argv[1]
+length_threshold = int(argv[2])
+max_sequences = int(argv[3])
+min_length_to_retain_contig = int(argv[4])
+
+# Base name for file output
+if input_file.endswith(".gz"):
+    rm_ext = input_file.replace(".gz", "")
+    out_base = out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", rm_ext)
+else:
+    out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", input_file)
+
+# Data structures to separate and store sequences
+df_above_threshold = pd.DataFrame(columns=["id", "seq", "length"])
+pooled = []
+remaining = []
+
+if input_file.endswith(".gz"):
+    with gzip.open(input_file, "rt") as f:
+        fasta_sequences = SeqIO.parse(f, "fasta")
+
+        for fasta in fasta_sequences:
+            name, sequence = fasta.id, str(fasta.seq)
+            length = len(sequence)
+
+            # store each sequence above threshold together with its length into df
+            if length >= length_threshold:
+                df_above_threshold = df_above_threshold.append(
+                    {"id": name, "seq": sequence, "length": length}, ignore_index=True
+                )
+            # contigs to retain and pool
+            elif length >= min_length_to_retain_contig:
+                pooled.append(
+                    SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+                )
+            # remaining sequences
+            else:
+                remaining.append(
+                    SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+                )
+else:
+    with open(input_file) as f:
+        fasta_sequences = SeqIO.parse(f, "fasta")
+
+        for fasta in fasta_sequences:
+            name, sequence = fasta.id, str(fasta.seq)
+            length = len(sequence)
+
+            # store each sequence above threshold together with its length into df
+            if length >= length_threshold:
+                df_above_threshold = df_above_threshold.append(
+                    {"id": name, "seq": sequence, "length": length}, ignore_index=True
+                )
+            # contigs to retain and pool
+            elif length >= min_length_to_retain_contig:
+                pooled.append(
+                    SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+                )
+            # remaining sequences
+            else:
+                remaining.append(
+                    SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+                )
+
+# Sort sequences above threshold by length
+df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True)
+df_above_threshold.reset_index(drop=True, inplace=True)
+
+# Write `max_sequences` longest sequences (above threshold) into separate files, add remainder to pooled
+for index, row in df_above_threshold.iterrows():
+    if index + 1 <= max_sequences:
+        print("write " + out_base + "." + str(index + 1) + ".fa")
+        out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
+        SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
+    else:
+        pooled.append(
+            SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
+        )
+
+print("write " + out_base + ".pooled.fa")
+SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
+print("write " + out_base + ".remaining.fa")
+SeqIO.write(remaining, out_base + ".remaining.fa", "fasta")
diff --git a/conf/modules.config b/conf/modules.config
@@ -180,4 +180,23 @@ process {
             ],
         ]
     }
+    withName: SPLIT_FASTA {
+        publishDir = [
+            [
+                path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned" },
+                mode: params.publish_dir_mode,
+                pattern: '*.*[0-9].fa.gz'
+            ],
+            [
+                path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" },
+                mode: params.publish_dir_mode,
+                pattern: '*.pooled.fa.gz'
+            ],
+            [
+                path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" },
+                mode: params.publish_dir_mode,
+                pattern: '*.remaining.fa.gz'
+            ]
+        ]
+    }
 }
diff --git a/modules.json b/modules.json
@@ -32,7 +32,7 @@
                         "installed_by": [
                             "modules"
                         ]
-                      },
+                    },
                     "megahit": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
@@ -46,6 +46,48 @@
                         "installed_by": [
                             "modules"
                         ]
+                    },
+                    "dastool/dastool": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                      "dastool/fastatocontig2bin": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                      "gunzip": {
+                        "branch": "master",
+                        "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208",
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                      "maxbin2": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                      "metabat2/jgisummarizebamcontigdepths": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                      "metabat2/metabat2": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": [
+                            "modules"
+                        ]
                     }
                 }
             }

diff --git a/modules/local/split_fasta.nf b/modules/local/split_fasta.nf
@@ -0,0 +1,34 @@
+process SPLIT_FASTA {
+    tag "${meta.assembler}-${meta.binner}-${meta.id}"
+    label 'process_low'
+
+    // Using container from metabat2 process, since this will be anyway already downloaded and contains biopython and pandas
+    conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' :
+        'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }"
+
+    input:
+    tuple val(meta), path(unbinned)
+
+    output:
+    tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.[1-9]*.fa.gz")   , optional:true, emit: unbinned
+    tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.pooled.fa.gz")   , optional:true, emit: pooled
+    tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.remaining.fa.gz"), optional:true, emit: remaining
+    path "versions.yml"                                                                   , emit: versions
+
+    script:
+    """
+    # save unbinned contigs above thresholds into individual files, dump others in one file
+    split_fasta.py $unbinned ${params.min_length_unbinned_contigs} ${params.max_unbinned_contigs} ${params.min_contig_size}
+
+    gzip *.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version 2>&1 | sed 's/Python //g')
+        biopython: 1.7.4
+        pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml
diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf
diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml
diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test
diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap
diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml