Add the concoct cut_up_fasta tool (#4436)

* Add the concoct cut_up_fasta tool * Attempt to fix flake8 problems * Improve help for all tools * Misc cleanup * Code cleanup and more tests * Use f strings * Use string templates * More code cleanup * Flake8 * Update the help pic * Slight code fix * Add the CONCOCT coverage_table tool * Code cleanup * Use a slice identifier when cutting contigs * Add the merge_cut_up_clustering tool * Flake8 fix
galaxyproject · Mar 13, 2022 · 40a09cb · 40a09cb
1 parent 7b6b07d
commit 40a09cb
Show file tree

Hide file tree

Showing 16 changed files with 1,886 additions and 4 deletions.
diff --git a/tools/concoct/concoct.xml b/tools/concoct/concoct.xml
@@ -109,8 +109,9 @@ The tool accepts 2 inputs; a tabular file where each row corresponds to a contig
 sample (the values are the average coverage for this contig in that sample) and a file containing sequences in
 fasta format.
 
-Three are produced; clustering of the > 1000 kmer count, the PCA transformed matrix and the PCA components.
+Three outputs are produced; clustering of the > 1000 kmer count, the PCA transformed matrix and the PCA components.
 
+@HELP_OVERVIEW@
     ]]></help>
     <expand macro="citations"/>
 </tool>
diff --git a/tools/concoct/coverage_table.py b/tools/concoct/coverage_table.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import argparse
+import gzip
+from functools import partial
+
+from Bio import SeqIO
+
+
+def generate_coverage_table(input_fasta, input_tabular, gzipped, output):
+    # Read input file into a dict and return everything
+    # in the table format required by CONCOCT.
+    gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped)
+    assert(len(gc_and_len_dict) > 0)
+    bed_coverage_dict = get_bed_coverage_dict(input_tabular)
+
+    with open(output, 'w') as fh:
+        # Output the header.
+        fh.write("contig\tlength")
+        t = tuple(range(len(bed_coverage_dict)))
+        fh.write("\tcov_mean_sample_%d\n" % len(t))
+        # Output the content.
+        for acc in gc_and_len_dict:
+            # Fasta stats.
+            fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length']))
+            # Mean
+            try:
+                # Coverage mean
+                fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"]))
+            except KeyError:
+                # No reads mapped to this contig
+                fh.write("\t0")
+            fh.write("\n")
+
+
+def get_bed_coverage_dict(input_tabular):
+    # Ddetermine mean coverage and percentage covered
+    # for each contig, returning a dict with fasta id
+    # as key and percentage covered and cov_mean as keys
+    # for the inner dict.
+    out_dict = {}
+
+    with open(input_tabular, 'r') as fh:
+        for line in fh:
+            line = line.rstrip('\r\n')
+            cols = line.split('\t')
+            try:
+                d = out_dict[cols[0]]
+            except KeyError:
+                d = {}
+                out_dict[cols[0]] = d
+            if int(cols[1]) == 0:
+                d["percentage_covered"] = 100 - float(cols[4]) * 100.0
+            else:
+                d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])
+    return out_dict
+
+
+def get_gc_and_len_dict(input_fasta, gzipped):
+    # Creates a dictionary with the fasta id as key
+    # and GC and length as keys for the inner dictionary.
+    if gzipped:
+        _open = partial(gzip.open, mode='rt')
+    else:
+        _open = open
+
+    out_dict = {}
+    with _open(input_fasta) as input_fh:
+        for rec in SeqIO.parse(input_fh, "fasta"):
+            out_dict[rec.id] = {}
+            out_dict[rec.id]["length"] = len(rec.seq)
+    return out_dict
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file')
+parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file')
+parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped")
+parser.add_argument('--output', action='store', dest='output', help='Output file')
+
+args = parser.parse_args()
+
+generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output)
diff --git a/tools/concoct/coverage_table.xml b/tools/concoct/coverage_table.xml
@@ -0,0 +1,40 @@
+<tool id="concoct_coverage_table" name="CONCOCT coverage table" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description></description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/coverage_table.py'
+--input_fasta '$input_fasta'
+#if $input_fasta.is_of_type('fasta.gz'):
+    --gzipped
+#end if
+--input_tabular '$input_tabular'
+--output '$output'
+    ]]></command>
+    <inputs>
+        <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Contigs fasta file"/>
+        <param name="input_tabular" type="data" format="tabular" label="Tabular bedtools Genome Coverage histogram file" help="Set the bedtools Genome Coverage Output type to be Data suitable for Histogram"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_fasta" value="input_coverage_table.fasta.gz" ftype="fasta.gz"/>
+            <param name="input_tabular" value="input_coverage_table.tabular" ftype="tabular"/>
+            <output name="output" file="output_coverage_table.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Accepts an assembled (and possibly cut by the Cut fasta contigs tool) fasta contigs file and a tabular coverage histogram
+file (produced by the bedtools Genomve Coverage tool) and outputs a tabular coverage file for use as the input to the
+CONCOCT metagenome binning tool.
+
+@HELP_OVERVIEW@
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/concoct/cut_up_fasta.py b/tools/concoct/cut_up_fasta.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+import argparse
+import gzip
+from functools import partial
+
+from Bio import SeqIO
+
+
+def cut_up_fasta(input_fasta, chunk_size, overlap, merge_last, output_fasta, output_bed, gzipped):
+    if gzipped:
+        _open = partial(gzip.open, mode='rt')
+    else:
+        _open = open
+
+    fasta_fh = open(output_fasta, 'w')
+
+    if output_bed is not None:
+        bed_fh = open(output_bed, 'w')
+
+    with _open(input_fasta) as input_fh:
+        for record in SeqIO.parse(input_fh, "fasta"):
+            if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size):
+                for index, split_seq in enumerate(chunks(record.seq, chunk_size, overlap, merge_last)):
+                    fasta_fh.write(f">{record.id}.concoct_part_{index}\n{split_seq}\n")
+                    if output_bed is not None:
+                        bed_fh.write(f"{record.id}\t{chunk_size * index}\t{chunk_size * index + len(split_seq)}\t{record.id}.concoct_part_{index}\n")
+            else:
+                fasta_fh.write(f">{record.id}.concoct_part_0\n{record.seq}\n")
+                if output_bed is not None:
+                    bed_fh.write(f"{record.id}\t0\t{len(record.seq)}\t{record.id}.concoct_part_0\n")
+    if output_bed is not None:
+        bed_fh.close()
+
+
+def chunks(seq, chunk_size, overlap_size, merge_last):
+    # Yield successive chunk_size-sized chunks from seq
+    # with given overlap overlap_size between the chunks.
+    assert chunk_size > overlap_size
+    if merge_last:
+        for i in range(0, len(seq) - chunk_size + 1, chunk_size - overlap_size):
+            yield seq[i:i + chunk_size] if i + chunk_size + chunk_size - overlap_size <= len(seq) else seq[i:]
+    else:
+        for i in range(0, len(seq), chunk_size - overlap_size):
+            yield seq[i:i + chunk_size]
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Fasta files with contigs")
+parser.add_argument("--gzipped", action="store_true", dest="gzipped", help="Input file is gzipped")
+parser.add_argument("--chunk_size", action="store", dest="chunk_size", type=int, help="Chunk size\n")
+parser.add_argument("--overlap_size", action="store", dest="overlap_size", type=int, help="Overlap size\n")
+parser.add_argument("--merge_last", default=False, action="store_true", dest="merge_last", help="Concatenate final part to last contig\n")
+parser.add_argument("--output_bed", action="store", dest="output_bed", default=None, help="BED file to be created with exact regions of the original contigs corresponding to the newly created contigs")
+parser.add_argument("--output_fasta", action="store", dest="output_fasta", help="Output fasta file with cut contigs")
+
+args = parser.parse_args()
+cut_up_fasta(args.input_fasta, args.chunk_size, args.overlap_size, args.merge_last, args.output_fasta, args.output_bed, args.gzipped)
diff --git a/tools/concoct/cut_up_fasta.xml b/tools/concoct/cut_up_fasta.xml
@@ -0,0 +1,94 @@
+<tool id="concoct_cut_up_fasta" name="CONCOCT: cut fasta contigs" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>into equal length non-overlapping or overlapping parts</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/cut_up_fasta.py'
+--input_fasta '$input_fasta'
+#if $input_fasta.is_of_type('fasta.gz'):
+    --gzipped
+#end if
+--chunk_size $chunk_size
+--overlap_size $overlap_size
+$merge_last
+#if str($output_bed_param) == 'yes':
+    --output_bed '$output_bed'
+#end if
+--output_fasta '$output_fasta'
+    ]]></command>
+    <inputs>
+        <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Fasta contigs file"/>
+        <param argument="--chunk_size" type="integer" value="1999" label="Chunk size"/>
+        <param argument="--overlap_size" type="integer" value="1900" label="Overlap size" help="Zero value produces non-overlapping parts"/>
+        <param argument="--merge_last" type="boolean" truevalue="--merge_last" falsevalue="" checked="false" label="Concatenate final part to last contig?"/>
+        <param name="output_bed_param" type="select" label="Output bed file with exact regions of the original contigs corresponding to the newly created contigs?" help="Can be used as input to the SAMTools bedcov tool">
+            <option value="no" selected="true">No</option>
+            <option value="yes">Yes</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_bed" format="bed" label="${tool.name} on ${on_string} (bed)">
+            <filter>output_bed_param == 'yes'</filter>
+        </data>
+        <data name="output_fasta" format="fasta" label="${tool.name} on ${on_string} (contigs)"/>
+    </outputs>
+    <tests>
+        <!-- default settings -->
+        <test expect_num_outputs="1">
+            <param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
+            <output name="output_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_size value="2366"/>
+                    <has_text text="116"/>
+                    <has_n_lines n="100"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- merge_last and  output bed file -->
+        <test expect_num_outputs="2">
+            <param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
+            <param name="merge_last" value="--merge_last"/>
+            <param name="output_bed_param" value="yes"/>
+            <output name="output_bed" ftype="bed">
+                <assert_contents>
+                    <has_size value="1332"/>
+                    <has_text text="116"/>
+                    <has_n_lines n="50"/>
+                </assert_contents>
+            </output>
+            <output name="output_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_size value="2366"/>
+                    <has_text text="116"/>
+                    <has_n_lines n="100"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Change chunk size and overlap size -->
+        <test expect_num_outputs="1">
+            <param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
+            <param name="chunk_size" value="500"/>
+            <param name="overlap_size" value="499"/>
+            <output name="output_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_size value="2366"/>
+                    <has_text text="116"/>
+                    <has_n_lines n="100"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Accepts a fasta file containing contigs, cuts them into non-overlapping or overlapping parts of equal length, and produces
+a fasta file containing the cut contigs.  An optional output BED file can be produced, where the cut contigs are specified
+in terms of the original contigs.  Using this file as input to a BED coverage tool (e.g., bedtools Compute both the length
+and depth of coverage) will produce a file that can be used as input to the CONCOCT Create coverage table tool.
+
+@HELP_OVERVIEW@
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/concoct/extract_fasta_bins.xml b/tools/concoct/extract_fasta_bins.xml
@@ -1,5 +1,5 @@
-<tool id="concoct_extract_fasta_bins" name="Extract a fasta file" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
-    <description>for each cluster in a CONCOCT clustering file</description>
+<tool id="concoct_extract_fasta_bins" name="CONCOCT: extract a fasta file" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>for each cluster</description>
     <macros>
         <import>macros.xml</import>
     </macros>
@@ -53,6 +53,8 @@ CONCOCT clustering file.
 
 The tool accepts two inputs; the fasta contigs file and the CONCOCT clustering file that was produced using
 the same fasta contigs input.  A collection of fasta files is produced.
+
+@HELP_OVERVIEW@
     ]]></help>
     <expand macro="citations"/>
 </tool>
diff --git a/tools/concoct/macros.xml b/tools/concoct/macros.xml
@@ -4,11 +4,20 @@
     <token name="@PROFILE@">21.01</token>
     <xml name="requirements">
         <requirements>
+            <requirement type="package" version="1.79">biopython</requirement>
             <requirement type="package" version="@TOOL_VERSION@">concoct</requirement>
             <requirement type="package" version="0.19.2">pandas</requirement>
-            <requirement type="package" version="1.79">biopython</requirement>
         </requirements>
     </xml>
+    <token name="@HELP_OVERVIEW@">
+
+The intended use of the CONCOCT tools is shown in the following image.
+
+.. image:: pipeline.png
+
+More information may be found on the CONCOCT homepage:: https://github.com/BinPro/CONCOCT
+
+    </token>
     <xml name="citations">
         <citations>
             <citation type="doi">10.1038/nmeth.3103</citation>

diff --git a/tools/concoct/merge_cut_up_clustering.py b/tools/concoct/merge_cut_up_clustering.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+
+import argparse
+import re
+import sys
+from collections import Counter
+from collections import defaultdict
+
+
+CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)')
+
+
+def original_contig_name_special(contig_id):
+    try:
+        original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2)
+        return original_id, part_index
+    except AttributeError:
+        # No matches for concoct_part regex.
+        return contig_id, 0
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters")
+parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters")
+
+args = parser.parse_args()
+
+# Get cut up clusters
+all_seqs = {}
+all_originals = defaultdict(dict)
+with open(args.input, 'r') as ifh:
+    for i, line in enumerate(ifh):
+        if i == 0:
+            if 'contig_id' not in line:
+                sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.")
+                sys.exit(-1)
+            # Skip header.
+            continue
+        line = line.rstrip('\r\n')
+        contig_id, cluster_id = line.split('\t')
+        original_contig_name, part_id = original_contig_name_special(contig_id)
+        all_originals[original_contig_name][part_id] = cluster_id
+
+# Merge cut up clusters.
+with open(args.output, 'w') as ofh:
+    ofh.write("contig_id\tcluster_id\n")
+    for original_contig_id, part_ids_d in all_originals.items():
+        if len(part_ids_d) > 1:
+            c = Counter(part_ids_d.values())
+            cluster_id = c.most_common(1)[0][0]
+            c_string = [(a, b) for a, b in c.items()]
+            # Here if len(c.values()) > 1,
+            # then no cluster for contig.
+        else:
+            cluster_id = list(part_ids_d.values())[0]
+        ofh.write(f"{original_contig_id}\t{cluster_id}\n")