Skip to content

Commit

Permalink
Add the concoct cut_up_fasta tool (#4436)
Browse files Browse the repository at this point in the history
* Add the concoct cut_up_fasta tool

* Attempt to fix flake8 problems

* Improve help for all tools

* Misc cleanup

* Code cleanup and more tests

* Use f strings

* Use string templates

* More code cleanup

* Flake8

* Update the help pic

* Slight code fix

* Add the CONCOCT coverage_table tool

* Code cleanup

* Use a slice identifier when cutting contigs

* Add the merge_cut_up_clustering tool

* Flake8 fix
  • Loading branch information
gregvonkuster authored Mar 13, 2022
1 parent 7b6b07d commit 40a09cb
Show file tree
Hide file tree
Showing 16 changed files with 1,886 additions and 4 deletions.
3 changes: 2 additions & 1 deletion tools/concoct/concoct.xml
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ The tool accepts 2 inputs; a tabular file where each row corresponds to a contig
sample (the values are the average coverage for this contig in that sample) and a file containing sequences in
fasta format.
Three are produced; clustering of the > 1000 kmer count, the PCA transformed matrix and the PCA components.
Three outputs are produced; clustering of the > 1000 kmer count, the PCA transformed matrix and the PCA components.
@HELP_OVERVIEW@
]]></help>
<expand macro="citations"/>
</tool>
83 changes: 83 additions & 0 deletions tools/concoct/coverage_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python

import argparse
import gzip
from functools import partial

from Bio import SeqIO


def generate_coverage_table(input_fasta, input_tabular, gzipped, output):
# Read input file into a dict and return everything
# in the table format required by CONCOCT.
gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped)
assert(len(gc_and_len_dict) > 0)
bed_coverage_dict = get_bed_coverage_dict(input_tabular)

with open(output, 'w') as fh:
# Output the header.
fh.write("contig\tlength")
t = tuple(range(len(bed_coverage_dict)))
fh.write("\tcov_mean_sample_%d\n" % len(t))
# Output the content.
for acc in gc_and_len_dict:
# Fasta stats.
fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length']))
# Mean
try:
# Coverage mean
fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"]))
except KeyError:
# No reads mapped to this contig
fh.write("\t0")
fh.write("\n")


def get_bed_coverage_dict(input_tabular):
# Ddetermine mean coverage and percentage covered
# for each contig, returning a dict with fasta id
# as key and percentage covered and cov_mean as keys
# for the inner dict.
out_dict = {}

with open(input_tabular, 'r') as fh:
for line in fh:
line = line.rstrip('\r\n')
cols = line.split('\t')
try:
d = out_dict[cols[0]]
except KeyError:
d = {}
out_dict[cols[0]] = d
if int(cols[1]) == 0:
d["percentage_covered"] = 100 - float(cols[4]) * 100.0
else:
d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])
return out_dict


def get_gc_and_len_dict(input_fasta, gzipped):
# Creates a dictionary with the fasta id as key
# and GC and length as keys for the inner dictionary.
if gzipped:
_open = partial(gzip.open, mode='rt')
else:
_open = open

out_dict = {}
with _open(input_fasta) as input_fh:
for rec in SeqIO.parse(input_fh, "fasta"):
out_dict[rec.id] = {}
out_dict[rec.id]["length"] = len(rec.seq)
return out_dict


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file')
parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file')
parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped")
parser.add_argument('--output', action='store', dest='output', help='Output file')

args = parser.parse_args()

generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output)
40 changes: 40 additions & 0 deletions tools/concoct/coverage_table.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<tool id="concoct_coverage_table" name="CONCOCT coverage table" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description></description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/coverage_table.py'
--input_fasta '$input_fasta'
#if $input_fasta.is_of_type('fasta.gz'):
--gzipped
#end if
--input_tabular '$input_tabular'
--output '$output'
]]></command>
<inputs>
<param name="input_fasta" type="data" format="fasta,fasta.gz" label="Contigs fasta file"/>
<param name="input_tabular" type="data" format="tabular" label="Tabular bedtools Genome Coverage histogram file" help="Set the bedtools Genome Coverage Output type to be Data suitable for Histogram"/>
</inputs>
<outputs>
<data name="output" format="tabular"/>
</outputs>
<tests>
<test expect_num_outputs="1">
<param name="input_fasta" value="input_coverage_table.fasta.gz" ftype="fasta.gz"/>
<param name="input_tabular" value="input_coverage_table.tabular" ftype="tabular"/>
<output name="output" file="output_coverage_table.tabular" ftype="tabular"/>
</test>
</tests>
<help><![CDATA[
**What it does**
Accepts an assembled (and possibly cut by the Cut fasta contigs tool) fasta contigs file and a tabular coverage histogram
file (produced by the bedtools Genomve Coverage tool) and outputs a tabular coverage file for use as the input to the
CONCOCT metagenome binning tool.
@HELP_OVERVIEW@
]]></help>
<expand macro="citations"/>
</tool>
58 changes: 58 additions & 0 deletions tools/concoct/cut_up_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

import argparse
import gzip
from functools import partial

from Bio import SeqIO


def cut_up_fasta(input_fasta, chunk_size, overlap, merge_last, output_fasta, output_bed, gzipped):
if gzipped:
_open = partial(gzip.open, mode='rt')
else:
_open = open

fasta_fh = open(output_fasta, 'w')

if output_bed is not None:
bed_fh = open(output_bed, 'w')

with _open(input_fasta) as input_fh:
for record in SeqIO.parse(input_fh, "fasta"):
if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size):
for index, split_seq in enumerate(chunks(record.seq, chunk_size, overlap, merge_last)):
fasta_fh.write(f">{record.id}.concoct_part_{index}\n{split_seq}\n")
if output_bed is not None:
bed_fh.write(f"{record.id}\t{chunk_size * index}\t{chunk_size * index + len(split_seq)}\t{record.id}.concoct_part_{index}\n")
else:
fasta_fh.write(f">{record.id}.concoct_part_0\n{record.seq}\n")
if output_bed is not None:
bed_fh.write(f"{record.id}\t0\t{len(record.seq)}\t{record.id}.concoct_part_0\n")
if output_bed is not None:
bed_fh.close()


def chunks(seq, chunk_size, overlap_size, merge_last):
# Yield successive chunk_size-sized chunks from seq
# with given overlap overlap_size between the chunks.
assert chunk_size > overlap_size
if merge_last:
for i in range(0, len(seq) - chunk_size + 1, chunk_size - overlap_size):
yield seq[i:i + chunk_size] if i + chunk_size + chunk_size - overlap_size <= len(seq) else seq[i:]
else:
for i in range(0, len(seq), chunk_size - overlap_size):
yield seq[i:i + chunk_size]


parser = argparse.ArgumentParser()
parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Fasta files with contigs")
parser.add_argument("--gzipped", action="store_true", dest="gzipped", help="Input file is gzipped")
parser.add_argument("--chunk_size", action="store", dest="chunk_size", type=int, help="Chunk size\n")
parser.add_argument("--overlap_size", action="store", dest="overlap_size", type=int, help="Overlap size\n")
parser.add_argument("--merge_last", default=False, action="store_true", dest="merge_last", help="Concatenate final part to last contig\n")
parser.add_argument("--output_bed", action="store", dest="output_bed", default=None, help="BED file to be created with exact regions of the original contigs corresponding to the newly created contigs")
parser.add_argument("--output_fasta", action="store", dest="output_fasta", help="Output fasta file with cut contigs")

args = parser.parse_args()
cut_up_fasta(args.input_fasta, args.chunk_size, args.overlap_size, args.merge_last, args.output_fasta, args.output_bed, args.gzipped)
94 changes: 94 additions & 0 deletions tools/concoct/cut_up_fasta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
<tool id="concoct_cut_up_fasta" name="CONCOCT: cut fasta contigs" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>into equal length non-overlapping or overlapping parts</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/cut_up_fasta.py'
--input_fasta '$input_fasta'
#if $input_fasta.is_of_type('fasta.gz'):
--gzipped
#end if
--chunk_size $chunk_size
--overlap_size $overlap_size
$merge_last
#if str($output_bed_param) == 'yes':
--output_bed '$output_bed'
#end if
--output_fasta '$output_fasta'
]]></command>
<inputs>
<param name="input_fasta" type="data" format="fasta,fasta.gz" label="Fasta contigs file"/>
<param argument="--chunk_size" type="integer" value="1999" label="Chunk size"/>
<param argument="--overlap_size" type="integer" value="1900" label="Overlap size" help="Zero value produces non-overlapping parts"/>
<param argument="--merge_last" type="boolean" truevalue="--merge_last" falsevalue="" checked="false" label="Concatenate final part to last contig?"/>
<param name="output_bed_param" type="select" label="Output bed file with exact regions of the original contigs corresponding to the newly created contigs?" help="Can be used as input to the SAMTools bedcov tool">
<option value="no" selected="true">No</option>
<option value="yes">Yes</option>
</param>
</inputs>
<outputs>
<data name="output_bed" format="bed" label="${tool.name} on ${on_string} (bed)">
<filter>output_bed_param == 'yes'</filter>
</data>
<data name="output_fasta" format="fasta" label="${tool.name} on ${on_string} (contigs)"/>
</outputs>
<tests>
<!-- default settings -->
<test expect_num_outputs="1">
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
<output name="output_fasta" ftype="fasta">
<assert_contents>
<has_size value="2366"/>
<has_text text="116"/>
<has_n_lines n="100"/>
</assert_contents>
</output>
</test>
<!-- merge_last and output bed file -->
<test expect_num_outputs="2">
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
<param name="merge_last" value="--merge_last"/>
<param name="output_bed_param" value="yes"/>
<output name="output_bed" ftype="bed">
<assert_contents>
<has_size value="1332"/>
<has_text text="116"/>
<has_n_lines n="50"/>
</assert_contents>
</output>
<output name="output_fasta" ftype="fasta">
<assert_contents>
<has_size value="2366"/>
<has_text text="116"/>
<has_n_lines n="100"/>
</assert_contents>
</output>
</test>
<!-- Change chunk size and overlap size -->
<test expect_num_outputs="1">
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/>
<param name="chunk_size" value="500"/>
<param name="overlap_size" value="499"/>
<output name="output_fasta" ftype="fasta">
<assert_contents>
<has_size value="2366"/>
<has_text text="116"/>
<has_n_lines n="100"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
**What it does**
Accepts a fasta file containing contigs, cuts them into non-overlapping or overlapping parts of equal length, and produces
a fasta file containing the cut contigs. An optional output BED file can be produced, where the cut contigs are specified
in terms of the original contigs. Using this file as input to a BED coverage tool (e.g., bedtools Compute both the length
and depth of coverage) will produce a file that can be used as input to the CONCOCT Create coverage table tool.
@HELP_OVERVIEW@
]]></help>
<expand macro="citations"/>
</tool>
6 changes: 4 additions & 2 deletions tools/concoct/extract_fasta_bins.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<tool id="concoct_extract_fasta_bins" name="Extract a fasta file" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>for each cluster in a CONCOCT clustering file</description>
<tool id="concoct_extract_fasta_bins" name="CONCOCT: extract a fasta file" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>for each cluster</description>
<macros>
<import>macros.xml</import>
</macros>
Expand Down Expand Up @@ -53,6 +53,8 @@ CONCOCT clustering file.
The tool accepts two inputs; the fasta contigs file and the CONCOCT clustering file that was produced using
the same fasta contigs input. A collection of fasta files is produced.
@HELP_OVERVIEW@
]]></help>
<expand macro="citations"/>
</tool>
11 changes: 10 additions & 1 deletion tools/concoct/macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,20 @@
<token name="@PROFILE@">21.01</token>
<xml name="requirements">
<requirements>
<requirement type="package" version="1.79">biopython</requirement>
<requirement type="package" version="@TOOL_VERSION@">concoct</requirement>
<requirement type="package" version="0.19.2">pandas</requirement>
<requirement type="package" version="1.79">biopython</requirement>
</requirements>
</xml>
<token name="@HELP_OVERVIEW@">

The intended use of the CONCOCT tools is shown in the following image.

.. image:: pipeline.png

More information may be found on the CONCOCT homepage:: https://github.com/BinPro/CONCOCT

</token>
<xml name="citations">
<citations>
<citation type="doi">10.1038/nmeth.3103</citation>
Expand Down
56 changes: 56 additions & 0 deletions tools/concoct/merge_cut_up_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python

import argparse
import re
import sys
from collections import Counter
from collections import defaultdict


CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)')


def original_contig_name_special(contig_id):
try:
original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2)
return original_id, part_index
except AttributeError:
# No matches for concoct_part regex.
return contig_id, 0


parser = argparse.ArgumentParser()
parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters")
parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters")

args = parser.parse_args()

# Get cut up clusters
all_seqs = {}
all_originals = defaultdict(dict)
with open(args.input, 'r') as ifh:
for i, line in enumerate(ifh):
if i == 0:
if 'contig_id' not in line:
sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.")
sys.exit(-1)
# Skip header.
continue
line = line.rstrip('\r\n')
contig_id, cluster_id = line.split('\t')
original_contig_name, part_id = original_contig_name_special(contig_id)
all_originals[original_contig_name][part_id] = cluster_id

# Merge cut up clusters.
with open(args.output, 'w') as ofh:
ofh.write("contig_id\tcluster_id\n")
for original_contig_id, part_ids_d in all_originals.items():
if len(part_ids_d) > 1:
c = Counter(part_ids_d.values())
cluster_id = c.most_common(1)[0][0]
c_string = [(a, b) for a, b in c.items()]
# Here if len(c.values()) > 1,
# then no cluster for contig.
else:
cluster_id = list(part_ids_d.values())[0]
ofh.write(f"{original_contig_id}\t{cluster_id}\n")
Loading

0 comments on commit 40a09cb

Please sign in to comment.