-
Notifications
You must be signed in to change notification settings - Fork 441
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the concoct cut_up_fasta tool (#4436)
* Add the concoct cut_up_fasta tool * Attempt to fix flake8 problems * Improve help for all tools * Misc cleanup * Code cleanup and more tests * Use f strings * Use string templates * More code cleanup * Flake8 * Update the help pic * Slight code fix * Add the CONCOCT coverage_table tool * Code cleanup * Use a slice identifier when cutting contigs * Add the merge_cut_up_clustering tool * Flake8 fix
- Loading branch information
1 parent
7b6b07d
commit 40a09cb
Showing
16 changed files
with
1,886 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import gzip | ||
from functools import partial | ||
|
||
from Bio import SeqIO | ||
|
||
|
||
def generate_coverage_table(input_fasta, input_tabular, gzipped, output): | ||
# Read input file into a dict and return everything | ||
# in the table format required by CONCOCT. | ||
gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped) | ||
assert(len(gc_and_len_dict) > 0) | ||
bed_coverage_dict = get_bed_coverage_dict(input_tabular) | ||
|
||
with open(output, 'w') as fh: | ||
# Output the header. | ||
fh.write("contig\tlength") | ||
t = tuple(range(len(bed_coverage_dict))) | ||
fh.write("\tcov_mean_sample_%d\n" % len(t)) | ||
# Output the content. | ||
for acc in gc_and_len_dict: | ||
# Fasta stats. | ||
fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length'])) | ||
# Mean | ||
try: | ||
# Coverage mean | ||
fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"])) | ||
except KeyError: | ||
# No reads mapped to this contig | ||
fh.write("\t0") | ||
fh.write("\n") | ||
|
||
|
||
def get_bed_coverage_dict(input_tabular): | ||
# Ddetermine mean coverage and percentage covered | ||
# for each contig, returning a dict with fasta id | ||
# as key and percentage covered and cov_mean as keys | ||
# for the inner dict. | ||
out_dict = {} | ||
|
||
with open(input_tabular, 'r') as fh: | ||
for line in fh: | ||
line = line.rstrip('\r\n') | ||
cols = line.split('\t') | ||
try: | ||
d = out_dict[cols[0]] | ||
except KeyError: | ||
d = {} | ||
out_dict[cols[0]] = d | ||
if int(cols[1]) == 0: | ||
d["percentage_covered"] = 100 - float(cols[4]) * 100.0 | ||
else: | ||
d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4]) | ||
return out_dict | ||
|
||
|
||
def get_gc_and_len_dict(input_fasta, gzipped): | ||
# Creates a dictionary with the fasta id as key | ||
# and GC and length as keys for the inner dictionary. | ||
if gzipped: | ||
_open = partial(gzip.open, mode='rt') | ||
else: | ||
_open = open | ||
|
||
out_dict = {} | ||
with _open(input_fasta) as input_fh: | ||
for rec in SeqIO.parse(input_fh, "fasta"): | ||
out_dict[rec.id] = {} | ||
out_dict[rec.id]["length"] = len(rec.seq) | ||
return out_dict | ||
|
||
|
||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file') | ||
parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file') | ||
parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped") | ||
parser.add_argument('--output', action='store', dest='output', help='Output file') | ||
|
||
args = parser.parse_args() | ||
|
||
generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
<tool id="concoct_coverage_table" name="CONCOCT coverage table" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | ||
<description></description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="requirements"/> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
python '$__tool_directory__/coverage_table.py' | ||
--input_fasta '$input_fasta' | ||
#if $input_fasta.is_of_type('fasta.gz'): | ||
--gzipped | ||
#end if | ||
--input_tabular '$input_tabular' | ||
--output '$output' | ||
]]></command> | ||
<inputs> | ||
<param name="input_fasta" type="data" format="fasta,fasta.gz" label="Contigs fasta file"/> | ||
<param name="input_tabular" type="data" format="tabular" label="Tabular bedtools Genome Coverage histogram file" help="Set the bedtools Genome Coverage Output type to be Data suitable for Histogram"/> | ||
</inputs> | ||
<outputs> | ||
<data name="output" format="tabular"/> | ||
</outputs> | ||
<tests> | ||
<test expect_num_outputs="1"> | ||
<param name="input_fasta" value="input_coverage_table.fasta.gz" ftype="fasta.gz"/> | ||
<param name="input_tabular" value="input_coverage_table.tabular" ftype="tabular"/> | ||
<output name="output" file="output_coverage_table.tabular" ftype="tabular"/> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
**What it does** | ||
Accepts an assembled (and possibly cut by the Cut fasta contigs tool) fasta contigs file and a tabular coverage histogram | ||
file (produced by the bedtools Genomve Coverage tool) and outputs a tabular coverage file for use as the input to the | ||
CONCOCT metagenome binning tool. | ||
@HELP_OVERVIEW@ | ||
]]></help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import gzip | ||
from functools import partial | ||
|
||
from Bio import SeqIO | ||
|
||
|
||
def cut_up_fasta(input_fasta, chunk_size, overlap, merge_last, output_fasta, output_bed, gzipped): | ||
if gzipped: | ||
_open = partial(gzip.open, mode='rt') | ||
else: | ||
_open = open | ||
|
||
fasta_fh = open(output_fasta, 'w') | ||
|
||
if output_bed is not None: | ||
bed_fh = open(output_bed, 'w') | ||
|
||
with _open(input_fasta) as input_fh: | ||
for record in SeqIO.parse(input_fh, "fasta"): | ||
if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size): | ||
for index, split_seq in enumerate(chunks(record.seq, chunk_size, overlap, merge_last)): | ||
fasta_fh.write(f">{record.id}.concoct_part_{index}\n{split_seq}\n") | ||
if output_bed is not None: | ||
bed_fh.write(f"{record.id}\t{chunk_size * index}\t{chunk_size * index + len(split_seq)}\t{record.id}.concoct_part_{index}\n") | ||
else: | ||
fasta_fh.write(f">{record.id}.concoct_part_0\n{record.seq}\n") | ||
if output_bed is not None: | ||
bed_fh.write(f"{record.id}\t0\t{len(record.seq)}\t{record.id}.concoct_part_0\n") | ||
if output_bed is not None: | ||
bed_fh.close() | ||
|
||
|
||
def chunks(seq, chunk_size, overlap_size, merge_last): | ||
# Yield successive chunk_size-sized chunks from seq | ||
# with given overlap overlap_size between the chunks. | ||
assert chunk_size > overlap_size | ||
if merge_last: | ||
for i in range(0, len(seq) - chunk_size + 1, chunk_size - overlap_size): | ||
yield seq[i:i + chunk_size] if i + chunk_size + chunk_size - overlap_size <= len(seq) else seq[i:] | ||
else: | ||
for i in range(0, len(seq), chunk_size - overlap_size): | ||
yield seq[i:i + chunk_size] | ||
|
||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Fasta files with contigs") | ||
parser.add_argument("--gzipped", action="store_true", dest="gzipped", help="Input file is gzipped") | ||
parser.add_argument("--chunk_size", action="store", dest="chunk_size", type=int, help="Chunk size\n") | ||
parser.add_argument("--overlap_size", action="store", dest="overlap_size", type=int, help="Overlap size\n") | ||
parser.add_argument("--merge_last", default=False, action="store_true", dest="merge_last", help="Concatenate final part to last contig\n") | ||
parser.add_argument("--output_bed", action="store", dest="output_bed", default=None, help="BED file to be created with exact regions of the original contigs corresponding to the newly created contigs") | ||
parser.add_argument("--output_fasta", action="store", dest="output_fasta", help="Output fasta file with cut contigs") | ||
|
||
args = parser.parse_args() | ||
cut_up_fasta(args.input_fasta, args.chunk_size, args.overlap_size, args.merge_last, args.output_fasta, args.output_bed, args.gzipped) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
<tool id="concoct_cut_up_fasta" name="CONCOCT: cut fasta contigs" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | ||
<description>into equal length non-overlapping or overlapping parts</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="requirements"/> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
python '$__tool_directory__/cut_up_fasta.py' | ||
--input_fasta '$input_fasta' | ||
#if $input_fasta.is_of_type('fasta.gz'): | ||
--gzipped | ||
#end if | ||
--chunk_size $chunk_size | ||
--overlap_size $overlap_size | ||
$merge_last | ||
#if str($output_bed_param) == 'yes': | ||
--output_bed '$output_bed' | ||
#end if | ||
--output_fasta '$output_fasta' | ||
]]></command> | ||
<inputs> | ||
<param name="input_fasta" type="data" format="fasta,fasta.gz" label="Fasta contigs file"/> | ||
<param argument="--chunk_size" type="integer" value="1999" label="Chunk size"/> | ||
<param argument="--overlap_size" type="integer" value="1900" label="Overlap size" help="Zero value produces non-overlapping parts"/> | ||
<param argument="--merge_last" type="boolean" truevalue="--merge_last" falsevalue="" checked="false" label="Concatenate final part to last contig?"/> | ||
<param name="output_bed_param" type="select" label="Output bed file with exact regions of the original contigs corresponding to the newly created contigs?" help="Can be used as input to the SAMTools bedcov tool"> | ||
<option value="no" selected="true">No</option> | ||
<option value="yes">Yes</option> | ||
</param> | ||
</inputs> | ||
<outputs> | ||
<data name="output_bed" format="bed" label="${tool.name} on ${on_string} (bed)"> | ||
<filter>output_bed_param == 'yes'</filter> | ||
</data> | ||
<data name="output_fasta" format="fasta" label="${tool.name} on ${on_string} (contigs)"/> | ||
</outputs> | ||
<tests> | ||
<!-- default settings --> | ||
<test expect_num_outputs="1"> | ||
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/> | ||
<output name="output_fasta" ftype="fasta"> | ||
<assert_contents> | ||
<has_size value="2366"/> | ||
<has_text text="116"/> | ||
<has_n_lines n="100"/> | ||
</assert_contents> | ||
</output> | ||
</test> | ||
<!-- merge_last and output bed file --> | ||
<test expect_num_outputs="2"> | ||
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/> | ||
<param name="merge_last" value="--merge_last"/> | ||
<param name="output_bed_param" value="yes"/> | ||
<output name="output_bed" ftype="bed"> | ||
<assert_contents> | ||
<has_size value="1332"/> | ||
<has_text text="116"/> | ||
<has_n_lines n="50"/> | ||
</assert_contents> | ||
</output> | ||
<output name="output_fasta" ftype="fasta"> | ||
<assert_contents> | ||
<has_size value="2366"/> | ||
<has_text text="116"/> | ||
<has_n_lines n="100"/> | ||
</assert_contents> | ||
</output> | ||
</test> | ||
<!-- Change chunk size and overlap size --> | ||
<test expect_num_outputs="1"> | ||
<param name="input_fasta" value="input.fasta.gz" ftype="fasta.gz"/> | ||
<param name="chunk_size" value="500"/> | ||
<param name="overlap_size" value="499"/> | ||
<output name="output_fasta" ftype="fasta"> | ||
<assert_contents> | ||
<has_size value="2366"/> | ||
<has_text text="116"/> | ||
<has_n_lines n="100"/> | ||
</assert_contents> | ||
</output> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
**What it does** | ||
Accepts a fasta file containing contigs, cuts them into non-overlapping or overlapping parts of equal length, and produces | ||
a fasta file containing the cut contigs. An optional output BED file can be produced, where the cut contigs are specified | ||
in terms of the original contigs. Using this file as input to a BED coverage tool (e.g., bedtools Compute both the length | ||
and depth of coverage) will produce a file that can be used as input to the CONCOCT Create coverage table tool. | ||
@HELP_OVERVIEW@ | ||
]]></help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import re | ||
import sys | ||
from collections import Counter | ||
from collections import defaultdict | ||
|
||
|
||
CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)') | ||
|
||
|
||
def original_contig_name_special(contig_id): | ||
try: | ||
original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2) | ||
return original_id, part_index | ||
except AttributeError: | ||
# No matches for concoct_part regex. | ||
return contig_id, 0 | ||
|
||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters") | ||
parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters") | ||
|
||
args = parser.parse_args() | ||
|
||
# Get cut up clusters | ||
all_seqs = {} | ||
all_originals = defaultdict(dict) | ||
with open(args.input, 'r') as ifh: | ||
for i, line in enumerate(ifh): | ||
if i == 0: | ||
if 'contig_id' not in line: | ||
sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.") | ||
sys.exit(-1) | ||
# Skip header. | ||
continue | ||
line = line.rstrip('\r\n') | ||
contig_id, cluster_id = line.split('\t') | ||
original_contig_name, part_id = original_contig_name_special(contig_id) | ||
all_originals[original_contig_name][part_id] = cluster_id | ||
|
||
# Merge cut up clusters. | ||
with open(args.output, 'w') as ofh: | ||
ofh.write("contig_id\tcluster_id\n") | ||
for original_contig_id, part_ids_d in all_originals.items(): | ||
if len(part_ids_d) > 1: | ||
c = Counter(part_ids_d.values()) | ||
cluster_id = c.most_common(1)[0][0] | ||
c_string = [(a, b) for a, b in c.items()] | ||
# Here if len(c.values()) > 1, | ||
# then no cluster for contig. | ||
else: | ||
cluster_id = list(part_ids_d.values())[0] | ||
ofh.write(f"{original_contig_id}\t{cluster_id}\n") |
Oops, something went wrong.