Skip to content

Commit

Permalink
Add NCBI taxonomy db, missing parameters, biom output, bio.tools
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut committed Jun 7, 2024
1 parent bcd8fec commit 5a5f7fb
Show file tree
Hide file tree
Showing 8 changed files with 307 additions and 54 deletions.
215 changes: 161 additions & 54 deletions tools/taxpasta/taxpasta.xml
Original file line number Diff line number Diff line change
@@ -1,34 +1,66 @@
<tool id="taxpasta" name="taxpasta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.01">
<tool id="taxpasta" name="taxpasta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>standardise taxonomic profiles</description>
<macros>
<token name="@TOOL_VERSION@">0.5.0</token>
<token name="@TOOL_VERSION@">0.6.1</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.01</token>
</macros>
<xrefs>
<xref type="bio.tools">taxpasta</xref>
</xrefs>
<requirements>
<requirement type="package" version="@TOOL_VERSION@">taxpasta</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
## Getting the NCBI taxonomy database
## In the future could find the one that exists in Galaxy and give the user the option to instead use that cached one
curl -O -s ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz &&
mkdir ./taxdump &&
tar -C ./taxdump -xzf taxdump.tar.gz &&
taxpasta $action -p $profiler -o ./output.tsv --taxonomy taxdump --add-name
#for $file in $infile:
'$file'
#end for
&&
mv ./output.tsv $output
## Outputs
taxpasta
$action.action
--profiler $profiler
--taxonomy '$taxonomy.fields.path'
#if $action.action == 'merge'
--output-format '$action.format.output_format'
#if $action.format.output_format == 'TSV'
--output '$tabular_output'
$action.format.wide
#else if $action.format.output_format == 'BIOM'
--output '$biom_output'
#end if
#else
--output-format 'TSV'
--output '$tabular_output'
#end if
$add_name
$add_rank
$add_lineage
$add_id_lineage
$add_rank_lineage
#for $file in $infile
'$file'
#end for
]]></command>
<inputs>
<param name="infile" type="data" format="tabular" multiple="true" label="Choose taxonomic report(s) to standardise" help="Select one or many reports, but only from the same profiling tool" />
<param name="profiler" type="select" label="What profiling tool created this report?" help="Select one profiler only">
<conditional name="action">
<param name="action" type="select" label="Taxpasta action">
<option value="standardise">Standardise input(s)</option>
<option value="merge">Standardise and combine multiple taxonomic profiles from the same profiler</option>
</param>
<when value="standardise"/>
<when value="merge">
<conditional name="format">
<param argument="--output-format" type="select" label="Desired output format">
<option value="TSV">Tabular</option>
<option value="BIOM">BIOM</option>
</param>
<when value="TSV">
<param argument="--wide" type="boolean" truevalue="--wide" falsevalue="--long" checked="true" label="Output merged abundance data in either wide or (tidy) long format"/>
</when>
<when value="BIOM"/>
</conditional>
</when>
</conditional>
<param argument="--profiler" type="select" label="What profiling tool created this report?" help="Select one profiler only">
<option value="bracken">Bracken</option>
<option value="Centrifuge">Centrifuge</option>
<option value="diamond">diamond</option>
<option value="diamond">DIAMOND</option>
<option value="ganon">ganon</option>
<option value="kaiju">Kaiju</option>
<option value="kraken2">Kraken2</option>
Expand All @@ -37,63 +69,139 @@
<option value="metaphlan">MetaPhlAn</option>
<option value="motus">mOTUs</option>
</param>
<param name="action" type="select" label="What should taxpasta do?" help="Only run merge if inputs are from the same profiler">
<option value="standardise">Standardise input(s)</option>
<option value="merge">Standardise and merge inputs</option>
</param>
<param name="infile" type="data" format="tabular" multiple="true" label="Taxonomic report(s) to standardise" help="The reports should be from the same profiling tool" />
<param argument="--taxonomy" type="select" label="NCBI taxonomy" help="To have actual human-readable taxon names in the standardised output">
<options from_data_table="ncbi_taxonomy">
<validator message="No NCBI database is available" type="no_options"/>
</options>
</param>
<param argument="--add-name" type="boolean" truevalue="--add-name" falsevalue="" checked="true" label="Add the taxon name to the output"/>
<param argument="--add-rank" type="boolean" truevalue="--add-rank" falsevalue="" checked="false" label="Add the taxon rank to the output"/>
<param argument="--add-lineage" type="boolean" truevalue="--add-lineage" falsevalue="" checked="false" label="Add the taxon's entire lineage to the output"
help="The taxon names are separated by semi-colons"/>
<param argument="--add-id-lineage" type="boolean" truevalue="--add-id-lineage" falsevalue="" checked="false" label="Add the taxon's entire lineage to the output"
help="The taxon identifiers are separated by semi-colons"/>
<param argument="--add-rank-lineage" type="boolean" truevalue="--add-rank-lineage" falsevalue="" checked="false" label="Add the taxon's entire rank lineage to the output"
help="These are taxon ranks separated by semi-colons"/>
</inputs>
<outputs>
<data format="tabular" name="output"/>
<data name="tabular_output" format="tabular" label="${tool.name} on ${on_string}: Tabular">
<filter>(action['action'] == 'merge' and action['format']['output_format'] == 'TSV') or action['action'] == 'standardise'</filter>
</data>
<data name="biom_output" format="biom1" label="${tool.name} on ${on_string}: BIOM">
<filter>(action['action'] == 'merge' and action['format']['output_format'] == 'BIOM')</filter>
</data>
</outputs>
<tests>
<!-- test data from taxpasta https://github.com/taxprofiler/taxpasta/tree/dev/tests/data-->
<test expect_num_outputs="1">
<param name="infile" value="ERR7569997.txt"/>
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="profiler" value="metaphlan"/>
<param name="action" value="standardise"/>
<output name="output">
<param name="infile" value="ERR7569997.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="Pseudomonadota"/>
<has_text text="Proteobacteria"/>
<has_n_columns n="3"/>
<has_n_lines n="241"/>
</assert_contents>
</output>
</test>

<!-- test multiple inputs and the merge command-->
<test expect_num_outputs="1">
<param name="infile" value="ERR7569997.txt,ERR7569998.txt"/>
<conditional name="action">
<param name="action" value="merge"/>
<conditional name="format">
<param name="output_format" value="TSV"/>
<param name="wide" value="true"/>
</conditional>
</conditional>
<param name="profiler" value="metaphlan"/>
<param name="action" value="merge"/>
<output name="output">
<param name="infile" value="ERR7569997.txt,ERR7569998.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="Pseudomonadota"/>
<has_text text="Gammaproteobacteria"/>
<has_n_columns n="4"/>
<has_n_lines n="252"/>
</assert_contents>
</output>
</test>

<!-- testing diamond input-->
<test expect_num_outputs="1">
<param name="infile" value="diamond_valid_1.tsv"/>
<param name="profiler" value="diamond"/>
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="action" value="standardise"/>
<output name="output">
<param name="profiler" value="diamond"/>
<param name="infile" value="diamond_valid_1.tsv"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="Acinetobacter"/>
<has_text text="1310613"/>
<has_n_columns n="3"/>
<has_n_lines n="4"/>
</assert_contents>
</output>
</test>

<!-- testing kraken input-->
<!-- testing kraken input and all boolean-->
<test expect_num_outputs="1">
<param name="infile" value="2612_pe-ERR5766176-db1.kraken2.report.txt"/>
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="profiler" value="kraken2"/>
<param name="action" value="standardise"/>
<output name="output">
<param name="infile" value="2612_pe-ERR5766176-db1.kraken2.report.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="true" />
<param name="add_lineage" value="true"/>
<param name="add_id_lineage" value="true"/>
<param name="add_rank_lineage" value="true"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="root"/>
<has_n_columns n="3"/>
<has_text text="rank_lineage"/>
<has_n_columns n="7"/>
<has_n_lines n="45"/>
</assert_contents>
</output>
</test>
<!-- testing BIOM output-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="merge"/>
<conditional name="format">
<param name="output_format" value="BIOM"/>
</conditional>
</conditional>
<param name="profiler" value="kraken2"/>
<param name="infile" value="2612_pe-ERR5766176-db1.kraken2.report.txt,2611_se-ERR5766174-db1.kraken2.report.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="biom_output" ftype="biom1">
<assert_contents>
<has_text text="314146"/>
</assert_contents>
</output>
</test>
Expand All @@ -104,23 +212,22 @@
**What it does**
* Taxpasta standardises the taxonomic profiles produced from other tools.
* It reformats these outputs into a table of NCBI taxonomy identifiers and their integer counts.
* Then, it converts these identifiers to taxon names.
* It can also merge outputs across samples, from the same profiling tool.
The main purpose of taxpasta is to standardise taxonomic profiles created by a range of bioinformatics tools.
We call those tools taxonomic profilers. They each come with their own particular tabular output format.
Across the profilers, relative abundances can be reported in read counts, fractions, or percentages, as well as
any number of additional columns with extra information. We therefore decided to take the lessons learnt to heart
and provide our own solution to deal with this pasticcio. With taxpasta you can ingest all of those formats and, at a
minimum, output taxonomy identifiers and their integer counts. Taxpasta can not only standardise profiles but also merge
them across samples for the same profiler into a single table.
**Input(s)**
* One or many outputs from a particular profiling tool, such as kraken2 or diamond.
* Check that this report is in the correct format for taxpasta: for detail, see https://taxpasta.readthedocs.io/en/latest/supported_profilers/.
* For example, for kraken2, taxpasta expects the kraken-report output file with 6 or 8 columns.
* For example, for diamond, taxpasta expects a tabular file with 3 columns.
**Output**
* A reformatted report, either for the single input, or for multiple inputs, as long as they are from the same profiling tool.
* The report is in tabular format.
* A reformatted report in tabular format, either for the single input, or for multiple inputs, as long as they are from the same profiling tool.
For more information see: https://taxpasta.readthedocs.io/en/latest/
Expand Down
44 changes: 44 additions & 0 deletions tools/taxpasta/test-data/2611_se-ERR5766174-db1.kraken2.report.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
99.98 787758 787758 U 0 unclassified
0.02 119 0 R 1 root
0.02 119 0 R1 131567 cellular organisms
0.02 119 0 D 2759 Eukaryota
0.02 119 0 D1 33154 Opisthokonta
0.01 96 0 K 4751 Fungi
0.01 96 0 K1 451864 Dikarya
0.01 96 0 P 4890 Ascomycota
0.01 96 0 P1 716545 saccharomyceta
0.01 96 0 P2 147537 Saccharomycotina
0.01 96 0 C 4891 Saccharomycetes
0.01 96 0 O 4892 Saccharomycetales
0.01 96 0 F 4893 Saccharomycetaceae
0.01 96 0 G 4930 Saccharomyces
0.01 96 0 S 4932 Saccharomyces cerevisiae
0.01 96 96 S1 559292 Saccharomyces cerevisiae S288C
0.00 23 0 K 33208 Metazoa
0.00 23 0 K1 6072 Eumetazoa
0.00 23 0 K2 33213 Bilateria
0.00 23 0 K3 33511 Deuterostomia
0.00 23 0 P 7711 Chordata
0.00 23 0 P1 89593 Craniata
0.00 23 0 P2 7742 Vertebrata
0.00 23 0 P3 7776 Gnathostomata
0.00 23 0 P4 117570 Teleostomi
0.00 23 0 P5 117571 Euteleostomi
0.00 23 0 P6 8287 Sarcopterygii
0.00 23 0 P7 1338369 Dipnotetrapodomorpha
0.00 23 0 P8 32523 Tetrapoda
0.00 23 0 P9 32524 Amniota
0.00 23 0 C 40674 Mammalia
0.00 23 0 C1 32525 Theria
0.00 23 0 C2 9347 Eutheria
0.00 23 0 C3 1437010 Boreoeutheria
0.00 23 0 C4 314146 Euarchontoglires
0.00 23 0 O 9443 Primates
0.00 23 0 O1 376913 Haplorrhini
0.00 23 0 O2 314293 Simiiformes
0.00 23 0 O3 9526 Catarrhini
0.00 23 0 O4 314295 Hominoidea
0.00 23 0 F 9604 Hominidae
0.00 23 0 F1 207598 Homininae
0.00 23 0 G 9605 Homo
0.00 23 23 S 9606 Homo sapiens
1 change: 1 addition & 0 deletions tools/taxpasta/test-data/ncbi_taxonomy.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test-db-tox Test Database ${__HERE__}/test-db
Loading

0 comments on commit 5a5f7fb

Please sign in to comment.