Skip to content

Commit

Permalink
Merge pull request #5465 from AnnaSyme/taxpasta
Browse files Browse the repository at this point in the history
add initial taxpasta wrapper
  • Loading branch information
pvanheus authored Jun 7, 2024
2 parents a8fe4cf + 5a5f7fb commit 5a6a3c7
Show file tree
Hide file tree
Showing 13 changed files with 2,619 additions and 0 deletions.
9 changes: 9 additions & 0 deletions tools/taxpasta/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: taxpasta
owner: iuc
description: standardise taxonomic profiles
homepage_url: https://taxpasta.readthedocs.io/en/latest/
long_description: |
Standardise and merge taxonomic profiles into one tabular report.
remote_repository_url: https://github.com/taxprofiler/taxpasta
categories:
- Sequence Analysis
238 changes: 238 additions & 0 deletions tools/taxpasta/taxpasta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
<tool id="taxpasta" name="taxpasta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>standardise taxonomic profiles</description>
<macros>
<token name="@TOOL_VERSION@">0.6.1</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.01</token>
</macros>
<xrefs>
<xref type="bio.tools">taxpasta</xref>
</xrefs>
<requirements>
<requirement type="package" version="@TOOL_VERSION@">taxpasta</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
taxpasta
$action.action
--profiler $profiler
--taxonomy '$taxonomy.fields.path'
#if $action.action == 'merge'
--output-format '$action.format.output_format'
#if $action.format.output_format == 'TSV'
--output '$tabular_output'
$action.format.wide
#else if $action.format.output_format == 'BIOM'
--output '$biom_output'
#end if
#else
--output-format 'TSV'
--output '$tabular_output'
#end if
$add_name
$add_rank
$add_lineage
$add_id_lineage
$add_rank_lineage
#for $file in $infile
'$file'
#end for
]]></command>
<inputs>
<conditional name="action">
<param name="action" type="select" label="Taxpasta action">
<option value="standardise">Standardise input(s)</option>
<option value="merge">Standardise and combine multiple taxonomic profiles from the same profiler</option>
</param>
<when value="standardise"/>
<when value="merge">
<conditional name="format">
<param argument="--output-format" type="select" label="Desired output format">
<option value="TSV">Tabular</option>
<option value="BIOM">BIOM</option>
</param>
<when value="TSV">
<param argument="--wide" type="boolean" truevalue="--wide" falsevalue="--long" checked="true" label="Output merged abundance data in either wide or (tidy) long format"/>
</when>
<when value="BIOM"/>
</conditional>
</when>
</conditional>
<param argument="--profiler" type="select" label="What profiling tool created this report?" help="Select one profiler only">
<option value="bracken">Bracken</option>
<option value="Centrifuge">Centrifuge</option>
<option value="diamond">DIAMOND</option>
<option value="ganon">ganon</option>
<option value="kaiju">Kaiju</option>
<option value="kraken2">Kraken2</option>
<option value="krakenuniq">KrakenUniq</option>
<option value="megan6">MEGAN6/MALT</option>
<option value="metaphlan">MetaPhlAn</option>
<option value="motus">mOTUs</option>
</param>
<param name="infile" type="data" format="tabular" multiple="true" label="Taxonomic report(s) to standardise" help="The reports should be from the same profiling tool" />
<param argument="--taxonomy" type="select" label="NCBI taxonomy" help="To have actual human-readable taxon names in the standardised output">
<options from_data_table="ncbi_taxonomy">
<validator message="No NCBI database is available" type="no_options"/>
</options>
</param>
<param argument="--add-name" type="boolean" truevalue="--add-name" falsevalue="" checked="true" label="Add the taxon name to the output"/>
<param argument="--add-rank" type="boolean" truevalue="--add-rank" falsevalue="" checked="false" label="Add the taxon rank to the output"/>
<param argument="--add-lineage" type="boolean" truevalue="--add-lineage" falsevalue="" checked="false" label="Add the taxon's entire lineage to the output"
help="The taxon names are separated by semi-colons"/>
<param argument="--add-id-lineage" type="boolean" truevalue="--add-id-lineage" falsevalue="" checked="false" label="Add the taxon's entire lineage to the output"
help="The taxon identifiers are separated by semi-colons"/>
<param argument="--add-rank-lineage" type="boolean" truevalue="--add-rank-lineage" falsevalue="" checked="false" label="Add the taxon's entire rank lineage to the output"
help="These are taxon ranks separated by semi-colons"/>
</inputs>
<outputs>
<data name="tabular_output" format="tabular" label="${tool.name} on ${on_string}: Tabular">
<filter>(action['action'] == 'merge' and action['format']['output_format'] == 'TSV') or action['action'] == 'standardise'</filter>
</data>
<data name="biom_output" format="biom1" label="${tool.name} on ${on_string}: BIOM">
<filter>(action['action'] == 'merge' and action['format']['output_format'] == 'BIOM')</filter>
</data>
</outputs>
<tests>
<!-- test data from taxpasta https://github.com/taxprofiler/taxpasta/tree/dev/tests/data-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="profiler" value="metaphlan"/>
<param name="infile" value="ERR7569997.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="Proteobacteria"/>
<has_n_columns n="3"/>
<has_n_lines n="241"/>
</assert_contents>
</output>
</test>
<!-- test multiple inputs and the merge command-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="merge"/>
<conditional name="format">
<param name="output_format" value="TSV"/>
<param name="wide" value="true"/>
</conditional>
</conditional>
<param name="profiler" value="metaphlan"/>
<param name="infile" value="ERR7569997.txt,ERR7569998.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="Gammaproteobacteria"/>
<has_n_columns n="4"/>
<has_n_lines n="252"/>
</assert_contents>
</output>
</test>
<!-- testing diamond input-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="action" value="standardise"/>
<param name="profiler" value="diamond"/>
<param name="infile" value="diamond_valid_1.tsv"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="1310613"/>
<has_n_columns n="3"/>
<has_n_lines n="4"/>
</assert_contents>
</output>
</test>
<!-- testing kraken input and all boolean-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="standardise"/>
</conditional>
<param name="profiler" value="kraken2"/>
<param name="infile" value="2612_pe-ERR5766176-db1.kraken2.report.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="true" />
<param name="add_lineage" value="true"/>
<param name="add_id_lineage" value="true"/>
<param name="add_rank_lineage" value="true"/>
<output name="tabular_output" ftype="tabular">
<assert_contents>
<has_text text="root"/>
<has_text text="rank_lineage"/>
<has_n_columns n="7"/>
<has_n_lines n="45"/>
</assert_contents>
</output>
</test>
<!-- testing BIOM output-->
<test expect_num_outputs="1">
<conditional name="action">
<param name="action" value="merge"/>
<conditional name="format">
<param name="output_format" value="BIOM"/>
</conditional>
</conditional>
<param name="profiler" value="kraken2"/>
<param name="infile" value="2612_pe-ERR5766176-db1.kraken2.report.txt,2611_se-ERR5766174-db1.kraken2.report.txt"/>
<param name="taxonomy" value="test-db-tox"/>
<param name="add_name" value="true"/>
<param name="add_rank" value="false" />
<param name="add_lineage" value="false"/>
<param name="add_id_lineage" value="false"/>
<param name="add_rank_lineage" value="false"/>
<output name="biom_output" ftype="biom1">
<assert_contents>
<has_text text="314146"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
**What it does**
The main purpose of taxpasta is to standardise taxonomic profiles created by a range of bioinformatics tools.
We call those tools taxonomic profilers. They each come with their own particular tabular output format.
Across the profilers, relative abundances can be reported in read counts, fractions, or percentages, as well as
any number of additional columns with extra information. We therefore decided to take the lessons learnt to heart
and provide our own solution to deal with this pasticcio. With taxpasta you can ingest all of those formats and, at a
minimum, output taxonomy identifiers and their integer counts. Taxpasta can not only standardise profiles but also merge
them across samples for the same profiler into a single table.
**Input(s)**
* One or many outputs from a particular profiling tool, such as kraken2 or diamond.
**Output**
* A reformatted report in tabular format, either for the single input, or for multiple inputs, as long as they are from the same profiling tool.
For more information see: https://taxpasta.readthedocs.io/en/latest/
]]></help>
<citations>
<citation type="doi">10.21105/joss.05627</citation>
</citations>
</tool>
44 changes: 44 additions & 0 deletions tools/taxpasta/test-data/2611_se-ERR5766174-db1.kraken2.report.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
99.98 787758 787758 U 0 unclassified
0.02 119 0 R 1 root
0.02 119 0 R1 131567 cellular organisms
0.02 119 0 D 2759 Eukaryota
0.02 119 0 D1 33154 Opisthokonta
0.01 96 0 K 4751 Fungi
0.01 96 0 K1 451864 Dikarya
0.01 96 0 P 4890 Ascomycota
0.01 96 0 P1 716545 saccharomyceta
0.01 96 0 P2 147537 Saccharomycotina
0.01 96 0 C 4891 Saccharomycetes
0.01 96 0 O 4892 Saccharomycetales
0.01 96 0 F 4893 Saccharomycetaceae
0.01 96 0 G 4930 Saccharomyces
0.01 96 0 S 4932 Saccharomyces cerevisiae
0.01 96 96 S1 559292 Saccharomyces cerevisiae S288C
0.00 23 0 K 33208 Metazoa
0.00 23 0 K1 6072 Eumetazoa
0.00 23 0 K2 33213 Bilateria
0.00 23 0 K3 33511 Deuterostomia
0.00 23 0 P 7711 Chordata
0.00 23 0 P1 89593 Craniata
0.00 23 0 P2 7742 Vertebrata
0.00 23 0 P3 7776 Gnathostomata
0.00 23 0 P4 117570 Teleostomi
0.00 23 0 P5 117571 Euteleostomi
0.00 23 0 P6 8287 Sarcopterygii
0.00 23 0 P7 1338369 Dipnotetrapodomorpha
0.00 23 0 P8 32523 Tetrapoda
0.00 23 0 P9 32524 Amniota
0.00 23 0 C 40674 Mammalia
0.00 23 0 C1 32525 Theria
0.00 23 0 C2 9347 Eutheria
0.00 23 0 C3 1437010 Boreoeutheria
0.00 23 0 C4 314146 Euarchontoglires
0.00 23 0 O 9443 Primates
0.00 23 0 O1 376913 Haplorrhini
0.00 23 0 O2 314293 Simiiformes
0.00 23 0 O3 9526 Catarrhini
0.00 23 0 O4 314295 Hominoidea
0.00 23 0 F 9604 Hominidae
0.00 23 0 F1 207598 Homininae
0.00 23 0 G 9605 Homo
0.00 23 23 S 9606 Homo sapiens
44 changes: 44 additions & 0 deletions tools/taxpasta/test-data/2612_pe-ERR5766176-db1.kraken2.report.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
99.97 627680 627680 U 0 unclassified
0.03 168 0 R 1 root
0.03 168 0 R1 131567 cellular organisms
0.03 168 0 D 2759 Eukaryota
0.03 168 0 D1 33154 Opisthokonta
0.02 152 0 K 33208 Metazoa
0.02 152 0 K1 6072 Eumetazoa
0.02 152 0 K2 33213 Bilateria
0.02 152 0 K3 33511 Deuterostomia
0.02 152 0 P 7711 Chordata
0.02 152 0 P1 89593 Craniata
0.02 152 0 P2 7742 Vertebrata
0.02 152 0 P3 7776 Gnathostomata
0.02 152 0 P4 117570 Teleostomi
0.02 152 0 P5 117571 Euteleostomi
0.02 152 0 P6 8287 Sarcopterygii
0.02 152 0 P7 1338369 Dipnotetrapodomorpha
0.02 152 0 P8 32523 Tetrapoda
0.02 152 0 P9 32524 Amniota
0.02 152 0 C 40674 Mammalia
0.02 152 0 C1 32525 Theria
0.02 152 0 C2 9347 Eutheria
0.02 152 0 C3 1437010 Boreoeutheria
0.02 152 0 C4 314146 Euarchontoglires
0.02 152 0 O 9443 Primates
0.02 152 0 O1 376913 Haplorrhini
0.02 152 0 O2 314293 Simiiformes
0.02 152 0 O3 9526 Catarrhini
0.02 152 0 O4 314295 Hominoidea
0.02 152 0 F 9604 Hominidae
0.02 152 0 F1 207598 Homininae
0.02 152 0 G 9605 Homo
0.02 152 152 S 9606 Homo sapiens
0.00 16 0 K 4751 Fungi
0.00 16 0 K1 451864 Dikarya
0.00 16 0 P 4890 Ascomycota
0.00 16 0 P1 716545 saccharomyceta
0.00 16 0 P2 147537 Saccharomycotina
0.00 16 0 C 4891 Saccharomycetes
0.00 16 0 O 4892 Saccharomycetales
0.00 16 0 F 4893 Saccharomycetaceae
0.00 16 0 G 4930 Saccharomyces
0.00 16 0 S 4932 Saccharomyces cerevisiae
0.00 16 16 S1 559292 Saccharomyces cerevisiae S288C
Loading

0 comments on commit 5a6a3c7

Please sign in to comment.