diff --git a/tools/taxpasta/taxpasta.xml b/tools/taxpasta/taxpasta.xml index 0712ca48530..b39d65644eb 100644 --- a/tools/taxpasta/taxpasta.xml +++ b/tools/taxpasta/taxpasta.xml @@ -1,34 +1,66 @@ - + standardise taxonomic profiles - 0.5.0 + 0.6.1 0 + 22.01 + + taxpasta + taxpasta - - + + + + + + + + + + + + + + + + + + + + - + @@ -37,63 +69,139 @@ - - - - + + + + + + + + + + + - + + (action['action'] == 'merge' and action['format']['output_format'] == 'TSV') or action['action'] == 'standardise' + + + (action['action'] == 'merge' and action['format']['output_format'] == 'BIOM') + - + + + - - + + + + + + + + - + + - - + + + + + + + - - + + + + + + + + - + + - - - + + + - + + + + + + + + + - + + - - + - + + + - - + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + @@ -104,23 +212,22 @@ **What it does** -* Taxpasta standardises the taxonomic profiles produced from other tools. -* It reformats these outputs into a table of NCBI taxonomy identifiers and their integer counts. -* Then, it converts these identifiers to taxon names. -* It can also merge outputs across samples, from the same profiling tool. +The main purpose of taxpasta is to standardise taxonomic profiles created by a range of bioinformatics tools. +We call those tools taxonomic profilers. They each come with their own particular tabular output format. +Across the profilers, relative abundances can be reported in read counts, fractions, or percentages, as well as +any number of additional columns with extra information. We therefore decided to take the lessons learnt to heart +and provide our own solution to deal with this pasticcio. With taxpasta you can ingest all of those formats and, at a +minimum, output taxonomy identifiers and their integer counts. Taxpasta can not only standardise profiles but also merge +them across samples for the same profiler into a single table. **Input(s)** * One or many outputs from a particular profiling tool, such as kraken2 or diamond. -* Check that this report is in the correct format for taxpasta: for detail, see https://taxpasta.readthedocs.io/en/latest/supported_profilers/. -* For example, for kraken2, taxpasta expects the kraken-report output file with 6 or 8 columns. -* For example, for diamond, taxpasta expects a tabular file with 3 columns. - **Output** -* A reformatted report, either for the single input, or for multiple inputs, as long as they are from the same profiling tool. -* The report is in tabular format. +* A reformatted report in tabular format, either for the single input, or for multiple inputs, as long as they are from the same profiling tool. + For more information see: https://taxpasta.readthedocs.io/en/latest/ diff --git a/tools/taxpasta/test-data/2611_se-ERR5766174-db1.kraken2.report.txt b/tools/taxpasta/test-data/2611_se-ERR5766174-db1.kraken2.report.txt new file mode 100644 index 00000000000..078b360fea3 --- /dev/null +++ b/tools/taxpasta/test-data/2611_se-ERR5766174-db1.kraken2.report.txt @@ -0,0 +1,44 @@ + 99.98 787758 787758 U 0 unclassified + 0.02 119 0 R 1 root + 0.02 119 0 R1 131567 cellular organisms + 0.02 119 0 D 2759 Eukaryota + 0.02 119 0 D1 33154 Opisthokonta + 0.01 96 0 K 4751 Fungi + 0.01 96 0 K1 451864 Dikarya + 0.01 96 0 P 4890 Ascomycota + 0.01 96 0 P1 716545 saccharomyceta + 0.01 96 0 P2 147537 Saccharomycotina + 0.01 96 0 C 4891 Saccharomycetes + 0.01 96 0 O 4892 Saccharomycetales + 0.01 96 0 F 4893 Saccharomycetaceae + 0.01 96 0 G 4930 Saccharomyces + 0.01 96 0 S 4932 Saccharomyces cerevisiae + 0.01 96 96 S1 559292 Saccharomyces cerevisiae S288C + 0.00 23 0 K 33208 Metazoa + 0.00 23 0 K1 6072 Eumetazoa + 0.00 23 0 K2 33213 Bilateria + 0.00 23 0 K3 33511 Deuterostomia + 0.00 23 0 P 7711 Chordata + 0.00 23 0 P1 89593 Craniata + 0.00 23 0 P2 7742 Vertebrata + 0.00 23 0 P3 7776 Gnathostomata + 0.00 23 0 P4 117570 Teleostomi + 0.00 23 0 P5 117571 Euteleostomi + 0.00 23 0 P6 8287 Sarcopterygii + 0.00 23 0 P7 1338369 Dipnotetrapodomorpha + 0.00 23 0 P8 32523 Tetrapoda + 0.00 23 0 P9 32524 Amniota + 0.00 23 0 C 40674 Mammalia + 0.00 23 0 C1 32525 Theria + 0.00 23 0 C2 9347 Eutheria + 0.00 23 0 C3 1437010 Boreoeutheria + 0.00 23 0 C4 314146 Euarchontoglires + 0.00 23 0 O 9443 Primates + 0.00 23 0 O1 376913 Haplorrhini + 0.00 23 0 O2 314293 Simiiformes + 0.00 23 0 O3 9526 Catarrhini + 0.00 23 0 O4 314295 Hominoidea + 0.00 23 0 F 9604 Hominidae + 0.00 23 0 F1 207598 Homininae + 0.00 23 0 G 9605 Homo + 0.00 23 23 S 9606 Homo sapiens diff --git a/tools/taxpasta/test-data/ncbi_taxonomy.loc b/tools/taxpasta/test-data/ncbi_taxonomy.loc new file mode 100644 index 00000000000..96b1a97157b --- /dev/null +++ b/tools/taxpasta/test-data/ncbi_taxonomy.loc @@ -0,0 +1 @@ +test-db-tox Test Database ${__HERE__}/test-db \ No newline at end of file diff --git a/tools/taxpasta/test-data/test-db/names.dmp b/tools/taxpasta/test-data/test-db/names.dmp new file mode 100644 index 00000000000..fa5e7ef33d4 --- /dev/null +++ b/tools/taxpasta/test-data/test-db/names.dmp @@ -0,0 +1,74 @@ +83333 | Escherichia coli K-12 | | scientific name | +83333 | Escherichia coli K12 | | equivalent name | +562 | "Bacillus coli" Migula 1895 | | authority | +562 | "Bacterium coli commune" Escherich 1885 | | authority | +562 | "Bacterium coli" (Migula 1895) Lehmann and Neumann 1896 | | authority | +562 | ATCC 11775 | | type material | +562 | Bacillus coli | | synonym | +562 | Bacterium coli | | synonym | +562 | Bacterium coli commune | | synonym | +562 | CCUG 24 | | type material | +562 | CCUG 29300 | | type material | +562 | CIP 54.8 | | type material | +562 | DSM 30083 | | type material | +562 | Enterococcus coli | | synonym | +562 | Escherchia coli | | misspelling | +562 | Escherichia coli | | scientific name | +562 | Escherichia coli (Migula 1895) Castellani and Chalmers 1919 | | authority | +562 | Escherichia sp. MAR | | includes | +562 | Escherichia/Shigella coli | | equivalent name | +562 | Eschericia coli | | misspelling | +562 | JCM 1649 | | type material | +562 | LMG 2092 | | type material | +562 | NBRC 102203 | | type material | +562 | NCCB 54008 | | type material | +562 | NCTC 9001 | | type material | +562 | bacterium 10a | | includes | +562 | bacterium E3 | | includes | +561 | Escherchia | | misspelling | +561 | Escherichia | | scientific name | +561 | Escherichia Castellani and Chalmers 1919 | | authority | +543 | Enterobacteraceae | | synonym | +543 | Enterobacteraceae (ex Lapage 1979) Lapage 1982, fam. nov., nom. rev. | | synonym | +543 | Enterobacteriaceae | | scientific name | +543 | Enterobacteriaceae (ex Rahn 1937) Ewing et al. 1980, fam. nov., nom. rev. | | synonym | +543 | Enterobacteriaceae Rahn 1937 | | synonym | +543 | gamma-3 proteobacteria | gamma-3 proteobacteria <#1> | in-part | +91347 | 'Enterobacteriales' | | synonym | +91347 | Enterobacteriaceae and related endosymbionts | | synonym | +91347 | Enterobacteriaceae group | | synonym | +91347 | Enterobacteriales | | scientific name | +91347 | enterobacteria | enterobacteria | blast name | +91347 | gamma-3 proteobacteria | gamma-3 proteobacteria <#5> | in-part | +1236 | Gammaproteobacteria | | scientific name | +1236 | Gammaproteobacteria Garrity et al. 2005 | | synonym | +1236 | Proteobacteria gamma subdivision | | synonym | +1236 | Purple bacteria, gamma subdivision | | synonym | +1236 | g-proteobacteria | gamma proteos | blast name | +1236 | gamma proteobacteria | | synonym | +1236 | gamma subdivision | | synonym | +1236 | gamma subgroup | | synonym | +1224 | Proteobacteria | | scientific name | +1224 | Proteobacteria Garrity et al. 2005 | | authority | +1224 | Proteobacteria [class] Stackebrandt et al. 1988 | | authority | +1224 | not Proteobacteria Cavalier-Smith 2002 | | authority | +1224 | proteobacteria | proteobacteria | blast name | +1224 | purple bacteria | | common name | +1224 | purple bacteria and relatives | | common name | +1224 | purple non-sulfur bacteria | | common name | +1224 | purple photosynthetic bacteria | | common name | +1224 | purple photosynthetic bacteria and relatives | | common name | +2 | Bacteria | Bacteria | scientific name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | bacteria | bacteria | blast name | +2 | eubacteria | | genbank common name | +2 | not Bacteria Haeckel 1894 | | synonym | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | +1 | all | | synonym | +1 | root | | scientific name | +131567 | biota | | synonym | +131567 | cellular organisms | | scientific name | diff --git a/tools/taxpasta/test-data/test-db/nodes.dmp b/tools/taxpasta/test-data/test-db/nodes.dmp new file mode 100644 index 00000000000..4e292c27101 --- /dev/null +++ b/tools/taxpasta/test-data/test-db/nodes.dmp @@ -0,0 +1,10 @@ +83333 | 562 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +562 | 561 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | diff --git a/tools/taxpasta/tool-data/ncbi_taxonomy.loc.sample b/tools/taxpasta/tool-data/ncbi_taxonomy.loc.sample new file mode 100644 index 00000000000..d8aa67d3958 --- /dev/null +++ b/tools/taxpasta/tool-data/ncbi_taxonomy.loc.sample @@ -0,0 +1 @@ +#value name path \ No newline at end of file diff --git a/tools/taxpasta/tool_data_table_conf.xml.sample b/tools/taxpasta/tool_data_table_conf.xml.sample new file mode 100644 index 00000000000..a17ce28eece --- /dev/null +++ b/tools/taxpasta/tool_data_table_conf.xml.sample @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+
diff --git a/tools/taxpasta/tool_data_table_conf.xml.test b/tools/taxpasta/tool_data_table_conf.xml.test new file mode 100644 index 00000000000..a17ce28eece --- /dev/null +++ b/tools/taxpasta/tool_data_table_conf.xml.test @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+