galaxyproject · bebatut · Oct 17, 2024 · May 14, 2024 · May 16, 2024 · May 16, 2024
diff --git a/data_managers/data_manager_groot_database_downloader/.shed.yml b/data_managers/data_manager_groot_database_downloader/.shed.yml
@@ -0,0 +1,8 @@
+categories:
+- Data Managers
+description: Download pre-clustered ARG database that are ready to be indexed
+homepage_url: https://github.com/will-rowe/groot
+long_description: GROOT is a tool to type Antibiotic Resistance Genes (ARGs) in metagenomic samples (a.k.a. Resistome Profiling). It combines variation graph representation of gene sets with an LSH indexing scheme to allow for fast classification of metagenomic reads.
+name: data_manager_groot_database_downloader
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_groot_database_downloader
+type: unrestricted
diff --git a/..._manager_groot_database_downloader/data_manager/data_manager_groot_database_downloader.py b/..._manager_groot_database_downloader/data_manager/data_manager_groot_database_downloader.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+#
+# Data manager for reference data for the MetaPhlAn Galaxy tools
+import argparse
+import json
+import subprocess
+from datetime import date
+from pathlib import Path
+
+
+# Utility functions for interacting with Galaxy JSON
+def read_input_json(json_fp):
+    """Read the JSON supplied from the data manager tool
+
+    Returns a tuple (param_dict,extra_files_path)
+
+    'param_dict' is an arbitrary dictionary of parameters
+    input into the tool; 'extra_files_path' is the path
+    to a directory where output files must be put for the
+    receiving data manager to pick them up.
+
+    NB the directory pointed to by 'extra_files_path'
+    doesn't exist initially, it is the job of the script
+    to create it if necessary.
+
+    """
+    with open(json_fp) as fh:
+        params = json.load(fh)
+    return (params['param_dict'],
+            Path(params['output_data'][0]['extra_files_path']))
+
+
+# Utility functions for creating data table dictionaries
+#
+# Example usage:
+# >>> d = create_data_tables_dict()
+# >>> add_data_table(d,'my_data')
+# >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
+# >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
+# >>> print(json.dumps(d))
+def create_data_tables_dict():
+    """Return a dictionary for storing data table information
+
+    Returns a dictionary that can be used with 'add_data_table'
+    and 'add_data_table_entry' to store information about a
+    data table. It can be converted to JSON to be sent back to
+    the data manager.
+
+    """
+    d = {
+        'data_tables': {}
+    }
+    return d
+
+
+def add_data_table(d, table):
+    """Add a data table to the data tables dictionary
+
+    Creates a placeholder for a data table called 'table'.
+
+    """
+    d['data_tables'][table] = []
+
+
+def add_data_table_entry(d, table, entry):
+    """Add an entry to a data table
+
+    Appends an entry to the data table 'table'. 'entry'
+    should be a dictionary where the keys are the names of
+    columns in the data table.
+
+    Raises an exception if the named data table doesn't
+    exist.
+
+    """
+    try:
+        d['data_tables'][table].append(entry)
+    except KeyError:
+        raise Exception("add_data_table_entry: no table '%s'" % table)
+
+
+def download_groot_db(data_tables, name, table_name, target_dp, identity, groot_version):
+    """Download GROOT database
+
+    Creates references to the specified file(s) on the Galaxy
+    server in the appropriate data table (determined from the
+    file extension).
+
+    The 'data_tables' dictionary should have been created using
+    the 'create_data_tables_dict' and 'add_data_table' functions.
+
+    Arguments:
+      data_tables: a dictionary containing the data table info
+      name: name of the database to download
+      table_name: name of the table
+      target_dp: directory to put copy or link to the data file
+      identity: identity threshold for GROOT
+      groot_version: version of GROOT to use
+
+    """
+    # Define the target directory path
+    db_dp = target_dp / Path(name)
+
+    # Build the command string
+    cmd = "groot get -d %s -o %s --identity %s" % (name, db_dp, identity)
+
+    # Execute the command
+    subprocess.check_call(cmd, shell=True)
+
+    # Add the data table entry
+    add_data_table_entry(
+        data_tables,
+        table_name,
+        dict(
+            value='%s.%s-v%s' % (name, identity, groot_version),
+            name='%s (%s percent identity)' % (name, identity),
+            dbkey='%s-v%s' % (date.today().strftime("%d%m%Y"), groot_version),
+            path=str(db_dp),
+            db_version=groot_version
+        )
+    )
+
+
+if __name__ == "__main__":
+    print("Starting...")
+
+    # Read command line
+    parser = argparse.ArgumentParser(description='Download and build Groot database')
+    parser.add_argument('--database', help="Name of the database")
+    parser.add_argument('--percentidentity', help="The identity threshold at which the database was clustered")
+    parser.add_argument('--grootversion', help="Version of the Database")
+    parser.add_argument('--json', help="Path to JSON file")
+    args = parser.parse_args()
+    print("args   : %s" % args)
+
+    # Read the input JSON
+    json_fp = Path(args.json)
+    params, target_dp = read_input_json(json_fp)
+
+    # Make the target directory
+    print("Making %s" % target_dp)
+    target_dp.mkdir(parents=True, exist_ok=True)
+
+    # Set up data tables dictionary
+
+    data_tables = create_data_tables_dict()
+    add_data_table(data_tables, "groot_database_downloader")
+
+    # Fetch data from specified data sources
+    print("Download and build database")
+    download_groot_db(
+        data_tables,
+        args.database,
+        "groot_database_downloader",
+        target_dp,
+        args.percentidentity,
+        args.grootversion)
+
+    # Write output JSON
+    print("Outputting JSON")
+    with open(json_fp, 'w') as fh:
+        json.dump(data_tables, fh, sort_keys=True)
+    print("Done.")
diff --git a/...manager_groot_database_downloader/data_manager/data_manager_groot_database_downloader.xml b/...manager_groot_database_downloader/data_manager/data_manager_groot_database_downloader.xml
@@ -0,0 +1,43 @@
+<tool id="groot_database_downloader" name="Groot get" tool_type="manage_data" version="0.0.5" profile="23.0">
+    <description> Download pre-clustered ARG database that are ready to be indexed</description>
+    <requirements>
+        <requirement type="package" version="3.12">python</requirement>
+        <requirement type="package" version="1.1.2">groot</requirement>
+    </requirements>
+    <command><![CDATA[
+        python '$__tool_directory__/data_manager_groot_database_downloader.py' 
+        --database '$database'
+        --grootversion 1.1.2
+        --percentidentity 90
+        --json '$out_file'
+    ]]></command>
+    <inputs>
+        <param name="database" type="select" label="Database name">
+            <option value="arg-annot" selected="true">ARG-annot 90% identity (default)</option>
+            <option value="resfinder">Resfinder 90% identity</option>
+            <option value="card">CARD 90% identity</option>
+            <option value="groot-db">Groot-db 90% identity, combine all sequences in ARG-annot, Resfinder and CARD</option>
+            <option value="groot-core-db">Groot-core-db 90% identity, same as groot-db but one copy of each sequence is kept and then this collection is clustered at 90% identity</option>
+         </param>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="database" value="arg-annot"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="arg-annot.90-v1.1.2"/>
+                    <has_text text="arg-annot (90 percent identity)"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+        The groot get subcommand is used to download a pre-clustered ARG database that is ready to be indexed.
+    </help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.1217889</citation>
+    </citations>
+</tool>
diff --git a/data_managers/data_manager_groot_database_downloader/data_manager_conf.xml b/data_managers/data_manager_groot_database_downloader/data_manager_conf.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_groot_database_downloader.xml" id="groot_database_downloader">
+        <data_table name="groot_database">
+            <output>
+                <column name="value"/>  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="name"/>  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="dbkey"/> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="path" output_ref="out_file">
+                    <move type="directory">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">groot_database/data/${dbkey}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/groot_database/data/${dbkey}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+                <column name="db_version"/> <!-- columns that are going to be specified by the Data Manager Tool -->
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
diff --git a/data_managers/data_manager_groot_database_downloader/test-data/groot_database.loc b/data_managers/data_manager_groot_database_downloader/test-data/groot_database.loc
@@ -0,0 +1,9 @@
+# Tab separated with 4 columns:
+# value : unique value with name, % identity and groot version
+# name: name of the database generate using the database name and the % identity
+# dbkey : name of the database generate using the date and version of the groot database
+# path : final oath to bakta database on galaxy
+# db_version : version of the groot database
+#value, name, dbkey, path, db_version
+# eg.
+#resfinder.90-v1.1.2	resfinder (90 percent identity)	300524-v1.1.2	${__HERE__}/resfinder.90    1.1.2
diff --git a/data_managers/data_manager_groot_database_downloader/tool-data/groot_database.loc.sample b/data_managers/data_manager_groot_database_downloader/tool-data/groot_database.loc.sample
@@ -0,0 +1,3 @@
+#This is a sample file distributed with Galaxy that enables tools
+#file has this format (white space characters are TAB characters)
+#resfinder.90-v1.1.2	resfinder (90 percent identity)	300524-v1.1.2	${__HERE__}/resfinder.90    1.1.2
diff --git a/data_managers/data_manager_groot_database_downloader/tool_data_table_conf.xml.sample b/data_managers/data_manager_groot_database_downloader/tool_data_table_conf.xml.sample
@@ -0,0 +1,6 @@
+<tables>
+    <table name="groot_database" comment_char="#">
+        <columns>value, name, dbkey, path, db_version</columns>
+        <file path="tool-data/groot_database.loc"/>
+    </table>
+</tables>
diff --git a/data_managers/data_manager_groot_database_downloader/tool_data_table_conf.xml.test b/data_managers/data_manager_groot_database_downloader/tool_data_table_conf.xml.test
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<tables>
+    <table name="groot_database" comment_char="#">
+        <columns>value, name, dbkey, path, db_version</columns>
+        <file path="${__HERE__}/test-data/groot_database.loc" />
+    </table>
+</tables>
diff --git a/tools/groot/.shed.yml b/tools/groot/.shed.yml
@@ -0,0 +1,18 @@
+name: groot
+owner: iuc
+description: GROOT is a tool to type Antibiotic Resistance Genes (ARGs) in metagenomic samples
+long_description: |
+  GROOT is a tool to type Antibiotic Resistance Genes (ARGs) in metagenomic samples (a.k.a. Resistome Profiling). 
+  It combines variation graph representation of gene sets with an LSH indexing scheme to allow for fast classification of metagenomic reads. 
+  Subsequent hierarchical local alignment of classified reads against graph traversals facilitates accurate reconstruction of full-length gene sequences using a simple scoring scheme.
+categories:
+- Metagenomics
+homepage_url: https://github.com/will-rowe/groot
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/groot
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper for the GROOT tool suite: {{ tool_name }}"
+suite:
+  name: "suite_groot"
+  description: "GROOT is a tool to type Antibiotic Resistance Genes (ARGs) in metagenomic samples"
diff --git a/tools/groot/groot_align.xml b/tools/groot/groot_align.xml
@@ -0,0 +1,90 @@
+<tool id="groot_align" name="Groot align" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>reads to references and weight variation graphs</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+ln -s '$fastq' 'input.fastq' &&
+
+groot index
+    --msaDir '$groot_db_select.fields.path'
+    --indexDir 'grootIndex'
+    --windowSize $windowSize
+    --kmerSize $kmerSize
+    --maxK $maxK
+    --maxSketchSpan $maxSketchSpan
+    --numPart $numPart
+    --sketchSize $sketchSize
+&&
+
+groot align
+    --fastq 'input.fastq'
+    --indexDir 'grootIndex'
+    --contThresh $contThresh
+    --minKmerCov $minKmerCov
+    > '$output'
+]]></command>
+    <inputs>
+        <param argument="--fastq" type="data" format="fastq,fastqsanger" label="FASTQ file(s) to align"/>
+        <section name="index" title="Index">
+            <param name="groot_db_select" type="select" label="Groot database">
+                <options from_data_table="groot_database">
+                    <validator message="No groot database is available" type="no_options"/>
+                </options>
+            </param>
+            <param argument="--windowSize" type="integer" min="0" value="100" label="Size of window to sketch graph traversals" />
+            <param argument="--kmerSize" type="integer" min="0" value="31" label="Size of k-mer" />
+            <param argument="--maxK" type="integer" min="0" value="30" label="maxK in the LSH Ensemble" />
+            <param argument="--maxSketchSpan" type="integer" min="0" value="4" label="Maximum number of identical neighbouring sketches permitted in any graph traversal" />
+            <param argument="--numPart" type="integer" min="0" value="8" label="Number of partitions in the LSH Ensemble" />
+            <param argument="--sketchSize" type="integer" min="0" value="21" label="Size of MinHash sketch" />
+        </section>
+        <section name="align" title="Align">
+            <param argument="--contThresh" type="float" min="0" max="1.0" value="0.99" label="Containment threshold for the LSH ensemble" />
+            <param argument="--minKmerCov" type="integer" min="0" value="1" label="Minimum number of k-mers covering each base of a graph segment" />
+            <param argument="--noAlign" type="boolean" truevalue="" falsevalue="--noAlign" label="Perform exact alignment?" 
+                help="If not, graphs will still be weighted using approximate read mappings"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="output" format="bam" label="${tool.name} on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="fastq" value="bla-b7-150bp-5x.fq"/>
+            <section name="index">
+                <param name="groot_db_select" value="resfinder.90" />
+                <param name="windowSize" value="100"/>
+                <param name="kmerSize" value="31"/>
+                <param name="maxK" value="30"/>
+                <param name="maxSketchSpan" value="4"/>
+                <param name="numPart" value="8"/>
+                <param name="sketchSize" value="21"/>
+            </section>
+            <section name="align">
+                <param name="contThresh" value="0.97"/>
+                <param name="minKmerCov" value="1" />
+                <param name="noAlign" value=""/>
+            </section>
+            <output name="output" ftype="bam">
+                <assert_contents>
+                    <has_size value="41106" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+@HELP@
+
+The align subcommand is used to align reads against the indexed variation graphs. 
+
+**Output**
+
+The output alignment is essentially the ARG classified reads (which may be useful) and can then be used to report full-length ARGs (using the report subcommand)
+    ]]></help>
+    <expand macro="citations"/>
+</tool>