Merge pull request galaxyproject#6387 from nekrut/snp_eff_update

updated snpeff version, fixed gb parsing py script
SaimMomin12 · Oct 21, 2024 · 4cc10cf · 4cc10cf
2 parents 1550e33 + e1e860d
commit 4cc10cf
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 151 deletions.
diff --git a/tool_collections/snpeff/gbk2fa.py b/tool_collections/snpeff/gbk2fa.py
@@ -6,6 +6,9 @@
 
 
 def get_opener(gbk_filename):
+    """Determines the appropriate opener for a given file, supporting
+    bzip2, gzip, or standard open.
+    """
     try:
         bz2.open(gbk_filename).read(1)
         return bz2.open
@@ -18,30 +21,38 @@ def get_opener(gbk_filename):
         return open
 
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "genbank_file",
-    help="GenBank input file. Can be compressed with gzip or bzip2"
-)
-parser.add_argument(
-    "fasta_file", help="FASTA output datset"
-)
-parser.add_argument(
-    "--remove_version", action="store_true",
-    help="Remove version number from NCBI form formatted accession numbers. "
-         "For example, this would convert 'B000657.2' to 'B000657'"
-)
-args = parser.parse_args()
-
-
-gbk_open = get_opener(args.genbank_file)
-with gbk_open(args.genbank_file, 'rt') as input_handle, \
-     open(args.fasta_file, 'w') as output_handle:
-    for seq_record in SeqIO.parse(input_handle, 'genbank'):
-        if args.remove_version:
-            seq_id = seq_record.id.split('.')[0]
-        else:
-            seq_id = seq_record.id
-        print('Writing FASTA record: {}'.format(seq_id))
-        print('>' + seq_id, file=output_handle)
-        print(seq_record.seq, file=output_handle)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert GenBank files to FASTA format. "
+                    "Supports gzip and bzip2 compressed files."
+    )
+    parser.add_argument(
+        "genbank_file",
+        help="GenBank input file. Can be compressed with gzip or bzip2"
+    )
+    parser.add_argument(
+        "fasta_file",
+        help="FASTA output dataset"
+    )
+    parser.add_argument(
+        "--remove_version", action="store_true",
+        help="Remove version number from NCBI formatted accession numbers. "
+             "For example, this converts 'B000657.2' to 'B000657'."
+    )
+    args = parser.parse_args()
+
+    gbk_open = get_opener(args.genbank_file)
+    with gbk_open(args.genbank_file, 'rt') as input_handle, \
+            open(args.fasta_file, 'w') as output_handle:
+        for seq_record in SeqIO.parse(input_handle, 'genbank'):
+            if args.remove_version:
+                seq_id = seq_record.id.split('.')[0]
+            else:
+                seq_id = seq_record.id
+            print(f'Writing FASTA record: {seq_id}')
+            output_handle.write(f'>{seq_id}\n')
+            output_handle.write(f'{seq_record.seq}\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tool_collections/snpeff/snpEff.xml b/tool_collections/snpeff/snpEff.xml
@@ -1,4 +1,4 @@
-<tool id="snpEff" name="SnpEff eff:" version="@WRAPPER_VERSION@.galaxy2">
+<tool id="snpEff" name="SnpEff eff:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
     <description> annotate variants</description>
     <macros>
         <import>snpEff_macros.xml</import>
@@ -13,7 +13,7 @@
             ln -s '${intervals}' intervals.bed &&
         #end if 
         snpEff @JAVA_OPTIONS@ eff
-        -i $inputFormat -o ${outputConditional.outputFormat} -upDownStreamLen $udLength
+        -i $inputFormat -o ${outputFormat} -upDownStreamLen $udLength
         #if $spliceSiteSize and str($spliceSiteSize) != '':
           -spliceSiteSize "$spliceSiteSize"
         #end if
@@ -53,9 +53,6 @@
         #if $csvStats:
             -csvStats '$csvFile'
         #end if
-        #if str($offset) != 'default':
-          ${offset}
-        #end if
         #if str($chr).strip() != '':
           -chr '$chr'
         #end if
@@ -103,35 +100,21 @@
             mkdir '$statsFile.files_path' &&
             mv '$genes_file' '#echo os.path.join($statsFile.files_path, $genes_file_name)#'
         #end if
-        #if $outputConditional.outputFormat == 'gatk' and $outputConditional.gatk_v1
-          &&
-          ## Replace real SnpEff version with 2.0.5 to prevent this GATK 1.x error: "The version of SnpEff used to generate the SnpEff input file (x.x) is not currently supported by the GATK. Supported versions are: [2.0.5]"
-          sed -i.bak -e 's/^\#\#SnpEffVersion="\(\S*\s\)/\#\#SnpEffVersion="2.0.5 - real is \1/' '$snpeff_output'
-        #end if
     ]]></command>
     <inputs>
         <param name="input" type="data" format="vcf,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>
-
-        <param name="inputFormat" type="select" label="Input format">
+        <param argument="-i" name="inputFormat" type="select" label="Input format" help="Specify the format of input dataset(s)">
             <option value="vcf" selected="true">VCF</option>
-            <option value="bed">BED (Deprecated)</option>
+            <option value="bed">BED</option>
         </param>
-
-        <conditional name="outputConditional">
-            <param name="outputFormat" type="select" label="Output format">
-                <option value="vcf" selected="true">VCF (only if input is VCF)</option>
-                <option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
-                <option value="bed">BED</option>
-                <option value="bedAnn">BED annotations</option>
-            </param>
-            <when value="vcf" />
-            <when value="gatk">
-                <param name="gatk_v1" type="boolean" checked="true" label="Compatible with GATK 1.x" />
-            </when>
-            <when value="bed" />
-            <when value="bedAnn" />
-        </conditional>
-        <param name="csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report, useful for downstream analysis (-csvStats)" />
+        <param argument="-o" name="outputFormat" type="select" label="Output format" help="Specify output format">
+            <option value="vcf" selected="true">VCF (only if input is VCF)</option>
+            <option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
+            <option value="bed">BED</option>
+            <option value="bedAnn">BED annotations</option>
+        </param>
+        <param argument="-csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report?" help="Useful for downstream analyses and report generation" />
+        <param argument="-noStats" name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats?" help="Generates an HTML summary of results"/>
         <conditional name="snpDb">
             <param name="genomeSrc" type="select" label="Genome source">
                 <!-- These options are referenced in the help section of SnpEff download tool. If you change them, change help of SnpEff download as well -->
@@ -171,8 +154,7 @@
                 </section>
             </when>
             <when value="named">
-                <param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)">
-                    <help>@SNPEFF_DATABASE_URL@</help>
+                <param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)" help="A list of databases can be obtained with 'snpEff download' tool">
                     <validator type="empty_field" message="A genome version name is required" />
                 </param>
             </when>
@@ -209,18 +191,16 @@
             </param>
             </when>
         </conditional>
-
         <param name="udLength" argument="-ud" type="select" label="Upstream / Downstream length">
-            <option value="0">No upstream / downstream intervals (0 bases)</option>
+            <option value="0" selected="true">No upstream / downstream intervals (0 bases)</option>
             <option value="200">200 bases</option>
             <option value="500">500 bases</option>
             <option value="1000">1000 bases</option>
             <option value="2000">2000 bases</option>
-            <option value="5000" selected="true">5000 bases</option>
+            <option value="5000">5000 bases</option>
             <option value="10000">10000 bases</option>
             <option value="20000">20000 bases</option>
         </param>
-
         <param name="spliceSiteSize" argument="-ss" type="select" optional="true" label="Set size for splice sites (donor and acceptor) in bases">
             <option value="1">1 base</option>
             <option value="2" selected="true">2 bases</option>
@@ -232,7 +212,6 @@
             <option value="8">8 bases</option>
             <option value="9">9 bases</option>
         </param>
-
         <conditional name="spliceRegion">
             <param name="setSpliceRegions" type="select" label="spliceRegion Settings">
                 <option value="no">Use Defaults</option>
@@ -245,7 +224,6 @@
                 <param argument="-spliceRegionIntronMax" type="integer" value="" min="1" max="10" optional="true" label="Set maximum number of bases for splice site region within intron. Default: 8 bases" />
             </when>
         </conditional>
-
         <param name="annotations" type="select" display="checkboxes" multiple="true" label="Annotation options">
             <option value="-formatEff">Use 'EFF' field compatible with older versions (instead of 'ANN')</option>
             <option value="-classic">Use Classic Effect names and amino acid variant annotations (NON_SYNONYMOUS_CODING vs missense_variant and G180R vs p.Gly180Arg/c.538G>C)</option>
@@ -334,20 +312,13 @@
                 </param>
             </when>
         </conditional>
-
-        <param name="offset" type="select" display="radio" label="Chromosomal position">
-            <option value="default" selected="true">Use default (based on input type)</option>
-            <option value="-0">Force zero-based positions (both input and output)</option>
-            <option value="-1">Force one-based positions (both input and output)</option>
-        </param>
         <param argument="-chr" type="text" label="Text to prepend to chromosome name">
             <help>
                By default SnpEff simplifies all chromosome names. For instance 'chr1' is just '1'.
                You can prepend any string you want to the chromosome name
             </help>
             <validator type="regex" message="No whitespace allowed">^\S*$</validator>
         </param>
-        <param name="generate_stats" argument="-noStats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats" />
         <param argument="-noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server" />
     </inputs>
     <outputs>
@@ -375,8 +346,8 @@
             <param name="generate_stats" value="true"/>
             <output name="snpeff_output">
                 <assert_contents>
-                    <has_text_matching expression="KJ660346\t572\t.*missense_variant" />
-                    <has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
+                    <has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
+                    <has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
                 </assert_contents>
             </output>
             <output name="statsFile">
@@ -398,13 +369,13 @@
             <param name="csvStats" value="true"/>
             <output name="snpeff_output">
                 <assert_contents>
-                    <has_text_matching expression="KJ660346\t572\t.*missense_variant" />
-                    <has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
+                    <has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
+                    <has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
                 </assert_contents>
             </output>
             <output name="csvFile">
                 <assert_contents>
-                    <has_n_lines n="185"/>
+                    <has_n_lines n="134"/>
                     <has_n_columns n="1" sep=","/>
                 </assert_contents>
             </output>

diff --git a/tool_collections/snpeff/snpEff_create_db.xml b/tool_collections/snpeff/snpEff_create_db.xml
@@ -1,11 +1,11 @@
-<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy6" profile="22.01">
+<tool id="snpEff_build_gb" name="SnpEff build:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
     <description> database from Genbank or GFF record</description>
     <macros>
         <import>snpEff_macros.xml</import>
     </macros>
     <requirements>
         <expand macro="requirement" />
-        <requirement type="package" version="1.79">biopython</requirement>
+        <requirement type="package" version="1.84">biopython</requirement>
     </requirements>
     <expand macro="stdio" />
     <expand macro="version_command" />
@@ -36,7 +36,7 @@
             ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.${input_type.input_type_selector}' &&
         #end if
 
-        snpEff @JAVA_OPTIONS@ build -v
+        snpEff @JAVA_OPTIONS@ build -noCheckCds -noCheckProtein -v
         -configOption '${genome_version}'.genome='${genome_version}'
         -configOption '${genome_version}'.codonTable='${codon_table}'
         #if str($input_type.input_type_selector) == "gb":
@@ -186,7 +186,7 @@
     <help><![CDATA[
 **What it does**
 
-This tool uses `"snpEff build -genbank"` or `"snpEff build -gff3"` commands to create a snpEff database.
+This tool uses `snpEff build` to create a snpEff database.
 
 ------
 
@@ -201,7 +201,7 @@ Using Genbank data for creating databases has several advantages:
 
  .. class:: warningmark
 
- SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use GFF route described below.
+ SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use the GFF route described below.
 
 -------
 

diff --git a/tool_collections/snpeff/snpEff_databases.xml b/tool_collections/snpeff/snpEff_databases.xml
@@ -1,4 +1,4 @@
-<tool id="snpEff_databases" name="SnpEff databases:" version="@WRAPPER_VERSION@.galaxy2">
+<tool id="snpEff_databases" name="SnpEff databases:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
     <description> list available databases</description>
     <macros>
         <import>snpEff_macros.xml</import>
@@ -19,6 +19,10 @@
              | grep -v '${exclude_pattern}'
         #end if
 
+        #if str($include_download_path) == "no":
+            | cut -f 1,2,3,4
+        #end if
+
         > '${snpeff_dbs}'
     ]]></command>
     <inputs>
@@ -38,7 +42,10 @@
                 </valid>
             </sanitizer>
         </param>
-
+        <param name="include_download_path" type="select" display="radio" label="Include download paths?" help="When snpEff dumps the list of available databases, it includes their download paths. These are not needed in the Galaxy context.">
+            <option value="yes">Yes</option>
+            <option value="no" selected="true">No</option>
+        </param>
     </inputs>
     <outputs>
         <data name="snpeff_dbs" format="tabular" label="${tool.name} @SNPEFF_VERSION@ available databases" />
@@ -63,12 +70,16 @@
     <help><![CDATA[
 **What it does**
 
-This tool downloads the master list of snpEff databases from @SNPEFF_DATABASE_URL@. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse* the it will produce a tabular dataset with the following content::
+This tool downloads the master list of snpEff databases from a remote SnpEff repository. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse*, it will produce a tabular dataset with the following content::
+
+    mm10  Mouse
+    mm39  Mouse
+    mm9   Mouse
 
-    mm10  Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm10.zip
-    mm9   Mouse  http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm9.zip
+This means that there are three available snpEff databases for mouse genome. If you want to use mm39 in you analysis:
 
-This means that there two available snpEff databases for mouse genome versions mm9 and mm10. In order to download these databases you should use identifier from the first column (e.g., mm9 or mm10 in this case).
+ - set **Genome source** option of **SnpEff eff** Galaxy tool to *Download on demand*
+ - enter 'mm39' into **Snpff Genome Version Name** text box
 
 -------
 
@@ -83,6 +94,7 @@ There are two ways to use names of databases obtained with this tool in Galaxy's
 
 @SNPEFF_IN_GALAXY_INFO@
 @EXTERNAL_DOCUMENTATION@
+
     ]]></help>
     <expand macro="citations" />
 </tool>

diff --git a/tool_collections/snpeff/snpEff_download.xml b/tool_collections/snpeff/snpEff_download.xml
@@ -1,4 +1,4 @@
-<tool id="snpEff_download" name="SnpEff download:" version="@WRAPPER_VERSION@.galaxy2">
+<tool id="snpEff_download" name="SnpEff download:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
     <description> download a pre-built database</description>
     <macros>
         <import>snpEff_macros.xml</import>
@@ -42,7 +42,7 @@ mv temp/'$genome_version' '$snpeff_db.files_path'
     <help><![CDATA[
 **What it does**
 
-This tool downloads a specified database from @SNPEFF_DATABASE_URL@. It deposits it into the history.
+This tool downloads a specified database from a remote SnpEff repository. It deposits it into the history.
 
 -------
 

diff --git a/tool_collections/snpeff/snpEff_macros.xml b/tool_collections/snpeff/snpEff_macros.xml
@@ -1,6 +1,6 @@
 <macros>
     <xml name="requirement">
-        <requirement type="package" version="4.3.1t">snpeff</requirement>
+        <requirement type="package" version="5.2">snpeff</requirement>
         <yield/>
     </xml>
   <xml name="stdio">
@@ -14,9 +14,8 @@
 snpEff -version
     ]]></version_command>
   </xml>
-  <token name="@WRAPPER_VERSION@">4.3+T</token>
-  <token name="@SNPEFF_VERSION@">SnpEff4.3</token>
-  <token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
+  <token name="@WRAPPER_VERSION@">0</token>
+  <token name="@SNPEFF_VERSION@">5.2</token>
   <token name="@JAVA_OPTIONS@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
   <xml name="ref_select">
     <conditional name="reference_source">
@@ -59,7 +58,7 @@ In you *do not see them* keep reading...
 
 **Download pre-built databases**
 
-SnpEff project generates large numbers of pre-build databases. These are available at @SNPEFF_DATABASE_URL@ and can downloaded. Follow these steps:
+SnpEff project generates large numbers of pre-build databases. To obtain and use them follow these steps:
 
   #. Use **SnpEff databases** tool to generate a list of existing databases. Note the name of the database you need.
   #. Use **SnpEff download** tool to download the database.

diff --git a/tool_collections/snpeff/snpeff_get_chr_names.xml b/tool_collections/snpeff/snpeff_get_chr_names.xml
@@ -1,4 +1,4 @@
-<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@WRAPPER_VERSION@.galaxy2">
+<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
     <description>list chromosome names/lengths</description>
     <macros>
         <import>snpEff_macros.xml</import>