Skip to content

Commit

Permalink
Merge pull request galaxyproject#6387 from nekrut/snp_eff_update
Browse files Browse the repository at this point in the history
updated snpeff version, fixed gb parsing py script
  • Loading branch information
nekrut authored Oct 21, 2024
2 parents 1550e33 + e1e860d commit 4cc10cf
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 151 deletions.
65 changes: 38 additions & 27 deletions tool_collections/snpeff/gbk2fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@


def get_opener(gbk_filename):
"""Determines the appropriate opener for a given file, supporting
bzip2, gzip, or standard open.
"""
try:
bz2.open(gbk_filename).read(1)
return bz2.open
Expand All @@ -18,30 +21,38 @@ def get_opener(gbk_filename):
return open


parser = argparse.ArgumentParser()
parser.add_argument(
"genbank_file",
help="GenBank input file. Can be compressed with gzip or bzip2"
)
parser.add_argument(
"fasta_file", help="FASTA output datset"
)
parser.add_argument(
"--remove_version", action="store_true",
help="Remove version number from NCBI form formatted accession numbers. "
"For example, this would convert 'B000657.2' to 'B000657'"
)
args = parser.parse_args()


gbk_open = get_opener(args.genbank_file)
with gbk_open(args.genbank_file, 'rt') as input_handle, \
open(args.fasta_file, 'w') as output_handle:
for seq_record in SeqIO.parse(input_handle, 'genbank'):
if args.remove_version:
seq_id = seq_record.id.split('.')[0]
else:
seq_id = seq_record.id
print('Writing FASTA record: {}'.format(seq_id))
print('>' + seq_id, file=output_handle)
print(seq_record.seq, file=output_handle)
def main():
parser = argparse.ArgumentParser(
description="Convert GenBank files to FASTA format. "
"Supports gzip and bzip2 compressed files."
)
parser.add_argument(
"genbank_file",
help="GenBank input file. Can be compressed with gzip or bzip2"
)
parser.add_argument(
"fasta_file",
help="FASTA output dataset"
)
parser.add_argument(
"--remove_version", action="store_true",
help="Remove version number from NCBI formatted accession numbers. "
"For example, this converts 'B000657.2' to 'B000657'."
)
args = parser.parse_args()

gbk_open = get_opener(args.genbank_file)
with gbk_open(args.genbank_file, 'rt') as input_handle, \
open(args.fasta_file, 'w') as output_handle:
for seq_record in SeqIO.parse(input_handle, 'genbank'):
if args.remove_version:
seq_id = seq_record.id.split('.')[0]
else:
seq_id = seq_record.id
print(f'Writing FASTA record: {seq_id}')
output_handle.write(f'>{seq_id}\n')
output_handle.write(f'{seq_record.seq}\n')


if __name__ == "__main__":
main()
69 changes: 20 additions & 49 deletions tool_collections/snpeff/snpEff.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff" name="SnpEff eff:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff" name="SnpEff eff:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> annotate variants</description>
<macros>
<import>snpEff_macros.xml</import>
Expand All @@ -13,7 +13,7 @@
ln -s '${intervals}' intervals.bed &&
#end if
snpEff @JAVA_OPTIONS@ eff
-i $inputFormat -o ${outputConditional.outputFormat} -upDownStreamLen $udLength
-i $inputFormat -o ${outputFormat} -upDownStreamLen $udLength
#if $spliceSiteSize and str($spliceSiteSize) != '':
-spliceSiteSize "$spliceSiteSize"
#end if
Expand Down Expand Up @@ -53,9 +53,6 @@
#if $csvStats:
-csvStats '$csvFile'
#end if
#if str($offset) != 'default':
${offset}
#end if
#if str($chr).strip() != '':
-chr '$chr'
#end if
Expand Down Expand Up @@ -103,35 +100,21 @@
mkdir '$statsFile.files_path' &&
mv '$genes_file' '#echo os.path.join($statsFile.files_path, $genes_file_name)#'
#end if
#if $outputConditional.outputFormat == 'gatk' and $outputConditional.gatk_v1
&&
## Replace real SnpEff version with 2.0.5 to prevent this GATK 1.x error: "The version of SnpEff used to generate the SnpEff input file (x.x) is not currently supported by the GATK. Supported versions are: [2.0.5]"
sed -i.bak -e 's/^\#\#SnpEffVersion="\(\S*\s\)/\#\#SnpEffVersion="2.0.5 - real is \1/' '$snpeff_output'
#end if
]]></command>
<inputs>
<param name="input" type="data" format="vcf,bed" label="Sequence changes (SNPs, MNPs, InDels)"/>

<param name="inputFormat" type="select" label="Input format">
<param argument="-i" name="inputFormat" type="select" label="Input format" help="Specify the format of input dataset(s)">
<option value="vcf" selected="true">VCF</option>
<option value="bed">BED (Deprecated)</option>
<option value="bed">BED</option>
</param>

<conditional name="outputConditional">
<param name="outputFormat" type="select" label="Output format">
<option value="vcf" selected="true">VCF (only if input is VCF)</option>
<option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
<option value="bed">BED</option>
<option value="bedAnn">BED annotations</option>
</param>
<when value="vcf" />
<when value="gatk">
<param name="gatk_v1" type="boolean" checked="true" label="Compatible with GATK 1.x" />
</when>
<when value="bed" />
<when value="bedAnn" />
</conditional>
<param name="csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report, useful for downstream analysis (-csvStats)" />
<param argument="-o" name="outputFormat" type="select" label="Output format" help="Specify output format">
<option value="vcf" selected="true">VCF (only if input is VCF)</option>
<option value="gatk">GATK-compatible VCF (only if input is VCF)</option>
<option value="bed">BED</option>
<option value="bedAnn">BED annotations</option>
</param>
<param argument="-csvStats" type="boolean" truevalue="-csvStats" falsevalue="" checked="false" label="Create CSV report?" help="Useful for downstream analyses and report generation" />
<param argument="-noStats" name="generate_stats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats?" help="Generates an HTML summary of results"/>
<conditional name="snpDb">
<param name="genomeSrc" type="select" label="Genome source">
<!-- These options are referenced in the help section of SnpEff download tool. If you change them, change help of SnpEff download as well -->
Expand Down Expand Up @@ -171,8 +154,7 @@
</section>
</when>
<when value="named">
<param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)">
<help>@SNPEFF_DATABASE_URL@</help>
<param name="genome_version" type="text" value="" label="Snpff Genome Version Name (e.g. GRCh38.86)" help="A list of databases can be obtained with 'snpEff download' tool">
<validator type="empty_field" message="A genome version name is required" />
</param>
</when>
Expand Down Expand Up @@ -209,18 +191,16 @@
</param>
</when>
</conditional>

<param name="udLength" argument="-ud" type="select" label="Upstream / Downstream length">
<option value="0">No upstream / downstream intervals (0 bases)</option>
<option value="0" selected="true">No upstream / downstream intervals (0 bases)</option>
<option value="200">200 bases</option>
<option value="500">500 bases</option>
<option value="1000">1000 bases</option>
<option value="2000">2000 bases</option>
<option value="5000" selected="true">5000 bases</option>
<option value="5000">5000 bases</option>
<option value="10000">10000 bases</option>
<option value="20000">20000 bases</option>
</param>

<param name="spliceSiteSize" argument="-ss" type="select" optional="true" label="Set size for splice sites (donor and acceptor) in bases">
<option value="1">1 base</option>
<option value="2" selected="true">2 bases</option>
Expand All @@ -232,7 +212,6 @@
<option value="8">8 bases</option>
<option value="9">9 bases</option>
</param>

<conditional name="spliceRegion">
<param name="setSpliceRegions" type="select" label="spliceRegion Settings">
<option value="no">Use Defaults</option>
Expand All @@ -245,7 +224,6 @@
<param argument="-spliceRegionIntronMax" type="integer" value="" min="1" max="10" optional="true" label="Set maximum number of bases for splice site region within intron. Default: 8 bases" />
</when>
</conditional>

<param name="annotations" type="select" display="checkboxes" multiple="true" label="Annotation options">
<option value="-formatEff">Use 'EFF' field compatible with older versions (instead of 'ANN')</option>
<option value="-classic">Use Classic Effect names and amino acid variant annotations (NON_SYNONYMOUS_CODING vs missense_variant and G180R vs p.Gly180Arg/c.538G>C)</option>
Expand Down Expand Up @@ -334,20 +312,13 @@
</param>
</when>
</conditional>

<param name="offset" type="select" display="radio" label="Chromosomal position">
<option value="default" selected="true">Use default (based on input type)</option>
<option value="-0">Force zero-based positions (both input and output)</option>
<option value="-1">Force one-based positions (both input and output)</option>
</param>
<param argument="-chr" type="text" label="Text to prepend to chromosome name">
<help>
By default SnpEff simplifies all chromosome names. For instance 'chr1' is just '1'.
You can prepend any string you want to the chromosome name
</help>
<validator type="regex" message="No whitespace allowed">^\S*$</validator>
</param>
<param name="generate_stats" argument="-noStats" type="boolean" truevalue="" falsevalue="-noStats" checked="true" label="Produce Summary Stats" />
<param argument="-noLog" type="boolean" truevalue="-noLog" falsevalue="" checked="true" label="Suppress reporting usage statistics to server" />
</inputs>
<outputs>
Expand Down Expand Up @@ -375,8 +346,8 @@
<param name="generate_stats" value="true"/>
<output name="snpeff_output">
<assert_contents>
<has_text_matching expression="KJ660346\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
<has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
</assert_contents>
</output>
<output name="statsFile">
Expand All @@ -398,13 +369,13 @@
<param name="csvStats" value="true"/>
<output name="snpeff_output">
<assert_contents>
<has_text_matching expression="KJ660346\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346\t1024\t.*synonymous_variant" />
<has_text_matching expression="KJ660346.1\t572\t.*missense_variant" />
<has_text_matching expression="KJ660346.1\t1024\t.*synonymous_variant" />
</assert_contents>
</output>
<output name="csvFile">
<assert_contents>
<has_n_lines n="185"/>
<has_n_lines n="134"/>
<has_n_columns n="1" sep=","/>
</assert_contents>
</output>
Expand Down
10 changes: 5 additions & 5 deletions tool_collections/snpeff/snpEff_create_db.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
<tool id="snpEff_build_gb" name="SnpEff build:" version="@WRAPPER_VERSION@.galaxy6" profile="22.01">
<tool id="snpEff_build_gb" name="SnpEff build:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> database from Genbank or GFF record</description>
<macros>
<import>snpEff_macros.xml</import>
</macros>
<requirements>
<expand macro="requirement" />
<requirement type="package" version="1.79">biopython</requirement>
<requirement type="package" version="1.84">biopython</requirement>
</requirements>
<expand macro="stdio" />
<expand macro="version_command" />
Expand Down Expand Up @@ -36,7 +36,7 @@
ln -s '${input_type.input}' 'snpeff_output/${genome_version}/genes.${input_type.input_type_selector}' &&
#end if
snpEff @JAVA_OPTIONS@ build -v
snpEff @JAVA_OPTIONS@ build -noCheckCds -noCheckProtein -v
-configOption '${genome_version}'.genome='${genome_version}'
-configOption '${genome_version}'.codonTable='${codon_table}'
#if str($input_type.input_type_selector) == "gb":
Expand Down Expand Up @@ -186,7 +186,7 @@
<help><![CDATA[
**What it does**
This tool uses `"snpEff build -genbank"` or `"snpEff build -gff3"` commands to create a snpEff database.
This tool uses `snpEff build` to create a snpEff database.
------
Expand All @@ -201,7 +201,7 @@ Using Genbank data for creating databases has several advantages:
.. class:: warningmark
SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use GFF route described below.
SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use the GFF route described below.
-------
Expand Down
24 changes: 18 additions & 6 deletions tool_collections/snpeff/snpEff_databases.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_databases" name="SnpEff databases:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_databases" name="SnpEff databases:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> list available databases</description>
<macros>
<import>snpEff_macros.xml</import>
Expand All @@ -19,6 +19,10 @@
| grep -v '${exclude_pattern}'
#end if
#if str($include_download_path) == "no":
| cut -f 1,2,3,4
#end if
> '${snpeff_dbs}'
]]></command>
<inputs>
Expand All @@ -38,7 +42,10 @@
</valid>
</sanitizer>
</param>

<param name="include_download_path" type="select" display="radio" label="Include download paths?" help="When snpEff dumps the list of available databases, it includes their download paths. These are not needed in the Galaxy context.">
<option value="yes">Yes</option>
<option value="no" selected="true">No</option>
</param>
</inputs>
<outputs>
<data name="snpeff_dbs" format="tabular" label="${tool.name} @SNPEFF_VERSION@ available databases" />
Expand All @@ -63,12 +70,16 @@
<help><![CDATA[
**What it does**
This tool downloads the master list of snpEff databases from @SNPEFF_DATABASE_URL@. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse* the it will produce a tabular dataset with the following content::
This tool downloads the master list of snpEff databases from a remote SnpEff repository. You can then look at this list and decide which database to use for your analysis. For example, if **List entries matching the following expression** parameter of this tool is set to *Mouse*, it will produce a tabular dataset with the following content::
mm10 Mouse
mm39 Mouse
mm9 Mouse
mm10 Mouse http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm10.zip
mm9 Mouse http://downloads.sourceforge.net/project/snpeff/databases/v4_3/snpEff_v4_3_mm9.zip
This means that there are three available snpEff databases for mouse genome. If you want to use mm39 in you analysis:
This means that there two available snpEff databases for mouse genome versions mm9 and mm10. In order to download these databases you should use identifier from the first column (e.g., mm9 or mm10 in this case).
- set **Genome source** option of **SnpEff eff** Galaxy tool to *Download on demand*
- enter 'mm39' into **Snpff Genome Version Name** text box
-------
Expand All @@ -83,6 +94,7 @@ There are two ways to use names of databases obtained with this tool in Galaxy's
@SNPEFF_IN_GALAXY_INFO@
@EXTERNAL_DOCUMENTATION@
]]></help>
<expand macro="citations" />
</tool>
Expand Down
4 changes: 2 additions & 2 deletions tool_collections/snpeff/snpEff_download.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_download" name="SnpEff download:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_download" name="SnpEff download:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description> download a pre-built database</description>
<macros>
<import>snpEff_macros.xml</import>
Expand Down Expand Up @@ -42,7 +42,7 @@ mv temp/'$genome_version' '$snpeff_db.files_path'
<help><![CDATA[
**What it does**
This tool downloads a specified database from @SNPEFF_DATABASE_URL@. It deposits it into the history.
This tool downloads a specified database from a remote SnpEff repository. It deposits it into the history.
-------
Expand Down
9 changes: 4 additions & 5 deletions tool_collections/snpeff/snpEff_macros.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<macros>
<xml name="requirement">
<requirement type="package" version="4.3.1t">snpeff</requirement>
<requirement type="package" version="5.2">snpeff</requirement>
<yield/>
</xml>
<xml name="stdio">
Expand All @@ -14,9 +14,8 @@
snpEff -version
]]></version_command>
</xml>
<token name="@WRAPPER_VERSION@">4.3+T</token>
<token name="@SNPEFF_VERSION@">SnpEff4.3</token>
<token name="@SNPEFF_DATABASE_URL@">https://sourceforge.net/projects/snpeff/files/databases/v4_3/</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@SNPEFF_VERSION@">5.2</token>
<token name="@JAVA_OPTIONS@">-Xmx\${GALAXY_MEMORY_MB:-8192}m</token>
<xml name="ref_select">
<conditional name="reference_source">
Expand Down Expand Up @@ -59,7 +58,7 @@ In you *do not see them* keep reading...

**Download pre-built databases**

SnpEff project generates large numbers of pre-build databases. These are available at @SNPEFF_DATABASE_URL@ and can downloaded. Follow these steps:
SnpEff project generates large numbers of pre-build databases. To obtain and use them follow these steps:

#. Use **SnpEff databases** tool to generate a list of existing databases. Note the name of the database you need.
#. Use **SnpEff download** tool to download the database.
Expand Down
2 changes: 1 addition & 1 deletion tool_collections/snpeff/snpeff_get_chr_names.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@WRAPPER_VERSION@.galaxy2">
<tool id="snpEff_get_chr_names" name="SnpEff chromosome-info:" version="@SNPEFF_VERSION@+galaxy@WRAPPER_VERSION@" profile="23.0">
<description>list chromosome names/lengths</description>
<macros>
<import>snpEff_macros.xml</import>
Expand Down
Loading

0 comments on commit 4cc10cf

Please sign in to comment.