Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EMBOSS] Add Needleall tool (v6) and bump version for needle to v6 #6643

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 55 additions & 13 deletions tools/emboss_5/emboss_needle.xml
Original file line number Diff line number Diff line change
@@ -1,18 +1,47 @@
<tool id="EMBOSS: needle56" name="needle" version="@[email protected]">
<tool id="EMBOSS: needle56" name="needle" version="@VERSION6@">
<description>Needleman-Wunsch global alignment</description>
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
<expand macro="bio_tools" />
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements" />
<expand macro="bio_tools" />
<expand macro="requirements6" />
<code file="emboss_format_corrector.py" />
<command>needle -asequence '$input1' -bsequence '$input2' -outfile '$out_file1' -gapopen $gapopen -gapextend $gapextend -brief $brief -aformat3 $out_format1 -auto</command>
<version_command>needle -version</version_command>
<command detect_errors="exit_code"><![CDATA[
needle -asequence '$asequence'
-bsequence '$bsequence'
-outfile '$out_file1'
-gapopen $gapopen
-gapextend $gapextend
-brief $brief
-aformat3 $out_format1
-auto
#if $datafile
-datafile $datafile
#end if
#if $endgap.endweight == 'yes'
-endopen $endgap.endopen
-endextend $endgap.endextend
#end if
]]></command>
<inputs>
<param name="input1" type="data" format="fasta" label="Sequence 1" />
<param name="input2" type="data" format="fasta" label="Sequence 2" />
<param name="gapopen" type="float" value="10.0" label="Gap open penalty" />
<param name="gapextend" type="float" value="0.5" label="Gap extension penalty" />
<param name="brief" type="select" label="Brief identity and similarity">
<param argument="-asequence" type="data" format="fasta" label="Sequence 1" />
<param argument="-bsequence" type="data" format="fasta" label="Sequence 2" />
<expand macro="scoring_matrix"/>
<param argument="-gapopen" type="float" value="10.0" label="Gap open penalty" />
<param argument="-gapextend" type="float" value="0.5" label="Gap extension penalty" />
<conditional name="endgap">
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
<param argument="-endweight" type="select" label="Apply end gap penalties?" help="">
<option value="yes">yes</option>
<option value="no" selected="true">no</option>
</param>
<when value="yes">
<param argument="-endopen" type="float" value="10.0" min="0.0" max="100.0" label="Penalty for creation of and end gap." help="The best value depends on the choice of comparison matrix. The default value assumes you are using the EBLOSUM62 matrix for protein sequences, and the EDNAFULL matrix for nucleotide sequences."/>
<param argument="-endextend" type="float" value="0.5" min="0.0" max="10.0" label="The end gap extensionpenalty" help="this penalty is added to the end gap penalty for each base or residue in the end gap."/>
</when>
<when value="no"/>
</conditional>
<param argument="-brief" type="select" label="Brief identity and similarity">
<option value="yes">Yes</option>
<option value="no">No</option>
</param>
Expand All @@ -36,16 +65,29 @@
</outputs>
<tests>
<test>
<param name="input1" value="2.fasta"/>
<param name="input2" value="1.fasta"/>
<param name="asequence" value="2.fasta"/>
<param name="bsequence" value="1.fasta"/>
<param name="gapopen" value="10"/>
<param name="gapextend" value="0.5"/>
<param name="brief" value="yes"/>
<param name="out_format1" value="score"/>
<output name="out_file1" file="emboss_needle_out.score"/>
</test>
<test><!-- test with fasta output, custom matrix, and endgap penalties -->
<param name="asequence" value="2.fasta"/>
<param name="bsequence" value="1.fasta"/>
<param name="gapopen" value="10"/>
<param name="gapextend" value="0.5"/>
<param name="datafile" value="EPAM30"/>
<param name="endweight" value="yes"/>
<param name="endopen" value="13.37"/>
<param name="endextend" value="2.5"/>
<param name="brief" value="yes"/>
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
<param name="out_format1" value="fasta"/>
<output name="out_file1" file="emboss_needle_out.fasta"/>
</test>
</tests>
<help>
<help><![CDATA[
.. class:: warningmark

needle reads any two sequences of the same type (DNA or protein).
Expand Down Expand Up @@ -114,6 +156,6 @@ You can view the original documentation here_.

#---------------------------------------
#---------------------------------------
</help>
]]></help>
<expand macro="citations" />
</tool>
123 changes: 123 additions & 0 deletions tools/emboss_5/emboss_needleall.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
<tool id="emboss_needleall" name="needle" version="@[email protected]">
<description>Many-to-many Needleman-Wunsch global alignment</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="bio_tools" />
<expand macro="requirements6" />
<code file="emboss_format_corrector.py" />
<version_command>needleall -version</version_command>
<command detect_errors="exit_code"><![CDATA[
needleall
-asequence '$asequence'
-bsequence '$bsequence'
-outfile '$out_file1'
-gapopen $gapopen
-gapextend $gapextend
-brief $brief
-aformat3 $out_format1
-auto
#if $datafile
-datafile $datafile
#end if
#if $endgap.endweight == 'yes'
-endopen $endgap.endopen
-endextend $endgap.endextend
#end if
-minscore $minscore
]]></command>
<inputs>
<param argument="-asequence" type="data" format="fasta" label="Sequence set 1" />
<param argument="-bsequence" type="data" format="fasta" label="Sequence seet 2" />
<expand macro="scoring_matrix"/>
<param argument="-gapopen" type="float" value="10.0" label="Gap open penalty" />
<param argument="-gapextend" type="float" value="0.5" label="Gap extension penalty" />
<conditional name="endgap">
<param argument="-endweight" type="select" label="Apply end gap penalties?" help="">
<option value="yes">yes</option>
<option value="no" selected="true">no</option>
</param>
<when value="yes">
<param argument="-endopen" type="float" value="10.0" min="0.0" max="100.0" label="Penalty for creation of and end gap." help="The best value depends on the choice of comparison matrix. The default value assumes you are using the EBLOSUM62 matrix for protein sequences, and the EDNAFULL matrix for nucleotide sequences."/>
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
<param argument="-endextend" type="float" value="0.5" min="0.0" max="10.0" label="The end gap extensionpenalty" help="this penalty is added to the end gap penalty for each base or residue in the end gap."/>
</when>
<when value="no"/>
</conditional>
<param argument="-minscore" type="float" value="1.0" min="-10.0" max="100.0" label="Minimum alignment score to report an alignment." help=""/>
<param argument="-brief" type="select" label="Brief identity and similarity">
<option value="yes">Yes</option>
<option value="no">No</option>
</param>
<param name="out_format1" type="select" label="Output alignment file format">
<option value="srspair">SRS pair (p)</option>
<option value="simple">Simple (m)</option>
<option value="fasta">FASTA (m)</option>
<option value="msf">MSF (m)</option>
<option value="srs">SRS (m)</option>
<option value="pair">Pair (p)</option>
<option value="markx0">Markx0 (p)</option>
<option value="markx1">Markx1 (p)</option>
<option value="markx2">Markx2 (p)</option>
<option value="markx3">Markx3 (p)</option>
<option value="markx10">Markx10 (p)</option>
<option value="score">Score (p)</option>
</param>
</inputs>
<outputs>
<data name="out_file1" format="needle" />
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
</outputs>
<tests>
<test>
<param name="asequence" value="emboss_needleall_input1.fa"/>
<param name="bsequence" value="emboss_needleall_input2.fa"/>
<param name="gapopen" value="10"/>
<param name="gapextend" value="0.5"/>
<param name="brief" value="yes"/>
<param name="out_format1" value="score"/>
<output name="out_file1" file="emboss_needleall_out.score"/>
</test>
<test><!-- test fasta output -->
<param name="asequence" value="emboss_needleall_input1.fa"/>
<param name="bsequence" value="emboss_needleall_input2.fa"/>
<param name="gapopen" value="10"/>
<param name="gapextend" value="0.5"/>
<param name="brief" value="yes"/>
<param name="out_format1" value="fasta"/>
<output name="out_file1" file="emboss_needleall_out.fasta" ftype="fasta"/>
</test>
<test><!-- test with pair output, endgap penalties and custom scoring matrix -->
<param name="asequence" value="emboss_needleall_input1.fa"/>
<param name="bsequence" value="emboss_needleall_input2.fa"/>
<param name="gapopen" value="10"/>
<param name="gapextend" value="0.5"/>
<param name="endweight" value="yes"/>
<param name="endopen" value="13.37"/>
<param name="endextend" value="2.5"/>
<param name="brief" value="yes"/>
<param name="datafile" value="EPAM30"/>
<param name="out_format1" value="pair"/>
<output name="out_file1" file="emboss_needleall_out.pair" lines_diff="10"/>
</test>
</tests>
<help><![CDATA[
.. class:: warningmark

needleall reads in two nucleotide or protein sequences inputs. Both can be one or more sequences. All sequences in the first ionput are aligned to all sequences in the second input.
shiltemann marked this conversation as resolved.
Show resolved Hide resolved

-----

**Syntax**

This tool uses the Needleman-Wunsch global alignment algorithm to find the optimum alignment (including gaps) of two sequences when considering their entire length.

- **Optimal alignment:** Dynamic programming methods ensure the optimal global alignment by exploring all possible alignments and choosing the best.

- **The Needleman-Wunsch algorithm** is a member of the class of algorithms that can calculate the best score and alignment in the order of mn steps, (where 'n' and 'm' are the lengths of the two sequences).

- **Gap open penalty:** [10.0 for any sequence] The gap open penalty is the score taken away when a gap is created. The best value depends on the choice of comparison matrix. The default value assumes you are using the EBLOSUM62 matrix for protein sequences, and the EDNAFULL matrix for nucleotide sequences. (Floating point number from 1.0 to 100.0)

- **Gap extension penalty:** [0.5 for any sequence] The gap extension, penalty is added to the standard gap penalty for each base or residue in the gap. This is how long gaps are penalized. Usually you will expect a few long gaps rather than many short gaps, so the gap extension penalty should be lower than the gap penalty. An exception is where one or both sequences are single reads with possible sequencing errors in which case you would expect many single base gaps. You can get this result by setting the gap open penalty to zero (or very low) and using the gap extension penalty to control gap scoring. (Floating point number from 0.0 to 10.0)

]]></help>
<expand macro="citations" />
</tool>
86 changes: 86 additions & 0 deletions tools/emboss_5/macros.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
<macros>
<token name="@VERSION@">5.0.0</token>
<token name="@VERSION6@">6.6.0</token>
<xml name="requirements">
<requirements>
<requirement type="package" version="@VERSION@">emboss</requirement>
<requirement type="package" version="5.26">perl</requirement>
</requirements>
</xml>
<xml name="requirements6">
<requirements>
<requirement type="package" version="@VERSION6@">emboss</requirement>
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
<requirement type="package" version="5.26">perl</requirement>
</requirements>
</xml>

<xml name="stdio">
<stdio>
<regex level="fatal_oom" match="insufficient memory available" source="both" />
Expand Down Expand Up @@ -47,4 +55,82 @@
<validator type="empty_field" />
<validator type="regex" message="Pattern must not end with backslash.">.*[^\\]$</validator>
</xml>

<xml name="scoring_matrix">
<param argument="-datafile" type="select" optional="true" label="Scoring matrix" help="This is the scoring matrix used when comparing sequences. By default it is EBLOSUM62 (for proteins) or EDNAFULL (for nucleic sequences)">
<option value="EBLOSUM30">EBLOSUM30</option>
<option value="EBLOSUM35">EBLOSUM35</option>
<option value="EBLOSUM40">EBLOSUM40</option>
<option value="EBLOSUM45">EBLOSUM45</option>
<option value="EBLOSUM50">EBLOSUM50</option>
<option value="EBLOSUM55">EBLOSUM55</option>
<option value="EBLOSUM60">EBLOSUM60</option>
<option value="EBLOSUM62">EBLOSUM62</option>
<option value="EBLOSUM62-12">EBLOSUM62-12</option>
<option value="EBLOSUM65">EBLOSUM65</option>
<option value="EBLOSUM70">EBLOSUM70</option>
<option value="EBLOSUM75">EBLOSUM75</option>
<option value="EBLOSUM80">EBLOSUM80</option>
<option value="EBLOSUM85">EBLOSUM85</option>
<option value="EBLOSUM90">EBLOSUM90</option>
<option value="EBLOSUMN">EBLOSUMN</option>
<option value="EDNAFULL">EDNAFULL</option>
<option value="EDNAMAT">EDNAMAT</option>
<option value="EDNASIMPLE">EDNASIMPLE</option>
<option value="ENUC.4.2">ENUC.4.2</option>
<option value="ENUC.4.4">ENUC.4.4</option>
<option value="EPAM10">EPAM10</option>
<option value="EPAM100">EPAM100</option>
<option value="EPAM110">EPAM110</option>
<option value="EPAM120">EPAM120</option>
<option value="EPAM130">EPAM130</option>
<option value="EPAM140">EPAM140</option>
<option value="EPAM150">EPAM150</option>
<option value="EPAM160">EPAM160</option>
<option value="EPAM170">EPAM170</option>
<option value="EPAM180">EPAM180</option>
<option value="EPAM190">EPAM190</option>
<option value="EPAM20">EPAM20</option>
<option value="EPAM200">EPAM200</option>
<option value="EPAM210">EPAM210</option>
<option value="EPAM220">EPAM220</option>
<option value="EPAM230">EPAM230</option>
<option value="EPAM240">EPAM240</option>
<option value="EPAM250">EPAM250</option>
<option value="EPAM260">EPAM260</option>
<option value="EPAM270">EPAM270</option>
<option value="EPAM280">EPAM280</option>
<option value="EPAM290">EPAM290</option>
<option value="EPAM30">EPAM30</option>
<option value="EPAM300">EPAM300</option>
<option value="EPAM310">EPAM310</option>
<option value="EPAM320">EPAM320</option>
<option value="EPAM330">EPAM330</option>
<option value="EPAM340">EPAM340</option>
<option value="EPAM350">EPAM350</option>
<option value="EPAM360">EPAM360</option>
<option value="EPAM370">EPAM370</option>
<option value="EPAM380">EPAM380</option>
<option value="EPAM390">EPAM390</option>
<option value="EPAM40">EPAM40</option>
<option value="EPAM400">EPAM400</option>
<option value="EPAM410">EPAM410</option>
<option value="EPAM420">EPAM420</option>
<option value="EPAM430">EPAM430</option>
<option value="EPAM440">EPAM440</option>
<option value="EPAM450">EPAM450</option>
<option value="EPAM460">EPAM460</option>
<option value="EPAM470">EPAM470</option>
<option value="EPAM480">EPAM480</option>
<option value="EPAM490">EPAM490</option>
<option value="EPAM50">EPAM50</option>
<option value="EPAM500">EPAM500</option>
<option value="EPAM60">EPAM60</option>
<option value="EPAM70">EPAM70</option>
<option value="EPAM80">EPAM80</option>
<option value="EPAM90">EPAM90</option>
<option value="SSSUB">SSSUB</option>
<option value="srspair">SRS pair (p)</option>
shiltemann marked this conversation as resolved.
Show resolved Hide resolved
</param>
</xml>
</macros>
Loading
Loading