Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update proteinortho=6.3.4 #6611

Merged
merged 3 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 60 additions & 19 deletions tools/proteinortho/proteinortho.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@
2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
#if $more_options.selfblast:
&&
mv result.blast-graph_clean result.blast-graph;
mv result.blast-graph_clean result.blast-graph
#end if
#if $synteny.synteny_options == "specified":
&&
mv result.poff-graph result.proteinortho-graph &&
mv result.poff.tsv result.proteinortho.tsv &&
mv result.poff.html result.proteinortho.html ;
mv result.poff.html result.proteinortho.html
#end if
]]></command>
<inputs>
Expand All @@ -115,6 +115,8 @@
<option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
<option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
<option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
<option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
<option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
<option value="lastp">Last (aminoacid sequences)</option>
<option value="lastn">Last (nucleotide sequences)</option>
<option value="blatp">BLAT (aminoacid sequences)</option>
Expand All @@ -126,7 +128,7 @@
<param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
<param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
<param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
<param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
<param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
<param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
Expand All @@ -137,7 +139,7 @@
</param>
</section>
<conditional name="synteny">
<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
<option value="no" selected="true">no</option>
<option value="specified">yes</option>
</param>
Expand Down Expand Up @@ -177,7 +179,7 @@
</data>
</outputs>
<tests>
<test expect_num_outputs="3"> <!-- test normal -->
<test expect_num_outputs="3"> <!-- test normal / default params -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="diamond"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
Expand All @@ -187,6 +189,16 @@
<has_text text="--p=diamond"/>
</assert_command>
</test>
<test expect_num_outputs="3"> <!-- test normal mmseqs -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="mmseqsp"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
<expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
<expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
<assert_command>
<has_text text="--p=mmseqsp"/>
</assert_command>
</test>
<test expect_num_outputs="3"> <!-- various parameter -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="diamond"/>
Expand Down Expand Up @@ -251,12 +263,12 @@
</test>
<test expect_num_outputs="3"> <!-- blat -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="blastp"/>
<param name="p" value="blatp"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
<expand macro="test_output_blastgraph" nlines="156" nlines_delta="50"/>
<expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/>
<expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
<expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
<assert_command>
<has_text text="--p=blastp"/>
<has_text text="--p=blatp"/>
</assert_command>
</test>
</tests>
Expand Down Expand Up @@ -285,8 +297,8 @@ Proteinortho is a tool to detect orthologous proteins/genes within different spe

* **(ii) Cluster the RBH**

| Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
| The resulting connected components are outputted in orthology-groups / -pairs
| A spectral clustering algorithm is used to remove weak connections, reducing false positives.
| The connected components from this process are output as orthology groups or pairs.

----

Expand Down Expand Up @@ -322,41 +334,70 @@ Proteinortho is a tool to detect orthologous proteins/genes within different spe

| The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
| Every line corresponds to an orthology group.
| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general.
| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself.
| Then a column for each species follows containing the proteins of these species.
| If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
| The '*' represents that this species does not contribute to the group.

.. csv-table::

Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
4,6,0.115,*,C_12,E_315,L_313,M_313
4,5,0.167,*,C_63,E_19,L_19,M_19
4,4,0.816,*,C_64,E_18,L_18,M_18

----

| The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
| The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
| The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).

.. csv-table::

seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
# ecoli.faa,human.faa
# 1.91e-112,357.5,1.825e-113,360
L_10,C_10;test,4.32e-151,447,4.30e-151,446
L_11,C_11,1.17e-68,209,3.00e-69,210
L_14,C_14,3.64e-139,422,1.19e-142,431
L_15,C_15,3.51e-100,303,2.12e-102,308
L_16,C_16,3.75e-49,157,7.06e-50,159
L_17,C_17,2.96e-195,578,5.50e-196,579

----

* **orthology-pairs**

| The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
| Similar to orthology groups, but each edge is printed individually.
| The output is formatted the same as the RBH graph.
| For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:

.. csv-table::

seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba

seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
M_313,C_12,1.18e-115,407,6.12e-116,407
C_12,E_315,4.50e-127,445,4.09e-127,445
L_313,M_313,0.00e+00,1368,0.00e+00,1368
L_313,C_12,3.76e-114,402,1.94e-114,402

----

| Especially L_313 and M_313 are very similar, probably identical.
| The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.

**Proteinortho-Tools for downstream analysis**

* `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
* `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.

More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho

**Citations:**

]]>
</help>
<expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>
6 changes: 5 additions & 1 deletion tools/proteinortho/proteinortho_grab_proteins.xml
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,9 @@ proteinortho_grab_proteins : find gene(s)/protein(s) in a given fasta file and r
More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
]]>
</help>
<expand macro="citations"/>
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>
14 changes: 4 additions & 10 deletions tools/proteinortho/proteinortho_macros.xml
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">6.3.1</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="citations">
<citations>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
</citations>
</xml>
<token name="@TOOL_VERSION@">6.3.4</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">proteinortho</xref>
Expand All @@ -22,6 +15,7 @@
<requirement type="package" version="2.15.0">blast</requirement>
<requirement type="package" version="445">ucsc-blat</requirement>
<requirement type="package" version="1519">last</requirement>
<requirement type="package" version="16.747c6">mmseqs2</requirement>
</requirements>
</xml>
<xml name="version_command">
Expand Down
6 changes: 5 additions & 1 deletion tools/proteinortho/proteinortho_summary.xml
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,9 @@ Or given 2 orthology-pairs from the same set of fasta files with different param
More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
]]>
</help>
<expand macro="citations"/>
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>
Loading