diff --git a/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/.dockstore.yml b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/.dockstore.yml new file mode 100644 index 000000000..29fc67356 --- /dev/null +++ b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/.dockstore.yml @@ -0,0 +1,15 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /Functional_annotation_of_protein_sequences.ga + testParameterFiles: + - /Functional_annotation_of_protein_sequences-tests.yml + authors: + - name: Romane Libouban + email: romane.libouban@irisa.fr + orcid: 0009-0001-4920-9951 + - name: Anthony Bretaudeau + email: anthony.bretaudeau@irisa.fr + orcid: 0000-0003-0914-2470 diff --git a/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/CHANGELOG.md b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/CHANGELOG.md new file mode 100644 index 000000000..f6ab12ac1 --- /dev/null +++ b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## [0.1] + +Initial version of the Functional annotation of protein sequence Workflow. \ No newline at end of file diff --git a/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences-tests.yml b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences-tests.yml new file mode 100644 index 000000000..e41298afc --- /dev/null +++ b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences-tests.yml @@ -0,0 +1,23 @@ +- doc: Test outline for Functional_annotation_of_protein_sequences.ga + job: + input: + class: File + location: https://zenodo.org/record/8414802/files/protein_sequences.fasta?download=1 + filetype: fasta + outputs: + eggNOG Mapper seed_orthologs: + location: https://zenodo.org/records/13951790/files/eggNOG_Mapper_seed_orthologs.tabular?download=1&preview=1 + compare: sim_size + delta: 50000 + eggNOG Mapper annotations: + location: https://zenodo.org/records/13951790/files/eggNOG_Mapper_annot.tabular?download=1&preview=1 + compare: sim_size + delta: 100000 + interproscan xml: + location: https://zenodo.org/records/13951790/files/interProScan.xml?download=1&preview=1 + compare: sim_size + delta: 7000000 + interproscan tabular: + location: https://zenodo.org/records/13951790/files/interProScan.tabular?download=1&preview=1 + compare: sim_size + delta: 2000000 \ No newline at end of file diff --git a/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences.ga b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences.ga new file mode 100644 index 000000000..4ad59001d --- /dev/null +++ b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/Functional_annotation_of_protein_sequences.ga @@ -0,0 +1,184 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow uses eggNOG mapper and InterProScan for functional annotation of protein sequences.", + "comments": [ + { + "child_steps": [ + 1, + 2 + ], + "color": "green", + "data": { + "title": "Functional annotation" + }, + "id": 0, + "position": [ + 300, + 0 + ], + "size": [ + 240, + 496 + ], + "type": "frame" + } + ], + "creator": [ + { + "class": "Person", + "email": "mailto:romane.libouban@irisa.fr", + "identifier": "https://orcid.org/0009-0001-4920-9951", + "name": "Romane Libouban" + }, + { + "class": "Person", + "email": "mailto:anthony.bretaudeau@irisa.fr", + "identifier": "https://orcid.org/0000-0003-0914-2470", + "name": "Anthony Bretaudeau" + } + ], + "format-version": "0.1", + "license": "MIT", + "release": "0.1", + "name": "Functional annotation of protein sequences", + "steps": { + "0": { + "annotation": "This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences.", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences.", + "name": "input" + } + ], + "label": "input", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0, + "top": 0 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "fb78bb38-ab6a-4676-98c5-5d3be83e7474", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "InterProScan is a tool that analyses each protein sequence from our annotation to determine if they contain one or several of the signatures from InterPro.", + "content_id": "toolshed.g2.bx.psu.edu/repos/bgruening/interproscan/interproscan/5.59-91.0+galaxy3", + "errors": null, + "id": 1, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": "InterProScan", + "name": "InterProScan", + "outputs": [ + { + "name": "outfile_tsv", + "type": "tabular" + }, + { + "name": "outfile_xml", + "type": "xml" + } + ], + "position": { + "left": 162.5, + "top": 279.5 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/interproscan/interproscan/5.59-91.0+galaxy3", + "tool_shed_repository": { + "changeset_revision": "74810db257cc", + "name": "interproscan", + "owner": "bgruening", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"applications\": [\"TIGRFAM\", \"FunFam\", \"SFLD\", \"SUPERFAMILY\", \"PANTHER\", \"Gene3D\", \"Hamap\", \"PrositeProfiles\", \"Coils\", \"SMART\", \"CDD\", \"PRINTS\", \"PIRSR\", \"PrositePatterns\", \"AntiFam\", \"Pfam\", \"MobiDBLite\", \"PIRSF\"], \"chromInfo\": \"/shared/ifbstor1/galaxy/mutable-config/tool-data/shared/ucsc/chrom/?.len\", \"database\": \"5.59-91.0\", \"goterms\": true, \"input\": {\"__class__\": \"ConnectedValue\"}, \"iprlookup\": false, \"licensed\": {\"use\": \"false\", \"__current_case__\": 1, \"applications_licensed\": [\"Phobius\", \"SignalP_EUK\", \"TMHMM\"]}, \"oformat\": [\"TSV\", \"XML\"], \"pathways\": true, \"seqtype\": \"p\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "5.59-91.0+galaxy3", + "type": "tool", + "uuid": "36d72511-ef8c-42ab-8944-b7aef340a9bc", + "when": null, + "workflow_outputs": [ + { + "label": "interproscan xml", + "output_name": "outfile_xml", + "uuid": "bef32b2c-0065-4854-9e5a-898e689d559c" + }, + { + "label": "interproscan tabular", + "output_name": "outfile_tsv", + "uuid": "d58da4b2-4c05-491f-8509-609184241715" + } + ] + }, + "2": { + "annotation": "EggNOG Mapper compares each protein sequence of the annotation to a huge set of ortholog groups from the EggNOG database.", + "content_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/eggnog_mapper/eggnog_mapper/2.1.8+galaxy4", + "errors": null, + "id": 2, + "input_connections": { + "ortho_method|input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": "eggNOG Mapper", + "name": "eggNOG Mapper", + "outputs": [ + { + "name": "seed_orthologs", + "type": "tabular" + }, + { + "name": "annotations", + "type": "tabular" + } + ], + "position": { + "left": 340, + "top": 52 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/eggnog_mapper/eggnog_mapper/2.1.8+galaxy4", + "tool_shed_repository": { + "changeset_revision": "d9c3016f7283", + "name": "eggnog_mapper", + "owner": "galaxyp", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"annotation_options\": {\"no_annot\": \"\", \"__current_case__\": 0, \"seed_ortholog_evalue\": \"0.001\", \"seed_ortholog_score\": null, \"tax_scope\": null, \"target_orthologs\": \"all\", \"go_evidence\": \"non-electronic\"}, \"chromInfo\": \"/shared/ifbstor1/galaxy/mutable-config/tool-data/shared/ucsc/chrom/?.len\", \"eggnog_data\": \"5.0.2\", \"ortho_method\": {\"m\": \"diamond\", \"__current_case__\": 0, \"input\": {\"__class__\": \"ConnectedValue\"}, \"input_trans\": {\"itype\": \"proteins\", \"__current_case__\": 0}, \"matrix_gapcosts\": {\"matrix\": \"BLOSUM62\", \"__current_case__\": 2, \"gap_costs\": \"--gapopen 11 --gapextend 1\"}, \"sensmode\": \"sensitive\", \"dmnd_iterate\": false, \"dmnd_ignore_warnings\": false, \"query_cover\": null, \"subject_cover\": null, \"pident\": null, \"evalue\": null, \"score\": \"0.001\"}, \"output_options\": {\"no_file_comments\": false, \"report_orthologs\": false, \"md5\": false}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.1.8+galaxy4", + "type": "tool", + "uuid": "76b27114-c49f-41b8-9333-91953729dee3", + "when": null, + "workflow_outputs": [ + { + "label": "eggNOG Mapper annotations", + "output_name": "annotations", + "uuid": "25a1e387-baa5-48b6-b142-198bec95463e" + }, + { + "label": "eggNOG Mapper seed_orthologs", + "output_name": "seed_orthologs", + "uuid": "47ac8226-b800-479f-a6f0-f882bafc33d7" + } + ] + } + }, + "tags": [], + "uuid": "4cbba315-c9bc-4895-aeeb-57dadef3542a", + "version": 2 +} diff --git a/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/README.md b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/README.md new file mode 100644 index 000000000..64ab025be --- /dev/null +++ b/workflows/genome_annotation/functional-annotation/functional-annotation-protein-sequences/README.md @@ -0,0 +1,28 @@ +# Functional annotation of protein sequences Workflow + +This workflow uses eggNOG mapper and Interproscan for functional annotation of protein sequences. +It can be used on proteins from any organism. + +EggNOG Mapper compares each protein sequence of the annotation to a huge set of ortholog groups from the EggNOG database. In this database, each ortholog group is associated with functional annotation like Gene Ontology (GO) terms or KEGG pathways. When the protein sequence of a new gene is found to be very similar to one of these ortholog groups, the corresponding functional annotation is transfered to this new gene. + +InterProScan is a tool that analyses each protein sequence from our annotation to determine if they contain one or several of the signatures from InterPro. When a protein contains a known signature, the corresponding functional annotation will be assigned to it by InterProScan. + +## Input dataset +This workflow requires only a input file: a protein sequences file in fasta format. + + +## Outputs for eggNOG Mapper +The output of this tool is a tabular file, where each line represents a gene from our annotation, with the functional annotation that was found by EggNOG-mapper. It includes a predicted protein name, GO terms, EC numbers, KEGG identifiers, etc. + +## Outputs for Interproscan +The output of this tool is both a tabular file and an XML file. Both contain the same information, but the tabular one is more readable for a Human: each line represents a gene from our annotation, with the different domains and motifs that were found by InterProScan. + +Each line correspond to a motif found in one of the annotated proteins. The most interesting columns are: +- Column 1: the protein identifier +- Column 5: the identifier of the signature that was found in the protein sequence +- Column 4: the databank where this signature comes from (InterProScan regroups several motifs databanks) +- Column 6: the human readable description of the motif +- Columns 7 and 8: the position where the motif was found +- Column 9: a score for the match (if available) +- Column 12 and 13: identifier of the signature integrated in InterPro (if available). Have a look an example webpage for IPR036859 on InterPro. +- The following columns contains various identifiers that were assigned to the protein based on the match with the signature (Gene ontology term, Reactome, …)