From 1ce1f00375ad16a85f563522d92af784b6a25531 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Wed, 11 Dec 2024 10:30:53 +0100 Subject: [PATCH 1/7] add workflow: annotation a genome with maker --- .../annotation_maker/.dockstore.yml | 13 + .../annotation_maker/CHANGELOG.md | 5 + .../Genome_annotation_with_maker_short.ga | 817 ++++++++++++++++++ ...nome_annotation_with_maker_short_tests.yml | 104 +++ .../annotation_maker/README.md | 85 ++ 5 files changed, 1024 insertions(+) create mode 100644 workflows/genome_annotation/annotation_maker/.dockstore.yml create mode 100644 workflows/genome_annotation/annotation_maker/CHANGELOG.md create mode 100644 workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga create mode 100644 workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml create mode 100644 workflows/genome_annotation/annotation_maker/README.md diff --git a/workflows/genome_annotation/annotation_maker/.dockstore.yml b/workflows/genome_annotation/annotation_maker/.dockstore.yml new file mode 100644 index 0000000000..22f9838e5e --- /dev/null +++ b/workflows/genome_annotation/annotation_maker/.dockstore.yml @@ -0,0 +1,13 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /Genome_annotation_with_maker_short.ga + testParameterFiles: + - /Genome_annotation_with_maker_short_tests.yml + authors: + - name: Romane Libouban + email: romane.libouban@irisa.fr + orcid: 0009-0001-4920-9951 + diff --git a/workflows/genome_annotation/annotation_maker/CHANGELOG.md b/workflows/genome_annotation/annotation_maker/CHANGELOG.md new file mode 100644 index 0000000000..3594f992f1 --- /dev/null +++ b/workflows/genome_annotation/annotation_maker/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## [0.1] + +Initial version of the genome annotation workflow with maker. \ No newline at end of file diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga new file mode 100644 index 0000000000..380e26fec1 --- /dev/null +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga @@ -0,0 +1,817 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow uses Maker to annotate a genome.", + "comments": [ + { + "child_steps": [ + 10 + ], + "color": "black", + "data": { + "title": "Improving gene naming" + }, + "id": 6, + "position": [ + 1180, + 1100 + ], + "size": [ + 240, + 224.4 + ], + "type": "frame" + }, + { + "child_steps": [ + 0, + 1, + 2, + 3, + 4 + ], + "color": "blue", + "data": { + "title": "Inputs" + }, + "id": 0, + "position": [ + 0, + 430 + ], + "size": [ + 240, + 682.6 + ], + "type": "frame" + }, + { + "child_steps": [ + 5, + 6 + ], + "color": "green", + "data": { + "title": "Genome quality evaluation" + }, + "id": 1, + "position": [ + 430, + 0 + ], + "size": [ + 470, + 400 + ], + "type": "frame" + }, + { + "child_steps": [ + 7 + ], + "color": "yellow", + "data": { + "title": "Annotation with Maker" + }, + "id": 3, + "position": [ + 400, + 930 + ], + "size": [ + 240, + 742 + ], + "type": "frame" + }, + { + "child_steps": [ + 9 + ], + "color": "green", + "data": { + "title": "Annotation statistics" + }, + "id": 4, + "position": [ + 750, + 1040 + ], + "size": [ + 240, + 260 + ], + "type": "frame" + }, + { + "child_steps": [ + 8, + 11 + ], + "color": "turquoise", + "data": { + "title": "Evaluation - Predicted protein from annotation" + }, + "id": 2, + "position": [ + 770, + 470 + ], + "size": [ + 480, + 478.8 + ], + "type": "frame" + }, + { + "child_steps": [ + 12 + ], + "color": "pink", + "data": { + "title": "Visualization" + }, + "id": 5, + "position": [ + 1400, + 640 + ], + "size": [ + 240, + 244.8 + ], + "type": "frame" + } + ], + "creator": [ + { + "class": "Person", + "identifier": "0009-0001-4920-9951", + "name": "Romane Libouban" + } + ], + "format-version": "0.1", + "license": "MIT", + "release": "0.1", + "name": "Genome annotation with Maker (short)", + "report": { + "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" + }, + "steps": { + "0": { + "annotation": "Genome sequence", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "Genome sequence", + "name": "Genome sequence" + } + ], + "label": "Genome sequence", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 20, + "top": 470 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "d4a25db2-d25c-4fce-bd4e-d48cdcf73e84", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Genome assembly", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Genome assembly", + "name": "Genome assembly" + } + ], + "label": "Genome assembly", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 20, + "top": 600 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "68b09574-db05-4b22-bd53-1bca87898d91", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "Protein sequences", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Protein sequences", + "name": "Protein sequences" + } + ], + "label": "Protein sequences", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 20, + "top": 730 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "51e28342-f0d8-4be6-915c-336c13e342c7", + "when": null, + "workflow_outputs": [] + }, + "3": { + "annotation": "Augustus training", + "content_id": null, + "errors": null, + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "Augustus training", + "name": "Augustus training" + } + ], + "label": "Augustus training", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 20, + "top": 850 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "062a77a0-a4c2-4ac9-a73e-489f64a82ebb", + "when": null, + "workflow_outputs": [] + }, + "4": { + "annotation": "SNAP training", + "content_id": null, + "errors": null, + "id": 4, + "input_connections": {}, + "inputs": [ + { + "description": "SNAP training", + "name": "SNAP training" + } + ], + "label": "SNAP training", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 20, + "top": 990 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "507ffef8-8c0d-4d81-a946-c79956cde331", + "when": null, + "workflow_outputs": [] + }, + "5": { + "annotation": "Fasta Statistics on the genome", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/fasta_stats/fasta-stats/2.0", + "errors": null, + "id": 5, + "input_connections": { + "fasta": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": " Fasta Statistics ", + "name": "Fasta Statistics", + "outputs": [ + { + "name": "stats_output", + "type": "tabular" + } + ], + "position": { + "left": 450, + "top": 100 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/fasta_stats/fasta-stats/2.0", + "tool_shed_repository": { + "changeset_revision": "0dbb995c7d35", + "name": "fasta_stats", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"fasta\": {\"__class__\": \"ConnectedValue\"}, \"gaps_option\": false, \"genome_size\": null, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.0", + "type": "tool", + "uuid": "0808d952-600b-46f0-9575-8fb8b8966c48", + "when": null, + "workflow_outputs": [ + { + "label": "fasta stats genome", + "output_name": "stats_output", + "uuid": "57ba9a76-95e9-4f6a-9ba2-bfeff0f05be8" + } + ] + }, + "6": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.7.1+galaxy0", + "errors": null, + "id": 6, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Busco", + "outputs": [ + { + "name": "busco_sum", + "type": "txt" + }, + { + "name": "busco_table", + "type": "tabular" + }, + { + "name": "busco_missing", + "type": "tabular" + }, + { + "name": "summary_image", + "type": "png" + }, + { + "name": "busco_gff", + "type": "gff3" + } + ], + "position": { + "left": 680, + "top": 40 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.7.1+galaxy0", + "tool_shed_repository": { + "changeset_revision": "2babe6d5c561", + "name": "busco", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"adv\": {\"evalue\": \"0.001\", \"limit\": \"3\", \"contig_break\": \"10\"}, \"busco_mode\": {\"mode\": \"geno\", \"__current_case__\": 0, \"use_augustus\": {\"use_augustus_selector\": \"augustus\", \"__current_case__\": 1}}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"input\": {\"__class__\": \"ConnectedValue\"}, \"lineage\": {\"lineage_mode\": \"select_lineage\", \"__current_case__\": 1, \"lineage_dataset\": \"fungi_odb10\"}, \"lineage_conditional\": {\"selector\": \"cached\", \"__current_case__\": 0, \"cached_db\": \"all+2024-03-21-114020\"}, \"outputs\": [\"short_summary\", \"image\", \"gff\", \"missing\"], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "5.7.1+galaxy0", + "type": "tool", + "uuid": "670eb200-8cae-4de7-a3e7-6c6e95b9e92f", + "when": null, + "workflow_outputs": [ + { + "label": "busco missing genome", + "output_name": "busco_missing", + "uuid": "84b2b45d-2095-4d24-898e-dba1f491f385" + }, + { + "label": "busco table genome", + "output_name": "busco_table", + "uuid": "18c8704b-f281-472c-a81e-d7b05fc98d69" + }, + { + "label": "busco sum genome", + "output_name": "busco_sum", + "uuid": "9632ecea-72ce-4cb8-8368-13a89125c141" + }, + { + "label": "busco gff genome", + "output_name": "busco_gff", + "uuid": "d67de38f-78f0-4be9-b699-945c9ccf2a87" + }, + { + "label": "busco image genome", + "output_name": "summary_image", + "uuid": "a866af88-80c4-4e61-939c-953f9a8d81fc" + } + ] + }, + "7": { + "annotation": "Annotation with Maker", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/maker/maker/2.31.11+galaxy2", + "errors": null, + "id": 7, + "input_connections": { + "abinitio_gene_prediction|aug_prediction|augustus_model": { + "id": 3, + "output_name": "output" + }, + "abinitio_gene_prediction|snaphmm": { + "id": 4, + "output_name": "output" + }, + "est_evidences|est": { + "id": 1, + "output_name": "output" + }, + "genome": { + "id": 0, + "output_name": "output" + }, + "protein_evidences|protein": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Maker", + "name": "abinitio_gene_prediction" + }, + { + "description": "runtime parameter for tool Maker", + "name": "est_evidences" + }, + { + "description": "runtime parameter for tool Maker", + "name": "protein_evidences" + } + ], + "label": "Maker", + "name": "Maker", + "outputs": [ + { + "name": "output_gff", + "type": "gff3" + }, + { + "name": "output_evidences", + "type": "gff3" + }, + { + "name": "output_full", + "type": "gff3" + } + ], + "position": { + "left": 420, + "top": 970 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/maker/maker/2.31.11+galaxy2", + "tool_shed_repository": { + "changeset_revision": "370c210d9541", + "name": "maker", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"abinitio_gene_prediction\": {\"snaphmm\": {\"__class__\": \"ConnectedValue\"}, \"aug_prediction\": {\"augustus_mode\": \"history\", \"__current_case__\": 1, \"augustus_model\": {\"__class__\": \"ConnectedValue\"}}, \"unmask\": false}, \"advanced\": {\"fix_nucleotides\": false, \"other_gff\": null, \"alt_peptide\": \"C\", \"max_dna_len\": \"100000\", \"min_contig\": \"1\", \"pred_flank\": \"200\", \"pred_stats\": false, \"AED_threshold\": \"1.0\", \"min_protein\": \"0\", \"alt_splice\": false, \"always_complete\": false, \"map_forward\": false, \"keep_preds\": \"0.0\", \"split_hit\": \"10000\", \"correct_est_fusion\": false, \"single_exon\": {\"single_exon\": \"0\", \"__current_case__\": 0}}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"est_evidences\": {\"est2genome\": false, \"est\": {\"__class__\": \"ConnectedValue\"}, \"altest\": null, \"est_gff\": null, \"altest_gff\": null}, \"gene_prediction\": {\"pred_gff\": null, \"model_gff\": null, \"trna\": false, \"snoscan_rrna\": null}, \"genome\": {\"__class__\": \"ConnectedValue\"}, \"license_agreement\": true, \"organism_type\": \"eukaryotic\", \"protein_evidences\": {\"protein2genome\": false, \"protein\": {\"__class__\": \"ConnectedValue\"}, \"protein_gff\": null}, \"reannotation\": {\"reannotate\": \"no\", \"__current_case__\": 0}, \"repeat_masking\": {\"repeat_source\": {\"source_type\": \"no\", \"__current_case__\": 3}}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.31.11+galaxy2", + "type": "tool", + "uuid": "88a77e0a-3060-46e2-a885-0644ee1d2d8e", + "when": null, + "workflow_outputs": [ + { + "label": "maker evidences", + "output_name": "output_evidences", + "uuid": "9051492b-9f9a-47cf-8308-759833f99979" + }, + { + "label": "maker full", + "output_name": "output_full", + "uuid": "81a7537f-174d-4db3-a518-c3bf45ba4857" + }, + { + "label": "maker gff", + "output_name": "output_gff", + "uuid": "00c3d37b-dfaa-474d-9cd1-c878eedb032e" + } + ] + }, + "8": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/gffread/gffread/2.2.1.4+galaxy0", + "errors": null, + "id": 8, + "input_connections": { + "input": { + "id": 7, + "output_name": "output_gff" + }, + "reference_genome|genome_fasta": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool gffread", + "name": "reference_genome" + } + ], + "label": "GFFRead", + "name": "gffread", + "outputs": [ + { + "name": "output_exons", + "type": "fasta" + } + ], + "position": { + "left": 790, + "top": 550 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/gffread/gffread/2.2.1.4+galaxy0", + "tool_shed_repository": { + "changeset_revision": "3e436657dcd0", + "name": "gffread", + "owner": "devteam", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"chr_replace\": null, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"decode_url\": true, \"expose\": true, \"filtering\": null, \"full_gff_attribute_preservation\": true, \"gffs\": {\"gff_fmt\": \"none\", \"__current_case__\": 0}, \"input\": {\"__class__\": \"ConnectedValue\"}, \"maxintron\": null, \"merging\": {\"merge_sel\": \"none\", \"__current_case__\": 0}, \"reference_genome\": {\"source\": \"history\", \"__current_case__\": 2, \"genome_fasta\": {\"__class__\": \"ConnectedValue\"}, \"ref_filtering\": null, \"fa_outputs\": [\"-w exons.fa\"]}, \"region\": {\"region_filter\": \"none\", \"__current_case__\": 0}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.2.1.4+galaxy0", + "type": "tool", + "uuid": "b2ddf33f-d870-41cf-88e1-2ecafe34a2d3", + "when": null, + "workflow_outputs": [ + { + "label": "gffread exons", + "output_name": "output_exons", + "uuid": "03ea976d-027f-49a2-8495-b95e1750ba2d" + } + ] + }, + "9": { + "annotation": "Genome annotation statistics on the maker's annotation", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/jcvi_gff_stats/jcvi_gff_stats/0.8.4", + "errors": null, + "id": 9, + "input_connections": { + "gff": { + "id": 7, + "output_name": "output_gff" + }, + "ref_genome|genome": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Genome annotation statistics", + "name": "ref_genome" + } + ], + "label": "Genome annotation statistics", + "name": "Genome annotation statistics", + "outputs": [ + { + "name": "summary", + "type": "txt" + }, + { + "name": "graphs", + "type": "pdf" + } + ], + "position": { + "left": 770, + "top": 1080 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/jcvi_gff_stats/jcvi_gff_stats/0.8.4", + "tool_shed_repository": { + "changeset_revision": "8cffbd184762", + "name": "jcvi_gff_stats", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"gff\": {\"__class__\": \"ConnectedValue\"}, \"ref_genome\": {\"genome_type_select\": \"history\", \"__current_case__\": 1, \"genome\": {\"__class__\": \"ConnectedValue\"}}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.8.4", + "type": "tool", + "uuid": "13b8f842-4061-4fce-8beb-610000044cdf", + "when": null, + "workflow_outputs": [ + { + "label": "graphs genome", + "output_name": "graphs", + "uuid": "3992f3ad-3cf7-45ad-b31d-e324c35468f6" + }, + { + "label": "summary genome", + "output_name": "summary", + "uuid": "8ae8f720-48aa-484b-a9f2-42420db1caee" + } + ] + }, + "10": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/maker_map_ids/maker_map_ids/2.31.11", + "errors": null, + "id": 10, + "input_connections": { + "maker_gff": { + "id": 7, + "output_name": "output_gff" + } + }, + "inputs": [], + "label": "Map annotation ids", + "name": "Map annotation ids", + "outputs": [ + { + "name": "renamed", + "type": "gff" + }, + { + "name": "id_map", + "type": "tabular" + } + ], + "position": { + "left": 1200, + "top": 1140 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/maker_map_ids/maker_map_ids/2.31.11", + "tool_shed_repository": { + "changeset_revision": "e906fa778440", + "name": "maker_map_ids", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"justify\": \"6\", \"maker_gff\": {\"__class__\": \"ConnectedValue\"}, \"prefix\": \"TEST_\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.31.11", + "type": "tool", + "uuid": "2e2af2b8-1bbb-4b0e-9f3d-9f9fe41213cd", + "when": null, + "workflow_outputs": [ + { + "label": "renamed gff3", + "output_name": "renamed", + "uuid": "d20f31b9-834c-4bff-b48d-e1892e740897" + }, + { + "label": "id map", + "output_name": "id_map", + "uuid": "ff088733-2509-4811-af89-3017731e341f" + } + ] + }, + "11": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.7.1+galaxy0", + "errors": null, + "id": 11, + "input_connections": { + "input": { + "id": 8, + "output_name": "output_exons" + } + }, + "inputs": [], + "label": null, + "name": "Busco", + "outputs": [ + { + "name": "busco_sum", + "type": "txt" + }, + { + "name": "busco_table", + "type": "tabular" + }, + { + "name": "busco_missing", + "type": "tabular" + }, + { + "name": "summary_image", + "type": "png" + }, + { + "name": "busco_gff", + "type": "gff3" + } + ], + "position": { + "left": 1030, + "top": 510 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.7.1+galaxy0", + "tool_shed_repository": { + "changeset_revision": "2babe6d5c561", + "name": "busco", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"adv\": {\"evalue\": \"0.001\", \"limit\": \"3\", \"contig_break\": \"10\"}, \"busco_mode\": {\"mode\": \"tran\", \"__current_case__\": 1}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"input\": {\"__class__\": \"ConnectedValue\"}, \"lineage\": {\"lineage_mode\": \"select_lineage\", \"__current_case__\": 1, \"lineage_dataset\": \"fungi_odb10\"}, \"lineage_conditional\": {\"selector\": \"cached\", \"__current_case__\": 0, \"cached_db\": \"all+2024-03-21-114020\"}, \"outputs\": [\"short_summary\", \"missing\", \"image\", \"gff\"], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "5.7.1+galaxy0", + "type": "tool", + "uuid": "ba1f8c50-d478-45a2-bcf9-3bba389ba763", + "when": null, + "workflow_outputs": [ + { + "label": "busco table predicted proteins", + "output_name": "busco_table", + "uuid": "65394361-dd3c-483e-adbd-82e3777a24ce" + }, + { + "label": "busco missing predicted proteins", + "output_name": "busco_missing", + "uuid": "9a72bf77-47b8-457e-9147-349a75cd00cf" + }, + { + "label": "busco image predicted proteins", + "output_name": "summary_image", + "uuid": "45dccf85-b254-4da5-bf65-ca33897043e1" + }, + { + "label": "busco gff predicted proteins", + "output_name": "busco_gff", + "uuid": "55ca4792-9cc7-4e11-8e98-cde89a980f9b" + }, + { + "label": "busco sum predicted proteins", + "output_name": "busco_sum", + "uuid": "bd03a2eb-813e-4bc3-a80d-2c32ffe1ff5b" + } + ] + }, + "12": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.11+galaxy1", + "errors": null, + "id": 12, + "input_connections": { + "reference_genome|genome": { + "id": 0, + "output_name": "output" + }, + "track_groups_0|data_tracks_0|data_format|annotation": { + "id": 10, + "output_name": "renamed" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool JBrowse", + "name": "reference_genome" + } + ], + "label": null, + "name": "JBrowse", + "outputs": [ + { + "name": "output", + "type": "html" + } + ], + "position": { + "left": 1420, + "top": 680 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.11+galaxy1", + "tool_shed_repository": { + "changeset_revision": "a6e57ff585c0", + "name": "jbrowse", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"fasta\", \"action\": {\"action_select\": \"create\", \"__current_case__\": 0}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"gencode\": \"1\", \"jbgen\": {\"defaultLocation\": \"\", \"trackPadding\": \"20\", \"shareLink\": true, \"aboutDescription\": \"\", \"show_tracklist\": true, \"show_nav\": true, \"show_overview\": true, \"show_menu\": true, \"hideGenomeOptions\": false}, \"plugins\": {\"BlastView\": true, \"ComboTrackSelector\": false, \"GCContent\": false}, \"reference_genome\": {\"genome_type_select\": \"history\", \"__current_case__\": 1, \"genome\": {\"__class__\": \"ConnectedValue\"}}, \"standalone\": \"minimal\", \"track_groups\": [{\"__index__\": 0, \"category\": \"Maker annotation\", \"data_tracks\": [{\"__index__\": 0, \"data_format\": {\"data_format_select\": \"gene_calls\", \"__current_case__\": 2, \"annotation\": {\"__class__\": \"ConnectedValue\"}, \"match_part\": {\"match_part_select\": false, \"__current_case__\": 1}, \"index\": false, \"track_config\": {\"track_class\": \"NeatHTMLFeatures/View/Track/NeatFeatures\", \"__current_case__\": 3, \"html_options\": {\"topLevelFeatures\": null}}, \"jbstyle\": {\"style_classname\": \"feature\", \"style_label\": \"product,name,id\", \"style_description\": \"note,description\", \"style_height\": \"10px\", \"max_height\": \"600\"}, \"jbcolor_scale\": {\"color_score\": {\"color_score_select\": \"none\", \"__current_case__\": 0, \"color\": {\"color_select\": \"automatic\", \"__current_case__\": 0}}}, \"jb_custom_config\": {\"option\": []}, \"jbmenu\": {\"track_menu\": []}, \"track_visibility\": \"default_off\", \"override_apollo_plugins\": \"False\", \"override_apollo_drag\": \"False\"}}]}], \"uglyTestingHack\": \"\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.16.11+galaxy1", + "type": "tool", + "uuid": "6972ade5-efc7-4987-86ea-0f38b92351e1", + "when": null, + "workflow_outputs": [] + } + }, + "tags": [], + "uuid": "fa7c5279-e07d-4393-9ec1-fc66241acbcd", + "version": 6 +} \ No newline at end of file diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml new file mode 100644 index 0000000000..a3fc48e6f6 --- /dev/null +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -0,0 +1,104 @@ +- doc: Test outline for Genome_annotation_with_maker_short.ga + job: + Genome sequence: + class: File + location: https://zenodo.org/records/13987091/files/S_pombe_chrIII_genome.fasta + filetype: fasta + Genome assembly: + class: File + location: https://zenodo.org/records/13987091/files/S_pombe_trinity_assembly.fasta + filetype: fasta + Protein sequences: + class: File + location: https://zenodo.org/records/13987091/files/Swissprot_no_S_pombe.fasta + filetype: fasta + Augustus training: + class: File + location: https://zenodo.org/records/13987091/files/augustus_training.tar.gz.augustus + filetype: augustus + SNAP training: + class: File + location: https://zenodo.org/records/13987091/files/snap_training.snaphmm + filetype: snaphmm + + + outputs: + fasta stats genome: + location: https://zenodo.org/records/14276084/files/Fasta_Statistics_summary_stats.tabular?download=1 + compare: sim_size + delta: 30000 + + busco sum genome: + location: https://zenodo.org/records/14276084/files/Busco_genome_short_summary.txt?download=1 + compare: sim_size + delta: 30000 + busco table genome: + location: https://zenodo.org/records/14276084/files/Busco_genome_full_table.tabular?download=1 + compare: sim_size + delta: + busco missing genome: + location: https://zenodo.org/records/14276084/files/Busco_genome_missing_buscos.tabular?download=1 + compare: sim_size + delta: 30000 + + + gffread exons: + location: https://zenodo.org/records/14276084/files/gffread_exons.fasta?download=1 + compare: sim_size + delta: 30000 + + maker gff: + location: https://zenodo.org/records/14276084/files/Maker_final_annotation.gff3?download=1 + compare: sim_size + delta: 30000 + maker evidences: + location: https://zenodo.org/records/14276084/files/Maker_evidences.gff3?download=1 + compare: sim_size + delta: 30000 + maker full: + location: https://zenodo.org/records/14276084/files/Maker_full_gff_evidences_final_annotation.gff3?download=1 + compare: sim_size + delta: 30000 + + summary genome: + location: https://zenodo.org/records/14276084/files/Genome_annotation_statistics_summary.txt?download=1 + compare: sim_size + delta: 30000 + graphs genome: + location: https://zenodo.org/records/14276084/files/Genome_annotation_statistics_graphs.pdf?download=1 + compare: sim_size + delta: 30000 + + renamed gff3: + location: https://zenodo.org/records/14276084/files/Map_annotation_ids_renamed.gff?download=1 + compare: sim_size + delta: 30000 + id map: + location: https://zenodo.org/records/14276084/files/Map_annotation_ids_ID_map.tabular?download=1 + compare: sim_size + delta: 30000 + + busco missing predicted proteins: + location: https://zenodo.org/records/14276084/files/Busco_proteins_missing_buscos.tabular?download=1 + compare: sim_size + delta: 30000 + busco sum predicted proteins: + location: https://zenodo.org/records/14276084/files/Busco_proteins_short_summary.txt?download=1 + compare: sim_size + delta: 30000 + busco table predicted proteins: + location: https://zenodo.org/records/14276084/files/Busco_proteins_full_table.tabular?download=1 + compare: sim_size + delta: 30000 + busco image predicted proteins: + location: + compare: sim_size + delta: 30000 + busco gff predicted proteins: + location: https://zenodo.org/records/14276084/files/Busco_proteins_GFF.gff3?download=1 + compare: sim_size + delta: 30000 + + + + diff --git a/workflows/genome_annotation/annotation_maker/README.md b/workflows/genome_annotation/annotation_maker/README.md new file mode 100644 index 0000000000..1c3c48c113 --- /dev/null +++ b/workflows/genome_annotation/annotation_maker/README.md @@ -0,0 +1,85 @@ +# Genome annotation workflow with Maker + +This workflow allows to annotate a genome with Maker and to evaluate the quality of the annotation using BUSCO and genome annotation statistics. + +**Maker** is a genome model prediction software that uses ab initio predictors (SANP and Augustus) to improve its predictions. Maker is capable of annotating both prokaryotes and eukaryotes. It works by aligning as much evidence as possible along the genome sequence, then reconciling all these signals to determine likely genetic structures. + + +To assess the quality of the annotation, different tools will be used: +- **Fasta Statistics** is used to assess the quality of the genome. +- **BUSCO (Benchmarking Universal Single-Copy Orthologs)**: is a tool for assessing the quality of a genome assembly or genome annotation. By comparing the genomes of various more or less related species, the authors have determined sets of orthologous genes that are present in a single copy in (almost) all the species in a clade (Bacteria, Fungi, Plants, Insects, Mammals, etc.). Most of these genes are essential to the life of the organism and should be found in any newly sequenced and annotated genome of the corresponding clade. Using this data, BUSCO is able to assess the proportion of these essential genes (also known as BUSCO) found in a set of (predicted) transcripts or protein sequences. This is a good assessment of the 'completeness' of the annotation. +- **Genome Annotation Statistics**: is a program designed to analyse and provide statistics on genome annotations. This software analyses a GFF3 file. + +To improve gene denomination, different tools will be used: +- **gffread**: to extract the predicted protein sequences from the annotation (i.e. the annotation Maker). +- **Map annotation ids**: to automatically assign more readable names. + + +The final step is to visualise the generated annotation using a genome browser such as **JBrowse**. This browser allows navigation along the chromosomes of the genome and visualisation of the structure of each predicted gene. + +## Input dataset for Fasta Statistics +An input file: the genome sequence in fasta format. +## Output dataset forFasta Statistics +The output file is a tabular file with several statistics: +- num_seq: the number of contigs (or scaffold or chromosomes), compare it to expected chromosome numbers +- len_min, len_max, len_N50, len_mean, len_median: the distribution of contig sizes +num_bp_not_N: the number of bases that are not N, it should be as close as possible to the total number of bases (num_bp) + +## Input dataset for BUSCO on the genome +An input file: the genome sequence in fasta format. +## Output dataset for BUSCO on the genome +Three outputs are generated: +- A short summary: summarizes the results of BUSCO (see below) +- A full table: lists all the BUSCOs that were searched for, with the corresponding status +- A table of missing BUSCOs: this is the list of all genes that were not found in the genome + +## Input dataset for Maker +Four inputs are required: +- Genome sequence in fasta format +- Protein sequences aligned with the genome in fasta format +- Ab-initio gene prediction +- EST evidences +## Output dataset for Maker +Three outputs are generated: +- The final annotation: the final consensus gene models produced by Maker +- The evidences: the alignments of all the data Maker used to construct the final annotation (ESTs and proteins that we used) +- A GFF3 file containing both the final annotation and the evidences + +## Input dataset for Genome annotation statistics +Two input files are required: +- The final annotation in gff format (i.e. annotation of the genome using Maker) +- The reference genome sequence in fasta format +## Output dataset for Genome annotation statistics +Two output files are generated: +- a file containing graphs in pdf format +- a summary in txt format + +## Input dataset for GFFread +In this workflow, GFFRead requires two inputs: +- an annotation file in GFF3 format (i.e. annotation of the genome using Maker) +- the genome sequence in fasta format +## Output dataset for GFFread +In this workflow, a unique output will be generated. This file, in fasta format, contains the protein sequences predicted from the annotation. + +## Input dataset for BUSCO on the proteome +An input file: the file in fasta format with the exon sequences from the GFFRead tool +## Output dataset for BUSCO on the proteome +Three outputs are generated: +- A short summary: summarizes the results of BUSCO (see below) +- A full table: lists all the BUSCOs that were searched for, with the corresponding status +- A table of missing BUSCOs: this is the list of all genes that were not found in the genome + +## Input dataset for Map annotation ids +A file is required: the final annotation in gff format (i.e. annotation of the genome using Maker) +## Output dataset for Map annotation ids +Two output files are generated: +- a GFF file +- a tabular file + +## Input dataset for JBrowse +JBrowse requires two inputs: +- the genome sequence in fasta format +- the annotation file in gff3 format, generated by Map annotation ids + +## Output dataset for JBrowse +An html file is generated for browsing the genome. From 356d164fbbae7867f980cf8dc370ab5c7e53f6c3 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Tue, 17 Dec 2024 15:10:04 +0100 Subject: [PATCH 2/7] some modification --- .../Genome_annotation_with_maker_short.ga | 135 +++++++++--------- ...nome_annotation_with_maker_short_tests.yml | 117 +++++++++------ .../annotation_maker/README.md | 112 +++++---------- 3 files changed, 172 insertions(+), 192 deletions(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga index 380e26fec1..dd8247d560 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short.ga @@ -4,20 +4,20 @@ "comments": [ { "child_steps": [ - 10 + 12 ], - "color": "black", + "color": "pink", "data": { - "title": "Improving gene naming" + "title": "Visualization" }, - "id": 6, + "id": 5, "position": [ - 1180, - 1100 + 1400, + 640 ], "size": [ 240, - 224.4 + 244.8 ], "type": "frame" }, @@ -46,8 +46,8 @@ }, { "child_steps": [ - 5, - 6 + 6, + 5 ], "color": "green", "data": { @@ -64,6 +64,26 @@ ], "type": "frame" }, + { + "child_steps": [ + 8, + 11 + ], + "color": "turquoise", + "data": { + "title": "Evaluation - Predicted protein from annotation" + }, + "id": 2, + "position": [ + 770, + 470 + ], + "size": [ + 480, + 478.8 + ], + "type": "frame" + }, { "child_steps": [ 7 @@ -104,40 +124,20 @@ }, { "child_steps": [ - 8, - 11 - ], - "color": "turquoise", - "data": { - "title": "Evaluation - Predicted protein from annotation" - }, - "id": 2, - "position": [ - 770, - 470 - ], - "size": [ - 480, - 478.8 - ], - "type": "frame" - }, - { - "child_steps": [ - 12 + 10 ], - "color": "pink", + "color": "black", "data": { - "title": "Visualization" + "title": "Improving gene naming" }, - "id": 5, + "id": 6, "position": [ - 1400, - 640 + 1180, + 1100 ], "size": [ 240, - 244.8 + 224.4 ], "type": "frame" } @@ -348,7 +348,12 @@ "output_name": "output" } }, - "inputs": [], + "inputs": [ + { + "description": "runtime parameter for tool Busco", + "name": "input" + } + ], "label": null, "name": "Busco", "outputs": [ @@ -385,22 +390,12 @@ "owner": "iuc", "tool_shed": "toolshed.g2.bx.psu.edu" }, - "tool_state": "{\"__input_ext\": \"input\", \"adv\": {\"evalue\": \"0.001\", \"limit\": \"3\", \"contig_break\": \"10\"}, \"busco_mode\": {\"mode\": \"geno\", \"__current_case__\": 0, \"use_augustus\": {\"use_augustus_selector\": \"augustus\", \"__current_case__\": 1}}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"input\": {\"__class__\": \"ConnectedValue\"}, \"lineage\": {\"lineage_mode\": \"select_lineage\", \"__current_case__\": 1, \"lineage_dataset\": \"fungi_odb10\"}, \"lineage_conditional\": {\"selector\": \"cached\", \"__current_case__\": 0, \"cached_db\": \"all+2024-03-21-114020\"}, \"outputs\": [\"short_summary\", \"image\", \"gff\", \"missing\"], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_state": "{\"adv\": {\"evalue\": \"0.001\", \"limit\": \"3\", \"contig_break\": \"10\"}, \"busco_mode\": {\"mode\": \"tran\", \"__current_case__\": 1}, \"input\": {\"__class__\": \"RuntimeValue\"}, \"lineage\": {\"lineage_mode\": \"select_lineage\", \"__current_case__\": 1, \"lineage_dataset\": \"fungi_odb10\"}, \"lineage_conditional\": {\"selector\": \"cached\", \"__current_case__\": 0, \"cached_db\": \"all+2024-03-21-114020\"}, \"outputs\": [\"short_summary\", \"image\", \"gff\", \"missing\"], \"__page__\": null, \"__rerun_remap_job_id__\": null}", "tool_version": "5.7.1+galaxy0", "type": "tool", "uuid": "670eb200-8cae-4de7-a3e7-6c6e95b9e92f", "when": null, "workflow_outputs": [ - { - "label": "busco missing genome", - "output_name": "busco_missing", - "uuid": "84b2b45d-2095-4d24-898e-dba1f491f385" - }, - { - "label": "busco table genome", - "output_name": "busco_table", - "uuid": "18c8704b-f281-472c-a81e-d7b05fc98d69" - }, { "label": "busco sum genome", "output_name": "busco_sum", @@ -415,6 +410,16 @@ "label": "busco image genome", "output_name": "summary_image", "uuid": "a866af88-80c4-4e61-939c-953f9a8d81fc" + }, + { + "label": "busco table genome", + "output_name": "busco_table", + "uuid": "18c8704b-f281-472c-a81e-d7b05fc98d69" + }, + { + "label": "busco missing genome", + "output_name": "busco_missing", + "uuid": "84b2b45d-2095-4d24-898e-dba1f491f385" } ] }, @@ -493,6 +498,11 @@ "uuid": "88a77e0a-3060-46e2-a885-0644ee1d2d8e", "when": null, "workflow_outputs": [ + { + "label": "maker gff", + "output_name": "output_gff", + "uuid": "00c3d37b-dfaa-474d-9cd1-c878eedb032e" + }, { "label": "maker evidences", "output_name": "output_evidences", @@ -502,11 +512,6 @@ "label": "maker full", "output_name": "output_full", "uuid": "81a7537f-174d-4db3-a518-c3bf45ba4857" - }, - { - "label": "maker gff", - "output_name": "output_gff", - "uuid": "00c3d37b-dfaa-474d-9cd1-c878eedb032e" } ] }, @@ -735,16 +740,6 @@ "uuid": "ba1f8c50-d478-45a2-bcf9-3bba389ba763", "when": null, "workflow_outputs": [ - { - "label": "busco table predicted proteins", - "output_name": "busco_table", - "uuid": "65394361-dd3c-483e-adbd-82e3777a24ce" - }, - { - "label": "busco missing predicted proteins", - "output_name": "busco_missing", - "uuid": "9a72bf77-47b8-457e-9147-349a75cd00cf" - }, { "label": "busco image predicted proteins", "output_name": "summary_image", @@ -759,6 +754,16 @@ "label": "busco sum predicted proteins", "output_name": "busco_sum", "uuid": "bd03a2eb-813e-4bc3-a80d-2c32ffe1ff5b" + }, + { + "label": "busco table predicted proteins", + "output_name": "busco_table", + "uuid": "65394361-dd3c-483e-adbd-82e3777a24ce" + }, + { + "label": "busco missing predicted proteins", + "output_name": "busco_missing", + "uuid": "9a72bf77-47b8-457e-9147-349a75cd00cf" } ] }, @@ -812,6 +817,6 @@ } }, "tags": [], - "uuid": "fa7c5279-e07d-4393-9ec1-fc66241acbcd", - "version": 6 + "uuid": "8ef0e1cf-c22b-4452-9581-d364e32635e9", + "version": 1 } \ No newline at end of file diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index a3fc48e6f6..5de2382a6e 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -24,80 +24,103 @@ outputs: fasta stats genome: - location: https://zenodo.org/records/14276084/files/Fasta_Statistics_summary_stats.tabular?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 30 busco sum genome: - location: https://zenodo.org/records/14276084/files/Busco_genome_short_summary.txt?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 2O + - has_text: + text: "BUSCO version is: 5.7.1" + text: "The lineage dataset is: fungi_odb10" busco table genome: - location: https://zenodo.org/records/14276084/files/Busco_genome_full_table.tabular?download=1 - compare: sim_size - delta: + asserts: + - has_text: + text: "BUSCO version is: 5.7.1" + text: "The lineage dataset is: fungi_odb10" busco missing genome: - location: https://zenodo.org/records/14276084/files/Busco_genome_missing_buscos.tabular?download=1 - compare: sim_size - delta: 30000 - + asserts: + - has_text: + text: "BUSCO version is: 5.7.1" + text: "The lineage dataset is: fungi_odb10" gffread exons: - location: https://zenodo.org/records/14276084/files/gffread_exons.fasta?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_text: + text: ">snap-NC_003421.2-processed-gene-0.2-mRNA-1 CDS=1-441 Name=snap-NC_003421.2-processed-gene-0.2-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|0|0|0|1|1|3|0|146" + text: "ATGATAGGAAGAGCCGACATCGAAGAATCAAAAAGCAACGTCGCTATGAACGCTTGGCTGCCACAAGCCA" maker gff: - location: https://zenodo.org/records/14276084/files/Maker_final_annotation.gff3?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 6253 + - has_text: + text: "##gff-version 3" + text: "ID=NC_003421.2;Name=NC_003421.2" maker evidences: - location: https://zenodo.org/records/14276084/files/Maker_evidences.gff3?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 74613 + - has_text: + text: "##gff-version 3" + text: " ID=NC_003421.2;Name=NC_003421.2" maker full: - location: https://zenodo.org/records/14276084/files/Maker_full_gff_evidences_final_annotation.gff3?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_text: + text: "##gff-version 3" + text: "ID=maker-NC_003421.2-augustus-gene-0.47;Name=maker-NC_003421.2-augustus-gene-0.47" summary genome: - location: https://zenodo.org/records/14276084/files/Genome_annotation_statistics_summary.txt?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 27 graphs genome: location: https://zenodo.org/records/14276084/files/Genome_annotation_statistics_graphs.pdf?download=1 compare: sim_size delta: 30000 renamed gff3: - location: https://zenodo.org/records/14276084/files/Map_annotation_ids_renamed.gff?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 6253 + - has_text: + text: "##gff-version 3" + text: "ID=TEST_000012-RA:exon:0;Parent=TEST_000012-RA" id map: - location: https://zenodo.org/records/14276084/files/Map_annotation_ids_ID_map.tabular?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 1728 + - has_text: + text: "snap-NC_003421.2-processed-gene-0.0" + text: "TEST_000008" busco missing predicted proteins: - location: https://zenodo.org/records/14276084/files/Busco_proteins_missing_buscos.tabular?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_text: + text: "BUSCO version is: 5.7.1" + text: "The lineage dataset is: fungi_odb10" + text: "106281at4751" busco sum predicted proteins: - location: https://zenodo.org/records/14276084/files/Busco_proteins_short_summary.txt?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_n_lines: + n: 20 + - has_text: + text: "BUSCO version is: 5.7.1" busco table predicted proteins: - location: https://zenodo.org/records/14276084/files/Busco_proteins_full_table.tabular?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_text: + text: "BUSCO version is: 5.7.1" + text: "The lineage dataset is: fungi_odb10" busco image predicted proteins: - location: compare: sim_size delta: 30000 busco gff predicted proteins: - location: https://zenodo.org/records/14276084/files/Busco_proteins_GFF.gff3?download=1 - compare: sim_size - delta: 30000 + asserts: + - has_text: + text: "gff-version 3" + text: "snap-NC_003421.2-processed-gene-12.60-mRNA-1" diff --git a/workflows/genome_annotation/annotation_maker/README.md b/workflows/genome_annotation/annotation_maker/README.md index 1c3c48c113..aeedd1f56f 100644 --- a/workflows/genome_annotation/annotation_maker/README.md +++ b/workflows/genome_annotation/annotation_maker/README.md @@ -1,85 +1,37 @@ # Genome annotation workflow with Maker -This workflow allows to annotate a genome with Maker and to evaluate the quality of the annotation using BUSCO and genome annotation statistics. +This workflow allows for genome annotation using Maker and evaluates the quality of the annotation with BUSCO and genome annotation statistics. The annotation can then be improved, standardized, and visualized with additional tools. **Maker** is a genome model prediction software that uses ab initio predictors (SANP and Augustus) to improve its predictions. Maker is capable of annotating both prokaryotes and eukaryotes. It works by aligning as much evidence as possible along the genome sequence, then reconciling all these signals to determine likely genetic structures. - -To assess the quality of the annotation, different tools will be used: -- **Fasta Statistics** is used to assess the quality of the genome. -- **BUSCO (Benchmarking Universal Single-Copy Orthologs)**: is a tool for assessing the quality of a genome assembly or genome annotation. By comparing the genomes of various more or less related species, the authors have determined sets of orthologous genes that are present in a single copy in (almost) all the species in a clade (Bacteria, Fungi, Plants, Insects, Mammals, etc.). Most of these genes are essential to the life of the organism and should be found in any newly sequenced and annotated genome of the corresponding clade. Using this data, BUSCO is able to assess the proportion of these essential genes (also known as BUSCO) found in a set of (predicted) transcripts or protein sequences. This is a good assessment of the 'completeness' of the annotation. -- **Genome Annotation Statistics**: is a program designed to analyse and provide statistics on genome annotations. This software analyses a GFF3 file. - -To improve gene denomination, different tools will be used: -- **gffread**: to extract the predicted protein sequences from the annotation (i.e. the annotation Maker). -- **Map annotation ids**: to automatically assign more readable names. - - -The final step is to visualise the generated annotation using a genome browser such as **JBrowse**. This browser allows navigation along the chromosomes of the genome and visualisation of the structure of each predicted gene. - -## Input dataset for Fasta Statistics -An input file: the genome sequence in fasta format. -## Output dataset forFasta Statistics -The output file is a tabular file with several statistics: -- num_seq: the number of contigs (or scaffold or chromosomes), compare it to expected chromosome numbers -- len_min, len_max, len_N50, len_mean, len_median: the distribution of contig sizes -num_bp_not_N: the number of bases that are not N, it should be as close as possible to the total number of bases (num_bp) - -## Input dataset for BUSCO on the genome -An input file: the genome sequence in fasta format. -## Output dataset for BUSCO on the genome -Three outputs are generated: -- A short summary: summarizes the results of BUSCO (see below) -- A full table: lists all the BUSCOs that were searched for, with the corresponding status -- A table of missing BUSCOs: this is the list of all genes that were not found in the genome - -## Input dataset for Maker -Four inputs are required: -- Genome sequence in fasta format -- Protein sequences aligned with the genome in fasta format -- Ab-initio gene prediction -- EST evidences -## Output dataset for Maker -Three outputs are generated: -- The final annotation: the final consensus gene models produced by Maker -- The evidences: the alignments of all the data Maker used to construct the final annotation (ESTs and proteins that we used) -- A GFF3 file containing both the final annotation and the evidences - -## Input dataset for Genome annotation statistics -Two input files are required: -- The final annotation in gff format (i.e. annotation of the genome using Maker) -- The reference genome sequence in fasta format -## Output dataset for Genome annotation statistics -Two output files are generated: -- a file containing graphs in pdf format -- a summary in txt format - -## Input dataset for GFFread -In this workflow, GFFRead requires two inputs: -- an annotation file in GFF3 format (i.e. annotation of the genome using Maker) -- the genome sequence in fasta format -## Output dataset for GFFread -In this workflow, a unique output will be generated. This file, in fasta format, contains the protein sequences predicted from the annotation. - -## Input dataset for BUSCO on the proteome -An input file: the file in fasta format with the exon sequences from the GFFRead tool -## Output dataset for BUSCO on the proteome -Three outputs are generated: -- A short summary: summarizes the results of BUSCO (see below) -- A full table: lists all the BUSCOs that were searched for, with the corresponding status -- A table of missing BUSCOs: this is the list of all genes that were not found in the genome - -## Input dataset for Map annotation ids -A file is required: the final annotation in gff format (i.e. annotation of the genome using Maker) -## Output dataset for Map annotation ids -Two output files are generated: -- a GFF file -- a tabular file - -## Input dataset for JBrowse -JBrowse requires two inputs: -- the genome sequence in fasta format -- the annotation file in gff3 format, generated by Map annotation ids - -## Output dataset for JBrowse -An html file is generated for browsing the genome. +## Workflow Steps + +- Annotation with Maker: Maker uses the genome sequence, protein evidence, ab-initio predictions, and ESTs to produce the annotation. +- Quality Evaluation: + - Run Fasta Statistics to assess genome assembly quality. + - Use BUSCO to evaluate annotation completeness. +- Annotation Statistics: Analyze the annotation using Genome Annotation Statistics, producing graphical and textual summaries. +- Sequence Extraction: Extract predicted protein sequences using GFFRead for downstream analysis. +- Improve Gene Names: Standardize gene names using Map annotation ids for better readability. +- Visualization: Load the genome sequence and annotation into JBrowse for interactive browsing. + +## Input data +The following input files are required for the workflow: +- Genome sequence (FASTA format): The genome to be annotated. Used by Maker, Fasta Statistics, and BUSCO. +- Protein sequences (FASTA format): Evidence to assist annotation in Maker. +- EST evidences (FASTA format): Alignments used as evidence by Maker. +- Ab-initio gene predictions: Supplementary data for Maker to refine annotations. + + +## Output Data +The workflow generates the following outputs: +- Annotation file (GFF3): Contains the final consensus gene models produced by Maker. +- Genome statistics: A tabular file summarizing contig sizes and base content, produced by Fasta Statistics. +- BUSCO results: Assess the completeness of the annotation and include: + - A summary of results. + - A table of all searched BUSCO genes with their status. + - A table of missing BUSCO genes. +- Annotation statistics: Summary and graphical analyses of the annotation, produced by Genome Annotation Statistics. +- Protein sequences (FASTA): Predicted from the annotation using GFFRead. +- Renamed GFF annotation file: Contains standardized gene names, produced by Map annotation ids. +- Genome browser visualization (HTML): An interactive genome view produced by JBrowse. \ No newline at end of file From 364eb9ffde4173b3bae16443f3b368cb56f134a5 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Tue, 17 Dec 2024 17:54:51 +0100 Subject: [PATCH 3/7] error typo --- .../Genome_annotation_with_maker_short_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index 5de2382a6e..21aa1e9d51 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -31,7 +31,7 @@ busco sum genome: asserts: - has_n_lines: - n: 2O + n: 20 - has_text: text: "BUSCO version is: 5.7.1" text: "The lineage dataset is: fungi_odb10" From 89c003582f8b46238ed770a0d7ff67cbf9558314 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Wed, 18 Dec 2024 10:31:38 +0100 Subject: [PATCH 4/7] correct error --- .../Genome_annotation_with_maker_short_tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index 21aa1e9d51..235354e9b5 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -56,6 +56,7 @@ asserts: - has_n_lines: n: 6253 + delta: 10 - has_text: text: "##gff-version 3" text: "ID=NC_003421.2;Name=NC_003421.2" @@ -63,6 +64,7 @@ asserts: - has_n_lines: n: 74613 + delta: 10 - has_text: text: "##gff-version 3" text: " ID=NC_003421.2;Name=NC_003421.2" @@ -70,7 +72,7 @@ asserts: - has_text: text: "##gff-version 3" - text: "ID=maker-NC_003421.2-augustus-gene-0.47;Name=maker-NC_003421.2-augustus-gene-0.47" + text: "NC_003421.2" summary genome: asserts: @@ -85,6 +87,7 @@ asserts: - has_n_lines: n: 6253 + delta: 10 - has_text: text: "##gff-version 3" text: "ID=TEST_000012-RA:exon:0;Parent=TEST_000012-RA" @@ -120,7 +123,7 @@ asserts: - has_text: text: "gff-version 3" - text: "snap-NC_003421.2-processed-gene-12.60-mRNA-1" + text: "augustus-NC_003421.2-processed-gene-12.45-mRNA-1" From 38d88ae51d48f0bf4e456f2d6ade0471dba83062 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Wed, 18 Dec 2024 11:14:43 +0100 Subject: [PATCH 5/7] correct error --- .../Genome_annotation_with_maker_short_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index 235354e9b5..93524aae84 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -59,7 +59,7 @@ delta: 10 - has_text: text: "##gff-version 3" - text: "ID=NC_003421.2;Name=NC_003421.2" + text: "NC_003421.2" maker evidences: asserts: - has_n_lines: @@ -67,7 +67,7 @@ delta: 10 - has_text: text: "##gff-version 3" - text: " ID=NC_003421.2;Name=NC_003421.2" + text: " NC_003421.2" maker full: asserts: - has_text: @@ -123,7 +123,7 @@ asserts: - has_text: text: "gff-version 3" - text: "augustus-NC_003421.2-processed-gene-12.45-mRNA-1" + text: "snap-NC_003421.2-processed-gene-12.94-mRNA-1" From e617648062b02de3da95212574baaacc1d45cd25 Mon Sep 17 00:00:00 2001 From: rlibouban Date: Wed, 18 Dec 2024 12:03:11 +0100 Subject: [PATCH 6/7] correct error --- .../Genome_annotation_with_maker_short_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index 93524aae84..97df6a8043 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -67,7 +67,7 @@ delta: 10 - has_text: text: "##gff-version 3" - text: " NC_003421.2" + text: "NC_003421.2" maker full: asserts: - has_text: From bc44bb99064f09259dabc0350800e6aab7850256 Mon Sep 17 00:00:00 2001 From: Rlibouban <127295521+rlibouba@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:53:28 +0100 Subject: [PATCH 7/7] Update Genome_annotation_with_maker_short_tests.yml --- .../Genome_annotation_with_maker_short_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml index 97df6a8043..180538ccaa 100644 --- a/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml +++ b/workflows/genome_annotation/annotation_maker/Genome_annotation_with_maker_short_tests.yml @@ -123,7 +123,7 @@ asserts: - has_text: text: "gff-version 3" - text: "snap-NC_003421.2-processed-gene-12.94-mRNA-1" + text: "MetaEuk"