diff --git a/workflows/proteomics/clinicalmp/clinicalmp-database-generation/.dockstore.yml b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/.dockstore.yml new file mode 100644 index 000000000..59310b082 --- /dev/null +++ b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/.dockstore.yml @@ -0,0 +1,11 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /iwc-clinicalmp-database-generation.ga + testParameterFiles: + - /iwc-clinicalmp-database-generation-tests.yml + authors: + - name: Subina Mehta + orcid: 0000-0001-9818-0537 diff --git a/workflows/proteomics/clinicalmp/clinicalmp-database-generation/CHANGELOG.md b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/CHANGELOG.md new file mode 100644 index 000000000..8ec28ce1e --- /dev/null +++ b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/CHANGELOG.md @@ -0,0 +1,4 @@ +# Changelog + +## [0.1] 2024-11-18 +First release. diff --git a/workflows/proteomics/clinicalmp/clinicalmp-database-generation/README.md b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/README.md new file mode 100644 index 000000000..470a9d89f --- /dev/null +++ b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/README.md @@ -0,0 +1,29 @@ +# Clinical Metaproteomics 1: Database Generation +Metaproteomics involves the large-scale identification and analysis of all proteins expressed by microbiota. However, analyzing clinical samples using metaproteomics is complicated by the presence of abundant human (host) proteins, which can obscure the detection of less abundant microbial proteins. + +To overcome this challenge, we developed a metaproteomics workflow using tandem mass spectrometry (MS/MS) and bioinformatics tools on the Galaxy platform. This workflow enables the characterization of metaproteomes in clinical samples. + +The first step in this workflow is the Database Generation process. The Galaxy-P team has created a workflow that compiles a large database by downloading protein sequences of known disease-causing microorganisms. From this extensive database, a compact, relevant database is then created using the Metanovo tool. +A GTN has been developed for this workflow. [https://training.galaxyproject.org/training-material/topics/proteomics/tutorials/clinical-mp-1-database-generation/tutorial.html](https://training.galaxyproject.org/training-material/topics/proteomics/tutorials/clinical-mp-1-database-generation/tutorial.html) + +## Inputs dataset + +### Search Databases (FASTA) from [Zenodo](https://zenodo.org/records/14181725) +- `HUMAN SwissProt Protein_Database.fasta` +- `Species UniProt Protein Database FASTA.fasta` +- `Contaminants (cRAP) Protein Database.fasta` + +### MSMS files download from [Zenodo](https://zenodo.org/records/14181725) +- `PTRC_Skubitz_Plex2_F10_9Aug19_Rage_Rep-19-06-08.mgf` +- `PTRC_Skubitz_Plex2_F11_9Aug19_Rage_Rep-19-06-08.mgf` +- `PTRC_Skubitz_Plex2_F13_9Aug19_Rage_Rep-19-06-08.mgf` +- `PTRC_Skubitz_Plex2_F15_9Aug19_Rage_Rep-19-06-08.mgf` + +## Input Values +For Metanovo +- Peptide Length +- Variable modifications +- Labeled element + +## Processing +- Merge all the resultant FASTA files diff --git a/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation-tests.yml b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation-tests.yml new file mode 100644 index 000000000..937b61191 --- /dev/null +++ b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation-tests.yml @@ -0,0 +1,47 @@ +- doc: Test outline for iwc-clinicalmp-database-generation + job: + Human SwissProt Protein Database: + class: File + location: https://zenodo.org/records/14181725/files/HUMAN-SwissProt-Protein-Database.fasta?download=1 + filetype: fasta + Species UniProt Protein Database: + class: File + location: https://zenodo.org/records/14181725/files/Species_UniProt_FASTA.fasta?download=1 + filetype: fasta + Contaminants cRAP Protein Database: + class: File + location: https://zenodo.org/records/14181725/files/Contaminants(cRAP)-Protein-Database.fasta?download=1 + filetype: fasta + Tandem Mass Spectrometry (MS/MS) datasets: + class: Collection + collection_type: list + elements: + - class: File + identifier: PTRC_Skubitz_Plex2_F15_9Aug19_Rage_Rep-19-06-08.mgf + location: https://zenodo.org/records/14181725/files/PTRC_Skubitz_Plex2_F15_9Aug19_Rage_Rep-19-06-08.mgf?download=1 + - class: File + identifier: PTRC_Skubitz_Plex2_F13_9Aug19_Rage_Rep-19-06-08.mgf + location: https://zenodo.org/records/14181725/files/PTRC_Skubitz_Plex2_F13_9Aug19_Rage_Rep-19-06-08.mgf?download=1 + - class: File + identifier: PTRC_Skubitz_Plex2_F11_9Aug19_Rage_Rep-19-06-08.mgf + location: https://zenodo.org/records/14181725/files/PTRC_Skubitz_Plex2_F11_9Aug19_Rage_Rep-19-06-08.mgf?download=1 + - class: File + identifier: PTRC_Skubitz_Plex2_F10_9Aug19_Rage_Rep-19-06-08.mgf + location: https://zenodo.org/records/14181725/files/PTRC_Skubitz_Plex2_F10_9Aug19_Rage_Rep-19-06-08.mgf?download=1 + outputs: + Human UniProt Microbial Proteins cRAP for MetaNovo: + asserts: + - that: has_text + text: ">sp|" + Metanovo Compact database: + asserts: + - that: has_text + text: ">sp|" + Metanovo Compact CSV database: + asserts: + - that: has_text + text: "index" + Human UniProt Microbial Proteins from MetaNovo cRAP: + asserts: + - that: has_text + text: ">sp|" diff --git a/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation.ga b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation.ga new file mode 100644 index 000000000..038e046ef --- /dev/null +++ b/workflows/proteomics/clinicalmp/clinicalmp-database-generation/iwc-clinicalmp-database-generation.ga @@ -0,0 +1,334 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "The workflow begins with the Database Generation process. The Galaxy-P team has developed a workflow that collects protein sequences from known disease-causing microorganisms to build a comprehensive database. This extensive database is then refined into a smaller, more relevant dataset using the Metanovo tool.", + "comments": [], + "creator": [ + { + "class": "Organization", + "identifier": "0000-0001-9818-0537", + "name": "Subina Mehta" + } + ], + "format-version": "0.1", + "license": "CC-BY-4.0", + "name": "Generate a Clinical Metaproteomics Database", + "release": "0.1", + "report": { + "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" + }, + "steps": { + "0": { + "annotation": "Human SwissProt Protein Database", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "Human SwissProt Protein Database", + "name": "Human SwissProt Protein Database" + } + ], + "label": "Human SwissProt Protein Database", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0, + "top": 207.8446261703461 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "bb037df4-ef56-42ad-8b3e-93425bc7bdfa", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Tandem Mass Spectrometry (MS/MS) datasets", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Tandem Mass Spectrometry (MS/MS) datasets", + "name": "Tandem Mass Spectrometry (MS/MS) datasets" + } + ], + "label": "Tandem Mass Spectrometry (MS/MS) datasets", + "name": "Input dataset collection", + "outputs": [], + "position": { + "left": 260.96875, + "top": 0 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null, \"collection_type\": \"list\"}", + "tool_version": null, + "type": "data_collection_input", + "uuid": "d6bb6e3e-ebf6-4c65-bcf7-ea9527cf8653", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "Species UniProt Protein Database", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Species UniProt Protein Database", + "name": "Species UniProt Protein Database" + } + ], + "label": "Species UniProt Protein Database", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 2.4375, + "top": 373.22882080078125 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "b4b90dcd-9228-4aad-b51d-2f08b2d4cc0d", + "when": null, + "workflow_outputs": [] + }, + "3": { + "annotation": "Contaminants cRAP Protein Database", + "content_id": null, + "errors": null, + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "Contaminants cRAP Protein Database", + "name": "Contaminants cRAP Protein Database" + } + ], + "label": "Contaminants cRAP Protein Database", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 14.40625, + "top": 471.45147705078125 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "ca7de659-6e60-4bd5-aa4c-a88f2860e8c9", + "when": null, + "workflow_outputs": [] + }, + "4": { + "annotation": "Merging FASTA files", + "content_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/fasta_merge_files_and_filter_unique_sequences/fasta_merge_files_and_filter_unique_sequences/1.2.0", + "errors": null, + "id": 4, + "input_connections": { + "batchmode|input_fastas_0|input_fasta": { + "id": 0, + "output_name": "output" + }, + "batchmode|input_fastas_1|input_fasta": { + "id": 2, + "output_name": "output" + }, + "batchmode|input_fastas_2|input_fasta": { + "id": 3, + "output_name": "output" + } + }, + "inputs": [], + "label": "Human UniProt Microbial Proteins cRAP for MetaNovo", + "name": "FASTA Merge Files and Filter Unique Sequences", + "outputs": [ + { + "name": "output", + "type": "fasta" + } + ], + "position": { + "left": 369.32421875, + "top": 137.68359736327898 + }, + "post_job_actions": { + "RenameDatasetActionoutput": { + "action_arguments": { + "newname": "Human UniProt Microbial Proteins cRAP for MetaNovo" + }, + "action_type": "RenameDatasetAction", + "output_name": "output" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/fasta_merge_files_and_filter_unique_sequences/fasta_merge_files_and_filter_unique_sequences/1.2.0", + "tool_shed_repository": { + "changeset_revision": "f546e7278f04", + "name": "fasta_merge_files_and_filter_unique_sequences", + "owner": "galaxyp", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"accession_parser\": \"^>([^ ]+).*$\", \"batchmode\": {\"processmode\": \"individual\", \"__current_case__\": 0, \"input_fastas\": [{\"__index__\": 0, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}, {\"__index__\": 1, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}, {\"__index__\": 2, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}]}, \"uniqueness_criterion\": \"sequence\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.2.0", + "type": "tool", + "uuid": "4c1b21ac-9ceb-49be-9c48-771a3fda8db4", + "when": null, + "workflow_outputs": [ + { + "label": "Human UniProt Microbial Proteins cRAP for MetaNovo", + "output_name": "output", + "uuid": "fbc778a5-f931-4b0c-b641-7782abfb4351" + } + ] + }, + "5": { + "annotation": "Generating Customized Database", + "content_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/metanovo/metanovo/1.9.4+galaxy4", + "errors": null, + "id": 5, + "input_connections": { + "input_fasta": { + "id": 4, + "output_name": "output" + }, + "input_type|input_mgf_collection": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MetaNovo", + "name": "input_type" + } + ], + "label": "Metanovo", + "name": "MetaNovo", + "outputs": [ + { + "name": "output_fasta", + "type": "fasta" + }, + { + "name": "output_csv", + "type": "csv" + } + ], + "position": { + "left": 651.57421875, + "top": 90.94140986327898 + }, + "post_job_actions": { + "RenameDatasetActionoutput_csv": { + "action_arguments": { + "newname": "Metanovo Compact CSV database" + }, + "action_type": "RenameDatasetAction", + "output_name": "output_csv" + }, + "RenameDatasetActionoutput_fasta": { + "action_arguments": { + "newname": "Metanovo Compact database" + }, + "action_type": "RenameDatasetAction", + "output_name": "output_fasta" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/metanovo/metanovo/1.9.4+galaxy4", + "tool_shed_repository": { + "changeset_revision": "d6dcd3173bdf", + "name": "metanovo", + "owner": "galaxyp", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"directag\": {\"directag_tic_cutoff\": \"85\", \"directag_max_peak_count\": \"400\", \"directag_intensity_classes\": \"3\", \"directag_adjust_precursor\": false, \"directag_min_adjustment\": \"-2.5\", \"directag_max_adjustment\": \"2.5\", \"directag_adjustment_step\": \"0.1\", \"directag_charge_states\": \"3\", \"directag_ms_charge_state\": false, \"directag_duplicate_spectra\": true, \"directag_deisotoping\": \"0\", \"directag_isotope_tolerance\": \"0.25\", \"directag_complement_tolerance\": \"0.5\", \"directag_tag_length\": \"4\", \"directag_max_var_mods\": \"2\", \"directag_max_tag_count\": \"5\", \"directag_intensity_weight\": \"1.0\", \"directag_fidelity_weight\": \"1.0\", \"directag_complement_weight\": \"1.0\"}, \"fraction_analysis\": {\"protein_fraction_mw_confidence\": \"95.0\"}, \"gene_annotation\": {\"useGeneMapping\": true, \"updateGeneMapping\": true}, \"import_filters\": {\"import_peptide_length_min\": \"8\", \"import_peptide_length_max\": \"50\", \"import_precursor_mz_ppm\": \"0\", \"exclude_unknown_ptms\": true}, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}, \"input_type\": {\"type\": \"collection\", \"__current_case__\": 1, \"input_mgf_collection\": {\"__class__\": \"ConnectedValue\"}}, \"metanovo_parameters\": {\"mn_specificity\": \"specific\", \"mn_enzymes\": \"Trypsin, no P rule\", \"mn_max_missed_cleavages\": \"2\"}, \"processing_control\": {\"CHUNKSIZE\": \"100000\"}, \"protein_inference\": {\"simplify_groups\": true, \"simplify_score\": true, \"simplify_enzymaticity\": true, \"simplify_evidence\": true, \"simplify_uncharacterized\": true}, \"ptm_localization\": {\"ptm_score\": \"1\", \"score_neutral_losses\": false, \"ptm_sequence_matching_type\": \"1\", \"ptm_alignment\": true}, \"sequence_matching\": {\"sequence_index_type\": \"0\", \"sequence_matching_type\": \"2\", \"sequence_matching_x\": \"0.25\"}, \"spectrum_annotation\": {\"annotation_level\": \"0.75\", \"annotation_high_resolution\": true}, \"spectrum_matching_parameters\": {\"prec_tol\": \"10.0\", \"prec_ppm\": \"1\", \"frag_tol\": \"0.01\", \"frag_ppm\": \"0\", \"digestion\": \"0\", \"enzyme\": [\"Trypsin (no P rule)\"], \"specificity\": \"0\", \"mc\": \"2\", \"fixed_mods\": [\"Carbamidomethylation of C\", \"TMT 10-plex of K\", \"TMT 10-plex of peptide N-term\"], \"variable_mods\": [\"Oxidation of M\"], \"min_charge\": \"2\", \"max_charge\": \"5\", \"fi\": \"b\", \"ri\": \"y\", \"min_isotope\": \"0\", \"max_isotope\": \"1\"}, \"validation_levels\": {\"psm_fdr\": \"1\", \"peptide_fdr\": \"1\", \"protein_fdr\": \"1\", \"group_psms\": true, \"group_peptides\": true, \"merge_subgroups\": true}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.9.4+galaxy4", + "type": "tool", + "uuid": "07d51088-eb68-4ffd-b505-7228a21de93c", + "when": null, + "workflow_outputs": [ + { + "label": "Metanovo Compact database", + "output_name": "output_fasta", + "uuid": "02940034-f0fe-4151-acd0-c6982e77b717" + }, + { + "label": "Metanovo Compact CSV database", + "output_name": "output_csv", + "uuid": "d7208cb6-a907-4a8b-b8b7-cf44c103d14a" + } + ] + }, + "6": { + "annotation": "Merge all FASTA", + "content_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/fasta_merge_files_and_filter_unique_sequences/fasta_merge_files_and_filter_unique_sequences/1.2.0", + "errors": null, + "id": 6, + "input_connections": { + "batchmode|input_fastas_0|input_fasta": { + "id": 0, + "output_name": "output" + }, + "batchmode|input_fastas_1|input_fasta": { + "id": 5, + "output_name": "output_fasta" + }, + "batchmode|input_fastas_2|input_fasta": { + "id": 3, + "output_name": "output" + } + }, + "inputs": [], + "label": "Merge all FASTA", + "name": "FASTA Merge Files and Filter Unique Sequences", + "outputs": [ + { + "name": "output", + "type": "fasta" + } + ], + "position": { + "left": 924.9069441945012, + "top": 322.62533209947645 + }, + "post_job_actions": { + "RenameDatasetActionoutput": { + "action_arguments": { + "newname": "Human UniProt Microbial Proteins from MetaNovo cRAP" + }, + "action_type": "RenameDatasetAction", + "output_name": "output" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/galaxyp/fasta_merge_files_and_filter_unique_sequences/fasta_merge_files_and_filter_unique_sequences/1.2.0", + "tool_shed_repository": { + "changeset_revision": "f546e7278f04", + "name": "fasta_merge_files_and_filter_unique_sequences", + "owner": "galaxyp", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"fasta\", \"accession_parser\": \"^>([^ ]+).*$\", \"batchmode\": {\"processmode\": \"individual\", \"__current_case__\": 0, \"input_fastas\": [{\"__index__\": 0, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}, {\"__index__\": 1, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}, {\"__index__\": 2, \"input_fasta\": {\"__class__\": \"ConnectedValue\"}}]}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"uniqueness_criterion\": \"sequence\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.2.0", + "type": "tool", + "uuid": "3880c712-4f3c-4edc-bda0-ee348ab1a2c4", + "when": null, + "workflow_outputs": [ + { + "label": "Human UniProt Microbial Proteins from MetaNovo cRAP", + "output_name": "output", + "uuid": "e3f490ef-1307-4892-bab1-6085cf8194ae" + } + ] + } + }, + "tags": [ + "name:clinicalMP" + ], + "uuid": "758f86d1-b81e-438d-9666-bc7dc31b4627", + "version": 10 +}