-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #430 from Proteobench/MSAID-compatibility
MSAID output compatibility
- Loading branch information
Showing
34 changed files
with
9,425 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# MSAID output preparation" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"This Jupyter Notebook serves to add protein information from proteingroups.tsv to precursors.tsv, for Proteobench upload" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Change paths to your local paths\n", | ||
"input_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\"\n", | ||
"output_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Change the path to your proteingroups.tsv and precursors.tsv files\n", | ||
"protein_file = pd.read_csv(input_path + 'proteingroups.tsv', sep='\\t')\n", | ||
"precursor_file = pd.read_csv(input_path + 'precursors.tsv', sep='\\t')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Map the proteins to the precursors using the \"PROTEIN_IDS\" column in the precursor file\n", | ||
"def add_fasta_headers(prec_df, protein_df):\n", | ||
" # Create a dictionary from the second DataFrame for fast look-up\n", | ||
" protein_to_header = dict(zip(protein_df['PROTEIN_IDS'], protein_df['FASTA_HEADERS']))\n", | ||
"\n", | ||
" # Function to find and join headers for each PROTEIN_IDS entry\n", | ||
" def get_fasta_headers(protein_ids):\n", | ||
" ids = protein_ids.split(';') # Split the IDs by the separator\n", | ||
" headers = [protein_to_header.get(protein_id.strip(), '') for protein_id in ids]\n", | ||
" headers = [header for header in headers if header] # Remove empty headers\n", | ||
" return '; '.join(headers) if headers else None\n", | ||
"\n", | ||
" # Apply the function to the PROTEIN_IDS column and create a new FASTA_HEADERS column\n", | ||
" prec_df['FASTA_HEADERS'] = prec_df['PROTEIN_IDS'].apply(get_fasta_headers)\n", | ||
"\n", | ||
" return prec_df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"prec_df_with_headers = add_fasta_headers(precursor_file, protein_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Change the path to the output file\n", | ||
"prec_df_with_headers.to_csv(output_path + 'precursors_with_headers.tsv', sep='\\t', index=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "proteobench", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pandas as pd | ||
import pathlib | ||
|
||
from proteobench.io.params import ProteoBenchParameters | ||
|
||
|
||
def extract_params(fname: str) -> ProteoBenchParameters: | ||
"""Parse MSAID params file and extract relevant parameters.""" | ||
# Some default and flag settings | ||
parameters = { | ||
"software_name": "MSAID", | ||
"search_engine": "Chimerys", | ||
"search_engine_version": "4.1.1", | ||
"quantification_method": "MS2 Area", | ||
"ident_fdr_psm": "0.01", | ||
"ident_fdr_peptide": "0.01", | ||
"ident_fdr_protein": "0.01", | ||
"enable_match_between_runs": False, | ||
} | ||
|
||
# Read the params file | ||
file = pd.read_csv(fname) | ||
# Convert the file to a dictionary | ||
params_dict = dict(file.itertuples(False, None)) | ||
|
||
parameters["search_engine"] = params_dict["Algorithm"].split(" ")[0] | ||
parameters["search_engine_version"] = params_dict["Algorithm"].split(" ")[1] | ||
parameters["fragment_mass_tolerance"] = params_dict["Fragment Mass Tolerance"] | ||
parameters["enzyme"] = params_dict["Enzyme"] | ||
parameters["allowed_miscleavages"] = params_dict["Max. Missed Cleavage Sites"] | ||
parameters["min_peptide_length"] = params_dict["Min. Peptide Length"] | ||
parameters["max_peptide_length"] = params_dict["Max. Peptide Length"] | ||
parameters["fixed_mods"] = params_dict["Static Modifications"] | ||
parameters["variable_mods"] = params_dict["Variable Modifications"] | ||
parameters["max_mods"] = params_dict["Maximum Number of Modifications"] | ||
parameters["min_precursor_charge"] = params_dict["Min. Peptide Charge"] | ||
parameters["max_precursor_charge"] = params_dict["Max. Peptide Charge"] | ||
parameters["quantification_method"] = params_dict["Quantification Type"] | ||
if "Quan in all file" in parameters["quantification_method"]: | ||
parameters["enable_match_between_runs"] = True | ||
else: | ||
parameters["enable_match_between_runs"] = False | ||
|
||
return ProteoBenchParameters(**parameters) | ||
|
||
|
||
if __name__ == "__main__": | ||
for fname in [ | ||
"../../../test/params/MSAID_default_params.csv", | ||
]: | ||
file = pathlib.Path(fname) | ||
params = extract_params(file) | ||
data_dict = params.__dict__ | ||
series = pd.Series(data_dict) | ||
series.to_csv(file.with_suffix(".tsv"), sep="\t") |
39 changes: 39 additions & 0 deletions
39
proteobench/io/parsing/io_parse_settings/parse_settings_msaid.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
[mapper] | ||
"MODIFIED_SEQUENCE" = "Sequence" | ||
"RAW_FILE_NAME" = "Raw file" | ||
"PRECURSOR_CHARGE" = "Charge" | ||
"QUANTIFICATION" = "Intensity" | ||
"FASTA_HEADERS" = "Proteins" | ||
|
||
[condition_mapper] | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "A" | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "A" | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "A" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "B" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "B" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "B" | ||
|
||
[run_mapper] | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "Condition_A_Sample_Alpha_01" | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "Condition_A_Sample_Alpha_02" | ||
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "Condition_A_Sample_Alpha_03" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "Condition_B_Sample_Alpha_01" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "Condition_B_Sample_Alpha_02" | ||
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "Condition_B_Sample_Alpha_03" | ||
|
||
[species_mapper] | ||
"_YEAST" = "YEAST" | ||
"_ECOLI" = "ECOLI" | ||
"_HUMAN" = "HUMAN" | ||
|
||
[modifications_parser] | ||
"parse_column" = "Sequence" | ||
"before_aa" = false | ||
"isalpha" = true | ||
"isupper" = true | ||
"pattern" = "\\[(.*?)\\]" | ||
"modification_dict" = {"[unimod:4]" = "Carbamidomethyl", "[unimod:1]" = "Acetyl", "[unimod:35]" = "Oxidation"} | ||
|
||
[general] | ||
"contaminant_flag" = "Cont_" | ||
"decoy_flag" = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.