Skip to content

Commit

Permalink
Merge pull request #430 from Proteobench/MSAID-compatibility
Browse files Browse the repository at this point in the history
MSAID output compatibility
  • Loading branch information
RobbinBouwmeester authored Nov 7, 2024
2 parents d89b24e + 20bff9d commit 213ac84
Show file tree
Hide file tree
Showing 34 changed files with 9,425 additions and 25 deletions.
113 changes: 113 additions & 0 deletions jupyter_notebooks/MSAID_input_conversion.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MSAID output preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This Jupyter Notebook serves to add protein information from proteingroups.tsv to precursors.tsv, for Proteobench upload"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Change paths to your local paths\n",
"input_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\"\n",
"output_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Change the path to your proteingroups.tsv and precursors.tsv files\n",
"protein_file = pd.read_csv(input_path + 'proteingroups.tsv', sep='\\t')\n",
"precursor_file = pd.read_csv(input_path + 'precursors.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Map the proteins to the precursors using the \"PROTEIN_IDS\" column in the precursor file\n",
"def add_fasta_headers(prec_df, protein_df):\n",
" # Create a dictionary from the second DataFrame for fast look-up\n",
" protein_to_header = dict(zip(protein_df['PROTEIN_IDS'], protein_df['FASTA_HEADERS']))\n",
"\n",
" # Function to find and join headers for each PROTEIN_IDS entry\n",
" def get_fasta_headers(protein_ids):\n",
" ids = protein_ids.split(';') # Split the IDs by the separator\n",
" headers = [protein_to_header.get(protein_id.strip(), '') for protein_id in ids]\n",
" headers = [header for header in headers if header] # Remove empty headers\n",
" return '; '.join(headers) if headers else None\n",
"\n",
" # Apply the function to the PROTEIN_IDS column and create a new FASTA_HEADERS column\n",
" prec_df['FASTA_HEADERS'] = prec_df['PROTEIN_IDS'].apply(get_fasta_headers)\n",
"\n",
" return prec_df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"prec_df_with_headers = add_fasta_headers(precursor_file, protein_file)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Change the path to the output file\n",
"prec_df_with_headers.to_csv(output_path + 'precursors_with_headers.tsv', sep='\\t', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "proteobench",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions proteobench/datapoint/quant_datapoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class Datapoint:
intermediate_hash: str = ""
results: dict = None
median_abs_epsilon: int = 0
# TODO nr_prec doesnt get updated in final df, always 0
nr_prec: int = 0
comments: str = ""
proteobench_version: str = ""
Expand Down
2 changes: 1 addition & 1 deletion proteobench/io/params/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class ProteoBenchParameters:
min_precursor_charge: Optional[int] = None # precursor_charge
max_precursor_charge: Optional[int] = None
scan_window: Optional[int] = None # DIA-specific
quantification_method_DIANN: Optional[str] = None # DIANN-specific
quantification_method: Optional[str] = None
second_pass: Optional[bool] = None # DIANN specific
protein_inference: Optional[str] = None
predictors_library: Optional[dict] = None
1 change: 0 additions & 1 deletion proteobench/io/params/alphadia.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ def extract_params(fname: str) -> ProteoBenchParameters:
"variable_mods": parsed_settings["library_prediction"]["variable_modifications"].strip(),
"max_mods": int(parsed_settings["library_prediction"]["max_var_mod_num"]),
"scan_window": int(parsed_settings["selection_config"]["max_size_rt"]),
"quantification_method_DIANN": None,
"second_pass": None,
"protein_inference": parsed_settings["fdr"]["inference_strategy"].strip(),
"predictors_library": "Built-in",
Expand Down
4 changes: 2 additions & 2 deletions proteobench/io/params/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:
"software_name": "DIA-NN",
"search_engine": "DIA-NN",
"enable_match_between_runs": False,
"quantification_method_DIANN": "QuantUMS high-precision",
"quantification_method": "QuantUMS high-precision",
"protein_inference": "Heuristic protein inference",
}

Expand All @@ -274,7 +274,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:
cmdline_dict = parse_cmdline_string(cmdline_string)

parameters["second_pass"] = "double-search" in cmdline_dict.keys() or "double-pass" in cmdline_dict.keys()
parameters["quantification_method_DIANN"] = parse_quantification_strategy(cmdline_dict)
parameters["quantification_method"] = parse_quantification_strategy(cmdline_dict)
parameters["protein_inference"] = parse_protein_inference_method(cmdline_dict)
parameters["predictors_library"] = parse_predictors_library(cmdline_dict)

Expand Down
2 changes: 1 addition & 1 deletion proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
params.enable_match_between_runs = True
else:
params.enable_match_between_runs = False
params.quantification_method_DIANN = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
params.protein_inference = "ProteinProphet: {}".format(fragpipe_params.loc["protein-prophet.cmd-opts"])

Expand Down
55 changes: 55 additions & 0 deletions proteobench/io/params/msaid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd
import pathlib

from proteobench.io.params import ProteoBenchParameters


def extract_params(fname: str) -> ProteoBenchParameters:
"""Parse MSAID params file and extract relevant parameters."""
# Some default and flag settings
parameters = {
"software_name": "MSAID",
"search_engine": "Chimerys",
"search_engine_version": "4.1.1",
"quantification_method": "MS2 Area",
"ident_fdr_psm": "0.01",
"ident_fdr_peptide": "0.01",
"ident_fdr_protein": "0.01",
"enable_match_between_runs": False,
}

# Read the params file
file = pd.read_csv(fname)
# Convert the file to a dictionary
params_dict = dict(file.itertuples(False, None))

parameters["search_engine"] = params_dict["Algorithm"].split(" ")[0]
parameters["search_engine_version"] = params_dict["Algorithm"].split(" ")[1]
parameters["fragment_mass_tolerance"] = params_dict["Fragment Mass Tolerance"]
parameters["enzyme"] = params_dict["Enzyme"]
parameters["allowed_miscleavages"] = params_dict["Max. Missed Cleavage Sites"]
parameters["min_peptide_length"] = params_dict["Min. Peptide Length"]
parameters["max_peptide_length"] = params_dict["Max. Peptide Length"]
parameters["fixed_mods"] = params_dict["Static Modifications"]
parameters["variable_mods"] = params_dict["Variable Modifications"]
parameters["max_mods"] = params_dict["Maximum Number of Modifications"]
parameters["min_precursor_charge"] = params_dict["Min. Peptide Charge"]
parameters["max_precursor_charge"] = params_dict["Max. Peptide Charge"]
parameters["quantification_method"] = params_dict["Quantification Type"]
if "Quan in all file" in parameters["quantification_method"]:
parameters["enable_match_between_runs"] = True
else:
parameters["enable_match_between_runs"] = False

return ProteoBenchParameters(**parameters)


if __name__ == "__main__":
for fname in [
"../../../test/params/MSAID_default_params.csv",
]:
file = pathlib.Path(fname)
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
series.to_csv(file.with_suffix(".tsv"), sep="\t")
39 changes: 39 additions & 0 deletions proteobench/io/parsing/io_parse_settings/parse_settings_msaid.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
[mapper]
"MODIFIED_SEQUENCE" = "Sequence"
"RAW_FILE_NAME" = "Raw file"
"PRECURSOR_CHARGE" = "Charge"
"QUANTIFICATION" = "Intensity"
"FASTA_HEADERS" = "Proteins"

[condition_mapper]
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "A"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "B"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "B"

[run_mapper]
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "Condition_A_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "Condition_A_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "Condition_A_Sample_Alpha_03"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "Condition_B_Sample_Alpha_01"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "Condition_B_Sample_Alpha_02"
"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "Condition_B_Sample_Alpha_03"

[species_mapper]
"_YEAST" = "YEAST"
"_ECOLI" = "ECOLI"
"_HUMAN" = "HUMAN"

[modifications_parser]
"parse_column" = "Sequence"
"before_aa" = false
"isalpha" = true
"isupper" = true
"pattern" = "\\[(.*?)\\]"
"modification_dict" = {"[unimod:4]" = "Carbamidomethyl", "[unimod:1]" = "Acetyl", "[unimod:35]" = "Oxidation"}

[general]
"contaminant_flag" = "Cont_"
"decoy_flag" = false
9 changes: 8 additions & 1 deletion proteobench/io/parsing/parse_ion.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,16 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
elif input_format == "AlphaDIA":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv")
mapper_df = pd.read_csv(mapper_path).set_index("gene_name")
mapper = mapper_df["description"].to_dict()
input_data_frame["Proteins"] = input_data_frame["genes"].map(
lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")])
)
input_data_frame["proforma"] = input_data_frame.apply(
lambda x: aggregate_modification_sites_column(x.sequence, x.mods, x.mod_sites),
axis=1,
)
input_data_frame["Proteins"] = input_data_frame["genes"] + "/" + input_data_frame["pg_master"]
elif input_format == "FragPipe (DIA-NN quant)":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv")
Expand All @@ -76,6 +81,8 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
lambda x: [mapper[protein] if protein in mapper.keys() else protein for protein in x]
)
input_data_frame["Proteins"] = input_data_frame["Proteins"].str.join(";")
elif input_format == "MSAID":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")

return input_data_frame

Expand Down
1 change: 1 addition & 0 deletions proteobench/io/parsing/parse_settings_ion.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(self, parse_settings_dir=None, acquisition_method="dda"):
"FragPipe": os.path.join(parse_settings_dir, "parse_settings_fragpipe_DIA.toml"),
"Spectronaut": os.path.join(parse_settings_dir, "parse_settings_spectronaut.toml"),
"AlphaDIA": os.path.join(parse_settings_dir, "parse_settings_alphadia.toml"),
"MSAID": os.path.join(parse_settings_dir, "parse_settings_msaid.toml"),
"Custom": os.path.join(parse_settings_dir, "parse_settings_custom_DIA_quant_ion.toml"),
}
else:
Expand Down
2 changes: 2 additions & 0 deletions proteobench/modules/quant_base/quant_base_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from proteobench.io.params.maxquant import extract_params as extract_params_maxquant
from proteobench.io.params.proline import extract_params as extract_params_proline
from proteobench.io.params.sage import extract_params as extract_params_sage
from proteobench.io.params.msaid import extract_params as extract_params_msaid
from proteobench.io.parsing.parse_ion import load_input_file
from proteobench.io.parsing.parse_settings_ion import ParseSettingsBuilder
from proteobench.score.quant.quantscores import QuantScores
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(self, token: str = None, proteobench_repo_name: str = "", proteobot
"DIA-NN": extract_params_diann,
"AlphaDIA": extract_params_alphadia,
"FragPipe (DIA-NN quant)": extract_params_fragger,
"MSAID": extract_params_msaid,
# "Spectronaut": extract_params_spectronaut
}

Expand Down
1 change: 1 addition & 0 deletions proteobench/plotting/plot_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def plot_metric(
"Custom": "#7f7f7f",
"Spectronaut": "#bcbd22",
"FragPipe (DIA-NN quant)": "#ff7f00",
"MSAID": "#afff57",
##ffff33 /yellow so not ideal
},
mapping={"old": 10, "new": 20},
Expand Down
Loading

0 comments on commit 213ac84

Please sign in to comment.