Merge pull request #430 from Proteobench/MSAID-compatibility

MSAID output compatibility
Proteobench · Nov 7, 2024 · 213ac84 · 213ac84
2 parents d89b24e + 20bff9d
commit 213ac84
Show file tree

Hide file tree

Showing 34 changed files with 9,425 additions and 25 deletions.
diff --git a/jupyter_notebooks/MSAID_input_conversion.ipynb b/jupyter_notebooks/MSAID_input_conversion.ipynb
@@ -0,0 +1,113 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MSAID output preparation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This Jupyter Notebook serves to add protein information from proteingroups.tsv to precursors.tsv, for Proteobench upload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change paths to your local paths\n",
+    "input_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\"\n",
+    "output_path = \"/mnt/d/Proteobench_manuscript_data/run_output/MSAID_default/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change the path to your proteingroups.tsv and precursors.tsv files\n",
+    "protein_file = pd.read_csv(input_path + 'proteingroups.tsv', sep='\\t')\n",
+    "precursor_file = pd.read_csv(input_path + 'precursors.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Map the proteins to the precursors using the \"PROTEIN_IDS\" column in the precursor file\n",
+    "def add_fasta_headers(prec_df, protein_df):\n",
+    "    # Create a dictionary from the second DataFrame for fast look-up\n",
+    "    protein_to_header = dict(zip(protein_df['PROTEIN_IDS'], protein_df['FASTA_HEADERS']))\n",
+    "\n",
+    "    # Function to find and join headers for each PROTEIN_IDS entry\n",
+    "    def get_fasta_headers(protein_ids):\n",
+    "        ids = protein_ids.split(';')  # Split the IDs by the separator\n",
+    "        headers = [protein_to_header.get(protein_id.strip(), '') for protein_id in ids]\n",
+    "        headers = [header for header in headers if header]  # Remove empty headers\n",
+    "        return '; '.join(headers) if headers else None\n",
+    "\n",
+    "    # Apply the function to the PROTEIN_IDS column and create a new FASTA_HEADERS column\n",
+    "    prec_df['FASTA_HEADERS'] = prec_df['PROTEIN_IDS'].apply(get_fasta_headers)\n",
+    "\n",
+    "    return prec_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prec_df_with_headers = add_fasta_headers(precursor_file, protein_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change the path to the output file\n",
+    "prec_df_with_headers.to_csv(output_path + 'precursors_with_headers.tsv', sep='\\t', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "proteobench",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/proteobench/datapoint/quant_datapoint.py b/proteobench/datapoint/quant_datapoint.py
@@ -53,6 +53,7 @@ class Datapoint:
     intermediate_hash: str = ""
     results: dict = None
     median_abs_epsilon: int = 0
+    # TODO nr_prec doesnt get updated in final df, always 0
     nr_prec: int = 0
     comments: str = ""
     proteobench_version: str = ""

diff --git a/proteobench/io/params/__init__.py b/proteobench/io/params/__init__.py
@@ -89,7 +89,7 @@ class ProteoBenchParameters:
     min_precursor_charge: Optional[int] = None  # precursor_charge
     max_precursor_charge: Optional[int] = None
     scan_window: Optional[int] = None  # DIA-specific
-    quantification_method_DIANN: Optional[str] = None  # DIANN-specific
+    quantification_method: Optional[str] = None
     second_pass: Optional[bool] = None  # DIANN specific
     protein_inference: Optional[str] = None
     predictors_library: Optional[dict] = None
diff --git a/proteobench/io/params/alphadia.py b/proteobench/io/params/alphadia.py
@@ -292,7 +292,6 @@ def extract_params(fname: str) -> ProteoBenchParameters:
         "variable_mods": parsed_settings["library_prediction"]["variable_modifications"].strip(),
         "max_mods": int(parsed_settings["library_prediction"]["max_var_mod_num"]),
         "scan_window": int(parsed_settings["selection_config"]["max_size_rt"]),
-        "quantification_method_DIANN": None,
         "second_pass": None,
         "protein_inference": parsed_settings["fdr"]["inference_strategy"].strip(),
         "predictors_library": "Built-in",

diff --git a/proteobench/io/params/diann.py b/proteobench/io/params/diann.py
@@ -253,7 +253,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:
         "software_name": "DIA-NN",
         "search_engine": "DIA-NN",
         "enable_match_between_runs": False,
-        "quantification_method_DIANN": "QuantUMS high-precision",
+        "quantification_method": "QuantUMS high-precision",
         "protein_inference": "Heuristic protein inference",
     }
 
@@ -274,7 +274,7 @@ def extract_params(fname: str) -> ProteoBenchParameters:
     cmdline_dict = parse_cmdline_string(cmdline_string)
 
     parameters["second_pass"] = "double-search" in cmdline_dict.keys() or "double-pass" in cmdline_dict.keys()
-    parameters["quantification_method_DIANN"] = parse_quantification_strategy(cmdline_dict)
+    parameters["quantification_method"] = parse_quantification_strategy(cmdline_dict)
     parameters["protein_inference"] = parse_protein_inference_method(cmdline_dict)
     parameters["predictors_library"] = parse_predictors_library(cmdline_dict)
 

diff --git a/proteobench/io/params/fragger.py b/proteobench/io/params/fragger.py
@@ -150,7 +150,7 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
             params.enable_match_between_runs = True
         else:
             params.enable_match_between_runs = False
-        params.quantification_method_DIANN = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
+        params.quantification_method = diann_quant_dict[int(fragpipe_params.loc["diann.quantification-strategy"])]
     if fragpipe_params.loc["protein-prophet.run-protein-prophet"] == "true":
         params.protein_inference = "ProteinProphet: {}".format(fragpipe_params.loc["protein-prophet.cmd-opts"])
 

diff --git a/proteobench/io/params/msaid.py b/proteobench/io/params/msaid.py
@@ -0,0 +1,55 @@
+import pandas as pd
+import pathlib
+
+from proteobench.io.params import ProteoBenchParameters
+
+
+def extract_params(fname: str) -> ProteoBenchParameters:
+    """Parse MSAID params file and extract relevant parameters."""
+    # Some default and flag settings
+    parameters = {
+        "software_name": "MSAID",
+        "search_engine": "Chimerys",
+        "search_engine_version": "4.1.1",
+        "quantification_method": "MS2 Area",
+        "ident_fdr_psm": "0.01",
+        "ident_fdr_peptide": "0.01",
+        "ident_fdr_protein": "0.01",
+        "enable_match_between_runs": False,
+    }
+
+    # Read the params file
+    file = pd.read_csv(fname)
+    # Convert the file to a dictionary
+    params_dict = dict(file.itertuples(False, None))
+
+    parameters["search_engine"] = params_dict["Algorithm"].split(" ")[0]
+    parameters["search_engine_version"] = params_dict["Algorithm"].split(" ")[1]
+    parameters["fragment_mass_tolerance"] = params_dict["Fragment Mass Tolerance"]
+    parameters["enzyme"] = params_dict["Enzyme"]
+    parameters["allowed_miscleavages"] = params_dict["Max. Missed Cleavage Sites"]
+    parameters["min_peptide_length"] = params_dict["Min. Peptide Length"]
+    parameters["max_peptide_length"] = params_dict["Max. Peptide Length"]
+    parameters["fixed_mods"] = params_dict["Static Modifications"]
+    parameters["variable_mods"] = params_dict["Variable Modifications"]
+    parameters["max_mods"] = params_dict["Maximum Number of Modifications"]
+    parameters["min_precursor_charge"] = params_dict["Min. Peptide Charge"]
+    parameters["max_precursor_charge"] = params_dict["Max. Peptide Charge"]
+    parameters["quantification_method"] = params_dict["Quantification Type"]
+    if "Quan in all file" in parameters["quantification_method"]:
+        parameters["enable_match_between_runs"] = True
+    else:
+        parameters["enable_match_between_runs"] = False
+
+    return ProteoBenchParameters(**parameters)
+
+
+if __name__ == "__main__":
+    for fname in [
+        "../../../test/params/MSAID_default_params.csv",
+    ]:
+        file = pathlib.Path(fname)
+        params = extract_params(file)
+        data_dict = params.__dict__
+        series = pd.Series(data_dict)
+        series.to_csv(file.with_suffix(".tsv"), sep="\t")
diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_msaid.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_msaid.toml
@@ -0,0 +1,39 @@
+[mapper]
+"MODIFIED_SEQUENCE" = "Sequence"
+"RAW_FILE_NAME" = "Raw file"
+"PRECURSOR_CHARGE" = "Charge"
+"QUANTIFICATION" = "Intensity"
+"FASTA_HEADERS" = "Proteins"
+
+[condition_mapper]
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "A"
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "A"
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "A"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "B"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "B"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "B"
+
+[run_mapper]
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw" = "Condition_A_Sample_Alpha_01"
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw" = "Condition_A_Sample_Alpha_02"
+"LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw" = "Condition_A_Sample_Alpha_03"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw" = "Condition_B_Sample_Alpha_01"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw" = "Condition_B_Sample_Alpha_02"
+"LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw" = "Condition_B_Sample_Alpha_03"
+
+[species_mapper]
+"_YEAST" = "YEAST"
+"_ECOLI" = "ECOLI"
+"_HUMAN" = "HUMAN"
+
+[modifications_parser]
+"parse_column" = "Sequence"
+"before_aa" = false
+"isalpha" = true
+"isupper" = true
+"pattern" = "\\[(.*?)\\]"
+"modification_dict" = {"[unimod:4]" = "Carbamidomethyl", "[unimod:1]" = "Acetyl", "[unimod:35]" = "Oxidation"}
+
+[general]
+"contaminant_flag" = "Cont_"
+"decoy_flag" = false
diff --git a/proteobench/io/parsing/parse_ion.py b/proteobench/io/parsing/parse_ion.py
@@ -48,11 +48,16 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
     elif input_format == "AlphaDIA":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
+        mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv")
+        mapper_df = pd.read_csv(mapper_path).set_index("gene_name")
+        mapper = mapper_df["description"].to_dict()
+        input_data_frame["Proteins"] = input_data_frame["genes"].map(
+            lambda x: ";".join([mapper[protein] if protein in mapper.keys() else protein for protein in x.split(";")])
+        )
         input_data_frame["proforma"] = input_data_frame.apply(
             lambda x: aggregate_modification_sites_column(x.sequence, x.mods, x.mod_sites),
             axis=1,
         )
-        input_data_frame["Proteins"] = input_data_frame["genes"] + "/" + input_data_frame["pg_master"]
     elif input_format == "FragPipe (DIA-NN quant)":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
         mapper_path = os.path.join(os.path.dirname(__file__), "io_parse_settings/mapper.csv")
@@ -76,6 +81,8 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
             lambda x: [mapper[protein] if protein in mapper.keys() else protein for protein in x]
         )
         input_data_frame["Proteins"] = input_data_frame["Proteins"].str.join(";")
+    elif input_format == "MSAID":
+        input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
 
     return input_data_frame
 

diff --git a/proteobench/io/parsing/parse_settings_ion.py b/proteobench/io/parsing/parse_settings_ion.py
@@ -36,6 +36,7 @@ def __init__(self, parse_settings_dir=None, acquisition_method="dda"):
                 "FragPipe": os.path.join(parse_settings_dir, "parse_settings_fragpipe_DIA.toml"),
                 "Spectronaut": os.path.join(parse_settings_dir, "parse_settings_spectronaut.toml"),
                 "AlphaDIA": os.path.join(parse_settings_dir, "parse_settings_alphadia.toml"),
+                "MSAID": os.path.join(parse_settings_dir, "parse_settings_msaid.toml"),
                 "Custom": os.path.join(parse_settings_dir, "parse_settings_custom_DIA_quant_ion.toml"),
             }
         else:

diff --git a/proteobench/modules/quant_base/quant_base_module.py b/proteobench/modules/quant_base/quant_base_module.py
@@ -26,6 +26,7 @@
 from proteobench.io.params.maxquant import extract_params as extract_params_maxquant
 from proteobench.io.params.proline import extract_params as extract_params_proline
 from proteobench.io.params.sage import extract_params as extract_params_sage
+from proteobench.io.params.msaid import extract_params as extract_params_msaid
 from proteobench.io.parsing.parse_ion import load_input_file
 from proteobench.io.parsing.parse_settings_ion import ParseSettingsBuilder
 from proteobench.score.quant.quantscores import QuantScores
@@ -72,6 +73,7 @@ def __init__(self, token: str = None, proteobench_repo_name: str = "", proteobot
         "DIA-NN": extract_params_diann,
         "AlphaDIA": extract_params_alphadia,
         "FragPipe (DIA-NN quant)": extract_params_fragger,
+        "MSAID": extract_params_msaid,
         # "Spectronaut": extract_params_spectronaut
     }
 

diff --git a/proteobench/plotting/plot_quant.py b/proteobench/plotting/plot_quant.py
@@ -80,6 +80,7 @@ def plot_metric(
             "Custom": "#7f7f7f",
             "Spectronaut": "#bcbd22",
             "FragPipe (DIA-NN quant)": "#ff7f00",
+            "MSAID": "#afff57",
             ##ffff33 /yellow so not ideal
         },
         mapping={"old": 10, "new": 20},