Merge branch 'main' into mean_median_plot

Proteobench · Dec 12, 2024 · 0fb6c04 · 0fb6c04
2 parents a6914bf + 4fdb82b
commit 0fb6c04
Show file tree

Hide file tree

Showing 52 changed files with 957 additions and 233 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -21,4 +21,4 @@
     "flake8.args": [
         "--max-line-length=120",
     ],
-}
+}
diff --git a/docs/available-modules/2-quant-lfq-ion-dda.md b/docs/available-modules/2-quant-lfq-ion-dda.md
@@ -18,7 +18,16 @@ Other modules will be more suited to explore further post-pocessing steps.
 A subset of the Q Exactive HF-X Orbitrap (Thermo Fisher) data dependent acquisition (DDA) data described by [Van Puyvelde et al., 2022](https://www.nature.com/articles/s41597-022-01216-6) was used as a benchmark dataset. Here, only the first biological replicate series (named “alpha”) was used, encompassing three technical replicates of two different conditions (referred to as “A” and “B”). The samples are a mixture of commercial peptide digest standards of the following species: Escherichia coli (P/N:186003196, Waters Corporation), Yeast (P/N: V7461, Promega) and Human (P/N: V6951, Promega), with logarithmic fold changes (log2FCs) of 0, −1 and 2 for respectively Human, Yeast and E.coli. 
 Please refer to the original publication for the full description of sample preparation and data acquisition parameters ([Van Puyvelde et al., 2022](https://www.nature.com/articles/s41597-022-01216-6)). 
 
-The files can be downloaded from the proteomeXchange repository [PXD028735](https://www.ebi.ac.uk/pride/archive/projects/PXD028735) or you can download them from the ProteoBench server here: [proteobench.cubimed.rub.de/datasets/raw_files/DDA/](https://proteobench.cubimed.rub.de/datasets/raw_files/DDA/)
+The files can be downloaded from the proteomeXchange repository [PXD028735](https://www.ebi.ac.uk/pride/archive/projects/PXD028735), make sure that you download the following raw files:
+
+- [LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_01.raw)
+- [LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_02.raw)
+- [LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_A_Sample_Alpha_03.raw)
+- [LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_01.raw)
+- [LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_02.raw)
+- [LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_DDA_Condition_B_Sample_Alpha_03.raw)
+
+Alternatively, you can download them from the ProteoBench server here: [proteobench.cubimed.rub.de/datasets/raw_files/DDA/](https://proteobench.cubimed.rub.de/datasets/raw_files/DDA/)
 
 **It is imperative not to rename the files once downloaded!**
 
@@ -87,10 +96,11 @@ Table 2 provides an overview of the required input files for public submission.
 
 In FragPipe output files, the protein identifiers matching a given ion are in two separate columns: "Proteins" and "Mapped Proteins". So we concatenate these two fields to have the protein groups.
 
-
 ### i2MassChroQ
-A ProteoBench-compatible format is available in i2MassChroQ through the button "ProteoBench export". It generates a tab-delimited file containing one row per quantified ion for metric calculation ("proteobench_export.tsv"; column headers are: "rawfile", "sequence", "ProForma", "charge", "proteins" and "area"); and a parameter file for public submission ("Project parameters.tsv"). Like with the other tools, the protein identifiers should be in the format "sp|P49327|FAS_HUMAN". 
+A ProteoBench-compatible format is available in i2MassChroQ through the button `ProteoBench export`. It generates a tab-delimited file containing one row per quantified ion for metric calculation ("proteobench_export.tsv"; column headers are: "rawfile", "sequence", "ProForma", "charge", "proteins" and "area"); and a parameter file for public submission ("Project parameters.tsv"). Like with the other tools, the protein identifiers should be in the format "sp|P49327|FAS_HUMAN". 
 Link to the i2MassChroQ documentation [here](http://pappso.inrae.fr/bioinfo/i2masschroq/documentation/html/).
+#### Specific information for searches with X!Tandem
+Among the default parameters of X!Tandem, "quick acetyl" and "quick pyrolidone" seach for the variable modifications N-ter acetylation and pyrolidone. Please turn these off if you don't want to include such modifications in your search. 
 
 ### MaxQuant
 By default, MaxQuant uses a contaminants-only fasta file that is located in the software folder (“contaminant.txt”). However, the fasta file provided for this module already contains a set of curated contaminant sequences. Therefore, in the MaxQuant settings (Global parameters > Sequences), **UNTICK the “Include contaminants” box**. 
@@ -104,7 +114,7 @@ The field "Proteins" in **the "evidence.txt" table should report proteins in the
 In the recent versions of MaxQuant, the default settings work perfectly (`Identifier rule = >([^\s]*)`; `Description rule = >(.*)`).
 Some older versions of MaxQuant do not provide the option to change fasta header parsing. These are not compatible with ProteoBench.
 
-### Proline Studio (work in progress..)
+### Proline Studio 
 Make sure that the peaklists are named with the same prefix as raw files. To do so in ProlineStudio, use peaklist names as sample names (manually or with automatic renaming option).
 
 ![ProlineStudio Naming](../../img/module_docs/quant_lfq_ion_DDA/ProlineStudio_naming.png)
@@ -115,6 +125,10 @@ The `Quantified peptide ions` tab reports the precursor ion quantities (retrieve
 
 For public submission, you can upload the same excel export, just make sure to have the tabs `Search settings and infos`, `Import and filters`, `Quant config`. For local usage and public submission, we strongly recommend to use the following [template.json](../../files_provided_to_users/quant_lfq_ion_DDA/ProlineStudio/template.json) to make sure that all the tabs and columns needed are exported to be correctly parsed. Make sure that no personal information is stored in the excel file before making it public. The version of ProlineStudio is only exported in the parameters from version 2.3. 
 
+### MSAngel (work in progress..)
+MSAngel allows to build piplenes for bottom-up MS analysis with a choice of search engines, validation strategy and the Proline quantification. 
+More information can be found [here](https://www.profiproteomics.fr/ms-angel/)
+
 ### Sage
 
 1. Convert .raw files into .mzML using MSConvert or ThermoRawFileParser **(do not change the file names)**

diff --git a/docs/available-modules/4-quant-lfq-ion-dia-aif.md b/docs/available-modules/4-quant-lfq-ion-dia-aif.md
@@ -1,14 +1,13 @@
 # DIA quantification - precursor ions - AIF data
 
-This module compares the sensitivity and quantification accuracy for data-independent acquisition (DIA) data, namely All-Ion Fragmentation, on a Q Exactive HF-X Orbitrap (Thermo Fisher).
+This module compares the sensitivity and quantification accuracy for data-independent acquisition (DIA) data, namely All-Ion Fragmentation (AIF), on a Q Exactive HF-X Orbitrap (Thermo Fisher).
 Users can load their data and inspect the results privately. They can also make their outputs public by providing the associated parameter file and submitting the benchmark run to ProteoBench. By doing so, their workflow output will be stored alongside all other benchmark runs in ProteoBench and will be accessible to the entire community.
 
 **This module is not designed to compare later-stages post-processing of quantitative data such as missing value replacement, and we advise users to publically upload data without replacement of missing values and without manual filtering.**  
 
 We think that this module is more suited to evaluate the impact of (non exhaustive list):
 - search engine identification
 - peak picking
-- match between run
 - low-level ion signal normalisation
 
 Other modules will be more suited to explore further post-pocessing steps. 
@@ -18,7 +17,16 @@ Other modules will be more suited to explore further post-pocessing steps.
 A subset of the Q Exactive HF-X Orbitrap (Thermo Fisher) data independent acquisition (DIA) data described by [Van Puyvelde et al., 2022](https://www.nature.com/articles/s41597-022-01216-6) was used as a benchmark dataset (in the manuscript referred to as All-Ion Fragmentation (AIF)). Here, only the first biological replicate series (named “alpha”) was used, encompassing three technical replicates of two different conditions (referred to as “A” and “B”). The samples are a mixture of commercial peptide digest standards of the following species: Escherichia coli (P/N:186003196, Waters Corporation), Yeast (P/N: V7461, Promega) and Human (P/N: V6951, Promega), with logarithmic fold changes (log2FCs) of 0, −1 and 2 for respectively Human, Yeast and E.coli. 
 Please refer to the original publication for the full description of sample preparation and data acquisition parameters ([Van Puyvelde et al., 2022](https://www.nature.com/articles/s41597-022-01216-6)). 
 
-The files can be downloaded from the proteomeXchange repository PXD028735 (https://www.ebi.ac.uk/pride/archive/projects/PXD028735) or you can download them from the ProteoBench server here: https://proteobench.cubimed.rub.de/datasets/raw_files/DIA/
+The files can be downloaded from the proteomeXchange repository [PXD028735](https://www.ebi.ac.uk/pride/archive/projects/PXD028735), make sure that you download the following raw files:
+
+- [LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01.raw)
+- [LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_02.raw)
+- [LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03.raw)
+- [LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_01.raw)
+- [LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_02.raw)
+- [LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw](https://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD028735/LFQ_Orbitrap_AIF_Condition_B_Sample_Alpha_03.raw)
+
+Alternatively, you can download them from the ProteoBench server here: [proteobench.cubimed.rub.de/datasets/raw_files/DDA/](https://proteobench.cubimed.rub.de/datasets/raw_files/DIA/)
 
 **It is imperative not to rename the files once downloaded!**
 

diff --git a/proteobench/io/params/MSAngel.py b/proteobench/io/params/MSAngel.py
@@ -0,0 +1,122 @@
+"""MSAngel creates modular pipelines that allows several search engines to identify 
+peptides, which are then quantified with Proline.
+The parameters are provided in a .json file.
+MSAngel allows for multiple search engines to be used in the same pipeline. So it 
+requires a list of search engines and their respective parameters, which are then 
+concatenated.
+
+Relevant information in file:
+
+"""
+
+import json
+import pathlib
+from typing import Union
+
+import pandas as pd
+
+from proteobench.io.params import ProteoBenchParameters
+
+
+def extract_search_engine(search_params: list) -> dict:
+    """
+    Extract search engine parameters from the JSON data.
+    The parameter format depends on the search engine used, so this functino needs to be
+    updated for each search engine. Currently, it is set up for:
+    . Mascot
+    """
+
+    all_search_engines = []
+    for each_search_params in search_params["operations"]:
+        print("1")
+        if "searchEnginesWithForms" in each_search_params:
+            all_search_engines.append(each_search_params["searchEnginesWithForms"][0][0])
+
+    return all_search_engines
+
+
+def extract_params(fname: Union[str, pathlib.Path]) -> ProteoBenchParameters:
+    """
+    Parse MSAangel quantification tool JSON parameter file and extract relevant parameters.
+
+    Args:
+        fname (str or pathlib.Path): The path to the Sage JSON parameter file.
+
+    Returns:
+        ProteoBenchParameters: The extracted parameters as a `ProteoBenchParameters` object.
+    """
+    params = ProteoBenchParameters()
+
+    try:
+        # If the input is a file-like object (e.g., StringIO), decode it
+        file_contents = fname.getvalue().decode("utf-8")
+        data = json.loads(file_contents)
+    except AttributeError:
+        # Otherwise, treat it as a file path
+        with open(fname, "r") as file_contents:
+            data = json.load(file_contents)
+
+    # Extract parameters from the JSON data
+    params.software_name = "MSAngel"
+    params.software_version = data["msAngelVersion"]
+
+    ## Extract the search engine(s) parameters before concatenating them:
+    all_search_engines = extract_search_engine(data)
+    params.search_engines = all_search_engines.join(",")
+    all_search_engines = []
+    all_enzyme = []
+    all_allowed_miscleavages = []
+    all_fixed_mods = []
+    all_variable_mods = []
+
+    # TODO needs to have actual values
+    all_search_params = {}
+
+    for key, value in all_search_params.items():
+        all_search_engines.append(value["format"])
+        all_enzyme.append(value["enzyme"]["cleave_at"])
+        all_allowed_miscleavages.append(value["enzyme"]["missed_cleavages"])
+        all_fixed_mods.append(value["static_mods"])
+        all_variable_mods.append(value["variable_mods"])
+
+    # TODO need to have an actual value
+    params.search_engine = ""
+    params.search_engine_version = data["version"]
+    params.enzyme = data["database"]["enzyme"]["cleave_at"]
+    params.allowed_miscleavages = data["database"]["enzyme"]["missed_cleavages"]
+    params.fixed_mods = data["database"]["static_mods"]
+    params.variable_mods = data["database"]["variable_mods"]
+
+    try:
+        params.precursor_mass_tolerance = data["precursor_tol"]["ppm"]
+    except KeyError:
+        params.precursor_mass_tolerance = data["precursor_tol"]["Da"]
+
+    params.fragment_mass_tolerance = data["fragment_tol"]["ppm"]
+    params.min_peptide_length = data["database"]["enzyme"]["min_len"]
+    params.max_peptide_length = data["database"]["enzyme"]["max_len"]
+    params.max_mods = data["database"]["max_variable_mods"]
+    params.min_precursor_charge = data["precursor_charge"][0]
+    params.max_precursor_charge = data["precursor_charge"][1]
+    params.enable_match_between_runs = True
+
+    return params
+
+
+if __name__ == "__main__":
+    """
+    Extract parameters from MSAngel JSON files and save them as CSV.
+    """
+    from pathlib import Path
+
+    file = Path("../../../test/params/msangel_results.json")
+
+    # Extract parameters from the file
+    params = extract_params(file)
+
+    # Convert the extracted parameters to a dictionary and then to a pandas Series
+    data_dict = params.__dict__
+    series = pd.Series(data_dict)
+
+    # Write the Series to a CSV file
+    series.to_csv(file.with_suffix(".csv"))
diff --git a/proteobench/io/params/__init__.py b/proteobench/io/params/__init__.py
@@ -31,11 +31,11 @@ class ProteoBenchParameters:
     enable_match_between_runs : Optional[bool]
         Match between run (also named cross assignment) is enabled.
     precursor_mass_tolerance : Optional[str]
-       Precursor mass tolerance used for the search,
-       associated with the unit: "20 ppm" = +/- 20 ppm; if several, separate with "|".
+       Precursor mass tolerance used for the search.
+       Given as an interval of upper and lower tolerance, e.g. [-20 ppm, 20 ppm].
     fragment_mass_tolerance : Optional[str]
         Precursor mass tolerance used for the search:
-        "20 ppm" = +/- 20 ppm; if several, separate with "|"
+        Given as an interval of upper and lower tolerance, e.g. [-0.02 Da, 0.02 Da].
     enzyme : Optional[str]
         Enzyme used as parameter for the search. If several, use "|".
     allowed_miscleavages : Optional[int]
@@ -73,16 +73,15 @@ class ProteoBenchParameters:
     search_engine: Optional[str] = None
     search_engine_version: Optional[str] = None
     ident_fdr_psm: Optional[str] = None  # fdr_psm
-    ident_fdr_peptide: Optional[str] = None  # fdr_peptide
-    ident_fdr_protein: Optional[str] = None  # fdr_protein
+    ident_fdr_peptide: Optional[float] = None  # fdr_peptide
+    ident_fdr_protein: Optional[float] = None  # fdr_protein
     enable_match_between_runs: Optional[bool] = None  # MBR
-    # TODO: either add the units for the tolerance here or remove them from the webpage/plot/etc.
     precursor_mass_tolerance: Optional[str] = None  # precursor_tol, precursor_tol_unit
     fragment_mass_tolerance: Optional[str] = None  # fragment_tol, fragment_tol_unit
     enzyme: Optional[str] = None  # enzyme_name
     allowed_miscleavages: Optional[int] = None  # missed_cleavages
-    min_peptide_length: Optional[str] = None  # min_pep_length
-    max_peptide_length: Optional[str] = None  # max_pep_length
+    min_peptide_length: Optional[int] = None  # min_pep_length
+    max_peptide_length: Optional[int] = None  # max_pep_length
     fixed_mods: Optional[str] = None  # fixed_modifications
     variable_mods: Optional[str] = None  # variable_modifications
     max_mods: Optional[int] = None  # max_num_modifications

diff --git a/proteobench/io/params/alphapept.py b/proteobench/io/params/alphapept.py
@@ -39,6 +39,8 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     # Extract FASTA related details
     fasta = record["fasta"]
     params.enzyme = fasta["protease"]
+    if params.enzyme == "trypsin":
+        params.enzyme = "Trypsin"
     params.allowed_miscleavages = fasta["n_missed_cleavages"]
     params.fixed_mods = ",".join(fasta["mods_fixed"])
     params.variable_mods = ",".join(fasta["mods_variable"])
@@ -51,8 +53,12 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
     _tolerance_unit = "Da"  # Default unit is Da
     if search["ppm"]:
         _tolerance_unit = "ppm"
-    params.precursor_mass_tolerance = f'{search["prec_tol"]} {_tolerance_unit}'
-    params.fragment_mass_tolerance = f'{search["frag_tol"]} {_tolerance_unit}'
+    params.precursor_mass_tolerance = (
+        f'[-{search["prec_tol"]} {_tolerance_unit}, {search["prec_tol"]} {_tolerance_unit}]'
+    )
+    params.fragment_mass_tolerance = (
+        f'[-{search["frag_tol"]} {_tolerance_unit}, {search["frag_tol"]} {_tolerance_unit}]'
+    )
     params.ident_fdr_protein = search["protein_fdr"]
     params.ident_fdr_peptide = search["peptide_fdr"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,4 +21,4 @@ @@
         "flake8.args": [
             "--max-line-length=120",
         ],
-    }
+    }