Skip to content

Commit

Permalink
Merge pull request #492 from Proteobench/380-homogenization-of-parame…
Browse files Browse the repository at this point in the history
…ters

380 homogenization of parameters
  • Loading branch information
RobbinBouwmeester authored Dec 11, 2024
2 parents a181476 + 1a206c1 commit cbebe0a
Show file tree
Hide file tree
Showing 35 changed files with 403 additions and 187 deletions.
15 changes: 7 additions & 8 deletions proteobench/io/params/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ class ProteoBenchParameters:
enable_match_between_runs : Optional[bool]
Match between run (also named cross assignment) is enabled.
precursor_mass_tolerance : Optional[str]
Precursor mass tolerance used for the search,
associated with the unit: "20 ppm" = +/- 20 ppm; if several, separate with "|".
Precursor mass tolerance used for the search.
Given as an interval of upper and lower tolerance, e.g. [-20 ppm, 20 ppm].
fragment_mass_tolerance : Optional[str]
Precursor mass tolerance used for the search:
"20 ppm" = +/- 20 ppm; if several, separate with "|"
Given as an interval of upper and lower tolerance, e.g. [-0.02 Da, 0.02 Da].
enzyme : Optional[str]
Enzyme used as parameter for the search. If several, use "|".
allowed_miscleavages : Optional[int]
Expand Down Expand Up @@ -73,16 +73,15 @@ class ProteoBenchParameters:
search_engine: Optional[str] = None
search_engine_version: Optional[str] = None
ident_fdr_psm: Optional[str] = None # fdr_psm
ident_fdr_peptide: Optional[str] = None # fdr_peptide
ident_fdr_protein: Optional[str] = None # fdr_protein
ident_fdr_peptide: Optional[float] = None # fdr_peptide
ident_fdr_protein: Optional[float] = None # fdr_protein
enable_match_between_runs: Optional[bool] = None # MBR
# TODO: either add the units for the tolerance here or remove them from the webpage/plot/etc.
precursor_mass_tolerance: Optional[str] = None # precursor_tol, precursor_tol_unit
fragment_mass_tolerance: Optional[str] = None # fragment_tol, fragment_tol_unit
enzyme: Optional[str] = None # enzyme_name
allowed_miscleavages: Optional[int] = None # missed_cleavages
min_peptide_length: Optional[str] = None # min_pep_length
max_peptide_length: Optional[str] = None # max_pep_length
min_peptide_length: Optional[int] = None # min_pep_length
max_peptide_length: Optional[int] = None # max_pep_length
fixed_mods: Optional[str] = None # fixed_modifications
variable_mods: Optional[str] = None # variable_modifications
max_mods: Optional[int] = None # max_num_modifications
Expand Down
10 changes: 8 additions & 2 deletions proteobench/io/params/alphapept.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
# Extract FASTA related details
fasta = record["fasta"]
params.enzyme = fasta["protease"]
if params.enzyme == "trypsin":
params.enzyme = "Trypsin"
params.allowed_miscleavages = fasta["n_missed_cleavages"]
params.fixed_mods = ",".join(fasta["mods_fixed"])
params.variable_mods = ",".join(fasta["mods_variable"])
Expand All @@ -51,8 +53,12 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
_tolerance_unit = "Da" # Default unit is Da
if search["ppm"]:
_tolerance_unit = "ppm"
params.precursor_mass_tolerance = f'{search["prec_tol"]} {_tolerance_unit}'
params.fragment_mass_tolerance = f'{search["frag_tol"]} {_tolerance_unit}'
params.precursor_mass_tolerance = (
f'[-{search["prec_tol"]} {_tolerance_unit}, {search["prec_tol"]} {_tolerance_unit}]'
)
params.fragment_mass_tolerance = (
f'[-{search["frag_tol"]} {_tolerance_unit}, {search["frag_tol"]} {_tolerance_unit}]'
)
params.ident_fdr_protein = search["protein_fdr"]
params.ident_fdr_peptide = search["peptide_fdr"]

Expand Down
29 changes: 25 additions & 4 deletions proteobench/io/params/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,14 +286,34 @@ def extract_params(fname: str) -> ProteoBenchParameters:
else:
parameters[proteobench_setting] = parse_setting(proteobench_setting, cmdline_dict[cmd_setting])

# Parse cut parameter to standard enzyme name
if parameters["enzyme"] == "K*,R*":
parameters["enzyme"] = "Trypsin/P"
elif parameters["enzyme"] == "K*,R*,!*P":
parameters["enzyme"] = "Trypsin"

# If mass-acc flag is not present in cmdline string, extract it from the log file
if "precursor_mass_tolerance" not in parameters.keys():
mass_tol = extract_with_regex(lines, mass_tolerance_regex)
parameters["precursor_mass_tolerance"] = mass_tol + " ppm"
parameters["fragment_mass_tolerance"] = mass_tol + " ppm"
parameters["precursor_mass_tolerance"] = "[-" + mass_tol + " ppm" + ", " + mass_tol + " ppm]"
parameters["fragment_mass_tolerance"] = "[-" + mass_tol + " ppm" + ", " + mass_tol + " ppm]"
else:
parameters["precursor_mass_tolerance"] = str(parameters["precursor_mass_tolerance"]) + " ppm"
parameters["fragment_mass_tolerance"] = str(parameters["fragment_mass_tolerance"]) + " ppm"
parameters["precursor_mass_tolerance"] = (
"[-"
+ str(parameters["precursor_mass_tolerance"])
+ " ppm"
+ ", "
+ str(parameters["precursor_mass_tolerance"])
+ " ppm]"
)
parameters["fragment_mass_tolerance"] = (
"[-"
+ str(parameters["fragment_mass_tolerance"])
+ " ppm"
+ ", "
+ str(parameters["fragment_mass_tolerance"])
+ " ppm]"
)

# If scan window is not customely set, extract it from the log file
parameters["scan_window"] = int(extract_with_regex(lines, scan_window_regex))
Expand All @@ -311,4 +331,5 @@ def extract_params(fname: str) -> ProteoBenchParameters:
params = extract_params(file)
data_dict = params.__dict__
series = pd.Series(data_dict)
print(series)
series.to_csv(file.with_suffix(".csv"))
19 changes: 11 additions & 8 deletions proteobench/io/params/fragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import logging
import pathlib
import pprint
import re
from collections import namedtuple
from io import BytesIO
Expand Down Expand Up @@ -124,29 +123,33 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
enzyme = fragpipe_params.loc["msfragger.search_enzyme_name_1"]
if fragpipe_params.loc["msfragger.search_enzyme_name_2"] != "null":
enzyme += f"|{fragpipe_params.loc['msfragger.search_enzyme_name_2']}"
if enzyme == "stricttrypsin":
enzyme = "Trypsin/P" # strict trypsin: always cut after K and R
elif enzyme == "trypsin":
enzyme = "Trypsin" # trypsin: do not cut before P
params.enzyme = enzyme
params.allowed_miscleavages = fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"]
params.allowed_miscleavages = int(fragpipe_params.loc["msfragger.allowed_missed_cleavage_1"])

# Modifications
params.fixed_mods = fragpipe_params.loc["msfragger.table.fix-mods"]
params.variable_mods = fragpipe_params.loc["msfragger.table.var-mods"]
params.max_mods = fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"]
params.max_mods = int(fragpipe_params.loc["msfragger.max_variable_mods_per_peptide"])

# Peptide length
params.min_peptide_length = fragpipe_params.loc["msfragger.digest_min_length"]
params.max_peptide_length = fragpipe_params.loc["msfragger.digest_max_length"]
params.min_peptide_length = int(fragpipe_params.loc["msfragger.digest_min_length"])
params.max_peptide_length = int(fragpipe_params.loc["msfragger.digest_max_length"])

# Precursor mass tolerance
precursor_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.precursor_mass_units"]):
precursor_mass_units = "ppm"
params.precursor_mass_tolerance = f'{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}|{fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}'
params.precursor_mass_tolerance = f'[{fragpipe_params.loc["msfragger.precursor_mass_lower"]} {precursor_mass_units}, {fragpipe_params.loc["msfragger.precursor_mass_upper"]} {precursor_mass_units}]'

# Fragment mass tolerance
fragment_mass_units = "Da"
if int(fragpipe_params.loc["msfragger.fragment_mass_units"]):
fragment_mass_units = "ppm"
params.fragment_mass_tolerance = f'{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}'
params.fragment_mass_tolerance = f'[-{fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}, {fragpipe_params.loc["msfragger.fragment_mass_tolerance"]} {fragment_mass_units}]'

# Quantification settings
if fragpipe_params.loc["quantitation.run-label-free-quant"] == "true":
Expand Down Expand Up @@ -205,6 +208,6 @@ def extract_params(file: BytesIO) -> ProteoBenchParameters:
df.to_csv(file.with_suffix(".csv"))
with open(file, "rb") as f:
params = extract_params(f)
pprint(params.__dict__)
series = pd.Series(params.__dict__)
print(series)
series.to_csv(file.parent / f"{file.stem}_extracted_params.csv")
48 changes: 28 additions & 20 deletions proteobench/io/params/i2masschroq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,33 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
params = pd.read_csv(fname, sep="\t", header=None, index_col=0).squeeze()

# Construct tolerance strings for fragment and parent mass errors
_tol_frag = "{} ({})".format(
_tol_frag = "{} {}".format(
params.loc["spectrum, fragment monoisotopic mass error"],
params.loc["spectrum, fragment monoisotopic mass error units"],
params.loc["spectrum, fragment monoisotopic mass error units"].replace("Daltons", "Da"),
)

# Assert the symmetry of parent mass error tolerances
assert (
params.loc["spectrum, parent monoisotopic mass error minus"]
== params.loc["spectrum, parent monoisotopic mass error plus"]
), "not symmetric tolerance"

# Construct tolerance strings for parent mass error
_tol_prec = "{} ({})".format(
_tol_prec_lower = "{} {}".format(
params.loc["spectrum, parent monoisotopic mass error minus"],
params.loc["spectrum, parent monoisotopic mass error units"],
params.loc["spectrum, parent monoisotopic mass error units"].replace("Daltons", "Da"),
)

_tol_prec_upper = "{} {}".format(
params.loc["spectrum, parent monoisotopic mass error plus"],
params.loc["spectrum, parent monoisotopic mass error units"].replace("Daltons", "Da"),
)

# Max missed cleavage sites, either from scoring or refinement
max_cleavage = params.loc["scoring, maximum missed cleavage sites"]
if params.loc["refine"] == "yes":
max_cleavage = params.loc["refine, maximum missed cleavage sites"]
max_cleavage = int(params.loc["refine, maximum missed cleavage sites"])

_enzyme = str(params.loc["protein, cleavage site"])
# Replace the enzyme pattern with the enzyme name used in ProteoBench
if _enzyme == "[RK]|{P}":
_enzyme = "Trypsin"
elif _enzyme == "[RK]":
_enzyme = "Trypsin/P"

fixed_mods_list = list(params.loc[params.index.str.contains("residue, modification mass")].dropna())
var_mods_list = list(params.loc[params.index.str.contains("residue, potential modification mass")].dropna())
Expand All @@ -57,22 +63,23 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
software_name="i2MassChroQ",
software_version=params.loc["i2MassChroQ_VERSION"],
search_engine=params.loc["AnalysisSoftware_name"],
search_engine_version=params.loc["AnalysisSoftware_version"],
ident_fdr_psm=params.loc["psm_fdr"],
ident_fdr_peptide=params.loc["peptide_fdr"],
ident_fdr_protein=params.loc["protein_fdr"],
enable_match_between_runs=params.loc["mcq_mbr"],
precursor_mass_tolerance=_tol_prec,
fragment_mass_tolerance=_tol_frag,
enzyme=params.loc["protein, cleavage site"],
search_engine_version=str(params.loc["AnalysisSoftware_version"] or ""),
ident_fdr_psm=float(params.loc["psm_fdr"]),
ident_fdr_peptide=float(params.loc["peptide_fdr"]),
ident_fdr_protein=float(params.loc["protein_fdr"]),
# set match between runs to True if it is enabled
enable_match_between_runs=True if params.loc["mcq_mbr"] == "T" else False,
precursor_mass_tolerance="[-" + _tol_prec_lower + ", " + _tol_prec_upper + "]",
fragment_mass_tolerance="[-" + _tol_frag + ", " + _tol_frag + "]",
enzyme=_enzyme,
allowed_miscleavages=max_cleavage,
min_peptide_length=None, # "spectrum, minimum fragment mz"
max_peptide_length=None, # Not mentioned, up to 38 AA in peptides
fixed_mods=";".join(fixed_mods_list),
variable_mods=";".join(var_mods_list),
max_mods=None,
min_precursor_charge=1, # Fixed in software
max_precursor_charge=params.loc["spectrum, maximum parent charge"],
max_precursor_charge=int(params.loc["spectrum, maximum parent charge"]),
)

return params
Expand All @@ -95,6 +102,7 @@ def extract_params(fname: pathlib.Path) -> ProteoBenchParameters:
# Convert the parameters to a dictionary and then to a pandas Series
data_dict = params.__dict__
series = pd.Series(data_dict)
print(series)

# Write the Series to a CSV file
series.to_csv(file.parent / (file.stem + "_sel.csv"))
6 changes: 4 additions & 2 deletions proteobench/io/params/maxdia.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ def extract_params(fname: str) -> ProteoBenchParameters:
series = build_Series_from_records(read_file(fname)).reset_index()

# Extract and set peptide length parameters
parameters.min_peptide_length = series.loc[series["level_0"] == "minPeptideLength", 0].values[0]
parameters.max_peptide_length = series.loc[series["level_0"] == "maxPeptideLengthForUnspecificSearch", 0].values[0]
parameters.min_peptide_length = int(series.loc[series["level_0"] == "minPeptideLength", 0].values[0])
parameters.max_peptide_length = int(
series.loc[series["level_0"] == "maxPeptideLengthForUnspecificSearch", 0].values[0]
)

# Set search engine version from software version
parameters.search_engine_version = parameters.__dict__["software_version"]
Expand Down
30 changes: 16 additions & 14 deletions proteobench/io/params/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,28 +137,30 @@ def extract_params(fname, ms2frac="FTMS") -> ProteoBenchParameters:
params.search_engine = "Andromeda"
params.software_version = record.loc["maxQuantVersion"].squeeze()
params.ident_fdr_psm = None
params.ident_fdr_peptide = record.loc["peptideFdr"].squeeze()
params.ident_fdr_protein = record.loc["proteinFdr"].squeeze()
params.enable_match_between_runs = record.loc["matchBetweenRuns"].squeeze()
precursor_mass_tolerance = record.loc[
params.ident_fdr_peptide = float(record.loc["peptideFdr"].squeeze())
params.ident_fdr_protein = float(record.loc["proteinFdr"].squeeze())
params.enable_match_between_runs = record.loc["matchBetweenRuns"].squeeze() == "True"
_precursor_mass_tolerance = record.loc[
pd.IndexSlice["parameterGroups", "parameterGroup", "mainSearchTol", :]
].squeeze()
params.precursor_mass_tolerance = f"{precursor_mass_tolerance} ppm"
_precursor_mass_tolerance = f"{_precursor_mass_tolerance} ppm"
params.precursor_mass_tolerance = "[-" + _precursor_mass_tolerance + ", " + _precursor_mass_tolerance + "]"
# ! differences between version >1.6 and <=1.5
fragment_mass_tolerance = record.loc[pd.IndexSlice["msmsParamsArray", "msmsParams", "MatchTolerance", :]].squeeze()
in_ppm = bool(record.loc[pd.IndexSlice["msmsParamsArray", "msmsParams", "MatchToleranceInPpm", :]].squeeze())
if in_ppm:
fragment_mass_tolerance = f"{fragment_mass_tolerance} ppm"
fragment_mass_tolerance = f"[-{fragment_mass_tolerance}, {fragment_mass_tolerance}]"
params.fragment_mass_tolerance = fragment_mass_tolerance
params.enzyme = record.loc[("parameterGroups", "parameterGroup", "enzymes", "string")].squeeze()
params.allowed_miscleavages = record.loc[
pd.IndexSlice["parameterGroups", "parameterGroup", "maxMissedCleavages", :]
].squeeze()
params.allowed_miscleavages = int(
record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "maxMissedCleavages", :]].squeeze()
)
try:
params.min_peptide_length = record.loc["minPepLen"].squeeze()
params.min_peptide_length = int(record.loc["minPepLen"].squeeze())
except KeyError:
# Version 2.6 and above
params.minPeptideLength = record.loc["minPeptideLength"].squeeze()
params.min_peptide_length = int(record.loc["minPeptideLength"].squeeze())
# minPeptideLengthForUnspecificSearch (what is it?)
params.max_peptide_length = None
# fixed mods
Expand All @@ -180,11 +182,11 @@ def extract_params(fname, ms2frac="FTMS") -> ProteoBenchParameters:
params.variable_mods = variable_mods
else:
params.variable_mods = ",".join(variable_mods)
params.max_mods = record.loc[("parameterGroups", "parameterGroup", "maxNmods")].squeeze()
params.max_mods = int(record.loc[("parameterGroups", "parameterGroup", "maxNmods")].squeeze())
params.min_precursor_charge = None
params.max_precursor_charge = record.loc[
pd.IndexSlice["parameterGroups", "parameterGroup", "maxCharge", :]
].squeeze()
params.max_precursor_charge = int(
record.loc[pd.IndexSlice["parameterGroups", "parameterGroup", "maxCharge", :]].squeeze()
)
return params


Expand Down
22 changes: 12 additions & 10 deletions proteobench/io/params/msaid.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def extract_params(fname: str) -> ProteoBenchParameters:
"search_engine": "Chimerys",
"search_engine_version": "4.1.1",
"quantification_method": "MS2 Area",
"ident_fdr_psm": "0.01",
"ident_fdr_peptide": "0.01",
"ident_fdr_protein": "0.01",
"ident_fdr_psm": 0.01,
"ident_fdr_peptide": 0.01,
"ident_fdr_protein": 0.01,
"enable_match_between_runs": False,
}

Expand All @@ -37,16 +37,18 @@ def extract_params(fname: str) -> ProteoBenchParameters:
# Extract relevant parameters from the file's dictionary
parameters["search_engine"] = params_dict["Algorithm"].split(" ")[0]
parameters["search_engine_version"] = params_dict["Algorithm"].split(" ")[1]
parameters["fragment_mass_tolerance"] = params_dict["Fragment Mass Tolerance"]
parameters["fragment_mass_tolerance"] = (
"[-" + params_dict["Fragment Mass Tolerance"] + ", " + params_dict["Fragment Mass Tolerance"] + "]"
)
parameters["enzyme"] = params_dict["Enzyme"]
parameters["allowed_miscleavages"] = params_dict["Max. Missed Cleavage Sites"]
parameters["min_peptide_length"] = params_dict["Min. Peptide Length"]
parameters["max_peptide_length"] = params_dict["Max. Peptide Length"]
parameters["allowed_miscleavages"] = int(params_dict["Max. Missed Cleavage Sites"])
parameters["min_peptide_length"] = int(params_dict["Min. Peptide Length"])
parameters["max_peptide_length"] = int(params_dict["Max. Peptide Length"])
parameters["fixed_mods"] = params_dict["Static Modifications"]
parameters["variable_mods"] = params_dict["Variable Modifications"]
parameters["max_mods"] = params_dict["Maximum Number of Modifications"]
parameters["min_precursor_charge"] = params_dict["Min. Peptide Charge"]
parameters["max_precursor_charge"] = params_dict["Max. Peptide Charge"]
parameters["max_mods"] = int(params_dict["Maximum Number of Modifications"])
parameters["min_precursor_charge"] = int(params_dict["Min. Peptide Charge"])
parameters["max_precursor_charge"] = int(params_dict["Max. Peptide Charge"])
parameters["quantification_method"] = params_dict["Quantification Type"]

# Set flag for enabling match between runs based on quantification method
Expand Down
Loading

0 comments on commit cbebe0a

Please sign in to comment.