From cfd39e80b4898077f92cacc6491a5c891c5a9454 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 23 Aug 2024 14:12:50 -0700 Subject: [PATCH] all comments addressed --- casanovo/config.yaml | 7 +++- casanovo/data/db_utils.py | 68 +++++++++++++++++++++++++++------ casanovo/denovo/model_runner.py | 1 + tests/conftest.py | 4 ++ tests/unit_tests/test_unit.py | 56 +++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 13 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 860cfabb..87795db8 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -46,7 +46,7 @@ devices: # See pyteomics.parser.expasy_rules for valid enzymes enzyme: "trypsin" # Digestion type for candidate peptide generation. -# Full: standard digestion. Semi: Include products of semi-specific cleavage +# full: standard digestion. semi: Include products of semi-specific cleavage digestion: "full" # Number of allowed missed cleavages when digesting protein missed_cleavages: 0 @@ -55,6 +55,11 @@ missed_cleavages: 0 max_mods: # Maximum peptide length to consider max_peptide_len: 50 +# Toggle allowed modifications on/off +# Permanent fixed mod (don't include): C+57.021 +# Allowed variable mods: M+15.995, N+0.984, Q+0.984, +# Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 +allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" ### diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index d249e0c7..2bdf3828 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -15,16 +15,6 @@ PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 -var_mods = { - "d": ["N", "Q"], - "ox": ["M"], - "ace-": True, - "carb-": True, - "nh3x-": True, - "carbnh3x-": True, -} -fixed_mods = {"carbm": ["C"]} - class ProteinDatabase: """ @@ -51,6 +41,8 @@ class ProteinDatabase: The precursor mass tolerance in ppm. isotope_error : List[int] Isotopes to consider when comparing predicted and observed precursor m/z's. + allowed_mods : List[str] + A list of allowed modifications to consider. """ def __init__( @@ -64,7 +56,11 @@ def __init__( max_mods: int, precursor_tolerance: float, isotope_error: List[int], + allowed_mods: List[str], ): + self.fixed_mods, self.var_mods = self._construct_mods_dict( + allowed_mods + ) self.digest = self._digest_fasta( fasta_path, enzyme, @@ -197,8 +193,8 @@ def _digest_fasta( for pep, prot in peptide_list: peptide_isoforms = parser.isoforms( pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, max_mods=max_mods, ) peptide_isoforms = list( @@ -218,6 +214,54 @@ def _digest_fasta( logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) return pdb_df + def _construct_mods_dict(self, allowed_mods): + """ + Constructs dictionaries of fixed and variable modifications. + + Parameters + ---------- + allowed_mods : str + A comma-separated list of allowed modifications. + + Returns + ------- + fixed_mods : dict + A dictionary of fixed modifications. + var_mods : dict + A dictionary of variable modifications. + """ + fixed_mods = {"carbm": ["C"]} + var_mods = {} + + if allowed_mods is "" or None: + return fixed_mods, var_mods + for mod in allowed_mods.split(","): + if mod == "M+15.995": + if "ox" not in var_mods: + var_mods["ox"] = [] + var_mods["ox"].append("M") + elif mod == "N+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("N") + elif mod == "Q+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("Q") + elif mod == "+42.011": + var_mods["ace-"] = True + elif mod == "+43.006": + var_mods["carb-"] = True + elif mod == "-17.027": + var_mods["nh3x-"] = True + elif mod == "+43.006-17.027": + var_mods["carbnh3x-"] = True + else: + logger.error("Modification %s not recognized.", mod) + raise ValueError(f"Modification {mod} not recognized.") + + return fixed_mods, var_mods + @jit def _to_mz(precursor_mass, charge): """ diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b90f06b0..789c960b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -124,6 +124,7 @@ def db_search( self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, + self.config.allowed_mods, ) self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index f20d7879..452316c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -299,6 +299,10 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, + "allowed_mods": ( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), } cfg_file = tmp_path / "config.yml" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 2473a168..a31e2024 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -287,6 +287,10 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected @@ -356,6 +360,10 @@ def test_digest_fasta_mods(tiny_fasta_file): max_mods=1, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) peptide_list = [ @@ -389,6 +397,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_long @@ -403,6 +415,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_short @@ -433,6 +449,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_argc @@ -447,6 +467,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_aspn @@ -472,6 +496,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == candidates @@ -486,6 +514,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=150000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == candidates @@ -500,6 +532,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=600000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == candidates @@ -563,6 +599,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -578,6 +618,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[1], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -593,6 +637,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[2], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -608,6 +656,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -623,6 +675,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 1, 2, 3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2)