From 1825f084ce93c165e02f8d819d8942e5ad6a0209 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 26 Aug 2024 14:46:18 -0500 Subject: [PATCH 1/5] started mod methods --- mzLib/Omics/IBioPolymerWithSetMods.cs | 79 +++++++++++++++++++++++++++ mzLib/Test/TestPeptideWithSetMods.cs | 49 +++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index 1c3ade66a..cf29e6fa1 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -73,5 +73,84 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence) } return sb.ToString(); } + + /// + /// Returns a list of modifications and their OneBased index from a full sequence + /// + /// Full sequence + /// All known modifications + /// + public static Dictionary GetModificationDictionaryFromFullSequence(string fullSequence, + Dictionary allModsKnown) + { + var allModsOneIsNterminus = new Dictionary(); + var baseSequence = GetBaseSequenceFromFullSequence(fullSequence); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < fullSequence.Length; r++) + { + char c = fullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = fullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message); + } + if (!allModsKnown.TryGetValue(modId, out var mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + fullSequence); + } + if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1) + { + currentModificationLocation = baseSequence.Length + 2; + } + allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return allModsOneIsNterminus; + } + + /// + /// Returns a list of modifications from a full sequence + /// + /// Full sequence + /// All known modifications + /// + public static List GetModificationsFromFullSequence(string fullSequence, + Dictionary allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values]; } } diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index 355d9d27c..e481b8d6d 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1181,5 +1181,54 @@ public static void TestPeptideWithSetModsNoParentProtein() Assert.AreEqual('-', last.NextAminoAcid); Assert.AreEqual('-', last.NextResidue); } + + [Test] + public static void TestIBioPolymerWithSetModsModificationFromFullSequence() + { + //Just making sure there are no snafus when creating decoy peptides from an xml,which will have mods in various places, etc. + //sequence variants, modifications + Dictionary un = new Dictionary(); + var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); + Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), formalChargesDictionary).ToList(); + List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), + true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); + var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); + + foreach (Protein p in proteins) + { + List targetPeptides = p.Digest(new DigestionParams(), [], [], null, null).ToList(); + foreach (PeptideWithSetModifications targetPeptide in targetPeptides) + { + // Pull our expected modifications based upon parent protein object + var expectedModCount = p.OneBasedPossibleLocalizedModifications.Count(mod => + mod.Key >= targetPeptide.OneBasedStartResidue && + mod.Key < targetPeptide.OneBasedEndResidue); + var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => + mod.Key >= targetPeptide.OneBasedStartResidue && + mod.Key < targetPeptide.OneBasedEndResidue).SelectMany(mod => mod.Value).ToList(); + + // Parse modifications from PWSM and two IBioPolymerWithSetMods methods + var pwsmModDict = targetPeptide.AllModsOneIsNterminus; + var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + + // Ensure all methods are in agreement by modification count + Assert.AreEqual(pwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModList.Count, expectedModCount); + + // Ensure all methods are in agreement by modification identify + List pwsmModList = pwsmModDict.Values.ToList(); + foreach (var expectedMod in expectedModifications) + { + Assert.Contains(expectedMod, pwsmModDict); + Assert.Contains(expectedMod, pwsmModList); + Assert.Contains(expectedMod, bpwsmModDict); + Assert.Contains(expectedMod, bpwsmModList); + } + } + } + } } } \ No newline at end of file From 2dee878750487df223a6a70c193394a05b96b390 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 26 Aug 2024 16:18:37 -0500 Subject: [PATCH 2/5] made the test pass --- mzLib/Test/TestPeptideWithSetMods.cs | 76 +++++++++++++++++++++------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index e481b8d6d..30c94eae1 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -8,6 +8,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using MzLibUtil; using Omics; using Omics.Digestion; using Omics.Fragmentation; @@ -1194,20 +1195,63 @@ public static void TestIBioPolymerWithSetModsModificationFromFullSequence() List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); + var digestionParameters = new DigestionParams(maxModsForPeptides: 3); foreach (Protein p in proteins) { - List targetPeptides = p.Digest(new DigestionParams(), [], [], null, null).ToList(); - foreach (PeptideWithSetModifications targetPeptide in targetPeptides) - { - // Pull our expected modifications based upon parent protein object - var expectedModCount = p.OneBasedPossibleLocalizedModifications.Count(mod => - mod.Key >= targetPeptide.OneBasedStartResidue && - mod.Key < targetPeptide.OneBasedEndResidue); + List digestedPeptides = p.Digest(digestionParameters, [], [], null, null).ToList(); + // take the most modified peptide by base sequence and ensure all methods function properly + foreach (var targetPeptide in digestedPeptides + .Where(pep => pep.FullSequence.Contains('[')) + .GroupBy(pep => pep.BaseSequence) + .Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count))) + { + var startResidue = targetPeptide.OneBasedStartResidue; + var endResidue = targetPeptide.OneBasedEndResidue; + + // Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods + // A bunch of logic to count the number of expected modifications based upon the xml database entries + int expectedModCount = 0; + foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications + .Where(mod => mod.Key >= startResidue && mod.Key <= endResidue)) + { + if (modDictEntry.Value.Count > 1) + { + var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList(); + + if (locRestrictions.AllSame()) + { + if (locRestrictions.First() == "Anywhere.") + expectedModCount++; + else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue) + expectedModCount++; + } + else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.") + && modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("N-terminal.")) + { + expectedModCount++; + if (modDictEntry.Key == startResidue) + expectedModCount++; + } + } + else + { + switch (modDictEntry.Value.First().LocationRestriction) + { + case "Anywhere.": + case "N-terminal." when modDictEntry.Key == startResidue: + expectedModCount++; + break; + } + } + } + expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods); + var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => - mod.Key >= targetPeptide.OneBasedStartResidue && - mod.Key < targetPeptide.OneBasedEndResidue).SelectMany(mod => mod.Value).ToList(); + mod.Key >= startResidue && + mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList(); + // Parse modifications from PWSM and two IBioPolymerWithSetMods methods var pwsmModDict = targetPeptide.AllModsOneIsNterminus; var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); @@ -1219,14 +1263,12 @@ public static void TestIBioPolymerWithSetModsModificationFromFullSequence() Assert.AreEqual(bpwsmModList.Count, expectedModCount); // Ensure all methods are in agreement by modification identify - List pwsmModList = pwsmModDict.Values.ToList(); - foreach (var expectedMod in expectedModifications) - { - Assert.Contains(expectedMod, pwsmModDict); - Assert.Contains(expectedMod, pwsmModList); - Assert.Contains(expectedMod, bpwsmModDict); - Assert.Contains(expectedMod, bpwsmModList); - } + foreach (var pwsmModification in pwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModList) + Assert.Contains(pwsmModification, expectedModifications); } } } From 359843bf7b9aada7359aa55ddbdd031335762f63 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 26 Aug 2024 18:10:05 -0500 Subject: [PATCH 3/5] Removed GetMods after deserialization method in favor of IBiopolymerWithSetMods backing method --- .../PeptideWithSetModifications.cs | 64 +------------------ mzLib/Test/TestPeptideWithSetMods.cs | 21 +++--- 2 files changed, 13 insertions(+), 72 deletions(-) diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 1b7d32d61..d26945f5a 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -69,7 +69,7 @@ public PeptideWithSetModifications(string sequence, Dictionary public void SetNonSerializedPeptideInfo(Dictionary idToMod, Dictionary accessionToProtein, DigestionParams dp) { - GetModsAfterDeserialization(idToMod); + _allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod); GetProteinAfterDeserialization(accessionToProtein); _digestionParams = dp; } @@ -919,66 +919,6 @@ public void SetNonSerializedPeptideInfo(Dictionary idToMod Dictionary accessionToProtein, IDigestionParams dp) => SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp); - private void GetModsAfterDeserialization(Dictionary idToMod) - { - _allModsOneIsNterminus = new Dictionary(); - int currentModStart = 0; - int currentModificationLocation = 1; - bool currentlyReadingMod = false; - int bracketCount = 0; - - for (int r = 0; r < FullSequence.Length; r++) - { - char c = FullSequence[r]; - if (c == '[') - { - currentlyReadingMod = true; - if (bracketCount == 0) - { - currentModStart = r + 1; - } - bracketCount++; - } - else if (c == ']') - { - string modId = null; - bracketCount--; - if (bracketCount == 0) - { - try - { - //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") - string modString = FullSequence.Substring(currentModStart, r - currentModStart); - int splitIndex = modString.IndexOf(':'); - string modType = modString.Substring(0, splitIndex); - modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); - } - catch (Exception e) - { - throw new MzLibUtil.MzLibException( - "Error while trying to parse string into peptide: " + e.Message); - } - if (!idToMod.TryGetValue(modId, out Modification mod)) - { - throw new MzLibUtil.MzLibException( - "Could not find modification while reading string: " + FullSequence); - } - if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) - { - currentModificationLocation = BaseSequence.Length + 2; - } - _allModsOneIsNterminus.Add(currentModificationLocation, mod); - currentlyReadingMod = false; - } - } - else if (!currentlyReadingMod) - { - currentModificationLocation++; - } - //else do nothing - } - } - private void GetProteinAfterDeserialization(Dictionary idToProtein) { Protein protein = null; diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index 30c94eae1..64ce5dbc7 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1186,26 +1186,26 @@ public static void TestPeptideWithSetModsNoParentProtein() [Test] public static void TestIBioPolymerWithSetModsModificationFromFullSequence() { - //Just making sure there are no snafus when creating decoy peptides from an xml,which will have mods in various places, etc. - //sequence variants, modifications Dictionary un = new Dictionary(); var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); - List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), formalChargesDictionary).ToList(); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), + formalChargesDictionary).ToList(); List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); var digestionParameters = new DigestionParams(maxModsForPeptides: 3); - foreach (Protein p in proteins) - { - List digestedPeptides = p.Digest(digestionParameters, [], [], null, null).ToList(); + foreach (Protein p in proteins) + { + List digestedPeptides = + p.Digest(digestionParameters, [], [], null, null).ToList(); // take the most modified peptide by base sequence and ensure all methods function properly foreach (var targetPeptide in digestedPeptides .Where(pep => pep.FullSequence.Contains('[')) .GroupBy(pep => pep.BaseSequence) .Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count))) - { + { var startResidue = targetPeptide.OneBasedStartResidue; var endResidue = targetPeptide.OneBasedEndResidue; @@ -1227,7 +1227,8 @@ public static void TestIBioPolymerWithSetModsModificationFromFullSequence() expectedModCount++; } else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.") - && modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("N-terminal.")) + && modDictEntry.Value.Select(mod => mod.LocationRestriction) + .Contains("N-terminal.")) { expectedModCount++; if (modDictEntry.Key == startResidue) @@ -1245,13 +1246,13 @@ public static void TestIBioPolymerWithSetModsModificationFromFullSequence() } } } + expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods); var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => mod.Key >= startResidue && mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList(); - // Parse modifications from PWSM and two IBioPolymerWithSetMods methods var pwsmModDict = targetPeptide.AllModsOneIsNterminus; var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); @@ -1270,7 +1271,7 @@ public static void TestIBioPolymerWithSetModsModificationFromFullSequence() foreach (var pwsmModification in bpwsmModList) Assert.Contains(pwsmModification, expectedModifications); } - } + } } } } \ No newline at end of file From e6ef57ef7dc3294999cdd3df7d6656e821b0fb52 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 27 Aug 2024 15:56:46 -0500 Subject: [PATCH 4/5] added throws to summary comment and extended mzlibexception to have an inner exception. --- mzLib/MzLibUtil/MzLibException.cs | 11 +++-------- mzLib/Omics/IBioPolymerWithSetMods.cs | 4 +++- mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs | 4 ++-- mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index cf86074d8..067b8e3ec 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -1,13 +1,8 @@ -using System; +#nullable enable +using System; namespace MzLibUtil { [Serializable] - public class MzLibException : Exception - { - public MzLibException(string message) - : base(message) - { - } - } + public class MzLibException(string message, Exception? innerException = null) : Exception(message, innerException); } \ No newline at end of file diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index cf29e6fa1..35136bb8d 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -80,6 +80,7 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence) /// Full sequence /// All known modifications /// + /// When a full sequence is not in the correct format or a mod is not found in the allModsKnown dictionary public static Dictionary GetModificationDictionaryFromFullSequence(string fullSequence, Dictionary allModsKnown) { @@ -119,7 +120,8 @@ public static Dictionary GetModificationDictionaryFromFullSeq catch (Exception e) { throw new MzLibUtil.MzLibException( - "Error while trying to parse string into peptide: " + e.Message); + "Error while trying to parse string into peptide: " + e.Message, e); + } if (!allModsKnown.TryGetValue(modId, out var mod)) { diff --git a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs index 1d7f1b231..1abb40e99 100644 --- a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs +++ b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs @@ -1103,7 +1103,7 @@ private void ParseSequence(string sequence) { modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString)); } - catch (MzLibException) + catch (MzLibException e) { if (double.TryParse(modString, out double mass)) { @@ -1111,7 +1111,7 @@ private void ParseSequence(string sequence) } else { - throw new MzLibException("Unable to correctly parse the following modification: " + modString); + throw new MzLibException("Unable to correctly parse the following modification: " + modString, e); } } diff --git a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs index 62a720c63..709b391ba 100644 --- a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs +++ b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs @@ -28,7 +28,7 @@ public static List ReadTsv(string filePath, out List Date: Wed, 23 Oct 2024 12:42:16 -0500 Subject: [PATCH 5/5] Made inner exception nullable in MzLibException --- mzLib/MzLibUtil/MzLibException.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index 1609b59b4..885081433 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -4,6 +4,6 @@ namespace MzLibUtil { [Serializable] - public class MzLibException(string message, Exception innerException = null) + public class MzLibException(string message, Exception? innerException = null) : Exception(message, innerException); } \ No newline at end of file