Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get Modifications from Full Sequence #796

Merged
merged 10 commits into from
Oct 30, 2024
79 changes: 79 additions & 0 deletions mzLib/Omics/IBioPolymerWithSetMods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,84 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence)
}
return sb.ToString();
}

/// <summary>
/// Returns a list of modifications and their OneBased index from a full sequence
/// </summary>
/// <param name="fullSequence">Full sequence</param>
/// <param name="allModsKnown">All known modifications</param>
/// <returns></returns>
nbollis marked this conversation as resolved.
Show resolved Hide resolved
public static Dictionary<int, Modification> GetModificationDictionaryFromFullSequence(string fullSequence,
Dictionary<string, Modification> allModsKnown)
{
var allModsOneIsNterminus = new Dictionary<int, Modification>();
var baseSequence = GetBaseSequenceFromFullSequence(fullSequence);
int currentModStart = 0;
int currentModificationLocation = 1;
bool currentlyReadingMod = false;
int bracketCount = 0;

for (int r = 0; r < fullSequence.Length; r++)
{
char c = fullSequence[r];
if (c == '[')
{
currentlyReadingMod = true;
if (bracketCount == 0)
{
currentModStart = r + 1;
}
bracketCount++;
}
else if (c == ']')
{
string modId = null;
bracketCount--;
if (bracketCount == 0)
{
try
{
//remove the beginning section (e.g. "Fixed", "Variable", "Uniprot")
string modString = fullSequence.Substring(currentModStart, r - currentModStart);
int splitIndex = modString.IndexOf(':');
string modType = modString.Substring(0, splitIndex);
modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1);
}
catch (Exception e)
{
throw new MzLibUtil.MzLibException(
"Error while trying to parse string into peptide: " + e.Message);
}
if (!allModsKnown.TryGetValue(modId, out var mod))
{
throw new MzLibUtil.MzLibException(
"Could not find modification while reading string: " + fullSequence);
}
if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1)
{
currentModificationLocation = baseSequence.Length + 2;
}
allModsOneIsNterminus.Add(currentModificationLocation, mod);
currentlyReadingMod = false;
}
}
else if (!currentlyReadingMod)
{
currentModificationLocation++;
}
//else do nothing
}

return allModsOneIsNterminus;
}

/// <summary>
/// Returns a list of modifications from a full sequence
/// </summary>
/// <param name="fullSequence">Full sequence</param>
/// <param name="allModsKnown">All known modifications</param>
/// <returns></returns>
public static List<Modification> GetModificationsFromFullSequence(string fullSequence,
Dictionary<string, Modification> allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values];
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public PeptideWithSetModifications(string sequence, Dictionary<string, Modificat

FullSequence = sequence;
_baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence);
GetModsAfterDeserialization(allKnownMods);
_allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(sequence, allKnownMods);
NumFixedMods = numFixedMods;
_digestionParams = digestionParams as DigestionParams;
PairedTargetDecoySequence = pairedTargetDecoySequence;
Expand Down Expand Up @@ -910,7 +910,7 @@ public override int GetHashCode()
/// </summary>
public void SetNonSerializedPeptideInfo(Dictionary<string, Modification> idToMod, Dictionary<string, Protein> accessionToProtein, DigestionParams dp)
{
GetModsAfterDeserialization(idToMod);
_allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod);
GetProteinAfterDeserialization(accessionToProtein);
_digestionParams = dp;
}
Expand All @@ -919,66 +919,6 @@ public void SetNonSerializedPeptideInfo(Dictionary<string, Modification> idToMod
Dictionary<string, Protein> accessionToProtein, IDigestionParams dp) =>
SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp);

private void GetModsAfterDeserialization(Dictionary<string, Modification> idToMod)
{
_allModsOneIsNterminus = new Dictionary<int, Modification>();
int currentModStart = 0;
int currentModificationLocation = 1;
bool currentlyReadingMod = false;
int bracketCount = 0;

for (int r = 0; r < FullSequence.Length; r++)
{
char c = FullSequence[r];
if (c == '[')
{
currentlyReadingMod = true;
if (bracketCount == 0)
{
currentModStart = r + 1;
}
bracketCount++;
}
else if (c == ']')
{
string modId = null;
bracketCount--;
if (bracketCount == 0)
{
try
{
//remove the beginning section (e.g. "Fixed", "Variable", "Uniprot")
string modString = FullSequence.Substring(currentModStart, r - currentModStart);
int splitIndex = modString.IndexOf(':');
string modType = modString.Substring(0, splitIndex);
modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1);
}
catch (Exception e)
{
throw new MzLibUtil.MzLibException(
"Error while trying to parse string into peptide: " + e.Message);
}
if (!idToMod.TryGetValue(modId, out Modification mod))
{
throw new MzLibUtil.MzLibException(
"Could not find modification while reading string: " + FullSequence);
}
if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1)
{
currentModificationLocation = BaseSequence.Length + 2;
}
_allModsOneIsNterminus.Add(currentModificationLocation, mod);
currentlyReadingMod = false;
}
}
else if (!currentlyReadingMod)
{
currentModificationLocation++;
}
//else do nothing
}
}

private void GetProteinAfterDeserialization(Dictionary<string, Protein> idToProtein)
{
Protein protein = null;
Expand Down
92 changes: 92 additions & 0 deletions mzLib/Test/TestPeptideWithSetMods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using MzLibUtil;
using Omics;
using Omics.Digestion;
using Omics.Fragmentation;
Expand Down Expand Up @@ -1181,5 +1182,96 @@ public static void TestPeptideWithSetModsNoParentProtein()
Assert.AreEqual('-', last.NextAminoAcid);
Assert.AreEqual('-', last.NextResidue);
}

[Test]
public static void TestIBioPolymerWithSetModsModificationFromFullSequence()
{
Dictionary<string, Modification> un = new Dictionary<string, Modification>();
var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml"));
Dictionary<string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized);
List<Modification> UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"),
formalChargesDictionary).ToList();
List<Protein> proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"),
true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un);
var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p);
var digestionParameters = new DigestionParams(maxModsForPeptides: 3);

foreach (Protein p in proteins)
{
List<PeptideWithSetModifications> digestedPeptides =
p.Digest(digestionParameters, [], [], null, null).ToList();
// take the most modified peptide by base sequence and ensure all methods function properly
foreach (var targetPeptide in digestedPeptides
.Where(pep => pep.FullSequence.Contains('['))
.GroupBy(pep => pep.BaseSequence)
.Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count)))
{
var startResidue = targetPeptide.OneBasedStartResidue;
var endResidue = targetPeptide.OneBasedEndResidue;

// Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods
// A bunch of logic to count the number of expected modifications based upon the xml database entries
int expectedModCount = 0;
foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications
.Where(mod => mod.Key >= startResidue && mod.Key <= endResidue))
{
if (modDictEntry.Value.Count > 1)
{
var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList();

if (locRestrictions.AllSame())
{
if (locRestrictions.First() == "Anywhere.")
expectedModCount++;
else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue)
expectedModCount++;
}
else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.")
&& modDictEntry.Value.Select(mod => mod.LocationRestriction)
.Contains("N-terminal."))
{
expectedModCount++;
if (modDictEntry.Key == startResidue)
expectedModCount++;
}
}
else
{
switch (modDictEntry.Value.First().LocationRestriction)
{
case "Anywhere.":
case "N-terminal." when modDictEntry.Key == startResidue:
expectedModCount++;
break;
}
}
}

expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods);

var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod =>
mod.Key >= startResidue &&
mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList();

// Parse modifications from PWSM and two IBioPolymerWithSetMods methods
var pwsmModDict = targetPeptide.AllModsOneIsNterminus;
var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict);
var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict);

// Ensure all methods are in agreement by modification count
Alexander-Sol marked this conversation as resolved.
Show resolved Hide resolved
Assert.AreEqual(pwsmModDict.Count, expectedModCount);
Assert.AreEqual(bpwsmModDict.Count, expectedModCount);
Assert.AreEqual(bpwsmModList.Count, expectedModCount);

// Ensure all methods are in agreement by modification identify
foreach (var pwsmModification in pwsmModDict.Values)
Assert.Contains(pwsmModification, expectedModifications);
foreach (var pwsmModification in bpwsmModDict.Values)
Assert.Contains(pwsmModification, expectedModifications);
foreach (var pwsmModification in bpwsmModList)
Assert.Contains(pwsmModification, expectedModifications);
}
}
}
}
}
Loading