Skip to content

Commit

Permalink
Fixed ProteinDB Writer method to be deterministic
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Sol committed Jan 6, 2025
1 parent 60d3f0b commit fa8745b
Showing 1 changed file with 23 additions and 12 deletions.
35 changes: 23 additions & 12 deletions mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ private static Dictionary<string, int> WriteNucleicAcidXmlDatabase(
return newModResEntries;
}

/// <summary>
// <summary>
/// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list.
/// </summary>
/// <param name="additionalModsToAddToProteins"></param>
Expand Down Expand Up @@ -324,8 +324,17 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
}

HashSet<Modification> allRelevantModifications = new HashSet<Modification>(
nonVariantProteins.SelectMany(p => p.SequenceVariations.SelectMany(sv => sv.OneBasedModifications).Concat(p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value))
.Concat(additionalModsToAddToProteins.Where(kv => nonVariantProteins.SelectMany(p => p.SequenceVariations.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })).Contains(kv.Key)).SelectMany(kv => kv.Value.Select(v => v.Item2))));
nonVariantProteins
.SelectMany(p => p.SequenceVariations
.SelectMany(sv => sv.OneBasedModifications)
.Concat(p.OneBasedPossibleLocalizedModifications)
.SelectMany(kv => kv.Value))
.Concat(additionalModsToAddToProteins
.Where(kv => nonVariantProteins
.SelectMany(p => p.SequenceVariations
.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession }))
.Contains(kv.Key))
.SelectMany(kv => kv.Value.Select(v => v.Item2))));

foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif))
{
Expand Down Expand Up @@ -384,7 +393,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteStartElement("dbReference");
writer.WriteAttributeString("type", dbRef.Type);
writer.WriteAttributeString("id", dbRef.Id);
foreach (Tuple<string, string> property in dbRef.Properties)
foreach (Tuple<string, string> property in dbRef.Properties.OrderBy(t => t.Item1).ThenBy(t => t.Item2))
{
writer.WriteStartElement("property");
writer.WriteAttributeString("type", property.Item1);
Expand All @@ -397,7 +406,8 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
//for now we are not going to write top-down truncations generated for top-down truncation search.
//some day we could write those if observed
//the truncation designation is contained in the "type" field of ProteolysisProduct
List<ProteolysisProduct> proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList();
List<ProteolysisProduct> proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation"))
.OrderBy(p => p).ToList();
foreach (var proteolysisProduct in proteolysisProducts)
{
writer.WriteStartElement("feature");
Expand All @@ -413,23 +423,23 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement();
}

foreach (var hm in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
{
foreach (var modId in hm.Value)
foreach (var modId in positionModKvp.Value.OrderBy(mod => mod))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "modified residue");
writer.WriteAttributeString("description", modId);
writer.WriteStartElement("location");
writer.WriteStartElement("position");
writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture));
writer.WriteAttributeString("position", positionModKvp.Key.ToString(CultureInfo.InvariantCulture));
writer.WriteEndElement();
writer.WriteEndElement();
writer.WriteEndElement();
}
}

foreach (var hm in protein.SequenceVariations)
foreach (var hm in protein.SequenceVariations.OrderBy(sv => sv))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "sequence variant");
Expand Down Expand Up @@ -458,7 +468,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
}
foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
{
foreach (var modId in hmm.Value)
foreach (var modId in hmm.Value.OrderBy(mod => mod))
{
writer.WriteStartElement("subfeature");
writer.WriteAttributeString("type", "modified residue");
Expand All @@ -475,7 +485,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement(); // feature
}

foreach (var hm in protein.DisulfideBonds)
foreach (var hm in protein.DisulfideBonds.OrderBy(bond => bond.OneBasedBeginPosition))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "disulfide bond");
Expand All @@ -500,7 +510,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
writer.WriteEndElement(); // feature
}

foreach (var hm in protein.SpliceSites)
foreach (var hm in protein.SpliceSites.OrderBy(site => site.OneBasedBeginPosition))
{
writer.WriteStartElement("feature");
writer.WriteAttributeString("type", "splice site");
Expand Down Expand Up @@ -538,6 +548,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
return newModResEntries;
}


public static void WriteFastaDatabase(List<Protein> proteinList, string outputFileName, string delimeter)
{
using (StreamWriter writer = new StreamWriter(outputFileName))
Expand Down

0 comments on commit fa8745b

Please sign in to comment.