From 1d402e20ab95af3664f7626ad8261712af80fb6a Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Fri, 16 Aug 2024 16:46:26 -0500 Subject: [PATCH 1/7] Result Text and Individual File Result Cleanup (#2397) * Updated to MzLib 1.0.548 and fixed custom ions in search tasks * reverted calibration task change * merged in master bbbyy * Spectral Library from Command Line (#2386) * Updated to MzLib 1.0.548 and fixed custom ions in search tasks * reverted calibration task change * merged in master bbbyy * Enabled Library Loading from command line * replaced "Peptides" with GlobalVariables.AnalyteType * built lazy loading search result structure for post search analysis task * Finsihed test case strucutre, added test for result files. Added top-down test case * changed access modifier * Refactored test case setup * Added more test cases to hopefully up coverage --- .../SearchTask/PostSearchAnalysisTask.cs | 61 ++--- .../Test/EverythingRunnerEngineTestCase.cs | 192 ++++++++++++++ MetaMorpheus/Test/GlobalVariablesTest.cs | 11 - MetaMorpheus/Test/MyTaskTest.cs | 4 +- .../Test/PostSearchAnalysisTaskTests.cs | 234 +++++++++++++----- MetaMorpheus/Test/SearchTaskTest.cs | 2 +- MetaMorpheus/Test/SetUpTests.cs | 11 +- MetaMorpheus/Test/SilacTest.cs | 2 +- MetaMorpheus/Test/Test.csproj | 3 + .../Test/TestData/Task2-SearchTaskconfig.toml | 2 +- MetaMorpheus/Test/TestData/smalldb.fasta | 7 + MetaMorpheus/Test/TestPsm.cs | 4 +- .../TopDownTestData/TopDownSearchToml.toml | 97 ++++++++ 13 files changed, 513 insertions(+), 117 deletions(-) create mode 100644 MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs create mode 100644 MetaMorpheus/Test/TopDownTestData/TopDownSearchToml.toml diff --git a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index d1791df8c..41cb7692a 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -80,12 +80,17 @@ public MyTaskResults Run() HistogramAnalysis(); WritePsmResults(); WritePeptideResults(); - if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) + if (Parameters.CurrentRawFileList.Count > 1 && (Parameters.SearchParameters.WriteIndividualFiles + || Parameters.SearchParameters.WriteMzId || + Parameters.SearchParameters.WritePepXml)) { // create individual files subdirectory Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); - WriteIndividualPsmResults(); - WriteIndividualPeptideResults(); + if (Parameters.SearchParameters.WriteIndividualFiles) + { + WriteIndividualPsmResults(); + WriteIndividualPeptideResults(); + } } WriteProteinResults(); AddResultsTotalsToAllResultsTsv(); @@ -615,7 +620,7 @@ private void WritePsmResults() "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string psmResultsText = "All target PSMs with " + psmsForPsmResults.FilterType + " = " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + + string psmResultsText = "All target PSMs with " + psmsForPsmResults.FilterType + " <= " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + psmsForPsmResults.TargetPsmsAboveThreshold; ResultsDictionary[("All", "PSMs")] = psmResultsText; } @@ -632,7 +637,7 @@ private void WritePeptideResults() filterAtPeptideLevel: true); // write PSMs - string writtenFile = Path.Combine(Parameters.OutputFolder, "AllPeptides.psmtsv"); + string writtenFile = Path.Combine(Parameters.OutputFolder, $"All{GlobalVariables.AnalyteType}s.psmtsv"); WritePsmsToTsv(peptidesForPeptideResults.OrderByDescending(p => p).ToList(), writtenFile, writePeptideLevelResults: true); FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); @@ -642,9 +647,9 @@ private void WritePeptideResults() Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string peptideResultsText = "All target peptides with " + peptidesForPeptideResults.FilterType + " = " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + + string peptideResultsText = $"All target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesForPeptideResults.FilterType + " <= " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + peptidesForPeptideResults.TargetPsmsAboveThreshold; - ResultsDictionary[("All", "Peptides")] = peptideResultsText; + ResultsDictionary[("All", GlobalVariables.AnalyteType)] = peptideResultsText; } private void WriteIndividualPsmResults() @@ -679,7 +684,7 @@ private void WriteIndividualPsmResults() FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string psmResultsText = strippedFileName + " - All target PSMs with " + psmsToWrite.FilterType + " = " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + + string psmResultsText = strippedFileName + " - Target PSMs with " + psmsToWrite.FilterType + " <= " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + psmsToWrite.TargetPsmsAboveThreshold; ResultsDictionary[(strippedFileName, "PSMs")] = psmResultsText; } @@ -710,16 +715,16 @@ private void WriteIndividualPeptideResults() filterAtPeptideLevel: true); // write PSMs - string writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_Peptides.psmtsv"); + string writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + $"_{GlobalVariables.AnalyteType}s.psmtsv"); WritePsmsToTsv(peptidesToWrite, writtenFile, writePeptideLevelResults: true); FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string peptideResultsText = strippedFileName + " - All target peptides with " + peptidesToWrite.FilterType + " = " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + + string peptideResultsText = strippedFileName + $" - Target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesToWrite.FilterType + " <= " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + peptidesToWrite.TargetPsmsAboveThreshold; - ResultsDictionary[(strippedFileName, "Peptides")] = peptideResultsText; + ResultsDictionary[(strippedFileName, GlobalVariables.AnalyteType)] = peptideResultsText; } - + } private void UpdateSpectralLibrary() { @@ -831,7 +836,7 @@ private void WriteProteinResults() } else { - string proteinResultsText = "All target protein groups with q-value = 0.01 (1% FDR): " + ProteinGroups.Count(b => b.QValue <= 0.01 && !b.IsDecoy); + string proteinResultsText = "All target protein groups with q-value <= 0.01 (1% FDR): " + ProteinGroups.Count(b => b.QValue <= 0.01 && !b.IsDecoy); ResultsDictionary[("All", "Proteins")] = proteinResultsText; } @@ -847,13 +852,6 @@ private void WriteProteinResults() string writtenFile = Path.Combine(Parameters.OutputFolder, fileName); WriteProteinGroupsToTsv(ProteinGroups, writtenFile, new List { Parameters.SearchTaskId }); - if (Parameters.CurrentRawFileList.Count > 1 && (Parameters.SearchParameters.WriteIndividualFiles - || Parameters.SearchParameters.WriteMzId || - Parameters.SearchParameters.WritePepXml)) - { - Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); - } - var psmsGroupedByFile = FilteredPsms.Filter(Parameters.AllPsms, CommonParameters, includeDecoys: true, @@ -910,14 +908,15 @@ private void WriteProteinResults() if (Parameters.SearchParameters.WriteIndividualFiles && Parameters.CurrentRawFileList.Count > 1) { + // write summary text + string proteinResultsText = strippedFileName + " - Target protein groups within 1 % FDR: " + subsetProteinGroupsForThisFile.Count(b => b.QValue <= 0.01 && !b.IsDecoy); + ResultsDictionary[(strippedFileName, "Proteins")] = proteinResultsText; + + // write result files writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_ProteinGroups.tsv"); WriteProteinGroupsToTsv(subsetProteinGroupsForThisFile, writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); } - // write summary text - string proteinResultsText = strippedFileName + " - Target protein groups within 1 % FDR: " + subsetProteinGroupsForThisFile.Count(b => b.QValue <= 0.01 && !b.IsDecoy); - ResultsDictionary[(strippedFileName, "Proteins")] = proteinResultsText; - psmsForThisFile = FilteredPsms.Filter(psmsForThisFile, CommonParameters, includeDecoys: Parameters.SearchParameters.WriteDecoys, @@ -1851,18 +1850,20 @@ private void WriteProteinGroupsToTsv(List proteinGroup FinishedWritingFile(filePath, nestedIds); } } + /// /// This is a handy dictionary to keep track of the PSM, peptide and protein count results at the /// "All" level and at the individual raw file level. /// The keys are a tuple such as ("All", "PSMs") or ("RawFileName", "Peptides") - // The values are the results as a string + /// The values are the results as a string /// private void ConstructResultsDictionary() { - ResultsDictionary = new(); - - ResultsDictionary.Add(("All", "PSMs"),""); - ResultsDictionary.Add(("All", "Peptides"), ""); + ResultsDictionary = new() + { + { ("All", "PSMs"), "" }, + { ("All", GlobalVariables.AnalyteType), "" } + }; if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) { @@ -1870,7 +1871,7 @@ private void ConstructResultsDictionary() { string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(rawFile); ResultsDictionary.Add((fileNameWithoutExtension, "PSMs"), ""); - ResultsDictionary.Add((fileNameWithoutExtension, "Peptides"), ""); + ResultsDictionary.Add((fileNameWithoutExtension, GlobalVariables.AnalyteType), ""); } } diff --git a/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs new file mode 100644 index 000000000..699b4fe67 --- /dev/null +++ b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs @@ -0,0 +1,192 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using Nett; +using NUnit.Framework; +using TaskLayer; + +namespace Test +{ + public enum EverythingRunnerEngineTestCases + { + BottomUpQValue, + BottomUpQValueNoIndividualFilesWriteMzId, + BottomUpQValueNoIndividualFilesWritePepXml, + BottomUpQValueSingle, + BottomUpPepQValue, + TopDownQValue, + TopDownQValueSingle + } + + /// + /// Test cases for the post search analysis task. These test cases are used to verify that the post search analysis task is functioning correctly. + /// This structure ensures that the database search is only ran once, and only ran once called. + /// These directories are cleaned up in the Global Cleanup found in SetUpTests.GlobalTearDown + /// + [ExcludeFromCodeCoverage] + internal class EverythingRunnerEngineTestCase : IDisposable + { + internal EverythingRunnerEngineTestCases TestCase { get; init; } + internal List<(string, MetaMorpheusTask)> TaskList { get; init; } + internal List DatabaseList { get; init; } + internal List DataFileList { get; init; } + internal string OutputDirectory => Path.Combine(ResultDirectory, TestCase.ToString()); + internal bool IsTopDown { get; init; } + internal bool HasRun { get; private set; } + internal bool WriteIndividualResults { get; init; } + internal bool WritePepXml { get; init; } + internal bool WriteMzId { get; init; } + + internal EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases testCase, + List<(string, MetaMorpheusTask)> taskList, List dataFileList, + List databaseList, bool isTopDown) + { + TestCase = testCase; + TaskList = taskList; + DatabaseList = databaseList; + DataFileList = dataFileList; + IsTopDown = isTopDown; + HasRun = false; + + var firstSearchTask = taskList.Select(p => p.Item2).FirstOrDefault(p => p.TaskType == MyTask.Search); + if (firstSearchTask is null) return; + + var searchTask = (SearchTask)firstSearchTask; + WriteIndividualResults = searchTask.SearchParameters.WriteIndividualFiles; + WritePepXml = searchTask.SearchParameters.WritePepXml; + WriteMzId = searchTask.SearchParameters.WriteMzId; + } + + internal void Run() + { + if (Directory.Exists(OutputDirectory)) + Directory.Delete(OutputDirectory, true); + + var runner = new EverythingRunnerEngine(TaskList, DataFileList, DatabaseList, OutputDirectory); + runner.Run(); + HasRun = true; + } + + public void Dispose() + { + if (Directory.Exists(OutputDirectory)) + Directory.Delete(OutputDirectory, true); + } + + #region Case Setup + + internal static string ResultDirectory => + Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\PostSearchAnalysisTaskTest"); + + private static Dictionary _cases; + + internal static bool TryGetTestCase(EverythingRunnerEngineTestCases testCase, + out EverythingRunnerEngineTestCase outCase) + { + if (!_cases.TryGetValue(testCase, out outCase)) return false; + + if (!outCase.HasRun) + outCase.Run(); + return true; + } + + internal static EverythingRunnerEngineTestCase GetTestCase(EverythingRunnerEngineTestCases testCase) + { + if (!TryGetTestCase(testCase, out var outCase)) + throw new KeyNotFoundException($"Test case {testCase} not found"); + return outCase; + } + + internal static void DisposeAll() + { + foreach (var testCase in _cases.Values) + testCase.Dispose(); + } + + static EverythingRunnerEngineTestCase() + { + // Directory GlobalSetup + if (Directory.Exists(ResultDirectory)) + Directory.Delete(ResultDirectory, true); + + if (!Directory.Exists(ResultDirectory)) + Directory.CreateDirectory(ResultDirectory); + + // Test Case Instantiation + _cases = new(); + + string myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TestData\Task1-SearchTaskconfig.toml"); + SearchTask searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); + string myFile1 = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TestData\TaGe_SA_A549_3_snip.mzML"); + string myFile2 = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TestData\TaGe_SA_A549_3_snip_2.mzML"); + string myDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TestData\TaGe_SA_A549_3_snip.fasta"); + _cases.Add(EverythingRunnerEngineTestCases.BottomUpQValue, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpQValue, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, + false)); + _cases.Add(EverythingRunnerEngineTestCases.BottomUpQValueSingle, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpQValueSingle, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile2 }, + new List { new DbForTask(myDatabase, false) }, false)); + + searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); + searchTaskLoaded.SearchParameters.WriteIndividualFiles = false; + searchTaskLoaded.SearchParameters.WriteMzId = true; + searchTaskLoaded.SearchParameters.WritePepXml = false; + _cases.Add(EverythingRunnerEngineTestCases.BottomUpQValueNoIndividualFilesWriteMzId, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpQValueNoIndividualFilesWriteMzId, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, + false)); + + searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); + searchTaskLoaded.SearchParameters.WriteIndividualFiles = false; + searchTaskLoaded.SearchParameters.WriteMzId = false; + searchTaskLoaded.SearchParameters.WritePepXml = true; + _cases.Add(EverythingRunnerEngineTestCases.BottomUpQValueNoIndividualFilesWritePepXml, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpQValueNoIndividualFilesWritePepXml, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, + false)); + + myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TestData\Task2-SearchTaskconfig.toml"); + searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); + // TODO: Uncomment this line and change values for PR 2394 + //searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; + _cases.Add(EverythingRunnerEngineTestCases.BottomUpPepQValue, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpPepQValue, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, + false)); + + myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"TopDownTestData\TopDownSearchToml.toml"); + searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); + myFile1 = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\SmallCalibratible_Yeast.mzML"); + myFile2 = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TopDownTestData\slicedTDYeast.mzML"); + myDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\smalldb.fasta"); + _cases.Add(EverythingRunnerEngineTestCases.TopDownQValue, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.TopDownQValue, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, + true)); + _cases.Add(EverythingRunnerEngineTestCases.TopDownQValueSingle, + new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.TopDownQValueSingle, + new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, + new List { myFile2 }, new List { new DbForTask(myDatabase, false) }, true)); + } + + #endregion + } +} + + diff --git a/MetaMorpheus/Test/GlobalVariablesTest.cs b/MetaMorpheus/Test/GlobalVariablesTest.cs index 749c0212e..eb9482da4 100644 --- a/MetaMorpheus/Test/GlobalVariablesTest.cs +++ b/MetaMorpheus/Test/GlobalVariablesTest.cs @@ -8,17 +8,6 @@ namespace Test { - [SetUpFixture] - public static class SetUpGlobalVariables - { - [OneTimeSetUp] - public static void LoadGlobalVariables() - { - // this loads the global variables once for all unit tests - GlobalVariables.SetUpGlobalVariables(); - } - } - [TestFixture] public static class GlobalVariablesTest { diff --git a/MetaMorpheus/Test/MyTaskTest.cs b/MetaMorpheus/Test/MyTaskTest.cs index 7e11c4778..109af1c4e 100644 --- a/MetaMorpheus/Test/MyTaskTest.cs +++ b/MetaMorpheus/Test/MyTaskTest.cs @@ -285,7 +285,7 @@ public static void MakeSureFdrDoesntSkip() //There is one PSM with close peptide mass (0 ppm difference) and one PSM with large mass difference (>1000 ppm difference) //Since this is an open search, both PSMs should be reported because they share the exact same MS2 scan - Assert.IsTrue(theStringResult.Contains("All target PSMs with q-value = 0.01: 1")); + Assert.IsTrue(theStringResult.Contains("All target PSMs with q-value <= 0.01: 1")); Directory.Delete(outputFolder, true); File.Delete(xmlName); File.Delete(mzmlName); @@ -434,7 +434,7 @@ public static void TestPeptideCount() { while ((line = file.ReadLine()) != null) { - if (line.Contains("All target peptides with q-value = 0.01: 4")) + if (line.Contains("All target peptides with q-value <= 0.01: 4")) { foundD = true; } diff --git a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs index ea33d0753..5426e1bbf 100644 --- a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs +++ b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs @@ -1,64 +1,56 @@ -using System.Collections.Generic; +using System; +using System.Diagnostics.CodeAnalysis; using System.IO; -using Nett; +using System.Linq; using NUnit.Framework; using Assert = NUnit.Framework.Legacy.ClassicAssert; -using TaskLayer; namespace Test { + /// + /// Uses test cases found in EverythingRunnerEngineTestCase.cs + /// [TestFixture] + [ExcludeFromCodeCoverage] public static class PostSearchAnalysisTaskTests { + public static Array GetTestCases() => Enum.GetValues(typeof(EverythingRunnerEngineTestCases)); + [Test] - public static void AllResultsAndResultsTxtTests() + public static void AllResultsAndResultsTxtContainsCorrectValues_QValue_BottomUp() { //First test that AllResults and Results display correct numbers of peptides and psms with q-value filter on - string myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\Task1-SearchTaskconfig.toml"); - SearchTask searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); - string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\PostSearchAnalysisTaskTest"); - string myFile1 = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.mzML"); - string myFile2 = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip_2.mzML"); - string myDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.fasta"); - - EverythingRunnerEngine engineToml = new(new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, outputFolder); - engineToml.Run(); - + EverythingRunnerEngineTestCase.TryGetTestCase(EverythingRunnerEngineTestCases.BottomUpQValue, out var testCase); + string outputFolder = testCase.OutputDirectory; string allResultsFile = Path.Combine(outputFolder, "allResults.txt"); string[] allResults = File.ReadAllLines(allResultsFile); // The new PEP calculation method improves things, so all these numbers are increasing as of (7/17/24) // There is a discrepancy between the number of All target peptides and individual file target peptides, // presumably due to the way protein inference is performed. - Assert.AreEqual("All target PSMs with q-value = 0.01: 428", allResults[10]); - Assert.AreEqual("All target peptides with q-value = 0.01: 174", allResults[11]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", allResults[12]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with q-value = 0.01: 214", allResults[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with q-value = 0.01: 174", allResults[15]); + Assert.AreEqual("All target PSMs with q-value <= 0.01: 428", allResults[10]); + Assert.AreEqual("All target peptides with q-value <= 0.01: 174", allResults[11]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 165", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with q-value <= 0.01: 214", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with q-value <= 0.01: 174", allResults[15]); Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 165", allResults[16]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with q-value = 0.01: 214", allResults[18]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with q-value = 0.01: 174", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with q-value <= 0.01: 214", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with q-value <= 0.01: 174", allResults[19]); Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 165", allResults[20]); + string resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); string[] results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with q-value = 0.01: 428", results[5]); - Assert.AreEqual("All target peptides with q-value = 0.01: 174", results[6]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with q-value = 0.01: 214", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with q-value = 0.01: 174", results[10]); + + Assert.AreEqual("All target PSMs with q-value <= 0.01: 428", results[5]); + Assert.AreEqual("All target peptides with q-value <= 0.01: 174", results[6]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with q-value <= 0.01: 214", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with q-value <= 0.01: 174", results[10]); Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 165", results[11]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with q-value = 0.01: 214", results[13]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with q-value = 0.01: 174", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with q-value <= 0.01: 214", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with q-value <= 0.01: 174", results[14]); Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 165", results[15]); - - Directory.Delete(outputFolder, true); - + // Search TaGe_SA_A549_3_snip_2 by itself. The results from this should be identical to the file specific results above - engineToml = new(new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, - new List { myFile2 }, - new List { new DbForTask(myDatabase, false) }, - outputFolder); - engineToml.Run(); - // TaGe_SA_A549_3_snip_2 is searched twice. First with two files being searched simultaneously, then with TaGe_SA_A549_3_snip_2 by itself // This allows us to compare the file specific results produced by in the two file search to the output // produced by searching the file by itself. The number of PSMs and Peptides observed should be the same @@ -67,44 +59,152 @@ public static void AllResultsAndResultsTxtTests() // identified across all files. int TaGe_SA_A549_3_snip_2ExpectedPsms = 214; int TaGe_SA_A549_3_snip_2ExpectedPeptides = 174; + + EverythingRunnerEngineTestCase.TryGetTestCase(EverythingRunnerEngineTestCases.BottomUpQValueSingle, out var testCaseSingle); + outputFolder = testCaseSingle.OutputDirectory; + + resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); string[] singleFileResults = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, singleFileResults[5]); - Assert.AreEqual("All target peptides with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPeptides, singleFileResults[6]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", singleFileResults[7]); - - //Second test that AllResults and Results display correct numbers of peptides and psms with PEP q-value filter on - myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\Task2-SearchTaskconfig.toml"); - searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); - engineToml = new EverythingRunnerEngine(new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, new List { myFile1, myFile2 }, new List { new DbForTask(myDatabase, false) }, outputFolder); - engineToml.Run(); - - allResultsFile = Path.Combine(outputFolder, "allResults.txt"); - allResults = File.ReadAllLines(allResultsFile); - Assert.AreEqual("All target PSMs with pep q-value = 0.01: 420", allResults[10]); - Assert.AreEqual("All target peptides with pep q-value = 0.01: 172", allResults[11]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 155", allResults[12]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with pep q-value = 0.01: 210", allResults[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with pep q-value = 0.01: 172", allResults[15]); + Assert.AreEqual("All target PSMs with q-value <= 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, singleFileResults[5]); + Assert.AreEqual("All target peptides with q-value <= 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPeptides, singleFileResults[6]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 165", singleFileResults[7]); + } + + [Test] + public static void AllResultsAndResultsTxtContainsCorrectValues_PepQValue_BottomUp() + { + //First test that AllResults and Results display correct numbers of peptides and psms with pep q-value filter on + EverythingRunnerEngineTestCase.TryGetTestCase(EverythingRunnerEngineTestCases.BottomUpPepQValue, out var testCase); + string outputFolder = testCase.OutputDirectory; + var allResultsFile = Path.Combine(outputFolder, "allResults.txt"); + var allResults = File.ReadAllLines(allResultsFile); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", allResults[10]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", allResults[11]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", allResults[15]); Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", allResults[16]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with pep q-value = 0.01: 210", allResults[18]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with pep q-value = 0.01: 172", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", allResults[19]); Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", allResults[20]); - - - resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); - results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with pep q-value = 0.01: 420", results[5]); - Assert.AreEqual("All target peptides with pep q-value = 0.01: 172", results[6]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 155", results[7]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with pep q-value = 0.01: 210", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with pep q-value = 0.01: 172", results[10]); + var resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); + var results = File.ReadAllLines(resultsFile); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", results[5]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", results[6]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", results[7]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", results[10]); Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", results[11]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with pep q-value = 0.01: 210", results[13]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with pep q-value = 0.01: 172", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", results[14]); Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", results[15]); + } + + /// + /// Ensures that there is the proper ratio of summary and individual lines in the result.txt file and that peptides and proteoforms are distinct + /// + [Test] + [TestCaseSource(nameof(GetTestCases))] + public static void AllResultTxtContainsCorrectNumberOfResultLines(EverythingRunnerEngineTestCases testCaseIdentifier) + { + var testCase = EverythingRunnerEngineTestCase.GetTestCase(testCaseIdentifier); + + int expectedIndividualFileLines = testCase.DataFileList.Count == 1 || !testCase.WriteIndividualResults + ? 0 : testCase.DataFileList.Count; + int expectedSummaryLines = 1; + var allResultTxtLines = File.ReadAllLines(Path.Combine(testCase.OutputDirectory, @"allResults.txt")); + + var summaryPsmLines = allResultTxtLines.Where(p => p.Contains("All target PSMs")).ToArray(); + var individualPsmLines = allResultTxtLines.Where(p => p.Contains("Target PSMs") + && !p.Contains("All")).ToArray(); + Assert.AreEqual(expectedSummaryLines, summaryPsmLines.Length); + Assert.AreEqual(expectedIndividualFileLines, individualPsmLines.Length); + + if (testCase.IsTopDown) + { + var summaryProteoformLines = + allResultTxtLines.Where(p => p.Contains("All target proteoforms")).ToArray(); + var individualProteoformLines = allResultTxtLines.Where(p => p.Contains("Target proteoforms") + && !p.Contains("All")).ToArray(); + Assert.AreEqual(expectedSummaryLines, summaryProteoformLines.Length); + Assert.AreEqual(expectedIndividualFileLines, individualProteoformLines.Length); + } + else + { + var summaryPeptideLines = allResultTxtLines.Where(p => p.Contains("All target peptides")).ToArray(); + var individualPeptideLines = allResultTxtLines.Where(p => p.Contains("Target peptides") + && !p.Contains("All")).ToArray(); + Assert.AreEqual(expectedSummaryLines, summaryPeptideLines.Length); + Assert.AreEqual(expectedIndividualFileLines, individualPeptideLines.Length); + } + + var summaryProteinLines = allResultTxtLines.Where(p => p.Contains("All target protein groups")).ToArray(); + var individualProteinLines = allResultTxtLines.Where(p => p.Contains("Target protein groups") + && !p.Contains("All")).ToArray(); + Assert.AreEqual(expectedSummaryLines, summaryProteinLines.Length); + Assert.AreEqual(expectedIndividualFileLines, individualProteinLines.Length); + } + + /// + /// Ensures that the files written out with each search are correct according to the search parameters and data type + /// + [Test] + [TestCaseSource(nameof(GetTestCases))] + public static void CorrectFilesAreWrittenWithCorrectName(EverythingRunnerEngineTestCases testCaseIdentifier) + { + var testCase = EverythingRunnerEngineTestCase.GetTestCase(testCaseIdentifier); + var psmFiles = Directory.GetFiles(testCase.OutputDirectory, "*PSMs.psmtsv", SearchOption.AllDirectories); + var pepXmlFiles = Directory.GetFiles(testCase.OutputDirectory, "*.pep.xml", SearchOption.AllDirectories); + var percolatorFiles = Directory.GetFiles(testCase.OutputDirectory, "*Percolator.tab", SearchOption.AllDirectories); + var proteinGroupFiles = Directory.GetFiles(testCase.OutputDirectory, "*ProteinGroups.tsv", SearchOption.AllDirectories); + var peptideFiles = Directory.GetFiles(testCase.OutputDirectory, "*Peptides.psmtsv", SearchOption.AllDirectories); + var proteoformFiles = Directory.GetFiles(testCase.OutputDirectory, "*Proteoforms.psmtsv", SearchOption.AllDirectories); + var mzidFiles = Directory.GetFiles(testCase.OutputDirectory, "*.mzid", SearchOption.AllDirectories); + + int spectraFileCount = testCase.DataFileList.Count; + var expectedResultFileCount = testCase.WriteIndividualResults && testCase.DataFileList.Count > 1 + ? testCase.DataFileList.Count + 1 : 1; + + Assert.AreEqual(expectedResultFileCount, psmFiles.Length); + Assert.AreEqual(expectedResultFileCount, proteinGroupFiles.Length); + if (testCase.IsTopDown) + { + Assert.AreEqual(expectedResultFileCount, proteoformFiles.Length); + Assert.AreEqual(0, peptideFiles.Length); + } + else + { + Assert.AreEqual(expectedResultFileCount, peptideFiles.Length); + Assert.AreEqual(0, proteoformFiles.Length); + } + + if (testCase.WritePepXml) + { + Assert.AreEqual(spectraFileCount, pepXmlFiles.Length); + } + else + { + Assert.AreEqual(0, pepXmlFiles.Length); + } + + if (testCase.WriteIndividualResults) + { + Assert.AreEqual(expectedResultFileCount, percolatorFiles.Length); + } + else + { + Assert.AreEqual(1, percolatorFiles.Length); + } - Directory.Delete(outputFolder, true); + if (testCase.WriteMzId) + { + Assert.AreEqual(spectraFileCount, mzidFiles.Length); + } + else + { + Assert.AreEqual(0, mzidFiles.Length); + } } } } \ No newline at end of file diff --git a/MetaMorpheus/Test/SearchTaskTest.cs b/MetaMorpheus/Test/SearchTaskTest.cs index 51943247c..221f0fb3c 100644 --- a/MetaMorpheus/Test/SearchTaskTest.cs +++ b/MetaMorpheus/Test/SearchTaskTest.cs @@ -614,7 +614,7 @@ public static void TestPepFilteringFewerThan100Psms() string resultsFile = Path.Combine(pepTaskFolder, "results.txt"); string[] results = File.ReadAllLines(resultsFile); Assert.AreEqual("PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value.", results[6]); - Assert.AreEqual("All target PSMs with q-value = 1: 84", results[7]); + Assert.AreEqual("All target PSMs with q-value <= 1: 84", results[7]); // clean up Directory.Delete(folderPath, true); diff --git a/MetaMorpheus/Test/SetUpTests.cs b/MetaMorpheus/Test/SetUpTests.cs index 3115c3dc7..71ca1c610 100644 --- a/MetaMorpheus/Test/SetUpTests.cs +++ b/MetaMorpheus/Test/SetUpTests.cs @@ -16,17 +16,24 @@ public class MySetUpClass private const string elementsLocation = @"elements.dat"; [OneTimeSetUp] - public static void Setup() + public static void GlobalSetup() { Environment.CurrentDirectory = TestContext.CurrentContext.TestDirectory; Loaders.LoadElements(); - + GlobalVariables.SetUpGlobalVariables(); + MetaMorpheusEngine.WarnHandler += WarnStatusHandler; MetaMorpheusTask.WarnHandler += WarnStatusHandler; EverythingRunnerEngine.FinishedAllTasksEngineHandler += SuccessfullyFinishedAllTasks; } + [OneTimeTearDown] + public static void GlobalTearDown() + { + EverythingRunnerEngineTestCase.DisposeAll(); + } + private static void SuccessfullyFinishedAllTasks(object sender, StringEventArgs rootOutputFolderPath) { outputFolder = rootOutputFolderPath.S; diff --git a/MetaMorpheus/Test/SilacTest.cs b/MetaMorpheus/Test/SilacTest.cs index b4a315c77..bdfc70c27 100644 --- a/MetaMorpheus/Test/SilacTest.cs +++ b/MetaMorpheus/Test/SilacTest.cs @@ -213,7 +213,7 @@ public static void TestSilacQuantification() Assert.IsTrue(File.Exists(mzIDPath1)); Assert.IsTrue(File.Exists(mzIDPath2)); - Assert.IsTrue(theStringResult.Contains("All target PSMs with q-value = 0.01: 2")); //it's not a psm, it's a MBR feature. 2 because there are two files, but not 4 because MBR != psm + Assert.IsTrue(theStringResult.Contains("All target PSMs with q-value <= 0.01: 2")); //it's not a psm, it's a MBR feature. 2 because there are two files, but not 4 because MBR != psm ///Normal Peptide //test proteins diff --git a/MetaMorpheus/Test/Test.csproj b/MetaMorpheus/Test/Test.csproj index 154a19918..10ecccb53 100644 --- a/MetaMorpheus/Test/Test.csproj +++ b/MetaMorpheus/Test/Test.csproj @@ -582,6 +582,9 @@ Always + + Always + Always diff --git a/MetaMorpheus/Test/TestData/Task2-SearchTaskconfig.toml b/MetaMorpheus/Test/TestData/Task2-SearchTaskconfig.toml index 59939e394..4045885e8 100644 --- a/MetaMorpheus/Test/TestData/Task2-SearchTaskconfig.toml +++ b/MetaMorpheus/Test/TestData/Task2-SearchTaskconfig.toml @@ -22,7 +22,7 @@ MaxFragmentSize = 30000.0 HistogramBinTolInDaltons = 0.003 MaximumMassThatFragmentIonScoreIsDoubled = 0.0 WriteMzId = false -WritePepXml = false +WritePepXml = true WriteDecoys = true WriteContaminants = true diff --git a/MetaMorpheus/Test/TestData/smalldb.fasta b/MetaMorpheus/Test/TestData/smalldb.fasta index 9b710608c..50b6f5ef9 100644 --- a/MetaMorpheus/Test/TestData/smalldb.fasta +++ b/MetaMorpheus/Test/TestData/smalldb.fasta @@ -48,6 +48,13 @@ NGGEATFGGIDESKFKGDITWLPVRRKAYWEVKFEGIGLGDEYAELESHGAAIDTGTSLI TLPSGLAEMINAEIGAKKGWTGQYTLDCNTRDNLPDLIFNFNGYNFTIGPYDYTLEVSGS CISAITPMDFPEPVGPLAIVGDAFLRKYYSIYDLGNNAVGLAKAI +>sp|P0726721|FakeTestProtein Saccharopepsin OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=PEP4 PE=1 SV=1 +MPKVYSYQEVAEHNGPENFWIIIDDKVYDVSQFKDEHPGGDEIIMDLGGQDATESFVDIG +HSDEALRLLKGLYIGDVDKTSERVSVEKVSTSENQSKGSGTLVVILAILMLGVAYYLLNE + +>sp|P0726721|FakeTestProtein2 Saccharopepsin OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=PEP4 PE=1 SV=1 +QIVHDSGR + >sp|P29547|EF1G1_YEAST Elongation factor 1-gamma 1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=CAM1 PE=1 SV=2 MSQGTLYANFRIRTWVPRGLVKALKLDVKVVTPDAAAEQFARDFPLKKVPAFVGPKGYKL TEAMAINYYLVKLSQDDKMKTQLLGADDDLNAQAQIIRWQSLANSDLCIQIANTIVPLKG diff --git a/MetaMorpheus/Test/TestPsm.cs b/MetaMorpheus/Test/TestPsm.cs index 5500fd414..f3921c997 100644 --- a/MetaMorpheus/Test/TestPsm.cs +++ b/MetaMorpheus/Test/TestPsm.cs @@ -436,8 +436,8 @@ public static void TestPsmCount2() List results = File.ReadAllLines(Path.Combine(outputFolder, @"results.txt")).ToList(); - string peptideCountFromResultsString = results.Where(r => r.Contains("All target peptides with q-value = 0.01: ")).FirstOrDefault(); - double peptideCountFromResults = Convert.ToDouble(peptideCountFromResultsString.Split(':')[1].ToString()); + string peptideCountFromResultsString = results.FirstOrDefault(r => r.Contains("All target peptides with q-value <= 0.01: ")); + double peptideCountFromResults = Convert.ToDouble(peptideCountFromResultsString?.Split(':')[1].ToString()); Assert.AreEqual(allPeptidesQvalueBelowCutoff, peptideCountFromResults); Directory.Delete(outputFolder, true); Directory.Delete(Path.Combine(TestContext.CurrentContext.TestDirectory, @"Task Settings"), true); diff --git a/MetaMorpheus/Test/TopDownTestData/TopDownSearchToml.toml b/MetaMorpheus/Test/TopDownTestData/TopDownSearchToml.toml new file mode 100644 index 000000000..38be3a05a --- /dev/null +++ b/MetaMorpheus/Test/TopDownTestData/TopDownSearchToml.toml @@ -0,0 +1,97 @@ +TaskType = "Search" + +[SearchParameters] +DisposeOfFileWhenDone = true +DoParsimony = true +ModPeptidesAreDifferent = false +NoOneHitWonders = false +MatchBetweenRuns = false +Normalize = false +QuantifyPpmTol = 5.0 +DoHistogramAnalysis = false +SearchTarget = true +DecoyType = "Reverse" +MassDiffAcceptorType = "ThreeMM" +WritePrunedDatabase = false +KeepAllUniprotMods = true +DoLocalizationAnalysis = true +DoLabelFreeQuantification = false +UseSharedPeptidesForLFQ = false +DoMultiplexQuantification = false +MultiplexModId = "TMT10" +DoSpectralRecovery = false +SearchType = "Classic" +LocalFdrCategories = ["FullySpecific"] +MaxFragmentSize = 30000.0 +MinAllowedInternalFragmentLength = 0 +HistogramBinTolInDaltons = 0.003 +MaximumMassThatFragmentIonScoreIsDoubled = 0.0 +WriteMzId = true +WritePepXml = false +WriteHighQValuePsms = true +WriteDecoys = true +WriteContaminants = true +WriteIndividualFiles = true +WriteSpectralLibrary = false +UpdateSpectralLibrary = false +CompressIndividualFiles = false +TCAmbiguity = "RemoveContaminant" +IncludeModMotifInMzid = false + +[SearchParameters.ModsToWriteSelection] +'N-linked glycosylation' = 3 +'O-linked glycosylation' = 3 +'Other glycosylation' = 3 +'Common Biological' = 3 +'Less Common' = 3 +Metal = 3 +'2+ nucleotide substitution' = 3 +'1 nucleotide substitution' = 3 +UniProt = 2 + +[CommonParameters] +TaskDescriptor = "SearchTask" +MaxThreadsToUsePerFile = 23 +ListOfModsFixed = "Common Fixed\tCarbamidomethyl on C\t\tCommon Fixed\tCarbamidomethyl on U" +ListOfModsVariable = "" +DoPrecursorDeconvolution = true +UseProvidedPrecursorInfo = false +DeconvolutionMaxAssumedChargeState = 60 +TotalPartitions = 1 +ProductMassTolerance = "±20.0000 PPM" +PrecursorMassTolerance = "±20.0000 PPM" +AddCompIons = false +QValueThreshold = 0.01 +PepQValueThreshold = 1.0 +ScoreCutoff = 2.0 +ReportAllAmbiguity = true +NumberOfPeaksToKeepPerWindow = 200 +MinimumAllowedIntensityRatioToBasePeak = 0.01 +NormalizePeaksAccrossAllWindows = false +TrimMs1Peaks = false +TrimMsMsPeaks = false +CustomIons = [] +AssumeOrphanPeaksAreZ1Fragments = false +MaxHeterozygousVariants = 4 +MinVariantDepth = 1 +AddTruncations = false +DissociationType = "Autodetect" +SeparationType = "HPLC" +MS2ChildScanDissociationType = "Unknown" +MS3ChildScanDissociationType = "Unknown" +UseMostAbundantPrecursorIntensity = true + +[CommonParameters.DigestionParams] +InitiatorMethionineBehavior = "Variable" +MaxMissedCleavages = 2 +MaxModificationIsoforms = 1024 +SearchModeType = "Full" +FragmentationTerminus = "Both" +SpecificProtease = "top-down" +GeneratehUnlabeledProteinsForSilac = true +KeepNGlycopeptide = false +KeepOGlycopeptide = false +Protease = "top-down" +MinPeptideLength = 7 +MaxPeptideLength = 2147483647 +MaxModsForPeptide = 2 From 22c8fe0d38ac672850ca92e28f6cc0e356c7d86f Mon Sep 17 00:00:00 2001 From: RayMSMS <150720362+RayMSMS@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:23:13 -0500 Subject: [PATCH 2/7] Add the comment on glyco-searching and the search summary on the result file (#2367) * The lie's comment about the glyco-searching * Try to add the search summary information (PSMs, protein group, glycoPsms, Level1-glycoPsms) into the "AllResult.txt" file for glycoSearch (1) adding text function in PostGlycoSearchAnalysisTask class (2) adding tester in TestOGlyco class (make sure we parse the certain value) (3) revise the "readCsv", enable to read the allPSMs file smoothly. * add the contaminant tester * Delet the unused constructor of Node class to cheat the coverage check * Fix the Fdr filter (initial: < 0.1, now <= 0.1) * Try to pass the coverage test, add the docoy filtering tester We also allow to get the same PSMs in duplicated files. * update 7/2/2024 (1) Rewrite the Summary writing function (2)Add the comment in the fuction header * In order to pass the converage, add the new model in the tester "N-glycan fragment" * Update 7/4/2024 1. add new tester model for "OGlycanCompositionFragments" * update 7/5/2024 1. add the tester for writing function, in different search type 2. glycoBox tester for decoy glycanBox * update 7/11/2024 1. delete the bin and retry to pass the tester * update 8/6/2024 deleted the duplicate tester * store the code --------- Co-authored-by: Nic Bollis --- MetaMorpheus/Test/TestOGlyco.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MetaMorpheus/Test/TestOGlyco.cs b/MetaMorpheus/Test/TestOGlyco.cs index 471c6d61b..886ab208a 100644 --- a/MetaMorpheus/Test/TestOGlyco.cs +++ b/MetaMorpheus/Test/TestOGlyco.cs @@ -658,7 +658,7 @@ public static void OGlycoTest_Run5() Directory.Delete(outputFolder, true); } - [Test] + [Test] public static void OGlycoTest_Run5_WriteContaminants() { string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TESTGlycoData"); From c0bb0a066fa418e1f38234c6e56e5ea394cb3d9a Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Fri, 23 Aug 2024 13:11:44 -0500 Subject: [PATCH 3/7] Github Actions 1: Cross Platform Build (#2398) * Create build.yml started github actions * Update build.yml * Update build.yml * Update build.yml * enabled windows targeting to gui and guifunctions * Update build.yml * windows targeting * added new configuration * Update build.yml * Update build.yml * Update build.yml * Update build.yml * Update build.yml * Update build.yml * Update build.yml * added test.yml from vs * reverterd commit * updated some csproj files * touched up GUI.csproj * Update build.yml * fixed the test? * Changes at alex's suggestion * added back in guifunctions for tests * removed windows targeting from GUI as it is no longer being built on windows and mac * cleaned up alot * did the thing --------- Co-authored-by: trishorts --- .github/workflows/build.yml | 37 +++++++++++++++++++ MetaMorpheus/GUI/GUI.csproj | 1 + MetaMorpheus/GuiFunctions/GuiFunctions.csproj | 1 + MetaMorpheus/MetaMorpheus.sln | 14 +++++++ MetaMorpheus/Test/SpectralRecoveryTest.cs | 4 +- MetaMorpheus/Test/Test.csproj | 1 + 6 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 000000000..f6325c098 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,37 @@ +# This workflow will build a .NET project +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-net + +name: Build + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + name: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + include: + - os: ubuntu-latest + configuration: UbuntuMac + - os: macos-latest + configuration: UbuntuMac + - os: windows-latest + configuration: Release + + steps: + - uses: actions/checkout@v4 + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: 8.0.204 + - name: Restore dependencies + run: dotnet restore ./MetaMorpheus/MetaMorpheus.sln + - name: Build + run: dotnet build --no-restore ./MetaMorpheus/MetaMorpheus.sln --configuration ${{ matrix.configuration }} diff --git a/MetaMorpheus/GUI/GUI.csproj b/MetaMorpheus/GUI/GUI.csproj index 29027f365..d57835e75 100644 --- a/MetaMorpheus/GUI/GUI.csproj +++ b/MetaMorpheus/GUI/GUI.csproj @@ -16,6 +16,7 @@ full true Icons\MMnice.ico + true diff --git a/MetaMorpheus/GuiFunctions/GuiFunctions.csproj b/MetaMorpheus/GuiFunctions/GuiFunctions.csproj index 27f6145ae..a45054b18 100644 --- a/MetaMorpheus/GuiFunctions/GuiFunctions.csproj +++ b/MetaMorpheus/GuiFunctions/GuiFunctions.csproj @@ -6,6 +6,7 @@ full true true + true diff --git a/MetaMorpheus/MetaMorpheus.sln b/MetaMorpheus/MetaMorpheus.sln index d03b322a1..f6a4a2a48 100644 --- a/MetaMorpheus/MetaMorpheus.sln +++ b/MetaMorpheus/MetaMorpheus.sln @@ -26,40 +26,54 @@ Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU + UbuntuMac|Any CPU = UbuntuMac|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.Debug|Any CPU.Build.0 = Debug|Any CPU {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.Release|Any CPU.ActiveCfg = Release|Any CPU {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.Release|Any CPU.Build.0 = Release|Any CPU + {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU + {FD20EBBA-F4C5-40D6-AD61-48A7EB255DAE}.UbuntuMac|Any CPU.Build.0 = Release|Any CPU {C654FC97-FBD1-43D7-9F61-35FDD1A4E0AD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {C654FC97-FBD1-43D7-9F61-35FDD1A4E0AD}.Debug|Any CPU.Build.0 = Debug|Any CPU {C654FC97-FBD1-43D7-9F61-35FDD1A4E0AD}.Release|Any CPU.ActiveCfg = Release|Any CPU {C654FC97-FBD1-43D7-9F61-35FDD1A4E0AD}.Release|Any CPU.Build.0 = Release|Any CPU + {C654FC97-FBD1-43D7-9F61-35FDD1A4E0AD}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.Debug|Any CPU.Build.0 = Debug|Any CPU {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.Release|Any CPU.ActiveCfg = Release|Any CPU {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.Release|Any CPU.Build.0 = Release|Any CPU + {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU + {FFAE3A5E-B5AE-4CD0-ABF9-703C91F1C7D6}.UbuntuMac|Any CPU.Build.0 = Release|Any CPU {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.Debug|Any CPU.Build.0 = Debug|Any CPU {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.Release|Any CPU.ActiveCfg = Release|Any CPU {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.Release|Any CPU.Build.0 = Release|Any CPU + {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU + {9CD3D75C-4E3C-40AC-A3D8-C32DD528DB7F}.UbuntuMac|Any CPU.Build.0 = Release|Any CPU {BE72541A-66A4-4958-9D19-56ACC7D3876B}.Debug|Any CPU.ActiveCfg = Debug|x64 {BE72541A-66A4-4958-9D19-56ACC7D3876B}.Debug|Any CPU.Build.0 = Debug|x64 {BE72541A-66A4-4958-9D19-56ACC7D3876B}.Release|Any CPU.ActiveCfg = Release|x64 {BE72541A-66A4-4958-9D19-56ACC7D3876B}.Release|Any CPU.Build.0 = Release|x64 + {BE72541A-66A4-4958-9D19-56ACC7D3876B}.UbuntuMac|Any CPU.ActiveCfg = Release|x64 {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.Debug|Any CPU.Build.0 = Debug|Any CPU {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.Release|Any CPU.ActiveCfg = Release|Any CPU {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.Release|Any CPU.Build.0 = Release|Any CPU + {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU + {653015B3-CC5A-4D1C-AAD3-1CE2A0C4D197}.UbuntuMac|Any CPU.Build.0 = Release|Any CPU {7EE028A9-75A2-450F-A9A7-76559ED15419}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {7EE028A9-75A2-450F-A9A7-76559ED15419}.Debug|Any CPU.Build.0 = Debug|Any CPU {7EE028A9-75A2-450F-A9A7-76559ED15419}.Release|Any CPU.ActiveCfg = Release|Any CPU {7EE028A9-75A2-450F-A9A7-76559ED15419}.Release|Any CPU.Build.0 = Release|Any CPU + {7EE028A9-75A2-450F-A9A7-76559ED15419}.UbuntuMac|Any CPU.ActiveCfg = Release|Any CPU + {7EE028A9-75A2-450F-A9A7-76559ED15419}.UbuntuMac|Any CPU.Build.0 = Release|Any CPU {E0EA5AC4-24A9-43DC-8FBC-CCEB3B9935B6}.Debug|Any CPU.ActiveCfg = Debug|x64 {E0EA5AC4-24A9-43DC-8FBC-CCEB3B9935B6}.Debug|Any CPU.Build.0 = Debug|x64 {E0EA5AC4-24A9-43DC-8FBC-CCEB3B9935B6}.Release|Any CPU.ActiveCfg = Release|x64 {E0EA5AC4-24A9-43DC-8FBC-CCEB3B9935B6}.Release|Any CPU.Build.0 = Release|x64 + {E0EA5AC4-24A9-43DC-8FBC-CCEB3B9935B6}.UbuntuMac|Any CPU.ActiveCfg = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/MetaMorpheus/Test/SpectralRecoveryTest.cs b/MetaMorpheus/Test/SpectralRecoveryTest.cs index 40c519ec4..3257361c6 100644 --- a/MetaMorpheus/Test/SpectralRecoveryTest.cs +++ b/MetaMorpheus/Test/SpectralRecoveryTest.cs @@ -285,6 +285,8 @@ public static void MiniClassicSearchEngineTest() [Test] public static void SpectralWriterTest() { + foreach (var specLibPath in Directory.GetFiles(outputFolder, "*.msp", SearchOption.AllDirectories)) + File.Delete(specLibPath); PostSearchAnalysisTask postSearchTask = new PostSearchAnalysisTask() { @@ -377,7 +379,7 @@ public static void SpectralWriterTest() postSearchTask.Run(); var libraryList = Directory.GetFiles(path, "*.*", SearchOption.AllDirectories); - string updateLibraryPath = libraryList.First(p => p.Contains("SpectralLibrary") && !p.Contains(matchingvalue)).ToString(); + string updateLibraryPath = libraryList.First(p => p.Contains("updateSpectralLibrary") && !p.Contains(matchingvalue)).ToString(); var updatedLibraryWithoutDecoy = new SpectralLibrary(new List { Path.Combine(path, updateLibraryPath) }); Assert.That(updatedLibraryWithoutDecoy.TryGetSpectrum("EESGKPGAHVTVK", 2, out spectrum)); diff --git a/MetaMorpheus/Test/Test.csproj b/MetaMorpheus/Test/Test.csproj index 10ecccb53..12286e77f 100644 --- a/MetaMorpheus/Test/Test.csproj +++ b/MetaMorpheus/Test/Test.csproj @@ -5,6 +5,7 @@ false Debug;Release full + true From 30e73f0b80a82f340fd38b5cf507c9d5e1218176 Mon Sep 17 00:00:00 2001 From: Anthony Date: Mon, 26 Aug 2024 09:38:55 -0700 Subject: [PATCH 4/7] polishing prose (#2403) --- .../PrecursorSearchModes/DotMassDiffAcceptor.cs | 4 ++-- MetaMorpheus/TaskLayer/MetaMorpheusTask.cs | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/MetaMorpheus/EngineLayer/PrecursorSearchModes/DotMassDiffAcceptor.cs b/MetaMorpheus/EngineLayer/PrecursorSearchModes/DotMassDiffAcceptor.cs index 657154052..13ea0fd8c 100644 --- a/MetaMorpheus/EngineLayer/PrecursorSearchModes/DotMassDiffAcceptor.cs +++ b/MetaMorpheus/EngineLayer/PrecursorSearchModes/DotMassDiffAcceptor.cs @@ -55,7 +55,7 @@ public override string ToString() public override string ToProseString() { - return (Tolerance.ToString() + " around " + String.Join(",", AcceptableSortedMassShifts) + " Da"); + return (Tolerance.ToString() + " around " + String.Join(", ", AcceptableSortedMassShifts) + " Da"); } } -} \ No newline at end of file +} diff --git a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs index cbc145fcf..0a0f89e35 100644 --- a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs +++ b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs @@ -572,12 +572,13 @@ public MyTaskResults RunTask(string output_folder, List currentProtei using (StreamWriter file = new StreamWriter(proseFilePath)) { file.WriteLine("The data analysis was performed using MetaMorpheus version " + GlobalVariables.MetaMorpheusVersion + ", available at " + "https://github.com/smith-chem-wisc/MetaMorpheus."); + file.WriteLine(); file.Write(ProseCreatedWhileRunning.ToString()); file.WriteLine(SystemInfo.SystemProse().Replace(Environment.NewLine, "") + " "); + file.WriteLine(); file.WriteLine("The total time to perform the " + TaskType + " task on " + currentRawDataFilepathList.Count + " spectra file(s) was " + String.Format("{0:0.00}", MyTaskResults.Time.TotalMinutes) + " minutes."); file.WriteLine(); - file.WriteLine("Published works using MetaMorpheus software are encouraged to cite: Solntsev, S. K.; Shortreed, M. R.; Frey, B. L.; Smith, L. M. Enhanced Global Post-translational Modification Discovery with MetaMorpheus. Journal of Proteome Research. 2018, 17 (5), 1844-1851."); - + file.WriteLine("Published works using MetaMorpheus software are encouraged to cite the appropriate publications listed in the reference guide, found here: https://github.com/smith-chem-wisc/MetaMorpheus/blob/master/README.md."); file.WriteLine(); file.WriteLine("Spectra files: "); file.WriteLine(string.Join(Environment.NewLine, currentRawDataFilepathList.Select(b => '\t' + b))); @@ -1186,4 +1187,4 @@ protected static void SanitizeProteinDatabase(List proteins, TargetCont } } } -} \ No newline at end of file +} From dd3dc20084e6f979bffefac56d805a3c7f35bbc4 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 27 Aug 2024 11:29:00 -0500 Subject: [PATCH 5/7] update mzlib nuget package to 551 (#2404) --- MetaMorpheus/CMD/CMD.csproj | 2 +- MetaMorpheus/EngineLayer/EngineLayer.csproj | 2 +- MetaMorpheus/GUI/GUI.csproj | 2 +- MetaMorpheus/GuiFunctions/GuiFunctions.csproj | 2 +- MetaMorpheus/TaskLayer/TaskLayer.csproj | 2 +- MetaMorpheus/Test/Test.csproj | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/MetaMorpheus/CMD/CMD.csproj b/MetaMorpheus/CMD/CMD.csproj index f278a13ce..44eb33bbe 100644 --- a/MetaMorpheus/CMD/CMD.csproj +++ b/MetaMorpheus/CMD/CMD.csproj @@ -24,7 +24,7 @@ - + diff --git a/MetaMorpheus/EngineLayer/EngineLayer.csproj b/MetaMorpheus/EngineLayer/EngineLayer.csproj index a8621408a..9c82f4ec2 100644 --- a/MetaMorpheus/EngineLayer/EngineLayer.csproj +++ b/MetaMorpheus/EngineLayer/EngineLayer.csproj @@ -21,7 +21,7 @@ - + diff --git a/MetaMorpheus/GUI/GUI.csproj b/MetaMorpheus/GUI/GUI.csproj index d57835e75..df8cfd5df 100644 --- a/MetaMorpheus/GUI/GUI.csproj +++ b/MetaMorpheus/GUI/GUI.csproj @@ -55,7 +55,7 @@ - + diff --git a/MetaMorpheus/GuiFunctions/GuiFunctions.csproj b/MetaMorpheus/GuiFunctions/GuiFunctions.csproj index a45054b18..e2989fbcb 100644 --- a/MetaMorpheus/GuiFunctions/GuiFunctions.csproj +++ b/MetaMorpheus/GuiFunctions/GuiFunctions.csproj @@ -16,7 +16,7 @@ - + diff --git a/MetaMorpheus/TaskLayer/TaskLayer.csproj b/MetaMorpheus/TaskLayer/TaskLayer.csproj index ded75a26e..35f063006 100644 --- a/MetaMorpheus/TaskLayer/TaskLayer.csproj +++ b/MetaMorpheus/TaskLayer/TaskLayer.csproj @@ -21,7 +21,7 @@ - + diff --git a/MetaMorpheus/Test/Test.csproj b/MetaMorpheus/Test/Test.csproj index 12286e77f..a4d8b9f5c 100644 --- a/MetaMorpheus/Test/Test.csproj +++ b/MetaMorpheus/Test/Test.csproj @@ -24,7 +24,7 @@ - + From 5c019b4f04a72c19351e71cfd696cdb4a64a666c Mon Sep 17 00:00:00 2001 From: Alexander-Sol <41119316+Alexander-Sol@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:58:19 -0500 Subject: [PATCH 6/7] Reworking PEP to prevent cross-contamination (#2394) --- MetaMorpheus/EngineLayer/CommonParameters.cs | 2 +- .../FdrAnalysis/FdrAnalysisEngine.cs | 26 +- ...nalysisGeneric.cs => PEPAnalysisEngine.cs} | 937 +++++++++--------- .../FdrAnalysis/PeptideMatchGroup.cs | 70 ++ MetaMorpheus/EngineLayer/SpectralMatch.cs | 2 +- MetaMorpheus/TaskLayer/FilteredPsms.cs | 23 +- .../MbrAnalysis/SpectralRecoveryRunner.cs | 67 +- MetaMorpheus/TaskLayer/MetaMorpheusTask.cs | 40 + .../SearchTask/PostSearchAnalysisTask.cs | 9 +- .../TaskLayer/SearchTask/SearchTask.cs | 2 +- .../Test/EverythingRunnerEngineTestCase.cs | 3 +- MetaMorpheus/Test/FdrTest.cs | 110 +- MetaMorpheus/Test/PeptideSpectralMatchTest.cs | 9 +- .../Test/PostSearchAnalysisTaskTests.cs | 36 +- MetaMorpheus/Test/SearchEngineTests.cs | 1 + MetaMorpheus/Test/SpectralRecoveryTest.cs | 27 +- MetaMorpheus/Test/XLTest.cs | 84 +- 17 files changed, 813 insertions(+), 635 deletions(-) rename MetaMorpheus/EngineLayer/FdrAnalysis/{PEPValueAnalysisGeneric.cs => PEPAnalysisEngine.cs} (66%) create mode 100644 MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs diff --git a/MetaMorpheus/EngineLayer/CommonParameters.cs b/MetaMorpheus/EngineLayer/CommonParameters.cs index 335749707..f95d6cb12 100644 --- a/MetaMorpheus/EngineLayer/CommonParameters.cs +++ b/MetaMorpheus/EngineLayer/CommonParameters.cs @@ -163,7 +163,7 @@ public int DeconvolutionMaxAssumedChargeState /// This parameter determines which PSMs/Peptides will be used as postive training examples /// when training the GBDT model for PEP. /// - public double QValueCutoffForPepCalculation { get; private set; } + public double QValueCutoffForPepCalculation { get; set; } public DigestionParams DigestionParams { get; private set; } public bool ReportAllAmbiguity { get; private set; } public int? NumberOfPeaksToKeepPerWindow { get; private set; } diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index 454bcebbc..e75e91b19 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -1,4 +1,5 @@ -using System; +using EngineLayer.CrosslinkSearch; +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -275,18 +276,25 @@ public static void PepQValueInverted(List psms, bool peptideLevel public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults, List psms) { - if (psms[0].DigestionParams.Protease.Name == "top-down") + string searchType; + // Currently, searches of mixed data (bottom-up + top-down) are not supported + // PEP will be calculated based on the search type of the first file/PSM in the list, which isn't ideal + // This will be addressed in a future release + switch(psms[0].DigestionParams.Protease.Name) { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "top-down", this.FileSpecificParameters, this.OutputFolder); + case "top-down": + searchType = "top-down"; + break; + default: + searchType = "standard"; + break; } - else if (psms[0].DigestionParams.Protease.Name == "crosslink") + if (psms[0] is CrosslinkSpectralMatch) { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "crosslink", this.FileSpecificParameters, this.OutputFolder); - } - else - { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "standard", this.FileSpecificParameters, this.OutputFolder); + searchType = "crosslink"; } + myAnalysisResults.BinarySearchTreeMetrics = new PepAnalysisEngine(psms, searchType, FileSpecificParameters, OutputFolder).ComputePEPValuesForAllPSMs(); + } /// diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs similarity index 66% rename from MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs rename to MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs index 2fa22248f..f68cd3c0a 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs @@ -16,23 +16,41 @@ using System.Threading.Tasks; using Omics.Modifications; using Omics; +using Easy.Common.Extensions; namespace EngineLayer { - public static class PEP_Analysis_Cross_Validation + public class PepAnalysisEngine { private static readonly double AbsoluteProbabilityThatDistinguishesPeptides = 0.05; - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = new Dictionary>>(); - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = new Dictionary>>(); - private static Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = new Dictionary>>(); - + + //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw + //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing + //the z-score. That z-score is used as a feature for machine learning. + //Separate dictionaries are created for peptides with modifications because SSRcalc doesn't really do a good job predicting hyrophobicity + + //The first string in the dictionary is the filename + //The value of the dictionary is another dictionary that profiles the hydrophobicity behavior. + //Each key is a retention time rounded to the nearest minute. + //The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time. + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified { get; private set; } + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified { get; private set; } + public Dictionary>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE { get; private set; } + /// /// A dictionary which stores the chimeric ID string in the key and the number of chimeric identifications as the vale /// - private static Dictionary chimeraCountDictionary = new Dictionary(); - public static bool UsePeptideLevelQValueForTraining = true; - public static double QValueCutoff = 0.005; - + private Dictionary chimeraCountDictionary = new Dictionary(); + public Dictionary FileSpecificMedianFragmentMassErrors { get; private set; } + public Dictionary FileSpecificParametersDictionary { get; private set; } + public int ChargeStateMode { get; private set; } + + public double QValueCutoff { get; } + public bool UsePeptideLevelQValueForTraining = true; + public string[] TrainingVariables { get; } + public string OutputFolder { get; } + public List AllPsms { get; } + public string SearchType { get; } /// /// This method is used to compute the PEP values for all PSMs in a dataset. @@ -42,158 +60,254 @@ public static class PEP_Analysis_Cross_Validation /// /// /// - public static string ComputePEPValuesForAllPSMsGeneric(List psms, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, string outputFolder) + public void SetFileSpecificParameters(List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters) + { + FileSpecificParametersDictionary = fileSpecificParameters.ToDictionary(p => Path.GetFileName(p.fileName), p => p.fileSpecificParameters); + } + + public PepAnalysisEngine(List psms, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, string outputFolder) + { + // This creates a new list of PSMs, but does not clone the Psms themselves. + // This allows the PSMs to be modified and the order to be preserved + AllPsms = psms.OrderByDescending(p => p).ToList(); + TrainingVariables = PsmData.trainingInfos[searchType]; + OutputFolder = outputFolder; + SearchType = searchType; + SetFileSpecificParameters(fileSpecificParameters); + BuildFileSpecificDictionaries(psms, TrainingVariables); + QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005); + + // If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level + UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100; + } + + public string ComputePEPValuesForAllPSMs() { - string[] trainingVariables = PsmData.trainingInfos[searchType]; - - //ensure that the order is always stable. - psms = psms.OrderByDescending(p => p).ToList(); - List allPeptideIndices = new List(); - List peptides = psms - .GroupBy(b => b.FullSequence) - .Select(b => b.FirstOrDefault()).ToList(); - List countOfPeptidesInEachFile = peptides.GroupBy(b => b.FullFilePath).Select(b => b.Count()).ToList(); - bool allFilesContainPeptides = (countOfPeptidesInEachFile.Count == fileSpecificParameters.Count); //rare condition where each file has psms but some files don't have peptides. probably only happens in unit tests. - UsePeptideLevelQValueForTraining = true; - QValueCutoff = fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(); - - int chargeStateMode = 0; - Dictionary fileSpecificMedianFragmentMassErrors = new Dictionary(); - if (peptides.Count() > 100 && allFilesContainPeptides) + List peptideGroups = UsePeptideLevelQValueForTraining + ? PeptideMatchGroup.GroupByBaseSequence(AllPsms) + : PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + + if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4)) { - foreach (var peptide in peptides) + peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + } + + int numGroups = 4; + List[] peptideGroupIndices = GetPeptideGroupIndices(peptideGroups, numGroups); + IEnumerable[] PSMDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + PSMDataGroups[i] = CreatePsmData(SearchType, peptideGroups, peptideGroupIndices[i]); + + if(!PSMDataGroups[i].Any(p => p.Label) || !PSMDataGroups[i].Any(p => !p.Label)) { - allPeptideIndices.Add(psms.IndexOf(peptide)); + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; } - chargeStateMode = GetChargeStateMode(peptides); - fileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(peptides); } - else + + MLContext mlContext = new MLContext(); + TransformerChain>>[] trainedModels = new TransformerChain>>[numGroups]; + + var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); + var pipeline = mlContext.Transforms.Concatenate("Features", TrainingVariables) + .Append(trainer); + + List allMetrics = new List(); + int sumOfAllAmbiguousPeptidesResolved = 0; + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) { - //there are too few psms to do any meaningful training if we used only peptides. So, we will train using psms instead. - UsePeptideLevelQValueForTraining = false; - allPeptideIndices = Enumerable.Range(0, psms.Count).ToList(); - chargeStateMode = GetChargeStateMode(psms); - fileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(psms); + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. + IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]].Concat(PSMDataGroups[allGroupIndexes[1]].Concat(PSMDataGroups[allGroupIndexes[2]]))); + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(PSMDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + //model is trained on peptides but here we can use that to compute PEP for all PSMs + int ambiguousPeptidesResolved = Compute_PSM_PEP(peptideGroups, peptideGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], SearchType, OutputFolder); + + allMetrics.Add(metrics); + sumOfAllAmbiguousPeptidesResolved += ambiguousPeptidesResolved; } - - //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw - //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing - //the z-score. That z-score is used as a feature for machine learning. - //Separate dictionaries are created for peptides with modifications because SSRcalc doesn't really do a good job predicting hyrophobicity + return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved); + } - //The first string in the dictionary is the filename - //The value of the dictionary is another dictionary that profiles the hydrophobicity behavior. - //Each key is a retention time rounded to the nearest minute. - //The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time. + /// + /// Sets the following static properties: ChargeStateMode, FileSpecificMedianFragmentMassErrors, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, and FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE + /// + /// The PSMs that will be used for training + /// An array of training variables from PsmData.trainingInfos dictionary + public void BuildFileSpecificDictionaries(List trainingData, string[] trainingVariables) + { + FileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(trainingData); + ChargeStateMode = GetChargeStateMode(trainingData); if (trainingVariables.Contains("HydrophobicityZScore")) { - if (peptides.Count() > 100 && allFilesContainPeptides) + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(trainingData, false); + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(trainingData, true); + FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(trainingData); + } + } + + public static List[] GetPeptideGroupIndices(List peptides, int numGroups) + { + List[] groupsOfIndices = new List[numGroups]; + + List targetIndices = new List(); + List decoyIndices = new List(); + for (int i = 0; i < peptides.Count; i++) + { + if (peptides[i].BestMatch.IsDecoy) { - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(peptides, fileSpecificParameters, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(peptides, fileSpecificParameters, true); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(peptides, fileSpecificParameters); + decoyIndices.Add(i); } else { - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(psms, fileSpecificParameters, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(psms, fileSpecificParameters, true); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(psms, fileSpecificParameters); + targetIndices.Add(i); } } - if (trainingVariables.Contains("ChimeraCount")) - chimeraCountDictionary = psms.GroupBy(p => p.ChimeraIdString) - .ToDictionary(p => p.Key, p => p.Count()); - - MLContext mlContext = new MLContext(); + var targetIndexGroups = DivideListIntoGroups(targetIndices, numGroups); + var decoyIndexGroups = DivideListIntoGroups(decoyIndices, numGroups); - //the number of groups used for cross-validation is hard-coded at four. Do not change this number without changes other areas of effected code. - int numGroups = 4; - if (psms.Count < 1000 || allPeptideIndices.Count < 500) + for (int i = 0; i < numGroups; i++) { - numGroups = 2; + groupsOfIndices[i] = targetIndexGroups[i].Concat(decoyIndexGroups[i]).ToList(); } - List[] psmGroupIndices = Get_PSM_Group_Indices(psms, numGroups); - //the psms will be randomly divided. but then we want to make another array that just contains the subset of peptides that are in those psms. that way we don't compute pep using any peptides that were used in training. - List[] peptideGroupIndices = Get_Peptide_Group_Indices(psmGroupIndices, allPeptideIndices); - IEnumerable[] PSMDataGroups = new IEnumerable[numGroups]; - + return groupsOfIndices; + } + + /// + /// This takes in a list of ints, and partitions them into numGroups partitions, + /// e.g., partition 1 = [0, 4, 8...], partition 2 = [1, 5, 9...], etc. + /// + /// A list containing numGroups partitions (lists of ints) + static List> DivideListIntoGroups(List list, int numGroups) + { + var groups = new List>(); for (int i = 0; i < numGroups; i++) { - PSMDataGroups[i] = CreatePsmData(searchType, fileSpecificParameters, psms, peptideGroupIndices[i], fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode); + groups.Add(new List()); } - TransformerChain>>[] trainedModels = new TransformerChain>>[numGroups]; - - var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); - var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables) - .Append(trainer); - - List allMetrics = new List(); - int sumOfAllAmbiguousPeptidesResolved = 0; - - bool allSetsContainPositiveAndNegativeTrainingExamples = true; - int groupNumber = 0; - while (allSetsContainPositiveAndNegativeTrainingExamples == true && groupNumber < numGroups) + int mainIndex = 0; + while (mainIndex < list.Count) { - if (PSMDataGroups[groupNumber].Where(p => p.Label == true).Count() == 0 || PSMDataGroups[groupNumber].Where(p => p.Label == false).Count() == 0) + int subIndex = 0; + while (subIndex < numGroups && mainIndex < list.Count) { - allSetsContainPositiveAndNegativeTrainingExamples = false; + groups[subIndex].Add(list[mainIndex]); + + subIndex++; + mainIndex++; } - groupNumber++; } - if (allSetsContainPositiveAndNegativeTrainingExamples) - { - for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) - { - List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); - allGroupIndexes.RemoveAt(groupIndexNumber); + return groups; + } - //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. - IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]]); - if (numGroups > 2) - { - dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[allGroupIndexes[0]].Concat(PSMDataGroups[allGroupIndexes[1]].Concat(PSMDataGroups[allGroupIndexes[2]]))); - } - trainedModels[groupIndexNumber] = pipeline.Fit(dataView); - var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(PSMDataGroups[groupIndexNumber])); - CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + public IEnumerable CreatePsmData(string searchType, + List peptideGroups, List peptideGroupIndices) + { + object psmDataListLock = new object(); + List psmDataList = new List(); + List psmOrder = new List(); + int maxThreads = FileSpecificParametersDictionary.Values.FirstOrDefault().MaxThreadsToUsePerFile; + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); - //Parallel operation of the following code requires the method to be stored and then read, once for each thread - //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation - if (outputFolder != null) + Parallel.ForEach(Partitioner.Create(0, peptideGroupIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localPsmDataList = new List(); + List localPsmOrder = new List(); + for (int i = range.Item1; i < range.Item2; i++) { - mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(outputFolder, "model.zip")); - } + // Stop loop if canceled + if (GlobalVariables.StopLoops) { return; } - //model is trained on peptides but here we can use that to compute PEP for all PSMs - int ambiguousPeptidesResolved = Compute_PSM_PEP(psms, psmGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], searchType, fileSpecificParameters, fileSpecificMedianFragmentMassErrors, chargeStateMode, outputFolder); + int modCount = 0; + foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null)) + { + PsmData newPsmData = new PsmData(); + if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null) + { + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - allMetrics.Add(metrics); - sumOfAllAmbiguousPeptidesResolved += ambiguousPeptidesResolved; - } + bool label; + if (csm.IsDecoy || csm.BetaPeptide.IsDecoy) + { + label = false; + newPsmData = CreateOnePsmDataEntry(searchType, csm, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); + } + else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && csm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) + { + label = true; + newPsmData = CreateOnePsmDataEntry(searchType, csm, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); + } + else + { + continue; + } + localPsmDataList.Add(newPsmData); + } + else + { + double bmp = 0; + foreach (var (notch, peptideWithSetMods) in psm.BestMatchingBioPolymersWithSetMods) + { + bool label; + double bmpc = psm.BestMatchingBioPolymersWithSetMods.Count(); + if (peptideWithSetMods.Parent.IsDecoy) + { + label = false; + newPsmData = CreateOnePsmDataEntry(searchType, psm, + peptideWithSetMods, notch, label); + } + else if (!peptideWithSetMods.Parent.IsDecoy + && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) + { + label = true; + newPsmData = CreateOnePsmDataEntry(searchType, psm, + peptideWithSetMods, notch, label); + } + else + { + continue; + } + localPsmDataList.Add(newPsmData); + localPsmOrder.Add(i + (bmp / bmpc / 2.0)); + bmp += 1.0; + } + } + modCount++; + } + } + lock (psmDataListLock) + { + psmDataList.AddRange(localPsmDataList); + psmOrder.AddRange(localPsmOrder); + } + }); + PsmData[] pda = psmDataList.ToArray(); + double[] order = psmOrder.ToArray(); - return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved); - } - else - { - return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; - } - } + Array.Sort(order, pda);//this sorts both arrays thru sorting the array in position one. The order array, keeps track of the positon in the original psms list and returns the PsmData array in that same order. - private static List[] Get_Peptide_Group_Indices(List[] psmGroupIndices, List allPeptideIndices) - { - List[] peptideGroupIndices = new List[psmGroupIndices.Length]; - for (int i = 0; i < psmGroupIndices.Length; i++) - { - peptideGroupIndices[i] = psmGroupIndices[i].Intersect(allPeptideIndices).ToList(); - } - return peptideGroupIndices; + return pda.AsEnumerable(); } public static string AggregateMetricsForOutput(List allMetrics, int sumOfAllAmbiguousPeptidesResolved) @@ -250,9 +364,11 @@ public static string AggregateMetricsForOutput(List psms, List psmIndices, MLContext mLContext, TransformerChain>> trainedModel, string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode, string outputFolder) + public int Compute_PSM_PEP(List peptideGroups, + List peptideGroupIndices, + MLContext mLContext, TransformerChain>> trainedModel, string searchType, string outputFolder) { - int maxThreads = fileSpecificParameters.FirstOrDefault().fileSpecificParameters.MaxThreadsToUsePerFile; + int maxThreads = FileSpecificParametersDictionary.Values.FirstOrDefault().MaxThreadsToUsePerFile; object lockObject = new object(); int ambiguousPeptidesResolved = 0; @@ -263,7 +379,7 @@ public static int Compute_PSM_PEP(List psms, List psmIndices maxThreads = 1; } - Parallel.ForEach(Partitioner.Create(0, psmIndices.Count), + Parallel.ForEach(Partitioner.Create(0, peptideGroupIndices.Count), new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, (range, loopState) => { @@ -287,32 +403,38 @@ public static int Compute_PSM_PEP(List psms, List psmIndices for (int i = range.Item1; i < range.Item2; i++) { - SpectralMatch psm = psms[psmIndices[i]]; - - if (psm != null) + foreach (SpectralMatch psm in peptideGroups[peptideGroupIndices[i]]) { - List indiciesOfPeptidesToRemove = new List(); - List pepValuePredictions = new List(); + // I'm not sure what's going one here vis-a-vis disambiguations, but I'm not going to touch it for now + if (psm != null) + { + List indiciesOfPeptidesToRemove = new List(); + List pepValuePredictions = new List(); - //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM. + //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM. - List allBmpNotches = new List(); - List allBmpPeptides = new List(); + List allBmpNotches = new List(); + List allBmpPeptides = new List(); - foreach (var (Notch, Peptide) in psm.BestMatchingBioPolymersWithSetMods) - { - allBmpNotches.Add(Notch); - allBmpPeptides.Add(Peptide); - PsmData pd = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, Peptide, Notch, !Peptide.Parent.IsDecoy); - var pepValuePrediction = threadPredictionEngine.Predict(pd); - pepValuePredictions.Add(pepValuePrediction.Probability); - //A score is available using the variable pepvaluePrediction.Score - } + foreach (var (Notch, Peptide) in psm.BestMatchingBioPolymersWithSetMods) + { + allBmpNotches.Add(Notch); + allBmpPeptides.Add(Peptide); + PsmData pd = CreateOnePsmDataEntry(searchType, psm, Peptide, Notch, !Peptide.Parent.IsDecoy); + var pepValuePrediction = threadPredictionEngine.Predict(pd); + pepValuePredictions.Add(pepValuePrediction.Probability); + //A score is available using the variable pepvaluePrediction.Score + } - GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); - int peptidesRemoved = 0; - RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, allBmpNotches, allBmpPeptides, pepValuePredictions, ref peptidesRemoved); - ambigousPeptidesRemovedinThread += peptidesRemoved; + GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); + int peptidesRemoved = 0; + RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, allBmpNotches, allBmpPeptides, pepValuePredictions, ref peptidesRemoved); + ambigousPeptidesRemovedinThread += peptidesRemoved; + + psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max(); + psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max(); + } + } } @@ -324,57 +446,192 @@ public static int Compute_PSM_PEP(List psms, List psmIndices return ambiguousPeptidesResolved; } - public static List[] Get_PSM_Group_Indices(List psms, int numGroups) + public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide, int notchToUse, bool label) { - List[] groupsOfIndicies = new List[numGroups]; - var targetIndexes = psms.Select((item, index) => new { Item = item, Index = index }) - .Where(x => !x.Item.IsDecoy) - .Select(x => x.Index) - .ToList(); - RandomizeListInPlace(targetIndexes); - var decoyIndexes = psms.Select((item, index) => new { Item = item, Index = index }) - .Where(x => x.Item.IsDecoy) - .Select(x => x.Index) - .ToList(); - RandomizeListInPlace(decoyIndexes); - - var targetGroups = DivideListIntoGroups(targetIndexes, numGroups); - var decoyGroups = DivideListIntoGroups(decoyIndexes, numGroups); + double normalizationFactor = selectedPeptide.BaseSequence.Length; + float totalMatchingFragmentCount = 0; + float internalMatchingFragmentCount = 0; + float intensity = 0; + float chargeDifference = 0; + float deltaScore = 0; + int notch = 0; + float ambiguity = 0; + float modCount = 0; + float absoluteFragmentMassError = 0; + float spectralAngle = 0; + float hasSpectralAngle = 0; + float chimeraCount = 0; + float peaksInPrecursorEnvelope = 0; + float mostAbundantPrecursorPeakIntensity = 0; + float fractionalIntensity = 0; - for (int i = 0; i < numGroups; i++) + float missedCleavages = 0; + float longestSeq = 0; + float complementaryIonCount = 0; + float hydrophobicityZscore = float.NaN; + bool isVariantPeptide = false; + + //crosslink specific features + float alphaIntensity = 0; + float betaIntensity = 0; + float longestFragmentIonSeries_Alpha = 0; + float longestFragmentIonSeries_Beta = 0; + float isDeadEnd = 0; + float isLoop = 0; + float isInter = 0; + float isIntra = 0; + + double multiplier = 10; + if (searchType != "crosslink") { - groupsOfIndicies[i] = targetGroups[i].Concat(decoyGroups[i]).ToList(); - } + if (searchType == "top-down") + { + normalizationFactor = 1.0; + } + // count only terminal fragment ions + totalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType == null) / normalizationFactor * multiplier, 0)); + internalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType != null) / normalizationFactor * multiplier, 0)); + intensity = (float)Math.Min(50, Math.Round((psm.Score - (int)psm.Score) / normalizationFactor * Math.Pow(multiplier, 2), 0)); + chargeDifference = -Math.Abs(ChargeStateMode - psm.ScanPrecursorCharge); + deltaScore = (float)Math.Round(psm.DeltaScore / normalizationFactor * multiplier, 0); + notch = notchToUse; + modCount = Math.Min((float)selectedPeptide.AllModsOneIsNterminus.Keys.Count(), 10); + if (psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]?.Count() > 0) + { + absoluteFragmentMassError = (float)Math.Min(100.0, Math.Round(10.0 * Math.Abs(GetAverageFragmentMassError(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]) - FileSpecificMedianFragmentMassErrors[Path.GetFileName(psm.FullFilePath)]))); + } - return groupsOfIndicies; - } + ambiguity = Math.Min((float)(psm.BioPolymersWithSetModsToMatchingFragments.Keys.Count - 1), 10); + //ambiguity = 10; // I'm pretty sure that you shouldn't train on ambiguity and its skewing the results + longestSeq = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); + complementaryIonCount = (float)Math.Round(SpectralMatch.GetCountComplementaryIons(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); + isVariantPeptide = PeptideIsVariant(selectedPeptide); + spectralAngle = (float)psm.SpectralAngle; + if (chimeraCountDictionary.TryGetValue(psm.ChimeraIdString, out int val)) + chimeraCount = val; + peaksInPrecursorEnvelope = psm.PrecursorScanEnvelopePeakCount; + mostAbundantPrecursorPeakIntensity = (float)Math.Round((float)psm.PrecursorScanIntensity / normalizationFactor * multiplier, 0); + fractionalIntensity = (float)psm.PrecursorFractionalIntensity; - static void RandomizeListInPlace(List list) - { - Random rng = new Random(42); - int n = list.Count; - while (n > 1) + if (PsmHasSpectralAngle(psm)) + { + hasSpectralAngle = 1; + } + + if (psm.DigestionParams.Protease.Name != "top-down") + { + missedCleavages = selectedPeptide.MissedCleavages; + bool fileIsCzeSeparationType = FileSpecificParametersDictionary.ContainsKey(Path.GetFileName(psm.FullFilePath)) && FileSpecificParametersDictionary[Path.GetFileName(psm.FullFilePath)].SeparationType == "CZE"; + + if (!fileIsCzeSeparationType) + { + if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence)) + { + hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified) * 10.0, 0); + } + else + { + hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified) * 10.0, 0); + } + } + else + { + hydrophobicityZscore = (float)Math.Round(GetMobilityZScore(psm, selectedPeptide) * 10.0, 0); + } + } + //this is not for actual crosslinks but for the byproducts of crosslink loop links, deadends, etc. + if (psm is CrosslinkSpectralMatch) + { + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; + isDeadEnd = Convert.ToSingle((csm.CrossType == PsmCrossType.DeadEnd) || (csm.CrossType == PsmCrossType.DeadEndH2O) || (csm.CrossType == PsmCrossType.DeadEndNH2) || (csm.CrossType == PsmCrossType.DeadEndTris)); + isLoop = Convert.ToSingle(csm.CrossType == PsmCrossType.Loop); + } + } + else { - n--; - int k = rng.Next(n + 1); - T value = list[k]; - list[k] = list[n]; - list[n] = value; + CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; + PeptideWithSetModifications selectedAlphaPeptide = csm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); + PeptideWithSetModifications selectedBetaPeptide = csm.BetaPeptide?.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); + + float alphaNormalizationFactor = selectedAlphaPeptide.BaseSequence.Length; + float betaNormalizationFactor = selectedBetaPeptide == null ? (float)0 : selectedBetaPeptide.BaseSequence.Length; + float totalNormalizationFactor = alphaNormalizationFactor + betaNormalizationFactor; + + totalMatchingFragmentCount = (float)Math.Round(csm.XLTotalScore / totalNormalizationFactor * 10, 0); + + //Compute fragment mass error + int alphaCount = 0; + float alphaError = 0; + if (csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide]?.Count > 0) + { + alphaCount = csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide].Count; + alphaError = Math.Abs(GetAverageFragmentMassError(csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide])); + } + int betaCount = 0; + float betaError = 0; + if (selectedBetaPeptide != null && csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide]?.Count > 0) + { + betaCount = csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide].Count; + betaError = Math.Abs(GetAverageFragmentMassError(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide])); + } + + float averageError = 0; + if ((alphaCount + betaCount) > 0) + { + averageError = (alphaCount * alphaError + betaCount * betaError) / (alphaCount + betaCount); + } + + absoluteFragmentMassError = (float)Math.Min(100, Math.Round(averageError - FileSpecificMedianFragmentMassErrors[Path.GetFileName(csm.FullFilePath)] * 10.0, 0)); + //End compute fragment mass error + + deltaScore = (float)Math.Round(csm.DeltaScore / totalNormalizationFactor * 10.0, 0); + chargeDifference = -Math.Abs(ChargeStateMode - psm.ScanPrecursorCharge); + alphaIntensity = (float)Math.Min(100, Math.Round((csm.Score - (int)csm.Score) / alphaNormalizationFactor * 100.0, 0)); + betaIntensity = csm.BetaPeptide == null ? (float)0 : (float)Math.Min(100.0, Math.Round((csm.BetaPeptide.Score - (int)csm.BetaPeptide.Score) / betaNormalizationFactor * 100.0, 0)); + longestFragmentIonSeries_Alpha = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(csm.BioPolymersWithSetModsToMatchingFragments, selectedAlphaPeptide) / alphaNormalizationFactor * 10.0, 0); + longestFragmentIonSeries_Beta = selectedBetaPeptide == null ? (float)0 : SpectralMatch.GetLongestIonSeriesBidirectional(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments, selectedBetaPeptide) / betaNormalizationFactor; + longestFragmentIonSeries_Beta = (float)Math.Round(longestFragmentIonSeries_Beta * 10.0, 0); + isInter = Convert.ToSingle(csm.CrossType == PsmCrossType.Inter); + isIntra = Convert.ToSingle(csm.CrossType == PsmCrossType.Intra); } - } + psm.PsmData_forPEPandPercolator = new PsmData + { + TotalMatchingFragmentCount = totalMatchingFragmentCount, + Intensity = intensity, + PrecursorChargeDiffToMode = chargeDifference, + DeltaScore = deltaScore, + Notch = notch, + ModsCount = modCount, + AbsoluteAverageFragmentMassErrorFromMedian = absoluteFragmentMassError, + MissedCleavagesCount = missedCleavages, + Ambiguity = ambiguity, + LongestFragmentIonSeries = longestSeq, + ComplementaryIonCount = complementaryIonCount, + HydrophobicityZScore = hydrophobicityZscore, + IsVariantPeptide = Convert.ToSingle(isVariantPeptide), + + AlphaIntensity = alphaIntensity, + BetaIntensity = betaIntensity, + LongestFragmentIonSeries_Alpha = longestFragmentIonSeries_Alpha, + LongestFragmentIonSeries_Beta = longestFragmentIonSeries_Beta, + IsDeadEnd = isDeadEnd, + IsLoop = isLoop, + IsInter = isInter, + IsIntra = isIntra, - static List> DivideListIntoGroups(List list, int n) - { - var groups = new List>(); - int groupSize = (int)Math.Ceiling(list.Count / (double)n); + Label = label, - for (int i = 0; i < n; i++) - { - groups.Add(list.Skip(i * groupSize).Take(groupSize).ToList()); - } + SpectralAngle = spectralAngle, + HasSpectralAngle = hasSpectralAngle, + PeaksInPrecursorEnvelope = peaksInPrecursorEnvelope, + ChimeraCount = chimeraCount, + MostAbundantPrecursorPeakIntensity = mostAbundantPrecursorPeakIntensity, + PrecursorFractionalIntensity = fractionalIntensity, + InternalIonCount = internalMatchingFragmentCount, + }; - return groups; + return psm.PsmData_forPEPandPercolator; } public static void RemoveBestMatchingPeptidesWithLowPEP(SpectralMatch psm, List indiciesOfPeptidesToRemove, List notches, List pwsmList, List pepValuePredictions, ref int ambiguousPeptidesRemovedCount) @@ -384,8 +641,6 @@ public static void RemoveBestMatchingPeptidesWithLowPEP(SpectralMatch psm, List< psm.RemoveThisAmbiguousPeptide(notches[i], pwsmList[i]); ambiguousPeptidesRemovedCount++; } - psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max(); - psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max(); } /// @@ -409,22 +664,24 @@ public static void GetIndiciesOfPeptidesToRemove(List indiciesOfPeptidesToR } } + #region Dictionary Builder Functions and Utilities + /// /// Here we're getting the most common charge state for precursors that are Targets with q<=0.01. - public static int GetChargeStateMode(List psms) + public int GetChargeStateMode(List psms) { - return psms.Where(p => p.IsDecoy != true && p.FdrInfo.QValue <= 0.01).Select(p => p.ScanPrecursorCharge).GroupBy(n => n).OrderByDescending(g => g.Count()).Select(g => g.Key).FirstOrDefault(); + return psms.Where(p => p.IsDecoy != true && p.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= 0.01).Select(p => p.ScanPrecursorCharge).GroupBy(n => n).OrderByDescending(g => g.Count()).Select(g => g.Key).FirstOrDefault(); } - public static Dictionary>> ComputeHydrophobicityValues(List psms, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, bool computeHydrophobicitiesforModifiedPeptides) + public Dictionary>> ComputeHydrophobicityValues(List psms, bool computeHydrophobicitiesforModifiedPeptides) { SSRCalc3 calc = new SSRCalc3("SSRCalc 3.0 (300A)", SSRCalc3.Column.A300); //TODO change the tuple so the values have names Dictionary>> rtHydrophobicityAvgDev = new Dictionary>>(); - List filenames = fileSpecificParameters.Where(s => s.fileSpecificParameters.SeparationType == "HPLC").Select(s => Path.GetFileName(s.fileName)).ToList(); + List filenames = FileSpecificParametersDictionary.Select(kvp => Path.GetFileName(kvp.Key)).ToList(); filenames = filenames.Distinct().ToList(); @@ -522,11 +779,11 @@ public static Dictionary>> Compute return rtHydrophobicityAvgDev; } - public static Dictionary>> ComputeMobilityValues(List psms, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters) + public Dictionary>> ComputeMobilityValues(List psms) { Dictionary>> rtMobilityAvgDev = new Dictionary>>(); - List filenames = fileSpecificParameters.Where(s => s.fileSpecificParameters.SeparationType == "CZE").Select(s => Path.GetFileName(s.fileName)).ToList(); + List filenames = FileSpecificParametersDictionary.Select(kvp => Path.GetFileName(kvp.Key)).ToList(); filenames = filenames.Distinct().ToList(); @@ -659,18 +916,18 @@ private static float GetSSRCalcHydrophobicityZScore(SpectralMatch psm, IBioPolym return (float)hydrophobicityZscore; } - private static float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide) + private float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide) { double mobilityZScore = double.NaN; - if (fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE.ContainsKey(Path.GetFileName(psm.FullFilePath))) + if (FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE.ContainsKey(Path.GetFileName(psm.FullFilePath))) { int time = (int)(2 * Math.Round(psm.ScanRetentionTime / 2d, 0)); - if (fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)].Keys.Contains(time)) + if (FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)].Keys.Contains(time)) { double predictedMobility = 100.0 * GetCifuentesMobility(selectedPeptide); - mobilityZScore = Math.Abs(fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item1 - predictedMobility) / fileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item2; + mobilityZScore = Math.Abs(FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item1 - predictedMobility) / FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE[Path.GetFileName(psm.FullFilePath)][time].Item2; } } @@ -683,287 +940,11 @@ private static float GetMobilityZScore(SpectralMatch psm, IBioPolymerWithSetMods return (float)mobilityZScore; } - public static IEnumerable CreatePsmData(string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, - List psms, List psmIndicies, - Dictionary>> timeDependantHydrophobicityAverageAndDeviation_unmodified, - Dictionary>> timeDependantHydrophobicityAverageAndDeviation_modified, - Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode) - { - object psmDataListLock = new object(); - List psmDataList = new List(); - List psmOrder = new List(); - int maxThreads = fileSpecificParameters.FirstOrDefault().fileSpecificParameters.MaxThreadsToUsePerFile; - int[] threads = Enumerable.Range(0, maxThreads).ToArray(); - - Parallel.ForEach(Partitioner.Create(0, psmIndicies.Count), - new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, - (range, loopState) => - { - List localPsmDataList = new List(); - List localPsmOrder = new List(); - for (int i = range.Item1; i < range.Item2; i++) - { - SpectralMatch psm = psms[psmIndicies[i]]; - - // Stop loop if canceled - if (GlobalVariables.StopLoops) { return; } - - PsmData newPsmData = new PsmData(); - if (searchType == "crosslink") - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psms[i]; - - bool label; - if (csm.IsDecoy || csm.BetaPeptide.IsDecoy) - { - label = false; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); - } - else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) - { - label = true; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); - } - else - { - continue; - } - localPsmDataList.Add(newPsmData); - localPsmOrder.Add(i); - } - else - { - double bmp = 0; - foreach (var (notch, peptideWithSetMods) in psm.BestMatchingBioPolymersWithSetMods) - { - bool label; - double bmpc = psm.BestMatchingBioPolymersWithSetMods.Count(); - if (peptideWithSetMods.Parent.IsDecoy) - { - label = false; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); - } - else if (!peptideWithSetMods.Parent.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) - { - label = true; - newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); - } - else - { - continue; - } - localPsmDataList.Add(newPsmData); - localPsmOrder.Add(i + (bmp / bmpc / 2.0)); - bmp += 1.0; - } - } - } - lock (psmDataListLock) - { - psmDataList.AddRange(localPsmDataList); - psmOrder.AddRange(localPsmOrder); - } - }); - PsmData[] pda = psmDataList.ToArray(); - double[] order = psmOrder.ToArray(); - - Array.Sort(order, pda);//this sorts both arrays thru sorting the array in position one. The order array, keeps track of the positon in the original psms list and returns the PsmData array in that same order. - - return pda.AsEnumerable(); - } - - public static PsmData CreateOnePsmDataEntry(string searchType, List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, SpectralMatch psm, Dictionary>> timeDependantHydrophobicityAverageAndDeviation_unmodified, Dictionary>> timeDependantHydrophobicityAverageAndDeviation_modified, Dictionary fileSpecificMedianFragmentMassErrors, int chargeStateMode, IBioPolymerWithSetMods selectedPeptide, int notchToUse, bool label) - { - double normalizationFactor = selectedPeptide.BaseSequence.Length; - float totalMatchingFragmentCount = 0; - float internalMatchingFragmentCount = 0; - float intensity = 0; - float chargeDifference = 0; - float deltaScore = 0; - int notch = 0; - float ambiguity = 0; - float modCount = 0; - float absoluteFragmentMassError = 0; - float spectralAngle = 0; - float hasSpectralAngle = 0; - float chimeraCount = 0; - float peaksInPrecursorEnvelope = 0; - float mostAbundantPrecursorPeakIntensity = 0; - float fractionalIntensity = 0; - - float missedCleavages = 0; - float longestSeq = 0; - float complementaryIonCount = 0; - float hydrophobicityZscore = float.NaN; - bool isVariantPeptide = false; - - //crosslink specific features - float alphaIntensity = 0; - float betaIntensity = 0; - float longestFragmentIonSeries_Alpha = 0; - float longestFragmentIonSeries_Beta = 0; - float isDeadEnd = 0; - float isLoop = 0; - float isInter = 0; - float isIntra = 0; - - double multiplier = 10; - if (searchType != "crosslink") - { - if (searchType == "top-down") - { - normalizationFactor = 1.0; - } - // count only terminal fragment ions - totalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType == null) / normalizationFactor * multiplier, 0)); - internalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType != null) / normalizationFactor * multiplier, 0)); - intensity = (float)Math.Min(50, Math.Round((psm.Score - (int)psm.Score) / normalizationFactor * Math.Pow(multiplier, 2), 0)); - chargeDifference = -Math.Abs(chargeStateMode - psm.ScanPrecursorCharge); - deltaScore = (float)Math.Round(psm.DeltaScore / normalizationFactor * multiplier, 0); - notch = notchToUse; - modCount = Math.Min((float)selectedPeptide.AllModsOneIsNterminus.Keys.Count(), 10); - if (psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]?.Count() > 0) - { - absoluteFragmentMassError = (float)Math.Min(100.0, Math.Round(10.0 * Math.Abs(GetAverageFragmentMassError(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]) - fileSpecificMedianFragmentMassErrors[Path.GetFileName(psm.FullFilePath)]))); - } - - ambiguity = Math.Min((float)(psm.BioPolymersWithSetModsToMatchingFragments.Keys.Count - 1), 10); - longestSeq = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); - complementaryIonCount = (float)Math.Round(SpectralMatch.GetCountComplementaryIons(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0); - isVariantPeptide = PeptideIsVariant(selectedPeptide); - spectralAngle = (float)psm.SpectralAngle; - if (chimeraCountDictionary.TryGetValue(psm.ChimeraIdString, out int val)) - chimeraCount = val; - peaksInPrecursorEnvelope = psm.PrecursorScanEnvelopePeakCount; - mostAbundantPrecursorPeakIntensity = (float)Math.Round((float)psm.PrecursorScanIntensity / normalizationFactor * multiplier, 0); - fractionalIntensity = (float)psm.PrecursorFractionalIntensity; - - if (PsmHasSpectralAngle(psm)) - { - hasSpectralAngle = 1; - } - - if (psm.DigestionParams.Protease.Name != "top-down") - { - missedCleavages = selectedPeptide.MissedCleavages; - bool fileIsCzeSeparationType = fileSpecificParameters.Any(p => Path.GetFileName(p.fileName) == Path.GetFileName(psm.FullFilePath) && p.fileSpecificParameters.SeparationType == "CZE"); - - if (!fileIsCzeSeparationType) - { - if (selectedPeptide.BaseSequence.Equals(selectedPeptide.FullSequence)) - { - hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_unmodified) * 10.0, 0); - } - else - { - hydrophobicityZscore = (float)Math.Round(GetSSRCalcHydrophobicityZScore(psm, selectedPeptide, timeDependantHydrophobicityAverageAndDeviation_modified) * 10.0, 0); - } - } - else - { - hydrophobicityZscore = (float)Math.Round(GetMobilityZScore(psm, selectedPeptide) * 10.0, 0); - } - } - //this is not for actual crosslinks but for the byproducts of crosslink loop links, deadends, etc. - if (psm is CrosslinkSpectralMatch) - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - isDeadEnd = Convert.ToSingle((csm.CrossType == PsmCrossType.DeadEnd) || (csm.CrossType == PsmCrossType.DeadEndH2O) || (csm.CrossType == PsmCrossType.DeadEndNH2) || (csm.CrossType == PsmCrossType.DeadEndTris)); - isLoop = Convert.ToSingle(csm.CrossType == PsmCrossType.Loop); - } - } - else - { - CrosslinkSpectralMatch csm = (CrosslinkSpectralMatch)psm; - PeptideWithSetModifications selectedAlphaPeptide = csm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); - PeptideWithSetModifications selectedBetaPeptide = csm.BetaPeptide?.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide as PeptideWithSetModifications).First(); - - float alphaNormalizationFactor = selectedAlphaPeptide.BaseSequence.Length; - float betaNormalizationFactor = selectedBetaPeptide == null ? (float)0 : selectedBetaPeptide.BaseSequence.Length; - float totalNormalizationFactor = alphaNormalizationFactor + betaNormalizationFactor; - - totalMatchingFragmentCount = (float)Math.Round(csm.XLTotalScore / totalNormalizationFactor * 10, 0); - - //Compute fragment mass error - int alphaCount = 0; - float alphaError = 0; - if (csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide]?.Count > 0) - { - alphaCount = csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide].Count; - alphaError = Math.Abs(GetAverageFragmentMassError(csm.BioPolymersWithSetModsToMatchingFragments[selectedAlphaPeptide])); - } - int betaCount = 0; - float betaError = 0; - if (csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide]?.Count > 0) - { - betaCount = csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide].Count; - betaError = Math.Abs(GetAverageFragmentMassError(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments[selectedBetaPeptide])); - } - - float averageError = 0; - if ((alphaCount + betaCount) > 0) - { - averageError = (alphaCount * alphaError + betaCount * betaError) / (alphaCount + betaCount); - } - - absoluteFragmentMassError = (float)Math.Min(100, Math.Round(averageError - fileSpecificMedianFragmentMassErrors[Path.GetFileName(csm.FullFilePath)] * 10.0, 0)); - //End compute fragment mass error - - deltaScore = (float)Math.Round(csm.DeltaScore / totalNormalizationFactor * 10.0, 0); - chargeDifference = -Math.Abs(chargeStateMode - psm.ScanPrecursorCharge); - alphaIntensity = (float)Math.Min(100, Math.Round((csm.Score - (int)csm.Score) / alphaNormalizationFactor * 100.0, 0)); - betaIntensity = csm.BetaPeptide == null ? (float)0 : (float)Math.Min(100.0, Math.Round((csm.BetaPeptide.Score - (int)csm.BetaPeptide.Score) / betaNormalizationFactor * 100.0, 0)); - longestFragmentIonSeries_Alpha = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(csm.BioPolymersWithSetModsToMatchingFragments, selectedAlphaPeptide) / alphaNormalizationFactor * 10.0, 0); - longestFragmentIonSeries_Beta = selectedBetaPeptide == null ? (float)0 : SpectralMatch.GetLongestIonSeriesBidirectional(csm.BetaPeptide.BioPolymersWithSetModsToMatchingFragments, selectedBetaPeptide) / betaNormalizationFactor; - longestFragmentIonSeries_Beta = (float)Math.Round(longestFragmentIonSeries_Beta * 10.0, 0); - isInter = Convert.ToSingle(csm.CrossType == PsmCrossType.Inter); - isIntra = Convert.ToSingle(csm.CrossType == PsmCrossType.Intra); - } - - psm.PsmData_forPEPandPercolator = new PsmData - { - TotalMatchingFragmentCount = totalMatchingFragmentCount, - Intensity = intensity, - PrecursorChargeDiffToMode = chargeDifference, - DeltaScore = deltaScore, - Notch = notch, - ModsCount = modCount, - AbsoluteAverageFragmentMassErrorFromMedian = absoluteFragmentMassError, - MissedCleavagesCount = missedCleavages, - Ambiguity = ambiguity, - LongestFragmentIonSeries = longestSeq, - ComplementaryIonCount = complementaryIonCount, - HydrophobicityZScore = hydrophobicityZscore, - IsVariantPeptide = Convert.ToSingle(isVariantPeptide), - - AlphaIntensity = alphaIntensity, - BetaIntensity = betaIntensity, - LongestFragmentIonSeries_Alpha = longestFragmentIonSeries_Alpha, - LongestFragmentIonSeries_Beta = longestFragmentIonSeries_Beta, - IsDeadEnd = isDeadEnd, - IsLoop = isLoop, - IsInter = isInter, - IsIntra = isIntra, - - Label = label, - - SpectralAngle = spectralAngle, - HasSpectralAngle = hasSpectralAngle, - PeaksInPrecursorEnvelope = peaksInPrecursorEnvelope, - ChimeraCount = chimeraCount, - MostAbundantPrecursorPeakIntensity = mostAbundantPrecursorPeakIntensity, - PrecursorFractionalIntensity = fractionalIntensity, - InternalIonCount = internalMatchingFragmentCount, - }; - - return psm.PsmData_forPEPandPercolator; - } - private static bool PeptideIsVariant(IBioPolymerWithSetMods bpwsm) { - if (bpwsm is not PeptideWithSetModifications pwsm) + if (bpwsm is not PeptideWithSetModifications pwsm) return false; - + bool identifiedVariant = false; if (pwsm.Protein.AppliedSequenceVariations.Count() > 0) { @@ -984,7 +965,7 @@ private static bool PsmHasSpectralAngle(SpectralMatch psm) return psm.SpectralAngle >= 0; } - public static bool ContainsModificationsThatShiftMobility(IEnumerable modifications) + public static bool ContainsModificationsThatShiftMobility(IEnumerable modifications) { List shiftingModifications = new List { "Acetylation", "Ammonia loss", "Carbamyl", "Deamidation", "Formylation", "N2-acetylarginine", "N6-acetyllysine", "N-acetylalanine", "N-acetylaspartate", "N-acetylcysteine", "N-acetylglutamate", "N-acetylglycine", @@ -1053,5 +1034,7 @@ public static float GetAverageFragmentMassError(IEnumerable return massErrors.Average(); } + + #endregion } } \ No newline at end of file diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs new file mode 100644 index 000000000..b88faa9d1 --- /dev/null +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs @@ -0,0 +1,70 @@ +using Omics; +using Proteomics.ProteolyticDigestion; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace EngineLayer +{ + public class PeptideMatchGroup : IEnumerable + { + public string PeptideFullSequence { get; } + public List SpectralMatches { get; } + + /// + /// This class groups all spectral matches associated with a given peptide together, + /// to facilitate the calculation of PEP values. + /// + /// The full sequence to be used for grouping + /// Every spectral match that matches the full sequence + public PeptideMatchGroup(string fullPeptideSeq, List spectralMatches) + { + PeptideFullSequence = fullPeptideSeq; + SpectralMatches = spectralMatches; + } + + public static List GroupByBaseSequence(List spectralMatches) + { + // This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training. + + // TODO: Determine if it's better to group PSMs by base sequence or by full sequence. + return spectralMatches.GroupBy(p => p.BaseSequence) + .Select(group => new PeptideMatchGroup(group.Key, group.ToList())) + .OrderByDescending(matchGroup => matchGroup.Count()) + .ThenByDescending(matchGroup => matchGroup.BestMatch.Score) + .ToList(); + } + + public IEnumerable GetBestMatchByMod() + { + return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p)); + } + + /// + /// This function is called if there aren't enough peptides to train at the peptide level + /// + /// + /// + public static List GroupByIndividualPsm(List spectralMatches) + { + return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List { psm })) + .ToList(); + } + + public SpectralMatch BestMatch => SpectralMatches.MaxBy(match => match); + + public IEnumerator GetEnumerator() + { + return SpectralMatches.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + } +} \ No newline at end of file diff --git a/MetaMorpheus/EngineLayer/SpectralMatch.cs b/MetaMorpheus/EngineLayer/SpectralMatch.cs index 96ef0f644..299f13b00 100644 --- a/MetaMorpheus/EngineLayer/SpectralMatch.cs +++ b/MetaMorpheus/EngineLayer/SpectralMatch.cs @@ -197,7 +197,7 @@ public void ResolveAllAmbiguities() ModsChemicalFormula = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Pwsm.AllModsOneIsNterminus.Select(c => (c.Value)))).ResolvedValue; Notch = PsmTsvWriter.Resolve(_BestMatchingBioPolymersWithSetMods.Select(b => b.Notch)).ResolvedValue; - // if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy + //if the PSM matches a target and a decoy and they are the SAME SEQUENCE, remove the decoy if (IsDecoy) { bool removedPeptides = false; diff --git a/MetaMorpheus/TaskLayer/FilteredPsms.cs b/MetaMorpheus/TaskLayer/FilteredPsms.cs index 869a477a5..81e80c0fa 100644 --- a/MetaMorpheus/TaskLayer/FilteredPsms.cs +++ b/MetaMorpheus/TaskLayer/FilteredPsms.cs @@ -8,6 +8,12 @@ namespace TaskLayer { + public enum FilterType + { + QValue, + PepQValue + } + /// /// Contains a filtered list of PSMs. /// All properties within this class are read-only, and should only be set on object construction @@ -18,11 +24,11 @@ public class FilteredPsms : IEnumerable /// /// Filter type can have only two values: "q-value" or "pep q-value" /// - public string FilterType { get; init; } + public FilterType FilterType { get; init; } public double FilterThreshold { get; init; } public bool FilteringNotPerformed { get; init; } public bool PeptideLevelFiltering { get; init; } - public FilteredPsms(List filteredPsms, string filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering) + public FilteredPsms(List filteredPsms, FilterType filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering) { FilteredPsmsList = filteredPsms; FilterType = filterType; @@ -37,13 +43,18 @@ private bool AboveThreshold(SpectralMatch psm) switch (FilterType) { - case "pep q-value": + case FilterType.PepQValue: return psm.GetFdrInfo(PeptideLevelFiltering).PEP_QValue <= FilterThreshold; default: return psm.GetFdrInfo(PeptideLevelFiltering).QValue <= FilterThreshold && psm.GetFdrInfo(PeptideLevelFiltering).QValueNotch <= FilterThreshold; } } + public string GetFilterTypeString() + { + return FilterType == FilterType.PepQValue ? "pep q-value" : "q-value"; + } + /// /// This method should only be called when filtered PSMs are modified for the purpose of SILAC analysis /// @@ -87,7 +98,7 @@ public static FilteredPsms Filter(IEnumerable psms, List filteredPsms = new List(); // set the filter type - string filterType = "q-value"; + FilterType filterType = FilterType.QValue; if (pepQValueThreshold < qValueThreshold) { if (psms.Count() < 100) @@ -97,13 +108,13 @@ public static FilteredPsms Filter(IEnumerable psms, } else { - filterType = "pep q-value"; + filterType = FilterType.PepQValue; } } if (!includeHighQValuePsms) { - filteredPsms = filterType.Equals("q-value") + filteredPsms = filterType.Equals(FilterType.QValue) ? psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null && p.GetFdrInfo(filterAtPeptideLevel).QValue <= filterThreshold && p.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= filterThreshold).ToList() diff --git a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs index a963eefbf..eae34b3b8 100644 --- a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs +++ b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs @@ -119,9 +119,8 @@ public static SpectralRecoveryResults RunSpectralRecoveryAlgorithm( List allPsms = parameters.AllPsms. OrderByDescending(p => p).ToList(); - AssignEstimatedPsmQvalue(bestMbrMatches, allPsms); FDRAnalysisOfMbrPsms(bestMbrMatches, allPsms, parameters, fileSpecificParameters); - AssignEstimatedPsmPepQValue(bestMbrMatches, allPsms); + foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) match.FindOriginalPsm(allPsms); } @@ -208,70 +207,10 @@ private static void FDRAnalysisOfMbrPsms(ConcurrentDictionary p.Value.spectralLibraryMatch). Where(v => v != null). ToList(); - List[] psmGroupIndices = PEP_Analysis_Cross_Validation.Get_PSM_Group_Indices(psms, 1); - MLContext mlContext = new MLContext(); - IEnumerable[] PSMDataGroups = new IEnumerable[1]; - - string searchType = "standard"; - if (psms[0].DigestionParams.Protease.Name == "top-down") - { - searchType = "top-down"; - } - - int chargeStateMode = PEP_Analysis_Cross_Validation.GetChargeStateMode(allPsms); - - Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, false); - Dictionary>> fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(allPsms, fileSpecificParameters, true); - PEP_Analysis_Cross_Validation.ComputeMobilityValues(allPsms, fileSpecificParameters); - - Dictionary fileSpecificMedianFragmentMassErrors = PEP_Analysis_Cross_Validation.GetFileSpecificMedianFragmentMassError(allPsms); - - PSMDataGroups[0] = PEP_Analysis_Cross_Validation.CreatePsmData(searchType, fileSpecificParameters, psms, psmGroupIndices[0], fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode); - string[] trainingVariables = PsmData.trainingInfos[searchType]; - - TransformerChain>>[] trainedModels = new TransformerChain>>[1]; - - var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfTrees: 400); - var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables).Append(trainer); - - IDataView dataView = mlContext.Data.LoadFromEnumerable(PSMDataGroups[0]); - - string outputFolder = parameters.OutputFolder; - - trainedModels[0] = pipeline.Fit(dataView); - - PEP_Analysis_Cross_Validation.Compute_PSM_PEP(psms, psmGroupIndices[0], mlContext, trainedModels[0], searchType, fileSpecificParameters, fileSpecificMedianFragmentMassErrors, chargeStateMode, outputFolder); - } + new FdrAnalysisEngine(psms, parameters.NumNotches, fileSpecificParameters.First().Item2, fileSpecificParameters, + new List { parameters.SearchTaskId }, analysisType: "PSM", doPEP: true, outputFolder: parameters.OutputFolder).Run(); - private static void AssignEstimatedPsmPepQValue(ConcurrentDictionary bestMbrMatches, List allPsms) - { - List pepValues = bestMbrMatches. - Select(p => p.Value.spectralLibraryMatch). - Where(p => p != null). - OrderBy(p => p.FdrInfo.PEP). - Select(p => p.FdrInfo.PEP). - ToList(); - - foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) - { - if (match.spectralLibraryMatch == null) continue; - - int myIndex = 0; - while (myIndex < (pepValues.Count - 1) && pepValues[myIndex] <= match.spectralLibraryMatch.FdrInfo.PEP) - { - myIndex++; - } - if (myIndex == pepValues.Count - 1) - { - match.spectralLibraryMatch.FdrInfo.PEP_QValue = pepValues.Last(); - } - else - { - double estimatedQ = (pepValues[myIndex - 1] + pepValues[myIndex]) / 2; - match.spectralLibraryMatch.FdrInfo.PEP_QValue = estimatedQ; - } - } } private static void WriteSpectralRecoveryPsmResults(ConcurrentDictionary bestMbrMatches, PostSearchAnalysisParameters parameters) diff --git a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs index 0a0f89e35..7def49d61 100644 --- a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs +++ b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs @@ -622,6 +622,46 @@ protected List LoadProteins(string taskId, List dbFilenameLi { Warn("Warning: " + emptyProteinEntries + " empty protein entries ignored"); } + + + + if (!proteinList.Any(p => p.IsDecoy)) + { + Status("Done loading proteins", new List { taskId }); + return proteinList; + } + + // Sanitize the decoys + // TODO: Fix this so that it accounts for multi-protease searches. Currently, we only consider the first protease + // when looking for target/decoy collisions + + HashSet targetPeptideSequences = new(); + foreach(var protein in proteinList.Where(p => !p.IsDecoy)) + { + // When thinking about decoy collisions, we can ignore modifications + foreach(var peptide in protein.Digest(commonParameters.DigestionParams, new List(), new List())) + { + targetPeptideSequences.Add(peptide.BaseSequence); + } + } + // Now, we iterate through the decoys and scramble the sequences that correspond to target peptides + for(int i = 0; i < proteinList.Count; i++) + { + if(proteinList[i].IsDecoy) + { + var peptidesToReplace = proteinList[i] + .Digest(commonParameters.DigestionParams, new List(), new List()) + .Select(p => p.BaseSequence) + .Where(targetPeptideSequences.Contains) + .ToList(); + if(peptidesToReplace.Any()) + { + proteinList[i] = Protein.ScrambleDecoyProteinSequence(proteinList[i], commonParameters.DigestionParams, forbiddenSequences: targetPeptideSequences, peptidesToReplace); + } + } + } + + Status("Done loading proteins", new List { taskId }); return proteinList; } diff --git a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index 41cb7692a..69e73a402 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -620,7 +620,7 @@ private void WritePsmResults() "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string psmResultsText = "All target PSMs with " + psmsForPsmResults.FilterType + " <= " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + + string psmResultsText = "All target PSMs with " + psmsForPsmResults.GetFilterTypeString() + " <= " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + psmsForPsmResults.TargetPsmsAboveThreshold; ResultsDictionary[("All", "PSMs")] = psmResultsText; } @@ -647,7 +647,7 @@ private void WritePeptideResults() Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - string peptideResultsText = $"All target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesForPeptideResults.FilterType + " <= " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + + string peptideResultsText = $"All target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesForPeptideResults.GetFilterTypeString() + " <= " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + peptidesForPeptideResults.TargetPsmsAboveThreshold; ResultsDictionary[("All", GlobalVariables.AnalyteType)] = peptideResultsText; } @@ -684,7 +684,7 @@ private void WriteIndividualPsmResults() FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string psmResultsText = strippedFileName + " - Target PSMs with " + psmsToWrite.FilterType + " <= " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + + string psmResultsText = strippedFileName + " - Target PSMs with " + psmsToWrite.GetFilterTypeString() + " <= " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + psmsToWrite.TargetPsmsAboveThreshold; ResultsDictionary[(strippedFileName, "PSMs")] = psmResultsText; } @@ -720,7 +720,7 @@ private void WriteIndividualPeptideResults() FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); // write summary text - string peptideResultsText = strippedFileName + $" - Target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesToWrite.FilterType + " <= " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + + string peptideResultsText = strippedFileName + $" - Target {GlobalVariables.AnalyteType.ToLower()}s with " + peptidesToWrite.GetFilterTypeString() + " <= " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + peptidesToWrite.TargetPsmsAboveThreshold; ResultsDictionary[(strippedFileName, GlobalVariables.AnalyteType)] = peptideResultsText; } @@ -746,7 +746,6 @@ private void UpdateSpectralLibrary() // Value is the highest scoring psm in the group elementSelector: g => g.MaxBy(p => p.Score)); - //load the original library var originalLibrarySpectra = Parameters.SpectralLibrary.GetAllLibrarySpectra(); List updatedLibrarySpectra = new(); diff --git a/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs b/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs index 5fb800513..6329d27a3 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/SearchTask.cs @@ -199,7 +199,7 @@ protected override MyTaskResults RunSpecific(string OutputFolder, List { taskId } ); Status("Searching files...", new List { taskId, "Individual Spectra Files" }); Dictionary numMs2SpectraPerFile = new Dictionary(); diff --git a/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs index 699b4fe67..a907b3f5d 100644 --- a/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs +++ b/MetaMorpheus/Test/EverythingRunnerEngineTestCase.cs @@ -160,8 +160,7 @@ static EverythingRunnerEngineTestCase() myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\Task2-SearchTaskconfig.toml"); searchTaskLoaded = Toml.ReadFile(myTomlPath, MetaMorpheusTask.tomlConfig); - // TODO: Uncomment this line and change values for PR 2394 - //searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; + searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; _cases.Add(EverythingRunnerEngineTestCases.BottomUpPepQValue, new EverythingRunnerEngineTestCase(EverythingRunnerEngineTestCases.BottomUpPepQValue, new List<(string, MetaMorpheusTask)> { ("postSearchAnalysisTaskTestOutput", searchTaskLoaded) }, diff --git a/MetaMorpheus/Test/FdrTest.cs b/MetaMorpheus/Test/FdrTest.cs index 645d7dd60..73bb1af82 100644 --- a/MetaMorpheus/Test/FdrTest.cs +++ b/MetaMorpheus/Test/FdrTest.cs @@ -19,6 +19,11 @@ using TaskLayer; using UsefulProteomicsDatabases; using Omics; +using Org.BouncyCastle.Utilities.Collections; +using OxyPlot; +using static iText.Svg.SvgConstants; +using System.Reflection; +using UsefulProteomicsDatabases.Generated; namespace Test { @@ -32,18 +37,18 @@ public static void TestSeeModsThatShiftMobility() Modification am = new Modification(_originalId: "Ammonia loss"); List real = new List { ac, am }; - Assert.IsTrue(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(real)); - Assert.AreEqual(2, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(real)); + Assert.IsTrue(PepAnalysisEngine.ContainsModificationsThatShiftMobility(real)); + Assert.AreEqual(2, PepAnalysisEngine.CountModificationsThatShiftMobility(real)); Modification fac = new Modification(_originalId: "fake Acetylation"); Modification fam = new Modification(_originalId: "fake Ammonia loss"); List fake = new List { fac, fam }; - Assert.IsFalse(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(fake)); - Assert.AreEqual(0, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(fake)); + Assert.IsFalse(PepAnalysisEngine.ContainsModificationsThatShiftMobility(fake)); + Assert.AreEqual(0, PepAnalysisEngine.CountModificationsThatShiftMobility(fake)); - Assert.IsTrue(PEP_Analysis_Cross_Validation.ContainsModificationsThatShiftMobility(real.Concat(fake))); - Assert.AreEqual(2, PEP_Analysis_Cross_Validation.CountModificationsThatShiftMobility(real.Concat(fake))); + Assert.IsTrue(PepAnalysisEngine.ContainsModificationsThatShiftMobility(real.Concat(fake))); + Assert.AreEqual(2, PepAnalysisEngine.CountModificationsThatShiftMobility(real.Concat(fake))); } [Test] @@ -178,6 +183,7 @@ public static void TestComputePEPValue() Dictionary sequenceToPsmCount = new Dictionary(); + List sequences = new List(); foreach (SpectralMatch psm in nonNullPsms) { @@ -212,7 +218,31 @@ public static void TestComputePEPValue() { Path.GetFileName(maxScorePsm.FullFilePath), 0 } }; - var maxPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, maxScorePsm, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, pwsm, notch, !pwsm.Parent.IsDecoy); + // Set values within PEP_Analysis through reflection + PepAnalysisEngine pepEngine = new PepAnalysisEngine(nonNullPsms, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch(p.Name) + { + case "FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified": + p.SetValue(pepEngine, fileSpecificRetTimeHI_behavior); + break; + case "FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified": + p.SetValue(pepEngine, fileSpecificRetTimeHI_behavior); + break; + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + var maxPsmData = pepEngine.CreateOnePsmDataEntry("standard", maxScorePsm, pwsm, notch, !pwsm.Parent.IsDecoy); Assert.That(maxScorePsm.BioPolymersWithSetModsToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); double normalizationFactor = (double)pwsm.BaseSequence.Length; float maxPsmDeltaScore = (float)Math.Round(maxScorePsm.DeltaScore / normalizationFactor * 10.0, 0); @@ -230,7 +260,7 @@ public static void TestComputePEPValue() List psmCopyForPEPFailure = nonNullPsms.ToList(); List psmCopyForNoOutputFolder = nonNullPsms.ToList(); - PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(nonNullPsms, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + pepEngine.ComputePEPValuesForAllPSMs(); int trueCount = 0; @@ -253,7 +283,9 @@ public static void TestComputePEPValue() } } - string metrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMs, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + + pepEngine = new PepAnalysisEngine(moreNonNullPSMs, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + string metrics = pepEngine.ComputePEPValuesForAllPSMs(); Assert.GreaterOrEqual(32, trueCount); //Test Variant Peptide as Input is identified as such as part of PEP calculation input much of the next several lines simply necessry to create a psm. @@ -286,18 +318,33 @@ public static void TestComputePEPValue() var (vnotch, vpwsm) = variantPSM.BestMatchingBioPolymersWithSetMods.First(); massError.Add(Path.GetFileName(variantPSM.FullFilePath), 0); - PsmData variantPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, variantPSM, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, vpwsm, vnotch, !maxScorePsm.IsDecoy); + + // edit the FileSpecificMedianFragmentMassErrors property of PEP_Analysis_Cross_Validation to include the mass error for the variant peptide file + pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + + PsmData variantPsmData = pepEngine.CreateOnePsmDataEntry("standard", variantPSM, vpwsm, vnotch, !maxScorePsm.IsDecoy); Assert.AreEqual((float)1, variantPsmData.IsVariantPeptide); //TEST CZE - fsp = new List<(string fileName, CommonParameters fileSpecificParameters)>(); var cp = new CommonParameters(separationType: "CZE"); fsp.Add((origDataFile, cp)); - PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForCZETest, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + trueCount = 0; foreach (var item in psmCopyForCZETest.Where(p => p != null)) @@ -318,18 +365,22 @@ public static void TestComputePEPValue() moreNonNullPSMsCZE.Add(psm); } } - metrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMsCZE, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + + pepEngine = new PepAnalysisEngine(moreNonNullPSMsCZE, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + metrics = pepEngine.ComputePEPValuesForAllPSMs(); Assert.GreaterOrEqual(32, trueCount); //TEST PEP calculation failure psmCopyForPEPFailure.RemoveAll(x => x.IsDecoy); - string result = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForPEPFailure, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + pepEngine = new PepAnalysisEngine(psmCopyForPEPFailure, "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + string result = pepEngine.ComputePEPValuesForAllPSMs(); Assert.AreEqual("Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples.", result); //Run PEP with no output folder; //There is no assertion here. We simply want to show that PEP calculation does not fail with null folder. string outputFolder = null; - string nullOutputFolderResults = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psmCopyForNoOutputFolder, "standard", fsp, outputFolder); + pepEngine = new PepAnalysisEngine(psmCopyForNoOutputFolder, "standard", fsp, outputFolder); + string nullOutputFolderResults = pepEngine.ComputePEPValuesForAllPSMs(); } [Test] @@ -404,7 +455,26 @@ public static void TestComputePEPValueTopDown() { { Path.GetFileName(maxScorePsm.FullFilePath), 0 } }; - var maxPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("top-down", fsp, maxScorePsm, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, massError, chargeStateMode, pwsm, notch, !pwsm.Parent.IsDecoy); + + // Set values within PEP_Analysis through reflection + PepAnalysisEngine pepEngine = new PepAnalysisEngine(nonNullPsms, "top-down", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, massError); + break; + default: + break; + } + } + + var maxPsmData = pepEngine.CreateOnePsmDataEntry("top-down", maxScorePsm, pwsm, notch, !pwsm.Parent.IsDecoy); Assert.That(maxScorePsm.BioPolymersWithSetModsToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); double normalizationFactor = 1; float maxPsmDeltaScore = (float)Math.Round(maxScorePsm.DeltaScore / normalizationFactor * 10.0, 0); @@ -442,7 +512,7 @@ public static void TestPEP_peptideRemoval() List<(int notch, PeptideWithSetModifications pwsm)> bestMatchingPeptidesToRemove = new List<(int notch, PeptideWithSetModifications pwsm)>(); List pepValuePredictions = new List { 1.0d, 0.99d, 0.9d }; - PEP_Analysis_Cross_Validation.GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); + PepAnalysisEngine.GetIndiciesOfPeptidesToRemove(indiciesOfPeptidesToRemove, pepValuePredictions); Assert.AreEqual(1, indiciesOfPeptidesToRemove.Count); Assert.AreEqual(2, indiciesOfPeptidesToRemove.FirstOrDefault()); Assert.AreEqual(2, pepValuePredictions.Count); @@ -455,7 +525,7 @@ public static void TestPEP_peptideRemoval() peptides.Add(bmp.Peptide); } - PEP_Analysis_Cross_Validation.RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, notches, peptides, pepValuePredictions, ref ambiguousPeptidesRemovedCount); + PepAnalysisEngine.RemoveBestMatchingPeptidesWithLowPEP(psm, indiciesOfPeptidesToRemove, notches, peptides, pepValuePredictions, ref ambiguousPeptidesRemovedCount); Assert.AreEqual(1, ambiguousPeptidesRemovedCount); Assert.AreEqual(2, psm.BestMatchingBioPolymersWithSetMods.Select(b => b.Notch).ToList().Count); } @@ -472,13 +542,13 @@ public static void TestPEP_standardDeviationsToChange() averagesCommaStandardDeviations.Add(2, new Tuple(1.0d, 1.1d));//will NOT get removed becuase its perfectly fine averagesCommaStandardDeviations.Add(3, new Tuple(1.0d, 10.0d));//will get removed becuase its too big - PEP_Analysis_Cross_Validation.GetStDevsToChange(stDevsToChange, averagesCommaStandardDeviations, globalStDev); + PepAnalysisEngine.GetStDevsToChange(stDevsToChange, averagesCommaStandardDeviations, globalStDev); Assert.That(stDevsToChange.ContainsKey(0)); Assert.That(stDevsToChange.ContainsKey(1)); Assert.That(stDevsToChange.ContainsKey(3)); Assert.AreEqual(3, stDevsToChange.Keys.Count); - PEP_Analysis_Cross_Validation.UpdateOutOfRangeStDevsWithGlobalAverage(stDevsToChange, averagesCommaStandardDeviations); + PepAnalysisEngine.UpdateOutOfRangeStDevsWithGlobalAverage(stDevsToChange, averagesCommaStandardDeviations); Assert.AreEqual(1.0d, averagesCommaStandardDeviations[0].Item2); Assert.AreEqual(1.0d, averagesCommaStandardDeviations[1].Item2); diff --git a/MetaMorpheus/Test/PeptideSpectralMatchTest.cs b/MetaMorpheus/Test/PeptideSpectralMatchTest.cs index 9e6c9dcdf..a5f097ed5 100644 --- a/MetaMorpheus/Test/PeptideSpectralMatchTest.cs +++ b/MetaMorpheus/Test/PeptideSpectralMatchTest.cs @@ -33,11 +33,11 @@ public static void GetAminoAcidCoverageTest() int missedCleavages = 0; CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full; string peptideDescription = null; - string? pairedTargetDecoyHash = null; + string pairedTargetDecoySequence = null; PeptideWithSetModifications pwsmNoBaseSequence = new(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, - peptideDescription, pairedTargetDecoyHash); + peptideDescription, pairedTargetDecoySequence); PeptideSpectralMatch psmNoBaseSequenceNoMFI = new(pwsmNoBaseSequence, 0, 10, 0, ms2ScanOneMzTen, commonParams, new List()); psmNoBaseSequenceNoMFI.ResolveAllAmbiguities(); @@ -52,9 +52,10 @@ public static void GetAminoAcidCoverageTest() sequence = "PEPTIDE"; oneBasedEndResidueInProtein = Math.Max(sequence.Length, 0); myProtein = new Protein(sequence, "ACCESSION"); - PeptideWithSetModifications pwsmBaseSequence = new(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, + var test = new PeptideWithSetModifications(sequence, allKnownMods); + PeptideWithSetModifications pwsmBaseSequence = new PeptideWithSetModifications(sequence, allKnownMods, numFixedMods, digestionParams, myProtein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, - peptideDescription, pairedTargetDecoyHash); + peptideDescription, pairedTargetDecoySequence); PeptideSpectralMatch psmBaseSequenceNoMFI = new(pwsmBaseSequence, 0, 10, 0, ms2ScanOneMzTen, commonParams, new List()); diff --git a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs index 5426e1bbf..f01117297 100644 --- a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs +++ b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs @@ -78,27 +78,27 @@ public static void AllResultsAndResultsTxtContainsCorrectValues_PepQValue_Bottom string outputFolder = testCase.OutputDirectory; var allResultsFile = Path.Combine(outputFolder, "allResults.txt"); var allResults = File.ReadAllLines(allResultsFile); - Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", allResults[10]); - Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", allResults[11]); - Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", allResults[12]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", allResults[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", allResults[15]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", allResults[16]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", allResults[18]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", allResults[19]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", allResults[20]); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 382", allResults[10]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 153", allResults[11]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 140", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 190", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 153", allResults[15]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 140", allResults[16]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 190", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 153", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 140", allResults[20]); var resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); var results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 420", results[5]); - Assert.AreEqual("All target peptides with pep q-value <= 0.01: 172", results[6]); - Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 155", results[7]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 210", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 172", results[10]); - Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", results[11]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 210", results[13]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 172", results[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", results[15]); + Assert.AreEqual("All target PSMs with pep q-value <= 0.01: 382", results[5]); + Assert.AreEqual("All target peptides with pep q-value <= 0.01: 153", results[6]); + Assert.AreEqual("All target protein groups with q-value <= 0.01 (1% FDR): 140", results[7]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target PSMs with pep q-value <= 0.01: 190", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target peptides with pep q-value <= 0.01: 153", results[10]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 140", results[11]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target PSMs with pep q-value <= 0.01: 190", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target peptides with pep q-value <= 0.01: 153", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 140", results[15]); } /// diff --git a/MetaMorpheus/Test/SearchEngineTests.cs b/MetaMorpheus/Test/SearchEngineTests.cs index 9053a1bb1..fc4d3c62f 100644 --- a/MetaMorpheus/Test/SearchEngineTests.cs +++ b/MetaMorpheus/Test/SearchEngineTests.cs @@ -74,6 +74,7 @@ public static void TestSearchEngineResultsPsmFromTsv() string myFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.mzML"); string myDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_A549_3_snip.fasta"); + searchTaskLoaded.CommonParameters.QValueCutoffForPepCalculation = 0.01; var engineToml = new EverythingRunnerEngine(new List<(string, MetaMorpheusTask)> { ("SearchTOML", searchTaskLoaded) }, new List { myFile }, new List { new DbForTask(myDatabase, false) }, outputFolder); engineToml.Run(); diff --git a/MetaMorpheus/Test/SpectralRecoveryTest.cs b/MetaMorpheus/Test/SpectralRecoveryTest.cs index 3257361c6..c69e03aa2 100644 --- a/MetaMorpheus/Test/SpectralRecoveryTest.cs +++ b/MetaMorpheus/Test/SpectralRecoveryTest.cs @@ -1,7 +1,8 @@ using EngineLayer; using EngineLayer.ClassicSearch; using MassSpectrometry; -using NUnit.Framework; using Assert = NUnit.Framework.Legacy.ClassicAssert; +using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; @@ -17,6 +18,8 @@ using UsefulProteomicsDatabases; using Nett; using System.DirectoryServices; +using System.Threading.Tasks; +using System.Threading; namespace Test { @@ -314,7 +317,7 @@ public static void SpectralWriterTest() QuantifyPpmTol = 25 } }, - CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect), + CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect, qValueCutoffForPepCalculation: 0.01), FileSpecificParameters = new List<(string FileName, CommonParameters Parameters)> { (rawSlices[0], new CommonParameters()), (rawSlices[1], new CommonParameters()) @@ -333,13 +336,16 @@ public static void SpectralWriterTest() testLibraryWithoutDecoy.CloseConnections(); + // Get rid of this file so it doesn't interfere with the next test + File.Delete(Path.Combine(path, matchingvalue)); + // new task with less than 100 psms. postSearchTask = new PostSearchAnalysisTask() { Parameters = new PostSearchAnalysisParameters() { ProteinList = proteinList, - AllPsms = psms.GetRange(0, 50), + AllPsms = psms.GetRange(0, 80), CurrentRawFileList = rawSlices, DatabaseFilenameList = databaseList, OutputFolder = outputFolder, @@ -360,32 +366,35 @@ public static void SpectralWriterTest() QuantifyPpmTol = 25 } }, - CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect), + CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect, qValueCutoffForPepCalculation: 0.01), FileSpecificParameters = new List<(string FileName, CommonParameters Parameters)> { (rawSlices[0], new CommonParameters()), (rawSlices[1], new CommonParameters()) } }; - postSearchTask.Run(); + // Find and open the new spectral library + list = Directory.GetFiles(path, "*.*", SearchOption.AllDirectories); + matchingvalue = list.Where(p => p.Contains("SpectralLibrary")).First().ToString(); testLibraryWithoutDecoy = new SpectralLibrary(new List { Path.Combine(path, matchingvalue) }); - Assert.That(testLibraryWithoutDecoy.TryGetSpectrum("EESGKPGAHVTVK", 2, out spectrum)); + // When writing a new spectral library, we don't want it to have the exact same name as the old one. + // So, we make sure at least one second has passed + Thread.Sleep(new TimeSpan(0, 0, 1)); // Wait for the library to close + // Test spectral library update postSearchTask.Parameters.SearchParameters.UpdateSpectralLibrary = true; postSearchTask.Parameters.SpectralLibrary = testLibraryWithoutDecoy; postSearchTask.Run(); - var libraryList = Directory.GetFiles(path, "*.*", SearchOption.AllDirectories); string updateLibraryPath = libraryList.First(p => p.Contains("updateSpectralLibrary") && !p.Contains(matchingvalue)).ToString(); var updatedLibraryWithoutDecoy = new SpectralLibrary(new List { Path.Combine(path, updateLibraryPath) }); Assert.That(updatedLibraryWithoutDecoy.TryGetSpectrum("EESGKPGAHVTVK", 2, out spectrum)); - testLibraryWithoutDecoy.CloseConnections(); + testLibraryWithoutDecoy.CloseConnections(); updatedLibraryWithoutDecoy.CloseConnections(); - } [Test] diff --git a/MetaMorpheus/Test/XLTest.cs b/MetaMorpheus/Test/XLTest.cs index 229189c3b..5de9a1e65 100644 --- a/MetaMorpheus/Test/XLTest.cs +++ b/MetaMorpheus/Test/XLTest.cs @@ -417,14 +417,14 @@ public static void XlTest_MoreComprehensive() } MyFileManager myFileManager = new MyFileManager(true); - CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams(), maxThreadsToUsePerFile: 1); + CommonParameters commonParameters2 = new CommonParameters(digestionParams: new DigestionParams(), maxThreadsToUsePerFile: 1); var fsp = new List<(string fileName, CommonParameters fileSpecificParameters)>(); - fsp.Add((Path.GetFileName(newFileName), CommonParameters)); + fsp.Add((Path.GetFileName(newFileName), commonParameters2)); - var myMsDataFile = myFileManager.LoadFile(newFileName, CommonParameters); + var myMsDataFile = myFileManager.LoadFile(newFileName, commonParameters2); - Ms2ScanWithSpecificMass[] listOfSortedms2Scans = MetaMorpheusTask.GetMs2ScansWrapByScanNum(myMsDataFile, newFileName, CommonParameters, out List> precursorss).ToArray(); + Ms2ScanWithSpecificMass[] listOfSortedms2Scans = MetaMorpheusTask.GetMs2ScansWrapByScanNum(myMsDataFile, newFileName, commonParameters2, out List> precursorss).ToArray(); //Generate crosslinker, which is DSS here. Crosslinker crosslinker = GlobalVariables.Crosslinkers.Where(p => p.CrosslinkerName == "DSS").First(); @@ -529,9 +529,45 @@ public static void XlTest_MoreComprehensive() Assert.AreEqual(0, deadendTris); Assert.AreEqual(0, unnasignedCrossType); - var fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra).ToList(), 1, CommonParameters, fsp, new List(), "crosslink").Run(); + // We have pretty high peptide-level q values for crosslinks, so we need to up the cut-off is we want PEP to run + commonParameters2.QValueCutoffForPepCalculation = 0.05; + var fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra).ToList(), 1, commonParameters2, fsp, new List(), "crosslink").Run(); - fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType != PsmCrossType.Inter && c.CrossType != PsmCrossType.Intra).ToList(), 1, CommonParameters, fsp, new List(), "standard").Run(); + unnasignedCrossType = 0; + inter = 0; + intra = 0; + single = 0; + loop = 0; + deadend = 0; + deadendH2O = 0; + deadendNH2 = 0; + deadendTris = 0; + + foreach (CrosslinkSpectralMatch csm in firstCsmsFromListsOfCsms.Where(c => (c.CrossType == PsmCrossType.Inter || c.CrossType == PsmCrossType.Intra) && c.FdrInfo.PEP_QValue <= 0.02).ToList()) + { + switch (csm.CrossType) + { + case PsmCrossType.Inter: + inter++; + break; + + case PsmCrossType.Intra: + intra++; + break; + + default: + unnasignedCrossType++; + break; + } + } + + Assert.AreEqual(47, inter); + Assert.AreEqual(73, intra); + Assert.AreEqual(0, unnasignedCrossType); + + + // We have pretty high peptide-level q values for crosslinks, so we need to up the cut-off is we want PEP to run + fdrResultsXLink = new FdrAnalysisEngine(firstCsmsFromListsOfCsms.Where(c => c.CrossType != PsmCrossType.Inter && c.CrossType != PsmCrossType.Intra).ToList(), 1, commonParameters2, fsp, new List(), "standard").Run(); unnasignedCrossType = 0; inter = 0; @@ -634,7 +670,26 @@ public static void XlTest_MoreComprehensive() { Path.GetFileName(intraCsm.FullFilePath), 0 } }; - var intraPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("crosslink", fsp, intraCsm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, chargeStateMode, intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, intraCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); + // Set values within PEP_Analysis through reflection + + PepAnalysisEngine pepEngine = new PepAnalysisEngine(new List(firstCsmsFromListsOfCsms), "standard", fsp, Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\")); + var pepEngineProperties = pepEngine.GetType().GetProperties(); + foreach (var p in pepEngineProperties) + { + switch (p.Name) + { + case "ChargeStateMode": + p.SetValue(pepEngine, chargeStateMode); + break; + case "FileSpecificMedianFragmentMassErrors": + p.SetValue(pepEngine, medianFragmentMassError); + break; + default: + break; + } + } + + var intraPsmData = pepEngine.CreateOnePsmDataEntry("crosslink", intraCsm, intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, intraCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !intraCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(intraPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(1.0).Within(0.1)); Assert.That(intraPsmData.AlphaIntensity, Is.EqualTo(1).Within(0.1)); Assert.AreEqual(intraPsmData.Ambiguity, 0); @@ -659,16 +714,9 @@ public static void XlTest_MoreComprehensive() CrosslinkSpectralMatch singleCsm = firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Single).OrderBy(c => -c.Score).First(); - List psms = new List(); - psms.AddRange(firstCsmsFromListsOfCsms); - - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(psms, fsp, false); - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = PEP_Analysis_Cross_Validation.ComputeHydrophobicityValues(psms, fsp, true); - - var singleCsmPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, singleCsm, - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, - fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, - chargeStateMode, singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide, + var singleCsmPsmData = pepEngine.CreateOnePsmDataEntry("standard", + singleCsm, + singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide, singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Notch, !singleCsm.BestMatchingBioPolymersWithSetMods.FirstOrDefault().Peptide.Parent.IsDecoy); Assert.That(singleCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(8).Within(0.1)); @@ -695,7 +743,7 @@ public static void XlTest_MoreComprehensive() Assert.That(singleCsmPsmData.TotalMatchingFragmentCount, Is.EqualTo(8).Within(0.1)); CrosslinkSpectralMatch loopCsm = firstCsmsFromListsOfCsms.Where(c => c.CrossType == PsmCrossType.Loop).OrderBy(c => -c.Score).First(); - var loopCsmPsmData = PEP_Analysis_Cross_Validation.CreateOnePsmDataEntry("standard", fsp, loopCsm, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, medianFragmentMassError, chargeStateMode, loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, loopCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(loopCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(6).Within(0.1)); + var loopCsmPsmData = pepEngine.CreateOnePsmDataEntry("standard", loopCsm, loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide, loopCsm.BestMatchingBioPolymersWithSetMods.First().Notch, !loopCsm.BestMatchingBioPolymersWithSetMods.First().Peptide.Parent.IsDecoy); Assert.That(loopCsmPsmData.AbsoluteAverageFragmentMassErrorFromMedian, Is.EqualTo(6).Within(0.1)); Assert.AreEqual(loopCsmPsmData.AlphaIntensity, 0); Assert.AreEqual(loopCsmPsmData.Ambiguity, 0); Assert.AreEqual(loopCsmPsmData.BetaIntensity, 0); From dd5e3b28c19fe5a600142bbc43bee2952c0b5e65 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 30 Aug 2024 13:30:29 -0500 Subject: [PATCH 7/7] FdrCategory Comment (#2402) --- .../ProteinScoringAndFdr/FdrCategory.cs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs b/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs index 47ad22412..cabfde29f 100644 --- a/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs +++ b/MetaMorpheus/EngineLayer/ProteinScoringAndFdr/FdrCategory.cs @@ -6,6 +6,25 @@ namespace EngineLayer { + /// + /// This enum is used to categorize the FDR of a peptide based on its cleavage specificity. + /// FullySpecific: The peptide is cleaved only at protease-specified cleavage sites. + /// SemiSpecific: The peptide is cleaved on one terminus at protease-specified cleavage sites and at non-specific site on the other terminus. + /// NonSpecific: The peptide is cleaved at non-specific sites on both termini. + /// + /// In the Speedy Non-Specific Search use case, all three categories are used with modern search. For each spectrum, the lowest q-value peptide is chosen + /// rather than the highest scoring peptide. + /// + /// In a classic NonSpecific search, I believe that only the NonSpecific category is used. Further, I believe that it includes peptides that are cleaved + /// at one or more protease-specified cleavage sites, but also at non-specific sites. + /// + /// The Single-N or Single-C protease is a special case. The modern search table is populated only with peptide fragments including the specified terminus. + /// Fragments from the other terminus are not included. + /// + /// This is not the same as Semi-Trypsin, which is a classic search where the protein is digested into peptides and then the database is further updated + /// the full set of peptides that could be generated by terminal degradation. + /// + /// public enum FdrCategory { //Cleavage Specificity