Merge pull request #2 from mlocardpaulet/TopPICv1.3.3

compatibility TopPICv1.3.3
mlocardpaulet · Jul 3, 2020 · 15d1c64 · 15d1c64
2 parents de34856 + 085a54a
commit 15d1c64
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@ Figures/
 *.Rproj
 files/test/TopPic3
 test/
+files/test/TopPicv133
 
 
diff --git a/README.md b/README.md
@@ -14,6 +14,11 @@ The fact that you are presently reading this means that you have had knowledge o
 
 ## Version history:
 
-*V2.0: version corresponding to the paper.
-*V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer.
-        In this new version, the intensity of the precursor for the MSMS that gave the protein ID is not provided on hovering because this information is missing from the new outputs of Proteome Discoverer
+
+* V2.0: version corresponding to the paper.
+
+* V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer.
+        In this new version, the intensity of the precursor for the MSMS that gave the protein ID is not provided on hovering because this information is missing from the new outputs of Proteome Discoverer
+
+* V2.2: July 2020 - adapt to new TopPIC input format for msalign files.
+
diff --git a/app.R b/app.R
@@ -85,58 +85,88 @@ ThresholdCleaning <- function(l, threshold) {
 }
 
 TopPicMS1Parsing <- function(fname) {
+  cat("== Start parsing TopPIC data MS1 ==\n")
   # Return a table in the style of RoWinPro tables for use in VisioProt.
   # fname is the path to the file to parse.
   allData <- readLines(fname)
-  allData <- allData[-(1:11)]
+  numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2]
+  allData <- allData[-(1:numline)]
   rep_ions_entries = which(allData=="BEGIN IONS")
-  IDs <- gsub("ID=", "", allData[rep_ions_entries+1])
-  SCANs <- gsub("SCANS=", "", allData[rep_ions_entries+2])
-  RT <- gsub("RETENTION_TIME=", "", allData[rep_ions_entries+3])
-  removeEntries <- c(rep_ions_entries,rep_ions_entries+1,rep_ions_entries+2,rep_ions_entries+3,rep_ions_entries[2:length(rep_ions_entries)]-1,rep_ions_entries[2:length(rep_ions_entries)]-2, length(allData)-1, length(allData))
-  ions_per_scan <- diff(rep_ions_entries) - 6
-  ions_per_scan <- c(ions_per_scan, (length(allData) - rep_ions_entries[length(rep_ions_entries)] - 5))
+  rep_ions_end = which(allData=="END IONS")
+  IDs <- gsub("ID=", "", allData[grepl("^ID", allData)])
+  SCANs <- gsub("SCANS=", "", allData[grepl("^SCANS", allData)])
+  RT <- gsub("RETENTION_TIME=", "", allData[grepl("^RETENTION_TIME", allData)])
+  # removeEntries <- c(rep_ions_entries,rep_ions_entries+1,rep_ions_entries+2,rep_ions_entries+3,rep_ions_entries[2:length(rep_ions_entries)]-1,rep_ions_entries[2:length(rep_ions_entries)]-2, length(allData)-1, length(allData))
+  removeEntries <- which(!(substr(allData, 1, 1) %in% c(0:9))) # Remove all lines not starting with a number
+  # Count the number of lines with comment per spectrum:
+  numComments <- sum(!(substr(allData[rep_ions_entries[1]:rep_ions_end[1]], 1, 1) %in% c(0:9)))
+  ions_per_scan <- sapply(seq_along(rep_ions_entries), function(x) {
+    rep_ions_end[x] - rep_ions_entries[x] - numComments + 1
+  })
   dat <- fread(paste(allData[-removeEntries], collapse = "\n"), sep = "\t")
   class(dat) <- "data.frame"
   names(dat) <- c("Mass", "intensity", "charge")
   dat$ID <- rep(IDs, ions_per_scan)
   dat$SCANs <- rep(SCANs, ions_per_scan)
   dat$RT <- rep(RT, ions_per_scan)
   dat <- dat[,c(6,1,2,3,5)] 
-  # Keep only the ions >= 5+
+  c("Keep only the ions >= 5+\n")
   dat <- dat[dat[,4]>=5,]
-  # Change from seconds to minutes:
+  c("Change from seconds to minutes\n")
   dat[,1] <- as.numeric(dat[,1])/60
-  # Keep only the 100% highest intensities:
+  c("Keep only the 100% highest intensities\n")
   dat <- dat[order(dat[,3], decreasing = T),]
   dat <- dat[!is.na(dat[,3]),]
   thresh <- floor(1 * nrow(dat))
   dat <- dat[c(1:thresh),]
   # For the functions to come (thresholding, renaming):
   dat[,4] <- rep(NA, nrow(dat))
   dat[,5] <- rep(NA, nrow(dat))
+  cat("== End parsing TopPIC MS1 ==\n\n")
   return(dat)
+
 }
 
 TopPicMS2Parsing <- function(fname) {
+  cat("== Start parsing TopPIC data MS2 ==\n")
   # Return a table in the style of RoWinPro tables for use in VisioProt.
   # fname is the path to the file to parse.
   allData <- readLines(fname)
   numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2]
   allData <- allData[-(1:numline)]
   rep_ions_entries = which(allData=="BEGIN IONS")
-  IDs <- gsub("ID=", "", allData[rep_ions_entries+1])
-  SCANs <- gsub("SCANS=", "", allData[rep_ions_entries+2])
-  RT <- gsub("RETENTION_TIME=", "", allData[rep_ions_entries+3])
-  Mass <- gsub("PRECURSOR_MASS=", "", allData[rep_ions_entries+9])
-  intensity <- gsub("PRECURSOR_INTENSITY=", "", allData[rep_ions_entries+10])
-  charge <- gsub("PRECURSOR_CHARGE=", "", allData[rep_ions_entries+8])
+  rep_ions_end = which(allData=="END IONS")
+  IDs <- gsub("ID=", "", allData[grepl("^ID", allData)])
+  SCANs <- gsub("SCANS=", "", allData[grepl("^SCANS", allData)])
+  RT <- gsub("RETENTION_TIME=", "", allData[grepl("^RETENTION_TIME", allData)])
+  Mass <- gsub("PRECURSOR_MASS=", "", allData[grepl("^PRECURSOR_MASS", allData)])
+  intensity <- gsub("PRECURSOR_INTENSITY=", "", allData[grepl("^PRECURSOR_INTENSITY", allData)])
+  charge <- gsub("PRECURSOR_CHARGE=", "", allData[grepl("^PRECURSOR_CHARGE", allData)])
 
   dat <- data.frame("RT"=RT, "Mass"=Mass, "intensity"=intensity, "Scan"=SCANs, stringsAsFactors = F)
-  # Change from seconds to minutes:
+  c("Change from seconds to minutes\n")
   dat[,1] <- as.numeric(dat[,1])/60
+  cat("== End parsing TopPIC MS2 ==\n\n")
   return(dat)
+
 }
+
+TopPicIDParsing <- function(fname) {
+
+  cat("== Start parsing TopPIC data ID ==\n")
+  # Return a table in the style of RoWinPro tables for use in VisioProt.
+  # fname is the path to the file to parse.
+  allData <- readLines(fname)
+  numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2]
+  allData <- allData[-(1:numline)]
+  allData[1] <- gsub("#", "", allData[1])
+  dat <- fread(paste(allData, collapse = "\n"), sep = "\t", header = T, stringsAsFactors = F)
+  class(dat) <- "data.frame"
+  cat("== End parsing TopPIC ID ==\n\n")
+  return(dat)
+
+}
+
 ############################################################################
 
 # App:
@@ -335,7 +365,9 @@ ui <- fluidPage(
   # Footer
   tabsetPanel(
     tabPanel(
-      HTML('<footer><font size="0.8">copyright 2017 - CNRS - All rights reserved - VisioProt-MS V2.1</font></footer>')
+
+      HTML('<footer><font size="0.8">copyright 2017 - CNRS - All rights reserved - VisioProt-MS V2.2</font></footer>')
+
     )
   )
 )
@@ -838,11 +870,7 @@ server <- function(input, output, clientData, session) {
       validate(
         need(grepl("_ms2.OUTPUT_TABLE", InputFilesMS2TP()$IDfile$name, fixed = T) | grepl("_ms2_toppic", InputFilesMS2TP()$IDfile$name, fixed = T), "Error in file format for plotting ID data.\nYou have to upload the \"_ms2.OUTPUT_TABLE\", or \"_ms2_toppic\" output file from TopPic associated with the deconvoluted MS2 weights uploaded as \"input file for MS2\".")
       )
-      allData <- readLines(InputFilesMS2TP()$IDfile$datapath)
-      allData <- allData[-(1:23)]
-      allData[1] <- gsub("#", "", allData[1])
-      IDTP <- fread(paste(allData, collapse = "\n"), sep = "\t", header = T, stringsAsFactors = F)
-      class(IDTP) <- "data.frame"
+      IDTP <- TopPicIDParsing(InputFilesMS2TP()$IDfile$datapath)
       MS2TP <- TopPicMS2Parsing(InputFilesMS2TP()$MS2file$datapath)
       names(IDTP)[names(IDTP) == "Spectrum ID"] <- "Scan"
       dat <- merge(MS2TP, IDTP, by = "Scan", all = T)
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,5 +6,6 @@ Figures/ @@
     *.Rproj
     files/test/TopPic3
     test/
+    files/test/TopPicv133