From 9eac4f034778b40100f0a7e83330ef01079915b7 Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet Date: Tue, 30 Jun 2020 14:24:59 +0200 Subject: [PATCH 1/5] V2.1 new PD outputs --- README.md | 3 ++- app.R | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 2c7fd38..eb65631 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,5 @@ The fact that you are presently reading this means that you have had knowledge o ## Version history: *V2.0: version corresponding to the paper. -*V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer. \ No newline at end of file +*V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer. + In this new version, the intensity of the precursor for the MSMS that gave the protein ID is not provided on hovering because this information is missing from the new outputs of Proteome Discoverer \ No newline at end of file diff --git a/app.R b/app.R index 50263ad..ac4f506 100644 --- a/app.R +++ b/app.R @@ -23,7 +23,7 @@ library(shinyBS) library(data.table) -options(shiny.maxRequestSize=20*1024^2) # Set max input size to 20M +options(shiny.maxRequestSize=40*1024^2) # Set max input size to 40M ############################################################################ # Functions: @@ -335,7 +335,7 @@ ui <- fluidPage( # Footer tabsetPanel( tabPanel( - HTML('') + HTML('') ) ) ) @@ -765,15 +765,23 @@ server <- function(input, output, clientData, session) { } else { if (testfileinput() == 3) { infileMS2 <- list.files("files/MS2/", pattern = "MSMS", full.names = T)[[1]] - infilePSM <- list.files("files/MS2/", pattern = "PSM", full.names = T)[[1]] - PSM <- read.table(infilePSM, sep = "\t", header = T) + infilePSM <- list.files("files/MS2/", pattern = "SMs.txt", full.names = T)[[1]] + PSM <- read.table(infilePSM, sep = "\t", header = T, comment.char = "#") MS2 <- read.table(infileMS2, sep = "\t", header = T) } else { PSM <- read.table(InputFilesMS2()$PSMfile$datapath, sep = "\t", header = T) MS2 <- read.table(InputFilesMS2()$MS2file$datapath, sep = "\t", header = T, comment.char = "#") validate( - need(sum(grepl("Master.Protein.Descriptions", names(PSM))) == 1 & sum(grepl("RT.in.min", names(MS2))) == 1, "Error in file format for plotting MS2 data.\nYou have to upload the following files:\n- A MSMSSpectrumInfo.txt file from BioPharma Finder (in the \"MS/MS File\" field).\n- The corresponding PSMs.txt file (in the \"PSM File\" field).") + need((sum(grepl("Master.Protein.Descriptions", names(PSM))) == 1 & sum(grepl("RT.in.min", names(MS2))) == 1) | (sum(grepl("Protein.Accessions", names(PSM))) == 1 & sum(grepl("RT..min.", names(MS2))) == 1), + "Error in file format for plotting MS2 data.\nYou have to upload the following files:\n- A MSMSSpectrumInfo.txt file from BioPharma Finder (in the \"MS/MS File\" field).\n- The corresponding PSMs.txt or PrSMs.txt file (in the \"PSM File\" field).") ) + # Change field names for compatibility between PSMs and PrSMs tables: + if (sum(grepl("Master.Protein.Description", names(PSM))) == 0) { + names(PSM)[names(PSM) == "Protein.Accessions"] <- "Master.Protein.Descriptions" + } + names(PSM)[names(PSM) == "RT..min."] <- "RT.in.min" + names(MS2)[names(MS2) == "RT..min."] <- "RT.in.min" + names(MS2)[names(MS2) == "Precursor.MH...Da."] <- "Precursor.MHplus.in.Da" } } return(list("MS2file" = MS2, "PSMfile" = PSM)) @@ -1106,20 +1114,29 @@ server <- function(input, output, clientData, session) { if (input$PDPFModeCheck == "PD" | testfileinput() == 3) { PSM <- filedataMS2()$PSM MS2 <- filedataMS2()$MS2 - PSM$ID <- paste0(PSM$Spectrum.File, "|", PSM$First.Scan) - MS2$ID <- paste0(MS2$Spectrum.File, "|", MS2$First.Scan) + if (sum(grepl("First.Scan", names(PSM))) == 1 & sum(grepl("First.Scan", names(MS2))) == 1) { + PSM$ID <- paste0(PSM$Spectrum.File, "|", PSM$First.Scan) + MS2$ID <- paste0(MS2$Spectrum.File, "|", MS2$First.Scan) + } else { + PSM$ID <- paste0(PSM$Spectrum.File, "|", PSM$m.z..Da.) + MS2$ID <- paste0(MS2$Spectrum.File, "|", MS2$Precursor.m.z..Da.) + MS2$Master.Protein.Descriptions <- PSM$Master.Protein.Descriptions[match(MS2$ID, PSM$ID)] + } # Retrieve protein IDs in the MS2 table: MS2$Master.Protein.Descriptions <- PSM$Master.Protein.Descriptions[match(MS2$ID, PSM$ID)] # Plot: - gtabMS2 <- MS2[,c("RT.in.min", "Precursor.MHplus.in.Da", "Precursor.Intensity", "Master.Protein.Descriptions")] + # gtabMS2 <- MS2[,c("RT.in.min", "Precursor.MHplus.in.Da", "Precursor.Intensity", "Master.Protein.Descriptions")] + # print(head(MS2)) + gtabMS2 <- MS2[,c("RT.in.min", "Precursor.MHplus.in.Da", "Master.Protein.Descriptions")] gtabMS2$Identification <- ifelse(!is.na(gtabMS2$Master.Protein.Descriptions), "IDed", "Not IDed") + # print(head(gtabMS2)) # Action button: if (input$HideMSMS == TRUE) { gtabMS2 <- gtabMS2[gtabMS2$Identification == "IDed",] } - names(gtabMS2)[3] <- "intensity" + # names(gtabMS2)[3] <- "intensity" gtabMS2 <- gtabMS2[order(gtabMS2$Identification, decreasing = T),] if (!is.null(input$SelectProt) & input$PDPFModeCheck == "PD") { @@ -1131,7 +1148,7 @@ server <- function(input, output, clientData, session) { if (is.null(filedata0()) | input$MSTrace == FALSE) { # No MS trace g <- ggplot() + - geom_point(data = gtabMS2, aes(x = RT.in.min, y = Precursor.MHplus.in.Da, shape = Identification, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\nSignal:", intensity, "\n", Protein.Descriptions)), alpha = 0.8, size = input$pch, col = "grey30", show.legend = FALSE) + + geom_point(data = gtabMS2, aes(x = RT.in.min, y = Precursor.MHplus.in.Da, shape = Identification, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "\n", Protein.Descriptions)), alpha = 0.8, size = input$pch, col = "grey30", show.legend = FALSE) + coord_cartesian(xlim = ranges$x, ylim = ranges$y, expand = TRUE) + theme_bw() + scale_shape_manual(values = c(16, 1)) + @@ -1139,19 +1156,19 @@ server <- function(input, output, clientData, session) { xlab("Retention time (min)") if (!is.null(input$SelectProt)) { g <- g + - geom_point(data = gtabMS2[gtabMS2$Protein.Descriptions %in% input$SelectProt[!is.na(input$SelectProt)],], aes(x = RT.in.min, y = Precursor.MHplus.in.Da, fill = Protein.Descriptions, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\nSignal:", intensity, "\n", Protein.Descriptions)), shape = 21, size = input$pch+1, alpha = 0.8, stroke = 0, col = alpha("black", 1)) + + geom_point(data = gtabMS2[gtabMS2$Protein.Descriptions %in% input$SelectProt[!is.na(input$SelectProt)],], aes(x = RT.in.min, y = Precursor.MHplus.in.Da, fill = Protein.Descriptions, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\n", Protein.Descriptions)), shape = 21, size = input$pch+1, alpha = 0.8, stroke = 0, col = alpha("black", 1)) + scale_fill_manual(values = getPalette(length(vec))) } } else if (input$MSTrace == TRUE) { # Overlay on MS trace g <- g + - geom_point(data = gtabMS2, aes(x = RT.in.min, y = Precursor.MHplus.in.Da, shape = Identification, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\nSignal:", intensity, "\n", Protein.Descriptions)), alpha = 0.8, size = input$pch, col = "grey30", show.legend = FALSE) + + geom_point(data = gtabMS2, aes(x = RT.in.min, y = Precursor.MHplus.in.Da, shape = Identification, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\n", Protein.Descriptions)), alpha = 0.8, size = input$pch, col = "grey30", show.legend = FALSE) + theme_bw() + scale_shape_manual(values = c(16, 1)) + ylab("Molecular Weight (Da)") + xlab("Retention time (min)") if (!is.null(input$SelectProt)) { g <- g + - geom_point(data = gtabMS2[gtabMS2$Protein.Descriptions %in% input$SelectProt[!is.na(input$SelectProt)],], aes(x = RT.in.min, y = Precursor.MHplus.in.Da, fill = Protein.Descriptions, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\nSignal:", intensity, "\n", Protein.Descriptions)), shape = 21, size = input$pch+1, alpha = 0.8, stroke = 0, col = alpha("black", 1)) + + geom_point(data = gtabMS2[gtabMS2$Protein.Descriptions %in% input$SelectProt[!is.na(input$SelectProt)],], aes(x = RT.in.min, y = Precursor.MHplus.in.Da, fill = Protein.Descriptions, text = paste(RT.in.min, "min\n", Precursor.MHplus.in.Da, "Da\n", Protein.Descriptions)), shape = 21, size = input$pch+1, alpha = 0.8, stroke = 0, col = alpha("black", 1)) + scale_fill_manual(values = getPalette(length(vec))) } } From 3e0372ee088d7fb4036fd69e4fb1b6814d4267b4 Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet Date: Tue, 30 Jun 2020 14:37:24 +0200 Subject: [PATCH 2/5] ignore test files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 865f950..1d933f2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,6 @@ Figures/ *.Rproj files/test/TopPic3 test/ +files/test/TopPicv133 From 54ef89d101c34481b4460f29668fe6c9afa1e3e3 Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet Date: Tue, 30 Jun 2020 17:28:57 +0200 Subject: [PATCH 3/5] new parsing compatible with TopPICv133 --- app.R | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/app.R b/app.R index ac4f506..d1b09ac 100644 --- a/app.R +++ b/app.R @@ -85,17 +85,24 @@ ThresholdCleaning <- function(l, threshold) { } TopPicMS1Parsing <- function(fname) { + cat("== Start parsing TopPIC data MS1 ==\n") # Return a table in the style of RoWinPro tables for use in VisioProt. # fname is the path to the file to parse. allData <- readLines(fname) - allData <- allData[-(1:11)] + numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2] + allData <- allData[-(1:numline)] rep_ions_entries = which(allData=="BEGIN IONS") - IDs <- gsub("ID=", "", allData[rep_ions_entries+1]) - SCANs <- gsub("SCANS=", "", allData[rep_ions_entries+2]) - RT <- gsub("RETENTION_TIME=", "", allData[rep_ions_entries+3]) - removeEntries <- c(rep_ions_entries,rep_ions_entries+1,rep_ions_entries+2,rep_ions_entries+3,rep_ions_entries[2:length(rep_ions_entries)]-1,rep_ions_entries[2:length(rep_ions_entries)]-2, length(allData)-1, length(allData)) - ions_per_scan <- diff(rep_ions_entries) - 6 - ions_per_scan <- c(ions_per_scan, (length(allData) - rep_ions_entries[length(rep_ions_entries)] - 5)) + rep_ions_end = which(allData=="END IONS") + IDs <- gsub("ID=", "", allData[grepl("^ID", allData)]) + SCANs <- gsub("SCANS=", "", allData[grepl("^SCANS", allData)]) + RT <- gsub("RETENTION_TIME=", "", allData[grepl("^RETENTION_TIME", allData)]) + # removeEntries <- c(rep_ions_entries,rep_ions_entries+1,rep_ions_entries+2,rep_ions_entries+3,rep_ions_entries[2:length(rep_ions_entries)]-1,rep_ions_entries[2:length(rep_ions_entries)]-2, length(allData)-1, length(allData)) + removeEntries <- which(!(substr(allData, 1, 1) %in% c(0:9))) # Remove all lines not starting with a number + # Count the number of lines with comment per spectrum: + numComments <- sum(!(substr(allData[rep_ions_entries[1]:rep_ions_end[1]], 1, 1) %in% c(0:9))) + ions_per_scan <- sapply(seq_along(rep_ions_entries), function(x) { + rep_ions_end[x] - rep_ions_entries[x] - numComments + 1 + }) dat <- fread(paste(allData[-removeEntries], collapse = "\n"), sep = "\t") class(dat) <- "data.frame" names(dat) <- c("Mass", "intensity", "charge") @@ -103,11 +110,11 @@ TopPicMS1Parsing <- function(fname) { dat$SCANs <- rep(SCANs, ions_per_scan) dat$RT <- rep(RT, ions_per_scan) dat <- dat[,c(6,1,2,3,5)] - # Keep only the ions >= 5+ + c("Keep only the ions >= 5+\n") dat <- dat[dat[,4]>=5,] - # Change from seconds to minutes: + c("Change from seconds to minutes\n") dat[,1] <- as.numeric(dat[,1])/60 - # Keep only the 100% highest intensities: + c("Keep only the 100% highest intensities\n") dat <- dat[order(dat[,3], decreasing = T),] dat <- dat[!is.na(dat[,3]),] thresh <- floor(1 * nrow(dat)) @@ -116,26 +123,30 @@ TopPicMS1Parsing <- function(fname) { dat[,4] <- rep(NA, nrow(dat)) dat[,5] <- rep(NA, nrow(dat)) return(dat) + cat("== End parsing TopPIC MS1 ==\n\n") } TopPicMS2Parsing <- function(fname) { + cat("== Start parsing TopPIC data MS2 ==\n") # Return a table in the style of RoWinPro tables for use in VisioProt. # fname is the path to the file to parse. allData <- readLines(fname) numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2] allData <- allData[-(1:numline)] rep_ions_entries = which(allData=="BEGIN IONS") - IDs <- gsub("ID=", "", allData[rep_ions_entries+1]) - SCANs <- gsub("SCANS=", "", allData[rep_ions_entries+2]) - RT <- gsub("RETENTION_TIME=", "", allData[rep_ions_entries+3]) - Mass <- gsub("PRECURSOR_MASS=", "", allData[rep_ions_entries+9]) - intensity <- gsub("PRECURSOR_INTENSITY=", "", allData[rep_ions_entries+10]) - charge <- gsub("PRECURSOR_CHARGE=", "", allData[rep_ions_entries+8]) + rep_ions_end = which(allData=="END IONS") + IDs <- gsub("ID=", "", allData[grepl("^ID", allData)]) + SCANs <- gsub("SCANS=", "", allData[grepl("^SCANS", allData)]) + RT <- gsub("RETENTION_TIME=", "", allData[grepl("^RETENTION_TIME", allData)]) + Mass <- gsub("PRECURSOR_MASS=", "", allData[grepl("^PRECURSOR_MASS", allData)]) + intensity <- gsub("PRECURSOR_INTENSITY=", "", allData[grepl("^PRECURSOR_INTENSITY", allData)]) + charge <- gsub("PRECURSOR_CHARGE=", "", allData[grepl("^PRECURSOR_CHARGE", allData)]) dat <- data.frame("RT"=RT, "Mass"=Mass, "intensity"=intensity, "Scan"=SCANs, stringsAsFactors = F) - # Change from seconds to minutes: + c("Change from seconds to minutes\n") dat[,1] <- as.numeric(dat[,1])/60 return(dat) + cat("== End parsing TopPIC MS2 ==\n\n") } ############################################################################ From bc6bd0dd3629b50ec9d20da6223bc280ae84134e Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet Date: Tue, 30 Jun 2020 17:34:37 +0200 Subject: [PATCH 4/5] amend version and readme --- README.md | 9 ++++++--- app.R | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index eb65631..45aefeb 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ The fact that you are presently reading this means that you have had knowledge o ## Version history: -*V2.0: version corresponding to the paper. -*V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer. - In this new version, the intensity of the precursor for the MSMS that gave the protein ID is not provided on hovering because this information is missing from the new outputs of Proteome Discoverer \ No newline at end of file +* V2.0: version corresponding to the paper. + +* V2.1: November 2019 - Make the MS/MS visualisation compatible with the new file formats of TopPic and Proteome Discoverer. + In this new version, the intensity of the precursor for the MSMS that gave the protein ID is not provided on hovering because this information is missing from the new outputs of Proteome Discoverer + +* V2.2: July 2020 - adapt to new TopPIC input format for msalign files. \ No newline at end of file diff --git a/app.R b/app.R index d1b09ac..30901bf 100644 --- a/app.R +++ b/app.R @@ -346,7 +346,7 @@ ui <- fluidPage( # Footer tabsetPanel( tabPanel( - HTML('
copyright 2017 - CNRS - All rights reserved - VisioProt-MS V2.1
') + HTML('
copyright 2017 - CNRS - All rights reserved - VisioProt-MS V2.2
') ) ) ) From 0c5e9a92158b7e0d1f7b7091b3ef40ff2c83db17 Mon Sep 17 00:00:00 2001 From: Marie Locard-Paulet Date: Fri, 3 Jul 2020 11:55:48 +0200 Subject: [PATCH 5/5] update output table import TopPIC --- app.R | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/app.R b/app.R index 30901bf..c936020 100644 --- a/app.R +++ b/app.R @@ -122,8 +122,9 @@ TopPicMS1Parsing <- function(fname) { # For the functions to come (thresholding, renaming): dat[,4] <- rep(NA, nrow(dat)) dat[,5] <- rep(NA, nrow(dat)) - return(dat) cat("== End parsing TopPIC MS1 ==\n\n") + return(dat) + } TopPicMS2Parsing <- function(fname) { @@ -145,9 +146,27 @@ TopPicMS2Parsing <- function(fname) { dat <- data.frame("RT"=RT, "Mass"=Mass, "intensity"=intensity, "Scan"=SCANs, stringsAsFactors = F) c("Change from seconds to minutes\n") dat[,1] <- as.numeric(dat[,1])/60 - return(dat) cat("== End parsing TopPIC MS2 ==\n\n") + return(dat) + +} + +TopPicIDParsing <- function(fname) { + + cat("== Start parsing TopPIC data ID ==\n") + # Return a table in the style of RoWinPro tables for use in VisioProt. + # fname is the path to the file to parse. + allData <- readLines(fname) + numline <- which(grepl("^[^P]+Parameters ", allData, perl = T))[2] + allData <- allData[-(1:numline)] + allData[1] <- gsub("#", "", allData[1]) + dat <- fread(paste(allData, collapse = "\n"), sep = "\t", header = T, stringsAsFactors = F) + class(dat) <- "data.frame" + cat("== End parsing TopPIC ID ==\n\n") + return(dat) + } + ############################################################################ # App: @@ -849,11 +868,7 @@ server <- function(input, output, clientData, session) { validate( need(grepl("_ms2.OUTPUT_TABLE", InputFilesMS2TP()$IDfile$name, fixed = T) | grepl("_ms2_toppic", InputFilesMS2TP()$IDfile$name, fixed = T), "Error in file format for plotting ID data.\nYou have to upload the \"_ms2.OUTPUT_TABLE\", or \"_ms2_toppic\" output file from TopPic associated with the deconvoluted MS2 weights uploaded as \"input file for MS2\".") ) - allData <- readLines(InputFilesMS2TP()$IDfile$datapath) - allData <- allData[-(1:23)] - allData[1] <- gsub("#", "", allData[1]) - IDTP <- fread(paste(allData, collapse = "\n"), sep = "\t", header = T, stringsAsFactors = F) - class(IDTP) <- "data.frame" + IDTP <- TopPicIDParsing(InputFilesMS2TP()$IDfile$datapath) MS2TP <- TopPicMS2Parsing(InputFilesMS2TP()$MS2file$datapath) names(IDTP)[names(IDTP) == "Spectrum ID"] <- "Scan" dat <- merge(MS2TP, IDTP, by = "Scan", all = T)