diff --git a/R/Grp2Analysis.R b/R/Grp2Analysis.R index 003bc04..2f00349 100644 --- a/R/Grp2Analysis.R +++ b/R/Grp2Analysis.R @@ -295,7 +295,7 @@ Grp2Analysis <- setRefClass("Grp2Analysis", colnames(col_clustering) = c("TopProteinName", "ClusterID") # Merging ResultTable with ClusterID data.frame - results <- dplyr::inner_join(tmpdata, col_clustering, by = "TopProteinName") + results <- dplyr::full_join(tmpdata, col_clustering, by = "TopProteinName") return(results) } ) diff --git a/inst/RunScripts/Run_Generic_QuantTwoGroupAnalysis.R b/inst/RunScripts/Run_Generic_QuantTwoGroupAnalysis.R index 6d44967..400e45a 100644 --- a/inst/RunScripts/Run_Generic_QuantTwoGroupAnalysis.R +++ b/inst/RunScripts/Run_Generic_QuantTwoGroupAnalysis.R @@ -99,4 +99,4 @@ genericQuantMatrixGRP2 <- grp2 ## REMOVE TO RENDER #rmarkdown::render("vignettes/Grp2Analysis.Rmd", params = list(grp = genericQuantMatrixGRP2), envir = new.env()) - +rmarkdown::render("vignettes/Grp2AnalysisHeatmap3.Rmd",bookdown::pdf_document2(), params=list(grp = grp2)) diff --git a/inst/RunScripts/Run_MQ_QuantTwoGroupAnalysis.R b/inst/RunScripts/Run_MQ_QuantTwoGroupAnalysis.R index d044db1..15baa19 100644 --- a/inst/RunScripts/Run_MQ_QuantTwoGroupAnalysis.R +++ b/inst/RunScripts/Run_MQ_QuantTwoGroupAnalysis.R @@ -52,7 +52,7 @@ write.table(annotation, file=file.path(resultdir, "annotationused.txt")) ####### END of user configuration ## -source("R/Grp2Analysis.R") +# source("R/Grp2Analysis.R") grp2 <- Grp2Analysis(annotation, "Experimentname", maxNA=nrNas , nrPeptides=nrPeptides, reference=reference) grp2$setMQProteinGroups(protein) @@ -64,6 +64,6 @@ usethis::use_data(mqQuantMatrixGRP2, overwrite = TRUE) #readr::write_tsv(grp2$getResultTable(), path=file.path(resultdir,"pValues.csv")) ## REMOVE TO RENDER -rmarkdown::render("vignettes/Grp2AnalysisHeatmap3.Rmd",bookdown::pdf_document2(), params=list(grp = grp2)) -rmarkdown::render("vignettes/Grp2Analysis.Rmd",bookdown::pdf_document2(), params=list(grp = grp2)) +# rmarkdown::render("vignettes/Grp2AnalysisHeatmap3.Rmd",bookdown::pdf_document2(), params=list(grp = grp2)) +# rmarkdown::render("vignettes/Grp2Analysis.Rmd",bookdown::pdf_document2(), params=list(grp = grp2)) diff --git a/vignettes/Grp2AnalysisHeatmap3.Rmd b/vignettes/Grp2AnalysisHeatmap3.Rmd new file mode 100644 index 0000000..6c9cd12 --- /dev/null +++ b/vignettes/Grp2AnalysisHeatmap3.Rmd @@ -0,0 +1,437 @@ +--- +title: "FGCZ Two-Group Analysis \n Statistics for a Quantitative Protein Matrix" +author: "Functional Genomics Center Zurich" +date: "`r format(Sys.time(), '%d %B, %Y')`" +output: + pdf_document: + toc: yes + html_document: + toc: yes +header-includes: + \usepackage{fancyhdr} + \pagestyle{fancy} + \fancyhead[CO,CE]{Group Comparison} + \fancyfoot[CO,CE]{\textbf{FGCZ} - www.fgcz.ch - 2018} + \fancyfoot[LE,RO]{\thepage} +params: + grp: !r quote(SRMService::mqQuantMatrixGRP2) +vignette: > + %\VignetteIndexEntry{FGCZ Two-Group Analysis} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +bibliography: bibliography.bib +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} + +knitr::opts_chunk$set( + echo = FALSE, + message = FALSE, + warning = FALSE +) + +if(!exists("progress")){ + progress <- function(howmuch, detail){ + invisible(NULL) + } +} + +grp2 <- eval(params$grp) + +library(tidyverse) +library(knitr) +library(limma) +library(dplyr) +library(reshape2) +library(quantable) +library(ggplot2) +library(dplyr) +library(rlang) + +``` + +# Introduction + +The following analysis compares protein signal intensities recorded in two groups of samples (Tables \@ref(tab:samples) and \@ref(tab:annotation)) by computing the fold change $log2(condition/reference)$ (difference between the means in both groups - also called effect size), and testing if it is different from zero. Table \@ref(tab:groupingvars) shows the group used as the reference. + + +The protein identification and quantification were performed using the _MaxQuant_ software and _Andromeda_ search engine [@Cox2008, Cox2011]. Based on the `proteinGroups.txt` file we generated by MaxQuant; we run a set of functions implemented in the R package SRMService [@SRMService2018] to generate visualizations and to compute +a _moderated t-test_ [@Smyth2004] for all proteins quantified with at least `r grp2$nrPeptides` peptides, employing the R package limma [@Ritchie2015a]. + + +## Experiment summary + + +This report was is stored in the LIMS system _bfabric_ [@Trker2010] in the project `r grp2$projectID`, +with the workunit name : `r grp2$projectName`. + +This report was created from data stored in _bfabric_ and can downloaded using: + +- workunit Id : `r grp2$workunitID` +- project Id: `r grp2$projectID`. + + +The protein matrix is filtered with the following thresholds: + +- Minimum number of peptides / protein: `r grp2$nrPeptides` +- Maximum of missing values per protein : `r grp2$maxNA` + + +- The total number of proteins in this experiment is: `r nrow(grp2$proteinIntensity)` +- Total number without decoys sequences is `r nrow(grp2$proteinIntensity) - sum(grepl("REV__",grp2$proteinAnnotation$ProteinName))` + + +- Percentage of contaminants : `r round(mean(grepl("CON__",grp2$proteinAnnotation$ProteinName)) * 100, digits=1)` % +- Percentage of false positives : `r round(mean(grepl("REV__",grp2$proteinAnnotation$ProteinName)) * 100, digits=1)` % + + + +```{r samples} +tab <- data.frame(table(grp2$getAnnotation()$Condition)) +colnames(tab) <- c("Condition","# samples") +knitr::kable(tab, caption="Nr of samples in each condition.") + +``` + +Table \@ref{tab:samples} shows the number of samples in each condition while Table \@ref(tab:annotation) shows the raw files assigned to the conditions. Finally, Table \@ref{tab:groupingvars} specifies which condition is used as reference (denominator in the log2 FC). + + +```{r annotation} +tab <- grp2$getAnnotation()[,c("Condition","Raw.file")] +rownames(tab) <- NULL +knitr::kable(tab, caption="Condition sample mapping.") + +``` + +```{r groupingvars} +x <- data.frame(name = unlist(grp2$getConditions())) +knitr::kable(x, caption="The reference group is the denominator of the foldchanges.") +``` + + + + +\newpage + +# Quality Control + + +## Missing Data and Intensity Distribution + +Figure \@ref(fig:missingValuesPerProtein) A shows the number of proteins (y) axis with $0-N$ missing values (x - axis), while +the histogram on the left (Panel B) shows the distribution of intensities of proteins with $0-N$ missing values. + +```{r histmissing } +missing <- grp2$getNrNAs() +int <- apply(grp2$proteinIntensity,1,sum, na.rm=TRUE) +grp2$proteinIntensity <- grp2$proteinIntensity[order(missing, -int,decreasing = T),] +``` + + +```{r missingValuesPerProtein, fig.cap="Panel A - # of proteins with n missing values, Panel B - intensity distribution of proteins with 1 to N missing values.", fig.height=3,fig.width=6} +missing <- data.frame(nrNAs = as.factor(grp2$getNrNAs())) +missing<-missing %>% group_by(nrNAs) %>% summarise(nrProteins = n()) +p1 <- ggplot(missing, aes(x = nrNAs, y=nrProteins)) + geom_bar(stat = "identity") + labs(tag="A") +missingDF <- data.frame(nrNAs = as.factor(grp2$getNrNAs()), + meanArea = apply(grp2$proteinIntensity,1,sum, na.rm=TRUE)) +p2<-ggplot(missingDF, aes(x = meanArea, fill = nrNAs, colour = nrNAs)) + + geom_histogram(alpha = 0.2, position = "identity") + scale_x_log10() + labs(tag="B") + +gridExtra::grid.arrange(p1,p2, nrow=1) + +progress(0.1, "Summary") +``` + +Shown in Figure \@ref(fig:distributionRaw) are the distributions of raw log2 transformed protein intensity values for each sample. Ideally the violins should look very similar (have the same shape and span the same range). + +```{r distributionRaw, fig.width=8, fig.height=5, fig.cap="Violin plots for quantifyable proteins (log2 transformed)"} +longm <- melt(log2(grp2$proteinIntensity)) + +p <- qplot( variable , value , data=longm , geom="violin" , xlab="" , ylab="log2(I)") +p + stat_summary(fun.y=median,geom='point') +theme(axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() + +``` + +In Figure \@ref(fig:scaling) the log2 fold change of the average sample intensity versus the mean average intensity of all samples is shown. It is getting critical if a samples average deviates more than 5 times (linear scale) from the average of all samples. + +```{r scaling,dpi=300, fig.height=6, fig.cap="Average intensity in sample vs average intensity in all samples. red line - critical fold change."} +bb <- grp2$getNormalized()$medians + +par(mar=c(15,6,3,6)) +barplot(sort(abs(bb)) - mean(bb) ,horiz=F,las=2, main="median", cex.names = 0.6, ylab="log2(sample average) - log2(total average)", ylim=c(-log2(8),log2(8))) +abline(h=c(-log2(5),log2(5)),col=2) +x<-seq(-3,3,by=1) +axis(4,x,round(2^abs(x),digits=1)) +mtext("linear scale", side=4, line=2) +``` + +```{r progress2} +progress(0.2, "Normalization") +``` + +## Normalization + +Figure \@ref(fig:normalized) shows the normalized values. Normalization is applied to remove systematic differences in protein abundance due to different sample concentrations, or different amount of sample loaded on column. Normalization is important, so that true differentially expressed proteins can be detected. To do this the z-score of the log2 transformed intensities is computed, which is updated by the average of the standard deviation of the log2 transformed intensities in all samples. After normalization all samples have a similar distribution. + + +```{r normalized, fig.width=8,fig.height=5,dpi=300, fig.cap="Violin plots of normalized protein intensity values (z-score)"} +longm <- melt(grp2$getNormalized()$data) +p <- qplot( variable , value , data=longm , geom="violin" , xlab="" , ylab="z-score") +p + stat_summary(fun.y=median, geom='point') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() +``` + +\pagebreak + + +The left panel of Figure \@ref(fig:SDViolin), show the coefficient of variations for all proteins in each condition and overall computed on not normalized data. To observe differences between conditions the variation within a condition should ideally be smaller than within all conditions. + + +```{r CViolin} +cond1 <- grp2$getConditionData(grp2$conditions[1]) +cond2 <- grp2$getConditionData(grp2$conditions[2]) + +cond1 <- quantable::CV(cond1,top= round(nrow(grp2$proteinIntensity)/20)) +cond2 <- quantable::CV(cond2,top= round(nrow(grp2$proteinIntensity)/20)) + +all <- quantable::CV(grp2$proteinIntensity,top= round(nrow(grp2$proteinIntensity)/20)) + +arethereCVS <-(length(cond1) > 0 & length(cond2) > 0) & length(all) >0 +if( arethereCVS) +{ + CVs <- rbind(data.frame(condition=grp2$conditions[1], cv=cond1), + data.frame(condition=grp2$conditions[2],cv=cond2 ), + data.frame(condition="all", cv=all)) + p <- qplot( condition , cv , data=CVs , geom="violin" , xlab="" , ylab="Coefficient of Variation (%)") + p1 <- p + stat_summary(fun.y=median,geom='point') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +} + +``` + + + +The right panel of Figure \@ref(fig:SDViolin) shows the distributions of standard deviations for all proteins within the conditions and overall after transforming and scaling the data. To observe differences between conditions the standard deviation within a condition ideally should be smaller than within all conditions. + +```{r SDViolin, fig.height=3, fig.width=7, fig.cap="Left panel - Distribution of protein CV within condition and overall, Right panel - Distribution of protein standard deviation (after sample normalization and scaling) within conditions and overall"} + +cond1 <- grp2$getNormalizedConditionData( grp2$conditions[1] ) +cond2 <- grp2$getNormalizedConditionData( grp2$conditions[2] ) + +cond1 <- apply(cond1, 1, sd, na.rm=TRUE) +cond2 <- apply(cond2, 1, sd, na.rm=TRUE) + +all <- apply( grp2$getNormalized()$data, 1 , sd, na.rm=TRUE ) +SDs<-rbind(data.frame( condition=grp2$conditions[1], sd=cond1), data.frame(condition=grp2$conditions[2],sd=cond2 ), data.frame(condition="all", sd=all)) + +p2 <- qplot( condition , sd , data=SDs , geom="violin" , xlab="" , ylab="sd of z-score") +p2 <- p2 + stat_summary(fun.y=median,geom='point') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + +gridExtra::grid.arrange(p1,p2, nrow=1) +``` + +```{r} +sdSummary <-aggregate(sd ~ condition , data=SDs , median, na.rm=TRUE) + +if(arethereCVS){ + cvSummary <- aggregate(cv ~ condition , data=CVs , median, na.rm=TRUE) + knitr::kable(merge(cvSummary, sdSummary), caption = 'median of cv and sd') +}else{ + knitr::kable(sdSummary,caption = 'median of cv') +} + +progress(0.1, "CVs") + +``` + +\pagebreak + +## Clustering for Samples and Proteins + +```{r correlation, dpi=300, fig.height=5, fig.width=7, fig.cap="Heatmap of correlations (spearman) between samples."} +col <- c("red","blue") +tmp <- simpleheatmap3(cor(grp2$getNormalized()$data, + use="pairwise.complete.obs", + method="spearman")^2, + palette = getGreensScale(21), + RowSideColors = col[as.factor(grp2$getAnnotation()$Condition)], + margins = c(1,13), labRow =colnames(grp2$getNormalized()$data), labCol="", showColDendro=FALSE, suppressColSideCols = TRUE ) +``` + +In Figure \@ref(fig:correlation) and Figure \@ref(fig:heatmapData) we show how samples are clustering depending on their correlation as well as the protein expression profiles. Side colors on the left side of the heatmaps indicate the groupings. + + +```{r heatmapData, fig.width=8, fig.height=5, dpi=300, fig.cap="Heatmap of normalized data."} +tmp <- grp2$getNormalized()$data +stmMm <- grp2$getNormalized()$data[grp2$getNrNAs() < ncol(grp2$getNormalized()$data)/2,] +tmp2 <- simpleheatmap3((scale(t(stmMm),scale = F)), + RowSideColors = col[as.factor(grp2$getAnnotation()$Condition)], + margins=c(1,10), + breaks=seq(-2.5,2.5,length=26), + palette = getBlueWhiteRed(25), + labCol="", + labRow=colnames(stmMm), + showColDendro = FALSE ) +progress(0.2, "Heatmaps") +``` + +\pagebreak + + +# Two Group Analysis + +In the following analysis, it is assumed that most of the proteins are not regulated (most log2 fold change should be around zero). P-values and Q-values are a measure of how likely it is to observe the data given the assumption that they are not differentially regulated. Small p-values tell us that $H_0$ (no regulation) is very unlikely. Figure \@ref(fig:densityOFFoldChanges), left panel, shows the distribution of fold changes. Most of the fold changes should be close to zero and also the median of all fold changes (red dashed line) should be close to zero (green line). + + + +```{r} +fcname <- paste("log2(", grp2$getConditions()$condition, "/", grp2$getConditions()$reference, ")", sep="") +res.eb <-grp2$getModPValuesCI() + +``` + + +```{r densityOFFoldChanges, fig.width=8, fig.height=4, dpi=300, fig.cap = "Left panel : Distribution of log2(FC). red dashed line - median fold change. Right panel - Histogram of moderated p values."} +par(mfrow=c(1,2)) +plot(density(na.omit(res.eb$log2FC)), main="") +abline(v=0,col="green") +abline(v=median(res.eb$effectSize),col=2,lty=2) +hist(res.eb$P.Value, breaks=20, xlab="moderated p-Value", main="") +abline(h=length(res.eb$P.Value)/20,col="blue") + +``` + +\newpage + +If samples in both groups differ on protein level we expect more small p-values than by chance (Figure \@ref(fig:densityOFFoldChanges) right panel blue horizontal line). If there are only as many or less small p-values as by chance than no significant false discovery rate controlled calls (q-values) will be made in Figure \@ref(fig:volcanoplot). Significant calls are made with q-value smaller than `r grp2$qvalue` (see Figure \@ref(fig:volcanoplot)). Table \@ref(tab:nrsignificant) summarizes the number of significant calls while \@ref(tab:top20table) lists the 20 proteins with the smallest moderated q-values. + + +```{r volcanoplot, fig.width=8, fig.height=5, dpi=300, fig.cap="VolcanoPlot : x axis log2 fold change of normalized data, y axis : panel A -log10(p-value), panel B -log10(q-value)"} + +p1 <- quantable::volcano2GB(res.eb, + pvalue="P.Value", + ylab="-log10(P Value)", + labels = "proteinID", + log2FCThresh=grp2$qfoldchange, + pthresh=grp2$qvalue) +p1<-p1 + labs(tag="A") + theme(legend.position="bottom") + + +p2 <- quantable::volcano2GB(res.eb, + pvalue="adj.P.Val", + ylab="-log10(adj. P Value)", + labels = "proteinID", + log2FCThresh=grp2$qfoldchange, + pthresh=grp2$qvalue) + +p2 <-p2 + labs(tag="B") + theme(legend.position="bottom") + +if(length(grp2$special)>0) +{ + p1<-quantable::addSpecialProteins(p1, res.eb, grp2$special, pvalue="P.Value") + p2<-quantable::addSpecialProteins(p2, res.eb, grp2$special, pvalue= "adj.P.Val") +} + +gridExtra::grid.arrange(p1, p2, nrow=1) +``` + + +\clearpage + + +```{r nrsignificant, results="markup"} +tmp <- grp2$getResultTable() +x <- data.frame(table(abs(tmp$log2FC) > grp2$qfoldchange & tmp$adj.P.Val < grp2$qvalue)) +if(length(x$Var1) == 2){ + x$Var1 <- c("Not Significant" , "Significant") +} +knitr::kable(x, caption = "Number of not significant and significant proteins (by adj.P.Val).") +``` + + +```{r top20table} +tmp2 <- grp2$getResultTableWithPseudoAndClustering(nrOfClustersRow = 2, nrOfClustersCol = 5) +top20 <- tmp2 %>% dplyr::select( proteinID,log2FC,CI.L,CI.R, P.Value, adj.P.Val, ClusterID ) %>% arrange(P.Value) %>% head(20) +knitr::kable(top20, caption = "Top 20 proteins sorted by smallest Q Value (adj.P.Val). The effectSize column is the log2 FC of condition vs reference.") +``` + + +```{r CI, fig.cap="Confidence interfals for proteins with 20 smallest p-values (ordered by p-value)", fig.height=4, fig.width=7} +tmp2 %>% arrange(P.Value) %>% head(20) -> top20CI +top20CI$proteinID <- with(top20CI,reorder(proteinID,P.Value, function(x){-x})) +ggplot(top20CI, aes(x = proteinID, y = log2FC, + ymin = CI.L, ymax = CI.R)) + + geom_hline( yintercept = 0, color = 'red' ) + + geom_linerange() + geom_point() + coord_flip() + theme_minimal() +``` + + + +```{r presentInOnlyOneCondition} +results <- grp2$getResultTable() +NAinfo <- c(sum(is.na(results[, grp2$getConditions()$reference])) , + sum(is.na(results[, grp2$getConditions()$condition])) ) +NAinfo<-data.frame(name = unlist(grp2$getConditions()), nrProteins= NAinfo) +``` + +```{r referencingChildDocument} +child_docs <- "Grp2Analysis_MissingInOneCondition.Rmd_t" +if(!sum(NAinfo$nrProteins > 0) > 0){ + child_docs <- "Grp2Analysis_Empty.Rmd_t" +} + +``` + + +```{r includeMissingInOne, child = child_docs} + +``` + +# Data Interpretation + +For interpreting the results in the _`r paste0("MaxQuant_report", grp2$projectName, ".csv")`_ file, the protein IDs can be either sorted by `log2FC` or `P.Value` (See Table \@ref(tab:columnlist)). Large positive or negative fold changes typically result in small p-values therefore we suggest sorting by foldchange. + +The IDs sorted by fold change can then be subjected to gene set enrichment analysis (GSEA). Alternatively, a subset filtered by q-value can be analysed using over-representation analysis (ORA). The web application WebGestalt (WEB-based GEne SeT AnaLysis Toolkit) implements both of these methods (and even more) for the most popular organisms [@Wang2017]. + +Overrepresentation analysis is performed on biological functional categories (e.g., biological processes of gene ontology annotations) or on biological pathways (e.g., KEGG or Wikipathways). Using such methods allows identifying functions or pathways for proteins in the submitted list. For the correct usage and interpretation of the results from such an analysis, it is essential to specify the background proteome. The background proteome is the list of all proteins identified in your experiment. + + +A further resource to analyze the results is the STRING database [@Szklarczyk2017]. It reports known and predicted interactions for proteins in the submitted list. + + +The FGCZ can support you, with the interpretation of your quantitative proteomics data or with a more customized analysis. Further visualization of the data, targeted to your audience, e.g., receiver operator curves (ROC) or MA-plots, can be generated. You can reach the proteome-bioinformatics team at . + + +\newpage + +```{r columnlist} +knitr::kable(data.frame(columns=colnames(grp2$getResultTable())), caption="List of column names in result .csv table.") +``` + + +# Disclaimer + + +The obtained results should be validated orthogonal as well (e.g. with Western blots). The Functional Genomics Center Zurich does not provide any kind of guarantee for the validity of the results. + + +For questions and improvement suggestions, with respect to this report, please do contact . + + +# Session Inforamtion + + +```{r} +pander::pander(sessionInfo()) +``` + + +# References + + diff --git a/vignettes/Grp2PullDownExampleSimpleheatmap3.Rmd b/vignettes/Grp2PullDownExampleSimpleheatmap3.Rmd new file mode 100644 index 0000000..39a17bb --- /dev/null +++ b/vignettes/Grp2PullDownExampleSimpleheatmap3.Rmd @@ -0,0 +1,29 @@ +--- +title: "FGCZ Two-Group Analysis \n Statistics for a Quantitative Protein Matrix" +author: "Functional Genomics Center Zurich" +date: "`r format(Sys.time(), '%d %B, %Y')`" +output: + pdf_document: + toc: yes + html_document: + toc: yes +header-includes: + \usepackage{fancyhdr} + \pagestyle{fancy} + \fancyhead[CO,CE]{Group Comparison} + \fancyfoot[CO,CE]{\textbf{FGCZ} - www.fgcz.ch - 2018} + \fancyfoot[LE,RO]{\thepage} +params: + grp: !r quote(SRMService::grp2PullDownExample) +vignette: > + %\VignetteIndexEntry{FGCZ Two-Group Analysis} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +bibliography: bibliography.bib +editor_options: + chunk_output_type: console +--- + + +```{r child="Grp2AnalysisHeatmap3.Rmd"} +```