add TCGA batch analysis module (#279)

openbiox · Nov 18, 2023 · 5bd9191 · 5bd9191
1 parent bcddced
commit 5bd9191
Show file tree

Hide file tree

Showing 54 changed files with 2,126 additions and 1,131 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -42,6 +42,7 @@ inst/extdata/transcript_identifier.rda
 inst/extdata/toil_sig_score.rda
 inst/extdata/tcga_PW.rda
 inst/extdata/tcga_PW_meta.rda
+inst/extdata/pancan_identifier_help
 
 
 ^doc$

diff --git a/R/batch_func.R b/R/batch_func.R
@@ -0,0 +1,94 @@
+#' download data for shiny batch analysis 
+#'
+#' @param L1 level 1  main datatype
+#' @param L2 level 2  sub datatype
+#' @param L3 level 3  identifier
+#' @param tumor_index_list Tumor index data. See shiny App.R file
+#' @param tcga_TIL        Tumor immune infiltration  data. See shiny App.R file
+#' @param tcga_PW         Pathway activity data. See shiny App.R file
+#' @param opt_pancan      molecular datasets parameters
+#' @param custom_metadata user customized metadata
+#'
+batch_download = function(L1, L2, L3,
+                          tumor_index_list, tcga_TIL, tcga_PW, 
+                          opt_pancan, custom_metadata=NULL){
+  if(L1 == "Molecular_profile"){
+    # L2 = "mRNA Expression"
+    # L3 = "TP53"
+    x_genomic_profile = switch(L2,
+                               `mRNA Expression` = "mRNA",
+                               `Transcript Expression` = "transcript",
+                               `DNA Methylation` = "methylation",
+                               `Protein Expression` = "protein",
+                               `miRNA Expression` = "miRNA",
+                               `Mutation status` = "mutation",
+                               `Copy Number Variation` = "cnv"
+    )
+    x_data <- query_pancan_value(L3, 
+                                 data_type = x_genomic_profile,
+                                 opt_pancan = opt_pancan
+    )
+    if (is.list(x_data)) x_data <- x_data[[1]]
+    x_data <- data.frame(id = L3,
+                         sample = names(x_data), value = as.numeric(x_data),
+                         level2 = L2)
+  } else if (L1 == "Tumor_index"){
+    # L2 = "Tumor Purity"
+    # L3 = L3_candi$id_tumor_index$tcga_purity$Level3[1]
+    x_tumor_index = switch(L2,
+                           `Tumor Purity` = "tcga_purity",
+                           `Tumor Stemness` = "tcga_stemness",
+                           `Tumor Mutation Burden` = "tcga_tmb",
+                           `Microsatellite Instability` = "tcga_msi",
+                           `Genome Instability` = "tcga_genome_instability"
+    )
+    x_data = tumor_index_list[[x_tumor_index]][,c("sample", L3)]
+    colnames(x_data)[2] = "value"
+    x_data = x_data %>% 
+      dplyr::mutate(id = L3, .before = 1) %>%
+      dplyr::mutate(level2 = L2) %>%
+      dplyr::filter(!is.na(.data$value))
+  } else if (L1 == "Immune_Infiltration"){
+    # L2 = "CIBERSORT"
+    # L3 = L3_candi$id_TIL$CIBERSORT$Level3[1]
+    x_data = tcga_TIL[,c("cell_type",
+                         paste0(L3,"_",L2))]
+    colnames(x_data) = c("sample","value")
+    x_data = x_data %>% 
+      dplyr::mutate(id = L3, .before = 1) %>%
+      dplyr::mutate(level2 = L2) %>%
+      dplyr::filter(!is.na(.data$value))
+  } else if (L1 == "Pathway_activity"){
+    # L2 = "HALLMARK"
+    # L3 = L3_candi$id_PW$HALLMARK$Level3[1]
+    x_data = tcga_PW[,paste0(L2,"_",L3),drop=FALSE]
+    colnames(x_data) = "value"
+    x_data = x_data %>% as.data.frame() %>%
+      tibble::rownames_to_column("sample") %>%
+      dplyr::mutate(id = L3, .before = 1) %>%
+      dplyr::mutate(level2 = L2) %>%
+      dplyr::filter(!is.na(.data$value))	
+  } else if (L1 == "Custom_metadata"){
+    if(is.null(custom_metadata)){
+      set.seed(42)
+      sp_info = query_tcga_group()$data[,"Sample"]
+      scores = matrix(stats::rnorm(nrow(sp_info)*5,mean = 1, sd = 1), ncol = 5) %>% as.data.frame()
+      colnames(scores) = paste0("TF",1:5)
+      custom_metadata = cbind(sp_info, scores)
+    }
+    x_data = custom_metadata[,c("Sample", L3)]
+    colnames(x_data) = c("sample","value")
+    x_data = x_data %>% as.data.frame() %>%
+      dplyr::mutate(id = L3, .before = 1) %>%
+      dplyr::mutate(level2 = L2) %>%
+      dplyr::filter(!is.na(.data$value))	
+  }
+  x_data
+  # id                  sample value          level2
+  # 1 TP53 GTEX-S4Q7-0003-SM-3NM8M 4.785 mRNA Expression
+  # 2 TP53         TCGA-19-1787-01 5.887 mRNA Expression
+  # 3 TP53         TCGA-S9-A7J2-01 5.517 mRNA Expression
+  # 4 TP53 GTEX-QV31-1626-SM-2S1QC 4.431 mRNA Expression
+  # 5 TP53         TCGA-G3-A3CH-11 2.382 mRNA Expression
+  # 6 TP53         TCGA-B5-A5OE-01 5.765 mRNA Expression
+}
diff --git a/R/get_pancan_value.R b/R/get_pancan_value.R
@@ -101,10 +101,10 @@ try_query_value <- function(host, dataset,
         xe = UCSCXenaTools::XenaQueryProbeMap(UCSCXenaTools::XenaGenerate(subset = XenaDatasets == dataset))
         xd = UCSCXenaTools::XenaPrepare(UCSCXenaTools::XenaDownload(xe), col_names = FALSE)[, c(1, 2)]
         xd = tidyr::separate_rows(xd, "X2", sep = ",")
-        xd = dplyr::filter(xd, X2 %in% identifiers)
+        xd = dplyr::filter(xd, .data$X2 %in% identifiers)
 
         if (!is.null(rule_out)) {
-          xd = dplyr::filter(xd, !X1 %in% rule_out)  # X2 → X1
+          xd = dplyr::filter(xd, !.data$X1 %in% rule_out)  # X2 → X1
         }
         ids = xd$X1
 

diff --git a/R/query_tcga_group.R b/R/query_tcga_group.R
@@ -149,44 +149,44 @@ query_tcga_group = function(cancer=NULL,
       # 6种过滤方式
       if (filter_by_L3=="+"){         #保留
         meta_data_sub %>% 
-          dplyr::filter(.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull(Sample)
+          dplyr::filter(.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull('Sample')
       } else if (filter_by_L3=="-"){  #剔除
         meta_data_sub %>% 
-          dplyr::filter(!.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull(Sample)
+          dplyr::filter(!.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull('Sample')
       } else if (filter_by_L3==">"){  #大于 绝对值
         filter_by_L2 = as.numeric(filter_by_L2)
         meta_data_sub %>% 
-          dplyr::filter(.data[[filter_by_L1]] > filter_by_L2) %>% dplyr::pull(Sample)
+          dplyr::filter(.data[[filter_by_L1]] > filter_by_L2) %>% dplyr::pull('Sample')
       } else if (filter_by_L3=="%>"){ #大于 分位数
         filter_by_L2 = as.numeric(filter_by_L2)
         meta_data_sub %>% 
-          dplyr::group_by(Cancer) %>% 
+          dplyr::group_by("Cancer") %>% 
           dplyr::filter(.data[[filter_by_L1]] > 
-                          quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull(Sample)
+                          quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull('Sample')
       } else if (filter_by_L3=="<"){ #小于 绝对值
         filter_by_L2 = as.numeric(filter_by_L2)
         meta_data_sub %>% 
-          dplyr::filter(.data[[filter_by_L1]] < filter_by_L2) %>% dplyr::pull(Sample)  
+          dplyr::filter(.data[[filter_by_L1]] < filter_by_L2) %>% dplyr::pull('Sample')  
       } else if (filter_by_L3=="%<"){#小于 分位数
         filter_by_L2 = as.numeric(filter_by_L2)
         meta_data_sub %>% 
-          dplyr::group_by(Cancer) %>% 
+          dplyr::group_by("Cancer") %>% 
           dplyr::filter(.data[[filter_by_L1]] < 
-                          quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull(Sample)  
+                          quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull('Sample')  
       }
     }) %>% unlist()
 
     # 统计频数，保留符合全部条件的样本
     Samples_freq = table(Samples_retain)
     meta_data_sub = meta_data_sub %>% 
-      dplyr::filter(Sample %in% names(Samples_freq)[Samples_freq==length(filter_by)])
+      dplyr::filter(.data$Sample %in% names(Samples_freq)[Samples_freq==length(filter_by)])
   }
 
 
   # step3-2: filter by sample id
   if(!is.null(filter_id)){
     meta_data_sub = meta_data_sub %>% 
-      dplyr::filter(Sample %in% filter_id)
+      dplyr::filter(.data$Sample %in% filter_id)
   }
 
 

diff --git a/inst/shinyapp/App.R b/inst/shinyapp/App.R
@@ -168,8 +168,6 @@ tumor_index_list$tcga_msi = tcga_gtex %>%
   dplyr::select(Barcode, sample) %>%
   dplyr::inner_join(tumor_index_list$tcga_msi, by = "Barcode")
 
-# Help → ID reference
-id_merge = load_data("pancan_identifier_help")
 
 
 
@@ -202,6 +200,55 @@ PW_meta <- PW_meta %>%
 
 
 
+code_types = list("NT"= "NT (normal tissue)",
+          "TP"= "TP (primary tumor)",
+          "TR"= "TR (recurrent tumor)",
+          "TB"= "TB (blood derived tumor)",
+          "TAP"="TAP (additional primary)",
+          "TM"= "TM (metastatic tumor)",
+          "TAM"="TAM (additional metastatic)")
+
+# Help → ID reference
+id_merge = load_data("pancan_identifier_help")
+
+id_list = list(
+   `mRNA Expression` = list(all = pancan_identifiers$gene, default = "TP53"),
+   `Transcript Expression` = list(all = load_data("transcript_identifier"), default = "ENST00000000233"),
+   `DNA Methylation` = list(all = pancan_identifiers$gene, default = "TP53"),
+   `Protein Expression` = list(all = pancan_identifiers$protein, default = "P53"),
+   `miRNA Expression` = list(all = pancan_identifiers$miRNA, default = "hsa-miR-769-3p"),
+   `Mutation status` = list(all = pancan_identifiers$gene, default = "TP53"),
+   `Copy Number Variation` = list(all = pancan_identifiers$gene, default = "TP53"),
+
+   `Tumor Purity` = list(all = colnames(tumor_index_list$tcga_purity)[3:7], default = "ESTIMATE"),
+   `Tumor Stemness` = list(all = colnames(tumor_index_list$tcga_stemness)[2:6], default = "RNAss"),
+   `Tumor Mutation Burden` = list(all = colnames(tumor_index_list$tcga_tmb)[4:5], default = "Non_silent_per_Mb"),
+   `Microsatellite Instability` = list(all = colnames(tumor_index_list$tcga_msi)[3:21], default = "Total_nb_MSI_events"),
+   `Genome Instability` = list(all = colnames(tumor_index_list$tcga_genome_instability)[2:6], default = "ploidy"),
+
+   `CIBERSORT` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="CIBERSORT"]), default = "Monocyte"),
+   `CIBERSORT-ABS` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="CIBERSORT-ABS"]), default = "Monocyte"),
+   `EPIC` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="EPIC"]), default = "Macrophage"),
+   `MCPCOUNTER` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="MCPCOUNTER"]), default = "Monocyte"),
+   `QUANTISEQ` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="QUANTISEQ"]), default = "Monocyte"),
+   `TIMER` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="TIMER"]), default = "Macrophage"),
+   `XCELL` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="XCELL"]), default = "Monocyte"),
+
+   `HALLMARK` = list(all = sort(PW_meta$Name[PW_meta$Type=="HALLMARK"]), default = "APOPTOSIS"),
+   `KEGG` = list(all = sort(PW_meta$Name[PW_meta$Type=="KEGG"]), default = "CELL_CYCLE"),
+   `IOBR` = list(all = sort(PW_meta$Name[PW_meta$Type=="IOBR"]), default = "Biotin_Metabolism")
+)
+id_category = list(
+  `Molecular_profile` = list("mRNA Expression", "Transcript Expression", "DNA Methylation", 
+        "Protein Expression", "miRNA Expression", "Mutation status","Copy Number Variation"),
+  `Tumor_index` = list("Tumor Purity","Tumor Stemness","Tumor Mutation Burden",
+        "Microsatellite Instability","Genome Instability"),
+  `Immune_Infiltration`=list("CIBERSORT", "CIBERSORT-ABS", "EPIC", "MCPCOUNTER",
+        "QUANTISEQ", "TIMER", "XCELL"),
+  `Pathway_activity` = list("HALLMARK","KEGG","IOBR"),
+  `Custom_metadata` = list("Custom_metadata")
+)
+
 # CCLE tissues for drug analysis
 # "ALL" means all tissues
 ccle_drug_related_tissues <- c(
@@ -241,13 +288,13 @@ mycolor <- c(RColorBrewer::brewer.pal(12, "Paired"))
 
 # Put modules here --------------------------------------------------------
 modules_path <- system.file("shinyapp", "modules", package = "UCSCXenaShiny", mustWork = TRUE)
-modules_file <- dir(modules_path, pattern = "\\.R$", full.names = TRUE)
+modules_file <- dir(modules_path, pattern = "\\.R$", full.names = TRUE, recursive = TRUE)
 sapply(modules_file, function(x, y) source(x, local = y), y = environment())
 
 
 # Put page UIs here -----------------------------------------------------
 pages_path <- system.file("shinyapp", "ui", package = "UCSCXenaShiny", mustWork = TRUE)
-pages_file <- dir(pages_path, pattern = "\\.R$", full.names = TRUE)
+pages_file <- dir(pages_path, pattern = "\\.R$", full.names = TRUE, recursive = TRUE)
 sapply(pages_file, function(x, y) source(x, local = y), y = environment())
 
 
@@ -343,8 +390,11 @@ ui <- tagList(
     ui.page_home(),
     ui.page_repository(),
     ui.page_general_analysis(),
-    ui.page_pancan(),
-    ui.page_pancan2(),
+    ui.page_pancan_tcga(),
+    ui.page_pancan_pcawg(),
+    ui.page_pancan_ccle(),
+    ui.page_pancan_quick(),
+    ui.page_download(),
     #ui.page_global(),
     ui.page_help(),
     ui.page_developers(),

diff --git a/inst/shinyapp/helper/batch_ids.md b/inst/shinyapp/helper/batch_ids.md
@@ -0,0 +1,29 @@
+**Here, you can provide multiple IDs in 3 ways:**
+
+**(1) Selection**
+
+- Your can select ids one by one or choose all ids under the sub data type;
+
+<br>
+
+**(2) All**
+
+- You can directly select all ids  under the sub data type;
+
+<br>
+
+**(3) File**
+
+- You can also upload valid ids file (.txt) for quick selection;
+- By default, some random ids are preliminarily  selected in the mode.
+
+<br>
+
+<br>
+
+> Note: For molecular profile, up to 100 ids are allowed to selected or uploaded.
+
+
+
+
+
diff --git a/inst/shinyapp/helper/choose_samples.md b/inst/shinyapp/helper/choose_samples.md
@@ -1,6 +1,16 @@
-#### 1. filter by multi-condition
+**Here, you can select subset of samples in 2 ways.**
 
-You can add  one or more filters from any data type or phenotype that we provide. 
+
+
+**（1）quick filter**
+
+You can also directly filter by code type, such as TP, NT samples.
+
+
+
+**（2）exact filter**
+
+You can execute detailed filter using multiple conditions for different phenotypes.
 
 - For **character** class, you can set "+" or "-" for retain or discard;
 
@@ -10,12 +20,6 @@ You can add  one or more filters from any data type or phenotype that we provide
 
 
 
-#### 2. filter by code type
-
-You can also directly filter by code type.
-
-It is important because we sometimes only want to analysis for tumor samples.
-
 
 
-> By default, we will retain all samples under the cancer types and then consider the intersection space for above 2 operations.
+> By default, it will retain all samples under the cancer type(s) and then consider the intersection space for above 2 operations.
diff --git a/inst/shinyapp/helper/data_origin.md b/inst/shinyapp/helper/data_origin.md
@@ -2,6 +2,8 @@ Here, you can change the default data origin for **molecular profile** data type
 
 Up to now, we support DNA methylation and CNV data adjustment:
 
+<br>
+
 #### 3. DNA methylation
 
 (1) You can choose the 450K or 27K array;
@@ -10,16 +12,14 @@ Up to now, we support DNA methylation and CNV data adjustment:
 
 (3) You can limit the CpG sites of one gene.
 
-
+<br>
 
 #### 7. Copy Number Variation
 
 (1) You can set whether to use thresholded (integer) data.
 
 
 
-> Note: One you click the button, it would restore the initial settings.
-
 
 
 

diff --git a/inst/shinyapp/helper/set_groups.md b/inst/shinyapp/helper/set_groups.md
@@ -1,15 +1,15 @@
 Firstly, Select a phenotype to use as the basis for grouping in above step.
 
+<br>
 
-
-#### For character class:
+#### For **character** class:
 
 - You should assign one or multiple categories for either group;
 - The two groups should have independent categories.
 
+<br>
 
-
-#### For numeric class:
+#### For **numeric** class:
 
 - You should set the range of either group;
 - The range is left close and right open [min, max);