Skip to content

Commit

Permalink
add TCGA batch analysis module (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
lishensuo authored Nov 18, 2023
1 parent bcddced commit 5bd9191
Show file tree
Hide file tree
Showing 54 changed files with 2,126 additions and 1,131 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ inst/extdata/transcript_identifier.rda
inst/extdata/toil_sig_score.rda
inst/extdata/tcga_PW.rda
inst/extdata/tcga_PW_meta.rda
inst/extdata/pancan_identifier_help


^doc$
Expand Down
94 changes: 94 additions & 0 deletions R/batch_func.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#' download data for shiny batch analysis
#'
#' @param L1 level 1 main datatype
#' @param L2 level 2 sub datatype
#' @param L3 level 3 identifier
#' @param tumor_index_list Tumor index data. See shiny App.R file
#' @param tcga_TIL Tumor immune infiltration data. See shiny App.R file
#' @param tcga_PW Pathway activity data. See shiny App.R file
#' @param opt_pancan molecular datasets parameters
#' @param custom_metadata user customized metadata
#'
batch_download = function(L1, L2, L3,
tumor_index_list, tcga_TIL, tcga_PW,
opt_pancan, custom_metadata=NULL){
if(L1 == "Molecular_profile"){
# L2 = "mRNA Expression"
# L3 = "TP53"
x_genomic_profile = switch(L2,
`mRNA Expression` = "mRNA",
`Transcript Expression` = "transcript",
`DNA Methylation` = "methylation",
`Protein Expression` = "protein",
`miRNA Expression` = "miRNA",
`Mutation status` = "mutation",
`Copy Number Variation` = "cnv"
)
x_data <- query_pancan_value(L3,
data_type = x_genomic_profile,
opt_pancan = opt_pancan
)
if (is.list(x_data)) x_data <- x_data[[1]]
x_data <- data.frame(id = L3,
sample = names(x_data), value = as.numeric(x_data),
level2 = L2)
} else if (L1 == "Tumor_index"){
# L2 = "Tumor Purity"
# L3 = L3_candi$id_tumor_index$tcga_purity$Level3[1]
x_tumor_index = switch(L2,
`Tumor Purity` = "tcga_purity",
`Tumor Stemness` = "tcga_stemness",
`Tumor Mutation Burden` = "tcga_tmb",
`Microsatellite Instability` = "tcga_msi",
`Genome Instability` = "tcga_genome_instability"
)
x_data = tumor_index_list[[x_tumor_index]][,c("sample", L3)]
colnames(x_data)[2] = "value"
x_data = x_data %>%
dplyr::mutate(id = L3, .before = 1) %>%
dplyr::mutate(level2 = L2) %>%
dplyr::filter(!is.na(.data$value))
} else if (L1 == "Immune_Infiltration"){
# L2 = "CIBERSORT"
# L3 = L3_candi$id_TIL$CIBERSORT$Level3[1]
x_data = tcga_TIL[,c("cell_type",
paste0(L3,"_",L2))]
colnames(x_data) = c("sample","value")
x_data = x_data %>%
dplyr::mutate(id = L3, .before = 1) %>%
dplyr::mutate(level2 = L2) %>%
dplyr::filter(!is.na(.data$value))
} else if (L1 == "Pathway_activity"){
# L2 = "HALLMARK"
# L3 = L3_candi$id_PW$HALLMARK$Level3[1]
x_data = tcga_PW[,paste0(L2,"_",L3),drop=FALSE]
colnames(x_data) = "value"
x_data = x_data %>% as.data.frame() %>%
tibble::rownames_to_column("sample") %>%
dplyr::mutate(id = L3, .before = 1) %>%
dplyr::mutate(level2 = L2) %>%
dplyr::filter(!is.na(.data$value))
} else if (L1 == "Custom_metadata"){
if(is.null(custom_metadata)){
set.seed(42)
sp_info = query_tcga_group()$data[,"Sample"]
scores = matrix(stats::rnorm(nrow(sp_info)*5,mean = 1, sd = 1), ncol = 5) %>% as.data.frame()
colnames(scores) = paste0("TF",1:5)
custom_metadata = cbind(sp_info, scores)
}
x_data = custom_metadata[,c("Sample", L3)]
colnames(x_data) = c("sample","value")
x_data = x_data %>% as.data.frame() %>%
dplyr::mutate(id = L3, .before = 1) %>%
dplyr::mutate(level2 = L2) %>%
dplyr::filter(!is.na(.data$value))
}
x_data
# id sample value level2
# 1 TP53 GTEX-S4Q7-0003-SM-3NM8M 4.785 mRNA Expression
# 2 TP53 TCGA-19-1787-01 5.887 mRNA Expression
# 3 TP53 TCGA-S9-A7J2-01 5.517 mRNA Expression
# 4 TP53 GTEX-QV31-1626-SM-2S1QC 4.431 mRNA Expression
# 5 TP53 TCGA-G3-A3CH-11 2.382 mRNA Expression
# 6 TP53 TCGA-B5-A5OE-01 5.765 mRNA Expression
}
4 changes: 2 additions & 2 deletions R/get_pancan_value.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ try_query_value <- function(host, dataset,
xe = UCSCXenaTools::XenaQueryProbeMap(UCSCXenaTools::XenaGenerate(subset = XenaDatasets == dataset))
xd = UCSCXenaTools::XenaPrepare(UCSCXenaTools::XenaDownload(xe), col_names = FALSE)[, c(1, 2)]
xd = tidyr::separate_rows(xd, "X2", sep = ",")
xd = dplyr::filter(xd, X2 %in% identifiers)
xd = dplyr::filter(xd, .data$X2 %in% identifiers)

if (!is.null(rule_out)) {
xd = dplyr::filter(xd, !X1 %in% rule_out) # X2 → X1
xd = dplyr::filter(xd, !.data$X1 %in% rule_out) # X2 → X1
}
ids = xd$X1

Expand Down
20 changes: 10 additions & 10 deletions R/query_tcga_group.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,44 +149,44 @@ query_tcga_group = function(cancer=NULL,
# 6种过滤方式
if (filter_by_L3=="+"){ #保留
meta_data_sub %>%
dplyr::filter(.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull(Sample)
dplyr::filter(.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull('Sample')
} else if (filter_by_L3=="-"){ #剔除
meta_data_sub %>%
dplyr::filter(!.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull(Sample)
dplyr::filter(!.data[[filter_by_L1]] %in% filter_by_L2) %>% dplyr::pull('Sample')
} else if (filter_by_L3==">"){ #大于 绝对值
filter_by_L2 = as.numeric(filter_by_L2)
meta_data_sub %>%
dplyr::filter(.data[[filter_by_L1]] > filter_by_L2) %>% dplyr::pull(Sample)
dplyr::filter(.data[[filter_by_L1]] > filter_by_L2) %>% dplyr::pull('Sample')
} else if (filter_by_L3=="%>"){ #大于 分位数
filter_by_L2 = as.numeric(filter_by_L2)
meta_data_sub %>%
dplyr::group_by(Cancer) %>%
dplyr::group_by("Cancer") %>%
dplyr::filter(.data[[filter_by_L1]] >
quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull(Sample)
quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull('Sample')
} else if (filter_by_L3=="<"){ #小于 绝对值
filter_by_L2 = as.numeric(filter_by_L2)
meta_data_sub %>%
dplyr::filter(.data[[filter_by_L1]] < filter_by_L2) %>% dplyr::pull(Sample)
dplyr::filter(.data[[filter_by_L1]] < filter_by_L2) %>% dplyr::pull('Sample')
} else if (filter_by_L3=="%<"){#小于 分位数
filter_by_L2 = as.numeric(filter_by_L2)
meta_data_sub %>%
dplyr::group_by(Cancer) %>%
dplyr::group_by("Cancer") %>%
dplyr::filter(.data[[filter_by_L1]] <
quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull(Sample)
quantile(.data[[filter_by_L1]],filter_by_L2,na.rm=T)) %>% dplyr::pull('Sample')
}
}) %>% unlist()

# 统计频数,保留符合全部条件的样本
Samples_freq = table(Samples_retain)
meta_data_sub = meta_data_sub %>%
dplyr::filter(Sample %in% names(Samples_freq)[Samples_freq==length(filter_by)])
dplyr::filter(.data$Sample %in% names(Samples_freq)[Samples_freq==length(filter_by)])
}


# step3-2: filter by sample id
if(!is.null(filter_id)){
meta_data_sub = meta_data_sub %>%
dplyr::filter(Sample %in% filter_id)
dplyr::filter(.data$Sample %in% filter_id)
}


Expand Down
62 changes: 56 additions & 6 deletions inst/shinyapp/App.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ tumor_index_list$tcga_msi = tcga_gtex %>%
dplyr::select(Barcode, sample) %>%
dplyr::inner_join(tumor_index_list$tcga_msi, by = "Barcode")

# Help → ID reference
id_merge = load_data("pancan_identifier_help")



Expand Down Expand Up @@ -202,6 +200,55 @@ PW_meta <- PW_meta %>%



code_types = list("NT"= "NT (normal tissue)",
"TP"= "TP (primary tumor)",
"TR"= "TR (recurrent tumor)",
"TB"= "TB (blood derived tumor)",
"TAP"="TAP (additional primary)",
"TM"= "TM (metastatic tumor)",
"TAM"="TAM (additional metastatic)")

# Help → ID reference
id_merge = load_data("pancan_identifier_help")

id_list = list(
`mRNA Expression` = list(all = pancan_identifiers$gene, default = "TP53"),
`Transcript Expression` = list(all = load_data("transcript_identifier"), default = "ENST00000000233"),
`DNA Methylation` = list(all = pancan_identifiers$gene, default = "TP53"),
`Protein Expression` = list(all = pancan_identifiers$protein, default = "P53"),
`miRNA Expression` = list(all = pancan_identifiers$miRNA, default = "hsa-miR-769-3p"),
`Mutation status` = list(all = pancan_identifiers$gene, default = "TP53"),
`Copy Number Variation` = list(all = pancan_identifiers$gene, default = "TP53"),

`Tumor Purity` = list(all = colnames(tumor_index_list$tcga_purity)[3:7], default = "ESTIMATE"),
`Tumor Stemness` = list(all = colnames(tumor_index_list$tcga_stemness)[2:6], default = "RNAss"),
`Tumor Mutation Burden` = list(all = colnames(tumor_index_list$tcga_tmb)[4:5], default = "Non_silent_per_Mb"),
`Microsatellite Instability` = list(all = colnames(tumor_index_list$tcga_msi)[3:21], default = "Total_nb_MSI_events"),
`Genome Instability` = list(all = colnames(tumor_index_list$tcga_genome_instability)[2:6], default = "ploidy"),

`CIBERSORT` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="CIBERSORT"]), default = "Monocyte"),
`CIBERSORT-ABS` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="CIBERSORT-ABS"]), default = "Monocyte"),
`EPIC` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="EPIC"]), default = "Macrophage"),
`MCPCOUNTER` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="MCPCOUNTER"]), default = "Monocyte"),
`QUANTISEQ` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="QUANTISEQ"]), default = "Monocyte"),
`TIMER` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="TIMER"]), default = "Macrophage"),
`XCELL` = list(all = sort(TIL_meta$celltype[TIL_meta$method=="XCELL"]), default = "Monocyte"),

`HALLMARK` = list(all = sort(PW_meta$Name[PW_meta$Type=="HALLMARK"]), default = "APOPTOSIS"),
`KEGG` = list(all = sort(PW_meta$Name[PW_meta$Type=="KEGG"]), default = "CELL_CYCLE"),
`IOBR` = list(all = sort(PW_meta$Name[PW_meta$Type=="IOBR"]), default = "Biotin_Metabolism")
)
id_category = list(
`Molecular_profile` = list("mRNA Expression", "Transcript Expression", "DNA Methylation",
"Protein Expression", "miRNA Expression", "Mutation status","Copy Number Variation"),
`Tumor_index` = list("Tumor Purity","Tumor Stemness","Tumor Mutation Burden",
"Microsatellite Instability","Genome Instability"),
`Immune_Infiltration`=list("CIBERSORT", "CIBERSORT-ABS", "EPIC", "MCPCOUNTER",
"QUANTISEQ", "TIMER", "XCELL"),
`Pathway_activity` = list("HALLMARK","KEGG","IOBR"),
`Custom_metadata` = list("Custom_metadata")
)

# CCLE tissues for drug analysis
# "ALL" means all tissues
ccle_drug_related_tissues <- c(
Expand Down Expand Up @@ -241,13 +288,13 @@ mycolor <- c(RColorBrewer::brewer.pal(12, "Paired"))

# Put modules here --------------------------------------------------------
modules_path <- system.file("shinyapp", "modules", package = "UCSCXenaShiny", mustWork = TRUE)
modules_file <- dir(modules_path, pattern = "\\.R$", full.names = TRUE)
modules_file <- dir(modules_path, pattern = "\\.R$", full.names = TRUE, recursive = TRUE)
sapply(modules_file, function(x, y) source(x, local = y), y = environment())


# Put page UIs here -----------------------------------------------------
pages_path <- system.file("shinyapp", "ui", package = "UCSCXenaShiny", mustWork = TRUE)
pages_file <- dir(pages_path, pattern = "\\.R$", full.names = TRUE)
pages_file <- dir(pages_path, pattern = "\\.R$", full.names = TRUE, recursive = TRUE)
sapply(pages_file, function(x, y) source(x, local = y), y = environment())


Expand Down Expand Up @@ -343,8 +390,11 @@ ui <- tagList(
ui.page_home(),
ui.page_repository(),
ui.page_general_analysis(),
ui.page_pancan(),
ui.page_pancan2(),
ui.page_pancan_tcga(),
ui.page_pancan_pcawg(),
ui.page_pancan_ccle(),
ui.page_pancan_quick(),
ui.page_download(),
#ui.page_global(),
ui.page_help(),
ui.page_developers(),
Expand Down
29 changes: 29 additions & 0 deletions inst/shinyapp/helper/batch_ids.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
**Here, you can provide multiple IDs in 3 ways:**

**(1) Selection**

- Your can select ids one by one or choose all ids under the sub data type;

<br>

**(2) All**

- You can directly select all ids under the sub data type;

<br>

**(3) File**

- You can also upload valid ids file (.txt) for quick selection;
- By default, some random ids are preliminarily selected in the mode.

<br>

<br>

> Note: For molecular profile, up to 100 ids are allowed to selected or uploaded.




22 changes: 13 additions & 9 deletions inst/shinyapp/helper/choose_samples.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
#### 1. filter by multi-condition
**Here, you can select subset of samples in 2 ways.**

You can add one or more filters from any data type or phenotype that we provide.


**(1)quick filter**

You can also directly filter by code type, such as TP, NT samples.



**(2)exact filter**

You can execute detailed filter using multiple conditions for different phenotypes.

- For **character** class, you can set "+" or "-" for retain or discard;

Expand All @@ -10,12 +20,6 @@ You can add one or more filters from any data type or phenotype that we provide



#### 2. filter by code type

You can also directly filter by code type.

It is important because we sometimes only want to analysis for tumor samples.



> By default, we will retain all samples under the cancer types and then consider the intersection space for above 2 operations.
> By default, it will retain all samples under the cancer type(s) and then consider the intersection space for above 2 operations.
6 changes: 3 additions & 3 deletions inst/shinyapp/helper/data_origin.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ Here, you can change the default data origin for **molecular profile** data type

Up to now, we support DNA methylation and CNV data adjustment:

<br>

#### 3. DNA methylation

(1) You can choose the 450K or 27K array;
Expand All @@ -10,16 +12,14 @@ Up to now, we support DNA methylation and CNV data adjustment:

(3) You can limit the CpG sites of one gene.


<br>

#### 7. Copy Number Variation

(1) You can set whether to use thresholded (integer) data.



> Note: One you click the button, it would restore the initial settings.



Expand Down
8 changes: 4 additions & 4 deletions inst/shinyapp/helper/set_groups.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
Firstly, Select a phenotype to use as the basis for grouping in above step.

<br>


#### For character class:
#### For **character** class:

- You should assign one or multiple categories for either group;
- The two groups should have independent categories.

<br>


#### For numeric class:
#### For **numeric** class:

- You should set the range of either group;
- The range is left close and right open [min, max);
Expand Down
Loading

0 comments on commit 5bd9191

Please sign in to comment.