diff --git a/NAMESPACE b/NAMESPACE index 25c69b4..66f4ede 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ export(editMiaTaxaNames) export(filterTaxa) export(get_direction_cols) export(get_meth_class) -export(lefser2) export(norm_clr) export(norm_tss) export(plot_enrichment) @@ -19,17 +18,5 @@ export(run_DA) export(set_DA_methods_list) export(set_norm_list) export(taxize_classification_to_taxonomy_table) -import(SummarizedExperiment) -importFrom(MASS,lda) -importFrom(coin,pvalue) -importFrom(coin,statistic) -importFrom(coin,wilcox_test) importFrom(dplyr,bind_rows) importFrom(magrittr,"%>%") -importFrom(methods,as) -importFrom(methods,is) -importFrom(stats,kruskal.test) -importFrom(stats,reorder) -importFrom(stats,rnorm) -importFrom(stats,setNames) -importFrom(utils,tail) diff --git a/R/lefser.R b/R/lefser.R deleted file mode 100644 index 1723fb1..0000000 --- a/R/lefser.R +++ /dev/null @@ -1,175 +0,0 @@ - -## This code is borrowed from the lefser package -## I need it here because there are some non-exported functions, -## necessary for lefser2 to work. - - -fillPmatZmat <- function(group, - block, - expr_sub, - p.threshold) -{ - # creates a list of boolean vectors, each vector indicates - # existance (TRUE) or absence (FALSE) of a class/sub-class combination - combos <- apply( - expand.grid(levels(group), levels(block)), 1L, paste0, collapse = "") - combined <- paste0(as.character(group), as.character(block)) - logilist <- lapply(setNames(nm = sort(combos)), `==`, combined) - - ## uses Wilcoxon rank-sum test to test for significant differential abundances between - ## subclasses of one class against subclasses of all othe classes; results are saved in - ## "pval_mat" and "z_mat" matrices - whichlist <- lapply(logilist, which) - sblock <- seq_along(levels(block)) - texp_sub <- t(expr_sub) - iters <- expand.grid(sblock, sblock + length(sblock)) - group_formats <- apply(iters, 1L, function(x) { - ind <- unlist(whichlist[x]) - apply(texp_sub, 2L, function(g) { - wx <- suppressWarnings(coin::wilcox_test(g ~ group, subset = ind)) - cbind.data.frame( - p.value = coin::pvalue(wx), statistic = coin::statistic(wx) - ) - }) - }) - - res <- lapply(group_formats, function(x) do.call(rbind, x)) - pval_mat <- do.call(cbind, lapply(res, `[[`, "p.value")) - z_mat <- do.call(cbind, lapply(res, `[[`, "statistic")) - - rownames(pval_mat) <- rownames(expr_sub) - rownames(z_mat) <- rownames(expr_sub) - - ## converts "pval_mat" into boolean matrix "logical_pval_mat" where - ## p-values <= wilcoxon.threshold - logical_pval_mat <- pval_mat <= p.threshold * 2.0 - logical_pval_mat[is.na(logical_pval_mat)] <- FALSE - - ## determines which rows (features) have all p-values<=0.05 - ## and selects such rows from the matrix of z-statistics - sub <- apply(logical_pval_mat, 1L, all) - z_mat_sub <- z_mat[sub, , drop = FALSE] - # confirms that z-statistics of a row all have the same sign - sub <- abs(rowSums(z_mat_sub)) == rowSums(abs(z_mat_sub)) - expr_sub[names(sub[sub]), , drop = FALSE] -} - -## ensures that more than half of the values in each for each feature are unique -## if that is not the case then a count value is altered by adding it to a small value -## generated via normal distribution with mean=0 and sd=5% of the count value -createUniqueValues <- function(df, group){ - orderedrows <- rownames(df) - splitdf <- split(df, group) - maxim <- vapply(table(group), function(x) max(x * 0.5, 4), numeric(1L)) - for (i in seq_along(splitdf)) { - sdat <- splitdf[[i]] - splitdf[[i]][] <- lapply(sdat, function(cols) { - if (length(unique(cols)) > maxim[i]) - cols - else - abs(cols + rnorm( - length(cols), mean = 0, sd = max(cols * 0.05, 0.01)) - ) - }) - } - df <- do.call(rbind, unname(splitdf)) - df[match(orderedrows, rownames(df)),, drop = FALSE] -} - -contastWithinClassesOrFewPerClass <- - function(expr_sub_t_df, rand_s, min_cl, ncl, groups) { - cols <- expr_sub_t_df[rand_s, , drop = FALSE] - cls <- expr_sub_t_df$class[rand_s] - # if the number of classes is less than the actual number (typically two) - # of classes in the dataframe then return TRUE - if (length(unique(cls)) < ncl) { - return (TRUE) - } - # detect if for each class there are not fewer than the minimum (min_cl) number of samples - if (any(table(cls) < min_cl)) { - return (TRUE) - } - # separate the randomly selected samples (cols) into a list of the two classes - drops <- c("class") - by_class <- - lapply(seq_along(groups), function(x) { - cols[cols[, "class"] == groups[x],!(names(cols) %in% drops)] - }) - - # makes sure that within each class all features have at least min_cl unique count values - for (i in seq_along(groups)) { - unique_counts_per_microb = apply(by_class[[i]], 2, function(x) { - length(unique(x)) - }) - if ((any(unique_counts_per_microb <= min_cl) & - min_cl > 1) | - (min_cl == 1 & any(unique_counts_per_microb <= 1))) { - return (TRUE) - } - } - return (FALSE) - - } - -ldaFunction <- function (data, lfk, rfk, min_cl, ncl, groups) { - # test 1000 samples for contrast within classes per feature - # and that there is at least a minimum number of samples per class - for (j in 1:1000) { - rand_s <- sample(seq_len(lfk), rfk, replace = TRUE) - if (!contastWithinClassesOrFewPerClass(data, rand_s, min_cl, ncl, groups)) { - break - } - } - # lda with rfk number of samples - lda.fit <- MASS::lda(class ~ ., data = data, subset = rand_s) - # coefficients that transform observations to discriminants - w <- lda.fit$scaling[, 1] - # scaling of lda coefficients - w.unit <- w / sqrt(sum(w ^ 2)) - sub_d <- data[rand_s,] - ss <- sub_d[,-match("class", colnames(sub_d))] - xy.matrix <- as.matrix(ss) - # the original matrix is transformed - LD <- xy.matrix %*% w.unit - # effect size is calculated as difference between averaged disciminants - # of two classes - effect_size <- - abs(mean(LD[sub_d[, "class"] == 0]) - mean(LD[sub_d[, "class"] == 1])) - # scaling lda coefficients by the efect size - scal <- w.unit * effect_size - # mean count values per fclass per feature - rres <- lda.fit$means - rowns <- rownames(rres) - lenc <- length(colnames(rres)) - - coeff <- vector("numeric", length(scal)) - for (v in seq_along(scal)) { - if (!is.na(scal[v])) { - coeff[v] <- abs(scal[v]) - } else{ - coeff[v] <- 0 - } - - } - # count value differences between means of two classes for each feature - lda.means.diff <- (lda.fit$means[2,] - lda.fit$means[1,]) - # difference between a feature's class means and effect size adjusted lda coefficient - # are averaged for each feature - (lda.means.diff + coeff) / 2 -} - -.numeric01 <- function(x) { - x <- as.factor(x) - uvals <- levels(x) - ifelse(x == uvals[1L], 0L, 1L) -} - -.trunc <- function(scores_df, trim.names){ - Names <- gsub("`", "", scores_df[["Names"]]) - if (trim.names) { - listNames <- strsplit(Names, "\\||\\.") - Names <- vapply(listNames, tail, character(1L), 1L) - } - scores_df[["Names"]] <- Names - return(scores_df) -} diff --git a/R/lefser2.R b/R/lefser2.R deleted file mode 100644 index 683be35..0000000 --- a/R/lefser2.R +++ /dev/null @@ -1,154 +0,0 @@ - -## This is an adaptation of a couple of functions from the lefser package. -## This was done in order to include raw p-values from the kruskal test. - -filterKruskal2 <- function(expr, group, p.value) { - - kw.res.pvalues <- apply(expr, 1L, function(x) { - stats::kruskal.test(x ~ group)[["p.value"]] - }) - - kw.res.pvalues <- kw.res.pvalues[kw.res.pvalues <= p.value] - kw.res.pvalues <- kw.res.pvalues[!is.na(kw.res.pvalues)] - kw.res.pvalues <- as.data.frame(kw.res.pvalues) - colnames(kw.res.pvalues) <- "kw.pval" - - taxa <- rownames(kw.res.pvalues) - filtered_matrix <- expr[taxa, ] - - list(pvalues = kw.res.pvalues, submatrix = filtered_matrix) - -} - -#' R implementation of the LEfSe method 2 -#' -#' \code{lefser2} is an adaptation from the -#' \code{\link[lefser]{lefser}} function. This adaptation allows the inclusion -#' of the raw p-values of the KW test on the output. -#' -#' @param expr A \code{\linkS4class{SummarizedExperiment}} with expression data. -#' @param kruskal.threshold numeric(1) The p-value for the Kruskal-Wallis Rank -#' Sum Test (default 0.05). -#' @param wilcox.threshold numeric(1) The p-value for the Wilcoxon Rank-Sum Test -#' when 'blockCol' is present (default 0.05). -#' @param lda.threshold numeric(1) The effect size threshold (default 2.0). -#' @param groupCol character(1) Column name in `colData(expr)` indicating -#' groups, usually a factor with two levels (e.g., `c("cases", "controls")`; -#' default "GROUP"). -#' @param blockCol character(1) Optional column name in `colData(expr)` -#' indicating the blocks, usually a factor with two levels (e.g., -#' `c("adult", "senior")`; default NULL). -#' @param assay The i-th assay matrix in the `SummarizedExperiment` ('expr'; -#' default 1). -#' @param trim.names If `TRUE` extracts the most specific taxonomic rank of -#' organism. -#' @param log If TRUE, matrix is exponentiated (exp). Default is FALSE. -#' @return -#' The function returns a data frame with three columns, which are -#' names of microorganisms, their LDA scores, and kruskal wallis p-values. -#' -#' @importFrom stats kruskal.test reorder rnorm -#' @importFrom coin pvalue statistic wilcox_test -#' @importFrom MASS lda -#' @importFrom methods as is -#' @importFrom stats setNames -#' @importFrom utils tail -#' @import SummarizedExperiment -#' -#' @export -#' -lefser2 <- - function(expr, - kruskal.threshold = 0.05, - wilcox.threshold = 0.05, - lda.threshold = 2.0, - groupCol = "GROUP", - blockCol = NULL, - assay = 1L, - trim.names = FALSE, log = FALSE - ) - { - groupf <- colData(expr)[[groupCol]] - if (is.null(groupf)) - stop("A valid group assignment 'groupCol' must be provided") - groupf <- as.factor(groupf) - groupsf <- levels(groupf) - if (length(groupsf) != 2L) - stop( - "Group classification is not dichotomous:\n", - "Found (", paste(groupsf, collapse = ", "), ")" - ) - group <- .numeric01(groupf) - groups <- 0:1 - expr_data <- assay(expr, i = assay) - # expr_sub <- filterKruskal(expr_data, group, kruskal.threshold) - - ## Adding these lines - kw_res <- filterKruskal2(expr_data, group, kruskal.threshold) - kw_pval <- data.frame( - Names = rownames(kw_res[[1]]), kw_pvalues = kw_res[[1]][[1]] - ) - kw_pval <- .trunc(kw_pval, trim.names) - expr_sub <- kw_res[[2]] - ## Lines above were added - - if (!is.null(blockCol)) { - block <- as.factor(colData(expr)[[blockCol]]) - expr_sub <- fillPmatZmat(groupf, block, expr_sub, wilcox.threshold) - } - - # transposes matrix and add a "class" (i.e., group) column - # matrix converted to dataframe - - if (log) { - expr_sub_t <- t(exp(expr_sub)) ## added this line - } else { - expr_sub_t <- t(expr_sub) - } - expr_sub_t_df <- as.data.frame(expr_sub_t) - expr_sub_t_df <- createUniqueValues(expr_sub_t_df, groupf) - expr_sub_t_df <- cbind(expr_sub_t_df, class = group) - - # number of samples (i.e., subjects) in the dataframe - lfk <- nrow(expr_sub_t_df) - # rfk is the number of subject that will be used in linear discriminant analysis - rfk <- floor(lfk * 2 / 3) - # number of classes (two) - ncl <- length(groups) - # count samples in each class of the dataframe, select the number from the class with a smaller - # count of samples and multiply that number by 2/*2/3*0.5 - min_cl <- - as.integer(min(table(expr_sub_t_df$class)) * 2 / 3 * 2 / 3 * - 0.5) - # if min_cl is less than 1, then make it equal to 1 - min_cl <- max(min_cl, 1) - - # lda_fn repeated 30 times, producing a matrix of 30 scores per feature - eff_size_mat <- - replicate(30, suppressWarnings(ldaFunction( - expr_sub_t_df, lfk, rfk, min_cl, ncl, groups - )), simplify = TRUE) - - # mean of 30 scores per feature - raw_lda_scores <- rowMeans(eff_size_mat) - - # processing of score - processed_scores <- - sign(raw_lda_scores) * log((1 + abs(raw_lda_scores)), 10) - - # sorting of scores - processed_sorted_scores <- sort(processed_scores) - scores_df <- data.frame(Names = names(processed_sorted_scores), - scores = as.vector(processed_sorted_scores), - stringsAsFactors = FALSE) - - scores_df <- .trunc(scores_df, trim.names) - - threshold_scores <- abs(scores_df$scores) >= lda.threshold - - ## Adding a these lines - final_scores <- scores_df[threshold_scores, ] - which_names <- which(kw_pval[["Names"]] %in% final_scores[["Names"]]) - kw_pval_subset <- kw_pval[which_names, ] - merge(scores_df, kw_pval_subset, by = "Names") - } \ No newline at end of file diff --git a/man/lefser2.Rd b/man/lefser2.Rd deleted file mode 100644 index dbf48fc..0000000 --- a/man/lefser2.Rd +++ /dev/null @@ -1,54 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lefser2.R -\name{lefser2} -\alias{lefser2} -\title{R implementation of the LEfSe method 2} -\usage{ -lefser2( - expr, - kruskal.threshold = 0.05, - wilcox.threshold = 0.05, - lda.threshold = 2, - groupCol = "GROUP", - blockCol = NULL, - assay = 1L, - trim.names = FALSE, - log = FALSE -) -} -\arguments{ -\item{expr}{A \code{\linkS4class{SummarizedExperiment}} with expression data.} - -\item{kruskal.threshold}{numeric(1) The p-value for the Kruskal-Wallis Rank -Sum Test (default 0.05).} - -\item{wilcox.threshold}{numeric(1) The p-value for the Wilcoxon Rank-Sum Test -when 'blockCol' is present (default 0.05).} - -\item{lda.threshold}{numeric(1) The effect size threshold (default 2.0).} - -\item{groupCol}{character(1) Column name in \code{colData(expr)} indicating -groups, usually a factor with two levels (e.g., \code{c("cases", "controls")}; -default "GROUP").} - -\item{blockCol}{character(1) Optional column name in \code{colData(expr)} -indicating the blocks, usually a factor with two levels (e.g., -\code{c("adult", "senior")}; default NULL).} - -\item{assay}{The i-th assay matrix in the \code{SummarizedExperiment} ('expr'; -default 1).} - -\item{trim.names}{If \code{TRUE} extracts the most specific taxonomic rank of -organism.} - -\item{log}{If TRUE, matrix is exponentiated (exp). Default is FALSE.} -} -\value{ -The function returns a data frame with three columns, which are -names of microorganisms, their LDA scores, and kruskal wallis p-values. -} -\description{ -\code{lefser2} is an adaptation from the -\code{\link[lefser]{lefser}} function. This adaptation allows the inclusion -of the raw p-values of the KW test on the output. -} diff --git a/vignettes/articles/Figure1.pdf b/vignettes/articles/Figure1.pdf index 70ceab7..d6d8562 100644 Binary files a/vignettes/articles/Figure1.pdf and b/vignettes/articles/Figure1.pdf differ