From 003b0ab685be00bbcec3516864445326de099d64 Mon Sep 17 00:00:00 2001 From: Lucille Delisle Date: Wed, 10 Jul 2019 10:25:00 +0200 Subject: [PATCH] added the overlap functions --- DESCRIPTION | 5 +- NAMESPACE | 8 + R/myGRoverlapFunctions.R | 244 ++++++++++++++++++++++++ README.md | 5 +- man/filterByScore.Rd | 49 +++++ man/findAllOverlap.Rd | 53 +++++ man/overlap2GR.Rd | 30 +++ tests/testthat/testGROverlapFunctions.R | 73 +++++++ 8 files changed, 463 insertions(+), 4 deletions(-) create mode 100644 R/myGRoverlapFunctions.R create mode 100644 man/filterByScore.Rd create mode 100644 man/findAllOverlap.Rd create mode 100644 man/overlap2GR.Rd create mode 100644 tests/testthat/testGROverlapFunctions.R diff --git a/DESCRIPTION b/DESCRIPTION index 85f9401..c4bf43b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,13 @@ Package: usefulLDfunctionsGR Title: Functions That I Use Regularly Which depends on GenomicRanges -Version: 0.0.1 +Version: 0.0.2 Authors@R: person(given = "Lucille", family = "Lopez-Delisle", role = c("aut", "cre"), email = "lucille.delisle@epfl.ch", comment = c(ORCID = "0000-0002-1964-4960")) -Description: Useful functions to manipulate GRanges +Description: Useful functions to manipulate GRanges and do overlaps. License: CC BY 4.0 Encoding: UTF-8 LazyData: true @@ -16,6 +16,7 @@ Imports: usefulLDfunctions, GenomicRanges, rtracklayer, + combinat, stats, grDevices, utils diff --git a/NAMESPACE b/NAMESPACE index abc48d6..e632afc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,18 @@ # Generated by roxygen2: do not edit by hand export(bedGraphFromGR) +export(filterByScore) +export(findAllOverlap) export(getTSSinUCSCFormatFromEnsemblGTF) export(grFromBedFile) export(grSortedSimplifiedFromNarrowPeak) +export(overlap2GR) import(usefulLDfunctions) +importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,makeGRangesFromDataFrame) +importFrom(GenomicRanges,mcols) importFrom(GenomicRanges,resize) +importFrom(combinat,combn) +importFrom(combinat,permn) importFrom(rtracklayer,readGFF) +importFrom(usefulLDfunctions,subsetByNamesOrIndices) diff --git a/R/myGRoverlapFunctions.R b/R/myGRoverlapFunctions.R new file mode 100644 index 0000000..680b90f --- /dev/null +++ b/R/myGRoverlapFunctions.R @@ -0,0 +1,244 @@ +#' Get a dataframe from findOverlaps from a GRangesList with 2 items where colnames match the names of the items +#' +#' @param my2GRsToOverlap a GRangesList with 2 items to overlap +#' @return a dataframe with 2 columns with colnames identicals to names of the input with the indices of the overlaps +#' @importFrom GenomicRanges findOverlaps +#' @export +#' @examples +#'gr1 <- GenomicRanges::GRanges( +#'seqnames = "chr1", +#'ranges = IRanges::IRanges(start = c(1, 11, 199), +#' end = c(10, 12, 200)), +#'score = c(20, 30, 100)) +#'gr2 <- GenomicRanges::GRanges( +#'seqnames = "chr1", +#'ranges = IRanges::IRanges(start = c(5, 101), +#' end = c(15, 102)), +#'score = c(1, 2)) +#'overlap2GR(list(first = gr1, second = gr2)) +overlap2GR <- function(my2GRsToOverlap){ + if (length(my2GRsToOverlap) != 2){ + stop("This is not a list with 2 items.") + } + mAll <- as.data.frame( + as.matrix( + GenomicRanges::findOverlaps(my2GRsToOverlap[[1]], + my2GRsToOverlap[[2]]))) + colnames(mAll) <- names(my2GRsToOverlap) + return(mAll) +} + +#' Get a dataframe from all possible pairwise overlaps +#' from a GRangesList where multiple overlaps are on the same line when possible. +#' Or a list of dataframes of any combination using any items in the list +#' +#' @param myGRsToOverlap a GRangesList with items to overlap +#' @param allMergedCalculated a List with previously calculated overlaps. Default is NULL. +#' @param verbose logical Default is F. +#' @param returnAllMergedCalculated logical to decide if only the final dataframe with all the merges are given or if the list with all the overlaps should be returned. Default is F. +#' @return Either a dataframe with one column per item in the list with colnames identicals to names of the input with the indices of the overlaps +#' Or a list with this this dataframe + all other which have been calculated in the process. +#' @details This function is really slow because it will make all pair, triple... comparison possible. +#' Because when you merge 3 dataframes with 2 columns with in total 3 column names, you have 3 ways to do the merge: +#' merge 1 with the merge of 2 and 3 or merge 2 with the merge of 1 and 3 or merge 3 with the merge of 1 and 2. +#' The algorithm will do all and report all. +#' This function is recursive. +#' @importFrom combinat combn permn +#' @importFrom usefulLDfunctions subsetByNamesOrIndices +#' @export +#' @examples +#'gr1 <- GenomicRanges::GRanges(seqnames = "chr1", +#'ranges = IRanges::IRanges(start = c(1, 11, 199), +#' end = c(10, 12, 200)), +#'score = c(20, 30, 100)) +#'gr2 <- GenomicRanges::GRanges(seqnames = "chr1", +#' ranges = IRanges::IRanges(start = c(5, 101), +#' end = c(15, 102)), +#' score = c(1, 2)) +#'gr3 <- GenomicRanges::GRanges(seqnames = "chr1", +#' ranges = IRanges::IRanges(start = c(1, 100), +#' end = c(4, 120)), +#' score = c(10, 20)) +#'grL <- list(first = gr1, second = gr2, third = gr3) +#'findAllOverlap(grL) +#'findAllOverlap(grL, returnAllMergedCalculated = TRUE) +findAllOverlap <- function(myGRsToOverlap, + allMergedCalculated = NULL, + verbose = F, + returnAllMergedCalculated = F){ + # Uses overlap2GR from the same package + if (verbose){ + cat("Evaluating the overlap of ") + cat(names(myGRsToOverlap)) + cat("\n") + } + n <- length(myGRsToOverlap) + if (is.null(allMergedCalculated)){ + #This is to avoid to calculate twice the same merge + allMergedCalculated <- list() + } + if (n == 2){ + if (verbose){ + cat("Do a merge\n") + } + mAll <- usefulLDfunctionsGR::overlap2GR(myGRsToOverlap) + if (returnAllMergedCalculated){ + nameOfN <- paste(sort(names(myGRsToOverlap)), collapse = "____") + allMergedCalculated[[nameOfN]] <- mAll[, names(myGRsToOverlap)] + return(allMergedCalculated) + } else { + return(mAll[, names(myGRsToOverlap)]) + } + } + # Will contain all the merge of all except one the (n-1)s merges + mNm1 <- list() + allNm1 <- combinat::combn(names(myGRsToOverlap), n - 1) + for (i in 1:ncol(allNm1)){ + nameOfNm1 <- paste(sort(allNm1[, i]), collapse = "____") + if (!nameOfNm1 %in% names(allMergedCalculated)){ + allMergedCalculated <- findAllOverlap( + subsetByNamesOrIndices(myGRsToOverlap, + sort(allNm1[, i])), + allMergedCalculated, + verbose = verbose, + returnAllMergedCalculated = T) + if (verbose){ + cat("back to ") + cat(names(myGRsToOverlap)) + cat("\n") + } + } + mNm1[[nameOfNm1]] <- allMergedCalculated[[nameOfNm1]] + } + mAll <- NULL + # The permutations will be use to decide the order of the merge. + allPerm <- matrix(unlist(combinat::permn(names(mNm1))), + byrow = T, ncol = n) + allPerm <- allPerm[allPerm[, 1] < allPerm[, 2], ] + for (i in 1:nrow(allPerm)){ + if (verbose){ + cat("perm", i, ",") + } + v <- allPerm[i, ] + temp.m <- mNm1[[v[1]]] + for (j in 2:length(v)){ + temp.m <- merge(temp.m, mNm1[[v[j]]], all = T) + } + mAll <- unique(rbind(mAll, temp.m)) + } + if (verbose){ + cat("\n") + } + if (returnAllMergedCalculated){ + nameOfN <- paste(sort(names(myGRsToOverlap)), collapse = "____") + allMergedCalculated[[nameOfN]] <- mAll[, names(myGRsToOverlap)] + return(allMergedCalculated) + } else { + return(mAll[, names(myGRsToOverlap)]) + } +} + +#' Find the overlaps of GRangesList items to have only once each indice of each GRanges +#' maximizing the sum of scores or normed scores. +#' +#' @param myGRsToOverlap a GRangesList with items to overlap +#' @param mAll a dataframe obtained with findAllOverlap with all possible overlaps between items of GRangeList (default is NULL). +#' @param colWithScore the name of the column in the meta columns of the GRanges of the GRList +#' to use to choose the best overlap (default is `"score"``). +#' @param useNormScore a boolean to specify if the score of each GRanges should be normalized +#' so the sum of all scores is 1 before summing between the overlaps to maximize. +#' @param verbose logical (default is `FALSE`). +#' @return a dataframe with column names which correspond to the names of the GRanges items of the GRList. +#' Each indice of each GRanges will be present. When indices from different columns are on the same row, +#' this means that the specified GRanges overlap (at least with another one). +#' @importFrom GenomicRanges mcols +#' @export +#' @examples +#'gr1 <- GenomicRanges::GRanges(seqnames = "chr1", +#'ranges = IRanges::IRanges(start = c(1, 11, 199), +#' end = c(10, 12, 200)), +#'score = c(20, 30, 100)) +#'gr2 <- GenomicRanges::GRanges(seqnames = "chr1", +#' ranges = IRanges::IRanges(start = c(5, 101), +#' end = c(15, 102)), +#' score = c(60, 90)) +#'gr3 <- GenomicRanges::GRanges(seqnames = "chr1", +#' ranges = IRanges::IRanges(start = c(1, 100), +#' end = c(4, 120)), +#' score = c(1, 2)) +#'grL <- list(first = gr1, second = gr2, third = gr3) +#'filterByScore(grL, useNormScore = TRUE) +#'filterByScore(grL, useNormScore = FALSE) +filterByScore <- function(myGRsToOverlap, + mAll = NULL, + colWithScore = "score", + useNormScore = T, + verbose = F){ + if (is.null(mAll)){ + mAll <- usefulLDfunctionsGR::findAllOverlap(myGRsToOverlap, + verbose = verbose) + } + n <- ncol(mAll) + mAll$score <- 0 + for (i in 1:n){ + nameI <- colnames(mAll)[i] + # Getting the indices where the value is not NA + nonNAI <- !is.na(mAll[, nameI]) + # Getting the corresponding scores + scoresToAdd <- mcols(myGRsToOverlap[[nameI]])[mAll[nonNAI, nameI], + colWithScore] + # If useNormScore is TRUE, the scores are divided by + # the sum of all scores of the GRange + if (useNormScore){ + scoresToAdd <- scoresToAdd / + sum(mcols(myGRsToOverlap[[nameI]])[, colWithScore]) + } + mAll$score[nonNAI] <- mAll$score[nonNAI] + scoresToAdd + } + # All overlaps are sorted by decreasing scores + mAllS <- mAll[order(mAll$score, decreasing = T), ] + # All duplicated values for each of the GRanges are removed. + # This step is biased as the order may have a high influence. + mAllSF <- mAllS + for (i in 1:n){ + nameI <- colnames(mAll)[i] + mAllSF <- mAllSF[is.na(mAllSF[, nameI]) | !duplicated(mAllSF[, nameI]), ] + } + # To compensate this bias, we will try to "rescue" the overlaps which have been deleted but exists. + mAllSRescue <- mAllS + # We put in mAllSRescue all overlaps which are not using indices already present in the mAllSF + for (i in 1:n){ + nameI<-colnames(mAll)[i] + mAllSRescue<-mAllSRescue[is.na(mAllSRescue[, nameI]) | !mAllSRescue[, nameI] %in% mAllSF[, nameI], ] + } + # We remove the duplicated indices in the mAllSRescue + for (i in 1:n){ + nameI <- colnames(mAll)[i] + mAllSRescue <- mAllSRescue[is.na(mAllSRescue[, nameI]) | !duplicated(mAllSRescue[, nameI]), ] + } + # While there are still overlaps which could be added. + # We redo the process. + while(nrow(mAllSRescue) > 0){ + mAllSF <- rbind(mAllSF, mAllSRescue) + mAllSRescue <- mAllS + # We put in mAllSRescue all overlaps which are not using indices already present in the mAllSF + for (i in 1:n){ + nameI<-colnames(mAll)[i] + mAllSRescue<-mAllSRescue[is.na(mAllSRescue[, nameI]) | !mAllSRescue[, nameI] %in% mAllSF[, nameI], ] + } + # We remove the duplicated indices in the mAllSRescue + for (i in 1:n){ + nameI <- colnames(mAll)[i] + mAllSRescue <- mAllSRescue[is.na(mAllSRescue[, nameI]) | !duplicated(mAllSRescue[, nameI]), ] + } + } + # We now need to add the indices that are not present. + mAllF <- mAllSF[, 1:n] + for (i in 1:n){ + nameI <- colnames(mAll)[i] + temp.df <- data.frame(1:length(myGRsToOverlap[[nameI]])) + colnames(temp.df) <- nameI + mAllF <- merge(mAllF, temp.df, all = T) + } + return(mAllF[, setdiff(colnames(mAll), "score")]) +} diff --git a/README.md b/README.md index 1a48cde..b01d79d 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ Functions That I Use Regularly Which depends on GenomicRanges ## Installation -This package depends on GenomicRanges, rtracklayer, usefulLDfunctions, stats, grDevices, utils. -If usefulLDfunctions, and/or GenomicRanges and/or rtracklayer are not installed on your R see Installation of dependencies section. +This package depends on GenomicRanges, rtracklayer, usefulLDfunctions, stats, grDevices, utils, combinat. +If usefulLDfunctions, and/or GenomicRanges and/or rtracklayer and/or combinat are not installed on your R see Installation of dependencies section. The easiest way to install usefulLDfunctions is using devtools::install_github() from R: ``` @@ -26,6 +26,7 @@ As the installation of Bioconductor package depends on the R version you have, I library(usefulLDfunctions) safelyLoadAPackageInCRANorBioconductor("GenomicRanges") safelyLoadAPackageInCRANorBioconductor("rtracklayer") +safelyLoadAPackageInCRANorBioconductor("combinat") ``` ## Issues diff --git a/man/filterByScore.Rd b/man/filterByScore.Rd new file mode 100644 index 0000000..5772876 --- /dev/null +++ b/man/filterByScore.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/myGRoverlapFunctions.R +\name{filterByScore} +\alias{filterByScore} +\title{Find the overlaps of GRangesList items to have only once each indice of each GRanges +maximizing the sum of scores or normed scores.} +\usage{ +filterByScore(myGRsToOverlap, mAll = NULL, colWithScore = "score", + useNormScore = T, verbose = F) +} +\arguments{ +\item{myGRsToOverlap}{a GRangesList with items to overlap} + +\item{mAll}{a dataframe obtained with findAllOverlap with all possible overlaps between items of GRangeList (default is NULL).} + +\item{colWithScore}{the name of the column in the meta columns of the GRanges of the GRList +to use to choose the best overlap (default is `"score"``).} + +\item{useNormScore}{a boolean to specify if the score of each GRanges should be normalized +so the sum of all scores is 1 before summing between the overlaps to maximize.} + +\item{verbose}{logical (default is `FALSE`).} +} +\value{ +a dataframe with column names which correspond to the names of the GRanges items of the GRList. +Each indice of each GRanges will be present. When indices from different columns are on the same row, +this means that the specified GRanges overlap (at least with another one). +} +\description{ +Find the overlaps of GRangesList items to have only once each indice of each GRanges +maximizing the sum of scores or normed scores. +} +\examples{ +gr1 <- GenomicRanges::GRanges(seqnames = "chr1", +ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), +score = c(20, 30, 100)) +gr2 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), + score = c(60, 90)) +gr3 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 100), + end = c(4, 120)), + score = c(1, 2)) +grL <- list(first = gr1, second = gr2, third = gr3) +filterByScore(grL, useNormScore = TRUE) +filterByScore(grL, useNormScore = FALSE) +} diff --git a/man/findAllOverlap.Rd b/man/findAllOverlap.Rd new file mode 100644 index 0000000..a7c8292 --- /dev/null +++ b/man/findAllOverlap.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/myGRoverlapFunctions.R +\name{findAllOverlap} +\alias{findAllOverlap} +\title{Get a dataframe from all possible pairwise overlaps +from a GRangesList where multiple overlaps are on the same line when possible. +Or a list of dataframes of any combination using any items in the list} +\usage{ +findAllOverlap(myGRsToOverlap, allMergedCalculated = NULL, verbose = F, + returnAllMergedCalculated = F) +} +\arguments{ +\item{myGRsToOverlap}{a GRangesList with items to overlap} + +\item{allMergedCalculated}{a List with previously calculated overlaps. Default is NULL.} + +\item{verbose}{logical Default is F.} + +\item{returnAllMergedCalculated}{logical to decide if only the final dataframe with all the merges are given or if the list with all the overlaps should be returned. Default is F.} +} +\value{ +Either a dataframe with one column per item in the list with colnames identicals to names of the input with the indices of the overlaps +Or a list with this this dataframe + all other which have been calculated in the process. +} +\description{ +Get a dataframe from all possible pairwise overlaps +from a GRangesList where multiple overlaps are on the same line when possible. +Or a list of dataframes of any combination using any items in the list +} +\details{ +This function is really slow because it will make all pair, triple... comparison possible. +Because when you merge 3 dataframes with 2 columns with in total 3 column names, you have 3 ways to do the merge: +merge 1 with the merge of 2 and 3 or merge 2 with the merge of 1 and 3 or merge 3 with the merge of 1 and 2. +The algorithm will do all and report all. +This function is recursive. +} +\examples{ +gr1 <- GenomicRanges::GRanges(seqnames = "chr1", +ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), +score = c(20, 30, 100)) +gr2 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), + score = c(1, 2)) +gr3 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 100), + end = c(4, 120)), + score = c(10, 20)) +grL <- list(first = gr1, second = gr2, third = gr3) +findAllOverlap(grL) +findAllOverlap(grL, returnAllMergedCalculated = TRUE) +} diff --git a/man/overlap2GR.Rd b/man/overlap2GR.Rd new file mode 100644 index 0000000..a67292f --- /dev/null +++ b/man/overlap2GR.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/myGRoverlapFunctions.R +\name{overlap2GR} +\alias{overlap2GR} +\title{Get a dataframe from findOverlaps from a GRangesList with 2 items where colnames match the names of the items} +\usage{ +overlap2GR(my2GRsToOverlap) +} +\arguments{ +\item{my2GRsToOverlap}{a GRangesList with 2 items to overlap} +} +\value{ +a dataframe with 2 columns with colnames identicals to names of the input with the indices of the overlaps +} +\description{ +Get a dataframe from findOverlaps from a GRangesList with 2 items where colnames match the names of the items +} +\examples{ +gr1 <- GenomicRanges::GRanges( +seqnames = "chr1", +ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), +score = c(20, 30, 100)) +gr2 <- GenomicRanges::GRanges( +seqnames = "chr1", +ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), +score = c(1, 2)) +overlap2GR(list(first = gr1, second = gr2)) +} diff --git a/tests/testthat/testGROverlapFunctions.R b/tests/testthat/testGROverlapFunctions.R new file mode 100644 index 0000000..954defa --- /dev/null +++ b/tests/testthat/testGROverlapFunctions.R @@ -0,0 +1,73 @@ +context("GR overlap functions") +library(usefulLDfunctionsGR) +test_that("overlap2GR gives what is expected", { + gr1 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), + score = c(20, 30, 100)) + gr2 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), + score = c(1, 2)) + expect_equal(overlap2GR(list(first = gr1, second = gr2)), + data.frame(first = c(1, 2), + second = rep(1, 2))) +}) + +test_that("findAllOverlap gives what is expected", { + gr1 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), + score = c(20, 30, 100)) + gr2 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), + score = c(1, 2)) + gr3 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 100), + end = c(4, 120)), + score = c(10, 20)) + grL <- list(first = gr1, second = gr2, third = gr3) + my.dataframe <- data.frame(first = c(1, 2, NA, 1, 1), + second = c(1, 1, 2, NA, 1), + third = c(1, NA, 2, 1, NA)) + expect_equal(findAllOverlap(grL), my.dataframe) + all.dataframes <- list(first____second = data.frame(first = 1:2, second = c(1, 1)), + first____third = data.frame(first = 1, third = 1), + second____third = data.frame(second = 2, third = 2), + first____second____third = my.dataframe) + expect_equal(findAllOverlap(grL, returnAllMergedCalculated = T), all.dataframes) +}) + +test_that("filterByScore gives what is expected", { + gr1 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 11, 199), + end = c(10, 12, 200)), + score = c(20, 30, 100)) + gr2 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(5, 101), + end = c(15, 102)), + score = c(60, 90)) + gr3 <- GenomicRanges::GRanges(seqnames = "chr1", + ranges = IRanges::IRanges(start = c(1, 100), + end = c(4, 120)), + score = c(1, 2)) + grL <- list(first = gr1, second = gr2, third = gr3) + my.dataframe <- data.frame(first = c(1, NA, 2, 3), + second = c(1:2, NA, NA), + third = c(1:2, NA, NA)) + expect_equal(filterByScore(grL, useNormScore = T), my.dataframe) + all.merges.my.dataframe <- data.frame(first = c(1, 2, NA, 1, 1), + second = c(1, 1, 2, NA, 1), + third = c(1, NA, 2, 1, NA)) + expect_equal(filterByScore(grL, mAll = all.merges.my.dataframe, useNormScore = T), + my.dataframe) + + my.dataframe2 <- data.frame(first = c(1, NA, 2:3), + second = c(NA, 2:1, NA), + third = c(1:2, NA, NA)) + + expect_equal(filterByScore(grL, useNormScore = F), + my.dataframe2) + +})