Skip to content

Commit

Permalink
added the overlap functions
Browse files Browse the repository at this point in the history
  • Loading branch information
lldelisle committed Jul 10, 2019
1 parent b724adc commit 003b0ab
Show file tree
Hide file tree
Showing 8 changed files with 463 additions and 4 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Package: usefulLDfunctionsGR
Title: Functions That I Use Regularly Which depends on GenomicRanges
Version: 0.0.1
Version: 0.0.2
Authors@R:
person(given = "Lucille",
family = "Lopez-Delisle",
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0002-1964-4960"))
Description: Useful functions to manipulate GRanges
Description: Useful functions to manipulate GRanges and do overlaps.
License: CC BY 4.0
Encoding: UTF-8
LazyData: true
Expand All @@ -16,6 +16,7 @@ Imports:
usefulLDfunctions,
GenomicRanges,
rtracklayer,
combinat,
stats,
grDevices,
utils
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# Generated by roxygen2: do not edit by hand

export(bedGraphFromGR)
export(filterByScore)
export(findAllOverlap)
export(getTSSinUCSCFormatFromEnsemblGTF)
export(grFromBedFile)
export(grSortedSimplifiedFromNarrowPeak)
export(overlap2GR)
import(usefulLDfunctions)
importFrom(GenomicRanges,findOverlaps)
importFrom(GenomicRanges,makeGRangesFromDataFrame)
importFrom(GenomicRanges,mcols)
importFrom(GenomicRanges,resize)
importFrom(combinat,combn)
importFrom(combinat,permn)
importFrom(rtracklayer,readGFF)
importFrom(usefulLDfunctions,subsetByNamesOrIndices)
244 changes: 244 additions & 0 deletions R/myGRoverlapFunctions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
#' Get a dataframe from findOverlaps from a GRangesList with 2 items where colnames match the names of the items
#'
#' @param my2GRsToOverlap a GRangesList with 2 items to overlap
#' @return a dataframe with 2 columns with colnames identicals to names of the input with the indices of the overlaps
#' @importFrom GenomicRanges findOverlaps
#' @export
#' @examples
#'gr1 <- GenomicRanges::GRanges(
#'seqnames = "chr1",
#'ranges = IRanges::IRanges(start = c(1, 11, 199),
#' end = c(10, 12, 200)),
#'score = c(20, 30, 100))
#'gr2 <- GenomicRanges::GRanges(
#'seqnames = "chr1",
#'ranges = IRanges::IRanges(start = c(5, 101),
#' end = c(15, 102)),
#'score = c(1, 2))
#'overlap2GR(list(first = gr1, second = gr2))
overlap2GR <- function(my2GRsToOverlap){
if (length(my2GRsToOverlap) != 2){
stop("This is not a list with 2 items.")
}
mAll <- as.data.frame(
as.matrix(
GenomicRanges::findOverlaps(my2GRsToOverlap[[1]],
my2GRsToOverlap[[2]])))
colnames(mAll) <- names(my2GRsToOverlap)
return(mAll)
}

#' Get a dataframe from all possible pairwise overlaps
#' from a GRangesList where multiple overlaps are on the same line when possible.
#' Or a list of dataframes of any combination using any items in the list
#'
#' @param myGRsToOverlap a GRangesList with items to overlap
#' @param allMergedCalculated a List with previously calculated overlaps. Default is NULL.
#' @param verbose logical Default is F.
#' @param returnAllMergedCalculated logical to decide if only the final dataframe with all the merges are given or if the list with all the overlaps should be returned. Default is F.
#' @return Either a dataframe with one column per item in the list with colnames identicals to names of the input with the indices of the overlaps
#' Or a list with this this dataframe + all other which have been calculated in the process.
#' @details This function is really slow because it will make all pair, triple... comparison possible.
#' Because when you merge 3 dataframes with 2 columns with in total 3 column names, you have 3 ways to do the merge:
#' merge 1 with the merge of 2 and 3 or merge 2 with the merge of 1 and 3 or merge 3 with the merge of 1 and 2.
#' The algorithm will do all and report all.
#' This function is recursive.
#' @importFrom combinat combn permn
#' @importFrom usefulLDfunctions subsetByNamesOrIndices
#' @export
#' @examples
#'gr1 <- GenomicRanges::GRanges(seqnames = "chr1",
#'ranges = IRanges::IRanges(start = c(1, 11, 199),
#' end = c(10, 12, 200)),
#'score = c(20, 30, 100))
#'gr2 <- GenomicRanges::GRanges(seqnames = "chr1",
#' ranges = IRanges::IRanges(start = c(5, 101),
#' end = c(15, 102)),
#' score = c(1, 2))
#'gr3 <- GenomicRanges::GRanges(seqnames = "chr1",
#' ranges = IRanges::IRanges(start = c(1, 100),
#' end = c(4, 120)),
#' score = c(10, 20))
#'grL <- list(first = gr1, second = gr2, third = gr3)
#'findAllOverlap(grL)
#'findAllOverlap(grL, returnAllMergedCalculated = TRUE)
findAllOverlap <- function(myGRsToOverlap,
allMergedCalculated = NULL,
verbose = F,
returnAllMergedCalculated = F){
# Uses overlap2GR from the same package
if (verbose){
cat("Evaluating the overlap of ")
cat(names(myGRsToOverlap))
cat("\n")
}
n <- length(myGRsToOverlap)
if (is.null(allMergedCalculated)){
#This is to avoid to calculate twice the same merge
allMergedCalculated <- list()
}
if (n == 2){
if (verbose){
cat("Do a merge\n")
}
mAll <- usefulLDfunctionsGR::overlap2GR(myGRsToOverlap)
if (returnAllMergedCalculated){
nameOfN <- paste(sort(names(myGRsToOverlap)), collapse = "____")
allMergedCalculated[[nameOfN]] <- mAll[, names(myGRsToOverlap)]
return(allMergedCalculated)
} else {
return(mAll[, names(myGRsToOverlap)])
}
}
# Will contain all the merge of all except one the (n-1)s merges
mNm1 <- list()
allNm1 <- combinat::combn(names(myGRsToOverlap), n - 1)
for (i in 1:ncol(allNm1)){
nameOfNm1 <- paste(sort(allNm1[, i]), collapse = "____")
if (!nameOfNm1 %in% names(allMergedCalculated)){
allMergedCalculated <- findAllOverlap(
subsetByNamesOrIndices(myGRsToOverlap,
sort(allNm1[, i])),
allMergedCalculated,
verbose = verbose,
returnAllMergedCalculated = T)
if (verbose){
cat("back to ")
cat(names(myGRsToOverlap))
cat("\n")
}
}
mNm1[[nameOfNm1]] <- allMergedCalculated[[nameOfNm1]]
}
mAll <- NULL
# The permutations will be use to decide the order of the merge.
allPerm <- matrix(unlist(combinat::permn(names(mNm1))),
byrow = T, ncol = n)
allPerm <- allPerm[allPerm[, 1] < allPerm[, 2], ]
for (i in 1:nrow(allPerm)){
if (verbose){
cat("perm", i, ",")
}
v <- allPerm[i, ]
temp.m <- mNm1[[v[1]]]
for (j in 2:length(v)){
temp.m <- merge(temp.m, mNm1[[v[j]]], all = T)
}
mAll <- unique(rbind(mAll, temp.m))
}
if (verbose){
cat("\n")
}
if (returnAllMergedCalculated){
nameOfN <- paste(sort(names(myGRsToOverlap)), collapse = "____")
allMergedCalculated[[nameOfN]] <- mAll[, names(myGRsToOverlap)]
return(allMergedCalculated)
} else {
return(mAll[, names(myGRsToOverlap)])
}
}

#' Find the overlaps of GRangesList items to have only once each indice of each GRanges
#' maximizing the sum of scores or normed scores.
#'
#' @param myGRsToOverlap a GRangesList with items to overlap
#' @param mAll a dataframe obtained with findAllOverlap with all possible overlaps between items of GRangeList (default is NULL).
#' @param colWithScore the name of the column in the meta columns of the GRanges of the GRList
#' to use to choose the best overlap (default is `"score"``).
#' @param useNormScore a boolean to specify if the score of each GRanges should be normalized
#' so the sum of all scores is 1 before summing between the overlaps to maximize.
#' @param verbose logical (default is `FALSE`).
#' @return a dataframe with column names which correspond to the names of the GRanges items of the GRList.
#' Each indice of each GRanges will be present. When indices from different columns are on the same row,
#' this means that the specified GRanges overlap (at least with another one).
#' @importFrom GenomicRanges mcols
#' @export
#' @examples
#'gr1 <- GenomicRanges::GRanges(seqnames = "chr1",
#'ranges = IRanges::IRanges(start = c(1, 11, 199),
#' end = c(10, 12, 200)),
#'score = c(20, 30, 100))
#'gr2 <- GenomicRanges::GRanges(seqnames = "chr1",
#' ranges = IRanges::IRanges(start = c(5, 101),
#' end = c(15, 102)),
#' score = c(60, 90))
#'gr3 <- GenomicRanges::GRanges(seqnames = "chr1",
#' ranges = IRanges::IRanges(start = c(1, 100),
#' end = c(4, 120)),
#' score = c(1, 2))
#'grL <- list(first = gr1, second = gr2, third = gr3)
#'filterByScore(grL, useNormScore = TRUE)
#'filterByScore(grL, useNormScore = FALSE)
filterByScore <- function(myGRsToOverlap,
mAll = NULL,
colWithScore = "score",
useNormScore = T,
verbose = F){
if (is.null(mAll)){
mAll <- usefulLDfunctionsGR::findAllOverlap(myGRsToOverlap,
verbose = verbose)
}
n <- ncol(mAll)
mAll$score <- 0
for (i in 1:n){
nameI <- colnames(mAll)[i]
# Getting the indices where the value is not NA
nonNAI <- !is.na(mAll[, nameI])
# Getting the corresponding scores
scoresToAdd <- mcols(myGRsToOverlap[[nameI]])[mAll[nonNAI, nameI],
colWithScore]
# If useNormScore is TRUE, the scores are divided by
# the sum of all scores of the GRange
if (useNormScore){
scoresToAdd <- scoresToAdd /
sum(mcols(myGRsToOverlap[[nameI]])[, colWithScore])
}
mAll$score[nonNAI] <- mAll$score[nonNAI] + scoresToAdd
}
# All overlaps are sorted by decreasing scores
mAllS <- mAll[order(mAll$score, decreasing = T), ]
# All duplicated values for each of the GRanges are removed.
# This step is biased as the order may have a high influence.
mAllSF <- mAllS
for (i in 1:n){
nameI <- colnames(mAll)[i]
mAllSF <- mAllSF[is.na(mAllSF[, nameI]) | !duplicated(mAllSF[, nameI]), ]
}
# To compensate this bias, we will try to "rescue" the overlaps which have been deleted but exists.
mAllSRescue <- mAllS
# We put in mAllSRescue all overlaps which are not using indices already present in the mAllSF
for (i in 1:n){
nameI<-colnames(mAll)[i]
mAllSRescue<-mAllSRescue[is.na(mAllSRescue[, nameI]) | !mAllSRescue[, nameI] %in% mAllSF[, nameI], ]
}
# We remove the duplicated indices in the mAllSRescue
for (i in 1:n){
nameI <- colnames(mAll)[i]
mAllSRescue <- mAllSRescue[is.na(mAllSRescue[, nameI]) | !duplicated(mAllSRescue[, nameI]), ]
}
# While there are still overlaps which could be added.
# We redo the process.
while(nrow(mAllSRescue) > 0){
mAllSF <- rbind(mAllSF, mAllSRescue)
mAllSRescue <- mAllS
# We put in mAllSRescue all overlaps which are not using indices already present in the mAllSF
for (i in 1:n){
nameI<-colnames(mAll)[i]
mAllSRescue<-mAllSRescue[is.na(mAllSRescue[, nameI]) | !mAllSRescue[, nameI] %in% mAllSF[, nameI], ]
}
# We remove the duplicated indices in the mAllSRescue
for (i in 1:n){
nameI <- colnames(mAll)[i]
mAllSRescue <- mAllSRescue[is.na(mAllSRescue[, nameI]) | !duplicated(mAllSRescue[, nameI]), ]
}
}
# We now need to add the indices that are not present.
mAllF <- mAllSF[, 1:n]
for (i in 1:n){
nameI <- colnames(mAll)[i]
temp.df <- data.frame(1:length(myGRsToOverlap[[nameI]]))
colnames(temp.df) <- nameI
mAllF <- merge(mAllF, temp.df, all = T)
}
return(mAllF[, setdiff(colnames(mAll), "score")])
}
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Functions That I Use Regularly Which depends on GenomicRanges

## Installation
This package depends on GenomicRanges, rtracklayer, usefulLDfunctions, stats, grDevices, utils.
If usefulLDfunctions, and/or GenomicRanges and/or rtracklayer are not installed on your R see Installation of dependencies section.
This package depends on GenomicRanges, rtracklayer, usefulLDfunctions, stats, grDevices, utils, combinat.
If usefulLDfunctions, and/or GenomicRanges and/or rtracklayer and/or combinat are not installed on your R see Installation of dependencies section.

The easiest way to install usefulLDfunctions is using devtools::install_github() from R:
```
Expand All @@ -26,6 +26,7 @@ As the installation of Bioconductor package depends on the R version you have, I
library(usefulLDfunctions)
safelyLoadAPackageInCRANorBioconductor("GenomicRanges")
safelyLoadAPackageInCRANorBioconductor("rtracklayer")
safelyLoadAPackageInCRANorBioconductor("combinat")
```

## Issues
Expand Down
49 changes: 49 additions & 0 deletions man/filterByScore.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 003b0ab

Please sign in to comment.