Skip to content

Commit

Permalink
Merge pull request #106 from rformassspectrometry/phili
Browse files Browse the repository at this point in the history
feat: add .group_standards_iteration function
  • Loading branch information
jorainer authored Oct 7, 2023
2 parents 869fd43 + 50e803a commit c640aca
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 3 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: MetaboAnnotation
Title: Utilities for Annotation of Metabolomics Data
Version: 1.5.6
Version: 1.5.7
Description:
High level functions to assist in annotation of (metabolomics) data sets.
These include functions to perform simple tentative annotations based on
Expand Down Expand Up @@ -74,6 +74,7 @@ Collate:
'CompDbSource.R'
'Matched.R'
'MatchedSpectra.R'
'group_standards.R'
'hidden-aliases.R'
'matchFormula.R'
'matchSpectra.R'
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# MetaboAnnotation 1.5

## Changes in 1.5.7

- Add function `.group_standards_iteration` and `.randomize_grouping` to allow iteration through matrix
of standards and group them if they are dissimilar enough.

## Changes in 1.5.6

- Fix issue in the vignette. Thanks @RemyDeB for the fix.
Expand Down
124 changes: 124 additions & 0 deletions R/group_standards.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#' @title Iterate through a table of standard compounds to group them.
#'
#' @description
#' The `.group_standards_iteration` function groups rows of a matrix of standard
#' compounds based on their similarity and a user-defined specified number of
#' standards per group. The `.randomize_grouping` function utilize the
#' `.group_standards_iteration` but tries for all combination of rows of `x`.
#' This is great if the user is not satisfied by the results of the
#' `.group_standards_iteration` function.
#'
#' @param x `numeric` matrix with row names representing the compounds and
#' columns representing different adducts.
#'
#' @param max_nstd `numeric` number of maximum of standards per group.
#'
#' @param min_nstd `numeric` number of minimum of standards per group.
#'
#' @param min_diff `numeric` Minimum difference for considering two values as
#' distinct. Default is 2.
#'
#' @return `list` where each element is a vector of row names representing a
#' group of standards.
#'
#' @details
#' Users should be aware that because the function iterates through `x`, the
#' compound at the bottom of the list is more complicated to group, and there's
#' a possibility that some compounds will not be grouped with others. We advise
#' to test the `.randomize_grouping` function if that happens.
#'
#' @examples
#'
#' ## `.group_standards_iteration` only
#' x <- data.frame(
#' row.names = c("Malic Acid", "Pyridoxic Acid", "Thiamine", "Uric acid",
#' "dUTP", "N-Formyl-L-methionine"),
#' "adduct_1" = c(135.0288, 184.0604, 265.1118, 169.0356, 468.9809, 178.0532),
#' "adduct_2" = c(157.0107, 206.0424, 287.0937, 191.0176, 490.9628, 200.0352)
#' )
#'
#' x <- as.matrix(x)
#' ## Group standards with a maximum of 3 per group and a minimum difference
#' ## of 2.
#' result <- .group_ite(x, max_nstd = 3, min_diff = 2)
#' result
#'
#'
#'
#' ## Comparing results with using `.randomize_grouping`.
#' set.seed(123)
#' ## Create a matrix with compound names and ion masses
#' x <- matrix(c(349.0544, 371.0363, 325.0431, 347.0251, 581.0416, 603.0235,
#' 167.0564, 189.0383, 150.0583, 172.0403, 171.0053, 192.9872,
#' 130.0863, 152.0682, 768.1225, 790.1044),
#' ncol = 2, byrow = TRUE)
#'
#' rownames(x) <- c("IMP", "UMP", "UDP-glucuronate", "1-Methylxanthine",
#' "Methionine", "Dihydroxyacetone phosphate", "Pipecolic acid",
#' "CoA")
#' colnames(x) <- c("[M+H]+", "[M+Na]+")
#'
#' ## run using `.group_standards_iteration`
#' standard_groups <- .group_standards_iteration(x, max_nstd = 4, min_diff = 2)
#' standard_groups
#'
#' ## get incomplete groups, rescue this using the `.randomize_grouping`.
#' standard_groups_r <- .randomize_grouping(x, max_nstd = 4,
#' min_nstd = 3,
#' min_diff = 2)
#' standard_groups_r
#'
#'
#' @author Philippine Louail
#'
#' @noRd
#'

.group_standards_iteration <- function(x, max_nstd, min_diff = 2) {
output <- vector("list")
g <- 0

while (nrow(x) > 1) {
g <- g + 1
i <- 1
group <- row.names(x)[i]

while (length(group) < max_nstd & i < nrow(x)) {
i <- i + 1
diff_table <- abs(outer(as.vector(x[group, ]), as.vector(x[i,]),
"-"))

if (all(diff_table > min_diff, na.rm= TRUE)) {
group <- c(group, row.names(x)[i])
}
}
x <- x[!(rownames(x) %in% group), , drop = FALSE]
output[[g]] <- group
}
if (nrow(x))
output[[g + 1]] <- row.names(x)

output
}


.randomize_grouping <- function(x,
max_nstd,
min_nstd,
min_diff = 2) {
n <- nrow(x)
standard_groups <- vector("list")
i <- 0
while (length(standard_groups) == 0 || any(lengths(standard_groups) <
min_nstd)) {

i <- i +1
x <- x[sample(n), ]
standard_groups <- .group_standards_iteration(x,
max_nstd = max_nstd,
min_diff = min_diff)
if (i > n*n)
stop("all combination were tested, no possibility to fit your input requirement")
}
standard_groups
}
4 changes: 2 additions & 2 deletions man/matchValues.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions tests/testthat/test_group_standards.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#' define some variable for test
x <- data.frame(
row.names = c("Malic Acid", "Pyridoxic Acid", "Thiamine", "Uric acid",
"dUTP", "N-Formyl-L-methionine"),
"adduct_1" = c(135.0288, 184.0604, 265.1118, 169.0356, 468.9809, 178.0532),
"adduct_2" = c(157.0107, 206.0424, 287.0937, 191.0176, 490.9628, 200.0352)
)
x <- as.matrix(x)

#' Expected list to get for maximum of 3 standard per group
results <- list(c("Malic Acid", "Pyridoxic Acid", "Thiamine"),
c("Uric acid", "dUTP", "N-Formyl-L-methionine"))

test_that("essentially that .group_standards_iteration works", {
expect_equal(.group_standards_iteration(x, max_nstd = 3), results)
expect_is(.group_standards_iteration(x, max_nstd = 3), "list")
})


#' test for randomization
set.seed(123)
#' Create a matrix with compound names and ion masses
x <- matrix(c(349.0544, 371.0363, 325.0431, 347.0251, 581.0416, 603.0235,
167.0564, 189.0383, 150.0583, 172.0403, 171.0053, 192.9872,
130.0863, 152.0682, 768.1225, 790.1044),
ncol = 2, byrow = TRUE)

rownames(x) <- c("IMP", "UMP", "UDP-glucuronate", "1-Methylxanthine",
"Methionine", "Dihydroxyacetone phosphate", "Pipecolic acid",
"CoA")
colnames(x) <- c("[M+H]+", "[M+Na]+")

#' run using `.group_standards_iteration`
standard_groups <- .group_standards_iteration(x, max_nstd = 4, min_diff = 2)

#' run using `.randomisation_group`.
standard_groups_r <- .randomize_grouping(x, max_nstd = 4,
min_nstd = 3,
min_diff = 2)
min_nstd <- 3
test_that("randomization improves results", {
expect_true(length(standard_groups) > length(standard_groups_r))
expect_true(any(lengths(standard_groups_r) > min_nstd))
expect_false(length(standard_groups_r) == 0 )
}

)

0 comments on commit c640aca

Please sign in to comment.