Merge pull request #106 from rformassspectrometry/phili

feat: add .group_standards_iteration function
rformassspectrometry · Oct 7, 2023 · c640aca · c640aca
2 parents 869fd43 + 50e803a
commit c640aca
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MetaboAnnotation
 Title: Utilities for Annotation of Metabolomics Data
-Version: 1.5.6
+Version: 1.5.7
 Description:
     High level functions to assist in annotation of (metabolomics) data sets.
     These include functions to perform simple tentative annotations based on
@@ -74,6 +74,7 @@ Collate:
     'CompDbSource.R'
     'Matched.R'
     'MatchedSpectra.R'
+    'group_standards.R'
     'hidden-aliases.R'
     'matchFormula.R'
     'matchSpectra.R'

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # MetaboAnnotation 1.5
 
+## Changes in 1.5.7
+
+- Add function `.group_standards_iteration` and `.randomize_grouping` to allow iteration through matrix 
+  of standards and group them if they are dissimilar enough.
+
 ## Changes in 1.5.6
 
 - Fix issue in the vignette. Thanks @RemyDeB for the fix.

diff --git a/R/group_standards.R b/R/group_standards.R
@@ -0,0 +1,124 @@
+#' @title Iterate through a table of standard compounds to group them.
+#' 
+#' @description
+#' The `.group_standards_iteration` function groups rows of a matrix of standard 
+#' compounds based on their similarity and a user-defined specified number of 
+#' standards per group. The `.randomize_grouping` function utilize the 
+#' `.group_standards_iteration` but tries for all combination of rows of `x`. 
+#' This is great if the user is not satisfied by the results of the
+#' `.group_standards_iteration` function.
+#'
+#' @param x `numeric` matrix with row names representing the compounds and 
+#' columns representing different adducts.
+#' 
+#' @param max_nstd `numeric` number of maximum of standards per group.
+#' 
+#' @param min_nstd `numeric` number of minimum of standards per group.
+#' 
+#' @param min_diff `numeric` Minimum difference for considering two values as 
+#' distinct. Default is 2.
+#'
+#' @return `list` where each element is a vector of row names representing a 
+#' group of standards.
+#' 
+#' @details
+#' Users should be aware that because the function iterates through `x`, the 
+#' compound at the bottom of the list is more complicated to group, and there's 
+#' a possibility that some compounds will not be grouped with others. We advise 
+#' to test the `.randomize_grouping`  function if that happens.  
+#' 
+#' @examples
+#' 
+#' ## `.group_standards_iteration` only
+#' x <- data.frame(
+#'   row.names = c("Malic Acid", "Pyridoxic Acid", "Thiamine", "Uric acid",
+#'                 "dUTP", "N-Formyl-L-methionine"),
+#'   "adduct_1" = c(135.0288, 184.0604, 265.1118, 169.0356, 468.9809, 178.0532),
+#'   "adduct_2" = c(157.0107, 206.0424, 287.0937, 191.0176, 490.9628, 200.0352)
+#' )
+#' 
+#' x <- as.matrix(x)
+#' ## Group standards with a maximum of 3 per group and a minimum difference
+#' ## of 2.
+#' result <- .group_ite(x, max_nstd = 3, min_diff = 2)
+#' result
+#'
+#'
+#'
+#' ## Comparing results with using `.randomize_grouping`.
+#' set.seed(123)
+#' ## Create a matrix with compound names and ion masses
+#' x <- matrix(c(349.0544, 371.0363, 325.0431, 347.0251, 581.0416, 603.0235,
+#'               167.0564, 189.0383, 150.0583, 172.0403, 171.0053, 192.9872,
+#'               130.0863, 152.0682, 768.1225, 790.1044),
+#'             ncol = 2, byrow = TRUE)
+#'
+#' rownames(x) <- c("IMP", "UMP", "UDP-glucuronate", "1-Methylxanthine", 
+#'                  "Methionine", "Dihydroxyacetone phosphate", "Pipecolic acid", 
+#'                  "CoA")
+#' colnames(x) <- c("[M+H]+", "[M+Na]+")
+#' 
+#' ## run using `.group_standards_iteration` 
+#' standard_groups <- .group_standards_iteration(x, max_nstd = 4, min_diff = 2)
+#' standard_groups
+#' 
+#' ## get incomplete groups, rescue this using the `.randomize_grouping`. 
+#' standard_groups_r <- .randomize_grouping(x, max_nstd = 4, 
+#'                                          min_nstd = 3,
+#'                                          min_diff = 2)
+#' standard_groups_r 
+#' 
+#' 
+#' @author Philippine Louail
+#' 
+#' @noRd
+#'
+
+.group_standards_iteration <- function(x, max_nstd, min_diff = 2) {   
+    output <- vector("list")
+    g <- 0
+
+    while (nrow(x) > 1) {
+        g <- g + 1
+        i <- 1
+        group <- row.names(x)[i]
+
+        while (length(group) < max_nstd & i < nrow(x)) {
+            i <- i + 1
+            diff_table <- abs(outer(as.vector(x[group, ]), as.vector(x[i,]), 
+                                    "-"))
+
+            if (all(diff_table > min_diff, na.rm= TRUE)) { 
+                group <- c(group, row.names(x)[i])
+            }
+        } 
+        x <- x[!(rownames(x) %in% group), , drop = FALSE] 
+        output[[g]] <- group
+    }
+    if (nrow(x))  
+        output[[g + 1]] <- row.names(x)
+
+    output
+} 
+
+
+.randomize_grouping <- function(x, 
+                                max_nstd, 
+                                min_nstd, 
+                                min_diff = 2) { 
+    n <- nrow(x)
+    standard_groups <- vector("list")
+    i <- 0
+    while (length(standard_groups) == 0 || any(lengths(standard_groups) < 
+                                              min_nstd)) {
+
+        i <- i +1
+        x <- x[sample(n), ]
+        standard_groups <- .group_standards_iteration(x, 
+                                                      max_nstd = max_nstd,
+                                                      min_diff = min_diff)
+        if (i > n*n)
+            stop("all combination were tested, no possibility to fit your input requirement")
+    }  
+    standard_groups
+}
diff --git a/man/matchValues.Rd b/man/matchValues.Rd
diff --git a/tests/testthat/test_group_standards.R b/tests/testthat/test_group_standards.R
@@ -0,0 +1,47 @@
+#' define some variable for test 
+x <- data.frame(
+  row.names = c("Malic Acid", "Pyridoxic Acid", "Thiamine", "Uric acid",
+                "dUTP", "N-Formyl-L-methionine"),
+  "adduct_1" = c(135.0288, 184.0604, 265.1118, 169.0356, 468.9809, 178.0532),
+  "adduct_2" = c(157.0107, 206.0424, 287.0937, 191.0176, 490.9628, 200.0352)
+)
+x <- as.matrix(x)
+
+#' Expected list to get for maximum of 3 standard per group 
+results <- list(c("Malic Acid", "Pyridoxic Acid", "Thiamine"), 
+                c("Uric acid", "dUTP", "N-Formyl-L-methionine"))
+
+test_that("essentially that .group_standards_iteration works", {
+    expect_equal(.group_standards_iteration(x, max_nstd = 3), results)
+    expect_is(.group_standards_iteration(x, max_nstd = 3), "list")
+})
+
+
+#' test for randomization
+set.seed(123) 
+#' Create a matrix with compound names and ion masses
+x <- matrix(c(349.0544, 371.0363, 325.0431, 347.0251, 581.0416, 603.0235,
+              167.0564, 189.0383, 150.0583, 172.0403, 171.0053, 192.9872,
+              130.0863, 152.0682, 768.1225, 790.1044),
+            ncol = 2, byrow = TRUE)
+
+rownames(x) <- c("IMP", "UMP", "UDP-glucuronate", "1-Methylxanthine",
+                 "Methionine", "Dihydroxyacetone phosphate", "Pipecolic acid",
+                 "CoA")
+colnames(x) <- c("[M+H]+", "[M+Na]+")
+
+#' run using `.group_standards_iteration`
+standard_groups <- .group_standards_iteration(x, max_nstd = 4, min_diff = 2)
+
+#' run using `.randomisation_group`.
+standard_groups_r <- .randomize_grouping(x, max_nstd = 4,
+                                         min_nstd = 3,
+                                         min_diff = 2)
+min_nstd <- 3
+test_that("randomization improves results", {
+  expect_true(length(standard_groups) > length(standard_groups_r))
+  expect_true(any(lengths(standard_groups_r) > min_nstd))
+  expect_false(length(standard_groups_r) == 0 )
+}
+
+)