mzR-functions.R


calculateFragments <- function(sequence, type = c("b", "y"), z = 1,
                               modifications = c(C = 57.02146),
                               neutralLoss = defaultNeutralLoss(),
                               verbose = isMSnbaseVerbose()) {
  if (nchar(sequence) <= 1L) {
    stop("'sequence' has to have two or more residues.")
  }
  
  type <- match.arg(type, choices=c("a", "b", "c", "x", "y", "z"), several.ok=TRUE)
  type <- sort(type)
  ## constants
  mass <- get.atomic.mass()
  ## according to Table 1 of:
  ## Johnson, R. S., Martin, S. A., Biemann, K., Stults, J. T., and
  ## Watson, J. T. (1987).
  ## Novel fragmentation process of peptides by collision-induced
  ## decomposition in a tandem mass spectrometer: differentiation of leucine
  ## and isoleucine.
  ## Analytical Chemistry, 59(21), 2621-2625.
  ## https://doi.org/10.1021/ac00148a019
  ##
  ## a proton (H+) is added later
  ## (after calculation of the different charge states)
  add <- c(a=-(mass["C"]+mass["O"]),            # + H - CO
           b=0,                                 # + H
           c=mass["N"]+3*mass["H"],             # + H + NH3
           x=mass["C"]+2*mass["O"],             # + CO + OH
           y=2*mass["H"]+mass["O"],             # + H2 + OH
           z=-(mass["N"]+mass["H"])+mass["O"])  # + NH + OH
  
  aa <- .get.amino.acids()
  aamass <- setNames(aa$ResidueMass, aa$AA)
  
  ## replace default mass by modifications
  if (length(modifications)) {
    aamass[names(modifications)] <- aamass[names(modifications)] + modifications
  }
  
  if (verbose) {
    if (length(modifications)) {
      mods <- paste0(names(modifications), "=", modifications, collapse=", ")
    } else {
      mods <- "None"
    }
    message("Modifications used: ", mods)
  }
  
  ## split peptide sequence into aa
  fragment.seq <- strsplit(sequence, "")[[1]]
  fn <- length(fragment.seq)
  
  ## calculate cumulative mass starting at the amino-terminus (for a, b, c)
  amz <- cumsum(aamass[fragment.seq[-fn]])
  ## calculate cumulative mass starting at the carboxyl-terminus (for x, y, z)
  cmz <- cumsum(aamass[rev(fragment.seq[-1L])])
  
  ## calculate fragment mass (amino-terminus)
  tn <- length(amz)
  atype <- c("a", "b", "c") %in% type
  nat <- sum(atype)
  amz <- rep(amz, nat) + rep(add[1:3][atype], each=tn)
  
  ### calculate fragment mass (carboxyl-terminus)
  ctype <- c("x", "y", "z") %in% type
  nct <- sum(ctype)
  cmz <- rep(cmz, nct) + rep(add[4:6][ctype], each=tn)
  
  ## devide by charge
  zn <- length(z)
  amz <- rep(amz, each=zn)/z
  cmz <- rep(cmz, each=zn)/z
  
  ## add protons (H+)
  amz <- amz + mass["p"]
  cmz <- cmz + mass["p"]
  
  ## fragment seq (amino-terminus)
  aseq <- rep(rep(substring(sequence, rep(1L, fn - 1L),
                            1L:(fn - 1L)), each=zn), nat)
  
  ## fragment seq (carboxyl-terminus)
  cseq <- rep(rep(rev(substring(sequence, 2L:fn,
                                rep(fn, fn - 1L))), each=zn), nct)
  
  ## fragment str (amino-terminus)
  atype <- rep(c("a", "b", "c")[atype], each=tn * zn)
  pos <- rep(1L:tn, each=zn)
  if (length(atype)) {
    aion <- paste0(atype, pos)
  } else {
    aion <- character()
  }
  
  ## fragment str (carboxyl-terminus)
  ctype <- rep(c("x", "y", "z")[ctype], each=tn * zn)
  if (length(ctype)) {
    cion <- paste0(ctype, pos)
  } else {
    cion <- character()
  }
  
  df <- data.frame(mz=c(amz, cmz),
                   ion=c(aion, cion),
                   type=c(atype, ctype),
                   pos=pos,
                   z=z,
                   seq=c(aseq, cseq),
                   stringsAsFactors=FALSE)
  df <- .neutralLoss(df, water=neutralLoss$water, ammonia=neutralLoss$ammonia)
  df <- .terminalModifications(df, modifications=modifications)
  rownames(df) <- NULL
  df
}

#' adds neutral loss to data.frame generated by .calculateFragments
#' @param df data.frame generated by. calculateFragments
#' @return data.frame neutral loss rows added
#' @noRd
.neutralLoss <- function(df,
                         water=c("Cterm", "D", "E", "S", "T"),
                         ammonia=c("K", "N", "Q", "R")) {
  ## see "Low energy peptide fragmentation pathways" by Hugh-G. Patterton, Ph.D.
  ## http://cbio.ufs.ac.za/fgap/download/fragmentation_review.pdf
  ## see also discussion #47: https://github.com/lgatto/MSnbase/issues/47
  
  ## constants
  mass <- get.atomic.mass()
  
  widx <- double()
  aidx <- double()
  
  .removeNeutralLoss <- function(df, idx, mass, ion) {
    if (length(idx)) {
      loss <- df[idx, ]
      loss[, c("ion", "type")] <- paste0(c(loss$ion, loss$type), ion)
      loss$mz <- loss$mz - mass
      rbind(df, loss)
    } else {
      df
    }
  }
  
  if (length(water)) {
    ## N-term D/E, internal S/T
    rules <- c(D="^D.", E="^E.", S=".S.", T=".T.")
    rules <- rules[intersect(c("D", "E", "S", "T"), water)]
    
    if (length(rules)) {
      widx <- grep(paste0(rules, collapse="|"), df$seq)
    }
    
    ## C-term COOH (all x, y, z fragments)
    if ("Cterm" %in% water) {
      widx <- unique(c(widx, grep("[xyz]", df$type)))
    }
  }
  
  if (length(ammonia)) {
    ## N-term/internal K/N/Q, internal R
    rules <- c(K="^.*K.", N="^.*N.", Q="^.*Q.", R=".R.")
    rules <- rules[intersect(c("K", "N", "Q", "R"), ammonia)]
    
    if (length(rules)) {
      aidx <- grep(paste0(rules, collapse="|"), df$seq)
    }
  }
  
  if (length(widx)) {
    df <- .removeNeutralLoss(df, idx=widx, mass=2*mass["H"]+mass["O"], ion="_")
  }
  if (length(aidx)) {
    df <- .removeNeutralLoss(df, idx=aidx, mass=mass["N"]+3*mass["H"], ion="*")
  }
  df
}

#' adds nterm/cterm modifications to data.frame generated by .calculateFragments
#' should be used after .neutralLoss
#' @param df data.frame generated by. calculateFragments
#' @return modified data.frame
#' @noRd
.terminalModifications <- function(df, modifications) {
  
  if ("Nterm" %in% names(modifications)) {
    isABC <- grep("[abc]", df$type)
    
    if (length(isABC)) {
      df$mz[isABC] <- df$mz[isABC] + modifications["Nterm"] / df$z[isABC]
    }
  }
  
  if ("Cterm" %in% names(modifications)) {
    isXYZ <- grep("[xyz]", df$type)
    
    if (length(isXYZ)) {
      df$mz[isXYZ] <- df$mz[isXYZ] + modifications["Cterm"] / df$z[isXYZ]
    }
  }
  
  df
}

#' default neutral loss argument for calculateFragments
#' @param disableWaterLoss character, which loss should not calculated
#' @param disableAmmoniaLoss character, which loss should not calculated
#' @noRd
defaultNeutralLoss <- function(disableWaterLoss=NULL, disableAmmoniaLoss=NULL) {
  list(water=setdiff(c("Cterm", "D", "E", "S", "T"), disableWaterLoss),
       ammonia=setdiff(c("K", "N", "Q", "R"), disableAmmoniaLoss))
}

##' get.atomic.mass()
get.atomic.mass <- function()
  .get.atomic.mass()

.get.atomic.mass <- function() {
  get("atomic.mass",envir=.MSnbaseEnv)
}

.MSnbaseEnv <- new.env(parent = emptyenv(), hash = TRUE)

## As discussed in issue #163 for details, the random errors we see
## (see issue #138) seem to come (partially at least) from using new
## in the prototype. As a result, these will be setn (and tested in
## validity methods) outside of the prototype. The vector below stores
## the respective class versions. When a class doesn't have one, the
## version should be defined as NA_character_.

ClassVersions <- c(
  Spectrum = "0.4.0",
  Spectrum1 = "0.2.0",
  Spectrum2 = "0.2.0")

assign("ClassVersions", ClassVersions, envir = .MSnbaseEnv)

assign("amino.acids",
       data.frame(AA = c("peg","A","R","N","D","C","E",
                         "Q","G","H","I","L", "K","M","F",
                         "P","S","T","W","Y","V"),
                  ResidueMass = c(44.00000,
                                  71.03711,  156.10111, 114.04293, 115.02694,
                                  103.00919, 129.04259, 128.05858, 57.02146,
                                  137.05891, 113.08406, 113.08406, 128.09496,
                                  131.04049, 147.06841, 97.05276,  87.03203,
                                  101.04768, 186.07931, 163.06333, 99.06841),
                  Abbrev3 = c(NA,
                              "Ala", "Arg", "Asn", "Asp", "Cys",
                              "Glu", "Gln", "Gly", "His", "Ile",
                              "Leu", "Lys", "Met", "Phe", "Pro",
                              "Ser", "Thr", "Trp", "Tyr", "Val"),
                  ImmoniumIonMass = c(NA,
                                      44.05003,  129.11400, 87.05584,  88.03986,  76.02210,
                                      102.05550, 101.07150, 30.03438,  110.07180, 86.09698,
                                      86.09698,  101.10790, 104.05340, 120.08130, 70.06568,
                                      60.04494,  74.06059,  159.09220, 136.07620, 72.08133),
                  Name = c("Polyethylene glycol",
                           "Alanine",    "Arginine",      "Asparagine", "Aspartic acid",
                           "Cysteine",   "Glutamic acid", "Glutamine",  "Glycine",
                           "Histidine",  "Isoleucine",    "Leucine",    "Lysine",
                           "Methionine", "Phenylalanine", "Proline",    "Serine",
                           "Threonine",  "Tryptophan",    "Tyrosine",   "Valine"),
                  ## The hydrophobicity values are from JACS, 1962, 84: 4240-4246. (C. Tanford)
                  Hydrophobicity = c(NA, 0.62, -2.53, -0.78, -0.9, 0.29,
                                     -0.74, -0.85, 0.48, -0.4, 1.38, 1.06, -1.5, 0.64, 1.19,
                                     0.12, -0.18, -0.05, 0.81, 0.26, 1.08),
                  ## The hydrophilicity values are from PNAS, 1981, 78:3824-3828 (T.P.Hopp & K.R.Woods)
                  Hydrophilicity = c(NA, -0.5, 3, 0.2, 3, -1, 3, 0.2, 0,
                                     -0.5, -1.8, -1.8, 3, -1.3, -2.5, 0, 0.3, -0.4, -3.4,
                                     -2.3, -1.5),
                  SideChainMass = c(NA, 15, 101, 58, 59, 47, 73, 72,
                                    1, 82, 57, 57, 73, 75, 91, 42, 31, 45, 130, 107, 43),
                  ## CRC Handbook of Chemistry and Physics, 66th ed., CRC Press, Boca Raton, Florida (1985).
                  ## R.M.C. Dawson, D.C. Elliott, W.H. Elliott, K.M. Jones, Data for Biochemical Research 3rd ed., Clarendon Press Oxford
                  pK1 = c(NA, 2.35, 2.18, 2.18, 1.88, 1.71, 2.19,
                          2.17, 2.34, 1.78, 2.32, 2.36, 2.2, 2.28, 2.58,
                          1.99, 2.21, 2.15, 2.38, 2.2, 2.29),
                  pK2 = c(NA, 9.87, 9.09, 9.09, 9.6, 10.78, 9.67, 9.13,
                          9.6, 8.97, 9.76, 9.6, 8.9, 9.21, 9.24, 10.6, 9.15,
                          9.12, 9.39, 9.11, 9.74),
                  pI = c(NA, 6.11, 10.76, 10.76, 2.98, 5.02, 3.08,
                         5.65, 6.06, 7.64, 6.04, 6.04, 9.47, 5.74, 5.91,
                         6.3, 5.68, 5.6, 5.88, 5.63, 6.02)),
       envir = .MSnbaseEnv)

assign("atomic.mass",
       ## taken from:
       ## http://www.chem.ualberta.ca/~massspec/atomic_mass_abund.pdf
       c(H=1.007825,
         C=12,
         N=14.003074,
         O=15.994915,
         p=1.007276),
       envir = .MSnbaseEnv)

lockEnvironment(.MSnbaseEnv,bindings=TRUE)

get.amino.acids <- function()
  .get.amino.acids()

.get.amino.acids <- function() {
  get("amino.acids",envir=.MSnbaseEnv)
}

isMSnbaseVerbose <- function()
  FALSE