From f9deb29cdbd2d2a5f2f4fbd470b1078431a36ae0 Mon Sep 17 00:00:00 2001 From: Polina Polunina <55543056+PlushZ@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:44:57 +0200 Subject: [PATCH] add hgvsparser (#6057) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add hgvsparser * fix shed.xml and citation * change test data file * Update tools/hgvsparser/hgvsparser.xml Co-authored-by: Björn Grüning * include required files --------- Co-authored-by: Björn Grüning --- tools/hgvsparser/.shed.yml | 9 + tools/hgvsparser/buildHGVS.R | 512 ++++++++++++++ tools/hgvsparser/hgvsparser.xml | 152 ++++ tools/hgvsparser/parseHGVS.R | 734 ++++++++++++++++++++ tools/hgvsparser/test-data/inputBuilder.csv | 6 + tools/hgvsparser/test-data/inputParser.csv | 6 + 6 files changed, 1419 insertions(+) create mode 100644 tools/hgvsparser/.shed.yml create mode 100644 tools/hgvsparser/buildHGVS.R create mode 100644 tools/hgvsparser/hgvsparser.xml create mode 100644 tools/hgvsparser/parseHGVS.R create mode 100644 tools/hgvsparser/test-data/inputBuilder.csv create mode 100644 tools/hgvsparser/test-data/inputParser.csv diff --git a/tools/hgvsparser/.shed.yml b/tools/hgvsparser/.shed.yml new file mode 100644 index 00000000000..0be98864519 --- /dev/null +++ b/tools/hgvsparser/.shed.yml @@ -0,0 +1,9 @@ +name: hgvsparser +owner: iuc +description: Parsing and building variant descriptor strings compliant with the HGVS standard +homepage_url: https://github.com/VariantEffect/hgvsParseR/tree/master +long_description: | + hgvsParseR is an R package for parsing the HGVS strings that are commonly used to describe variation. +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/hgvsparser/ +categories: + - Variant Analysis diff --git a/tools/hgvsparser/buildHGVS.R b/tools/hgvsparser/buildHGVS.R new file mode 100644 index 00000000000..55d4c55e06c --- /dev/null +++ b/tools/hgvsparser/buildHGVS.R @@ -0,0 +1,512 @@ +# Copyright (C) 2018 Jochen Weile, Roth Lab +# +# This file is part of hgvsParseR. +# +# hgvsParseR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# hgvsParseR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with hgvsParseR. If not, see . + +#' Genomic HGVS Builder +#' +#' A constructor for a genomic-level HGVS builder object. The object contains a collection of functions +#' for building genomic HGVS strings. +#' +#' The resulting object encapsulates the following functions: +#' \itemize{ +#' \item{substitution(pos,ancestral,variant)} Genomic substitution variants. +#' pos = position (integer); ancestral = ancestral nucleotide [ACGT]; +#' variant = variant nucleotide [ACGT] +#' \item{deletion(start,stop)} Genomic deletion. start = start position (integer); +#' stop = stop position (integer) +#' \item{inversion(start,stop)} Genomic inversion. start = start position (integer); +#' stop = stop position (integer) +#' \item{duplication(start,stop)} Genomic duplication. start = start position (integer); +#' stop = stop position (integer) +#' \item{insertion(start,variant)} Genomic insertion. start = position immediately preceeding +#' the insertion (integer); seq = inserted nucleotide sequence [ACGT]+ +#' \item{delins(start,stop,variant)} Genomic deletion and insertion. start = start position (integer); +#' stop = stop position relative to the reference (integer); seq = inserted nucleotide sequence [ACGT]+ +#' \item{cis(...)} Multi-variant phased in cis. Parameters are genomic HGVS strings for the +#' corresponding single mutants +#' \item{trans(...)} Multi-variant phased in trans. Parameters are genomic HGVS strings for the +#' corresponding single mutants +#' \item{nophase(...)} Multi-variant with unknown phasing. Parameters are genomic HGVS strings for the +#' corresponding single mutants +#' } +#' +#' @return A \code{hgvs.builder.g} object with functions for building genomic HGVS strings. +#' The individual functions return single-element character vectors containing these strings. +#' @keywords HGVS builder +#' @export +#' @examples +#' builder <- new.hgvs.builder.g() +#' string1 <- builder$substitution(123,"A","G") +#' string2 <- builder$delins(123,129,"ATTG") +#' string3 <- with(builder,cis(substitution(123,"A","C"),substitution(231,"G","A"))) + +new.hgvs.builder.g <- function() { + + substitution <- function(pos,ancestral,variant) { + if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer") + if (!is.character(ancestral) || !(ancestral %in% c("A","C","G","T"))) stop("ancestral must be single nucleotide") + if (!is.character(variant) || !(variant %in% c("A","C","G","T"))) stop("variant must be single nucleotide") + paste0("g.",pos,ancestral,">",variant) + } + + deletion <- function(start,stop) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (start > stop) stop("start must be upstream of stop") + paste0("g.",start,"_",stop,"del") + } + + inversion <- function(start,stop) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (start > stop) stop("start must be upstream of stop") + paste0("g.",start,"_",stop,"inv") + } + + duplication <- function(start,stop) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (start > stop) stop("start must be upstream of stop") + paste0("g.",start,"_",stop,"dup") + } + + insertion <- function(start,seq) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence") + paste0("g.",start,"_",start+1,"ins",seq) + } + + delins <- function(start,stop,seq) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (start > stop) stop("start must be upstream of stop") + if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence") + paste0("g.",start,"_",stop,"delins",seq) + } + + cis <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("g.[",paste(bodies,collapse=";"),"]") + } + + trans <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("g.[",paste(bodies,collapse="];["),"]") + } + + nophase <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("g.[",paste(bodies,collapse="(;)"),"]") + } + + return(structure(list( + substitution=substitution, + deletion=deletion, + inversion=inversion, + duplication=duplication, + insertion=insertion, + delins=delins, + cis=cis, + trans=trans, + nophase=nophase + ),class="hgvs.builder.g")) +} + +print.hgvs.builder.g <- function() { + cat("Genomic HGVS string builder. Use $ operator to access functions.") +} + + + +#' Coding Sequence HGVS Builder +#' +#' A constructor for a CDS (=coding sequence) HGVS builder object. The object contains a collection of functions +#' for building CDS HGVS strings. +#' The resulting object encapsulates the following functions: +#' \itemize{ +#' \item{substitution(pos,ancestral,variant,posOffset=0)} CDS substitution variants. +#' pos = position (integer); ancestral = ancestral nucleotide [ACGT]; +#' variant = variant nucleotide [ACGT]; posOffset = offset from the position when +#' crossing exon-intron borders (integer, defaults to 0) +#' \item{deletion(start,stop,startOffset=0,stopOffset=0)} CDS deletion. start = start position (integer); +#' stop = stop position (integer); startOffset = offset from the start position when +#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the +#' stop position when crossing exon-intron borders (integer, defaults to 0) +#' \item{inversion(start,stop,startOffset=0,stopOffset=0)} CDS inversion. start = start position (integer); +#' stop = stop position (integer); startOffset = offset from the start position when +#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the +#' stop position when crossing exon-intron borders (integer, defaults to 0) +#' \item{duplication(start,stop,startOffset=0,stopOffset=0)} CDS duplication. start = start position (integer); +#' stop = stop position (integer); startOffset = offset from the start position when +#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the +#' stop position when crossing exon-intron borders (integer, defaults to 0) +#' \item{insertion(start,variant,startOffset=0)} CDS insertion. start = position immediately preceeding +#' the insertion (integer); seq = inserted nucleotide sequence [ACGT]+ ; +#' startOffset = offset from the start position when crossing exon-intron borders +#' (integer, defaults to 0) +#' \item{delins(start,stop,variant,startOffset=0,stopOffset=0)} CDS deletion and insertion. start = start position (integer); +#' stop = stop position relative to the reference (integer); +#' seq = inserted nucleotide sequence [ACGT]+ ; startOffset = offset from the start position when +#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the +#' stop position when crossing exon-intron borders (integer, defaults to 0) +#' \item{cis(...)} Multi-variant phased in cis. Parameters are coding HGVS strings for the +#' corresponding single mutants +#' \item{trans(...)} Multi-variant phased in trans. Parameters are coding HGVS strings for the +#' corresponding single mutants +#' \item{nophase(...)} Multi-variant with unknown phasing. Parameters are coding HGVS strings for the +#' corresponding single mutants +#' } +#' +#' @return A \code{hgvs.builder.c} object with functions for building coding HGVS strings. +#' The individual functions return single-element character vectors containing these strings. +#' @keywords HGVS builder +#' @export +#' @examples +#' builder <- new.hgvs.builder.c() +#' string1 <- builder$substitution(123,"A","G",posOffset=2) +#' string2 <- builder$delins(123,129,"ATTG") +#' string3 <- with(builder,cis(substitution(123,"A","C"),substitution(231,"G","A"))) + +new.hgvs.builder.c <- function() { + + offsetStr <- function(offset) { + if (offset==0) { + "" + } else if (offset > 0) { + paste0("+",offset) + } else if (offset < 0) { + as.character(offset) + } + } + + substitution <- function(pos,ancestral,variant,posOffset=0) { + if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer") + if (!is.numeric(posOffset)) stop("offset must be an integer") + if (!is.character(ancestral) || !(ancestral %in% c("A","C","G","T"))) stop("ancestral must be single nucleotide") + if (!is.character(variant) || !(variant %in% c("A","C","G","T"))) stop("variant must be single nucleotide") + paste0("c.",pos,offsetStr(posOffset),ancestral,">",variant) + } + + deletion <- function(start,stop,startOffset=0,stopOffset=0) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (!is.numeric(startOffset)) stop("offset must be an integer") + if (!is.numeric(stopOffset)) stop("offset must be an integer") + if (start+startOffset > stop+stopOffset) stop("start must be before stop") + paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"del") + } + + inversion <- function(start,stop,startOffset=0,stopOffset=0) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (!is.numeric(startOffset)) stop("offset must be an integer") + if (!is.numeric(stopOffset)) stop("offset must be an integer") + if (start+startOffset >= stop+stopOffset) stop("start must be before stop") + paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"inv") + } + + duplication <- function(start,stop,startOffset=0,stopOffset=0) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (!is.numeric(startOffset)) stop("offset must be an integer") + if (!is.numeric(stopOffset)) stop("offset must be an integer") + if (start+startOffset > stop+stopOffset) stop("start must be before stop") + paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"dup") + } + + insertion <- function(start,seq,startOffset=0) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) { + stop("variant must be nucleotide sequence") + } + if (!is.numeric(startOffset)) stop("offset must be an integer") + stop <- if (startOffset != 0) start else start+1 + stopOffset <- if (startOffset != 0) startOffset+1 else startOffset + paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"ins",seq) + } + + delins <- function(start,stop,seq,startOffset=0,stopOffset=0) { + if (!is.numeric(start)) stop("start must be an integer") + if (!is.numeric(stop)) stop("stop must be an integer") + if (!is.numeric(startOffset)) stop("offset must be an integer") + if (!is.numeric(stopOffset)) stop("offset must be an integer") + if (start+startOffset > stop+stopOffset) stop("start must be before stop") + if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence") + paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"delins",seq) + } + + cis <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("c.[",paste(bodies,collapse=";"),"]") + } + + trans <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("c.[",paste(bodies,collapse="];["),"]") + } + + nophase <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("c.[",paste(bodies,collapse="(;)"),"]") + } + + return(structure(list( + substitution=substitution, + deletion=deletion, + inversion=inversion, + duplication=duplication, + insertion=insertion, + delins=delins, + cis=cis, + trans=trans, + nophase=nophase + ),class="hgvs.builder.c")) +} + +print.hgvs.builder.c <- function() { + cat("Coding-sequence HGVS string builder. Use $ operator to access functions.") +} + + + +#' Protein HGVS Builder +#' +#' A constructor for a protein-level HGVS builder object. The object contains a collection of functions +#' for building protein HGVS strings. +#' +#' The resulting object encapsulates the following functions: +#' \itemize{ +#' \item{synonymous()} A synonymous variant. No parameters required. +#' \item{synonymous(pos,ancestral)} Unofficial (yet frequently used) version of synonymous variant syntax. +#' pos = position (integer); ancestral = ancestral amino acid in one-letter or three-letter code. +#' \item{substitution(pos,ancestral,variant)} AA substitution variants. +#' pos = position (integer); ancestral = ancestral amino acid in one-letter or three-letter code; +#' variant = variant amino acid in one-letter or three-letter code +#' \item{deletion(startPos,startAA,endPos,endAA)} AA deletion. startPos = start position (integer); +#' startAA = start amino acid in one-letter or three-letter code; +#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code +#' \item{duplication(startPos,startAA,endPos,endAA)} AA duplication. startPos = start position (integer); +#' startAA = start amino acid in one-letter or three-letter code; +#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code +#' \item{insertion(leftPos,leftAA,rightAA,seq)} AA insertion. leftPos = position immediately preceeding +#' the insertion (integer); leftAA = corresponding amino acid in one-letter or three-letter code; +#' rightAA = amino acid to the right of the insertion, in one-letter or three-letter code; +#' seq = inserted amino acid sequence, given as a character vector containing the individual +#' one-letter or three-letter amino acid codes. +#' \item{delins(startPos,startAA,endPos,endAA,seq)} AA deletion and insertion. +#' startPos = start position (integer); +#' startAA = start amino acid in one-letter or three-letter code; +#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code; +#' seq = inserted amino acid sequence, given as a character vector containing the individual +#' one-letter or three-letter amino acid codes. +#' \item{frameshift(startPos,startAA,variantAA=NA,newStop=NA)} Frameshift variant. +#' startPos = start position (integer); +#' startAA = start amino acid in one-letter or three-letter code; +#' variantAA = amino acid replacing the start position in the frameshift sequence, +#' given in one-letter or three-letter code, or \code{NA} to omit (default); +#' newStop = the position of the nearest coding resulting from the frameshift, +#' or \code{NA} to omit (default). +#' \item{cis(...)} Multi-variant phased in cis. Parameters are coding HGVS strings for the +#' corresponding single mutants. As phasing in trans would be nonsensical in a protein context, +#' the \code{trans()} and \code{nophase()} methods are not provided here. +#' } +#' +#' @return A \code{hgvs.builder.g} object with functions for building genomic HGVS strings. +#' The individual functions return single-element character vectors containing these strings. +#' @keywords HGVS builder +#' @export +#' @examples +#' builder <- new.hgvs.builder.g() +#' string1 <- builder$substitution(123,"R","K") +#' string2 <- builder$delins(123,"Arg",152,"Leu",c("Lys","Trp","Ser")) +#' string3 <- with(builder,cis(substitution(123,"R","K"),deletion(125,"S",152,"L"))) + + +new.hgvs.builder.p <- function(aacode=c(1,3)) { + + aacode <- aacode[[1]] + if (!is.numeric(aacode) && !(aacode %in% c(1,3))) { + stop("Invalid aacode parameter, only 1 or 3 allowed!") + } + + one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His", + I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg", + S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter") + three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E", + Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P", + Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*") + + enforceCode <- function(aa) { + if (aa %in% one2three) { + if (aacode == 1) { + three2one[[aa]] + } else { + aa + } + } else if (aa %in% three2one) { + if (aacode == 1) { + aa + } else { + one2three[[aa]] + } + } else { + stop("Invalid AA code") + } + } + + synonymous <- function(pos=NULL,ancestral=NULL) { + if (is.null(pos) || is.null(ancestral)) { + return("p.=") + } + if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer") + if (!is.character(ancestral) || !(ancestral %in% c(one2three,three2one))) stop("ancestral must be single amimo acid") + ancestral <- enforceCode(ancestral) + paste0("p.",ancestral,pos,"=") + } + + substitution <- function(pos,ancestral,variant) { + if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer") + if (!is.character(ancestral) || !(ancestral %in% c(one2three,three2one))) stop("ancestral must be single amimo acid") + if (!is.character(variant) || !(variant %in% c(one2three,three2one))) stop("variant must be single amino acid") + ancestral <- enforceCode(ancestral) + variant <- enforceCode(variant) + paste0("p.",ancestral,pos,variant) + } + + deletion <- function(startPos,startAA,endPos,endAA) { + if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer") + if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer") + if (startPos > endPos) stop("start must be upstream of stop") + if (!is.character(startAA) || !(startAA %in% c(one2three,three2one))) stop("startAA must be single amimo acid") + if (!is.character(endAA) || !(endAA %in% c(one2three,three2one))) stop("endAA must be single amimo acid") + startAA <- enforceCode(startAA) + endAA <- enforceCode(endAA) + if (startPos==endPos) { + paste0("p.",startAA,startPos,"del") + } else { + paste0("p.",startAA,startPos,"_",endAA,endPos,"del") + } + } + + duplication <- function(startPos,startAA,endPos,endAA) { + if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer") + if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer") + if (startPos >= endPos) stop("start must be upstream of stop") + if (!is.character(startAA) || !(startAA %in% c(one2three,three2one))) + stop("startAA must be single amimo acid") + if (!is.character(endAA) || !(endAA %in% c(one2three,three2one))) + stop("endAA must be single amimo acid") + startAA <- enforceCode(startAA) + endAA <- enforceCode(endAA) + paste0("p.",startAA,startPos,"_",endAA,endPos,"dup") + } + + insertion <- function(leftPos,leftAA,rightAA,seq) { + if (!is.numeric(leftPos) || leftPos < 1) stop("position must be a positive integer") + if (!is.character(leftAA) || !(leftAA %in% c(one2three,three2one))) + stop("leftAA must be single amimo acid") + if (!is.character(rightAA) || !(rightAA %in% c(one2three,three2one))) + stop("rightAA must be single amimo acid") + if (!is.character(seq) || !all(sapply(seq,function(x) x %in% c(one2three,three2one)))) + stop("seq must be a vector of amino acids") + rightPos <- leftPos+1 + leftAA <- enforceCode(leftAA) + rightAA <- enforceCode(rightAA) + seq <- paste(sapply(seq,enforceCode),collapse="") + paste0("p.",leftAA,leftPos,"_",rightAA,rightPos,"ins",seq) + } + + delins <- function(startPos,startAA,endPos,endAA,seq) { + if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer") + if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer") + if (startPos > endPos) stop("start must be upstream of stop") + if (!is.character(startAA) || !(startAA %in% c(one2three,three2one))) + stop("startAA must be single amimo acid") + if (!is.character(endAA) || !(endAA %in% c(one2three,three2one))) + stop("endAA must be single amimo acid") + if (!is.character(seq) || !all(sapply(seq,function(x) x %in% c(one2three,three2one)))) + stop("seq must be a vector of amino acids") + startAA <- enforceCode(startAA) + endAA <- enforceCode(endAA) + seq <- paste(sapply(seq,enforceCode),collapse="") + paste0("p.",startAA,startPos,"_",endAA,endPos,"delins",seq) + } + + frameshift <- function(startPos,startAA,variantAA=NA,newStop=NA) { + if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer") + if (!is.na(newStop) && (!is.numeric(newStop) || newStop < 1)) stop("position must be a positive integer") + if (!is.character(startAA) || !(startAA %in% c(one2three,three2one))) + stop("startAA must be single amimo acid or NA") + if (!is.na(variantAA) && (!is.character(startAA) || !(startAA %in% c(one2three,three2one)))) + stop("variantAA must be single amimo acid or NA") + startAA <- enforceCode(startAA) + if (is.na(variantAA)) { + variantAA <- "" + } else { + variantAA <- enforceCode(variantAA) + } + if (is.na(newStop)) { + newStop <- "" + } else { + newStop <- paste0("*",newStop) + } + paste0("p.",startAA,startPos,variantAA,"fs",newStop) + } + + cis <- function(...) { + strings <- list(...) + if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings") + strings <- unlist(strings) + if (!all(substr(strings,1,2)=="p.")) stop("all arguments must be protein HGVS strings") + bodies <- substr(strings,3,nchar(strings)) + paste0("p.[",paste(bodies,collapse=";"),"]") + } + + return(structure(list( + synonymous=synonymous, + substitution=substitution, + deletion=deletion, + duplication=duplication, + insertion=insertion, + delins=delins, + frameshift=frameshift, + cis=cis + ),class="hgvs.builder.p")) +} \ No newline at end of file diff --git a/tools/hgvsparser/hgvsparser.xml b/tools/hgvsparser/hgvsparser.xml new file mode 100644 index 00000000000..38ed5998d48 --- /dev/null +++ b/tools/hgvsparser/hgvsparser.xml @@ -0,0 +1,152 @@ + + parsing and building variant descriptor strings compliant with the HGVS standard + + 0.1.0 + 0 + + + r-base + + + + + + tmp_output.csv + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + action_choice['do'] == 'parse' + + + action_choice['do'] == 'build' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1186/s13059-019-1845-6 + + \ No newline at end of file diff --git a/tools/hgvsparser/parseHGVS.R b/tools/hgvsparser/parseHGVS.R new file mode 100644 index 00000000000..da8552b7cb8 --- /dev/null +++ b/tools/hgvsparser/parseHGVS.R @@ -0,0 +1,734 @@ +# Copyright (C) 2018 Jochen Weile, Roth Lab +# +# This file is part of hgvsParseR. +# +# hgvsParseR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# hgvsParseR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with hgvsParseR. If not, see . + +#' HGVS Parser +#' +#' Parses HGVS strings +#' @param strings A character vector containing the HGVS strings +#' @param aacode allowed values: 1, 3, or NA. Determines whether 1-letter codes or 3-letter codes should be forced. NA uses input format. +#' @return A \code{data.frame} with the following columns: +#' @keywords HGVS parsing +#' @export +#' @examples +#' result <- parseHGVS(c("g.1318G>T","c.123_125inv","p.R123_L152del")) + +parseHGVS <- function(strings,aacode=c(NA,1,3)) { + + #Check that parameters are valid + if (!is.character(strings)) { + stop("Input for 'parse' function must be a character vector! Found '",class(strings),"' instead.") + } + + aacode <- aacode[[1]] + if (!is.na(aacode) && !(aacode %in% c(1,3))) { + warning("Invalid aacode parameter, defaulting to NA!") + aacode <- NA + } + + #Helper function: turns a list of lists (lol) in to a dataframe + to.df <- function(lol) { + colnames <- unique(do.call(c,lapply(lol,names))) + columns <- lapply(colnames,function(cn) sapply(lol,function(row) { + if (cn %in% names(row)) row[[cn]] else NA + })) + names(columns) <- colnames + empty <- which(sapply(columns,function(xs)all(is.na(xs)))) + columns[empty] <- NULL + do.call(data.frame,columns) + } + + # ### + # # Binds matrices of same size together to a 3D matrix, analogously + # # to cbind and rbind. + # # + # zbind <- function(...) { + # x <- list(...) + # y <- array(0,dim=c(nrow(x[[1]]),ncol(x[[1]]),length(x)),dimnames=dimnames(x[[1]])) + # for (i in 1:length(x)) y[,,i] <- x[[i]] + # y + # } + + + ### + # Function to *locally* excise regex groups from string vectors. + # I.e. only extract the first occurrence of each group within each string. + # x = string vector + # re = regular expression with groups + # + extract.groups <- function(x, re) { + matches <- regexpr(re,x,perl=TRUE) + start <- attr(matches,"capture.start") + end <- start + attr(matches,"capture.length") - 1 + do.call(cbind,lapply(1:ncol(start), function(i) { + sapply(1:nrow(start),function(j){ + if (start[j,i] > -1) substr(x[[j]],start[j,i],end[j,i]) else NA + }) + })) + } + + # ### + # # Function to *globally* excise regex groups from string vectors. + # # x = string vector + # # re = regular expression with groups + # # + # global.extract.groups <- function(x,re) { + # all.matches <- gregexpr(re,x,perl=TRUE) + # mapply(function(matches,x) { + # start <- attr(matches,"capture.start") + # end <- start + attr(matches,"capture.length") - 1 + # apply(zbind(start,end),c(1,2),function(pos) substr(x,pos[[1]],pos[[2]]) ) + # },matches=all.matches,x=x,SIMPLIFY=FALSE) + # } + + ### + # Helper function to split multi-mutant bodies into their individual + # elements. Returns a vector of strings containing these elements. + # An attribute "multi" is attached to the vector, detailing the type + # of multi-mutant + # + splitMulti <- function(body) { + if (regexpr("\\[.+\\];\\[.+\\]",body) > 0) { + out <- strsplit(substr(body,2,nchar(body)-1),"\\];\\[")[[1]] + attr(out,"multi") <- "trans" + } else if (regexpr("\\[.+\\(;\\).+\\]",body) > 0) { + out <- strsplit(substr(body,2,nchar(body)-1),"\\(;\\)")[[1]] + attr(out,"multi") <- "unknown" + } else if (regexpr("\\[.+;.+\\]",body) > 0) { + out <- strsplit(substr(body,2,nchar(body)-1),";")[[1]] + attr(out,"multi") <- "cis" + } else { + out <- body + attr(out,"multi") <- "single" + } + return(out) + } + + ### + # Helper function: + # Given an HGVS body and a list of regexes corresponding to types, + # find the (first) matching type. + findType <- function(body,types) { + i <- 0 + found <- done <- FALSE + while (!found && !done) { + found <- regexpr(types[[i <- i+1]],body) > 0 + done <- i >= length(types) + } + if (found) { + return(names(types)[[i]]) + } else { + return("invalid") + } + } + + out <- lapply(strings,function(s) { + + if (regexpr("^[gcnmrp]\\.",s) < 1) { + return(list(list(hgvs=s,subject="invalid",type="invalid"))) + } + + body <- substr(s,3,nchar(s)) + + subbodies <- splitMulti(body) + + subjects <- c( + g="genomic",c="coding",n="noncoding", + m="mitochondrial",r="rna",p="protein" + ) + subject <- subjects[[substr(s,1,1)]] + + if (subject=="genomic") { + + types <- c( + substitution="\\d+[ACGT]>[ACGT]", singledeletion="^\\d+del$", + deletion="\\d+_\\d+del$",inversion="\\d+_\\d+inv", + duplication="\\d+_\\d+dup",insertion="\\d+_\\d+ins[ATCG]+", + conversion="\\d+_\\d+con\\d+_\\d+",delins="\\d+_\\d+delins[ATCG]+", + amplification="\\d+_\\d+\\[\\d+\\]" + ) + + phasing <- attr(subbodies,"multi") + isMulti <- length(subbodies) > 1 + + lapply(1:length(subbodies), function(i.multi) { + body <- subbodies[[i.multi]] + + type <- findType(body,types) + + if (type == "substitution") { + groups <- extract.groups(body,"(\\d+)([ACGT])>([ACGT])")[1,] + position <- as.integer(groups[[1]]) + ancestral <- groups[[2]] + variant <- groups[[3]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=position,ancestral=ancestral,variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=position, + ancestral=ancestral,variant=variant)) + } + + } else if (type == "singledeletion") { + groups <- extract.groups(body,"(\\d+)del")[1,] + position <- as.integer(groups[[1]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=position)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=position)) + } + + } else if (type == "deletion") { + groups <- extract.groups(body,"(\\d+)_(\\d+)del") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) + } + + } else if (type == "inversion") { + groups <- extract.groups(body,"(\\d+)_(\\d+)inv") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) + } + + } else if (type == "duplication") { + groups <- extract.groups(body,"(\\d+)_(\\d+)dup") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end)) + } + + } else if (type == "insertion") { + groups <- extract.groups(body,"(\\d+)_(\\d+)ins([ATCG]+)") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + if (abs(end-start)!=1) { + warning("Invalid insertion definition: + Start and end positions must be adjacent!") + } + variant <- groups[[3]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end,variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end, + variant=variant)) + } + + } else if (type == "conversion") { + groups <- extract.groups(body,"(\\d+)_(\\d+)con(\\d+)_(\\d+)") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + tStart <- as.integer(groups[[3]]) + tEnd <- as.integer(groups[[4]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end, + templateStart=tStart,templateEnd=tEnd)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end, + templateStart=tStart,templateEnd=tEnd)) + } + + } else if (type == "delins") { + groups <- extract.groups(body,"(\\d+)_(\\d+)delins([ATCG]+)") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + variant <- groups[[3]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end,variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start,end=end, + variant=variant)) + } + + } else if (type == "amplification") { + groups <- extract.groups(body,"(\\d+)_(\\d+)\\[(\\d+)\\]") + start <- as.integer(groups[[1]]) + end <- as.integer(groups[[2]]) + copies <- as.integer(groups[[3]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,end=end,copies=copies)) + } else { + return(list(hgvs=s,subject=subject,type=type, + start=start,end=end,copies=copies)) + } + + } else { + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type="invalid")) + } else { + return(list(hgvs=s,subject=subject,type="invalid")) + } + } + + }) + + } else if (subject=="coding") { + #coding needs to be handled separately from genomic, as the syntax may differ + #e.g. it allows for offset descriptions relative to exon-intron borders + + types <- c( + substitution="\\d+([+-]\\d+)?[ACGT]>[ACGT]", + singledeletion="^\\d+([+-]\\d+)?del$", + deletion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?del$", + inversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?inv", + duplication="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?dup", + insertion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?ins[ATCG]+", + conversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?con\\d+([+-]\\d+)?_\\d+([+-]\\d+)?", + delins="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?delins[ATCG]+", + amplification="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?\\[\\d+\\]" + ) + + phasing <- attr(subbodies,"multi") + isMulti <- length(subbodies) > 1 + + lapply(1:length(subbodies), function(i.multi) { + body <- subbodies[[i.multi]] + + type <- findType(body,types) + + if (type == "substitution") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?([ACGT])>([ACGT])")[1,] + position <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + ancestral <- groups[[3]] + variant <- groups[[4]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=position,startIntron=intronOffset,ancestral=ancestral, + variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=position, + startIntron=intronOffset,ancestral=ancestral,variant=variant)) + } + + } else if (type == "singledeletion") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?del")[1,] + position <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=position,startIntron=intronOffset)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=position, + startIntron=intronOffset)) + } + } else if (type == "deletion") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?del") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2)) + } + } else if (type == "inversion") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?inv") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2)) + } + } else if (type == "duplication") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?dup") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2)) + } + } else if (type == "insertion") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?ins([ATCG]+)") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + variant <- groups[[5]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2,variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) + } + } else if (type == "conversion") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?con(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + tStart <- as.integer(groups[[5]]) + intronOffset3 <- as.integer(groups[[6]]) + tEnd <- as.integer(groups[[7]]) + intronOffset4 <- as.integer(groups[[8]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2,templateStart=tStart, + templateStartIntron=intronOffset3,templateEnd=tEnd, + templateEndIntron=intronOffset4)) + } else { + return(list(hgvs=s,subject=subject,type=type, + start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2,templateStart=tStart, + templateStartIntron=intronOffset3,templateEnd=tEnd, + templateEndIntron=intronOffset4)) + } + } else if (type == "delins") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?delins([ATCG]+)") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + variant <- groups[[5]] + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2,variant=variant)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant)) + } + } else if (type == "amplification") { + groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?\\[(\\d+)\\]") + start <- as.integer(groups[[1]]) + intronOffset <- as.integer(groups[[2]]) + end <- as.integer(groups[[3]]) + intronOffset2 <- as.integer(groups[[4]]) + copies <- as.integer(groups[[3]]) + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=start,startIntron=intronOffset, + end=end,endIntron=intronOffset2,copies=copies)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=start, + startIntron=intronOffset,end=end,endIntron=intronOffset2,copies=copies)) + } + } else { + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="invalid")) + } else { + return(list(hgvs=s,subject=subject,type="invalid")) + } + } + + }) + + } else if (subject=="protein") { + + one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His", + I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg", + S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter") + three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E", + Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P", + Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*") + codes <- paste(c(one2three,three2one[-21],"\\*"),collapse="|") + + types <- list( + synonymous1="^=$", + synonymous2=paste0("^(",codes,")(\\d+)=$"), + substitution=paste0("^(",codes,")(\\d+)(",codes,")$"), + singledeletion=paste0("^(",codes,")(\\d+)del$"), + deletion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)del$"), + duplication=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)dup$"), + insertion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)ins((",codes,")+)$"), + delins=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)delins((",codes,")+)$"), + frameshift1=paste0("^(",codes,")(\\d+)fs$"), + frameshift2=paste0("^(",codes,")(\\d+)(",codes,")fs(Ter|\\*)(\\d+)$") + ) + + phasing <- attr(subbodies,"multi") + isMulti <- length(subbodies) > 1 + + lapply(1:length(subbodies), function(i.multi) { + body <- subbodies[[i.multi]] + + type <- findType(body,types) + + if (type == "synonymous1") { + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type=type)) + } else { + return(list(hgvs=s,subject=subject,type="synonymous")) + } + } else if (type == "synonymous2") { + groups <- extract.groups(body,types$synonymous2) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + if (aa1 %in% c(one2three,three2one)) { + if (is.na(aacode)) { + #do nothing + } else if (aacode == 1) { + if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] + } else if (aacode ==3) { + if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] + } else { + #this should never happen, as it's supposed to be detected at start of function + stop("Invalid aacode. If you see this, report this as a bug!") + } + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="synonymous",start=pos,ancestral=aa1)) + } else { + return(list(hgvs=s,subject=subject,type="synonymous",start=pos, + ancestral=aa1)) + } + } else {#not valid amino acid + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="invalid")) + } else { + return(list(hgvs=s,subject=subject,type="invalid")) + } + } + } else if (type == "substitution") { + groups <- extract.groups(body,types$substitution) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + if (aa1 %in% c(one2three,three2one) && aa2 %in% c(one2three,three2one)) { + if (is.na(aacode)) { + #do nothing + } else if (aacode == 1) { + if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] + if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] + } else if (aacode ==3) { + if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] + if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] + } else { + #this should never happen, as it's supposed to be detected at start of function + stop("Invalid aacode. If you see this, report this as a bug!") + } + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1,variant=aa2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos, + ancestral=aa1,variant=aa2)) + } + } else {#not valid amino acid + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="invalid")) + } else { + return(list(hgvs=s,subject=subject,type="invalid")) + } + } + + } else if (type == "singledeletion") { + groups <- extract.groups(body,types$singledeletion) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + if (is.na(aacode)) { + #do nothing + } else if (aacode == 1) { + if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] + } else if (aacode == 3) { + if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] + } else { + #this should never happen, as it's supposed to be detected at start of function + stop("Invalid aacode. If you see this, report this as a bug!") + } + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos,ancestral=aa1)) + } + } else if (type == "deletion") { + groups <- extract.groups(body,types$deletion) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + pos2 <- as.integer(groups[[4]]) + if (is.na(aacode)) { + #do nothing + } else if (aacode == 1) { + if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] + if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] + } else if (aacode == 3) { + if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] + if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] + } else { + #this should never happen, as it's supposed to be detected at start of function + stop("Invalid aacode. If you see this, report this as a bug!") + } + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos, + ancestral=aa1,end=pos2,ancestral2=aa2)) + } + + } else if (type == "duplication") { + groups <- extract.groups(body,types$duplication) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + pos2 <- as.integer(groups[[4]]) + if (is.na(aacode)) { + #do nothing + } else if (aacode == 1) { + if (nchar(aa1) == 3) aa1 <- three2one[[aa1]] + if (nchar(aa2) == 3) aa2 <- three2one[[aa2]] + } else if (aacode == 3) { + if (nchar(aa1) == 1) aa1 <- one2three[[aa1]] + if (nchar(aa2) == 1) aa2 <- one2three[[aa2]] + } else { + #this should never happen, as it's supposed to be detected at start of function + stop("Invalid aacode. If you see this, report this as a bug!") + } + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos, + ancestral=aa1,end=pos2,ancestral2=aa2)) + } + + } else if (type == "insertion") { + groups <- extract.groups(body,types$insertion) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + pos2 <- as.integer(groups[[4]]) + insert <- groups[[5]] + #TODO: Implement code conversion + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos, + ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) + } + + } else if (type == "delins") { + groups <- extract.groups(body,types$delins) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + pos2 <- as.integer(groups[[4]]) + insert <- groups[[5]] + #TODO: Implement code conversion + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) + } else { + return(list(hgvs=s,subject=subject,type=type,start=pos, + ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert)) + } + + } else if (type == "frameshift1") { + groups <- extract.groups(body,types$frameshift1) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + #TODO: Implement code conversion + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="frameshift",start=pos,ancestral=aa1)) + } else { + return(list(hgvs=s,subject=subject,type="frameshift",start=pos, + ancestral=aa1)) + } + + } else if (type == "frameshift2") { + groups <- extract.groups(body,types$frameshift2) + aa1 <- groups[[1]] + pos <- as.integer(groups[[2]]) + aa2 <- groups[[3]] + term <- as.integer(groups[[5]]) + #TODO: Implement code conversion + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="frameshift",start=pos,ancestral=aa1,variant=aa2,end=term)) + } else { + return(list(hgvs=s,subject=subject,type="frameshift",start=pos, + ancestral=aa1,variant=aa2,end=term)) + } + + } else {#unmatched type + if (isMulti) { + return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi, + type="invalid")) + } else { + return(list(hgvs=s,subject=subject,type="invalid")) + } + } + + }) + + } else if (subject=="noncoding") { + #FIXME: These need to be list of lists to match postprocessing + return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) + } else if (subject=="mitochondrial") { + return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) + } else if (subject=="rna") { + return(list(list(hgvs=s,subject="not_implemented",type="not_implemented"))) + } else {#unmatched subject, shouldn't happen + stop("Unmatched subject! If you see this, report it as a bug!") + } + }) + + #demote multimutants to enforce simple list structure + multiLengths <- sapply(out,length) + ids <- do.call(c,lapply(1:length(multiLengths), + function(i) if (multiLengths[[i]]==1) as.character(i) else paste0(i,".",1:multiLengths[[i]]) + )) + out2 <- do.call(c,out) + names(out2) <- ids + + + + to.df(out2) +} \ No newline at end of file diff --git a/tools/hgvsparser/test-data/inputBuilder.csv b/tools/hgvsparser/test-data/inputBuilder.csv new file mode 100644 index 00000000000..de5ccbe9c5b --- /dev/null +++ b/tools/hgvsparser/test-data/inputBuilder.csv @@ -0,0 +1,6 @@ +"hgvs","subject","type","start","ancestral","variant" +"p.Ala105*","protein","substitution",105,"Ala","*" +"p.Ala105*","protein","substitution",105,"Ala","*" +"p.Ala105Arg","protein","substitution",105,"Ala","Arg" +"p.Ala105Arg","protein","substitution",105,"Ala","Arg" +"p.Ala105Asn","protein","substitution",105,"Ala","Asn" diff --git a/tools/hgvsparser/test-data/inputParser.csv b/tools/hgvsparser/test-data/inputParser.csv new file mode 100644 index 00000000000..79c2f2cf69c --- /dev/null +++ b/tools/hgvsparser/test-data/inputParser.csv @@ -0,0 +1,6 @@ +accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA,p_AA +urn:mavedb:00000044-a-1#4199,NA,NA,p.Ala105*,NA,NA,1,A435,p.A435 +urn:mavedb:00000044-a-1#4200,NA,NA,p.Ala105*,NA,NA,2,A435,p.A435 +urn:mavedb:00000044-a-1#4187,NA,NA,p.Ala105Arg,-2.71,-2.76,1,A435R,p.A435R +urn:mavedb:00000044-a-1#4188,NA,NA,p.Ala105Arg,-2.81,-2.76,2,A435R,p.A435R +urn:mavedb:00000044-a-1#4181,NA,NA,p.Ala105Asn,-1.1,-1.13,1,A435N,p.A435N