diff --git a/tools/hgvsparser/.shed.yml b/tools/hgvsparser/.shed.yml
new file mode 100644
index 00000000000..0be98864519
--- /dev/null
+++ b/tools/hgvsparser/.shed.yml
@@ -0,0 +1,9 @@
+name: hgvsparser
+owner: iuc
+description: Parsing and building variant descriptor strings compliant with the HGVS standard
+homepage_url: https://github.com/VariantEffect/hgvsParseR/tree/master
+long_description: |
+ hgvsParseR is an R package for parsing the HGVS strings that are commonly used to describe variation.
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/hgvsparser/
+categories:
+ - Variant Analysis
diff --git a/tools/hgvsparser/buildHGVS.R b/tools/hgvsparser/buildHGVS.R
new file mode 100644
index 00000000000..55d4c55e06c
--- /dev/null
+++ b/tools/hgvsparser/buildHGVS.R
@@ -0,0 +1,512 @@
+# Copyright (C) 2018 Jochen Weile, Roth Lab
+#
+# This file is part of hgvsParseR.
+#
+# hgvsParseR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# hgvsParseR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with hgvsParseR. If not, see .
+
+#' Genomic HGVS Builder
+#'
+#' A constructor for a genomic-level HGVS builder object. The object contains a collection of functions
+#' for building genomic HGVS strings.
+#'
+#' The resulting object encapsulates the following functions:
+#' \itemize{
+#' \item{substitution(pos,ancestral,variant)} Genomic substitution variants.
+#' pos = position (integer); ancestral = ancestral nucleotide [ACGT];
+#' variant = variant nucleotide [ACGT]
+#' \item{deletion(start,stop)} Genomic deletion. start = start position (integer);
+#' stop = stop position (integer)
+#' \item{inversion(start,stop)} Genomic inversion. start = start position (integer);
+#' stop = stop position (integer)
+#' \item{duplication(start,stop)} Genomic duplication. start = start position (integer);
+#' stop = stop position (integer)
+#' \item{insertion(start,variant)} Genomic insertion. start = position immediately preceeding
+#' the insertion (integer); seq = inserted nucleotide sequence [ACGT]+
+#' \item{delins(start,stop,variant)} Genomic deletion and insertion. start = start position (integer);
+#' stop = stop position relative to the reference (integer); seq = inserted nucleotide sequence [ACGT]+
+#' \item{cis(...)} Multi-variant phased in cis. Parameters are genomic HGVS strings for the
+#' corresponding single mutants
+#' \item{trans(...)} Multi-variant phased in trans. Parameters are genomic HGVS strings for the
+#' corresponding single mutants
+#' \item{nophase(...)} Multi-variant with unknown phasing. Parameters are genomic HGVS strings for the
+#' corresponding single mutants
+#' }
+#'
+#' @return A \code{hgvs.builder.g} object with functions for building genomic HGVS strings.
+#' The individual functions return single-element character vectors containing these strings.
+#' @keywords HGVS builder
+#' @export
+#' @examples
+#' builder <- new.hgvs.builder.g()
+#' string1 <- builder$substitution(123,"A","G")
+#' string2 <- builder$delins(123,129,"ATTG")
+#' string3 <- with(builder,cis(substitution(123,"A","C"),substitution(231,"G","A")))
+
+new.hgvs.builder.g <- function() {
+
+ substitution <- function(pos,ancestral,variant) {
+ if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer")
+ if (!is.character(ancestral) || !(ancestral %in% c("A","C","G","T"))) stop("ancestral must be single nucleotide")
+ if (!is.character(variant) || !(variant %in% c("A","C","G","T"))) stop("variant must be single nucleotide")
+ paste0("g.",pos,ancestral,">",variant)
+ }
+
+ deletion <- function(start,stop) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (start > stop) stop("start must be upstream of stop")
+ paste0("g.",start,"_",stop,"del")
+ }
+
+ inversion <- function(start,stop) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (start > stop) stop("start must be upstream of stop")
+ paste0("g.",start,"_",stop,"inv")
+ }
+
+ duplication <- function(start,stop) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (start > stop) stop("start must be upstream of stop")
+ paste0("g.",start,"_",stop,"dup")
+ }
+
+ insertion <- function(start,seq) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence")
+ paste0("g.",start,"_",start+1,"ins",seq)
+ }
+
+ delins <- function(start,stop,seq) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (start > stop) stop("start must be upstream of stop")
+ if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence")
+ paste0("g.",start,"_",stop,"delins",seq)
+ }
+
+ cis <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("g.[",paste(bodies,collapse=";"),"]")
+ }
+
+ trans <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("g.[",paste(bodies,collapse="];["),"]")
+ }
+
+ nophase <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="g.")) stop("all arguments must be genomic HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("g.[",paste(bodies,collapse="(;)"),"]")
+ }
+
+ return(structure(list(
+ substitution=substitution,
+ deletion=deletion,
+ inversion=inversion,
+ duplication=duplication,
+ insertion=insertion,
+ delins=delins,
+ cis=cis,
+ trans=trans,
+ nophase=nophase
+ ),class="hgvs.builder.g"))
+}
+
+print.hgvs.builder.g <- function() {
+ cat("Genomic HGVS string builder. Use $ operator to access functions.")
+}
+
+
+
+#' Coding Sequence HGVS Builder
+#'
+#' A constructor for a CDS (=coding sequence) HGVS builder object. The object contains a collection of functions
+#' for building CDS HGVS strings.
+#' The resulting object encapsulates the following functions:
+#' \itemize{
+#' \item{substitution(pos,ancestral,variant,posOffset=0)} CDS substitution variants.
+#' pos = position (integer); ancestral = ancestral nucleotide [ACGT];
+#' variant = variant nucleotide [ACGT]; posOffset = offset from the position when
+#' crossing exon-intron borders (integer, defaults to 0)
+#' \item{deletion(start,stop,startOffset=0,stopOffset=0)} CDS deletion. start = start position (integer);
+#' stop = stop position (integer); startOffset = offset from the start position when
+#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the
+#' stop position when crossing exon-intron borders (integer, defaults to 0)
+#' \item{inversion(start,stop,startOffset=0,stopOffset=0)} CDS inversion. start = start position (integer);
+#' stop = stop position (integer); startOffset = offset from the start position when
+#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the
+#' stop position when crossing exon-intron borders (integer, defaults to 0)
+#' \item{duplication(start,stop,startOffset=0,stopOffset=0)} CDS duplication. start = start position (integer);
+#' stop = stop position (integer); startOffset = offset from the start position when
+#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the
+#' stop position when crossing exon-intron borders (integer, defaults to 0)
+#' \item{insertion(start,variant,startOffset=0)} CDS insertion. start = position immediately preceeding
+#' the insertion (integer); seq = inserted nucleotide sequence [ACGT]+ ;
+#' startOffset = offset from the start position when crossing exon-intron borders
+#' (integer, defaults to 0)
+#' \item{delins(start,stop,variant,startOffset=0,stopOffset=0)} CDS deletion and insertion. start = start position (integer);
+#' stop = stop position relative to the reference (integer);
+#' seq = inserted nucleotide sequence [ACGT]+ ; startOffset = offset from the start position when
+#' crossing exon-intron borders (integer, defaults to 0); stopOffset = offset from the
+#' stop position when crossing exon-intron borders (integer, defaults to 0)
+#' \item{cis(...)} Multi-variant phased in cis. Parameters are coding HGVS strings for the
+#' corresponding single mutants
+#' \item{trans(...)} Multi-variant phased in trans. Parameters are coding HGVS strings for the
+#' corresponding single mutants
+#' \item{nophase(...)} Multi-variant with unknown phasing. Parameters are coding HGVS strings for the
+#' corresponding single mutants
+#' }
+#'
+#' @return A \code{hgvs.builder.c} object with functions for building coding HGVS strings.
+#' The individual functions return single-element character vectors containing these strings.
+#' @keywords HGVS builder
+#' @export
+#' @examples
+#' builder <- new.hgvs.builder.c()
+#' string1 <- builder$substitution(123,"A","G",posOffset=2)
+#' string2 <- builder$delins(123,129,"ATTG")
+#' string3 <- with(builder,cis(substitution(123,"A","C"),substitution(231,"G","A")))
+
+new.hgvs.builder.c <- function() {
+
+ offsetStr <- function(offset) {
+ if (offset==0) {
+ ""
+ } else if (offset > 0) {
+ paste0("+",offset)
+ } else if (offset < 0) {
+ as.character(offset)
+ }
+ }
+
+ substitution <- function(pos,ancestral,variant,posOffset=0) {
+ if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer")
+ if (!is.numeric(posOffset)) stop("offset must be an integer")
+ if (!is.character(ancestral) || !(ancestral %in% c("A","C","G","T"))) stop("ancestral must be single nucleotide")
+ if (!is.character(variant) || !(variant %in% c("A","C","G","T"))) stop("variant must be single nucleotide")
+ paste0("c.",pos,offsetStr(posOffset),ancestral,">",variant)
+ }
+
+ deletion <- function(start,stop,startOffset=0,stopOffset=0) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (!is.numeric(startOffset)) stop("offset must be an integer")
+ if (!is.numeric(stopOffset)) stop("offset must be an integer")
+ if (start+startOffset > stop+stopOffset) stop("start must be before stop")
+ paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"del")
+ }
+
+ inversion <- function(start,stop,startOffset=0,stopOffset=0) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (!is.numeric(startOffset)) stop("offset must be an integer")
+ if (!is.numeric(stopOffset)) stop("offset must be an integer")
+ if (start+startOffset >= stop+stopOffset) stop("start must be before stop")
+ paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"inv")
+ }
+
+ duplication <- function(start,stop,startOffset=0,stopOffset=0) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (!is.numeric(startOffset)) stop("offset must be an integer")
+ if (!is.numeric(stopOffset)) stop("offset must be an integer")
+ if (start+startOffset > stop+stopOffset) stop("start must be before stop")
+ paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"dup")
+ }
+
+ insertion <- function(start,seq,startOffset=0) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) {
+ stop("variant must be nucleotide sequence")
+ }
+ if (!is.numeric(startOffset)) stop("offset must be an integer")
+ stop <- if (startOffset != 0) start else start+1
+ stopOffset <- if (startOffset != 0) startOffset+1 else startOffset
+ paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"ins",seq)
+ }
+
+ delins <- function(start,stop,seq,startOffset=0,stopOffset=0) {
+ if (!is.numeric(start)) stop("start must be an integer")
+ if (!is.numeric(stop)) stop("stop must be an integer")
+ if (!is.numeric(startOffset)) stop("offset must be an integer")
+ if (!is.numeric(stopOffset)) stop("offset must be an integer")
+ if (start+startOffset > stop+stopOffset) stop("start must be before stop")
+ if (!is.character(seq) || regexpr("^[ACGT]+$",seq) < 1) stop("variant must be nucleotide sequence")
+ paste0("c.",start,offsetStr(startOffset),"_",stop,offsetStr(stopOffset),"delins",seq)
+ }
+
+ cis <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("c.[",paste(bodies,collapse=";"),"]")
+ }
+
+ trans <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("c.[",paste(bodies,collapse="];["),"]")
+ }
+
+ nophase <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="c.")) stop("all arguments must be coding HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("c.[",paste(bodies,collapse="(;)"),"]")
+ }
+
+ return(structure(list(
+ substitution=substitution,
+ deletion=deletion,
+ inversion=inversion,
+ duplication=duplication,
+ insertion=insertion,
+ delins=delins,
+ cis=cis,
+ trans=trans,
+ nophase=nophase
+ ),class="hgvs.builder.c"))
+}
+
+print.hgvs.builder.c <- function() {
+ cat("Coding-sequence HGVS string builder. Use $ operator to access functions.")
+}
+
+
+
+#' Protein HGVS Builder
+#'
+#' A constructor for a protein-level HGVS builder object. The object contains a collection of functions
+#' for building protein HGVS strings.
+#'
+#' The resulting object encapsulates the following functions:
+#' \itemize{
+#' \item{synonymous()} A synonymous variant. No parameters required.
+#' \item{synonymous(pos,ancestral)} Unofficial (yet frequently used) version of synonymous variant syntax.
+#' pos = position (integer); ancestral = ancestral amino acid in one-letter or three-letter code.
+#' \item{substitution(pos,ancestral,variant)} AA substitution variants.
+#' pos = position (integer); ancestral = ancestral amino acid in one-letter or three-letter code;
+#' variant = variant amino acid in one-letter or three-letter code
+#' \item{deletion(startPos,startAA,endPos,endAA)} AA deletion. startPos = start position (integer);
+#' startAA = start amino acid in one-letter or three-letter code;
+#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code
+#' \item{duplication(startPos,startAA,endPos,endAA)} AA duplication. startPos = start position (integer);
+#' startAA = start amino acid in one-letter or three-letter code;
+#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code
+#' \item{insertion(leftPos,leftAA,rightAA,seq)} AA insertion. leftPos = position immediately preceeding
+#' the insertion (integer); leftAA = corresponding amino acid in one-letter or three-letter code;
+#' rightAA = amino acid to the right of the insertion, in one-letter or three-letter code;
+#' seq = inserted amino acid sequence, given as a character vector containing the individual
+#' one-letter or three-letter amino acid codes.
+#' \item{delins(startPos,startAA,endPos,endAA,seq)} AA deletion and insertion.
+#' startPos = start position (integer);
+#' startAA = start amino acid in one-letter or three-letter code;
+#' endPos = stop position (integer); endAA = start amino acid in one-letter or three-letter code;
+#' seq = inserted amino acid sequence, given as a character vector containing the individual
+#' one-letter or three-letter amino acid codes.
+#' \item{frameshift(startPos,startAA,variantAA=NA,newStop=NA)} Frameshift variant.
+#' startPos = start position (integer);
+#' startAA = start amino acid in one-letter or three-letter code;
+#' variantAA = amino acid replacing the start position in the frameshift sequence,
+#' given in one-letter or three-letter code, or \code{NA} to omit (default);
+#' newStop = the position of the nearest coding resulting from the frameshift,
+#' or \code{NA} to omit (default).
+#' \item{cis(...)} Multi-variant phased in cis. Parameters are coding HGVS strings for the
+#' corresponding single mutants. As phasing in trans would be nonsensical in a protein context,
+#' the \code{trans()} and \code{nophase()} methods are not provided here.
+#' }
+#'
+#' @return A \code{hgvs.builder.g} object with functions for building genomic HGVS strings.
+#' The individual functions return single-element character vectors containing these strings.
+#' @keywords HGVS builder
+#' @export
+#' @examples
+#' builder <- new.hgvs.builder.g()
+#' string1 <- builder$substitution(123,"R","K")
+#' string2 <- builder$delins(123,"Arg",152,"Leu",c("Lys","Trp","Ser"))
+#' string3 <- with(builder,cis(substitution(123,"R","K"),deletion(125,"S",152,"L")))
+
+
+new.hgvs.builder.p <- function(aacode=c(1,3)) {
+
+ aacode <- aacode[[1]]
+ if (!is.numeric(aacode) && !(aacode %in% c(1,3))) {
+ stop("Invalid aacode parameter, only 1 or 3 allowed!")
+ }
+
+ one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His",
+ I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg",
+ S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter")
+ three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E",
+ Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P",
+ Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*")
+
+ enforceCode <- function(aa) {
+ if (aa %in% one2three) {
+ if (aacode == 1) {
+ three2one[[aa]]
+ } else {
+ aa
+ }
+ } else if (aa %in% three2one) {
+ if (aacode == 1) {
+ aa
+ } else {
+ one2three[[aa]]
+ }
+ } else {
+ stop("Invalid AA code")
+ }
+ }
+
+ synonymous <- function(pos=NULL,ancestral=NULL) {
+ if (is.null(pos) || is.null(ancestral)) {
+ return("p.=")
+ }
+ if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer")
+ if (!is.character(ancestral) || !(ancestral %in% c(one2three,three2one))) stop("ancestral must be single amimo acid")
+ ancestral <- enforceCode(ancestral)
+ paste0("p.",ancestral,pos,"=")
+ }
+
+ substitution <- function(pos,ancestral,variant) {
+ if (!is.numeric(pos) || pos < 1) stop("position must be a positive integer")
+ if (!is.character(ancestral) || !(ancestral %in% c(one2three,three2one))) stop("ancestral must be single amimo acid")
+ if (!is.character(variant) || !(variant %in% c(one2three,three2one))) stop("variant must be single amino acid")
+ ancestral <- enforceCode(ancestral)
+ variant <- enforceCode(variant)
+ paste0("p.",ancestral,pos,variant)
+ }
+
+ deletion <- function(startPos,startAA,endPos,endAA) {
+ if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer")
+ if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer")
+ if (startPos > endPos) stop("start must be upstream of stop")
+ if (!is.character(startAA) || !(startAA %in% c(one2three,three2one))) stop("startAA must be single amimo acid")
+ if (!is.character(endAA) || !(endAA %in% c(one2three,three2one))) stop("endAA must be single amimo acid")
+ startAA <- enforceCode(startAA)
+ endAA <- enforceCode(endAA)
+ if (startPos==endPos) {
+ paste0("p.",startAA,startPos,"del")
+ } else {
+ paste0("p.",startAA,startPos,"_",endAA,endPos,"del")
+ }
+ }
+
+ duplication <- function(startPos,startAA,endPos,endAA) {
+ if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer")
+ if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer")
+ if (startPos >= endPos) stop("start must be upstream of stop")
+ if (!is.character(startAA) || !(startAA %in% c(one2three,three2one)))
+ stop("startAA must be single amimo acid")
+ if (!is.character(endAA) || !(endAA %in% c(one2three,three2one)))
+ stop("endAA must be single amimo acid")
+ startAA <- enforceCode(startAA)
+ endAA <- enforceCode(endAA)
+ paste0("p.",startAA,startPos,"_",endAA,endPos,"dup")
+ }
+
+ insertion <- function(leftPos,leftAA,rightAA,seq) {
+ if (!is.numeric(leftPos) || leftPos < 1) stop("position must be a positive integer")
+ if (!is.character(leftAA) || !(leftAA %in% c(one2three,three2one)))
+ stop("leftAA must be single amimo acid")
+ if (!is.character(rightAA) || !(rightAA %in% c(one2three,three2one)))
+ stop("rightAA must be single amimo acid")
+ if (!is.character(seq) || !all(sapply(seq,function(x) x %in% c(one2three,three2one))))
+ stop("seq must be a vector of amino acids")
+ rightPos <- leftPos+1
+ leftAA <- enforceCode(leftAA)
+ rightAA <- enforceCode(rightAA)
+ seq <- paste(sapply(seq,enforceCode),collapse="")
+ paste0("p.",leftAA,leftPos,"_",rightAA,rightPos,"ins",seq)
+ }
+
+ delins <- function(startPos,startAA,endPos,endAA,seq) {
+ if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer")
+ if (!is.numeric(endPos) || endPos < 1) stop("position must be a positive integer")
+ if (startPos > endPos) stop("start must be upstream of stop")
+ if (!is.character(startAA) || !(startAA %in% c(one2three,three2one)))
+ stop("startAA must be single amimo acid")
+ if (!is.character(endAA) || !(endAA %in% c(one2three,three2one)))
+ stop("endAA must be single amimo acid")
+ if (!is.character(seq) || !all(sapply(seq,function(x) x %in% c(one2three,three2one))))
+ stop("seq must be a vector of amino acids")
+ startAA <- enforceCode(startAA)
+ endAA <- enforceCode(endAA)
+ seq <- paste(sapply(seq,enforceCode),collapse="")
+ paste0("p.",startAA,startPos,"_",endAA,endPos,"delins",seq)
+ }
+
+ frameshift <- function(startPos,startAA,variantAA=NA,newStop=NA) {
+ if (!is.numeric(startPos) || startPos < 1) stop("position must be a positive integer")
+ if (!is.na(newStop) && (!is.numeric(newStop) || newStop < 1)) stop("position must be a positive integer")
+ if (!is.character(startAA) || !(startAA %in% c(one2three,three2one)))
+ stop("startAA must be single amimo acid or NA")
+ if (!is.na(variantAA) && (!is.character(startAA) || !(startAA %in% c(one2three,three2one))))
+ stop("variantAA must be single amimo acid or NA")
+ startAA <- enforceCode(startAA)
+ if (is.na(variantAA)) {
+ variantAA <- ""
+ } else {
+ variantAA <- enforceCode(variantAA)
+ }
+ if (is.na(newStop)) {
+ newStop <- ""
+ } else {
+ newStop <- paste0("*",newStop)
+ }
+ paste0("p.",startAA,startPos,variantAA,"fs",newStop)
+ }
+
+ cis <- function(...) {
+ strings <- list(...)
+ if (!all(sapply(strings,is.character))) stop("all arguments must be HGVS strings")
+ strings <- unlist(strings)
+ if (!all(substr(strings,1,2)=="p.")) stop("all arguments must be protein HGVS strings")
+ bodies <- substr(strings,3,nchar(strings))
+ paste0("p.[",paste(bodies,collapse=";"),"]")
+ }
+
+ return(structure(list(
+ synonymous=synonymous,
+ substitution=substitution,
+ deletion=deletion,
+ duplication=duplication,
+ insertion=insertion,
+ delins=delins,
+ frameshift=frameshift,
+ cis=cis
+ ),class="hgvs.builder.p"))
+}
\ No newline at end of file
diff --git a/tools/hgvsparser/hgvsparser.xml b/tools/hgvsparser/hgvsparser.xml
new file mode 100644
index 00000000000..38ed5998d48
--- /dev/null
+++ b/tools/hgvsparser/hgvsparser.xml
@@ -0,0 +1,152 @@
+
+ parsing and building variant descriptor strings compliant with the HGVS standard
+
+ 0.1.0
+ 0
+
+
+ r-base
+
+
+
+
+
+ tmp_output.csv
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ action_choice['do'] == 'parse'
+
+
+ action_choice['do'] == 'build'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1186/s13059-019-1845-6
+
+
\ No newline at end of file
diff --git a/tools/hgvsparser/parseHGVS.R b/tools/hgvsparser/parseHGVS.R
new file mode 100644
index 00000000000..da8552b7cb8
--- /dev/null
+++ b/tools/hgvsparser/parseHGVS.R
@@ -0,0 +1,734 @@
+# Copyright (C) 2018 Jochen Weile, Roth Lab
+#
+# This file is part of hgvsParseR.
+#
+# hgvsParseR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# hgvsParseR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with hgvsParseR. If not, see .
+
+#' HGVS Parser
+#'
+#' Parses HGVS strings
+#' @param strings A character vector containing the HGVS strings
+#' @param aacode allowed values: 1, 3, or NA. Determines whether 1-letter codes or 3-letter codes should be forced. NA uses input format.
+#' @return A \code{data.frame} with the following columns:
+#' @keywords HGVS parsing
+#' @export
+#' @examples
+#' result <- parseHGVS(c("g.1318G>T","c.123_125inv","p.R123_L152del"))
+
+parseHGVS <- function(strings,aacode=c(NA,1,3)) {
+
+ #Check that parameters are valid
+ if (!is.character(strings)) {
+ stop("Input for 'parse' function must be a character vector! Found '",class(strings),"' instead.")
+ }
+
+ aacode <- aacode[[1]]
+ if (!is.na(aacode) && !(aacode %in% c(1,3))) {
+ warning("Invalid aacode parameter, defaulting to NA!")
+ aacode <- NA
+ }
+
+ #Helper function: turns a list of lists (lol) in to a dataframe
+ to.df <- function(lol) {
+ colnames <- unique(do.call(c,lapply(lol,names)))
+ columns <- lapply(colnames,function(cn) sapply(lol,function(row) {
+ if (cn %in% names(row)) row[[cn]] else NA
+ }))
+ names(columns) <- colnames
+ empty <- which(sapply(columns,function(xs)all(is.na(xs))))
+ columns[empty] <- NULL
+ do.call(data.frame,columns)
+ }
+
+ # ###
+ # # Binds matrices of same size together to a 3D matrix, analogously
+ # # to cbind and rbind.
+ # #
+ # zbind <- function(...) {
+ # x <- list(...)
+ # y <- array(0,dim=c(nrow(x[[1]]),ncol(x[[1]]),length(x)),dimnames=dimnames(x[[1]]))
+ # for (i in 1:length(x)) y[,,i] <- x[[i]]
+ # y
+ # }
+
+
+ ###
+ # Function to *locally* excise regex groups from string vectors.
+ # I.e. only extract the first occurrence of each group within each string.
+ # x = string vector
+ # re = regular expression with groups
+ #
+ extract.groups <- function(x, re) {
+ matches <- regexpr(re,x,perl=TRUE)
+ start <- attr(matches,"capture.start")
+ end <- start + attr(matches,"capture.length") - 1
+ do.call(cbind,lapply(1:ncol(start), function(i) {
+ sapply(1:nrow(start),function(j){
+ if (start[j,i] > -1) substr(x[[j]],start[j,i],end[j,i]) else NA
+ })
+ }))
+ }
+
+ # ###
+ # # Function to *globally* excise regex groups from string vectors.
+ # # x = string vector
+ # # re = regular expression with groups
+ # #
+ # global.extract.groups <- function(x,re) {
+ # all.matches <- gregexpr(re,x,perl=TRUE)
+ # mapply(function(matches,x) {
+ # start <- attr(matches,"capture.start")
+ # end <- start + attr(matches,"capture.length") - 1
+ # apply(zbind(start,end),c(1,2),function(pos) substr(x,pos[[1]],pos[[2]]) )
+ # },matches=all.matches,x=x,SIMPLIFY=FALSE)
+ # }
+
+ ###
+ # Helper function to split multi-mutant bodies into their individual
+ # elements. Returns a vector of strings containing these elements.
+ # An attribute "multi" is attached to the vector, detailing the type
+ # of multi-mutant
+ #
+ splitMulti <- function(body) {
+ if (regexpr("\\[.+\\];\\[.+\\]",body) > 0) {
+ out <- strsplit(substr(body,2,nchar(body)-1),"\\];\\[")[[1]]
+ attr(out,"multi") <- "trans"
+ } else if (regexpr("\\[.+\\(;\\).+\\]",body) > 0) {
+ out <- strsplit(substr(body,2,nchar(body)-1),"\\(;\\)")[[1]]
+ attr(out,"multi") <- "unknown"
+ } else if (regexpr("\\[.+;.+\\]",body) > 0) {
+ out <- strsplit(substr(body,2,nchar(body)-1),";")[[1]]
+ attr(out,"multi") <- "cis"
+ } else {
+ out <- body
+ attr(out,"multi") <- "single"
+ }
+ return(out)
+ }
+
+ ###
+ # Helper function:
+ # Given an HGVS body and a list of regexes corresponding to types,
+ # find the (first) matching type.
+ findType <- function(body,types) {
+ i <- 0
+ found <- done <- FALSE
+ while (!found && !done) {
+ found <- regexpr(types[[i <- i+1]],body) > 0
+ done <- i >= length(types)
+ }
+ if (found) {
+ return(names(types)[[i]])
+ } else {
+ return("invalid")
+ }
+ }
+
+ out <- lapply(strings,function(s) {
+
+ if (regexpr("^[gcnmrp]\\.",s) < 1) {
+ return(list(list(hgvs=s,subject="invalid",type="invalid")))
+ }
+
+ body <- substr(s,3,nchar(s))
+
+ subbodies <- splitMulti(body)
+
+ subjects <- c(
+ g="genomic",c="coding",n="noncoding",
+ m="mitochondrial",r="rna",p="protein"
+ )
+ subject <- subjects[[substr(s,1,1)]]
+
+ if (subject=="genomic") {
+
+ types <- c(
+ substitution="\\d+[ACGT]>[ACGT]", singledeletion="^\\d+del$",
+ deletion="\\d+_\\d+del$",inversion="\\d+_\\d+inv",
+ duplication="\\d+_\\d+dup",insertion="\\d+_\\d+ins[ATCG]+",
+ conversion="\\d+_\\d+con\\d+_\\d+",delins="\\d+_\\d+delins[ATCG]+",
+ amplification="\\d+_\\d+\\[\\d+\\]"
+ )
+
+ phasing <- attr(subbodies,"multi")
+ isMulti <- length(subbodies) > 1
+
+ lapply(1:length(subbodies), function(i.multi) {
+ body <- subbodies[[i.multi]]
+
+ type <- findType(body,types)
+
+ if (type == "substitution") {
+ groups <- extract.groups(body,"(\\d+)([ACGT])>([ACGT])")[1,]
+ position <- as.integer(groups[[1]])
+ ancestral <- groups[[2]]
+ variant <- groups[[3]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=position,ancestral=ancestral,variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=position,
+ ancestral=ancestral,variant=variant))
+ }
+
+ } else if (type == "singledeletion") {
+ groups <- extract.groups(body,"(\\d+)del")[1,]
+ position <- as.integer(groups[[1]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=position))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=position))
+ }
+
+ } else if (type == "deletion") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)del")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
+ }
+
+ } else if (type == "inversion") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)inv")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
+ }
+
+ } else if (type == "duplication") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)dup")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end))
+ }
+
+ } else if (type == "insertion") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)ins([ATCG]+)")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ if (abs(end-start)!=1) {
+ warning("Invalid insertion definition:
+ Start and end positions must be adjacent!")
+ }
+ variant <- groups[[3]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end,variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
+ variant=variant))
+ }
+
+ } else if (type == "conversion") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)con(\\d+)_(\\d+)")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ tStart <- as.integer(groups[[3]])
+ tEnd <- as.integer(groups[[4]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end,
+ templateStart=tStart,templateEnd=tEnd))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
+ templateStart=tStart,templateEnd=tEnd))
+ }
+
+ } else if (type == "delins") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)delins([ATCG]+)")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ variant <- groups[[3]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end,variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,end=end,
+ variant=variant))
+ }
+
+ } else if (type == "amplification") {
+ groups <- extract.groups(body,"(\\d+)_(\\d+)\\[(\\d+)\\]")
+ start <- as.integer(groups[[1]])
+ end <- as.integer(groups[[2]])
+ copies <- as.integer(groups[[3]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,end=end,copies=copies))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,
+ start=start,end=end,copies=copies))
+ }
+
+ } else {
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type="invalid"))
+ } else {
+ return(list(hgvs=s,subject=subject,type="invalid"))
+ }
+ }
+
+ })
+
+ } else if (subject=="coding") {
+ #coding needs to be handled separately from genomic, as the syntax may differ
+ #e.g. it allows for offset descriptions relative to exon-intron borders
+
+ types <- c(
+ substitution="\\d+([+-]\\d+)?[ACGT]>[ACGT]",
+ singledeletion="^\\d+([+-]\\d+)?del$",
+ deletion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?del$",
+ inversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?inv",
+ duplication="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?dup",
+ insertion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?ins[ATCG]+",
+ conversion="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?con\\d+([+-]\\d+)?_\\d+([+-]\\d+)?",
+ delins="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?delins[ATCG]+",
+ amplification="\\d+([+-]\\d+)?_\\d+([+-]\\d+)?\\[\\d+\\]"
+ )
+
+ phasing <- attr(subbodies,"multi")
+ isMulti <- length(subbodies) > 1
+
+ lapply(1:length(subbodies), function(i.multi) {
+ body <- subbodies[[i.multi]]
+
+ type <- findType(body,types)
+
+ if (type == "substitution") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?([ACGT])>([ACGT])")[1,]
+ position <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ ancestral <- groups[[3]]
+ variant <- groups[[4]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=position,startIntron=intronOffset,ancestral=ancestral,
+ variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=position,
+ startIntron=intronOffset,ancestral=ancestral,variant=variant))
+ }
+
+ } else if (type == "singledeletion") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?del")[1,]
+ position <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=position,startIntron=intronOffset))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=position,
+ startIntron=intronOffset))
+ }
+ } else if (type == "deletion") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?del")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2))
+ }
+ } else if (type == "inversion") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?inv")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2))
+ }
+ } else if (type == "duplication") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?dup")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2))
+ }
+ } else if (type == "insertion") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?ins([ATCG]+)")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ variant <- groups[[5]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2,variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant))
+ }
+ } else if (type == "conversion") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?con(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ tStart <- as.integer(groups[[5]])
+ intronOffset3 <- as.integer(groups[[6]])
+ tEnd <- as.integer(groups[[7]])
+ intronOffset4 <- as.integer(groups[[8]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2,templateStart=tStart,
+ templateStartIntron=intronOffset3,templateEnd=tEnd,
+ templateEndIntron=intronOffset4))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,
+ start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2,templateStart=tStart,
+ templateStartIntron=intronOffset3,templateEnd=tEnd,
+ templateEndIntron=intronOffset4))
+ }
+ } else if (type == "delins") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?delins([ATCG]+)")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ variant <- groups[[5]]
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2,variant=variant))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2,variant=variant))
+ }
+ } else if (type == "amplification") {
+ groups <- extract.groups(body,"(\\d+)([+-]\\d+)?_(\\d+)([+-]\\d+)?\\[(\\d+)\\]")
+ start <- as.integer(groups[[1]])
+ intronOffset <- as.integer(groups[[2]])
+ end <- as.integer(groups[[3]])
+ intronOffset2 <- as.integer(groups[[4]])
+ copies <- as.integer(groups[[3]])
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=start,startIntron=intronOffset,
+ end=end,endIntron=intronOffset2,copies=copies))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=start,
+ startIntron=intronOffset,end=end,endIntron=intronOffset2,copies=copies))
+ }
+ } else {
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="invalid"))
+ } else {
+ return(list(hgvs=s,subject=subject,type="invalid"))
+ }
+ }
+
+ })
+
+ } else if (subject=="protein") {
+
+ one2three <- c(A="Ala",C="Cys",D="Asp",E="Glu",F="Phe",G="Gly",H="His",
+ I="Ile",K="Lys",L="Leu",M="Met",N="Asn",P="Pro",Q="Gln",R="Arg",
+ S="Ser",T="Thr",V="Val",W="Trp",Y="Tyr",`*`="Ter")
+ three2one <- c(Ala="A",Arg="R",Asn="N",Asp="D",Cys="C",Gln="Q",Glu="E",
+ Gly="G",His="H",Ile="I",Leu="L",Lys="K",Met="M",Phe="F",Pro="P",
+ Ser="S",Thr="T",Trp="W",Tyr="Y",Val="V",Ter="*")
+ codes <- paste(c(one2three,three2one[-21],"\\*"),collapse="|")
+
+ types <- list(
+ synonymous1="^=$",
+ synonymous2=paste0("^(",codes,")(\\d+)=$"),
+ substitution=paste0("^(",codes,")(\\d+)(",codes,")$"),
+ singledeletion=paste0("^(",codes,")(\\d+)del$"),
+ deletion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)del$"),
+ duplication=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)dup$"),
+ insertion=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)ins((",codes,")+)$"),
+ delins=paste0("^(",codes,")(\\d+)_(",codes,")(\\d+)delins((",codes,")+)$"),
+ frameshift1=paste0("^(",codes,")(\\d+)fs$"),
+ frameshift2=paste0("^(",codes,")(\\d+)(",codes,")fs(Ter|\\*)(\\d+)$")
+ )
+
+ phasing <- attr(subbodies,"multi")
+ isMulti <- length(subbodies) > 1
+
+ lapply(1:length(subbodies), function(i.multi) {
+ body <- subbodies[[i.multi]]
+
+ type <- findType(body,types)
+
+ if (type == "synonymous1") {
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,type=type))
+ } else {
+ return(list(hgvs=s,subject=subject,type="synonymous"))
+ }
+ } else if (type == "synonymous2") {
+ groups <- extract.groups(body,types$synonymous2)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ if (aa1 %in% c(one2three,three2one)) {
+ if (is.na(aacode)) {
+ #do nothing
+ } else if (aacode == 1) {
+ if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
+ } else if (aacode ==3) {
+ if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
+ } else {
+ #this should never happen, as it's supposed to be detected at start of function
+ stop("Invalid aacode. If you see this, report this as a bug!")
+ }
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="synonymous",start=pos,ancestral=aa1))
+ } else {
+ return(list(hgvs=s,subject=subject,type="synonymous",start=pos,
+ ancestral=aa1))
+ }
+ } else {#not valid amino acid
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="invalid"))
+ } else {
+ return(list(hgvs=s,subject=subject,type="invalid"))
+ }
+ }
+ } else if (type == "substitution") {
+ groups <- extract.groups(body,types$substitution)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ if (aa1 %in% c(one2three,three2one) && aa2 %in% c(one2three,three2one)) {
+ if (is.na(aacode)) {
+ #do nothing
+ } else if (aacode == 1) {
+ if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
+ if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
+ } else if (aacode ==3) {
+ if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
+ if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
+ } else {
+ #this should never happen, as it's supposed to be detected at start of function
+ stop("Invalid aacode. If you see this, report this as a bug!")
+ }
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1,variant=aa2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,
+ ancestral=aa1,variant=aa2))
+ }
+ } else {#not valid amino acid
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="invalid"))
+ } else {
+ return(list(hgvs=s,subject=subject,type="invalid"))
+ }
+ }
+
+ } else if (type == "singledeletion") {
+ groups <- extract.groups(body,types$singledeletion)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ if (is.na(aacode)) {
+ #do nothing
+ } else if (aacode == 1) {
+ if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
+ } else if (aacode == 3) {
+ if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
+ } else {
+ #this should never happen, as it's supposed to be detected at start of function
+ stop("Invalid aacode. If you see this, report this as a bug!")
+ }
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,ancestral=aa1))
+ }
+ } else if (type == "deletion") {
+ groups <- extract.groups(body,types$deletion)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ pos2 <- as.integer(groups[[4]])
+ if (is.na(aacode)) {
+ #do nothing
+ } else if (aacode == 1) {
+ if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
+ if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
+ } else if (aacode == 3) {
+ if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
+ if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
+ } else {
+ #this should never happen, as it's supposed to be detected at start of function
+ stop("Invalid aacode. If you see this, report this as a bug!")
+ }
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,
+ ancestral=aa1,end=pos2,ancestral2=aa2))
+ }
+
+ } else if (type == "duplication") {
+ groups <- extract.groups(body,types$duplication)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ pos2 <- as.integer(groups[[4]])
+ if (is.na(aacode)) {
+ #do nothing
+ } else if (aacode == 1) {
+ if (nchar(aa1) == 3) aa1 <- three2one[[aa1]]
+ if (nchar(aa2) == 3) aa2 <- three2one[[aa2]]
+ } else if (aacode == 3) {
+ if (nchar(aa1) == 1) aa1 <- one2three[[aa1]]
+ if (nchar(aa2) == 1) aa2 <- one2three[[aa2]]
+ } else {
+ #this should never happen, as it's supposed to be detected at start of function
+ stop("Invalid aacode. If you see this, report this as a bug!")
+ }
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,
+ ancestral=aa1,end=pos2,ancestral2=aa2))
+ }
+
+ } else if (type == "insertion") {
+ groups <- extract.groups(body,types$insertion)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ pos2 <- as.integer(groups[[4]])
+ insert <- groups[[5]]
+ #TODO: Implement code conversion
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,
+ ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
+ }
+
+ } else if (type == "delins") {
+ groups <- extract.groups(body,types$delins)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ pos2 <- as.integer(groups[[4]])
+ insert <- groups[[5]]
+ #TODO: Implement code conversion
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type=type,start=pos,ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
+ } else {
+ return(list(hgvs=s,subject=subject,type=type,start=pos,
+ ancestral=aa1,end=pos2,ancestral2=aa2,variant=insert))
+ }
+
+ } else if (type == "frameshift1") {
+ groups <- extract.groups(body,types$frameshift1)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ #TODO: Implement code conversion
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="frameshift",start=pos,ancestral=aa1))
+ } else {
+ return(list(hgvs=s,subject=subject,type="frameshift",start=pos,
+ ancestral=aa1))
+ }
+
+ } else if (type == "frameshift2") {
+ groups <- extract.groups(body,types$frameshift2)
+ aa1 <- groups[[1]]
+ pos <- as.integer(groups[[2]])
+ aa2 <- groups[[3]]
+ term <- as.integer(groups[[5]])
+ #TODO: Implement code conversion
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="frameshift",start=pos,ancestral=aa1,variant=aa2,end=term))
+ } else {
+ return(list(hgvs=s,subject=subject,type="frameshift",start=pos,
+ ancestral=aa1,variant=aa2,end=term))
+ }
+
+ } else {#unmatched type
+ if (isMulti) {
+ return(list(hgvs=s,subject=subject,phasing=phasing,multiPart=i.multi,
+ type="invalid"))
+ } else {
+ return(list(hgvs=s,subject=subject,type="invalid"))
+ }
+ }
+
+ })
+
+ } else if (subject=="noncoding") {
+ #FIXME: These need to be list of lists to match postprocessing
+ return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
+ } else if (subject=="mitochondrial") {
+ return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
+ } else if (subject=="rna") {
+ return(list(list(hgvs=s,subject="not_implemented",type="not_implemented")))
+ } else {#unmatched subject, shouldn't happen
+ stop("Unmatched subject! If you see this, report it as a bug!")
+ }
+ })
+
+ #demote multimutants to enforce simple list structure
+ multiLengths <- sapply(out,length)
+ ids <- do.call(c,lapply(1:length(multiLengths),
+ function(i) if (multiLengths[[i]]==1) as.character(i) else paste0(i,".",1:multiLengths[[i]])
+ ))
+ out2 <- do.call(c,out)
+ names(out2) <- ids
+
+
+
+ to.df(out2)
+}
\ No newline at end of file
diff --git a/tools/hgvsparser/test-data/inputBuilder.csv b/tools/hgvsparser/test-data/inputBuilder.csv
new file mode 100644
index 00000000000..de5ccbe9c5b
--- /dev/null
+++ b/tools/hgvsparser/test-data/inputBuilder.csv
@@ -0,0 +1,6 @@
+"hgvs","subject","type","start","ancestral","variant"
+"p.Ala105*","protein","substitution",105,"Ala","*"
+"p.Ala105*","protein","substitution",105,"Ala","*"
+"p.Ala105Arg","protein","substitution",105,"Ala","Arg"
+"p.Ala105Arg","protein","substitution",105,"Ala","Arg"
+"p.Ala105Asn","protein","substitution",105,"Ala","Asn"
diff --git a/tools/hgvsparser/test-data/inputParser.csv b/tools/hgvsparser/test-data/inputParser.csv
new file mode 100644
index 00000000000..79c2f2cf69c
--- /dev/null
+++ b/tools/hgvsparser/test-data/inputParser.csv
@@ -0,0 +1,6 @@
+accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA,p_AA
+urn:mavedb:00000044-a-1#4199,NA,NA,p.Ala105*,NA,NA,1,A435,p.A435
+urn:mavedb:00000044-a-1#4200,NA,NA,p.Ala105*,NA,NA,2,A435,p.A435
+urn:mavedb:00000044-a-1#4187,NA,NA,p.Ala105Arg,-2.71,-2.76,1,A435R,p.A435R
+urn:mavedb:00000044-a-1#4188,NA,NA,p.Ala105Arg,-2.81,-2.76,2,A435R,p.A435R
+urn:mavedb:00000044-a-1#4181,NA,NA,p.Ala105Asn,-1.1,-1.13,1,A435N,p.A435N