Skip to content

Commit

Permalink
Merge pull request #178 from BackofenLab/dev
Browse files Browse the repository at this point in the history
v3.1.4
  • Loading branch information
martin-raden authored Jan 31, 2020
2 parents 3a31d0b + 86dd7be commit eff24d0
Show file tree
Hide file tree
Showing 26 changed files with 757 additions and 292 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ script:
- cd $TRAVIS_BUILD_DIR
# generate autotools's files
- bash autotools-init.sh
# run configure
# run configure (without boost checks)
- ./configure --prefix=$HOME/IntaRNA --with-vrna=$HOME/miniconda/envs/build-IntaRNA --with-boost=no --without-zlib
# compile documentation
# - make doxygen-doc
# compile, test and install IntaRNA
- make -j 2 && make tests -j 2 && make install
##### check IntaRNA build #####
# run IntaRNA with help output
# run installed IntaRNA with help output
- $HOME/IntaRNA/bin/IntaRNA -h
50 changes: 50 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,59 @@
# changes in development version since last release
################################################################################


################################################################################
################################################################################


################################################################################
### version 3.1.4
################################################################################

# IntaRNA
- bugfix generation and tracing of seeds with bulges and no GU ends
- bugfix seed-extension prediction for seeds with bulges
- noLP for seeds with bulges enabled

# R
- `IntaRNA_CSV_p-value.R` script to estimate p-values based on energy values
- `IntaRNA_plotRegions.R` = renaming of former `plotRegions.R`

################################################################################

200130 Martin Raden
* IntaRNA/SeedHandlerMfe :
* bugfix generation and tracing of seeds with bulges and no GU ends
* IntaRNA/PredictorMfe*SeedExtension* :
* bugfix enumeration of seeds with bulges
* bin/CommandLineParseing :
* error msgs rephrased
+ noLP for seeds with bulges enabled
+ setup noLP for seed constraints via outNoLP
* IntaRNA/SeedConstraint :
+ isLpAllowed : whether or not lps are allowed in seeds
* IntaRNA/SeedHandlerMfe :
+ support for noLP constraint
* test/SeedHandlerMfe :
+ test with lp
+ test no lp (boundary)
+ test no lp (internal)
* test/*
* adaptation to SeedConstraint constructor changes

200121 Martin Raden
+ R/IntaRNA_CSV_p-value.R : former addPvalues2csv.R
+ R/IntaRNA_plotRegions.R : former plotRegions.R
- R/addPvalues2csv.R : renamed
- R/plotRegions.R : renamed
* README.md : adapted to renamings
+ R/Makefile.am : install R scripts

191115 Martin Raden
+ R/addPvalues2csv.R
* R/README.md :
+ docu of addPvalue2csv.R

################################################################################
### version 3.1.3
################################################################################
Expand Down
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
export GCC_COLORS ?= "error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01"

# sub directories to check for Makefiles
SUBDIRS = src python perl tests doc .
SUBDIRS = src python perl R tests doc .

# list of all personalities to be installed
#PERSONALITIES = `grep -P "^\\s+case\\s+IntaRNA\\S+\\s*:\\s*return" $(abs_top_srcdir)/src/bin/CommandLineParsing.h | sed "s/^\\s*case\\s\\+\\(IntaRNA\\S*\\)\\s\\+:\\s\\+return.*/\\1/g"`
Expand Down
150 changes: 150 additions & 0 deletions R/IntaRNA_CSV_p-value.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env Rscript

####################################################################
# Computes p-values and false discovery rates (fdr following Benjamin+Hochberg)
# by fitting a GEV on the energy values computed by IntaRNA.
# Note, such p-value estimates are only useful for genome-wide predictions.
#
# arguments: <IntaRNA-output-CSV> [<out-CSV> = <intarna-csv-output>] [<col-name-E> = E]
#
# 1 <IntaRNA-output-CSV> = ";"-separated CSV output of IntaRNA
# 2 <out-CSV> = file name to write the extended CSV output to (2 new columns)
# 3 <col-name-E> = the column name that holds the energy values to be fitted
#
# example call:
#
# Rscript --vanilla IntaRNA_CSV_p-value.R predictions.csv
#
# This script is part of the IntaRNA source code package. See
# respective licence and documentation for further information.
#
# https://github.com/BackofenLab/IntaRNA
#
####################################################################


####################################################################
# get command line arguments
####################################################################

args = commandArgs(trailingOnly=TRUE)
# check and parse
if (length(args)<1) { stop("call with <intarna-csv-output> [<out-file-with-p-values> = <intarna-csv-output>] [<col-name-E> = E]", call.=FALSE) }

# get input file = IntaRNA csv output
intarnaOutputFile = args[1];
if (!file.exists(intarnaOutputFile )) { stop("intarna-csv-output file '", intarnaOutputFile, "' does not exist!", call.=FALSE) }

# get output file
outFile = intarnaOutputFile;
if (length(args)>=2) {
outFile = args[2]
}

# set column to get energies from
colNameE = "E"
# get column name from argument if present
if (length(args)>=3) {
colNameE = args[3]
}

# column delimiter used in CSV input / output
csvColSep = ";"

# number of digits of p-values
pValPrec = 7

####################################################################
# fits a generalized extreme value distribution to the given energy data
# adopted from 'gev' function of 'evir' library
# @param energy the IntaRNA energy values to fit (a vector)
# @return the fitting parameters xi, mu, and sigma
gevFitting <- function (energy)
####################################################################
{
n.all <- NA
energy <- as.numeric(-energy)
n <- length(energy)
sigma0 <- sqrt(6 * var(energy))/pi
mu0 <- mean(energy) - 0.57722 * sigma0
xi0 <- 0.1
theta <- c(xi0, sigma0, mu0)
negloglik <- function(theta, tmp) {
y <- 1 + (theta[1] * (tmp - theta[3]))/theta[2]
if ((theta[2] < 0) || (min(y) < 0))
out <- 1e+06
else {
term1 <- length(tmp) * logb(theta[2])
term2 <- sum((1 + 1/theta[1]) * logb(y))
term3 <- sum(y^(-1/theta[1]))
out <- term1 + term2 + term3
}
out
}
# compute fit
fit <- optim(theta, negloglik, hessian = TRUE, tmp = energy)
if (fit$convergence)
warning("gev fit optimization may not have succeeded")

return( list( xi=fit$par[1], sigma=fit$par[2], mu=fit$par[3] ) )
}


####################################################################
# computes p-values for the given energy values and GEV distribution
# adopted from 'pgev' function of 'evir' library
# @param energy IntaRNA energy values
# @param gev GEV parameters
# @return p-values for each energy value
gevPvalue <- function (energy, gev=list( xi = 1, mu = 0, sigma = 1) )
####################################################################
{
return ( 1 - exp( - (1 + (gev$xi * ((-energy) - gev$mu))/gev$sigma)^(-1 /gev$xi)))
}



####################################################################
# parse IntaRNA CSV
####################################################################

d = read.csv( intarnaOutputFile, sep=csvColSep )

# check if energy column present
if (!is.element(colNameE, colnames(d))) {
stop("'",colNameE,"' is not among the column names of '",intarnaOutputFile,"'", call.=FALSE);
}
# check if unique
if (sum(colnames(d) == colNameE)>1) {
stop("column name '",colNameE,"' occurs more than once in '",intarnaOutputFile,"'", call.=FALSE);
}


####################################################################
# fit p-values
####################################################################

# get energies to fit
E = d[,colnames(d) == colNameE]


# fit negated energies
gevfit <- gevFitting(E) # fitten

# get rounded pValue
pVal <- round( gevPvalue( E, gevfit ), digits=pValPrec )
# get rounded fdr
fdr <- round( p.adjust(pVal, method="BH"), digits=pValPrec )

####################################################################
# write output
####################################################################

o = cbind( d, pVal, fdr )
colnames(o)[ncol(o)-1] = "p-value"
colnames(o)[ncol(o)] = "fdr"

write.table( o, outFile, sep=csvColSep, row.names=FALSE, col.names = TRUE, quote=FALSE )


#################################################################EOF
Loading

0 comments on commit eff24d0

Please sign in to comment.