-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #178 from BackofenLab/dev
v3.1.4
- Loading branch information
Showing
26 changed files
with
757 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
#################################################################### | ||
# Computes p-values and false discovery rates (fdr following Benjamin+Hochberg) | ||
# by fitting a GEV on the energy values computed by IntaRNA. | ||
# Note, such p-value estimates are only useful for genome-wide predictions. | ||
# | ||
# arguments: <IntaRNA-output-CSV> [<out-CSV> = <intarna-csv-output>] [<col-name-E> = E] | ||
# | ||
# 1 <IntaRNA-output-CSV> = ";"-separated CSV output of IntaRNA | ||
# 2 <out-CSV> = file name to write the extended CSV output to (2 new columns) | ||
# 3 <col-name-E> = the column name that holds the energy values to be fitted | ||
# | ||
# example call: | ||
# | ||
# Rscript --vanilla IntaRNA_CSV_p-value.R predictions.csv | ||
# | ||
# This script is part of the IntaRNA source code package. See | ||
# respective licence and documentation for further information. | ||
# | ||
# https://github.com/BackofenLab/IntaRNA | ||
# | ||
#################################################################### | ||
|
||
|
||
#################################################################### | ||
# get command line arguments | ||
#################################################################### | ||
|
||
args = commandArgs(trailingOnly=TRUE) | ||
# check and parse | ||
if (length(args)<1) { stop("call with <intarna-csv-output> [<out-file-with-p-values> = <intarna-csv-output>] [<col-name-E> = E]", call.=FALSE) } | ||
|
||
# get input file = IntaRNA csv output | ||
intarnaOutputFile = args[1]; | ||
if (!file.exists(intarnaOutputFile )) { stop("intarna-csv-output file '", intarnaOutputFile, "' does not exist!", call.=FALSE) } | ||
|
||
# get output file | ||
outFile = intarnaOutputFile; | ||
if (length(args)>=2) { | ||
outFile = args[2] | ||
} | ||
|
||
# set column to get energies from | ||
colNameE = "E" | ||
# get column name from argument if present | ||
if (length(args)>=3) { | ||
colNameE = args[3] | ||
} | ||
|
||
# column delimiter used in CSV input / output | ||
csvColSep = ";" | ||
|
||
# number of digits of p-values | ||
pValPrec = 7 | ||
|
||
#################################################################### | ||
# fits a generalized extreme value distribution to the given energy data | ||
# adopted from 'gev' function of 'evir' library | ||
# @param energy the IntaRNA energy values to fit (a vector) | ||
# @return the fitting parameters xi, mu, and sigma | ||
gevFitting <- function (energy) | ||
#################################################################### | ||
{ | ||
n.all <- NA | ||
energy <- as.numeric(-energy) | ||
n <- length(energy) | ||
sigma0 <- sqrt(6 * var(energy))/pi | ||
mu0 <- mean(energy) - 0.57722 * sigma0 | ||
xi0 <- 0.1 | ||
theta <- c(xi0, sigma0, mu0) | ||
negloglik <- function(theta, tmp) { | ||
y <- 1 + (theta[1] * (tmp - theta[3]))/theta[2] | ||
if ((theta[2] < 0) || (min(y) < 0)) | ||
out <- 1e+06 | ||
else { | ||
term1 <- length(tmp) * logb(theta[2]) | ||
term2 <- sum((1 + 1/theta[1]) * logb(y)) | ||
term3 <- sum(y^(-1/theta[1])) | ||
out <- term1 + term2 + term3 | ||
} | ||
out | ||
} | ||
# compute fit | ||
fit <- optim(theta, negloglik, hessian = TRUE, tmp = energy) | ||
if (fit$convergence) | ||
warning("gev fit optimization may not have succeeded") | ||
|
||
return( list( xi=fit$par[1], sigma=fit$par[2], mu=fit$par[3] ) ) | ||
} | ||
|
||
|
||
#################################################################### | ||
# computes p-values for the given energy values and GEV distribution | ||
# adopted from 'pgev' function of 'evir' library | ||
# @param energy IntaRNA energy values | ||
# @param gev GEV parameters | ||
# @return p-values for each energy value | ||
gevPvalue <- function (energy, gev=list( xi = 1, mu = 0, sigma = 1) ) | ||
#################################################################### | ||
{ | ||
return ( 1 - exp( - (1 + (gev$xi * ((-energy) - gev$mu))/gev$sigma)^(-1 /gev$xi))) | ||
} | ||
|
||
|
||
|
||
#################################################################### | ||
# parse IntaRNA CSV | ||
#################################################################### | ||
|
||
d = read.csv( intarnaOutputFile, sep=csvColSep ) | ||
|
||
# check if energy column present | ||
if (!is.element(colNameE, colnames(d))) { | ||
stop("'",colNameE,"' is not among the column names of '",intarnaOutputFile,"'", call.=FALSE); | ||
} | ||
# check if unique | ||
if (sum(colnames(d) == colNameE)>1) { | ||
stop("column name '",colNameE,"' occurs more than once in '",intarnaOutputFile,"'", call.=FALSE); | ||
} | ||
|
||
|
||
#################################################################### | ||
# fit p-values | ||
#################################################################### | ||
|
||
# get energies to fit | ||
E = d[,colnames(d) == colNameE] | ||
|
||
|
||
# fit negated energies | ||
gevfit <- gevFitting(E) # fitten | ||
|
||
# get rounded pValue | ||
pVal <- round( gevPvalue( E, gevfit ), digits=pValPrec ) | ||
# get rounded fdr | ||
fdr <- round( p.adjust(pVal, method="BH"), digits=pValPrec ) | ||
|
||
#################################################################### | ||
# write output | ||
#################################################################### | ||
|
||
o = cbind( d, pVal, fdr ) | ||
colnames(o)[ncol(o)-1] = "p-value" | ||
colnames(o)[ncol(o)] = "fdr" | ||
|
||
write.table( o, outFile, sep=csvColSep, row.names=FALSE, col.names = TRUE, quote=FALSE ) | ||
|
||
|
||
#################################################################EOF |
Oops, something went wrong.