From b9e5c8e097010373d84519b33f99c1b5227a2b2b Mon Sep 17 00:00:00 2001 From: martin-mann Date: Fri, 7 Sep 2018 17:22:19 +0200 Subject: [PATCH 1/9] + 'hybridDP|Bfull' output for full sequence lengths --- ChangeLog | 11 ++++++ README.md | 8 +++-- src/IntaRNA/Interaction.cpp | 57 ++++++++++++++++++++++++-------- src/IntaRNA/Interaction.h | 9 +++-- src/IntaRNA/OutputHandlerCsv.cpp | 8 +++++ src/IntaRNA/OutputHandlerCsv.h | 4 +++ 6 files changed, 78 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 91f38c3b..7e848be2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,15 @@ +180907 Martin Raden : + * IntaRNA/OutputHandlerCsv : + + 'hybridDP|Bfull' output for full sequence lengths + * IntaRNA/Interaction : + * dotBar() : + * dotBracket() : + + "fullLength" argument to trigger full-length or interaction-site-only + output + * README.md : + + 'hybridD(P|B)full' documentation + version 2.3.0 180906 Martin Raden : diff --git a/README.md b/README.md index bfb79ea1..93b6c14a 100644 --- a/README.md +++ b/README.md @@ -609,7 +609,7 @@ This information can be incorporated into IntaRNA predictions by providing *explicit seed information*. To this end, the `--seedTQ` parameter can be used. It takes a comma-separated list of seed string encodings in the format `startTbpsT&startQbpsQ`, which is in the same format as the IntaRNA `hybridDB` -output (see below), i.e. e.g. `--seedTQ='4|||.|&7||.||'` +output (see [below](#outModeCsv)), i.e. e.g. `--seedTQ='4|||.|&7||.||'` (ensure you quote the seed encoding to avoid a shell interpretation of the pipe symbol '|') to encode a seed interaction like the following @@ -772,8 +772,10 @@ are - `end1` : end index of hybrid in seq1 - `start2` : start index of hybrid in seq2 - `end2` : end index of hybrid in seq2 -- `hybridDP` : hybrid in VRNA dot-bracket notation -- `hybridDB` : hybrid in dot-bar notation +- `hybridDP` : hybrid in VRNA dot-bracket notation (interaction sites only) +- `hybridDPfull` : hybrid in VRNA dot-bracket notation (full sequence length) +- `hybridDB` : hybrid in dot-bar notation (interactin sites only) +- `hybridDBfull` : hybrid in dot-bar notation (full sequence length) - `E` : overall hybridization energy - `ED1` : ED value of seq1 - `ED2` : ED value of seq2 diff --git a/src/IntaRNA/Interaction.cpp b/src/IntaRNA/Interaction.cpp index 123749d2..95fb4946 100644 --- a/src/IntaRNA/Interaction.cpp +++ b/src/IntaRNA/Interaction.cpp @@ -131,36 +131,65 @@ operator= ( const InteractionRange & range ) std::string Interaction:: -dotBar( const Interaction & i ) +dotBar( const Interaction & i, const bool fullLength ) { #if INTARNA_IN_DEBUG_MODE if (!i.isValid()) throw std::runtime_error("Interaction::dotBar("+toString(i)+") not valid!"); #endif - // compile overall dot-bracket representation - return toString(i.basePairs.begin()->first +1) - + dotSomething(i.basePairs.begin(), i.basePairs.end(), true, '|') - +"&" - +toString(i.basePairs.rbegin()->second +1) - + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, '|') - ; + + // check whether to do full length output + if (fullLength) { + // compile dot-bar representation for full sequences + return "1" + + std::string(i.basePairs.begin()->first, '.' ) // leading unpaired s1 + + dotSomething(i.basePairs.begin(), i.basePairs.end(), true, '|') // s1 structure + + std::string(i.s1->size() - i.basePairs.rbegin()->first -1, '.' ) // trailing unpaired s1 + + "&" + + "1" + + std::string(i.basePairs.rbegin()->second, '.' ) // trailing unpaired s2 + + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, '|') // s2 structure + + std::string( i.s2->size() - i.basePairs.begin()->second -1, '.' ) // leading unpaired s2 + ; + } else { + // compile dot-bar representation for interacting subsequences only + return toString(i.basePairs.begin()->first +1) + + dotSomething(i.basePairs.begin(), i.basePairs.end(), true, '|') + +"&" + +toString(i.basePairs.rbegin()->second +1) + + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, '|') + ; + } } //////////////////////////////////////////////////////////////////////////// std::string Interaction:: -dotBracket( const Interaction & i, const char symOpen, const char symClose ) +dotBracket( const Interaction & i, const char symOpen, const char symClose, const bool fullLength ) { #if INTARNA_IN_DEBUG_MODE if (!i.isValid()) throw std::runtime_error("Interaction::dotBracket("+toString(i)+") not valid!"); #endif - // compile overall dot-bracket representation - return dotSomething(i.basePairs.begin(), i.basePairs.end(), true, symOpen) - +"&" - + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, symClose) - ; + + if (fullLength) { + // compile dot-bracket representation for full sequence lengths + return std::string(i.basePairs.begin()->first, '.' ) // leading unpaired s1 + + dotSomething(i.basePairs.begin(), i.basePairs.end(), true, symOpen) // s1 structure + + std::string(i.s1->size() - i.basePairs.rbegin()->first -1, '.' ) // trailing unpaired s1 + +"&" + + std::string(i.s2->size() - i.basePairs.rbegin()->second -1, '.' ) // leading unpaired s2 + + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, symClose) // s2 structure + + std::string(i.basePairs.rbegin()->second, '.' ) // trailing unpaired s2 + ; + } else { + // compile dot-bracket representation for interacting subsequences only + return dotSomething(i.basePairs.begin(), i.basePairs.end(), true, symOpen) + +"&" + + dotSomething(i.basePairs.rbegin(), i.basePairs.rend(), false, symClose) + ; + } } //////////////////////////////////////////////////////////////////////////// diff --git a/src/IntaRNA/Interaction.h b/src/IntaRNA/Interaction.h index 5fb876b1..73334b36 100644 --- a/src/IntaRNA/Interaction.h +++ b/src/IntaRNA/Interaction.h @@ -200,11 +200,14 @@ class Interaction { * needed yielding still a valid encoding (for the swapped sequences). * * @param i interaction to encode + * @param fullLength whether or not full length base pair annotation is to + * be printed + * * @return the dot-bracket encoding */ static std::string - dotBar( const Interaction & i ); + dotBar( const Interaction & i, const bool fullLength = false ); /** * Produces a dot-bracket encoding of the interaction in VRNA style in the @@ -221,12 +224,14 @@ class Interaction { * @param i interaction to encode * @param symOpen the symbol to be used for base pairs in dot-bracket-1 * @param symClose the symbol to be used for base pairs in dot-bracket-2 + * @param fullLength whether or not full length base pair annotation is to + * be printed * * @return the VRNA-styled dot-bracket encoding */ static std::string - dotBracket( const Interaction & i, const char symOpen = '(', const char symClose = ')'); + dotBracket( const Interaction & i, const char symOpen = '(', const char symClose = ')', const bool fullLength = false); protected: diff --git a/src/IntaRNA/OutputHandlerCsv.cpp b/src/IntaRNA/OutputHandlerCsv.cpp index 50bf7d44..e5e20462 100644 --- a/src/IntaRNA/OutputHandlerCsv.cpp +++ b/src/IntaRNA/OutputHandlerCsv.cpp @@ -158,6 +158,14 @@ add( const Interaction & i ) outTmp < Date: Thu, 8 Nov 2018 13:28:35 +0100 Subject: [PATCH 2/9] docu corrected --- src/IntaRNA/PredictionTrackerPairMinE.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/IntaRNA/PredictionTrackerPairMinE.h b/src/IntaRNA/PredictionTrackerPairMinE.h index e141a7a8..78085317 100644 --- a/src/IntaRNA/PredictionTrackerPairMinE.h +++ b/src/IntaRNA/PredictionTrackerPairMinE.h @@ -25,8 +25,8 @@ class PredictionTrackerPairMinE: public PredictionTracker public: /** - * Constructs a PredictionTracker that collected for each positions of a - * sequence the minimal energy of any interaction covering this position. + * Constructs a PredictionTracker that collected for index pair of two + * sequences the minimal energy of any interaction covering this position. * * Note, if a filename is provided, its stream is closed on destruction of * this object! @@ -45,8 +45,8 @@ class PredictionTrackerPairMinE: public PredictionTracker ); /** - * Constructs an PredictionTracker that collected for each positions of a - * sequence the minimal energy of any interaction covering this position. + * Constructs a PredictionTracker that collected for index pair of two + * sequences the minimal energy of any interaction covering this position. * * Note, the stream is NOT closed nor deleted on destruction of this object! * @@ -63,13 +63,13 @@ class PredictionTrackerPairMinE: public PredictionTracker ); /** - * destruction: write the profile(s) to the according streams. + * destruction: write the pair information to the according stream. */ virtual ~PredictionTrackerPairMinE(); /** - * Updates the profile information for each Predictor.updateOptima() call. + * Updates the minE information for each Predictor.updateOptima() call. * * @param i1 the index of the first sequence interacting with i2 * @param j1 the index of the first sequence interacting with j2 @@ -106,7 +106,7 @@ class PredictionTrackerPairMinE: public PredictionTracker E2dMatrix pairMinE; /** - * Writes profile data to stream. + * Writes minE data to stream. * * @param out the output stream to write to * @param pairMinE the minE data to write From 29e8082e0ed0f44479ec5b2857b48018b090384c Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 8 Nov 2018 16:45:46 +0100 Subject: [PATCH 3/9] exhaustive spot probability computation --- ChangeLog | 8 + README.md | 75 ++++++--- src/IntaRNA/Makefile.am | 2 + src/IntaRNA/PredictionTrackerSpotProbAll.cpp | 156 +++++++++++++++++++ src/IntaRNA/PredictionTrackerSpotProbAll.h | 141 +++++++++++++++++ src/bin/CommandLineParsing.cpp | 29 +++- src/bin/CommandLineParsing.h | 7 +- 7 files changed, 389 insertions(+), 29 deletions(-) create mode 100644 src/IntaRNA/PredictionTrackerSpotProbAll.cpp create mode 100644 src/IntaRNA/PredictionTrackerSpotProbAll.h diff --git a/ChangeLog b/ChangeLog index 91f38c3b..46e80126 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,12 @@ +181108 Martin Raden : + + IntaRNA/PredictionTrackerSpotProbAll : exhaustively computes and prints all + spot probabilities (ie for all intermolecular index pairs) + * bin/CommandLineParsing : + + registration and documentation of PredictionTrackerSpotProbAll usage + * README.md : + * exhaustive spot prob calculation documented + version 2.3.0 180906 Martin Raden : diff --git a/README.md b/README.md index bfb79ea1..68c27ce5 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ The following topics are covered by this documentation: - [Energy parameters and temperature](#energy) - [Additional output files](#outFiles) - [Minimal energy profiles](#profileMinE) - - [Spot probability profiles](#profileSpotProb) using partition functions - [Minimal energy for all intermolecular index pairs](#pairMinE) + - [Spot probability profiles](#profileSpotProb) using partition functions - [Interaction probabilities for interaction spots of interest](#spotProb) - [Accessibility and unpaired probabilities](#accessibility) - [Local versus global unpaired probabilities](#accLocalGlobal) @@ -974,23 +974,6 @@ mfe interaction close to the 5'-end of the molecule. -
- - -### Spot probability profiles - -Similarly to (minimal energy profiles)[#profileMinE], it is also possible to -compute position-wise probabilities how likely a position is covered by an -interaction, i.e. its *spot probability*. To the end, we compute for each -position $i$ the partition function $Zi$ of all interactions covering $i$. -Given the overall partition function *Z* including all possible interactions, -the position-speficit spot probability for *i* is given by *Zi/Z*. - -Such profiles can be generated using `--out=qSpotProb:MYPROFILEFILE.csv` or -`--out=tSpotProb:...` for the query/target sequence respectively and independently. - - -
@@ -1025,6 +1008,26 @@ mfe-site in the second sequence and are thus less likely to occure. +
+
+ +### Spot probability profiles + +Similarly to [minimal energy profiles](#profileMinE), it is also possible to +compute position-wise probabilities how likely a position is covered by an +interaction, i.e. its *spot probability*. To the end, we compute for each +position *i* the partition function *Zi* of all interactions covering *i*. +Given the overall partition function *Z* including all possible interactions, +the position-speficit spot probability for *i* is given by *Zi/Z*. + +Such profiles can be generated using `--out=qSpotProb:MYPROFILEFILE.csv` or +`--out=tSpotProb:...` for the query/target sequence respectively and independently. + +Note, instead of a file you can also write the profile to stream using `STDOUT` +or `STDERR` instead of a file name. + + +
@@ -1068,6 +1071,42 @@ Nevertheless, since the Boltzmann probabilities are dominated by the low(est) energy interactions, we consider the probability estimates as meaningful! +#### Spot probabilities for all intermolecular index pairs + +If interested in all intermolecular index pair combinations (all spots), you +can exclude the list from the call and only specify the output file/stream by +`--out="spotProb:MYSPOTPROBFILE.csv"`. The resulting semicolon-separated table provides the spot +probability for each index pair combination, as shown below. + +```[bash] + --> IntaRNA --out="spotProb:STDOUT" -t AAACACCCCCGGUGGUUUGG -q AAACACCCCCGGUGGUUUGG --energy=B -m E --noSeed --out=/dev/null +spotProb;A_1;A_2;A_3;C_4;A_5;C_6;C_7;C_8;C_9;C_10;G_11;G_12;U_13;G_14;G_15;U_16;U_17;U_18;G_19;G_20 +A_1;3.29634e-07;1.04259e-06;1.88581e-06;3.92703e-06;6.01889e-06;1.09014e-05;1.98091e-05;3.26691e-05;4.97285e-05;6.40822e-05;0.00114721;0.00220701;0.00540899;0.00237187;0.00360255;0.00701572;0.00700194;0.00417576;0;0 +A_2;1.04259e-06;3.29756e-06;6.59127e-06;1.23155e-05;2.15023e-05;3.65021e-05;6.65094e-05;0.000109528;0.000164207;0.000210854;0.00272534;0.00524301;0.00906912;0.00467934;0.00760322;0.0122582;0.0114086;0.00602621;0;0 +A_3;1.88581e-06;6.59127e-06;1.46527e-05;2.91728e-05;5.38079e-05;9.40555e-05;0.000179154;0.000297828;0.000439376;0.000558681;0.005502;0.0104376;0.0143365;0.00670991;0.0118697;0.0159918;0.0123526;0.00593264;0;0 +C_4;3.92703e-06;1.23155e-05;2.91728e-05;7.35161e-05;0.000143682;0.000297768;0.000618853;0.00101766;0.00145303;0.0017689;0.0102492;0.0153404;0.015348;0.0123563;0.0147842;0.0126304;0.00887796;0.00667471;0.00973898;0.0075106 +A_5;6.01889e-06;2.15023e-05;5.38079e-05;0.000143682;0.00027948;0.00055287;0.00115804;0.00206338;0.00313617;0.00403553;0.0127348;0.0189567;0.022699;0.0126698;0.0146316;0.0153814;0.0102196;0.00793081;0.00690787;0.00474379 +C_6;1.09014e-05;3.65021e-05;9.40555e-05;0.000297768;0.00055287;0.00127294;0.00278079;0.00507586;0.00801924;0.0102464;0.0234918;0.0277373;0.0219681;0.0173597;0.0181085;0.0126097;0.00712746;0.00444397;0.0150453;0.0139076 +C_7;1.98091e-05;6.65094e-05;0.000179154;0.000618853;0.00115804;0.00278079;0.00611847;0.0114976;0.0187521;0.0245813;0.0392287;0.0365652;0.0245582;0.0257423;0.0194922;0.00880615;0.00691222;0.00878555;0.0299224;0.0270425 +C_8;3.26691e-05;0.000109528;0.000297828;0.00101766;0.00206338;0.00507586;0.0114976;0.0224763;0.0380841;0.051392;0.0716197;0.0587535;0.0346146;0.0429812;0.0266964;0.00717352;0.00914256;0.0183812;0.0585854;0.0521578 +C_9;4.97285e-05;0.000164207;0.000439376;0.00145303;0.00313617;0.00801924;0.0187521;0.0380841;0.0671115;0.0930764;0.115935;0.0830931;0.045061;0.0572892;0.0333122;0.0068722;0.0139344;0.0357869;0.0859975;0.0736521 +C_10;6.40822e-05;0.000210854;0.000558681;0.0017689;0.00403553;0.0102464;0.0245813;0.051392;0.0930764;0.12986;0.14737;0.0865541;0.0493791;0.0589481;0.030814;0.00654186;0.0174543;0.0481296;0.085893;0.0665433 +G_11;0.00114721;0.00272534;0.005502;0.0102492;0.0127348;0.0234918;0.0392287;0.0716197;0.115935;0.14737;0.136251;0.0740332;0.0512369;0.0438958;0.0218223;0.0117975;0.0219606;0.0500723;0.0522355;0.033636 +G_12;0.00220701;0.00524301;0.0104376;0.0153404;0.0189567;0.0277373;0.0365652;0.0587535;0.0830931;0.0865541;0.0740332;0.0495326;0.0356528;0.0244903;0.0175123;0.0140905;0.0164346;0.0207754;0.0228082;0.0156495 +U_13;0.00540899;0.00906912;0.0143365;0.015348;0.022699;0.0219681;0.0245582;0.0346146;0.045061;0.0493791;0.0512369;0.0356528;0.0251168;0.0207634;0.017577;0.0111258;0.010028;0.012619;0.0218789;0.0161306 +G_14;0.00237187;0.00467934;0.00670991;0.0123563;0.0126698;0.0173597;0.0257423;0.0429812;0.0572892;0.0589481;0.0438958;0.0244903;0.0207634;0.0122356;0.00753315;0.0078345;0.00965907;0.0128517;0.0109001;0.0059406 +G_15;0.00360255;0.00760322;0.0118697;0.0147842;0.0146316;0.0181085;0.0194922;0.0266964;0.0333122;0.030814;0.0218223;0.0175123;0.017577;0.00753315;0.00840878;0.00965183;0.00775696;0.0062456;0.00442768;0.00259566 +U_16;0.00701572;0.0122582;0.0159918;0.0126304;0.0153814;0.0126097;0.00880615;0.00717352;0.0068722;0.00654186;0.0117975;0.0140905;0.0111258;0.0078345;0.00965183;0.00685222;0.00365991;0.00195865;0.00765631;0.00707759 +U_17;0.00700194;0.0114086;0.0123526;0.00887796;0.0102196;0.00712746;0.00691222;0.00914256;0.0139344;0.0174543;0.0219606;0.0164346;0.010028;0.00965907;0.00775696;0.00365991;0.00362107;0.00625881;0.0137988;0.0117224 +U_18;0.00417576;0.00602621;0.00593264;0.00667471;0.00793081;0.00444397;0.00878555;0.0183812;0.0357869;0.0481296;0.0500723;0.0207754;0.012619;0.0128517;0.0062456;0.00195865;0.00625881;0.0161844;0.0212005;0.0133051 +G_19;0;0;0;0.00973898;0.00690787;0.0150453;0.0299224;0.0585854;0.0859975;0.085893;0.0522355;0.0228082;0.0218789;0.0109001;0.00442768;0.00765631;0.0137988;0.0212005;0.0158313;0.00807077 +G_20;0;0;0;0.0075106;0.00474379;0.0139076;0.0270425;0.0521578;0.0736521;0.0665433;0.033636;0.0156495;0.0161306;0.0059406;0.00259566;0.00707759;0.0117224;0.0133051;0.00807077;0.00396313 +``` + +This data can be visualized in heatmaps as discussed for the [minimal energy heatmap](#pairMinE). + + +
diff --git a/src/IntaRNA/Makefile.am b/src/IntaRNA/Makefile.am index 04dda2a5..97273467 100644 --- a/src/IntaRNA/Makefile.am +++ b/src/IntaRNA/Makefile.am @@ -51,6 +51,7 @@ libIntaRNA_a_HEADERS = \ PredictionTrackerPairMinE.h \ PredictionTrackerProfileMinE.h \ PredictionTrackerSpotProb.h \ + PredictionTrackerSpotProbAll.h \ PredictionTrackerProfileSpotProb.h \ Predictor.h \ PredictorMaxProb.h \ @@ -99,6 +100,7 @@ libIntaRNA_a_SOURCES = \ PredictionTrackerPairMinE.cpp \ PredictionTrackerProfileMinE.cpp \ PredictionTrackerSpotProb.cpp \ + PredictionTrackerSpotProbAll.cpp \ PredictionTrackerProfileSpotProb.cpp \ PredictorMaxProb.cpp \ PredictorMfe.cpp \ diff --git a/src/IntaRNA/PredictionTrackerSpotProbAll.cpp b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp new file mode 100644 index 00000000..b1dd0646 --- /dev/null +++ b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp @@ -0,0 +1,156 @@ + +#include "IntaRNA/PredictionTrackerSpotProbAll.h" + +namespace IntaRNA { + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , const std::string & streamName + , const std::string E_INF_string + ) + : PredictionTracker() + , energy(energy) + , deleteStreamsOnDestruction(true) + , outStream(NULL) + , E_INF_string(E_INF_string) + , overallZ( 0.0 ) + , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 +{ +#if INTARNA_IN_DEBUG_MODE + if (streamName.empty()) { + throw std::runtime_error("PredictionTrackerSpotProbAll() : streamName empty"); + } +#endif + // open streams + outStream = newOutputStream( streamName ); + if (outStream == NULL) { + throw std::runtime_error("PredictionTrackerSpotProbAll() : could not open output stream '"+streamName+"' for writing"); + } +} + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , std::ostream * outStream + , const std::string E_INF_string + ) + : PredictionTracker() + , energy(energy) + , deleteStreamsOnDestruction(false) + , outStream(outStream) + , E_INF_string(E_INF_string) + , overallZ( 0.0 ) + , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 +{ +} + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +~PredictionTrackerSpotProbAll() +{ + writeData( *outStream + , pairZ + , overallZ + , energy + , E_INF_string ); + + // clean up if file pointers were created in constructor + if (deleteStreamsOnDestruction) { + deleteOutputStream( outStream ); + } +} + +////////////////////////////////////////////////////////////////////// + +void +PredictionTrackerSpotProbAll:: +updateOptimumCalled( const size_t i1_, const size_t j1_ + , const size_t i2_, const size_t j2_ + , const E_type curE + ) +{ + // get valid indices + const size_t i1 = (i1_ == RnaSequence::lastPos ? energy.size1()-1 : i1_); + const size_t j1 = (j1_ == RnaSequence::lastPos ? energy.size1()-1 : j1_); + const size_t i2 = (i2_ == RnaSequence::lastPos ? energy.size2()-1 : i2_); + const size_t j2 = (j2_ == RnaSequence::lastPos ? energy.size2()-1 : j2_); + +#if INTARNA_IN_DEBUG_MODE + // sanity checks + if (i1>=energy.size1() || j1>=energy.size1()) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : index-1 range ["+toString(i1)+","+toString(j1)+"] exceeds sequence-1 length "+toString(energy.size1())); + if (i1>j1) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : i1 "+toString(i1)+" > j1 "+toString(j1)); + if (i2>=energy.size2() || j2>=energy.size2()) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : index-2 range ["+toString(i2)+","+toString(j2)+"] exceeds sequence-2 length "+toString(energy.size2())); + if (i2>j2) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : i2 "+toString(i2)+" > j2 "+toString(j2)); +#endif + + // get Boltzmann weight contribution of this interaction + Z_type curWeight = energy.getBoltzmannWeight( curE ); + + // update overall partition function + overallZ += curWeight; + + // update pair data + for (size_t k1=i1; k1<=j1; k1++) { + for (size_t k2=i2; k2<=j2; k2++) { + // update partition function + pairZ(k1,k2) += curWeight; + } + } +} + + +////////////////////////////////////////////////////////////////////// + +void +PredictionTrackerSpotProbAll:: +writeData( std::ostream &out + , const Z2dMatrix & pairZ + , const Z_type & overallZ + , const InteractionEnergy & energy + , const std::string & E_INF_string ) +{ + // direct access to sequence string information + const std::string & rna1 = energy.getAccessibility1().getSequence().asString(); + const std::string & rna2 = energy.getAccessibility2().getSequence().asString(); + + // write in CSV-like format + // header = reversed seq2 + // col[0] = seq1 + // cell[0,0] = "minE" + + // print header : spotProb; "nt_index" ... starting with index 1 + out <<"spotProb"; + for (size_t j=rna2.size(); j-- > 0; ) { + out <<';' < 0; ) { + // out separator + out <<';'; + // out infinity replacement if needed + if ( E_isINF( pairZ(i,j) ) ) { + out< + +#include +#include + + +namespace IntaRNA { + +/** + * Collects for each intermolecular index pair the probability that this pair + * is covered by an interaction. + * + * The pair data is written to stream on destruction. + */ +class PredictionTrackerSpotProbAll: public PredictionTracker +{ + +public: + + /** + * Constructs a PredictionTracker that collected for each index pair of two + * sequences the probability to be covered by an interaction. + * + * Note, if a filename is provided, its stream is closed on destruction of + * this object! + * + * @param energy the energy function used for energy calculation + * @param streamName the stream name where the data + * is to be written to. use STDOUT/STDERR for the respective stream. + * Otherwise, an according file is created (has to be non-empty) + * @param E_INF_string the output string representation of E_INF values in + * the profile output + */ + PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , const std::string & streamName + , const std::string E_INF_string = "NA" + ); + + /** + * Constructs a PredictionTracker that collected for each index pair of two + * sequences the probability to be covered by an interaction. + * + * Note, the stream is NOT closed nor deleted on destruction of this object! + * + * @param energy the energy function used + * @param outStream the stream where the data + * is to be written to (has to be non-null) + * @param E_INF_string the output string representation of E_INF values in + * the profile output + */ + PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , std::ostream * outStream + , const std::string E_INF_string = "NA" + ); + + /** + * destruction: write the profile(s) to the according streams. + */ + virtual ~PredictionTrackerSpotProbAll(); + + + /** + * Updates the partition function information for each + * Predictor.updateOptima() call. + * + * @param i1 the index of the first sequence interacting with i2 + * @param j1 the index of the first sequence interacting with j2 + * @param i2 the index of the second sequence interacting with i1 + * @param j2 the index of the second sequence interacting with j1 + * @param energy the overall energy of the interaction site + */ + virtual + void + updateOptimumCalled( const size_t i1, const size_t j1 + , const size_t i2, const size_t j2 + , const E_type energy + ); + + +protected: + + //! energy handler used for predictions + const InteractionEnergy & energy; + + //! whether or not the streams are to be deleted on destruction + const bool deleteStreamsOnDestruction; + + //! the stream to write the minE-data to + std::ostream * outStream; + + //! the output string representation of E_INF values in the profile output + const std::string E_INF_string; + + //! type of partition function values + typedef double Z_type; + + //! overall partition function + Z_type overallZ; + + //! matrix type to hold the partition function for each index pair + typedef boost::numeric::ublas::matrix Z2dMatrix; + + //! the index-pair-wise minimal energy values + Z2dMatrix pairZ; + + + + /** + * Writes profile data to stream. + * + * @param out the output stream to write to + * @param pairZ the partition function data to write + * @param overallZ the overall partition function for pairZ + * @param energy the energy function used + * @param E_INF_string the string to be used for E_INF entries + */ + static + void + writeData( std::ostream &out + , const Z2dMatrix & pairZ + , const Z_type & overallZ + , const InteractionEnergy & energy + , const std::string & E_INF_string ); + + +}; + +////////////////////////////////////////////////////////////////////// + +} // namespace + +#endif /* INTARNA_PREDICTIONTRACKERSPOTPROBALL_H_ */ diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index e3ee168b..f077303b 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -48,6 +48,7 @@ extern "C" { #include "IntaRNA/PredictionTrackerPairMinE.h" #include "IntaRNA/PredictionTrackerProfileMinE.h" #include "IntaRNA/PredictionTrackerSpotProb.h" +#include "IntaRNA/PredictionTrackerSpotProbAll.h" #include "IntaRNA/PredictionTrackerProfileSpotProb.h" #include "IntaRNA/SeedHandlerMfe.h" @@ -554,8 +555,8 @@ CommandLineParsing::CommandLineParsing() "\n 'tSpotProb:' (target) for each position the probability that is is covered by an interaction covering (CSV format)" "\n 'tAcc:' (target) ED accessibility values ('tPu'-like format)." "\n 'tPu:' (target) unpaired probabilities values (RNAplfold format)." - "\n 'pMinE:' (query+target) for each index pair the minimal energy of any interaction covering the pair (CSV format)" - "\n 'spotProb:' (query+target) tracks for a given set of interaction spots their probability to be covered by an interaction. Spots are encoded by comma-separated 'idx1&idx2' pairs. For each spot a probability is provided in concert with the probability that none of the spots (encoded by '0&0') is covered (CSV format). The spot encoding is followed colon-separated by the output stream/file name, eg. '--out=\"spotProb:3&76,59&2:STDERR\"'. NOTE: value has to be quoted due to '&' symbol!" + "\n 'pMinE:' (target+query) for each index pair the minimal energy of any interaction covering the pair (CSV format)" + "\n 'spotProb:' (target+query) tracks for a given set of interaction spots their probability to be covered by an interaction. If no spots are provided, probabilities for all index combinations are computed. Spots are encoded by comma-separated 'idxT&idxQ' pairs (target-query). For each spot a probability is provided in concert with the probability that none of the spots (encoded by '0&0') is covered (CSV format). The spot encoding is followed colon-separated by the output stream/file name, eg. '--out=\"spotProb:3&76,59&2:STDERR\"'. NOTE: value has to be quoted due to '&' symbol!" "\nFor each, provide a file name or STDOUT/STDERR to write to the respective output stream." ).c_str()) ("outMode" @@ -1764,12 +1765,24 @@ getPredictor( const InteractionEnergy & energy, OutputHandler & output ) const // check if spotProbs are to be tracked if (!outPrefix2streamName.at(OutPrefixCode::OP_spotProb).empty()) { - predTracker->addPredictionTracker( - new PredictionTrackerSpotProb( energy - // get encoding - , outSpotProbSpots - , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) - ); + if (outSpotProbSpots.empty()) { + // track all spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProbAll( energy + // add sequence-specific prefix for output file + , getFullFilename( outPrefix2streamName.at(OutPrefixCode::OP_spotProb) + , &(energy.getAccessibility1().getSequence()) + , &(energy.getAccessibility2().getAccessibilityOrigin().getSequence())) + , "0") ); + } else { + // track only specific spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProb( energy + // get encoding + , outSpotProbSpots + , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) + ); + } } // check if any tracker registered diff --git a/src/bin/CommandLineParsing.h b/src/bin/CommandLineParsing.h index 42c6017e..c8ebd9cc 100644 --- a/src/bin/CommandLineParsing.h +++ b/src/bin/CommandLineParsing.h @@ -1704,9 +1704,10 @@ void CommandLineParsing::validate_out(const std::vector & list) { } // store prefix to identify another existence std::string streamName = v->substr(v->find(':')+1); - // handle SpotProb setup - if (curPrefCode == OP_spotProb) { - // get spotProb spots + + // handle SpotProb setup if spots are defined + if (curPrefCode == OP_spotProb && streamName.find(':') != std::string::npos) { + // get spots outSpotProbSpots = streamName.substr(0,streamName.find(':')); // check if valid spot encoding if (!boost::regex_match(outSpotProbSpots, PredictionTrackerSpotProb::regexSpotString, boost::match_perl)){ From 73d3f8c36659479d8750c2b0ee2e69ba603aa821 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 8 Nov 2018 13:28:35 +0100 Subject: [PATCH 4/9] docu corrected --- src/IntaRNA/PredictionTrackerPairMinE.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/IntaRNA/PredictionTrackerPairMinE.h b/src/IntaRNA/PredictionTrackerPairMinE.h index e141a7a8..78085317 100644 --- a/src/IntaRNA/PredictionTrackerPairMinE.h +++ b/src/IntaRNA/PredictionTrackerPairMinE.h @@ -25,8 +25,8 @@ class PredictionTrackerPairMinE: public PredictionTracker public: /** - * Constructs a PredictionTracker that collected for each positions of a - * sequence the minimal energy of any interaction covering this position. + * Constructs a PredictionTracker that collected for index pair of two + * sequences the minimal energy of any interaction covering this position. * * Note, if a filename is provided, its stream is closed on destruction of * this object! @@ -45,8 +45,8 @@ class PredictionTrackerPairMinE: public PredictionTracker ); /** - * Constructs an PredictionTracker that collected for each positions of a - * sequence the minimal energy of any interaction covering this position. + * Constructs a PredictionTracker that collected for index pair of two + * sequences the minimal energy of any interaction covering this position. * * Note, the stream is NOT closed nor deleted on destruction of this object! * @@ -63,13 +63,13 @@ class PredictionTrackerPairMinE: public PredictionTracker ); /** - * destruction: write the profile(s) to the according streams. + * destruction: write the pair information to the according stream. */ virtual ~PredictionTrackerPairMinE(); /** - * Updates the profile information for each Predictor.updateOptima() call. + * Updates the minE information for each Predictor.updateOptima() call. * * @param i1 the index of the first sequence interacting with i2 * @param j1 the index of the first sequence interacting with j2 @@ -106,7 +106,7 @@ class PredictionTrackerPairMinE: public PredictionTracker E2dMatrix pairMinE; /** - * Writes profile data to stream. + * Writes minE data to stream. * * @param out the output stream to write to * @param pairMinE the minE data to write From 650248530c8eeb6c0f53e575ce640cbfcca9d7f7 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 8 Nov 2018 16:56:21 +0100 Subject: [PATCH 5/9] merged --- ChangeLog | 7 + README.md | 75 ++++++--- src/IntaRNA/Makefile.am | 2 + src/IntaRNA/PredictionTrackerSpotProbAll.cpp | 156 +++++++++++++++++++ src/IntaRNA/PredictionTrackerSpotProbAll.h | 141 +++++++++++++++++ src/bin/CommandLineParsing.cpp | 29 +++- src/bin/CommandLineParsing.h | 7 +- 7 files changed, 388 insertions(+), 29 deletions(-) create mode 100644 src/IntaRNA/PredictionTrackerSpotProbAll.cpp create mode 100644 src/IntaRNA/PredictionTrackerSpotProbAll.h diff --git a/ChangeLog b/ChangeLog index 7e848be2..058ad4d3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +181108 Martin Raden : + + IntaRNA/PredictionTrackerSpotProbAll : exhaustively computes and prints all + spot probabilities (ie for all intermolecular index pairs) + * bin/CommandLineParsing : + + registration and documentation of PredictionTrackerSpotProbAll usage + * README.md : + * exhaustive spot prob calculation documented 180907 Martin Raden : * IntaRNA/OutputHandlerCsv : diff --git a/README.md b/README.md index 93b6c14a..09434697 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ The following topics are covered by this documentation: - [Energy parameters and temperature](#energy) - [Additional output files](#outFiles) - [Minimal energy profiles](#profileMinE) - - [Spot probability profiles](#profileSpotProb) using partition functions - [Minimal energy for all intermolecular index pairs](#pairMinE) + - [Spot probability profiles](#profileSpotProb) using partition functions - [Interaction probabilities for interaction spots of interest](#spotProb) - [Accessibility and unpaired probabilities](#accessibility) - [Local versus global unpaired probabilities](#accLocalGlobal) @@ -976,23 +976,6 @@ mfe interaction close to the 5'-end of the molecule. -
-
- -### Spot probability profiles - -Similarly to (minimal energy profiles)[#profileMinE], it is also possible to -compute position-wise probabilities how likely a position is covered by an -interaction, i.e. its *spot probability*. To the end, we compute for each -position $i$ the partition function $Zi$ of all interactions covering $i$. -Given the overall partition function *Z* including all possible interactions, -the position-speficit spot probability for *i* is given by *Zi/Z*. - -Such profiles can be generated using `--out=qSpotProb:MYPROFILEFILE.csv` or -`--out=tSpotProb:...` for the query/target sequence respectively and independently. - - -
@@ -1027,6 +1010,26 @@ mfe-site in the second sequence and are thus less likely to occure. +
+
+ +### Spot probability profiles + +Similarly to [minimal energy profiles](#profileMinE), it is also possible to +compute position-wise probabilities how likely a position is covered by an +interaction, i.e. its *spot probability*. To the end, we compute for each +position *i* the partition function *Zi* of all interactions covering *i*. +Given the overall partition function *Z* including all possible interactions, +the position-speficit spot probability for *i* is given by *Zi/Z*. + +Such profiles can be generated using `--out=qSpotProb:MYPROFILEFILE.csv` or +`--out=tSpotProb:...` for the query/target sequence respectively and independently. + +Note, instead of a file you can also write the profile to stream using `STDOUT` +or `STDERR` instead of a file name. + + +
@@ -1070,6 +1073,42 @@ Nevertheless, since the Boltzmann probabilities are dominated by the low(est) energy interactions, we consider the probability estimates as meaningful! +#### Spot probabilities for all intermolecular index pairs + +If interested in all intermolecular index pair combinations (all spots), you +can exclude the list from the call and only specify the output file/stream by +`--out="spotProb:MYSPOTPROBFILE.csv"`. The resulting semicolon-separated table provides the spot +probability for each index pair combination, as shown below. + +```[bash] + --> IntaRNA --out="spotProb:STDOUT" -t AAACACCCCCGGUGGUUUGG -q AAACACCCCCGGUGGUUUGG --energy=B -m E --noSeed --out=/dev/null +spotProb;A_1;A_2;A_3;C_4;A_5;C_6;C_7;C_8;C_9;C_10;G_11;G_12;U_13;G_14;G_15;U_16;U_17;U_18;G_19;G_20 +A_1;3.29634e-07;1.04259e-06;1.88581e-06;3.92703e-06;6.01889e-06;1.09014e-05;1.98091e-05;3.26691e-05;4.97285e-05;6.40822e-05;0.00114721;0.00220701;0.00540899;0.00237187;0.00360255;0.00701572;0.00700194;0.00417576;0;0 +A_2;1.04259e-06;3.29756e-06;6.59127e-06;1.23155e-05;2.15023e-05;3.65021e-05;6.65094e-05;0.000109528;0.000164207;0.000210854;0.00272534;0.00524301;0.00906912;0.00467934;0.00760322;0.0122582;0.0114086;0.00602621;0;0 +A_3;1.88581e-06;6.59127e-06;1.46527e-05;2.91728e-05;5.38079e-05;9.40555e-05;0.000179154;0.000297828;0.000439376;0.000558681;0.005502;0.0104376;0.0143365;0.00670991;0.0118697;0.0159918;0.0123526;0.00593264;0;0 +C_4;3.92703e-06;1.23155e-05;2.91728e-05;7.35161e-05;0.000143682;0.000297768;0.000618853;0.00101766;0.00145303;0.0017689;0.0102492;0.0153404;0.015348;0.0123563;0.0147842;0.0126304;0.00887796;0.00667471;0.00973898;0.0075106 +A_5;6.01889e-06;2.15023e-05;5.38079e-05;0.000143682;0.00027948;0.00055287;0.00115804;0.00206338;0.00313617;0.00403553;0.0127348;0.0189567;0.022699;0.0126698;0.0146316;0.0153814;0.0102196;0.00793081;0.00690787;0.00474379 +C_6;1.09014e-05;3.65021e-05;9.40555e-05;0.000297768;0.00055287;0.00127294;0.00278079;0.00507586;0.00801924;0.0102464;0.0234918;0.0277373;0.0219681;0.0173597;0.0181085;0.0126097;0.00712746;0.00444397;0.0150453;0.0139076 +C_7;1.98091e-05;6.65094e-05;0.000179154;0.000618853;0.00115804;0.00278079;0.00611847;0.0114976;0.0187521;0.0245813;0.0392287;0.0365652;0.0245582;0.0257423;0.0194922;0.00880615;0.00691222;0.00878555;0.0299224;0.0270425 +C_8;3.26691e-05;0.000109528;0.000297828;0.00101766;0.00206338;0.00507586;0.0114976;0.0224763;0.0380841;0.051392;0.0716197;0.0587535;0.0346146;0.0429812;0.0266964;0.00717352;0.00914256;0.0183812;0.0585854;0.0521578 +C_9;4.97285e-05;0.000164207;0.000439376;0.00145303;0.00313617;0.00801924;0.0187521;0.0380841;0.0671115;0.0930764;0.115935;0.0830931;0.045061;0.0572892;0.0333122;0.0068722;0.0139344;0.0357869;0.0859975;0.0736521 +C_10;6.40822e-05;0.000210854;0.000558681;0.0017689;0.00403553;0.0102464;0.0245813;0.051392;0.0930764;0.12986;0.14737;0.0865541;0.0493791;0.0589481;0.030814;0.00654186;0.0174543;0.0481296;0.085893;0.0665433 +G_11;0.00114721;0.00272534;0.005502;0.0102492;0.0127348;0.0234918;0.0392287;0.0716197;0.115935;0.14737;0.136251;0.0740332;0.0512369;0.0438958;0.0218223;0.0117975;0.0219606;0.0500723;0.0522355;0.033636 +G_12;0.00220701;0.00524301;0.0104376;0.0153404;0.0189567;0.0277373;0.0365652;0.0587535;0.0830931;0.0865541;0.0740332;0.0495326;0.0356528;0.0244903;0.0175123;0.0140905;0.0164346;0.0207754;0.0228082;0.0156495 +U_13;0.00540899;0.00906912;0.0143365;0.015348;0.022699;0.0219681;0.0245582;0.0346146;0.045061;0.0493791;0.0512369;0.0356528;0.0251168;0.0207634;0.017577;0.0111258;0.010028;0.012619;0.0218789;0.0161306 +G_14;0.00237187;0.00467934;0.00670991;0.0123563;0.0126698;0.0173597;0.0257423;0.0429812;0.0572892;0.0589481;0.0438958;0.0244903;0.0207634;0.0122356;0.00753315;0.0078345;0.00965907;0.0128517;0.0109001;0.0059406 +G_15;0.00360255;0.00760322;0.0118697;0.0147842;0.0146316;0.0181085;0.0194922;0.0266964;0.0333122;0.030814;0.0218223;0.0175123;0.017577;0.00753315;0.00840878;0.00965183;0.00775696;0.0062456;0.00442768;0.00259566 +U_16;0.00701572;0.0122582;0.0159918;0.0126304;0.0153814;0.0126097;0.00880615;0.00717352;0.0068722;0.00654186;0.0117975;0.0140905;0.0111258;0.0078345;0.00965183;0.00685222;0.00365991;0.00195865;0.00765631;0.00707759 +U_17;0.00700194;0.0114086;0.0123526;0.00887796;0.0102196;0.00712746;0.00691222;0.00914256;0.0139344;0.0174543;0.0219606;0.0164346;0.010028;0.00965907;0.00775696;0.00365991;0.00362107;0.00625881;0.0137988;0.0117224 +U_18;0.00417576;0.00602621;0.00593264;0.00667471;0.00793081;0.00444397;0.00878555;0.0183812;0.0357869;0.0481296;0.0500723;0.0207754;0.012619;0.0128517;0.0062456;0.00195865;0.00625881;0.0161844;0.0212005;0.0133051 +G_19;0;0;0;0.00973898;0.00690787;0.0150453;0.0299224;0.0585854;0.0859975;0.085893;0.0522355;0.0228082;0.0218789;0.0109001;0.00442768;0.00765631;0.0137988;0.0212005;0.0158313;0.00807077 +G_20;0;0;0;0.0075106;0.00474379;0.0139076;0.0270425;0.0521578;0.0736521;0.0665433;0.033636;0.0156495;0.0161306;0.0059406;0.00259566;0.00707759;0.0117224;0.0133051;0.00807077;0.00396313 +``` + +This data can be visualized in heatmaps as discussed for the [minimal energy heatmap](#pairMinE). + + +
diff --git a/src/IntaRNA/Makefile.am b/src/IntaRNA/Makefile.am index 04dda2a5..97273467 100644 --- a/src/IntaRNA/Makefile.am +++ b/src/IntaRNA/Makefile.am @@ -51,6 +51,7 @@ libIntaRNA_a_HEADERS = \ PredictionTrackerPairMinE.h \ PredictionTrackerProfileMinE.h \ PredictionTrackerSpotProb.h \ + PredictionTrackerSpotProbAll.h \ PredictionTrackerProfileSpotProb.h \ Predictor.h \ PredictorMaxProb.h \ @@ -99,6 +100,7 @@ libIntaRNA_a_SOURCES = \ PredictionTrackerPairMinE.cpp \ PredictionTrackerProfileMinE.cpp \ PredictionTrackerSpotProb.cpp \ + PredictionTrackerSpotProbAll.cpp \ PredictionTrackerProfileSpotProb.cpp \ PredictorMaxProb.cpp \ PredictorMfe.cpp \ diff --git a/src/IntaRNA/PredictionTrackerSpotProbAll.cpp b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp new file mode 100644 index 00000000..b1dd0646 --- /dev/null +++ b/src/IntaRNA/PredictionTrackerSpotProbAll.cpp @@ -0,0 +1,156 @@ + +#include "IntaRNA/PredictionTrackerSpotProbAll.h" + +namespace IntaRNA { + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , const std::string & streamName + , const std::string E_INF_string + ) + : PredictionTracker() + , energy(energy) + , deleteStreamsOnDestruction(true) + , outStream(NULL) + , E_INF_string(E_INF_string) + , overallZ( 0.0 ) + , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 +{ +#if INTARNA_IN_DEBUG_MODE + if (streamName.empty()) { + throw std::runtime_error("PredictionTrackerSpotProbAll() : streamName empty"); + } +#endif + // open streams + outStream = newOutputStream( streamName ); + if (outStream == NULL) { + throw std::runtime_error("PredictionTrackerSpotProbAll() : could not open output stream '"+streamName+"' for writing"); + } +} + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , std::ostream * outStream + , const std::string E_INF_string + ) + : PredictionTracker() + , energy(energy) + , deleteStreamsOnDestruction(false) + , outStream(outStream) + , E_INF_string(E_INF_string) + , overallZ( 0.0 ) + , pairZ( energy.size1(), energy.size2(), (Z_type)0.0 ) // init 0 +{ +} + +////////////////////////////////////////////////////////////////////// + +PredictionTrackerSpotProbAll:: +~PredictionTrackerSpotProbAll() +{ + writeData( *outStream + , pairZ + , overallZ + , energy + , E_INF_string ); + + // clean up if file pointers were created in constructor + if (deleteStreamsOnDestruction) { + deleteOutputStream( outStream ); + } +} + +////////////////////////////////////////////////////////////////////// + +void +PredictionTrackerSpotProbAll:: +updateOptimumCalled( const size_t i1_, const size_t j1_ + , const size_t i2_, const size_t j2_ + , const E_type curE + ) +{ + // get valid indices + const size_t i1 = (i1_ == RnaSequence::lastPos ? energy.size1()-1 : i1_); + const size_t j1 = (j1_ == RnaSequence::lastPos ? energy.size1()-1 : j1_); + const size_t i2 = (i2_ == RnaSequence::lastPos ? energy.size2()-1 : i2_); + const size_t j2 = (j2_ == RnaSequence::lastPos ? energy.size2()-1 : j2_); + +#if INTARNA_IN_DEBUG_MODE + // sanity checks + if (i1>=energy.size1() || j1>=energy.size1()) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : index-1 range ["+toString(i1)+","+toString(j1)+"] exceeds sequence-1 length "+toString(energy.size1())); + if (i1>j1) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : i1 "+toString(i1)+" > j1 "+toString(j1)); + if (i2>=energy.size2() || j2>=energy.size2()) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : index-2 range ["+toString(i2)+","+toString(j2)+"] exceeds sequence-2 length "+toString(energy.size2())); + if (i2>j2) throw std::runtime_error("PredictionTrackerSpotProbAll::updateProfile() : i2 "+toString(i2)+" > j2 "+toString(j2)); +#endif + + // get Boltzmann weight contribution of this interaction + Z_type curWeight = energy.getBoltzmannWeight( curE ); + + // update overall partition function + overallZ += curWeight; + + // update pair data + for (size_t k1=i1; k1<=j1; k1++) { + for (size_t k2=i2; k2<=j2; k2++) { + // update partition function + pairZ(k1,k2) += curWeight; + } + } +} + + +////////////////////////////////////////////////////////////////////// + +void +PredictionTrackerSpotProbAll:: +writeData( std::ostream &out + , const Z2dMatrix & pairZ + , const Z_type & overallZ + , const InteractionEnergy & energy + , const std::string & E_INF_string ) +{ + // direct access to sequence string information + const std::string & rna1 = energy.getAccessibility1().getSequence().asString(); + const std::string & rna2 = energy.getAccessibility2().getSequence().asString(); + + // write in CSV-like format + // header = reversed seq2 + // col[0] = seq1 + // cell[0,0] = "minE" + + // print header : spotProb; "nt_index" ... starting with index 1 + out <<"spotProb"; + for (size_t j=rna2.size(); j-- > 0; ) { + out <<';' < 0; ) { + // out separator + out <<';'; + // out infinity replacement if needed + if ( E_isINF( pairZ(i,j) ) ) { + out< + +#include +#include + + +namespace IntaRNA { + +/** + * Collects for each intermolecular index pair the probability that this pair + * is covered by an interaction. + * + * The pair data is written to stream on destruction. + */ +class PredictionTrackerSpotProbAll: public PredictionTracker +{ + +public: + + /** + * Constructs a PredictionTracker that collected for each index pair of two + * sequences the probability to be covered by an interaction. + * + * Note, if a filename is provided, its stream is closed on destruction of + * this object! + * + * @param energy the energy function used for energy calculation + * @param streamName the stream name where the data + * is to be written to. use STDOUT/STDERR for the respective stream. + * Otherwise, an according file is created (has to be non-empty) + * @param E_INF_string the output string representation of E_INF values in + * the profile output + */ + PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , const std::string & streamName + , const std::string E_INF_string = "NA" + ); + + /** + * Constructs a PredictionTracker that collected for each index pair of two + * sequences the probability to be covered by an interaction. + * + * Note, the stream is NOT closed nor deleted on destruction of this object! + * + * @param energy the energy function used + * @param outStream the stream where the data + * is to be written to (has to be non-null) + * @param E_INF_string the output string representation of E_INF values in + * the profile output + */ + PredictionTrackerSpotProbAll( + const InteractionEnergy & energy + , std::ostream * outStream + , const std::string E_INF_string = "NA" + ); + + /** + * destruction: write the profile(s) to the according streams. + */ + virtual ~PredictionTrackerSpotProbAll(); + + + /** + * Updates the partition function information for each + * Predictor.updateOptima() call. + * + * @param i1 the index of the first sequence interacting with i2 + * @param j1 the index of the first sequence interacting with j2 + * @param i2 the index of the second sequence interacting with i1 + * @param j2 the index of the second sequence interacting with j1 + * @param energy the overall energy of the interaction site + */ + virtual + void + updateOptimumCalled( const size_t i1, const size_t j1 + , const size_t i2, const size_t j2 + , const E_type energy + ); + + +protected: + + //! energy handler used for predictions + const InteractionEnergy & energy; + + //! whether or not the streams are to be deleted on destruction + const bool deleteStreamsOnDestruction; + + //! the stream to write the minE-data to + std::ostream * outStream; + + //! the output string representation of E_INF values in the profile output + const std::string E_INF_string; + + //! type of partition function values + typedef double Z_type; + + //! overall partition function + Z_type overallZ; + + //! matrix type to hold the partition function for each index pair + typedef boost::numeric::ublas::matrix Z2dMatrix; + + //! the index-pair-wise minimal energy values + Z2dMatrix pairZ; + + + + /** + * Writes profile data to stream. + * + * @param out the output stream to write to + * @param pairZ the partition function data to write + * @param overallZ the overall partition function for pairZ + * @param energy the energy function used + * @param E_INF_string the string to be used for E_INF entries + */ + static + void + writeData( std::ostream &out + , const Z2dMatrix & pairZ + , const Z_type & overallZ + , const InteractionEnergy & energy + , const std::string & E_INF_string ); + + +}; + +////////////////////////////////////////////////////////////////////// + +} // namespace + +#endif /* INTARNA_PREDICTIONTRACKERSPOTPROBALL_H_ */ diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index e3ee168b..f077303b 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -48,6 +48,7 @@ extern "C" { #include "IntaRNA/PredictionTrackerPairMinE.h" #include "IntaRNA/PredictionTrackerProfileMinE.h" #include "IntaRNA/PredictionTrackerSpotProb.h" +#include "IntaRNA/PredictionTrackerSpotProbAll.h" #include "IntaRNA/PredictionTrackerProfileSpotProb.h" #include "IntaRNA/SeedHandlerMfe.h" @@ -554,8 +555,8 @@ CommandLineParsing::CommandLineParsing() "\n 'tSpotProb:' (target) for each position the probability that is is covered by an interaction covering (CSV format)" "\n 'tAcc:' (target) ED accessibility values ('tPu'-like format)." "\n 'tPu:' (target) unpaired probabilities values (RNAplfold format)." - "\n 'pMinE:' (query+target) for each index pair the minimal energy of any interaction covering the pair (CSV format)" - "\n 'spotProb:' (query+target) tracks for a given set of interaction spots their probability to be covered by an interaction. Spots are encoded by comma-separated 'idx1&idx2' pairs. For each spot a probability is provided in concert with the probability that none of the spots (encoded by '0&0') is covered (CSV format). The spot encoding is followed colon-separated by the output stream/file name, eg. '--out=\"spotProb:3&76,59&2:STDERR\"'. NOTE: value has to be quoted due to '&' symbol!" + "\n 'pMinE:' (target+query) for each index pair the minimal energy of any interaction covering the pair (CSV format)" + "\n 'spotProb:' (target+query) tracks for a given set of interaction spots their probability to be covered by an interaction. If no spots are provided, probabilities for all index combinations are computed. Spots are encoded by comma-separated 'idxT&idxQ' pairs (target-query). For each spot a probability is provided in concert with the probability that none of the spots (encoded by '0&0') is covered (CSV format). The spot encoding is followed colon-separated by the output stream/file name, eg. '--out=\"spotProb:3&76,59&2:STDERR\"'. NOTE: value has to be quoted due to '&' symbol!" "\nFor each, provide a file name or STDOUT/STDERR to write to the respective output stream." ).c_str()) ("outMode" @@ -1764,12 +1765,24 @@ getPredictor( const InteractionEnergy & energy, OutputHandler & output ) const // check if spotProbs are to be tracked if (!outPrefix2streamName.at(OutPrefixCode::OP_spotProb).empty()) { - predTracker->addPredictionTracker( - new PredictionTrackerSpotProb( energy - // get encoding - , outSpotProbSpots - , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) - ); + if (outSpotProbSpots.empty()) { + // track all spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProbAll( energy + // add sequence-specific prefix for output file + , getFullFilename( outPrefix2streamName.at(OutPrefixCode::OP_spotProb) + , &(energy.getAccessibility1().getSequence()) + , &(energy.getAccessibility2().getAccessibilityOrigin().getSequence())) + , "0") ); + } else { + // track only specific spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProb( energy + // get encoding + , outSpotProbSpots + , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) + ); + } } // check if any tracker registered diff --git a/src/bin/CommandLineParsing.h b/src/bin/CommandLineParsing.h index 42c6017e..c8ebd9cc 100644 --- a/src/bin/CommandLineParsing.h +++ b/src/bin/CommandLineParsing.h @@ -1704,9 +1704,10 @@ void CommandLineParsing::validate_out(const std::vector & list) { } // store prefix to identify another existence std::string streamName = v->substr(v->find(':')+1); - // handle SpotProb setup - if (curPrefCode == OP_spotProb) { - // get spotProb spots + + // handle SpotProb setup if spots are defined + if (curPrefCode == OP_spotProb && streamName.find(':') != std::string::npos) { + // get spots outSpotProbSpots = streamName.substr(0,streamName.find(':')); // check if valid spot encoding if (!boost::regex_match(outSpotProbSpots, PredictionTrackerSpotProb::regexSpotString, boost::match_perl)){ From a096b5b0f79a59caa25181e06be1374cf23e8c6b Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 8 Nov 2018 16:58:23 +0100 Subject: [PATCH 6/9] merge merge --- ChangeLog | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index a10cbc69..058ad4d3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -17,14 +17,6 @@ * README.md : + 'hybridD(P|B)full' documentation -181108 Martin Raden : - + IntaRNA/PredictionTrackerSpotProbAll : exhaustively computes and prints all - spot probabilities (ie for all intermolecular index pairs) - * bin/CommandLineParsing : - + registration and documentation of PredictionTrackerSpotProbAll usage - * README.md : - * exhaustive spot prob calculation documented - version 2.3.0 180906 Martin Raden : From b49c9c36c9661975e25ee6cc69e4597f7a072b4a Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 8 Nov 2018 17:44:03 +0100 Subject: [PATCH 7/9] + parallel specific and exhaustive spotprob out --- src/bin/CommandLineParsing.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/bin/CommandLineParsing.h b/src/bin/CommandLineParsing.h index c8ebd9cc..707f1c84 100644 --- a/src/bin/CommandLineParsing.h +++ b/src/bin/CommandLineParsing.h @@ -254,6 +254,7 @@ class CommandLineParsing { OP_qPu, OP_tPu, OP_spotProb, + OP_spotProbAll, OP_UNKNOWN }; @@ -1705,19 +1706,25 @@ void CommandLineParsing::validate_out(const std::vector & list) { // store prefix to identify another existence std::string streamName = v->substr(v->find(':')+1); - // handle SpotProb setup if spots are defined - if (curPrefCode == OP_spotProb && streamName.find(':') != std::string::npos) { - // get spots - outSpotProbSpots = streamName.substr(0,streamName.find(':')); - // check if valid spot encoding - if (!boost::regex_match(outSpotProbSpots, PredictionTrackerSpotProb::regexSpotString, boost::match_perl)){ - // check sanity of spot encodings (indexing starts with 1) - LOG(ERROR) <<"--out : spot encoding '"< Date: Thu, 8 Nov 2018 17:51:23 +0100 Subject: [PATCH 8/9] fixing last commit --- src/bin/CommandLineParsing.cpp | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index f077303b..b2d30f44 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -1763,26 +1763,27 @@ getPredictor( const InteractionEnergy & energy, OutputHandler & output ) const , "NA") ); } - // check if spotProbs are to be tracked + // check if specific spotProbs are to be tracked if (!outPrefix2streamName.at(OutPrefixCode::OP_spotProb).empty()) { - if (outSpotProbSpots.empty()) { - // track all spots - predTracker->addPredictionTracker( - new PredictionTrackerSpotProbAll( energy - // add sequence-specific prefix for output file - , getFullFilename( outPrefix2streamName.at(OutPrefixCode::OP_spotProb) - , &(energy.getAccessibility1().getSequence()) - , &(energy.getAccessibility2().getAccessibilityOrigin().getSequence())) - , "0") ); - } else { - // track only specific spots - predTracker->addPredictionTracker( - new PredictionTrackerSpotProb( energy - // get encoding - , outSpotProbSpots - , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) - ); - } + // track only specific spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProb( energy + // get encoding + , outSpotProbSpots + , outPrefix2streamName.at(OutPrefixCode::OP_spotProb) ) + ); + } + + // check if all spotProbs are to be tracked + if (!outPrefix2streamName.at(OutPrefixCode::OP_spotProbAll).empty()) { + // track all spots + predTracker->addPredictionTracker( + new PredictionTrackerSpotProbAll( energy + // add sequence-specific prefix for output file + , getFullFilename( outPrefix2streamName.at(OutPrefixCode::OP_spotProbAll) + , &(energy.getAccessibility1().getSequence()) + , &(energy.getAccessibility2().getAccessibilityOrigin().getSequence())) + , "0") ); } // check if any tracker registered From f7b21f7a395cd77e48854ac44edc46565475bcf1 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Fri, 23 Nov 2018 11:25:43 +0100 Subject: [PATCH 9/9] v2.3.1 --- ChangeLog | 3 +++ configure.ac | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 058ad4d3..217ce339 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,6 @@ + +version 2.3.1 + 181108 Martin Raden : + IntaRNA/PredictionTrackerSpotProbAll : exhaustively computes and prints all spot probabilities (ie for all intermolecular index pairs) diff --git a/configure.ac b/configure.ac index ea26f92e..d597d599 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ AC_PREREQ([2.65]) # 5 argument version only available with aclocal >= 2.64 -AC_INIT([IntaRNA], [2.3.0], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) +AC_INIT([IntaRNA], [2.3.1], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) # minimal required version of the boost library BOOST_REQUIRED_VERSION=1.50.0