forked from EddyRivasLab/easel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
esl_histogram.h
126 lines (107 loc) · 5.25 KB
/
esl_histogram.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* Collection and display of score histograms.
*
* SRE, Fri Jul 1 13:22:45 2005 [St. Louis]
*/
#ifndef eslHISTOGRAM_INCLUDED
#define eslHISTOGRAM_INCLUDED
#include "esl_config.h"
#include <math.h> /* floor() is in one of the macros */
/* Structure: ESL_HISTOGRAM
*
* Keeps a score histogram, in which scores are counted into bins of
* size (width) w.
* histogram starts at bmin < floor(xmin/w) * w
* histogram ends at bmax >= ceil(xmax/w)*w
* nb = (bmax-bmin)/w
* each score x is counted into bin b = nb - (int) (bmax-x)/w
* each bin b contains scores bw+bmin < x <= (b+1)w + bmin
*
* Anything having to do with the counts themselves (obs, n, etc)
* is a uint64_t, with range 0..2^64-1 (up to 2e19).
*/
typedef struct {
/* The histogram is kept as counts in fixed-width bins.
*/
uint64_t *obs; /* observed counts in bin b, 0..nb-1 (dynamic) */
int nb; /* number of bins */
double w; /* fixed width of each bin */
double bmin, bmax; /* histogram bounds: all x satisfy bmin < x <= bmax */
int imin, imax; /* smallest, largest bin that contain obs[i] > 0 */
/* Optionally, in a "full" h, we can also keep all the raw samples in x.
*/
double xmin, xmax; /* smallest, largest sample value x observed */
uint64_t n; /* total number of raw data samples */
double *x; /* optional: raw sample values x[0..n-1] */
uint64_t nalloc; /* current allocated size of x */
/* The binned data might be censored (either truly, or virtually).
* This information has to be made available to a binned/censored
* parameter fitting function, and to goodness-of-fit tests.
*/
double phi; /* censoring value; all x_i > phi */
int cmin; /* smallest bin index that contains uncensored data */
uint64_t z; /* # of censored values <= phi */
uint64_t Nc; /* # samples in complete data (including unobs) */
uint64_t No; /* # of samples in observed data */
/* Expected binned counts are set by SetExpect() or SetExpectedTail().
*/
double *expect; /* expected counts in bin b, 0..nb-1 (not resized) */
int emin; /* smallest bin index that contains expected counts */
double tailbase; /* for tail fits: fitted x > tailbase */
double tailmass; /* for tail fits: fractional prob in the tail */
/* Some status flags
*/
int is_full; /* TRUE when we're keeping raw data in x */
int is_done; /* TRUE if we prevent more Add()'s */
int is_sorted; /* TRUE if x is sorted smallest-to-largest */
int is_tailfit; /* TRUE if expected dist only describes tail */
int is_rounded; /* TRUE if values aren't more accurate than bins */
enum { COMPLETE, VIRTUAL_CENSORED, TRUE_CENSORED } dataset_is;
} ESL_HISTOGRAM;
#define esl_histogram_Bin2LBound(h,b) ((h)->w*(b) + (h)->bmin)
#define esl_histogram_Bin2UBound(h,b) ((h)->w*((b)+1) + (h)->bmin)
/* Creating/destroying histograms and collecting data:
*/
extern ESL_HISTOGRAM *esl_histogram_Create (double bmin, double bmax, double w);
extern ESL_HISTOGRAM *esl_histogram_CreateFull(double bmin, double bmax, double w);
extern void esl_histogram_Destroy (ESL_HISTOGRAM *h);
extern int esl_histogram_Score2Bin(ESL_HISTOGRAM *h, double x, int *ret_b);
extern int esl_histogram_Add (ESL_HISTOGRAM *h, double x);
/* Declarations about the binned data before parameter fitting:
*/
extern int esl_histogram_DeclareCensoring(ESL_HISTOGRAM *h, int z, double phi);
extern int esl_histogram_DeclareRounding (ESL_HISTOGRAM *h);
extern int esl_histogram_SetTail (ESL_HISTOGRAM *h, double phi,
double *ret_newmass);
extern int esl_histogram_SetTailByMass (ESL_HISTOGRAM *h, double pmass,
double *ret_newmass);
/* Accessing data samples in a full histogram:
*/
extern int esl_histogram_GetRank(ESL_HISTOGRAM *h, int rank, double *ret_x);
extern int esl_histogram_GetData(ESL_HISTOGRAM *h, double **ret_x, int *ret_n);
extern int esl_histogram_GetTail(ESL_HISTOGRAM *h, double phi, double **ret_x,
int *ret_n, int *ret_z);
extern int esl_histogram_GetTailByMass(ESL_HISTOGRAM *h, double pmass,
double **ret_x, int *ret_n, int *ret_z);
/* Setting expected binned counts:
*/
extern int esl_histogram_SetExpect(ESL_HISTOGRAM *h,
double (*cdf)(double x, void *params),
void *params);
extern int esl_histogram_SetExpectedTail(ESL_HISTOGRAM *h, double base_val,
double pmass,
double (*cdf)(double x, void *params),
void *params);
/* Output/display of binned data:
*/
extern int esl_histogram_Write (FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_Plot (FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_PlotSurvival(FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_PlotQQ (FILE *fp, ESL_HISTOGRAM *h,
double (*invcdf)(double, void *), void *params);
/* Goodness of fit testing
*/
extern int esl_histogram_Goodness(ESL_HISTOGRAM *h, int nfitted,
int *ret_nbins,
double *ret_G, double *ret_Gp,
double *ret_X2, double *ret_X2p);
#endif /*eslHISTOGRAM_INCLUDED*/