-
Notifications
You must be signed in to change notification settings - Fork 24
/
tdigest.go
452 lines (397 loc) · 12.1 KB
/
tdigest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// Package tdigest provides a highly accurate mergeable data-structure
// for quantile estimation.
//
// Typical T-Digest use cases involve accumulating metrics on several
// distinct nodes of a cluster and then merging them together to get
// a system-wide quantile overview. Things such as: sensory data from
// IoT devices, quantiles over enormous document datasets (think
// ElasticSearch), performance metrics for distributed systems, etc.
//
// After you create (and configure, if desired) the digest:
//
// digest, err := tdigest.New(tdigest.Compression(100))
//
// You can then use it for registering measurements:
//
// digest.Add(number)
//
// Estimating quantiles:
//
// digest.Quantile(0.99)
//
// And merging with another digest:
//
// digest.Merge(otherDigest)
package tdigest
import (
"fmt"
"math"
)
// TDigest is a quantile approximation data structure.
type TDigest struct {
summary *summary
compression float64
count uint64
rng RNG
}
// New creates a new digest.
//
// By default the digest is constructed with a configuration that
// should be useful for most use-cases. It comes with compression
// set to 100 and uses a local random number generator for
// performance reasons.
func New(options ...tdigestOption) (*TDigest, error) {
tdigest, err := newWithoutSummary(options...)
if err != nil {
return nil, err
}
tdigest.summary = newSummary(estimateCapacity(tdigest.compression))
return tdigest, nil
}
// Creates a tdigest instance without allocating a summary.
func newWithoutSummary(options ...tdigestOption) (*TDigest, error) {
tdigest := &TDigest{
compression: 100,
count: 0,
}
for _, option := range options {
err := option(tdigest)
if err != nil {
return nil, err
}
}
if tdigest.rng == nil {
tdigest.rng = newLocalRNG(1)
}
return tdigest, nil
}
func _quantile(index float64, previousIndex float64, nextIndex float64, previousMean float64, nextMean float64) float64 {
delta := nextIndex - previousIndex
previousWeight := (nextIndex - index) / delta
nextWeight := (index - previousIndex) / delta
return previousMean*previousWeight + nextMean*nextWeight
}
// Compression returns the TDigest compression.
func (t *TDigest) Compression() float64 {
return t.compression
}
// Quantile returns the desired percentile estimation.
//
// Values of p must be between 0 and 1 (inclusive), will panic otherwise.
func (t *TDigest) Quantile(q float64) float64 {
if q < 0 || q > 1 {
panic("q must be between 0 and 1 (inclusive)")
}
if t.summary.Len() == 0 {
return math.NaN()
} else if t.summary.Len() == 1 {
return t.summary.Mean(0)
}
index := q * float64(t.count-1)
previousMean := math.NaN()
previousIndex := float64(0)
next, total := t.summary.FloorSum(index)
if next > 0 {
previousMean = t.summary.Mean(next - 1)
previousIndex = total - float64(t.summary.Count(next-1)+1)/2
}
for {
nextIndex := total + float64(t.summary.Count(next)-1)/2
if nextIndex >= index {
if math.IsNaN(previousMean) {
// the index is before the 1st centroid
if nextIndex == previousIndex {
return t.summary.Mean(next)
}
// assume linear growth
nextIndex2 := total + float64(t.summary.Count(next)) + float64(t.summary.Count(next+1)-1)/2
previousMean = (nextIndex2*t.summary.Mean(next) - nextIndex*t.summary.Mean(next+1)) / (nextIndex2 - nextIndex)
}
// common case: two centroids found, the result in in between
return _quantile(index, previousIndex, nextIndex, previousMean, t.summary.Mean(next))
} else if next+1 == t.summary.Len() {
// the index is after the last centroid
nextIndex2 := float64(t.count - 1)
nextMean2 := (t.summary.Mean(next)*(nextIndex2-previousIndex) - previousMean*(nextIndex2-nextIndex)) / (nextIndex - previousIndex)
return _quantile(index, nextIndex, nextIndex2, t.summary.Mean(next), nextMean2)
}
total += float64(t.summary.Count(next))
previousMean = t.summary.Mean(next)
previousIndex = nextIndex
next++
}
// unreachable
}
// boundedWeightedAverage computes the weighted average of two
// centroids guaranteeing that the result will be between x1 and x2,
// inclusive.
//
// Refer to https://github.com/caio/go-tdigest/pull/19 for more details
func boundedWeightedAverage(x1 float64, w1 float64, x2 float64, w2 float64) float64 {
if x1 > x2 {
x1, x2, w1, w2 = x2, x1, w2, w1
}
result := (x1*w1 + x2*w2) / (w1 + w2)
return math.Max(x1, math.Min(result, x2))
}
// AddWeighted registers a new sample in the digest.
//
// It's the main entry point for the digest and very likely the only
// method to be used for collecting samples. The count parameter is for
// when you are registering a sample that occurred multiple times - the
// most common value for this is 1.
//
// This will emit an error if `value` is NaN or if `count` is zero.
func (t *TDigest) AddWeighted(value float64, count uint64) (err error) {
if count == 0 {
return fmt.Errorf("illegal datapoint <value: %.4f, count: %d>", value, count)
}
if t.summary.Len() == 0 {
err = t.summary.Add(value, count)
t.count = uint64(count)
return err
}
begin := t.summary.Floor(value)
if begin == -1 {
begin = 0
}
begin, end := t.findNeighbors(begin, value)
closest := t.chooseMergeCandidate(begin, end, count)
if closest == t.summary.Len() {
err = t.summary.Add(value, count)
if err != nil {
return err
}
} else {
c := float64(t.summary.Count(closest))
newMean := boundedWeightedAverage(t.summary.Mean(closest), c, value, float64(count))
t.summary.setAt(closest, newMean, uint64(c)+count)
}
t.count += uint64(count)
if float64(t.summary.Len()) > 20*t.compression {
err = t.Compress()
}
return err
}
// Count returns the total number of samples this digest represents
//
// The result represents how many times Add() was called on a digest
// plus how many samples the digests it has been merged with had.
// This is useful mainly for two scenarios:
//
// - Knowing if there is enough data so you can trust the quantiles
//
// - Knowing if you've registered too many samples already and
// deciding what to do about it.
//
// For the second case one approach would be to create a side empty
// digest and start registering samples on it as well as on the old
// (big) one and then discard the bigger one after a certain criterion
// is reached (say, minimum number of samples or a small relative
// error between new and old digests).
func (t TDigest) Count() uint64 {
return t.count
}
// Add is an alias for AddWeighted(x,1)
// Read the documentation for AddWeighted for more details.
func (t *TDigest) Add(value float64) error {
return t.AddWeighted(value, 1)
}
// Compress tries to reduce the number of individual centroids stored
// in the digest.
//
// Compression trades off accuracy for performance and happens
// automatically after a certain amount of distinct samples have been
// stored.
//
// At any point in time you may call Compress on a digest, but you
// may completely ignore this and it will compress itself automatically
// after it grows too much. If you are minimizing network traffic
// it might be a good idea to compress before serializing.
func (t *TDigest) Compress() (err error) {
if t.summary.Len() <= 1 {
return nil
}
oldTree := t.summary
t.summary = newSummary(estimateCapacity(t.compression))
t.count = 0
oldTree.shuffle(t.rng)
oldTree.ForEach(func(mean float64, count uint64) bool {
err = t.AddWeighted(mean, count)
return err == nil
})
return err
}
// Merge joins a given digest into itself.
//
// Merging is useful when you have multiple TDigest instances running
// in separate threads and you want to compute quantiles over all the
// samples. This is particularly important on a scatter-gather/map-reduce
// scenario.
func (t *TDigest) Merge(other *TDigest) (err error) {
if other.summary.Len() == 0 {
return nil
}
other.summary.Perm(t.rng, func(mean float64, count uint64) bool {
err = t.AddWeighted(mean, count)
return err == nil
})
return err
}
// MergeDestructive joins a given digest into itself rendering
// the other digest invalid.
//
// This works as Merge above but its faster. Using this method
// requires caution as it makes 'other' useless - you must make
// sure you discard it without making further uses of it.
func (t *TDigest) MergeDestructive(other *TDigest) (err error) {
if other.summary.Len() == 0 {
return nil
}
other.summary.shuffle(t.rng)
other.summary.ForEach(func(mean float64, count uint64) bool {
err = t.AddWeighted(mean, count)
return err == nil
})
return err
}
// CDF computes the fraction in which all samples are less than
// or equal to the given value.
func (t *TDigest) CDF(value float64) float64 {
if t.summary.Len() == 0 {
return math.NaN()
} else if t.summary.Len() == 1 {
if value < t.summary.Mean(0) {
return 0
}
return 1
}
// We have at least 2 centroids
left := (t.summary.Mean(1) - t.summary.Mean(0)) / 2
right := left
tot := 0.0
for i := 1; i < t.summary.Len()-1; i++ {
prevMean := t.summary.Mean(i - 1)
if value < prevMean+right {
v := (tot + float64(t.summary.Count(i-1))*interpolate(value, prevMean-left, prevMean+right)) / float64(t.Count())
if v > 0 {
return v
}
return 0
}
tot += float64(t.summary.Count(i - 1))
left = right
right = (t.summary.Mean(i+1) - t.summary.Mean(i)) / 2
}
// last centroid, the summary length is at least two
aIdx := t.summary.Len() - 2
aMean := t.summary.Mean(aIdx)
if value < aMean+right {
aCount := float64(t.summary.Count(aIdx))
return (tot + aCount*interpolate(value, aMean-left, aMean+right)) / float64(t.Count())
}
return 1
}
// Clone returns a deep copy of a TDigest.
func (t *TDigest) Clone() *TDigest {
return &TDigest{
summary: t.summary.Clone(),
compression: t.compression,
count: t.count,
rng: t.rng,
}
}
func interpolate(x, x0, x1 float64) float64 {
return (x - x0) / (x1 - x0)
}
// ForEachCentroid calls the specified function for each centroid.
//
// Iteration stops when the supplied function returns false, or when all
// centroids have been iterated.
func (t *TDigest) ForEachCentroid(f func(mean float64, count uint64) bool) {
t.summary.ForEach(f)
}
func (t TDigest) findNeighbors(start int, value float64) (int, int) {
minDistance := math.MaxFloat64
lastNeighbor := t.summary.Len()
for neighbor := start; neighbor < t.summary.Len(); neighbor++ {
z := math.Abs(t.summary.Mean(neighbor) - value)
if z < minDistance {
start = neighbor
minDistance = z
} else if z > minDistance {
lastNeighbor = neighbor
break
}
}
return start, lastNeighbor
}
func (t TDigest) chooseMergeCandidate(begin, end int, count uint64) int {
closest := t.summary.Len()
sum := t.summary.HeadSum(begin)
var n float32
for neighbor := begin; neighbor != end; neighbor++ {
c := float64(t.summary.Count(neighbor))
var q float64
if t.count == 1 {
q = 0.5
} else {
q = (sum + (c-1)/2) / float64(t.count-1)
}
k := 4 * float64(t.count) * q * (1 - q) / t.compression
if c+float64(count) <= k {
n++
if t.rng.Float32() < 1/n {
closest = neighbor
}
}
sum += c
}
return closest
}
// TrimmedMean returns the mean of the distribution between the two
// percentiles p1 and p2.
//
// Values of p1 and p2 must be beetween 0 and 1 (inclusive) and p1
// must be less than p2. Will panic otherwise.
func (t *TDigest) TrimmedMean(p1, p2 float64) float64 {
if p1 < 0 || p1 > 1 {
panic("p1 must be between 0 and 1 (inclusive)")
}
if p2 < 0 || p2 > 1 {
panic("p2 must be between 0 and 1 (inclusive)")
}
if p1 >= p2 {
panic("p1 must be lower than p2")
}
minCount := p1 * float64(t.count)
maxCount := p2 * float64(t.count)
var trimmedSum, trimmedCount, currCount float64
for i, mean := range t.summary.means {
count := float64(t.summary.counts[i])
nextCount := currCount + count
if nextCount <= minCount {
currCount = nextCount
continue
}
if currCount < minCount {
count = nextCount - minCount
}
if nextCount > maxCount {
count -= nextCount - maxCount
}
trimmedSum += count * mean
trimmedCount += count
if nextCount >= maxCount {
break
}
currCount = nextCount
}
if trimmedCount == 0 {
return 0
}
return trimmedSum / trimmedCount
}
func estimateCapacity(compression float64) int {
return int(compression) * 10
}