-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add reservoir timers #171
base: master
Are you sure you want to change the base?
add reservoir timers #171
Changes from 13 commits
72bc838
f67aca4
5cc34d9
f7fc55e
4a0662a
f35471d
57ef42b
4610f55
cc908b5
b1a2def
8eb942d
5dd8757
0d3fb45
ea5ae6a
e81d603
74a26a1
6d2687c
d067744
7e5a451
a54db1a
18c0e57
bf0ef63
8dad5ed
9762152
2641924
858a3fd
4e9611d
70cc61c
b352a4f
ef8cf0b
8cceead
4cae9ac
7fe2893
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ type Sink interface { | |
FlushCounter(name string, value uint64) | ||
FlushGauge(name string, value uint64) | ||
FlushTimer(name string, value float64) | ||
FlushAggregatedTimer(name string, value, sampleRate float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of this we'll need a new major release. The library usage should be backwards compatible, and additionally there is a feature flag to control the new behaviour, however if anything is implementing this interface it'll break. |
||
} | ||
|
||
// FlushableSink is an extension of Sink that provides a Flush() function that | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -214,7 +214,10 @@ type StatGenerator interface { | |
// NewStore returns an Empty store that flushes to Sink passed as an argument. | ||
// Note: the export argument is unused. | ||
func NewStore(sink Sink, _ bool) Store { | ||
return &statStore{sink: sink} | ||
return &statStore{ | ||
sink: sink, | ||
conf: GetSettings(), // todo: right now the environment is being loaded in multiple places and can be made more efficient | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The sync.OnceValue function can help eliminate these multiple loads. |
||
} | ||
} | ||
|
||
// NewDefaultStore returns a Store with a TCP statsd sink, and a running flush timer. | ||
|
@@ -298,30 +301,95 @@ func (c *gauge) Value() uint64 { | |
return atomic.LoadUint64(&c.value) | ||
} | ||
|
||
type timer struct { | ||
type timer interface { | ||
time(time.Duration) | ||
AddDuration(time.Duration) | ||
AddValue(float64) | ||
AllocateSpan() Timespan | ||
CollectedValue() []float64 | ||
SampleRate() float64 | ||
} | ||
|
||
type standardTimer struct { | ||
base time.Duration | ||
name string | ||
sink Sink | ||
} | ||
|
||
func (t *timer) time(dur time.Duration) { | ||
func (t *standardTimer) time(dur time.Duration) { | ||
t.AddDuration(dur) | ||
} | ||
|
||
func (t *timer) AddDuration(dur time.Duration) { | ||
func (t *standardTimer) AddDuration(dur time.Duration) { | ||
t.AddValue(float64(dur / t.base)) | ||
} | ||
|
||
func (t *timer) AddValue(value float64) { | ||
func (t *standardTimer) AddValue(value float64) { | ||
t.sink.FlushTimer(t.name, value) | ||
} | ||
|
||
func (t *timer) AllocateSpan() Timespan { | ||
func (t *standardTimer) AllocateSpan() Timespan { | ||
return ×pan{timer: t, start: time.Now()} | ||
} | ||
|
||
func (t *standardTimer) CollectedValue() []float64 { | ||
return nil // since we flush right away nothing will be collected | ||
} | ||
|
||
func (t *standardTimer) SampleRate() float64 { | ||
return 1.0 // metrics which are not sampled have an implicit sample rate 1.0 | ||
} | ||
|
||
type reservoirTimer struct { | ||
base time.Duration | ||
name string | ||
capacity int | ||
values []float64 | ||
count int | ||
mu sync.Mutex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. super nit, make the Mutex the first field (it's advantageous performance wise to make the most accessed field first) |
||
} | ||
|
||
func (t *reservoirTimer) time(dur time.Duration) { | ||
t.AddDuration(dur) | ||
} | ||
|
||
func (t *reservoirTimer) AddDuration(dur time.Duration) { | ||
t.AddValue(float64(dur / t.base)) | ||
} | ||
|
||
func (t *reservoirTimer) AddValue(value float64) { | ||
t.mu.Lock() | ||
defer t.mu.Unlock() | ||
|
||
if t.count < t.capacity { | ||
t.values = append(t.values, value) | ||
} else { | ||
t.values = append(t.values[1:], value) // discard the oldest value when the reservoir is full, this can probably be smarter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Treating this more as a ring buffer when at capacity would be better since it would provide a more accurate representation of the most recent timings (whereas here we only update the first timing): diff --git a/stats.go b/stats.go
index 70a582b..99a0a0a 100644
--- a/stats.go
+++ b/stats.go
@@ -346,6 +346,7 @@ type reservoirTimer struct {
capacity int
values []float64
count int
+ off int
mu sync.Mutex
}
@@ -364,7 +365,8 @@ func (t *reservoirTimer) AddValue(value float64) {
if t.count < t.capacity {
t.values = append(t.values, value)
} else {
- t.values = append(t.values[1:], value) // discard the oldest value when the reservoir is full, this can probably be smarter
+ t.values[t.off%len(t.values)] = value
+ t.off++
}
t.count++ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, making func nextPowerOfTwo(capacity int) uint {
return 1 << bits.Len(uint(capacity))
} |
||
} | ||
|
||
t.count++ | ||
} | ||
|
||
func (t *reservoirTimer) AllocateSpan() Timespan { | ||
return ×pan{timer: t, start: time.Now()} | ||
} | ||
|
||
func (t *reservoirTimer) CollectedValue() []float64 { | ||
t.mu.Lock() | ||
defer t.mu.Unlock() | ||
|
||
// return a copy of the values slice to avoid data races | ||
values := make([]float64, len(t.values)) | ||
copy(values, t.values) | ||
return values | ||
} | ||
|
||
func (t *reservoirTimer) SampleRate() float64 { | ||
return float64(len(t.values)) / float64(t.count) | ||
} | ||
|
||
type timespan struct { | ||
timer *timer | ||
timer timer | ||
start time.Time | ||
} | ||
|
||
|
@@ -336,6 +404,7 @@ func (ts *timespan) CompleteWithDuration(value time.Duration) { | |
} | ||
|
||
type statStore struct { | ||
// todo: no idea how memory is managed here, when are the map entries ever deleted? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The assumption is the that stat names are stable over the lifetime of a process, but this is a known issue for processes that incorrectly use this library and cause a cardinality explosion of stat names. See:
|
||
counters sync.Map | ||
gauges sync.Map | ||
timers sync.Map | ||
|
@@ -344,6 +413,8 @@ type statStore struct { | |
statGenerators []StatGenerator | ||
|
||
sink Sink | ||
|
||
conf Settings | ||
} | ||
|
||
var ReservedTagWords = map[string]bool{"asg": true, "az": true, "backend": true, "canary": true, "host": true, "period": true, "region": true, "shard": true, "window": true, "source": true, "project": true, "facet": true, "envoyservice": true} | ||
|
@@ -380,6 +451,8 @@ func (s *statStore) Flush() { | |
} | ||
s.mu.RUnlock() | ||
|
||
// todo: if we're not deleting the data we flush from these maps, won't we just keep resending them? | ||
|
||
s.counters.Range(func(key, v interface{}) bool { | ||
// do not flush counters that are set to zero | ||
if value := v.(*counter).latch(); value != 0 { | ||
|
@@ -393,6 +466,18 @@ func (s *statStore) Flush() { | |
return true | ||
}) | ||
|
||
s.timers.Range(func(key, v interface{}) bool { | ||
if timer, ok := v.(*reservoirTimer); ok { | ||
sampleRate := timer.SampleRate() | ||
for _, value := range timer.CollectedValue() { | ||
s.sink.FlushAggregatedTimer(key.(string), value, sampleRate) | ||
} | ||
s.timers.Delete(key) // delete it from the map so it's not flushed again | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deletion is expensive and since we assume the names of stats are stable will just lead to it being re-added to the map at a later time. For counters we only flush non-zero values and I think we could do something similar with the |
||
} | ||
|
||
return true | ||
}) | ||
|
||
flushableSink, ok := s.sink.(FlushableSink) | ||
if ok { | ||
flushableSink.Flush() | ||
|
@@ -490,14 +575,32 @@ func (s *statStore) NewPerInstanceGauge(name string, tags map[string]string) Gau | |
return s.newGaugeWithTagSet(name, tagspkg.TagSet(nil).MergePerInstanceTags(tags)) | ||
} | ||
|
||
func (s *statStore) newTimer(serializedName string, base time.Duration) *timer { | ||
func (s *statStore) newTimer(serializedName string, base time.Duration) timer { | ||
if v, ok := s.timers.Load(serializedName); ok { | ||
return v.(*timer) | ||
return v.(timer) | ||
} | ||
t := &timer{name: serializedName, sink: s.sink, base: base} | ||
|
||
var t timer | ||
if s.conf.isTimerReservoirEnabled() { | ||
t = &reservoirTimer{ | ||
name: serializedName, | ||
base: base, | ||
capacity: s.conf.TimerReservoirSize, | ||
values: make([]float64, 0, s.conf.TimerReservoirSize), | ||
count: 0, | ||
} | ||
} else { | ||
t = &standardTimer{ | ||
name: serializedName, | ||
sink: s.sink, | ||
base: base, | ||
} | ||
} | ||
|
||
if v, loaded := s.timers.LoadOrStore(serializedName, t); loaded { | ||
return v.(*timer) | ||
return v.(timer) | ||
} | ||
|
||
return t | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For performance reasons it would be great to avoid
fmt.Sprintf
here if possible.