Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent: Decrease LFC metrics fetch frequency from 1/15hz -> 1/60hz #1187

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions autoscaler-agent/config_map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,14 @@ data:
"system": {
"port": 9100,
"requestTimeoutSeconds": 2,
"secondsBetweenRequests": 5
"secondsBetweenRequests": 5,
"firstRequestWithinSeconds": 5
},
"lfc": {
"port": 9499,
"requestTimeoutSeconds": 5,
"secondsBetweenRequests": 15
"secondsBetweenRequests": 60,
"firstRequestWithinSeconds": 15
}
},
"scheduler": {
Expand Down
6 changes: 6 additions & 0 deletions pkg/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ type MetricsSourceConfig struct {
RequestTimeoutSeconds uint `json:"requestTimeoutSeconds"`
// SecondsBetweenRequests sets the number of seconds to wait between metrics requests
SecondsBetweenRequests uint `json:"secondsBetweenRequests"`
// FirstRequestWithinSeconds sets the maximum number of seconds to wait for the first request.
//
// This is useful for cases where we don't want to fetch metrics very frequently, but we don't
// want to wait too long to get them, because they're required for decision-making.
FirstRequestWithinSeconds uint `json:"firstRequestWithinSeconds"`
}

// SchedulerConfig defines a few parameters for scheduler requests
Expand Down Expand Up @@ -200,6 +205,7 @@ func (c *Config) validate() error {
erc.Whenf(ec, cfg.Port == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.port", key))
erc.Whenf(ec, cfg.RequestTimeoutSeconds == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.requestTimeoutSeconds", key))
erc.Whenf(ec, cfg.SecondsBetweenRequests == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.secondsBetweenRequests", key))
erc.Whenf(ec, cfg.FirstRequestWithinSeconds == 0, zeroTmpl, fmt.Sprintf(".metrics.%s.firstRequestWithinSeconds", key))
}
validateMetricsConfig(c.Metrics.System, "system")
validateMetricsConfig(c.Metrics.LFC, "lfc")
Expand Down
45 changes: 31 additions & 14 deletions pkg/agent/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -414,27 +414,43 @@ func getMetricsLoop[M core.FromPrometheus](
) {
waitBetweenDuration := time.Second * time.Duration(config.SecondsBetweenRequests)

randomStartWait := util.NewTimeRange(time.Second, 0, int(config.SecondsBetweenRequests)).Random()

lastActive := mgr.isActive()

// Don't log anything if we're not making this type of metrics request currently.
// Pick a random offset for our long-running periodic fetches. This is to load-balance against
// cases where many VMs start at once, or immediately after the autoscaler-agent starts.
//
// The idea is that isActive() can/should be used for gradual rollout of new metrics, and we
// don't want to log every time we *don't* do the new thing.
if lastActive {
logger.Info(
fmt.Sprintf("Sleeping for random delay before making first %s metrics request", mgr.kind),
zap.Duration("delay", randomStartWait),
)
// However, the FIRST request must be done within config.FirstRequestWithinSeconds, so we ALSO
// pick a random offset within that value, and
firstWait := util.NewTimeRange(time.Second, 0, int(config.FirstRequestWithinSeconds)).Random()
secondWait := util.NewTimeRange(time.Second, 0, int(config.SecondsBetweenRequests)).Random()

if firstWait > secondWait {
firstWait = secondWait
secondWait += waitBetweenDuration // wait a full SecondsBetweenRequests, firstWait is now close enough.
}

// Long-running timer to help manage the complication around "first request quickly, all others
// slower, on a different period".
//
// Every time we receive from this timer, we make a metrics request. But before that, we
// IMMEDIATELY reset it to the next time at which we should fetch metrics. There's basically two
// reasons to do that:
//
// 1. It makes the initial handling simpler
// 2. It means that temporary network delays won't unablanace the random distribution by
// causing their timings to align.
//
// However, it's worth noting that (2) means there's no graceful degradation under load -- we
// keep making requests at the same intervals, even if they start taking longer.
timer := time.NewTimer(firstWait)
defer timer.Stop()

select {
case <-ctx.Done():
return
case <-time.After(randomStartWait):
case <-timer.C:
timer.Reset(secondWait - firstWait)
}

lastActive := mgr.isActive()
for {
if !mgr.isActive() {
if lastActive {
Expand Down Expand Up @@ -463,7 +479,8 @@ func getMetricsLoop[M core.FromPrometheus](
select {
case <-ctx.Done():
return
case <-time.After(waitBetweenDuration):
case <-timer.C:
timer.Reset(waitBetweenDuration)
}
}
}
Expand Down
Loading