Skip to content

Commit

Permalink
Add metric to measure rule group load time (#5609)
Browse files Browse the repository at this point in the history
Signed-off-by: Anand Rajagopal <[email protected]>
  • Loading branch information
rajagopalanand authored Oct 20, 2023
1 parent d321d46 commit fc61482
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## master / unreleased
* [CHANGE] Ruler: Add `cortex_ruler_rule_group_load_duration_seconds` and `cortex_ruler_rule_group_sync_duration_seconds` metrics. #5609
* [CHANGE] Ruler: Add contextual info and query statistics to log
* [FEATURE] Ruler: Add support for disabling rule groups. #5521
* [FEATURE] Added the flag `-alertmanager.alerts-gc-interval` to configure alert manager alerts Garbage collection interval. #5550
Expand Down
45 changes: 39 additions & 6 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,10 @@ type Ruler struct {
// Pool of clients used to connect to other ruler replicas.
clientsPool ClientsPool

ringCheckErrors prometheus.Counter
rulerSync *prometheus.CounterVec
ringCheckErrors prometheus.Counter
rulerSync *prometheus.CounterVec
ruleGroupStoreLoadDuration prometheus.Gauge
ruleGroupSyncDuration prometheus.Gauge

allowedTenants *util.AllowedTenants

Expand Down Expand Up @@ -288,6 +290,16 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer,
Name: "cortex_ruler_sync_rules_total",
Help: "Total number of times the ruler sync operation triggered.",
}, []string{"reason"}),

ruleGroupStoreLoadDuration: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "cortex_ruler_rule_group_load_duration_seconds",
Help: "Time taken to load rule groups from storage",
}),

ruleGroupSyncDuration: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "cortex_ruler_rule_group_sync_duration_seconds",
Help: "The duration in seconds required to sync and load rule groups from storage.",
}),
}

if len(cfg.EnabledTenants) > 0 {
Expand Down Expand Up @@ -512,20 +524,41 @@ func (r *Ruler) run(ctx context.Context) error {
func (r *Ruler) syncRules(ctx context.Context, reason string) {
level.Debug(r.logger).Log("msg", "syncing rules", "reason", reason)
r.rulerSync.WithLabelValues(reason).Inc()
timer := prometheus.NewTimer(nil)

defer func() {
ruleGroupSyncDuration := timer.ObserveDuration().Seconds()
r.ruleGroupSyncDuration.Set(ruleGroupSyncDuration)
}()

loadedConfigs, err := r.loadRuleGroups(ctx)
if err != nil {
return
}

// This will also delete local group files for users that are no longer in 'configs' map.
r.manager.SyncRuleGroups(ctx, loadedConfigs)
}

func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
timer := prometheus.NewTimer(nil)

defer func() {
storeLoadSeconds := timer.ObserveDuration().Seconds()
r.ruleGroupStoreLoadDuration.Set(storeLoadSeconds)
}()

configs, err := r.listRules(ctx)
if err != nil {
level.Error(r.logger).Log("msg", "unable to list rules", "err", err)
return
return nil, err
}

loadedConfigs, err := r.store.LoadRuleGroups(ctx, configs)
if err != nil {
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(configs)-len(loadedConfigs), "err", err)
}

// This will also delete local group files for users that are no longer in 'configs' map.
r.manager.SyncRuleGroups(ctx, loadedConfigs)
return loadedConfigs, nil
}

func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGroupList, err error) {
Expand Down

0 comments on commit fc61482

Please sign in to comment.