From 35c7c74a9a6408d74a3397d94092f1c2004c0548 Mon Sep 17 00:00:00 2001 From: Hengzhe Date: Thu, 24 Oct 2024 09:12:16 +0900 Subject: [PATCH] [YUNIKORN-2908] metrics not removed when a queue is removed --- pkg/metrics/init.go | 9 +++++++++ pkg/metrics/queue.go | 15 +++++++++++++++ pkg/metrics/queue_test.go | 15 +++++++++++++++ pkg/scheduler/objects/queue.go | 5 +++++ 4 files changed, 44 insertions(+) diff --git a/pkg/metrics/init.go b/pkg/metrics/init.go index ce4780145..ec370abf3 100644 --- a/pkg/metrics/init.go +++ b/pkg/metrics/init.go @@ -84,6 +84,15 @@ func GetQueueMetrics(name string) *QueueMetrics { return queueMetrics } +func RemoveQueueMetrics(name string) { + m.lock.Lock() + defer m.lock.Unlock() + if metrics, ok := m.queues[name]; ok { + metrics.UnregisterMetrics() + delete(m.queues, name) + } +} + func GetEventMetrics() *EventMetrics { return m.event } diff --git a/pkg/metrics/queue.go b/pkg/metrics/queue.go index fc1181646..d8917b850 100644 --- a/pkg/metrics/queue.go +++ b/pkg/metrics/queue.go @@ -132,6 +132,21 @@ func InitQueueMetrics(name string) *QueueMetrics { return q } +func (m *QueueMetrics) UnregisterMetrics() { + var queueMetricsList = []prometheus.Collector{ + m.appMetricsLabel, + m.appMetricsSubsystem, + m.containerMetrics, + m.resourceMetricsLabel, + m.resourceMetricsSubsystem, + } + + // Unregister the metrics + for _, metric := range queueMetricsList { + prometheus.Unregister(metric) + } +} + func (m *QueueMetrics) incQueueApplications(state string) { m.appMetricsLabel.WithLabelValues(state).Inc() m.appMetricsSubsystem.WithLabelValues(state).Inc() diff --git a/pkg/metrics/queue_test.go b/pkg/metrics/queue_test.go index 9a9678645..282f346bc 100644 --- a/pkg/metrics/queue_test.go +++ b/pkg/metrics/queue_test.go @@ -246,6 +246,21 @@ func TestQueuePreemptingResourceMetrics(t *testing.T) { verifyResourceMetrics(t, "preempting", "cpu") } +func TestRemoveQueueMetrics(t *testing.T) { + qm = getQueueMetrics() + defer unregisterQueueMetrics() + + qm.SetQueueAllocatedResourceMetrics("cpu", 1) + verifyResourceMetrics(t, "allocated", "cpu") + + qm.UnregisterMetrics() + metrics, _ := prometheus.DefaultGatherer.Gather() + for _, metric := range metrics { + assert.Assert(t, metric.GetName() != "yunikorn_queue_resource", + "Metric is not cleaned up") + } +} + func getQueueMetrics() *QueueMetrics { return InitQueueMetrics("root.test") } diff --git a/pkg/scheduler/objects/queue.go b/pkg/scheduler/objects/queue.go index 6ac9dc16e..5b62ca5c1 100644 --- a/pkg/scheduler/objects/queue.go +++ b/pkg/scheduler/objects/queue.go @@ -986,6 +986,7 @@ func (sq *Queue) RemoveQueue() bool { return false } log.Log(log.SchedQueue).Info("removing queue", zap.String("queue", sq.QueuePath)) + sq.removeMetrics() // root is always managed and is the only queue with a nil parent: no need to guard sq.parent.removeChildQueue(sq.Name) sq.queueEvents.SendRemoveQueueEvent(sq.QueuePath, sq.isManaged) @@ -1696,6 +1697,10 @@ func (sq *Queue) updatePreemptingResourceMetrics() { } } +func (sq *Queue) removeMetrics() { + metrics.RemoveQueueMetrics(sq.QueuePath) +} + func (sq *Queue) String() string { sq.RLock() defer sq.RUnlock()