From 4ef733826ed831d2b5f5d3b92f2ee25d24760665 Mon Sep 17 00:00:00 2001
From: Ian Anderson <ianderson@yugabyte.com>
Date: Thu, 29 Aug 2024 13:41:54 -0400
Subject: [PATCH] promdump: improve logging to make collection failures more
 obvious

Added a dumpSuccessful boolean to the promExport struct to track whether the dump of the metric succeeded or failed. Dump failure for a custom metric is always fatal; we may wish to change this behaviour but that's a topic for another commit. This boolean is set to false before we attempt to collect the metric, then set to true once we have successfully dumped all the batches.

Once we have finished processing all the metrics, we count up the number of skipped, successful, and failed dumps and write a summary line.

If no dumps were successful, this is fatal.

If any dumps failed, a warning is reported, once for inclusion in the log and once at the end of the output for user visibility.
---
 promdump/promdump.go | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/promdump/promdump.go b/promdump/promdump.go
index 50f50e0..0ee9f88 100644
--- a/promdump/promdump.go
+++ b/promdump/promdump.go
@@ -49,6 +49,7 @@ type promExport struct {
 	collect            bool
 	changedFromDefault bool
 	requiresNodePrefix bool
+	dumpSuccessful     bool
 	fileCount          uint
 }
 
@@ -1188,6 +1189,8 @@ func main() {
 	// Loop through yb metrics list and export each metric according to its configuration
 	for _, v := range collectMetrics {
 		if v.collect {
+			// Mark the metric as not successfully dumped yet
+			v.dumpSuccessful = false
 			metricName, err := getMetricName(v)
 			if err != nil {
 				logger.Fatalf("main: %v", err)
@@ -1225,6 +1228,8 @@ func main() {
 				logger.Printf("exportMetric: export of metric %v failed with error %v; moving to next metric", metricName, err)
 				continue
 			}
+			// If we reach this point, the dump of all batches for this metric was successful
+			v.dumpSuccessful = true
 		}
 	}
 	if *metric != "" {
@@ -1235,6 +1240,28 @@ func main() {
 	}
 	logger.Println("main: Finished with Prometheus connection")
 
+	// TODO: Put this in a func?
+	skippedMetrics := 0
+	successfulMetrics := 0
+	failedMetrics := 0
+	for _, v := range collectMetrics {
+		if v.collect {
+			if v.dumpSuccessful {
+				successfulMetrics += 1
+			} else {
+				failedMetrics += 1
+			}
+		} else {
+			skippedMetrics += 1
+		}
+	}
+	logger.Printf("main: summary: %v metrics processed (skipped: %v dumped: %v failed: %v)", len(collectMetrics), skippedMetrics, successfulMetrics, failedMetrics)
+	if successfulMetrics < 1 {
+		logger.Fatalf("main: no metrics were dumped successfully; aborting")
+	} else if failedMetrics > 0 {
+		logger.Println("Warning: one or more metric exports failed; dump is incomplete")
+	}
+
 	if *enableTar {
 		tarFileOut, err := os.Create(*tarFilename)
 		if err != nil {
@@ -1274,4 +1301,10 @@ func main() {
 
 		logger.Printf("main: finished creating metrics bundle '%s'", *tarFilename)
 	}
+	if failedMetrics > 0 {
+		// Yes, this is logged twice but it's important! It's logged the first time so it shows up unambiguously
+		// in the log that's included in the tarball and the second time so it's the last line of the output where
+		// customers and support are likely to see it.
+		logger.Println("Warning: one or more metric exports failed; dump is incomplete")
+	}
 }