diff --git a/charts/aws-ebs-csi-driver/values.yaml b/charts/aws-ebs-csi-driver/values.yaml index 45579b8fa..feef610c0 100644 --- a/charts/aws-ebs-csi-driver/values.yaml +++ b/charts/aws-ebs-csi-driver/values.yaml @@ -344,6 +344,7 @@ controller: node: env: [] envFrom: [] + enableMetrics: false kubeletPath: /var/lib/kubelet loggingFormat: text logLevel: 2 diff --git a/cmd/main.go b/cmd/main.go index 9659415af..dde8adc73 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -173,7 +173,7 @@ func main() { region = md.GetRegion() } - cloud, err := cloud.NewCloud(region, options.AwsSdkDebugLog, options.UserAgentExtra, options.Batching) + cloud, err := cloud.NewCloud(region, options.AwsSdkDebugLog, options.UserAgentExtra, options.Batching, options.DeprecatedMetrics) if err != nil { klog.ErrorS(err, "failed to create cloud service") klog.FlushAndExit(klog.ExitFlushTimeout, 1) diff --git a/docs/metrics.md b/docs/metrics.md index f6d8126d7..d59b58c7e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -8,7 +8,7 @@ $ helm repo add prometheus-community https://prometheus-community.github.io/helm $ helm repo update $ helm install prometheus prometheus-community/kube-prometheus-stack ``` -2. Enable metrics by setting `enableMetrics: true` in [values.yaml](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/charts/aws-ebs-csi-driver/values.yaml). +2. Enable metrics by configuring `controller.enableMetrics` and `node.enableMetrics`. 3. Deploy EBS CSI Driver: ```sh @@ -21,26 +21,24 @@ Installing the Prometheus Operator and enabling metrics will deploy a [Service]( ## AWS API Metrics -The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `enableMetrics: true` has been configured in the Helm chart. +The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `controller.enableMetrics: true` has been configured in the Helm chart. The metrics will appear in the following format: ```sh -# HELP cloudprovider_aws_api_request_duration_seconds [ALPHA] Latency of AWS API calls -# TYPE cloudprovider_aws_api_request_duration_seconds histogram -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1 -cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1 -cloudprovider_aws_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574 -cloudprovider_aws_api_request_duration_seconds_count{request="AttachVolume"} 1 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1 +aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1 +aws_ebs_csi_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574 +aws_ebs_csi_api_request_duration_seconds_count{request="AttachVolume"} 1 ... ``` diff --git a/pkg/cloud/cloud.go b/pkg/cloud/cloud.go index a5b86308a..fca0bab15 100644 --- a/pkg/cloud/cloud.go +++ b/pkg/cloud/cloud.go @@ -334,12 +334,12 @@ var _ Cloud = &cloud{} // NewCloud returns a new instance of AWS cloud // It panics if session is invalid. -func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batching bool) (Cloud, error) { - c := newEC2Cloud(region, awsSdkDebugLog, userAgentExtra, batching) +func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batching bool, deprecatedMetrics bool) (Cloud, error) { + c := newEC2Cloud(region, awsSdkDebugLog, userAgentExtra, batching, deprecatedMetrics) return c, nil } -func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchingEnabled bool) Cloud { +func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchingEnabled bool, deprecatedMetrics bool) Cloud { cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion(region)) if err != nil { panic(err) @@ -358,7 +358,7 @@ func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batc svc := ec2.NewFromConfig(cfg, func(o *ec2.Options) { o.APIOptions = append(o.APIOptions, - RecordRequestsMiddleware(), + RecordRequestsMiddleware(deprecatedMetrics), LogServerErrorsMiddleware(), // This middlware should always be last so it sees an unmangled error ) diff --git a/pkg/cloud/cloud_test.go b/pkg/cloud/cloud_test.go index 3b09e107f..02e111420 100644 --- a/pkg/cloud/cloud_test.go +++ b/pkg/cloud/cloud_test.go @@ -82,11 +82,12 @@ func extractVolumeIdentifiers(volumes []types.Volume) (volumeIDs []string, volum func TestNewCloud(t *testing.T) { testCases := []struct { - name string - region string - awsSdkDebugLog bool - userAgentExtra string - batchingEnabled bool + name string + region string + awsSdkDebugLog bool + userAgentExtra string + batchingEnabled bool + deprecatedMetrics bool }{ { name: "success: with awsSdkDebugLog, userAgentExtra, and batchingEnabled", @@ -107,7 +108,7 @@ func TestNewCloud(t *testing.T) { }, } for _, tc := range testCases { - ec2Cloud, err := NewCloud(tc.region, tc.awsSdkDebugLog, tc.userAgentExtra, tc.batchingEnabled) + ec2Cloud, err := NewCloud(tc.region, tc.awsSdkDebugLog, tc.userAgentExtra, tc.batchingEnabled, tc.deprecatedMetrics) if err != nil { t.Fatalf("error %v", err) } diff --git a/pkg/cloud/handlers.go b/pkg/cloud/handlers.go index 2f725943d..aa853728d 100644 --- a/pkg/cloud/handlers.go +++ b/pkg/cloud/handlers.go @@ -30,7 +30,7 @@ import ( ) // RecordRequestsHandler is added to the Complete chain; called after any request. -func RecordRequestsMiddleware() func(*middleware.Stack) error { +func RecordRequestsMiddleware(deprecatedMetrics bool) func(*middleware.Stack) error { return func(stack *middleware.Stack) error { return stack.Finalize.Add(middleware.FinalizeMiddlewareFunc("RecordRequestsMiddleware", func(ctx context.Context, input middleware.FinalizeInput, next middleware.FinalizeHandler) (output middleware.FinalizeOutput, metadata middleware.Metadata, err error) { start := time.Now() @@ -44,14 +44,23 @@ func RecordRequestsMiddleware() func(*middleware.Stack) error { labels = map[string]string{ "operation_name": operationName, } - metrics.Recorder().IncreaseCount("cloudprovider_aws_api_throttled_requests_total", labels) + metrics.Recorder().IncreaseCount("aws_ebs_csi_api_request_throttles_total", labels) + if deprecatedMetrics { + metrics.Recorder().IncreaseCount("cloudprovider_aws_api_throttled_requests_total", labels) + } } else { - metrics.Recorder().IncreaseCount("cloudprovider_aws_api_request_errors", labels) + metrics.Recorder().IncreaseCount("aws_ebs_csi_api_request_errors_total", labels) + if deprecatedMetrics { + metrics.Recorder().IncreaseCount("cloudprovider_aws_api_request_errors", labels) + } } } } else { duration := time.Since(start).Seconds() - metrics.Recorder().ObserveHistogram("cloudprovider_aws_api_request_duration_seconds", duration, labels, nil) + metrics.Recorder().ObserveHistogram("aws_ebs_csi_api_request_duration_seconds", duration, labels, nil) + if deprecatedMetrics { + metrics.Recorder().ObserveHistogram("cloudprovider_aws_api_request_duration_seconds", duration, labels, nil) + } } return output, metadata, err }), middleware.After) diff --git a/pkg/driver/options.go b/pkg/driver/options.go index 9e1ea4d08..9838bd286 100644 --- a/pkg/driver/options.go +++ b/pkg/driver/options.go @@ -67,6 +67,8 @@ type Options struct { // flag to set the timeout for volume modification requests to be coalesced into a single // volume modification call to AWS. ModifyVolumeRequestHandlerTimeout time.Duration + // flag to enable deprecated metrics + DeprecatedMetrics bool // #### Node options ##### @@ -113,6 +115,7 @@ func (o *Options) AddFlags(f *flag.FlagSet) { f.StringVar(&o.UserAgentExtra, "user-agent-extra", "", "Extra string appended to user agent.") f.BoolVar(&o.Batching, "batching", false, "To enable batching of API calls. This is especially helpful for improving performance in workloads that are sensitive to EC2 rate limits.") f.DurationVar(&o.ModifyVolumeRequestHandlerTimeout, "modify-volume-request-handler-timeout", DefaultModifyVolumeRequestHandlerTimeout, "Timeout for the window in which volume modification calls must be received in order for them to coalesce into a single volume modification call to AWS. This must be lower than the csi-resizer and volumemodifier timeouts") + f.BoolVar(&o.DeprecatedMetrics, "deprecated-metrics", true, "DEPRECATED: To enable deprecated metrics. This parameter is only for backward compatibility and may be removed in a future release.") } // Node options if o.Mode == AllMode || o.Mode == NodeMode { diff --git a/pkg/driver/options_test.go b/pkg/driver/options_test.go index 0bea2edb8..b16565c71 100644 --- a/pkg/driver/options_test.go +++ b/pkg/driver/options_test.go @@ -52,6 +52,9 @@ func TestAddFlags(t *testing.T) { if err := f.Set("aws-sdk-debug-log", "true"); err != nil { t.Errorf("error setting aws-sdk-debug-log: %v", err) } + if err := f.Set("deprecated-metrics", "true"); err != nil { + t.Errorf("error setting deprecated-metrics: %v", err) + } if err := f.Set("warn-on-invalid-tag", "true"); err != nil { t.Errorf("error setting warn-on-invalid-tag: %v", err) } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 64b8fd708..54dc46468 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -24,6 +24,10 @@ import ( "k8s.io/klog/v2" ) +const ( + namespace = "aws_ebs_csi_" +) + var ( r *metricRecorder // singleton instance of metricRecorder once sync.Once diff --git a/pkg/metrics/nvme.go b/pkg/metrics/nvme.go index 33c992f23..0011690bc 100644 --- a/pkg/metrics/nvme.go +++ b/pkg/metrics/nvme.go @@ -36,6 +36,33 @@ import ( "k8s.io/klog/v2" ) +const ( + // Counter metrics. + metricReadOps = namespace + "read_ops_total" + metricWriteOps = namespace + "write_ops_total" + metricReadBytes = namespace + "read_bytes_total" + metricWriteBytes = namespace + "write_bytes_total" + metricReadOpsSeconds = namespace + "read_seconds_total" + metricWriteOpsSeconds = namespace + "write_seconds_total" + metricExceededIOPS = namespace + "exceeded_iops_seconds_total" + metricExceededTP = namespace + "exceeded_tp_seconds_total" + metricEC2ExceededIOPS = namespace + "ec2_exceeded_iops_seconds_total" + metricEC2ExceededTP = namespace + "ec2_exceeded_tp_seconds_total" + metricCollectorScrapes = namespace + "nvme_collector_scrapes_total" + metricCollectorErrors = namespace + "nvme_collector_errors_total" + + // Gauge metrics. + metricVolumeQueueLength = namespace + "volume_queue_length" + + // Histogram metrics. + metricReadLatency = namespace + "read_io_latency_seconds" + metricWriteLatency = namespace + "write_io_latency_seconds" + metricCollectorDuration = namespace + "nvme_collector_duration_seconds" + + // Conversion factor. + microsecondsInSeconds = 1e6 +) + // EBSMetrics represents the parsed metrics from the NVMe log page. type EBSMetrics struct { EBSMagic uint64 @@ -109,38 +136,38 @@ func NewNVMECollector(path, instanceID string) *NVMECollector { return &NVMECollector{ metrics: map[string]*prometheus.Desc{ - "total_read_ops": prometheus.NewDesc("total_read_ops", "Total number of read operations", variableLabels, constLabels), - "total_write_ops": prometheus.NewDesc("total_write_ops", "Total number of write operations", variableLabels, constLabels), - "total_read_bytes": prometheus.NewDesc("total_read_bytes", "Total number of bytes read", variableLabels, constLabels), - "total_write_bytes": prometheus.NewDesc("total_write_bytes", "Total number of bytes written", variableLabels, constLabels), - "total_read_time": prometheus.NewDesc("total_read_time", "Total time spent on read operations (in microseconds)", variableLabels, constLabels), - "total_write_time": prometheus.NewDesc("total_write_time", "Total time spent on write operations (in microseconds)", variableLabels, constLabels), - "ebs_volume_performance_exceeded_iops": prometheus.NewDesc("ebs_volume_performance_exceeded_iops", "Time EBS volume IOPS limit was exceeded (in microseconds)", variableLabels, constLabels), - "ebs_volume_performance_exceeded_tp": prometheus.NewDesc("ebs_volume_performance_exceeded_tp", "Time EBS volume throughput limit was exceeded (in microseconds)", variableLabels, constLabels), - "ec2_instance_ebs_performance_exceeded_iops": prometheus.NewDesc("ec2_instance_ebs_performance_exceeded_iops", "Time EC2 instance EBS IOPS limit was exceeded (in microseconds)", variableLabels, constLabels), - "ec2_instance_ebs_performance_exceeded_tp": prometheus.NewDesc("ec2_instance_ebs_performance_exceeded_tp", "Time EC2 instance EBS throughput limit was exceeded (in microseconds)", variableLabels, constLabels), - "volume_queue_length": prometheus.NewDesc("volume_queue_length", "Current volume queue length", variableLabels, constLabels), - "read_io_latency_histogram": prometheus.NewDesc("read_io_latency_histogram", "Histogram of read I/O latencies (in microseconds)", variableLabels, constLabels), - "write_io_latency_histogram": prometheus.NewDesc("write_io_latency_histogram", "Histogram of write I/O latencies (in microseconds)", variableLabels, constLabels), + metricReadOps: prometheus.NewDesc(metricReadOps, "The total number of completed read operations.", variableLabels, constLabels), + metricWriteOps: prometheus.NewDesc(metricWriteOps, "The total number of completed write operations.", variableLabels, constLabels), + metricReadBytes: prometheus.NewDesc(metricReadBytes, "The total number of read bytes transferred.", variableLabels, constLabels), + metricWriteBytes: prometheus.NewDesc(metricWriteBytes, "The total number of write bytes transferred.", variableLabels, constLabels), + metricReadOpsSeconds: prometheus.NewDesc(metricReadOpsSeconds, "The total time spent, in seconds, by all completed read operations.", variableLabels, constLabels), + metricWriteOpsSeconds: prometheus.NewDesc(metricWriteOpsSeconds, "The total time spent, in seconds, by all completed write operations.", variableLabels, constLabels), + metricExceededIOPS: prometheus.NewDesc(metricExceededIOPS, "The total time, in seconds, that IOPS demand exceeded the volume's provisioned IOPS performance.", variableLabels, constLabels), + metricExceededTP: prometheus.NewDesc(metricExceededTP, "The total time, in seconds, that throughput demand exceeded the volume's provisioned throughput performance.", variableLabels, constLabels), + metricEC2ExceededIOPS: prometheus.NewDesc(metricEC2ExceededIOPS, "The total time, in seconds, that the EBS volume exceeded the attached Amazon EC2 instance's maximum IOPS performance.", variableLabels, constLabels), + metricEC2ExceededTP: prometheus.NewDesc(metricEC2ExceededTP, "The total time, in seconds, that the EBS volume exceeded the attached Amazon EC2 instance's maximum throughput performance.", variableLabels, constLabels), + metricVolumeQueueLength: prometheus.NewDesc(metricVolumeQueueLength, "The number of read and write operations waiting to be completed.", variableLabels, constLabels), + metricReadLatency: prometheus.NewDesc(metricReadLatency, "The number of read operations completed within each latency bin, in seconds.", variableLabels, constLabels), + metricWriteLatency: prometheus.NewDesc(metricWriteLatency, "The number of write operations completed within each latency bin, in seconds.", variableLabels, constLabels), }, // Clean CSI mount point path to normalize path // Add trailing slash back that Clean prunes csiMountPointPath: filepath.Clean(path) + "/", instanceID: instanceID, collectionDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "nvme_collector_duration_seconds", - Help: "Histogram of NVMe collector duration in seconds", + Name: metricCollectorDuration, + Help: "Histogram of NVMe collector scrape duration in seconds.", Buckets: []float64{0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, ConstLabels: constLabels, }), scrapesTotal: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "nvme_collector_scrapes_total", - Help: "Total number of NVMe collector scrapes", + Name: metricCollectorScrapes, + Help: "Total number of NVMe collector scrapes.", ConstLabels: constLabels, }), scrapeErrorsTotal: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "nvme_collector_scrape_errors_total", - Help: "Total number of NVMe collector scrape errors", + Name: metricCollectorErrors, + Help: "Total number of NVMe collector scrape errors.", ConstLabels: constLabels, }), } @@ -207,22 +234,22 @@ func (c *NVMECollector) Collect(ch chan<- prometheus.Metric) { } // Send all collected metrics to Prometheus - ch <- prometheus.MustNewConstMetric(c.metrics["total_read_ops"], prometheus.CounterValue, float64(metrics.ReadOps), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["total_write_ops"], prometheus.CounterValue, float64(metrics.WriteOps), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["total_read_bytes"], prometheus.CounterValue, float64(metrics.ReadBytes), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["total_write_bytes"], prometheus.CounterValue, float64(metrics.WriteBytes), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["total_read_time"], prometheus.CounterValue, float64(metrics.TotalReadTime), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["total_write_time"], prometheus.CounterValue, float64(metrics.TotalWriteTime), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["ebs_volume_performance_exceeded_iops"], prometheus.CounterValue, float64(metrics.EBSIOPSExceeded), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["ebs_volume_performance_exceeded_tp"], prometheus.CounterValue, float64(metrics.EBSThroughputExceeded), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["ec2_instance_ebs_performance_exceeded_iops"], prometheus.CounterValue, float64(metrics.EC2IOPSExceeded), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["ec2_instance_ebs_performance_exceeded_tp"], prometheus.CounterValue, float64(metrics.EC2ThroughputExceeded), volumeID) - ch <- prometheus.MustNewConstMetric(c.metrics["volume_queue_length"], prometheus.GaugeValue, float64(metrics.QueueLength), volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricReadOps], prometheus.CounterValue, float64(metrics.ReadOps), volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteOps], prometheus.CounterValue, float64(metrics.WriteOps), volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricReadBytes], prometheus.CounterValue, float64(metrics.ReadBytes), volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteBytes], prometheus.CounterValue, float64(metrics.WriteBytes), volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricReadOpsSeconds], prometheus.CounterValue, float64(metrics.TotalReadTime)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteOpsSeconds], prometheus.CounterValue, float64(metrics.TotalWriteTime)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededIOPS], prometheus.CounterValue, float64(metrics.EBSIOPSExceeded)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededTP], prometheus.CounterValue, float64(metrics.EBSThroughputExceeded)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededIOPS], prometheus.CounterValue, float64(metrics.EC2IOPSExceeded)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededTP], prometheus.CounterValue, float64(metrics.EC2ThroughputExceeded)/microsecondsInSeconds, volumeID) + ch <- prometheus.MustNewConstMetric(c.metrics[metricVolumeQueueLength], prometheus.GaugeValue, float64(metrics.QueueLength), volumeID) // Read Latency Histogram readCount, readBuckets := convertHistogram(metrics.ReadLatency) ch <- prometheus.MustNewConstHistogram( - c.metrics["read_io_latency_histogram"], + c.metrics[metricReadLatency], readCount, 0, readBuckets, @@ -232,7 +259,7 @@ func (c *NVMECollector) Collect(ch chan<- prometheus.Metric) { // Write Latency Histogram writeCount, writeBuckets := convertHistogram(metrics.WriteLatency) ch <- prometheus.MustNewConstHistogram( - c.metrics["write_io_latency_histogram"], + c.metrics[metricWriteLatency], writeCount, 0, writeBuckets, @@ -248,7 +275,7 @@ func convertHistogram(hist Histogram) (uint64, map[float64]uint64) { for i := uint64(0); i < hist.BinCount && i < 64; i++ { count += hist.Bins[i].Count - buckets[float64(hist.Bins[i].Upper)] = count + buckets[float64(hist.Bins[i].Upper)/microsecondsInSeconds] = count } return count, buckets diff --git a/pkg/metrics/nvme_test.go b/pkg/metrics/nvme_test.go index a32a175a7..2e2c9ae01 100644 --- a/pkg/metrics/nvme_test.go +++ b/pkg/metrics/nvme_test.go @@ -46,19 +46,19 @@ func TestNewNVMECollector(t *testing.T) { } expectedMetrics := []string{ - "total_read_ops", - "total_write_ops", - "total_read_bytes", - "total_write_bytes", - "total_read_time", - "total_write_time", - "ebs_volume_performance_exceeded_iops", - "ebs_volume_performance_exceeded_tp", - "ec2_instance_ebs_performance_exceeded_iops", - "ec2_instance_ebs_performance_exceeded_tp", - "volume_queue_length", - "read_io_latency_histogram", - "write_io_latency_histogram", + metricReadOps, + metricWriteOps, + metricReadBytes, + metricWriteBytes, + metricReadOpsSeconds, + metricWriteOpsSeconds, + metricExceededIOPS, + metricExceededTP, + metricExceededIOPS, + metricExceededTP, + metricVolumeQueueLength, + metricReadLatency, + metricWriteLatency, } for _, metricName := range expectedMetrics { @@ -87,9 +87,9 @@ func TestConvertHistogram(t *testing.T) { }, wantCount: 10, wantBuckets: map[float64]uint64{ - 100: 5, - 200: 8, - 300: 10, + 100 / 1e6: 5, + 200 / 1e6: 8, + 300 / 1e6: 10, }, }, } diff --git a/tests/e2e/dynamic_provisioning.go b/tests/e2e/dynamic_provisioning.go index bf17eb25d..398a83a7c 100644 --- a/tests/e2e/dynamic_provisioning.go +++ b/tests/e2e/dynamic_provisioning.go @@ -621,7 +621,7 @@ var _ = Describe("[ebs-csi-e2e] [single-az] Dynamic Provisioning", func() { availabilityZones := strings.Split(os.Getenv(awsAvailabilityZonesEnv), ",") availabilityZone := availabilityZones[rand.Intn(len(availabilityZones))] region := availabilityZone[0 : len(availabilityZone)-1] - cloud, err := awscloud.NewCloud(region, false, "", true) + cloud, err := awscloud.NewCloud(region, false, "", true, false) if err != nil { Fail(fmt.Sprintf("could not get NewCloud: %v", err)) } diff --git a/tests/e2e/nvme_metrics.go b/tests/e2e/nvme_metrics.go index 397c13d89..0de341ee8 100644 --- a/tests/e2e/nvme_metrics.go +++ b/tests/e2e/nvme_metrics.go @@ -114,19 +114,22 @@ var _ = Describe("[ebs-csi-e2e] [single-az] NVMe Metrics", func() { By("Verifying NVMe metrics") expectedMetrics := []string{ - "total_read_ops", - "total_write_ops", - "total_read_bytes", - "total_write_bytes", - "total_read_time", - "total_write_time", - "ebs_volume_performance_exceeded_iops", - "ebs_volume_performance_exceeded_tp", - "ec2_instance_ebs_performance_exceeded_iops", - "ec2_instance_ebs_performance_exceeded_tp", - "volume_queue_length", - "read_io_latency_histogram", - "write_io_latency_histogram", + "aws_ebs_csi_read_ops_total", + "aws_ebs_csi_write_ops_total", + "aws_ebs_csi_read_bytes_total", + "aws_ebs_csi_write_bytes_total", + "aws_ebs_csi_read_seconds_total", + "aws_ebs_csi_write_seconds_total", + "aws_ebs_csi_exceeded_iops_seconds_total", + "aws_ebs_csi_exceeded_tp_seconds_total", + "aws_ebs_csi_ec2_exceeded_iops_seconds_total", + "aws_ebs_csi_ec2_exceeded_tp_seconds_total", + "aws_ebs_csi_nvme_collector_scrapes_total", + "aws_ebs_csi_nvme_collector_errors_total", + "aws_ebs_csi_volume_queue_length", + "aws_ebs_csi_read_io_latency_seconds", + "aws_ebs_csi_write_io_latency_seconds", + "aws_ebs_csi_nvme_collector_duration_seconds", } for _, metric := range expectedMetrics { diff --git a/tests/e2e/pre_provsioning.go b/tests/e2e/pre_provsioning.go index a95e97456..e102fad3d 100644 --- a/tests/e2e/pre_provsioning.go +++ b/tests/e2e/pre_provsioning.go @@ -87,7 +87,7 @@ var _ = Describe("[ebs-csi-e2e] [single-az] Pre-Provisioned", func() { Tags: map[string]string{awscloud.VolumeNameTagKey: dummyVolumeName, awscloud.AwsEbsDriverTagKey: "true"}, } var err error - cloud, err = awscloud.NewCloud(region, false, "", true) + cloud, err = awscloud.NewCloud(region, false, "", true, false) if err != nil { Fail(fmt.Sprintf("could not get NewCloud: %v", err)) } @@ -260,7 +260,7 @@ var _ = Describe("[ebs-csi-e2e] [single-az] Pre-Provisioned with Multi-Attach", Tags: map[string]string{awscloud.VolumeNameTagKey: dummyVolumeName, awscloud.AwsEbsDriverTagKey: "true"}, } var err error - cloud, err = awscloud.NewCloud(region, false, "", true) + cloud, err = awscloud.NewCloud(region, false, "", true, false) if err != nil { Fail(fmt.Sprintf("could not get NewCloud: %v", err)) }