Skip to content

Commit

Permalink
Merge pull request #2248 from torredil/update-metrics-85123
Browse files Browse the repository at this point in the history
Update metrics to align with Prometheus best practices
  • Loading branch information
k8s-ci-robot authored Dec 6, 2024
2 parents 39822b5 + 212f289 commit 064f186
Show file tree
Hide file tree
Showing 14 changed files with 147 additions and 98 deletions.
1 change: 1 addition & 0 deletions charts/aws-ebs-csi-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ controller:
node:
env: []
envFrom: []
enableMetrics: false
kubeletPath: /var/lib/kubelet
loggingFormat: text
logLevel: 2
Expand Down
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ func main() {
region = md.GetRegion()
}

cloud, err := cloud.NewCloud(region, options.AwsSdkDebugLog, options.UserAgentExtra, options.Batching)
cloud, err := cloud.NewCloud(region, options.AwsSdkDebugLog, options.UserAgentExtra, options.Batching, options.DeprecatedMetrics)
if err != nil {
klog.ErrorS(err, "failed to create cloud service")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
Expand Down
34 changes: 16 additions & 18 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ $ helm repo add prometheus-community https://prometheus-community.github.io/helm
$ helm repo update
$ helm install prometheus prometheus-community/kube-prometheus-stack
```
2. Enable metrics by setting `enableMetrics: true` in [values.yaml](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/charts/aws-ebs-csi-driver/values.yaml).
2. Enable metrics by configuring `controller.enableMetrics` and `node.enableMetrics`.

3. Deploy EBS CSI Driver:
```sh
Expand All @@ -21,26 +21,24 @@ Installing the Prometheus Operator and enabling metrics will deploy a [Service](

## AWS API Metrics

The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `enableMetrics: true` has been configured in the Helm chart.
The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `controller.enableMetrics: true` has been configured in the Helm chart.

The metrics will appear in the following format:
```sh
# HELP cloudprovider_aws_api_request_duration_seconds [ALPHA] Latency of AWS API calls
# TYPE cloudprovider_aws_api_request_duration_seconds histogram
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1
cloudprovider_aws_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574
cloudprovider_aws_api_request_duration_seconds_count{request="AttachVolume"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1
aws_ebs_csi_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574
aws_ebs_csi_api_request_duration_seconds_count{request="AttachVolume"} 1
...
```

Expand Down
8 changes: 4 additions & 4 deletions pkg/cloud/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,12 @@ var _ Cloud = &cloud{}

// NewCloud returns a new instance of AWS cloud
// It panics if session is invalid.
func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batching bool) (Cloud, error) {
c := newEC2Cloud(region, awsSdkDebugLog, userAgentExtra, batching)
func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batching bool, deprecatedMetrics bool) (Cloud, error) {
c := newEC2Cloud(region, awsSdkDebugLog, userAgentExtra, batching, deprecatedMetrics)
return c, nil
}

func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchingEnabled bool) Cloud {
func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchingEnabled bool, deprecatedMetrics bool) Cloud {
cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion(region))
if err != nil {
panic(err)
Expand All @@ -358,7 +358,7 @@ func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batc

svc := ec2.NewFromConfig(cfg, func(o *ec2.Options) {
o.APIOptions = append(o.APIOptions,
RecordRequestsMiddleware(),
RecordRequestsMiddleware(deprecatedMetrics),
LogServerErrorsMiddleware(), // This middlware should always be last so it sees an unmangled error
)

Expand Down
13 changes: 7 additions & 6 deletions pkg/cloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,12 @@ func extractVolumeIdentifiers(volumes []types.Volume) (volumeIDs []string, volum

func TestNewCloud(t *testing.T) {
testCases := []struct {
name string
region string
awsSdkDebugLog bool
userAgentExtra string
batchingEnabled bool
name string
region string
awsSdkDebugLog bool
userAgentExtra string
batchingEnabled bool
deprecatedMetrics bool
}{
{
name: "success: with awsSdkDebugLog, userAgentExtra, and batchingEnabled",
Expand All @@ -107,7 +108,7 @@ func TestNewCloud(t *testing.T) {
},
}
for _, tc := range testCases {
ec2Cloud, err := NewCloud(tc.region, tc.awsSdkDebugLog, tc.userAgentExtra, tc.batchingEnabled)
ec2Cloud, err := NewCloud(tc.region, tc.awsSdkDebugLog, tc.userAgentExtra, tc.batchingEnabled, tc.deprecatedMetrics)
if err != nil {
t.Fatalf("error %v", err)
}
Expand Down
17 changes: 13 additions & 4 deletions pkg/cloud/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
)

// RecordRequestsHandler is added to the Complete chain; called after any request.
func RecordRequestsMiddleware() func(*middleware.Stack) error {
func RecordRequestsMiddleware(deprecatedMetrics bool) func(*middleware.Stack) error {
return func(stack *middleware.Stack) error {
return stack.Finalize.Add(middleware.FinalizeMiddlewareFunc("RecordRequestsMiddleware", func(ctx context.Context, input middleware.FinalizeInput, next middleware.FinalizeHandler) (output middleware.FinalizeOutput, metadata middleware.Metadata, err error) {
start := time.Now()
Expand All @@ -44,14 +44,23 @@ func RecordRequestsMiddleware() func(*middleware.Stack) error {
labels = map[string]string{
"operation_name": operationName,
}
metrics.Recorder().IncreaseCount("cloudprovider_aws_api_throttled_requests_total", labels)
metrics.Recorder().IncreaseCount("aws_ebs_csi_api_request_throttles_total", labels)
if deprecatedMetrics {
metrics.Recorder().IncreaseCount("cloudprovider_aws_api_throttled_requests_total", labels)
}
} else {
metrics.Recorder().IncreaseCount("cloudprovider_aws_api_request_errors", labels)
metrics.Recorder().IncreaseCount("aws_ebs_csi_api_request_errors_total", labels)
if deprecatedMetrics {
metrics.Recorder().IncreaseCount("cloudprovider_aws_api_request_errors", labels)
}
}
}
} else {
duration := time.Since(start).Seconds()
metrics.Recorder().ObserveHistogram("cloudprovider_aws_api_request_duration_seconds", duration, labels, nil)
metrics.Recorder().ObserveHistogram("aws_ebs_csi_api_request_duration_seconds", duration, labels, nil)
if deprecatedMetrics {
metrics.Recorder().ObserveHistogram("cloudprovider_aws_api_request_duration_seconds", duration, labels, nil)
}
}
return output, metadata, err
}), middleware.After)
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ type Options struct {
// flag to set the timeout for volume modification requests to be coalesced into a single
// volume modification call to AWS.
ModifyVolumeRequestHandlerTimeout time.Duration
// flag to enable deprecated metrics
DeprecatedMetrics bool

// #### Node options #####

Expand Down Expand Up @@ -113,6 +115,7 @@ func (o *Options) AddFlags(f *flag.FlagSet) {
f.StringVar(&o.UserAgentExtra, "user-agent-extra", "", "Extra string appended to user agent.")
f.BoolVar(&o.Batching, "batching", false, "To enable batching of API calls. This is especially helpful for improving performance in workloads that are sensitive to EC2 rate limits.")
f.DurationVar(&o.ModifyVolumeRequestHandlerTimeout, "modify-volume-request-handler-timeout", DefaultModifyVolumeRequestHandlerTimeout, "Timeout for the window in which volume modification calls must be received in order for them to coalesce into a single volume modification call to AWS. This must be lower than the csi-resizer and volumemodifier timeouts")
f.BoolVar(&o.DeprecatedMetrics, "deprecated-metrics", true, "DEPRECATED: To enable deprecated metrics. This parameter is only for backward compatibility and may be removed in a future release.")
}
// Node options
if o.Mode == AllMode || o.Mode == NodeMode {
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ func TestAddFlags(t *testing.T) {
if err := f.Set("aws-sdk-debug-log", "true"); err != nil {
t.Errorf("error setting aws-sdk-debug-log: %v", err)
}
if err := f.Set("deprecated-metrics", "true"); err != nil {
t.Errorf("error setting deprecated-metrics: %v", err)
}
if err := f.Set("warn-on-invalid-tag", "true"); err != nil {
t.Errorf("error setting warn-on-invalid-tag: %v", err)
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ import (
"k8s.io/klog/v2"
)

const (
namespace = "aws_ebs_csi_"
)

var (
r *metricRecorder // singleton instance of metricRecorder
once sync.Once
Expand Down
Loading

0 comments on commit 064f186

Please sign in to comment.