From 28cc1284cffae842509680ca868ebd10afa2647e Mon Sep 17 00:00:00 2001 From: Drew Sirenko <68304519+AndrewSirenko@users.noreply.github.com> Date: Fri, 8 Dec 2023 12:52:33 -0500 Subject: [PATCH] Add attachment limit scripts to hack/cluster-debugging-scripts --- hack/cluster-debugging-scripts/README.md | 55 ++++++++ .../device_slot_test.tmpl | 51 +++++++ .../find-attachment-limit | 128 ++++++++++++++++++ .../generate_example_manifest.go | 76 +++++++++++ .../get-attachment-breakdown | 101 ++++++++++++++ hack/cluster-debugging-scripts/go.mod | 3 + 6 files changed, 414 insertions(+) create mode 100644 hack/cluster-debugging-scripts/README.md create mode 100644 hack/cluster-debugging-scripts/device_slot_test.tmpl create mode 100755 hack/cluster-debugging-scripts/find-attachment-limit create mode 100644 hack/cluster-debugging-scripts/generate_example_manifest.go create mode 100755 hack/cluster-debugging-scripts/get-attachment-breakdown create mode 100644 hack/cluster-debugging-scripts/go.mod diff --git a/hack/cluster-debugging-scripts/README.md b/hack/cluster-debugging-scripts/README.md new file mode 100644 index 000000000..b4c0033bd --- /dev/null +++ b/hack/cluster-debugging-scripts/README.md @@ -0,0 +1,55 @@ +# Cluster Debugging Scripts + +This folder contains a collection of scripts to help debug clusters. + +## FAQ + +### How can I validate that the aws-ebs-csi-driver correctly makes use of all available attachment slots for instance type X? + +Answer: Perform the following steps (Which will create a nodegroup, count the Block Device Mappings + ENI for the underlying instance, deploy pods EBS PVs until the script finds the maximum amount of volumes the aws-ebs-csi-driver can attach to instance) + +``` +export CLUSTER_NAME="devcluster" +export NODEGROUP_NAME="ng-attachment-limit-test" +export INSTANCE_TYPE="m7g.large" + +eksctl create nodegroup -c "$CLUSTER_NAME" --nodes 1 -t "$INSTANCE_TYPE" -n "$NODEGROUP_NAME" + +./get-attachment-breakdown "$NODEGROUP_NAME" + +eksctl delete nodegroup -c "$CLUSTER_NAME" -n "$NODEGROUP_NAME" +``` + +By the end of the script, you should see an output similar to this: +``` +Attachments for ng-f3ecdf71 +BlockDeviceMappings ENIs Available-Attachment-Slots(Validated-by-aws-ebs-csi-driver) +1 2 25 +``` + +## Scripts + +get-attachment-breakdown: Find the maximum amount of volumes that can be attached to a specified nodegroup node. Additionally, log how many BlockDeviceMappings and ENIs are attached the instance of the specified nodegroup. + +Examples +``` +./get_attachment_breakdown "ng-f3ecdf71" +MIN_VOLUME_GUESS=20 MAX_VOLUME_GUESS=40 POD_TIMEOUT_SECONDS=120 ./get_attachment_breakdown "ng-f3ecdf71" +``` + +find_attachment_limit: Find the maximum amount of volumes the aws-ebs-csi-driver can attach to a node. + +Examples: +``` +./find-attachment-limit 'some.node.affinity.key:value' +./find-attachment-limit 'eks.amazonaws.com/nodegroup:test-nodegroup' +./find-attachment-limit 'node.kubernetes.io/instance-type:m5.large' +MIN_VOLUME_GUESS=12 MAX_VOLUME_GUESS=30 POD_TIMEOUT_SECONDS=60 ./find_attachment_limit 'node.kubernetes.io/instance-type:m5.large' +``` + +generate_example_manifest.go: Generate a yaml file containing a pod associated with a specified amount of PVCs based off of the template file `device_slot_test.tmpl` + +Example: +``` +go run "generate_example_manifest.go" --node-affinity "some.label:value" --volume-count "22" --test-pod-name "test-pod" +``` diff --git a/hack/cluster-debugging-scripts/device_slot_test.tmpl b/hack/cluster-debugging-scripts/device_slot_test.tmpl new file mode 100644 index 000000000..53ca40573 --- /dev/null +++ b/hack/cluster-debugging-scripts/device_slot_test.tmpl @@ -0,0 +1,51 @@ +apiVersion: v1 +kind: Pod +metadata: + name: {{ .PodName }} +spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .NodeAffinityKey }} + operator: In + values: + - {{ .NodeAffinityValue }} + containers: + - name: device-limit-tester-{{ len .Volumes }}-volumes + image: centos + command: ["/bin/sh"] + args: ["-c", "while true; do echo hello; sleep 10;done"] + volumeMounts: +{{- range $index, $value := .Volumes }} + - name: persistent-storage-{{ $index }} + mountPath: /data-{{ $index }} +{{- end }} + volumes: +{{- range $index, $value := .Volumes }} + - name: persistent-storage-{{ $index }} + persistentVolumeClaim: + claimName: ebs-claim-{{ $index }} +{{- end }} +--- +{{- range $index, $value := .Volumes }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ebs-claim-{{ $index }} +spec: + accessModes: + - ReadWriteOnce + storageClassName: ebs-sc + resources: + requests: + storage: 4Gi +--- +{{- end }} +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ebs-sc +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer diff --git a/hack/cluster-debugging-scripts/find-attachment-limit b/hack/cluster-debugging-scripts/find-attachment-limit new file mode 100755 index 000000000..b85788999 --- /dev/null +++ b/hack/cluster-debugging-scripts/find-attachment-limit @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# --- + +set -euo pipefail # Exit on any error + +# --- Environment Variables +export MIN_VOLUME_GUESS=${MIN_VOLUME_GUESS:=0} +export MAX_VOLUME_GUESS=${MAX_VOLUME_GUESS:=130} +export POD_TIMEOUT_SECONDS=${POD_TIMEOUT_SECONDS:=90} +export EXTRA_LOGS_FILEPATH=${EXTRA_LOGS_FILEPATH:="/dev/null"} + +export TEST_POD_NAME=${TEST_POD_NAME:="attachment-limit-test-pod"} + +export SCRIPT_PATH ROOT_DIRECTORY GENERATE_MANIFEST_SCRIPT_FILEPATH +SCRIPT_PATH=$(dirname $(realpath "$0")) +ROOT_DIRECTORY="$SCRIPT_PATH/../.." +GENERATE_MANIFEST_SCRIPT_FILEPATH="$ROOT_DIRECTORY/hack/cluster-debugging-scripts/generate_example_manifest.go" + +# --- Script Tools +log() { + printf "%s [INFO] - %s\n" "$(date +"%Y-%m-%d %H:%M:%S")" "${*}" >&2 +} + +check_dependencies() { + local readonly dependencies=("kubectl" "go") + + for cmd in "${dependencies[@]}"; do + if ! command -v "${cmd}" &>/dev/null; then + log "${cmd} could not be found, please install it." + exit 1 + fi + done +} + +# --- Script +usage () { + echo "Usage: $0 [NODE_AFFINITY]" + echo "Examples:" + echo "$0 'eks.amazonaws.com/nodegroup:test-nodegroup'" + echo "$0 'node.kubernetes.io/instance-type:m5.large'" + echo "You can also override the following environment variable defaults: MIN_VOLUME_GUESS=0 MAX_VOLUME_GUESS=130 POD_TIMEOUT_SECONDS=90 EXTRA_LOGS_FILEPATH='/dev/null'" + echo "MIN_VOLUME_GUESS=12 MAX_VOLUME_GUESS=30 POD_TIMEOUT_SECONDS=60 $0 'node.kubernetes.io/instance-type:m5.large'" + exit 1 +} + +parse_args () { + # Confirm 1 parameter + [[ $# -ne 1 ]] && usage + + export NODE_AFFINITY_KEY_VALUE_PAIR=$1 +} + +cleanup() { + log "Deleting k8s objects associated with manifest $MANIFEST_FILE" + kubectl delete -f "$MANIFEST_FILE" > "$EXTRA_LOGS_FILEPATH" 2>&1 + test -f "$MANIFEST_FILE" && rm "$MANIFEST_FILE" +} + +deploy_manifest() { + VOLUME_COUNT=$1 + log "Attempting to deploy pod with $VOLUME_COUNT PVCs on node with label '$NODE_AFFINITY_KEY_VALUE_PAIR'" + + # Create pod manifest for initial guess + MANIFEST_FILE=$(mktemp) + go run "$GENERATE_MANIFEST_SCRIPT_FILEPATH" --node-affinity "$NODE_AFFINITY_KEY_VALUE_PAIR" --volume-count "$VOLUME_COUNT" --test-pod-name "$TEST_POD_NAME" > "$MANIFEST_FILE" + + # Deploy pod to node + log "Creating k8s objects associated with manifest $MANIFEST_FILE" + kubectl create -f "$MANIFEST_FILE" > "$EXTRA_LOGS_FILEPATH" + + # Watch for success vs error code + log "Waiting $POD_TIMEOUT_SECONDS seconds for 'pod/$TEST_POD_NAME to reach condition 'ready'" + set +e + kubectl wait --for=condition=ready --timeout="${POD_TIMEOUT_SECONDS}s" pod/"$TEST_POD_NAME" > "$EXTRA_LOGS_FILEPATH" 2>&1 + WAS_POD_CREATED=$? + set -e + if [[ $WAS_POD_CREATED == 0 ]]; then + log "Pod with $VOLUME_COUNT PVCs successfully deployed" + else + log "Pod with $VOLUME_COUNT PVCs did not successfully deploy" + fi + + cleanup +} + +main() { + check_dependencies + + parse_args "$@" + + export WAS_POD_CREATED=0 # 0 is true in bash + export MANIFEST_FILE + trap 'cleanup' EXIT + + min=$MIN_VOLUME_GUESS + max=$MAX_VOLUME_GUESS + + while (( min < max )); do + # Compute the mean between min and max, rounded up to the superior unit + current_volume_count=$(( (min + max + 1 ) / 2 )) + deploy_manifest $current_volume_count + if [[ $WAS_POD_CREATED == 0 ]] # 0 is True in bash + then min=$current_volume_count + else max=$((current_volume_count - 1)) + fi + done + + export MAX_ATTACHED_VOLUMES="$min" + log "Success!" + log "Maximum amount of volumes deployed with pod on node with label '$NODE_AFFINITY_KEY_VALUE_PAIR': $MAX_ATTACHED_VOLUMES" + trap - EXIT + return "$MAX_ATTACHED_VOLUMES" +} + +main "$@" diff --git a/hack/cluster-debugging-scripts/generate_example_manifest.go b/hack/cluster-debugging-scripts/generate_example_manifest.go new file mode 100644 index 000000000..9b5f09575 --- /dev/null +++ b/hack/cluster-debugging-scripts/generate_example_manifest.go @@ -0,0 +1,76 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "log" + "os" + "strings" + "text/template" +) + +type Manifest struct { + NodeAffinityKey string + NodeAffinityValue string + PodName string + Volumes []int +} + +func main() { + // Parse Command-Line args & flags + nodeAffinityPtr := flag.String("node-affinity", "", "node affinity for pod in form of 'key:value'") + podNamePtr := flag.String("test-pod-name", "test-pod", "name for pod used in manifest. Default is 'test-pod'") + volumeCountPtr := flag.Int("volume-count", 2, "amount of Volumes to provision") + flag.Parse() + + nodeAffinityKey, nodeAffinityValue, err := parseNodeAffinityFlag(nodeAffinityPtr) + if err != nil { + log.Fatal(err) + } + + manifest := Manifest{ + NodeAffinityKey: nodeAffinityKey, + NodeAffinityValue: nodeAffinityValue, + PodName: *podNamePtr, + Volumes: make([]int, *volumeCountPtr), + } + + // Generate manifest to stdout from template file + var tmplFile = "device_slot_test.tmpl" + tmpl, err := template.New(tmplFile).ParseFiles(tmplFile) + if err != nil { + log.Fatal(err) + } + err = tmpl.Execute(os.Stdout, manifest) + if err != nil { + log.Fatal(err) + } +} + +func parseNodeAffinityFlag(nodeAffinityPtr *string) (string, string, error) { + nodeAffinityKey := "" + nodeAffinityValue := "" + if len(*nodeAffinityPtr) > 0 { + nodeAffinity := strings.Split(*nodeAffinityPtr, ":") + if len(nodeAffinity) != 2 { + return "", "", fmt.Errorf("flag '--node-affinity' must take the form 'key:value'") + } + nodeAffinityKey = nodeAffinity[0] + nodeAffinityValue = nodeAffinity[1] + } + return nodeAffinityKey, nodeAffinityValue, nil +} diff --git a/hack/cluster-debugging-scripts/get-attachment-breakdown b/hack/cluster-debugging-scripts/get-attachment-breakdown new file mode 100755 index 000000000..73c5aa744 --- /dev/null +++ b/hack/cluster-debugging-scripts/get-attachment-breakdown @@ -0,0 +1,101 @@ +#!/bin/bash +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# --- + +set -euo pipefail # Exit on any error + +# --- Environment Variables +export MIN_VOLUME_GUESS=${MIN_VOLUME_GUESS:=0} +export MAX_VOLUME_GUESS=${MAX_VOLUME_GUESS:=130} +export POD_TIMEOUT_SECONDS=${POD_TIMEOUT_SECONDS:=120} +export TEST_POD_NAME=${TEST_POD_NAME:="attachment-limit-test-pod"} + +export SCRIPT_PATH ROOT_DIRECTORY FIND_ATTACHMENT_LIMIT_FILEPATH +SCRIPT_PATH=$(dirname $(realpath "$0")) +ROOT_DIRECTORY="$SCRIPT_PATH/../.." +FIND_ATTACHMENT_LIMIT_FILEPATH="$ROOT_DIRECTORY/hack/cluster-debugging-scripts/find-attachment-limit" + +# --- Script Tools +# Color codes for different text colors +RED='\033[0;31m' # Red color for errors +YELLOW='\033[0;33m' # Yellow color for warnings +NC='\033[0m' # No color (to reset the text color) + +log_warning() { + printf "${YELLOW}%s [WARNING] - %s\n${NC}" "$(date +"%Y-%m-%d %H:%M:%S")" "${*}" >&2 +} + +log_error() { + printf "${RED}%s [ERROR] - %s\n${NC}" "$(date +"%Y-%m-%d %H:%M:%S")" "${*}" >&2 +} + +log() { + printf "%s [INFO] - %s\n" "$(date +"%Y-%m-%d %H:%M:%S")" "${*}" >&2 +} + +check_dependencies() { + local readonly dependencies=("kubectl" "go" "aws") + + for cmd in "${dependencies[@]}"; do + if ! command -v "${cmd}" &>/dev/null; then + log "${cmd} could not be found, please install it." + exit 1 + fi + done +} + +# --- Script +usage () { + echo "Usage: $0 [NODEGROUP_NAME]" + echo "Ex: $0 'test-nodegroup'" + echo "You can also override the following environment variable defaults: MIN_VOLUME_GUESS=0 MAX_VOLUME_GUESS=130 POD_TIMEOUT_SECONDS=90 EXTRA_LOGS_FILEPATH='/dev/null'" + echo "MIN_VOLUME_GUESS=12 MAX_VOLUME_GUESS=30 POD_TIMEOUT_SECONDS=60 EXTRA_LOGS_FILEPATH:='/dev/null' $0 'test-nodegroup'" + exit 1 +} + +parse_args () { + # Confirm 1 parameter + [[ $# -ne 1 ]] && usage + + export NODEGROUP_NAME=$1 +} + +main() { + check_dependencies + + parse_args "$@" + + num_nodes_with_same_nodegroup_name=$(aws ec2 describe-instances --filters Name=tag:Name,Values="*$NODEGROUP_NAME*" --query 'length(Reservations[*].Instances[*])') + [[ $num_nodes_with_same_nodegroup_name -ne 1 ]] && log_warning "There are $num_nodes_with_same_nodegroup_name instances with the same nodegroup name. This script may provide inaccurate numbers." + + log "Currently the instance associated with nodegroup name $NODEGROUP_NAME has the following attachments:" + block_device_mappings=$(aws ec2 describe-instances --filters Name=tag:Name,Values="*$NODEGROUP_NAME*" --query 'length(Reservations[0].Instances[0].BlockDeviceMappings)') + log "$block_device_mappings volumes from Block Device Mappings are attached to the instance. (Including the instance's root volume)" + + enis=$(aws ec2 describe-instances --filters Name=tag:Name,Values="*$NODEGROUP_NAME*" --query 'length(Reservations[0].Instances[0].NetworkInterfaces)') + log "$enis Elastic Network Interfaces (ENIs) are attached to the instance. (NOTE: These ENIs may not count towards volume limit for certain Nitro System instance types. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html)" + + log "Checking how many additional EBS volumes are able to be attached via the aws-ebs-csi-driver. This may take a while..." + set +e + $FIND_ATTACHMENT_LIMIT_FILEPATH "eks.amazonaws.com/nodegroup:$NODEGROUP_NAME" + max_additional_volumes="$?" + set -e + log "$max_additional_volumes volumes are able to be attached to the instance." + + echo "Attachments for $NODEGROUP_NAME" + printf "BlockDeviceMappings ENIs Available-Attachment-Slots(Validated-by-aws-ebs-csi-driver)\n%s %s %s" "$block_device_mappings" "$enis" "$max_additional_volumes" | column --table +} + +main "$@" diff --git a/hack/cluster-debugging-scripts/go.mod b/hack/cluster-debugging-scripts/go.mod new file mode 100644 index 000000000..3e12cb300 --- /dev/null +++ b/hack/cluster-debugging-scripts/go.mod @@ -0,0 +1,3 @@ +module cluster_debugging + +go 1.21.4