Skip to content

Commit

Permalink
Add attachment limit scripts to hack/cluster-debugging-scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewSirenko committed Dec 8, 2023
1 parent b894ce5 commit 28cc128
Show file tree
Hide file tree
Showing 6 changed files with 414 additions and 0 deletions.
55 changes: 55 additions & 0 deletions hack/cluster-debugging-scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Cluster Debugging Scripts

This folder contains a collection of scripts to help debug clusters.

## FAQ

### How can I validate that the aws-ebs-csi-driver correctly makes use of all available attachment slots for instance type X?

Answer: Perform the following steps (Which will create a nodegroup, count the Block Device Mappings + ENI for the underlying instance, deploy pods EBS PVs until the script finds the maximum amount of volumes the aws-ebs-csi-driver can attach to instance)

```
export CLUSTER_NAME="devcluster"
export NODEGROUP_NAME="ng-attachment-limit-test"
export INSTANCE_TYPE="m7g.large"
eksctl create nodegroup -c "$CLUSTER_NAME" --nodes 1 -t "$INSTANCE_TYPE" -n "$NODEGROUP_NAME"
./get-attachment-breakdown "$NODEGROUP_NAME"
eksctl delete nodegroup -c "$CLUSTER_NAME" -n "$NODEGROUP_NAME"
```

By the end of the script, you should see an output similar to this:
```
Attachments for ng-f3ecdf71
BlockDeviceMappings ENIs Available-Attachment-Slots(Validated-by-aws-ebs-csi-driver)
1 2 25
```

## Scripts

get-attachment-breakdown: Find the maximum amount of volumes that can be attached to a specified nodegroup node. Additionally, log how many BlockDeviceMappings and ENIs are attached the instance of the specified nodegroup.

Examples
```
./get_attachment_breakdown "ng-f3ecdf71"
MIN_VOLUME_GUESS=20 MAX_VOLUME_GUESS=40 POD_TIMEOUT_SECONDS=120 ./get_attachment_breakdown "ng-f3ecdf71"
```

find_attachment_limit: Find the maximum amount of volumes the aws-ebs-csi-driver can attach to a node.

Examples:
```
./find-attachment-limit 'some.node.affinity.key:value'
./find-attachment-limit 'eks.amazonaws.com/nodegroup:test-nodegroup'
./find-attachment-limit 'node.kubernetes.io/instance-type:m5.large'
MIN_VOLUME_GUESS=12 MAX_VOLUME_GUESS=30 POD_TIMEOUT_SECONDS=60 ./find_attachment_limit 'node.kubernetes.io/instance-type:m5.large'
```

generate_example_manifest.go: Generate a yaml file containing a pod associated with a specified amount of PVCs based off of the template file `device_slot_test.tmpl`

Example:
```
go run "generate_example_manifest.go" --node-affinity "some.label:value" --volume-count "22" --test-pod-name "test-pod"
```
51 changes: 51 additions & 0 deletions hack/cluster-debugging-scripts/device_slot_test.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
apiVersion: v1
kind: Pod
metadata:
name: {{ .PodName }}
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .NodeAffinityKey }}
operator: In
values:
- {{ .NodeAffinityValue }}
containers:
- name: device-limit-tester-{{ len .Volumes }}-volumes
image: centos
command: ["/bin/sh"]
args: ["-c", "while true; do echo hello; sleep 10;done"]
volumeMounts:
{{- range $index, $value := .Volumes }}
- name: persistent-storage-{{ $index }}
mountPath: /data-{{ $index }}
{{- end }}
volumes:
{{- range $index, $value := .Volumes }}
- name: persistent-storage-{{ $index }}
persistentVolumeClaim:
claimName: ebs-claim-{{ $index }}
{{- end }}
---
{{- range $index, $value := .Volumes }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ebs-claim-{{ $index }}
spec:
accessModes:
- ReadWriteOnce
storageClassName: ebs-sc
resources:
requests:
storage: 4Gi
---
{{- end }}
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ebs-sc
provisioner: ebs.csi.aws.com
volumeBindingMode: WaitForFirstConsumer
128 changes: 128 additions & 0 deletions hack/cluster-debugging-scripts/find-attachment-limit
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/bin/bash
# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ---

set -euo pipefail # Exit on any error

# --- Environment Variables
export MIN_VOLUME_GUESS=${MIN_VOLUME_GUESS:=0}
export MAX_VOLUME_GUESS=${MAX_VOLUME_GUESS:=130}
export POD_TIMEOUT_SECONDS=${POD_TIMEOUT_SECONDS:=90}
export EXTRA_LOGS_FILEPATH=${EXTRA_LOGS_FILEPATH:="/dev/null"}

export TEST_POD_NAME=${TEST_POD_NAME:="attachment-limit-test-pod"}

export SCRIPT_PATH ROOT_DIRECTORY GENERATE_MANIFEST_SCRIPT_FILEPATH
SCRIPT_PATH=$(dirname $(realpath "$0"))
ROOT_DIRECTORY="$SCRIPT_PATH/../.."
GENERATE_MANIFEST_SCRIPT_FILEPATH="$ROOT_DIRECTORY/hack/cluster-debugging-scripts/generate_example_manifest.go"

# --- Script Tools
log() {
printf "%s [INFO] - %s\n" "$(date +"%Y-%m-%d %H:%M:%S")" "${*}" >&2
}

check_dependencies() {
local readonly dependencies=("kubectl" "go")

for cmd in "${dependencies[@]}"; do
if ! command -v "${cmd}" &>/dev/null; then
log "${cmd} could not be found, please install it."
exit 1
fi
done
}

# --- Script
usage () {
echo "Usage: $0 [NODE_AFFINITY]"
echo "Examples:"
echo "$0 'eks.amazonaws.com/nodegroup:test-nodegroup'"
echo "$0 'node.kubernetes.io/instance-type:m5.large'"
echo "You can also override the following environment variable defaults: MIN_VOLUME_GUESS=0 MAX_VOLUME_GUESS=130 POD_TIMEOUT_SECONDS=90 EXTRA_LOGS_FILEPATH='/dev/null'"
echo "MIN_VOLUME_GUESS=12 MAX_VOLUME_GUESS=30 POD_TIMEOUT_SECONDS=60 $0 'node.kubernetes.io/instance-type:m5.large'"
exit 1
}

parse_args () {
# Confirm 1 parameter
[[ $# -ne 1 ]] && usage

export NODE_AFFINITY_KEY_VALUE_PAIR=$1
}

cleanup() {
log "Deleting k8s objects associated with manifest $MANIFEST_FILE"
kubectl delete -f "$MANIFEST_FILE" > "$EXTRA_LOGS_FILEPATH" 2>&1
test -f "$MANIFEST_FILE" && rm "$MANIFEST_FILE"
}

deploy_manifest() {
VOLUME_COUNT=$1
log "Attempting to deploy pod with $VOLUME_COUNT PVCs on node with label '$NODE_AFFINITY_KEY_VALUE_PAIR'"

# Create pod manifest for initial guess
MANIFEST_FILE=$(mktemp)
go run "$GENERATE_MANIFEST_SCRIPT_FILEPATH" --node-affinity "$NODE_AFFINITY_KEY_VALUE_PAIR" --volume-count "$VOLUME_COUNT" --test-pod-name "$TEST_POD_NAME" > "$MANIFEST_FILE"

# Deploy pod to node
log "Creating k8s objects associated with manifest $MANIFEST_FILE"
kubectl create -f "$MANIFEST_FILE" > "$EXTRA_LOGS_FILEPATH"

# Watch for success vs error code
log "Waiting $POD_TIMEOUT_SECONDS seconds for 'pod/$TEST_POD_NAME to reach condition 'ready'"
set +e
kubectl wait --for=condition=ready --timeout="${POD_TIMEOUT_SECONDS}s" pod/"$TEST_POD_NAME" > "$EXTRA_LOGS_FILEPATH" 2>&1
WAS_POD_CREATED=$?
set -e
if [[ $WAS_POD_CREATED == 0 ]]; then
log "Pod with $VOLUME_COUNT PVCs successfully deployed"
else
log "Pod with $VOLUME_COUNT PVCs did not successfully deploy"
fi

cleanup
}

main() {
check_dependencies

parse_args "$@"

export WAS_POD_CREATED=0 # 0 is true in bash
export MANIFEST_FILE
trap 'cleanup' EXIT

min=$MIN_VOLUME_GUESS
max=$MAX_VOLUME_GUESS

while (( min < max )); do
# Compute the mean between min and max, rounded up to the superior unit
current_volume_count=$(( (min + max + 1 ) / 2 ))
deploy_manifest $current_volume_count
if [[ $WAS_POD_CREATED == 0 ]] # 0 is True in bash
then min=$current_volume_count
else max=$((current_volume_count - 1))
fi
done

export MAX_ATTACHED_VOLUMES="$min"
log "Success!"
log "Maximum amount of volumes deployed with pod on node with label '$NODE_AFFINITY_KEY_VALUE_PAIR': $MAX_ATTACHED_VOLUMES"
trap - EXIT
return "$MAX_ATTACHED_VOLUMES"
}

main "$@"
76 changes: 76 additions & 0 deletions hack/cluster-debugging-scripts/generate_example_manifest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"flag"
"fmt"
"log"
"os"
"strings"
"text/template"
)

type Manifest struct {
NodeAffinityKey string
NodeAffinityValue string
PodName string
Volumes []int
}

func main() {
// Parse Command-Line args & flags
nodeAffinityPtr := flag.String("node-affinity", "", "node affinity for pod in form of 'key:value'")
podNamePtr := flag.String("test-pod-name", "test-pod", "name for pod used in manifest. Default is 'test-pod'")
volumeCountPtr := flag.Int("volume-count", 2, "amount of Volumes to provision")
flag.Parse()

nodeAffinityKey, nodeAffinityValue, err := parseNodeAffinityFlag(nodeAffinityPtr)
if err != nil {
log.Fatal(err)
}

manifest := Manifest{
NodeAffinityKey: nodeAffinityKey,
NodeAffinityValue: nodeAffinityValue,
PodName: *podNamePtr,
Volumes: make([]int, *volumeCountPtr),
}

// Generate manifest to stdout from template file
var tmplFile = "device_slot_test.tmpl"
tmpl, err := template.New(tmplFile).ParseFiles(tmplFile)
if err != nil {
log.Fatal(err)
}
err = tmpl.Execute(os.Stdout, manifest)
if err != nil {
log.Fatal(err)
}
}

func parseNodeAffinityFlag(nodeAffinityPtr *string) (string, string, error) {
nodeAffinityKey := ""
nodeAffinityValue := ""
if len(*nodeAffinityPtr) > 0 {
nodeAffinity := strings.Split(*nodeAffinityPtr, ":")
if len(nodeAffinity) != 2 {
return "", "", fmt.Errorf("flag '--node-affinity' must take the form 'key:value'")
}
nodeAffinityKey = nodeAffinity[0]
nodeAffinityValue = nodeAffinity[1]
}
return nodeAffinityKey, nodeAffinityValue, nil
}
Loading

0 comments on commit 28cc128

Please sign in to comment.