From 7748d16529687117e10b8277d4285ac6a1cc9efd Mon Sep 17 00:00:00 2001 From: Andrew Durbin Date: Wed, 16 Oct 2024 09:46:51 -0600 Subject: [PATCH] Install Descheduler, fix startup readywait Descheduler will be used for eve-app rebalancing during cluster node reboots/upgrades in an upcoming PR. Wait for longhorn daemonsets to be ready, before upcoming PR to snapshot single-node /var/lib kube db. Resolve sometimes failure to import external-boot-image Wait for containerd before importing. Tighter error checking on import. Signed-off-by: Andrew Durbin --- .spdxignore | 2 + pkg/kube/Dockerfile | 4 ++ pkg/kube/cluster-init.sh | 72 ++++++++++++++++------ pkg/kube/descheduler-job.yaml | 54 ++++++++++++++++ pkg/kube/descheduler-policy-configmap.yaml | 24 ++++++++ 5 files changed, 137 insertions(+), 19 deletions(-) create mode 100644 pkg/kube/descheduler-job.yaml create mode 100644 pkg/kube/descheduler-policy-configmap.yaml diff --git a/.spdxignore b/.spdxignore index c089d6d0f4..edc4abf5f4 100644 --- a/.spdxignore +++ b/.spdxignore @@ -10,3 +10,5 @@ pkg/rngd/cmd/rngd/vendor/ pkg/wwan/mmagent/vendor/ tools/get-deps/vendor/ pkg/installer/vendor/ +pkg/kube/descheduler-job.yaml +pkg/kube/descheduler-policy-configmap.yaml diff --git a/pkg/kube/Dockerfile b/pkg/kube/Dockerfile index 2922f6bd69..02c6f2bfe9 100644 --- a/pkg/kube/Dockerfile +++ b/pkg/kube/Dockerfile @@ -43,6 +43,10 @@ COPY iscsid.conf /etc/iscsi/ COPY longhorn-generate-support-bundle.sh /usr/bin/ COPY nsmounter /usr/bin/ +# descheduler +COPY descheduler-job.yaml /etc/ +COPY descheduler-policy-configmap.yaml /etc/ + # Containerd config RUN mkdir -p /etc/containerd COPY config-k3s.toml /etc/containerd/ diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh index 358cfafc15..cb7228d6eb 100755 --- a/pkg/kube/cluster-init.sh +++ b/pkg/kube/cluster-init.sh @@ -300,20 +300,34 @@ check_start_containerd() { logmsg "Started k3s-containerd at pid:$containerd_pid" fi if [ -f /etc/external-boot-image.tar ]; then + # Give containerd a moment to start before importing + for _ in 1 2 3; do + reported_pid=$(/var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info | jq .server.pid) + if [ "$reported_pid" = "$containerd_pid" ]; then + logmsg "containerd online, continue to import" + break + fi + sleep 1 + done + # NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/ # Install external-boot-image image to our eve user containerd registry. # This image contains just kernel and initrd to bootstrap a container image as a VM. # This is very similar to what we do on kvm based eve to start container as a VM. logmsg "Trying to install new external-boot-image" # This import happens once per reboot - if ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then - eve_external_boot_img_tag=$(cat /run/eve-release) - eve_external_boot_img=docker.io/lfedge/eve-external-boot-image:"$eve_external_boot_img_tag" - import_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]') - ctr -a /run/containerd-user/containerd.sock image tag "$import_tag" "$eve_external_boot_img" - - logmsg "Successfully installed external-boot-image $import_tag as $eve_external_boot_img" - rm -f /etc/external-boot-image.tar + import_name_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]') + import_name=$(echo "$import_name_tag" | cut -d ':' -f 1) + eve_external_boot_img_name="docker.io/lfedge/eve-external-boot-image" + if [ "$import_name" = "$eve_external_boot_img_name" ]; then + if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then + eve_external_boot_img_tag=$(cat /run/eve-release) + eve_external_boot_img="${eve_external_boot_img_name}:${eve_external_boot_img_tag}" + if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image tag "$import_name_tag" "$eve_external_boot_img"; then + logmsg "Successfully installed external-boot-image $import_name_tag as $eve_external_boot_img" + rm -f /etc/external-boot-image.tar + fi + fi fi fi } @@ -498,21 +512,41 @@ if [ ! -f /var/lib/all_components_initialized ]; then fi if [ ! -f /var/lib/longhorn_initialized ]; then - wait_for_item "longhorn" - logmsg "Installing longhorn version ${LONGHORN_VERSION}" - apply_longhorn_disk_config "$HOSTNAME" - lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml - if [ ! -e $lhCfgPath ]; then - curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath" + if [ ! -f /var/lib/longhorn_installing ]; then + wait_for_item "longhorn" + logmsg "Installing longhorn version ${LONGHORN_VERSION}" + apply_longhorn_disk_config "$HOSTNAME" + lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml + if [ ! -e $lhCfgPath ]; then + curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath" + fi + if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then + sed -i '/ default-setting.yaml: |-/a\ create-default-disk-labeled-nodes: true' "$lhCfgPath" + fi + kubectl apply -f "$lhCfgPath" + touch /var/lib/longhorn_installing fi - if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then - sed -i '/ default-setting.yaml: |-/a\ create-default-disk-labeled-nodes: true' "$lhCfgPath" + lhStatus=$(kubectl -n longhorn-system get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n') + if [ "$lhStatus" = "truetruetrue" ]; then + logmsg "longhorn ready" + rm /var/lib/longhorn_installing + touch /var/lib/longhorn_initialized fi - kubectl apply -f "$lhCfgPath" - touch /var/lib/longhorn_initialized fi - if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then + # + # Descheduler + # + if [ ! -f /var/lib/descheduler_initialized ]; then + wait_for_item "descheduler" + logmsg "Installing Descheduler" + DESCHEDULER_VERSION="v0.29.0" + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/descheduler/${DESCHEDULER_VERSION}/kubernetes/base/rbac.yaml + kubectl apply -f /etc/descheduler-policy-configmap.yaml + touch /var/lib/descheduler_initialized + fi + + if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ] && [ -f /var/lib/descheduler_initialized ]; then logmsg "All components initialized" touch /var/lib/all_components_initialized fi diff --git a/pkg/kube/descheduler-job.yaml b/pkg/kube/descheduler-job.yaml new file mode 100644 index 0000000000..6ef92d1cac --- /dev/null +++ b/pkg/kube/descheduler-job.yaml @@ -0,0 +1,54 @@ +--- +# from: https://raw.githubusercontent.com/kubernetes-sigs/descheduler/${DESCHEDULER_VERSION}/kubernetes/job/job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: descheduler-job + namespace: kube-system +spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: descheduler-pod + spec: + priorityClassName: system-cluster-critical + containers: + - name: descheduler + image: registry.k8s.io/descheduler/descheduler:v0.29.0 + volumeMounts: + - mountPath: /policy-dir + name: policy-volume + command: + - "/bin/descheduler" + args: + - "--policy-config-file" + - "/policy-dir/policy.yaml" + - "--v" + - "3" + resources: + requests: + cpu: "500m" + memory: "256Mi" + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10258 + scheme: HTTPS + initialDelaySeconds: 3 + periodSeconds: 10 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + restartPolicy: "Never" + serviceAccountName: descheduler-sa + volumes: + - name: policy-volume + configMap: + name: descheduler-policy-configmap diff --git a/pkg/kube/descheduler-policy-configmap.yaml b/pkg/kube/descheduler-policy-configmap.yaml new file mode 100644 index 0000000000..8f52cd6c77 --- /dev/null +++ b/pkg/kube/descheduler-policy-configmap.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: descheduler-policy-configmap + namespace: kube-system +data: + policy.yaml: | + apiVersion: "descheduler/v1alpha2" + kind: "DeschedulerPolicy" + profiles: + - name: ProfileName + pluginConfig: + - name: "RemovePodsViolatingNodeAffinity" + args: + namespaces: + include: + - "eve-kube-app" + nodeAffinityType: + - "preferredDuringSchedulingIgnoredDuringExecution" + plugins: + deschedule: + enabled: + - "RemovePodsViolatingNodeAffinity"