From c030dfa1653990417f287b4a407f24d8f8a3a07c Mon Sep 17 00:00:00 2001
From: Naiming Shen <naiming@zededa.com>
Date: Fri, 18 Oct 2024 11:08:55 -0700
Subject: [PATCH] Main chnages for kube/cluster-init.sh for multi-node cluster
 handling

- create the cluster-utils.sh in pkg/kube for a number of functions
- get EdgeNodeClusterStatus from 'zedkube' for cluster status, cluster
  nodeip, cluster prefix, etc
- a background process to monitor the cluster mode changes
- implement cluster mode change logic and operations for change from
  single-node to cluster, or from cluster back to single-node
- save kube /var/lib at single-node first time setup for later use when
  the mode changes back from cluster mode to single node
- display eve-release string for get node OS-IMAGE field

Signed-off-by: Naiming Shen <naiming@zededa.com>
---
 pkg/kube/Dockerfile       |   1 +
 pkg/kube/cluster-init.sh  | 789 +++++++++++++++++++++++++++-----------
 pkg/kube/cluster-utils.sh | 273 +++++++++++++
 3 files changed, 845 insertions(+), 218 deletions(-)
 create mode 100755 pkg/kube/cluster-utils.sh

diff --git a/pkg/kube/Dockerfile b/pkg/kube/Dockerfile
index 2922f6bd69..5d7378e9a0 100644
--- a/pkg/kube/Dockerfile
+++ b/pkg/kube/Dockerfile
@@ -23,6 +23,7 @@ RUN GO111MODULE=on CGO_ENABLED=0 go build -v -ldflags "-s -w" -o /out/usr/bin/ce
 FROM scratch
 COPY --from=build /out/ /
 COPY cluster-init.sh /usr/bin/
+COPY cluster-utils.sh /usr/bin/
 COPY cgconfig.conf /etc
 
 # k3s
diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh
index 358cfafc15..9c9e415295 100755
--- a/pkg/kube/cluster-init.sh
+++ b/pkg/kube/cluster-init.sh
@@ -1,141 +1,115 @@
 #!/bin/sh
 #
-# Copyright (c) 2023-2024 Zededa, Inc.
+# Copyright (c) 2024 Zededa, Inc.
 # SPDX-License-Identifier: Apache-2.0
 
-K3S_VERSION=v1.28.5+k3s1
 KUBEVIRT_VERSION=v1.1.0
 LONGHORN_VERSION=v1.6.2
 CDI_VERSION=v1.54.0
 NODE_IP=""
-MAX_K3S_RESTARTS=10
 RESTART_COUNT=0
-K3S_LOG_DIR="/persist/newlog/kube"
+K3S_LOG_DIR="/persist/kubelog"
 INSTALL_LOG="${K3S_LOG_DIR}/k3s-install.log"
 CTRD_LOG="${K3S_LOG_DIR}/containerd-user.log"
 LOG_SIZE=$((5*1024*1024))
 HOSTNAME=""
 VMICONFIG_FILENAME="/run/zedkube/vmiVNC.run"
 VNC_RUNNING=false
-
-logmsg() {
-        local MSG
-        local TIME
-        MSG="$*"
-        TIME=$(date +"%F %T")
-        echo "$TIME : $MSG"  >> $INSTALL_LOG
-}
-
-setup_cgroup () {
-        echo "cgroup /sys/fs/cgroup cgroup defaults 0 0" >> /etc/fstab
-}
-
-check_log_file_size() {
-        currentSize=$(wc -c <"$K3S_LOG_DIR/$1")
-        if [ "$currentSize" -gt "$LOG_SIZE" ]; then
-                if [ -f "$K3S_LOG_DIR/$1.2" ]; then
-                        cp "$K3S_LOG_DIR/$1.2" "$K3S_LOG_DIR/$1.3"
-                fi
-                if [ -f "$K3S_LOG_DIR/$1.1" ]; then
-                        cp "$K3S_LOG_DIR/$1.1" "$K3S_LOG_DIR/$1.2"
-                fi
-                cp "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1"
-                truncate -s 0 "$K3S_LOG_DIR/$1"
-                logmsg "k3s logfile size $currentSize rotate"
-        fi
-}
-
-save_crash_log() {
-        if [ "$RESTART_COUNT" = "1" ]; then
-                return
-        fi
-        fileBaseName=$1
-        # This pattern will alias with older crashes, but also a simple way to contain log bloat
-        crashLogBaseName="${fileBaseName}.restart.${RESTART_COUNT}.gz"
-        if [ -e "${K3S_LOG_DIR}/${crashLogBaseName}" ]; then
-                rm "${K3S_LOG_DIR}/${crashLogBaseName}"
-        fi
-        gzip -k -9 "${K3S_LOG_DIR}/${fileBaseName}" -c > "${K3S_LOG_DIR}/${crashLogBaseName}"
+ClusterPrefixMask=""
+multus_source_dir="/var/lib/cni/multus/results"
+multus_dest_dir="/run/kube/multus"
+search_multus_string="-net"
+config_file="/etc/rancher/k3s/config.yaml"
+k3s_config_file="/etc/rancher/k3s/k3s-config.yaml"
+k3s_last_start_time=""
+clusterStatusPort="12346"
+K3s_LOG_FILE="k3s.log"
+INITIAL_WAIT_TIME=5
+MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart
+current_wait_time=$INITIAL_WAIT_TIME
+SAVE_KUBE_VAR_LIB_DIR="/persist/kube-save-var-lib"
+CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing"
+
+# Source the utility script
+. /usr/bin/cluster-utils.sh
+
+# get cluster IP address from the cluster status file
+get_cluster_node_ip() {
+    if [ -z "$1" ]; then
+        enc_data=$(cat "$enc_status_file")
+        clusternodeip=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.IP')
+        echo "$clusternodeip"
+    else
+        echo "$1"
+    fi
 }
 
-check_network_connection () {
-        while true; do
-                ret=$(curl -o /dev/null -w "%{http_code}" -s "https://get.k3s.io")
-                if [ "$ret" -eq 200 ]; then
-                        logmsg "Network is ready."
-                        break;
-                else
-                        logmsg "Network is not yet ready"
-                fi
-                sleep 5
-        done
-}
+# Function to get the cluster prefix length from the cluster status file
+get_cluster_prefix_len() {
+    enc_data=$(cat "$enc_status_file")
+    mask=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.Mask')
+    decoded_mask=$(echo "$mask" | base64 -d | od -An -t u1)
+    prefixlen=0
+
+    for byte in $decoded_mask; do
+        case $byte in
+            255) prefixlen=$((prefixlen + 8)) ;;
+            254) prefixlen=$((prefixlen + 7)) ;;
+            252) prefixlen=$((prefixlen + 6)) ;;
+            248) prefixlen=$((prefixlen + 5)) ;;
+            240) prefixlen=$((prefixlen + 4)) ;;
+            224) prefixlen=$((prefixlen + 3)) ;;
+            192) prefixlen=$((prefixlen + 2)) ;;
+            128) prefixlen=$((prefixlen + 1)) ;;
+            0) break ;;
+            *) logmsg "get_cluster_prefix_len, Unexpected byte value: $byte"; exit 1 ;;
+        esac
+    done
 
-wait_for_default_route() {
-        while read -r iface dest gw flags refcnt use metric mask mtu window irtt; do
-                if [ "$dest" = "00000000" ] && [ "$mask" = "00000000" ]; then
-                        logmsg "Default route found"
-                        return 0
-                fi
-                logmsg "waiting for default route $iface $dest $gw $flags $refcnt $use $metric $mask $mtu $window $irtt"
-                sleep 1
-        done < /proc/net/route
-        return 1
+    echo "/$prefixlen"
 }
 
-# Get IP of the interface with the first default route.
-# This will be then used as K3s node IP.
-# XXX This is a temporary solution. Eventually, the user will be able to select
-#     the cluster network interface via EdgeDevConfig.
-get_default_intf_IP_prefix() {
-        logmsg "Trying to obtain Node IP..."
-        while [ -z "$NODE_IP" ]; do
-                # Find the default route interface
-                default_interface="$(ip route show default | head -n 1 | awk '/default/ {print $5}')"
-                # Get the IP address of the default route interface
-                NODE_IP="$(ip addr show dev "$default_interface" | awk '/inet / {print $2}' | cut -d "/" -f1)"
-                [ -z "$NODE_IP" ] && sleep 1
-        done
-        logmsg "Node IP Address: $NODE_IP"
-        ip_prefix="$NODE_IP/32"
-        # Fill in the outbound external Interface IP prefix in multus config
-        awk -v new_ip="$ip_prefix" '{gsub("IPAddressReplaceMe", new_ip)}1' /etc/multus-daemonset.yaml > /tmp/multus-daemonset.yaml
-}
+# Set the node IP to multus differently for single node and cluster mode
+assign_multus_nodeip() {
+  if [ -f /var/lib/edge-node-cluster-mode ]; then
+    NODE_IP=$(get_cluster_node_ip "$1")
+    ClusterPrefixMask=$(get_cluster_prefix_len)
+    ip_prefix=$(ipcalc -n "$NODE_IP$ClusterPrefixMask" | cut -d "=" -f2)
+    ip_prefix="$ip_prefix$ClusterPrefixMask"
+    logmsg "Cluster Node IP prefix to multus: $ip_prefix with node-ip $NODE_IP"
+  else
+    while [ -z "$NODE_IP" ]; do
+      # Find the default route interface
+      default_interface="$(ip route show default | head -n 1 | awk '/default/ {print $5}')"
 
-# kubernetes's name must be lower case and '-' instead of '_'
-convert_to_k8s_compatible() {
-        echo "$1" | tr '[:upper:]_' '[:lower:]-'
-}
+      # Get the IP address of the default route interface
+      NODE_IP="$(ip addr show dev "$default_interface" | awk '/inet / {print $2}' | cut -d "/" -f1)"
 
-wait_for_device_name() {
-        logmsg "Waiting for DeviceName from controller..."
-        EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json"
-        while [ ! -f $EdgeNodeInfoPath ]; do
-                sleep 5
-        done
-        dName=$(jq -r '.DeviceName' $EdgeNodeInfoPath)
-        if [ -n "$dName" ]; then
-                HOSTNAME=$(convert_to_k8s_compatible "$dName")
-        fi
+      [ -z "$NODE_IP" ] && sleep 1
+    done
 
-        # we should have the uuid since we got the device name
-        DEVUUID=$(/bin/hostname)
+    ip_prefix="$NODE_IP/32"
+    logmsg "Single Node IP prefix to multus: $ip_prefix with node-ip $NODE_IP"
+  fi
 
-        if ! grep -q node-name /etc/rancher/k3s/config.yaml; then
-                echo "node-name: $HOSTNAME" >> /etc/rancher/k3s/config.yaml
-        fi
-        logmsg "Hostname: $HOSTNAME"
+  logmsg "Assign node-ip for multus with $ip_prefix"
+  # fill in the outbound external Interface IP prefix in multus config
+  awk -v new_ip="$ip_prefix" '{gsub("IPAddressReplaceMe", new_ip)}1' /etc/multus-daemonset.yaml > /etc/multus-daemonset-new.yaml
 }
 
 apply_multus_cni() {
-        get_default_intf_IP_prefix
-        kubectl create namespace eve-kube-app
-        logmsg "Apply Multus, Node-IP: $NODE_IP"
-        if ! kubectl apply -f /tmp/multus-daemonset.yaml; then
+        # remove get_default_intf_IP_prefix
+        #get_default_intf_IP_prefix
+        if ! kubectl get namespace eve-kube-app > /dev/null 2>&1; then
+                kubectl create namespace eve-kube-app
+        fi
+        logmsg "Apply multus-daemonset-new.yaml"
+        if ! kubectl apply -f /etc/multus-daemonset-new.yaml > /dev/null 2>&1; then
+                logmsg "Apply Multus, has failed, jump out now"
                 return 1
         fi
         logmsg "Done applying Multus"
-        ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus
+        link_multus_into_k3s
         # need to only do this once
         touch /var/lib/multus_initialized
         return 0
@@ -182,6 +156,8 @@ setup_prereqs () {
         #Needed for iscsi tools
         mkdir -p /run/lock
         mkdir -p "$K3S_LOG_DIR"
+        rm -rf /var/log
+        ln -s "$K3S_LOG_DIR" /var/log
         /usr/sbin/iscsid start
         mount --make-rshared /
         setup_cgroup
@@ -195,6 +171,11 @@ setup_prereqs () {
 }
 
 config_cluster_roles() {
+        # remove the previous k3s-debuguser*.pem files
+        # in the case of single node to cluster transition, we may not reboot,
+        # and there could be more than one certs files
+        rm -f /tmp/k3s-debuguser*.pem
+
         # generate user debugging-user certificates
         # 10 year expiration for now
         if ! /usr/bin/cert-gen -l 315360000 --ca-cert /var/lib/rancher/k3s/server/tls/client-ca.crt \
@@ -226,6 +207,26 @@ apply_longhorn_disk_config() {
         kubectl annotate node "$node" node.longhorn.io/default-disks-config='[ { "path":"/persist/vault/volumes", "allowScheduling":true }]'
 }
 
+# Apply the node-uuid label to the node, since node name is the EVE device name
+apply_node_uuid_lable () {
+        logmsg "set node label with uuid $DEVUUID"
+        kubectl label node "$HOSTNAME" node-uuid="$DEVUUID"
+}
+
+reapply_node_labes() {
+        apply_node_uuid_lable
+        apply_longhorn_disk_config "$HOSTNAME"
+        # Check if the node with both labels exists, don't assume above apply worked
+        node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length')
+
+        if [ "$node_count" -gt 0 ]; then
+                logmsg "Node labels re-applied successfully"
+                touch /var/lib/node-labels-initialized
+        else
+                logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID"
+        fi
+}
+
 check_overwrite_nsmounter() {
         ### REMOVE ME+
         # When https://github.com/longhorn/longhorn/issues/6857 is resolved, remove this 'REMOVE ME' section
@@ -255,30 +256,61 @@ longhorn_post_install_config() {
 }
 
 check_start_k3s() {
+  # the cluster change code is in another task loop, so if the cluster wait is nogoing
+  # don't go to start k3s in this time. wait also
+  if [ -f "$CLUSTER_WAIT_FILE" ]; then
+        logmsg "Cluster wait ongoing, wait for it before starting k3s"
+        while [ -f "$CLUSTER_WAIT_FILE" ]; do
+                sleep 5
+        done
+  fi
+
   pgrep -f "k3s server" > /dev/null 2>&1
   if [ $? -eq 1 ]; then
-      if [ $RESTART_COUNT -lt $MAX_K3S_RESTARTS ]; then
-          ## Must be after reboot, or from k3s restart
-          RESTART_COUNT=$((RESTART_COUNT+1))
-          save_crash_log "k3s.log"
-          ln -s /var/lib/k3s/bin/* /usr/bin
-          logmsg "Starting k3s server, restart count: $RESTART_COUNT"
-          # for now, always copy to get the latest
-          nohup /usr/bin/k3s server --config /etc/rancher/k3s/config.yaml &
-          k3s_pid=$!
-          # Give the embedded etcd in k3s priority over io as its fsync latencies are critical
-          ionice -c2 -n0 -p $k3s_pid
-          # Default location where clients will look for config
-          # There is a very small window where this file is not available
-          # while k3s is starting up
-          while [ ! -f /etc/rancher/k3s/k3s.yaml ]; do
-            sleep 5
-          done
-          ln -s /etc/rancher/k3s/k3s.yaml ~/.kube/config
-          mkdir -p /run/.kube/k3s
-          cp /etc/rancher/k3s/k3s.yaml /run/.kube/k3s/k3s.yaml
-          return 1
-      fi
+        # do exponential backoff for k3s restart, but not more than MAX_WAIT_TIME
+        RESTART_COUNT=$((RESTART_COUNT+1))
+        logmsg "k3s server not running, restart wait time $current_wait_time, restart count: $RESTART_COUNT"
+        sleep $current_wait_time
+        current_wait_time=$((current_wait_time * 2))
+        if [ $current_wait_time -gt $MAX_WAIT_TIME ]; then
+                current_wait_time=$MAX_WAIT_TIME
+        fi
+
+        ## Must be after reboot, or from k3s restart
+        save_crash_log
+        ln -s /var/lib/k3s/bin/* /usr/bin
+        if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then
+                copy_cni_plugin_files
+        fi
+        # for now, always copy to get the latest
+
+        # start the k3s server now
+        nohup /usr/bin/k3s server --config "$k3s_config_file" &
+
+        # remember the k3s start time
+        k3s_last_start_time=$(date +%s)
+
+        k3s_pid=$!
+        # Give the embedded etcd in k3s priority over io as its fsync latencies are critical
+        ionice -c2 -n0 -p $k3s_pid
+        # Default location where clients will look for config
+        # There is a very small window where this file is not available
+        # while k3s is starting up
+        counter=0
+        while [ ! -f /etc/rancher/k3s/k3s.yaml ]; do
+                sleep 5
+                counter=$((counter+1))
+                # to prevent infinite looping, k3s could have crashed immediately
+                if [ $counter -eq 120 ]; then
+                        break
+                fi
+        done
+        mkdir -p /run/.kube/k3s
+        cp /etc/rancher/k3s/k3s.yaml /run/.kube/k3s/k3s.yaml
+        return 1
+  else
+        # k3s is running, reset the wait time to initial value
+        current_wait_time=$INITIAL_WAIT_TIME
   fi
   return 0
 }
@@ -295,62 +327,42 @@ check_start_containerd() {
         pgrep -f "/var/lib/rancher/k3s/data/current/bin/containerd" > /dev/null 2>&1
         if [ $? -eq 1 ]; then
                 mkdir -p /run/containerd-user
-                nohup /var/lib/rancher/k3s/data/current/bin/containerd --config /etc/containerd/config-k3s.toml > $CTRD_LOG 2>&1 &
+                nohup /var/lib/rancher/k3s/data/current/bin/containerd --config /etc/containerd/config-k3s.toml >> $CTRD_LOG 2>&1 &
                 containerd_pid=$!
                 logmsg "Started k3s-containerd at pid:$containerd_pid"
         fi
         if [ -f /etc/external-boot-image.tar ]; then
+                # Give containerd a moment to start before importing
+                for i in 1 2 3; do
+                        reported_pid=$(/var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info | jq .server.pid)
+                        if [ "$reported_pid" = "$containerd_pid" ]; then
+                                logmsg "containerd online, continue to import"
+                                break
+                        fi
+                        sleep 1
+                done
+
                 # NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
                 # Install external-boot-image image to our eve user containerd registry.
                 # This image contains just kernel and initrd to bootstrap a container image as a VM.
                 # This is very similar to what we do on kvm based eve to start container as a VM.
                 logmsg "Trying to install new external-boot-image"
                 # This import happens once per reboot
-                if ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
-                        eve_external_boot_img_tag=$(cat /run/eve-release)
-                        eve_external_boot_img=docker.io/lfedge/eve-external-boot-image:"$eve_external_boot_img_tag"
-                        import_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
-                        ctr -a /run/containerd-user/containerd.sock image tag "$import_tag" "$eve_external_boot_img"
-
-                        logmsg "Successfully installed external-boot-image $import_tag as $eve_external_boot_img"
-                        rm -f /etc/external-boot-image.tar
+                import_name_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
+                import_name=$(echo "$import_name_tag" | cut -d ':' -f 1)
+                eve_external_boot_img_name="docker.io/lfedge/eve-external-boot-image"
+                if [ "$import_name" = "$eve_external_boot_img_name" ]; then
+                        if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
+                                eve_external_boot_img_tag=$(cat /run/eve-release)
+                                eve_external_boot_img="${eve_external_boot_img_name}:${eve_external_boot_img_tag}"
+                                if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image tag "$import_name_tag" "$eve_external_boot_img"; then
+                                        logmsg "Successfully installed external-boot-image $import_name_tag as $eve_external_boot_img"
+                                        rm -f /etc/external-boot-image.tar
+                                fi
+                        fi
                 fi
         fi
 }
-trigger_k3s_selfextraction() {
-        # Analysis of the k3s source shows nearly any cli command will first self-extract a series of binaries.
-        # In our case we're looking for the containerd binary.
-        # k3s check-config appears to be the only cli cmd which doesn't:
-        # - start a long running process/server
-        # - timeout connecting to a socket
-        # - manipulate config/certs
-
-        # When run on the shell this does throw some config errors, its unclear if we need this issues fixed:
-        # - links: aux/ip6tables should link to iptables-detect.sh (fail)
-        # - links: aux/ip6tables-restore should link to iptables-detect.sh (fail)
-        # - links: aux/ip6tables-save should link to iptables-detect.sh (fail)
-        # - links: aux/iptables should link to iptables-detect.sh (fail)
-        # - links: aux/iptables-restore should link to iptables-detect.sh (fail)
-        # - links: aux/iptables-save should link to iptables-detect.sh (fail)
-        # - apparmor: enabled, but apparmor_parser missing (fail)
-        /usr/bin/k3s check-config >> $INSTALL_LOG 2>&1
-}
-
-# wait for debugging flag in /persist/k3s/wait_{flagname} if exist
-wait_for_item() {
-        filename="/persist/k3s/wait_$1"
-        processname="k3s server"
-        while [ -e "$filename" ]; do
-                k3sproc=""
-                if pgrep -x "$processname" > /dev/null; then
-                        k3sproc="k3s server is running"
-                else
-                        k3sproc="k3s server is NOT running"
-                fi
-                logmsg "Found $filename file. $k3sproc, Waiting for 60 seconds..."
-                sleep 60
-        done
-}
 
 # Return success if all pods are Running/Succeeded and Ready
 # Used in install time to control api server load
@@ -416,57 +428,351 @@ check_and_run_vnc() {
   fi
 }
 
-setup_prereqs
+# get the EdgeNodeClusterStatus
+enc_status_file="/run/zedkube/EdgeNodeClusterStatus/global.json"
+cluster_intf=""
+is_bootstrap=""
+join_serverIP=""
+cluster_token=""
+cluster_node_ip=""
+# for bootstrap node, after reboot to get neighbor node to join
+FoundENCStatus=false
+
+# get the EdgeNodeClusterStatus from zedkube publication
+get_enc_status() {
+    # Read the JSON data from the file, return 0 if successful, 1 if not
+    if [ ! -f "$enc_status_file" ]; then
+      return 1
+    fi
+
+    enc_data=$(cat "$enc_status_file")
+    cluster_intf=$(echo "$enc_data" | jq -r '.ClusterInterface')
+    is_bootstrap=$(echo "$enc_data" | jq -r '.BootstrapNode')
+    join_serverIP=$(echo "$enc_data" | jq -r '.JoinServerIP')
+    cluster_token=$(echo "$enc_data" | jq -r '.EncryptedClusterToken')
+    cluster_node_ip=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.IP')
+    Node_IP=$cluster_node_ip
+    cluster_node_ip_is_ready=$(echo "$enc_data" | jq -r '.ClusterIPIsReady')
+    if [ -n "$cluster_intf" ] && [ -n "$join_serverIP" ] && [ -n "$cluster_token" ] &&\
+       [ -n "$cluster_node_ip" ] && [ "$cluster_node_ip_is_ready" = "true" ] &&\
+       ( [ "$is_bootstrap" = "true" ] || [ "$is_bootstrap" = "false" ] ); then
+      return 0
+    else
+      return 1
+    fi
+}
+
+# When transitioning from single node to cluster mode, need change the controller
+# provided token for the cluster
+change_to_new_token() {
+  if [ -n "$cluster_token" ]; then
+    /usr/bin/k3s token rotate --new-token "$cluster_token"
+    while true; do
+        if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then
+            logmsg "Token change has taken effect."
+            break
+        else
+            logmsg "Token has not taken effect yet. Sleeping for 2 seconds..."
+            sleep 2
+        fi
+    done
+  else
+    # save the content of the token file
+    current_token=$(cat /var/lib/rancher/k3s/server/token)
+
+    # let k3s generate a new token
+    /usr/bin/k3s token rotate
+    logmsg "Rotate Token by k3s."
+
+    # loop to check if the token file has changed
+    while true; do
+      if grep -q "$current_token" /var/lib/rancher/k3s/server/token; then
+        logmsg "Token change has not taken effect yet. Sleeping for 2 seconds..."
+        sleep 2
+      else
+        logmsg "Token change has taken effect."
+        break
+      fi
+    done
+  fi
+}
+
+# monitor function to check if the cluster mode has changed, either from single node to cluster
+# or from cluster to single node
+check_cluster_config_change() {
+
+    # only check the cluster change when it's fully initialized
+    if [ ! -f /var/lib/all_components_initialized ]; then
+        return 0
+    fi
+
+    if [ ! -f "$enc_status_file" ]; then
+      #logmsg "EdgeNodeClusterStatus file not found"
+      if [ ! -f /var/lib/edge-node-cluster-mode ]; then
+        return 0
+      else
+        # check to see if the persistent config file exists, if yes, then we need to
+        # wait until zedkube to publish the ENC status file
+        if [ -f /persist/status/zedagent/EdgeNodeClusterConfig/global.json ]; then
+          logmsg "EdgeNodeClusterConfig file found, but the EdgeNodeClusterStatus file is missing, wait..."
+          return 0
+        fi
+        touch /var/lib/convert-to-single-node
+        reboot
+      fi
+    else
+      # record we have seen this ENC status file
+      FoundENCStatus=true
+      if [ ! -f /var/lib/edge-node-cluster-mode ]; then
+        logmsg "EdgeNodeClusterStatus file found, but the node does not have edge-node-cluster-mode"
+        logmsg "*** check_cluster_config_change, before while loop. cluster_node_ip: $cluster_node_ip" # XXX
+        while true; do
+          if get_enc_status; then
+            logmsg "got the EdgeNodeClusterStatus successfully"
+            # mark it cluster mode before changing the config file
+            touch /var/lib/edge-node-cluster-mode
+
+            # rotate the token with the new token
+            if [ "$is_bootstrap" = "true" ]; then
+                change_to_new_token
+            fi
+
+            # remove previous multus config
+            remove_multus_cni
+
+            # redo the multus config file in /etc/multus-daemonset-new.yaml
+            logmsg "Reapply Multus CNI for clusternodeip: $cluster_node_ip"
+            assign_multus_nodeip "$cluster_node_ip"
+
+            # need to reapply node labels later
+            rm /var/lib/node-labels-initialized
+
+            # kill the process and let the loop to restart k3s
+            terminate_k3s
+            # romove the /var/lib/rancher/k3s/server/tls directory files
+            if [ "$is_bootstrap" = "false" ]; then
+              rm -rf /var/lib/rancher/k3s/server/tls/*
+              # redo the debugger user role binding since certs are changed
+              rm /var/lib/debuguser-initialized
+            fi
+
+            logmsg "privision config file for node to cluster mode"
+            provision_cluster_config_file true
+
+            logmsg "WARNING: changing the node to cluster mode, done"
+            break
+          else
+            sleep 10
+          fi
+        done
+      else
+        return 0
+      fi
+    fi
+    logmsg "Check cluster config change done"
+}
+
+monitor_cluster_config_change() {
+    while true; do
+        check_cluster_config_change
+        sleep 15
+    done
+}
+
+# provision the config.yaml and bootstrap-config.yaml for cluster node, passing $1 as k3s needs initailizing
+provision_cluster_config_file() {
+# prepare the config.yaml and bootstrap-config.yaml on node
+bootstrapContent=$(cat <<- EOF
+cluster-init: true
+token: "${cluster_token}"
+tls-san:
+  - "${join_serverIP}"
+flannel-iface: "${cluster_intf}"
+node-ip: "${cluster_node_ip}"
+node-name: "${HOSTNAME}"
+EOF
+      )
+serverContent=$(cat <<- EOF
+server: "https://${join_serverIP}:6443"
+token: "${cluster_token}"
+flannel-iface: "${cluster_intf}"
+node-ip: "${cluster_node_ip}"
+node-name: "${HOSTNAME}"
+EOF
+      )
+
+    # we have 2 conditions, one is we are the bootstrap node or not, the other is we are
+    # the first time configure k3s cluster or not. If both are true, then we need boostrap config
+    # otherwise, we just need normal server config to join the existing cluster
+    # check if is_bootstrap is true
+    if [ "$is_bootstrap" = "true" ]; then
+        #Bootstrap_Node=true
+        if [ "$1" = "true" ]; then
+                cp "$config_file" "$k3s_config_file"
+                echo "$bootstrapContent" >> "$k3s_config_file"
+                logmsg "bootstrap config.yaml configured with $join_serverIP and $HOSTNAME"
+        else # if we are in restart case, and we are the bootstrap node, wait for some other nodes to join
+                # we go here, means we can not find node to join the cluster, we have waited long enough
+                # but still put in the server config.yaml for now
+                logmsg "join the cluster, use server content config.yaml"
+                cp "$config_file" "$k3s_config_file"
+                #echo "$bootstrapContent" >> "$k3s_config_file"
+                echo "$serverContent" >> "$k3s_config_file"
+        fi
+    else
+      # non-bootstrap node, decide if we need to wait for the join server to be ready
+      #Bootstrap_Node=false
+      cp "$config_file" "$k3s_config_file"
+      echo "$serverContent" >> "$k3s_config_file"
+      logmsg "config.yaml configured with Join-ServerIP $join_serverIP and hostname $HOSTNAME"
+      if [ "$1" = true ]; then
+        logmsg "Check if the Endpoint https://$join_serverIP:6443 is in cluster mode, and wait if not..."
+        # Check if the join Server is available by kubernetes, wait here until it is ready
+        counter=0
+        touch "$CLUSTER_WAIT_FILE"
+        while true; do
+          if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then
+            counter=$((counter+1))
+            #logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status"
+            # if we are here, check the bootstrap server is single or cluster mode
+            status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status")
+            if [ $? -ne 0 ]; then
+                if [ $((counter % 30)) -eq 1 ]; then
+                        logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..."
+                fi
+            elif [ "$status" != "cluster" ]; then
+                if [ $((counter % 30)) -eq 1 ]; then
+                        logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
+                fi
+            else
+                logmsg "Server is in 'cluster' status. done"
+                rm "$CLUSTER_WAIT_FILE"
+                break
+            fi
+          fi
+          sleep 10
+        done
+      else
+        logmsg "restart case with k3s already installed, no need to wait"
+      fi
+    fi
+}
+
 DATESTR=$(date)
 echo "========================== $DATESTR ==========================" >> $INSTALL_LOG
-echo "cluster-init.sh start for $HOSTNAME, uuid $DEVUUID" >> $INSTALL_LOG
 logmsg "Using ZFS persistent storage"
 
+setup_prereqs
+
+
+if [ -f /var/lib/convert-to-single-node ]; then
+        logmsg "remove /var/lib and copy saved single node /var/lib"
+        restore_var_lib
+        # assign node-ip to multus nodeIP for yaml config file
+        assign_multus_nodeip
+fi
+# since we can wait for long time, always start the containerd first
+check_start_containerd
+logmsg "containerd started"
+
+# task running in the backgound to check if the cluster config has changed
+monitor_cluster_config_change &
+
+# if this is the first time to run install, we may wait for the
+# cluster config and status
+if [ ! -f /var/lib/all_components_initialized ]; then
+  logmsg "First time for k3s install"
+
+  # if we are in edge-node cluster mode prepare the config.yaml and bootstrap-config.yaml
+  # for single node mode, we basically use the existing config.yaml
+  if [ -f /var/lib/edge-node-cluster-mode ]; then
+    provision_cluster_config_file true
+  else
+    logmsg "Single node mode prepare config.yaml for $HOSTNAME"
+
+    # append the hostname to the config.yaml and bootstrap-config.yaml
+    cp "$config_file" "$k3s_config_file"
+  fi
+
+  # assing node-ip to multus
+  assign_multus_nodeip "$cluster_node_ip"
+else # a restart case, found all_components_initialized
+  # k3s initialized already and installed, get the config.yaml if not in cluster mode
+  if [ -f /var/lib/edge-node-cluster-mode ]; then
+    logmsg "Cluster config case, restarted k3s node, wait for cluster config"
+    while true; do
+      if get_enc_status; then
+        logmsg "got the EdgeNodeClusterStatus successfully"
+        break
+      else
+        sleep 10
+      fi
+    done
+    # got the cluster config, make the config.ymal now
+    logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"
+    provision_cluster_config_file false
+    logmsg "provision config.yaml done"
+  else # single node mode
+    logmsg "Single node mode, prepare config.yaml for $HOSTNAME"
+    cp "$config_file" "$k3s_config_file"
+    # append the hostname to the config.yaml
+    if ! grep -q node-name "$k3s_config_file"; then
+      echo "node-name: $HOSTNAME" >> "$k3s_config_file"
+    fi
+  fi
+fi
+
+# use part of the /run/eve-release to get the OS-IMAGE string
+get_eve_os_release
+
 #Forever loop every 15 secs
 while true;
 do
 if [ ! -f /var/lib/all_components_initialized ]; then
-        if [ ! -f /var/lib/k3s_initialized ]; then
-                logmsg "Installing K3S version $K3S_VERSION on $HOSTNAME"
-                mkdir -p /var/lib/k3s/bin
-                /usr/bin/curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${K3S_VERSION} INSTALL_K3S_SKIP_ENABLE=true INSTALL_K3S_BIN_DIR=/var/lib/k3s/bin sh -
-                logmsg "Initializing K3S version $K3S_VERSION"
-                ln -s /var/lib/k3s/bin/* /usr/bin
-                trigger_k3s_selfextraction
-                touch /var/lib/k3s_initialized
-        fi
-
-        # Be kind to the API server
-        sleep 1
-
-        check_start_containerd
         if ! check_start_k3s; then
                 continue
         fi
 
-        this_node_ready=$(kubectl get node "$HOSTNAME" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')
-        if [ "$this_node_ready" != "true" ]; then
+        # the k3s just started, may have crashed immediately, we need to continue to retry
+        # instead of waiting forever
+        start_time=$(date +%s)
+        while [ $(($(date +%s) - start_time)) -lt 120 ]; do
+            node_count_ready=$(kubectl get "node/${HOSTNAME}" | grep -cw Ready )
+            if [ $node_count_ready -ne 1 ]; then
+                sleep 10
                 continue
+            else
+                break
+            fi
+        done
+        if [ $node_count_ready -ne 1 ]; then
+            continue
         fi
-        node_uuid_len=$(kubectl get nodes -l node-uuid="$DEVUUID" -o json | jq '.items | length')
-        if [ "$node_uuid_len" -eq 0 ]; then
-          logmsg "set node label with uuid $DEVUUID"
-          kubectl label node "$HOSTNAME" node-uuid="$DEVUUID"
-        fi
+
+        # label the node with device uuid
+        apply_node_uuid_lable
 
         if ! are_all_pods_ready; then
                 continue
         fi
 
-        if [ ! -f /var/lib/cni/bin ]; then
-                copy_cni_plugin_files
-        fi
-
         if [ ! -f /var/lib/multus_initialized ]; then
+                if [ ! -f /etc/multus-daemonset-new.yaml ]; then
+                        assign_multus_nodeip "$cluster_node_ip"
+                fi
                 apply_multus_cni
                 continue
+                if [ ! -f /var/lib/multus_initialized ]; then
+                        logmsg "Failed to apply multus cni, wait a while"
+                        sleep 10
+                        continue
+                fi
         fi
         if ! pidof dhcp; then
+                # if the dhcp.sock exist, then the daemon can not be restarted
+                if [ -f /run/cni/dhcp.sock ]; then
+                        rm /run/cni/dhcp.sock
+                fi
                 # launch CNI dhcp service
                 /opt/cni/bin/dhcp daemon &
         fi
@@ -482,7 +788,10 @@ if [ ! -f /var/lib/all_components_initialized ]; then
                 # This patched version will be removed once the following PR https://github.com/kubevirt/kubevirt/pull/9668 is merged
                 logmsg "Installing patched Kubevirt"
                 kubectl apply -f /etc/kubevirt-operator.yaml
+		logmsg "Updating replica to 1 for virt-operator and virt-controller"
+		kubectl patch deployment virt-operator -n kubevirt --patch '{"spec":{"replicas": 1 }}'
                 kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/kubevirt-cr.yaml
+		kubectl patch KubeVirt kubevirt -n kubevirt --patch '{"spec": {"infra": {"replicas": 1}}}' --type='merge'
 
                 wait_for_item "cdi"
                 #CDI (containerzed data importer) is need to convert qcow2/raw formats to Persistent Volumes and Data volumes
@@ -498,39 +807,79 @@ if [ ! -f /var/lib/all_components_initialized ]; then
         fi
 
         if [ ! -f /var/lib/longhorn_initialized ]; then
-                wait_for_item "longhorn"
-                logmsg "Installing longhorn version ${LONGHORN_VERSION}"
-                apply_longhorn_disk_config "$HOSTNAME"
-                lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml
-                if [ ! -e $lhCfgPath ]; then
-                        curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath"
+                if [ ! -f /var/lib/longhorn_installing ]; then
+                        wait_for_item "longhorn"
+                        logmsg "Installing longhorn version ${LONGHORN_VERSION}"
+                        apply_longhorn_disk_config "$HOSTNAME"
+                        lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml
+                        if [ ! -e $lhCfgPath ]; then
+                                curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath"
+                        fi
+                        if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then
+                                sed -i '/  default-setting.yaml: |-/a\    create-default-disk-labeled-nodes: true' "$lhCfgPath"
+                        fi
+                        kubectl apply -f "$lhCfgPath"
+                        touch /var/lib/longhorn_installing
                 fi
-                if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then
-                        sed -i '/  default-setting.yaml: |-/a\    create-default-disk-labeled-nodes: true' "$lhCfgPath"
+                lhStatus=$(kubectl -n longhorn-system get daemonsets -o json | jq '.items[].status | .numberReady==.desiredNumberScheduled' | tr -d '\n')
+                if [ "$lhStatus" = "truetruetrue" ]; then
+                        logmsg "longhorn ready"
+                        rm /var/lib/longhorn_installing
+                        touch /var/lib/longhorn_initialized
                 fi
-                kubectl apply -f "$lhCfgPath"
-                touch /var/lib/longhorn_initialized
         fi
 
-        if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then
+        if [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then
                 logmsg "All components initialized"
+                touch /var/lib/node-labels-initialized
                 touch /var/lib/all_components_initialized
+                sleep 5
+                logmsg "stop the k3s server and wait for copy /var/lib"
+                terminate_k3s
+                sync
+                sleep 5
+                save_var_lib
+                logmsg "saved the copy of /var/lib, done"
         fi
 else
-        check_start_containerd
         if ! check_start_k3s; then
-                while [ "$(kubectl get node "$HOSTNAME" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')" != "true" ];
-                do
-                        sleep 5;
+                start_time=$(date +%s)
+                while [ $(($(date +%s) - start_time)) -lt 120 ]; do
+                    node_count_ready=$(kubectl get "node/${HOSTNAME}" | grep -cw Ready )
+                    if [ $node_count_ready -ne 1 ]; then
+                        sleep 10
+                        pgrep -f "k3s server" > /dev/null 2>&1
+                        if [ $? -eq 1 ]; then
+                            break
+                        fi
+                        continue
+                    else
+                        break
+                    fi
                 done
+                if [ $node_count_ready -ne 1 ]; then
+                    logmsg "Node not ready, continue to to check_start_k3s"
+                    continue
+                fi
+        else
+                if [ ! -f /var/lib/node-labels-initialized ]; then
+                        reapply_node_labes
+                fi
                 # Initialize CNI after k3s reboot
-                if [ ! -f /var/lib/cni/bin ]; then
+                if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then
                         copy_cni_plugin_files
                 fi
                 if [ ! -f /var/lib/multus_initialized ]; then
+                        if [ ! -f /etc/multus-daemonset-new.yaml ]; then
+                                assign_multus_nodeip "$cluster_node_ip"
+                        fi
                         apply_multus_cni
                 fi
                 if ! pidof dhcp; then
+                        # if the dhcp.sock exist, then the daemon can not be restarted
+                        if [ -f /run/cni/dhcp.sock ]; then
+                                rm /run/cni/dhcp.sock
+                        fi
                         # launch CNI dhcp service
                         /opt/cni/bin/dhcp daemon &
                 fi
@@ -538,9 +887,11 @@ else
                 if [ ! -f /var/lib/debuguser-initialized ]; then
                         config_cluster_roles
                 else
-                        cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml
+                        if [ ! -e /run/.kube/k3s/user.yaml ]; then
+                                cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml
+                        fi
                 fi
-        else
+
                 if [ -e /var/lib/longhorn_initialized ]; then
                         check_overwrite_nsmounter
                 fi
@@ -554,6 +905,8 @@ fi
         check_log_file_size "k3s-install.log"
         check_log_file_size "eve-bridge.log"
         check_log_file_size "containerd-user.log"
+        check_kubeconfig_yaml_files
+        check_and_remove_excessive_k3s_logs
         check_and_run_vnc
         wait_for_item "wait"
         sleep 15
diff --git a/pkg/kube/cluster-utils.sh b/pkg/kube/cluster-utils.sh
new file mode 100755
index 0000000000..fbc034e38e
--- /dev/null
+++ b/pkg/kube/cluster-utils.sh
@@ -0,0 +1,273 @@
+#!/bin/sh
+#
+# Copyright (c) 2024 Zededa, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+logmsg() {
+        local MSG
+        local TIME
+        MSG="$*"
+        TIME=$(date +"%F %T")
+        echo "$TIME : $MSG"  >> $INSTALL_LOG
+}
+
+check_network_connection () {
+        while true; do
+                ret=$(curl -o /dev/null -w "%{http_code}" -s "https://get.k3s.io")
+                if [ "$ret" -eq 200 ]; then
+                        logmsg "Network is ready."
+                        break;
+                else
+                        logmsg "Network is not yet ready"
+                fi
+                sleep 5
+        done
+}
+
+setup_cgroup () {
+        echo "cgroup /sys/fs/cgroup cgroup defaults 0 0" >> /etc/fstab
+}
+
+check_log_file_size() {
+        currentSize=$(wc -c <"$K3S_LOG_DIR/$1")
+        if [ "$currentSize" -gt "$LOG_SIZE" ]; then
+                if [ -f "$K3S_LOG_DIR/$1.2" ]; then
+                        cp -p "$K3S_LOG_DIR/$1.2" "$K3S_LOG_DIR/$1.3"
+                fi
+                if [ -f "$K3S_LOG_DIR/$1.1" ]; then
+                        cp -p "$K3S_LOG_DIR/$1.1" "$K3S_LOG_DIR/$1.2"
+                fi
+                # keep the original log file's attributes
+                cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1"
+                truncate -s 0 "$K3S_LOG_DIR/$1"
+                logmsg "k3s logfile $1, size $currentSize rotate"
+        fi
+}
+
+# search and find the last occurrence of the k3s staring string in the file
+# and gzip the content from that line to the end of the file
+# do the entire file if the string is not found
+gzip_last_restart_part() {
+    fileBaseName=$1
+    targetFile=$2
+    searchString="Starting k3s $K3S_VERSION"
+
+    # Find the line number of the last occurrence of the search string, or 1 if not found
+    lastLine=$(grep -n -F "$searchString" "$fileBaseName" | tail -n 1 | cut -d: -f1)
+    lastLine=${lastLine:-1}
+
+    # Extract the content from the last occurrence of the search string to the end
+    tail -n +$lastLine "$fileBaseName" | gzip -k -9 -c > "$targetFile"
+}
+
+save_crash_log() {
+        if [ "$RESTART_COUNT" = "1" ]; then
+                return
+        fi
+
+        # add timestamp to the filename for clear identification
+        timestamp=$(date +"%Y%m%d-%H%M%S")
+        # This pattern will alias with older crashes, but also a simple way to contain log bloat
+        crashLogBaseName="${K3s_LOG_FILE}.restart.${timestamp}.${RESTART_COUNT}.gz"
+
+        gzip_last_restart_part "${K3S_LOG_DIR}/${K3s_LOG_FILE}" "${K3S_LOG_DIR}/${crashLogBaseName}"
+
+        # Find and list files matching the pattern
+        matching_files=$(ls "$K3S_LOG_DIR" | grep "${K3s_LOG_FILE}.restart.*.gz")
+        file_count=$(echo "$matching_files" | wc -w)
+
+        logmsg "total $file_count crash logs found in dir $K3S_LOG_DIR, file prefix $K3s_LOG_FILE"
+        if [ "$file_count" -gt 10 ]; then
+                files_to_delete=$(ls -t ${K3S_LOG_DIR} | grep -E "${K3s_LOG_FILE}.restart.*.gz" | tail -n +11)
+                echo "$files_to_delete" | while read -r file; do
+                        rm -f "${K3S_LOG_DIR}/${file}"
+                done
+        fi
+}
+
+# k3s can generate log files such as this: k3s-2024-07-30T20-29-31.172.log.gz
+# they seem to be generated by raft operation warnings
+# this check and remove is to prevent the log files from growing indefinitely
+# keep the latest 10 log files and delete the rest
+check_and_remove_excessive_k3s_logs() {
+
+    # Directory to search in (current directory in this case)
+    search_dir="$K3S_LOG_DIR"
+
+    # Regular expression pattern for date and time in the format YYYY-MM-DDTHH-MM-SS.mmm
+    pattern='k3s-[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2}-[0-9]{2}\.[0-9]{3}\.log\.gz'
+
+    # Find and list files matching the pattern
+    matching_files=$(find "$search_dir" -type f -name 'k3s-*.log.gz' | grep -E "$pattern")
+    file_count=$(echo "$matching_files" | wc -w)
+    if [ "$file_count" -gt 10 ]; then
+        files_to_delete=$(echo "$matching_files" | grep ".log.gz" | tail -n +11)
+        echo "$files_to_delete" | while read -r file; do
+                rm -f "${K3S_LOG_DIR}/${file}"
+        done
+    fi
+}
+
+# kubernetes's name must be lower case and '-' instead of '_'
+convert_to_k8s_compatible() {
+        echo "$1" | tr '[:upper:]_' '[:lower:]-'
+}
+
+# Function to check if a string is a valid UUID
+is_valid_uuid() {
+    local uuid=$1
+    if echo "$uuid" | grep -qE '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'; then
+        return 0 # Valid UUID
+    else
+        return 1 # Invalid UUID
+    fi
+}
+
+remove_server_tls_dir() {
+  if [ -d /var/lib/rancher/k3s/server/tls ]; then
+    rm /var/lib/rancher/k3s/server/tls/request-header-ca.key
+    rm /var/lib/rancher/k3s/server/tls/server-ca.key
+    rm /var/lib/rancher/k3s/server/tls/etcd/peer-ca.key
+    rm /var/lib/rancher/k3s/server/tls/etcd/server-ca.crt
+    rm /var/lib/rancher/k3s/server/tls/request-header-ca.crt
+    rm /var/lib/rancher/k3s/server/tls/etcd/server-ca.key
+    rm /var/lib/rancher/k3s/server/cred/ipsec.psk
+    rm /var/lib/rancher/k3s/server/tls/server-ca.crt
+    rm /var/lib/rancher/k3s/server/tls/service.key
+    rm /var/lib/rancher/k3s/server/tls/client-ca.crt
+    rm /var/lib/rancher/k3s/server/tls/client-ca.key
+    rm /var/lib/rancher/k3s/server/tls/etcd/peer-ca.crt
+  fi
+}
+
+remove_multus_cni() {
+        kubectl delete -f /etc/multus-daemonset-new.yaml
+        rm /etc/multus-daemonset-new.yaml
+        rm /var/lib/multus_initialized
+}
+
+# save the /var/lib to /persist/kube-save-var-lib
+save_var_lib() {
+  local dest_dir="${SAVE_KUBE_VAR_LIB_DIR}"
+  # Check if destination directory exists, if not create it
+  if [ ! -d "$dest_dir" ]; then
+    mkdir -p "$dest_dir"
+  fi
+
+  # Remove everything in the destination directory
+  rm -rf "${dest_dir:?}"/*
+
+  # Copy all contents from /var/lib to destination directory
+  cp -a /var/lib/. "$dest_dir"
+}
+
+# Function to restore contents from /persist/kube-save-var-lib back to /var/lib
+restore_var_lib() {
+  local source_dir="${SAVE_KUBE_VAR_LIB_DIR}"
+  # Remove everything under /var/lib
+  rm -rf /var/lib/*
+
+  # Copy everything from /persist/kube-save-var-lib back to /var/lib
+  if [ -d "$source_dir" ]; then
+        cp -a "${source_dir}/." /var/lib
+  else
+        ## the saved files are missing, have do install again
+        Update_CheckNodeComponents
+  fi
+}
+
+# when transitioning from single node to cluster mode, the k3s.yaml file may need
+# to change with new certificates
+check_kubeconfig_yaml_files() {
+    file1="/etc/rancher/k3s/k3s.yaml"
+    file2="/run/.kube/k3s/k3s.yaml"
+
+    if ! cmp -s "$file1" "$file2"; then
+        logmsg "k3s.yaml files are different, copying $file1 to $file2"
+        cp "$file1" "$file2"
+    fi
+}
+
+# get the OS-IMAGE name from the /run/eve-release
+get_eve_os_release() {
+        # Wait for /run/eve-release to appear
+        while [ ! -f /run/eve-release ]; do
+                sleep 1
+        done
+
+        # Read the original name from /run/eve-release
+        eve_image_name=$(cat /run/eve-release)
+
+        logmsg "EVE Release: $eve_image_name, write to /etc/os-release"
+        # Write the short name to /etc/os-release
+        echo "PRETTY_NAME=\"$eve_image_name\"" > /etc/os-release
+}
+
+terminate_k3s() {
+  # Find the process ID of 'k3s server'
+  pid=$(pgrep -f 'k3s server')
+
+  # If the process exists, kill it
+  if [ ! -z "$pid" ]; then
+    logmsg "Killing 'k3s server' process with PID: $pid"
+    kill "$pid"
+  else
+    logmsg "'k3s server' process not found"
+  fi
+}
+
+# wait for debugging flag in /persist/k3s/wait_{flagname} if exist
+wait_for_item() {
+        filename="/persist/k3s/wait_$1"
+        processname="k3s server"
+        while [ -e "$filename" ]; do
+                k3sproc=""
+                if pgrep -x "$processname" > /dev/null; then
+                        k3sproc="k3s server is running"
+                else
+                        k3sproc="k3s server is NOT running"
+                fi
+                logmsg "Found $filename file. $k3sproc, Waiting for 60 seconds..."
+                sleep 60
+        done
+}
+
+wait_for_device_name() {
+        logmsg "Waiting for DeviceName from controller..."
+        EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json"
+        while [ ! -f $EdgeNodeInfoPath ]; do
+                sleep 5
+        done
+        dName=$(jq -r '.DeviceName' $EdgeNodeInfoPath)
+        if [ -n "$dName" ]; then
+                HOSTNAME=$(convert_to_k8s_compatible "$dName")
+        fi
+
+        # we should have the uuid since we got the device name
+        while true; do
+                DEVUUID=$(/bin/hostname)
+                if is_valid_uuid "$DEVUUID"; then
+                        logmsg "got valid Device UUID: $DEVUUID"
+                        break
+                else
+                        sleep 5
+                fi
+        done
+
+        if ! grep -q node-name /etc/rancher/k3s/config.yaml; then
+                echo "node-name: $HOSTNAME" >> /etc/rancher/k3s/config.yaml
+        fi
+        logmsg "Hostname: $HOSTNAME"
+}
+
+wait_for_default_route() {
+        while read -r iface dest gw flags refcnt use metric mask mtu window irtt; do
+                if [ "$dest" = "00000000" ] && [ "$mask" = "00000000" ]; then
+                        logmsg "Default route found"
+                        return 0
+                fi
+                logmsg "waiting for default route $iface $dest $gw $flags $refcnt $use $metric $mask $mtu $window $irtt"
+                sleep 1
+        done < /proc/net/route
+        return 1
+}