diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 08ec9eee4..2aad660bc 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -230,4 +230,6 @@ jobs: with: name: smoketests-${{ matrix.smoke-suite }}-files path: | - /tmp/${{ matrix.smoke-suite }}-k0smotron.log + /tmp/${{ matrix.smoke-suite }}-k0smotron-bootstrap.log + /tmp/${{ matrix.smoke-suite }}-k0smotron-control-plane.log + /tmp/${{ matrix.smoke-suite }}-k0smotron-infrastructure.log diff --git a/api/controlplane/v1beta1/k0s_types.go b/api/controlplane/v1beta1/k0s_types.go index d485dcf3b..68f8d9d1a 100644 --- a/api/controlplane/v1beta1/k0s_types.go +++ b/api/controlplane/v1beta1/k0s_types.go @@ -47,6 +47,10 @@ type K0sControlPlaneSpec struct { //+kubebuilder:validation:Optional //+kubebuilder:default=1 Replicas int32 `json:"replicas,omitempty"` + //+kubebuilder:validation:Optional + //+kubebuilder:validation:Enum:rollout;inplace + //+kubebuilder:default=inplace + UpdateStrategy string `json:"updateStrategy,omitempty"` // Version defines the k0s version to be deployed. You can use a specific k0s version (e.g. v1.27.1+k0s.0) or // just the Kubernetes version (e.g. v1.27.1). If left empty, k0smotron will select one automatically. //+kubebuilder:validation:Optional diff --git a/config/clusterapi/controlplane/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml b/config/clusterapi/controlplane/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml index 8a2596d36..5312ef971 100644 --- a/config/clusterapi/controlplane/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml +++ b/config/clusterapi/controlplane/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml @@ -206,6 +206,9 @@ spec: default: 1 format: int32 type: integer + updateStrategy: + default: inplace + type: string version: description: |- Version defines the k0s version to be deployed. You can use a specific k0s version (e.g. v1.27.1+k0s.0) or diff --git a/config/crd/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml b/config/crd/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml index 8a2596d36..5312ef971 100644 --- a/config/crd/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml +++ b/config/crd/bases/controlplane.cluster.x-k8s.io_k0scontrolplanes.yaml @@ -206,6 +206,9 @@ spec: default: 1 format: int32 type: integer + updateStrategy: + default: inplace + type: string version: description: |- Version defines the k0s version to be deployed. You can use a specific k0s version (e.g. v1.27.1+k0s.0) or diff --git a/docs/resource-reference.md b/docs/resource-reference.md index b77802a70..12f2af44a 100644 --- a/docs/resource-reference.md +++ b/docs/resource-reference.md @@ -1180,6 +1180,15 @@ Resource Types: Default: 1
false + + updateStrategy + string + +
+
+ Default: inplace
+ + false version string diff --git a/internal/controller/bootstrap/controlplane_bootstrap_controller.go b/internal/controller/bootstrap/controlplane_bootstrap_controller.go index c3786af26..afcf37816 100644 --- a/internal/controller/bootstrap/controlplane_bootstrap_controller.go +++ b/internal/controller/bootstrap/controlplane_bootstrap_controller.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/cluster-api/util" capiutil "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/secret" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -179,14 +180,20 @@ func (c *ControlPlaneController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("control plane endpoint is not set") } - if strings.HasSuffix(config.Name, "-0") { + machines, err := collections.GetFilteredMachinesForCluster(ctx, c.Client, cluster, collections.ControlPlaneMachines(cluster.Name), collections.ActiveMachines) + if err != nil { + return ctrl.Result{}, fmt.Errorf("error collecting machines: %w", err) + } + + if machines.Oldest().Name == config.Name { files, err = c.genInitialControlPlaneFiles(ctx, scope, files) if err != nil { return ctrl.Result{}, fmt.Errorf("error generating initial control plane files: %v", err) } installCmd = createCPInstallCmd(config) } else { - files, err = c.genControlPlaneJoinFiles(ctx, scope, config, files) + files, err = c.genControlPlaneJoinFiles(ctx, scope, files, machines.Oldest()) + //files, err = c.genControlPlaneJoinFiles(ctx, scope, config, files) if err != nil { return ctrl.Result{}, fmt.Errorf("error generating control plane join files: %v", err) } @@ -289,7 +296,7 @@ func (c *ControlPlaneController) genInitialControlPlaneFiles(ctx context.Context return files, nil } -func (c *ControlPlaneController) genControlPlaneJoinFiles(ctx context.Context, scope *Scope, config *bootstrapv1.K0sControllerConfig, files []cloudinit.File) ([]cloudinit.File, error) { +func (c *ControlPlaneController) genControlPlaneJoinFiles(ctx context.Context, scope *Scope, files []cloudinit.File, firstControllerMachine *clusterv1.Machine) ([]cloudinit.File, error) { log := log.FromContext(ctx).WithValues("K0sControllerConfig cluster", scope.Cluster.Name) _, ca, err := c.getCerts(ctx, scope) @@ -316,7 +323,7 @@ func (c *ControlPlaneController) genControlPlaneJoinFiles(ctx context.Context, s return nil, err } - host, err := c.findFirstControllerIP(ctx, config) + host, err := c.findFirstControllerIP(ctx, firstControllerMachine) if err != nil { log.Error(err, "Failed to get controller IP") return nil, err @@ -526,22 +533,9 @@ func createCPInstallCmdWithJoinToken(config *bootstrapv1.K0sControllerConfig, to return strings.Join(installCmd, " ") } -func (c *ControlPlaneController) findFirstControllerIP(ctx context.Context, config *bootstrapv1.K0sControllerConfig) (string, error) { - // Dirty first controller name generation - nameParts := strings.Split(config.Name, "-") - nameParts[len(nameParts)-1] = "0" - name := strings.Join(nameParts, "-") - machine, machineImpl, err := c.getMachineImplementation(ctx, name, config) - if err != nil { - return "", fmt.Errorf("error getting machine implementation: %w", err) - } - addresses, found, err := unstructured.NestedSlice(machineImpl.UnstructuredContent(), "status", "addresses") - if err != nil { - return "", err - } - +func (c *ControlPlaneController) findFirstControllerIP(ctx context.Context, firstControllerMachine *clusterv1.Machine) (string, error) { extAddr, intAddr := "", "" - for _, addr := range machine.Status.Addresses { + for _, addr := range firstControllerMachine.Status.Addresses { if addr.Type == clusterv1.MachineExternalIP { extAddr = addr.Address break @@ -552,16 +546,29 @@ func (c *ControlPlaneController) findFirstControllerIP(ctx context.Context, conf } } - if found { - for _, addr := range addresses { - addrMap, _ := addr.(map[string]interface{}) - if addrMap["type"] == string(v1.NodeExternalIP) { - extAddr = addrMap["address"].(string) - break - } - if addrMap["type"] == string(v1.NodeInternalIP) { - intAddr = addrMap["address"].(string) - break + name := firstControllerMachine.Name + + if extAddr == "" && intAddr == "" { + machineImpl, err := c.getMachineImplementation(ctx, firstControllerMachine) + if err != nil { + return "", fmt.Errorf("error getting machine implementation: %w", err) + } + addresses, found, err := unstructured.NestedSlice(machineImpl.UnstructuredContent(), "status", "addresses") + if err != nil { + return "", err + } + + if found { + for _, addr := range addresses { + addrMap, _ := addr.(map[string]interface{}) + if addrMap["type"] == string(v1.NodeExternalIP) { + extAddr = addrMap["address"].(string) + break + } + if addrMap["type"] == string(v1.NodeInternalIP) { + intAddr = addrMap["address"].(string) + break + } } } } @@ -577,13 +584,7 @@ func (c *ControlPlaneController) findFirstControllerIP(ctx context.Context, conf return "", fmt.Errorf("no address found for machine %s", name) } -func (c *ControlPlaneController) getMachineImplementation(ctx context.Context, name string, config *bootstrapv1.K0sControllerConfig) (*clusterv1.Machine, *unstructured.Unstructured, error) { - var machine clusterv1.Machine - err := c.Get(ctx, client.ObjectKey{Name: name, Namespace: config.Namespace}, &machine) - if err != nil { - return nil, nil, fmt.Errorf("error getting machine object: %w", err) - } - +func (c *ControlPlaneController) getMachineImplementation(ctx context.Context, machine *clusterv1.Machine) (*unstructured.Unstructured, error) { infRef := machine.Spec.InfrastructureRef machineImpl := new(unstructured.Unstructured) @@ -593,11 +594,11 @@ func (c *ControlPlaneController) getMachineImplementation(ctx context.Context, n key := client.ObjectKey{Name: infRef.Name, Namespace: infRef.Namespace} - err = c.Get(ctx, key, machineImpl) + err := c.Get(ctx, key, machineImpl) if err != nil { - return nil, nil, fmt.Errorf("error getting machine implementation object: %w", err) + return nil, fmt.Errorf("error getting machine implementation object: %w", err) } - return &machine, machineImpl, nil + return machineImpl, nil } func genShutdownServiceFiles() []cloudinit.File { diff --git a/internal/controller/controlplane/k0s_controlplane_controller.go b/internal/controller/controlplane/k0s_controlplane_controller.go index fe5ebb3c1..ea634cdbe 100644 --- a/internal/controller/controlplane/k0s_controlplane_controller.go +++ b/internal/controller/controlplane/k0s_controlplane_controller.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "github.com/Masterminds/semver" "github.com/google/uuid" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -27,6 +28,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apiserver/pkg/storage/names" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/utils/pointer" @@ -34,6 +36,7 @@ import ( kubeadmbootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" capiutil "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/secret" ctrl "sigs.k8s.io/controller-runtime" @@ -210,63 +213,42 @@ func (c *K0sController) reconcile(ctx context.Context, cluster *clusterv1.Cluste func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv1.Cluster, kcp *cpv1beta1.K0sControlPlane) (int32, error) { replicasToReport := kcp.Spec.Replicas - // TODO: Scale down machines if needed - if kcp.Status.Replicas > kcp.Spec.Replicas { - kubeClient, err := c.getKubeClient(ctx, cluster) - if err != nil { - return replicasToReport, fmt.Errorf("error getting cluster client set for deletion: %w", err) - } - // Remove the last machine and report the new number of replicas to status - // On the next reconcile, the next machine will be removed - if kcp.Status.Replicas > kcp.Spec.Replicas { - // Wait for the previous machine to be deleted to avoid etcd issues - previousMachineName := machineName(kcp.Name, int(kcp.Status.Replicas)) - exist, err := c.machineExist(ctx, previousMachineName, kcp) - if err != nil { - return kcp.Status.Replicas, fmt.Errorf("error checking machine existance: %w", err) - } - if exist { - return kcp.Status.Replicas, fmt.Errorf("waiting for previous machine to be deleted") - } - - replicasToReport = kcp.Status.Replicas - 1 - name := machineName(kcp.Name, int(kcp.Status.Replicas-1)) - - if err := c.markChildControlNodeToLeave(ctx, name, kubeClient); err != nil { - return replicasToReport, fmt.Errorf("error marking controlnode to leave: %w", err) - } - - if err := c.deleteBootstrapConfig(ctx, name, kcp); err != nil { - return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) - } - - if err := c.deleteMachineFromTemplate(ctx, name, cluster, kcp); err != nil { - return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) - } - - if err := c.deleteMachine(ctx, name, kcp); err != nil { - return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) - } + machines, err := collections.GetFilteredMachinesForCluster(ctx, c, cluster, collections.ControlPlaneMachines(cluster.Name), collections.ActiveMachines) + if err != nil { + return replicasToReport, fmt.Errorf("error collecting machines: %w", err) + } - return replicasToReport, nil - } + currentReplicas := machines.Len() + desiredReplicas := kcp.Spec.Replicas + machinesToDelete := 0 + if currentReplicas > int(desiredReplicas) { + machinesToDelete = currentReplicas - int(desiredReplicas) + replicasToReport = kcp.Status.Replicas } if kcp.Status.Version != "" && kcp.Spec.Version != kcp.Status.Version { - kubeClient, err := c.getKubeClient(ctx, cluster) - if err != nil { - return replicasToReport, fmt.Errorf("error getting cluster client set for machine update: %w", err) - } + if kcp.Spec.UpdateStrategy == "" { + desiredReplicas += kcp.Spec.Replicas + machinesToDelete = int(kcp.Spec.Replicas) + replicasToReport = desiredReplicas + } else { + kubeClient, err := c.getKubeClient(ctx, cluster) + if err != nil { + return replicasToReport, fmt.Errorf("error getting cluster client set for machine update: %w", err) + } - err = c.createAutopilotPlan(ctx, kcp, kubeClient) - if err != nil { - return replicasToReport, fmt.Errorf("error creating autopilot plan: %w", err) + err = c.createAutopilotPlan(ctx, kcp, kubeClient) + if err != nil { + return replicasToReport, fmt.Errorf("error creating autopilot plan: %w", err) + } } } - for i := 0; i < int(kcp.Spec.Replicas); i++ { - name := machineName(kcp.Name, i) + for i := currentReplicas; i < int(desiredReplicas); i++ { + name := names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%s-%d", kcp.Name, i)) + //for i := 0; i < int(kcp.Spec.Replicas); i++ { + // name := machineName(kcp.Name, i) machineFromTemplate, err := c.createMachineFromTemplate(ctx, name, cluster, kcp) if err != nil { @@ -291,6 +273,73 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv } } + var isNewMachineReady bool + for _, m := range machines { + ver := semver.MustParse(kcp.Spec.Version) + if m.Spec.Version != nil && *m.Spec.Version == fmt.Sprintf("%d.%d.%d", ver.Major(), ver.Minor(), ver.Patch()) { + continue + } + if m.Status.Phase == string(clusterv1.MachinePhaseProvisioning) || + m.Status.Phase == string(clusterv1.MachinePhaseProvisioned) || + m.Status.Phase == string(clusterv1.MachinePhaseRunning) { + isNewMachineReady = true + break + } + } + + if machinesToDelete > 0 && !isNewMachineReady { + return replicasToReport, fmt.Errorf("waiting for new machines") + } + + //if kcp.Status.Version != "" && kcp.Spec.Version != kcp.Status.Version { + // kubeClient, err := c.getKubeClient(ctx, cluster) + // if err != nil { + // return replicasToReport, fmt.Errorf("error getting cluster client set for machine update: %w", err) + // } + // + // err = c.createAutopilotPlan(ctx, kcp, kubeClient) + // if err != nil { + // return replicasToReport, fmt.Errorf("error creating autopilot plan: %w", err) + // } + //} + + // TODO: Scale down machines if needed + //if kcp.Status.Replicas > kcp.Spec.Replicas { + if machinesToDelete > 0 { + kubeClient, err := c.getKubeClient(ctx, cluster) + if err != nil { + return replicasToReport, fmt.Errorf("error getting cluster client set for deletion: %w", err) + } + + // Remove the last machine and report the new number of replicas to status + // On the next reconcile, the next machine will be removed + // Wait for the previous machine to be deleted to avoid etcd issues + machine := machines.Oldest() + if machine.Status.Phase == string(clusterv1.MachinePhaseDeleting) { + return kcp.Status.Replicas, fmt.Errorf("waiting for previous machine to be deleted") + } + + replicasToReport -= 1 + name := machine.Name + if err := c.markChildControlNodeToLeave(ctx, name, kubeClient); err != nil { + return replicasToReport, fmt.Errorf("error marking controlnode to leave: %w", err) + } + + if err := c.deleteBootstrapConfig(ctx, name, kcp); err != nil { + return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) + } + + if err := c.deleteMachineFromTemplate(ctx, name, cluster, kcp); err != nil { + return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) + } + + if err := c.deleteMachine(ctx, name, kcp); err != nil { + return replicasToReport, fmt.Errorf("error deleting machine from template: %w", err) + } + + return replicasToReport, nil + } + return replicasToReport, nil } diff --git a/internal/controller/infrastructure/remote_machine_controller.go b/internal/controller/infrastructure/remote_machine_controller.go index 76c5b7c63..3fe31cfe5 100644 --- a/internal/controller/infrastructure/remote_machine_controller.go +++ b/internal/controller/infrastructure/remote_machine_controller.go @@ -332,7 +332,7 @@ func (r *RemoteMachineController) returnMachineToPool(ctx context.Context, rm *i } } log := log.FromContext(ctx).WithValues("remotemachine", rm.Name) - log.Error(fmt.Errorf("no pooled machine found for remote machine"), rm.Namespace, rm.Name) + log.Error(fmt.Errorf("no pooled machine found for remote machine"), "failed to found pooled remote machine", "namespace", rm.Namespace, "name", rm.Name) return nil } diff --git a/inttest/capi-remote-machine-template-update/capi_remote_machine_template_update_test.go b/inttest/capi-remote-machine-template-update/capi_remote_machine_template_update_test.go index ce9566955..58d8402d2 100644 --- a/inttest/capi-remote-machine-template-update/capi_remote_machine_template_update_test.go +++ b/inttest/capi-remote-machine-template-update/capi_remote_machine_template_update_test.go @@ -184,7 +184,7 @@ func (s *RemoteMachineTemplateUpdateSuite) TestCAPIRemoteMachine() { s.Require().NoError(err) s.T().Log("waiting for node to be ready") - s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, "remote-test-0", corev1.ConditionTrue)) + s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, rmName, corev1.ConditionTrue)) s.T().Log("update cluster") s.updateCluster() @@ -200,7 +200,7 @@ func (s *RemoteMachineTemplateUpdateSuite) TestCAPIRemoteMachine() { s.Require().NoError(err) s.T().Log("waiting for node to be ready in updated cluster") - s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, "remote-test-0", corev1.ConditionTrue)) + s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, rmName, corev1.ConditionTrue)) } func (s *RemoteMachineTemplateUpdateSuite) findRemoteMachines(namespace string) ([]infra.RemoteMachine, error) { @@ -230,7 +230,7 @@ func (s *RemoteMachineTemplateUpdateSuite) getRemoteMachine(name string, namespa } func (s *RemoteMachineTemplateUpdateSuite) deleteCluster() { - out, err := exec.Command("kubectl", "delete", "-f", s.clusterYamlsPath).CombinedOutput() + out, err := exec.Command("kubectl", "delete", "cluster", "remote-test-cluster").CombinedOutput() s.Require().NoError(err, "failed to delete cluster objects: %s", string(out)) } @@ -269,8 +269,8 @@ func (s *RemoteMachineTemplateUpdateSuite) createCluster() { s.Require().NoError(os.WriteFile(s.clusterYamlsPath, bytes, 0644)) out, err := exec.Command("kubectl", "apply", "-f", s.clusterYamlsPath).CombinedOutput() + s.Require().NoError(err, "failed to apply cluster objects: %s", string(out)) s.Require().NoError(os.WriteFile(s.updatedClusterYamlsPath, []byte(updatedClusterYaml), 0644)) - s.Require().NoError(err, "failed to update cluster objects: %s", string(out)) } func getLBPort(name string) (int, error) { diff --git a/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go b/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go index 030352bf6..05abada61 100644 --- a/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go +++ b/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go @@ -151,10 +151,26 @@ func (s *RemoteMachineTemplateSuite) TestCAPIRemoteMachine() { s.Require().NoError(err) // Verify the RemoteMachine is at expected state + var rmName string + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + rm, err := s.findRemoteMachines("default") + if err != nil { + return false, err + } + + if len(rm) == 0 { + return true, nil + } + + rmName = rm[0].GetName() + return true, nil + }) + s.Require().NoError(err) // nolint:staticcheck err = wait.PollImmediateUntilWithContext(ctx, 1*time.Second, func(ctx context.Context) (bool, error) { - rm, err := s.getRemoteMachine("remote-test-0", "default") + rm, err := s.getRemoteMachine(rmName, "default") if err != nil { return false, err } @@ -165,10 +181,10 @@ func (s *RemoteMachineTemplateSuite) TestCAPIRemoteMachine() { s.Require().NoError(err) s.T().Log("waiting for node to be ready") - s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, "remote-test-0", corev1.ConditionTrue)) + s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, rmName, corev1.ConditionTrue)) s.T().Log("deleting node from cluster") - s.Require().NoError(s.deleteRemoteMachine("remote-test-0", "default")) + s.Require().NoError(s.deleteRemoteMachine(rmName, "default")) nodes, err := kmcKC.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) s.Require().NoError(err) @@ -176,6 +192,19 @@ func (s *RemoteMachineTemplateSuite) TestCAPIRemoteMachine() { } +func (s *RemoteMachineTemplateSuite) findRemoteMachines(namespace string) ([]infra.RemoteMachine, error) { + apiPath := fmt.Sprintf("/apis/infrastructure.cluster.x-k8s.io/v1beta1/namespaces/%s/remotemachines", namespace) + result, err := s.client.RESTClient().Get().AbsPath(apiPath).DoRaw(s.Context()) + if err != nil { + return nil, err + } + rm := &infra.RemoteMachineList{} + if err := yaml.Unmarshal(result, rm); err != nil { + return nil, err + } + return rm.Items, nil +} + func (s *RemoteMachineTemplateSuite) getRemoteMachine(name string, namespace string) (*infra.RemoteMachine, error) { apiPath := fmt.Sprintf("/apis/infrastructure.cluster.x-k8s.io/v1beta1/namespaces/%s/remotemachines/%s", namespace, name) result, err := s.client.RESTClient().Get().AbsPath(apiPath).DoRaw(s.Context())