Skip to content
This repository has been archived by the owner on Sep 21, 2023. It is now read-only.

pkg/operator: don't erroneously "update" (kill) unhealthy active node #345

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions pkg/operator/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,9 @@ func (v *Vaults) syncUpgrade(vr *api.VaultService, d *appsv1beta1.Deployment) (e
// If there is one active node belonging to the old version, and all other nodes are
// standby and uptodate, then trigger step-down on active node.
// It maps to the following conditions on Status:
// 1. check standby == updated
// 2. check Available - Updated == Active
// 1. check there is an active node (deployment will update non-active nodes)
// 2. check standby == updated (ensure new active node will be updated)
// 3. ensure there are no sealed nodes (don't seal because of update)
readyToTriggerStepdown := func() bool {
if len(vr.Status.VaultStatus.Active) == 0 {
return false
Expand All @@ -253,14 +254,14 @@ func (v *Vaults) syncUpgrade(vr *api.VaultService, d *appsv1beta1.Deployment) (e
return false
}

ava := append(vr.Status.VaultStatus.Standby, vr.Status.VaultStatus.Sealed...)
if !reflect.DeepEqual(ava, vr.Status.UpdatedNodes) {
if len(vr.Status.VaultStatus.Sealed) != 0 {
return false
}
return true
}()

if readyToTriggerStepdown {
logrus.Infof("Deleting active Vault pod (%s) to complete upgrade", vr.Status.VaultStatus.Active)
// This will send SIGTERM to the active Vault pod. It should release HA lock and exit properly.
// If it failed for some reason, kubelet will send SIGKILL after default grace period (30s) eventually.
// It take longer but the the lock will get released eventually on failure case.
Expand Down
14 changes: 10 additions & 4 deletions pkg/operator/vault_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ func (vs *Vaults) updateLocalVaultCRStatus(ctx context.Context, vr *api.VaultSer
continue
}

// Make a note of any updated pod, even if it's not healthy.
// This ensures that `Vaults.syncUpgrade()` doesn't lose track of
// unhealthy, but already updated nodes. (Otherwise, an already
// updated, active, but unhealthy node could mistakenly be killed to
// "complete" an update.)
// This alone does not indicate the pods have "changed".
if k8sutil.IsVaultVersionMatch(p.Spec, vr.Spec) {
updated = append(updated, p.GetName())
}

vapi, err := vaultutil.NewClient(k8sutil.PodDNSName(p), "8200", tlsConfig)
if err != nil {
logrus.Errorf("failed to update vault replica status: failed creating client for the vault pod (%s/%s): %v", namespace, p.GetName(), err)
Expand All @@ -109,10 +119,6 @@ func (vs *Vaults) updateLocalVaultCRStatus(ctx context.Context, vr *api.VaultSer

changed = true

if k8sutil.IsVaultVersionMatch(p.Spec, vr.Spec) {
updated = append(updated, p.GetName())
}

// TODO: add to vaultutil?
if hr.Initialized && !hr.Sealed && !hr.Standby {
s.VaultStatus.Active = p.GetName()
Expand Down