Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consumed capacity info in SpaceProvisionerConfig #1109

Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
067308a
Make the SPC ready status reflect the ability to place spaces to
metlos Nov 28, 2024
8470d2b
fix small typos and improve the wording in comments
metlos Nov 29, 2024
f30cec6
Address linter complaints.
metlos Nov 29, 2024
a6b7053
Merge branch 'master' into member-info-in-spc-status-2
mfrancisc Dec 2, 2024
2130e70
return error when no ToolchainStatus is found
metlos Dec 3, 2024
466f212
Rename the tests to better reflect what they're doing
metlos Dec 3, 2024
8450dd9
Don't log the error that will be logged again by the callers
metlos Dec 3, 2024
a0b15c8
better naming and explanation why we're doing only a partial re-check…
metlos Dec 3, 2024
1b9a606
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 3, 2024
691752e
Add tests for the GetSpaceCountsFromCountsCache.
metlos Dec 3, 2024
d6db400
Clean up comments
metlos Dec 5, 2024
3b1b557
add a testcase for multiple memory usage threshold breaches
metlos Dec 5, 2024
9e27aa8
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 5, 2024
136c8ed
Fix typos
metlos Dec 5, 2024
7c935b5
remove GetUsageFunc from the SPC, always read the ToolchainStatus
metlos Dec 6, 2024
0bebee7
Make sure the mappers are used with the correct object type
metlos Dec 6, 2024
f97d687
reset the env var to its original value after the test
metlos Dec 6, 2024
087b3bd
Move the update of the space count from the predicate up a level for it
metlos Dec 6, 2024
5972be2
Move the GetSpaceCountFromSpaceProvisionerConfigs helper function into
metlos Dec 6, 2024
226ca14
Simplify the capacity ready state computation.
metlos Dec 6, 2024
845ea69
Merge remote-tracking branch 'origin/member-info-in-spc-status-2' int…
metlos Dec 6, 2024
db8505f
Mirror the consumed capacity recorded in the ToolchainStatus to
metlos Dec 6, 2024
499ef81
Merge branch 'master' into member-info-in-spc-status_controller-part
MatousJobanek Dec 12, 2024
68d6be5
Simplify the logic in refreshStatus of the SPC.
metlos Dec 12, 2024
9a0e4f6
Break up long assertions in the test
metlos Dec 12, 2024
b058ec5
Merge remote-tracking branch 'upstream/master' into member-info-in-sp…
metlos Dec 16, 2024
6ffd1e5
Merge branch 'master' into member-info-in-spc-status_controller-part
metlos Dec 18, 2024
90cf19a
Merge branch 'master' into member-info-in-spc-status_controller-part
fbm3307 Dec 18, 2024
a514b81
Merge branch 'master' into member-info-in-spc-status_controller-part
metlos Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions controllers/spaceprovisionerconfig/mapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,40 @@ package spaceprovisionerconfig

import (
"context"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"k8s.io/apimachinery/pkg/types"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

func MapToolchainClusterToSpaceProvisionerConfigs(ctx context.Context, cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
return func(context context.Context, obj runtimeclient.Object) []reconcile.Request {
func MapToolchainClusterToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
func MapToolchainClusterToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
func mapToolchainClusterToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {

minor doubt- i don't see this function being used out of the package or in e2e, why is it declared global?

return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainCluster); !ok {
return nil
}

ret, err := findReferencingProvisionerConfigs(ctx, cl, runtimeclient.ObjectKeyFromObject(obj))
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"toolchainClusterCause", runtimeclient.ObjectKeyFromObject(obj))
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainCluster")
return []reconcile.Request{}
}
return ret
}
}

func MapToolchainStatusToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
func MapToolchainStatusToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {
func mapToolchainStatusToSpaceProvisionerConfigs(cl runtimeclient.Client) func(context.Context, runtimeclient.Object) []reconcile.Request {

Similarly for this function may be we don't require it to be global

return func(ctx context.Context, obj runtimeclient.Object) []reconcile.Request {
if _, ok := obj.(*toolchainv1alpha1.ToolchainStatus); !ok {
return nil
}

ret, err := findAllSpaceProvisionerConfigsInNamespace(ctx, cl, obj.GetNamespace())
if err != nil {
log.FromContext(ctx).Error(err, "failed to list SpaceProvisionerConfig objects while determining what objects to reconcile",
"causeObj", runtimeclient.ObjectKeyFromObject(obj), "causeKind", "ToolchainStatus")
return []reconcile.Request{}
}
return ret
Expand All @@ -39,3 +60,20 @@ func findReferencingProvisionerConfigs(ctx context.Context, cl runtimeclient.Cli
}
return ret, nil
}

func findAllSpaceProvisionerConfigsInNamespace(ctx context.Context, cl runtimeclient.Client, ns string) ([]reconcile.Request, error) {
configs := &toolchainv1alpha1.SpaceProvisionerConfigList{}
if err := cl.List(ctx, configs, runtimeclient.InNamespace(ns)); err != nil {
return nil, err
}
ret := make([]reconcile.Request, 0, len(configs.Items))
for _, cfg := range configs.Items {
ret = append(ret, reconcile.Request{
NamespacedName: types.NamespacedName{
Namespace: cfg.Namespace,
Name: cfg.Name,
},
})
}
return ret, nil
}
51 changes: 49 additions & 2 deletions controllers/spaceprovisionerconfig/mapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -123,7 +123,7 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
}

// when
reqs := MapToolchainClusterToSpaceProvisionerConfigs(context.TODO(), cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
reqs := MapToolchainClusterToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "cluster1",
Namespace: test.HostOperatorNs,
Expand All @@ -134,3 +134,50 @@ func TestMapToolchainClusterToSpaceProvisionerConfigs(t *testing.T) {
require.Empty(t, reqs)
})
}

func TestMapToolchainStatusToSpaceProvisionerConfig(t *testing.T) {
t.Run("finds all SPCs in namespace", func(t *testing.T) {
// given
spc0 := NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
spc1 := NewSpaceProvisionerConfig("spc1", test.HostOperatorNs, ReferencingToolchainCluster("cluster2"))
spc2 := NewSpaceProvisionerConfig("spc2", test.HostOperatorNs, ReferencingToolchainCluster("cluster1"))
cl := test.NewFakeClient(t, spc0, spc1, spc2)

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Equal(t, []reconcile.Request{
requestFromObject(spc0),
requestFromObject(spc1),
requestFromObject(spc2),
}, reqs)
})
t.Run("interprets erors as empty result", func(t *testing.T) {
// given
cl := test.NewFakeClient(t, NewSpaceProvisionerConfig("spc0", test.HostOperatorNs, ReferencingToolchainCluster("cluster1")))
expectedErr := errors.New("expected list error")
cl.MockList = func(ctx context.Context, list runtimeclient.ObjectList, opts ...runtimeclient.ListOption) error {
if _, ok := list.(*toolchainv1alpha1.SpaceProvisionerConfigList); ok {
return expectedErr
}
return cl.Client.List(ctx, list, opts...)
}

// when
reqs := MapToolchainStatusToSpaceProvisionerConfigs(cl)(context.TODO(), &toolchainv1alpha1.ToolchainStatus{
ObjectMeta: metav1.ObjectMeta{
Name: "toolchain-status",
Namespace: test.HostOperatorNs,
},
})

// then
require.Empty(t, reqs)
})
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ import (
"fmt"

toolchainv1alpha1 "github.com/codeready-toolchain/api/api/v1alpha1"
"github.com/codeready-toolchain/host-operator/controllers/toolchainconfig"
"github.com/codeready-toolchain/toolchain-common/pkg/condition"
"github.com/redhat-cop/operator-utils/pkg/util"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -25,19 +27,28 @@ type Reconciler struct {

var _ reconcile.Reconciler = (*Reconciler)(nil)

func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&toolchainv1alpha1.SpaceProvisionerConfig{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Watches(
&toolchainv1alpha1.ToolchainCluster{},
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(ctx, r.Client)),
handler.EnqueueRequestsFromMapFunc(MapToolchainClusterToSpaceProvisionerConfigs(r.Client)),
).
// we use the same information as the ToolchainStatus specific for the SPCs. Because memory consumption is
// read directly out of the member clusters using remote connections, let's look for it only once
// in ToolchainStatus and just read it out "locally" here without needing to reach out to the member clusters
// again.
Watches(
&toolchainv1alpha1.ToolchainStatus{},
handler.EnqueueRequestsFromMapFunc(MapToolchainStatusToSpaceProvisionerConfigs(r.Client)),
).
Complete(r)
}

//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=spaceprovisionerconfigs/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainclusters,verbs=get;list;watch
//+kubebuilder:rbac:groups=toolchain.dev.openshift.com,resources=toolchainstatuses,verbs=get;list;watch

// Reconcile ensures that SpaceProvisionerConfig is valid and points to an existing ToolchainCluster.
func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.Result, error) {
Expand All @@ -58,62 +69,153 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl.
return ctrl.Result{}, nil
}

readyCondition, reportedError := r.determineReadyState(ctx, spaceProvisionerConfig)
reportedErr := r.refreshStatus(ctx, spaceProvisionerConfig)

var updated bool
spaceProvisionerConfig.Status.Conditions, updated = condition.AddOrUpdateStatusConditions(spaceProvisionerConfig.Status.Conditions,
readyCondition)
if !updated {
return ctrl.Result{}, reportedError
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
return ctrl.Result{}, err
}

logger.Info("updating SpaceProvisionerConfig", "readyCondition", readyCondition)
if err := r.Client.Status().Update(ctx, spaceProvisionerConfig); err != nil {
if reportedError != nil {
logger.Info("failed to update the status (reported as failed reconciliation) with a previous unreported error during reconciliation", "unreportedError", reportedError)
return ctrl.Result{}, reportedErr
}

func (r *Reconciler) refreshStatus(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) error {
// clear out the consumed capacity - this will advertise to the user that we either failed before it made sense
// to collect it (and therefore we don't know it) or it was not available (and therefore we again don't know it)
spc.Status.ConsumedCapacity = nil

if !spc.Spec.Enabled {
updateReadyCondition(spc, corev1.ConditionFalse, toolchainv1alpha1.SpaceProvisionerConfigDisabledReason, "")
return nil
}

clusterCondition, err := r.determineClusterReadyState(ctx, spc)
if err != nil {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason, err.Error())

// the reconciler reacts on ToolchainCluster changes so it will be triggered once a new TC appears
// we therefore don't need to return error from the reconciler in the case the TC is not found.
if errors.IsNotFound(err) {
return nil
}
reportedError = fmt.Errorf("failed to update the SpaceProvisionerConfig status: %w", err)
return err
}

if clusterCondition != corev1.ConditionTrue {
updateReadyCondition(spc, clusterCondition, toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason, "")
return nil
}

return ctrl.Result{}, reportedError
spc.Status.ConsumedCapacity, err = collectConsumedCapacity(ctx, r.Client, spc.Spec.ToolchainCluster, spc.Namespace)
if err != nil {
updateReadyCondition(spc, corev1.ConditionUnknown, toolchainv1alpha1.SpaceProvisionerConfigFailedToDetermineCapacityReason, err.Error())
return err
}

capacityCondition := r.determineCapacityReadyState(spc)

reason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
if capacityCondition != corev1.ConditionTrue {
reason = toolchainv1alpha1.SpaceProvisionerConfigInsufficientCapacityReason
}

updateReadyCondition(spc, capacityCondition, reason, "")

return nil
}

func (r *Reconciler) determineReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (toolchainv1alpha1.Condition, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
toolchainClusterKey := runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}
var toolchainPresent corev1.ConditionStatus
toolchainPresenceReason := toolchainv1alpha1.SpaceProvisionerConfigValidReason
var reportedError error
toolchainPresenceMessage := ""
if err := r.Client.Get(ctx, toolchainClusterKey, toolchainCluster); err != nil {
if !errors.IsNotFound(err) {
// we need to requeue the reconciliation in this case because we cannot be sure whether the ToolchainCluster
// is really present in the cluster or not. If we did not do that and instead just reported the error in
// the status, we could eventually leave the SPC in an incorrect state once the error condition in the cluster,
// that prevents us from reading the ToolchainCluster, clears. I.e. we need the requeue to keep the promise
// of eventual consistency.

reportedError = fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
toolchainPresenceMessage = reportedError.Error()
// Note that this function merely mirrors the usage information found in the ToolchainStatus. This means that it actually may work
// with slightly stale data because the counter.Counts cache might not have been synced yet. This is ok though because the capacity manager
// doesn't completely rely on the readiness status of the SPC and will re-evaluate the decision taking into the account the contents of
// the counter cache and therefore completely "fresh" data.
func collectConsumedCapacity(ctx context.Context, cl runtimeclient.Client, clusterName string, toolchainStatusNs string) (*toolchainv1alpha1.ConsumedCapacity, error) {
status := &toolchainv1alpha1.ToolchainStatus{}
if err := cl.Get(ctx, types.NamespacedName{Namespace: toolchainStatusNs, Name: toolchainconfig.ToolchainStatusName}, status); err != nil {
return nil, fmt.Errorf("unable to read ToolchainStatus resource: %w", err)
}

for _, m := range status.Status.Members {
if m.ClusterName == clusterName {
cc := toolchainv1alpha1.ConsumedCapacity{}
cc.MemoryUsagePercentPerNodeRole = m.MemberStatus.ResourceUsage.MemoryUsagePerNodeRole
cc.SpaceCount = m.SpaceCount

return &cc, nil
}
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotFoundReason
toolchainPresent = corev1.ConditionFalse
} else {
readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
toolchainPresent = corev1.ConditionFalse
} else {
toolchainPresent = readyCond.Status
}

return nil, nil
}

func (r *Reconciler) determineClusterReadyState(ctx context.Context, spc *toolchainv1alpha1.SpaceProvisionerConfig) (corev1.ConditionStatus, error) {
toolchainCluster := &toolchainv1alpha1.ToolchainCluster{}
if err := r.Client.Get(ctx, runtimeclient.ObjectKey{Name: spc.Spec.ToolchainCluster, Namespace: spc.Namespace}, toolchainCluster); err != nil {
if errors.IsNotFound(err) {
return corev1.ConditionFalse, err
}
if toolchainPresent != corev1.ConditionTrue {
toolchainPresenceReason = toolchainv1alpha1.SpaceProvisionerConfigToolchainClusterNotReadyReason
// IsNotFound is self-explanatory but let's add a little bit of context to the error in other cases
return corev1.ConditionFalse, fmt.Errorf("failed to get the referenced ToolchainCluster: %w", err)
}

readyCond, found := condition.FindConditionByType(toolchainCluster.Status.Conditions, toolchainv1alpha1.ConditionReady)
if !found {
return corev1.ConditionFalse, nil
}

return readyCond.Status, nil
}

func (r *Reconciler) determineCapacityReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Status.ConsumedCapacity == nil {
// we don't know anything about the resource consumption in the member
return corev1.ConditionUnknown
}

// the cluster capacity is ok if it has room for additional spaces and enough free memory

roomForAdditionalSpaces := determineSpaceCountReadyState(spc)
if !roomForAdditionalSpaces {
return corev1.ConditionFalse
}

return determineMemoryUtilizationReadyState(spc)
}

// determineSpaceCountReadyState checks that there is room for additional spaces in the cluster.
// It always knows this fact so returning a bool is ok, in contrast to determinMemoryUtilizationReadyState.
func determineSpaceCountReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) bool {
max := spc.Spec.CapacityThresholds.MaxNumberOfSpaces
return max == 0 || max > uint(spc.Status.ConsumedCapacity.SpaceCount)
}

// determineMemoryUtilizationReadyState checks that the cluster has enough free memory. It may not be able to tell the fact
// if the SPC doesn't contain memory usage information in the status. It therefore can return true, false or
// unknown condition values.
func determineMemoryUtilizationReadyState(spc *toolchainv1alpha1.SpaceProvisionerConfig) corev1.ConditionStatus {
if spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent == 0 {
// 0 max memory utilization means no limit
return corev1.ConditionTrue
}

if len(spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole) == 0 {
// we don't know the memory utilization in the member
return corev1.ConditionUnknown
}

// the memory utilitzation is ok if it is below the threshold in all node types
for _, val := range spc.Status.ConsumedCapacity.MemoryUsagePercentPerNodeRole {
if uint(val) >= spc.Spec.CapacityThresholds.MaxMemoryUtilizationPercent {
return corev1.ConditionFalse
}
}
return corev1.ConditionTrue
}

return toolchainv1alpha1.Condition{
func updateReadyCondition(spc *toolchainv1alpha1.SpaceProvisionerConfig, status corev1.ConditionStatus, reason, message string) {
readyCondition := toolchainv1alpha1.Condition{
Type: toolchainv1alpha1.ConditionReady,
Status: toolchainPresent,
Message: toolchainPresenceMessage,
Reason: toolchainPresenceReason,
}, reportedError
Status: status,
Reason: reason,
Message: message,
}
spc.Status.Conditions, _ = condition.AddOrUpdateStatusConditions(spc.Status.Conditions, readyCondition)
}
Loading
Loading