Skip to content

Commit

Permalink
operator: Add timeout and improve logging for DNS check on proxy startup
Browse files Browse the repository at this point in the history
Signed-off-by: Aaron Wilson <[email protected]>
  • Loading branch information
aaronnw committed Sep 20, 2024
1 parent 35c0248 commit e4790ef
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
16 changes: 15 additions & 1 deletion operator/pkg/controllers/proxy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ package controllers

import (
"context"
"errors"
"fmt"
"net"
"strings"
"time"

aisapi "github.com/NVIDIA/aistore/api"
aisapc "github.com/NVIDIA/aistore/api/apc"
Expand All @@ -22,6 +24,8 @@ import (
logf "sigs.k8s.io/controller-runtime/pkg/log"
)

const dnsLookupTimeout = 10 * time.Second

func (r *AIStoreReconciler) ensureProxyPrereqs(ctx context.Context, ais *aisv1.AIStore) (err error) {
var cm *corev1.ConfigMap

Expand Down Expand Up @@ -84,7 +88,7 @@ func (r *AIStoreReconciler) initProxies(ctx context.Context, ais *aisv1.AIStore)

// Check whether proxy service to have a registered DNS entry.
if err = checkDNSEntry(ctx, ais); err != nil {
logger.Error(err, "Failed to lookup DNS entries for primary proxy service")
logger.Info("Failed to find any DNS entries for proxy service", "error", err)
r.recorder.Event(ais, corev1.EventTypeNormal, EventReasonWaiting, "Waiting for proxy service to have registered DNS entries")
return true /*requeue*/, nil
}
Expand All @@ -97,7 +101,17 @@ func checkDNSEntryDefault(ctx context.Context, ais *aisv1.AIStore) error {
nsName := proxy.HeadlessSVCNSName(ais)
clusterDomain := ais.GetClusterDomain()
hostname := fmt.Sprintf("%s.%s.svc.%s", nsName.Name, nsName.Namespace, clusterDomain)

ctx, cancel := context.WithTimeout(ctx, dnsLookupTimeout)
defer cancel()
_, err := net.DefaultResolver.LookupIPAddr(ctx, hostname)
// Log an error if we have an actual error, not just no host found
if err != nil {
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) && !dnsErr.IsNotFound {
logf.FromContext(ctx).Error(dnsErr, "Error looking up DNS entry")
}
}
return err
}

Expand Down
2 changes: 0 additions & 2 deletions operator/pkg/resources/cmn/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,13 @@ func GenerateConfigToSet(ctx context.Context, ais *aisv1.AIStore) (*aiscmn.Confi
// Deep copy to avoid modifying the spec itself
specConfig = ais.Spec.ConfigToUpdate.DeepCopy()
}
logger := logf.FromContext(ctx)
// Override rebalance if the cluster is not ready for it (starting up, scaling, upgrading)
if ais.IsConditionTrue(aisv1.ConditionReadyRebalance) {
// If not provided, reset to default
if !specConfig.IsRebalanceEnabledSet() {
specConfig.UpdateRebalanceEnabled(DefaultAISConf(ctx, ais).IsRebalanceEnabled())
}
} else {
logger.Info("Setting rebalance disabled in spec config because of condition")
specConfig.UpdateRebalanceEnabled(aisapc.Ptr(false))
}

Expand Down

0 comments on commit e4790ef

Please sign in to comment.