From 2b783fb86dacca43c7a72b0a940b667d0183b54e Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 5 Jun 2024 12:31:46 +0200 Subject: [PATCH 1/2] Test that we don't rescale when container memory is unfulfiled --- .../prelude/fastsearch/VespaBackend.java | 4 +- .../autoscale/AllocatableResources.java | 6 ++- .../provision/autoscale/Autoscaler.java | 8 ++-- .../provision/autoscale/ClusterModel.java | 37 ++++++++++--------- .../hosted/provision/autoscale/Load.java | 2 +- .../provision/autoscale/AutoscalingTest.java | 17 +++++++++ .../hosted/provision/autoscale/Loader.java | 2 + 7 files changed, 49 insertions(+), 27 deletions(-) diff --git a/container-search/src/main/java/com/yahoo/prelude/fastsearch/VespaBackend.java b/container-search/src/main/java/com/yahoo/prelude/fastsearch/VespaBackend.java index a463fb9d0e66..47a4530b9d11 100644 --- a/container-search/src/main/java/com/yahoo/prelude/fastsearch/VespaBackend.java +++ b/container-search/src/main/java/com/yahoo/prelude/fastsearch/VespaBackend.java @@ -311,10 +311,10 @@ void traceQuery(String sourceName, String type, Query query, int offset, int hit if (query.getTrace().isTraceable(level + 1) && query.getTrace().getQuery()) { query.trace("Current state of query tree: " + new TextualQueryRepresentation(query.getModel().getQueryTree().getRoot()), - false, level+1); + false, level + 1); } if (query.getTrace().isTraceable(level + 2) && query.getTrace().getQuery()) { - query.trace("YQL+ representation: " + query.yqlRepresentation(), level+2); + query.trace("YQL+ representation: " + query.yqlRepresentation(), level + 2); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java index 6d3de6286018..7609851d7fef 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java @@ -116,6 +116,10 @@ public ClusterResources advertisedResources() { */ public double fulfilment() { return fulfilment; } + public boolean notFulfiled() { + return fulfilment < 0.9999999; + } + private static double fulfilment(ClusterResources realResources, ClusterResources idealResources) { double vcpuFulfilment = Math.min(1, realResources.totalResources().vcpu() / idealResources.totalResources().vcpu()); double memoryGbFulfilment = Math.min(1, realResources.totalResources().memoryGb() / idealResources.totalResources().memoryGb()); @@ -128,7 +132,7 @@ private static double fulfilment(ClusterResources realResources, ClusterResource public boolean preferableTo(AllocatableResources other, ClusterModel model) { // always fulfil as much as possible unless fulfilment is considered to be equal - if (!equal(this.fulfilment(), other.fulfilment()) && (other.fulfilment() < 1 || this.fulfilment() < 1)) + if ((other.fulfilment() < 1 || this.fulfilment() < 1) && ! equal(this.fulfilment(), other.fulfilment())) return this.fulfilment() > other.fulfilment(); return this.cost() * toHours(model.allocationDuration()) + this.costChangingFrom(model) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 29ab6d65b9fe..d0ea406c91e1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -69,11 +69,9 @@ public Autoscaling autoscale(Application application, Cluster cluster, NodeList return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", model); var loadAdjustment = model.loadAdjustment(); - if (enableDetailedLogging) { + if (enableDetailedLogging) log.info("Application: " + application.id().toShortString() + ", loadAdjustment: " + loadAdjustment.toString()); - } - // Ensure we only scale down if we'll have enough headroom to not scale up again given a small load increase var target = allocationOptimizer.findBestAllocation(loadAdjustment, model, limits, enableDetailedLogging); if (target.isEmpty()) @@ -98,8 +96,8 @@ private Autoscaling toAutoscaling(AllocatableResources target, ClusterModel mode return Autoscaling.dontScale(Status.unavailable, "Autoscaling is disabled in single node clusters", model); if (! worthRescaling(model.current().realResources(), target.realResources())) { - if (target.fulfilment() < 0.9999999) - return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", model); + if (target.notFulfiled()) + return Autoscaling.dontScale(Status.insufficient, "Cluster cannot be scaled to achieve ideal load", model); else if ( ! model.safeToScaleDown() && model.idealLoad().any(v -> v < 1.0)) return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", model); else diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 504965f1992e..10207ea87d53 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -125,6 +125,21 @@ public ClusterModel(NodeRepository nodeRepository, this.at = clock.instant(); } + + /** + * The central decision made in autoscaling. + * + * @return the relative load adjustment that should be made to this cluster given available measurements. + * For example, a load adjustment of 2 means we should allocate twice the amount of that resources. + */ + public Load loadAdjustment() { + if (nodeTimeseries().measurementsPerNode() < 0.5) return Load.one(); // Don't change based on very little data + Load adjustment = peakLoad().divide(idealLoad()); + if (! safeToScaleDown()) + adjustment = adjustment.map(v -> v < 1 ? 1 : v); + return adjustment; + } + public Application application() { return application; } public ClusterSpec clusterSpec() { return clusterSpec; } public CloudAccount cloudAccount() { return cluster.cloudAccount().orElse(CloudAccount.empty); } @@ -133,11 +148,7 @@ public ClusterModel(NodeRepository nodeRepository, private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; } /** Returns the instant this model was created. */ - public Instant at() { return at;} - - public boolean isEmpty() { - return nodeTimeseries().isEmpty(); - } + public Instant at() { return at; } /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } @@ -148,9 +159,9 @@ public boolean isEmpty() { */ public Duration allocationDuration() { return allocationDuration; } - public boolean isContent() { - return clusterSpec.type().isContent(); - } + public boolean isEmpty() { return nodeTimeseries().isEmpty(); } + + public boolean isContent() { return clusterSpec.type().isContent(); } /** Returns the predicted duration of data redistribution in this cluster. */ public Duration redistributionDuration() { @@ -177,15 +188,6 @@ public boolean isExclusive() { return nodeRepository.exclusivity().allocation(clusterSpec); } - /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ - public Load loadAdjustment() { - if (nodeTimeseries().measurementsPerNode() < 0.5) return Load.one(); // Don't change based on very little data - Load adjustment = peakLoad().divide(idealLoad()); - if (! safeToScaleDown()) - adjustment = adjustment.map(v -> v < 1 ? 1 : v); - return adjustment; - } - public boolean isStable(NodeRepository nodeRepository) { // The cluster is processing recent changes if (nodes.stream().anyMatch(node -> node.status().wantToRetire() || @@ -218,7 +220,6 @@ public Load loadAdjustmentWith(int nodes, int groups, Load loadAdjustment) { .divide(redundancyAdjustment()); // correct for double redundancy adjustment } - /** * Returns the relative load adjustment accounting for redundancy given these nodes+groups * relative to node nodes+groups in this. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index 22c13795d184..e00bf17c89ff 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -114,7 +114,7 @@ private static double divide(double a, double b) { @Override public String toString() { - return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk," + gpu + " gpu," + gpuMemory + " gpuMemory"; + return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk, " + gpu + " gpu, " + gpuMemory + " gpuMemory"; } public static Load zero() { return new Load(0, 0, 0, 0, 0); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index e0c8199a8827..5b15327556fd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -153,6 +153,23 @@ public void test_container_scaling_down_exclusive() { fixture.autoscale()); } + @Test + public void test_containers_wont_scale_up_on_memory() { + var min = new ClusterResources(2, 1, new NodeResources(4, 8, 50, 0.1)); + var now = new ClusterResources(4, 1, new NodeResources(4, 8, 50, 0.1)); + var max = new ClusterResources(8, 1, new NodeResources(4, 8, 50, 0.1)); + var fixture = DynamicProvisioningTester.fixture() + .awsProdSetup(false) + .clusterType(ClusterSpec.Type.container) + .initialResources(Optional.of(now)) + .capacity(Capacity.from(min, max)) + .build(); + fixture.tester().setScalingDuration(fixture.applicationId(), fixture.clusterSpec.id(), Duration.ofMinutes(5)); + + fixture.loader().applyLoad(new Load(0.1875, 1.0, 0.95, 0, 0), 50); + assertEquals(Autoscaling.Status.insufficient, fixture.autoscale().status()); + } + @Test public void initial_deployment_with_host_sharing_flag() { var min = new ClusterResources(7, 1, new NodeResources(2.0, 10.0, 384.0, 0.1)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java index 8dc3945223fe..8f053e427e21 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java @@ -102,6 +102,8 @@ public void addMemMeasurements(double value, int count) { NodeList nodes = fixture.nodes(); float oneExtraNodeFactor = (float)(nodes.size() - 1.0) / (nodes.size()); Load load = new Load(idealLoad.cpu(), value, idealLoad.disk(), 0, 0).multiply(oneExtraNodeFactor); + System.out.println(" idealLoad: " + idealLoad); + System.out.println("adjusted idealLoad: " + load); for (int i = 0; i < count; i++) { fixture.tester().clock().advance(samplingInterval); for (Node node : nodes) { From 66ee600aa41ebd0a51a4db2d1ca17bfe343c9088 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 5 Jun 2024 12:34:48 +0200 Subject: [PATCH 2/2] Remove printlns --- .../java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java index 8f053e427e21..8dc3945223fe 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java @@ -102,8 +102,6 @@ public void addMemMeasurements(double value, int count) { NodeList nodes = fixture.nodes(); float oneExtraNodeFactor = (float)(nodes.size() - 1.0) / (nodes.size()); Load load = new Load(idealLoad.cpu(), value, idealLoad.disk(), 0, 0).multiply(oneExtraNodeFactor); - System.out.println(" idealLoad: " + idealLoad); - System.out.println("adjusted idealLoad: " + load); for (int i = 0; i < count; i++) { fixture.tester().clock().advance(samplingInterval); for (Node node : nodes) {