diff --git a/config-model/src/main/resources/schema/deployment.rnc b/config-model/src/main/resources/schema/deployment.rnc index 3e751a379d4f..1aaf002b7030 100644 --- a/config-model/src/main/resources/schema/deployment.rnc +++ b/config-model/src/main/resources/schema/deployment.rnc @@ -52,7 +52,8 @@ ParallelInstances = element parallel { Upgrade = element upgrade { attribute policy { xsd:string }? & - attribute revision { xsd:string }? & + attribute revision-target { xsd:string }? & + attribute revision-change { xsd:string }? & attribute rollout { xsd:string }? } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index f36f5be77786..8c933f982779 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -176,10 +176,14 @@ public Map> jobsToRun() { Map> jobs = jobsToRun(changes); // Add test jobs for any outstanding change. - for (InstanceName instance : application.deploymentSpec().instanceNames()) - changes.put(instance, outstandingChange(instance).onTopOf(application.require(instance).change())); - var testJobs = jobsToRun(changes, true).entrySet().stream() - .filter(entry -> ! entry.getKey().type().isProduction()); + Map outstandingChanges = new LinkedHashMap<>(); + for (InstanceName instance : application.deploymentSpec().instanceNames()) { + Change outstanding = outstandingChange(instance); + if (outstanding.hasTargets()) + outstandingChanges.put(instance, outstanding.onTopOf(application.require(instance).change())); + } + var testJobs = jobsToRun(outstandingChanges, true).entrySet().stream() + .filter(entry -> ! entry.getKey().type().isProduction()); return Stream.concat(jobs.entrySet().stream(), testJobs) .collect(collectingAndThen(toMap(Map.Entry::getKey, @@ -347,8 +351,8 @@ private List changes(JobId job, StepStatus step, Change change) { || step.completedAt(change.withoutPlatform(), Optional.of(job)).isPresent()) return List.of(change); - // For a dual change, where both target remain, we determine what to run by looking at when the two parts became ready: - // for deployments, we look at dependencies; for tests, this may be overridden by what is already deployed. + // For a dual change, where both targets remain, we determine what to run by looking at when the two parts became ready: + // for deployments, we look at dependencies; for production tests, this may be overridden by what is already deployed. JobId deployment = new JobId(job.application(), JobType.from(system, job.type().zone(system)).get()); UpgradeRollout rollout = application.deploymentSpec().requireInstance(job.application().instance()).upgradeRollout(); if (job.type().isTest()) { @@ -405,10 +409,13 @@ private List changes(JobId job, StepStatus step, Change change) { // Both changes are ready for this step, and we look to the specified rollout to decide. boolean platformReadyFirst = platformReadyAt.get().isBefore(revisionReadyAt.get()); boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get()); + boolean failingUpgradeOnlyTests = ! jobs().type(systemTest, stagingTest) + .failingHardOn(Versions.from(change.withoutApplication(), application, deploymentFor(job), systemVersion)) + .isEmpty(); switch (rollout) { case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing. return (platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH)) // Assume platform was first if no jobs have run yet. - ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() + ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() || failingUpgradeOnlyTests ? List.of(change) // Platform was first, but is failing. : List.of(change.withoutApplication(), change) // Platform was first, and is OK. : revisionReadyFirst @@ -900,6 +907,11 @@ public int hashCode() { return Objects.hash(versions, readyAt, change); } + @Override + public String toString() { + return change + " with versions " + versions + ", ready at " + readyAt; + } + } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index ed5df62ca5db..aeaa821745b2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -372,11 +372,11 @@ private void abortIfOutdated(DeploymentStatus status, Map { if (jobs.get(job).stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions()) && versions.versions().sourcesMatchIfPresent(last.versions()))) { - log.log(Level.INFO, "Aborting outdated run " + last); - controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + - jobs.get(job).stream() - .map(scheduled -> scheduled.versions().toString()) - .collect(Collectors.joining(", "))); + String blocked = jobs.get(job).stream() + .map(scheduled -> scheduled.versions().toString()) + .collect(Collectors.joining(", ")); + log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked); + controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked); } }); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 5de07bad8593..d06bdc45583b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -17,6 +17,7 @@ import java.util.function.Predicate; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure; /** * A list of deployment jobs that can be filtered in various ways. @@ -102,6 +103,15 @@ public JobList production() { return matching(job -> job.id().type().isProduction()); } + /** Returns the jobs with any runs failing with non-out-of-test-capacity on the given versions — targets only for system test, everything present otherwise. */ + public JobList failingHardOn(Versions versions) { + return matching(job -> ! RunList.from(job) + .on(versions) + .matching(Run::hasFailed) + .not().matching(run -> run.status() == nodeAllocationFailure && run.id().type().environment().isTest()) + .isEmpty()); + } + /** Returns the jobs with any runs matching the given versions — targets only for system test, everything present otherwise. */ public JobList triggeredOn(Versions versions) { return matching(job -> ! RunList.from(job).on(versions).isEmpty()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index b95d34f54145..b2847a29654c 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -698,7 +698,10 @@ public void stepIsCompletePreciselyWhenItShouldBe() { ApplicationVersion revision1 = app1.lastSubmission().get(); app1.submit(applicationPackage); ApplicationVersion revision2 = app1.lastSubmission().get(); - app1.runJob(systemTest).runJob(stagingTest); + app1.runJob(systemTest) // Tests for new revision on version2 + .runJob(stagingTest) + .runJob(systemTest) // Tests for new revision on version1 + .runJob(stagingTest); assertEquals(Change.of(version1).with(revision2), app1.instance().change()); tester.triggerJobs(); app1.assertRunning(productionUsCentral1); @@ -718,9 +721,7 @@ public void stepIsCompletePreciselyWhenItShouldBe() { app1.assertNotRunning(productionUsCentral1); // Last job has a different deployment target, so tests need to run again. - app1.runJob(systemTest) - .runJob(stagingTest) // Eager test of outstanding change, assuming upgrade in west succeeds. - .runJob(productionEuWest1) // Upgrade completes, and revision is the only change. + app1.runJob(productionEuWest1) // Upgrade completes, and revision is the only change. .runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure. .runJob(productionEuWest1); // Finally, west changes revision. assertEquals(Change.empty(), app1.instance().change()); @@ -1433,6 +1434,21 @@ public void testRevisionJoinsUpgradeWithSeparateRollout() { tester.jobs().last(app.instanceId(), productionUsWest1).get().versions()); app.runJob(productionUsWest1); assertEquals(Change.empty(), app.instance().change()); + + // New upgrade fails in staging-test, and revision to fix it is submitted. + var version2 = new Version("7.2"); + tester.controllerTester().upgradeSystem(version2); + tester.upgrader().maintain(); + app.runJob(systemTest).failDeployment(stagingTest); + tester.clock().advance(Duration.ofMinutes(30)); + app.failDeployment(stagingTest); + app.submit(appPackage); + + app.runJob(systemTest).runJob(stagingTest) // Tests run with combined upgrade. + .runJob(productionUsCentral1) // Combined upgrade stays together. + .runJob(productionUsEast3).runJob(productionUsWest1); + assertEquals(Map.of(), app.deploymentStatus().jobsToRun()); + assertEquals(Change.empty(), app.instance().change()); } @Test