Skip to content

Commit

Permalink
Merge pull request #21476 from vespa-engine/jonmv/more-dep-orch-adjus…
Browse files Browse the repository at this point in the history
…tments

Jonmv/more dep orch adjustments
  • Loading branch information
jonmv authored Mar 1, 2022
2 parents 3b60f8a + 3609153 commit 2090ed6
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 17 deletions.
3 changes: 2 additions & 1 deletion config-model/src/main/resources/schema/deployment.rnc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ ParallelInstances = element parallel {

Upgrade = element upgrade {
attribute policy { xsd:string }? &
attribute revision { xsd:string }? &
attribute revision-target { xsd:string }? &
attribute revision-change { xsd:string }? &
attribute rollout { xsd:string }?
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,14 @@ public Map<JobId, List<Job>> jobsToRun() {
Map<JobId, List<Job>> jobs = jobsToRun(changes);

// Add test jobs for any outstanding change.
for (InstanceName instance : application.deploymentSpec().instanceNames())
changes.put(instance, outstandingChange(instance).onTopOf(application.require(instance).change()));
var testJobs = jobsToRun(changes, true).entrySet().stream()
.filter(entry -> ! entry.getKey().type().isProduction());
Map<InstanceName, Change> outstandingChanges = new LinkedHashMap<>();
for (InstanceName instance : application.deploymentSpec().instanceNames()) {
Change outstanding = outstandingChange(instance);
if (outstanding.hasTargets())
outstandingChanges.put(instance, outstanding.onTopOf(application.require(instance).change()));
}
var testJobs = jobsToRun(outstandingChanges, true).entrySet().stream()
.filter(entry -> ! entry.getKey().type().isProduction());

return Stream.concat(jobs.entrySet().stream(), testJobs)
.collect(collectingAndThen(toMap(Map.Entry::getKey,
Expand Down Expand Up @@ -347,8 +351,8 @@ private List<Change> changes(JobId job, StepStatus step, Change change) {
|| step.completedAt(change.withoutPlatform(), Optional.of(job)).isPresent())
return List.of(change);

// For a dual change, where both target remain, we determine what to run by looking at when the two parts became ready:
// for deployments, we look at dependencies; for tests, this may be overridden by what is already deployed.
// For a dual change, where both targets remain, we determine what to run by looking at when the two parts became ready:
// for deployments, we look at dependencies; for production tests, this may be overridden by what is already deployed.
JobId deployment = new JobId(job.application(), JobType.from(system, job.type().zone(system)).get());
UpgradeRollout rollout = application.deploymentSpec().requireInstance(job.application().instance()).upgradeRollout();
if (job.type().isTest()) {
Expand Down Expand Up @@ -405,10 +409,13 @@ private List<Change> changes(JobId job, StepStatus step, Change change) {
// Both changes are ready for this step, and we look to the specified rollout to decide.
boolean platformReadyFirst = platformReadyAt.get().isBefore(revisionReadyAt.get());
boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get());
boolean failingUpgradeOnlyTests = ! jobs().type(systemTest, stagingTest)
.failingHardOn(Versions.from(change.withoutApplication(), application, deploymentFor(job), systemVersion))
.isEmpty();
switch (rollout) {
case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing.
return (platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH)) // Assume platform was first if no jobs have run yet.
? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent()
? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() || failingUpgradeOnlyTests
? List.of(change) // Platform was first, but is failing.
: List.of(change.withoutApplication(), change) // Platform was first, and is OK.
: revisionReadyFirst
Expand Down Expand Up @@ -900,6 +907,11 @@ public int hashCode() {
return Objects.hash(versions, readyAt, change);
}

@Override
public String toString() {
return change + " with versions " + versions + ", ready at " + readyAt;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,11 @@ private void abortIfOutdated(DeploymentStatus status, Map<JobId, List<Deployment
.ifPresent(last -> {
if (jobs.get(job).stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions())
&& versions.versions().sourcesMatchIfPresent(last.versions()))) {
log.log(Level.INFO, "Aborting outdated run " + last);
controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " +
jobs.get(job).stream()
.map(scheduled -> scheduled.versions().toString())
.collect(Collectors.joining(", ")));
String blocked = jobs.get(job).stream()
.map(scheduled -> scheduled.versions().toString())
.collect(Collectors.joining(", "));
log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked);
controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked);
}
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import java.util.function.Predicate;

import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure;

/**
* A list of deployment jobs that can be filtered in various ways.
Expand Down Expand Up @@ -102,6 +103,15 @@ public JobList production() {
return matching(job -> job.id().type().isProduction());
}

/** Returns the jobs with any runs failing with non-out-of-test-capacity on the given versions — targets only for system test, everything present otherwise. */
public JobList failingHardOn(Versions versions) {
return matching(job -> ! RunList.from(job)
.on(versions)
.matching(Run::hasFailed)
.not().matching(run -> run.status() == nodeAllocationFailure && run.id().type().environment().isTest())
.isEmpty());
}

/** Returns the jobs with any runs matching the given versions — targets only for system test, everything present otherwise. */
public JobList triggeredOn(Versions versions) {
return matching(job -> ! RunList.from(job).on(versions).isEmpty());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,10 @@ public void stepIsCompletePreciselyWhenItShouldBe() {
ApplicationVersion revision1 = app1.lastSubmission().get();
app1.submit(applicationPackage);
ApplicationVersion revision2 = app1.lastSubmission().get();
app1.runJob(systemTest).runJob(stagingTest);
app1.runJob(systemTest) // Tests for new revision on version2
.runJob(stagingTest)
.runJob(systemTest) // Tests for new revision on version1
.runJob(stagingTest);
assertEquals(Change.of(version1).with(revision2), app1.instance().change());
tester.triggerJobs();
app1.assertRunning(productionUsCentral1);
Expand All @@ -718,9 +721,7 @@ public void stepIsCompletePreciselyWhenItShouldBe() {
app1.assertNotRunning(productionUsCentral1);

// Last job has a different deployment target, so tests need to run again.
app1.runJob(systemTest)
.runJob(stagingTest) // Eager test of outstanding change, assuming upgrade in west succeeds.
.runJob(productionEuWest1) // Upgrade completes, and revision is the only change.
app1.runJob(productionEuWest1) // Upgrade completes, and revision is the only change.
.runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure.
.runJob(productionEuWest1); // Finally, west changes revision.
assertEquals(Change.empty(), app1.instance().change());
Expand Down Expand Up @@ -1433,6 +1434,21 @@ public void testRevisionJoinsUpgradeWithSeparateRollout() {
tester.jobs().last(app.instanceId(), productionUsWest1).get().versions());
app.runJob(productionUsWest1);
assertEquals(Change.empty(), app.instance().change());

// New upgrade fails in staging-test, and revision to fix it is submitted.
var version2 = new Version("7.2");
tester.controllerTester().upgradeSystem(version2);
tester.upgrader().maintain();
app.runJob(systemTest).failDeployment(stagingTest);
tester.clock().advance(Duration.ofMinutes(30));
app.failDeployment(stagingTest);
app.submit(appPackage);

app.runJob(systemTest).runJob(stagingTest) // Tests run with combined upgrade.
.runJob(productionUsCentral1) // Combined upgrade stays together.
.runJob(productionUsEast3).runJob(productionUsWest1);
assertEquals(Map.of(), app.deploymentStatus().jobsToRun());
assertEquals(Change.empty(), app.instance().change());
}

@Test
Expand Down

0 comments on commit 2090ed6

Please sign in to comment.