From 610b2e6ff2d4f5b0e30c73b2965585cea0069712 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Thu, 5 Dec 2024 23:44:36 +0100 Subject: [PATCH 1/2] Write reason to node history for wTF, failed and parked events --- .../java/com/yahoo/vespa/hosted/provision/Node.java | 9 +++------ .../hosted/provision/maintenance/NodeFailer.java | 13 ++++++++----- .../yahoo/vespa/hosted/provision/node/History.java | 12 ++++++++---- .../yahoo/vespa/hosted/provision/node/Nodes.java | 4 ++-- .../hosted/provision/persistence/CuratorDb.java | 4 ++-- .../provision/persistence/NodeSerializer.java | 7 +++++-- 6 files changed, 28 insertions(+), 21 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 7fe7e88c7f5e..32c4904f4136 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -283,12 +283,9 @@ public Optional wireguardPubKey() { /** * Returns a copy of this where wantToFail is set to true and history is updated to reflect this. */ - public Node withWantToFail(boolean wantToFail, Agent agent, Instant at) { - Node node = this.with(status.withWantToFail(wantToFail)); - if (wantToFail) - node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at))); - return node; - + public Node withWantToFail(Agent agent, Instant at, String reason) { + return with(status.withWantToFail(true)) + .with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at, Optional.of(reason)))); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 3f614dddb365..67b6049d8094 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -246,7 +246,7 @@ private void failActive(FailingNode failing) { if (activeChildrenToFail.isEmpty()) { log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason); - markWantToFail(failing.node(), true, lock.parent()); + markWantToFail(failing.node(), true, failing.reason, lock.parent()); redeploy = true; } } @@ -260,7 +260,7 @@ private void failActive(FailingNode failing) { if (failing == null) return; log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason); - markWantToFail(failing.node(), true, lock); + markWantToFail(failing.node(), true, failing.reason, lock); redeploy = true; } } @@ -295,16 +295,19 @@ private void redeploy(Deployment deployment, FailingNode failing) { Optional optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node()); if (optionalNodeMutex.isEmpty()) return; try (var nodeMutex = optionalNodeMutex.get()) { - markWantToFail(nodeMutex.node(), false, nodeMutex); + markWantToFail(nodeMutex.node(), false, failing.reason(), nodeMutex); log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); } } } - private void markWantToFail(Node node, boolean wantToFail, Mutex lock) { + private void markWantToFail(Node node, boolean wantToFail, String reason, Mutex lock) { if (node.status().wantToFail() != wantToFail) { - nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); + node = wantToFail ? + node.withWantToFail(Agent.NodeFailer, clock().instant(), reason) : + node.with(node.status().withWantToFail(false)); + nodeRepository().nodes().write(node, lock); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 538b5ccd6178..552f394471cf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -141,9 +141,10 @@ private ImmutableMap.Builder builderWithout(Event.Type type) } /** Returns a copy of this history with a record of this state transition added, if applicable */ - public History recordStateTransition(Node.State from, Node.State to, Agent agent, Instant at) { + public History recordStateTransition(Node.State from, Node.State to, Agent agent, Optional reason, Instant at) { // If the event is a re-reservation, allow the new one to override the older one. if (from == to && from != Node.State.reserved) return this; + // Only keeping reason for errored states (failed, parked) return switch (to) { case provisioned -> this.with(new Event(Event.Type.provisioned, agent, at)); case deprovisioned -> this.with(new Event(Event.Type.deprovisioned, agent, at)); @@ -151,9 +152,9 @@ public History recordStateTransition(Node.State from, Node.State to, Agent agent case active -> this.with(new Event(Event.Type.activated, agent, at)); case inactive -> this.with(new Event(Event.Type.deactivated, agent, at)); case reserved -> this.with(new Event(Event.Type.reserved, agent, at)); - case failed -> this.with(new Event(Event.Type.failed, agent, at)); + case failed -> this.with(new Event(Event.Type.failed, agent, at, reason)); case dirty -> this.with(new Event(Event.Type.deallocated, agent, at)); - case parked -> this.with(new Event(Event.Type.parked, agent, at)); + case parked -> this.with(new Event(Event.Type.parked, agent, at, reason)); case breakfixed -> this.with(new Event(Event.Type.breakfixed, agent, at)); }; } @@ -180,7 +181,10 @@ public String toString() { } /** An event which may happen to a node */ - public record Event(Type type, Agent agent, Instant at) { + public record Event(Type type, Agent agent, Instant at, Optional reason) { + public Event(Type type, Agent agent, Instant at) { + this(type, agent, at, Optional.empty()); + } public enum Type { // State changes diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 8a331a009a1d..2b088c6069dc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -276,7 +276,7 @@ public List deactivate(List nodes, ApplicationTransaction transactio */ public List fail(List nodes, ApplicationTransaction transaction) { return db.writeTo(Node.State.failed, - nodes.stream().map(n -> n.withWantToFail(false, Agent.application, clock.instant())).toList(), + nodes.stream().map(n -> n.with(n.status().withWantToFail(false))).toList(), Agent.application, Optional.of("Failed by application"), transaction); } @@ -380,7 +380,7 @@ public List failOrMarkRecursively(String hostname, Agent agent, String rea private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) { if (node.state() == Node.State.active) { - node = node.withWantToFail(true, agent, clock.instant()); + node = node.withWantToFail(agent, clock.instant(), reason); write(node, lock); return node; } else { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDb.java index 0f6ec937a146..fccb841095da 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDb.java @@ -134,7 +134,7 @@ public List addNodesInState(LockedNodeList nodes, Node.State expectedState if (node.state() != expectedState) throw new IllegalArgumentException(node + " is not in the " + expectedState + " state"); - node = node.with(node.history().recordStateTransition(null, expectedState, agent, clock.instant())); + node = node.with(node.history().recordStateTransition(null, expectedState, agent, Optional.empty(), clock.instant())); byte[] serialized = nodeSerializer.toJson(node); curatorTransaction.add(CuratorOperations.create(nodePath(node).getAbsolute(), serialized)); } @@ -234,7 +234,7 @@ public List writeTo(Node.State toState, List nodes, newNodeStatus(node, toState), toState, toState.isAllocated() ? node.allocation() : Optional.empty(), - node.history().recordStateTransition(node.state(), toState, agent, clock.instant()), + node.history().recordStateTransition(node.state(), toState, agent, reason, clock.instant()), node.type(), node.reports(), node.modelName(), node.reservedTo(), node.exclusiveToApplicationId(), node.provisionedForApplicationId(), node.hostTTL(), node.hostEmptyAt(), node.exclusiveToClusterType(), node.switchHostname(), node.trustedCertificates(), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index c9eb8d04daa4..167dea8ad247 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -126,7 +126,8 @@ public class NodeSerializer { // History event fields private static final String historyEventTypeKey = "type"; private static final String atKey = "at"; - private static final String agentKey = "agent"; // retired events only + private static final String agentKey = "agent"; + private static final String reasonKey = "reason"; // Network port fields private static final String networkPortsKey = "networkPorts"; @@ -238,6 +239,7 @@ private void toSlime(History.Event event, Cursor object) { object.setString(historyEventTypeKey, toString(event.type())); object.setLong(atKey, event.at().toEpochMilli()); object.setString(agentKey, toString(event.agent())); + event.reason().ifPresent(reason -> object.setString(reasonKey, reason)); } private void toSlime(List addresses, Cursor array, boolean dummyDueToErasure) { @@ -365,7 +367,8 @@ private History.Event eventFromSlime(Inspector object) { if (type == null) return null; Instant at = Instant.ofEpochMilli(object.field(atKey).asLong()); Agent agent = eventAgentFromSlime(object.field(agentKey)); - return new History.Event(type, agent, at); + Optional reason = SlimeUtils.optionalString(object.field(reasonKey)); + return new History.Event(type, agent, at, reason); } private Generation generationFromSlime(Inspector object, String wantedField, String currentField) { From 7b50dc3c9dfa8f9983b095687a18e4e02faf22da Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Mon, 9 Dec 2024 11:09:31 +0100 Subject: [PATCH 2/2] Add event reason to /nodes/v2 API --- .../yahoo/vespa/hosted/provision/restapi/NodesResponse.java | 1 + .../hosted/provision/restapi/responses/dockerhost6.json | 6 ++++-- .../provision/restapi/responses/node4-after-changes.json | 6 ++++-- .../provision/restapi/responses/node5-after-changes.json | 6 ++++-- .../vespa/hosted/provision/restapi/responses/node5.json | 6 ++++-- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index 03044319d54f..5b0c0f0e16e0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -237,6 +237,7 @@ private void toSlime(Collection events, Cursor array) { object.setString("event", event.type().name()); object.setLong("at", event.at().toEpochMilli()); object.setString("agent", event.agent().name()); + event.reason().ifPresent(reason -> object.setString("reason", reason)); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/dockerhost6.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/dockerhost6.json index 0e1892b41905..f30c50b6c27b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/dockerhost6.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/dockerhost6.json @@ -49,7 +49,8 @@ { "event": "failed", "at": 123, - "agent": "operator" + "agent": "operator", + "reason": "MockNodeRepository" }, { "event": "deprovisioned", @@ -76,7 +77,8 @@ { "event": "failed", "at": 123, - "agent": "operator" + "agent": "operator", + "reason": "MockNodeRepository" }, { "event": "deprovisioned", diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json index a3589e760ade..325e250bf48d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json @@ -88,7 +88,8 @@ { "event": "wantToFail", "at": 123, - "agent": "operator" + "agent": "operator", + "reason": "Failed through the nodes/v2 API" }, { "event": "rebooted", @@ -135,7 +136,8 @@ { "event": "wantToFail", "at": 123, - "agent": "operator" + "agent": "operator", + "reason": "Failed through the nodes/v2 API" }, { "event": "rebooted", diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5-after-changes.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5-after-changes.json index fafaf61ef78a..8f3c32a4ce66 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5-after-changes.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5-after-changes.json @@ -48,7 +48,8 @@ { "event": "failed", "at": 123, - "agent": "system" + "agent": "system", + "reason": "MockNodeRepository" } ], "log": [ @@ -70,7 +71,8 @@ { "event": "failed", "at": 123, - "agent": "system" + "agent": "system", + "reason": "MockNodeRepository" } ], "ipAddresses": ["::5:1", "127.0.5.1"], diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5.json index aae4fe84fb86..c7d2d7988edd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node5.json @@ -50,7 +50,8 @@ { "event": "failed", "at": 123, - "agent": "system" + "agent": "system", + "reason": "MockNodeRepository" } ], "log": [ @@ -72,7 +73,8 @@ { "event": "failed", "at": 123, - "agent": "system" + "agent": "system", + "reason": "MockNodeRepository" } ], "ipAddresses": ["::5:1", "127.0.5.1"],