Skip to content

Commit

Permalink
Merge pull request #33011 from vespa-engine/freva/fail-reason
Browse files Browse the repository at this point in the history
Write reason to node history for wTF, failed and parked events
  • Loading branch information
mpolden authored Dec 10, 2024
2 parents 3465fd9 + 7b50dc3 commit 63cb8f5
Show file tree
Hide file tree
Showing 11 changed files with 45 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,9 @@ public Optional<WireguardKeyWithTimestamp> wireguardPubKey() {
/**
* Returns a copy of this where wantToFail is set to true and history is updated to reflect this.
*/
public Node withWantToFail(boolean wantToFail, Agent agent, Instant at) {
Node node = this.with(status.withWantToFail(wantToFail));
if (wantToFail)
node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at)));
return node;

public Node withWantToFail(Agent agent, Instant at, String reason) {
return with(status.withWantToFail(true))
.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at, Optional.of(reason))));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ private void failActive(FailingNode failing) {

if (activeChildrenToFail.isEmpty()) {
log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock.parent());
markWantToFail(failing.node(), true, failing.reason, lock.parent());
redeploy = true;
}
}
Expand All @@ -260,7 +260,7 @@ private void failActive(FailingNode failing) {
if (failing == null) return;

log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock);
markWantToFail(failing.node(), true, failing.reason, lock);
redeploy = true;
}
}
Expand Down Expand Up @@ -295,16 +295,19 @@ private void redeploy(Deployment deployment, FailingNode failing) {
Optional<NodeMutex> optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
if (optionalNodeMutex.isEmpty()) return;
try (var nodeMutex = optionalNodeMutex.get()) {
markWantToFail(nodeMutex.node(), false, nodeMutex);
markWantToFail(nodeMutex.node(), false, failing.reason(), nodeMutex);
log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
" for " + failing.reason() + ": " + Exceptions.toMessageString(e));
}
}
}

private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
private void markWantToFail(Node node, boolean wantToFail, String reason, Mutex lock) {
if (node.status().wantToFail() != wantToFail) {
nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
node = wantToFail ?
node.withWantToFail(Agent.NodeFailer, clock().instant(), reason) :
node.with(node.status().withWantToFail(false));
nodeRepository().nodes().write(node, lock);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,20 @@ private ImmutableMap.Builder<Event.Type, Event> builderWithout(Event.Type type)
}

/** Returns a copy of this history with a record of this state transition added, if applicable */
public History recordStateTransition(Node.State from, Node.State to, Agent agent, Instant at) {
public History recordStateTransition(Node.State from, Node.State to, Agent agent, Optional<String> reason, Instant at) {
// If the event is a re-reservation, allow the new one to override the older one.
if (from == to && from != Node.State.reserved) return this;
// Only keeping reason for errored states (failed, parked)
return switch (to) {
case provisioned -> this.with(new Event(Event.Type.provisioned, agent, at));
case deprovisioned -> this.with(new Event(Event.Type.deprovisioned, agent, at));
case ready -> this.withoutApplicationEvents().with(new Event(Event.Type.readied, agent, at));
case active -> this.with(new Event(Event.Type.activated, agent, at));
case inactive -> this.with(new Event(Event.Type.deactivated, agent, at));
case reserved -> this.with(new Event(Event.Type.reserved, agent, at));
case failed -> this.with(new Event(Event.Type.failed, agent, at));
case failed -> this.with(new Event(Event.Type.failed, agent, at, reason));
case dirty -> this.with(new Event(Event.Type.deallocated, agent, at));
case parked -> this.with(new Event(Event.Type.parked, agent, at));
case parked -> this.with(new Event(Event.Type.parked, agent, at, reason));
case breakfixed -> this.with(new Event(Event.Type.breakfixed, agent, at));
};
}
Expand All @@ -180,7 +181,10 @@ public String toString() {
}

/** An event which may happen to a node */
public record Event(Type type, Agent agent, Instant at) {
public record Event(Type type, Agent agent, Instant at, Optional<String> reason) {
public Event(Type type, Agent agent, Instant at) {
this(type, agent, at, Optional.empty());
}

public enum Type {
// State changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public List<Node> deactivate(List<Node> nodes, ApplicationTransaction transactio
*/
public List<Node> fail(List<Node> nodes, ApplicationTransaction transaction) {
return db.writeTo(Node.State.failed,
nodes.stream().map(n -> n.withWantToFail(false, Agent.application, clock.instant())).toList(),
nodes.stream().map(n -> n.with(n.status().withWantToFail(false))).toList(),
Agent.application, Optional.of("Failed by application"), transaction);
}

Expand Down Expand Up @@ -380,7 +380,7 @@ public List<Node> failOrMarkRecursively(String hostname, Agent agent, String rea

private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) {
if (node.state() == Node.State.active) {
node = node.withWantToFail(true, agent, clock.instant());
node = node.withWantToFail(agent, clock.instant(), reason);
write(node, lock);
return node;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public List<Node> addNodesInState(LockedNodeList nodes, Node.State expectedState
if (node.state() != expectedState)
throw new IllegalArgumentException(node + " is not in the " + expectedState + " state");

node = node.with(node.history().recordStateTransition(null, expectedState, agent, clock.instant()));
node = node.with(node.history().recordStateTransition(null, expectedState, agent, Optional.empty(), clock.instant()));
byte[] serialized = nodeSerializer.toJson(node);
curatorTransaction.add(CuratorOperations.create(nodePath(node).getAbsolute(), serialized));
}
Expand Down Expand Up @@ -234,7 +234,7 @@ public List<Node> writeTo(Node.State toState, List<Node> nodes,
newNodeStatus(node, toState),
toState,
toState.isAllocated() ? node.allocation() : Optional.empty(),
node.history().recordStateTransition(node.state(), toState, agent, clock.instant()),
node.history().recordStateTransition(node.state(), toState, agent, reason, clock.instant()),
node.type(), node.reports(), node.modelName(), node.reservedTo(),
node.exclusiveToApplicationId(), node.provisionedForApplicationId(), node.hostTTL(), node.hostEmptyAt(),
node.exclusiveToClusterType(), node.switchHostname(), node.trustedCertificates(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ public class NodeSerializer {
// History event fields
private static final String historyEventTypeKey = "type";
private static final String atKey = "at";
private static final String agentKey = "agent"; // retired events only
private static final String agentKey = "agent";
private static final String reasonKey = "reason";

// Network port fields
private static final String networkPortsKey = "networkPorts";
Expand Down Expand Up @@ -238,6 +239,7 @@ private void toSlime(History.Event event, Cursor object) {
object.setString(historyEventTypeKey, toString(event.type()));
object.setLong(atKey, event.at().toEpochMilli());
object.setString(agentKey, toString(event.agent()));
event.reason().ifPresent(reason -> object.setString(reasonKey, reason));
}

private void toSlime(List<String> addresses, Cursor array, boolean dummyDueToErasure) {
Expand Down Expand Up @@ -365,7 +367,8 @@ private History.Event eventFromSlime(Inspector object) {
if (type == null) return null;
Instant at = Instant.ofEpochMilli(object.field(atKey).asLong());
Agent agent = eventAgentFromSlime(object.field(agentKey));
return new History.Event(type, agent, at);
Optional<String> reason = SlimeUtils.optionalString(object.field(reasonKey));
return new History.Event(type, agent, at, reason);
}

private Generation generationFromSlime(Inspector object, String wantedField, String currentField) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ private void toSlime(Collection<History.Event> events, Cursor array) {
object.setString("event", event.type().name());
object.setLong("at", event.at().toEpochMilli());
object.setString("agent", event.agent().name());
event.reason().ifPresent(reason -> object.setString("reason", reason));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
{
"event": "failed",
"at": 123,
"agent": "operator"
"agent": "operator",
"reason": "MockNodeRepository"
},
{
"event": "deprovisioned",
Expand All @@ -76,7 +77,8 @@
{
"event": "failed",
"at": 123,
"agent": "operator"
"agent": "operator",
"reason": "MockNodeRepository"
},
{
"event": "deprovisioned",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@
{
"event": "wantToFail",
"at": 123,
"agent": "operator"
"agent": "operator",
"reason": "Failed through the nodes/v2 API"
},
{
"event": "rebooted",
Expand Down Expand Up @@ -135,7 +136,8 @@
{
"event": "wantToFail",
"at": 123,
"agent": "operator"
"agent": "operator",
"reason": "Failed through the nodes/v2 API"
},
{
"event": "rebooted",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
{
"event": "failed",
"at": 123,
"agent": "system"
"agent": "system",
"reason": "MockNodeRepository"
}
],
"log": [
Expand All @@ -70,7 +71,8 @@
{
"event": "failed",
"at": 123,
"agent": "system"
"agent": "system",
"reason": "MockNodeRepository"
}
],
"ipAddresses": ["::5:1", "127.0.5.1"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@
{
"event": "failed",
"at": 123,
"agent": "system"
"agent": "system",
"reason": "MockNodeRepository"
}
],
"log": [
Expand All @@ -72,7 +73,8 @@
{
"event": "failed",
"at": 123,
"agent": "system"
"agent": "system",
"reason": "MockNodeRepository"
}
],
"ipAddresses": ["::5:1", "127.0.5.1"],
Expand Down

0 comments on commit 63cb8f5

Please sign in to comment.