Skip to content

Commit

Permalink
Merge pull request #27753 from vespa-engine/jonmv/limit-max-deprovisi…
Browse files Browse the repository at this point in the history
…oned-nodes

Jonmv/limit max deprovisioned nodes
  • Loading branch information
jonmv authored Jul 12, 2023
2 parents ccb2515 + fcabb9a commit 6995d69
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@

import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.node.History.Event.Type;

import java.time.Duration;
import java.time.Instant;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;

import static java.util.Comparator.comparing;

/**
* This removes hosts from {@link com.yahoo.vespa.hosted.provision.Node.State#deprovisioned}, in dynamically provisioned
* zones, after a grace period.
Expand All @@ -17,21 +24,37 @@
*/
public class DeprovisionedExpirer extends Expirer {

private static final int maxDeprovisionedNodes = 1000;

DeprovisionedExpirer(NodeRepository nodeRepository, Duration expiryTime, Metric metric) {
super(Node.State.deprovisioned, History.Event.Type.deprovisioned, nodeRepository, expiryTime, metric);
}

@Override
protected boolean isExpired(Node node) {
return nodeRepository().zone().cloud().dynamicProvisioning() &&
super.isExpired(node);
return nodeRepository().zone().cloud().dynamicProvisioning() && super.isExpired(node);
}

@Override
protected void expire(List<Node> expired) {
for (var node : expired) {
nodeRepository().nodes().forget(node);
protected NodeList getExpiredNodes() {
List<Node> deprovisioned = nodeRepository().nodes().list(Node.State.deprovisioned)
.sortedBy(comparing(node -> node.history().event(Type.deprovisioned)
.map(History.Event::at)
.orElse(Instant.EPOCH)))
.asList();
Deque<Node> expired = new ArrayDeque<>(deprovisioned);
int kept = 0;
while ( ! expired.isEmpty()) {
if (isExpired(expired.getLast()) || kept++ >= maxDeprovisionedNodes) break; // If we encounter an expired node, the rest are also expired.
expired.removeLast();
}
return NodeList.copyOf(List.copyOf(expired));
}

@Override
protected void expire(List<Node> expired) {
nodeRepository().nodes().performOn(NodeList.copyOf(expired),
(node, lock) -> { nodeRepository().nodes().forget(node); return node; });
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer {

@Override
protected double maintain() {
NodeList expired = nodeRepository().nodes().list(fromState).matching(this::isExpired);
NodeList expired = getExpiredNodes();

if ( ! expired.isEmpty()) {
log.info(fromState + " expirer found " + expired.size() + " expired nodes: " + expired);
Expand All @@ -51,6 +51,10 @@ protected double maintain() {
return 1.0;
}

protected NodeList getExpiredNodes() {
return nodeRepository().nodes().list(fromState).matching(this::isExpired);
}

protected boolean isExpired(Node node) {
return isExpired(node, expiryTime);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ private void failActive(FailingNode failing) {
// so we must release ours before failing the children.
List<FailingNode> activeChildrenToFail = new ArrayList<>();
boolean redeploy = false;
try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) {
try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) { // TODO: recursive lock for right order, only for hosts though
// Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ private boolean removeRetiredNodes(ApplicationId application) {
}
boolean redeploy = false;
List<String> nodesToDeactivate = new ArrayList<>();
try (var lock = nodeRepository().applications().lock(application)) {
try (var lock = nodeRepository().applications().lock(application)) { // TODO: take recusrive lock for right order
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
Map<Removal, NodeList> nodesByRemovalReason = activeNodes.owner(application)
.retired()
Expand Down

0 comments on commit 6995d69

Please sign in to comment.