Skip to content

Commit

Permalink
[YUNIKORN-2976] Handle multiple require node allocations per node (#1001
Browse files Browse the repository at this point in the history
)

If an allocation requires a specific node the scheduler should not
consider any other node. We should allow multiple allocations that
require the same node to reserve the node at the same time. A required
node allocation must be placed on the node before anything else.
If other non required node reservations are made on a node remove the
existing reservations that do not require that node. Make sure that the
releases are tracked correctly in the partition.

After the repeat count removal reservations can be simplified:
- track reservations using the allocation key
- removed the composite key setup
- removed collection listener call on reserve or unreserve of a node

Closes: #1001

Signed-off-by: Craig Condit <[email protected]>
  • Loading branch information
wilfred-s authored and craigcondit committed Dec 13, 2024
1 parent f7d0e10 commit b02e15e
Show file tree
Hide file tree
Showing 15 changed files with 929 additions and 674 deletions.
27 changes: 20 additions & 7 deletions pkg/common/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,24 @@ package common

import "errors"

// InvalidQueueName returned when queue name is invalid
var InvalidQueueName = errors.New("invalid queue name, max 64 characters consisting of alphanumeric characters and '-', '_', '#', '@', '/', ':' allowed")
var (
// InvalidQueueName returned when queue name is invalid
InvalidQueueName = errors.New("invalid queue name, max 64 characters consisting of alphanumeric characters and '-', '_', '#', '@', '/', ':' allowed")
// ErrorReservingAlloc returned when an ask that is allocated tries to reserve a node.
ErrorReservingAlloc = errors.New("ask already allocated, no reservation allowed")
// ErrorDuplicateReserve returned when the same reservation already exists on the application
ErrorDuplicateReserve = errors.New("reservation already exists")
// ErrorNodeAlreadyReserved returned when the node is already reserved, failing the reservation
ErrorNodeAlreadyReserved = errors.New("node is already reserved")
// ErrorNodeNotFitReserve returned when the allocation does not fit on an empty node, failing the reservation
ErrorNodeNotFitReserve = errors.New("reservation does not fit on node")
)

const PreemptionPreconditionsFailed = "Preemption preconditions failed"
const PreemptionDoesNotGuarantee = "Preemption queue guarantees check failed"
const PreemptionShortfall = "Preemption helped but short of resources"
const PreemptionDoesNotHelp = "Preemption does not help"
const NoVictimForRequiredNode = "No fit on required node, preemption does not help"
// Constant messages for AllocationLog entries
const (
PreemptionPreconditionsFailed = "Preemption preconditions failed"
PreemptionDoesNotGuarantee = "Preemption queue guarantees check failed"
PreemptionShortfall = "Preemption helped but short of resources"
PreemptionDoesNotHelp = "Preemption does not help"
NoVictimForRequiredNode = "No fit on required node, preemption does not help"
)
20 changes: 2 additions & 18 deletions pkg/scheduler/objects/allocation.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ type Allocation struct {
allowPreemptOther bool
originator bool
tags map[string]string
resKeyWithoutNode string // the reservation key without node
foreign bool
preemptable bool

Expand All @@ -57,9 +56,8 @@ type Allocation struct {
allocLog map[string]*AllocationLogEntry
preemptionTriggered bool
preemptCheckTime time.Time
schedulingAttempted bool // whether scheduler core has tried to schedule this allocation
scaleUpTriggered bool // whether this aloocation has triggered autoscaling or not
resKeyPerNode map[string]string // reservation key for a given node
schedulingAttempted bool // whether scheduler core has tried to schedule this allocation
scaleUpTriggered bool // whether this allocation has triggered autoscaling or not
allocatedResource *resources.Resource
askEvents *schedEvt.AskEvents
userQuotaCheckFailed bool
Expand Down Expand Up @@ -145,8 +143,6 @@ func NewAllocationFromSI(alloc *si.Allocation) *Allocation {
allowPreemptOther: alloc.PreemptionPolicy.GetAllowPreemptOther(),
originator: alloc.Originator,
allocLog: make(map[string]*AllocationLogEntry),
resKeyPerNode: make(map[string]string),
resKeyWithoutNode: reservationKeyWithoutNode(alloc.ApplicationID, alloc.AllocationKey),
askEvents: schedEvt.NewAskEvents(events.GetEventSystem()),
allocated: allocated,
nodeID: nodeID,
Expand Down Expand Up @@ -554,18 +550,6 @@ func (a *Allocation) HasTriggeredScaleUp() bool {
return a.scaleUpTriggered
}

func (a *Allocation) setReservationKeyForNode(node, resKey string) {
a.Lock()
defer a.Unlock()
a.resKeyPerNode[node] = resKey
}

func (a *Allocation) getReservationKeyForNode(node string) string {
a.RLock()
defer a.RUnlock()
return a.resKeyPerNode[node]
}

func (a *Allocation) setHeadroomCheckFailed(headroom *resources.Resource, queue string) {
a.Lock()
defer a.Unlock()
Expand Down
9 changes: 5 additions & 4 deletions pkg/scheduler/objects/allocation_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ func (art AllocationResultType) String() string {
}

type AllocationResult struct {
ResultType AllocationResultType
Request *Allocation
NodeID string
ReservedNodeID string
ResultType AllocationResultType
Request *Allocation
NodeID string
ReservedNodeID string
CancelledReservations int
}

func (ar *AllocationResult) String() string {
Expand Down
1 change: 0 additions & 1 deletion pkg/scheduler/objects/allocation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ func TestNewAsk(t *testing.T) {
askStr := ask.String()
expected := "allocationKey ask-1, applicationID app-1, Resource map[first:10], Allocated false"
assert.Equal(t, askStr, expected, "Strings should have been equal")
assert.Equal(t, "app-1|ask-1", ask.resKeyWithoutNode) //nolint:staticcheck
}

func TestAskAllocateDeallocate(t *testing.T) {
Expand Down
Loading

0 comments on commit b02e15e

Please sign in to comment.