Skip to content

Commit

Permalink
Fix: Correct resource reporting for single gpu on shared gpu worker (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jsun-m authored Jan 3, 2025
1 parent 60f3eee commit ac7c555
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
13 changes: 5 additions & 8 deletions pkg/worker/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ func (w *Worker) collectAndSendContainerMetrics(ctx context.Context, request *ty
ticker := time.NewTicker(w.config.Monitoring.ContainerMetricsInterval)
defer ticker.Stop()

monitor := NewProcessMonitor(containerPid, spec.Linux.Resources.Devices)
monitor := NewProcessMonitor(containerPid, spec.Linux.Resources.Devices, w.containerCudaManager.GetContainerGPUDevices(request.ContainerId))

for {
select {
Expand Down Expand Up @@ -74,10 +74,11 @@ type ProcessMonitor struct {
lastIO process.IOCountersStat
lastNetIO net.IOCountersStat
gpuInfoClient GPUInfoClient
gpuDeviceIds []int
}

func NewProcessMonitor(pid int, devices []specs.LinuxDeviceCgroup) *ProcessMonitor {
return &ProcessMonitor{pid: int32(pid), devices: devices, gpuInfoClient: &NvidiaInfoClient{}}
func NewProcessMonitor(pid int, devices []specs.LinuxDeviceCgroup, gpuDeviceIds []int) *ProcessMonitor {
return &ProcessMonitor{pid: int32(pid), devices: devices, gpuInfoClient: &NvidiaInfoClient{}, gpuDeviceIds: gpuDeviceIds}
}

func (m *ProcessMonitor) GetStatistics() (*ProcessStats, error) {
Expand Down Expand Up @@ -112,12 +113,8 @@ func (m *ProcessMonitor) GetStatistics() (*ProcessStats, error) {

func (m *ProcessMonitor) fetchGPUMemory() *GPUInfoStat {
stat := &GPUInfoStat{}
availableDevices, err := m.gpuInfoClient.AvailableGPUDevices()
if err != nil {
return stat
}

for _, device := range availableDevices {
for _, device := range m.gpuDeviceIds {
stats, err := m.gpuInfoClient.GetGPUMemoryUsage(device)
if err == nil {
stat.MemoryUsed += uint64(stats.UsedCapacity)
Expand Down
10 changes: 10 additions & 0 deletions pkg/worker/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ var (

type GPUManager interface {
AssignGPUDevices(containerId string, gpuCount uint32) (*AssignedGpuDevices, error)
GetContainerGPUDevices(containerId string) []int
UnassignGPUDevices(containerId string)
InjectEnvVars(env []string, options *ContainerOptions) ([]string, bool)
InjectMounts(mounts []specs.Mount) []specs.Mount
Expand Down Expand Up @@ -120,6 +121,15 @@ func (c *ContainerNvidiaManager) AssignGPUDevices(containerId string, gpuCount u
}, nil
}

func (c *ContainerNvidiaManager) GetContainerGPUDevices(containerId string) []int {
gpuDevices, ok := c.gpuAllocationMap.Get(containerId)
if !ok {
return []int{}
}

return gpuDevices
}

func (c *ContainerNvidiaManager) chooseDevices(containerId string, requestedGpuCount uint32) ([]int, error) {
c.mu.Lock()
defer c.mu.Unlock()
Expand Down

0 comments on commit ac7c555

Please sign in to comment.