Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia/query): bump up nvidia-smi cmd timeout, better debugging info #261

Merged
merged 14 commits into from
Dec 25, 2024
2 changes: 2 additions & 0 deletions components/accelerator/nvidia/hw-slowdown/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ const (
)

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
// the default nvidia poller persists the events to the storage
// so we can just read from the storage
events, err := nvidia_clock_events_state.ReadEvents(
ctx,
c.db,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
return &Output{}
}

o := &Output{
PersistencedExists: i.PersistencedExists,
PersistencedRunning: i.PersistencedRunning,
}
o := &Output{}

if i.NVML != nil {
for _, device := range i.NVML.DeviceInfos {
Expand All @@ -39,9 +36,6 @@ func ToOutput(i *nvidia_query.Output) *Output {
}

type Output struct {
PersistencedExists bool `json:"persistenced_exists"`
PersistencedRunning bool `json:"persistenced_running"`

PersistenceModesSMI []nvidia_query.SMIGPUPersistenceMode `json:"persistence_modes_smi"`
PersistenceModesNVML []nvidia_query_nvml.PersistenceMode `json:"persistence_modes_nvml"`
}
Expand Down Expand Up @@ -94,10 +88,6 @@ func (o *Output) Evaluate() (string, bool, error) {

enabled := true
for _, p := range o.PersistenceModesSMI {
if o.PersistencedRunning {
continue
}

// legacy mode (https://docs.nvidia.com/deploy/driver-persistence/index.html#installation)
// "The reason why we cannot immediately deprecate the legacy persistence mode and switch transparently to the NVIDIA Persistence Daemon is because at this time,
// we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box."
Expand All @@ -108,10 +98,6 @@ func (o *Output) Evaluate() (string, bool, error) {
}

for _, p := range o.PersistenceModesNVML {
if o.PersistencedRunning {
continue
}

// legacy mode (https://docs.nvidia.com/deploy/driver-persistence/index.html#installation)
// "The reason why we cannot immediately deprecate the legacy persistence mode and switch transparently to the NVIDIA Persistence Daemon is because at this time,
// we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box."
Expand All @@ -121,15 +107,6 @@ func (o *Output) Evaluate() (string, bool, error) {
}
}

// does not make the component unhealthy, since persistence mode can still be enabled
// recommend installing nvidia-persistenced since it's the recommended way to enable persistence mode
if !o.PersistencedExists {
reasons = append(reasons, "nvidia-persistenced does not exist (install 'nvidia-persistenced' or run 'nvidia-smi -pm 1')")
}
if !o.PersistencedRunning {
reasons = append(reasons, "nvidia-persistenced exists but not running (start 'nvidia-persistenced' or run 'nvidia-smi -pm 1')")
}

return strings.Join(reasons, "; "), enabled, nil
}

Expand Down
20 changes: 0 additions & 20 deletions components/accelerator/nvidia/query/nvidia_persistenced.go

This file was deleted.

15 changes: 8 additions & 7 deletions components/accelerator/nvidia/query/nvidia_smi_query.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"strings"

metrics_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state"
"github.com/leptonai/gpud/log"
"github.com/leptonai/gpud/pkg/file"
"github.com/leptonai/gpud/pkg/process"

Expand All @@ -25,6 +26,7 @@ func SMIExists() bool {
}

func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
log.Logger.Debugw("finding nvidia-smi")
nvidiaSMIPath, err := file.LocateExecutable("nvidia-smi")
if err != nil {
return nil, fmt.Errorf("nvidia-smi not found (%w)", err)
Expand All @@ -38,6 +40,7 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
return nil, err
}

log.Logger.Debugw("starting nvidia-smi", "args", args)
if err := p.Start(ctx); err != nil {
return nil, err
}
Expand All @@ -63,9 +66,8 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
// [Sat Oct 12 18:38:44 2024] _nv042330rm+0x10/0x40 [nvidia]
// [Sat Oct 12 18:38:44 2024] ? _nv043429rm+0x23c/0x290
errc := make(chan error, 1)
var output []byte
lines := make([]string, 0)
go func() {
lines := make([]string, 0)
err := process.Read(
ctx,
p,
Expand All @@ -76,20 +78,18 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
}),
process.WithWaitForCmd(),
)

errc <- err
output = []byte(strings.Join(lines, "\n"))
}()

select {
case <-ctx.Done():
return nil, ctx.Err()
return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", ctx.Err(), strings.Join(lines, "\n"))

case err := <-errc:
if err != nil {
return nil, fmt.Errorf("nvidia-smi command failed: %w\n\noutput:\n%s", err, string(output))
return nil, fmt.Errorf("nvidia-smi command failed: %w\n\n(partial) output:\n%s", err, strings.Join(lines, "\n"))
}
return output, nil
return []byte(strings.Join(lines, "\n")), nil
}
}

Expand All @@ -101,6 +101,7 @@ func GetSMIOutput(ctx context.Context) (*SMIOutput, error) {
if err != nil {
return nil, err
}

o, err := ParseSMIQueryOutput(qb)
if err != nil {
return nil, err
Expand Down
23 changes: 23 additions & 0 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,13 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
}

nvmlLib := nvml.New()

log.Logger.Debugw("initializing nvml library")
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
}

log.Logger.Debugw("getting driver version from nvml library")
driverVersion, err := GetDriverVersion()
if err != nil {
return nil, err
Expand All @@ -186,19 +190,25 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
if err != nil {
return nil, err
}

log.Logger.Debugw("checking if clock events are supported")
clockEventsSupported := ClockEventsSupportedVersion(major)
if !clockEventsSupported {
log.Logger.Warnw("old nvidia driver -- skipping clock events, see https://github.com/NVIDIA/go-nvml/pull/123", "version", driverVersion)
}

log.Logger.Debugw("successfully initialized NVML", "driverVersion", driverVersion)

log.Logger.Debugw("creating device library")
deviceLib := device.New(nvmlLib)

log.Logger.Debugw("creating info library")
infoLib := nvinfo.New(
nvinfo.WithNvmlLib(nvmlLib),
nvinfo.WithDeviceLib(deviceLib),
)

log.Logger.Debugw("checking if nvml exists from info library")
nvmlExists, nvmlExistsMsg := infoLib.HasNvml()
if !nvmlExists {
log.Logger.Warnw("nvml not found", "message", nvmlExistsMsg)
Expand Down Expand Up @@ -258,6 +268,7 @@ func (inst *instance) Start() error {
inst.mu.Lock()
defer inst.mu.Unlock()

log.Logger.Debugw("creating xid sxid event history table")
ctx, cancel := context.WithTimeout(inst.rootCtx, 10*time.Second)
defer cancel()
if err := nvidia_xid_sxid_state.CreateTableXidSXidEventHistory(ctx, inst.db); err != nil {
Expand All @@ -266,6 +277,7 @@ func (inst *instance) Start() error {

// "NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with:
// "error getting device handle for index '6': Unknown Error"
log.Logger.Debugw("getting devices from device library")
devices, err := inst.deviceLib.GetDevices()
if err != nil {
return err
Expand All @@ -285,30 +297,38 @@ func (inst *instance) Start() error {
}

// TODO: this returns 0 for all GPUs...
log.Logger.Debugw("getting device minor number")
minorNumber, ret := d.GetMinorNumber()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device minor number: %v", nvml.ErrorString(ret))
}

// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g8789a616b502a78a1013c45cbb86e1bd
log.Logger.Debugw("getting device pci info")
pciInfo, ret := d.GetPciInfo()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device PCI info: %v", nvml.ErrorString(ret))
}

log.Logger.Debugw("getting device name")
name, ret := d.GetName()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret))
}

log.Logger.Debugw("getting device cores")
cores, ret := d.GetNumGpuCores()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device cores: %v", nvml.ErrorString(ret))
}

log.Logger.Debugw("getting supported event types")
supportedEvents, ret := d.GetSupportedEventTypes()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get supported event types: %v", nvml.ErrorString(ret))
}

log.Logger.Debugw("registering events")
ret = d.RegisterEvents(inst.xidEventMask&supportedEvents, inst.xidEventSet)
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to register events: %v", nvml.ErrorString(ret))
Expand All @@ -318,6 +338,7 @@ func (inst *instance) Start() error {
inst.xidErrorSupported = false
}

log.Logger.Debugw("checking if gpm metrics are supported")
gpmMetricsSpported, err := GPMSupportedByDevice(d)
if err != nil {
return err
Expand Down Expand Up @@ -571,6 +592,8 @@ func StartDefaultInstance(rootCtx context.Context, opts ...OpOption) error {
return nil
}

log.Logger.Debugw("creating a new default nvml instance")

var err error
defaultInstance, err = NewInstance(rootCtx, opts...)
if err != nil {
Expand Down
Loading
Loading