leptonai · gyuho · Dec 25, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/components/accelerator/nvidia/hw-slowdown/component.go b/components/accelerator/nvidia/hw-slowdown/component.go
@@ -57,6 +57,8 @@ const (
 )
 
 func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
+	// the default nvidia poller persists the events to the storage
+	// so we can just read from the storage
 	events, err := nvidia_clock_events_state.ReadEvents(
 		ctx,
 		c.db,

diff --git a/components/accelerator/nvidia/persistence-mode/component_output.go b/components/accelerator/nvidia/persistence-mode/component_output.go
@@ -18,10 +18,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
 		return &Output{}
 	}
 
-	o := &Output{
-		PersistencedExists:  i.PersistencedExists,
-		PersistencedRunning: i.PersistencedRunning,
-	}
+	o := &Output{}
 
 	if i.NVML != nil {
 		for _, device := range i.NVML.DeviceInfos {
@@ -39,9 +36,6 @@ func ToOutput(i *nvidia_query.Output) *Output {
 }
 
 type Output struct {
-	PersistencedExists  bool `json:"persistenced_exists"`
-	PersistencedRunning bool `json:"persistenced_running"`
-
 	PersistenceModesSMI  []nvidia_query.SMIGPUPersistenceMode `json:"persistence_modes_smi"`
 	PersistenceModesNVML []nvidia_query_nvml.PersistenceMode  `json:"persistence_modes_nvml"`
 }
@@ -94,10 +88,6 @@ func (o *Output) Evaluate() (string, bool, error) {
 
 	enabled := true
 	for _, p := range o.PersistenceModesSMI {
-		if o.PersistencedRunning {
-			continue
-		}
-
 		// legacy mode (https://docs.nvidia.com/deploy/driver-persistence/index.html#installation)
 		// "The reason why we cannot immediately deprecate the legacy persistence mode and switch transparently to the NVIDIA Persistence Daemon is because at this time,
 		// we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box."
@@ -108,10 +98,6 @@ func (o *Output) Evaluate() (string, bool, error) {
 	}
 
 	for _, p := range o.PersistenceModesNVML {
-		if o.PersistencedRunning {
-			continue
-		}
-
 		// legacy mode (https://docs.nvidia.com/deploy/driver-persistence/index.html#installation)
 		// "The reason why we cannot immediately deprecate the legacy persistence mode and switch transparently to the NVIDIA Persistence Daemon is because at this time,
 		// we cannot guarantee that the NVIDIA Persistence Daemon will be running. This would be a feature regression as persistence mode might not be available out-of- the-box."
@@ -121,15 +107,6 @@ func (o *Output) Evaluate() (string, bool, error) {
 		}
 	}
 
-	// does not make the component unhealthy, since persistence mode can still be enabled
-	// recommend installing nvidia-persistenced since it's the recommended way to enable persistence mode
-	if !o.PersistencedExists {
-		reasons = append(reasons, "nvidia-persistenced does not exist (install 'nvidia-persistenced' or run 'nvidia-smi -pm 1')")
-	}
-	if !o.PersistencedRunning {
-		reasons = append(reasons, "nvidia-persistenced exists but not running (start 'nvidia-persistenced' or run 'nvidia-smi -pm 1')")
-	}
-
 	return strings.Join(reasons, "; "), enabled, nil
 }
 

diff --git a/components/accelerator/nvidia/query/nvidia_persistenced.go b/components/accelerator/nvidia/query/nvidia_persistenced.go
diff --git a/components/accelerator/nvidia/query/nvidia_smi_query.go b/components/accelerator/nvidia/query/nvidia_smi_query.go
@@ -11,6 +11,7 @@ import (
 	"strings"
 
 	metrics_clock_events_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/clock-events-state"
+	"github.com/leptonai/gpud/log"
 	"github.com/leptonai/gpud/pkg/file"
 	"github.com/leptonai/gpud/pkg/process"
 
@@ -25,6 +26,7 @@ func SMIExists() bool {
 }
 
 func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
+	log.Logger.Debugw("finding nvidia-smi")
 	nvidiaSMIPath, err := file.LocateExecutable("nvidia-smi")
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi not found (%w)", err)
@@ -38,6 +40,7 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
 		return nil, err
 	}
 
+	log.Logger.Debugw("starting nvidia-smi", "args", args)
 	if err := p.Start(ctx); err != nil {
 		return nil, err
 	}
@@ -63,9 +66,8 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
 	// [Sat Oct 12 18:38:44 2024]  _nv042330rm+0x10/0x40 [nvidia]
 	// [Sat Oct 12 18:38:44 2024]  ? _nv043429rm+0x23c/0x290
 	errc := make(chan error, 1)
-	var output []byte
+	lines := make([]string, 0)
 	go func() {
-		lines := make([]string, 0)
 		err := process.Read(
 			ctx,
 			p,
@@ -76,20 +78,18 @@ func RunSMI(ctx context.Context, args ...string) ([]byte, error) {
 			}),
 			process.WithWaitForCmd(),
 		)
-
 		errc <- err
-		output = []byte(strings.Join(lines, "\n"))
 	}()
 
 	select {
 	case <-ctx.Done():
-		return nil, ctx.Err()
+		return nil, fmt.Errorf("nvidia-smi command timed out: %w\n\n(partial) output:\n%s", ctx.Err(), strings.Join(lines, "\n"))
 
 	case err := <-errc:
 		if err != nil {
-			return nil, fmt.Errorf("nvidia-smi command failed: %w\n\noutput:\n%s", err, string(output))
+			return nil, fmt.Errorf("nvidia-smi command failed: %w\n\n(partial) output:\n%s", err, strings.Join(lines, "\n"))
 		}
-		return output, nil
+		return []byte(strings.Join(lines, "\n")), nil
 	}
 }
 
@@ -101,6 +101,7 @@ func GetSMIOutput(ctx context.Context) (*SMIOutput, error) {
 	if err != nil {
 		return nil, err
 	}
+
 	o, err := ParseSMIQueryOutput(qb)
 	if err != nil {
 		return nil, err

diff --git a/components/accelerator/nvidia/query/nvml/nvml.go b/components/accelerator/nvidia/query/nvml/nvml.go
@@ -175,9 +175,13 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
 	}
 
 	nvmlLib := nvml.New()
+
+	log.Logger.Debugw("initializing nvml library")
 	if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
 		return nil, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
 	}
+
+	log.Logger.Debugw("getting driver version from nvml library")
 	driverVersion, err := GetDriverVersion()
 	if err != nil {
 		return nil, err
@@ -186,19 +190,25 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
 	if err != nil {
 		return nil, err
 	}
+
+	log.Logger.Debugw("checking if clock events are supported")
 	clockEventsSupported := ClockEventsSupportedVersion(major)
 	if !clockEventsSupported {
 		log.Logger.Warnw("old nvidia driver -- skipping clock events, see https://github.com/NVIDIA/go-nvml/pull/123", "version", driverVersion)
 	}
 
 	log.Logger.Debugw("successfully initialized NVML", "driverVersion", driverVersion)
 
+	log.Logger.Debugw("creating device library")
 	deviceLib := device.New(nvmlLib)
+
+	log.Logger.Debugw("creating info library")
 	infoLib := nvinfo.New(
 		nvinfo.WithNvmlLib(nvmlLib),
 		nvinfo.WithDeviceLib(deviceLib),
 	)
 
+	log.Logger.Debugw("checking if nvml exists from info library")
 	nvmlExists, nvmlExistsMsg := infoLib.HasNvml()
 	if !nvmlExists {
 		log.Logger.Warnw("nvml not found", "message", nvmlExistsMsg)
@@ -258,6 +268,7 @@ func (inst *instance) Start() error {
 	inst.mu.Lock()
 	defer inst.mu.Unlock()
 
+	log.Logger.Debugw("creating xid sxid event history table")
 	ctx, cancel := context.WithTimeout(inst.rootCtx, 10*time.Second)
 	defer cancel()
 	if err := nvidia_xid_sxid_state.CreateTableXidSXidEventHistory(ctx, inst.db); err != nil {
@@ -266,6 +277,7 @@ func (inst *instance) Start() error {
 
 	// "NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with:
 	// "error getting device handle for index '6': Unknown Error"
+	log.Logger.Debugw("getting devices from device library")
 	devices, err := inst.deviceLib.GetDevices()
 	if err != nil {
 		return err
@@ -285,30 +297,38 @@ func (inst *instance) Start() error {
 		}
 
 		// TODO: this returns 0 for all GPUs...
+		log.Logger.Debugw("getting device minor number")
 		minorNumber, ret := d.GetMinorNumber()
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to get device minor number: %v", nvml.ErrorString(ret))
 		}
 
 		// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g8789a616b502a78a1013c45cbb86e1bd
+		log.Logger.Debugw("getting device pci info")
 		pciInfo, ret := d.GetPciInfo()
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to get device PCI info: %v", nvml.ErrorString(ret))
 		}
 
+		log.Logger.Debugw("getting device name")
 		name, ret := d.GetName()
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret))
 		}
+
+		log.Logger.Debugw("getting device cores")
 		cores, ret := d.GetNumGpuCores()
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to get device cores: %v", nvml.ErrorString(ret))
 		}
+
+		log.Logger.Debugw("getting supported event types")
 		supportedEvents, ret := d.GetSupportedEventTypes()
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to get supported event types: %v", nvml.ErrorString(ret))
 		}
 
+		log.Logger.Debugw("registering events")
 		ret = d.RegisterEvents(inst.xidEventMask&supportedEvents, inst.xidEventSet)
 		if ret != nvml.SUCCESS {
 			return fmt.Errorf("failed to register events: %v", nvml.ErrorString(ret))
@@ -318,6 +338,7 @@ func (inst *instance) Start() error {
 			inst.xidErrorSupported = false
 		}
 
+		log.Logger.Debugw("checking if gpm metrics are supported")
 		gpmMetricsSpported, err := GPMSupportedByDevice(d)
 		if err != nil {
 			return err
@@ -571,6 +592,8 @@ func StartDefaultInstance(rootCtx context.Context, opts ...OpOption) error {
 		return nil
 	}
 
+	log.Logger.Debugw("creating a new default nvml instance")
+
 	var err error
 	defaultInstance, err = NewInstance(rootCtx, opts...)
 	if err != nil {