Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia): use last successful data in shared poller, shared nvidia-smi/nvml poller to still return data if one operation fails #265

Merged
merged 6 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 20 additions & 27 deletions components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type component struct {
func (c *component) Name() string { return bad_envs_id.Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", bad_envs_id.Name)
return []components.State{
Expand All @@ -53,28 +53,19 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

output := ToOutput(allOutput)
return output.States()
}
Expand All @@ -101,24 +92,26 @@ func (c *component) Close() error {
var _ components.OutputProvider = (*component)(nil)

func (c *component) Output() (any, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", bad_envs_id.Name)
return nil, query.ErrNoData
}
if err != nil {
return nil, err
}
if last.Error != nil {
return nil, last.Error
}
if last.Output == nil {
return nil, nil
}

output, ok := last.Output.(*nvidia_query.Output)
allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T, expected *nvidia_query.Output", last.Output)
}
return ToOutput(output), nil
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

return ToOutput(allOutput), nil
}
27 changes: 9 additions & 18 deletions components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ type component struct {
func (c *component) Name() string { return nvidia_clock_speed_id.Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_clock_speed_id.Name)
return []components.State{
Expand All @@ -57,28 +57,19 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

if allOutput.SMIExists && len(allOutput.SMIQueryErrors) > 0 {
cs := make([]components.State, 0)
for _, e := range allOutput.SMIQueryErrors {
Expand Down
27 changes: 9 additions & 18 deletions components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ type component struct {
func (c *component) Name() string { return nvidia_ecc_id.Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_ecc_id.Name)
return []components.State{
Expand All @@ -57,28 +57,19 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

if allOutput.SMIExists && len(allOutput.SMIQueryErrors) > 0 {
cs := make([]components.State, 0)
for _, e := range allOutput.SMIQueryErrors {
Expand Down
27 changes: 9 additions & 18 deletions components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ type component struct {
func (c *component) Name() string { return Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", Name)
return []components.State{
Expand All @@ -53,28 +53,19 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

if allOutput.SMIExists && len(allOutput.SMIQueryErrors) > 0 {
cs := make([]components.State, 0)
for _, e := range allOutput.SMIQueryErrors {
Expand Down
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/error/sxid/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ func (o *Output) getEvents(since time.Time) []components.Event {
des := make([]components.Event, 0)
for i, sxidErr := range reason.Errors {
if sxidErr.Time.IsZero() {
log.Logger.Warnw("skipping event because it's too old", "sxid", sxidErr.SXid, "since", since, "event_time", sxidErr.Time.Time)
log.Logger.Debugw("skipping event because it's too old", "sxid", sxidErr.SXid, "since", since, "event_time", sxidErr.Time.Time)
continue
}
if sxidErr.Time.Time.Before(since) {
log.Logger.Warnw("skipping event because it's too old", "sxid", sxidErr.SXid, "since", since, "event_time", sxidErr.Time.Time)
log.Logger.Debugw("skipping event because it's too old", "sxid", sxidErr.SXid, "since", since, "event_time", sxidErr.Time.Time)
continue
}

Expand Down
44 changes: 10 additions & 34 deletions components/accelerator/nvidia/error/xid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,30 +43,6 @@ func (c *component) Name() string { return nvidia_component_error_xid_id.Name }

// Just checks if the xid poller is working.
func (c *component) States(_ context.Context) ([]components.State, error) {
last, err := c.poller.Last()

// no data yet from realtime xid poller
// just return whatever we got from dmesg
if err == query.ErrNoData {
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", nvidia_component_error_xid_id.Name)
return []components.State{
{
Name: StateNameErrorXid,
Healthy: true,
Reason: "no xid error event",
},
}, nil
}

// something went wrong in the poller
// just return an error to surface the issue
if err != nil {
return nil, err
}
if last.Error != nil {
return nil, last.Error
}

return []components.State{
{
Name: StateNameErrorXid,
Expand Down Expand Up @@ -118,7 +94,7 @@ func (c *component) tailScan() (*Output, error) {
o.DmesgErrors = append(o.DmesgErrors, ev)
}

last, err := c.poller.Last()
last, err := c.poller.LastSuccess()

// no data yet from realtime xid poller
// just return whatever we got from dmesg
Expand All @@ -132,22 +108,22 @@ func (c *component) tailScan() (*Output, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return nil, last.Error
}

// no output from the poller
// just return whatever we got from dmesg
if last.Output == nil {
return o, nil
}

ev, ok := last.Output.(*nvidia_query_nvml.XidEvent)
if !ok { // shoild never happen
return nil, fmt.Errorf("invalid output type: %T, expected nvidia_query_nvml.XidEvent", last.Output)
}
if ev != nil && ev.Xid > 0 {
o.NVMLXidEvent = ev

if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}

lastSuccessPollElapsed := time.Now().UTC().Sub(ev.Time.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}
}

return o, nil
Expand Down
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/error/xid/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,11 @@ func (o *Output) getEvents(since time.Time) []components.Event {
des := make([]components.Event, 0)
for i, xidErr := range reason.Errors {
if xidErr.Time.IsZero() {
log.Logger.Warnw("skipping event because it's too old", "xid", xidErr.Xid, "since", since, "event_time", xidErr.Time.Time)
log.Logger.Debugw("skipping event because it's too old", "xid", xidErr.Xid, "since", since, "event_time", xidErr.Time.Time)
continue
}
if xidErr.Time.Time.Before(since) {
log.Logger.Warnw("skipping event because it's too old", "xid", xidErr.Xid, "since", since, "event_time", xidErr.Time.Time)
log.Logger.Debugw("skipping event because it's too old", "xid", xidErr.Xid, "since", since, "event_time", xidErr.Time.Time)
continue
}

Expand Down
27 changes: 9 additions & 18 deletions components/accelerator/nvidia/fabric-manager/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ type component struct {
func (c *component) Name() string { return Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
last, err := c.poller.Last()
last, err := c.poller.LastSuccess()
if err == query.ErrNoData { // no data
log.Logger.Debugw("nothing found in last state (no data collected yet)", "component", Name)
return []components.State{
Expand All @@ -70,28 +70,19 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if err != nil {
return nil, err
}
if last.Error != nil {
return []components.State{
{
Healthy: false,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
}
if last.Output == nil {
return []components.State{
{
Healthy: false,
Reason: "no output",
},
}, nil
}

allOutput, ok := last.Output.(*nvidia_query.Output)
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if lerr := c.poller.LastError(); lerr != nil {
log.Logger.Warnw("last query failed -- returning cached, possibly stale data", "error", lerr)
}
lastSuccessPollElapsed := time.Now().UTC().Sub(allOutput.Time)
if lastSuccessPollElapsed > 2*c.poller.Config().Interval.Duration {
log.Logger.Warnw("last poll is too old", "elapsed", lastSuccessPollElapsed, "interval", c.poller.Config().Interval.Duration)
}

if !allOutput.FabricManagerExists {
return []components.State{
{
Expand Down
Loading
Loading