Skip to content

Commit

Permalink
fix(nvidia/infiniband): use sysclass ib directory count as default po…
Browse files Browse the repository at this point in the history
…rt state checks, use Infiniband PCI bus count to decide whether Infiniband is enabled or not (#200)

* fix(nvidia/infiniband): use sys class ib dir count as defualt

Signed-off-by: Gyuho Lee <[email protected]>

* use pci bus count

Signed-off-by: Gyuho Lee <[email protected]>

* rmeove

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Nov 21, 2024
1 parent 8bea6e9 commit 112f28b
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 281 deletions.
20 changes: 20 additions & 0 deletions components/accelerator/nvidia/infiniband/component_output.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package infiniband

import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"

"github.com/leptonai/gpud/components"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
Expand Down Expand Up @@ -97,14 +99,32 @@ func (o *Output) Evaluate(cfg Config) (string, bool, error) {
if !infiniband.SupportsInfinibandProduct(o.GPUProductName) {
return fmt.Sprintf("%q GPUs do not support infiniband", o.GPUProductName), true, nil
}

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
count, err := infiniband.CountInfinibandPCIBuses(ctx)
if err != nil {
return fmt.Sprintf("failed to count infiniband pci buses: %s", err), false, nil
}
if count == 0 {
return "no infiniband pci buses found", true, nil
}

if o.InfinibandClassExists && o.IbstatExists {
if len(o.Ibstat.Errors) > 0 {
return fmt.Sprintf("infiniband suppported but ibstat errors found: %s", strings.Join(o.Ibstat.Errors, ", ")), false, nil
}
if len(o.Ibstat.Parsed) > 0 {
// no port count is set, use the gpu count as port count
expectedPortCount := cfg.ExpectedPortStates.PortCount

// some H100 machines only have 1 ib port in ib class dir
if expectedPortCount == 0 {
expectedPortCount = infiniband.CountInfinibandClass()
}

// H100 machines with 12 ib ports should default to the GPU count 8
if expectedPortCount == 0 || expectedPortCount > o.GPUCount {
expectedPortCount = o.GPUCount
}

Expand Down
275 changes: 0 additions & 275 deletions components/accelerator/nvidia/infiniband/component_output_test.go

This file was deleted.

55 changes: 52 additions & 3 deletions components/accelerator/nvidia/query/infiniband/infiniband.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
package infiniband

import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
Expand Down Expand Up @@ -41,10 +43,57 @@ func IbstatExists() bool {
return p != ""
}

// Checks if "/sys/class/infiniband" directory exists.
func InfinibandClassExists() bool {
// lspci | grep -i infiniband
// 1a:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7]
// 3c:00.0 Infiniband controller: Mellanox Technologies MT2910 Family [ConnectX-7]
func CountInfinibandPCIBuses(ctx context.Context) (int, error) {
p, err := exec.LookPath("lspci")
if err != nil {
return 0, fmt.Errorf("lspci not found (%w)", err)
}
b, err := exec.CommandContext(ctx, p).CombinedOutput()
if err != nil {
return 0, err
}

count := 0
s := bufio.NewScanner(bytes.NewReader(b))
for s.Scan() {
line := s.Text()
if strings.Contains(strings.ToLower(line), "infiniband") {
count++
}
}
if err := s.Err(); err != nil {
return count, err
}
return count, nil
}

// Counts the directories in "/sys/class/infiniband".
// Returns 0 if the directory does not exist.
func CountInfinibandClass() int {
info, err := os.Stat("/sys/class/infiniband")
return err == nil && info.IsDir()
if err != nil || !info.IsDir() {
return 0
}
dirs, err := os.ReadDir("/sys/class/infiniband")
if err != nil {
return 0
}
return len(dirs)
}

func countInfinibandClass(dir string) int {
info, err := os.Stat(dir)
if err != nil || !info.IsDir() {
return 0
}
dirs, err := os.ReadDir(dir)
if err != nil {
return 0
}
return len(dirs)
}

func RunIbstat(ctx context.Context) (*IbstatOutput, error) {
Expand Down
Loading

0 comments on commit 112f28b

Please sign in to comment.