Skip to content

Commit

Permalink
fix(nvidia/infiniband): adjust default port rate based on GPU product (
Browse files Browse the repository at this point in the history
…#198)

* fix(nvidia/infiniband): adjust default port rate based on GPU product

Signed-off-by: Gyuho Lee <[email protected]>

* remove

Signed-off-by: Gyuho Lee <[email protected]>

* bump up go

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Nov 21, 2024
1 parent b0f3451 commit e43da55
Show file tree
Hide file tree
Showing 13 changed files with 307 additions and 76 deletions.
8 changes: 7 additions & 1 deletion components/accelerator/nvidia/infiniband/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,21 @@ func (o *Output) Evaluate(cfg Config) (string, bool, error) {
return fmt.Sprintf("infiniband suppported but ibstat errors found: %s", strings.Join(o.Ibstat.Errors, ", ")), false, nil
}
if len(o.Ibstat.Parsed) > 0 {
// no port count is set, use the gpu count as port count
expectedPortCount := cfg.ExpectedPortStates.PortCount
if expectedPortCount == 0 {
expectedPortCount = o.GPUCount
}

// no rate is set, use the default rate based on the product
expectedRate := cfg.ExpectedPortStates.Rate
if expectedRate == 0 {
expectedRate = infiniband.SupportsInfinibandPortRate(o.GPUProductName)
}

upCards := o.Ibstat.Parsed.CountByRates(expectedRate, "Active", "LinkUp")
if upCards != expectedPortCount {
return fmt.Sprintf("only %d out of %d ibstat cards are active and link up", upCards, expectedPortCount), false, nil
return fmt.Sprintf("only %d out of %d ibstat cards are active and link up (expected rate: %d Gb/sec)", upCards, expectedPortCount, expectedRate), false, nil
}
}
}
Expand Down
62 changes: 57 additions & 5 deletions components/accelerator/nvidia/infiniband/component_output_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,36 @@ func TestOutputStates(t *testing.T) {
expectedReason: "no infiniband class found or no ibstat exists or no ibstat error found",
},
{
name: "Not all cards active and up",
name: "Not all cards active and up (A100) with default rate",
cfg: Config{
ExpectedPortStates: ExpectedPortStates{
PortCount: 0,
Rate: 400,
ExpectedPortStates: ExpectedPortStates{},
},
o: &Output{
GPUProductName: "NVIDIA A100",
GPUCount: 8,
InfinibandClassExists: true,
IbstatExists: true,
Ibstat: infiniband.IbstatOutput{
Parsed: infiniband.IBStatCards{
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}},
},
},
},
expectedHealthy: false,
expectedReason: "only 6 out of 8 ibstat cards are active and link up (expected rate: 200 Gb/sec)",
},
{
name: "Not all cards active and up (H100) with default rate",
cfg: Config{
ExpectedPortStates: ExpectedPortStates{},
},
o: &Output{
GPUProductName: "NVIDIA H100",
GPUCount: 8,
Expand All @@ -109,7 +132,36 @@ func TestOutputStates(t *testing.T) {
},
},
expectedHealthy: false,
expectedReason: "only 6 out of 8 ibstat cards are active and link up",
expectedReason: "only 6 out of 8 ibstat cards are active and link up (expected rate: 400 Gb/sec)",
},
{
name: "Not all cards active and up (H100) with lower rate",
cfg: Config{
ExpectedPortStates: ExpectedPortStates{
PortCount: 6,
Rate: 200,
},
},
o: &Output{
GPUProductName: "NVIDIA H100",
GPUCount: 8,
InfinibandClassExists: true,
IbstatExists: true,
Ibstat: infiniband.IbstatOutput{
Parsed: infiniband.IBStatCards{
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}},
{Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}},
},
},
},
expectedHealthy: true,
expectedReason: "no infiniband class found or no ibstat exists or no ibstat error found",
},
}

Expand Down
9 changes: 1 addition & 8 deletions components/accelerator/nvidia/infiniband/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ import (
query_config "github.com/leptonai/gpud/components/query/config"
)

const (
DefaultExpectedRate = 400
)

type Config struct {
Query query_config.Config `json:"query"`

Expand All @@ -24,7 +20,7 @@ type ExpectedPortStates struct {
PortCount int `json:"port_count"`

// The expected rate in Gb/sec.
// If not set, it defaults to 400.
// If not set, it defaults to 200.
Rate int `json:"rate"`
}

Expand All @@ -45,8 +41,5 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
}

func (cfg *Config) Validate() error {
if cfg.ExpectedPortStates.Rate == 0 {
cfg.ExpectedPortStates.Rate = DefaultExpectedRate
}
return nil
}
57 changes: 0 additions & 57 deletions components/accelerator/nvidia/infiniband/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,60 +53,3 @@ func TestParseConfig(t *testing.T) {
})
}
}

func TestConfig_Validate(t *testing.T) {
tests := []struct {
name string
config Config
wantError bool
wantConfig Config
}{
{
name: "zero expected rate should set default",
config: Config{
ExpectedPortStates: ExpectedPortStates{
Rate: 0,
},
},
wantConfig: Config{
ExpectedPortStates: ExpectedPortStates{
Rate: DefaultExpectedRate,
},
},
wantError: false,
},
{
name: "non-zero expected rate should remain unchanged",
config: Config{
ExpectedPortStates: ExpectedPortStates{
Rate: 200,
},
},
wantConfig: Config{
ExpectedPortStates: ExpectedPortStates{
Rate: 200,
},
},
wantError: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := tt.config.Validate()
if tt.wantError {
if err == nil {
t.Errorf("Validate() error = nil, wantErr = true")
}
return
}
if err != nil {
t.Errorf("Validate() error = %v, wantErr = false", err)
return
}
if !reflect.DeepEqual(tt.config.ExpectedPortStates, tt.wantConfig.ExpectedPortStates) {
t.Errorf("ExpectedPortStates = %v, want %v", tt.config.ExpectedPortStates, tt.wantConfig.ExpectedPortStates)
}
})
}
}
12 changes: 12 additions & 0 deletions components/accelerator/nvidia/query/infiniband/infiniband.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@ func SupportsInfinibandProduct(gpuProductName string) bool {
return strings.Contains(p, "a100") || strings.Contains(p, "h100")
}

// Returns the default non-zero per-port rate in GB/sec if the product supports infiniband.
func SupportsInfinibandPortRate(gpuProductName string) int {
p := strings.ToLower(gpuProductName)
if strings.Contains(p, "a100") {
return 200
}
if strings.Contains(p, "h100") {
return 400
}
return 0
}

func IbstatExists() bool {
p, err := exec.LookPath("ibstat")
if err != nil {
Expand Down
58 changes: 58 additions & 0 deletions components/accelerator/nvidia/query/infiniband/infiniband_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,61 @@ func TestSupportsInfinibandProduct(t *testing.T) {
})
}
}

func TestSupportsInfinibandPortRate(t *testing.T) {
tests := []struct {
name string
productName string
want int
}{
{
// e.g.,
// "gpu_1x_h100_sxm5" in Lambda Labs
// "gpu_2x_h100_sxm5" in Lambda Labs
// "gpu_8x_h100_sxm5" in Lambda Labs
// H100s in Paperspace
name: "H100 supports Infiniband",
productName: "NVIDIA H100 80GB HBM3",
want: 400,
},
{
// e.g.,
// "gpu_1x_a100_sxm4" in Lambda Labs
name: "A100 40GB supports Infiniband",
productName: "NVIDIA A100-SXM4-40GB",
want: 200,
},
{
// e.g.,
// "gpu_8x_a100_80gb_sxm4" in Lambda Labs
name: "A100 80GB supports Infiniband",
productName: "NVIDIA A100-SXM4-80GB",
want: 200,
},
{
// e.g.,
// "gpu_1x_a10" in Lambda Labs
name: "A10 does not support Infiniband",
productName: "NVIDIA A10",
want: 0,
},
{
name: "RTX 4090 does not support Infiniband",
productName: "NVIDIA GeForce RTX 4090",
want: 0,
},
{
name: "TITAN V does not support Infiniband",
productName: "NVIDIA TITAN V",
want: 0,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := SupportsInfinibandPortRate(tt.productName); got != tt.want {
t.Errorf("SupportsInfinibandPortRate(%q) = %v, want %v", tt.productName, got, tt.want)
}
})
}
}
22 changes: 18 additions & 4 deletions components/accelerator/nvidia/query/infiniband/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,28 +58,42 @@ func TestParseIBStatCountByRates(t *testing.T) {
expectedCount int
}{
{
fileName: "testdata/ibstat.47.0.all.active.0",
fileName: "testdata/ibstat.47.0.a100.all.active.0",
rate: 200,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
expectedCount: 9,
},
{
fileName: "testdata/ibstat.47.0.a100.all.active.0",
rate: 100,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
expectedCount: 9,
},
{
fileName: "testdata/ibstat.47.0.h100.all.active.0",
rate: 400,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
expectedCount: 8,
},
{
fileName: "testdata/ibstat.47.0.all.active.1",
fileName: "testdata/ibstat.47.0.h100.all.active.1",
rate: 400,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
expectedCount: 8,
},
{
fileName: "testdata/ibstat.47.0.some.down.0",
fileName: "testdata/ibstat.47.0.h100.some.down.0",
rate: 400,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
expectedCount: 8,
},
{
fileName: "testdata/ibstat.47.0.some.down.1",
fileName: "testdata/ibstat.47.0.h100.some.down.1",
rate: 400,
expectedState: "Active",
expectedPhysicalState: "LinkUp",
Expand Down
Loading

0 comments on commit e43da55

Please sign in to comment.