From e43da552cbe793e1f7bc051e828e602970fd6170 Mon Sep 17 00:00:00 2001 From: Gyuho Lee <6799218+gyuho@users.noreply.github.com> Date: Thu, 21 Nov 2024 23:26:18 +0800 Subject: [PATCH] fix(nvidia/infiniband): adjust default port rate based on GPU product (#198) * fix(nvidia/infiniband): adjust default port rate based on GPU product Signed-off-by: Gyuho Lee * remove Signed-off-by: Gyuho Lee * bump up go Signed-off-by: Gyuho Lee --------- Signed-off-by: Gyuho Lee --- .../nvidia/infiniband/component_output.go | 8 +- .../infiniband/component_output_test.go | 62 ++++++- .../accelerator/nvidia/infiniband/config.go | 9 +- .../nvidia/infiniband/config_test.go | 57 ------- .../nvidia/query/infiniband/infiniband.go | 12 ++ .../query/infiniband/infiniband_test.go | 58 +++++++ .../nvidia/query/infiniband/parse_test.go | 22 ++- .../testdata/ibstat.47.0.a100.all.active.0 | 153 ++++++++++++++++++ ...active.0 => ibstat.47.0.h100.all.active.0} | 0 ...active.1 => ibstat.47.0.h100.all.active.1} | 0 ...me.down.0 => ibstat.47.0.h100.some.down.0} | 0 ...me.down.1 => ibstat.47.0.h100.some.down.1} | 0 go.mod | 2 +- 13 files changed, 307 insertions(+), 76 deletions(-) create mode 100644 components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.a100.all.active.0 rename components/accelerator/nvidia/query/infiniband/testdata/{ibstat.47.0.all.active.0 => ibstat.47.0.h100.all.active.0} (100%) rename components/accelerator/nvidia/query/infiniband/testdata/{ibstat.47.0.all.active.1 => ibstat.47.0.h100.all.active.1} (100%) rename components/accelerator/nvidia/query/infiniband/testdata/{ibstat.47.0.some.down.0 => ibstat.47.0.h100.some.down.0} (100%) rename components/accelerator/nvidia/query/infiniband/testdata/{ibstat.47.0.some.down.1 => ibstat.47.0.h100.some.down.1} (100%) diff --git a/components/accelerator/nvidia/infiniband/component_output.go b/components/accelerator/nvidia/infiniband/component_output.go index 26adef4c..1f99624d 100644 --- a/components/accelerator/nvidia/infiniband/component_output.go +++ b/components/accelerator/nvidia/infiniband/component_output.go @@ -102,15 +102,21 @@ func (o *Output) Evaluate(cfg Config) (string, bool, error) { return fmt.Sprintf("infiniband suppported but ibstat errors found: %s", strings.Join(o.Ibstat.Errors, ", ")), false, nil } if len(o.Ibstat.Parsed) > 0 { + // no port count is set, use the gpu count as port count expectedPortCount := cfg.ExpectedPortStates.PortCount if expectedPortCount == 0 { expectedPortCount = o.GPUCount } + + // no rate is set, use the default rate based on the product expectedRate := cfg.ExpectedPortStates.Rate + if expectedRate == 0 { + expectedRate = infiniband.SupportsInfinibandPortRate(o.GPUProductName) + } upCards := o.Ibstat.Parsed.CountByRates(expectedRate, "Active", "LinkUp") if upCards != expectedPortCount { - return fmt.Sprintf("only %d out of %d ibstat cards are active and link up", upCards, expectedPortCount), false, nil + return fmt.Sprintf("only %d out of %d ibstat cards are active and link up (expected rate: %d Gb/sec)", upCards, expectedPortCount, expectedRate), false, nil } } } diff --git a/components/accelerator/nvidia/infiniband/component_output_test.go b/components/accelerator/nvidia/infiniband/component_output_test.go index edfd63ad..5e724c29 100644 --- a/components/accelerator/nvidia/infiniband/component_output_test.go +++ b/components/accelerator/nvidia/infiniband/component_output_test.go @@ -83,13 +83,36 @@ func TestOutputStates(t *testing.T) { expectedReason: "no infiniband class found or no ibstat exists or no ibstat error found", }, { - name: "Not all cards active and up", + name: "Not all cards active and up (A100) with default rate", cfg: Config{ - ExpectedPortStates: ExpectedPortStates{ - PortCount: 0, - Rate: 400, + ExpectedPortStates: ExpectedPortStates{}, + }, + o: &Output{ + GPUProductName: "NVIDIA A100", + GPUCount: 8, + InfinibandClassExists: true, + IbstatExists: true, + Ibstat: infiniband.IbstatOutput{ + Parsed: infiniband.IBStatCards{ + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}}, + }, }, }, + expectedHealthy: false, + expectedReason: "only 6 out of 8 ibstat cards are active and link up (expected rate: 200 Gb/sec)", + }, + { + name: "Not all cards active and up (H100) with default rate", + cfg: Config{ + ExpectedPortStates: ExpectedPortStates{}, + }, o: &Output{ GPUProductName: "NVIDIA H100", GPUCount: 8, @@ -109,7 +132,36 @@ func TestOutputStates(t *testing.T) { }, }, expectedHealthy: false, - expectedReason: "only 6 out of 8 ibstat cards are active and link up", + expectedReason: "only 6 out of 8 ibstat cards are active and link up (expected rate: 400 Gb/sec)", + }, + { + name: "Not all cards active and up (H100) with lower rate", + cfg: Config{ + ExpectedPortStates: ExpectedPortStates{ + PortCount: 6, + Rate: 200, + }, + }, + o: &Output{ + GPUProductName: "NVIDIA H100", + GPUCount: 8, + InfinibandClassExists: true, + IbstatExists: true, + Ibstat: infiniband.IbstatOutput{ + Parsed: infiniband.IBStatCards{ + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Active", PhysicalState: "LinkUp", Rate: 200}}, + {Port1: infiniband.IBStatPort{State: "Down", PhysicalState: "Disabled", Rate: 200}}, + }, + }, + }, + expectedHealthy: true, + expectedReason: "no infiniband class found or no ibstat exists or no ibstat error found", }, } diff --git a/components/accelerator/nvidia/infiniband/config.go b/components/accelerator/nvidia/infiniband/config.go index 460537d7..56987b45 100644 --- a/components/accelerator/nvidia/infiniband/config.go +++ b/components/accelerator/nvidia/infiniband/config.go @@ -7,10 +7,6 @@ import ( query_config "github.com/leptonai/gpud/components/query/config" ) -const ( - DefaultExpectedRate = 400 -) - type Config struct { Query query_config.Config `json:"query"` @@ -24,7 +20,7 @@ type ExpectedPortStates struct { PortCount int `json:"port_count"` // The expected rate in Gb/sec. - // If not set, it defaults to 400. + // If not set, it defaults to 200. Rate int `json:"rate"` } @@ -45,8 +41,5 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) { } func (cfg *Config) Validate() error { - if cfg.ExpectedPortStates.Rate == 0 { - cfg.ExpectedPortStates.Rate = DefaultExpectedRate - } return nil } diff --git a/components/accelerator/nvidia/infiniband/config_test.go b/components/accelerator/nvidia/infiniband/config_test.go index 6d86fd25..4c389448 100644 --- a/components/accelerator/nvidia/infiniband/config_test.go +++ b/components/accelerator/nvidia/infiniband/config_test.go @@ -53,60 +53,3 @@ func TestParseConfig(t *testing.T) { }) } } - -func TestConfig_Validate(t *testing.T) { - tests := []struct { - name string - config Config - wantError bool - wantConfig Config - }{ - { - name: "zero expected rate should set default", - config: Config{ - ExpectedPortStates: ExpectedPortStates{ - Rate: 0, - }, - }, - wantConfig: Config{ - ExpectedPortStates: ExpectedPortStates{ - Rate: DefaultExpectedRate, - }, - }, - wantError: false, - }, - { - name: "non-zero expected rate should remain unchanged", - config: Config{ - ExpectedPortStates: ExpectedPortStates{ - Rate: 200, - }, - }, - wantConfig: Config{ - ExpectedPortStates: ExpectedPortStates{ - Rate: 200, - }, - }, - wantError: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := tt.config.Validate() - if tt.wantError { - if err == nil { - t.Errorf("Validate() error = nil, wantErr = true") - } - return - } - if err != nil { - t.Errorf("Validate() error = %v, wantErr = false", err) - return - } - if !reflect.DeepEqual(tt.config.ExpectedPortStates, tt.wantConfig.ExpectedPortStates) { - t.Errorf("ExpectedPortStates = %v, want %v", tt.config.ExpectedPortStates, tt.wantConfig.ExpectedPortStates) - } - }) - } -} diff --git a/components/accelerator/nvidia/query/infiniband/infiniband.go b/components/accelerator/nvidia/query/infiniband/infiniband.go index d33be3c4..042f23f9 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband.go @@ -21,6 +21,18 @@ func SupportsInfinibandProduct(gpuProductName string) bool { return strings.Contains(p, "a100") || strings.Contains(p, "h100") } +// Returns the default non-zero per-port rate in GB/sec if the product supports infiniband. +func SupportsInfinibandPortRate(gpuProductName string) int { + p := strings.ToLower(gpuProductName) + if strings.Contains(p, "a100") { + return 200 + } + if strings.Contains(p, "h100") { + return 400 + } + return 0 +} + func IbstatExists() bool { p, err := exec.LookPath("ibstat") if err != nil { diff --git a/components/accelerator/nvidia/query/infiniband/infiniband_test.go b/components/accelerator/nvidia/query/infiniband/infiniband_test.go index 237fb5d7..7cb611ac 100644 --- a/components/accelerator/nvidia/query/infiniband/infiniband_test.go +++ b/components/accelerator/nvidia/query/infiniband/infiniband_test.go @@ -164,3 +164,61 @@ func TestSupportsInfinibandProduct(t *testing.T) { }) } } + +func TestSupportsInfinibandPortRate(t *testing.T) { + tests := []struct { + name string + productName string + want int + }{ + { + // e.g., + // "gpu_1x_h100_sxm5" in Lambda Labs + // "gpu_2x_h100_sxm5" in Lambda Labs + // "gpu_8x_h100_sxm5" in Lambda Labs + // H100s in Paperspace + name: "H100 supports Infiniband", + productName: "NVIDIA H100 80GB HBM3", + want: 400, + }, + { + // e.g., + // "gpu_1x_a100_sxm4" in Lambda Labs + name: "A100 40GB supports Infiniband", + productName: "NVIDIA A100-SXM4-40GB", + want: 200, + }, + { + // e.g., + // "gpu_8x_a100_80gb_sxm4" in Lambda Labs + name: "A100 80GB supports Infiniband", + productName: "NVIDIA A100-SXM4-80GB", + want: 200, + }, + { + // e.g., + // "gpu_1x_a10" in Lambda Labs + name: "A10 does not support Infiniband", + productName: "NVIDIA A10", + want: 0, + }, + { + name: "RTX 4090 does not support Infiniband", + productName: "NVIDIA GeForce RTX 4090", + want: 0, + }, + { + name: "TITAN V does not support Infiniband", + productName: "NVIDIA TITAN V", + want: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := SupportsInfinibandPortRate(tt.productName); got != tt.want { + t.Errorf("SupportsInfinibandPortRate(%q) = %v, want %v", tt.productName, got, tt.want) + } + }) + } +} diff --git a/components/accelerator/nvidia/query/infiniband/parse_test.go b/components/accelerator/nvidia/query/infiniband/parse_test.go index 8fe2a5e5..1baac96b 100644 --- a/components/accelerator/nvidia/query/infiniband/parse_test.go +++ b/components/accelerator/nvidia/query/infiniband/parse_test.go @@ -58,28 +58,42 @@ func TestParseIBStatCountByRates(t *testing.T) { expectedCount int }{ { - fileName: "testdata/ibstat.47.0.all.active.0", + fileName: "testdata/ibstat.47.0.a100.all.active.0", + rate: 200, + expectedState: "Active", + expectedPhysicalState: "LinkUp", + expectedCount: 9, + }, + { + fileName: "testdata/ibstat.47.0.a100.all.active.0", + rate: 100, + expectedState: "Active", + expectedPhysicalState: "LinkUp", + expectedCount: 9, + }, + { + fileName: "testdata/ibstat.47.0.h100.all.active.0", rate: 400, expectedState: "Active", expectedPhysicalState: "LinkUp", expectedCount: 8, }, { - fileName: "testdata/ibstat.47.0.all.active.1", + fileName: "testdata/ibstat.47.0.h100.all.active.1", rate: 400, expectedState: "Active", expectedPhysicalState: "LinkUp", expectedCount: 8, }, { - fileName: "testdata/ibstat.47.0.some.down.0", + fileName: "testdata/ibstat.47.0.h100.some.down.0", rate: 400, expectedState: "Active", expectedPhysicalState: "LinkUp", expectedCount: 8, }, { - fileName: "testdata/ibstat.47.0.some.down.1", + fileName: "testdata/ibstat.47.0.h100.some.down.1", rate: 400, expectedState: "Active", expectedPhysicalState: "LinkUp", diff --git a/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.a100.all.active.0 b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.a100.all.active.0 new file mode 100644 index 00000000..3339f0a5 --- /dev/null +++ b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.a100.all.active.0 @@ -0,0 +1,153 @@ +CA 'mlx5_0' + CA type: MT4126 + Number of ports: 1 + Firmware version: 22.35.2302 + Hardware version: 0 + Node GUID: 0x1e918efffe7a03a0 + System image GUID: 0xb83fd20300b6eb16 + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 0 + LMC: 0 + SM lid: 0 + Capability mask: 0x00010000 + Port GUID: 0x1c918efffe7a03a0 + Link layer: Ethernet +CA 'mlx5_1' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0x6b9d7f2c99902a63 + System image GUID: 0x946dae03009377da + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1369 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0x6b9d7f2c99902a63 + Link layer: InfiniBand +CA 'mlx5_2' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0xb6f5686b2a0384d3 + System image GUID: 0x946dae0300937c6a + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1372 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0xb6f5686b2a0384d3 + Link layer: InfiniBand +CA 'mlx5_3' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0xbcaf66ff1fb84de3 + System image GUID: 0x946dae03009377ba + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1374 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0xbcaf66ff1fb84de3 + Link layer: InfiniBand +CA 'mlx5_4' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0xff764023a4d54d1f + System image GUID: 0x946dae030093df72 + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1379 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0xff764023a4d54d1f + Link layer: InfiniBand +CA 'mlx5_5' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0x85b89152a49a6c7f + System image GUID: 0x946dae030093e39a + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1383 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0x85b89152a49a6c7f + Link layer: InfiniBand +CA 'mlx5_6' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0xf405e6a5caf90755 + System image GUID: 0x946dae0300937c8a + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1387 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0xf405e6a5caf90755 + Link layer: InfiniBand +CA 'mlx5_7' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0x8127c63ef9e8fc5f + System image GUID: 0x946dae0300937902 + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1392 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0x8127c63ef9e8fc5f + Link layer: InfiniBand +CA 'mlx5_8' + CA type: MT4126 + Number of ports: 1 + Firmware version: 28.37.1014 + Hardware version: 0 + Node GUID: 0x4c1037318e9cd20a + System image GUID: 0x946dae030093e0ba + Port 1: + State: Active + Physical state: LinkUp + Rate: 200 + Base lid: 1396 + LMC: 0 + SM lid: 83 + Capability mask: 0xa651ec48 + Port GUID: 0x4c1037318e9cd20a + Link layer: InfiniBand \ No newline at end of file diff --git a/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.all.active.0 b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.all.active.0 similarity index 100% rename from components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.all.active.0 rename to components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.all.active.0 diff --git a/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.all.active.1 b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.all.active.1 similarity index 100% rename from components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.all.active.1 rename to components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.all.active.1 diff --git a/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.some.down.0 b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.some.down.0 similarity index 100% rename from components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.some.down.0 rename to components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.some.down.0 diff --git a/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.some.down.1 b/components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.some.down.1 similarity index 100% rename from components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.some.down.1 rename to components/accelerator/nvidia/query/infiniband/testdata/ibstat.47.0.h100.some.down.1 diff --git a/go.mod b/go.mod index 696bdfcc..0d9fb6c5 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/leptonai/gpud -go 1.23.2 +go 1.23.3 require ( github.com/NVIDIA/go-nvlib v0.7.0