Add NeMoLauncherKubernetesCommandGenStrategy

NVIDIA · Sep 30, 2024 · 0f9a393 · 0f9a393
1 parent 9fdb8fa
commit 0f9a393
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 0 deletions.
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -68,6 +68,7 @@
 from .schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
 from .schema.test_template.nccl_test.template import NcclTest
 from .schema.test_template.nemo_launcher.grading_strategy import NeMoLauncherGradingStrategy
+from .schema.test_template.nemo_launcher.kubernetes_command_gen_strategy import NeMoLauncherKubernetesCommandGenStrategy
 from .schema.test_template.nemo_launcher.kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy
 from .schema.test_template.nemo_launcher.report_generation_strategy import NeMoLauncherReportGenerationStrategy
 from .schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
@@ -128,6 +129,9 @@
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
 Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
+Registry().add_strategy(
+    CommandGenStrategy, [KubernetesSystem], [NeMoLauncher], NeMoLauncherKubernetesCommandGenStrategy
+)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)

diff --git a/src/cloudai/schema/test_template/nemo_launcher/kubernetes_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/kubernetes_command_gen_strategy.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List
+
+from cloudai import CommandGenStrategy
+
+from .kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy
+
+
+class NeMoLauncherKubernetesCommandGenStrategy(CommandGenStrategy):
+    """Command generation strategy for NeMo Megatron Launcher on Kubernetes systems."""
+
+    def gen_exec_command(
+        self,
+        cmd_args: Dict[str, str],
+        extra_env_vars: Dict[str, str],
+        extra_cmd_args: str,
+        output_path: Path,
+        num_nodes: int,
+        nodes: List[str],
+    ) -> str:
+        final_env_vars = {**self.system.global_env_vars, **extra_env_vars}
+
+        launcher_path = (
+            self.system.install_path
+            / NeMoLauncherKubernetesInstallStrategy.SUBDIR_PATH
+            / NeMoLauncherKubernetesInstallStrategy.REPOSITORY_NAME
+        )
+
+        self.final_cmd_args = {**self.default_cmd_args, **cmd_args}
+        self.final_cmd_args["launcher_scripts_path"] = str(launcher_path / "launcher_scripts")
+
+        self.final_cmd_args.update({f"env_vars.{key}": value for key, value in final_env_vars.items()})
+
+        self.final_cmd_args["training"] = self.final_cmd_args.pop("training.values", None)
+
+        for key in ["repository_url", "repository_commit_hash", "docker_image_url"]:
+            self.final_cmd_args.pop(key, None)
+
+        if self.final_cmd_args.get("data_dir") == "DATA_DIR":
+            raise ValueError(
+                "The 'data_dir' field of the NeMo launcher test contains the placeholder 'DATA_DIR'. "
+                "Please update the test schema TOML file with a valid path to the dataset."
+            )
+
+        cmd_args_str = self._generate_cmd_args_str(self.final_cmd_args)
+
+        full_cmd = f"python {launcher_path}/launcher_scripts/main.py {cmd_args_str}"
+
+        if extra_cmd_args:
+            full_cmd += f" {extra_cmd_args}"
+
+        env_vars_str = " ".join(f"{key}={value}" for key, value in final_env_vars.items())
+        return f"{env_vars_str} {full_cmd}".strip() if env_vars_str else full_cmd.strip()
+
+    def _generate_cmd_args_str(self, args: Dict[str, str]) -> str:
+        """
+        Generate a string of command-line arguments.
+
+        Args:
+            args (Dict[str, str]): The command-line arguments.
+
+        Returns:
+            str: A string of command-line arguments.
+        """
+        cmd_arg_str_parts = []
+        env_var_str_parts = []
+
+        for key, value in args.items():
+            if key.startswith("env_vars."):
+                if isinstance(value, str) and "," in value:
+                    value = f"\\'{value}\\'"
+                env_var_str_parts.append(f"+{key}={value}")
+            else:
+                cmd_arg_str_parts.append(f"{key}={value}")
+
+        return " ".join(cmd_arg_str_parts + env_var_str_parts)
diff --git a/src/cloudai/test_definitions/nemo_launcher.py b/src/cloudai/test_definitions/nemo_launcher.py
@@ -28,11 +28,34 @@ class NumaMapping(BaseModel):
     enable: bool = True
 
 
+class PersistentVolumeClaim(BaseModel):
+    """Persistent volume claim configuration."""
+
+    model_config = ConfigDict(extra="forbid")
+    claim_name: str = "nemo-workspace"
+
+
+class ClusterWorkspace(BaseModel):
+    """Cluster workspace configuration."""
+
+    model_config = ConfigDict(extra="forbid")
+    persistent_volume_claim: PersistentVolumeClaim = Field(default_factory=PersistentVolumeClaim)
+    mount_path: str = "/nemo-workspace"
+
+
+class ClusterVolume(BaseModel):
+    """Cluster volume configuration."""
+
+    model_config = ConfigDict(extra="forbid")
+    workspace: ClusterWorkspace = Field(default_factory=ClusterWorkspace)
+
+
 class Cluster(BaseModel):
     """Cluster configuration."""
 
     model_config = ConfigDict(extra="forbid")
     gpus_per_node: int = 8
+    volumes: ClusterVolume = Field(default_factory=ClusterVolume)
 
 
 class ExpManager(BaseModel):
@@ -50,6 +73,8 @@ class Trainer(BaseModel):
     val_check_interval: int = 100
     log_every_n_steps: Literal["1", "2"] = "1"
     enable_checkpointing: bool = False
+    num_nodes: int = 3
+    devices: int = 8
 
 
 class TrainingModelData(BaseModel):

diff --git a/tests/test_kubernetes_command_gen_strategy.py b/tests/test_kubernetes_command_gen_strategy.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import pytest
+from cloudai.schema.test_template.nemo_launcher.kubernetes_command_gen_strategy import (
+    NeMoLauncherKubernetesCommandGenStrategy,
+)
+from cloudai.systems.kubernetes import KubernetesSystem
+
+
+@pytest.fixture
+def kube_config_tempfile():
+    kube_config_content = """
+    apiVersion: v1
+    kind: Config
+    clusters:
+    - cluster:
+        server: https://127.0.0.1:6443
+      name: local-cluster
+    contexts:
+    - context:
+        cluster: local-cluster
+        user: local-user
+      name: local-context
+    current-context: local-context
+    users:
+    - name: local-user
+      user:
+        token: fake-token
+    """
+
+    home_dir = Path.home()
+    kube_config_dir = home_dir / ".kube"
+    kube_config_path = kube_config_dir / "config"
+
+    kube_config_dir.mkdir(parents=True, exist_ok=True)
+
+    with kube_config_path.open("w") as config_file:
+        config_file.write(kube_config_content)
+
+    yield kube_config_path
+
+
+@pytest.fixture
+def k8s_system(kube_config_tempfile):
+    system_data = {
+        "name": "test-system",
+        "install_path": "/fake/install/path",
+        "output_path": "/fake/output/path",
+        "kube_config_path": kube_config_tempfile,
+        "default_namespace": "default",
+        "default_image": "test-image",
+        "scheduler": "kubernetes",
+        "global_env_vars": {},
+        "monitor_interval": 1,
+    }
+
+    k8s_system = KubernetesSystem(**system_data)
+    k8s_system.model_post_init(None)
+
+    validated_system = KubernetesSystem.model_validate(system_data)
+
+    yield validated_system
+
+
+class TestNeMoLauncherKubernetesCommandGenStrategy__GenExecCommand:
+    @pytest.fixture
+    def nemo_cmd_gen(self, k8s_system: KubernetesSystem) -> NeMoLauncherKubernetesCommandGenStrategy:
+        cmd_args = {"test_arg": "test_value"}
+        strategy = NeMoLauncherKubernetesCommandGenStrategy(k8s_system, cmd_args)
+        strategy.system = k8s_system
+        strategy.default_cmd_args = {
+            "repository_url": "mock_repo_url",
+            "repository_commit_hash": "mock_commit_hash",
+            "docker_image_url": "mock_docker_image",
+            "data_dir": "mock_data_dir",
+            "training.values": "mock_training_values",
+            "training.model.data.data_prefix": "\\mock_prefix",
+        }
+        return strategy
+
+    def test_print_custom_gen_exec_command(self, nemo_cmd_gen: NeMoLauncherKubernetesCommandGenStrategy):
+        extra_env_vars = {}
+        cmd_args = {
+            "launcher_scripts_path": "/fake/install/path/NeMoLauncherKubernetesInstallStrategy/launcher_scripts",
+            "data_dir": "/nemo-workspace/pile",
+            "cluster": "k8s_v2",
+            "cluster.volumes.workspace.persistent_volume_claim.claim_name": "nemo-workspace",
+            "cluster.volumes.workspace.mount_path": "/nemo-workspace",
+            "stages": "[training]",
+            "training": "gpt3/1b_improved",
+            "training.exp_manager.explicit_log_dir": "/nemo-workspace/gpt3/1b_improved/training_gpt3/1b_improved/results",
+            "training.model.data.data_prefix": "[0.5,/nemo-workspace/pile/my-gpt3_00_text_document,0.5,/nemo-workspace/pile/my-gpt3_01_text_document]",
+            "training.trainer.num_nodes": "3",
+            "training.trainer.devices": "8",
+            "training.trainer.max_steps": "1000",
+            "training.trainer.val_check_interval": "100",
+            "training.model.global_batch_size": "48",
+        }
+
+        cmd = nemo_cmd_gen.gen_exec_command(
+            cmd_args=cmd_args,
+            extra_env_vars=extra_env_vars,
+            extra_cmd_args="",
+            output_path=Path("/fake/output/path"),
+            num_nodes=3,
+            nodes=["node1", "node2", "node3"],
+        )
+
+        print(f"Generated Command: {cmd}")