rwth-i6 · curufinwe · Oct 26, 2023 · Aug 18, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/returnn/training.py b/returnn/training.py
@@ -240,25 +240,34 @@ def _get_run_cmd(self):
         ]
 
         if self.horovod_num_processes:
-            # Normally, if the engine (e.g. SGE or Slurm) is configured correctly,
-            # it automatically provides the information on multiple nodes to mpirun,
-            # so it is not needed to explicitly pass on any hostnames here.
-            run_cmd = [
-                "mpirun",
-                "-np",
-                str(self.horovod_num_processes),
-                "-bind-to",
-                "none",
-                "-map-by",
-                "slot",
-                "-mca",
-                "pml",
-                "ob1",
-                "-mca",
-                "btl",
-                "^openib",
-                "--report-bindings",
-            ] + run_cmd
+            if self.returnn_config.get("backend", None) == "torch":
+                # use torchrun to lauch DDP training when the backend is torch
+                nnodes = self.multi_node_slots if self.multi_node_slots else 1
+                run_cmd = [
+                    "torchrun",
+                    f"--nnodes={nnodes}",
+                    f"--nproc-per-node={self.horovod_num_processes}",
+                ] + run_cmd[1:]
+            else:
+                # Normally, if the engine (e.g. SGE or Slurm) is configured correctly,
+                # it automatically provides the information on multiple nodes to mpirun,
+                # so it is not needed to explicitly pass on any hostnames here.
+                run_cmd = [
+                    "mpirun",
+                    "-np",
+                    str(self.horovod_num_processes),
+                    "-bind-to",
+                    "none",
+                    "-map-by",
+                    "slot",
+                    "-mca",
+                    "pml",
+                    "ob1",
+                    "-mca",
+                    "btl",
+                    "^openib",
+                    "--report-bindings",
+                ] + run_cmd
 
         return run_cmd