interTwin-eu · jarlsondre · Jan 9, 2025 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/env-files/tensorflow/generic_tf.sh b/env-files/tensorflow/generic_tf.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
 
+# --------------------------------------------------------------------------------------
+# Part of the interTwin Project: https://www.intertwin.eu/
+#
+# Created by: Matteo Bunino
+#
+# Credit:
+# - Jarl Sondre Sæther <[email protected]> - CERN
+# - Matteo Bunino <[email protected]> - CERN
+# --------------------------------------------------------------------------------------
+
 if [ -z "$ENV_NAME" ]; then
   ENV_NAME=".venv-tf"
 fi

diff --git a/env-files/torch/generic_torch.sh b/env-files/torch/generic_torch.sh
@@ -1,4 +1,15 @@
 #!/bin/bash
+
+# --------------------------------------------------------------------------------------
+# Part of the interTwin Project: https://www.intertwin.eu/
+#
+# Created by: Matteo Bunino
+#
+# Credit:
+# - Jarl Sondre Sæther <[email protected]> - CERN
+# - Matteo Bunino <[email protected]> - CERN
+# --------------------------------------------------------------------------------------
+
 if [ -z "$ENV_NAME" ]; then
   ENV_NAME=".venv-pytorch"
 fi

diff --git a/env-files/torch/install-horovod-deepspeed-cuda.sh b/env-files/torch/install-horovod-deepspeed-cuda.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
 
+# --------------------------------------------------------------------------------------
+# Part of the interTwin Project: https://www.intertwin.eu/
+#
+# Created by: Jarl Sondre Sæther
+#
+# Credit:
+# - Jarl Sondre Sæther <[email protected]> - CERN
+# - Matteo Bunino <[email protected]> - CERN
+# --------------------------------------------------------------------------------------
+
 # DeepSpeed variables
 export DS_BUILD_CCL_COMM=1
 export DS_BUILD_UTILS=1

diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py
@@ -372,6 +372,7 @@ def exec_pipeline(
         print(json.dumps(parser.config, indent=2))
         print("#=" * 50)
         print()
+
     pipeline = parser.parse_pipeline(pipeline_nested_key=pipe_key)
     if steps:
         if not re.match(r"\d+(:\d+)?(:\d+)?", steps):

diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py
@@ -1177,7 +1177,7 @@ class EpochTimeTracker:
     """Tracker for epoch execution time during training."""
 
     def __init__(
-        self, strategy_name: str, save_path: Union[Path, str], num_nodes: int
+        self, strategy_name: str, save_path: Path | str, num_nodes: int
     ) -> None:
         if isinstance(save_path, str):
             save_path = Path(save_path)

diff --git a/src/itwinai/scalability.py b/src/itwinai/scalability.py
@@ -108,6 +108,8 @@ def create_absolute_plot(avg_epoch_time_df: pd.DataFrame) -> None:
     ax.grid(True)
 
     output_path = Path("plots/absolute_scalability_plot.png")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    plt.tight_layout()
     plt.savefig(output_path)
     print(f"Saving absolute plot to '{output_path.resolve()}'.")
     sns.reset_orig()

diff --git a/src/itwinai/slurm/slurm_config.yaml b/src/itwinai/slurm/slurm_config.yaml
@@ -3,8 +3,8 @@ account: intertwin
 dist_strat: horovod
 time: 00:11:11
 
-std_out: slurm_jobs/job.out
-err_out: slurm_jobs/err.out
+std_out: slurm_job_Logs/job.out
+err_out: slurm_job_Logs/err.out
 
 num_nodes: 1
 num_tasks_per_node: 1

diff --git a/src/itwinai/slurm/slurm_script_builder.py b/src/itwinai/slurm/slurm_script_builder.py
@@ -172,6 +172,8 @@ def get_debug_command(self) -> str:
         echo ""
         echo "### Other Variables ###"
         echo "Distributed Strategy: {self.distributed_strategy}"
+        echo "Current working directory: $(pwd)"
+        echo "Which python: $(which python)"
         """
         debug_print_command = debug_print_command.strip()
         return remove_indentation_from_multiline_string(debug_print_command)
@@ -201,10 +203,10 @@ def process_slurm_script(
             self.slurm_script_configuration.job_name = self.generate_identifier()
 
         if self.slurm_script_configuration.std_out is None:
-            std_out_path = Path("slurm_jobs") / (self.generate_identifier() + ".out")
+            std_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".out")
             self.slurm_script_configuration.std_out = std_out_path
         if self.slurm_script_configuration.err_out is None:
-            err_out_path = Path("slurm_jobs") / (self.generate_identifier() + ".err")
+            err_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".err")
             self.slurm_script_configuration.err_out = err_out_path
 
         # Making sure the std out and err out folders exist
@@ -218,9 +220,9 @@ def process_slurm_script(
         # Generate the script using the given configuration
         script = self.slurm_script_configuration.format_script()
         if not submit_slurm_job and not retain_file:
-            print("#" * 30)
+            print("#" * 20, "SLURM Script Preview", "#"*20)
             print(script)
-            print("#" * 30)
+            print("#" * 62)
             return
 
         if file_path is None:
@@ -258,8 +260,8 @@ def run_slurm_script_all_strategies(
 
             # Overriding job_name, std_out and err_out
             self.slurm_script_configuration.job_name = self.generate_identifier()
-            std_out_path = Path("slurm_jobs") / (self.generate_identifier() + ".out")
-            err_out_path = Path("slurm_jobs") / (self.generate_identifier() + ".err")
+            std_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".out")
+            err_out_path = Path("slurm_job_logs") / (self.generate_identifier() + ".err")
             self.slurm_script_configuration.std_out = std_out_path
             self.slurm_script_configuration.err_out = err_out_path
 

diff --git a/src/itwinai/slurm/utils.py b/src/itwinai/slurm/utils.py
@@ -7,6 +7,8 @@
 # - Jarl Sondre Sæther <[email protected]> - CERN
 # --------------------------------------------------------------------------------------
 
+from typing import List
+
 from itwinai.parser import ArgumentParser
 
 
@@ -18,6 +20,31 @@ def remove_indentation_from_multiline_string(multiline_string: str) -> str:
     return "\n".join([line.lstrip() for line in multiline_string.split("\n")])
 
 
+def scalability_nodes_list(value: str | List[int]) -> List[int]:
+    """Checks that the value it receives conforms to the comma-separated integer
+    constraint and returns the parsed list if successful.
+
+    Returns:
+        The list of integers that was parsed.
+
+    Raises:
+        ValueError: If unable to parse the integers e.g. due to formatting errors.
+    """
+
+    if isinstance(value, list):
+        if not all([isinstance(x, int) for x in value]):
+            raise ValueError(f"Provided list, '{value}', contains non-integer values.")
+        else:
+            return value
+
+    try:
+        return [int(n) for n in value.split(",")]
+    except ValueError:
+        raise ValueError(
+            f"Invalid input: '{value}', must be formatted as comma-separated integers."
+        )
+
+
 def get_slurm_job_parser() -> ArgumentParser:
     # Default arguments for the SLURM script configuration
     default_account = "intertwin"
@@ -38,16 +65,11 @@ def get_slurm_job_parser() -> ArgumentParser:
     default_pipe_key = "rnn_training_pipeline"
     default_training_command = None
     default_python_venv = ".venv"
+    default_scalability_nodes = "1,2,4,8"
 
     parser = ArgumentParser(parser_mode="omegaconf")
 
     # Arguments specific to the SLURM script configuration
-    parser.add_argument(
-        "--job_name",
-        type=str,
-        default=default_job_name,
-        help="The name of the SLURM job",
-    )
     parser.add_argument(
         "--job-name",
         type=str,
@@ -142,6 +164,12 @@ def get_slurm_job_parser() -> ArgumentParser:
         default=default_python_venv,
         help="Which python venv to use for running the command.",
     )
+    parser.add_argument(
+        "--scalability-nodes",
+        type=scalability_nodes_list,
+        default=default_scalability_nodes,
+        help="A comma-separated list of node numbers to use for the scalability test.",
+    )
 
     # Boolean arguments where you only need to include the flag and not an actual value
     parser.add_argument(

diff --git a/src/itwinai/torch/monitoring/plotting.py b/src/itwinai/torch/monitoring/plotting.py
@@ -107,6 +107,7 @@ def gpu_bar_plot(
         raise ValueError(
             f"DataFrame is missing the following columns: {missing_columns}"
         )
+
     sns.set_theme()
 
     strategies = data_df["strategy"].unique()
@@ -138,9 +139,9 @@ def gpu_bar_plot(
     ax.set_xticklabels(unique_gpu_counts)
     ax.legend(title="Strategy")
 
-    figure_width = int(1.5 * len(unique_gpu_counts))
-    fig.set_figheight(6)
+    figure_width = max(int(2 * len(unique_gpu_counts)), 8)
     fig.set_figwidth(figure_width)
+    fig.set_figheight(figure_width * 0.8)
 
     sns.reset_orig()
 

diff --git a/src/itwinai/torch/profiling/communication_plot.py b/src/itwinai/torch/profiling/communication_plot.py
@@ -16,8 +16,6 @@
 import seaborn as sns
 from matplotlib.patches import Patch
 
-# from itwinai.scalability import convert_matching_files_to_dataframe
-
 # Doing this because otherwise I get an error about X11 Forwarding which I believe
 # is due to the server trying to pass the image to the client computer
 matplotlib.use("Agg")
@@ -40,9 +38,15 @@ def calculate_comp_and_comm_time(df: pd.DataFrame) -> Tuple[float, float]:
             f"\nMissing columns: {missing_columns}"
         )
 
-    nccl_comm_pattern = (
-        r"ncclKernel_(?:AllReduce|Broadcast|Reduce|AllGather|ReduceScatter|SendRecv)"
-    )
+    comm_types = [
+        "AllReduce",
+        "Broadcast",
+        "Reduce",
+        "AllGather",
+        "Gather",
+        "ReduceScatter",
+    ]
+    nccl_comm_pattern = rf"(?:{'|'.join(comm_types)})"
     cuda_stream_pattern = r"cudaStream(?:WaitEvent|Synchronize)"
 
     # Any operation that is a part of PyTorch's ATen library is considered a computation
@@ -133,10 +137,11 @@ def communication_overhead_stacked_bar_plot(
     ax.legend(handles=ax.get_legend_handles_labels()[0] + [hatch_patch])
 
     # Dynamically adjusting the width of the figure
-    figure_width = int(1.5 * len(gpu_numbers))
-    fig.set_figheight(5)
+    figure_width = max(int(2 * len(gpu_numbers)), 8)
     fig.set_figwidth(figure_width)
+    fig.set_figheight(figure_width * 0.8)
 
+    # Resetting so that seaborn's theme doesn't affect other plots
     sns.reset_orig()
 
     return fig, ax

diff --git a/src/itwinai/torch/profiling/profiler.py b/src/itwinai/torch/profiling/profiler.py
@@ -89,13 +89,15 @@ def profiled_method(self: TorchTrainer, *args, **kwargs) -> Any:
             warmup_epochs=self.profiling_warmup_epochs,
         )
         with profile(
-            activities=[ProfilerActivity.CUDA],
+            activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
             schedule=schedule(
                 wait=wait_epochs,
                 warmup=warmup_epochs,
                 active=active_epochs,
             ),
+            with_modules=True
         ) as profiler:
+            self.profiler = profiler
             result = method(self, *args, **kwargs)
 
         strategy = self.strategy

diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py
@@ -422,7 +422,8 @@ def set_epoch(self, epoch: int) -> None:
         Args:
             epoch (int): epoch number, from 0 to ``epochs-1``.
         """
-        if self.profiler is not None:
+        if self.profiler is not None and epoch > 0:
+            # We don't want to start stepping until after the first epoch
             self.profiler.step()
         self._set_epoch_dataloaders(epoch)