interTwin-eu · matbun · Dec 13, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 25, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "submitit>=1.4.6",
     "typing-extensions==4.5.0",
     "typing_extensions==4.5.0",
-    "urllib3>=2.0.5",
+    "urllib3>=1.26.18",
 ]
 
 # dynamic = ["version", "description"]

diff --git a/src/itwinai/components.py b/src/itwinai/components.py
@@ -2,12 +2,15 @@
 from typing import Iterable, Dict, Any, Optional, Tuple, Union
 from abc import ABCMeta, abstractmethod
 import time
+from jsonargparse import ArgumentParser
+
 # import logging
 # from logging import Logger as PythonLogger
 
 from .cluster import ClusterEnvironment
 from .types import ModelML, DatasetML
 from .serialization import ModelLoader
+from .utils import load_yaml
 
 
 class Executable(metaclass=ABCMeta):
@@ -231,12 +234,12 @@ def save(self, *args, **kwargs):
 class Executor(Executable):
     """Sets-up and executes a sequence of Executable steps."""
 
-    steps: Iterable[Executable]
+    steps: Union[Dict[str, Executable], Iterable[Executable]]
     constructor_args: Dict
 
     def __init__(
         self,
-        steps: Iterable[Executable],
+        steps: Union[Dict[str, Executable], Iterable[Executable]],
         name: Optional[str] = None,
         # logs_dir: Optional[str] = None,
         # debug: bool = False,
@@ -247,9 +250,20 @@ def __init__(
         self.steps = steps
         self.constructor_args = kwargs
 
-    def __getitem__(self, subscript) -> Executor:
+    def __getitem__(self, subscript: Union[str, int, slice]) -> Executor:
         if isinstance(subscript, slice):
-            s = self.steps[subscript.start:subscript.stop: subscript.step]
+            # First, convert to list if is a dict
+            if isinstance(self.steps, dict):
+                steps = list(self.steps.items())
+            else:
+                steps = self.steps
+            # Second, perform slicing
+            s = steps[subscript.start:subscript.stop: subscript.step]
+            # Third, reconstruct dict, if it is a dict
+            if isinstance(self.steps, dict):
+                s = dict(s)
+            # Fourth, return sliced sub-pipeline, preserving its
+            # initial structure
             sliced = self.__class__(
                 steps=s,
                 **self.constructor_args
@@ -270,7 +284,12 @@ def setup(self, parent: Optional[Executor] = None) -> None:
                 Defaults to None.
         """
         super().setup(parent)
-        for step in self.steps:
+        if isinstance(self.steps, dict):
+            steps = list(self.steps.values())
+        else:
+            steps = self.steps
+
+        for step in steps:
             step.setup(self)
             step.is_setup = True
 
@@ -303,7 +322,12 @@ def execute(
             Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as
                 (results, config).
         """
-        for step in self.steps:
+        if isinstance(self.steps, dict):
+            steps = list(self.steps.values())
+        else:
+            steps = self.steps
+
+        for step in steps:
             if not step.is_setup:
                 raise RuntimeError(
                     f"Step '{step.name}' was not setup!"
@@ -318,3 +342,39 @@ def _pack_args(self, args) -> Tuple:
         if not isinstance(args, tuple):
             args = (args,)
         return args
+
+
+def recursive_replace(config: Dict, target_field: str, new_value: Any) -> None:
+    def _recursive_replace_key(sub_dict: Dict):
+        if not isinstance(sub_dict, dict):
+            return
+        for k, v in sub_dict.items():
+            if k == target_field:
+                sub_dict[k] = new_value
+                return
+            else:
+                _recursive_replace_key(v)
+    _recursive_replace_key(config)
+
+
+def load_pipeline_step(
+    pipe: Union[str, Dict],
+    step_id: Union[str, int],
+    override_keys: Optional[Dict[str, Any]] = None
+) -> Executable:
+    if isinstance(pipe, str):
+        # Load pipe from YAML file path
+        pipe = load_yaml(pipe)
+    step_dict_config = pipe['executor']['init_args']['steps'][step_id]
+
+    # Override fields
+    if override_keys is not None:
+        for key, value in override_keys.items():
+            recursive_replace(step_dict_config, key, value)
+
+    # Wrap config under "step" field and parse it
+    step_dict_config = dict(step=step_dict_config)
+    step_parser = ArgumentParser()
+    step_parser.add_subclass_arguments(Executable, "step")
+    parsed_namespace = step_parser.parse_object(step_dict_config)
+    return step_parser.instantiate_classes(parsed_namespace)["step"]
diff --git a/use-cases/3dgan/Dockerfile.vega b/use-cases/3dgan/Dockerfile.vega
@@ -0,0 +1,35 @@
+FROM python:3.9.12
+
+WORKDIR /usr/src/app
+
+RUN pip install --upgrade pip
+
+# Install pytorch (cpuonly)
+# Ref:https://pytorch.org/get-started/previous-versions/#linux-and-windows-5
+RUN pip install --no-cache-dir torch==1.13.1+cpu torchvision==0.14.1+cpu torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir lightning
+
+# Add 3DGAN custom requirements
+COPY use-cases/3dgan/requirements.txt /usr/src/app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install itwinai and dependencies
+COPY pyproject.toml /usr/src/app/
+COPY src /usr/src/app/
+RUN  pip install --no-cache-dir /usr/src/app
+
+# Add 3DGAN use case files
+COPY use-cases/3dgan/* /usr/src/app/
+
+# # Create results folder
+# RUN mkdir -p /tmp/data
+# RUN chmod 0777 -R /tmp/data
+
+# Create results folder
+# TODO: remove once the problem with file system permissions are solved
+RUN mkdir -p /tmp/data/3dgan-generated-data
+RUN mkdir -p /tmp/data/exp_data/3dgan_data
+RUN chmod 0777 -R /tmp/data
+
+# Run inference
+CMD [ "python", "train.py", "-p", "inference-pipeline.yaml"]
diff --git a/use-cases/3dgan/README.md b/use-cases/3dgan/README.md
@@ -1,19 +1,42 @@
 # 3DGAN use case
 
+First of all, from the repository root, create a torch environment:
+
+```bash
+make torch-gpu
+```
+
+Now, install custom requirements for 3DGAN:
+
+```bash
+micromamba activate ./.venv-pytorch
+cd use-cases/3dgan
+pip install -r requirements.txt
+```
+
+**NOTE**: Python commands below assumed to be executed from within the
+micromamba virtual environment.
+
 ## Training
 
 At CERN, use the dedicated configuration file:
 
 ```bash
 cd use-cases/3dgan
 python train.py -p cern-pipeline.yaml
+
+# Or better:
+micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p cern-pipeline.yaml
 ```
 
 Anywhere else, use the general purpose training configuration:
 
 ```bash
 cd use-cases/3dgan
 python train.py -p pipeline.yaml
+
+# Or better:
+micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p pipeline.yaml
 ```
 
 To visualize the logs with MLFLow run the following in the terminal:
@@ -88,8 +111,8 @@ Build from project root with
 docker buildx build -t itwinai-mnist-torch-inference -f use-cases/3dgan/Dockerfile .
 
 # Ghcr.io
-docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1 -f use-cases/3dgan/Dockerfile .
-docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
+docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2 -f use-cases/3dgan/Dockerfile .
+docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2
 ```
 
 From wherever a sample of MNIST jpg images is available
@@ -106,7 +129,7 @@ From wherever a sample of MNIST jpg images is available
 ```
 
 ```bash
-docker run -it --rm --name running-inference -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
+docker run -it --rm --name running-inference -v "$PWD":/tmp/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2
 ```
 
 This command will store the results in a folder called "3dgan-generated-data":
@@ -120,3 +143,13 @@ This command will store the results in a folder called "3dgan-generated-data":
 |   │   ├── energy=1.664689540863037&angle=1.4906378984451294.pth
 |   │   ├── energy=1.664689540863037&angle=1.4906378984451294.jpg
 ```
+
+### Singularity
+
+Run overriding the working directory (`--pwd /usr/src/app`, restores Docker's WORKDIR)
+and providing a writable filesystem (`-B "$PWD":/usr/data`):
+
+```bash
+singularity exec -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2 /
+bash -c "cd /usr/src/app && python train.py -p pipeline.yaml"
+```
diff --git a/use-cases/3dgan/inference-pipeline.yaml b/use-cases/3dgan/inference-pipeline.yaml
@@ -62,7 +62,7 @@ executor:
                 class_path: lightning.pytorch.loggers.MLFlowLogger
                 init_args:
                   experiment_name: 3DGAN
-                  save_dir: ml_logs/mlflow_logs
+                  save_dir: /usr/data/ml_logs/mlflow_logs
                   log_model: all
               max_epochs: 1
               max_steps: 20
@@ -88,7 +88,7 @@ executor:
                 loss_weights: [3, 0.1, 25, 0.1]
                 power: 0.85
                 lr: 0.001
-                checkpoint_path: exp_data/3dgan.pth
+                checkpoint_path: /usr/data/exp_data/3dgan.pth
 
             # Lightning data module configuration
             data:

diff --git a/use-cases/3dgan/model.py b/use-cases/3dgan/model.py
@@ -334,8 +334,7 @@ def __init__(
         self.test_history = defaultdict(list)
         self.pklfile = checkpoint_path
         checkpoint_dir = os.path.dirname(checkpoint_path)
-        if not os.path.exists(checkpoint_dir):
-            os.makedirs(checkpoint_dir)
+        os.makedirs(checkpoint_dir, exist_ok=True)
 
     def BitFlip(self, x, prob=0.05):
         """

diff --git a/use-cases/mnist/torch/pipeline.yaml b/use-cases/mnist/torch/pipeline.yaml
@@ -2,11 +2,13 @@ executor:
   class_path: itwinai.components.Executor
   init_args:
     steps:
-      - class_path: dataloader.MNISTDataModuleTorch
+      dataloading_step:
+        class_path: dataloader.MNISTDataModuleTorch
         init_args:
           save_path: .tmp/
 
-      - class_path: itwinai.torch.trainer.TorchTrainerMG
+      training_step:
+        class_path: itwinai.torch.trainer.TorchTrainerMG
         init_args:
           model:
             class_path: model.Net
@@ -25,7 +27,7 @@ executor:
             batch_size: 32
             pin_memory: True
             shuffle: False
-          epochs: 30
+          epochs: 2
           train_metrics:
             accuracy:
               class_path: torchmetrics.classification.MulticlassAccuracy