interTwin-eu · matbun · Dec 13, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 25, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,9 @@ dependencies = [
     "submitit>=1.4.6",
     "typing-extensions==4.5.0",
     "typing_extensions==4.5.0",
-    "urllib3>=2.0.5",
+    "urllib3>=1.26.18",
+    "lightning>=2.0.0",
+    "torchmetrics>=1.2.0",
 ]
 
 # dynamic = ["version", "description"]

diff --git a/src/itwinai/components.py b/src/itwinai/components.py
@@ -2,12 +2,15 @@
 from typing import Iterable, Dict, Any, Optional, Tuple, Union
 from abc import ABCMeta, abstractmethod
 import time
+from jsonargparse import ArgumentParser
+
 # import logging
 # from logging import Logger as PythonLogger
 
 from .cluster import ClusterEnvironment
 from .types import ModelML, DatasetML
 from .serialization import ModelLoader
+from .utils import load_yaml
 
 
 class Executable(metaclass=ABCMeta):
@@ -231,12 +234,12 @@ def save(self, *args, **kwargs):
 class Executor(Executable):
     """Sets-up and executes a sequence of Executable steps."""
 
-    steps: Iterable[Executable]
+    steps: Union[Dict[str, Executable], Iterable[Executable]]
     constructor_args: Dict
 
     def __init__(
         self,
-        steps: Iterable[Executable],
+        steps: Union[Dict[str, Executable], Iterable[Executable]],
         name: Optional[str] = None,
         # logs_dir: Optional[str] = None,
         # debug: bool = False,
@@ -247,9 +250,20 @@ def __init__(
         self.steps = steps
         self.constructor_args = kwargs
 
-    def __getitem__(self, subscript) -> Executor:
+    def __getitem__(self, subscript: Union[str, int, slice]) -> Executor:
         if isinstance(subscript, slice):
-            s = self.steps[subscript.start:subscript.stop: subscript.step]
+            # First, convert to list if is a dict
+            if isinstance(self.steps, dict):
+                steps = list(self.steps.items())
+            else:
+                steps = self.steps
+            # Second, perform slicing
+            s = steps[subscript.start:subscript.stop: subscript.step]
+            # Third, reconstruct dict, if it is a dict
+            if isinstance(self.steps, dict):
+                s = dict(s)
+            # Fourth, return sliced sub-pipeline, preserving its
+            # initial structure
             sliced = self.__class__(
                 steps=s,
                 **self.constructor_args
@@ -270,7 +284,12 @@ def setup(self, parent: Optional[Executor] = None) -> None:
                 Defaults to None.
         """
         super().setup(parent)
-        for step in self.steps:
+        if isinstance(self.steps, dict):
+            steps = list(self.steps.values())
+        else:
+            steps = self.steps
+
+        for step in steps:
             step.setup(self)
             step.is_setup = True
 
@@ -303,7 +322,12 @@ def execute(
             Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as
                 (results, config).
         """
-        for step in self.steps:
+        if isinstance(self.steps, dict):
+            steps = list(self.steps.values())
+        else:
+            steps = self.steps
+
+        for step in steps:
             if not step.is_setup:
                 raise RuntimeError(
                     f"Step '{step.name}' was not setup!"
@@ -318,3 +342,77 @@ def _pack_args(self, args) -> Tuple:
         if not isinstance(args, tuple):
             args = (args,)
         return args
+
+
+def add_replace_field(
+    config: Dict,
+    key_chain: str,
+    value: Any
+) -> None:
+    """Replace or add (if not present) a field in a dictionary, following a
+    path of dot-separated keys. Inplace operation.
+
+    Args:
+        config (Dict): dictionary to be modified.
+        key_chain (str): path of dot-separated keys to specify the location
+        if the new value (e.g., 'foo.bar.line' adds/overwrites the value
+        located at config['foo']['bar']['line']).
+        value (Any): the value to insert.
+    """
+    sub_config = config
+    for idx, k in enumerate(key_chain.split('.')):
+        if idx >= len(key_chain.split('.')) - 1:
+            # Last key reached
+            break
+        if not isinstance(sub_config.get(k), dict):
+            sub_config[k] = dict()
+        sub_config = sub_config[k]
+    sub_config[k] = value
+
+
+def load_pipeline_step(
+    pipe: Union[str, Dict],
+    step_id: Union[str, int],
+    override_keys: Optional[Dict[str, Any]] = None,
+    verbose: bool = False
+) -> Executable:
+    """Instantiates a specific step from a pipeline configuration file, given
+    its ID (index if steps are a list, key if steps are a dictionary). It
+    allows to override the step configuration with user defined values.
+
+    Args:
+        pipe (Union[str, Dict]): pipeline configuration. Either a path to a
+        YAML file (if string), or a configuration in memory (if dict object).
+        step_id (Union[str, int]): step identifier: list index if steps are
+        represented as a list, string key if steps are represented as a
+        dictionary.
+        override_keys (Optional[Dict[str, Any]], optional): if given, maps key
+        path to the value to add/override. A key path is a string of
+        dot-separated keys (e.g., 'foo.bar.line' adds/overwrites the value
+        located at pipe['foo']['bar']['line']). Defaults to None.
+        verbose (bool, optional): if given, prints to console the new
+        configuration, obtained after overriding. Defaults to False.
+
+    Returns:
+        Executable: an instance of the selected step in the pipeline.
+    """
+    if isinstance(pipe, str):
+        # Load pipe from YAML file path
+        pipe = load_yaml(pipe)
+    step_dict_config = pipe['executor']['init_args']['steps'][step_id]
+
+    # Override fields
+    if override_keys is not None:
+        for key_chain, value in override_keys.items():
+            add_replace_field(step_dict_config, key_chain, value)
+    if verbose:
+        import json
+        print(f"NEW STEP <ID:{step_id}> CONFIG:")
+        print(json.dumps(step_dict_config, indent=4))
+
+    # Wrap config under "step" field and parse it
+    step_dict_config = dict(step=step_dict_config)
+    step_parser = ArgumentParser()
+    step_parser.add_subclass_arguments(Executable, "step")
+    parsed_namespace = step_parser.parse_object(step_dict_config)
+    return step_parser.instantiate_classes(parsed_namespace)["step"]
diff --git a/src/itwinai/torch/mlflow.py b/src/itwinai/torch/mlflow.py
@@ -0,0 +1,77 @@
+from typing import Dict, Optional
+import os
+
+import mlflow
+import yaml
+
+
+def _get_mlflow_logger_conf(pl_config: Dict) -> Optional[Dict]:
+    """Extract MLFLowLogger configuration from pytorch lightning
+    configuration file, if present.
+
+    Args:
+        pl_config (Dict): lightning configuration loaded in memory.
+
+    Returns:
+        Optional[Dict]: if present, MLFLowLogger constructor arguments
+        (under 'init_args' key).
+    """
+    if isinstance(pl_config['trainer']['logger'], list):
+        # If multiple loggers are provided
+        for logger_conf in pl_config['trainer']['logger']:
+            if logger_conf['class_path'].endswith('MLFlowLogger'):
+                return logger_conf['init_args']
+    elif pl_config['trainer']['logger']['class_path'].endswith('MLFlowLogger'):
+        return pl_config['trainer']['logger']['init_args']
+
+
+def _mlflow_log_pl_config(pl_config: Dict, local_yaml_path: str) -> None:
+    os.makedirs(os.path.dirname(local_yaml_path), exist_ok=True)
+    with open(local_yaml_path, 'w') as outfile:
+        yaml.dump(pl_config, outfile, default_flow_style=False)
+    mlflow.log_artifact(local_yaml_path)
+
+
+def init_lightning_mlflow(
+    pl_config: Dict,
+    default_experiment_name: str = 'Default',
+    **autolog_kwargs
+) -> None:
+    """Initialize mlflow for pytorch lightning, also setting up
+    auto-logging (mlflow.pytorch.autolog(...)). Creates a new mlflow
+    run and attaches it to the mlflow auto-logger.
+
+    Args:
+        pl_config (Dict): pytorch lightning configuration loaded in memory.
+        default_experiment_name (str, optional): used as experiment name
+        if it is not given in the lightning conf. Defaults to 'Default'.
+        **autolog_kwargs (kwargs): args for mlflow.pytorch.autolog(...).
+    """
+    mlflow_conf: Optional[Dict] = _get_mlflow_logger_conf(pl_config)
+    if not mlflow_conf:
+        return
+
+    tracking_uri = mlflow_conf.get('tracking_uri')
+    if not tracking_uri:
+        save_path = mlflow_conf.get('save_dir')
+        tracking_uri = "file://" + os.path.abspath(save_path)
+
+    experiment_name = mlflow_conf.get('experiment_name')
+    if not experiment_name:
+        experiment_name = default_experiment_name
+
+    mlflow.set_tracking_uri(tracking_uri)
+    mlflow.set_experiment(experiment_name)
+    mlflow.pytorch.autolog(**autolog_kwargs)
+    mlflow.start_run()
+
+    mlflow_conf['experiment_name'] = experiment_name
+    mlflow_conf['run_id'] = mlflow.active_run().info.run_id
+
+    _mlflow_log_pl_config(pl_config, '.tmp/pl_config.yml')
+
+
+def teardown_lightning_mlflow() -> None:
+    """End active mlflow run, if any."""
+    if mlflow.active_run() is not None:
+        mlflow.end_run()
diff --git a/use-cases/3dgan/Dockerfile → use-cases/3dgan/Dockerfile.inference b/use-cases/3dgan/Dockerfile → use-cases/3dgan/Dockerfile.inference
diff --git a/use-cases/3dgan/README.md b/use-cases/3dgan/README.md
@@ -1,19 +1,42 @@
 # 3DGAN use case
 
+First of all, from the repository root, create a torch environment:
+
+```bash
+make torch-gpu
+```
+
+Now, install custom requirements for 3DGAN:
+
+```bash
+micromamba activate ./.venv-pytorch
+cd use-cases/3dgan
+pip install -r requirements.txt
+```
+
+**NOTE**: Python commands below assumed to be executed from within the
+micromamba virtual environment.
+
 ## Training
 
 At CERN, use the dedicated configuration file:
 
 ```bash
 cd use-cases/3dgan
 python train.py -p cern-pipeline.yaml
+
+# Or better:
+micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p cern-pipeline.yaml
 ```
 
 Anywhere else, use the general purpose training configuration:
 
 ```bash
 cd use-cases/3dgan
 python train.py -p pipeline.yaml
+
+# Or better:
+micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p pipeline.yaml
 ```
 
 To visualize the logs with MLFLow run the following in the terminal:
@@ -85,11 +108,11 @@ Build from project root with
 
 ```bash
 # Local
-docker buildx build -t itwinai-mnist-torch-inference -f use-cases/3dgan/Dockerfile .
+docker buildx build -t itwinai-mnist-torch-inference -f use-cases/3dgan/Dockerfile.inference .
 
 # Ghcr.io
-docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1 -f use-cases/3dgan/Dockerfile .
-docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
+docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.3 -f use-cases/3dgan/Dockerfile.inference .
+docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.3
 ```
 
 From wherever a sample of MNIST jpg images is available
@@ -106,7 +129,7 @@ From wherever a sample of MNIST jpg images is available
 ```
 
 ```bash
-docker run -it --rm --name running-inference -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
+docker run -it --rm --name running-inference -v "$PWD":/tmp/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.3
 ```
 
 This command will store the results in a folder called "3dgan-generated-data":
@@ -120,3 +143,13 @@ This command will store the results in a folder called "3dgan-generated-data":
 |   │   ├── energy=1.664689540863037&angle=1.4906378984451294.pth
 |   │   ├── energy=1.664689540863037&angle=1.4906378984451294.jpg
 ```
+
+### Singularity
+
+Run overriding the working directory (`--pwd /usr/src/app`, restores Docker's WORKDIR)
+and providing a writable filesystem (`-B "$PWD":/usr/data`):
+
+```bash
+singularity exec -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.3 /
+bash -c "cd /usr/src/app && python train.py -p inference-pipeline.yaml"
+```
diff --git a/use-cases/3dgan/cern-pipeline.yaml b/use-cases/3dgan/cern-pipeline.yaml
@@ -4,7 +4,7 @@ executor:
     steps:
       - class_path: dataloader.Lightning3DGANDownloader
         init_args:
-          data_path: /eos/user/k/ktsolaki/data/3dgan_data # exp_data/
+          data_path: /eos/user/k/ktsolaki/data/3dgan_data
           data_url: null # https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX
 
       - class_path: trainer.Lightning3DGANTrainer
@@ -17,22 +17,22 @@ executor:
               accumulate_grad_batches: 1
               barebones: false
               benchmark: null
-              # callbacks:
-              #   # - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
-              #   #   init_args:
-              #   #     monitor: val_loss
-              #   #     patience: 2
-              #   - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
-              #     init_args:
-              #       logging_interval: step
-              #   # - class_path: lightning.pytorch.callbacks.ModelCheckpoint
-              #   #   init_args:
-              #   #     dirpath: checkpoints
-              #   #     filename: best-checkpoint
-              #   #     mode: min
-              #   #     monitor: val_loss
-              #   #     save_top_k: 1
-              #   #     verbose: true
+              callbacks:
+                - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
+                  init_args:
+                    monitor: val_generator_loss
+                    patience: 2
+                - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
+                  init_args:
+                    logging_interval: step
+                - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+                  init_args:
+                    dirpath: checkpoints
+                    filename: best-checkpoint
+                    mode: min
+                    monitor: val_generator_loss
+                    save_top_k: 1
+                    verbose: true
               check_val_every_n_epoch: 1
               default_root_dir: null
               detect_anomaly: false
@@ -92,4 +92,4 @@ executor:
                 datapath: /eos/user/k/ktsolaki/data/3dgan_data/*.h5 # exp_data/*/*.h5
                 batch_size: 128
                 num_workers: 0
-                max_samples: 3000
+                max_samples: 10000