From 57ea4f85b94b3cbdc5b9f100a5bad3a5f4846b98 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Wed, 6 Dec 2023 17:19:58 +0100 Subject: [PATCH 01/26] Add draft example --- experimental/train.yaml | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 experimental/train.yaml diff --git a/experimental/train.yaml b/experimental/train.yaml new file mode 100644 index 00000000..8b2beb13 --- /dev/null +++ b/experimental/train.yaml @@ -0,0 +1,40 @@ +version: 0.0.1 +name: Experiment name +description: This is a textual description +authors: + - author1 + - author2 + +vars: + images_dataset_path: some/path/disk + mlflow_tracking_uri: http://localhost:5000 + training_lr: 0.001 + +steps: + preprocessing-step: + logic: + class_path: itwinai.torch.Preprocessor + init_args: + save_path: ${vars.images_dataset_path} + after: null + env: null + + training-step: + logic: + class_path: itwinai.torch.Trainer + init_args: + lr: ${vars.training_lr} + tracking_uri: ${vars.mlflow_tracking_uri} + after: preprocessing-step + env: null + + sth_step: + logic: python inference.py -p pipeline.yaml + after: [preprocessing-step, training-step] + env: docker+ghcr.io/intertwin-eu/itwinai:training-0.0.1 + + sth_step2: + logic: python train.py -p pipeline.yaml + after: null + env: conda+path/to/my/local/env + From b096c4a0608ce1e5c1af4e4a90c892ea3b42ddf0 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Wed, 6 Dec 2023 17:21:33 +0100 Subject: [PATCH 02/26] UPDATE credits field --- experimental/train.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/train.yaml b/experimental/train.yaml index 8b2beb13..906f4e26 100644 --- a/experimental/train.yaml +++ b/experimental/train.yaml @@ -1,7 +1,7 @@ version: 0.0.1 name: Experiment name description: This is a textual description -authors: +credits: - author1 - author2 From 8a802014dd50be29983b02085e69bb0c37e85002 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Thu, 7 Dec 2023 18:16:19 +0100 Subject: [PATCH 03/26] ADD docs --- experimental/train.yaml | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/experimental/train.yaml b/experimental/train.yaml index 906f4e26..c21d4141 100644 --- a/experimental/train.yaml +++ b/experimental/train.yaml @@ -1,3 +1,5 @@ +# AI workflow metadata/header. +# They are optional and easily extensible in the future. version: 0.0.1 name: Experiment name description: This is a textual description @@ -5,14 +7,25 @@ credits: - author1 - author2 +# Provide a unified place where this *template* can be configured. +# Variables which can be overridden at runtime as env vars, e.g.: +# - Execution environment details (e.g., path in container vs. in laptop, MLFlow tracking URI) +# - Tunable parameters (e.g., learning rate) +# - Intrinsically dynamic values (e.g., MLFLow run ID is a random value) +# These variables are interpolated with OmegaConf. vars: images_dataset_path: some/path/disk mlflow_tracking_uri: http://localhost:5000 training_lr: 0.001 +# Runner-independent workflow steps. +# Each step is designed to be minimal, but easily extensible +# to accommodate future needs by adding new fields. +# The only required field is 'command'. New fields can be added +# to support future workflow executors. steps: preprocessing-step: - logic: + command: class_path: itwinai.torch.Preprocessor init_args: save_path: ${vars.images_dataset_path} @@ -20,7 +33,7 @@ steps: env: null training-step: - logic: + command: class_path: itwinai.torch.Trainer init_args: lr: ${vars.training_lr} @@ -29,12 +42,12 @@ steps: env: null sth_step: - logic: python inference.py -p pipeline.yaml + command: python inference.py -p pipeline.yaml after: [preprocessing-step, training-step] env: docker+ghcr.io/intertwin-eu/itwinai:training-0.0.1 sth_step2: - logic: python train.py -p pipeline.yaml + command: python train.py -p pipeline.yaml after: null env: conda+path/to/my/local/env - + From b128fe074166a0740375af5fefb74291e22b40be Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Fri, 8 Dec 2023 11:16:11 +0100 Subject: [PATCH 04/26] REFACTOR components and pipeline code --- experimental/{ => workflow}/train.yaml | 0 src/itwinai/components.py | 307 +++++++------------- src/itwinai/{ => experimental}/executors.py | 14 +- src/itwinai/pipeline.py | 84 ++++++ src/itwinai/serialization.py | 4 +- src/itwinai/types.py | 8 +- tests/test_components.py | 4 +- use-cases/3dgan/train.py | 6 +- use-cases/cyclones/executor.py | 6 +- use-cases/cyclones/train.py | 4 +- use-cases/mnist/tensorflow/train.py | 6 +- use-cases/mnist/torch-lightning/train.py | 6 +- use-cases/mnist/torch/train.py | 6 +- use-cases/zebra2horse/train.py | 2 +- 14 files changed, 216 insertions(+), 241 deletions(-) rename experimental/{ => workflow}/train.yaml (100%) rename src/itwinai/{ => experimental}/executors.py (92%) create mode 100644 src/itwinai/pipeline.py diff --git a/experimental/train.yaml b/experimental/workflow/train.yaml similarity index 100% rename from experimental/train.yaml rename to experimental/workflow/train.yaml diff --git a/src/itwinai/components.py b/src/itwinai/components.py index c1e6e372..6af9aca7 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -1,58 +1,38 @@ -from __future__ import annotations -from typing import Iterable, Dict, Any, Optional, Tuple, Union +""" +This module provides the base classes to define modular and reproducible ML +workflows. The base component classes provide a template to follow for +extending existing components or creating new ones. +""" + + +from typing import Any, Optional, Tuple, Union, Callable from abc import ABCMeta, abstractmethod import time +import functools # import logging # from logging import Logger as PythonLogger -from .cluster import ClusterEnvironment -from .types import ModelML, DatasetML +from .types import MLModel, MLDataset, MLArtifact from .serialization import ModelLoader -class Executable(metaclass=ABCMeta): - """Base Executable class. - - Args: - name (Optional[str], optional): unique identifier for a step. - Defaults to None. - logs_path (Optional[str], optional): where to store the logs - produced by Python logging. Defaults to None. - """ - name: str = 'unnamed' - is_setup: bool = False - cluster: ClusterEnvironment = None - parent: Executor = None - # logs_dir: str = None - # log_file: str = None - # console: PythonLogger = None +def monitor_exec(method: Callable) -> Callable: + """Decorator for execute method of a component class. + Computes execution time and gives some information about + the execution of the component. - def __init__( - self, - name: Optional[str] = None, - # logs_dir: Optional[str] = None, - # debug: bool = False, - **kwargs - ) -> None: - self.name = name if name is not None else self.__class__.__name__ - # self.logs_dir = logs_dir - # self.debug = debug - - def __call__( - self, - *args: Any, - config: Optional[Dict] = None, - **kwargs: Any - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - # WAIT! This method SHOULD NOT be overridden. This is just a wrapper. - # Override execute() instead! + Args: + func (Callable): class method. + """ + @functools.wraps(method) + def monitored_method(self: BaseComponent, *args, **kwargs) -> Any: msg = f"Starting execution of '{self.name}'..." self._printout(msg) start_t = time.time() try: # print(f'ARGS: {args}') # print(f'KWARGS: {kwargs}') - result = self.execute(*args, **kwargs, config=config) + result = method(self, *args, **kwargs) finally: self.cleanup() self.exec_t = time.time() - start_t @@ -60,25 +40,31 @@ def __call__( self._printout(msg) return result - @abstractmethod - def execute( - self, - *args, - config: Optional[Dict] = None, - **kwargs - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - """"Execute some operations. + return monitored_method + + +class BaseComponent(metaclass=ABCMeta): + """Base component class. Args: - args (Any, optional): generic input of the executable step. - config (Dict, optional): key-value configuration. + name (Optional[str], optional): unique identifier for a step. Defaults to None. - - Returns: - Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as - (results, config). """ - return args, config + name: str = 'unnamed' + + def __init__( + self, + name: Optional[str] = None, + # logs_dir: Optional[str] = None, + # debug: bool = False, + **kwargs + ) -> None: + self.name = name if name is not None else self.__class__.__name__ + + @abstractmethod + @monitor_exec + def execute(self, *args, **kwargs) -> Any: + """"Execute some operations.""" # def setup_console(self): # """Setup Python logging""" @@ -101,48 +87,37 @@ def execute( # ) # self.console = logging.getLogger(self.name) - def setup(self, parent: Optional[Executor] = None) -> None: - """Inherit properties from parent Executor instance. - - Args: - parent (Optional[Executor], optional): parent executor. - Defaults to None. - """ - if parent is None: - # # Setup Python logging ("console") - # self.logs_dir = '.logs' - # os.makedirs(self.logs_dir, exist_ok=True) - # self.setup_console() - self.is_setup = True - return - if self.cluster is None: - self.cluster = parent.cluster - - # # Python logging ("console") - # if self.logs_dir is None: - # self.logs_dir = parent.logs_dir - # if self.log_file is None: - # self.log_file = parent.log_file - # if self.console is None: - # self.console = logging.getLogger(self.name) - - self.is_setup = True - def cleanup(self): - pass + """Cleanup resources allocated by this component.""" - def _printout(self, msg: str): + @staticmethod + def _printout(msg: str): msg = f"# {msg} #" print("#"*len(msg)) print(msg) print("#"*len(msg)) -class Trainer(Executable): +class Trainer(BaseComponent): """Trains a machine learning model.""" + @abstractmethod - def train(self, *args, **kwargs): - pass + @monitor_exec + def execute( + self, + train_dataset: MLDataset, + validation_dataset: MLDataset + ) -> Tuple[MLDataset, MLDataset, MLModel]: + """Trains a machine learning model. + + Args: + train_dataset (DatasetML): training dataset. + validation_dataset (DatasetML): validation dataset. + + Returns: + Tuple[DatasetML, DatasetML, ModelML]: training dataset, + validation dataset, trained model. + """ @abstractmethod def save_state(self): @@ -153,44 +128,27 @@ def load_state(self): pass -class Predictor(Executable): +class Predictor(BaseComponent): """Applies a pre-trained machine learning model to unseen data.""" - model: ModelML + model: MLModel def __init__( self, - model: Union[ModelML, ModelLoader], + model: Union[MLModel, ModelLoader], name: Optional[str] = None, **kwargs ) -> None: super().__init__(name, **kwargs) self.model = model() if isinstance(model, ModelLoader) else model - def execute( - self, - predict_dataset: DatasetML, - config: Optional[Dict] = None, - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - """"Execute some operations. - - Args: - predict_dataset (DatasetML): dataset object for inference. - config (Dict, optional): key-value configuration. - Defaults to None. - - Returns: - Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as - (results, config). - """ - return self.predict(predict_dataset), config - @abstractmethod - def predict( + @monitor_exec + def execute( self, - predict_dataset: DatasetML, - model: Optional[ModelML] = None - ) -> Iterable[Any]: + predict_dataset: MLDataset, + model: Optional[MLModel] = None + ) -> MLDataset: """Applies a machine learning model on a dataset of samples. Args: @@ -199,122 +157,51 @@ def predict( if given. Defaults to None. Returns: - Iterable[Any]: predictions with the same cardinality of the + DatasetML: predictions with the same cardinality of the input dataset. """ -class DataGetter(Executable): - @abstractmethod - def load(self, *args, **kwargs): - pass - +class DataGetter(BaseComponent): + """Retrieves a dataset.""" -class DataPreproc(Executable): @abstractmethod - def preproc(self, *args, **kwargs): - pass + @monitor_exec + def execute(self) -> MLDataset: + """Retrieves a dataset. + Returns: + MLDataset: retrieved dataset. + """ -# class StatGetter(Executable): -# @abstractmethod -# def stats(self, *args, **kwargs): -# pass +class DataPreproc(BaseComponent): + """Performs dataset pre-processing.""" -class Saver(Executable): @abstractmethod - def save(self, *args, **kwargs): - pass - + @monitor_exec + def execute(self, dataset: MLDataset) -> MLDataset: + """Pre-processes a dataset. -class Executor(Executable): - """Sets-up and executes a sequence of Executable steps.""" + Args: + dataset (MLDataset): dataset. - steps: Iterable[Executable] - constructor_args: Dict + Returns: + MLDataset: pre-processed dataset. + """ - def __init__( - self, - steps: Iterable[Executable], - name: Optional[str] = None, - # logs_dir: Optional[str] = None, - # debug: bool = False, - **kwargs - ): - # super().__init__(name=name, logs_dir=logs_dir, debug=debug, **kwargs) - super().__init__(name=name, **kwargs) - self.steps = steps - self.constructor_args = kwargs - - def __getitem__(self, subscript) -> Executor: - if isinstance(subscript, slice): - s = self.steps[subscript.start:subscript.stop: subscript.step] - sliced = self.__class__( - steps=s, - **self.constructor_args - ) - return sliced - else: - return self.steps[subscript] - - def __len__(self) -> int: - return len(self.steps) - - def setup(self, parent: Optional[Executor] = None) -> None: - """Inherit properties from parent Executor instance, then - propagates its properties to its own child steps. - Args: - parent (Optional[Executor], optional): parent executor. - Defaults to None. - """ - super().setup(parent) - for step in self.steps: - step.setup(self) - step.is_setup = True - - # def setup(self, config: Dict = None): - # """Pass a key-value based configuration down the pipeline, - # to propagate information computed at real-time. - - # Args: - # config (Dict, optional): key-value configuration. - # Defaults to None. - # """ - # for step in self.steps: - # config = step.setup(config) +class Saver(BaseComponent): + """Saves artifact to disk.""" - def execute( - self, - *args, - config: Optional[Dict] = None, - **kwargs - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - """"Execute some operations. + @abstractmethod + @monitor_exec + def execute(self, artifact: MLArtifact) -> MLArtifact: + """Saves an ML artifact to disk. Args: - args (Tuple, optional): generic input of the first executable step - in the pipeline. - config (Dict, optional): key-value configuration. - Defaults to None. + artifact (MLArtifact): artifact to save. Returns: - Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as - (results, config). + MLArtifact: the same input artifact, after saving it. """ - for step in self.steps: - if not step.is_setup: - raise RuntimeError( - f"Step '{step.name}' was not setup!" - ) - args = self._pack_args(args) - args, config = step(*args, **kwargs, config=config) - - return args, config - - def _pack_args(self, args) -> Tuple: - args = () if args is None else args - if not isinstance(args, tuple): - args = (args,) - return args diff --git a/src/itwinai/executors.py b/src/itwinai/experimental/executors.py similarity index 92% rename from src/itwinai/executors.py rename to src/itwinai/experimental/executors.py index d94e1c0f..2c89f1c3 100644 --- a/src/itwinai/executors.py +++ b/src/itwinai/experimental/executors.py @@ -8,11 +8,11 @@ from ray import air, tune from jsonargparse import ArgumentParser -from .components import Executor, Executable -from .utils import parse_pipe_config +from ..components import Pipeline, BaseComponent +from ..utils import parse_pipe_config -class LocalExecutor(Executor): +class LocalExecutor(Pipeline): def __init__(self, pipeline, class_dict): # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() @@ -40,7 +40,7 @@ def setup(self, args): args = executable.setup(args) -class RayExecutor(Executor): +class RayExecutor(Pipeline): def __init__(self, pipeline, class_dict, param_space): self.class_dict = class_dict self.param_space = param_space @@ -91,10 +91,10 @@ def setup(self, args): pass -class ParallelExecutor(Executor): +class ParallelExecutor(Pipeline): """Execute a pipeline in parallel: multiprocessing and multi-node.""" - def __init__(self, steps: Iterable[Executable]): + def __init__(self, steps: Iterable[BaseComponent]): super().__init__(steps) def setup(self, config: Dict = None): @@ -112,7 +112,7 @@ class HPCExecutor(ParallelExecutor): network access. """ - def __init__(self, steps: Iterable[Executable]): + def __init__(self, steps: Iterable[BaseComponent]): super().__init__(steps) def setup(self, config: Dict = None): diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py new file mode 100644 index 00000000..4f929a1e --- /dev/null +++ b/src/itwinai/pipeline.py @@ -0,0 +1,84 @@ +""" +This module provides the functionalities to execute workflows defined in +in form of pipelines. +""" +from __future__ import annotations +from typing import Iterable, Dict, Any, Tuple +import inspect +from .components import BaseComponent, monitor_exec + + +class Pipeline(BaseComponent): + """Executes a set of components arranged as a pipeline.""" + + steps: Iterable[BaseComponent] + constructor_args: Dict + + def __init__( + self, + steps: Iterable[BaseComponent], + **kwargs + ): + super().__init__(**kwargs) + self.steps = steps + self.constructor_args = kwargs + + def __getitem__(self, subscript) -> Pipeline: + if isinstance(subscript, slice): + s = self.steps[subscript.start:subscript.stop: subscript.step] + sliced = self.__class__( + steps=s, + name=self.name, + **self.constructor_args + ) + return sliced + else: + return self.steps[subscript] + + def __len__(self) -> int: + return len(self.steps) + + @monitor_exec + def execute(self, *args) -> Any: + """"Execute components sequentially.""" + for step in self.steps: + step: BaseComponent + args = self._pack_args(args) + self.validate_args(args, step) + args = step.execute(*args) + + return args + + @staticmethod + def _pack_args(args) -> Tuple: + """Wraps args in a tuple, if needed.""" + args = () if args is None else args + if not isinstance(args, tuple): + args = (args,) + return args + + @staticmethod + def validate_args(input_args: Tuple, component: BaseComponent): + """Verify that the number of input args provided to some component + match with the number of the non-default args in the component. + + Args: + input_args (Tuple): input args to be fed to the component. + component (BaseComponent): component to be executed. + + Raises: + RuntimeError: in case of args mismatch. + """ + comp_params = inspect.signature(component.execute).parameters.items() + non_default_par = list(filter( + lambda p: p[0] != 'self' and p[1].default == inspect._empty, + comp_params + )) + non_default_par_names = list(map(lambda p: p[0], non_default_par)) + if len(non_default_par) != len(input_args): + raise RuntimeError( + "Mismatch into the number of non-default parameters " + f"of execute method of '{component.name}' component " + f"({non_default_par_names}), and the number of arguments " + f"it received as input: {input_args}." + ) diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index a7b70cd3..b0253ae1 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -1,4 +1,4 @@ -from .types import ModelML +from .types import MLModel import abc @@ -10,5 +10,5 @@ def __init__(self, model_uri: str) -> None: self.model_uri = model_uri @abc.abstractmethod - def __call__(self) -> ModelML: + def __call__(self) -> MLModel: """Loads model from model URI.""" diff --git a/src/itwinai/types.py b/src/itwinai/types.py index 9c302eb1..977068b9 100644 --- a/src/itwinai/types.py +++ b/src/itwinai/types.py @@ -3,9 +3,13 @@ """ -class DatasetML: +class MLArtifact: + """A framework-independent machine learning artifact.""" + + +class MLDataset(MLArtifact): """A framework-independent machine learning dataset.""" -class ModelML: +class MLModel(MLArtifact): """A framework-independent machine learning model.""" diff --git a/tests/test_components.py b/tests/test_components.py index f7396214..f51c483b 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -1,8 +1,8 @@ -from itwinai.components import Executor +from itwinai.components import Pipeline def test_slice(): - p = Executor(['step1', 'step2', 'step3'], pippo=2) + p = Pipeline(['step1', 'step2', 'step3'], pippo=2) assert len(p[:1]) == 1 assert p[:1][0] == 'step1' assert len(p[1:]) == 2 diff --git a/use-cases/3dgan/train.py b/use-cases/3dgan/train.py index d04596be..512015e6 100644 --- a/use-cases/3dgan/train.py +++ b/use-cases/3dgan/train.py @@ -15,7 +15,7 @@ import argparse -from itwinai.components import Executor +from itwinai.components import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser @@ -38,12 +38,12 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Executor, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "executor") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) pipe = pipe_parser.instantiate_classes(parsed) - executor: Executor = getattr(pipe, 'executor') + executor: Pipeline = getattr(pipe, 'executor') if args.download_only: print('Downloading datasets and exiting...') diff --git a/use-cases/cyclones/executor.py b/use-cases/cyclones/executor.py index 9c00af43..67946615 100644 --- a/use-cases/cyclones/executor.py +++ b/use-cases/cyclones/executor.py @@ -5,14 +5,14 @@ from typing import Tuple, Dict, Optional, Iterable from lib.macros import PATCH_SIZE as patch_size, SHAPE as shape -from itwinai.components import Executor, Executable +from itwinai.components import Pipeline, BaseComponent -class CycloneExecutor(Executor): +class CycloneExecutor(Pipeline): def __init__( self, run_name: str, - steps: Iterable[Executable], + steps: Iterable[BaseComponent], name: Optional[str] = None ): super().__init__(steps=steps, name=name) diff --git a/use-cases/cyclones/train.py b/use-cases/cyclones/train.py index 82a6d15d..a3ab63dd 100644 --- a/use-cases/cyclones/train.py +++ b/use-cases/cyclones/train.py @@ -13,7 +13,7 @@ import argparse -from itwinai.components import Executor +from itwinai.components import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser from executor import CycloneExecutor @@ -38,7 +38,7 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Executor, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "executor") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) diff --git a/use-cases/mnist/tensorflow/train.py b/use-cases/mnist/tensorflow/train.py index 65e12c78..7e7d71ac 100644 --- a/use-cases/mnist/tensorflow/train.py +++ b/use-cases/mnist/tensorflow/train.py @@ -13,7 +13,7 @@ import argparse -from itwinai.components import Executor +from itwinai.components import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser @@ -36,12 +36,12 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Executor, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "executor") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) pipe = pipe_parser.instantiate_classes(parsed) - executor: Executor = getattr(pipe, 'executor') + executor: Pipeline = getattr(pipe, 'executor') if args.download_only: print('Downloading datasets and exiting...') diff --git a/use-cases/mnist/torch-lightning/train.py b/use-cases/mnist/torch-lightning/train.py index 50c91988..09ab30a9 100644 --- a/use-cases/mnist/torch-lightning/train.py +++ b/use-cases/mnist/torch-lightning/train.py @@ -15,7 +15,7 @@ import argparse -from itwinai.components import Executor +from itwinai.components import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser @@ -38,12 +38,12 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Executor, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "executor") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) pipe = pipe_parser.instantiate_classes(parsed) - executor: Executor = getattr(pipe, 'executor') + executor: Pipeline = getattr(pipe, 'executor') if args.download_only: print('Downloading datasets and exiting...') diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py index 50c91988..09ab30a9 100644 --- a/use-cases/mnist/torch/train.py +++ b/use-cases/mnist/torch/train.py @@ -15,7 +15,7 @@ import argparse -from itwinai.components import Executor +from itwinai.components import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser @@ -38,12 +38,12 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Executor, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "executor") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) pipe = pipe_parser.instantiate_classes(parsed) - executor: Executor = getattr(pipe, 'executor') + executor: Pipeline = getattr(pipe, 'executor') if args.download_only: print('Downloading datasets and exiting...') diff --git a/use-cases/zebra2horse/train.py b/use-cases/zebra2horse/train.py index 08a91fd2..c33b9402 100644 --- a/use-cases/zebra2horse/train.py +++ b/use-cases/zebra2horse/train.py @@ -2,7 +2,7 @@ from trainer import Zebra2HorseTrainer from dataloader import Zebra2HorseDataLoader -from itwinai.executors import LocalExecutor # , RayExecutor +from itwinai.experimental.executors import LocalExecutor # , RayExecutor if __name__ == "__main__": From 8e6fabb0e85c843ece2311fc8c603db5ca2c9400 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Fri, 8 Dec 2023 11:19:58 +0100 Subject: [PATCH 05/26] UPDATE docstring --- src/itwinai/components.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 6af9aca7..37611593 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -44,7 +44,12 @@ def monitored_method(self: BaseComponent, *args, **kwargs) -> Any: class BaseComponent(metaclass=ABCMeta): - """Base component class. + """Base component class. Each component provides a simple interface + to foster modularity in machine learning code. Each component class + implements the `execute` method, which received some input ML artifacts + (e.g., datasets), performs some operations and returns new artifacts. + The components are meant to be assembled in complex ML workflows, + represented as pipelines. Args: name (Optional[str], optional): unique identifier for a step. From 49f3bed450d6904a5330ab35be41f5d67a36cdeb Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Fri, 8 Dec 2023 13:48:47 +0100 Subject: [PATCH 06/26] UPDATE mnist torch uc --- src/itwinai/components.py | 2 +- src/itwinai/pipeline.py | 17 +++++ src/itwinai/torch/inference.py | 5 +- src/itwinai/torch/trainer.py | 8 +-- use-cases/mnist/torch/dataloader.py | 65 ++++--------------- use-cases/mnist/torch/inference-pipeline.yaml | 4 +- use-cases/mnist/torch/pipeline.yaml | 6 +- use-cases/mnist/torch/saver.py | 21 ++---- use-cases/mnist/torch/train.py | 9 ++- 9 files changed, 53 insertions(+), 84 deletions(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 37611593..b13479a1 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -4,7 +4,7 @@ extending existing components or creating new ones. """ - +from __future__ import annotations from typing import Any, Optional, Tuple, Union, Callable from abc import ABCMeta, abstractmethod import time diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py index 4f929a1e..6fedeea0 100644 --- a/src/itwinai/pipeline.py +++ b/src/itwinai/pipeline.py @@ -1,6 +1,23 @@ """ This module provides the functionalities to execute workflows defined in in form of pipelines. + +TODO: +- Define input and output for components, as in KubeFlow, so that it is +not ambiguous when creating a DAG how to split/merge outputs/inputs. +An alternative is to define additional splitter/merger blocks to manage +the routing of inputs/outputs: + +>>> class Router: +>>> ... +>>> class Splitter(Router): +>>> ... +>>> class Merger(Router): +>>> ... + +- Create a CLI parser allowing to execute pipelines directly from their +config file serialization, directly from CLI, with dynamic override of +fields, as done with Lightning CLI. """ from __future__ import annotations from typing import Iterable, Dict, Any, Tuple diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py index 4d7797c6..39c16b04 100644 --- a/src/itwinai/torch/inference.py +++ b/src/itwinai/torch/inference.py @@ -8,7 +8,7 @@ from ..utils import dynamically_import_class from .utils import clear_key -from ..components import Predictor +from ..components import Predictor, monitor_exec from .types import TorchDistributedStrategy as StrategyT from .types import Metric, Batch from ..serialization import ModelLoader @@ -122,7 +122,8 @@ def __init__( # else validation_metrics # ) - def predict( + @monitor_exec + def execute( self, test_dataset: Dataset, model: nn.Module = None, diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py index 6d8a1771..f965cb2e 100644 --- a/src/itwinai/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -17,7 +17,7 @@ import torch.nn as nn from torch.optim.optimizer import Optimizer -from ..components import Trainer +from ..components import Trainer, monitor_exec from .utils import seed_worker, par_allgather_obj, clear_key from .types import ( Batch, Loss, LrScheduler, Metric @@ -309,6 +309,7 @@ def set_seed(self, seed: Optional[int] = None): if self.cluster.is_cuda_available(): torch.cuda.manual_seed(seed) + @monitor_exec def execute( self, train_dataset: Dataset, @@ -316,8 +317,7 @@ def execute( model: nn.Module = None, optimizer: Optimizer = None, lr_scheduler: LrScheduler = None, - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: + ) -> Any: self.train_dataset = train_dataset self.validation_dataset = validation_dataset @@ -337,7 +337,7 @@ def execute( result = self._train(0) # Return value compliant with Executable.execute format - return ((result,), config) + return result def _train( self, diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index 39e9b56b..609b3770 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -1,6 +1,6 @@ """Dataloader for Torch-based MNIST use case.""" -from typing import Dict, Optional, Tuple, Callable, Any +from typing import Optional, Tuple, Callable, Any import os import shutil @@ -8,59 +8,32 @@ from torch.utils.data import Dataset from torchvision import transforms, datasets -from itwinai.components import DataGetter +from itwinai.components import DataGetter, monitor_exec class MNISTDataModuleTorch(DataGetter): """Download MNIST dataset for torch.""" - def __init__( - self, - save_path: str = '.tmp/', - # batch_size: int = 32, - # pin_memory: bool = True, - # num_workers: int = 4 - ) -> None: + def __init__(self, save_path: str = '.tmp/',) -> None: super().__init__() self.save_path = save_path - # self.batch_size = batch_size - # self.pin_memory = pin_memory - # self.num_workers = num_workers - def load(self): - self.train_dataset = datasets.MNIST( + @monitor_exec + def execute(self) -> Tuple[Dataset, Dataset]: + train_dataset = datasets.MNIST( self.save_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) - self.val_dataset = datasets.MNIST( + validation_dataset = datasets.MNIST( self.save_path, train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) - - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Tuple[Dataset, Dataset], Optional[Dict]]: - self.load() - print("Train and valid datasets loaded.") - # train_dataloder = DataLoader( - # self.train_dataset, - # batch_size=self.batch_size, - # pin_memory=self.pin_memory, - # num_workers=self.num_workers - # ) - # validation_dataloader = DataLoader( - # self.val_dataset, - # batch_size=self.batch_size, - # pin_memory=self.pin_memory, - # num_workers=self.num_workers - # ) - # return (train_dataloder, validation_dataloader) - return (self.train_dataset, self.val_dataset), config + print("Train and validation datasets loaded.") + return train_dataset, validation_dataset class InferenceMNIST(Dataset): @@ -100,11 +73,6 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: """ img_id, img = list(self.data.items())[index] - # doing this so that it is consistent with all other datasets - # to return a PIL Image - # print(type(img)) - # img = Image.fromarray(img.numpy(), mode="L") - if self.transform is not None: img = self.transform(img) @@ -136,21 +104,12 @@ def generate_jpg_sample( class MNISTPredictLoader(DataGetter): - def __init__( - self, - test_data_path: str - ) -> None: + def __init__(self, test_data_path: str) -> None: super().__init__() self.test_data_path = test_data_path - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Tuple[Dataset, Dataset], Optional[Dict]]: - data = self.load() - return data, config - - def load(self) -> Dataset: + @monitor_exec + def execute(self) -> Dataset: return InferenceMNIST( root=self.test_data_path, transform=transforms.Compose([ diff --git a/use-cases/mnist/torch/inference-pipeline.yaml b/use-cases/mnist/torch/inference-pipeline.yaml index ba4f5e86..5edf6ce9 100644 --- a/use-cases/mnist/torch/inference-pipeline.yaml +++ b/use-cases/mnist/torch/inference-pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.MNISTPredictLoader diff --git a/use-cases/mnist/torch/pipeline.yaml b/use-cases/mnist/torch/pipeline.yaml index 9bb7fb98..2c631675 100644 --- a/use-cases/mnist/torch/pipeline.yaml +++ b/use-cases/mnist/torch/pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.MNISTDataModuleTorch @@ -25,7 +25,7 @@ executor: batch_size: 32 pin_memory: True shuffle: False - epochs: 30 + epochs: 2 train_metrics: accuracy: class_path: torchmetrics.classification.MulticlassAccuracy diff --git a/use-cases/mnist/torch/saver.py b/use-cases/mnist/torch/saver.py index fd54c0cf..ad4ff9ab 100644 --- a/use-cases/mnist/torch/saver.py +++ b/use-cases/mnist/torch/saver.py @@ -2,12 +2,12 @@ This module is used during inference to save predicted labels to file. """ -from typing import Optional, List, Dict, Tuple +from typing import Optional, List, Dict import os import shutil import csv -from itwinai.components import Saver +from itwinai.components import Saver, monitor_exec class TorchMNISTLabelSaver(Saver): @@ -27,23 +27,17 @@ def __init__( else [f'Digit {i}' for i in range(10)] ) - def execute( - self, - predicted_classes: Dict[str, int], - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: + @monitor_exec + def execute(self, predicted_classes: Dict[str, int],) -> Dict[str, int]: """Translate predictions from class idx to class label and save them to disk. Args: predicted_classes (Dict[str, int]): maps unique item ID to the predicted class ID. - config (Optional[Dict], optional): inherited configuration. - Defaults to None. Returns: - Tuple[Optional[Tuple], Optional[Dict]]: propagation of inherited - configuration and saver return value. + Dict[str, int]: predicted classes. """ if os.path.exists(self.save_dir): shutil.rmtree(self.save_dir) @@ -54,12 +48,11 @@ def execute( itm_name: self.class_labels[cls_idx] for itm_name, cls_idx in predicted_classes.items() } - result = self.save(predicted_labels) - return ((result,), config) - def save(self, predicted_labels: Dict[str, str]) -> None: + # Save to disk filepath = os.path.join(self.save_dir, self.predictions_file) with open(filepath, 'w') as csv_file: writer = csv.writer(csv_file) for key, value in predicted_labels.items(): writer.writerow([key, value]) + return predicted_labels diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py index 09ab30a9..182d1c29 100644 --- a/use-cases/mnist/torch/train.py +++ b/use-cases/mnist/torch/train.py @@ -15,7 +15,7 @@ import argparse -from itwinai.components import Pipeline +from itwinai.pipeline import Pipeline from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser @@ -38,12 +38,12 @@ # Create parser for the pipeline (ordered) pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "executor") + pipe_parser.add_subclass_arguments(Pipeline, "pipeline") # Parse, Instantiate pipe parsed = parse_pipe_config(args.pipeline, pipe_parser) pipe = pipe_parser.instantiate_classes(parsed) - executor: Pipeline = getattr(pipe, 'executor') + executor: Pipeline = getattr(pipe, 'pipeline') if args.download_only: print('Downloading datasets and exiting...') @@ -51,5 +51,4 @@ else: print('Downloading datasets (if not already done) and running...') executor = executor - executor.setup() - executor() + executor.execute() From 9c7f1a470267b6f2d48408f0c1de19e1a5b80bfa Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Fri, 8 Dec 2023 14:56:20 +0100 Subject: [PATCH 07/26] ADD config file parser draft --- experimental/cli/example.yaml | 9 +++++++++ experimental/cli/mycode.py | 17 +++++++++++++++++ experimental/cli/parser.py | 27 +++++++++++++++++++++++++++ src/itwinai/parser.py | 4 ++++ 4 files changed, 57 insertions(+) create mode 100644 experimental/cli/example.yaml create mode 100644 experimental/cli/mycode.py create mode 100644 experimental/cli/parser.py create mode 100644 src/itwinai/parser.py diff --git a/experimental/cli/example.yaml b/experimental/cli/example.yaml new file mode 100644 index 00000000..ef6a342e --- /dev/null +++ b/experimental/cli/example.yaml @@ -0,0 +1,9 @@ +server: + class_path: mycode.ServerOptions + init_args: + host: localhost + port: 80 +client: + class_path: mycode.ClientOptions + init_args: + url: http://${server.init_args.host}:${server.init_args.port}/ \ No newline at end of file diff --git a/experimental/cli/mycode.py b/experimental/cli/mycode.py new file mode 100644 index 00000000..6259ad7c --- /dev/null +++ b/experimental/cli/mycode.py @@ -0,0 +1,17 @@ +# from dataclasses import dataclass + + +class ServerOptions: + host: str + port: int + + def __init__(self, host: str, port: int) -> None: + self.host = host + self.port = port + + +class ClientOptions: + url: str + + def __init__(self, url: str) -> None: + self.url = url diff --git a/experimental/cli/parser.py b/experimental/cli/parser.py new file mode 100644 index 00000000..aacbb4ff --- /dev/null +++ b/experimental/cli/parser.py @@ -0,0 +1,27 @@ +""" +Example of dynamic override of config files with (sub)class arguments, +and variable interpolation with omegaconf. + +Run with: +>>> python parser.py + +Or (after clearing the arguments in parse_args(...)): +>>> python parser.py --config example.yaml --server.port 212 +See the help page of each class: +>>> python parser.py --server.help mycode.ServerOptions +""" + +from jsonargparse import ArgumentParser, ActionConfigFile +from mycode import ServerOptions, ClientOptions + +parser = ArgumentParser(parser_mode="omegaconf") +parser.add_subclass_arguments(ServerOptions, "server") +parser.add_subclass_arguments(ClientOptions, "client") +parser.add_argument("--config", action=ActionConfigFile) + +# Example of dynamic CLI override +# cfg = parser.parse_args(["--config=example.yaml", "--server.port=212"]) +cfg = parser.parse_args() +cfg = parser.instantiate_classes(cfg) +print(cfg.client.url) +print(cfg.server.port) diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py new file mode 100644 index 00000000..8f790000 --- /dev/null +++ b/src/itwinai/parser.py @@ -0,0 +1,4 @@ +""" +Provide functionalities to manage configuration files, including parsing, +execution, and dynamic override of fields. +""" From c75d4e9e7cd6fe462fac72140b896fb99f5dfc57 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Fri, 8 Dec 2023 23:18:43 +0100 Subject: [PATCH 08/26] ADD itwinaiCLI and ConfigParser --- experimental/cli/itwinai-conf.yaml | 14 ++ experimental/cli/itwinaicli.py | 29 ++++ experimental/cli/mycode.py | 22 ++- experimental/cli/parser-bk.py | 46 ++++++ experimental/cli/parser.py | 22 +-- src/itwinai/parser.py | 248 +++++++++++++++++++++++++++++ src/itwinai/pipeline.py | 28 +++- src/itwinai/utils.py | 22 ++- 8 files changed, 410 insertions(+), 21 deletions(-) create mode 100644 experimental/cli/itwinai-conf.yaml create mode 100644 experimental/cli/itwinaicli.py create mode 100644 experimental/cli/parser-bk.py diff --git a/experimental/cli/itwinai-conf.yaml b/experimental/cli/itwinai-conf.yaml new file mode 100644 index 00000000..0cb662df --- /dev/null +++ b/experimental/cli/itwinai-conf.yaml @@ -0,0 +1,14 @@ +pipeline: + class_path: itwinai.pipeline.Pipeline + steps: [server, client] + +server: + class_path: mycode.ServerOptions + init_args: + host: localhost + port: 80 + +client: + class_path: mycode.ClientOptions + init_args: + url: http://${server.init_args.host}:${server.init_args.port}/ \ No newline at end of file diff --git a/experimental/cli/itwinaicli.py b/experimental/cli/itwinaicli.py new file mode 100644 index 00000000..34fc2411 --- /dev/null +++ b/experimental/cli/itwinaicli.py @@ -0,0 +1,29 @@ +""" +>>> python itwinaicli.py --config itwinai-conf.yaml --help +>>> python itwinaicli.py --config itwinai-conf.yaml --server.port 333 +""" + + +from itwinai.parser import ConfigParser +from itwinai.parser import ItwinaiCLI + +cli = ItwinaiCLI() +print(cli.pipeline) +print(cli.pipeline.steps) +print(cli.pipeline.steps['server'].port) + + +parser = ConfigParser( + config='itwinai-conf.yaml', + override_keys={ + 'server.init_args.port': 777 + } +) +pipeline = parser.parse_pipeline() +print(pipeline) +print(pipeline.steps) +print(pipeline.steps['server'].port) + +server = parser.parse_step('server') +print(server) +print(server.port) diff --git a/experimental/cli/mycode.py b/experimental/cli/mycode.py index 6259ad7c..5da07624 100644 --- a/experimental/cli/mycode.py +++ b/experimental/cli/mycode.py @@ -1,7 +1,8 @@ # from dataclasses import dataclass +from itwinai.components import BaseComponent -class ServerOptions: +class ServerOptions(BaseComponent): host: str port: int @@ -9,9 +10,26 @@ def __init__(self, host: str, port: int) -> None: self.host = host self.port = port + def execute(): + ... -class ClientOptions: + +class ClientOptions(BaseComponent): url: str def __init__(self, url: str) -> None: self.url = url + + def execute(): + ... + + +class ServerOptions2(BaseComponent): + host: str + port: int + + def __init__(self, client: ClientOptions) -> None: + self.client = client + + def execute(): + ... diff --git a/experimental/cli/parser-bk.py b/experimental/cli/parser-bk.py new file mode 100644 index 00000000..8f87bf37 --- /dev/null +++ b/experimental/cli/parser-bk.py @@ -0,0 +1,46 @@ +""" +Provide functionalities to manage configuration files, including parsing, +execution, and dynamic override of fields. +""" + +from typing import Any +from jsonargparse import ArgumentParser, ActionConfigFile, Namespace + +from .components import BaseComponent + + +class ItwinaiCLI: + _parser: ArgumentParser + pipeline: BaseComponent + + def __init__( + self, + pipeline_nested_key: str = "pipeline", + args: Any = None, + parser_mode: str = "omegaconf" + ) -> None: + self.pipeline_nested_key = pipeline_nested_key + self.args = args + self.parser_mode = parser_mode + self._init_parser() + self._parse_args() + pipeline_inst = self._parser.instantiate_classes(self._config) + self.pipeline = pipeline_inst[self.pipeline_nested_key] + + def _init_parser(self): + self._parser = ArgumentParser(parser_mode=self.parser_mode) + self._parser.add_argument( + "-c", "--config", action=ActionConfigFile, + required=True, + help="Path to a configuration file in json or yaml format." + ) + self._parser.add_subclass_arguments( + baseclass=BaseComponent, + nested_key=self.pipeline_nested_key + ) + + def _parse_args(self): + if isinstance(self.args, (dict, Namespace)): + self._config = self._parser.parse_object(self.args) + else: + self._config = self._parser.parse_args(self.args) diff --git a/experimental/cli/parser.py b/experimental/cli/parser.py index aacbb4ff..f400466f 100644 --- a/experimental/cli/parser.py +++ b/experimental/cli/parser.py @@ -14,14 +14,16 @@ from jsonargparse import ArgumentParser, ActionConfigFile from mycode import ServerOptions, ClientOptions -parser = ArgumentParser(parser_mode="omegaconf") -parser.add_subclass_arguments(ServerOptions, "server") -parser.add_subclass_arguments(ClientOptions, "client") -parser.add_argument("--config", action=ActionConfigFile) +if __name__ == "__main__": + parser = ArgumentParser(parser_mode="omegaconf") + parser.add_subclass_arguments(ServerOptions, "server") + parser.add_subclass_arguments(ClientOptions, "client") + parser.add_argument("--config", action=ActionConfigFile) -# Example of dynamic CLI override -# cfg = parser.parse_args(["--config=example.yaml", "--server.port=212"]) -cfg = parser.parse_args() -cfg = parser.instantiate_classes(cfg) -print(cfg.client.url) -print(cfg.server.port) + # Example of dynamic CLI override + # cfg = parser.parse_args(["--config=example.yaml", "--server.port=212"]) + cfg = parser.parse_args() + cfg = parser.instantiate_classes(cfg) + print(cfg.client) + print(cfg.client.url) + print(cfg.server.port) diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 8f790000..65df62da 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -2,3 +2,251 @@ Provide functionalities to manage configuration files, including parsing, execution, and dynamic override of fields. """ + +import sys +from typing import Dict, Any, Union, Optional +from jsonargparse import ArgumentParser, ActionConfigFile +import json +from omegaconf import OmegaConf + +from .components import BaseComponent +from .pipeline import Pipeline +from .utils import load_yaml, dynamically_import_class + + +def add_replace_field( + config: Dict, + key_chain: str, + value: Any +) -> None: + """Replace or add (if not present) a field in a dictionary, following a + path of dot-separated keys. Inplace operation. + Args: + config (Dict): dictionary to be modified. + key_chain (str): path of nested (dot-separated) keys to specify the + location + of the new value (e.g., 'foo.bar.line' adds/overwrites the value + located at config['foo']['bar']['line']). + value (Any): the value to insert. + """ + sub_config = config + for idx, k in enumerate(key_chain.split('.')): + if idx >= len(key_chain.split('.')) - 1: + # Last key reached + break + if not isinstance(sub_config.get(k), dict): + sub_config[k] = dict() + sub_config = sub_config[k] + sub_config[k] = value + + +class ConfigParser: + """Parses a configuration file, merging the steps into + the pipeline and returning a pipeline object. + It also provides functionalities for dynamic override + of fields by means of nested key notation. + + Example: + + >>> from itwinai.parser import ConfigParser + >>> + >>> parser = ConfigParser( + >>> config='itwinai-conf.yaml', + >>> override_keys={ + >>> 'server.init_args.port': 777 + >>> } + >>> ) + >>> pipeline = parser.parse_pipeline() + >>> print(pipeline) + >>> print(pipeline.steps) + >>> print(pipeline.steps['server'].port) + >>> + >>> server = parser.parse_step('server') + >>> print(server) + >>> print(server.port) + """ + + config: Dict + pipeline: Pipeline + + def __init__( + self, + config: Union[str, Dict], + override_keys: Optional[Dict[str, Any]] = None + ) -> None: + self.config = config + self.override_keys = override_keys + if isinstance(self.config, str): + self.config = load_yaml(self.config) + self._dynamic_override_keys() + self._omegaconf_interpolate() + + def _dynamic_override_keys(self): + if self.override_keys is not None: + for key_chain, value in self.override_keys.items(): + add_replace_field(self.config, key_chain, value) + + def _omegaconf_interpolate(self) -> None: + """Performs variable interpolation with OmegaConf on internal + configuration file. + """ + conf = OmegaConf.create(self.config) + self.config = OmegaConf.to_container(conf, resolve=True) + + def parse_pipeline( + self, + pipeline_nested_key: str = "pipeline", + verbose: bool = False + ) -> Pipeline: + """Merges steps into pipeline and parses it. + + Args: + pipeline_nested_key (str, optional): nested key in the + configuration file identifying the pipeline object. + Defaults to "pipeline". + verbose (bool): if True, prints the assembled pipeline + to console formatted as JSON. + + Returns: + Pipeline: instantiated pipeline. + """ + pipe_parser = ArgumentParser() + pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key) + pipe_dict = self.config[pipeline_nested_key] + + # Pop steps list from pipeline dictionary + steps_list = pipe_dict['steps'] + del pipe_dict['steps'] + + # Link steps with respective dictionaries + if not pipe_dict.get('init_args'): + pipe_dict['init_args'] = {} + steps_dict = pipe_dict['init_args']['steps'] = {} + for step_name in steps_list: + steps_dict[step_name] = self.config[step_name] + pipe_dict = {pipeline_nested_key: pipe_dict} + + if verbose: + print("Assembled pipeline:") + print(json.dumps(pipe_dict, indent=4)) + + # Parse pipeline dict once merged with steps + conf = pipe_parser.parse_object(pipe_dict) + pipe = pipe_parser.instantiate_classes(conf) + self.pipeline = pipe[pipeline_nested_key] + return self.pipeline + + def parse_step( + self, + step_name: str, + verbose: bool = False + ) -> BaseComponent: + step_dict_config = self.config[step_name] + + if verbose: + print(f"STEP '{step_name}' CONFIG:") + print(json.dumps(step_dict_config, indent=4)) + + # Wrap config under "step" field and parse it + step_dict_config = {'step': step_dict_config} + step_parser = ArgumentParser() + step_parser.add_subclass_arguments(BaseComponent, "step") + parsed_namespace = step_parser.parse_object(step_dict_config) + return step_parser.instantiate_classes(parsed_namespace)["step"] + + +class ItwinaiCLI: + """CLI tool for executing a configuration file, with dynamic + override of fields and variable interpolation with Omegaconf. + + Example: + + >>> # train.py + >>> from itwinai.parser import ItwinaiCLI + >>> cli = ItwinaiCLI() + >>> cli.pipeline.execute() + + >>> # pipeline.yaml + >>> pipeline: + >>> class_path: itwinai.pipeline.Pipeline + >>> steps: [server, client] + >>> + >>> server: + >>> class_path: mycode.ServerOptions + >>> init_args: + >>> host: localhost + >>> port: 80 + >>> + >>> client: + >>> class_path: mycode.ClientOptions + >>> init_args: + >>> url: http://${server.init_args.host}:${server.init_args.port}/ + + From command line: + + >>> python train.py --config itwinai-conf.yaml --help + >>> python train.py --config itwinai-conf.yaml + >>> python train.py --config itwinai-conf.yaml --server.port 8080 + """ + _parser: ArgumentParser + _config: Dict + pipeline: Pipeline + + def __init__( + self, + pipeline_nested_key: str = "pipeline", + parser_mode: str = "omegaconf" + ) -> None: + self.pipeline_nested_key = pipeline_nested_key + self.parser_mode = parser_mode + self._init_parser() + self._parser.add_argument(f"--{self.pipeline_nested_key}", type=dict) + self._add_steps_arguments() + self._config = self._parser.parse_args() + + # Merge steps into pipeline and parse it + del self._config['config'] + pipe_parser = ConfigParser(config=self._config.as_dict()) + self.pipeline = pipe_parser.parse_pipeline( + pipeline_nested_key=self.pipeline_nested_key + ) + + def _init_parser(self): + self._parser = ArgumentParser(parser_mode=self.parser_mode) + self._parser.add_argument( + "-c", "--config", action=ActionConfigFile, + required=True, + help="Path to a configuration file in json or yaml format." + ) + + def _add_steps_arguments(self): + """Pre-parses the configuration file, dynamically adding all the + component classes under 'steps' as arguments of the parser. + """ + if "--config" not in sys.argv: + raise ValueError( + "--config parameter has to be specified with a " + "valid path to a configuration file." + ) + config_path = sys.argv.index("--config") + 1 + config_path = sys.argv[config_path] + config = load_yaml(config_path) + + # Add steps to parser + steps = filter( + lambda itm: itm[0] != self.pipeline_nested_key, + config.items() + ) + steps = { + step_name: step_data['class_path'] + for step_name, step_data in steps + } + + for st_nested_key, step_class_str in steps.items(): + step_class = dynamically_import_class(step_class_str) + self._add_step_arguments( + step_class=step_class, nested_key=st_nested_key) + + def _add_step_arguments(self, step_class, nested_key): + self._parser.add_subclass_arguments( + baseclass=step_class, nested_key=nested_key) diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py index 6fedeea0..29aad61f 100644 --- a/src/itwinai/pipeline.py +++ b/src/itwinai/pipeline.py @@ -20,7 +20,7 @@ fields, as done with Lightning CLI. """ from __future__ import annotations -from typing import Iterable, Dict, Any, Tuple +from typing import Iterable, Dict, Any, Tuple, Union import inspect from .components import BaseComponent, monitor_exec @@ -28,21 +28,32 @@ class Pipeline(BaseComponent): """Executes a set of components arranged as a pipeline.""" - steps: Iterable[BaseComponent] + steps: Union[Dict[str, BaseComponent], Iterable[BaseComponent]] constructor_args: Dict def __init__( self, - steps: Iterable[BaseComponent], + steps: Union[Dict[str, BaseComponent], Iterable[BaseComponent]], **kwargs ): super().__init__(**kwargs) self.steps = steps self.constructor_args = kwargs - def __getitem__(self, subscript) -> Pipeline: + def __getitem__(self, subscript: Union[str, int, slice]) -> Pipeline: if isinstance(subscript, slice): - s = self.steps[subscript.start:subscript.stop: subscript.step] + # First, convert to list if is a dict + if isinstance(self.steps, dict): + steps = list(self.steps.items()) + else: + steps = self.steps + # Second, perform slicing + s = steps[subscript.start:subscript.stop: subscript.step] + # Third, reconstruct dict, if it is a dict + if isinstance(self.steps, dict): + s = dict(s) + # Fourth, return sliced sub-pipeline, preserving its + # initial structure sliced = self.__class__( steps=s, name=self.name, @@ -58,7 +69,12 @@ def __len__(self) -> int: @monitor_exec def execute(self, *args) -> Any: """"Execute components sequentially.""" - for step in self.steps: + if isinstance(self.steps, dict): + steps = list(self.steps.values()) + else: + steps = self.steps + + for step in steps: step: BaseComponent args = self._pack_args(args) self.validate_args(args, step) diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py index 1314423a..cbd527f9 100644 --- a/src/itwinai/utils.py +++ b/src/itwinai/utils.py @@ -67,9 +67,25 @@ def dynamically_import_class(name: str) -> Type: Returns: __class__: class type. """ - module, class_name = name.rsplit(".", 1) - mod = __import__(module, fromlist=[class_name]) - klass = getattr(mod, class_name) + try: + module, class_name = name.rsplit(".", 1) + mod = __import__(module, fromlist=[class_name]) + klass = getattr(mod, class_name) + except ModuleNotFoundError as err: + print( + f"Module not found when trying to dynamically import '{name}'. " + "Make sure that the module's file is reachable from your current " + "directory." + ) + raise err + except Exception as err: + print( + f"Exception occurred when trying to dynamically import '{name}'. " + "Make sure that the module's file is reachable from your current " + "directory and that the class is present in that module." + ) + raise err + return klass From dba8547e5527edcd66d7068b8b8365b9e7d70b1f Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Mon, 11 Dec 2023 11:11:45 +0100 Subject: [PATCH 09/26] ADD docs --- src/itwinai/components.py | 79 +++++++++++++++++++++++++++++++++++++++ src/itwinai/parser.py | 26 ++++++++++++- 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index b13479a1..dafc2770 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -2,8 +2,87 @@ This module provides the base classes to define modular and reproducible ML workflows. The base component classes provide a template to follow for extending existing components or creating new ones. + +There are two ways of creating workflows: simple and advanced workflows. + +Simple workflows can be obtained by creating a sequence of components +wrapped in a Pipeline object, which executes them in cascade, passing the +output of a component as the input of the following one. It is responsibility +of the user to prevent mismatches among outputs and inputs of component +sequences. This pipeline can be configured +both in terms of parameters and structure, with a configuration file +representing the whole pipeline. This configuration file can be executed +using itwinai CLI without the need of python files. + +Example: + +>>> from itwinai.components import DataGetter, Saver +>>> from itwinai.pipeline import Pipeline +>>> +>>> my_pipe = Pipeline({"getter": DataGetter(...), "data_saver": Saver(...)}) +>>> my_pipe.execute() +>>> my_pipe.to_yaml("training_pipe.yaml") +>>> +>>> # The pipeline can be parsed back to Python with: +>>> from itwinai.parser import PipeParser +>>> my_pipe = PipeParser("training_pipe.yaml") +>>> my_pipe.execute() +>>> +>>> # Run the pipeline from configuration file with dynamic override +>>> itwinai exec-pipeline --config training_pipe.yaml \ +>>> --override pipeline.init_args.steps.data_saver.some_param 42 + + +Advanced workflows foresee more complicated connections between the +components and it is very difficult to define a structure beforehand +without risking of over-constraining the user. Therefore, advanced +workflows are defined by explicitly connecting component outputs to +to the inputs of other components, without a wrapper Pipeline object. +In this case, the configuration files enable the user to persist the +parameters passed to the argument parser, enabling reuse through +configuration files, with the possibility of dynamic overrides of parameters. + +Example: + +>>> from jsonargparse import ArgumentParser, ActionConfigFile +>>> +>>> parser = ArgumentParser(description='PyTorch MNIST Example') +>>> parser.add_argument('--batch-size', type=int, default=64, +>>> help='input batch size for training (default: 64)') +>>> parser.add_argument('--epochs', type=int, default=10, +>>> help='number of epochs to train (default: 10)') +>>> parser.add_argument('--lr', type=float, default=0.01, +>>> help='learning rate (default: 0.01)') +>>> parser.add_argument( +>>> "-c", "--config", action=ActionConfigFile, +>>> required=True, +>>> help="Path to a configuration file in json or yaml format." +>>> ) +>>> args = parser.parse_args() +>>> +>>> from itwinai.components import ( +>>> DataGetter, Saver, DataSplitter, Trainer +>>> ) +>>> getter = DataGetter(...) +>>> splitter = DataSplitter(...) +>>> data_saver = Saver(...) +>>> model_saver = Saver(...) +>>> trainer = Trainer( +>>> batch_size=args.batch_size, lr=args.lr, epochs=args.epochs +>>> ) +>>> +>>> # Compose workflow +>>> my_dataset = getter.execute() +>>> train_set, valid_set, test_set = splitter.execute(my_dataset) +>>> data_saver.execute("train_dataset.pkl", test_set) +>>> _, _, _, trained_model = trainer(train_set, valid_set) +>>> model_saver.execute(trained_model) +>>> +>>> # Run the script using a previous configuration with dynamic override +>>> python my_train.py --config training_pipe.yaml --lr 0.002 """ + from __future__ import annotations from typing import Any, Optional, Tuple, Union, Callable from abc import ABCMeta, abstractmethod diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 65df62da..a2ab5abd 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -40,6 +40,14 @@ def add_replace_field( sub_config[k] = value +class ConfigParser2: + ... + + +class ItwinaiCLI2: + ... + + class ConfigParser: """Parses a configuration file, merging the steps into the pipeline and returning a pipeline object. @@ -48,10 +56,26 @@ class ConfigParser: Example: + >>> # pipeline.yaml + >>> pipeline: + >>> class_path: itwinai.pipeline.Pipeline + >>> steps: [server, client] + >>> + >>> server: + >>> class_path: mycode.ServerOptions + >>> init_args: + >>> host: localhost + >>> port: 80 + >>> + >>> client: + >>> class_path: mycode.ClientOptions + >>> init_args: + >>> url: http://${server.init_args.host}:${server.init_args.port}/ + >>> from itwinai.parser import ConfigParser >>> >>> parser = ConfigParser( - >>> config='itwinai-conf.yaml', + >>> config='pipeline.yaml', >>> override_keys={ >>> 'server.init_args.port': 777 >>> } From f9499d3526bfb70c2a84b5efaa7399d87009716b Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Mon, 11 Dec 2023 18:05:56 +0100 Subject: [PATCH 10/26] ADD pipeline parser and serializer plus tests --- experimental/cli/itwinaicli.py | 4 +- pyproject.toml | 3 +- src/itwinai/cli.py | 47 +++++++ src/itwinai/components.py | 28 ++-- src/itwinai/parser.py | 191 ++++++++++++++++++++++++-- src/itwinai/pipeline.py | 12 +- src/itwinai/serialization.py | 134 +++++++++++++++++- src/itwinai/tests/__init__.py | 1 + src/itwinai/tests/dummy_components.py | 31 +++++ tests/components/conftest.py | 72 ++++++++++ tests/components/test_components.py | 0 tests/components/test_pipe_parser.py | 144 +++++++++++++++++++ tests/components/test_pipeline.py | 58 ++++++++ tests/test_components.py | 9 -- 14 files changed, 693 insertions(+), 41 deletions(-) create mode 100644 src/itwinai/tests/__init__.py create mode 100644 src/itwinai/tests/dummy_components.py create mode 100644 tests/components/conftest.py create mode 100644 tests/components/test_components.py create mode 100644 tests/components/test_pipe_parser.py create mode 100644 tests/components/test_pipeline.py delete mode 100644 tests/test_components.py diff --git a/experimental/cli/itwinaicli.py b/experimental/cli/itwinaicli.py index 34fc2411..6a22bfb1 100644 --- a/experimental/cli/itwinaicli.py +++ b/experimental/cli/itwinaicli.py @@ -4,7 +4,7 @@ """ -from itwinai.parser import ConfigParser +from itwinai.parser import ConfigParser2 from itwinai.parser import ItwinaiCLI cli = ItwinaiCLI() @@ -13,7 +13,7 @@ print(cli.pipeline.steps['server'].port) -parser = ConfigParser( +parser = ConfigParser2( config='itwinai-conf.yaml', override_keys={ 'server.init_args.port': 777 diff --git a/pyproject.toml b/pyproject.toml index 7a780b7c..8d50247a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,8 @@ dependencies = [ "typing-extensions==4.5.0", "typing_extensions==4.5.0", "urllib3>=2.0.5", + "rich>=13.5.3", + "typer>=0.9.0", ] # dynamic = ["version", "description"] @@ -43,7 +45,6 @@ dependencies = [ # TODO: add torch and tensorflow # torch = [] # tf = [] -cli = ["rich>=13.5.3", "typer>=0.9.0"] dev = [ "pytest>=7.4.2", "pytest-mock>=3.11.1", diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py index bc1b852e..12954fbf 100644 --- a/src/itwinai/cli.py +++ b/src/itwinai/cli.py @@ -10,12 +10,59 @@ # NOTE: import libs in the command"s function, not here. # Otherwise this will slow the whole CLI. +from typing import Optional, List +from typing_extensions import Annotated +from pathlib import Path import typer app = typer.Typer() +@app.command() +def exec_pipeline( + config: Annotated[Path, typer.Option( + help="Path to the configuration file of the pipeline to execute." + )], + pipe_key: Annotated[str, typer.Option( + help=("Key in the configuration file identifying " + "the pipeline object to execute.") + )] = "pipeline", + overrides_list: Annotated[ + Optional[List[str]], typer.Option( + "--override", "-o", + help=( + "Nested key to dynamically override elements in the " + "configuration file with the " + "corresponding new value, joined by '='. It is also possible " + "to index elements in lists using their list index. " + "Example: [...] " + "-o pipeline.init_args.trainer.init_args.lr=0.001 " + "-o pipeline.my_list.2.batch_size=64 " + ) + ) + ] = None +): + """Execute a pipeline from configuration file. + Allows dynamic override of fields. + """ + # Add working directory to python path so that the interpreter is able + # to find the local python files imported from the pipeline file + import os + import sys + sys.path.append(os.getcwd()) + + # Parse and execute pipeline + from itwinai.parser import ConfigParser + overrides = { + k: v for k, v + in map(lambda x: (x.split('=')[0], x.split('=')[1]), overrides_list) + } + parser = ConfigParser(config=config, override_keys=overrides) + pipeline = parser.parse_pipeline(pipeline_nested_key=pipe_key) + pipeline.execute() + + @app.command() def mlflow_ui( path: str = typer.Option("ml-logs/", help="Path to logs storage."), diff --git a/src/itwinai/components.py b/src/itwinai/components.py index dafc2770..21fa6802 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -84,15 +84,15 @@ from __future__ import annotations -from typing import Any, Optional, Tuple, Union, Callable -from abc import ABCMeta, abstractmethod +from typing import Any, Optional, Tuple, Union, Callable, Dict +from abc import ABC, abstractmethod import time import functools # import logging # from logging import Logger as PythonLogger from .types import MLModel, MLDataset, MLArtifact -from .serialization import ModelLoader +from .serialization import ModelLoader, Serializable def monitor_exec(method: Callable) -> Callable: @@ -122,7 +122,7 @@ def monitored_method(self: BaseComponent, *args, **kwargs) -> Any: return monitored_method -class BaseComponent(metaclass=ABCMeta): +class BaseComponent(ABC, Serializable): """Base component class. Each component provides a simple interface to foster modularity in machine learning code. Each component class implements the `execute` method, which received some input ML artifacts @@ -134,16 +134,26 @@ class BaseComponent(metaclass=ABCMeta): name (Optional[str], optional): unique identifier for a step. Defaults to None. """ - name: str = 'unnamed' + _name: str = 'unnamed' + parameters: Dict[Any, Any] = None def __init__( self, name: Optional[str] = None, # logs_dir: Optional[str] = None, # debug: bool = False, - **kwargs ) -> None: - self.name = name if name is not None else self.__class__.__name__ + self.save_parameters(name=name) + + @property + def name(self) -> str: + return ( + self._name if self._name is not None else self.__class__.__name__ + ) + + @name.setter + def name(self, name: str) -> None: + self._name = name @abstractmethod @monitor_exec @@ -221,9 +231,9 @@ def __init__( self, model: Union[MLModel, ModelLoader], name: Optional[str] = None, - **kwargs ) -> None: - super().__init__(name, **kwargs) + super().__init__(name=name) + self.save_parameters(model=model, name=name) self.model = model() if isinstance(model, ModelLoader) else model @abstractmethod diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index a2ab5abd..3cda509e 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -8,6 +8,7 @@ from jsonargparse import ArgumentParser, ActionConfigFile import json from omegaconf import OmegaConf +from pathlib import Path from .components import BaseComponent from .pipeline import Pipeline @@ -34,22 +35,186 @@ def add_replace_field( if idx >= len(key_chain.split('.')) - 1: # Last key reached break - if not isinstance(sub_config.get(k), dict): + + if isinstance(sub_config, (list, tuple)): + k = int(k) + next_elem = sub_config[k] + else: + next_elem = sub_config.get(k) + + if not isinstance(next_elem, (dict, list, tuple)): sub_config[k] = dict() + sub_config = sub_config[k] + if isinstance(sub_config, (list, tuple)): + k = int(k) sub_config[k] = value -class ConfigParser2: - ... +# def add_replace_field( +# config: Dict, +# key_chain: str, +# value: Any +# ) -> None: +# """Replace or add (if not present) a field in a dictionary, following a +# path of dot-separated keys. Inplace operation. +# Args: +# config (Dict): dictionary to be modified. +# key_chain (str): path of nested (dot-separated) keys to specify the +# location +# of the new value (e.g., 'foo.bar.line' adds/overwrites the value +# located at config['foo']['bar']['line']). +# value (Any): the value to insert. +# """ +# sub_config = config +# for idx, k in enumerate(key_chain.split('.')): +# if idx >= len(key_chain.split('.')) - 1: +# # Last key reached +# break +# if not isinstance(sub_config.get(k), dict): +# sub_config[k] = dict() +# sub_config = sub_config[k] +# sub_config[k] = value -class ItwinaiCLI2: - ... +class ConfigParser: + """ + Parses a pipeline from a configuration file. + It also provides functionalities for dynamic override + of fields by means of nested key notation. + Example: -class ConfigParser: - """Parses a configuration file, merging the steps into + >>> # pipeline.yaml file + >>> pipeline: + >>> class_path: itwinai.pipeline.Pipeline + >>> init_args: + >>> steps: + >>> - class_path: dataloader.MNISTDataModuleTorch + >>> init_args: + >>> save_path: .tmp/ + >>> + >>> - class_path: itwinai.torch.trainer.TorchTrainerMG + >>> init_args: + >>> model: + >>> class_path: model.Net + >>> loss: + >>> class_path: torch.nn.NLLLoss + >>> init_args: + >>> reduction: mean + + >>> from itwinai.parser import ConfigParser + >>> + >>> parser = ConfigParser( + >>> config='pipeline.yaml', + >>> override_keys={ + >>> 'pipeline.init_args.steps.0.init_args.save_path': /save/path + >>> } + >>> ) + >>> pipeline = parser.parse_pipeline() + >>> print(pipeline) + >>> print(pipeline.steps) + >>> + >>> dataloader = parser.parse_step(0) + >>> print(dataloader) + >>> print(dataloader.save_path) + """ + + config: Dict + pipeline: Pipeline + + def __init__( + self, + config: Union[str, Dict], + override_keys: Optional[Dict[str, Any]] = None + ) -> None: + self.config = config + self.override_keys = override_keys + if isinstance(self.config, (str, Path)): + self.config = load_yaml(self.config) + self._dynamic_override_keys() + self._omegaconf_interpolate() + + def _dynamic_override_keys(self): + if self.override_keys is not None: + for key_chain, value in self.override_keys.items(): + add_replace_field(self.config, key_chain, value) + + def _omegaconf_interpolate(self) -> None: + """Performs variable interpolation with OmegaConf on internal + configuration file. + """ + conf = OmegaConf.create(self.config) + self.config = OmegaConf.to_container(conf, resolve=True) + + def parse_pipeline( + self, + pipeline_nested_key: str = "pipeline", + verbose: bool = False + ) -> Pipeline: + """Merges steps into pipeline and parses it. + + Args: + pipeline_nested_key (str, optional): nested key in the + configuration file identifying the pipeline object. + Defaults to "pipeline". + verbose (bool): if True, prints the assembled pipeline + to console formatted as JSON. + + Returns: + Pipeline: instantiated pipeline. + """ + pipe_parser = ArgumentParser() + pipe_parser.add_subclass_arguments(Pipeline, "pipeline") + + pipe_dict = self.config + for key in pipeline_nested_key.split('.'): + pipe_dict = pipe_dict[key] + # pipe_dict = self.config[pipeline_nested_key] + pipe_dict = {"pipeline": pipe_dict} + + if verbose: + print("Assembled pipeline:") + print(json.dumps(pipe_dict, indent=4)) + + # Parse pipeline dict once merged with steps + conf = pipe_parser.parse_object(pipe_dict) + pipe = pipe_parser.instantiate_classes(conf) + self.pipeline = pipe["pipeline"] + return self.pipeline + + def parse_step( + self, + step_name: str, + pipeline_nested_key: str = "pipeline", + verbose: bool = False + ) -> BaseComponent: + pipeline_dict = self.config + for key in pipeline_nested_key.split('.'): + pipeline_dict = pipeline_dict[key] + + step_dict_config = pipeline_dict['init_args']['steps'][step_name] + + if verbose: + print(f"STEP '{step_name}' CONFIG:") + print(json.dumps(step_dict_config, indent=4)) + + # Wrap config under "step" field and parse it + step_dict_config = {'step': step_dict_config} + step_parser = ArgumentParser() + step_parser.add_subclass_arguments(BaseComponent, "step") + parsed_namespace = step_parser.parse_object(step_dict_config) + return step_parser.instantiate_classes(parsed_namespace)["step"] + + +class ConfigParser2: + """ + Deprecated: this pipeline structure does not allow for + nested pipelines. However, it is more readable and the linking + from name to step data could be achieved with OmegaConf. This + could be reused in the future: left as example. + + Parses a configuration file, merging the steps into the pipeline and returning a pipeline object. It also provides functionalities for dynamic override of fields by means of nested key notation. @@ -72,9 +237,9 @@ class ConfigParser: >>> init_args: >>> url: http://${server.init_args.host}:${server.init_args.port}/ - >>> from itwinai.parser import ConfigParser + >>> from itwinai.parser import ConfigParser2 >>> - >>> parser = ConfigParser( + >>> parser = ConfigParser2( >>> config='pipeline.yaml', >>> override_keys={ >>> 'server.init_args.port': 777 @@ -180,7 +345,11 @@ def parse_step( class ItwinaiCLI: - """CLI tool for executing a configuration file, with dynamic + """ + Deprecated: the dynamic override does not work with nested parameters + and may be confusing. + + CLI tool for executing a configuration file, with dynamic override of fields and variable interpolation with Omegaconf. Example: @@ -230,7 +399,7 @@ def __init__( # Merge steps into pipeline and parse it del self._config['config'] - pipe_parser = ConfigParser(config=self._config.as_dict()) + pipe_parser = ConfigParser2(config=self._config.as_dict()) self.pipeline = pipe_parser.parse_pipeline( pipeline_nested_key=self.pipeline_nested_key ) diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py index 29aad61f..acc9a3ee 100644 --- a/src/itwinai/pipeline.py +++ b/src/itwinai/pipeline.py @@ -20,7 +20,7 @@ fields, as done with Lightning CLI. """ from __future__ import annotations -from typing import Iterable, Dict, Any, Tuple, Union +from typing import Iterable, Dict, Any, Tuple, Union, Optional import inspect from .components import BaseComponent, monitor_exec @@ -29,16 +29,15 @@ class Pipeline(BaseComponent): """Executes a set of components arranged as a pipeline.""" steps: Union[Dict[str, BaseComponent], Iterable[BaseComponent]] - constructor_args: Dict def __init__( self, steps: Union[Dict[str, BaseComponent], Iterable[BaseComponent]], - **kwargs + name: Optional[str] = None ): - super().__init__(**kwargs) + super().__init__(name=name) + self.save_parameters(steps=steps, name=name) self.steps = steps - self.constructor_args = kwargs def __getitem__(self, subscript: Union[str, int, slice]) -> Pipeline: if isinstance(subscript, slice): @@ -56,8 +55,7 @@ def __getitem__(self, subscript: Union[str, int, slice]) -> Pipeline: # initial structure sliced = self.__class__( steps=s, - name=self.name, - **self.constructor_args + name=self.name ) return sliced else: diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index b0253ae1..2b7de63d 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -1,8 +1,138 @@ -from .types import MLModel +from typing import Dict, Any import abc +import json +import inspect + +from .types import MLModel + + +def is_jsonable(x): + try: + json.dumps(x) + return True + except Exception: + return False + + +def fullname(o): + klass = o.__class__ + module = klass.__module__ + if module == 'builtins': + return klass.__qualname__ # avoid outputs like 'builtins.str' + return module + '.' + klass.__qualname__ + + +class SerializationError(Exception): + ... + + +class Serializable: + parameters: Dict[Any, Any] = None + + def save_parameters(self, **kwargs) -> None: + """Simplified way to store constructor arguments in as class + attributes. Keeps track of the parameters to enable + YAML/JSON serialization. + """ + if self.parameters is None: + self.parameters = {} + self.parameters.update(kwargs) + + for k, v in kwargs.items(): + self.__setattr__(k, v) + + def update_parameters(self, **kwargs) -> None: + """Updates stored parameters.""" + self.save_parameters(**kwargs) + + def to_dict(self) -> Dict: + """Returns a dict serialization of the current object.""" + self._validate_parameters() + class_path = fullname(self) + init_args = dict() + for par_name, par in self._saved_constructor_parameters().items(): + init_args[par_name] = self._recursive_serialization(par, par_name) + return dict(class_path=class_path, init_args=init_args) + + def _validate_parameters(self) -> None: + if self.parameters is None: + raise SerializationError( + f"{self.__class__.__name__} cannot be serialized " + "because its constructor arguments were not saved. " + "Please add 'self.save_parameters(param_1=param_1, " + "..., param_n=param_n)' as first instruction of its " + "constructor." + ) + + init_params = inspect.signature(self.__init__).parameters.items() + + # Check that all non-default parameters are in self.parameters + non_default_par = list(filter( + lambda p: p[0] != 'self' and p[1].default == inspect._empty, + init_params + )) + non_default_par_names = list(map(lambda p: p[0], non_default_par)) + for par_name in non_default_par_names: + if self.parameters.get(par_name) is None: + raise SerializationError( + f"Required parameter '{par_name}' of " + f"{self.__class__.__name__} class not present in " + "saved parameters. " + "Please add 'self.save_parameters(param_1=param_1, " + "..., param_n=param_n)' as first instruction of its " + f"constructor, including also '{par_name}'." + ) + + # # Check that all params in self.parameters match with the signature + # init_par_nam = set(map(lambda x: x[0], init_params)) + # sav_par_nam = set(self.parameters.keys()) + # if len(init_par_nam.intersection(sav_par_nam)) != len(sav_par_nam): + # raise SerializationError( + # "Some parameters saved with " + # "'self.save_parameters(param_1=param_1, " + # "..., param_n=param_n)' " + # "Are unused not present in the constructor of " + # f"'{self.__class__.__name__}' class. Please remove them." + # ) + + def _saved_constructor_parameters(self) -> Dict[str, Any]: + """Extracts the current constructor parameters from all + the saved parameters, as some of them may had been added by + superclasses. + + Returns: + Dict[str, Any]: subset of saved parameters containing only + the constructor parameters for this class. + """ + init_params = inspect.signature(self.__init__).parameters.items() + init_par_nam = map(lambda x: x[0], init_params) + return { + par_name: self.parameters[par_name] for par_name in init_par_nam + if self.parameters.get(par_name, inspect._empty) != inspect._empty + } + + def _recursive_serialization(self, item: Any, item_name: str) -> Any: + if isinstance(item, (tuple, list)): + return [self._recursive_serialization(x, item_name) for x in item] + elif isinstance(item, dict): + return { + k: self._recursive_serialization(v, item_name) + for k, v in item.items() + } + elif is_jsonable(item): + return item + elif isinstance(item, Serializable): + return item.to_dict() + else: + raise SerializationError( + f"{self.__class__.__name__} cannot be serialized " + f"because its constructor argument '{item_name}' " + "is not a Python built-in type and it does not " + "extend 'itwinai.serialization.Serializable' class." + ) -class ModelLoader(abc.ABC): +class ModelLoader(abc.ABC, Serializable): """Loads a machine learning model from somewhere.""" def __init__(self, model_uri: str) -> None: diff --git a/src/itwinai/tests/__init__.py b/src/itwinai/tests/__init__.py new file mode 100644 index 00000000..9eaae9a1 --- /dev/null +++ b/src/itwinai/tests/__init__.py @@ -0,0 +1 @@ +from .dummy_components import * diff --git a/src/itwinai/tests/dummy_components.py b/src/itwinai/tests/dummy_components.py new file mode 100644 index 00000000..1daa4c08 --- /dev/null +++ b/src/itwinai/tests/dummy_components.py @@ -0,0 +1,31 @@ +from typing import Optional +from ..components import BaseComponent + + +class FakePreproc(BaseComponent): + def __init__(self, max_items: int, name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters(max_items=max_items, name=name) + + def execute(self): + ... + + +class FakeTrainer(BaseComponent): + def __init__(self, lr: float, batch_size: int, name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters(lr=lr, batch_size=batch_size, name=name) + + def execute(self): + ... + + +class FakeSaver(BaseComponent): + def __init__(self, save_path: str, name: Optional[str] = None) -> None: + super().__init__(name) + self.save_parameters(save_path=save_path, name=name) + + def execute(self): + ... diff --git a/tests/components/conftest.py b/tests/components/conftest.py new file mode 100644 index 00000000..0ba66af1 --- /dev/null +++ b/tests/components/conftest.py @@ -0,0 +1,72 @@ +import pytest + +pytest.PIPE_LIST_YAML = """ +my-list-pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + - class_path: itwinai.tests.dummy_components.FakePreproc + init_args: + max_items: 32 + name: my-preproc + + - class_path: itwinai.tests.dummy_components.FakeTrainer + init_args: + lr: 0.001 + batch_size: 32 + name: my-trainer + + - class_path: itwinai.tests.dummy_components.FakeSaver + init_args: + save_path: ./some/path + name: my-saver +""" + +pytest.PIPE_DICT_YAML = """ +my-dict-pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + preproc-step: + class_path: itwinai.tests.dummy_components.FakePreproc + init_args: + max_items: 32 + name: my-preproc + + train-step: + class_path: itwinai.tests.dummy_components.FakeTrainer + init_args: + lr: 0.001 + batch_size: 32 + name: my-trainer + + save-step: + class_path: itwinai.tests.dummy_components.FakeSaver + init_args: + save_path: ./some/path + name: my-saver +""" + +pytest.NESTED_PIPELINE = """ +some: + field: + nst-pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + - class_path: itwinai.tests.FakePreproc + init_args: + max_items: 32 + name: my-preproc + + - class_path: itwinai.tests.FakeTrainer + init_args: + lr: 0.001 + batch_size: 32 + name: my-trainer + + - class_path: itwinai.tests.FakeSaver + init_args: + save_path: ./some/path + name: my-saver +""" diff --git a/tests/components/test_components.py b/tests/components/test_components.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/components/test_pipe_parser.py b/tests/components/test_pipe_parser.py new file mode 100644 index 00000000..293461c5 --- /dev/null +++ b/tests/components/test_pipe_parser.py @@ -0,0 +1,144 @@ +import yaml +import pytest + +from itwinai.components import BaseComponent +from itwinai.parser import ConfigParser, add_replace_field + + +def test_add_replace_field(): + conf = {} + add_replace_field(conf, "some.key.chain", 123) + target1 = dict(some=dict(key=dict(chain=123))) + assert conf == target1 + + add_replace_field(conf, "some.key.chain", 222) + target2 = dict(some=dict(key=dict(chain=222))) + assert conf == target2 + + add_replace_field(conf, "some.key.field", 333) + target3 = dict(some=dict(key=dict(chain=222, field=333))) + assert conf == target3 + + conf['some']['list'] = [1, 2, 3] + add_replace_field(conf, "some.list.0", 3) + target4 = dict(some=dict( + key=dict(chain=222, field=333), + list=[3, 2, 3] + )) + assert conf == target4 + + print('-'*55) + + add_replace_field(conf, "some.list.0.some.el", 7) + target5 = dict(some=dict( + key=dict(chain=222, field=333), + list=[dict(some=dict(el=7)), 2, 3] + )) + assert conf == target5 + + conf2 = dict(first=dict(list1=[[0, 1], [2, 3]], el=0)) + add_replace_field(conf2, "first.list1.1.0", 77) + target6 = dict(first=dict(list1=[[0, 1], [77, 3]], el=0)) + assert conf2 == target6 + + conf3 = dict(first=dict( + list1=[[0, dict(nst=("el", dict(ciao="ciao")))], [2, 3]], el=0)) + add_replace_field(conf3, "first.list1.0.1.nst.1.ciao", "hello") + target7 = dict(first=dict( + list1=[[0, dict(nst=("el", dict(ciao="hello")))], [2, 3]], el=0)) + assert conf3 == target7 + + add_replace_field(conf3, "first.list1.0.1.nst.1.ciao.I.am.john", True) + target8 = dict(first=dict( + list1=[ + [0, dict(nst=("el", dict(ciao=dict(I=dict(am=dict(john=True))))))], + [2, 3] + ], el=0)) + assert conf3 == target8 + + +def test_parse_list_pipeline(): + """Parse a pipeline from config file, + where the pipeline is define as a list of components. + """ + config = yaml.safe_load(pytest.PIPE_LIST_YAML) + parser = ConfigParser(config=config) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-list-pipeline" + ) + + assert isinstance(pipe.steps, list) + for step in pipe.steps: + assert isinstance(step, BaseComponent) + + +def test_parse_dict_pipeline(): + """Parse a pipeline from config file, + where the pipeline is define as a dict of components. + """ + config = yaml.safe_load(pytest.PIPE_DICT_YAML) + parser = ConfigParser(config=config) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-dict-pipeline" + ) + + assert isinstance(pipe.steps, dict) + for step in pipe.steps.values(): + assert isinstance(step, BaseComponent) + + +def test_parse_non_existing_pipeline(): + """Parse a pipeline from config file, + where the pipeline key is wrong. + """ + config = yaml.safe_load(pytest.PIPE_DICT_YAML) + parser = ConfigParser(config=config) + with pytest.raises(KeyError): + _ = parser.parse_pipeline( + pipeline_nested_key="non-existing-pipeline" + ) + + +def test_parse_nested_pipeline(): + """Parse a pipeline from config file, + where the pipeline key is nested. + """ + config = yaml.safe_load(pytest.NESTED_PIPELINE) + parser = ConfigParser(config=config) + _ = parser.parse_pipeline( + pipeline_nested_key="some.field.nst-pipeline" + ) + + +def test_dynamic_override_parser_pipeline_dict(): + """Parse a pipeline from config file, + and verify that dynamic override works + in a pipeline composed of a dict of components. + """ + config = yaml.safe_load(pytest.PIPE_DICT_YAML) + + override_keys = { + "my-dict-pipeline.init_args.steps.preproc-step.init_args.max_items": 33 + } + parser = ConfigParser(config=config, override_keys=override_keys) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-dict-pipeline" + ) + assert pipe.steps['preproc-step'].max_items == 33 + + +def test_dynamic_override_parser_pipeline_list(): + """Parse a pipeline from config file, + and verify that dynamic override works + in a pipeline composed of a list of components. + """ + config = yaml.safe_load(pytest.PIPE_LIST_YAML) + + override_keys = { + "my-list-pipeline.init_args.steps.0.init_args.max_items": 42 + } + parser = ConfigParser(config=config, override_keys=override_keys) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-list-pipeline" + ) + assert pipe.steps[0].max_items == 42 diff --git a/tests/components/test_pipeline.py b/tests/components/test_pipeline.py new file mode 100644 index 00000000..b07376da --- /dev/null +++ b/tests/components/test_pipeline.py @@ -0,0 +1,58 @@ +import yaml +import pytest + +from itwinai.pipeline import Pipeline +from itwinai.parser import ConfigParser + + +def test_slice_into_sub_pipelines(): + """Test slicing the pipeline to obtain + sub-pipelines as Pipeline objects. + """ + p = Pipeline(['step1', 'step2', 'step3']) + sub_pipe1, sub_pipe2 = p[:1], p[1:] + assert isinstance(sub_pipe1, Pipeline) + assert isinstance(sub_pipe2, Pipeline) + assert len(sub_pipe1) == 1 + assert sub_pipe1[0] == "step1" + assert len(sub_pipe2) == 2 + + p = Pipeline(dict(step1="step1", step2="step2", step3="step3")) + sub_pipe1, sub_pipe2 = p[:1], p[1:] + assert isinstance(sub_pipe1, Pipeline) + assert isinstance(sub_pipe2, Pipeline) + assert len(sub_pipe1) == 1 + assert sub_pipe1["step1"] == "step1" + assert len(sub_pipe2) == 2 + + +def test_serialization_pipe_list(): + """Test dict serialization of pipeline + defined as list of BaseComponent objects. + """ + config = yaml.safe_load(pytest.PIPE_LIST_YAML) + parser = ConfigParser(config=config) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-list-pipeline" + ) + + dict_pipe = pipe.to_dict() + del dict_pipe['init_args']['name'] + dict_pipe = {"my-list-pipeline": dict_pipe} + assert dict_pipe == config + + +def test_serialization_pipe_dict(): + """Test dict serialization of pipeline + defined as dict of BaseComponent objects. + """ + config = yaml.safe_load(pytest.PIPE_DICT_YAML) + parser = ConfigParser(config=config) + pipe = parser.parse_pipeline( + pipeline_nested_key="my-dict-pipeline" + ) + + dict_pipe = pipe.to_dict() + del dict_pipe['init_args']['name'] + dict_pipe = {"my-dict-pipeline": dict_pipe} + assert dict_pipe == config diff --git a/tests/test_components.py b/tests/test_components.py deleted file mode 100644 index f51c483b..00000000 --- a/tests/test_components.py +++ /dev/null @@ -1,9 +0,0 @@ -from itwinai.components import Pipeline - - -def test_slice(): - p = Pipeline(['step1', 'step2', 'step3'], pippo=2) - assert len(p[:1]) == 1 - assert p[:1][0] == 'step1' - assert len(p[1:]) == 2 - assert p[1:].constructor_args['pippo'] == 2 From cbd8ba5b908e21ce722039112156cbe4923475ef Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Mon, 11 Dec 2023 18:11:56 +0100 Subject: [PATCH 11/26] UPDATE docs --- src/itwinai/parser.py | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 3cda509e..e577fbb3 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -21,7 +21,8 @@ def add_replace_field( value: Any ) -> None: """Replace or add (if not present) a field in a dictionary, following a - path of dot-separated keys. Inplace operation. + path of dot-separated keys. Adding is not supported for list items. + Inplace operation. Args: config (Dict): dictionary to be modified. key_chain (str): path of nested (dot-separated) keys to specify the @@ -51,32 +52,6 @@ def add_replace_field( sub_config[k] = value -# def add_replace_field( -# config: Dict, -# key_chain: str, -# value: Any -# ) -> None: -# """Replace or add (if not present) a field in a dictionary, following a -# path of dot-separated keys. Inplace operation. -# Args: -# config (Dict): dictionary to be modified. -# key_chain (str): path of nested (dot-separated) keys to specify the -# location -# of the new value (e.g., 'foo.bar.line' adds/overwrites the value -# located at config['foo']['bar']['line']). -# value (Any): the value to insert. -# """ -# sub_config = config -# for idx, k in enumerate(key_chain.split('.')): -# if idx >= len(key_chain.split('.')) - 1: -# # Last key reached -# break -# if not isinstance(sub_config.get(k), dict): -# sub_config[k] = dict() -# sub_config = sub_config[k] -# sub_config[k] = value - - class ConfigParser: """ Parses a pipeline from a configuration file. From 464f5270da1bfaeafa2665b9558a2552ff88cdfb Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Mon, 11 Dec 2023 22:25:32 +0100 Subject: [PATCH 12/26] ADD adapter component and tests (incl parser) --- src/itwinai/components.py | 69 ++++++++++++++++++++++++- src/itwinai/parser.py | 6 +-- tests/components/test_components.py | 57 +++++++++++++++++++++ tests/components/test_pipe_parser.py | 76 +++++++++++++++++++++++++++- 4 files changed, 202 insertions(+), 6 deletions(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 21fa6802..889083e0 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -84,7 +84,7 @@ from __future__ import annotations -from typing import Any, Optional, Tuple, Union, Callable, Dict +from typing import Any, Optional, Tuple, Union, Callable, Dict, List from abc import ABC, abstractmethod import time import functools @@ -299,3 +299,70 @@ def execute(self, artifact: MLArtifact) -> MLArtifact: Returns: MLArtifact: the same input artifact, after saving it. """ + + +class Adapter(BaseComponent): + """Connects to components in a sequential pipeline, allowing to + control with greater detail how intermediate results are propagated + among the components. + + Args: + policy (List[Any]): list of the same length of the output of this + component, describing how to map the input args to the output. + name (Optional[str], optional): name of the component. + Defaults to None. + + The adapter allows to define a policy with which inputs are re-arranged + before being propagated to the next component. + Some examples: [policy]: (input) -> (output) + - ["INPUT_ARG#2", "INPUT_ARG#1", "INPUT_ARG#0"]: (11,22,33) -> (33,22,11) + - ["INPUT_ARG#0", "INPUT_ARG#2", None]: (11, 22, 33) -> (11, 33, None) + - []: (11, 22, 33) -> () + - [42, "INPUT_ARG#2", "hello"] -> (11,22,33,44,55) -> (42, 33, "hello") + - [None, 33, 3.14]: () -> (None, 33, 3.14) + - [None, 33, 3.14]: ("double", 44, None, True) -> (None, 33, 3.14) + """ + + policy: List[Any] + INPUT_PREFIX: str = "INPUT_ARG#" + + def __init__(self, policy: List[Any], name: Optional[str] = None) -> None: + super().__init__(name) + self.save_parameters(policy=policy) + + @monitor_exec + def execute(self, *args) -> Tuple: + """Produces an output tuple by arranging input arguments according + to the policy specified in the constructor. + + Args: + args (Tuple): input arguments. + + Returns: + Tuple: input args arranged according to some policy. + """ + result = [] + for itm in self.policy: + if isinstance(itm, str) and itm.startswith(self.INPUT_PREFIX): + arg_idx = int(itm[len(self.INPUT_PREFIX):]) + if arg_idx >= len(args): + max_idx = max(map( + lambda itm: int(itm[len(self.INPUT_PREFIX):]), + filter( + lambda el: ( + isinstance(el, str) + and el.startswith(self.INPUT_PREFIX) + ), + self.policy + ))) + raise IndexError( + f"The args received as input by '{self.name}' " + "are not consistent with the given adapter policy " + "because input args are too few! " + f"Input args are {len(args)} but the policy foresees " + f"at least {max_idx+1} items." + ) + result.append(args[arg_idx]) + else: + result.append(itm) + return tuple(result) diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index e577fbb3..42c4a480 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -160,7 +160,7 @@ def parse_pipeline( def parse_step( self, - step_name: str, + step_idx: Union[str, int], pipeline_nested_key: str = "pipeline", verbose: bool = False ) -> BaseComponent: @@ -168,10 +168,10 @@ def parse_step( for key in pipeline_nested_key.split('.'): pipeline_dict = pipeline_dict[key] - step_dict_config = pipeline_dict['init_args']['steps'][step_name] + step_dict_config = pipeline_dict['init_args']['steps'][step_idx] if verbose: - print(f"STEP '{step_name}' CONFIG:") + print(f"STEP '{step_idx}' CONFIG:") print(json.dumps(step_dict_config, indent=4)) # Wrap config under "step" field and parse it diff --git a/tests/components/test_components.py b/tests/components/test_components.py index e69de29b..6a0a86ce 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -0,0 +1,57 @@ +import pytest + +from itwinai.components import Adapter + + +def test_adapter(): + """Test Adapter component.""" + prefix = Adapter.INPUT_PREFIX + adapter = Adapter( + policy=[f"{prefix}{3-i}" for i in range(4)] + ) + result = adapter.execute(0, 1, 2, 3) + assert result == (3, 2, 1, 0) + + result = adapter.execute(*tuple(range(100))) + assert result == (3, 2, 1, 0) + + adapter = Adapter( + policy=[f"{prefix}0" for i in range(4)] + ) + result = adapter.execute(0, 1, 2, 3) + assert result == (0, 0, 0, 0) + + adapter = Adapter( + policy=[f"{prefix}{i%2}" for i in range(4)] + ) + result = adapter.execute(0, 1, 2, 3) + assert result == (0, 1, 0, 1) + + adapter = Adapter( + policy=[f"{prefix}2", "hello", "world", 3.14] + ) + result = adapter.execute(0, 1, 2, 3) + assert result == (2, "hello", "world", 3.14) + + adapter = Adapter( + policy=[1, 3, 5, 7, 11] + ) + result = adapter.execute(0, 1, 2, 3) + assert result == (1, 3, 5, 7, 11) + + adapter = Adapter( + policy=[f"{prefix}{9-i}" for i in range(10)] + ) + with pytest.raises(IndexError) as exc_info: + result = adapter.execute(0, 1) + assert str(exc_info.value) == ( + "The args received as input by 'Adapter' are not consistent with " + "the given adapter policy because input args are too few! Input " + "args are 2 but the policy foresees at least 10 items." + ) + + adapter = Adapter( + policy=[] + ) + result = adapter.execute(*tuple(range(100))) + assert result == () diff --git a/tests/components/test_pipe_parser.py b/tests/components/test_pipe_parser.py index 293461c5..f26d105d 100644 --- a/tests/components/test_pipe_parser.py +++ b/tests/components/test_pipe_parser.py @@ -3,6 +3,7 @@ from itwinai.components import BaseComponent from itwinai.parser import ConfigParser, add_replace_field +from itwinai.tests import FakeTrainer, FakePreproc, FakeSaver def test_add_replace_field(): @@ -27,8 +28,6 @@ def test_add_replace_field(): )) assert conf == target4 - print('-'*55) - add_replace_field(conf, "some.list.0.some.el", 7) target5 = dict(some=dict( key=dict(chain=222, field=333), @@ -142,3 +141,76 @@ def test_dynamic_override_parser_pipeline_list(): pipeline_nested_key="my-list-pipeline" ) assert pipe.steps[0].max_items == 42 + + +def test_parse_step_list_pipeline(): + """Parse a pipeline step from config file, + where the pipeline is define as a list of components. + """ + config = yaml.safe_load(pytest.PIPE_LIST_YAML) + parser = ConfigParser(config=config) + step = parser.parse_step( + step_idx=1, + pipeline_nested_key="my-list-pipeline" + ) + + assert isinstance(step, BaseComponent) + assert isinstance(step, FakeTrainer) + + with pytest.raises(IndexError): + _ = parser.parse_step( + step_idx=12, + pipeline_nested_key="my-list-pipeline" + ) + with pytest.raises(TypeError): + _ = parser.parse_step( + step_idx='my-step-name', + pipeline_nested_key="my-list-pipeline" + ) + + +def test_parse_step_dict_pipeline(): + """Parse a pipeline step from config file, + where the pipeline is define as a dict of components. + """ + config = yaml.safe_load(pytest.PIPE_DICT_YAML) + parser = ConfigParser(config=config) + step = parser.parse_step( + step_idx='preproc-step', + pipeline_nested_key="my-dict-pipeline" + ) + + assert isinstance(step, BaseComponent) + assert isinstance(step, FakePreproc) + + with pytest.raises(KeyError): + _ = parser.parse_step( + step_idx='unk-step', + pipeline_nested_key="my-dict-pipeline" + ) + with pytest.raises(KeyError): + _ = parser.parse_step( + step_idx=0, + pipeline_nested_key="my-dict-pipeline" + ) + + +def test_parse_step_nested_pipeline(): + """Parse a pipeline step from config file, + where the pipeline is nested under some field. + """ + config = yaml.safe_load(pytest.NESTED_PIPELINE) + parser = ConfigParser(config=config) + step = parser.parse_step( + step_idx=2, + pipeline_nested_key="some.field.nst-pipeline" + ) + + assert isinstance(step, BaseComponent) + assert isinstance(step, FakeSaver) + + with pytest.raises(KeyError): + _ = parser.parse_step( + step_idx=2, + pipeline_nested_key="my-pipeline" + ) From 0ce72363ed2f06543a6ae8338e8ecdf1020b9676 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 10:34:05 +0100 Subject: [PATCH 13/26] ADD splitter component, improve pipeline, tests --- src/itwinai/components.py | 39 ++++++++++++- src/itwinai/pipeline.py | 29 +++++----- src/itwinai/tests/dummy_components.py | 58 ++++++++++++++++++- src/itwinai/utils.py | 60 ++++++++++++++++++- tests/components/test_components.py | 20 +++++++ tests/components/test_pipeline.py | 25 ++++++++ tests/test_utils.py | 83 ++++++++++++++++++++++++++- 7 files changed, 296 insertions(+), 18 deletions(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 889083e0..49aea905 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -327,8 +327,8 @@ class Adapter(BaseComponent): INPUT_PREFIX: str = "INPUT_ARG#" def __init__(self, policy: List[Any], name: Optional[str] = None) -> None: - super().__init__(name) - self.save_parameters(policy=policy) + super().__init__(name=name) + self.save_parameters(policy=policy, name=name) @monitor_exec def execute(self, *args) -> Tuple: @@ -366,3 +366,38 @@ def execute(self, *args) -> Tuple: else: result.append(itm) return tuple(result) + + +class DataSplitter(BaseComponent): + """Splits a dataset into train, validation, and test splits.""" + + def __init__( + self, + train_proportion: float, + validation_proportion: float, + test_proportion: float, + name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters( + train_proportion=train_proportion, + validation_proportion=validation_proportion, + test_proportion=test_proportion, + name=name + ) + + @abstractmethod + @monitor_exec + def execute( + self, + dataset: MLDataset + ) -> Tuple[MLDataset, MLDataset, MLDataset]: + """Splits a dataset into train, validation and test splits. + + Args: + dataset (MLDataset): input dataset. + + Returns: + Tuple[MLDataset, MLDataset, MLDataset]: tuple of + train, validation and test splits. + """ diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py index acc9a3ee..92cfb8eb 100644 --- a/src/itwinai/pipeline.py +++ b/src/itwinai/pipeline.py @@ -21,8 +21,9 @@ """ from __future__ import annotations from typing import Iterable, Dict, Any, Tuple, Union, Optional -import inspect + from .components import BaseComponent, monitor_exec +from .utils import SignatureInspector class Pipeline(BaseComponent): @@ -100,16 +101,18 @@ def validate_args(input_args: Tuple, component: BaseComponent): Raises: RuntimeError: in case of args mismatch. """ - comp_params = inspect.signature(component.execute).parameters.items() - non_default_par = list(filter( - lambda p: p[0] != 'self' and p[1].default == inspect._empty, - comp_params - )) - non_default_par_names = list(map(lambda p: p[0], non_default_par)) - if len(non_default_par) != len(input_args): - raise RuntimeError( - "Mismatch into the number of non-default parameters " - f"of execute method of '{component.name}' component " - f"({non_default_par_names}), and the number of arguments " - f"it received as input: {input_args}." + inspector = SignatureInspector(component.execute) + if inspector.min_params_num > len(input_args): + raise TypeError( + f"Component '{component.name}' received too few " + f"input arguments: {input_args}. Expected at least " + f"{inspector.min_params_num}, with names: " + f"{inspector.required_params}." + ) + if (inspector.max_params_num != inspector.INFTY + and len(input_args) > inspector.max_params_num): + raise TypeError( + f"Component '{component.name}' received too many " + f"input arguments: {input_args}. Expected at most " + f"{inspector.max_params_num}." ) diff --git a/src/itwinai/tests/dummy_components.py b/src/itwinai/tests/dummy_components.py index 1daa4c08..6f28afbe 100644 --- a/src/itwinai/tests/dummy_components.py +++ b/src/itwinai/tests/dummy_components.py @@ -1,5 +1,41 @@ from typing import Optional -from ..components import BaseComponent +from ..components import BaseComponent, monitor_exec + + +class FakeGetter(BaseComponent): + def __init__(self, data_uri: str, name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters(data_uri=data_uri, name=name) + + def execute(self): + ... + + +class FakeGetterExec(FakeGetter): + result: str = "dataset" + + @monitor_exec + def execute(self): + return self.result + + +class FakeSplitter(BaseComponent): + def __init__(self, train_prop: float, name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters(train_prop=train_prop, name=name) + + def execute(self): + ... + + +class FakeSplitterExec(FakeSplitter): + result: tuple = ("train_dataset", "val_dataset", "test_dataset") + + @monitor_exec + def execute(self, dataset): + return self.result class FakePreproc(BaseComponent): @@ -12,6 +48,12 @@ def execute(self): ... +class FakePreprocExec(FakePreproc): + @monitor_exec + def execute(self, train_dataset, val_dataset, test_dataset): + return train_dataset, val_dataset, test_dataset + + class FakeTrainer(BaseComponent): def __init__(self, lr: float, batch_size: int, name: Optional[str] = None ) -> None: @@ -22,6 +64,14 @@ def execute(self): ... +class FakeTrainerExec(FakeTrainer): + model: str = "trained_model" + + @monitor_exec + def execute(self, train_dataset, val_dataset, test_dataset): + return train_dataset, val_dataset, test_dataset, self.model + + class FakeSaver(BaseComponent): def __init__(self, save_path: str, name: Optional[str] = None) -> None: super().__init__(name) @@ -29,3 +79,9 @@ def __init__(self, save_path: str, name: Optional[str] = None) -> None: def execute(self): ... + + +class FakeSaverExec(FakeSaver): + @monitor_exec + def execute(self, artifact): + return artifact diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py index cbd527f9..52279aeb 100644 --- a/src/itwinai/utils.py +++ b/src/itwinai/utils.py @@ -1,8 +1,10 @@ """ Utilities for itwinai package. """ -from typing import Dict, Type +from typing import Dict, Type, Callable, Tuple import os +import sys +import inspect from collections.abc import MutableMapping import yaml from omegaconf import OmegaConf @@ -123,3 +125,59 @@ def parse_pipe_config(yaml_file, parser): raise exc return parser.parse_object(config) + + +class SignatureInspector: + """Provides the functionalities to inspect the signature of a function + or a method. + + Args: + func (Callable): function to be inspected. + """ + + INFTY: int = sys.maxsize + + def __init__(self, func: Callable) -> None: + self.func = func + self.func_params = inspect.signature(func).parameters.items() + + @property + def has_varargs(self) -> bool: + """Checks if the function has ``*args`` parameter.""" + return any(map( + lambda p: p[1].kind == p[1].VAR_POSITIONAL, + self.func_params + )) + + @property + def has_kwargs(self) -> bool: + """Checks if the function has ``**kwargs`` parameter.""" + return any(map( + lambda p: p[1].kind == p[1].VAR_KEYWORD, + self.func_params + )) + + @property + def required_params(self) -> Tuple[str]: + """Names of required parameters. Class method's 'self' is skipped.""" + required_params = list(filter( + lambda p: (p[0] != 'self' and p[1].default == inspect._empty + and p[1].kind != p[1].VAR_POSITIONAL + and p[1].kind != p[1].VAR_KEYWORD), + self.func_params + )) + return tuple(map(lambda p: p[0], required_params)) + + @property + def min_params_num(self) -> int: + """Minimum number of arguments required.""" + return len(self.required_params) + + @property + def max_params_num(self) -> int: + """Max number of supported input arguments. + If no limit, ``SignatureInspector.INFTY`` is returned. + """ + if self.has_kwargs or self.has_varargs: + return self.INFTY + return len(self.func_params) diff --git a/tests/components/test_components.py b/tests/components/test_components.py index 6a0a86ce..3f4b7874 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -1,6 +1,10 @@ import pytest from itwinai.components import Adapter +from itwinai.pipeline import Pipeline +from itwinai.tests import ( + FakeGetterExec, FakeSplitterExec, FakeTrainerExec, FakeSaverExec +) def test_adapter(): @@ -55,3 +59,19 @@ def test_adapter(): ) result = adapter.execute(*tuple(range(100))) assert result == () + + +@pytest.mark.integration +def test_adapter_integration_pipeline(): + """Test integration of Adapter component in the pipeline, + connecting other components. + """ + pipeline = Pipeline([ + FakeGetterExec(data_uri='http://...'), + FakeSplitterExec(train_prop=.7), + FakeTrainerExec(lr=1e-3, batch_size=32), + Adapter(policy=[f"{Adapter.INPUT_PREFIX}-1"]), + FakeSaverExec(save_path="my_model.pth") + ]) + saved_model = pipeline.execute() + assert saved_model == FakeTrainerExec.model diff --git a/tests/components/test_pipeline.py b/tests/components/test_pipeline.py index b07376da..7710da59 100644 --- a/tests/components/test_pipeline.py +++ b/tests/components/test_pipeline.py @@ -3,6 +3,9 @@ from itwinai.pipeline import Pipeline from itwinai.parser import ConfigParser +from itwinai.tests import ( + FakeGetterExec, FakeSplitterExec, FakeTrainerExec, FakeSaverExec +) def test_slice_into_sub_pipelines(): @@ -56,3 +59,25 @@ def test_serialization_pipe_dict(): del dict_pipe['init_args']['name'] dict_pipe = {"my-dict-pipeline": dict_pipe} assert dict_pipe == config + + +def test_arguments_mismatch(): + """Test mismatch of arguments passed among components in a pipeline.""" + pipeline = Pipeline([ + FakeGetterExec(data_uri='http://...'), + FakeSplitterExec(train_prop=.7), + FakeTrainerExec(lr=1e-3, batch_size=32), + # Adapter(policy=[f"{Adapter.INPUT_PREFIX}-1"]), + FakeSaverExec(save_path="my_model.pth") + ]) + # Too many arguments for saver + with pytest.raises(TypeError): + _ = pipeline.execute() + + pipeline = Pipeline([ + FakeGetterExec(data_uri='http://...'), + FakeTrainerExec(lr=1e-3, batch_size=32), + ]) + # Too few arguments for trainer + with pytest.raises(TypeError): + _ = pipeline.execute() diff --git a/tests/test_utils.py b/tests/test_utils.py index bbeb61fa..5fb7b936 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ Tests for itwinai.utils module. """ -from itwinai.utils import flatten_dict +from itwinai.utils import flatten_dict, SignatureInspector def test_flatten_dict(): @@ -16,3 +16,84 @@ def test_flatten_dict(): assert flattened.get("b.b1") == 2 assert flattened.get("b.b2") == 3 assert len(flattened) == 3 + + +def test_signature_inspector(): + """Test SignatureInspector class.""" + def f(): + ... + + inspector = SignatureInspector(f) + assert not inspector.has_varargs + assert not inspector.has_kwargs + assert inspector.required_params == () + assert inspector.min_params_num == 0 + assert inspector.max_params_num == 0 + + def f(*args): + ... + + inspector = SignatureInspector(f) + assert inspector.has_varargs + assert not inspector.has_kwargs + assert inspector.required_params == () + assert inspector.min_params_num == 0 + assert inspector.max_params_num == SignatureInspector.INFTY + + def f(foo, *args): + ... + + inspector = SignatureInspector(f) + assert inspector.has_varargs + assert not inspector.has_kwargs + assert inspector.required_params == ("foo",) + assert inspector.min_params_num == 1 + assert inspector.max_params_num == SignatureInspector.INFTY + + def f(foo, bar=123): + ... + + inspector = SignatureInspector(f) + assert not inspector.has_varargs + assert not inspector.has_kwargs + assert inspector.required_params == ("foo",) + assert inspector.min_params_num == 1 + assert inspector.max_params_num == 2 + + def f(foo, *args, bar=123): + ... + + inspector = SignatureInspector(f) + assert inspector.has_varargs + assert not inspector.has_kwargs + assert inspector.required_params == ("foo",) + assert inspector.min_params_num == 1 + assert inspector.max_params_num == SignatureInspector.INFTY + + def f(*args, **kwargs): + ... + + inspector = SignatureInspector(f) + assert inspector.has_varargs + assert inspector.has_kwargs + assert inspector.required_params == () + assert inspector.min_params_num == 0 + assert inspector.max_params_num == SignatureInspector.INFTY + + def f(foo, /, bar, *arg, **kwargs): + ... + inspector = SignatureInspector(f) + assert inspector.has_varargs + assert inspector.has_kwargs + assert inspector.required_params == ("foo", "bar") + assert inspector.min_params_num == 2 + assert inspector.max_params_num == SignatureInspector.INFTY + + def f(foo, /, bar, *, hello, **kwargs): + ... + inspector = SignatureInspector(f) + assert not inspector.has_varargs + assert inspector.has_kwargs + assert inspector.required_params == ("foo", "bar", "hello") + assert inspector.min_params_num == 3 + assert inspector.max_params_num == SignatureInspector.INFTY From 0a20b8291f56b007f1bcf996581f80f48310050d Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 10:36:54 +0100 Subject: [PATCH 14/26] UPDATE test --- tests/components/test_pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/components/test_pipeline.py b/tests/components/test_pipeline.py index 7710da59..a61198b6 100644 --- a/tests/components/test_pipeline.py +++ b/tests/components/test_pipeline.py @@ -70,14 +70,14 @@ def test_arguments_mismatch(): # Adapter(policy=[f"{Adapter.INPUT_PREFIX}-1"]), FakeSaverExec(save_path="my_model.pth") ]) - # Too many arguments for saver - with pytest.raises(TypeError): + with pytest.raises(TypeError) as exc_info: _ = pipeline.execute() + assert "received too many input arguments" in str(exc_info.value) pipeline = Pipeline([ FakeGetterExec(data_uri='http://...'), FakeTrainerExec(lr=1e-3, batch_size=32), ]) - # Too few arguments for trainer - with pytest.raises(TypeError): + with pytest.raises(TypeError) as exc_info: _ = pipeline.execute() + assert "received too few input arguments" in str(exc_info.value) From 60d9c8ac6f2c85a6586b072197830ca8278aae2b Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 11:05:38 +0100 Subject: [PATCH 15/26] REMOVE todos --- src/itwinai/pipeline.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/itwinai/pipeline.py b/src/itwinai/pipeline.py index 92cfb8eb..1391bfef 100644 --- a/src/itwinai/pipeline.py +++ b/src/itwinai/pipeline.py @@ -1,23 +1,6 @@ """ This module provides the functionalities to execute workflows defined in in form of pipelines. - -TODO: -- Define input and output for components, as in KubeFlow, so that it is -not ambiguous when creating a DAG how to split/merge outputs/inputs. -An alternative is to define additional splitter/merger blocks to manage -the routing of inputs/outputs: - ->>> class Router: ->>> ... ->>> class Splitter(Router): ->>> ... ->>> class Merger(Router): ->>> ... - -- Create a CLI parser allowing to execute pipelines directly from their -config file serialization, directly from CLI, with dynamic override of -fields, as done with Lightning CLI. """ from __future__ import annotations from typing import Iterable, Dict, Any, Tuple, Union, Optional From 9df7a7f56a21affc804a7b27432a4c10d4d82848 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 11:38:00 +0100 Subject: [PATCH 16/26] ADD component tests --- src/itwinai/serialization.py | 48 ++++++++++++----------- tests/components/test_components.py | 61 +++++++++++++++++++++++++++++ tests/use-cases/conftest.py | 1 - tutorials/ml-workflows/basic.py | 12 ++++++ 4 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 tutorials/ml-workflows/basic.py diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index 2b7de63d..9f7f6278 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -1,9 +1,12 @@ -from typing import Dict, Any +from typing import Dict, Any, Union import abc import json +import yaml +from pathlib import Path import inspect from .types import MLModel +from .utils import SignatureInspector def is_jsonable(x): @@ -64,15 +67,8 @@ def _validate_parameters(self) -> None: "constructor." ) - init_params = inspect.signature(self.__init__).parameters.items() - - # Check that all non-default parameters are in self.parameters - non_default_par = list(filter( - lambda p: p[0] != 'self' and p[1].default == inspect._empty, - init_params - )) - non_default_par_names = list(map(lambda p: p[0], non_default_par)) - for par_name in non_default_par_names: + init_inspector = SignatureInspector(self.__init__) + for par_name in init_inspector.required_params: if self.parameters.get(par_name) is None: raise SerializationError( f"Required parameter '{par_name}' of " @@ -83,18 +79,6 @@ def _validate_parameters(self) -> None: f"constructor, including also '{par_name}'." ) - # # Check that all params in self.parameters match with the signature - # init_par_nam = set(map(lambda x: x[0], init_params)) - # sav_par_nam = set(self.parameters.keys()) - # if len(init_par_nam.intersection(sav_par_nam)) != len(sav_par_nam): - # raise SerializationError( - # "Some parameters saved with " - # "'self.save_parameters(param_1=param_1, " - # "..., param_n=param_n)' " - # "Are unused not present in the constructor of " - # f"'{self.__class__.__name__}' class. Please remove them." - # ) - def _saved_constructor_parameters(self) -> Dict[str, Any]: """Extracts the current constructor parameters from all the saved parameters, as some of them may had been added by @@ -112,7 +96,7 @@ def _saved_constructor_parameters(self) -> Dict[str, Any]: } def _recursive_serialization(self, item: Any, item_name: str) -> Any: - if isinstance(item, (tuple, list)): + if isinstance(item, (tuple, list, set)): return [self._recursive_serialization(x, item_name) for x in item] elif isinstance(item, dict): return { @@ -131,6 +115,24 @@ def _recursive_serialization(self, item: Any, item_name: str) -> Any: "extend 'itwinai.serialization.Serializable' class." ) + def to_json(self, file_path: Union[str, Path]) -> None: + """Save a component to JSON file. + + Args: + file_path (Union[str, Path]): JSON file path. + """ + with open(file_path, "w") as fp: + json.dump(self.to_dict(), fp) + + def to_yaml(self, file_path: Union[str, Path]) -> None: + """Save a component to YAML file. + + Args: + file_path (Union[str, Path]): YAML file path. + """ + with open(file_path, "w") as fp: + yaml.dump(self.to_dict(), fp) + class ModelLoader(abc.ABC, Serializable): """Loads a machine learning model from somewhere.""" diff --git a/tests/components/test_components.py b/tests/components/test_components.py index 3f4b7874..91236389 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -5,6 +5,67 @@ from itwinai.tests import ( FakeGetterExec, FakeSplitterExec, FakeTrainerExec, FakeSaverExec ) +from itwinai.serialization import SerializationError + + +def test_serializable(): + """Test serialization of components.""" + comp = FakeGetterExec(data_uri='http://...') + dict_serializ = comp.to_dict() + assert isinstance(dict_serializ, dict) + assert comp.name == "FakeGetterExec" + assert dict_serializ == dict( + class_path="itwinai.tests.dummy_components.FakeGetterExec", + init_args=dict(data_uri='http://...', name=None) + ) + + # List + comp = FakeGetterExec(data_uri=[1, 2, 3]) + dict_serializ = comp.to_dict() + assert isinstance(dict_serializ, dict) + assert comp.name == "FakeGetterExec" + assert dict_serializ == dict( + class_path="itwinai.tests.dummy_components.FakeGetterExec", + init_args=dict(data_uri=[1, 2, 3], name=None) + ) + + # Tuple + comp = FakeGetterExec(data_uri=(1, 2, 3)) + dict_serializ = comp.to_dict() + assert isinstance(dict_serializ, dict) + assert comp.name == "FakeGetterExec" + assert dict_serializ == dict( + class_path="itwinai.tests.dummy_components.FakeGetterExec", + init_args=dict(data_uri=[1, 2, 3], name=None) + ) + + # Set + comp = FakeGetterExec(data_uri={1, 2, 3}) + dict_serializ = comp.to_dict() + assert isinstance(dict_serializ, dict) + assert comp.name == "FakeGetterExec" + assert dict_serializ == dict( + class_path="itwinai.tests.dummy_components.FakeGetterExec", + init_args=dict(data_uri=[1, 2, 3], name=None) + ) + + # Dict + comp = FakeGetterExec(data_uri=dict(foo=12, bar="123", hl=3.14)) + dict_serializ = comp.to_dict() + assert isinstance(dict_serializ, dict) + assert comp.name == "FakeGetterExec" + assert dict_serializ == dict( + class_path="itwinai.tests.dummy_components.FakeGetterExec", + init_args=dict(data_uri=dict(foo=12, bar="123", hl=3.14), name=None) + ) + + # Non serializable obj + class NonSerializable: + ... + + comp = FakeGetterExec(data_uri=NonSerializable()) + with pytest.raises(SerializationError): + dict_serializ = comp.to_dict() def test_adapter(): diff --git a/tests/use-cases/conftest.py b/tests/use-cases/conftest.py index c965799c..5c36e2ee 100644 --- a/tests/use-cases/conftest.py +++ b/tests/use-cases/conftest.py @@ -8,7 +8,6 @@ FNAMES = [ 'pipeline.yaml', 'startscript', - 'train.py', ] diff --git a/tutorials/ml-workflows/basic.py b/tutorials/ml-workflows/basic.py new file mode 100644 index 00000000..e13830c8 --- /dev/null +++ b/tutorials/ml-workflows/basic.py @@ -0,0 +1,12 @@ +""" +The most simple workflow that you can write is a sequential pipeline of steps, +where the outputs of a component are fed as input to the following component, +employing a scikit-learn-like Pipeline. + +This allows to export the Pipeline form Python code to configuration file, to +persist both parameters and workflow structure. Exporting to configuration file +assumes that each component class resides in a separate python file, so that +the pipeline configuration is agnostic from the current python script. + +Once the Pipeline has been exported to configuration file +""" From d51ae900d3ce1bdc62de93f93a18137fbbb0833f Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 11:55:09 +0100 Subject: [PATCH 17/26] ADD serializer tests --- src/itwinai/serialization.py | 13 ++++++++++++- tests/components/test_components.py | 23 +++++++++++++++++++++-- tutorials/ml-workflows/basic.py | 12 +++++++++++- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index 9f7f6278..3e7c3650 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -51,7 +51,7 @@ def update_parameters(self, **kwargs) -> None: def to_dict(self) -> Dict: """Returns a dict serialization of the current object.""" self._validate_parameters() - class_path = fullname(self) + class_path = self._get_class_path() init_args = dict() for par_name, par in self._saved_constructor_parameters().items(): init_args[par_name] = self._recursive_serialization(par, par_name) @@ -79,6 +79,17 @@ def _validate_parameters(self) -> None: f"constructor, including also '{par_name}'." ) + def _get_class_path(self) -> str: + class_path = fullname(self) + if "" in class_path: + raise SerializationError( + f"{self.__class__.__name__} is " + "defined locally, which is not supported for serialization. " + "Move the class to a separate Python file and import it " + "from there." + ) + return class_path + def _saved_constructor_parameters(self) -> Dict[str, Any]: """Extracts the current constructor parameters from all the saved parameters, as some of them may had been added by diff --git a/tests/components/test_components.py b/tests/components/test_components.py index 91236389..0f101868 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -1,6 +1,7 @@ +from typing import Optional import pytest -from itwinai.components import Adapter +from itwinai.components import Trainer, Adapter from itwinai.pipeline import Pipeline from itwinai.tests import ( FakeGetterExec, FakeSplitterExec, FakeTrainerExec, FakeSaverExec @@ -64,8 +65,26 @@ class NonSerializable: ... comp = FakeGetterExec(data_uri=NonSerializable()) - with pytest.raises(SerializationError): + with pytest.raises(SerializationError) as exc_info: dict_serializ = comp.to_dict() + assert ("is not a Python built-in type and it does not extend" + in str(exc_info.value)) + + # Local component class + class MyTrainer(Trainer): + def execute(self): + ... + + def save_state(self): + ... + + def load_state(self): + ... + comp = MyTrainer() + with pytest.raises(SerializationError) as exc_info: + dict_serializ = comp.to_dict() + assert ("is defined locally, which is not supported for serialization." + in str(exc_info.value)) def test_adapter(): diff --git a/tutorials/ml-workflows/basic.py b/tutorials/ml-workflows/basic.py index e13830c8..1480d455 100644 --- a/tutorials/ml-workflows/basic.py +++ b/tutorials/ml-workflows/basic.py @@ -8,5 +8,15 @@ assumes that each component class resides in a separate python file, so that the pipeline configuration is agnostic from the current python script. -Once the Pipeline has been exported to configuration file +Once the Pipeline has been exported to configuration file (YAML), it can +be executed directly from CLI: + +>>> itwinai exec-pipeline --config my-pipeline.yaml --override nested.key=42 + +The itwinai CLI allows for dynamic override of configuration fields, by means +of nested key notation. Also list indices are supported: + +>>> itwinai exec-pipeline --config my-pipe.yaml --override nested.list.2.0=42 + """ +from itwinai \ No newline at end of file From 4f5522c607bd840d6df5428749fb38a01f132c8d Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 11:55:37 +0100 Subject: [PATCH 18/26] FIX linter --- tests/components/test_components.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/components/test_components.py b/tests/components/test_components.py index 0f101868..364b4917 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -1,4 +1,3 @@ -from typing import Optional import pytest from itwinai.components import Trainer, Adapter From 041c41b3d53009dde17b187d6ce09c357a16d1b7 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 14:50:45 +0100 Subject: [PATCH 19/26] ADD basic workflow tutorial --- src/itwinai/components.py | 3 + src/itwinai/serialization.py | 10 ++-- tutorials/ml-workflows/advanced_workflow.py | 0 tutorials/ml-workflows/basic.py | 22 -------- tutorials/ml-workflows/basic_components.py | 53 ++++++++++++++++++ tutorials/ml-workflows/basic_workflow.py | 62 +++++++++++++++++++++ 6 files changed, 124 insertions(+), 26 deletions(-) create mode 100644 tutorials/ml-workflows/advanced_workflow.py delete mode 100644 tutorials/ml-workflows/basic.py create mode 100644 tutorials/ml-workflows/basic_components.py create mode 100644 tutorials/ml-workflows/basic_workflow.py diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 49aea905..053e97a1 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -370,6 +370,9 @@ def execute(self, *args) -> Tuple: class DataSplitter(BaseComponent): """Splits a dataset into train, validation, and test splits.""" + train_proportion: float + validation_proportion: float + test_proportion: float def __init__( self, diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index 3e7c3650..d7269350 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -126,23 +126,25 @@ def _recursive_serialization(self, item: Any, item_name: str) -> Any: "extend 'itwinai.serialization.Serializable' class." ) - def to_json(self, file_path: Union[str, Path]) -> None: + def to_json(self, file_path: Union[str, Path], nested_key: str) -> None: """Save a component to JSON file. Args: file_path (Union[str, Path]): JSON file path. + nested_key (str): root field containing the serialized object. """ with open(file_path, "w") as fp: - json.dump(self.to_dict(), fp) + json.dump({nested_key: self.to_dict()}, fp) - def to_yaml(self, file_path: Union[str, Path]) -> None: + def to_yaml(self, file_path: Union[str, Path], nested_key: str) -> None: """Save a component to YAML file. Args: file_path (Union[str, Path]): YAML file path. + nested_key (str): root field containing the serialized object. """ with open(file_path, "w") as fp: - yaml.dump(self.to_dict(), fp) + yaml.dump({nested_key: self.to_dict()}, fp) class ModelLoader(abc.ABC, Serializable): diff --git a/tutorials/ml-workflows/advanced_workflow.py b/tutorials/ml-workflows/advanced_workflow.py new file mode 100644 index 00000000..e69de29b diff --git a/tutorials/ml-workflows/basic.py b/tutorials/ml-workflows/basic.py deleted file mode 100644 index 1480d455..00000000 --- a/tutorials/ml-workflows/basic.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -The most simple workflow that you can write is a sequential pipeline of steps, -where the outputs of a component are fed as input to the following component, -employing a scikit-learn-like Pipeline. - -This allows to export the Pipeline form Python code to configuration file, to -persist both parameters and workflow structure. Exporting to configuration file -assumes that each component class resides in a separate python file, so that -the pipeline configuration is agnostic from the current python script. - -Once the Pipeline has been exported to configuration file (YAML), it can -be executed directly from CLI: - ->>> itwinai exec-pipeline --config my-pipeline.yaml --override nested.key=42 - -The itwinai CLI allows for dynamic override of configuration fields, by means -of nested key notation. Also list indices are supported: - ->>> itwinai exec-pipeline --config my-pipe.yaml --override nested.list.2.0=42 - -""" -from itwinai \ No newline at end of file diff --git a/tutorials/ml-workflows/basic_components.py b/tutorials/ml-workflows/basic_components.py new file mode 100644 index 00000000..5198a53e --- /dev/null +++ b/tutorials/ml-workflows/basic_components.py @@ -0,0 +1,53 @@ +""" +Here we show how to implement component interfaces in a simple way. +""" +from typing import List, Optional, Tuple +from itwinai.components import ( + DataGetter, DataSplitter, Trainer, monitor_exec +) + + +class MyDataGetter(DataGetter): + def __init__(self, data_size: int, name: Optional[str] = None) -> None: + super().__init__(name) + self.save_parameters(data_size=data_size) + + @monitor_exec + def execute(self) -> List[int]: + """Return a list dataset. + + Returns: + List[int]: dataset + """ + return list(range(self.data_size)) + + +class MyDatasetSplitter(DataSplitter): + @monitor_exec + def execute( + self, + dataset: List[int] + ) -> Tuple[List[int], List[int], List[int]]: + train_n = int(len(dataset)*self.train_proportion) + valid_n = int(len(dataset)*self.validation_proportion) + train_set = dataset[:train_n] + vaild_set = dataset[train_n:train_n+valid_n] + test_set = dataset[train_n+valid_n:] + return train_set, vaild_set, test_set + + +class MyTrainer(Trainer): + @monitor_exec + def execute( + self, + train_set: List[int], + vaild_set: List[int], + test_set: List[int] + ) -> Tuple[List[int], List[int], List[int], str]: + return train_set, vaild_set, test_set, "my_trained_model" + + def save_state(self): + return super().save_state() + + def load_state(self): + return super().load_state() diff --git a/tutorials/ml-workflows/basic_workflow.py b/tutorials/ml-workflows/basic_workflow.py new file mode 100644 index 00000000..072a40a0 --- /dev/null +++ b/tutorials/ml-workflows/basic_workflow.py @@ -0,0 +1,62 @@ +""" +The most simple workflow that you can write is a sequential pipeline of steps, +where the outputs of a component are fed as input to the following component, +employing a scikit-learn-like Pipeline. + +This allows to export the Pipeline form Python code to configuration file, to +persist both parameters and workflow structure. Exporting to configuration file +assumes that each component class resides in a separate python file, so that +the pipeline configuration is agnostic from the current python script. + +Once the Pipeline has been exported to configuration file (YAML), it can +be executed directly from CLI: + +>>> itwinai exec-pipeline --config my-pipeline.yaml --override nested.key=42 + +The itwinai CLI allows for dynamic override of configuration fields, by means +of nested key notation. Also list indices are supported: + +>>> itwinai exec-pipeline --config my-pipe.yaml --override nested.list.2.0=42 + +""" +from itwinai.pipeline import Pipeline +from itwinai.parser import ConfigParser + +from basic_components import MyDataGetter, MyDatasetSplitter, MyTrainer + +pipeline = Pipeline([ + MyDataGetter(data_size=100), + MyDatasetSplitter( + train_proportion=.5, validation_proportion=.25, test_proportion=0.25 + ), + MyTrainer() +]) + +# Run pipeline +_, _, _, trained_model = pipeline.execute() +print("Trained model: ", trained_model) +print("\n" + "="*50 + "\n") + +# Serialize pipeline to YAML +pipeline.to_yaml("basic_pipeline_example.yaml", "pipeline") + +# Below, we show how to run a pre-existing pipeline stored as +# a configuration file, with the possibility of dynamically +# override some fields + +# Load pipeline from saved YAML (dynamic serialization) +parser = ConfigParser( + config="basic_pipeline_example.yaml", + override_keys={ + "pipeline.init_args.steps.0.init_args.data_size": 200 + } +) +pipeline = parser.parse_pipeline() +print(f"MyDataGetter's data_size is now: {pipeline.steps[0].data_size}\n") + +# Run parsed pipeline, with new data_size for MyDataGetter +_, _, _, trained_model = pipeline.execute() +print("Trained model (2): ", trained_model) + +# Save new pipeline we YAML file +pipeline.to_yaml("basic_pipeline_example_v2.yaml", "pipeline") From 331efac4354a024cbfdb8a82a1f7091109ec5f8d Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 15:49:40 +0100 Subject: [PATCH 20/26] ADD basic intermediate tutorial --- tutorials/ml-workflows/advanced_workflow.py | 0 tutorials/ml-workflows/basic_components.py | 20 ++++++ .../ml-workflows/basic_pipeline_example.yaml | 18 +++++ .../basic_pipeline_example_v2.yaml | 18 +++++ .../ml-workflows/tutorial_0_basic_workflow.py | 66 +++++++++++++++++++ ...py => tutorial_1_intermediate_workflow.py} | 36 +++++++--- .../tutorial_2_advanced_workflow.py | 7 ++ 7 files changed, 157 insertions(+), 8 deletions(-) delete mode 100644 tutorials/ml-workflows/advanced_workflow.py create mode 100644 tutorials/ml-workflows/basic_pipeline_example.yaml create mode 100644 tutorials/ml-workflows/basic_pipeline_example_v2.yaml create mode 100644 tutorials/ml-workflows/tutorial_0_basic_workflow.py rename tutorials/ml-workflows/{basic_workflow.py => tutorial_1_intermediate_workflow.py} (53%) create mode 100644 tutorials/ml-workflows/tutorial_2_advanced_workflow.py diff --git a/tutorials/ml-workflows/advanced_workflow.py b/tutorials/ml-workflows/advanced_workflow.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tutorials/ml-workflows/basic_components.py b/tutorials/ml-workflows/basic_components.py index 5198a53e..dc9c00f9 100644 --- a/tutorials/ml-workflows/basic_components.py +++ b/tutorials/ml-workflows/basic_components.py @@ -28,6 +28,15 @@ def execute( self, dataset: List[int] ) -> Tuple[List[int], List[int], List[int]]: + """Splits a list dataset into train, validation and test datasets. + + Args: + dataset (List[int]): input list dataset. + + Returns: + Tuple[List[int], List[int], List[int]]: train, validation, and + test datasets. + """ train_n = int(len(dataset)*self.train_proportion) valid_n = int(len(dataset)*self.validation_proportion) train_set = dataset[:train_n] @@ -44,6 +53,17 @@ def execute( vaild_set: List[int], test_set: List[int] ) -> Tuple[List[int], List[int], List[int], str]: + """Dummy ML trainer mocking a ML training algorithm. + + Args: + train_set (List[int]): training dataset. + vaild_set (List[int]): validation dataset. + test_set (List[int]): test dataset. + + Returns: + Tuple[List[int], List[int], List[int], str]: train, validation, + test datasets, and trained model. + """ return train_set, vaild_set, test_set, "my_trained_model" def save_state(self): diff --git a/tutorials/ml-workflows/basic_pipeline_example.yaml b/tutorials/ml-workflows/basic_pipeline_example.yaml new file mode 100644 index 00000000..c4596e5d --- /dev/null +++ b/tutorials/ml-workflows/basic_pipeline_example.yaml @@ -0,0 +1,18 @@ +pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + name: null + steps: + - class_path: basic_components.MyDataGetter + init_args: + data_size: 100 + name: null + - class_path: basic_components.MyDatasetSplitter + init_args: + name: null + test_proportion: 0.25 + train_proportion: 0.5 + validation_proportion: 0.25 + - class_path: basic_components.MyTrainer + init_args: + name: null diff --git a/tutorials/ml-workflows/basic_pipeline_example_v2.yaml b/tutorials/ml-workflows/basic_pipeline_example_v2.yaml new file mode 100644 index 00000000..d54a3bd7 --- /dev/null +++ b/tutorials/ml-workflows/basic_pipeline_example_v2.yaml @@ -0,0 +1,18 @@ +pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + name: null + steps: + - class_path: basic_components.MyDataGetter + init_args: + data_size: 200 + name: null + - class_path: basic_components.MyDatasetSplitter + init_args: + name: null + test_proportion: 0.25 + train_proportion: 0.5 + validation_proportion: 0.25 + - class_path: basic_components.MyTrainer + init_args: + name: null diff --git a/tutorials/ml-workflows/tutorial_0_basic_workflow.py b/tutorials/ml-workflows/tutorial_0_basic_workflow.py new file mode 100644 index 00000000..dbd04768 --- /dev/null +++ b/tutorials/ml-workflows/tutorial_0_basic_workflow.py @@ -0,0 +1,66 @@ +""" +The most simple workflow that you can write is a sequential pipeline of steps, +where the outputs of a component are fed as input to the following component, +employing a scikit-learn-like Pipeline. + +In itwinai, a step is also called "component" and is implemented by extending +the ``itwinai.components.BaseComponent`` class. Each component implements +the `execute(...)` method, which provides a unified interface to interact with +each component. + +The aim of itwinai components is to provide reusable machine learning best +practices, and some common operations are already encoded in some abstract +components. Some examples are: +- ``DataGetter``: has no input and returns a dataset, collected from somewhere +(e.g., downloaded). +- ``DataSplitter``: splits an input dataset into train, validation and test. +- ``DataPreproc``: perform preprocessing on train, validation, and test +datasets. +- ``Trainer``: trains an ML model and returns the trained model. +- ``Saver``: saved an ML artifact (e.g., dataset, model) to disk. + +In this tutorial you will see how to create new components and how they +are assembled into sequential pipelines. Newly created components are +in a separate file called 'basic_components.py'. +""" +from itwinai.pipeline import Pipeline + +# Import the custom components from file +from basic_components import MyDataGetter, MyDatasetSplitter, MyTrainer + +# Assemble them in a scikit-learn like pipeline +pipeline = Pipeline([ + MyDataGetter(data_size=100), + MyDatasetSplitter( + train_proportion=.5, validation_proportion=.25, test_proportion=0.25 + ), + MyTrainer() +]) + +# Inspect steps +print(pipeline[0]) +print(pipeline[2].name) +print(pipeline[1].train_proportion) + +# Run pipeline +_, _, _, trained_model = pipeline.execute() +print("Trained model: ", trained_model) + +# You can also create a Pipeline from a dict of components, which simplifies +# their retrieval by name +pipeline = Pipeline({ + "datagetter": MyDataGetter(data_size=100), + "splitter": MyDatasetSplitter( + train_proportion=.5, validation_proportion=.25, test_proportion=0.25 + ), + "trainer": MyTrainer() +}) + +# Inspect steps +print(pipeline["datagetter"]) +print(pipeline["trainer"].name) +print(pipeline["splitter"].train_proportion) + +# Run pipeline +_, _, _, trained_model = pipeline.execute() +print("Trained model: ", trained_model) diff --git a/tutorials/ml-workflows/basic_workflow.py b/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py similarity index 53% rename from tutorials/ml-workflows/basic_workflow.py rename to tutorials/ml-workflows/tutorial_1_intermediate_workflow.py index 072a40a0..4feaeb94 100644 --- a/tutorials/ml-workflows/basic_workflow.py +++ b/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py @@ -1,12 +1,20 @@ """ -The most simple workflow that you can write is a sequential pipeline of steps, -where the outputs of a component are fed as input to the following component, -employing a scikit-learn-like Pipeline. +In the previous tutorial we saw how to create new components and assemble them +into a Pipeline for a simplified workflow execution. The Pipeline executes +the components in the order in which they are given, *assuming* that the +outputs of a component will fit as inputs of the following component. +This is not always true, thus you can use the ``Adapter`` component to +compensate for mismatches. This component allows to define a policy to +rearrange intermediate results between two components. -This allows to export the Pipeline form Python code to configuration file, to -persist both parameters and workflow structure. Exporting to configuration file -assumes that each component class resides in a separate python file, so that -the pipeline configuration is agnostic from the current python script. +Moreover, it is good for reproducibility to keep track of the pipeline +configuration used to achieve some outstanding ML results. It would be a shame +to forget how you achieved state-of-the-art results! + +itwinai allows to export the Pipeline form Python code to configuration file, +to persist both parameters and workflow structure. Exporting to configuration +file assumes that each component class resides in a separate python file, so +that the pipeline configuration is agnostic from the current python script. Once the Pipeline has been exported to configuration file (YAML), it can be executed directly from CLI: @@ -19,6 +27,7 @@ >>> itwinai exec-pipeline --config my-pipe.yaml --override nested.list.2.0=42 """ +import subprocess from itwinai.pipeline import Pipeline from itwinai.parser import ConfigParser @@ -58,5 +67,16 @@ _, _, _, trained_model = pipeline.execute() print("Trained model (2): ", trained_model) -# Save new pipeline we YAML file +# Save new pipeline to YAML file pipeline.to_yaml("basic_pipeline_example_v2.yaml", "pipeline") + +print("\n" + "="*50 + "\n") + +# Emulate pipeline execution from CLI, with dynamic override of +# pipeline configuration fields +subprocess.run( + ["itwinai", "exec-pipeline", "--config", "basic_pipeline_example_v2.yaml", + "--override", "pipeline.init_args.steps.0.init_args.data_size=300", + "--override", "pipeline.init_args.steps.1.init_args.train_proportion=0.4" + ] +) diff --git a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py new file mode 100644 index 00000000..36a28fce --- /dev/null +++ b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py @@ -0,0 +1,7 @@ +""" +In the first tutorial we saw how to define simple sequential workflows by +means of the Pipeline object, which feds the outputs of the previous component +as inputs of the following one. + +In this tutorial we show how to create +""" From 0cd6c92f3c3187893fb7a798097c7c3b73f23a8a Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 16:43:52 +0100 Subject: [PATCH 21/26] ADD advanced tutorial --- src/itwinai/parser.py | 76 ++++++++++-- tutorials/ml-workflows/basic_components.py | 22 +++- .../ml-workflows/basic_pipeline_example.yaml | 8 ++ .../basic_pipeline_example_v2.yaml | 8 ++ .../ml-workflows/tutorial_0_basic_workflow.py | 67 ++++++----- .../tutorial_1_intermediate_workflow.py | 110 ++++++++++-------- .../tutorial_2_advanced_workflow.py | 41 ++++++- 7 files changed, 242 insertions(+), 90 deletions(-) diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 42c4a480..d6c4db70 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -3,10 +3,14 @@ execution, and dynamic override of fields. """ +import logging +import os import sys -from typing import Dict, Any, Union, Optional -from jsonargparse import ArgumentParser, ActionConfigFile +from typing import Dict, Any, List, Type, Union, Optional +from jsonargparse import ArgumentParser as JAPArgumentParser +from jsonargparse import ActionConfigFile import json +from jsonargparse._formatters import DefaultHelpFormatter from omegaconf import OmegaConf from pathlib import Path @@ -139,7 +143,7 @@ def parse_pipeline( Returns: Pipeline: instantiated pipeline. """ - pipe_parser = ArgumentParser() + pipe_parser = JAPArgumentParser() pipe_parser.add_subclass_arguments(Pipeline, "pipeline") pipe_dict = self.config @@ -176,12 +180,68 @@ def parse_step( # Wrap config under "step" field and parse it step_dict_config = {'step': step_dict_config} - step_parser = ArgumentParser() + step_parser = JAPArgumentParser() step_parser.add_subclass_arguments(BaseComponent, "step") parsed_namespace = step_parser.parse_object(step_dict_config) return step_parser.instantiate_classes(parsed_namespace)["step"] +class ArgumentParser(JAPArgumentParser): + def __init__( + self, + *args, + env_prefix: Union[bool, str] = True, + formatter_class: Type[DefaultHelpFormatter] = DefaultHelpFormatter, + exit_on_error: bool = True, + logger: Union[bool, str, dict, logging.Logger] = False, + version: Optional[str] = None, + print_config: Optional[str] = "--print_config", + parser_mode: str = "yaml", + dump_header: Optional[List[str]] = None, + default_config_files: Optional[List[Union[str, os.PathLike]]] = None, + default_env: bool = False, + default_meta: bool = True, + **kwargs, + ) -> None: + """Initializer for ArgumentParser instance. + + All the arguments from the initializer of `argparse.ArgumentParser + `_ + are supported. Additionally it accepts: + + Args: + env_prefix: Prefix for environment variables. ``True`` to derive + from ``prog``. + formatter_class: Class for printing help messages. + logger: Configures the logger, see :class:`.LoggerProperty`. + version: Program version which will be printed by the --version + argument. + print_config: Add this as argument to print config, set None to + disable. + parser_mode: Mode for parsing config files: ``'yaml'``, + ``'jsonnet'`` or ones added via :func:`.set_loader`. + dump_header: Header to include as comment when dumping a config + object. + default_config_files: Default config file locations, e.g. + :code:`['~/.config/myapp/*.yaml']`. + default_env: Set the default value on whether to parse environment + variables. + default_meta: Set the default value on whether to include metadata + in config objects. + """ + super().__init__( + *args, env_prefix=env_prefix, formatter_class=formatter_class, + exit_on_error=exit_on_error, logger=logger, version=version, + print_config=print_config, parser_mode=parser_mode, + dump_header=dump_header, default_config_files=default_config_files, + default_env=default_env, + default_meta=default_meta, **kwargs) + self.add_argument( + "-c", "--config", action=ActionConfigFile, + help="Path to a configuration file in json or yaml format." + ) + + class ConfigParser2: """ Deprecated: this pipeline structure does not allow for @@ -274,7 +334,7 @@ def parse_pipeline( Returns: Pipeline: instantiated pipeline. """ - pipe_parser = ArgumentParser() + pipe_parser = JAPArgumentParser() pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key) pipe_dict = self.config[pipeline_nested_key] @@ -313,7 +373,7 @@ def parse_step( # Wrap config under "step" field and parse it step_dict_config = {'step': step_dict_config} - step_parser = ArgumentParser() + step_parser = JAPArgumentParser() step_parser.add_subclass_arguments(BaseComponent, "step") parsed_namespace = step_parser.parse_object(step_dict_config) return step_parser.instantiate_classes(parsed_namespace)["step"] @@ -356,7 +416,7 @@ class ItwinaiCLI: >>> python train.py --config itwinai-conf.yaml >>> python train.py --config itwinai-conf.yaml --server.port 8080 """ - _parser: ArgumentParser + _parser: JAPArgumentParser _config: Dict pipeline: Pipeline @@ -380,7 +440,7 @@ def __init__( ) def _init_parser(self): - self._parser = ArgumentParser(parser_mode=self.parser_mode) + self._parser = JAPArgumentParser(parser_mode=self.parser_mode) self._parser.add_argument( "-c", "--config", action=ActionConfigFile, required=True, diff --git a/tutorials/ml-workflows/basic_components.py b/tutorials/ml-workflows/basic_components.py index dc9c00f9..49e74180 100644 --- a/tutorials/ml-workflows/basic_components.py +++ b/tutorials/ml-workflows/basic_components.py @@ -1,9 +1,9 @@ """ Here we show how to implement component interfaces in a simple way. """ -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Any from itwinai.components import ( - DataGetter, DataSplitter, Trainer, monitor_exec + DataGetter, DataSplitter, Trainer, Saver, monitor_exec ) @@ -46,6 +46,10 @@ def execute( class MyTrainer(Trainer): + def __init__(self, lr: float = 1e-3, name: Optional[str] = None) -> None: + super().__init__(name) + self.save_parameters(name=name, lr=lr) + @monitor_exec def execute( self, @@ -71,3 +75,17 @@ def save_state(self): def load_state(self): return super().load_state() + + +class MySaver(Saver): + @monitor_exec + def execute(self, artifact: Any) -> Any: + """Saves an artifact to disk. + + Args: + artifact (Any): artifact to save (e.g., dataset, model). + + Returns: + Any: input artifact. + """ + return artifact diff --git a/tutorials/ml-workflows/basic_pipeline_example.yaml b/tutorials/ml-workflows/basic_pipeline_example.yaml index c4596e5d..5c433b74 100644 --- a/tutorials/ml-workflows/basic_pipeline_example.yaml +++ b/tutorials/ml-workflows/basic_pipeline_example.yaml @@ -16,3 +16,11 @@ pipeline: - class_path: basic_components.MyTrainer init_args: name: null + - class_path: itwinai.components.Adapter + init_args: + name: null + policy: + - INPUT_ARG#-1 + - class_path: basic_components.MySaver + init_args: + name: null diff --git a/tutorials/ml-workflows/basic_pipeline_example_v2.yaml b/tutorials/ml-workflows/basic_pipeline_example_v2.yaml index d54a3bd7..d4d1352e 100644 --- a/tutorials/ml-workflows/basic_pipeline_example_v2.yaml +++ b/tutorials/ml-workflows/basic_pipeline_example_v2.yaml @@ -16,3 +16,11 @@ pipeline: - class_path: basic_components.MyTrainer init_args: name: null + - class_path: itwinai.components.Adapter + init_args: + name: null + policy: + - INPUT_ARG#-1 + - class_path: basic_components.MySaver + init_args: + name: null diff --git a/tutorials/ml-workflows/tutorial_0_basic_workflow.py b/tutorials/ml-workflows/tutorial_0_basic_workflow.py index dbd04768..98861777 100644 --- a/tutorials/ml-workflows/tutorial_0_basic_workflow.py +++ b/tutorials/ml-workflows/tutorial_0_basic_workflow.py @@ -28,39 +28,44 @@ # Import the custom components from file from basic_components import MyDataGetter, MyDatasetSplitter, MyTrainer -# Assemble them in a scikit-learn like pipeline -pipeline = Pipeline([ - MyDataGetter(data_size=100), - MyDatasetSplitter( - train_proportion=.5, validation_proportion=.25, test_proportion=0.25 - ), - MyTrainer() -]) +if __name__ == "__main__": + # Assemble them in a scikit-learn like pipeline + pipeline = Pipeline([ + MyDataGetter(data_size=100), + MyDatasetSplitter( + train_proportion=.5, + validation_proportion=.25, + test_proportion=0.25 + ), + MyTrainer() + ]) -# Inspect steps -print(pipeline[0]) -print(pipeline[2].name) -print(pipeline[1].train_proportion) + # Inspect steps + print(pipeline[0]) + print(pipeline[2].name) + print(pipeline[1].train_proportion) -# Run pipeline -_, _, _, trained_model = pipeline.execute() -print("Trained model: ", trained_model) + # Run pipeline + _, _, _, trained_model = pipeline.execute() + print("Trained model: ", trained_model) -# You can also create a Pipeline from a dict of components, which simplifies -# their retrieval by name -pipeline = Pipeline({ - "datagetter": MyDataGetter(data_size=100), - "splitter": MyDatasetSplitter( - train_proportion=.5, validation_proportion=.25, test_proportion=0.25 - ), - "trainer": MyTrainer() -}) + # You can also create a Pipeline from a dict of components, which + # simplifies their retrieval by name + pipeline = Pipeline({ + "datagetter": MyDataGetter(data_size=100), + "splitter": MyDatasetSplitter( + train_proportion=.5, + validation_proportion=.25, + test_proportion=0.25 + ), + "trainer": MyTrainer() + }) -# Inspect steps -print(pipeline["datagetter"]) -print(pipeline["trainer"].name) -print(pipeline["splitter"].train_proportion) + # Inspect steps + print(pipeline["datagetter"]) + print(pipeline["trainer"].name) + print(pipeline["splitter"].train_proportion) -# Run pipeline -_, _, _, trained_model = pipeline.execute() -print("Trained model: ", trained_model) + # Run pipeline + _, _, _, trained_model = pipeline.execute() + print("Trained model: ", trained_model) diff --git a/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py b/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py index 4feaeb94..6604df13 100644 --- a/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py +++ b/tutorials/ml-workflows/tutorial_1_intermediate_workflow.py @@ -30,53 +30,69 @@ import subprocess from itwinai.pipeline import Pipeline from itwinai.parser import ConfigParser +from itwinai.components import Adapter -from basic_components import MyDataGetter, MyDatasetSplitter, MyTrainer - -pipeline = Pipeline([ - MyDataGetter(data_size=100), - MyDatasetSplitter( - train_proportion=.5, validation_proportion=.25, test_proportion=0.25 - ), - MyTrainer() -]) - -# Run pipeline -_, _, _, trained_model = pipeline.execute() -print("Trained model: ", trained_model) -print("\n" + "="*50 + "\n") - -# Serialize pipeline to YAML -pipeline.to_yaml("basic_pipeline_example.yaml", "pipeline") - -# Below, we show how to run a pre-existing pipeline stored as -# a configuration file, with the possibility of dynamically -# override some fields - -# Load pipeline from saved YAML (dynamic serialization) -parser = ConfigParser( - config="basic_pipeline_example.yaml", - override_keys={ - "pipeline.init_args.steps.0.init_args.data_size": 200 - } +from basic_components import ( + MyDataGetter, MyDatasetSplitter, MyTrainer, MySaver ) -pipeline = parser.parse_pipeline() -print(f"MyDataGetter's data_size is now: {pipeline.steps[0].data_size}\n") -# Run parsed pipeline, with new data_size for MyDataGetter -_, _, _, trained_model = pipeline.execute() -print("Trained model (2): ", trained_model) - -# Save new pipeline to YAML file -pipeline.to_yaml("basic_pipeline_example_v2.yaml", "pipeline") - -print("\n" + "="*50 + "\n") - -# Emulate pipeline execution from CLI, with dynamic override of -# pipeline configuration fields -subprocess.run( - ["itwinai", "exec-pipeline", "--config", "basic_pipeline_example_v2.yaml", - "--override", "pipeline.init_args.steps.0.init_args.data_size=300", - "--override", "pipeline.init_args.steps.1.init_args.train_proportion=0.4" - ] -) +if __name__ == "__main__": + + # In this pipeline, the MyTrainer produces 4 elements as output: train, + # validation, test datasets, and trained model. The Adapter selects the + # trained model only, and forwards it to the saver, which expects a single + # item as input. + pipeline = Pipeline([ + MyDataGetter(data_size=100), + MyDatasetSplitter( + train_proportion=.5, + validation_proportion=.25, + test_proportion=0.25 + ), + MyTrainer(), + Adapter(policy=[f"{Adapter.INPUT_PREFIX}-1"]), + MySaver() + ]) + + # Run pipeline + trained_model = pipeline.execute() + print("Trained model: ", trained_model) + print("\n" + "="*50 + "\n") + + # Serialize pipeline to YAML + pipeline.to_yaml("basic_pipeline_example.yaml", "pipeline") + + # Below, we show how to run a pre-existing pipeline stored as + # a configuration file, with the possibility of dynamically + # override some fields + + # Load pipeline from saved YAML (dynamic serialization) + parser = ConfigParser( + config="basic_pipeline_example.yaml", + override_keys={ + "pipeline.init_args.steps.0.init_args.data_size": 200 + } + ) + pipeline = parser.parse_pipeline() + print(f"MyDataGetter's data_size is now: {pipeline.steps[0].data_size}\n") + + # Run parsed pipeline, with new data_size for MyDataGetter + trained_model = pipeline.execute() + print("Trained model (2): ", trained_model) + + # Save new pipeline to YAML file + pipeline.to_yaml("basic_pipeline_example_v2.yaml", "pipeline") + + print("\n" + "="*50 + "\n") + + # Emulate pipeline execution from CLI, with dynamic override of + # pipeline configuration fields + subprocess.run( + ["itwinai", "exec-pipeline", "--config", + "basic_pipeline_example_v2.yaml", + "--override", + "pipeline.init_args.steps.0.init_args.data_size=300", + "--override", + "pipeline.init_args.steps.1.init_args.train_proportion=0.4" + ] + ) diff --git a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py index 36a28fce..e7661328 100644 --- a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py +++ b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py @@ -1,7 +1,44 @@ """ -In the first tutorial we saw how to define simple sequential workflows by +In the first two tutorials we saw how to define simple sequential workflows by means of the Pipeline object, which feds the outputs of the previous component as inputs of the following one. -In this tutorial we show how to create +In this tutorial we show how to create more complex workflows, with +non-sequential data flows. Here, components can be arranges as an directed +acyclic graph (DAG). Under the DAG assumption, outputs of each block can be fed +as input potentially to any other component, granting great flexibility to the +experimenter. + +The trade-off for improved flexibility is a change in the way we define +configuration files. From now on, it will only be possible to configure the +parameters used by the training script, but not its structure through the +Pipeline. + +itwinai provides a wrapper of jsonarparse's ArgumentParser which supports +configuration files by default. + +To run as usual: +>>> python my_script.py -d 20 --train-prop 0.7 --val-prop 0.2 --lr 1e-5 + +To reuse the parameters saved in a configuration file and override some +parameter (e.g., learning rate): +>>> python my_script.py --config my_config_file.yaml --lr 2e-3 + """ +from itwinai.parser import ArgumentParser + +if __name__ == "__main__": + parser = ArgumentParser(description="itwinai advanced workflows tutorial") + parser.add_argument( + "--data-size", "-d", type=int, required=True, + help="Dataset cardinality.") + parser.add_argument( + "--train-prop", type=float, required=True, + help="Train split proportion.") + parser.add_argument( + "--val-prop", type=float, required=True, + help="Validation split proportion.") + parser.add_argument( + "--lr", type=float, help="Training learning rate.") + args = parser.parse_args() + # parser.save(args, "test_conf2.yaml", format='yaml') From e4297b1d05f83262d851157941d21d04dc87d6fb Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 16:59:40 +0100 Subject: [PATCH 22/26] UPDATE advanced tutorial --- .../ml-workflows/basic_pipeline_example.yaml | 26 ----------- .../basic_pipeline_example_v2.yaml | 26 ----------- .../tutorial_2_advanced_workflow.py | 44 ++++++++++++++++++- 3 files changed, 42 insertions(+), 54 deletions(-) delete mode 100644 tutorials/ml-workflows/basic_pipeline_example.yaml delete mode 100644 tutorials/ml-workflows/basic_pipeline_example_v2.yaml diff --git a/tutorials/ml-workflows/basic_pipeline_example.yaml b/tutorials/ml-workflows/basic_pipeline_example.yaml deleted file mode 100644 index 5c433b74..00000000 --- a/tutorials/ml-workflows/basic_pipeline_example.yaml +++ /dev/null @@ -1,26 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - name: null - steps: - - class_path: basic_components.MyDataGetter - init_args: - data_size: 100 - name: null - - class_path: basic_components.MyDatasetSplitter - init_args: - name: null - test_proportion: 0.25 - train_proportion: 0.5 - validation_proportion: 0.25 - - class_path: basic_components.MyTrainer - init_args: - name: null - - class_path: itwinai.components.Adapter - init_args: - name: null - policy: - - INPUT_ARG#-1 - - class_path: basic_components.MySaver - init_args: - name: null diff --git a/tutorials/ml-workflows/basic_pipeline_example_v2.yaml b/tutorials/ml-workflows/basic_pipeline_example_v2.yaml deleted file mode 100644 index d4d1352e..00000000 --- a/tutorials/ml-workflows/basic_pipeline_example_v2.yaml +++ /dev/null @@ -1,26 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - name: null - steps: - - class_path: basic_components.MyDataGetter - init_args: - data_size: 200 - name: null - - class_path: basic_components.MyDatasetSplitter - init_args: - name: null - test_proportion: 0.25 - train_proportion: 0.5 - validation_proportion: 0.25 - - class_path: basic_components.MyTrainer - init_args: - name: null - - class_path: itwinai.components.Adapter - init_args: - name: null - policy: - - INPUT_ARG#-1 - - class_path: basic_components.MySaver - init_args: - name: null diff --git a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py index e7661328..97bff880 100644 --- a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py +++ b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py @@ -22,10 +22,24 @@ To reuse the parameters saved in a configuration file and override some parameter (e.g., learning rate): ->>> python my_script.py --config my_config_file.yaml --lr 2e-3 +>>> python my_script.py --config advanced_tutorial_conf.yaml --lr 2e-3 """ +from typing import Any from itwinai.parser import ArgumentParser +from itwinai.components import Predictor, monitor_exec + +from basic_components import ( + MyDataGetter, MyDatasetSplitter, MyTrainer, MySaver +) + + +class MyPredictor(Predictor): + @monitor_exec + def execute(self, dataset, model) -> Any: + # do some predictions with model on dataset... + return dataset + if __name__ == "__main__": parser = ArgumentParser(description="itwinai advanced workflows tutorial") @@ -41,4 +55,30 @@ parser.add_argument( "--lr", type=float, help="Training learning rate.") args = parser.parse_args() - # parser.save(args, "test_conf2.yaml", format='yaml') + + # Save parsed arguments to configuration file. + # Previous configurations are overwritten, which is not good, + # but the versioning of configuration files is out of the scope + # of this tutorial. + parser.save( + args, "advanced_tutorial_conf.yaml", format='yaml', overwrite=True) + + # Define workflow components + getter = MyDataGetter(data_size=args.data_size) + splitter = MyDatasetSplitter( + train_proportion=args.train_prop, + validation_proportion=args.val_prop, + test_proportion=1-args.train_prop-args.val_prop + ) + trainer = MyTrainer(lr=args.lr) + saver = MySaver() + predictor = MyPredictor(model=None) + + # Define ML workflow + dataset = getter.execute() + train_spl, val_spl, test_spl = splitter.execute(dataset) + _, _, _, trained_model = trainer.execute(train_spl, val_spl, test_spl) + _ = saver.execute(trained_model) + predictions = predictor.execute(test_spl, trained_model) + print() + print("Predictions: " + str(predictions)) From c0cfd1e015392d62ff5690c73b3ac3a13bc48daa Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 18:02:58 +0100 Subject: [PATCH 23/26] UPDATE use cases --- src/itwinai/parser.py | 8 +- src/itwinai/serialization.py | 4 +- src/itwinai/tensorflow/trainer.py | 19 ++- src/itwinai/torch/inference.py | 1 + src/itwinai/torch/trainer.py | 1 + use-cases/3dgan/cern-pipeline.yaml | 4 +- use-cases/3dgan/dataloader.py | 27 ++--- use-cases/3dgan/inference-pipeline.yaml | 4 +- use-cases/3dgan/pipeline.yaml | 4 +- use-cases/3dgan/saver.py | 28 +---- use-cases/3dgan/train.py | 25 ++-- use-cases/3dgan/trainer.py | 37 ++---- use-cases/3dgan/utils.py | 108 ------------------ use-cases/mnist/tensorflow/dataloader.py | 32 +++--- use-cases/mnist/tensorflow/pipeline.yaml | 6 +- use-cases/mnist/tensorflow/train.py | 25 ++-- use-cases/mnist/tensorflow/trainer.py | 25 ++-- use-cases/mnist/torch-lightning/dataloader.py | 25 ++-- use-cases/mnist/torch-lightning/pipeline.yaml | 4 +- use-cases/mnist/torch-lightning/train.py | 25 ++-- use-cases/mnist/torch-lightning/trainer.py | 15 +-- use-cases/mnist/torch/dataloader.py | 2 + use-cases/mnist/torch/saver.py | 1 + use-cases/mnist/torch/train.py | 24 ++-- 24 files changed, 129 insertions(+), 325 deletions(-) delete mode 100644 use-cases/3dgan/utils.py diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index d6c4db70..b627bce7 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -62,6 +62,12 @@ class ConfigParser: It also provides functionalities for dynamic override of fields by means of nested key notation. + Args: + config (Union[str, Dict]): path to YAML configuration file + or dict storing a configuration. + override_keys (Optional[Dict[str, Any]], optional): dict mapping + nested keys to the value to override. Defaults to None. + Example: >>> # pipeline.yaml file @@ -379,7 +385,7 @@ def parse_step( return step_parser.instantiate_classes(parsed_namespace)["step"] -class ItwinaiCLI: +class ItwinaiCLI2: """ Deprecated: the dynamic override does not work with nested parameters and may be confusing. diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index d7269350..099385f3 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -41,8 +41,8 @@ def save_parameters(self, **kwargs) -> None: self.parameters = {} self.parameters.update(kwargs) - for k, v in kwargs.items(): - self.__setattr__(k, v) + # for k, v in kwargs.items(): + # self.__setattr__(k, v) def update_parameters(self, **kwargs) -> None: """Updates stored parameters.""" diff --git a/src/itwinai/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py index 3f51f000..367ceeff 100644 --- a/src/itwinai/tensorflow/trainer.py +++ b/src/itwinai/tensorflow/trainer.py @@ -4,7 +4,7 @@ from jsonargparse import ArgumentParser import tensorflow as tf -from ..components import Trainer +from ..components import Trainer, monitor_exec def import_class(name): @@ -38,6 +38,11 @@ def __init__( strategy ): super().__init__() + self.save_parameters( + strategy=strategy, epochs=epochs, batch_size=batch_size, + callbacks=callbacks, model_dict=model_dict, + compile_conf=compile_conf, strategy=strategy + ) self.strategy = strategy self.epochs = epochs self.batch_size = batch_size @@ -96,7 +101,8 @@ def instantiate_compile_conf(conf: Dict) -> Dict: conf[item_name] = instance_from_dict(item) return conf - def train(self, train_dataset, validation_dataset): + @monitor_exec + def execute(self, train_dataset, validation_dataset) -> Any: # Set batch size to the dataset # train = train.batch(self.batch_size, drop_remainder=True) # test = test.batch(self.batch_size, drop_remainder=True) @@ -169,7 +175,8 @@ def train(self, train_dataset, validation_dataset): # # TODO: move loss, optimizer and metrics instantiation under # # here # # Ref: -# # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit +# # https://www.tensorflow.org/guide/distributed_training\ +# #use_tfdistributestrategy_with_keras_modelfit # else: # self.model = parser.instantiate_classes(model_dict).model # self.model.compile(**compile_conf) @@ -191,8 +198,10 @@ def train(self, train_dataset, validation_dataset): # n_test = test.cardinality().numpy() # # TODO: read -# # https://github.com/tensorflow/tensorflow/issues/56773#issuecomment-1188693881 -# # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit +# # https://github.com/tensorflow/tensorflow/issues/56773\ +# #issuecomment-1188693881 +# # https://www.tensorflow.org/guide/distributed_training\ +# #use_tfdistributestrategy_with_keras_modelfit # # Distribute dataset # if self.strategy: diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py index 39c16b04..ca4ec752 100644 --- a/src/itwinai/torch/inference.py +++ b/src/itwinai/torch/inference.py @@ -93,6 +93,7 @@ def __init__( name: str = None ) -> None: super().__init__(model=model, name=name) + self.save_parameters(**locals()) self.model = self.model.eval() # self.seed = seed # self.strategy = strategy diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py index f965cb2e..e9ee2f2a 100644 --- a/src/itwinai/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -205,6 +205,7 @@ def __init__( Makes the model a DDP model. """ super().__init__() + self.save_parameters(**locals()) self.model = model self.loss = loss self.epochs = epochs diff --git a/use-cases/3dgan/cern-pipeline.yaml b/use-cases/3dgan/cern-pipeline.yaml index 7d251ae5..3af5c892 100644 --- a/use-cases/3dgan/cern-pipeline.yaml +++ b/use-cases/3dgan/cern-pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.Lightning3DGANDownloader diff --git a/use-cases/3dgan/dataloader.py b/use-cases/3dgan/dataloader.py index d6e5a880..5565b3cc 100644 --- a/use-cases/3dgan/dataloader.py +++ b/use-cases/3dgan/dataloader.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple, Dict +from typing import Optional import os from lightning.pytorch.utilities.types import EVAL_DATALOADERS @@ -10,21 +10,23 @@ import h5py import gdown -from itwinai.components import DataGetter +from itwinai.components import DataGetter, monitor_exec class Lightning3DGANDownloader(DataGetter): def __init__( - self, - data_path: str, - data_url: Optional[str] = None, - name: Optional[str] = None, - **kwargs) -> None: - super().__init__(name, **kwargs) + self, + data_path: str, + data_url: Optional[str] = None, + name: Optional[str] = None, + ) -> None: + self.save_parameters(**locals()) + super().__init__(name) self.data_path = data_path self.data_url = data_url - def load(self): + @monitor_exec + def execute(self): # Download data if not os.path.exists(self.data_path): if self.data_url is None: @@ -36,13 +38,6 @@ def load(self): output=self.data_path ) - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[None, Optional[Dict]]: - self.load() - return None, config - class ParticlesDataset(Dataset): def __init__(self, datapath: str, max_samples: Optional[int] = None): diff --git a/use-cases/3dgan/inference-pipeline.yaml b/use-cases/3dgan/inference-pipeline.yaml index 3939b206..1088b883 100644 --- a/use-cases/3dgan/inference-pipeline.yaml +++ b/use-cases/3dgan/inference-pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.Lightning3DGANDownloader diff --git a/use-cases/3dgan/pipeline.yaml b/use-cases/3dgan/pipeline.yaml index 676424aa..ca502e30 100644 --- a/use-cases/3dgan/pipeline.yaml +++ b/use-cases/3dgan/pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.Lightning3DGANDownloader diff --git a/use-cases/3dgan/saver.py b/use-cases/3dgan/saver.py index 7aa72429..d1bf0a9f 100644 --- a/use-cases/3dgan/saver.py +++ b/use-cases/3dgan/saver.py @@ -1,4 +1,4 @@ -from typing import Dict, Tuple, Optional +from typing import Dict import os import shutil @@ -7,7 +7,7 @@ import matplotlib.pyplot as plt import numpy as np -from itwinai.components import Saver +from itwinai.components import Saver, monitor_exec class ParticleImagesSaver(Saver): @@ -17,30 +17,12 @@ def __init__( self, save_dir: str = '3dgan-generated' ) -> None: + self.save_parameters(**locals()) super().__init__() self.save_dir = save_dir - def execute( - self, - generated_images: Dict[str, Tensor], - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - """Saves generated images to disk. - - Args: - generated_images (Dict[str, Tensor]): maps unique item ID to - the generated image. - config (Optional[Dict], optional): inherited configuration. - Defaults to None. - - Returns: - Tuple[Optional[Tuple], Optional[Dict]]: propagation of inherited - configuration and saver return value. - """ - result = self.save(generated_images) - return ((result,), config) - - def save(self, generated_images: Dict[str, Tensor]) -> None: + @monitor_exec + def execute(self, generated_images: Dict[str, Tensor]) -> None: """Saves generated images to disk. Args: diff --git a/use-cases/3dgan/train.py b/use-cases/3dgan/train.py index 512015e6..d12ee05e 100644 --- a/use-cases/3dgan/train.py +++ b/use-cases/3dgan/train.py @@ -15,13 +15,10 @@ import argparse -from itwinai.components import Pipeline -from itwinai.utils import parse_pipe_config -from jsonargparse import ArgumentParser +from itwinai.parser import ConfigParser if __name__ == "__main__": - # Create CLI Parser parser = argparse.ArgumentParser() parser.add_argument( "-p", "--pipeline", type=str, required=True, @@ -36,20 +33,12 @@ ) args = parser.parse_args() - # Create parser for the pipeline (ordered) - pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "executor") - - # Parse, Instantiate pipe - parsed = parse_pipe_config(args.pipeline, pipe_parser) - pipe = pipe_parser.instantiate_classes(parsed) - executor: Pipeline = getattr(pipe, 'executor') + # Create parser for the pipeline + pipe_parser = ConfigParser(config=args.pipeline) + pipeline = pipe_parser.parse_pipeline() if args.download_only: print('Downloading datasets and exiting...') - executor = executor[:1] - else: - print('Downloading datasets (if not already done) and running...') - executor = executor - executor.setup() - executor() + pipeline = pipeline[:1] + + pipeline.execute() diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py index faf7dc32..5ac9e7c3 100644 --- a/use-cases/3dgan/trainer.py +++ b/use-cases/3dgan/trainer.py @@ -1,31 +1,33 @@ import os import sys -from typing import Union, Dict, Tuple, Optional, Any +from typing import Union, Dict, Optional, Any import torch from torch import Tensor import lightning as pl from lightning.pytorch.cli import LightningCLI -from itwinai.components import Trainer, Predictor +from itwinai.components import Trainer, Predictor, monitor_exec from itwinai.serialization import ModelLoader from itwinai.torch.inference import TorchModelLoader from itwinai.torch.types import Batch +from itwinai.utils import load_yaml from model import ThreeDGAN from dataloader import ParticlesDataModule -from utils import load_yaml class Lightning3DGANTrainer(Trainer): def __init__(self, config: Union[Dict, str]): + self.save_parameters(**locals()) super().__init__() if isinstance(config, str) and os.path.isfile(config): # Load from YAML config = load_yaml(config) self.conf = config - def train(self) -> Any: + @monitor_exec + def execute(self) -> Any: old_argv = sys.argv sys.argv = ['some_script_placeholder.py'] cli = LightningCLI( @@ -43,13 +45,6 @@ def train(self) -> Any: sys.argv = old_argv cli.trainer.fit(cli.model, datamodule=cli.datamodule) - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Any, Optional[Dict]]: - result = self.train() - return result, config - def save_state(self): return super().save_state() @@ -93,13 +88,15 @@ def __init__( config: Union[Dict, str], name: Optional[str] = None ): + self.save_parameters(**locals()) super().__init__(model, name) if isinstance(config, str) and os.path.isfile(config): # Load from YAML config = load_yaml(config) self.conf = config - def predict( + @monitor_exec + def execute( self, datamodule: Optional[pl.LightningDataModule] = None, model: Optional[pl.LightningModule] = None @@ -152,19 +149,3 @@ def transform_predictions(self, batch: Batch) -> Batch: Post-process the predictions of the torch model. """ return batch.squeeze(1) - - def execute( - self, - config: Optional[Dict] = None, - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - """"Execute some operations. - - Args: - config (Dict, optional): key-value configuration. - Defaults to None. - - Returns: - Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as - (results, config). - """ - return self.predict(), config diff --git a/use-cases/3dgan/utils.py b/use-cases/3dgan/utils.py deleted file mode 100644 index d04f9e63..00000000 --- a/use-cases/3dgan/utils.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Utilities for itwinai package. -""" -import os -import yaml - -from collections.abc import MutableMapping -from typing import Dict -from omegaconf import OmegaConf -from omegaconf.dictconfig import DictConfig - - -def load_yaml(path: str) -> Dict: - """Load YAML file as dict. - - Args: - path (str): path to YAML file. - - Raises: - exc: yaml.YAMLError for loading/parsing errors. - - Returns: - Dict: nested dict representation of parsed YAML file. - """ - with open(path, "r", encoding="utf-8") as yaml_file: - try: - loaded_config = yaml.safe_load(yaml_file) - except yaml.YAMLError as exc: - print(exc) - raise exc - return loaded_config - - -def load_yaml_with_deps_from_file(path: str) -> DictConfig: - """ - Load YAML file with OmegaConf and merge it with its dependencies - specified in the `conf-dependencies` field. - Assume that the dependencies live in the same folder of the - YAML file which is importing them. - - Args: - path (str): path to YAML file. - - Raises: - exc: yaml.YAMLError for loading/parsing errors. - - Returns: - DictConfig: nested representation of parsed YAML file. - """ - yaml_conf = load_yaml(path) - use_case_dir = os.path.dirname(path) - deps = [] - if yaml_conf.get("conf-dependencies"): - for dependency in yaml_conf["conf-dependencies"]: - deps.append(load_yaml(os.path.join(use_case_dir, dependency))) - - return OmegaConf.merge(yaml_conf, *deps) - - -def load_yaml_with_deps_from_dict(dict_conf, use_case_dir) -> DictConfig: - deps = [] - - if dict_conf.get("conf-dependencies"): - for dependency in dict_conf["conf-dependencies"]: - deps.append(load_yaml(os.path.join(use_case_dir, dependency))) - - return OmegaConf.merge(dict_conf, *deps) - - -def dynamically_import_class(name: str): - """ - Dynamically import class by module path. - Adapted from https://stackoverflow.com/a/547867 - - Args: - name (str): path to the class (e.g., mypackage.mymodule.MyClass) - - Returns: - __class__: class object. - """ - module, class_name = name.rsplit(".", 1) - mod = __import__(module, fromlist=[class_name]) - klass = getattr(mod, class_name) - return klass - - -def flatten_dict( - d: MutableMapping, parent_key: str = "", sep: str = "." -) -> MutableMapping: - """Flatten dictionary - - Args: - d (MutableMapping): nested dictionary to flatten - parent_key (str, optional): prefix for all keys. Defaults to ''. - sep (str, optional): separator for nested key concatenation. - Defaults to '.'. - - Returns: - MutableMapping: flattened dictionary with new keys. - """ - items = [] - for k, v in d.items(): - new_key = parent_key + sep + k if parent_key else k - if isinstance(v, MutableMapping): - items.extend(flatten_dict(v, new_key, sep=sep).items()) - else: - items.append((new_key, v)) - return dict(items) diff --git a/use-cases/mnist/tensorflow/dataloader.py b/use-cases/mnist/tensorflow/dataloader.py index 920e0dba..af42d957 100644 --- a/use-cases/mnist/tensorflow/dataloader.py +++ b/use-cases/mnist/tensorflow/dataloader.py @@ -1,31 +1,32 @@ -from typing import Optional, Dict, Tuple +from typing import Tuple import tensorflow.keras as keras import tensorflow as tf -from itwinai.components import DataGetter, DataPreproc +from itwinai.components import DataGetter, DataPreproc, monitor_exec class MNISTDataGetter(DataGetter): def __init__(self): super().__init__() + self.save_parameters(**locals()) - def load(self): - return keras.datasets.mnist.load_data() - - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - train, test = self.load() - return ([train, test],), config + @monitor_exec + def execute(self) -> Tuple: + train, test = keras.datasets.mnist.load_data() + return train, test class MNISTDataPreproc(DataPreproc): def __init__(self, classes: int): super().__init__() + self.save_parameters(**locals()) self.classes = classes - def preproc(self, datasets) -> Tuple: + @monitor_exec + def execute( + self, + *datasets, + ) -> Tuple: options = tf.data.Options() options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.FILE) @@ -37,10 +38,3 @@ def preproc(self, datasets) -> Tuple: sliced = sliced.with_options(options) preprocessed.append(sliced) return tuple(preprocessed) - - def execute( - self, - datasets, - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - return self.preproc(datasets), config diff --git a/use-cases/mnist/tensorflow/pipeline.yaml b/use-cases/mnist/tensorflow/pipeline.yaml index aa34e0d4..9fced327 100644 --- a/use-cases/mnist/tensorflow/pipeline.yaml +++ b/use-cases/mnist/tensorflow/pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.MNISTDataGetter @@ -29,7 +29,7 @@ executor: input_shape: [ 28, 28, 1 ] output_shape: 10 - strategy: + strategy: class_path: tensorflow.python.distribute.mirrored_strategy.MirroredStrategy logger: diff --git a/use-cases/mnist/tensorflow/train.py b/use-cases/mnist/tensorflow/train.py index 7e7d71ac..26a90f81 100644 --- a/use-cases/mnist/tensorflow/train.py +++ b/use-cases/mnist/tensorflow/train.py @@ -13,13 +13,10 @@ import argparse -from itwinai.components import Pipeline -from itwinai.utils import parse_pipe_config -from jsonargparse import ArgumentParser +from itwinai.parser import ConfigParser if __name__ == "__main__": - # Create CLI Parser parser = argparse.ArgumentParser() parser.add_argument( "-p", "--pipeline", type=str, required=True, @@ -34,20 +31,12 @@ ) args = parser.parse_args() - # Create parser for the pipeline (ordered) - pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "executor") - - # Parse, Instantiate pipe - parsed = parse_pipe_config(args.pipeline, pipe_parser) - pipe = pipe_parser.instantiate_classes(parsed) - executor: Pipeline = getattr(pipe, 'executor') + # Create parser for the pipeline + pipe_parser = ConfigParser(config=args.pipeline) + pipeline = pipe_parser.parse_pipeline() if args.download_only: print('Downloading datasets and exiting...') - executor = executor[:1] - else: - print('Downloading datasets (if not already done) and running...') - executor = executor - executor.setup() - executor() + pipeline = pipeline[:1] + + pipeline.execute() diff --git a/use-cases/mnist/tensorflow/trainer.py b/use-cases/mnist/tensorflow/trainer.py index dfbc06c7..4b653c04 100644 --- a/use-cases/mnist/tensorflow/trainer.py +++ b/use-cases/mnist/tensorflow/trainer.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple, Any +from typing import Dict, List, Optional, Any # from tensorflow.keras.optimizers import Optimizer # from tensorflow.keras.losses import Loss @@ -6,6 +6,7 @@ from itwinai.tensorflow.trainer import TensorflowTrainer from itwinai.loggers import Logger +from itwinai.components import monitor_exec class MNISTTrainer(TensorflowTrainer): @@ -19,29 +20,21 @@ def __init__( strategy: Optional[MirroredStrategy] = None, logger: Optional[List[Logger]] = None ): - # Configurable - self.logger = logger if logger is not None else [] - compile_conf = dict(loss=loss, optimizer=optimizer) - print(f'STRATEGY: {strategy}') super().__init__( epochs=epochs, batch_size=batch_size, callbacks=[], model_dict=model, - compile_conf=compile_conf, + compile_conf=dict(loss=loss, optimizer=optimizer), strategy=strategy ) + self.save_parameters(**locals()) + print(f'STRATEGY: {strategy}') + self.logger = logger if logger is not None else [] - def train(self, train_dataset, validation_dataset) -> Any: - return super().train(train_dataset, validation_dataset) - - def execute( - self, - train_dataset, - validation_dataset, - config: Optional[Dict] = None, - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - return (self.train(train_dataset, validation_dataset),), config + @monitor_exec + def execute(self, train_dataset, validation_dataset) -> Any: + return super().execute(train_dataset, validation_dataset) def load_state(self): return super().load_state() diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py index 28ec236d..ce693560 100644 --- a/use-cases/mnist/torch-lightning/dataloader.py +++ b/use-cases/mnist/torch-lightning/dataloader.py @@ -1,20 +1,21 @@ -from typing import Optional, Tuple, Dict +from typing import Optional import lightning as L from torchvision.datasets import MNIST from torch.utils.data import DataLoader, random_split from torchvision import transforms -from itwinai.components import DataGetter +from itwinai.components import DataGetter, monitor_exec class LightningMNISTDownloader(DataGetter): def __init__( - self, - data_path: str, - name: Optional[str] = None, - **kwargs) -> None: - super().__init__(name, **kwargs) + self, + data_path: str, + name: Optional[str] = None + ) -> None: + super().__init__(name) + self.save_parameters(**locals()) self.data_path = data_path self._downloader = MNISTDataModule( data_path=self.data_path, download=True, @@ -22,19 +23,13 @@ def __init__( batch_size=1, train_prop=.5, ) - def load(self): + @monitor_exec + def execute(self) -> None: # Simulate dataset creation to force data download self._downloader.setup(stage='fit') self._downloader.setup(stage='test') self._downloader.setup(stage='predict') - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[None, Optional[Dict]]: - self.load() - return None, config - class MNISTDataModule(L.LightningModule): def __init__( diff --git a/use-cases/mnist/torch-lightning/pipeline.yaml b/use-cases/mnist/torch-lightning/pipeline.yaml index 33ae0a94..cf754b2f 100644 --- a/use-cases/mnist/torch-lightning/pipeline.yaml +++ b/use-cases/mnist/torch-lightning/pipeline.yaml @@ -1,5 +1,5 @@ -executor: - class_path: itwinai.components.Executor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: steps: - class_path: dataloader.LightningMNISTDownloader diff --git a/use-cases/mnist/torch-lightning/train.py b/use-cases/mnist/torch-lightning/train.py index 09ab30a9..97f53093 100644 --- a/use-cases/mnist/torch-lightning/train.py +++ b/use-cases/mnist/torch-lightning/train.py @@ -15,13 +15,10 @@ import argparse -from itwinai.components import Pipeline -from itwinai.utils import parse_pipe_config -from jsonargparse import ArgumentParser +from itwinai.parser import ConfigParser if __name__ == "__main__": - # Create CLI Parser parser = argparse.ArgumentParser() parser.add_argument( "-p", "--pipeline", type=str, required=True, @@ -36,20 +33,12 @@ ) args = parser.parse_args() - # Create parser for the pipeline (ordered) - pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "executor") - - # Parse, Instantiate pipe - parsed = parse_pipe_config(args.pipeline, pipe_parser) - pipe = pipe_parser.instantiate_classes(parsed) - executor: Pipeline = getattr(pipe, 'executor') + # Create parser for the pipeline + pipe_parser = ConfigParser(config=args.pipeline) + pipeline = pipe_parser.parse_pipeline() if args.download_only: print('Downloading datasets and exiting...') - executor = executor[:1] - else: - print('Downloading datasets (if not already done) and running...') - executor = executor - executor.setup() - executor() + pipeline = pipeline[:1] + + pipeline.execute() diff --git a/use-cases/mnist/torch-lightning/trainer.py b/use-cases/mnist/torch-lightning/trainer.py index 72454cea..ce546674 100644 --- a/use-cases/mnist/torch-lightning/trainer.py +++ b/use-cases/mnist/torch-lightning/trainer.py @@ -1,7 +1,7 @@ import os -from typing import Union, Dict, Tuple, Optional, Any +from typing import Union, Dict, Any -from itwinai.components import Trainer +from itwinai.components import Trainer, monitor_exec from itwinai.torch.models.mnist import MNISTModel from dataloader import MNISTDataModule from lightning.pytorch.cli import LightningCLI @@ -11,12 +11,14 @@ class LightningMNISTTrainer(Trainer): def __init__(self, config: Union[Dict, str]): super().__init__() + self.save_parameters(**locals()) if isinstance(config, str) and os.path.isfile(config): # Load from YAML config = load_yaml(config) self.conf = config - def train(self) -> Any: + @monitor_exec + def execute(self) -> Any: cli = LightningCLI( args=self.conf, model_class=MNISTModel, @@ -31,13 +33,6 @@ def train(self) -> Any: ) cli.trainer.fit(cli.model, datamodule=cli.datamodule) - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Any, Optional[Dict]]: - result = self.train() - return result, config - def save_state(self): return super().save_state() diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index 609b3770..56e35acb 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -16,6 +16,7 @@ class MNISTDataModuleTorch(DataGetter): def __init__(self, save_path: str = '.tmp/',) -> None: super().__init__() + self.save_parameters(**locals()) self.save_path = save_path @monitor_exec @@ -106,6 +107,7 @@ def generate_jpg_sample( class MNISTPredictLoader(DataGetter): def __init__(self, test_data_path: str) -> None: super().__init__() + self.save_parameters(**locals()) self.test_data_path = test_data_path @monitor_exec diff --git a/use-cases/mnist/torch/saver.py b/use-cases/mnist/torch/saver.py index ad4ff9ab..7e79e90f 100644 --- a/use-cases/mnist/torch/saver.py +++ b/use-cases/mnist/torch/saver.py @@ -20,6 +20,7 @@ def __init__( class_labels: Optional[List] = None ) -> None: super().__init__() + self.save_parameters(**locals()) self.save_dir = save_dir self.predictions_file = predictions_file self.class_labels = ( diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py index 182d1c29..97f53093 100644 --- a/use-cases/mnist/torch/train.py +++ b/use-cases/mnist/torch/train.py @@ -15,13 +15,10 @@ import argparse -from itwinai.pipeline import Pipeline -from itwinai.utils import parse_pipe_config -from jsonargparse import ArgumentParser +from itwinai.parser import ConfigParser if __name__ == "__main__": - # Create CLI Parser parser = argparse.ArgumentParser() parser.add_argument( "-p", "--pipeline", type=str, required=True, @@ -36,19 +33,12 @@ ) args = parser.parse_args() - # Create parser for the pipeline (ordered) - pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "pipeline") - - # Parse, Instantiate pipe - parsed = parse_pipe_config(args.pipeline, pipe_parser) - pipe = pipe_parser.instantiate_classes(parsed) - executor: Pipeline = getattr(pipe, 'pipeline') + # Create parser for the pipeline + pipe_parser = ConfigParser(config=args.pipeline) + pipeline = pipe_parser.parse_pipeline() if args.download_only: print('Downloading datasets and exiting...') - executor = executor[:1] - else: - print('Downloading datasets (if not already done) and running...') - executor = executor - executor.execute() + pipeline = pipeline[:1] + + pipeline.execute() From 390a9119c0e52a4547b52ee3088958beb9f5f901 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 21:27:11 +0100 Subject: [PATCH 24/26] UPDATE save parameters --- src/itwinai/components.py | 62 ++++++++++++++++--- src/itwinai/serialization.py | 17 +++++ src/itwinai/tensorflow/trainer.py | 6 +- src/itwinai/tests/dummy_components.py | 6 ++ src/itwinai/torch/inference.py | 2 +- src/itwinai/torch/trainer.py | 2 +- use-cases/3dgan/dataloader.py | 2 +- use-cases/3dgan/saver.py | 2 +- use-cases/3dgan/trainer.py | 4 +- use-cases/cyclones/dataloader.py | 1 + use-cases/cyclones/trainer.py | 13 +--- use-cases/mnist/tensorflow/dataloader.py | 4 +- use-cases/mnist/tensorflow/trainer.py | 2 +- use-cases/mnist/torch-lightning/dataloader.py | 2 +- use-cases/mnist/torch-lightning/trainer.py | 2 +- use-cases/mnist/torch/dataloader.py | 4 +- use-cases/mnist/torch/saver.py | 2 +- 17 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 053e97a1..16fb074c 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -134,7 +134,7 @@ class BaseComponent(ABC, Serializable): name (Optional[str], optional): unique identifier for a step. Defaults to None. """ - _name: str = 'unnamed' + _name: str = None parameters: Dict[Any, Any] = None def __init__( @@ -144,6 +144,7 @@ def __init__( # debug: bool = False, ) -> None: self.save_parameters(name=name) + self.name = name @property def name(self) -> str: @@ -329,6 +330,8 @@ class Adapter(BaseComponent): def __init__(self, policy: List[Any], name: Optional[str] = None) -> None: super().__init__(name=name) self.save_parameters(policy=policy, name=name) + self.name = name + self.policy = policy @monitor_exec def execute(self, *args) -> Tuple: @@ -370,15 +373,15 @@ def execute(self, *args) -> Tuple: class DataSplitter(BaseComponent): """Splits a dataset into train, validation, and test splits.""" - train_proportion: float - validation_proportion: float - test_proportion: float + _train_proportion: Union[int, float] + _validation_proportion: Union[int, float] + _test_proportion: Union[int, float] def __init__( self, - train_proportion: float, - validation_proportion: float, - test_proportion: float, + train_proportion: Union[int, float], + validation_proportion: Union[int, float], + test_proportion: Union[int, float], name: Optional[str] = None ) -> None: super().__init__(name) @@ -388,6 +391,51 @@ def __init__( test_proportion=test_proportion, name=name ) + self.train_proportion = train_proportion + self.validation_proportion = validation_proportion + self.test_proportion = test_proportion + + @property + def train_proportion(self) -> Union[int, float]: + """Training set proportion.""" + return self._train_proportion + + @train_proportion.setter + def train_proportion(self, prop: Union[int, float]) -> None: + if isinstance(prop, float) and not 0.0 <= prop <= 1.0: + raise ValueError( + "Train proportion should be in the interval [0.0, 1.0] " + f"if given as float. Received {prop}" + ) + self._train_proportion = prop + + @property + def validation_proportion(self) -> Union[int, float]: + """Validation set proportion.""" + return self._validation_proportion + + @validation_proportion.setter + def validation_proportion(self, prop: Union[int, float]) -> None: + if isinstance(prop, float) and not 0.0 <= prop <= 1.0: + raise ValueError( + "Validation proportion should be in the interval [0.0, 1.0] " + f"if given as float. Received {prop}" + ) + self._validation_proportion = prop + + @property + def test_proportion(self) -> Union[int, float]: + """Test set proportion.""" + return self._ttest_proportion + + @test_proportion.setter + def test_proportion(self, prop: Union[int, float]) -> None: + if isinstance(prop, float) and not 0.0 <= prop <= 1.0: + raise ValueError( + "Test proportion should be in the interval [0.0, 1.0] " + f"if given as float. Received {prop}" + ) + self._test_proportion = prop @abstractmethod @monitor_exec diff --git a/src/itwinai/serialization.py b/src/itwinai/serialization.py index 099385f3..9c1c8563 100644 --- a/src/itwinai/serialization.py +++ b/src/itwinai/serialization.py @@ -44,6 +44,23 @@ def save_parameters(self, **kwargs) -> None: # for k, v in kwargs.items(): # self.__setattr__(k, v) + @staticmethod + def locals2params(locals: Dict, pop_self: bool = True) -> Dict: + """Remove ``self`` from the output of ``locals()``. + + Args: + locals (Dict): output of ``locals()`` called in the constructor + of a class. + pop_self (bool, optional): whether to remove ``self``. + Defaults to True. + + Returns: + Dict: cleaned ``locals()``. + """ + if pop_self: + locals.pop('self', None) + return locals + def update_parameters(self, **kwargs) -> None: """Updates stored parameters.""" self.save_parameters(**kwargs) diff --git a/src/itwinai/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py index 367ceeff..f1a10214 100644 --- a/src/itwinai/tensorflow/trainer.py +++ b/src/itwinai/tensorflow/trainer.py @@ -38,11 +38,7 @@ def __init__( strategy ): super().__init__() - self.save_parameters( - strategy=strategy, epochs=epochs, batch_size=batch_size, - callbacks=callbacks, model_dict=model_dict, - compile_conf=compile_conf, strategy=strategy - ) + self.save_parameters(**self.locals2params(locals())) self.strategy = strategy self.epochs = epochs self.batch_size = batch_size diff --git a/src/itwinai/tests/dummy_components.py b/src/itwinai/tests/dummy_components.py index 6f28afbe..d2df54de 100644 --- a/src/itwinai/tests/dummy_components.py +++ b/src/itwinai/tests/dummy_components.py @@ -7,6 +7,7 @@ def __init__(self, data_uri: str, name: Optional[str] = None ) -> None: super().__init__(name) self.save_parameters(data_uri=data_uri, name=name) + self.data_uri = data_uri def execute(self): ... @@ -25,6 +26,7 @@ def __init__(self, train_prop: float, name: Optional[str] = None ) -> None: super().__init__(name) self.save_parameters(train_prop=train_prop, name=name) + self.train_prop = train_prop def execute(self): ... @@ -43,6 +45,7 @@ def __init__(self, max_items: int, name: Optional[str] = None ) -> None: super().__init__(name) self.save_parameters(max_items=max_items, name=name) + self.max_items = max_items def execute(self): ... @@ -59,6 +62,8 @@ def __init__(self, lr: float, batch_size: int, name: Optional[str] = None ) -> None: super().__init__(name) self.save_parameters(lr=lr, batch_size=batch_size, name=name) + self.lr = lr + self.batch_size = batch_size def execute(self): ... @@ -76,6 +81,7 @@ class FakeSaver(BaseComponent): def __init__(self, save_path: str, name: Optional[str] = None) -> None: super().__init__(name) self.save_parameters(save_path=save_path, name=name) + self.save_path = save_path def execute(self): ... diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py index ca4ec752..02882f06 100644 --- a/src/itwinai/torch/inference.py +++ b/src/itwinai/torch/inference.py @@ -93,7 +93,7 @@ def __init__( name: str = None ) -> None: super().__init__(model=model, name=name) - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.model = self.model.eval() # self.seed = seed # self.strategy = strategy diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py index e9ee2f2a..31794c49 100644 --- a/src/itwinai/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -205,7 +205,7 @@ def __init__( Makes the model a DDP model. """ super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.model = model self.loss = loss self.epochs = epochs diff --git a/use-cases/3dgan/dataloader.py b/use-cases/3dgan/dataloader.py index 5565b3cc..f21e57d9 100644 --- a/use-cases/3dgan/dataloader.py +++ b/use-cases/3dgan/dataloader.py @@ -20,7 +20,7 @@ def __init__( data_url: Optional[str] = None, name: Optional[str] = None, ) -> None: - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) super().__init__(name) self.data_path = data_path self.data_url = data_url diff --git a/use-cases/3dgan/saver.py b/use-cases/3dgan/saver.py index d1bf0a9f..fd9bd710 100644 --- a/use-cases/3dgan/saver.py +++ b/use-cases/3dgan/saver.py @@ -17,7 +17,7 @@ def __init__( self, save_dir: str = '3dgan-generated' ) -> None: - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) super().__init__() self.save_dir = save_dir diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py index 5ac9e7c3..c9cf47f7 100644 --- a/use-cases/3dgan/trainer.py +++ b/use-cases/3dgan/trainer.py @@ -19,7 +19,7 @@ class Lightning3DGANTrainer(Trainer): def __init__(self, config: Union[Dict, str]): - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) super().__init__() if isinstance(config, str) and os.path.isfile(config): # Load from YAML @@ -88,7 +88,7 @@ def __init__( config: Union[Dict, str], name: Optional[str] = None ): - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) super().__init__(model, name) if isinstance(config, str) and os.path.isfile(config): # Load from YAML diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py index 8c837822..423eab77 100644 --- a/use-cases/cyclones/dataloader.py +++ b/use-cases/cyclones/dataloader.py @@ -43,6 +43,7 @@ def __init__( data_path: str = "tmp_data" ): super().__init__() + self.save_parameters(**self.locals2params(locals())) self.batch_size = batch_size self.split_ratio = split_ratio self.epochs = epochs diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py index 8760e4bc..e6ed9506 100644 --- a/use-cases/cyclones/trainer.py +++ b/use-cases/cyclones/trainer.py @@ -29,6 +29,7 @@ def __init__( cores: int = None, ): super().__init__() + self.save_parameters(**self.locals2params(locals())) # Configurable self.cores = cores self.model_backup = model_backup @@ -43,7 +44,7 @@ def __init__( # Optimizers, Losses self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate) - def train(self, train_data, validation_data): + def execute(self, train_data, validation_data): train_dataset, n_train = train_data valid_dataset, n_valid = validation_data @@ -103,16 +104,6 @@ def train(self, train_data, validation_data): model.save(self.last_model_name) logging.debug("Saved training history") - def execute( - self, - train_dataset, - validation_dataset, - config: Optional[Dict] = None, - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - config = self.setup_config(config) - train_result = self.train(train_dataset, validation_dataset) - return (train_result,), config - def setup_config(self, config: Optional[Dict] = None) -> Dict: config = config if config is not None else {} self.experiment_dir = config["experiment_dir"] diff --git a/use-cases/mnist/tensorflow/dataloader.py b/use-cases/mnist/tensorflow/dataloader.py index af42d957..cc95153e 100644 --- a/use-cases/mnist/tensorflow/dataloader.py +++ b/use-cases/mnist/tensorflow/dataloader.py @@ -8,7 +8,7 @@ class MNISTDataGetter(DataGetter): def __init__(self): super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) @monitor_exec def execute(self) -> Tuple: @@ -19,7 +19,7 @@ def execute(self) -> Tuple: class MNISTDataPreproc(DataPreproc): def __init__(self, classes: int): super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.classes = classes @monitor_exec diff --git a/use-cases/mnist/tensorflow/trainer.py b/use-cases/mnist/tensorflow/trainer.py index 4b653c04..17ef19a5 100644 --- a/use-cases/mnist/tensorflow/trainer.py +++ b/use-cases/mnist/tensorflow/trainer.py @@ -28,7 +28,7 @@ def __init__( compile_conf=dict(loss=loss, optimizer=optimizer), strategy=strategy ) - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) print(f'STRATEGY: {strategy}') self.logger = logger if logger is not None else [] diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py index ce693560..1f062fe5 100644 --- a/use-cases/mnist/torch-lightning/dataloader.py +++ b/use-cases/mnist/torch-lightning/dataloader.py @@ -15,7 +15,7 @@ def __init__( name: Optional[str] = None ) -> None: super().__init__(name) - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.data_path = data_path self._downloader = MNISTDataModule( data_path=self.data_path, download=True, diff --git a/use-cases/mnist/torch-lightning/trainer.py b/use-cases/mnist/torch-lightning/trainer.py index ce546674..128cf5c6 100644 --- a/use-cases/mnist/torch-lightning/trainer.py +++ b/use-cases/mnist/torch-lightning/trainer.py @@ -11,7 +11,7 @@ class LightningMNISTTrainer(Trainer): def __init__(self, config: Union[Dict, str]): super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) if isinstance(config, str) and os.path.isfile(config): # Load from YAML config = load_yaml(config) diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index 56e35acb..e4243763 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -16,7 +16,7 @@ class MNISTDataModuleTorch(DataGetter): def __init__(self, save_path: str = '.tmp/',) -> None: super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.save_path = save_path @monitor_exec @@ -107,7 +107,7 @@ def generate_jpg_sample( class MNISTPredictLoader(DataGetter): def __init__(self, test_data_path: str) -> None: super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.test_data_path = test_data_path @monitor_exec diff --git a/use-cases/mnist/torch/saver.py b/use-cases/mnist/torch/saver.py index 7e79e90f..e1ce56ac 100644 --- a/use-cases/mnist/torch/saver.py +++ b/use-cases/mnist/torch/saver.py @@ -20,7 +20,7 @@ def __init__( class_labels: Optional[List] = None ) -> None: super().__init__() - self.save_parameters(**locals()) + self.save_parameters(**self.locals2params(locals())) self.save_dir = save_dir self.predictions_file = predictions_file self.class_labels = ( From fe55006807ae2a007f8a9c43074c9668dd4966c2 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Tue, 12 Dec 2023 21:41:16 +0100 Subject: [PATCH 25/26] FIX linter --- src/itwinai/components.py | 2 +- src/itwinai/parser.py | 475 +++++++++++++++++----------------- src/itwinai/tests/__init__.py | 12 +- 3 files changed, 249 insertions(+), 240 deletions(-) diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 16fb074c..12c775f2 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -449,6 +449,6 @@ def execute( dataset (MLDataset): input dataset. Returns: - Tuple[MLDataset, MLDataset, MLDataset]: tuple of + Tuple[MLDataset, MLDataset, MLDataset]: tuple of train, validation and test splits. """ diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index b627bce7..8e393652 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -5,7 +5,6 @@ import logging import os -import sys from typing import Dict, Any, List, Type, Union, Optional from jsonargparse import ArgumentParser as JAPArgumentParser from jsonargparse import ActionConfigFile @@ -16,7 +15,7 @@ from .components import BaseComponent from .pipeline import Pipeline -from .utils import load_yaml, dynamically_import_class +from .utils import load_yaml def add_replace_field( @@ -248,239 +247,239 @@ def __init__( ) -class ConfigParser2: - """ - Deprecated: this pipeline structure does not allow for - nested pipelines. However, it is more readable and the linking - from name to step data could be achieved with OmegaConf. This - could be reused in the future: left as example. - - Parses a configuration file, merging the steps into - the pipeline and returning a pipeline object. - It also provides functionalities for dynamic override - of fields by means of nested key notation. - - Example: - - >>> # pipeline.yaml - >>> pipeline: - >>> class_path: itwinai.pipeline.Pipeline - >>> steps: [server, client] - >>> - >>> server: - >>> class_path: mycode.ServerOptions - >>> init_args: - >>> host: localhost - >>> port: 80 - >>> - >>> client: - >>> class_path: mycode.ClientOptions - >>> init_args: - >>> url: http://${server.init_args.host}:${server.init_args.port}/ - - >>> from itwinai.parser import ConfigParser2 - >>> - >>> parser = ConfigParser2( - >>> config='pipeline.yaml', - >>> override_keys={ - >>> 'server.init_args.port': 777 - >>> } - >>> ) - >>> pipeline = parser.parse_pipeline() - >>> print(pipeline) - >>> print(pipeline.steps) - >>> print(pipeline.steps['server'].port) - >>> - >>> server = parser.parse_step('server') - >>> print(server) - >>> print(server.port) - """ - - config: Dict - pipeline: Pipeline - - def __init__( - self, - config: Union[str, Dict], - override_keys: Optional[Dict[str, Any]] = None - ) -> None: - self.config = config - self.override_keys = override_keys - if isinstance(self.config, str): - self.config = load_yaml(self.config) - self._dynamic_override_keys() - self._omegaconf_interpolate() - - def _dynamic_override_keys(self): - if self.override_keys is not None: - for key_chain, value in self.override_keys.items(): - add_replace_field(self.config, key_chain, value) - - def _omegaconf_interpolate(self) -> None: - """Performs variable interpolation with OmegaConf on internal - configuration file. - """ - conf = OmegaConf.create(self.config) - self.config = OmegaConf.to_container(conf, resolve=True) - - def parse_pipeline( - self, - pipeline_nested_key: str = "pipeline", - verbose: bool = False - ) -> Pipeline: - """Merges steps into pipeline and parses it. - - Args: - pipeline_nested_key (str, optional): nested key in the - configuration file identifying the pipeline object. - Defaults to "pipeline". - verbose (bool): if True, prints the assembled pipeline - to console formatted as JSON. - - Returns: - Pipeline: instantiated pipeline. - """ - pipe_parser = JAPArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key) - pipe_dict = self.config[pipeline_nested_key] - - # Pop steps list from pipeline dictionary - steps_list = pipe_dict['steps'] - del pipe_dict['steps'] - - # Link steps with respective dictionaries - if not pipe_dict.get('init_args'): - pipe_dict['init_args'] = {} - steps_dict = pipe_dict['init_args']['steps'] = {} - for step_name in steps_list: - steps_dict[step_name] = self.config[step_name] - pipe_dict = {pipeline_nested_key: pipe_dict} - - if verbose: - print("Assembled pipeline:") - print(json.dumps(pipe_dict, indent=4)) - - # Parse pipeline dict once merged with steps - conf = pipe_parser.parse_object(pipe_dict) - pipe = pipe_parser.instantiate_classes(conf) - self.pipeline = pipe[pipeline_nested_key] - return self.pipeline - - def parse_step( - self, - step_name: str, - verbose: bool = False - ) -> BaseComponent: - step_dict_config = self.config[step_name] - - if verbose: - print(f"STEP '{step_name}' CONFIG:") - print(json.dumps(step_dict_config, indent=4)) - - # Wrap config under "step" field and parse it - step_dict_config = {'step': step_dict_config} - step_parser = JAPArgumentParser() - step_parser.add_subclass_arguments(BaseComponent, "step") - parsed_namespace = step_parser.parse_object(step_dict_config) - return step_parser.instantiate_classes(parsed_namespace)["step"] - - -class ItwinaiCLI2: - """ - Deprecated: the dynamic override does not work with nested parameters - and may be confusing. - - CLI tool for executing a configuration file, with dynamic - override of fields and variable interpolation with Omegaconf. - - Example: - - >>> # train.py - >>> from itwinai.parser import ItwinaiCLI - >>> cli = ItwinaiCLI() - >>> cli.pipeline.execute() - - >>> # pipeline.yaml - >>> pipeline: - >>> class_path: itwinai.pipeline.Pipeline - >>> steps: [server, client] - >>> - >>> server: - >>> class_path: mycode.ServerOptions - >>> init_args: - >>> host: localhost - >>> port: 80 - >>> - >>> client: - >>> class_path: mycode.ClientOptions - >>> init_args: - >>> url: http://${server.init_args.host}:${server.init_args.port}/ - - From command line: - - >>> python train.py --config itwinai-conf.yaml --help - >>> python train.py --config itwinai-conf.yaml - >>> python train.py --config itwinai-conf.yaml --server.port 8080 - """ - _parser: JAPArgumentParser - _config: Dict - pipeline: Pipeline - - def __init__( - self, - pipeline_nested_key: str = "pipeline", - parser_mode: str = "omegaconf" - ) -> None: - self.pipeline_nested_key = pipeline_nested_key - self.parser_mode = parser_mode - self._init_parser() - self._parser.add_argument(f"--{self.pipeline_nested_key}", type=dict) - self._add_steps_arguments() - self._config = self._parser.parse_args() - - # Merge steps into pipeline and parse it - del self._config['config'] - pipe_parser = ConfigParser2(config=self._config.as_dict()) - self.pipeline = pipe_parser.parse_pipeline( - pipeline_nested_key=self.pipeline_nested_key - ) - - def _init_parser(self): - self._parser = JAPArgumentParser(parser_mode=self.parser_mode) - self._parser.add_argument( - "-c", "--config", action=ActionConfigFile, - required=True, - help="Path to a configuration file in json or yaml format." - ) - - def _add_steps_arguments(self): - """Pre-parses the configuration file, dynamically adding all the - component classes under 'steps' as arguments of the parser. - """ - if "--config" not in sys.argv: - raise ValueError( - "--config parameter has to be specified with a " - "valid path to a configuration file." - ) - config_path = sys.argv.index("--config") + 1 - config_path = sys.argv[config_path] - config = load_yaml(config_path) - - # Add steps to parser - steps = filter( - lambda itm: itm[0] != self.pipeline_nested_key, - config.items() - ) - steps = { - step_name: step_data['class_path'] - for step_name, step_data in steps - } - - for st_nested_key, step_class_str in steps.items(): - step_class = dynamically_import_class(step_class_str) - self._add_step_arguments( - step_class=step_class, nested_key=st_nested_key) - - def _add_step_arguments(self, step_class, nested_key): - self._parser.add_subclass_arguments( - baseclass=step_class, nested_key=nested_key) +# class ConfigParser2: +# """ +# Deprecated: this pipeline structure does not allow for +# nested pipelines. However, it is more readable and the linking +# from name to step data could be achieved with OmegaConf. This +# could be reused in the future: left as example. + +# Parses a configuration file, merging the steps into +# the pipeline and returning a pipeline object. +# It also provides functionalities for dynamic override +# of fields by means of nested key notation. + +# Example: + +# >>> # pipeline.yaml +# >>> pipeline: +# >>> class_path: itwinai.pipeline.Pipeline +# >>> steps: [server, client] +# >>> +# >>> server: +# >>> class_path: mycode.ServerOptions +# >>> init_args: +# >>> host: localhost +# >>> port: 80 +# >>> +# >>> client: +# >>> class_path: mycode.ClientOptions +# >>> init_args: +# >>> url: http://${server.init_args.host}:${server.init_args.port}/ + +# >>> from itwinai.parser import ConfigParser2 +# >>> +# >>> parser = ConfigParser2( +# >>> config='pipeline.yaml', +# >>> override_keys={ +# >>> 'server.init_args.port': 777 +# >>> } +# >>> ) +# >>> pipeline = parser.parse_pipeline() +# >>> print(pipeline) +# >>> print(pipeline.steps) +# >>> print(pipeline.steps['server'].port) +# >>> +# >>> server = parser.parse_step('server') +# >>> print(server) +# >>> print(server.port) +# """ + +# config: Dict +# pipeline: Pipeline + +# def __init__( +# self, +# config: Union[str, Dict], +# override_keys: Optional[Dict[str, Any]] = None +# ) -> None: +# self.config = config +# self.override_keys = override_keys +# if isinstance(self.config, str): +# self.config = load_yaml(self.config) +# self._dynamic_override_keys() +# self._omegaconf_interpolate() + +# def _dynamic_override_keys(self): +# if self.override_keys is not None: +# for key_chain, value in self.override_keys.items(): +# add_replace_field(self.config, key_chain, value) + +# def _omegaconf_interpolate(self) -> None: +# """Performs variable interpolation with OmegaConf on internal +# configuration file. +# """ +# conf = OmegaConf.create(self.config) +# self.config = OmegaConf.to_container(conf, resolve=True) + +# def parse_pipeline( +# self, +# pipeline_nested_key: str = "pipeline", +# verbose: bool = False +# ) -> Pipeline: +# """Merges steps into pipeline and parses it. + +# Args: +# pipeline_nested_key (str, optional): nested key in the +# configuration file identifying the pipeline object. +# Defaults to "pipeline". +# verbose (bool): if True, prints the assembled pipeline +# to console formatted as JSON. + +# Returns: +# Pipeline: instantiated pipeline. +# """ +# pipe_parser = JAPArgumentParser() +# pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key) +# pipe_dict = self.config[pipeline_nested_key] + +# # Pop steps list from pipeline dictionary +# steps_list = pipe_dict['steps'] +# del pipe_dict['steps'] + +# # Link steps with respective dictionaries +# if not pipe_dict.get('init_args'): +# pipe_dict['init_args'] = {} +# steps_dict = pipe_dict['init_args']['steps'] = {} +# for step_name in steps_list: +# steps_dict[step_name] = self.config[step_name] +# pipe_dict = {pipeline_nested_key: pipe_dict} + +# if verbose: +# print("Assembled pipeline:") +# print(json.dumps(pipe_dict, indent=4)) + +# # Parse pipeline dict once merged with steps +# conf = pipe_parser.parse_object(pipe_dict) +# pipe = pipe_parser.instantiate_classes(conf) +# self.pipeline = pipe[pipeline_nested_key] +# return self.pipeline + +# def parse_step( +# self, +# step_name: str, +# verbose: bool = False +# ) -> BaseComponent: +# step_dict_config = self.config[step_name] + +# if verbose: +# print(f"STEP '{step_name}' CONFIG:") +# print(json.dumps(step_dict_config, indent=4)) + +# # Wrap config under "step" field and parse it +# step_dict_config = {'step': step_dict_config} +# step_parser = JAPArgumentParser() +# step_parser.add_subclass_arguments(BaseComponent, "step") +# parsed_namespace = step_parser.parse_object(step_dict_config) +# return step_parser.instantiate_classes(parsed_namespace)["step"] + + +# class ItwinaiCLI2: +# """ +# Deprecated: the dynamic override does not work with nested parameters +# and may be confusing. + +# CLI tool for executing a configuration file, with dynamic +# override of fields and variable interpolation with Omegaconf. + +# Example: + +# >>> # train.py +# >>> from itwinai.parser import ItwinaiCLI +# >>> cli = ItwinaiCLI() +# >>> cli.pipeline.execute() + +# >>> # pipeline.yaml +# >>> pipeline: +# >>> class_path: itwinai.pipeline.Pipeline +# >>> steps: [server, client] +# >>> +# >>> server: +# >>> class_path: mycode.ServerOptions +# >>> init_args: +# >>> host: localhost +# >>> port: 80 +# >>> +# >>> client: +# >>> class_path: mycode.ClientOptions +# >>> init_args: +# >>> url: http://${server.init_args.host}:${server.init_args.port}/ + +# From command line: + +# >>> python train.py --config itwinai-conf.yaml --help +# >>> python train.py --config itwinai-conf.yaml +# >>> python train.py --config itwinai-conf.yaml --server.port 8080 +# """ +# _parser: JAPArgumentParser +# _config: Dict +# pipeline: Pipeline + +# def __init__( +# self, +# pipeline_nested_key: str = "pipeline", +# parser_mode: str = "omegaconf" +# ) -> None: +# self.pipeline_nested_key = pipeline_nested_key +# self.parser_mode = parser_mode +# self._init_parser() +# self._parser.add_argument(f"--{self.pipeline_nested_key}", type=dict) +# self._add_steps_arguments() +# self._config = self._parser.parse_args() + +# # Merge steps into pipeline and parse it +# del self._config['config'] +# pipe_parser = ConfigParser2(config=self._config.as_dict()) +# self.pipeline = pipe_parser.parse_pipeline( +# pipeline_nested_key=self.pipeline_nested_key +# ) + +# def _init_parser(self): +# self._parser = JAPArgumentParser(parser_mode=self.parser_mode) +# self._parser.add_argument( +# "-c", "--config", action=ActionConfigFile, +# required=True, +# help="Path to a configuration file in json or yaml format." +# ) + +# def _add_steps_arguments(self): +# """Pre-parses the configuration file, dynamically adding all the +# component classes under 'steps' as arguments of the parser. +# """ +# if "--config" not in sys.argv: +# raise ValueError( +# "--config parameter has to be specified with a " +# "valid path to a configuration file." +# ) +# config_path = sys.argv.index("--config") + 1 +# config_path = sys.argv[config_path] +# config = load_yaml(config_path) + +# # Add steps to parser +# steps = filter( +# lambda itm: itm[0] != self.pipeline_nested_key, +# config.items() +# ) +# steps = { +# step_name: step_data['class_path'] +# for step_name, step_data in steps +# } + +# for st_nested_key, step_class_str in steps.items(): +# step_class = dynamically_import_class(step_class_str) +# self._add_step_arguments( +# step_class=step_class, nested_key=st_nested_key) + +# def _add_step_arguments(self, step_class, nested_key): +# self._parser.add_subclass_arguments( +# baseclass=step_class, nested_key=nested_key) diff --git a/src/itwinai/tests/__init__.py b/src/itwinai/tests/__init__.py index 9eaae9a1..5486fb7a 100644 --- a/src/itwinai/tests/__init__.py +++ b/src/itwinai/tests/__init__.py @@ -1 +1,11 @@ -from .dummy_components import * +from .dummy_components import ( + FakeGetter, FakeGetterExec, FakePreproc, FakePreprocExec, + FakeSaver, FakeSaverExec, FakeSplitter, FakeSplitterExec, + FakeTrainer, FakeTrainerExec +) + +_ = ( + FakeGetter, FakeGetterExec, FakePreproc, FakePreprocExec, + FakeSaver, FakeSaverExec, FakeSplitter, FakeSplitterExec, + FakeTrainer, FakeTrainerExec +) From 489b697ac87985c1b2076cb83bce3ef93e722038 Mon Sep 17 00:00:00 2001 From: Matteo Bunino Date: Wed, 13 Dec 2023 11:52:28 +0100 Subject: [PATCH 26/26] FIX cyclones use case workflow --- src/itwinai/components.py | 2 +- src/itwinai/tests/dummy_components.py | 8 +- .../tutorial_2_advanced_workflow.py | 16 +-- use-cases/cyclones/.gitignore | 2 + use-cases/cyclones/dataloader.py | 42 +++----- use-cases/cyclones/executor.py | 75 -------------- use-cases/cyclones/pipeline.yaml | 15 ++- use-cases/cyclones/train.py | 97 +++++++++++++++---- use-cases/cyclones/trainer.py | 27 +++--- 9 files changed, 135 insertions(+), 149 deletions(-) create mode 100644 use-cases/cyclones/.gitignore delete mode 100644 use-cases/cyclones/executor.py diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 12c775f2..49b965b2 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -426,7 +426,7 @@ def validation_proportion(self, prop: Union[int, float]) -> None: @property def test_proportion(self) -> Union[int, float]: """Test set proportion.""" - return self._ttest_proportion + return self._test_proportion @test_proportion.setter def test_proportion(self, prop: Union[int, float]) -> None: diff --git a/src/itwinai/tests/dummy_components.py b/src/itwinai/tests/dummy_components.py index d2df54de..b60f1df0 100644 --- a/src/itwinai/tests/dummy_components.py +++ b/src/itwinai/tests/dummy_components.py @@ -58,8 +58,12 @@ def execute(self, train_dataset, val_dataset, test_dataset): class FakeTrainer(BaseComponent): - def __init__(self, lr: float, batch_size: int, name: Optional[str] = None - ) -> None: + def __init__( + self, + lr: float, + batch_size: int, + name: Optional[str] = None + ) -> None: super().__init__(name) self.save_parameters(lr=lr, batch_size=batch_size, name=name) self.lr = lr diff --git a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py index 97bff880..6c437fb2 100644 --- a/tutorials/ml-workflows/tutorial_2_advanced_workflow.py +++ b/tutorials/ml-workflows/tutorial_2_advanced_workflow.py @@ -34,9 +34,9 @@ ) -class MyPredictor(Predictor): +class MyEnsemblePredictor(Predictor): @monitor_exec - def execute(self, dataset, model) -> Any: + def execute(self, dataset, model_ensemble) -> Any: # do some predictions with model on dataset... return dataset @@ -70,15 +70,17 @@ def execute(self, dataset, model) -> Any: validation_proportion=args.val_prop, test_proportion=1-args.train_prop-args.val_prop ) - trainer = MyTrainer(lr=args.lr) + trainer1 = MyTrainer(lr=args.lr) + trainer2 = MyTrainer(lr=args.lr) saver = MySaver() - predictor = MyPredictor(model=None) + predictor = MyEnsemblePredictor(model=None) # Define ML workflow dataset = getter.execute() train_spl, val_spl, test_spl = splitter.execute(dataset) - _, _, _, trained_model = trainer.execute(train_spl, val_spl, test_spl) - _ = saver.execute(trained_model) - predictions = predictor.execute(test_spl, trained_model) + _, _, _, trained_model1 = trainer1.execute(train_spl, val_spl, test_spl) + _, _, _, trained_model2 = trainer2.execute(train_spl, val_spl, test_spl) + _ = saver.execute(trained_model1) + predictions = predictor.execute(test_spl, [trained_model1, trained_model2]) print() print("Predictions: " + str(predictions)) diff --git a/use-cases/cyclones/.gitignore b/use-cases/cyclones/.gitignore new file mode 100644 index 00000000..255b69f5 --- /dev/null +++ b/use-cases/cyclones/.gitignore @@ -0,0 +1,2 @@ +data +experiments \ No newline at end of file diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py index 423eab77..ee19b805 100644 --- a/use-cases/cyclones/dataloader.py +++ b/use-cases/cyclones/dataloader.py @@ -1,8 +1,7 @@ -import logging from os import listdir from os.path import join, exists -from itwinai.components import DataGetter -from typing import List, Dict, Optional, Tuple +from itwinai.components import DataGetter, monitor_exec +from typing import List, Dict from lib.macros import ( PatchType, LabelNoCyclone, @@ -29,6 +28,7 @@ class TensorflowDataGetter(DataGetter): def __init__( self, + data_url: str, patch_type: PatchType, shuffle: bool, split_ratio: List[float], @@ -38,12 +38,14 @@ def __init__( target_scale: bool, label_no_cyclone: LabelNoCyclone, aug_type: AugmentationType, - experiment: dict, + experiment: Dict, + global_config: Dict, shuffle_buffer: int = None, data_path: str = "tmp_data" ): super().__init__() self.save_parameters(**self.locals2params(locals())) + self.data_url = data_url self.batch_size = batch_size self.split_ratio = split_ratio self.epochs = epochs @@ -53,6 +55,7 @@ def __init__( self.aug_type = aug_type.value self.patch_type = patch_type.value self.augment = augment + self.global_config = global_config self.shuffle = shuffle self.data_path = data_path self.drv_vars, self.coo_vars = ( @@ -88,6 +91,9 @@ def __init__( else: self.aug_fns = {} + # Parse global config + self.setup_config(self.global_config) + def split_files(self, files, ratio): n = len(files) return ( @@ -95,7 +101,8 @@ def split_files(self, files, ratio): files[int(ratio[0] * n): int((ratio[0] + ratio[1]) * n)], ) - def load(self): + @monitor_exec + def execute(self): # divide into train, valid and test dataset files train_c_fs, valid_c_fs = self.split_files( files=self.cyclone_files, ratio=self.split_ratio @@ -161,30 +168,16 @@ def load(self): patch_type=self.patch_type, aug_type=self.aug_type, ) - return train_dataset, valid_dataset + return train_dataset, valid_dataset, self.channels - def execute( - self, - config: Optional[Dict] = None - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - config = self.setup_config(config) - train, test = self.load() - logging.debug("Train, valid and test datasets loaded.") - return (train, test), config - - def setup_config(self, config: Optional[Dict] = None) -> Dict: - config = config if config is not None else {} + def setup_config(self, config: Dict) -> None: self.shape = config["shape"] root_dir = config["root_dir"] # Download data - url = ( - "https://drive.google.com/drive/folders/" - "15DEq33MmtRvIpe2bNCg44lnfvEiHcPaf" - ) if not exists(join(root_dir, self.data_path)): gdown.download_folder( - url=url, quiet=False, + url=self.data_url, quiet=False, output=join(root_dir, self.data_path) ) @@ -229,8 +222,3 @@ def setup_config(self, config: Optional[Dict] = None) -> Dict: PatchType.RANDOM.value) ] ) - - config["epochs"] = self.epochs - config["batch_size"] = self.batch_size - config["channels"] = self.channels - return config diff --git a/use-cases/cyclones/executor.py b/use-cases/cyclones/executor.py deleted file mode 100644 index 67946615..00000000 --- a/use-cases/cyclones/executor.py +++ /dev/null @@ -1,75 +0,0 @@ -import logging -from os.path import join -from os import makedirs -from datetime import datetime -from typing import Tuple, Dict, Optional, Iterable - -from lib.macros import PATCH_SIZE as patch_size, SHAPE as shape -from itwinai.components import Pipeline, BaseComponent - - -class CycloneExecutor(Pipeline): - def __init__( - self, - run_name: str, - steps: Iterable[BaseComponent], - name: Optional[str] = None - ): - super().__init__(steps=steps, name=name) - self.run_name = run_name - - def execute( - self, - root_dir, - config: Optional[Dict] = None, - ) -> Tuple[Optional[Tuple], Optional[Dict]]: - self.root_dir = root_dir - print(f" Data will be stored at: {self.root_dir}") - config = self.setup_config(config) - super().execute(config=config) - - def setup_config(self, config: Optional[Dict] = None) -> Dict: - config = config if config is not None else {} - - # Paths, Folders - FORMATTED_DATETIME = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) - MODEL_BACKUP_DIR = join(self.root_dir, "models/") - EXPERIMENTS_DIR = join(self.root_dir, "experiments") - RUN_DIR = join(EXPERIMENTS_DIR, self.run_name + - "_" + FORMATTED_DATETIME) - SCALER_DIR = join(RUN_DIR, "scalers") - TENSORBOARD_DIR = join(RUN_DIR, "tensorboard") - CHECKPOINTS_DIR = join(RUN_DIR, "checkpoints") - - # Files - LOG_FILE = join(RUN_DIR, "run.log") - - # Create folders - makedirs(EXPERIMENTS_DIR, exist_ok=True) - makedirs(RUN_DIR, exist_ok=True) - makedirs(SCALER_DIR, exist_ok=True) - makedirs(TENSORBOARD_DIR, exist_ok=True) - makedirs(CHECKPOINTS_DIR, exist_ok=True) - - config = { - "root_dir": self.root_dir, - "experiment_dir": EXPERIMENTS_DIR, - "run_dir": RUN_DIR, - "scaler_dir": SCALER_DIR, - "tensorboard_dir": TENSORBOARD_DIR, - "checkpoints_dir": CHECKPOINTS_DIR, - "backup_dir": MODEL_BACKUP_DIR, - "log_file": LOG_FILE, - "shape": shape, - "patch_size": patch_size, - } - self.args = config - - # initialize logger - logging.basicConfig( - format="[%(asctime)s] %(levelname)s : %(message)s", - level=logging.DEBUG, - filename=LOG_FILE, - datefmt="%Y-%m-%d %H:%M:%S", - ) - return config diff --git a/use-cases/cyclones/pipeline.yaml b/use-cases/cyclones/pipeline.yaml index de52df9b..97cfc083 100644 --- a/use-cases/cyclones/pipeline.yaml +++ b/use-cases/cyclones/pipeline.yaml @@ -1,10 +1,11 @@ -executor: - class_path: executor.CycloneExecutor +pipeline: + class_path: itwinai.pipeline.Pipeline init_args: - run_name: 'default' steps: - - class_path: dataloader.TensorflowDataGetter + download-step: + class_path: dataloader.TensorflowDataGetter init_args: + data_url: https://drive.google.com/drive/folders/15DEq33MmtRvIpe2bNCg44lnfvEiHcPaf patch_type: NEAREST shuffle: False split_ratio: [0.75, 0.25] @@ -19,8 +20,12 @@ executor: 'COO_VARS_1': ['patch_cyclone'], 'MSK_VAR_1': None } - - class_path: trainer.TensorflowTrainer + + training-step: + class_path: trainer.TensorflowTrainer init_args: + epochs: ${pipeline.init_args.steps.download-step.init_args.epochs} + batch_size: ${pipeline.init_args.steps.download-step.init_args.batch_size} network: VGG_V1 activation: LINEAR regularization_strength: NONE diff --git a/use-cases/cyclones/train.py b/use-cases/cyclones/train.py index a3ab63dd..0146dddf 100644 --- a/use-cases/cyclones/train.py +++ b/use-cases/cyclones/train.py @@ -11,22 +11,76 @@ """ +from typing import Dict import argparse +import logging +from os.path import join +from os import makedirs +from datetime import datetime -from itwinai.components import Pipeline -from itwinai.utils import parse_pipe_config -from jsonargparse import ArgumentParser -from executor import CycloneExecutor +from itwinai.parser import ConfigParser, ArgumentParser + +from lib.macros import PATCH_SIZE, SHAPE + + +def setup_config(args) -> Dict: + config = {} + + # Paths, Folders + FORMATTED_DATETIME = str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) + MODEL_BACKUP_DIR = join(args.root_dir, "models/") + EXPERIMENTS_DIR = join(args.root_dir, "experiments") + RUN_DIR = join(EXPERIMENTS_DIR, args.run_name + + "_" + FORMATTED_DATETIME) + SCALER_DIR = join(RUN_DIR, "scalers") + TENSORBOARD_DIR = join(RUN_DIR, "tensorboard") + CHECKPOINTS_DIR = join(RUN_DIR, "checkpoints") + + # Files + LOG_FILE = join(RUN_DIR, "run.log") + + # Create folders + makedirs(EXPERIMENTS_DIR, exist_ok=True) + makedirs(RUN_DIR, exist_ok=True) + makedirs(SCALER_DIR, exist_ok=True) + makedirs(TENSORBOARD_DIR, exist_ok=True) + makedirs(CHECKPOINTS_DIR, exist_ok=True) + + config = { + "root_dir": args.root_dir, + "experiment_dir": EXPERIMENTS_DIR, + "run_dir": RUN_DIR, + "scaler_dir": SCALER_DIR, + "tensorboard_dir": TENSORBOARD_DIR, + "checkpoints_dir": CHECKPOINTS_DIR, + "backup_dir": MODEL_BACKUP_DIR, + "log_file": LOG_FILE, + "shape": SHAPE, + "patch_size": PATCH_SIZE, + # "epochs": args.epochs, + # "batch_size": args.batch_size + } + + # initialize logger + logging.basicConfig( + format="[%(asctime)s] %(levelname)s : %(message)s", + level=logging.DEBUG, + filename=LOG_FILE, + datefmt="%Y-%m-%d %H:%M:%S", + ) + return config if __name__ == "__main__": - # Create CLI Parser - parser = argparse.ArgumentParser() + parser = ArgumentParser() parser.add_argument( "-p", "--pipeline", type=str, required=True, help='Configuration file to the pipeline to execute.' ) parser.add_argument("-r", "--root_dir", type=str, default='./data') + parser.add_argument("-n", "--run_name", default="noname", type=str) + parser.add_argument("-e", "--epochs", default=1, type=int) + parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument( '-d', '--download-only', action=argparse.BooleanOptionalAction, @@ -35,21 +89,24 @@ '(suggested on login nodes of HPC systems)') ) args = parser.parse_args() + global_config = setup_config(args) - # Create parser for the pipeline (ordered) - pipe_parser = ArgumentParser() - pipe_parser.add_subclass_arguments(Pipeline, "executor") - - # Parse, Instantiate pipe - parsed = parse_pipe_config(args.pipeline, pipe_parser) - pipe = pipe_parser.instantiate_classes(parsed) - executor: CycloneExecutor = getattr(pipe, 'executor') + # Create parser for the pipeline + downloader_params = "pipeline.init_args.steps.download-step.init_args." + trainer_params = "pipeline.init_args.steps.training-step.init_args." + pipe_parser = ConfigParser( + config=args.pipeline, + override_keys={ + downloader_params + "epochs": args.epochs, + downloader_params + "batch_size": args.batch_size, + downloader_params + "global_config": global_config, + trainer_params + "global_config": global_config + } + ) + pipeline = pipe_parser.parse_pipeline() if args.download_only: print('Downloading datasets and exiting...') - executor = executor[:1] - else: - print('Downloading datasets (if not already done) and running...') - executor = executor - executor.setup() - executor(root_dir=args.root_dir) + pipeline = pipeline[:1] + + pipeline.execute() diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py index e6ed9506..2fb3c1bc 100644 --- a/use-cases/cyclones/trainer.py +++ b/use-cases/cyclones/trainer.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, Tuple +from typing import Dict, Any import logging from os.path import join, exists @@ -6,7 +6,7 @@ from lib.strategy import get_mirrored_strategy from lib.utils import get_network_config, load_model -from itwinai.components import Trainer +from itwinai.components import Trainer, monitor_exec from lib.callbacks import ProcessBenchmark from lib.macros import ( Network, @@ -24,13 +24,18 @@ def __init__( regularization_strength: RegularizationStrength, learning_rate: float, loss: Losses, + epochs: int, + batch_size: int, + global_config: Dict[str, Any], kernel_size: int = None, model_backup: str = None, cores: int = None, ): super().__init__() self.save_parameters(**self.locals2params(locals())) - # Configurable + self.epochs = epochs + self.batch_size = batch_size + self.global_config = global_config self.cores = cores self.model_backup = model_backup self.network = network.value @@ -44,7 +49,11 @@ def __init__( # Optimizers, Losses self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate) - def execute(self, train_data, validation_data): + # Parse global config + self.setup_config(self.global_config) + + @monitor_exec + def execute(self, train_data, validation_data, channels) -> None: train_dataset, n_train = train_data valid_dataset, n_valid = validation_data @@ -69,7 +78,7 @@ def execute(self, train_data, validation_data): activation=self.activation, regularizer=self.regularizer, kernel_size=self.kernel_size, - channels=self.channels, + channels=channels, ) logging.debug("New model created") else: @@ -104,14 +113,10 @@ def execute(self, train_data, validation_data): model.save(self.last_model_name) logging.debug("Saved training history") - def setup_config(self, config: Optional[Dict] = None) -> Dict: - config = config if config is not None else {} + def setup_config(self, config: Dict) -> None: self.experiment_dir = config["experiment_dir"] self.run_dir = config["run_dir"] - self.epochs = config["epochs"] - self.batch_size = config["batch_size"] self.patch_size = config["patch_size"] - self.channels = config["channels"] # Paths CHECKPOINTS_DIR = join(self.run_dir, "checkpoints") @@ -150,8 +155,6 @@ def setup_config(self, config: Optional[Dict] = None) -> Dict: self.best_model_name = join(self.model_backup, "best_model.h5") self.last_model_name = join(self.run_dir, "last_model.h5") - return config - def load_state(self): return super().load_state()