diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json index b3ca0584..8a003c54 100644 --- a/.github/linters/.jscpd.json +++ b/.github/linters/.jscpd.json @@ -1,6 +1,6 @@ { "threshold": 2.0, "ignore": [ - "**/itwinai/backend/loggers.py" + "**/itwinai/loggers.py" ] } \ No newline at end of file diff --git a/.github/workflows/workflows-dt.yml b/.github/workflows/workflows-dt.yml index 55b0748c..51883411 100644 --- a/.github/workflows/workflows-dt.yml +++ b/.github/workflows/workflows-dt.yml @@ -21,11 +21,15 @@ jobs: shell: bash -l {0} run: make torch-cpu + - name: Make tensorflow env + shell: bash -l {0} + run: make tf-2.13 + - name: Install dev version shell: bash -l {0} run: micromamba run -p ./.venv-pytorch pip install .[dev] - name: Run pytest for workflows shell: bash -l {0} - run: micromamba run -p ./.venv-pytorch pytest -v ./tests/ -m "not distributed" + run: micromamba run -p ./.venv-pytorch pytest -v ./tests/ -m "not slurm" diff --git a/.gitignore b/.gitignore index 57eda978..41349b5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ TODO -data +/data nohup* lightning_logs mlruns diff --git a/README.md b/README.md index a66de09a..55c8484a 100644 --- a/README.md +++ b/README.md @@ -96,11 +96,25 @@ adding the `dev` extra: pip install -e .[dev] ``` -To **run tests** on itwinai package: +#### Test with `pytest` + +To run tests on itwinai package: ```bash # Activate env micromamba activate ./.venv-pytorch # or ./.venv-tf -pytest -v tests/ +pytest -v -m "not slurm" tests/ +``` + +However, some tests are intended to be executed only on an HPC system, +where SLURM is available. They are marked with "slurm" tag. To run also +those tests, use the dedicated job script: + +```bash +sbatch tests/slurm_tests_startscript + +# Upon completion, check the output: +cat job.err +cat job.out ``` diff --git a/pyproject.toml b/pyproject.toml index fd5c42f0..7a780b7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,5 +70,6 @@ itwinai = "itwinai.cli:app" [tool.pytest.ini_options] markers = [ "integration: integration tests (deselect with '-m \"not integration\"')", - "distributed: test distributed ML on multiple GPUs/nodes (deselect with '-m \"not distributed\"')", + "slurm: needs SLURM and HPC resources (multiple GPUs/nodes). (deselect with '-m \"not slurm\"')", + "functional: functional tests. (deselect with '-m \"not functional\"')", ] diff --git a/src/itwinai/backend/torch/_utils.py b/src/itwinai/backend/torch/_utils.py deleted file mode 100644 index 86154ddf..00000000 --- a/src/itwinai/backend/torch/_utils.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Hashable, Dict - - -def clear_key( - my_dict: Dict, - dict_name: str, - key: Hashable, - complain: bool = True -) -> Dict: - """Remove key from dictionary if present and complain. - - Args: - my_dict (Dict): Dictionary. - dict_name (str): name of the dictionary. - key (Hashable): Key to remove. - """ - if key in my_dict: - if complain: - print( - f"Field '{key}' should not be present " - f"in dictionary '{dict_name}'" - ) - del my_dict[key] - return my_dict diff --git a/src/itwinai/backend/utils.py b/src/itwinai/backend/utils.py deleted file mode 100644 index deaab45a..00000000 --- a/src/itwinai/backend/utils.py +++ /dev/null @@ -1,13 +0,0 @@ -import yaml - - -# Parse (part of) YAML loaded in memory -def parse_pipe_config(yaml_file, parser): - with open(yaml_file, "r", encoding="utf-8") as f: - try: - config = yaml.safe_load(f) - except yaml.YAMLError as exc: - print(exc) - raise exc - - return parser.parse_object(config) diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py index 5d8ca0b0..bc1b852e 100644 --- a/src/itwinai/cli.py +++ b/src/itwinai/cli.py @@ -10,271 +10,12 @@ # NOTE: import libs in the command"s function, not here. # Otherwise this will slow the whole CLI. -# from typing import Optional -import os -import sys import typer app = typer.Typer() -@app.command() -def train( - train_dataset: str = typer.Option("unk", help="Path to training dataset."), - config: str = typer.Option( - "unk", help="Path to training configuration file."), - ml_logs: str = typer.Option("ml-logs/", help="Path to logs storage."), -): - """ - Train a neural network defined as a Pytorch Lightning model. - """ - import copy - import shutil - import mlflow - from lightning.pytorch.cli import LightningCLI - from omegaconf import DictConfig, OmegaConf - - from itwinai.utils import load_yaml_with_deps, check_server, flatten_dict - from itwinai.plmodels.base import ( - ItwinaiBasePlModule, ItwinaiBasePlDataModule - ) - - # Replicate args under cli field, to be used in imported config files - cli_conf = dict( - cli=dict(train_dataset=train_dataset, config=config, ml_logs=ml_logs) - ) - cli_conf = OmegaConf.create(cli_conf) - - # os.makedirs(ml_logs, exist_ok=True) - train_config: DictConfig = load_yaml_with_deps(config) - train_config = OmegaConf.merge(train_config, cli_conf) - # print(OmegaConf.to_yaml(train_config)) - train_config = OmegaConf.to_container(train_config, resolve=True) - - # Setup logger - if os.path.exists("checkpoints"): - # Remove old checkpoints - shutil.rmtree("checkpoints") - - # Check if MLflow server is reachable - if not check_server(ml_logs): - raise RuntimeError("MLFlow server not reachable!") - - log_conf = train_config["logger"] - # mlflow.set_tracking_uri('file:' + ml_logs) - mlflow.set_tracking_uri(ml_logs) - mlflow.set_experiment(log_conf["experiment_name"]) - mlflow.pytorch.autolog( - log_every_n_epoch=log_conf["log_every_n_epoch"], - log_every_n_step=log_conf["log_every_n_steps"], - registered_model_name=log_conf["registered_model_name"], - ) - # Note: we use autolog and MlFlowLogger combined: - # - MlFlow logger provides better flexibility - # - autolog takes care of repetitive operations - # Ref: https://github.com/Lightning-AI/lightning/discussions/11197 - - # Load training configuration - lightning_conf = train_config["train"]["conf"] - # lightning_conf = OmegaConf.to_container(lightning_conf, resolve=True) - - # Start Mlflow run - with mlflow.start_run(description=log_conf["description"]): - # Log hyperparameters - config_params = copy.copy(train_config) - config_params["cli.train_dataset"] = train_dataset - config_params["cli.ml_logs"] = ml_logs - config_params["cli.config"] = config - mlflow.log_params(flatten_dict(config_params)) - - # Save config file used for this specific training run - # for reproducibility - mlflow.log_artifact(config) - - # Update lightning MLFlow logger constructor args - # Infer MlFlow conf from pre-configured mlflow client - lightning_conf["trainer"]["logger"]["init_args"].update( - dict( - experiment_name=mlflow.get_experiment( - mlflow.active_run().info.experiment_id - ).name, - tracking_uri=mlflow.get_tracking_uri(), - log_model="all", - run_id=mlflow.active_run().info.run_id, - save_dir=None, - ) - ) - # Append CSVLogger in front: - # https://github.com/Lightning-AI/lightning/issues/16310#issuecomment-1404177131 - csv_log_conf = dict( - class_path="lightning.pytorch.loggers.CSVLogger", - init_args=dict(save_dir="./.tmp"), - ) - lightning_conf["trainer"]["logger"] = [ - csv_log_conf, - lightning_conf["trainer"]["logger"], - ] - - # Reset argv before using Lightning CLI - old_argv = sys.argv - sys.argv = ["some_script_placeholder.py"] - print(lightning_conf) - cli = LightningCLI( - args=lightning_conf, - model_class=ItwinaiBasePlModule, - datamodule_class=ItwinaiBasePlDataModule, - run=False, - save_config_kwargs={ - "overwrite": True, - "config_filename": "pl-training.yml", - }, - subclass_mode_model=True, - subclass_mode_data=True, - ) - print(cli.trainer.log_dir) - sys.argv = old_argv - # Train + validation, and test - cli.trainer.fit(cli.model, datamodule=cli.datamodule) - cli.trainer.test( - dataloaders=cli.datamodule, datamodule=cli.datamodule, - ckpt_path="best" - ) - - # Save updated lightning conf as an mlflow artifact - mlflow.log_artifact(os.path.join( - cli.trainer.log_dir, "pl-training.yml")) - - -@app.command() -def predict( - input_dataset: str = typer.Option( - "unk", help="Path to dataset of unseen data to make predictions." - ), - config: str = typer.Option( - "unk", help="Path to inference configuration file."), - predictions_location: str = typer.Option( - "preds/", help="Where to save predictions." - ), - ml_logs: str = typer.Option("ml-logs/", help="Path to MLFLow logs."), -): - """ - Apply a pre-trained neural network to a set of unseen data. - """ - import logging - import mlflow - from mlflow.exceptions import MlflowException - from lightning.pytorch.cli import LightningCLI - from lightning.pytorch.trainer.trainer import Trainer - import torch - from omegaconf import DictConfig, OmegaConf - - from itwinai.utils import load_yaml_with_deps, load_yaml - from itwinai.plmodels.base import ( - ItwinaiBasePlModule, ItwinaiBasePlDataModule - ) - - # Replicate args under cli field, to be used in imported config files - cli_conf = dict( - cli=dict( - input_dataset=input_dataset, - config=config, - predictions_location=predictions_location, - ml_logs=ml_logs, - ) - ) - cli_conf = OmegaConf.create(cli_conf) - - os.makedirs(predictions_location, exist_ok=True) - ml_conf: DictConfig = load_yaml_with_deps(config) - ml_conf = OmegaConf.merge(ml_conf, cli_conf) - # print(OmegaConf.to_yaml(ml_conf)) - ml_conf = OmegaConf.to_container(ml_conf, resolve=True) - ml_conf = ml_conf["inference"] - - os.makedirs(predictions_location, exist_ok=True) - - # mlflow.set_tracking_uri('file:' + ml_logs) - mlflow.set_tracking_uri(ml_logs) - - # Check if run ID exists - try: - mlflow.get_run(ml_conf["run_id"]) - # mlflow_client.get_run(ml_conf['run_id']) - except MlflowException: - logging.warning( - f"Run ID '{ml_conf['run_id']}' not found! " - "Using latest run available for experiment " - f"'{ml_conf['experiment_name']}' instead." - ) - runs = mlflow.search_runs( - experiment_names=[ml_conf["experiment_name"]], - ) - new_run_id = runs[runs.status == "FINISHED"].iloc[0]["run_id"] - ml_conf["run_id"] = new_run_id - logging.warning(f"Using Run ID: '{new_run_id}'") - - # Download training configuration - train_conf_path = mlflow.artifacts.download_artifacts( - run_id=ml_conf["run_id"], - artifact_path=ml_conf["train_config_artifact_path"], - dst_path="tmp/", - tracking_uri=mlflow.get_tracking_uri(), - ) - - # Download last ckpt - ckpt_path = mlflow.artifacts.download_artifacts( - run_id=ml_conf["run_id"], - artifact_path=ml_conf["ckpt_path"], - dst_path="tmp/", - tracking_uri=mlflow.get_tracking_uri(), - ) - - # Instantiate PL model - lightning_conf = load_yaml(train_conf_path) - if ml_conf["conf"] is not None: - # Override training configuration with the one - # provided during inference. - # Example: predictions dataset is different - # from training dataset - lightning_conf.update(ml_conf["conf"]) - - # Reset argv before using Lightning CLI - old_argv = sys.argv - sys.argv = ["some_script_placeholder.py"] - cli = LightningCLI( - args=lightning_conf, - model_class=ItwinaiBasePlModule, - run=False, - subclass_mode_model=True, - subclass_mode_data=True, - save_config_callback=None, - ) - sys.argv = old_argv - - # Load best model - loaded_model = cli.model.load_from_checkpoint( - ckpt_path, lightning_conf["model"]["init_args"] - ) - - # Load Data module - loaded_data_module: ItwinaiBasePlDataModule = cli.datamodule - - trainer = Trainer(logger=cli.trainer.logger) - - # Predict - predictions = trainer.predict( - loaded_model, datamodule=loaded_data_module - ) # , ckpt_path='best') - pred_class_names = loaded_data_module.preds_to_names( - torch.cat(predictions)) - - # Save list of predictions as class names - preds_file = os.path.join(predictions_location, "predictions.txt") - with open(preds_file, "w") as preds_file: - preds_file.write("\n".join(pred_class_names)) - - @app.command() def mlflow_ui( path: str = typer.Option("ml-logs/", help="Path to logs storage."), @@ -288,77 +29,36 @@ def mlflow_ui( @app.command() -def datasets( - use_case: str = typer.Option( - "./use-cases/mnist", help="Path to use case files."), +def mlflow_server( + path: str = typer.Option("ml-logs/", help="Path to logs storage."), + port: int = typer.Option( + 5000, help="Port on which the server is listening."), ): - """List datasets of an use case.""" - from rich.console import Console - from rich.table import Table - from itwinai.utils import load_yaml - from pathlib import Path - from omegaconf import OmegaConf - - datasets_reg = load_yaml(os.path.join(use_case, "meta.yml")) - # datasets_reg = OmegaConf.create(datasets_reg) - datasets_reg = OmegaConf.to_container( - OmegaConf.create(datasets_reg), resolve=True) - - rows = [] - columns = ["Name", "Description", "Location"] - for ds_name, ds_info in datasets_reg["datasets"].items(): - rows.append([ds_name, ds_info["doc"], ds_info["location"]]) + """ + Spawn Mlflow server. + """ + import subprocess - use_case_dir = os.path.basename(Path(use_case)) - table = Table(title=f"Datasets registry for '{use_case_dir}' use case") - for column in columns: - table.add_column(column) - for row in rows: - table.add_row(*row, style="bright_green") - console = Console() - console.print(table) + subprocess.run( + f"mlflow server --backend-store-uri {path} --port {port}".split()) @app.command() -def workflows( - use_case: str = typer.Option( - "./use-cases/mnist", help="Path to use case files."), +def kill_mlflow_server( + port: int = typer.Option( + 5000, help="Port on which the server is listening."), ): - """List workflows of an use case.""" - from omegaconf import OmegaConf - from rich.console import Console - from rich.table import Table - from pathlib import Path - from itwinai.utils import load_yaml_with_deps - - use_case_dir = os.path.basename(Path(use_case)) - wf_files = filter(lambda itm: itm.endswith( - "-workflow.yml"), os.listdir(use_case)) - columns = ["Step", "Description", "Command", "Env location", "Env file"] - for workflow_file in wf_files: - wf = load_yaml_with_deps(os.path.join(use_case, workflow_file)) - wf_name = workflow_file.split(".")[0] - rows = [] - for step in wf.steps: - step = OmegaConf.to_container(step, resolve=True) - step_name, step_data = list(step.items())[0] - rows.append( - [ - step_name, - step_data["doc"], - step_data["command"], - step_data["env"]["prefix"], - step_data["env"]["file"], - ] - ) + """ + Kill Mlflow server. + """ + import subprocess - table = Table(title=f"'{wf_name}' for '{use_case_dir}' use case") - for column in columns: - table.add_column(column) - for row in rows: - table.add_row(*row, style="bright_green") - console = Console() - console.print(table) + subprocess.run( + f"kill -9 $(lsof -t -i:{port})", + shell=True, + check=True, + stderr=subprocess.DEVNULL + ) if __name__ == "__main__": diff --git a/src/itwinai/backend/cluster.py b/src/itwinai/cluster.py similarity index 100% rename from src/itwinai/backend/cluster.py rename to src/itwinai/cluster.py diff --git a/src/itwinai/backend/components.py b/src/itwinai/components.py similarity index 100% rename from src/itwinai/backend/components.py rename to src/itwinai/components.py diff --git a/src/itwinai/backend/executors.py b/src/itwinai/executors.py similarity index 96% rename from src/itwinai/backend/executors.py rename to src/itwinai/executors.py index 035224eb..d94e1c0f 100644 --- a/src/itwinai/backend/executors.py +++ b/src/itwinai/executors.py @@ -20,9 +20,9 @@ def __init__(self, pipeline, class_dict): pipe_parser.add_subclass_arguments(v, k) # Parse, Instantiate pipe - if type(pipeline) == str: + if isinstance(pipeline, str): parsed = parse_pipe_config(pipeline, pipe_parser) - elif type(pipeline) == dict: + elif isinstance(pipeline, dict): parsed = pipe_parser.parse_object(pipeline) else: raise "Type of pipeline is not supported" @@ -57,7 +57,7 @@ def worker_fn(self, config, pipeline, class_dict): # Should have same structure pipe and params def replace(pipe, params): for param in params: - if type(pipe[param]) != dict: + if not isinstance(pipe[param], dict): pipe[param] = params[param] else: replace(pipe[param], params[param]) diff --git a/src/itwinai/backend/loggers.py b/src/itwinai/loggers.py similarity index 91% rename from src/itwinai/backend/loggers.py rename to src/itwinai/loggers.py index 10b142bd..1116c503 100644 --- a/src/itwinai/backend/loggers.py +++ b/src/itwinai/loggers.py @@ -113,7 +113,7 @@ def should_log( return True -class SimpleLogger(Logger): +class ConsoleLogger(Logger): """Simple logger for quick tests.""" def __init__( @@ -160,22 +160,22 @@ def log( self.run_path, identifier ) - print(f"SimpleLogger: Serializing to {identifier}...") + print(f"ConsoleLogger: Serializing to {identifier}...") shutil.copyfile(item, identifier) else: identifier = os.path.join( os.path.basename(self.run_path), identifier ) - print(f"SimpleLogger: Serializing to {identifier}...") + print(f"ConsoleLogger: Serializing to {identifier}...") self.serialize(item, identifier) elif kind == 'torch': identifier = os.path.join(self.run_path, identifier) - print(f"SimpleLogger: Saving to {identifier}...") + print(f"ConsoleLogger: Saving to {identifier}...") import torch torch.save(item, identifier) else: - print(f"SimpleLogger: {identifier} = {item}") + print(f"ConsoleLogger: {identifier} = {item}") class MLFlowLogger(Logger): @@ -415,3 +415,36 @@ def log( return # TODO: complete + + +class LoggersCollection(Logger): + """Contains a list of loggers. Never tested.""" + + def __init__( + self, + loggers: List[Logger] + ) -> None: + super().__init__(savedir='/.tmp_mllogs_LoggersCollection', log_freq=0) + self.loggers = loggers + + def should_log(self, batch_idx: int = None) -> bool: + return True + + def log( + self, + item: Union[Any, List[Any]], + identifier: Union[str, List[str]], + kind: str = 'metric', + step: Optional[int] = None, + batch_idx: Optional[int] = None, + **kwargs + ) -> None: + for logger in self.loggers: + logger.log( + item=item, + identifier=identifier, + kind=kind, + step=step, + batch_idx=batch_idx, + **kwargs + ) diff --git a/src/itwinai/backend/__init__.py b/src/itwinai/tensorflow/__init__.py similarity index 100% rename from src/itwinai/backend/__init__.py rename to src/itwinai/tensorflow/__init__.py diff --git a/src/itwinai/backend/tensorflow/__init__.py b/src/itwinai/tensorflow/data/__init__.py similarity index 100% rename from src/itwinai/backend/tensorflow/__init__.py rename to src/itwinai/tensorflow/data/__init__.py diff --git a/src/itwinai/backend/torch/__init__.py b/src/itwinai/tensorflow/models/__init__.py similarity index 100% rename from src/itwinai/backend/torch/__init__.py rename to src/itwinai/tensorflow/models/__init__.py diff --git a/src/itwinai/models/tensorflow/mnist.py b/src/itwinai/tensorflow/models/mnist.py similarity index 100% rename from src/itwinai/models/tensorflow/mnist.py rename to src/itwinai/tensorflow/models/mnist.py diff --git a/src/itwinai/backend/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py similarity index 65% rename from src/itwinai/backend/tensorflow/trainer.py rename to src/itwinai/tensorflow/trainer.py index ec0f4e09..3f51f000 100644 --- a/src/itwinai/backend/tensorflow/trainer.py +++ b/src/itwinai/tensorflow/trainer.py @@ -138,76 +138,76 @@ def train(self, train_dataset, validation_dataset): return history -class TensorflowTrainer2(Trainer): - def __init__( - self, - epochs, - batch_size, - callbacks, - model_dict: Dict, - compile_conf, - strategy - ): - super().__init__() - self.strategy = strategy - self.epochs = epochs - self.batch_size = batch_size - self.callbacks = callbacks - - # Handle the parsing - model_class = import_class(model_dict["class_path"]) - parser = ArgumentParser() - parser.add_subclass_arguments(model_class, "model") - model_dict = {"model": model_dict} - - # Create distributed TF vars - if self.strategy: - with self.strategy.scope(): - self.model = parser.instantiate_classes(model_dict).model - print(self.model) - self.model.compile(**compile_conf) - # TODO: move loss, optimizer and metrics instantiation under - # here - # Ref: - # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit - else: - self.model = parser.instantiate_classes(model_dict).model - self.model.compile(**compile_conf) - - self.num_devices = ( - self.strategy.num_replicas_in_sync if self.strategy else 1) - print(f"Strategy is working with: {self.num_devices} devices") - - def train(self, train_dataset, validation_dataset): - # TODO: FIX Steps sizes in model.fit - train, test = train_dataset, validation_dataset - - # Set batch size to the dataset - train = train.batch(self.batch_size, drop_remainder=True) - test = test.batch(self.batch_size, drop_remainder=True) - - # Number of samples - n_train = train.cardinality().numpy() - n_test = test.cardinality().numpy() - - # TODO: read - # https://github.com/tensorflow/tensorflow/issues/56773#issuecomment-1188693881 - # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit - - # Distribute dataset - if self.strategy: - train = self.strategy.experimental_distribute_dataset(train) - test = self.strategy.experimental_distribute_dataset(test) - - # train the model - history = self.model.fit( - train, - validation_data=test, - steps_per_epoch=int(n_train // self.num_devices), - validation_steps=int(n_test // self.num_devices), - epochs=self.epochs, - callbacks=self.callbacks, - ) - - print("Model trained") - return history +# class TensorflowTrainer2(Trainer): +# def __init__( +# self, +# epochs, +# batch_size, +# callbacks, +# model_dict: Dict, +# compile_conf, +# strategy +# ): +# super().__init__() +# self.strategy = strategy +# self.epochs = epochs +# self.batch_size = batch_size +# self.callbacks = callbacks + +# # Handle the parsing +# model_class = import_class(model_dict["class_path"]) +# parser = ArgumentParser() +# parser.add_subclass_arguments(model_class, "model") +# model_dict = {"model": model_dict} + +# # Create distributed TF vars +# if self.strategy: +# with self.strategy.scope(): +# self.model = parser.instantiate_classes(model_dict).model +# print(self.model) +# self.model.compile(**compile_conf) +# # TODO: move loss, optimizer and metrics instantiation under +# # here +# # Ref: +# # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit +# else: +# self.model = parser.instantiate_classes(model_dict).model +# self.model.compile(**compile_conf) + +# self.num_devices = ( +# self.strategy.num_replicas_in_sync if self.strategy else 1) +# print(f"Strategy is working with: {self.num_devices} devices") + +# def train(self, train_dataset, validation_dataset): +# # TODO: FIX Steps sizes in model.fit +# train, test = train_dataset, validation_dataset + +# # Set batch size to the dataset +# train = train.batch(self.batch_size, drop_remainder=True) +# test = test.batch(self.batch_size, drop_remainder=True) + +# # Number of samples +# n_train = train.cardinality().numpy() +# n_test = test.cardinality().numpy() + +# # TODO: read +# # https://github.com/tensorflow/tensorflow/issues/56773#issuecomment-1188693881 +# # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit + +# # Distribute dataset +# if self.strategy: +# train = self.strategy.experimental_distribute_dataset(train) +# test = self.strategy.experimental_distribute_dataset(test) + +# # train the model +# history = self.model.fit( +# train, +# validation_data=test, +# steps_per_epoch=int(n_train // self.num_devices), +# validation_steps=int(n_test // self.num_devices), +# epochs=self.epochs, +# callbacks=self.callbacks, +# ) + +# print("Model trained") +# return history diff --git a/src/itwinai/backend/tensorflow/utils.py b/src/itwinai/tensorflow/utils.py similarity index 100% rename from src/itwinai/backend/tensorflow/utils.py rename to src/itwinai/tensorflow/utils.py diff --git a/src/itwinai/models/__init__.py b/src/itwinai/torch/__init__.py similarity index 100% rename from src/itwinai/models/__init__.py rename to src/itwinai/torch/__init__.py diff --git a/src/itwinai/backend/torch/cluster.py b/src/itwinai/torch/cluster.py similarity index 100% rename from src/itwinai/backend/torch/cluster.py rename to src/itwinai/torch/cluster.py diff --git a/src/itwinai/models/tensorflow/__init__.py b/src/itwinai/torch/data/__init__.py similarity index 100% rename from src/itwinai/models/tensorflow/__init__.py rename to src/itwinai/torch/data/__init__.py diff --git a/src/itwinai/models/torch/__init__.py b/src/itwinai/torch/models/__init__.py similarity index 100% rename from src/itwinai/models/torch/__init__.py rename to src/itwinai/torch/models/__init__.py diff --git a/src/itwinai/models/torch/mnist.py b/src/itwinai/torch/models/mnist.py similarity index 100% rename from src/itwinai/models/torch/mnist.py rename to src/itwinai/torch/models/mnist.py diff --git a/src/itwinai/backend/torch/trainer.py b/src/itwinai/torch/trainer.py similarity index 99% rename from src/itwinai/backend/torch/trainer.py rename to src/itwinai/torch/trainer.py index 864ce07e..6d8a1771 100644 --- a/src/itwinai/backend/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -23,8 +23,8 @@ Batch, Loss, LrScheduler, Metric ) from .types import TorchDistributedStrategy as StrategyT -from ..loggers import LogMixin, Logger, SimpleLogger -from ...utils import dynamically_import_class +from ..loggers import LogMixin, Logger, ConsoleLogger +from ..utils import dynamically_import_class from ..cluster import ClusterEnvironment @@ -265,7 +265,7 @@ def __init__( ) # Loggers - self.logger = logger if logger is not None else SimpleLogger() + self.logger = logger if logger is not None else ConsoleLogger() # Metrics self.train_metrics = ( diff --git a/src/itwinai/backend/torch/types.py b/src/itwinai/torch/types.py similarity index 100% rename from src/itwinai/backend/torch/types.py rename to src/itwinai/torch/types.py diff --git a/src/itwinai/backend/torch/utils.py b/src/itwinai/torch/utils.py similarity index 100% rename from src/itwinai/backend/torch/utils.py rename to src/itwinai/torch/utils.py diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py index be794644..1314423a 100644 --- a/src/itwinai/utils.py +++ b/src/itwinai/utils.py @@ -9,26 +9,6 @@ from omegaconf.dictconfig import DictConfig -def check_server(uri: str) -> bool: - """Check if an HTTP server is reachable - - Args: - uri (str): Server URL - - Returns: - bool: True if reachable. - """ - import requests - from requests import ConnectionError - - success = True - try: - _ = requests.get(uri) - except ConnectionError: - success = False - return success - - def load_yaml(path: str) -> Dict: """Load YAML file as dict. @@ -115,3 +95,15 @@ def flatten_dict( else: items.append((new_key, v)) return dict(items) + + +# Parse (part of) YAML loaded in memory +def parse_pipe_config(yaml_file, parser): + with open(yaml_file, "r", encoding="utf-8") as f: + try: + config = yaml.safe_load(f) + except yaml.YAMLError as exc: + print(exc) + raise exc + + return parser.parse_object(config) diff --git a/tests/backend/torch/torch_dist_trainer2.py b/tests/backend/torch/torch_dist_trainer2.py deleted file mode 100644 index bfd0d3a3..00000000 --- a/tests/backend/torch/torch_dist_trainer2.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Test Trainer class. To run this script, use the following command: - ->>> torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 --rdzv_backend=c10d \ - --rdzv_endpoint=localhost:29400 test_trainer.py - -""" - -from torch import nn -import torch.nn.functional as F -from torch.utils.data import DataLoader -from torchvision import transforms, datasets - -from itwinai.backend.torch.trainer import TorchTrainer2 - - -class Net(nn.Module): - - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=0) - - -if __name__ == '__main__': - train_set = datasets.MNIST( - '.tmp/', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - val_set = datasets.MNIST( - '.tmp/', train=False, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - trainer = TorchTrainer2( - model=Net(), - train_dataloader=DataLoader(train_set, batch_size=32, pin_memory=True), - validation_dataloader=DataLoader( - val_set, batch_size=32, pin_memory=True), - strategy='ddp', - backend='nccl', - loss='NLLLoss', - epochs=20, - checkpoint_every=1 - ) - trainer.execute() diff --git a/tests/slurm_tests_startscript b/tests/slurm_tests_startscript new file mode 100644 index 00000000..f7540fab --- /dev/null +++ b/tests/slurm_tests_startscript @@ -0,0 +1,32 @@ +#!/bin/bash + +# general configuration of the job +#SBATCH --job-name=PrototypeTest +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# configure node and process count on the CM +#SBATCH --partition=batch +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gpus-per-node=4 + +# SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# load modules +ml --force purge +ml Stages/2023 StdEnv/2023 NVHPC/23.1 OpenMPI/4.1.4 cuDNN/8.6.0.163-CUDA-11.7 Python/3.10.4 HDF5 libaio/0.3.112 GCC/11.3.0 + +# shellcheck source=/dev/null +source ~/.bashrc + +# from repo's root dir +srun micromamba run -p ./.venv-pytorch pytest -v -m slurm tests/ \ No newline at end of file diff --git a/tests/backend/test_components.py b/tests/test_components.py similarity index 81% rename from tests/backend/test_components.py rename to tests/test_components.py index 9c4f3d37..f7396214 100644 --- a/tests/backend/test_components.py +++ b/tests/test_components.py @@ -1,4 +1,4 @@ -from itwinai.backend.components import Executor +from itwinai.components import Executor def test_slice(): diff --git a/tests/backend/torch/distribtued_decorator.py b/tests/torch/distribtued_decorator.py similarity index 98% rename from tests/backend/torch/distribtued_decorator.py rename to tests/torch/distribtued_decorator.py index 7af056c6..fd086154 100644 --- a/tests/backend/torch/distribtued_decorator.py +++ b/tests/torch/distribtued_decorator.py @@ -15,7 +15,7 @@ import torch.optim as optim from torch.optim.lr_scheduler import StepLR -from itwinai.backend.torch.trainer import distributed +from itwinai.torch.trainer import distributed class Net(nn.Module): diff --git a/tests/backend/torch/test_distribtued_training.py b/tests/torch/test_distribtued_training.py similarity index 68% rename from tests/backend/torch/test_distribtued_training.py rename to tests/torch/test_distribtued_training.py index f73fdf02..829fa9a2 100644 --- a/tests/backend/torch/test_distribtued_training.py +++ b/tests/torch/test_distribtued_training.py @@ -4,21 +4,22 @@ import pytest -@pytest.mark.distributed +@pytest.mark.slurm def test_distributed_decorator(): """Test function decorator. Needs torchrun cmd.""" - cmd = ("micromamba run -p ./ai/.venv-pytorch " + cmd = ("micromamba run -p ./.venv-pytorch " "torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 " "--rdzv_backend=c10d --rdzv_endpoint=localhost:29400 " - "tests/backend/torch/distribtued_decorator.py") + "tests/torch/distribtued_decorator.py") subprocess.run(cmd.split(), check=True) -@pytest.mark.distributed +@pytest.mark.skip(reason="TorchTrainer not implemented yet") +@pytest.mark.slurm def test_distributed_trainer(): """Test vanilla torch distributed trainer. Needs torchrun cmd.""" - cmd = ("micromamba run -p ./ai/.venv-pytorch " + cmd = ("micromamba run -p ./.venv-pytorch " "torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 " "--rdzv_backend=c10d --rdzv_endpoint=localhost:29400 " - "tests/backend/torch/torch_dist_trainer.py") + "tests/torch/torch_dist_trainer.py") subprocess.run(cmd.split(), check=True) diff --git a/tests/backend/torch/torch_dist_trainer.py b/tests/torch/torch_dist_trainer.py similarity index 97% rename from tests/backend/torch/torch_dist_trainer.py rename to tests/torch/torch_dist_trainer.py index e2ba90af..63871f4a 100644 --- a/tests/backend/torch/torch_dist_trainer.py +++ b/tests/torch/torch_dist_trainer.py @@ -11,7 +11,7 @@ from torch.utils.data import DataLoader from torchvision import transforms, datasets -from itwinai.backend.torch.trainer import TorchTrainer +from itwinai.torch.trainer import TorchTrainer class Net(nn.Module): diff --git a/tests/use-cases/conftest.py b/tests/use-cases/conftest.py new file mode 100644 index 00000000..c965799c --- /dev/null +++ b/tests/use-cases/conftest.py @@ -0,0 +1,37 @@ +import os +import pytest +import subprocess + +pytest.TORCH_PREFIX = './.venv-pytorch' +pytest.TF_PREFIX = './.venv-tf' + +FNAMES = [ + 'pipeline.yaml', + 'startscript', + 'train.py', +] + + +@pytest.fixture +def check_folder_structure(): + """ + Verify that the use case folder complies with some predefined + structure. + """ + def _check_structure(root: str): + for fname in FNAMES: + fpath = os.path.join(root, fname) + assert os.path.isfile(fpath), f"'{fname}' is missing in '{fpath}'" + return _check_structure + + +@pytest.fixture +def install_requirements(): + """Install requirements.txt, if present in root folder.""" + def _install_reqs(root: str, env_prefix: str): + req_path = os.path.join(root, 'requirements.txt') + if os.path.isfile(req_path): + cmd = (f"micromamba run -p {env_prefix} " + f"pip install -r {req_path}") + subprocess.run(cmd.split(), check=True) + return _install_reqs diff --git a/tests/use-cases/test_cyclones.py b/tests/use-cases/test_cyclones.py index d729d794..6b11db1e 100644 --- a/tests/use-cases/test_cyclones.py +++ b/tests/use-cases/test_cyclones.py @@ -8,23 +8,20 @@ import pytest import subprocess -# TODO: add tests for use case folder format: -# - structure -# - naming convention -# - file exist +CYCLONES_PATH = "use-cases/cyclones" -@pytest.mark.skip(reason="workflow changed") -@pytest.mark.integration -def test_cyclones_train(): +def test_structure_cyclones(check_folder_structure): + """Test cyclones folder structure.""" + check_folder_structure(CYCLONES_PATH) + + +@pytest.mark.functional +def test_cyclones_train_tf(install_requirements): """ - Test MNIST training workflow(s) by running it end-to-end. + Test Cyclones tensorflow trainer by running it end-to-end. """ - workflows = [ - "./use-cases/cyclones/workflows/training-workflow.yml", - ] - - for workflow in workflows: - cmd = f"micromamba run -p ./.venv python run-workflow.py -f {workflow}" - subprocess.run(cmd.split(), check=True) - subprocess.run(cmd.split() + ["--cwl"], check=True) + install_requirements(CYCLONES_PATH, pytest.TF_PREFIX) + cmd = (f"micromamba run -p {pytest.TF_PREFIX} python " + f"{CYCLONES_PATH}/train.py -p {CYCLONES_PATH}/pipeline.yaml") + subprocess.run(cmd.split(), check=True) diff --git a/tests/use-cases/test_mnist.py b/tests/use-cases/test_mnist.py index d0d47b6f..d32aab1c 100644 --- a/tests/use-cases/test_mnist.py +++ b/tests/use-cases/test_mnist.py @@ -8,15 +8,62 @@ import pytest import subprocess -# TODO: add tests for use case folder format: -# - structure -# - naming convention -# - file exist +TORCH_PATH = "use-cases/mnist/torch" +LIGHTNING_PATH = "use-cases/mnist/torch-lightning" +TF_PATH = "use-cases/mnist/tensorflow" -@pytest.mark.skip(reason="workflow changed") +def test_structure_mnist_torch(check_folder_structure): + """Test MNIST folder structure for torch native trainer.""" + check_folder_structure(TORCH_PATH) + + +def test_structure_mnist_lightning(check_folder_structure): + """Test MNIST folder structure for torch lightning trainer.""" + check_folder_structure(LIGHTNING_PATH) + + +def test_structure_mnist_tf(check_folder_structure): + """Test MNIST folder structure for tensorflow trainer.""" + check_folder_structure(TF_PATH) + + +@pytest.mark.functional +def test_mnist_train_torch(install_requirements): + """ + Test MNIST torch native trainer by running it end-to-end. + """ + install_requirements(TORCH_PATH, pytest.TORCH_PREFIX) + cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " + f"{TORCH_PATH}/train.py -p {TORCH_PATH}/pipeline.yaml") + subprocess.run(cmd.split(), check=True) + + +@pytest.mark.functional +def test_mnist_train_lightning(install_requirements): + """ + Test MNIST torch lightning trainer by running it end-to-end. + """ + install_requirements(TORCH_PATH, pytest.TORCH_PREFIX) + cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " + f"{LIGHTNING_PATH}/train.py -p {LIGHTNING_PATH}/pipeline.yaml") + subprocess.run(cmd.split(), check=True) + + +@pytest.mark.functional +def test_mnist_train_tf(install_requirements): + """ + Test MNIST tensorflow trainer by running it end-to-end. + """ + install_requirements(TF_PATH, pytest.TF_PREFIX) + cmd = (f"micromamba run -p {pytest.TF_PREFIX} python " + f"{TF_PATH}/train.py -p {TF_PATH}/pipeline.yaml") + subprocess.run(cmd.split(), check=True) + + +@pytest.mark.skip(reason="workflow changed. Left as example") @pytest.mark.integration -def test_mnist_train(): +def test_mnist_train_legacy(): """ Test MNIST training workflow(s) by running it end-to-end. """ diff --git a/src/itwinai/models/tensorflow/cyclones_vgg.py b/use-cases/cyclones/cyclones_vgg.py similarity index 100% rename from src/itwinai/models/tensorflow/cyclones_vgg.py rename to use-cases/cyclones/cyclones_vgg.py diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py index 367c6c5d..8c837822 100644 --- a/use-cases/cyclones/dataloader.py +++ b/use-cases/cyclones/dataloader.py @@ -1,7 +1,7 @@ import logging from os import listdir from os.path import join, exists -from itwinai.backend.components import DataGetter +from itwinai.components import DataGetter from typing import List, Dict, Optional, Tuple from lib.macros import ( PatchType, @@ -40,6 +40,7 @@ def __init__( aug_type: AugmentationType, experiment: dict, shuffle_buffer: int = None, + data_path: str = "tmp_data" ): super().__init__() self.batch_size = batch_size @@ -52,6 +53,7 @@ def __init__( self.patch_type = patch_type.value self.augment = augment self.shuffle = shuffle + self.data_path = data_path self.drv_vars, self.coo_vars = ( experiment["DRV_VARS_1"], experiment["COO_VARS_1"], @@ -179,15 +181,16 @@ def setup_config(self, config: Optional[Dict] = None) -> Dict: "https://drive.google.com/drive/folders/" "15DEq33MmtRvIpe2bNCg44lnfvEiHcPaf" ) - if not exists(join(root_dir, "data")): + if not exists(join(root_dir, self.data_path)): gdown.download_folder( url=url, quiet=False, - output=join(root_dir, "data") + output=join(root_dir, self.data_path) ) # Scalar fields self.root_dir = root_dir - self.dataset_dir = join(root_dir, "data", "tfrecords", "trainval/") + self.dataset_dir = join(root_dir, self.data_path, + "tfrecords", "trainval/") self.scaler_file = join(config["scaler_dir"], "minmax.tfrecord") # get records filenames diff --git a/use-cases/cyclones/executor.py b/use-cases/cyclones/executor.py index b2fa1f80..9c00af43 100644 --- a/use-cases/cyclones/executor.py +++ b/use-cases/cyclones/executor.py @@ -5,7 +5,7 @@ from typing import Tuple, Dict, Optional, Iterable from lib.macros import PATCH_SIZE as patch_size, SHAPE as shape -from itwinai.backend.components import Executor, Executable +from itwinai.components import Executor, Executable class CycloneExecutor(Executor): diff --git a/use-cases/cyclones/lib/utils.py b/use-cases/cyclones/lib/utils.py index 7fc461aa..4586ebc7 100644 --- a/use-cases/cyclones/lib/utils.py +++ b/use-cases/cyclones/lib/utils.py @@ -3,8 +3,8 @@ import time from .macros import Network -from itwinai.models.tensorflow.cyclones_vgg import ( - custom_VGG_V1, custom_VGG_V2, custom_VGG_V3, ModelV5 +from cyclones_vgg import ( + custom_VGG_V1, custom_VGG_V2, custom_VGG_V3 # , ModelV5 ) @@ -61,12 +61,12 @@ def get_network_config(network, **kwargs): patch_size=kwargs['patch_size'], channels=kwargs['channels'], activation=kwargs['activation'], regularizer=kwargs['regularizer']) - elif network == Network.MODEL_V5.value: - print('Using Model V5') - model = ModelV5( - patch_size=kwargs['patch_size'], channels=kwargs['channels'], - last_activation=kwargs['activation'], - kernel_size=kwargs['kernel_size']) + # elif network == Network.MODEL_V5.value: + # print('Using Model V5') + # model = ModelV5( + # patch_size=kwargs['patch_size'], channels=kwargs['channels'], + # last_activation=kwargs['activation'], + # kernel_size=kwargs['kernel_size']) return model diff --git a/use-cases/cyclones/train.py b/use-cases/cyclones/train.py index cc1b19fa..82a6d15d 100644 --- a/use-cases/cyclones/train.py +++ b/use-cases/cyclones/train.py @@ -13,8 +13,8 @@ import argparse -from itwinai.backend.components import Executor -from itwinai.backend.utils import parse_pipe_config +from itwinai.components import Executor +from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser from executor import CycloneExecutor diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py index 12a68f2f..8760e4bc 100644 --- a/use-cases/cyclones/trainer.py +++ b/use-cases/cyclones/trainer.py @@ -6,7 +6,7 @@ from lib.strategy import get_mirrored_strategy from lib.utils import get_network_config, load_model -from itwinai.backend.components import Trainer +from itwinai.components import Trainer from lib.callbacks import ProcessBenchmark from lib.macros import ( Network, diff --git a/use-cases/mnist/tensorflow/dataloader.py b/use-cases/mnist/tensorflow/dataloader.py index 1f9dad7b..920e0dba 100644 --- a/use-cases/mnist/tensorflow/dataloader.py +++ b/use-cases/mnist/tensorflow/dataloader.py @@ -2,7 +2,7 @@ import tensorflow.keras as keras import tensorflow as tf -from itwinai.backend.components import DataGetter, DataPreproc +from itwinai.components import DataGetter, DataPreproc class MNISTDataGetter(DataGetter): diff --git a/use-cases/mnist/tensorflow/pipeline.yaml b/use-cases/mnist/tensorflow/pipeline.yaml index 2dc582ef..aa34e0d4 100644 --- a/use-cases/mnist/tensorflow/pipeline.yaml +++ b/use-cases/mnist/tensorflow/pipeline.yaml @@ -1,5 +1,5 @@ executor: - class_path: itwinai.backend.components.Executor + class_path: itwinai.components.Executor init_args: steps: - class_path: dataloader.MNISTDataGetter @@ -24,7 +24,7 @@ executor: learning_rate: 0.001 model: - class_path: itwinai.models.tensorflow.mnist.MNIST_Model + class_path: itwinai.tensorflow.models.mnist.MNIST_Model init_args: input_shape: [ 28, 28, 1 ] output_shape: 10 @@ -33,8 +33,8 @@ executor: class_path: tensorflow.python.distribute.mirrored_strategy.MirroredStrategy logger: - - class_path: itwinai.backend.loggers.SimpleLogger - - class_path: itwinai.backend.loggers.MLFlowLogger + - class_path: itwinai.loggers.ConsoleLogger + - class_path: itwinai.loggers.MLFlowLogger init_args: experiment_name: MNIST classifier log_freq: batch diff --git a/use-cases/mnist/tensorflow/train.py b/use-cases/mnist/tensorflow/train.py index 9d19a853..65e12c78 100644 --- a/use-cases/mnist/tensorflow/train.py +++ b/use-cases/mnist/tensorflow/train.py @@ -13,8 +13,8 @@ import argparse -from itwinai.backend.components import Executor -from itwinai.backend.utils import parse_pipe_config +from itwinai.components import Executor +from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser diff --git a/use-cases/mnist/tensorflow/trainer.py b/use-cases/mnist/tensorflow/trainer.py index 93900e52..dfbc06c7 100644 --- a/use-cases/mnist/tensorflow/trainer.py +++ b/use-cases/mnist/tensorflow/trainer.py @@ -4,8 +4,8 @@ # from tensorflow.keras.losses import Loss from tensorflow.python.distribute.mirrored_strategy import MirroredStrategy -from itwinai.backend.tensorflow.trainer import TensorflowTrainer -from itwinai.backend.loggers import Logger +from itwinai.tensorflow.trainer import TensorflowTrainer +from itwinai.loggers import Logger class MNISTTrainer(TensorflowTrainer): diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py index 26cef92c..28ec236d 100644 --- a/use-cases/mnist/torch-lightning/dataloader.py +++ b/use-cases/mnist/torch-lightning/dataloader.py @@ -5,7 +5,7 @@ from torch.utils.data import DataLoader, random_split from torchvision import transforms -from itwinai.backend.components import DataGetter +from itwinai.components import DataGetter class LightningMNISTDownloader(DataGetter): diff --git a/use-cases/mnist/torch-lightning/pipeline.yaml b/use-cases/mnist/torch-lightning/pipeline.yaml index 708f910c..33ae0a94 100644 --- a/use-cases/mnist/torch-lightning/pipeline.yaml +++ b/use-cases/mnist/torch-lightning/pipeline.yaml @@ -1,5 +1,5 @@ executor: - class_path: itwinai.backend.components.Executor + class_path: itwinai.components.Executor init_args: steps: - class_path: dataloader.LightningMNISTDownloader @@ -67,7 +67,7 @@ executor: # Lightning Model configuration model: - class_path: itwinai.models.torch.mnist.MNISTModel + class_path: itwinai.torch.models.mnist.MNISTModel init_args: hidden_size: 64 diff --git a/use-cases/mnist/torch-lightning/train.py b/use-cases/mnist/torch-lightning/train.py index 1afd23a0..50c91988 100644 --- a/use-cases/mnist/torch-lightning/train.py +++ b/use-cases/mnist/torch-lightning/train.py @@ -1,32 +1,3 @@ -# import argparse - -# from trainer import TorchTrainer -# from itwinai.backend.torch.executor import TorchExecutor -# from itwinai.backend.utils import parse_pipe_config -# from jsonargparse import ArgumentParser - - -# if __name__ == "__main__": -# # Create CLI Parser -# parser = argparse.ArgumentParser() -# parser.add_argument("-p", "--pipeline", type=str) -# args = parser.parse_args() - -# # Create parser for the pipeline (ordered) -# pipe_parser = ArgumentParser() -# pipe_parser.add_subclass_arguments(TorchTrainer, "trainer") - -# # Parse, Instantiate pipe -# parsed = parse_pipe_config(args.pipeline, pipe_parser) -# pipe = pipe_parser.instantiate_classes(parsed) -# # Make pipe as a list -# pipe = [getattr(pipe, arg) for arg in vars(pipe)] - -# # Execute pipe -# executor = TorchExecutor() -# executor.setup(pipe) -# executor.execute(pipe) - """ Training pipeline. To run this script, use the following commands. @@ -44,8 +15,8 @@ import argparse -from itwinai.backend.components import Executor -from itwinai.backend.utils import parse_pipe_config +from itwinai.components import Executor +from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser diff --git a/use-cases/mnist/torch-lightning/trainer.py b/use-cases/mnist/torch-lightning/trainer.py index 9c23751d..72454cea 100644 --- a/use-cases/mnist/torch-lightning/trainer.py +++ b/use-cases/mnist/torch-lightning/trainer.py @@ -1,8 +1,8 @@ import os from typing import Union, Dict, Tuple, Optional, Any -from itwinai.backend.components import Trainer -from itwinai.models.torch.mnist import MNISTModel +from itwinai.components import Trainer +from itwinai.torch.models.mnist import MNISTModel from dataloader import MNISTDataModule from lightning.pytorch.cli import LightningCLI from utils import load_yaml diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index c4dd2fde..56ef807d 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -5,7 +5,7 @@ from torch.utils.data import Dataset from torchvision import transforms, datasets -from itwinai.backend.components import DataGetter +from itwinai.components import DataGetter class MNISTDataModuleTorch(DataGetter): diff --git a/use-cases/mnist/torch/pipeline.yaml b/use-cases/mnist/torch/pipeline.yaml index 14ca3dd3..df29c3fe 100644 --- a/use-cases/mnist/torch/pipeline.yaml +++ b/use-cases/mnist/torch/pipeline.yaml @@ -1,12 +1,12 @@ executor: - class_path: itwinai.backend.components.Executor + class_path: itwinai.components.Executor init_args: steps: - class_path: dataloader.MNISTDataModuleTorch init_args: save_path: .tmp/ - - class_path: itwinai.backend.torch.trainer.TorchTrainerMG + - class_path: itwinai.torch.trainer.TorchTrainerMG init_args: model: class_path: model.Net @@ -40,15 +40,15 @@ executor: init_args: num_classes: 10 logger: - - class_path: itwinai.backend.loggers.SimpleLogger - - class_path: itwinai.backend.loggers.MLFlowLogger + - class_path: itwinai.loggers.ConsoleLogger + - class_path: itwinai.loggers.MLFlowLogger init_args: experiment_name: MNIST classifier log_freq: batch strategy: ddp checkpoint_every: 1 cluster: - class_path: itwinai.backend.torch.cluster.LocalCluster + class_path: itwinai.torch.cluster.LocalCluster init_args: gpus: '0,1,2' backend: nccl diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py index f09523ba..50c91988 100644 --- a/use-cases/mnist/torch/train.py +++ b/use-cases/mnist/torch/train.py @@ -15,8 +15,8 @@ import argparse -from itwinai.backend.components import Executor -from itwinai.backend.utils import parse_pipe_config +from itwinai.components import Executor +from itwinai.utils import parse_pipe_config from jsonargparse import ArgumentParser diff --git a/src/itwinai/models/tensorflow/cyclegan.py b/use-cases/zebra2horse/cyclegan.py similarity index 100% rename from src/itwinai/models/tensorflow/cyclegan.py rename to use-cases/zebra2horse/cyclegan.py diff --git a/use-cases/zebra2horse/dataloader.py b/use-cases/zebra2horse/dataloader.py index 9f97c58b..0970d270 100644 --- a/use-cases/zebra2horse/dataloader.py +++ b/use-cases/zebra2horse/dataloader.py @@ -4,7 +4,7 @@ import tensorflow as tf import tensorflow_datasets as tfds -from itwinai.backend.components import DataGetter +from itwinai.components import DataGetter class Zebra2HorseDataLoader(DataGetter): diff --git a/use-cases/zebra2horse/pipeline.yaml b/use-cases/zebra2horse/pipeline.yaml index 213f44d1..ff00ef28 100644 --- a/use-cases/zebra2horse/pipeline.yaml +++ b/use-cases/zebra2horse/pipeline.yaml @@ -9,16 +9,16 @@ trainer: epochs: 10 batch_size: 1 model: - class_path: itwinai.models.tensorflow.cyclegan.CycleGAN + class_path: cyclegan.CycleGAN init_args: generator_G: - class_path: itwinai.models.tensorflow.cyclegan.Generator + class_path: cyclegan.Generator generator_F: - class_path: itwinai.models.tensorflow.cyclegan.Generator + class_path: cyclegan.Generator discriminator_X: - class_path: itwinai.models.tensorflow.cyclegan.Discriminator + class_path: cyclegan.Discriminator discriminator_Y: - class_path: itwinai.models.tensorflow.cyclegan.Discriminator + class_path: cyclegan.Discriminator compile_conf: gen_G_optimizer: { class_name: "Adam", diff --git a/src/itwinai/models/tensorflow/pix2pix.py b/use-cases/zebra2horse/pix2pix.py similarity index 100% rename from src/itwinai/models/tensorflow/pix2pix.py rename to use-cases/zebra2horse/pix2pix.py diff --git a/use-cases/zebra2horse/train.py b/use-cases/zebra2horse/train.py index 28a32631..08a91fd2 100644 --- a/use-cases/zebra2horse/train.py +++ b/use-cases/zebra2horse/train.py @@ -2,7 +2,7 @@ from trainer import Zebra2HorseTrainer from dataloader import Zebra2HorseDataLoader -from itwinai.backend.executors import LocalExecutor # , RayExecutor +from itwinai.executors import LocalExecutor # , RayExecutor if __name__ == "__main__": diff --git a/use-cases/zebra2horse/trainer.py b/use-cases/zebra2horse/trainer.py index 02055e09..1ae74896 100644 --- a/use-cases/zebra2horse/trainer.py +++ b/use-cases/zebra2horse/trainer.py @@ -2,8 +2,8 @@ import tensorflow as tf import tensorflow.keras as keras -from itwinai.backend.tensorflow.trainer import TensorflowTrainer -from itwinai.backend.loggers import Logger +from itwinai.tensorflow.trainer import TensorflowTrainer +from itwinai.loggers import Logger class Zebra2HorseTrainer(TensorflowTrainer):