Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3dgan integration #98

Merged
merged 59 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
443b39e
commiting integration of 3dgan scripts
Oct 24, 2023
0f50aaf
ADD: Download dataset
matbun Oct 24, 2023
41d2b66
FIX: DDP distributed training with manual optimization
matbun Oct 25, 2023
ddfa59d
ADD: log with MLFlow
matbun Oct 25, 2023
e89a433
Sqaaas code (#88)
matbun Oct 25, 2023
adc6c91
Sqaaas code (#89)
matbun Oct 27, 2023
291b4f3
ADD: draft predictor and saver
matbun Nov 6, 2023
7da9ba4
ADD: stub for inference pipeline
matbun Nov 6, 2023
c73fb08
ADD: small docs
matbun Nov 6, 2023
1866a81
UPDATE: inference pipeline components
matbun Nov 7, 2023
22aed46
UPDATE: reorg
matbun Nov 7, 2023
0242790
ADD: image generation for inference
matbun Nov 7, 2023
17915b1
update tag
matbun Nov 7, 2023
c3ff733
ADD: threshold
matbun Nov 7, 2023
0a0f56e
ADD: draft inference
matbun Nov 7, 2023
95661c1
ADD: draft inference wf
matbun Nov 7, 2023
94254cf
ADD: working inference workflow
matbun Nov 8, 2023
63a7aa0
ADD: 3D scatter plots
matbun Nov 8, 2023
61c3666
ADD: Dockerfile + refactor
matbun Nov 8, 2023
d690192
ADD: .dockerignore
matbun Nov 8, 2023
a2a9875
Update .dockerignore
matbun Nov 8, 2023
3bcc410
REMOVE: keras dependency
matbun Nov 8, 2023
be0c115
ADD: skip download option
matbun Nov 9, 2023
77d939e
ADD: cern pipeline.yaml
matbun Nov 9, 2023
b603b05
UPDATE: dataset loading function
matbun Nov 9, 2023
bee1317
UPDATE: dataset loading function
matbun Nov 9, 2023
b6c3ee2
UPDATE conf
matbun Nov 9, 2023
466b150
UPDATE refactor
matbun Nov 9, 2023
3e1d6ab
UPDATE refactor
matbun Nov 9, 2023
a814e65
Merge branch 'dev' into 3dgan_integration
matbun Nov 9, 2023
ca60e19
UPDATE training docs
matbun Nov 9, 2023
307ed65
Update readme
matbun Nov 9, 2023
f47f40d
update README
matbun Nov 9, 2023
fc0697e
FIX typo
matbun Nov 9, 2023
a8c9d6d
Update README
matbun Nov 9, 2023
3faa062
Update mkdir
matbun Nov 9, 2023
0935e83
Merge branch 'dev' into 3dgan_integration
matbun Nov 10, 2023
b50c610
UPDATE data paths
matbun Nov 15, 2023
2cedfe7
UPDATE Dockerfile
matbun Nov 16, 2023
1efba3f
UPDATE Dockerfiles
matbun Nov 16, 2023
60ab87d
UPDATE for Singularity execution
matbun Nov 16, 2023
881ae47
FIX version mismatch
matbun Nov 16, 2023
9ab6ec1
UPDATE Singularity docs
matbun Nov 16, 2023
59fd74b
Named steps pipe (#100)
matbun Nov 23, 2023
8f13d92
UPDATE Singularity exec command
matbun Nov 23, 2023
8e19c62
UPDATE: Image version
matbun Nov 23, 2023
d3a2630
UPDATE: load components from pipeline
matbun Nov 23, 2023
33de0b4
ADD: docs
matbun Nov 23, 2023
f2ccfae
Simplify 3DGAN model config
matbun Nov 23, 2023
1af8ba7
ADD: mlflow autologging support for PL trainer
matbun Nov 23, 2023
acf7782
UPDATE container info
matbun Nov 24, 2023
656ab67
Refactor
matbun Dec 1, 2023
b176abf
UPDATE dependencies
matbun Dec 1, 2023
087c7ec
FIX linter problem
matbun Dec 1, 2023
8d9f51f
Simplified workflow configuration (#108)
matbun Dec 13, 2023
dd2c5ea
Simplified workflow configuration (#109)
matbun Dec 13, 2023
debc6a4
ADD integration tests
matbun Dec 13, 2023
9e8eafe
FIX test
matbun Dec 13, 2023
c9b1c17
FIX 3dgan inference test
matbun Dec 13, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"submitit>=1.4.6",
"typing-extensions==4.5.0",
"typing_extensions==4.5.0",
"urllib3>=2.0.5",
"urllib3>=1.26.18",
matbun marked this conversation as resolved.
Show resolved Hide resolved
]

# dynamic = ["version", "description"]
Expand Down
72 changes: 66 additions & 6 deletions src/itwinai/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
from typing import Iterable, Dict, Any, Optional, Tuple, Union
from abc import ABCMeta, abstractmethod
import time
from jsonargparse import ArgumentParser

# import logging
# from logging import Logger as PythonLogger

from .cluster import ClusterEnvironment
from .types import ModelML, DatasetML
from .serialization import ModelLoader
from .utils import load_yaml


class Executable(metaclass=ABCMeta):
Expand Down Expand Up @@ -231,12 +234,12 @@ def save(self, *args, **kwargs):
class Executor(Executable):
"""Sets-up and executes a sequence of Executable steps."""

steps: Iterable[Executable]
steps: Union[Dict[str, Executable], Iterable[Executable]]
constructor_args: Dict

def __init__(
self,
steps: Iterable[Executable],
steps: Union[Dict[str, Executable], Iterable[Executable]],
name: Optional[str] = None,
# logs_dir: Optional[str] = None,
# debug: bool = False,
Expand All @@ -247,9 +250,20 @@ def __init__(
self.steps = steps
self.constructor_args = kwargs

def __getitem__(self, subscript) -> Executor:
def __getitem__(self, subscript: Union[str, int, slice]) -> Executor:
if isinstance(subscript, slice):
s = self.steps[subscript.start:subscript.stop: subscript.step]
# First, convert to list if is a dict
if isinstance(self.steps, dict):
steps = list(self.steps.items())
else:
steps = self.steps
# Second, perform slicing
s = steps[subscript.start:subscript.stop: subscript.step]
# Third, reconstruct dict, if it is a dict
if isinstance(self.steps, dict):
s = dict(s)
# Fourth, return sliced sub-pipeline, preserving its
# initial structure
sliced = self.__class__(
steps=s,
**self.constructor_args
Expand All @@ -270,7 +284,12 @@ def setup(self, parent: Optional[Executor] = None) -> None:
Defaults to None.
"""
super().setup(parent)
for step in self.steps:
if isinstance(self.steps, dict):
steps = list(self.steps.values())
else:
steps = self.steps

for step in steps:
step.setup(self)
step.is_setup = True

Expand Down Expand Up @@ -303,7 +322,12 @@ def execute(
Tuple[Optional[Tuple], Optional[Dict]]: tuple structured as
(results, config).
"""
for step in self.steps:
if isinstance(self.steps, dict):
steps = list(self.steps.values())
else:
steps = self.steps

for step in steps:
if not step.is_setup:
raise RuntimeError(
f"Step '{step.name}' was not setup!"
Expand All @@ -318,3 +342,39 @@ def _pack_args(self, args) -> Tuple:
if not isinstance(args, tuple):
args = (args,)
return args


def recursive_replace(config: Dict, target_field: str, new_value: Any) -> None:
def _recursive_replace_key(sub_dict: Dict):
if not isinstance(sub_dict, dict):
return
for k, v in sub_dict.items():
if k == target_field:
sub_dict[k] = new_value
return
else:
_recursive_replace_key(v)
_recursive_replace_key(config)


def load_pipeline_step(
pipe: Union[str, Dict],
step_id: Union[str, int],
override_keys: Optional[Dict[str, Any]] = None
) -> Executable:
if isinstance(pipe, str):
# Load pipe from YAML file path
pipe = load_yaml(pipe)
step_dict_config = pipe['executor']['init_args']['steps'][step_id]

# Override fields
if override_keys is not None:
for key, value in override_keys.items():
recursive_replace(step_dict_config, key, value)

# Wrap config under "step" field and parse it
step_dict_config = dict(step=step_dict_config)
step_parser = ArgumentParser()
step_parser.add_subclass_arguments(Executable, "step")
parsed_namespace = step_parser.parse_object(step_dict_config)
return step_parser.instantiate_classes(parsed_namespace)["step"]
35 changes: 35 additions & 0 deletions use-cases/3dgan/Dockerfile.vega
matbun marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM python:3.9.12

WORKDIR /usr/src/app

RUN pip install --upgrade pip

# Install pytorch (cpuonly)
# Ref:https://pytorch.org/get-started/previous-versions/#linux-and-windows-5
RUN pip install --no-cache-dir torch==1.13.1+cpu torchvision==0.14.1+cpu torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cpu
RUN pip install --no-cache-dir lightning

# Add 3DGAN custom requirements
COPY use-cases/3dgan/requirements.txt /usr/src/app/
RUN pip install --no-cache-dir -r requirements.txt

# Install itwinai and dependencies
COPY pyproject.toml /usr/src/app/
COPY src /usr/src/app/
RUN pip install --no-cache-dir /usr/src/app

# Add 3DGAN use case files
COPY use-cases/3dgan/* /usr/src/app/

# # Create results folder
# RUN mkdir -p /tmp/data
# RUN chmod 0777 -R /tmp/data

# Create results folder
# TODO: remove once the problem with file system permissions are solved
RUN mkdir -p /tmp/data/3dgan-generated-data
RUN mkdir -p /tmp/data/exp_data/3dgan_data
RUN chmod 0777 -R /tmp/data

# Run inference
CMD [ "python", "train.py", "-p", "inference-pipeline.yaml"]
39 changes: 36 additions & 3 deletions use-cases/3dgan/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,42 @@
# 3DGAN use case

First of all, from the repository root, create a torch environment:

```bash
make torch-gpu
```

Now, install custom requirements for 3DGAN:

```bash
micromamba activate ./.venv-pytorch
cd use-cases/3dgan
pip install -r requirements.txt
```

**NOTE**: Python commands below assumed to be executed from within the
micromamba virtual environment.

## Training

At CERN, use the dedicated configuration file:

```bash
cd use-cases/3dgan
python train.py -p cern-pipeline.yaml

# Or better:
micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p cern-pipeline.yaml
```

Anywhere else, use the general purpose training configuration:

```bash
cd use-cases/3dgan
python train.py -p pipeline.yaml

# Or better:
micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu train.py -p pipeline.yaml
```

To visualize the logs with MLFLow run the following in the terminal:
Expand Down Expand Up @@ -88,8 +111,8 @@ Build from project root with
docker buildx build -t itwinai-mnist-torch-inference -f use-cases/3dgan/Dockerfile .

# Ghcr.io
docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1 -f use-cases/3dgan/Dockerfile .
docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
docker buildx build -t ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2 -f use-cases/3dgan/Dockerfile .
docker push ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2
```

From wherever a sample of MNIST jpg images is available
Expand All @@ -106,7 +129,7 @@ From wherever a sample of MNIST jpg images is available
```

```bash
docker run -it --rm --name running-inference -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.1
docker run -it --rm --name running-inference -v "$PWD":/tmp/data ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2
```

This command will store the results in a folder called "3dgan-generated-data":
Expand All @@ -120,3 +143,13 @@ This command will store the results in a folder called "3dgan-generated-data":
| │ ├── energy=1.664689540863037&angle=1.4906378984451294.pth
| │ ├── energy=1.664689540863037&angle=1.4906378984451294.jpg
```

### Singularity

Run overriding the working directory (`--pwd /usr/src/app`, restores Docker's WORKDIR)
and providing a writable filesystem (`-B "$PWD":/usr/data`):

```bash
singularity exec -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai-3dgan-inference:0.0.2 /
bash -c "cd /usr/src/app && python train.py -p pipeline.yaml"
```
4 changes: 2 additions & 2 deletions use-cases/3dgan/inference-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ executor:
class_path: lightning.pytorch.loggers.MLFlowLogger
init_args:
experiment_name: 3DGAN
save_dir: ml_logs/mlflow_logs
save_dir: /usr/data/ml_logs/mlflow_logs
log_model: all
max_epochs: 1
max_steps: 20
Expand All @@ -88,7 +88,7 @@ executor:
loss_weights: [3, 0.1, 25, 0.1]
power: 0.85
lr: 0.001
checkpoint_path: exp_data/3dgan.pth
checkpoint_path: /usr/data/exp_data/3dgan.pth

# Lightning data module configuration
data:
Expand Down
3 changes: 1 addition & 2 deletions use-cases/3dgan/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,7 @@ def __init__(
self.test_history = defaultdict(list)
self.pklfile = checkpoint_path
checkpoint_dir = os.path.dirname(checkpoint_path)
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
os.makedirs(checkpoint_dir, exist_ok=True)

def BitFlip(self, x, prob=0.05):
"""
Expand Down
8 changes: 5 additions & 3 deletions use-cases/mnist/torch/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ executor:
class_path: itwinai.components.Executor
init_args:
steps:
- class_path: dataloader.MNISTDataModuleTorch
dataloading_step:
class_path: dataloader.MNISTDataModuleTorch
init_args:
save_path: .tmp/

- class_path: itwinai.torch.trainer.TorchTrainerMG
training_step:
class_path: itwinai.torch.trainer.TorchTrainerMG
init_args:
model:
class_path: model.Net
Expand All @@ -25,7 +27,7 @@ executor:
batch_size: 32
pin_memory: True
shuffle: False
epochs: 30
epochs: 2
train_metrics:
accuracy:
class_path: torchmetrics.classification.MulticlassAccuracy
Expand Down