Skip to content

Commit

Permalink
gordo-dataset~=5.0.0rc6 (#1263)
Browse files Browse the repository at this point in the history
* gordo-dataset~=5.0.0rc6

* Remove ConfigurationError

* Update pyzmq

* Refactoring Machine. Update test dependencies

* First attempt to fix all unit-tests

* All unit-tests fixed
  • Loading branch information
koropets authored Jul 27, 2022
1 parent c7328b1 commit 15ab9b3
Show file tree
Hide file tree
Showing 33 changed files with 301 additions and 123 deletions.
6 changes: 6 additions & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ spec:

- name: ct-23-0001 #1st machine
dataset:
data_provider:
type: RandomDataProvider
tags: #list of tags for 1st machine
- GRA-TAG 1
- GRA-TAG 2
Expand All @@ -21,6 +23,8 @@ spec:

- name: ct-23-0002 #2nd machine
dataset:
data_provider:
type: RandomDataProvider
resolution: 2T
tags: #list of tags for 2nd machine
- GRA-TAG 1
Expand All @@ -38,6 +42,8 @@ spec:

- name: ct-23-0003 #3rd machine
dataset:
data_provider:
type: RandomDataProvider
tags: #list of tags for 3rd machine
- GRA-TAG 1
- GRA-TAG 2
Expand Down
50 changes: 35 additions & 15 deletions gordo/builder/build_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
parse_version,
)
from gordo_dataset.base import GordoBaseDataset
from gordo_dataset.import_utils import BackCompatibleLocations
from gordo.machine.model.base import GordoBase
from gordo.machine.model.utils import metric_wrapper
from gordo.workflow.config_elements.normalized_config import NormalizedConfig
Expand All @@ -45,7 +46,12 @@


class ModelBuilder:
def __init__(self, machine: Machine):
def __init__(
self,
machine: Machine,
back_compatibles: Optional[BackCompatibleLocations] = None,
default_data_provider: Optional[str] = None,
):
"""
Build a model for a given :class:`gordo.machine.Machine`
Expand All @@ -57,7 +63,7 @@ def __init__(self, machine: Machine):
-------
>>> from gordo_dataset.sensor_tag import SensorTag
>>> from gordo.machine import Machine
>>> machine = Machine(
>>> machine = Machine.from_config(dict(
... name="special-model-name",
... model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}},
... dataset={
Expand All @@ -68,14 +74,20 @@ def __init__(self, machine: Machine):
... "target_tag_list": [SensorTag("Tag 3"), SensorTag("Tag 4")]
... },
... project_name='test-proj',
... )
... ))
>>> builder = ModelBuilder(machine=machine)
>>> model, machine = builder.build()
"""
# Avoid overwriting the passed machine, copy doesn't work if it holds
# reference to a loaded Tensorflow model; .to_dict() serializes it to
# a primitive dict representation.
self.machine = Machine(**machine.to_dict())
self.machine = Machine.from_dict(
machine.to_dict(),
back_compatibles=back_compatibles,
default_data_provider=default_data_provider,
)
self.back_compatibles = back_compatibles
self.default_data_provider = default_data_provider

@property
def cached_model_path(self) -> Union[os.PathLike, str, None]:
Expand Down Expand Up @@ -146,7 +158,11 @@ def build(

metadata["runtime"] = self.machine.runtime

machine = Machine(**metadata)
machine = Machine.from_dict(
metadata,
back_compatibles=self.back_compatibles,
default_data_provider=self.default_data_provider,
)

# Otherwise build and cache the model
else:
Expand Down Expand Up @@ -205,14 +221,18 @@ def _build(self) -> Tuple[sklearn.base.BaseEstimator, Machine]:

cv_duration_sec = None

machine: Machine = Machine(
name=self.machine.name,
dataset=self.machine.dataset.to_dict(),
metadata=self.machine.metadata,
model=self.machine.model,
project_name=self.machine.project_name,
evaluation=self.machine.evaluation,
runtime=self.machine.runtime,
machine: Machine = Machine.from_dict(
dict(
name=self.machine.name,
dataset=self.machine.dataset.to_dict(),
metadata=self.machine.metadata.to_dict(),
model=self.machine.model,
project_name=self.machine.project_name,
evaluation=self.machine.evaluation,
runtime=self.machine.runtime,
),
back_compatibles=self.back_compatibles,
default_data_provider=self.default_data_provider,
)

split_metadata: Dict[str, Any] = dict()
Expand Down Expand Up @@ -566,7 +586,7 @@ def calculate_cache_key(self, machine: Machine) -> str:
-------
>>> from gordo.machine import Machine
>>> from gordo_dataset.sensor_tag import SensorTag
>>> machine = Machine(
>>> machine = Machine.from_config(dict(
... name="special-model-name",
... model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}},
... dataset={
Expand All @@ -577,7 +597,7 @@ def calculate_cache_key(self, machine: Machine) -> str:
... "target_tag_list": [SensorTag("Tag 3"), SensorTag("Tag 4")]
... },
... project_name='test-proj'
... )
... ))
>>> builder = ModelBuilder(machine)
>>> len(builder.cache_key)
128
Expand Down
2 changes: 0 additions & 2 deletions gordo/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from gordo_dataset.data_providers.providers import NoSuitableDataProviderError
from gordo_dataset.sensor_tag import SensorTagNormalizationError
from gordo_dataset.base import ConfigurationError
from gordo_dataset.exceptions import ConfigException, InsufficientDataError
from gunicorn.glogging import Logger
from typing import Tuple, List, Any, cast
Expand All @@ -38,7 +37,6 @@
(SensorTagNormalizationError, 60),
(NoSuitableDataProviderError, 70),
(InsufficientDataError, 80),
(ConfigurationError, 81),
(ImportError, 85),
(ReporterException, 90),
(ConfigException, 100),
Expand Down
97 changes: 65 additions & 32 deletions gordo/machine/machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
import json
import logging
from datetime import datetime
from typing import Dict, Any, Union, Optional, List
from typing import Dict, Any, Optional, List, cast

import numpy as np
import yaml

from gordo_dataset.base import GordoBaseDataset
from gordo_dataset.sensor_tag import SensorTag
from gordo_dataset.import_utils import BackCompatibleLocations
from gordo.machine.validators import (
ValidUrlString,
ValidMetadata,
Expand Down Expand Up @@ -38,44 +39,47 @@ class Machine:
runtime = ValidMachineRuntime()
_strict = True

@staticmethod
def prepare_evaluation(evaluation: Optional[dict]) -> dict:
if evaluation is None:
evaluation = dict(cv_mode="full_build")
return evaluation

def __init__(
self,
name: str,
model: dict,
dataset: Union[GordoBaseDataset, dict],
dataset: GordoBaseDataset,
project_name: str,
evaluation: Optional[dict] = None,
metadata: Optional[Union[dict, Metadata]] = None,
runtime=None,
metadata: Optional[Metadata] = None,
runtime: Optional[dict] = None,
):

if runtime is None:
runtime = dict()
if evaluation is None:
evaluation = dict(cv_mode="full_build")
if metadata is None:
metadata = dict()
metadata = cast(Any, Metadata).from_dict({})
self.name = name
self.model = model
self.dataset = (
dataset
if isinstance(dataset, GordoBaseDataset)
else GordoBaseDataset.from_dict(dataset)
)
self.dataset = dataset
self.runtime = runtime
self.evaluation = evaluation
self.metadata = (
metadata
if isinstance(metadata, Metadata)
else Metadata.from_dict(metadata) # type: ignore
)
self.evaluation = self.prepare_evaluation(evaluation)
self.metadata = metadata
self.project_name = project_name

# host validation
self.host = f"gordoserver-{self.project_name}-{self.name}"

# TODO TypedDict for config argument
@classmethod
def from_config( # type: ignore
cls, config: Dict[str, Any], project_name: str, config_globals=None
cls,
config: Dict[str, Any],
project_name: Optional[str] = None,
config_globals=None,
back_compatibles: Optional[BackCompatibleLocations] = None,
default_data_provider: Optional[str] = None,
):
"""
Construct an instance from a block of YAML config file which represents
Expand All @@ -89,6 +93,9 @@ def from_config( # type: ignore
Name of the project this Machine belongs to.
config_globals:
The block of config within the YAML file within `globals`
back_compatibles: Optional[BackCompatibleLocations]
See `gordo_dataset.import_utils.prepare_back_compatible_locations()` function for reference.
default_data_provider: Optional[str]
Returns
-------
Expand All @@ -100,15 +107,20 @@ def from_config( # type: ignore
name = config["name"]
model = config.get("model") or config_globals.get("model")

if project_name is None:
project_name = config.get("project_name", None)
if project_name is None:
raise ValueError("project_name is empty")

local_runtime = config.get("runtime", dict())
runtime = patch_dict(config_globals.get("runtime", dict()), local_runtime)

dataset_config = patch_dict(
dataset = patch_dict(
config.get("dataset", dict()), config_globals.get("dataset", dict())
)
dataset = GordoBaseDataset.from_dict(dataset_config)
config_evaluation = cls.prepare_evaluation(config.get("evaluation"))
evaluation = patch_dict(
config_globals.get("evaluation", dict()), config.get("evaluation", dict())
config_globals.get("evaluation", dict()), config_evaluation
)

metadata = Metadata(
Expand All @@ -117,14 +129,18 @@ def from_config( # type: ignore
"machine-metadata": config.get("metadata", dict()),
}
)
return cls(
name,
model,
dataset,
metadata=metadata,
runtime=runtime,
project_name=project_name,
evaluation=evaluation,
return cls.from_dict(
{
"name": name,
"model": model,
"dataset": dataset,
"project_name": project_name,
"evaluation": evaluation,
"metadata": metadata,
"runtime": runtime,
},
back_compatibles=back_compatibles,
default_data_provider=default_data_provider,
)

def normalize_sensor_tags(self, tag_list: TagsList) -> List[SensorTag]:
Expand Down Expand Up @@ -153,13 +169,30 @@ def __str__(self):
def __eq__(self, other):
return self.to_dict() == other.to_dict()

# TODO TypedDict for d argument
@classmethod
def from_dict(cls, d: dict) -> "Machine":
def from_dict(
cls,
d: dict[str, Any],
back_compatibles: Optional[BackCompatibleLocations] = None,
default_data_provider: Optional[str] = None,
) -> "Machine":
"""
Get an instance from a dict taken from :func:`~Machine.to_dict`
"""
# No special treatment required, just here for consistency.
return cls(**d)
args: dict[str, Any] = {}
for k, v in d.items():
if k == "dataset" and isinstance(v, dict):
v = GordoBaseDataset.from_dict(
v,
back_compatibles=back_compatibles,
default_data_provider=default_data_provider,
)
if k == "metadata" and isinstance(v, dict):
v = cast(Any, Metadata).from_dict(v)
args[k] = v
return cls(**args)

def to_dict(self):
"""
Expand Down
9 changes: 8 additions & 1 deletion gordo/workflow/config_elements/normalized_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from gordo.workflow.workflow_generator.helpers import patch_dict
from gordo.machine import Machine
from gordo import __version__
from gordo_dataset.import_utils import BackCompatibleLocations
from packaging.version import parse
from pydantic import parse_obj_as, BaseModel

Expand Down Expand Up @@ -108,6 +109,8 @@ def __init__(
project_name: str,
gordo_version: Optional[str] = None,
model_builder_env: Optional[dict] = None,
back_compatibles: Optional[BackCompatibleLocations] = None,
default_data_provider: Optional[str] = None,
):
if gordo_version is None:
gordo_version = __version__
Expand Down Expand Up @@ -137,7 +140,11 @@ def __init__(
self.project_name = project_name
self.machines: List[Machine] = [
Machine.from_config(
conf, project_name=project_name, config_globals=patched_globals
conf,
project_name=project_name,
config_globals=patched_globals,
back_compatibles=back_compatibles,
default_data_provider=default_data_provider,
)
for conf in config["machines"]
]
Expand Down
9 changes: 5 additions & 4 deletions requirements/full_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with python 3.9
# To update, run:
#
# pip-compile --extra-index-url=<index_url> --no-emit-index-url --output-file=full_requirements.txt mlflow_requirements.in postgres_requirements.in requirements.in
# pip-compile --no-emit-index-url --output-file=full_requirements.txt mlflow_requirements.in postgres_requirements.in requirements.in
#
absl-py==0.11.0
# via
Expand Down Expand Up @@ -150,9 +150,9 @@ google-auth-oauthlib==0.4.1
# via tensorboard
google-pasta==0.2.0
# via tensorflow
gordo-client==5.2.0rc0
gordo-client==5.2.0rc2
# via -r requirements.in
gordo-dataset==5.0.0rc3
gordo-dataset==5.0.0rc6
# via
# -r requirements.in
# gordo-client
Expand Down Expand Up @@ -297,6 +297,7 @@ packaging==20.7
# mlflow
# mlflow-skinny
# numexpr
# xarray
pandas==1.4.2
# via
# catboost
Expand Down Expand Up @@ -504,7 +505,7 @@ wrapt==1.11.2
# via
# gordo-client
# tensorflow
xarray==0.20.2
xarray==2022.3.0
# via gordo-dataset
zipp==2.0.0
# via importlib-metadata
Expand Down
Loading

0 comments on commit 15ab9b3

Please sign in to comment.