Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add columnar data to data types (but don't use yet) #663

Merged
merged 12 commits into from
Aug 21, 2024
74 changes: 43 additions & 31 deletions src/power_grid_model/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,17 @@
from power_grid_model.core.data_handling import OutputType, process_output_component_types
from power_grid_model.core.dataset_definitions import ComponentType
from power_grid_model.data_types import (
BatchArray,
BatchComponentData,
BatchDataset,
BatchList,
Dataset,
DenseBatchArray,
PythonDataset,
SingleArray,
SingleComponentData,
SingleDataset,
SinglePythonDataset,
SparseBatchArray,
SparseBatchData,
)
from power_grid_model.typing import ComponentAttributeMapping

Expand Down Expand Up @@ -120,7 +121,7 @@ def get_and_verify_batch_sizes(batch_data: BatchDataset) -> int:
return n_batch_size


def get_batch_size(batch_data: BatchArray) -> int:
def get_batch_size(batch_data: BatchComponentData) -> int:
"""
Determine the number of batches and verify the data structure while we're at it.

Expand All @@ -135,19 +136,21 @@ def get_batch_size(batch_data: BatchArray) -> int:
# we assume that it is a single batch.
if batch_data.ndim == 1:
return 1
n_batches = batch_data.shape[0]
elif isinstance(batch_data, dict):
return batch_data.shape[0]

if isinstance(batch_data, dict):
# If the batch data is a dictionary, we assume that it is an indptr/data structure (otherwise it is an
# invalid dictionary). There is always one indptr more than there are batches.
if "indptr" not in batch_data:
raise ValueError("Invalid batch data format, expected 'indptr' and 'data' entries")
n_batches = batch_data["indptr"].size - 1
else:
# If the batch data is not a numpy array and not a dictionary, it is invalid
raise ValueError(
"Invalid batch data format, expected a 2-d numpy array or a dictionary with an 'indptr' and 'data' entry"
)
return n_batches
indptr = batch_data["indptr"]
if isinstance(indptr, np.ndarray):
return indptr.size - 1

# If the batch data is not a numpy array and not a dictionary, it is invalid
raise ValueError(
"Invalid batch data format, expected a 2-d numpy array or a dictionary with an 'indptr' and 'data' entry"
)


def split_numpy_array_in_batches(data: DenseBatchArray | SingleArray, component: ComponentType) -> list[np.ndarray]:
Expand Down Expand Up @@ -177,7 +180,7 @@ def split_numpy_array_in_batches(data: DenseBatchArray | SingleArray, component:
)


def split_sparse_batches_in_batches(batch_data: SparseBatchArray, component: ComponentType) -> list[np.ndarray]:
def split_sparse_batches_in_batches(batch_data: SparseBatchData, component: ComponentType) -> list[SingleComponentData]:
"""
Split a single numpy array representing, a compressed sparse structure, into one or more batches

Expand All @@ -199,27 +202,36 @@ def split_sparse_batches_in_batches(batch_data: SparseBatchArray, component: Com
data = batch_data["data"]
indptr = batch_data["indptr"]

if not isinstance(data, np.ndarray) or data.ndim != 1:
raise TypeError(
f"Invalid data type {type(data).__name__} in sparse batch data for '{component}' "
"(should be a 1D Numpy structured array (i.e. a single 'table'))."
)
def _split_buffer(buffer: np.ndarray, scenario: int) -> SingleArray:
if not isinstance(buffer, np.ndarray) or buffer.ndim != 1:
raise TypeError(
f"Invalid data type {type(buffer).__name__} in sparse batch data for '{component}' "
"(should be a 1D Numpy structured array (i.e. a single 'table'))."
)

if not isinstance(indptr, np.ndarray) or indptr.ndim != 1 or not np.issubdtype(indptr.dtype, np.integer):
raise TypeError(
f"Invalid indptr data type {type(indptr).__name__} in batch data for '{component}' "
"(should be a 1D Numpy array (i.e. a single 'list'), "
"containing indices (i.e. integers))."
)
if not isinstance(indptr, np.ndarray) or indptr.ndim != 1 or not np.issubdtype(indptr.dtype, np.integer):
raise TypeError(
f"Invalid indptr data type {type(indptr).__name__} in batch data for '{component}' "
"(should be a 1D Numpy array (i.e. a single 'list'), "
"containing indices (i.e. integers))."
)

if indptr[0] != 0 or indptr[-1] != len(data) or any(indptr[i] > indptr[i + 1] for i in range(len(indptr) - 1)):
raise TypeError(
f"Invalid indptr in batch data for '{component}' "
f"(should start with 0, end with the number of objects ({len(data)}) "
"and be monotonic increasing)."
)
if indptr[0] != 0 or indptr[-1] != len(buffer) or indptr[scenario] > indptr[scenario + 1]:
raise TypeError(
f"Invalid indptr in batch data for '{component}' "
f"(should start with 0, end with the number of objects ({len(buffer)}) "
"and be monotonic increasing)."
)

return buffer[indptr[scenario] : indptr[scenario + 1]]

def _get_scenario(scenario: int) -> SingleComponentData:
if isinstance(data, dict):
# return {attribute: _split_buffer(attribute_data, scenario) for attribute, attribute_data in data.items()}
raise NotImplementedError() # TODO(mgovers): uncomment when columnar data support is added
return _split_buffer(data, scenario)

return [data[indptr[i] : indptr[i + 1]] for i in range(len(indptr) - 1)]
return [_get_scenario(i) for i in range(len(indptr) - 1)]


def convert_dataset_to_python_dataset(data: Dataset) -> PythonDataset:
Expand Down
42 changes: 28 additions & 14 deletions src/power_grid_model/core/buffer_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,22 @@


from dataclasses import dataclass
from typing import Mapping, Optional
from typing import Optional, cast

import numpy as np

from power_grid_model.core.error_handling import VALIDATOR_MSG
from power_grid_model.core.index_integer import IdxC, IdxNp
from power_grid_model.core.power_grid_core import IdxPtr, VoidPtr
from power_grid_model.core.power_grid_meta import ComponentMetaData
from power_grid_model.data_types import (
ComponentData,
DenseBatchArray,
DenseBatchData,
SingleArray,
SparseBatchArray,
SparseBatchData,
)


@dataclass
Expand Down Expand Up @@ -72,12 +80,12 @@ def _get_indptr_view(indptr: np.ndarray) -> IdxPtr: # type: ignore[valid-type]
return np.ascontiguousarray(indptr, dtype=IdxNp).ctypes.data_as(IdxPtr)


def _get_uniform_buffer_properties(data: np.ndarray) -> BufferProperties:
def _get_uniform_buffer_properties(data: SingleArray | DenseBatchArray) -> BufferProperties:
"""
Extract the properties of the uniform batch dataset component.

Args:
data (np.ndarray): the dataset component.
data (SingleArray | DenseBatchArray): the dataset component.

Raises:
KeyError: if the dataset component is not sparse.
Expand Down Expand Up @@ -105,12 +113,12 @@ def _get_uniform_buffer_properties(data: np.ndarray) -> BufferProperties:
)


def _get_sparse_buffer_properties(data: Mapping[str, np.ndarray]) -> BufferProperties:
def _get_sparse_buffer_properties(data: SparseBatchArray) -> BufferProperties:
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved
"""
Extract the properties of the sparse batch dataset component.

Args:
data (Mapping[str, np.ndarray]): the sparse dataset component.
data (SparseBatchArray): the sparse dataset component.

Raises:
KeyError: if the dataset component is not sparse.
Expand Down Expand Up @@ -147,12 +155,12 @@ def _get_sparse_buffer_properties(data: Mapping[str, np.ndarray]) -> BufferPrope
)


def get_buffer_properties(data: np.ndarray | Mapping[str, np.ndarray]) -> BufferProperties:
def get_buffer_properties(data: ComponentData) -> BufferProperties:
"""
Extract the properties of the dataset component

Args:
data (np.ndarray | Mapping[str, np.ndarray]): the dataset component.
data (ComponentData): the dataset component.

Raises:
ValueError: if the dataset component contains conflicting or bad data.
Expand All @@ -163,7 +171,10 @@ def get_buffer_properties(data: np.ndarray | Mapping[str, np.ndarray]) -> Buffer
if isinstance(data, np.ndarray):
return _get_uniform_buffer_properties(data)

return _get_sparse_buffer_properties(data)
if isinstance(data.get("indptr"), np.ndarray) and isinstance(data.get("data"), np.ndarray):
return _get_sparse_buffer_properties(cast(SparseBatchArray, data))
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved

raise NotImplementedError() # TODO(mgovers): implement columnar data handling


def _get_uniform_buffer_view(data: np.ndarray, schema: ComponentMetaData) -> CBuffer:
Expand All @@ -188,7 +199,7 @@ def _get_uniform_buffer_view(data: np.ndarray, schema: ComponentMetaData) -> CBu
)


def _get_sparse_buffer_view(data: Mapping[str, np.ndarray], schema: ComponentMetaData) -> CBuffer:
def _get_sparse_buffer_view(data: SparseBatchArray, schema: ComponentMetaData) -> CBuffer:
"""
Get a C API compatible view on a sparse buffer.

Expand All @@ -213,7 +224,7 @@ def _get_sparse_buffer_view(data: Mapping[str, np.ndarray], schema: ComponentMet
)


def get_buffer_view(data: np.ndarray | Mapping[str, np.ndarray], schema: ComponentMetaData) -> CBuffer:
def get_buffer_view(data: ComponentData, schema: ComponentMetaData) -> CBuffer:
"""
Get a C API compatible view on a buffer.

Expand All @@ -227,10 +238,13 @@ def get_buffer_view(data: np.ndarray | Mapping[str, np.ndarray], schema: Compone
if isinstance(data, np.ndarray):
return _get_uniform_buffer_view(data, schema)

return _get_sparse_buffer_view(data, schema)
if isinstance(data.get("indptr"), np.ndarray) and isinstance(data.get("data"), np.ndarray):
return _get_sparse_buffer_view(cast(SparseBatchArray, data), schema)
figueroa1395 marked this conversation as resolved.
Show resolved Hide resolved

raise NotImplementedError() # TODO(mgovers): implement columnar data handling


def create_buffer(properties: BufferProperties, schema: ComponentMetaData) -> np.ndarray | dict[str, np.ndarray]:
def create_buffer(properties: BufferProperties, schema: ComponentMetaData) -> ComponentData:
"""
Create a buffer with the provided properties and type.

Expand All @@ -250,7 +264,7 @@ def create_buffer(properties: BufferProperties, schema: ComponentMetaData) -> np
return _create_uniform_buffer(properties=properties, schema=schema)


def _create_uniform_buffer(properties: BufferProperties, schema: ComponentMetaData) -> np.ndarray:
def _create_uniform_buffer(properties: BufferProperties, schema: ComponentMetaData) -> DenseBatchData:
"""
Create a uniform buffer with the provided properties and type.

Expand All @@ -275,7 +289,7 @@ def _create_uniform_buffer(properties: BufferProperties, schema: ComponentMetaDa
return np.empty(shape=shape, dtype=schema.dtype)


def _create_sparse_buffer(properties: BufferProperties, schema: ComponentMetaData) -> dict[str, np.ndarray]:
def _create_sparse_buffer(properties: BufferProperties, schema: ComponentMetaData) -> SparseBatchData:
"""
Create a sparse buffer with the provided properties and type.

Expand Down
13 changes: 5 additions & 8 deletions src/power_grid_model/core/data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,11 @@


from enum import Enum
from typing import Mapping

import numpy as np

from power_grid_model.core.dataset_definitions import ComponentType, DatasetType
from power_grid_model.core.power_grid_dataset import CConstDataset, CMutableDataset
from power_grid_model.core.power_grid_meta import initialize_array, power_grid_meta_data
from power_grid_model.data_types import Dataset
from power_grid_model.data_types import Dataset, SingleDataset
from power_grid_model.enum import CalculationType
from power_grid_model.typing import ComponentAttributeMapping, _ComponentAttributeMappingDict

Expand Down Expand Up @@ -56,7 +53,7 @@ def get_output_type(*, calculation_type: CalculationType, symmetric: bool) -> Ou
raise NotImplementedError()


def prepare_input_view(input_data: Mapping[ComponentType, np.ndarray]) -> CConstDataset:
def prepare_input_view(input_data: SingleDataset) -> CConstDataset:
"""
Create a view of the input data in a format compatible with the PGM core libary.

Expand All @@ -70,7 +67,7 @@ def prepare_input_view(input_data: Mapping[ComponentType, np.ndarray]) -> CConst
return CConstDataset(input_data, dataset_type=DatasetType.input)


def prepare_update_view(update_data: Mapping[ComponentType, np.ndarray | Mapping[str, np.ndarray]]) -> CConstDataset:
def prepare_update_view(update_data: Dataset) -> CConstDataset:
"""
Create a view of the update data, or an empty view if not provided, in a format compatible with the PGM core libary.

Expand All @@ -84,7 +81,7 @@ def prepare_update_view(update_data: Mapping[ComponentType, np.ndarray | Mapping
return CConstDataset(update_data, dataset_type=DatasetType.update)


def prepare_output_view(output_data: Mapping[ComponentType, np.ndarray], output_type: OutputType) -> CMutableDataset:
def prepare_output_view(output_data: Dataset, output_type: OutputType) -> CMutableDataset:
"""
create a view of the output data in a format compatible with the PGM core libary.

Expand Down Expand Up @@ -166,7 +163,7 @@ def process_output_component_types(
"""
# limit all component count to user specified component types in output and convert to a dict
if output_component_types is None:
output_component_types = {k: None for k in available_components}
output_component_types = {ComponentType[k]: None for k in available_components}
elif isinstance(output_component_types, (list, set)):
output_component_types = {k: None for k in output_component_types}
elif not isinstance(output_component_types, dict) or not all(
Expand Down
30 changes: 7 additions & 23 deletions src/power_grid_model/core/power_grid_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
power_grid_core as pgc,
)
from power_grid_model.core.power_grid_meta import DatasetMetaData, power_grid_meta_data
from power_grid_model.data_types import Dataset
from power_grid_model.data_types import ComponentData, Dataset
from power_grid_model.errors import PowerGridError


Expand Down Expand Up @@ -186,11 +186,7 @@ class CMutableDataset:
_mutable_dataset: MutableDatasetPtr
_buffer_views: list[CBuffer]

def __new__(
cls,
data: Mapping[ComponentType, np.ndarray] | Mapping[ComponentType, np.ndarray | Mapping[str, np.ndarray]],
dataset_type: Any = None,
):
def __new__(cls, data: Dataset, dataset_type: Any = None):
instance = super().__new__(cls)
instance._mutable_dataset = MutableDatasetPtr()
instance._buffer_views = []
Expand Down Expand Up @@ -243,10 +239,7 @@ def get_buffer_views(self) -> list[CBuffer]:
"""
return self._buffer_views

def _add_data(
self,
data: Mapping[ComponentType, np.ndarray] | Mapping[ComponentType, np.ndarray | Mapping[str, np.ndarray]],
):
def _add_data(self, data: Dataset):
"""
Add Power Grid Model data to the mutable dataset view.

Expand All @@ -261,12 +254,7 @@ def _add_data(
for component, component_data in data.items():
self._add_component_data(component, component_data, allow_unknown=False)

def _add_component_data(
self,
component: ComponentType,
data: np.ndarray | Mapping[str, np.ndarray],
allow_unknown: bool = False,
):
def _add_component_data(self, component: ComponentType, data: ComponentData, allow_unknown: bool = False):
"""
Add Power Grid Model data for a single component to the mutable dataset view.

Expand Down Expand Up @@ -301,7 +289,7 @@ def _register_buffer(self, component: ComponentType, buffer: CBuffer):
)
assert_no_error()

def _validate_properties(self, data: np.ndarray | Mapping[str, np.ndarray]):
def _validate_properties(self, data: ComponentData):
properties = get_buffer_properties(data)
if properties.is_batch != self._is_batch:
raise ValueError(
Expand All @@ -328,11 +316,7 @@ class CConstDataset:
_const_dataset: ConstDatasetPtr
_buffer_views: list[CBuffer]

def __new__(
cls,
data: Mapping[ComponentType, np.ndarray] | Mapping[ComponentType, np.ndarray | Mapping[str, np.ndarray]],
dataset_type: Optional[DatasetType] = None,
):
def __new__(cls, data: Dataset, dataset_type: Optional[DatasetType] = None):
instance = super().__new__(cls)
instance._const_dataset = ConstDatasetPtr()

Expand Down Expand Up @@ -419,7 +403,7 @@ def get_data(self) -> Dataset:
"""
return self._data

def get_component_data(self, component: ComponentType) -> np.ndarray | Mapping[str, np.ndarray]:
def get_component_data(self, component: ComponentType) -> ComponentData:
"""
Retrieve Power Grid Model data from the dataset for a specific component.

Expand Down
Loading