Skip to content

Commit

Permalink
Fix instance to pandas prop name conflict (#2050)
Browse files Browse the repository at this point in the history
  • Loading branch information
haakonvt authored Nov 27, 2024
1 parent 5e7b77e commit f8302f4
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 8 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ Changes are grouped as follows
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [7.69.1] - 2024-11-25
## [7.69.2] - 2024-11-28
### Improved
- Handle conversion of instance lists like NodeList to pandas DataFrame in scenarios where: a) properties are expanded
into columns, b) the view ID prefix has be removed and c) one or more user properties have a naming conflict with
base properties. This no longer raises a documented error by pandas, but gives a warning instead.

## [7.69.1] - 2024-11-27
### Fixed
- Convenience methods for `TimeSeries` (defined through Data Modeling with `instance_id`) now works as
intended: `count`, `latest` and `first`.
Expand Down
2 changes: 1 addition & 1 deletion cognite/client/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import annotations

__version__ = "7.69.1"
__version__ = "7.69.2"
__api_subversion__ = "20230101"
31 changes: 26 additions & 5 deletions cognite/client/data_classes/data_modeling/instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def to_pandas( # type: ignore [override]
camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded.
convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
expand_properties (bool): Expand the properties into separate rows. Note: Will change default to True in the next major version.
remove_property_prefix (bool): Remove view ID prefix from row names of expanded properties (in index). Requires data to be from a single view.
remove_property_prefix (bool): Attempt to remove the view ID prefix from row names of expanded properties (in index). Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept.
**kwargs (Any): For backwards compatibility.
Returns:
Expand All @@ -455,10 +455,20 @@ def to_pandas( # type: ignore [override]
view_id, *extra = self.properties.keys()
# We only do/allow this if we have a single source:
if not extra:
prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
prefix = "{}.{}/{}.".format(*view_id.as_tuple())
prop_df.columns = prop_df.columns.str.removeprefix(prefix)

if isinstance(self, TypedInstance):
attr_name_mapping = self._get_descriptor_property_name_mapping()
prop_df = prop_df.rename(columns=attr_name_mapping)

if any(overlapping := col.index.intersection(prop_df.columns)):
warnings.warn(
"One or more expanded property names overlapped with base properties. "
f"These rows (index) will not have their view ID prefix removed: {sorted(overlapping)}",
RuntimeWarning,
)
prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping})
else:
warnings.warn(
"Can't remove view ID prefix from expanded property rows as source was not unique",
Expand Down Expand Up @@ -1089,7 +1099,7 @@ def to_pandas( # type: ignore [override]
camel_case (bool): Convert column names to camel case (e.g. `externalId` instead of `external_id`). Does not apply to properties.
convert_timestamps (bool): Convert known columns storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
expand_properties (bool): Expand the properties into separate columns. Note: Will change default to True in the next major version.
remove_property_prefix (bool): Remove view ID prefix from columns names of expanded properties. Requires data to be from a single view.
remove_property_prefix (bool): Attempt to remove the view ID prefix from columns names of expanded properties. Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept.
**kwargs (Any): For backwards compatibility.
Returns:
Expand All @@ -1107,16 +1117,27 @@ def to_pandas( # type: ignore [override]
if not expand_properties or "properties" not in df.columns:
return df

prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2)
pd = local_import("pandas")
prop_df = pd.json_normalize(df.pop("properties"), max_level=2)
if remove_property_prefix and not prop_df.empty:
typed_view_ids = {item.get_source() for item in self if isinstance(item, TypedInstance)} # type: ignore [attr-defined]
view_id, *extra = typed_view_ids | set(vid for item in self for vid in item.properties)
# We only do/allow this if we have a single source:
if not extra:
prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
prefix = "{}.{}/{}.".format(*view_id.as_tuple())
prop_df.columns = prop_df.columns.str.removeprefix(prefix)

if len(self) > 0 and isinstance(self[0], TypedInstance):
attr_name_mapping = self[0]._get_descriptor_property_name_mapping()
prop_df = prop_df.rename(columns=attr_name_mapping)

if any(overlapping := df.columns.intersection(prop_df.columns)):
warnings.warn(
"One or more expanded property names overlapped with base properties. "
f"These columns will not have their view ID prefix removed: {sorted(overlapping)}",
RuntimeWarning,
)
prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping})
else:
warnings.warn(
"Can't remove view ID prefix from expanded property columns as multiple sources exist",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "cognite-sdk"

version = "7.69.1"
version = "7.69.2"
description = "Cognite Python SDK"
readme = "README.md"
documentation = "https://cognite-sdk-python.readthedocs-hosted.com"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,31 @@ def node_dumped() -> dict[str, Any]:
}


@pytest.fixture
def node_overlapping_props() -> dict[str, Any]:
return {
"space": "sp",
"externalId": "copy pastarino",
"version": 22,
"lastUpdatedTime": 123456789,
"createdTime": 17326520,
"instanceType": "node",
"properties": {
"my-space": {
"my-view/v8": {
"name": "huh A NAME",
"type": "numeric",
"lastUpdatedTime": "not a number (🐫)",
"last_updated_time": "not a number (🐍)",
"unit": {"space": "cdf_cdm_units", "externalId": "temperature:deg_c"},
"externalId": "overlaps-ext-id",
}
}
},
"type": {"space": "asdf", "externalId": "im-a-time-series-type"},
}


@pytest.fixture
def edge_dumped(node_dumped: dict[str, Any]) -> dict[str, Any]:
return {
Expand Down Expand Up @@ -481,6 +506,73 @@ def test_expand_properties__list_class_empty_properties(

assert "properties" not in expanded_with_empty_properties.columns

def test_instances_to_pandas_expand_true_with_overlapping_props(
self, node_dumped: dict, node_overlapping_props: dict
) -> None:
# We don't test Edge/EdgeList as it uses the same to_pandas implementation as NodeList
nodes: NodeList = NodeList.load([node_dumped, node_overlapping_props])
node = Node.load(node_overlapping_props)
# We need to test with camel_case T/F as e.g. base prop. lastUpdatedTime only overlaps when camel_case=True
# but user properties do not vary with camel_case
kw: dict = {"expand_properties": True, "convert_timestamps": False}
without_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=True, **kw)
without_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=False, **kw)
with_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=True, **kw)
with_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=False, **kw)

without_prefix_camel = node.to_pandas(remove_property_prefix=True, camel_case=True, **kw)
without_prefix_snake = node.to_pandas(remove_property_prefix=True, camel_case=False, **kw)
with_prefix_camel = node.to_pandas(remove_property_prefix=False, camel_case=True, **kw)
with_prefix_snake = node.to_pandas(remove_property_prefix=False, camel_case=False, **kw)

# Base property should never be affected:
prefix = "my-space.my-view/v8"
for df in without_prefix_camel_lst, without_prefix_snake_lst, with_prefix_camel_lst, with_prefix_snake_lst:
base_type = df.at[1, "type"]
assert isinstance(base_type, dict) and base_type["space"] == "asdf"
assert df.at[1, f"{prefix}.type"] == "numeric"

for df in without_prefix_camel, without_prefix_snake, with_prefix_camel, with_prefix_snake:
base_type = df.at["type", "value"]
assert isinstance(base_type, dict) and base_type["space"] == "asdf"
assert df.at[f"{prefix}.type", "value"] == "numeric"

# Check a base property that varies with camel_case:
assert without_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789
assert without_prefix_snake_lst.at[1, "last_updated_time"] == 123456789
assert with_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789
assert with_prefix_snake_lst.at[1, "last_updated_time"] == 123456789

assert without_prefix_camel.at["lastUpdatedTime", "value"] == 123456789
assert without_prefix_snake.at["last_updated_time", "value"] == 123456789
assert with_prefix_camel.at["lastUpdatedTime", "value"] == 123456789
assert with_prefix_snake.at["last_updated_time", "value"] == 123456789

# Check the overlapping user property
assert without_prefix_camel_lst.at[1, "last_updated_time"] == "not a number (🐍)"
assert without_prefix_snake_lst.at[1, "lastUpdatedTime"] == "not a number (🐫)"
assert without_prefix_camel_lst.at[1, f"{prefix}.lastUpdatedTime"] == "not a number (🐫)"
assert without_prefix_snake_lst.at[1, f"{prefix}.last_updated_time"] == "not a number (🐍)"

assert without_prefix_camel.at["last_updated_time", "value"] == "not a number (🐍)"
assert without_prefix_snake.at["lastUpdatedTime", "value"] == "not a number (🐫)"
assert without_prefix_camel.at[f"{prefix}.lastUpdatedTime", "value"] == "not a number (🐫)"
assert without_prefix_snake.at[f"{prefix}.last_updated_time", "value"] == "not a number (🐍)"

for df in with_prefix_camel_lst, with_prefix_snake_lst:
for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")):
assert df.at[1, f"{prefix}.{prop_name}"] == f"not a number ({case})"
# Ensure the property doesn't exist without prefix - or that it's the base
# property when camel_case aligns with base prop.
assert prop_name not in df.columns or df.at[1, prop_name] == 123456789

for df in with_prefix_camel, with_prefix_snake:
for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")):
assert df.at[f"{prefix}.{prop_name}", "value"] == f"not a number ({case})"
# Ensure the property doesn't exist without prefix - or that it's the base
# property when camel_case aligns with base prop.
assert prop_name not in df.index or df.at[prop_name, "value"] == 123456789

def test_node_with_single_property_to_pandas_with_expand_props(self) -> None:
# Bug prior to 7.62.6 made to_pandas(expand_properties=True) fail on nodes with a single property
# due to how squeeze works in pandas, even a DataFrame will be forced into a scalar (to be fair,
Expand Down

0 comments on commit f8302f4

Please sign in to comment.