diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ccefddf48..686438ff65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,13 @@ Changes are grouped as follows - `Fixed` for any bug fixes. - `Security` in case of vulnerabilities. -## [7.69.1] - 2024-11-25 +## [7.69.2] - 2024-11-28 +### Improved +- Handle conversion of instance lists like NodeList to pandas DataFrame in scenarios where: a) properties are expanded + into columns, b) the view ID prefix has be removed and c) one or more user properties have a naming conflict with + base properties. This no longer raises a documented error by pandas, but gives a warning instead. + +## [7.69.1] - 2024-11-27 ### Fixed - Convenience methods for `TimeSeries` (defined through Data Modeling with `instance_id`) now works as intended: `count`, `latest` and `first`. diff --git a/cognite/client/_version.py b/cognite/client/_version.py index 32b309b089..d394953f39 100644 --- a/cognite/client/_version.py +++ b/cognite/client/_version.py @@ -1,4 +1,4 @@ from __future__ import annotations -__version__ = "7.69.1" +__version__ = "7.69.2" __api_subversion__ = "20230101" diff --git a/cognite/client/data_classes/data_modeling/instances.py b/cognite/client/data_classes/data_modeling/instances.py index 890f31fb6a..7ded525062 100644 --- a/cognite/client/data_classes/data_modeling/instances.py +++ b/cognite/client/data_classes/data_modeling/instances.py @@ -428,7 +428,7 @@ def to_pandas( # type: ignore [override] camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded. convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties. expand_properties (bool): Expand the properties into separate rows. Note: Will change default to True in the next major version. - remove_property_prefix (bool): Remove view ID prefix from row names of expanded properties (in index). Requires data to be from a single view. + remove_property_prefix (bool): Attempt to remove the view ID prefix from row names of expanded properties (in index). Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept. **kwargs (Any): For backwards compatibility. Returns: @@ -455,10 +455,20 @@ def to_pandas( # type: ignore [override] view_id, *extra = self.properties.keys() # We only do/allow this if we have a single source: if not extra: - prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple())) + prefix = "{}.{}/{}.".format(*view_id.as_tuple()) + prop_df.columns = prop_df.columns.str.removeprefix(prefix) + if isinstance(self, TypedInstance): attr_name_mapping = self._get_descriptor_property_name_mapping() prop_df = prop_df.rename(columns=attr_name_mapping) + + if any(overlapping := col.index.intersection(prop_df.columns)): + warnings.warn( + "One or more expanded property names overlapped with base properties. " + f"These rows (index) will not have their view ID prefix removed: {sorted(overlapping)}", + RuntimeWarning, + ) + prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping}) else: warnings.warn( "Can't remove view ID prefix from expanded property rows as source was not unique", @@ -1089,7 +1099,7 @@ def to_pandas( # type: ignore [override] camel_case (bool): Convert column names to camel case (e.g. `externalId` instead of `external_id`). Does not apply to properties. convert_timestamps (bool): Convert known columns storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties. expand_properties (bool): Expand the properties into separate columns. Note: Will change default to True in the next major version. - remove_property_prefix (bool): Remove view ID prefix from columns names of expanded properties. Requires data to be from a single view. + remove_property_prefix (bool): Attempt to remove the view ID prefix from columns names of expanded properties. Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept. **kwargs (Any): For backwards compatibility. Returns: @@ -1107,16 +1117,27 @@ def to_pandas( # type: ignore [override] if not expand_properties or "properties" not in df.columns: return df - prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2) + pd = local_import("pandas") + prop_df = pd.json_normalize(df.pop("properties"), max_level=2) if remove_property_prefix and not prop_df.empty: typed_view_ids = {item.get_source() for item in self if isinstance(item, TypedInstance)} # type: ignore [attr-defined] view_id, *extra = typed_view_ids | set(vid for item in self for vid in item.properties) # We only do/allow this if we have a single source: if not extra: - prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple())) + prefix = "{}.{}/{}.".format(*view_id.as_tuple()) + prop_df.columns = prop_df.columns.str.removeprefix(prefix) + if len(self) > 0 and isinstance(self[0], TypedInstance): attr_name_mapping = self[0]._get_descriptor_property_name_mapping() prop_df = prop_df.rename(columns=attr_name_mapping) + + if any(overlapping := df.columns.intersection(prop_df.columns)): + warnings.warn( + "One or more expanded property names overlapped with base properties. " + f"These columns will not have their view ID prefix removed: {sorted(overlapping)}", + RuntimeWarning, + ) + prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping}) else: warnings.warn( "Can't remove view ID prefix from expanded property columns as multiple sources exist", diff --git a/pyproject.toml b/pyproject.toml index 1dc147dd34..bff80801b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "cognite-sdk" -version = "7.69.1" +version = "7.69.2" description = "Cognite Python SDK" readme = "README.md" documentation = "https://cognite-sdk-python.readthedocs-hosted.com" diff --git a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py index e2a321def3..d23269566a 100644 --- a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py +++ b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py @@ -331,6 +331,31 @@ def node_dumped() -> dict[str, Any]: } +@pytest.fixture +def node_overlapping_props() -> dict[str, Any]: + return { + "space": "sp", + "externalId": "copy pastarino", + "version": 22, + "lastUpdatedTime": 123456789, + "createdTime": 17326520, + "instanceType": "node", + "properties": { + "my-space": { + "my-view/v8": { + "name": "huh A NAME", + "type": "numeric", + "lastUpdatedTime": "not a number (🐫)", + "last_updated_time": "not a number (🐍)", + "unit": {"space": "cdf_cdm_units", "externalId": "temperature:deg_c"}, + "externalId": "overlaps-ext-id", + } + } + }, + "type": {"space": "asdf", "externalId": "im-a-time-series-type"}, + } + + @pytest.fixture def edge_dumped(node_dumped: dict[str, Any]) -> dict[str, Any]: return { @@ -481,6 +506,73 @@ def test_expand_properties__list_class_empty_properties( assert "properties" not in expanded_with_empty_properties.columns + def test_instances_to_pandas_expand_true_with_overlapping_props( + self, node_dumped: dict, node_overlapping_props: dict + ) -> None: + # We don't test Edge/EdgeList as it uses the same to_pandas implementation as NodeList + nodes: NodeList = NodeList.load([node_dumped, node_overlapping_props]) + node = Node.load(node_overlapping_props) + # We need to test with camel_case T/F as e.g. base prop. lastUpdatedTime only overlaps when camel_case=True + # but user properties do not vary with camel_case + kw: dict = {"expand_properties": True, "convert_timestamps": False} + without_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=True, **kw) + without_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=False, **kw) + with_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=True, **kw) + with_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=False, **kw) + + without_prefix_camel = node.to_pandas(remove_property_prefix=True, camel_case=True, **kw) + without_prefix_snake = node.to_pandas(remove_property_prefix=True, camel_case=False, **kw) + with_prefix_camel = node.to_pandas(remove_property_prefix=False, camel_case=True, **kw) + with_prefix_snake = node.to_pandas(remove_property_prefix=False, camel_case=False, **kw) + + # Base property should never be affected: + prefix = "my-space.my-view/v8" + for df in without_prefix_camel_lst, without_prefix_snake_lst, with_prefix_camel_lst, with_prefix_snake_lst: + base_type = df.at[1, "type"] + assert isinstance(base_type, dict) and base_type["space"] == "asdf" + assert df.at[1, f"{prefix}.type"] == "numeric" + + for df in without_prefix_camel, without_prefix_snake, with_prefix_camel, with_prefix_snake: + base_type = df.at["type", "value"] + assert isinstance(base_type, dict) and base_type["space"] == "asdf" + assert df.at[f"{prefix}.type", "value"] == "numeric" + + # Check a base property that varies with camel_case: + assert without_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789 + assert without_prefix_snake_lst.at[1, "last_updated_time"] == 123456789 + assert with_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789 + assert with_prefix_snake_lst.at[1, "last_updated_time"] == 123456789 + + assert without_prefix_camel.at["lastUpdatedTime", "value"] == 123456789 + assert without_prefix_snake.at["last_updated_time", "value"] == 123456789 + assert with_prefix_camel.at["lastUpdatedTime", "value"] == 123456789 + assert with_prefix_snake.at["last_updated_time", "value"] == 123456789 + + # Check the overlapping user property + assert without_prefix_camel_lst.at[1, "last_updated_time"] == "not a number (🐍)" + assert without_prefix_snake_lst.at[1, "lastUpdatedTime"] == "not a number (🐫)" + assert without_prefix_camel_lst.at[1, f"{prefix}.lastUpdatedTime"] == "not a number (🐫)" + assert without_prefix_snake_lst.at[1, f"{prefix}.last_updated_time"] == "not a number (🐍)" + + assert without_prefix_camel.at["last_updated_time", "value"] == "not a number (🐍)" + assert without_prefix_snake.at["lastUpdatedTime", "value"] == "not a number (🐫)" + assert without_prefix_camel.at[f"{prefix}.lastUpdatedTime", "value"] == "not a number (🐫)" + assert without_prefix_snake.at[f"{prefix}.last_updated_time", "value"] == "not a number (🐍)" + + for df in with_prefix_camel_lst, with_prefix_snake_lst: + for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")): + assert df.at[1, f"{prefix}.{prop_name}"] == f"not a number ({case})" + # Ensure the property doesn't exist without prefix - or that it's the base + # property when camel_case aligns with base prop. + assert prop_name not in df.columns or df.at[1, prop_name] == 123456789 + + for df in with_prefix_camel, with_prefix_snake: + for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")): + assert df.at[f"{prefix}.{prop_name}", "value"] == f"not a number ({case})" + # Ensure the property doesn't exist without prefix - or that it's the base + # property when camel_case aligns with base prop. + assert prop_name not in df.index or df.at[prop_name, "value"] == 123456789 + def test_node_with_single_property_to_pandas_with_expand_props(self) -> None: # Bug prior to 7.62.6 made to_pandas(expand_properties=True) fail on nodes with a single property # due to how squeeze works in pandas, even a DataFrame will be forced into a scalar (to be fair,