Fix instance to pandas prop name conflict (#2050)

cognitedata · Nov 27, 2024 · f8302f4 · f8302f4
1 parent 5e7b77e
commit f8302f4
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,13 @@ Changes are grouped as follows
 - `Fixed` for any bug fixes.
 - `Security` in case of vulnerabilities.
 
-## [7.69.1] - 2024-11-25
+## [7.69.2] - 2024-11-28
+### Improved
+- Handle conversion of instance lists like NodeList to pandas DataFrame in scenarios where: a) properties are expanded
+  into columns, b) the view ID prefix has be removed and c) one or more user properties have a naming conflict with
+  base properties. This no longer raises a documented error by pandas, but gives a warning instead.
+
+## [7.69.1] - 2024-11-27
 ### Fixed
 - Convenience methods for `TimeSeries` (defined through Data Modeling with `instance_id`) now works as
   intended: `count`, `latest` and `first`.

diff --git a/cognite/client/_version.py b/cognite/client/_version.py
@@ -1,4 +1,4 @@
 from __future__ import annotations
 
-__version__ = "7.69.1"
+__version__ = "7.69.2"
 __api_subversion__ = "20230101"
diff --git a/cognite/client/data_classes/data_modeling/instances.py b/cognite/client/data_classes/data_modeling/instances.py
@@ -428,7 +428,7 @@ def to_pandas(  # type: ignore [override]
             camel_case (bool): Convert attribute names to camel case (e.g. `externalId` instead of `external_id`). Does not affect properties if expanded.
             convert_timestamps (bool): Convert known attributes storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
             expand_properties (bool): Expand the properties into separate rows. Note: Will change default to True in the next major version.
-            remove_property_prefix (bool): Remove view ID prefix from row names of expanded properties (in index). Requires data to be from a single view.
+            remove_property_prefix (bool): Attempt to remove the view ID prefix from row names of expanded properties (in index). Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept.
             **kwargs (Any): For backwards compatibility.
 
         Returns:
@@ -455,10 +455,20 @@ def to_pandas(  # type: ignore [override]
             view_id, *extra = self.properties.keys()
             # We only do/allow this if we have a single source:
             if not extra:
-                prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
+                prefix = "{}.{}/{}.".format(*view_id.as_tuple())
+                prop_df.columns = prop_df.columns.str.removeprefix(prefix)
+
                 if isinstance(self, TypedInstance):
                     attr_name_mapping = self._get_descriptor_property_name_mapping()
                     prop_df = prop_df.rename(columns=attr_name_mapping)
+
+                if any(overlapping := col.index.intersection(prop_df.columns)):
+                    warnings.warn(
+                        "One or more expanded property names overlapped with base properties. "
+                        f"These rows (index) will not have their view ID prefix removed: {sorted(overlapping)}",
+                        RuntimeWarning,
+                    )
+                    prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping})
             else:
                 warnings.warn(
                     "Can't remove view ID prefix from expanded property rows as source was not unique",
@@ -1089,7 +1099,7 @@ def to_pandas(  # type: ignore [override]
             camel_case (bool): Convert column names to camel case (e.g. `externalId` instead of `external_id`). Does not apply to properties.
             convert_timestamps (bool): Convert known columns storing CDF timestamps (milliseconds since epoch) to datetime. Does not affect properties.
             expand_properties (bool): Expand the properties into separate columns. Note: Will change default to True in the next major version.
-            remove_property_prefix (bool): Remove view ID prefix from columns names of expanded properties. Requires data to be from a single view.
+            remove_property_prefix (bool): Attempt to remove the view ID prefix from columns names of expanded properties. Requires data to be from a single view and that all property names do not conflict with base properties (e.g. 'space' or 'type'). In such cases, a warning is issued and the prefix is kept.
             **kwargs (Any): For backwards compatibility.
 
         Returns:
@@ -1107,16 +1117,27 @@ def to_pandas(  # type: ignore [override]
         if not expand_properties or "properties" not in df.columns:
             return df
 
-        prop_df = local_import("pandas").json_normalize(df.pop("properties"), max_level=2)
+        pd = local_import("pandas")
+        prop_df = pd.json_normalize(df.pop("properties"), max_level=2)
         if remove_property_prefix and not prop_df.empty:
             typed_view_ids = {item.get_source() for item in self if isinstance(item, TypedInstance)}  # type: ignore [attr-defined]
             view_id, *extra = typed_view_ids | set(vid for item in self for vid in item.properties)
             # We only do/allow this if we have a single source:
             if not extra:
-                prop_df.columns = prop_df.columns.str.removeprefix("{}.{}/{}.".format(*view_id.as_tuple()))
+                prefix = "{}.{}/{}.".format(*view_id.as_tuple())
+                prop_df.columns = prop_df.columns.str.removeprefix(prefix)
+
                 if len(self) > 0 and isinstance(self[0], TypedInstance):
                     attr_name_mapping = self[0]._get_descriptor_property_name_mapping()
                     prop_df = prop_df.rename(columns=attr_name_mapping)
+
+                if any(overlapping := df.columns.intersection(prop_df.columns)):
+                    warnings.warn(
+                        "One or more expanded property names overlapped with base properties. "
+                        f"These columns will not have their view ID prefix removed: {sorted(overlapping)}",
+                        RuntimeWarning,
+                    )
+                    prop_df = prop_df.rename(columns={col: f"{prefix}{col}" for col in overlapping})
             else:
                 warnings.warn(
                     "Can't remove view ID prefix from expanded property columns as multiple sources exist",

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "cognite-sdk"
 
-version = "7.69.1"
+version = "7.69.2"
 description = "Cognite Python SDK"
 readme = "README.md"
 documentation = "https://cognite-sdk-python.readthedocs-hosted.com"

diff --git a/tests/tests_unit/test_data_classes/test_data_models/test_instances.py b/tests/tests_unit/test_data_classes/test_data_models/test_instances.py
@@ -331,6 +331,31 @@ def node_dumped() -> dict[str, Any]:
     }
 
 
+@pytest.fixture
+def node_overlapping_props() -> dict[str, Any]:
+    return {
+        "space": "sp",
+        "externalId": "copy pastarino",
+        "version": 22,
+        "lastUpdatedTime": 123456789,
+        "createdTime": 17326520,
+        "instanceType": "node",
+        "properties": {
+            "my-space": {
+                "my-view/v8": {
+                    "name": "huh A NAME",
+                    "type": "numeric",
+                    "lastUpdatedTime": "not a number (🐫)",
+                    "last_updated_time": "not a number (🐍)",
+                    "unit": {"space": "cdf_cdm_units", "externalId": "temperature:deg_c"},
+                    "externalId": "overlaps-ext-id",
+                }
+            }
+        },
+        "type": {"space": "asdf", "externalId": "im-a-time-series-type"},
+    }
+
+
 @pytest.fixture
 def edge_dumped(node_dumped: dict[str, Any]) -> dict[str, Any]:
     return {
@@ -481,6 +506,73 @@ def test_expand_properties__list_class_empty_properties(
 
         assert "properties" not in expanded_with_empty_properties.columns
 
+    def test_instances_to_pandas_expand_true_with_overlapping_props(
+        self, node_dumped: dict, node_overlapping_props: dict
+    ) -> None:
+        # We don't test Edge/EdgeList as it uses the same to_pandas implementation as NodeList
+        nodes: NodeList = NodeList.load([node_dumped, node_overlapping_props])
+        node = Node.load(node_overlapping_props)
+        # We need to test with camel_case T/F as e.g. base prop. lastUpdatedTime only overlaps when camel_case=True
+        # but user properties do not vary with camel_case
+        kw: dict = {"expand_properties": True, "convert_timestamps": False}
+        without_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=True, **kw)
+        without_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=True, camel_case=False, **kw)
+        with_prefix_camel_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=True, **kw)
+        with_prefix_snake_lst = nodes.to_pandas(remove_property_prefix=False, camel_case=False, **kw)
+
+        without_prefix_camel = node.to_pandas(remove_property_prefix=True, camel_case=True, **kw)
+        without_prefix_snake = node.to_pandas(remove_property_prefix=True, camel_case=False, **kw)
+        with_prefix_camel = node.to_pandas(remove_property_prefix=False, camel_case=True, **kw)
+        with_prefix_snake = node.to_pandas(remove_property_prefix=False, camel_case=False, **kw)
+
+        # Base property should never be affected:
+        prefix = "my-space.my-view/v8"
+        for df in without_prefix_camel_lst, without_prefix_snake_lst, with_prefix_camel_lst, with_prefix_snake_lst:
+            base_type = df.at[1, "type"]
+            assert isinstance(base_type, dict) and base_type["space"] == "asdf"
+            assert df.at[1, f"{prefix}.type"] == "numeric"
+
+        for df in without_prefix_camel, without_prefix_snake, with_prefix_camel, with_prefix_snake:
+            base_type = df.at["type", "value"]
+            assert isinstance(base_type, dict) and base_type["space"] == "asdf"
+            assert df.at[f"{prefix}.type", "value"] == "numeric"
+
+        # Check a base property that varies with camel_case:
+        assert without_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789
+        assert without_prefix_snake_lst.at[1, "last_updated_time"] == 123456789
+        assert with_prefix_camel_lst.at[1, "lastUpdatedTime"] == 123456789
+        assert with_prefix_snake_lst.at[1, "last_updated_time"] == 123456789
+
+        assert without_prefix_camel.at["lastUpdatedTime", "value"] == 123456789
+        assert without_prefix_snake.at["last_updated_time", "value"] == 123456789
+        assert with_prefix_camel.at["lastUpdatedTime", "value"] == 123456789
+        assert with_prefix_snake.at["last_updated_time", "value"] == 123456789
+
+        # Check the overlapping user property
+        assert without_prefix_camel_lst.at[1, "last_updated_time"] == "not a number (🐍)"
+        assert without_prefix_snake_lst.at[1, "lastUpdatedTime"] == "not a number (🐫)"
+        assert without_prefix_camel_lst.at[1, f"{prefix}.lastUpdatedTime"] == "not a number (🐫)"
+        assert without_prefix_snake_lst.at[1, f"{prefix}.last_updated_time"] == "not a number (🐍)"
+
+        assert without_prefix_camel.at["last_updated_time", "value"] == "not a number (🐍)"
+        assert without_prefix_snake.at["lastUpdatedTime", "value"] == "not a number (🐫)"
+        assert without_prefix_camel.at[f"{prefix}.lastUpdatedTime", "value"] == "not a number (🐫)"
+        assert without_prefix_snake.at[f"{prefix}.last_updated_time", "value"] == "not a number (🐍)"
+
+        for df in with_prefix_camel_lst, with_prefix_snake_lst:
+            for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")):
+                assert df.at[1, f"{prefix}.{prop_name}"] == f"not a number ({case})"
+                # Ensure the property doesn't exist without prefix - or that it's the base
+                # property when camel_case aligns with base prop.
+                assert prop_name not in df.columns or df.at[1, prop_name] == 123456789
+
+        for df in with_prefix_camel, with_prefix_snake:
+            for prop_name, case in zip(("last_updated_time", "lastUpdatedTime"), ("🐍", "🐫")):
+                assert df.at[f"{prefix}.{prop_name}", "value"] == f"not a number ({case})"
+                # Ensure the property doesn't exist without prefix - or that it's the base
+                # property when camel_case aligns with base prop.
+                assert prop_name not in df.index or df.at[prop_name, "value"] == 123456789
+
     def test_node_with_single_property_to_pandas_with_expand_props(self) -> None:
         # Bug prior to 7.62.6 made to_pandas(expand_properties=True) fail on nodes with a single property
         # due to how squeeze works in pandas, even a DataFrame will be forced into a scalar (to be fair,