From 03f6169841ccd92714dd904e76e8531887dbe3c3 Mon Sep 17 00:00:00 2001 From: Topher Cawlfield <4094385+tcawlfield@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:19:14 -0600 Subject: [PATCH] feat: 3154 from_parquet should be able to read partial columns (#3156) * feat: 3154 - Adding unit test and fix * Taking this opportunity to fix an issue with 2772 test This had been bugging me, and with a better understanding now I can improve the unit tests. Mostly for future readability and correctness. * Removing commented-out test return value --------- Co-authored-by: Ianna Osborne --- src/awkward/_connect/pyarrow/table_conv.py | 33 +++++++++++++-- .../test_2772_parquet_extn_array_metadata.py | 17 ++++++++ tests/test_3154_parquet_subcolumn_select.py | 40 +++++++++++++++++++ 3 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 tests/test_3154_parquet_subcolumn_select.py diff --git a/src/awkward/_connect/pyarrow/table_conv.py b/src/awkward/_connect/pyarrow/table_conv.py index c8c1e50152..53f286c730 100644 --- a/src/awkward/_connect/pyarrow/table_conv.py +++ b/src/awkward/_connect/pyarrow/table_conv.py @@ -1,3 +1,5 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE + from __future__ import annotations import json @@ -128,11 +130,34 @@ def native_arrow_field_to_akarraytype( fields = _fields_of_strg_type(storage_type) if len(fields) > 0: # We need to replace storage_type with one that contains AwkwardArrowTypes. - awkwardized_fields = [ - native_arrow_field_to_akarraytype(field, meta) # Recurse - for field, meta in zip(fields, metadata["subfield_metadata"]) - ] + sub_meta = metadata["subfield_metadata"] + awkwardized_fields = None # Temporary + if len(sub_meta) == len(fields): + awkwardized_fields = [ + native_arrow_field_to_akarraytype(field, meta) # Recurse + for field, meta in zip(fields, metadata["subfield_metadata"]) + ] + elif len(fields) < len(sub_meta): + # If a user has read a partial column, we can have fewer Arrow fields than the original. + sub_meta_dict = {sm["field_name"]: sm for sm in sub_meta} + awkwardized_fields = [] + for field in fields: + if field.name in sub_meta_dict: + awkwardized_fields.append( + native_arrow_field_to_akarraytype( + field, sub_meta_dict[field.name] + ) + ) + else: + raise ValueError( + f"Cannot find Awkward metadata for sub-field {field.name}" + ) + else: + raise ValueError( + f"Not enough fields in Awkward metadata. Have {len(sub_meta)} need at least {len(fields)}." + ) storage_type = _make_pyarrow_type_like(storage_type, awkwardized_fields) + ak_type = AwkwardArrowType._from_metadata_object(storage_type, metadata) return pyarrow.field(ntv_field.name, type=ak_type, nullable=ntv_field.nullable) diff --git a/tests/test_2772_parquet_extn_array_metadata.py b/tests/test_2772_parquet_extn_array_metadata.py index b45e1111d3..fd4c9fede6 100644 --- a/tests/test_2772_parquet_extn_array_metadata.py +++ b/tests/test_2772_parquet_extn_array_metadata.py @@ -3,6 +3,7 @@ from __future__ import annotations +import io import os import numpy as np @@ -137,6 +138,22 @@ def test_array_conversions(akarray, as_dict): rt_array = ak.from_arrow(as_extn, highlevel=True) assert to_list(rt_array) == to_list(akarray) + # Deeper test of types + akarray_high = ak.Array(akarray) + if akarray_high.type.content.parameters.get("__categorical__", False) == as_dict: + # as_dict is supposed to go hand-in-hand with __categorical__: True, and if it + # does not, we do not round-trip perfectly. So only test when this is set correctly. + assert rt_array.type == akarray_high.type + + ak_type_str_orig = io.StringIO() + ak_type_str_rtrp = io.StringIO() + akarray_high.type.show(stream=ak_type_str_orig) + rt_array.type.show(stream=ak_type_str_rtrp) + if ak_type_str_orig.getvalue() != ak_type_str_rtrp.getvalue(): + print(" Original type:", ak_type_str_orig.getvalue()) + print(" Rnd-trip type:", ak_type_str_rtrp.getvalue()) + assert ak_type_str_orig.getvalue() == ak_type_str_rtrp.getvalue() + def test_table_conversion(): ak_tbl_like = ak.Array( diff --git a/tests/test_3154_parquet_subcolumn_select.py b/tests/test_3154_parquet_subcolumn_select.py new file mode 100644 index 0000000000..46f0429dd2 --- /dev/null +++ b/tests/test_3154_parquet_subcolumn_select.py @@ -0,0 +1,40 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE +# ruff: noqa: E402 + +from __future__ import annotations + +import os + +import pytest + +import awkward as ak + +pa = pytest.importorskip("pyarrow") +pq = pytest.importorskip("pyarrow.parquet") + + +def test_parquet_subcolumn_select(tmp_path): + ak_tbl = ak.Array( + { + "a": [ + {"lbl": "item 1", "idx": 11, "ids": [1, 2, 3]}, + {"lbl": "item 2", "idx": 12, "ids": [51, 52]}, + {"lbl": "item 3", "idx": 13, "ids": [61, 62, 63, 64]}, + ], + "b": [ + [[111, 112], [121, 122]], + [[211, 212], [221, 222]], + [[311, 312], [321, 322]], + ], + } + ) + parquet_file = os.path.join(tmp_path, "test_3514.parquet") + ak.to_parquet(ak_tbl, parquet_file) + + selection = ak.from_parquet(parquet_file, columns=["a.ids", "b"]) + assert selection["a"].to_list() == [ + {"ids": [1, 2, 3]}, + {"ids": [51, 52]}, + {"ids": [61, 62, 63, 64]}, + ] + assert selection["b"].to_list() == ak_tbl["b"].to_list()