From 8fe999224e5d6b5403c8cd2fc81c0066314e90f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Wed, 15 Jan 2025 17:06:36 -0300 Subject: [PATCH] Rewrite tests --- dev/provision.py | 24 ------------ tests/conftest.py | 4 +- .../integration/test_statistics_operations.py | 37 +++++++++++++++---- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/dev/provision.py b/dev/provision.py index a4dd213e7d..b358da6593 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -401,27 +401,3 @@ ) spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id") spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'") - - spark.sql( - f""" - CREATE OR REPLACE TABLE {catalog_name}.default.test_table_statistics_operations ( - number integer - ) - USING iceberg - TBLPROPERTIES ( - 'format-version'='2' - ); - """ - ) - spark.sql( - f""" - INSERT INTO {catalog_name}.default.test_table_statistics_operations - VALUES (1) - """ - ) - spark.sql( - f""" - INSERT INTO {catalog_name}.default.test_table_statistics_operations - VALUES (2) - """ - ) diff --git a/tests/conftest.py b/tests/conftest.py index 453c62cbe7..c8dde01563 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1010,7 +1010,7 @@ def generate_snapshot( "file-footer-size-in-bytes": 42, "blob-metadata": [ { - "type": "ndv", + "type": "apache-datasketches-theta-v1", "snapshot-id": 3051729675574597004, "sequence-number": 1, "fields": [1], @@ -1024,7 +1024,7 @@ def generate_snapshot( "file-footer-size-in-bytes": 42, "blob-metadata": [ { - "type": "ndv", + "type": "deletion-vector-v1", "snapshot-id": 3055729675574597004, "sequence-number": 1, "fields": [1], diff --git a/tests/integration/test_statistics_operations.py b/tests/integration/test_statistics_operations.py index de9a8e691f..361bfebb63 100644 --- a/tests/integration/test_statistics_operations.py +++ b/tests/integration/test_statistics_operations.py @@ -14,24 +14,45 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from typing import TYPE_CHECKING + import pytest -from pyiceberg.catalog import Catalog +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.table.statistics import BlobMetadata, StatisticsFile +if TYPE_CHECKING: + import pyarrow as pa + + from pyiceberg.catalog import Catalog + from pyiceberg.schema import Schema + from pyiceberg.table import Table + + +def _create_table_with_schema(catalog: "Catalog", schema: "Schema") -> "Table": + tbl_name = "default.test_table_statistics_operations" + + try: + catalog.drop_table(tbl_name) + except NoSuchTableError: + pass + return catalog.create_table(identifier=tbl_name, schema=schema) + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) -def test_manage_statistics(catalog: Catalog) -> None: - identifier = "default.test_table_statistics_operations" - tbl = catalog.load_table(identifier) +def test_manage_statistics(catalog: "Catalog", arrow_table_with_null: "pa.Table") -> None: + tbl = _create_table_with_schema(catalog, arrow_table_with_null.schema) + + tbl.append(arrow_table_with_null) + tbl.append(arrow_table_with_null) add_snapshot_id_1 = tbl.history()[0].snapshot_id add_snapshot_id_2 = tbl.history()[1].snapshot_id - def create_statistics_file(snapshot_id: int) -> StatisticsFile: + def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile: blob_metadata = BlobMetadata( - type="boring-type", + type=type_name, snapshot_id=snapshot_id, sequence_number=2, fields=[1], @@ -48,8 +69,8 @@ def create_statistics_file(snapshot_id: int) -> StatisticsFile: return statistics_file - statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1) - statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2) + statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1, "apache-datasketches-theta-v1") + statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1") with tbl.update_statistics() as update: update.set_statistics(add_snapshot_id_1, statistics_file_snap_1)