From 4b139eec2d7b85c4eb46115fc18725bdf62c5602 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 20 Dec 2024 21:55:18 +0000 Subject: [PATCH 01/13] URL-encode partition field names in file locations --- pyiceberg/partitioning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 5f9178ebf9..90b60c8def 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -236,7 +236,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: value_str = quote(value_str, safe="") value_strs.append(value_str) - field_strs.append(partition_field.name) + field_strs.append(quote(partition_field.name, safe="")) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path From 638a43fa1c73e019d5713d73c473435b419ed244 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 20 Dec 2024 22:04:14 +0000 Subject: [PATCH 02/13] Separate into variable --- pyiceberg/partitioning.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 90b60c8def..872b084f05 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -236,7 +236,9 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: value_str = quote(value_str, safe="") value_strs.append(value_str) - field_strs.append(quote(partition_field.name, safe="")) + + field_str = quote(partition_field.name, safe="") + field_strs.append(field_str) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path From 31269521739562ab30cd6141baff7760b4df7845 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 20 Dec 2024 23:33:51 +0000 Subject: [PATCH 03/13] Add test --- tests/integration/test_partitioning_key.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 29f664909c..330f9f428d 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -203,10 +203,11 @@ # """ ), ( - [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], + [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp#field")], [datetime(2023, 1, 1, 12, 0, 1, 999)], - Record(timestamp_field=1672574401000999), - "timestamp_field=2023-01-01T12%3A00%3A01.000999", + Record(**{"timestamp#field": 1672574401000999}), # type: ignore + # Special characters in both partition field name and value must be URL-encoded: + "timestamp%23field=2023-01-01T12%3A00%3A01.000999", f"""CREATE TABLE {identifier} ( timestamp_field timestamp_ntz, string_field string From 65e2c3970fad4af8c17c9da4a069fbafeb91f194 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 20 Dec 2024 23:58:50 +0000 Subject: [PATCH 04/13] Revert to main --- tests/integration/test_partitioning_key.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 330f9f428d..29f664909c 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -203,11 +203,10 @@ # """ ), ( - [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp#field")], + [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], [datetime(2023, 1, 1, 12, 0, 1, 999)], - Record(**{"timestamp#field": 1672574401000999}), # type: ignore - # Special characters in both partition field name and value must be URL-encoded: - "timestamp%23field=2023-01-01T12%3A00%3A01.000999", + Record(timestamp_field=1672574401000999), + "timestamp_field=2023-01-01T12%3A00%3A01.000999", f"""CREATE TABLE {identifier} ( timestamp_field timestamp_ntz, string_field string From 64e7748e7c34bef2775d8126d7fe22581a744244 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sat, 21 Dec 2024 00:22:40 +0000 Subject: [PATCH 05/13] Failing test --- tests/integration/test_partitioning_key.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 29f664909c..5ec314502e 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -70,6 +70,7 @@ NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False), NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False), NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False), + NestedField(field_id=15, name="special#string#field", field_type=StringType(), required=False), ) @@ -722,6 +723,25 @@ (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, ), + # Test that special characters are URL-encoded + ( + [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")], + ["special string"], + Record(**{"special#string#field": "special string"}), # type: ignore + "special%23string%23field=special%20string", + f"""CREATE TABLE {identifier} ( + `special#string#field` string + ) + USING iceberg + PARTITIONED BY ( + identity(`special#string#field`) + ) + """, + f"""INSERT INTO {identifier} + VALUES + ('special string') + """, + ), ], ) @pytest.mark.integration From 18c7674be757894c26c45a7f3160ac25003f54a7 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sat, 21 Dec 2024 19:24:55 +0000 Subject: [PATCH 06/13] Disable justication from test --- tests/integration/test_partitioning_key.py | 29 +++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 5ec314502e..a483f668d2 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -729,18 +729,23 @@ ["special string"], Record(**{"special#string#field": "special string"}), # type: ignore "special%23string%23field=special%20string", - f"""CREATE TABLE {identifier} ( - `special#string#field` string - ) - USING iceberg - PARTITIONED BY ( - identity(`special#string#field`) - ) - """, - f"""INSERT INTO {identifier} - VALUES - ('special string') - """, + # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification + # (comparing expected value with Spark behavior) would fail: PyIceberg produces + # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. + None, + None, + # f"""CREATE TABLE {identifier} ( + # `special#string#field` string + # ) + # USING iceberg + # PARTITIONED BY ( + # identity(`special#string#field`) + # ) + # """, + # f"""INSERT INTO {identifier} + # VALUES + # ('special string') + # """, ), ], ) From 3756e4efadee75d366024a5c9bc3d3558c26916a Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sat, 21 Dec 2024 21:00:08 +0000 Subject: [PATCH 07/13] Use `quote_plus` instead of `quote` to match Java behaviour --- pyiceberg/partitioning.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 872b084f05..c9b6316f59 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -30,7 +30,7 @@ Tuple, TypeVar, ) -from urllib.parse import quote +from urllib.parse import quote_plus from pydantic import ( BeforeValidator, @@ -234,10 +234,10 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote(value_str, safe="") + value_str = quote_plus(value_str, safe="") value_strs.append(value_str) - field_str = quote(partition_field.name, safe="") + field_str = quote_plus(partition_field.name, safe="") field_strs.append(field_str) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) From f5a35de144c857b0105b6460012198a5b60ed3a3 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sat, 21 Dec 2024 21:20:39 +0000 Subject: [PATCH 08/13] Temporarily update test to pass --- tests/integration/test_partitioning_key.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index a483f668d2..29994b9b58 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -728,7 +728,7 @@ [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")], ["special string"], Record(**{"special#string#field": "special string"}), # type: ignore - "special%23string%23field=special%20string", + "special%23string%23field=special+string", # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification # (comparing expected value with Spark behavior) would fail: PyIceberg produces # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. From f1f5f4c7d8291ad4c5c8561a5c6cb41c49d97293 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sun, 22 Dec 2024 22:32:15 -0500 Subject: [PATCH 09/13] Uncomment test --- tests/integration/test_partitioning_key.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 29994b9b58..a43cf95c4b 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -732,20 +732,20 @@ # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification # (comparing expected value with Spark behavior) would fail: PyIceberg produces # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. - None, - None, - # f"""CREATE TABLE {identifier} ( - # `special#string#field` string - # ) - # USING iceberg - # PARTITIONED BY ( - # identity(`special#string#field`) - # ) - # """, - # f"""INSERT INTO {identifier} - # VALUES - # ('special string') - # """, + # None, + # None, + f"""CREATE TABLE {identifier} ( + `special#string#field` string + ) + USING iceberg + PARTITIONED BY ( + identity(`special#string#field`) + ) + """, + f"""INSERT INTO {identifier} + VALUES + ('special string') + """, ), ], ) From 8a106e64a3a2226545db3e9a179cc654eb476757 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 23 Dec 2024 08:52:59 -0500 Subject: [PATCH 10/13] Add unit test --- tests/table/test_partitioning.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index d7425bc351..4c28a3945d 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -16,7 +16,8 @@ # under the License. from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.transforms import BucketTransform, TruncateTransform +from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform +from pyiceberg.typedef import Record from pyiceberg.types import ( IntegerType, NestedField, @@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None: ) +def test_partition_spec_to_path() -> None: + schema = Schema( + NestedField(field_id=1, name="str", field_type=StringType(), required=False), + NestedField(field_id=2, name="other_str", field_type=StringType(), required=False), + NestedField(field_id=3, name="int", field_type=IntegerType(), required=True), + ) + + spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"), + PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"), + PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"), + spec_id=3, + ) + + record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10}) # type: ignore + + # Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java + # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204 + assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" + + def test_partition_type(table_schema_simple: Schema) -> None: spec = PartitionSpec( PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"), From 1bb379b88e9d5f88773deb0f1bb8f70a57127cb5 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 23 Dec 2024 13:51:59 -0500 Subject: [PATCH 11/13] Fix typo in comment --- tests/table/test_partitioning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 4c28a3945d..127d57a798 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -135,7 +135,7 @@ def test_partition_spec_to_path() -> None: record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10}) # type: ignore - # Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java + # Both partition field names and values should be URL encoded, with spaces mapping to plus signs, to match the Java # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204 assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" From 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 23 Dec 2024 18:25:20 -0500 Subject: [PATCH 12/13] Add `make_name_compatible` suggestion so test passes --- tests/integration/test_partitioning_key.py | 62 +++++++++++++++++----- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index a43cf95c4b..57a0b52439 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -18,7 +18,7 @@ import uuid from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, List +from typing import Any, Callable, List, Optional import pytest from pyspark.sql import SparkSession @@ -78,7 +78,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", [ # # Identity Transform ( @@ -99,6 +99,7 @@ VALUES (false, 'Boolean field set to false'); """, + None, ), ( [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], @@ -118,6 +119,7 @@ VALUES ('sample_string', 'Another string value') """, + None, ), ( [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], @@ -137,6 +139,7 @@ VALUES (42, 'Associated string value for int 42') """, + None, ), ( [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], @@ -156,6 +159,7 @@ VALUES (1234567890123456789, 'Associated string value for long 1234567890123456789') """, + None, ), ( [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], @@ -179,6 +183,7 @@ # VALUES # (3.14, 'Associated string value for float 3.14') # """ + None, ), ( [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], @@ -202,6 +207,7 @@ # VALUES # (6.282, 'Associated string value for double 6.282') # """ + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -221,6 +227,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -240,6 +247,7 @@ VALUES (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -264,6 +272,7 @@ # VALUES # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') # """ + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], @@ -288,6 +297,7 @@ # VALUES # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], @@ -307,6 +317,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') """, + None, ), ( [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], @@ -326,6 +337,7 @@ VALUES ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') """, + None, ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], @@ -345,6 +357,7 @@ VALUES (CAST('example' AS BINARY), 'Associated string value for binary `example`') """, + None, ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], @@ -364,6 +377,7 @@ VALUES (123.45, 'Associated string value for decimal 123.45') """, + None, ), # # Year Month Day Hour Transform # Month Transform @@ -385,6 +399,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], @@ -404,6 +419,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], @@ -423,6 +439,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # Year Transform ( @@ -443,6 +460,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], @@ -462,6 +480,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], @@ -481,6 +500,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # # Day Transform ( @@ -501,6 +521,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], @@ -520,6 +541,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], @@ -539,6 +561,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # Hour Transform ( @@ -559,6 +582,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], @@ -578,6 +602,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), # Truncate Transform ( @@ -598,6 +623,7 @@ VALUES (12345, 'Sample data for int'); """, + None, ), ( [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], @@ -617,6 +643,7 @@ VALUES (4294967297, 'Sample data for long'); """, + None, ), ( [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], @@ -636,6 +663,7 @@ VALUES ('abcdefg', 'Another sample for string'); """, + None, ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], @@ -655,6 +683,7 @@ VALUES (678.90, 'Associated string value for decimal 678.90') """, + None, ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], @@ -674,6 +703,7 @@ VALUES (binary('HELLOICEBERG'), 'Sample data for binary'); """, + None, ), # Bucket Transform ( @@ -694,6 +724,7 @@ VALUES (10, 'Integer with value 10'); """, + None, ), # Test multiple field combinations could generate the Partition record and hive partition path correctly ( @@ -722,30 +753,27 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, + None, ), # Test that special characters are URL-encoded ( - [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")], + [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")], ["special string"], - Record(**{"special#string#field": "special string"}), # type: ignore - "special%23string%23field=special+string", - # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification - # (comparing expected value with Spark behavior) would fail: PyIceberg produces - # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. - # None, - # None, + Record(**{"special#string+field": "special string"}), # type: ignore + "special%23string%2Bfield=special+string", f"""CREATE TABLE {identifier} ( - `special#string#field` string + `special#string+field` string ) USING iceberg PARTITIONED BY ( - identity(`special#string#field`) + identity(`special#string+field`) ) """, f"""INSERT INTO {identifier} VALUES ('special string') """, + lambda name: name.replace("#", "_x23").replace("+", "_x2B"), ), ], ) @@ -759,6 +787,7 @@ def test_partition_key( expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, spark_data_insert_sql_for_justification: str, + make_compatible_name: Optional[Callable[[str], str]], ) -> None: partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] spec = PartitionSpec(*partition_fields) @@ -793,5 +822,12 @@ def test_partition_key( spark_path_for_justification = ( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) - assert spark_partition_for_justification == expected_partition_record + # Special characters in partition value are sanitized when written to the data file's partition field + # Use `make_compatible_name` to match the sanitize behavior + sanitized_record = ( + Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) + if make_compatible_name + else expected_partition_record + ) + assert spark_partition_for_justification == sanitized_record assert expected_hive_partition_path_slice in spark_path_for_justification From f32b3aa89fa029d968f68169b7e097cb29ea4c4d Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Tue, 24 Dec 2024 08:04:59 -0500 Subject: [PATCH 13/13] Fix typo in schema field name --- tests/integration/test_partitioning_key.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 57a0b52439..1ac808c7d0 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -70,7 +70,7 @@ NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False), NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False), NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False), - NestedField(field_id=15, name="special#string#field", field_type=StringType(), required=False), + NestedField(field_id=15, name="special#string+field", field_type=StringType(), required=False), )