From 4b139eec2d7b85c4eb46115fc18725bdf62c5602 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Fri, 20 Dec 2024 21:55:18 +0000
Subject: [PATCH 01/13] URL-encode partition field names in file locations

---
 pyiceberg/partitioning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index 5f9178ebf9..90b60c8def 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -236,7 +236,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
 
             value_str = quote(value_str, safe="")
             value_strs.append(value_str)
-            field_strs.append(partition_field.name)
+            field_strs.append(quote(partition_field.name, safe=""))
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
         return path

From 638a43fa1c73e019d5713d73c473435b419ed244 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Fri, 20 Dec 2024 22:04:14 +0000
Subject: [PATCH 02/13] Separate into variable

---
 pyiceberg/partitioning.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index 90b60c8def..872b084f05 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -236,7 +236,9 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
 
             value_str = quote(value_str, safe="")
             value_strs.append(value_str)
-            field_strs.append(quote(partition_field.name, safe=""))
+
+            field_str = quote(partition_field.name, safe="")
+            field_strs.append(field_str)
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
         return path

From 31269521739562ab30cd6141baff7760b4df7845 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Fri, 20 Dec 2024 23:33:51 +0000
Subject: [PATCH 03/13] Add test

---
 tests/integration/test_partitioning_key.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 29f664909c..330f9f428d 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -203,10 +203,11 @@
             # """
         ),
         (
-            [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
+            [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp#field")],
             [datetime(2023, 1, 1, 12, 0, 1, 999)],
-            Record(timestamp_field=1672574401000999),
-            "timestamp_field=2023-01-01T12%3A00%3A01.000999",
+            Record(**{"timestamp#field": 1672574401000999}),  # type: ignore
+            # Special characters in both partition field name and value must be URL-encoded:
+            "timestamp%23field=2023-01-01T12%3A00%3A01.000999",
             f"""CREATE TABLE {identifier} (
                 timestamp_field timestamp_ntz,
                 string_field string

From 65e2c3970fad4af8c17c9da4a069fbafeb91f194 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Fri, 20 Dec 2024 23:58:50 +0000
Subject: [PATCH 04/13] Revert to main

---
 tests/integration/test_partitioning_key.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 330f9f428d..29f664909c 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -203,11 +203,10 @@
             # """
         ),
         (
-            [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp#field")],
+            [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
             [datetime(2023, 1, 1, 12, 0, 1, 999)],
-            Record(**{"timestamp#field": 1672574401000999}),  # type: ignore
-            # Special characters in both partition field name and value must be URL-encoded:
-            "timestamp%23field=2023-01-01T12%3A00%3A01.000999",
+            Record(timestamp_field=1672574401000999),
+            "timestamp_field=2023-01-01T12%3A00%3A01.000999",
             f"""CREATE TABLE {identifier} (
                 timestamp_field timestamp_ntz,
                 string_field string

From 64e7748e7c34bef2775d8126d7fe22581a744244 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Sat, 21 Dec 2024 00:22:40 +0000
Subject: [PATCH 05/13] Failing test

---
 tests/integration/test_partitioning_key.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 29f664909c..5ec314502e 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -70,6 +70,7 @@
     NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False),
     NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False),
     NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False),
+    NestedField(field_id=15, name="special#string#field", field_type=StringType(), required=False),
 )
 
 
@@ -722,6 +723,25 @@
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
         ),
+        # Test that special characters are URL-encoded
+        (
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
+            ["special string"],
+            Record(**{"special#string#field": "special string"}),  # type: ignore
+            "special%23string%23field=special%20string",
+            f"""CREATE TABLE {identifier} (
+                `special#string#field` string
+            )
+            USING iceberg
+            PARTITIONED BY (
+                identity(`special#string#field`)
+            )
+            """,
+            f"""INSERT INTO {identifier}
+            VALUES
+            ('special string')
+            """,
+        ),
     ],
 )
 @pytest.mark.integration

From 18c7674be757894c26c45a7f3160ac25003f54a7 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Sat, 21 Dec 2024 19:24:55 +0000
Subject: [PATCH 06/13] Disable justication from test

---
 tests/integration/test_partitioning_key.py | 29 +++++++++++++---------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 5ec314502e..a483f668d2 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -729,18 +729,23 @@
             ["special string"],
             Record(**{"special#string#field": "special string"}),  # type: ignore
             "special%23string%23field=special%20string",
-            f"""CREATE TABLE {identifier} (
-                `special#string#field` string
-            )
-            USING iceberg
-            PARTITIONED BY (
-                identity(`special#string#field`)
-            )
-            """,
-            f"""INSERT INTO {identifier}
-            VALUES
-            ('special string')
-            """,
+            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
+            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
+            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
+            None,
+            None,
+            # f"""CREATE TABLE {identifier} (
+            #     `special#string#field` string
+            # )
+            # USING iceberg
+            # PARTITIONED BY (
+            #     identity(`special#string#field`)
+            # )
+            # """,
+            # f"""INSERT INTO {identifier}
+            # VALUES
+            # ('special string')
+            # """,
         ),
     ],
 )

From 3756e4efadee75d366024a5c9bc3d3558c26916a Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Sat, 21 Dec 2024 21:00:08 +0000
Subject: [PATCH 07/13] Use `quote_plus` instead of `quote` to match Java
 behaviour

---
 pyiceberg/partitioning.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index 872b084f05..c9b6316f59 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -30,7 +30,7 @@
     Tuple,
     TypeVar,
 )
-from urllib.parse import quote
+from urllib.parse import quote_plus
 
 from pydantic import (
     BeforeValidator,
@@ -234,10 +234,10 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             partition_field = self.fields[pos]
             value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos])
 
-            value_str = quote(value_str, safe="")
+            value_str = quote_plus(value_str, safe="")
             value_strs.append(value_str)
 
-            field_str = quote(partition_field.name, safe="")
+            field_str = quote_plus(partition_field.name, safe="")
             field_strs.append(field_str)
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])

From f5a35de144c857b0105b6460012198a5b60ed3a3 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Sat, 21 Dec 2024 21:20:39 +0000
Subject: [PATCH 08/13] Temporarily update test to pass

---
 tests/integration/test_partitioning_key.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index a483f668d2..29994b9b58 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -728,7 +728,7 @@
             [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
             ["special string"],
             Record(**{"special#string#field": "special string"}),  # type: ignore
-            "special%23string%23field=special%20string",
+            "special%23string%23field=special+string",
             # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
             # (comparing expected value with Spark behavior) would fail: PyIceberg produces
             # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].

From f1f5f4c7d8291ad4c5c8561a5c6cb41c49d97293 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Sun, 22 Dec 2024 22:32:15 -0500
Subject: [PATCH 09/13] Uncomment test

---
 tests/integration/test_partitioning_key.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 29994b9b58..a43cf95c4b 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -732,20 +732,20 @@
             # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
             # (comparing expected value with Spark behavior) would fail: PyIceberg produces
             # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
-            None,
-            None,
-            # f"""CREATE TABLE {identifier} (
-            #     `special#string#field` string
-            # )
-            # USING iceberg
-            # PARTITIONED BY (
-            #     identity(`special#string#field`)
-            # )
-            # """,
-            # f"""INSERT INTO {identifier}
-            # VALUES
-            # ('special string')
-            # """,
+            # None,
+            # None,
+            f"""CREATE TABLE {identifier} (
+                `special#string#field` string
+            )
+            USING iceberg
+            PARTITIONED BY (
+                identity(`special#string#field`)
+            )
+            """,
+            f"""INSERT INTO {identifier}
+            VALUES
+            ('special string')
+            """,
         ),
     ],
 )

From 8a106e64a3a2226545db3e9a179cc654eb476757 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Mon, 23 Dec 2024 08:52:59 -0500
Subject: [PATCH 10/13] Add unit test

---
 tests/table/test_partitioning.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
index d7425bc351..4c28a3945d 100644
--- a/tests/table/test_partitioning.py
+++ b/tests/table/test_partitioning.py
@@ -16,7 +16,8 @@
 # under the License.
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
 from pyiceberg.schema import Schema
-from pyiceberg.transforms import BucketTransform, TruncateTransform
+from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform
+from pyiceberg.typedef import Record
 from pyiceberg.types import (
     IntegerType,
     NestedField,
@@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None:
     )
 
 
+def test_partition_spec_to_path() -> None:
+    schema = Schema(
+        NestedField(field_id=1, name="str", field_type=StringType(), required=False),
+        NestedField(field_id=2, name="other_str", field_type=StringType(), required=False),
+        NestedField(field_id=3, name="int", field_type=IntegerType(), required=True),
+    )
+
+    spec = PartitionSpec(
+        PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"),
+        PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"),
+        PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"),
+        spec_id=3,
+    )
+
+    record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10})  # type: ignore
+
+    # Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java
+    # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204
+    assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10"
+
+
 def test_partition_type(table_schema_simple: Schema) -> None:
     spec = PartitionSpec(
         PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"),

From 1bb379b88e9d5f88773deb0f1bb8f70a57127cb5 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Mon, 23 Dec 2024 13:51:59 -0500
Subject: [PATCH 11/13] Fix typo in comment

---
 tests/table/test_partitioning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
index 4c28a3945d..127d57a798 100644
--- a/tests/table/test_partitioning.py
+++ b/tests/table/test_partitioning.py
@@ -135,7 +135,7 @@ def test_partition_spec_to_path() -> None:
 
     record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10})  # type: ignore
 
-    # Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java
+    # Both partition field names and values should be URL encoded, with spaces mapping to plus signs, to match the Java
     # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204
     assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10"
 

From 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Mon, 23 Dec 2024 18:25:20 -0500
Subject: [PATCH 12/13] Add `make_name_compatible` suggestion so test passes

---
 tests/integration/test_partitioning_key.py | 62 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index a43cf95c4b..57a0b52439 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -18,7 +18,7 @@
 import uuid
 from datetime import date, datetime, timedelta, timezone
 from decimal import Decimal
-from typing import Any, List
+from typing import Any, Callable, List, Optional
 
 import pytest
 from pyspark.sql import SparkSession
@@ -78,7 +78,7 @@
 
 
 @pytest.mark.parametrize(
-    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
+    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
     [
         # # Identity Transform
         (
@@ -99,6 +99,7 @@
             VALUES
             (false, 'Boolean field set to false');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -118,6 +119,7 @@
             VALUES
             ('sample_string', 'Another string value')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -137,6 +139,7 @@
             VALUES
             (42, 'Associated string value for int 42')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -156,6 +159,7 @@
             VALUES
             (1234567890123456789, 'Associated string value for long 1234567890123456789')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -179,6 +183,7 @@
             # VALUES
             # (3.14, 'Associated string value for float 3.14')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -202,6 +207,7 @@
             # VALUES
             # (6.282, 'Associated string value for double 6.282')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -221,6 +227,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -240,6 +247,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -264,6 +272,7 @@
             # VALUES
             # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -288,6 +297,7 @@
             # VALUES
             # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -307,6 +317,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -326,6 +337,7 @@
             VALUES
             ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -345,6 +357,7 @@
             VALUES
             (CAST('example' AS BINARY), 'Associated string value for binary `example`')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -364,6 +377,7 @@
             VALUES
             (123.45, 'Associated string value for decimal 123.45')
             """,
+            None,
         ),
         # # Year Month Day Hour Transform
         # Month Transform
@@ -385,6 +399,7 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -404,6 +419,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -423,6 +439,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # Year Transform
         (
@@ -443,6 +460,7 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -462,6 +480,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -481,6 +500,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # # Day Transform
         (
@@ -501,6 +521,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -520,6 +541,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -539,6 +561,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # Hour Transform
         (
@@ -559,6 +582,7 @@
                 VALUES
                 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
                 """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -578,6 +602,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         # Truncate Transform
         (
@@ -598,6 +623,7 @@
                 VALUES
                 (12345, 'Sample data for int');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -617,6 +643,7 @@
             VALUES
             (4294967297, 'Sample data for long');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -636,6 +663,7 @@
             VALUES
             ('abcdefg', 'Another sample for string');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -655,6 +683,7 @@
             VALUES
             (678.90, 'Associated string value for decimal 678.90')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -674,6 +703,7 @@
                 VALUES
                 (binary('HELLOICEBERG'), 'Sample data for binary');
             """,
+            None,
         ),
         # Bucket Transform
         (
@@ -694,6 +724,7 @@
             VALUES
             (10, 'Integer with value 10');
             """,
+            None,
         ),
         # Test multiple field combinations could generate the Partition record and hive partition path correctly
         (
@@ -722,30 +753,27 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
+            None,
         ),
         # Test that special characters are URL-encoded
         (
-            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
             ["special string"],
-            Record(**{"special#string#field": "special string"}),  # type: ignore
-            "special%23string%23field=special+string",
-            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
-            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
-            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
-            # None,
-            # None,
+            Record(**{"special#string+field": "special string"}),  # type: ignore
+            "special%23string%2Bfield=special+string",
             f"""CREATE TABLE {identifier} (
-                `special#string#field` string
+                `special#string+field` string
             )
             USING iceberg
             PARTITIONED BY (
-                identity(`special#string#field`)
+                identity(`special#string+field`)
             )
             """,
             f"""INSERT INTO {identifier}
             VALUES
             ('special string')
             """,
+            lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
         ),
     ],
 )
@@ -759,6 +787,7 @@ def test_partition_key(
     expected_hive_partition_path_slice: str,
     spark_create_table_sql_for_justification: str,
     spark_data_insert_sql_for_justification: str,
+    make_compatible_name: Optional[Callable[[str], str]],
 ) -> None:
     partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
     spec = PartitionSpec(*partition_fields)
@@ -793,5 +822,12 @@ def test_partition_key(
         spark_path_for_justification = (
             snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
         )
-        assert spark_partition_for_justification == expected_partition_record
+        # Special characters in partition value are sanitized when written to the data file's partition field
+        # Use `make_compatible_name` to match the sanitize behavior
+        sanitized_record = (
+            Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
+            if make_compatible_name
+            else expected_partition_record
+        )
+        assert spark_partition_for_justification == sanitized_record
         assert expected_hive_partition_path_slice in spark_path_for_justification

From f32b3aa89fa029d968f68169b7e097cb29ea4c4d Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Tue, 24 Dec 2024 08:04:59 -0500
Subject: [PATCH 13/13] Fix typo in schema field name

---
 tests/integration/test_partitioning_key.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 57a0b52439..1ac808c7d0 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -70,7 +70,7 @@
     NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False),
     NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False),
     NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False),
-    NestedField(field_id=15, name="special#string#field", field_type=StringType(), required=False),
+    NestedField(field_id=15, name="special#string+field", field_type=StringType(), required=False),
 )