apache · kevinjqliu · Jan 2, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -30,7 +30,7 @@
     Tuple,
     TypeVar,
 )
-from urllib.parse import quote
+from urllib.parse import quote_plus
 
 from pydantic import (
     BeforeValidator,
@@ -234,9 +234,11 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             partition_field = self.fields[pos]
             value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos])
 
-            value_str = quote(value_str, safe="")
+            value_str = quote_plus(value_str, safe="")
             value_strs.append(value_str)
-            field_strs.append(partition_field.name)
+
+            field_str = quote_plus(partition_field.name, safe="")
+            field_strs.append(field_str)
-            field_str = quote_plus(partition_field.name, safe="")
-            field_strs.append(field_str)
+            field_strs.append(quote_plus(partition_field.name, safe=""))
-            field_str = quote_plus(partition_field.name, safe="")
-            field_strs.append(field_str)
+            field_strs.append(quote_plus(partition_field.name, safe=""))
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
         return path

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
@@ -70,6 +70,7 @@
     NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False),
     NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False),
     NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False),
+    NestedField(field_id=15, name="special#string#field", field_type=StringType(), required=False),
 )
 
 
@@ -722,6 +723,30 @@
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
         ),
+        # Test that special characters are URL-encoded
+        (
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
+            ["special string"],
+            Record(**{"special#string#field": "special string"}),  # type: ignore
+            "special%23string%23field=special+string",
+            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
+            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
+            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
+            # None,
+            # None,
+            f"""CREATE TABLE {identifier} (
+                `special#string#field` string
+            )
+            USING iceberg
+            PARTITIONED BY (
+                identity(`special#string#field`)
+            )
+            """,
+            f"""INSERT INTO {identifier}
+            VALUES
+            ('special string')
+            """,
+        ),
     ],
 )
 @pytest.mark.integration

diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
@@ -16,7 +16,8 @@
 # under the License.
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
 from pyiceberg.schema import Schema
-from pyiceberg.transforms import BucketTransform, TruncateTransform
+from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform
+from pyiceberg.typedef import Record
 from pyiceberg.types import (
     IntegerType,
     NestedField,
@@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None:
     )
 
 
+def test_partition_spec_to_path() -> None:
+    schema = Schema(
+        NestedField(field_id=1, name="str", field_type=StringType(), required=False),
+        NestedField(field_id=2, name="other_str", field_type=StringType(), required=False),
+        NestedField(field_id=3, name="int", field_type=IntegerType(), required=True),
+    )
+
+    spec = PartitionSpec(
+        PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"),
+        PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"),
+        PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"),
+        spec_id=3,
+    )
+
+    record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10})  # type: ignore
+
+    # Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java
+    # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204
+    assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10"
+
+
 def test_partition_type(table_schema_simple: Schema) -> None:
     spec = PartitionSpec(
         PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"),