-
Notifications
You must be signed in to change notification settings - Fork 194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
URL-encode partition field names in file locations #1457
Changes from 10 commits
4b139ee
638a43f
3126952
65e2c39
64e7748
18c7674
3756e4e
f5a35de
f1f5f4c
8a106e6
1bb379b
61cdd08
f32b3aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -30,7 +30,7 @@ | |||||||
Tuple, | ||||||||
TypeVar, | ||||||||
) | ||||||||
from urllib.parse import quote | ||||||||
from urllib.parse import quote_plus | ||||||||
|
||||||||
from pydantic import ( | ||||||||
BeforeValidator, | ||||||||
|
@@ -234,9 +234,11 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: | |||||||
partition_field = self.fields[pos] | ||||||||
value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) | ||||||||
|
||||||||
value_str = quote(value_str, safe="") | ||||||||
value_str = quote_plus(value_str, safe="") | ||||||||
value_strs.append(value_str) | ||||||||
field_strs.append(partition_field.name) | ||||||||
|
||||||||
field_str = quote_plus(partition_field.name, safe="") | ||||||||
field_strs.append(field_str) | ||||||||
Comment on lines
+240
to
+241
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit, I would just collapse these:
Suggested change
|
||||||||
|
||||||||
path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) | ||||||||
return path | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,8 @@ | |
# under the License. | ||
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec | ||
from pyiceberg.schema import Schema | ||
from pyiceberg.transforms import BucketTransform, TruncateTransform | ||
from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform | ||
from pyiceberg.typedef import Record | ||
from pyiceberg.types import ( | ||
IntegerType, | ||
NestedField, | ||
|
@@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None: | |
) | ||
|
||
|
||
def test_partition_spec_to_path() -> None: | ||
schema = Schema( | ||
NestedField(field_id=1, name="str", field_type=StringType(), required=False), | ||
NestedField(field_id=2, name="other_str", field_type=StringType(), required=False), | ||
NestedField(field_id=3, name="int", field_type=IntegerType(), required=True), | ||
) | ||
|
||
spec = PartitionSpec( | ||
PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"), | ||
PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"), | ||
PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"), | ||
spec_id=3, | ||
) | ||
|
||
record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10}) # type: ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mypy complains here and elsewhere but I think it's fine |
||
|
||
# Both partition names fields and values should be URL encoded, with spaces mapping to plus signs, to match the Java | ||
# behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204 | ||
assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cross-checked with Java implementation (integration tests will do this eventually), in particular WRT to |
||
|
||
|
||
def test_partition_type(table_schema_simple: Schema) -> None: | ||
spec = PartitionSpec( | ||
PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It defaults to
utf-8
, so that's good 👍