From 733ccdfc4dec69cef6efd4e9f10aea1d43d08104 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 9 Jun 2024 12:44:36 +0100 Subject: [PATCH] feat: convert to give time zone in .str.to_datetime when values are offset-aware --- .../src/chunked_array/temporal/datetime.rs | 20 +++++++-- .../src/chunked_array/temporal/mod.rs | 2 +- .../src/dsl/function_expr/datetime.rs | 3 ++ .../src/dsl/function_expr/strings.rs | 30 ++++++-------- .../src/chunkedarray/string/infer.rs | 21 +++------- .../src/chunkedarray/string/mod.rs | 7 +++- py-polars/polars/expr/string.py | 12 +++++- py-polars/polars/series/string.py | 12 +++++- py-polars/tests/unit/dataframe/test_df.py | 12 +++--- .../tests/unit/datatypes/test_temporal.py | 2 +- .../namespaces/temporal/test_to_datetime.py | 41 +++++++++++++++++++ .../operations/namespaces/test_strptime.py | 11 ++--- 12 files changed, 121 insertions(+), 52 deletions(-) diff --git a/crates/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs index bd3e6fae1c47..adacd1c790a1 100644 --- a/crates/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -206,17 +206,31 @@ impl DatetimeChunked { } /// Change the underlying [`TimeUnit`]. This does not modify the data. - pub fn set_time_unit(&mut self, tu: TimeUnit) { - self.2 = Some(Datetime(tu, self.time_zone().clone())) + pub fn set_time_unit(&mut self, time_unit: TimeUnit) { + self.2 = Some(Datetime(time_unit, self.time_zone().clone())) } /// Change the underlying [`TimeZone`]. This does not modify the data. + /// This does not validate the time zone - it's up to the caller to verify that it's + /// already been validated. #[cfg(feature = "timezones")] pub fn set_time_zone(&mut self, time_zone: TimeZone) -> PolarsResult<()> { - validate_time_zone(&time_zone)?; self.2 = Some(Datetime(self.time_unit(), Some(time_zone))); Ok(()) } + + /// Change the underlying [`TimeUnit`] and [`TimeZone`]. This does not modify the data. + /// This does not validate the time zone - it's up to the caller to verify that it's + /// already been validated. + #[cfg(feature = "timezones")] + pub fn set_time_unit_and_time_zone( + &mut self, + time_unit: TimeUnit, + time_zone: TimeZone, + ) -> PolarsResult<()> { + self.2 = Some(Datetime(time_unit, Some(time_zone))); + Ok(()) + } } #[cfg(test)] diff --git a/crates/polars-core/src/chunked_array/temporal/mod.rs b/crates/polars-core/src/chunked_array/temporal/mod.rs index de58e421dc0f..d9f50fe9ad96 100644 --- a/crates/polars-core/src/chunked_array/temporal/mod.rs +++ b/crates/polars-core/src/chunked_array/temporal/mod.rs @@ -39,7 +39,7 @@ static FIXED_OFFSET_PATTERN: &str = r#"(?x) static FIXED_OFFSET_RE: Lazy = Lazy::new(|| Regex::new(FIXED_OFFSET_PATTERN).unwrap()); #[cfg(feature = "timezones")] -pub(crate) fn validate_time_zone(tz: &str) -> PolarsResult<()> { +pub fn validate_time_zone(tz: &str) -> PolarsResult<()> { match tz.parse::() { Ok(_) => Ok(()), Err(_) => { diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index 1d473012c411..f6bda2902b9d 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -2,6 +2,8 @@ use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS, SECON #[cfg(feature = "timezones")] use chrono_tz::Tz; #[cfg(feature = "timezones")] +use polars_core::chunked_array::temporal::validate_time_zone; +#[cfg(feature = "timezones")] use polars_time::base_utc_offset as base_utc_offset_fn; #[cfg(feature = "timezones")] use polars_time::dst_offset as dst_offset_fn; @@ -343,6 +345,7 @@ pub(super) fn convert_time_zone(s: &Series, time_zone: &TimeZone) -> PolarsResul match s.dtype() { DataType::Datetime(_, _) => { let mut ca = s.datetime()?.clone(); + validate_time_zone(time_zone)?; ca.set_time_zone(time_zone.clone())?; Ok(ca.into_series()) }, diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 2ca0b93771b5..56be47b16213 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -3,22 +3,22 @@ use std::borrow::Cow; use arrow::legacy::utils::CustomIterTools; #[cfg(feature = "timezones")] use once_cell::sync::Lazy; -#[cfg(feature = "regex")] -use regex::{escape, Regex}; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; - #[cfg(feature = "timezones")] -static TZ_AWARE_RE: Lazy = - Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap()); - +use polars_core::chunked_array::temporal::validate_time_zone; use polars_core::utils::handle_casting_failures; #[cfg(feature = "dtype-struct")] use polars_utils::format_smartstring; +use regex::{escape, Regex}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use super::*; use crate::{map, map_as_slice}; +#[cfg(feature = "timezones")] +static TZ_AWARE_RE: Lazy = + Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap()); + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum StringFunction { @@ -652,16 +652,10 @@ fn to_datetime( Some(format) => TZ_AWARE_RE.is_match(format), _ => false, }; - if let (Some(tz), true) = (time_zone, tz_aware) { - if tz != "UTC" { - polars_bail!( - ComputeError: - "if using strftime/to_datetime with a time-zone-aware format, the output will be in UTC. Please either drop the time zone from the function call, or set it to UTC. \ - If you are trying to convert the output to a different time zone, please use `convert_time_zone`." - ) - } - }; - + #[cfg(feature = "timezones")] + if let Some(time_zone) = time_zone { + validate_time_zone(time_zone)?; + } let out = if options.exact { datetime_strings .as_datetime( diff --git a/crates/polars-time/src/chunkedarray/string/infer.rs b/crates/polars-time/src/chunkedarray/string/infer.rs index dbd10651fd29..5f0f3d7daf96 100644 --- a/crates/polars-time/src/chunkedarray/string/infer.rs +++ b/crates/polars-time/src/chunkedarray/string/infer.rs @@ -452,25 +452,16 @@ pub(crate) fn to_datetime( .find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single)) .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?; let mut infer = DatetimeInfer::::try_from_with_unit(pattern, Some(tu))?; - if pattern == Pattern::DatetimeYMDZ - && tz.is_some() - && tz.map(|x| x.as_str()) != Some("UTC") - { - polars_bail!(ComputeError: "offset-aware datetimes are converted to UTC. \ - Please either drop the time zone from the function call, or set it to UTC. \ - To convert to a different time zone, please use `convert_time_zone`.") - } match pattern { #[cfg(feature = "timezones")] Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| { let mut ca = ca.clone(); - ca.set_time_unit(tu); - polars_ops::prelude::replace_time_zone( - &ca, - Some("UTC"), - _ambiguous, - NonExistent::Raise, - ) + // `tz` has already been validated. + ca.set_time_unit_and_time_zone( + tu, + tz.cloned().unwrap_or_else(|| "UTC".to_string()), + )?; + Ok(ca) })?, _ => infer.coerce_string(ca).datetime().map(|ca| { let mut ca = ca.clone(); diff --git a/crates/polars-time/src/chunkedarray/string/mod.rs b/crates/polars-time/src/chunkedarray/string/mod.rs index 6f56986385fe..71022d0be81a 100644 --- a/crates/polars-time/src/chunkedarray/string/mod.rs +++ b/crates/polars-time/src/chunkedarray/string/mod.rs @@ -221,7 +221,7 @@ pub trait StringMethods: AsString { NonExistent::Raise, ), #[cfg(feature = "timezones")] - (true, _) => Ok(ca.into_datetime(tu, Some("UTC".to_string()))), + (true, tz) => Ok(ca.into_datetime(tu, tz.cloned().or_else(|| Some("UTC".to_string())))), _ => Ok(ca.into_datetime(tu, None)), } } @@ -305,7 +305,10 @@ pub trait StringMethods: AsString { Ok(string_ca .apply_generic(|opt_s| convert.eval(opt_s?, use_cache)) .with_name(string_ca.name()) - .into_datetime(tu, Some("UTC".to_string()))) + .into_datetime( + tu, + Some(tz.map(|x| x.to_string()).unwrap_or("UTC".to_string())), + )) } #[cfg(not(feature = "timezones"))] { diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 16c3ce349346..8c6023a1a321 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -109,7 +109,17 @@ def to_datetime( `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is found, the default is `"us"`. time_zone - Time zone for the resulting Datetime column. + Time zone for the resulting Datetime column. Rules are: + + - If inputs are tz-naive and `time_zone` is None, the result time zone is + `None`. + - If inputs are offset-aware and `time_zone` is None, inputs are converted + to `'UTC'` and the result time zone is `'UTC'`. + - If inputs are offset-aware and `time_zone` is given, inputs are converted + to `time_zone` and the result time zone is `time_zone`. + - If inputs are tz-naive and `time_zone` is given, input time zones are + replaced with (not converted to!) `time_zone`, and the result time zone + is `time_zone`. strict Raise an error if any conversion fails. exact diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 0788886cd871..c0c6adba6308 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -99,7 +99,17 @@ def to_datetime( `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is found, the default is `"us"`. time_zone - Time zone for the resulting Datetime column. + Time zone for the resulting Datetime column. Rules are: + + - If inputs are tz-naive and `time_zone` is None, the result time zone is + `None`. + - If inputs are offset-aware and `time_zone` is None, inputs are converted + to `'UTC'` and the result time zone is `'UTC'`. + - If inputs are offset-aware and `time_zone` is given, inputs are converted + to `time_zone` and the result time zone is `time_zone`. + - If inputs are tz-naive and `time_zone` is given, input time zones are + replaced with (not converted to!) `time_zone`, and the result time zone + is `time_zone`. strict Raise an error if any conversion fails. exact diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index f9f78d1a9bc1..c76a94251fcc 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -18,7 +18,7 @@ import polars.selectors as cs from polars._utils.construction import iterable_to_pydf from polars.datatypes import DTYPE_TEMPORAL_UNITS, INTEGER_DTYPES -from polars.exceptions import ComputeError, TimeZoneAwareConstructorWarning +from polars.exceptions import TimeZoneAwareConstructorWarning from polars.testing import ( assert_frame_equal, assert_frame_not_equal, @@ -2502,10 +2502,12 @@ def test_init_vs_strptime_consistency_raises() -> None: [datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))], dtype=pl.Datetime("us", "US/Pacific"), ) - with pytest.raises(ComputeError, match=msg): - pl.Series(["2020-01-01 00:00-08:00"]).str.strptime( - pl.Datetime("us", "US/Pacific") - ) + result = ( + pl.Series(["2020-01-01 00:00-08:00"]) + .str.strptime(pl.Datetime("us", "US/Pacific")) + .item() + ) + assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific")) def test_init_physical_with_timezone() -> None: diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 85762f216438..fea1a324b604 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1205,7 +1205,7 @@ def test_strptime_with_invalid_tz() -> None: pl.Series(["2020-01-01 03:00:00"]).str.strptime(pl.Datetime("us", "foo")) with pytest.raises( ComputeError, - match="Please either drop the time zone from the function call, or set it to UTC", + match="unable to parse time zone: 'foo'", ): pl.Series(["2020-01-01 03:00:00+01:00"]).str.strptime( pl.Datetime("us", "foo"), "%Y-%m-%d %H:%M:%S%z" diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_to_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_to_datetime.py index 599c0b9582c0..95612b07c6ac 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_to_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_to_datetime.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from datetime import datetime from typing import TYPE_CHECKING @@ -8,7 +9,16 @@ from hypothesis import given import polars as pl +from polars.dependencies import _ZONEINFO_AVAILABLE from polars.exceptions import ComputeError +from polars.testing import assert_series_equal + +if sys.version_info >= (3, 9): + from zoneinfo import ZoneInfo +elif _ZONEINFO_AVAILABLE: + # Import from submodule due to typing issue with backports.zoneinfo package: + # https://github.com/pganssle/zoneinfo/issues/125 + from backports.zoneinfo._zoneinfo import ZoneInfo if TYPE_CHECKING: from hypothesis.strategies import DrawFn @@ -152,3 +162,34 @@ def test_cast_to_time_and_combine(d: datetime, tu: TimeUnit) -> None: assert [d.date() for d in datetimes] == res["dt"].to_list() assert [d.time() for d in datetimes] == res["tm"].to_list() assert datetimes == res["dtm"].to_list() + + +def test_to_datetime_aware_values_aware_dtype() -> None: + s = pl.Series(["2020-01-01T01:12:34+01:00"]) + expected = pl.Series([datetime(2020, 1, 1, 5, 57, 34)]).dt.replace_time_zone( + "Asia/Kathmandu" + ) + + # When Polars infers the format + result = s.str.to_datetime(time_zone="Asia/Kathmandu") + assert_series_equal(result, expected) + + # When the format is provided + result = s.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu") + assert_series_equal(result, expected) + + # With `exact=False` + result = s.str.to_datetime( + format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu", exact=False + ) + assert_series_equal(result, expected) + + # Check consistency with Series constructor + # TODO: remove `raises`, after https://github.com/pola-rs/polars/pull/16828. + with pytest.raises(ValueError, match="Please either drop"): + result = pl.Series( + [datetime(2020, 1, 1, 5, 57, 34, tzinfo=ZoneInfo("Asia/Kathmandu"))], + dtype=pl.Datetime("us", "Asia/Kathmandu"), + ) + # TODO: uncomment, after https://github.com/pola-rs/polars/pull/16828. + # assert_series_equal(result, expected) diff --git a/py-polars/tests/unit/operations/namespaces/test_strptime.py b/py-polars/tests/unit/operations/namespaces/test_strptime.py index ab49e64dbb88..07bf026faeeb 100644 --- a/py-polars/tests/unit/operations/namespaces/test_strptime.py +++ b/py-polars/tests/unit/operations/namespaces/test_strptime.py @@ -300,11 +300,12 @@ def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None: def test_infer_tz_aware_raises() -> None: - msg = "Please either drop the time zone from the function call, or set it to UTC" - with pytest.raises(ComputeError, match=msg): - pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime( - time_unit="us", time_zone="Europe/Vienna" - ) + result = ( + pl.Series(["2020-01-02T04:00:00+02:00"]) + .str.to_datetime(time_unit="us", time_zone="Europe/Vienna") + .item() + ) + assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna")) @pytest.mark.parametrize(