Skip to content

Commit

Permalink
Handle strings with partial date/time specifications and time zones i…
Browse files Browse the repository at this point in the history
…n Presto's from_iso8601_timestamp and cast to TSwTZ from Varchar (#11412)

Summary:
Pull Request resolved: #11412

Today in from_iso8601_timestamp and when casting to TimestampWithTimeZone from Varchar,
Velox doesn't support specifying a time zone, unless theres is a timestamp also specified.  Presto
Java allows specifying just a date and a time zone in these cases. The root cause was that tryParseTimestampString in TimestampConversion wouldn't allow trailing
characters after a date unless they were a time (trailing characters after a time were allowed).  For
from_iso8601_timestamp, I had to update tryParseDateString to allow just specifying the year or the
year and month with a time zone as well.

In addition I noticed from_iso8601_timestamp does not allow specifying just the hour portion of the
timestamp with a time zone (without a time zone this is allowed).  Again, this is allowed in Presto
Java. The root cause was tryParseTimeString not allowing trailing characters after the hour unless
they are a minute in Iso8601 parse mode.

Reviewed By: bikramSingh91

Differential Revision: D65370931

fbshipit-source-id: 9fcb67ed57cf89f7642cc8a6ab6082c2643f219f
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Nov 5, 2024
1 parent c95f1e0 commit f4a1223
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 15 deletions.
2 changes: 1 addition & 1 deletion velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ TEST_F(CastExprTest, stringToTimestamp) {
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "1970-01-01T00:00")),
"Cannot cast VARCHAR '1970-01-01T00:00' to TIMESTAMP. Unable to parse timestamp value");
"Cannot cast VARCHAR '1970-01-01T00:00' to TIMESTAMP. Unknown timezone value: \"T00:00\"");
VELOX_ASSERT_THROW(
(evaluateOnce<Timestamp, std::string>(
"cast(c0 as timestamp)", "201915-04-23 11:46:00.000")),
Expand Down
62 changes: 62 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4091,6 +4091,13 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Timestamp) {
for (const auto& timezone : timezones) {
setQueryTimeZone(timezone.timezone->name());

EXPECT_EQ(
TimestampWithTimezone(
kMillisInDay + 11 * kMillisInHour + 38 * kMillisInMinute +
56 * kMillisInSecond + 123 - 3 * kMillisInHour,
tz::locateZone("+03:00")),
fromIso("1970-01-02T11:38:56.123+03:00"));

EXPECT_EQ(
TimestampWithTimezone(
kMillisInDay + 11 * kMillisInHour + 38 * kMillisInMinute +
Expand Down Expand Up @@ -4142,6 +4149,28 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Timestamp) {
TimestampWithTimezone(-timezone.offset, timezone.timezone),
fromIso("1970"));

// Trailing time separator.
EXPECT_EQ(
TimestampWithTimezone(-timezone.offset, timezone.timezone),
fromIso("1970-01-01T"));
EXPECT_EQ(
TimestampWithTimezone(-timezone.offset, timezone.timezone),
fromIso("1970-01T"));
EXPECT_EQ(
TimestampWithTimezone(-timezone.offset, timezone.timezone),
fromIso("1970T"));

// No time but with a time zone.
EXPECT_EQ(
TimestampWithTimezone(-1 * kMillisInHour, tz::locateZone("+01:00")),
fromIso("1970-01-01T+01:00"));
EXPECT_EQ(
TimestampWithTimezone(2 * kMillisInHour, tz::locateZone("-02:00")),
fromIso("1970-01T-02:00"));
EXPECT_EQ(
TimestampWithTimezone(-14 * kMillisInHour, tz::locateZone("+14:00")),
fromIso("1970T+14:00"));

// No date.
EXPECT_EQ(
TimestampWithTimezone(
Expand Down Expand Up @@ -4174,6 +4203,39 @@ TEST_F(DateTimeFunctionsTest, fromIso8601Timestamp) {
TimestampWithTimezone(
11 * kMillisInHour - timezone.offset, timezone.timezone),
fromIso("T11"));

// No date but with a time zone.
EXPECT_EQ(
TimestampWithTimezone(
11 * kMillisInHour + 38 * kMillisInMinute + 56 * kMillisInSecond +
123 + 14 * kMillisInHour,
tz::locateZone("-14:00")),
fromIso("T11:38:56.123-14:00"));

EXPECT_EQ(
TimestampWithTimezone(
11 * kMillisInHour + 38 * kMillisInMinute + 56 * kMillisInSecond +
123 - 11 * kMillisInHour,
tz::locateZone("+11:00")),
fromIso("T11:38:56,123+11:00"));

EXPECT_EQ(
TimestampWithTimezone(
11 * kMillisInHour + 38 * kMillisInMinute + 56 * kMillisInSecond +
12 * kMillisInHour,
tz::locateZone("-12:00")),
fromIso("T11:38:56-12:00"));

EXPECT_EQ(
TimestampWithTimezone(
11 * kMillisInHour + 38 * kMillisInMinute - 7 * kMillisInHour,
tz::locateZone("+07:00")),
fromIso("T11:38+07:00"));

EXPECT_EQ(
TimestampWithTimezone(
11 * kMillisInHour + 8 * kMillisInHour, tz::locateZone("-08:00")),
fromIso("T11-08:00"));
}

VELOX_ASSERT_THROW(
Expand Down
25 changes: 20 additions & 5 deletions velox/functions/prestosql/tests/TimestampWithTimeZoneCastTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarchar) {
"2012-10-31 01:00:47 -06:00",
"1994-05-06 15:49 Europe/Vienna",
"1979-02-24 08:33:31 Pacific/Chatham",
"1979-02-24 08:33:31 +13:45"});
"1979-02-24 08:33:31 +13:45",
"1979-02-24 Atlantic/Bermuda"});

// Varchar representations above hold local time (relative to specified time
// zone). For instance, the first string represents a wall-clock displaying
Expand All @@ -83,17 +84,19 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarchar) {
auto denverUTC = parseTimestamp("2012-10-31 07:00:47").toMillis();
auto viennaUTC = parseTimestamp("1994-05-06 13:49:00").toMillis();
auto chathamUTC = parseTimestamp("1979-02-23 18:48:31").toMillis();
auto bermudaUTC = parseTimestamp("1979-02-24 04:00:00").toMillis();

auto timestamps = std::vector<int64_t>{
0, denverUTC, denverUTC, viennaUTC, chathamUTC, chathamUTC};
0, denverUTC, denverUTC, viennaUTC, chathamUTC, chathamUTC, bermudaUTC};

auto timezones = std::vector<TimeZoneKey>{
{0,
(int16_t)tz::getTimeZoneID("America/Denver"),
(int16_t)tz::getTimeZoneID("-06:00"),
(int16_t)tz::getTimeZoneID("Europe/Vienna"),
(int16_t)tz::getTimeZoneID("Pacific/Chatham"),
(int16_t)tz::getTimeZoneID("+13:45")}};
(int16_t)tz::getTimeZoneID("+13:45"),
(int16_t)tz::getTimeZoneID("Atlantic/Bermuda")}};

const auto expected = makeTimestampWithTimeZoneVector(
timestamps.size(),
Expand Down Expand Up @@ -166,6 +169,12 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
const auto invalidStringVector4 = makeNullableFlatVector<StringView>(
{"2012-10-31 35:00:47 America/Los_Angeles"});

const auto invalidStringVector5 =
makeNullableFlatVector<StringView>({"2012 America/Los_Angeles"});

const auto invalidStringVector6 =
makeNullableFlatVector<StringView>({"2012-10 America/Los_Angeles"});

auto millis = parseTimestamp("2012-10-31 07:00:47").toMillis();
auto timestamps = std::vector<int64_t>{millis};

Expand All @@ -185,10 +194,16 @@ TEST_F(TimestampWithTimeZoneCastTest, fromVarcharInvalidInput) {
"Unknown timezone value: \"America/California\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector3, expected),
"Unable to parse timestamp value: \"2012-10-31foo01:00:47 America/Los_Angeles\"");
"Unknown timezone value: \"foo01:00:47\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector4, expected),
"Unable to parse timestamp value: \"2012-10-31 35:00:47 America/Los_Angeles\"");
"Unknown timezone value: \"35:00:47\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector5, expected),
"Unable to parse timestamp value: \"2012 America/Los_Angeles\"");
VELOX_ASSERT_THROW(
testCast(invalidStringVector6, expected),
"Unable to parse timestamp value: \"2012-10 America/Los_Angeles\"");
}

TEST_F(TimestampWithTimeZoneCastTest, toTimestamp) {
Expand Down
30 changes: 21 additions & 9 deletions velox/type/TimestampConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ bool tryParseDateString(

// No month or day.
if ((mode == ParseMode::kSparkCast || mode == ParseMode::kIso8601) &&
pos == len) {
(pos == len || buf[pos] == 'T')) {
Expected<int64_t> expected = daysSinceEpochFromDate(year, 1, 1);
if (expected.hasError()) {
return false;
Expand Down Expand Up @@ -308,7 +308,7 @@ bool tryParseDateString(

// No day.
if ((mode == ParseMode::kSparkCast || mode == ParseMode::kIso8601) &&
pos == len) {
(pos == len || buf[pos] == 'T')) {
Expected<int64_t> expected = daysSinceEpochFromDate(year, month, 1);
if (expected.hasError()) {
return false;
Expand Down Expand Up @@ -443,6 +443,7 @@ bool tryParseTimeString(
size_t& pos,
int64_t& result,
TimestampParseMode parseMode) {
static constexpr int sep = ':';
int32_t hour = 0, min = 0, sec = 0, micros = 0;
pos = 0;

Expand Down Expand Up @@ -470,7 +471,7 @@ bool tryParseTimeString(
return false;
}

if (pos >= len) {
if (pos >= len || buf[pos] != sep) {
if (parseMode == TimestampParseMode::kIso8601) {
result = fromTime(hour, 0, 0, 0);
return true;
Expand All @@ -479,8 +480,7 @@ bool tryParseTimeString(
}

// Fetch the separator.
int sep = buf[pos++];
if (sep != ':') {
if (buf[pos++] != sep) {
// Invalid separator.
return false;
}
Expand Down Expand Up @@ -529,8 +529,11 @@ bool tryParseTimeString(
return true;
}

// String format is "YYYY-MM-DD hh:mm:ss.microseconds" (seconds and microseconds
// are optional). ISO 8601
// Parses a variety of timestamp strings, depending on the value of `parseMode`.
// Consumes as much of the string as it can and sets `result` to the
// timestamp from whatever it successfully parses. `pos` is set to the position
// of first character that was not consumed. Returns true if it successfully
// parsed at least a date, `result` is only set if true is returned.
bool tryParseTimestampString(
const char* buf,
size_t len,
Expand All @@ -551,6 +554,10 @@ bool tryParseTimestampString(

if (parseMode == TimestampParseMode::kIso8601 && pos < len &&
buf[pos] == 'T') {
if (pos == len - 1) {
// If the string is just 'T'.
return false;
}
// No date. Assume 1970-01-01.
} else if (!tryParseDateString(
buf,
Expand All @@ -575,7 +582,11 @@ bool tryParseTimestampString(
size_t timePos = 0;
if (!tryParseTimeString(
buf + pos, len - pos, timePos, microsSinceMidnight, parseMode)) {
return false;
// The rest of the string is not a valid time, but it could be relevant to
// the caller (e.g. it could be a time zone), return the date we parsed and
// let them decide what to do with the rest.
result = fromDatetime(daysSinceEpoch, 0);
return true;
}

pos += timePos;
Expand Down Expand Up @@ -860,7 +871,8 @@ fromTimestampWithTimezoneString(

const tz::TimeZone* timeZone = nullptr;

if (pos < len && characterIsSpace(str[pos])) {
if (pos < len && parseMode != TimestampParseMode::kIso8601 &&
characterIsSpace(str[pos])) {
pos++;
}

Expand Down
18 changes: 18 additions & 0 deletions velox/type/tests/TimestampConversionTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,24 @@ TEST(DateTimeUtilTest, fromTimestampWithTimezoneString) {
EXPECT_EQ(
parseTimestampWithTimezone("1970-01-01 00:00:01 America/Los_Angeles"),
std::make_pair(Timestamp(1, 0), tz::locateZone("America/Los_Angeles")));

EXPECT_EQ(
parseTimestampWithTimezone("1970-01-01 Pacific/Fiji"),
std::make_pair(Timestamp(0, 0), tz::locateZone("Pacific/Fiji")));

EXPECT_EQ(
parseTimestampWithTimezone(
"1970-01-01T+01:00", TimestampParseMode::kIso8601),
std::make_pair(Timestamp(0, 0), tz::locateZone("+01:00")));

EXPECT_EQ(
parseTimestampWithTimezone(
"1970-01T+14:00", TimestampParseMode::kIso8601),
std::make_pair(Timestamp(0, 0), tz::locateZone("+14:00")));

EXPECT_EQ(
parseTimestampWithTimezone("1970T-06:00", TimestampParseMode::kIso8601),
std::make_pair(Timestamp(0, 0), tz::locateZone("-06:00")));
}

TEST(DateTimeUtilTest, toGMT) {
Expand Down

0 comments on commit f4a1223

Please sign in to comment.