From 9f15dda4a2da26e577b5827897647246c91822c0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 27 May 2026 08:22:24 -0600 Subject: [PATCH] feat: add GetTimestamp support via codegen dispatcher Routes Spark's GetTimestamp expression through the existing codegen dispatcher so that to_timestamp(s, fmt), to_date(s, fmt), to_timestamp_ntz(s, fmt), and try_to_timestamp(s, fmt) execute natively in Comet instead of falling back to Spark. GetTimestamp depends on Spark's full SimpleDateFormat / TimestampFormatter parsing semantics, including time parser policies, ANSI mode, timezone handling, and locale awareness. datafusion-spark has no equivalent function. The codegen dispatcher runs Spark's own doGenCode inside the Comet pipeline, which guarantees identical behavior across Spark 3.4, 3.5, and 4.0. --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/get_timestamp.sql | 95 +++++++++++++++++++ .../datetime/get_timestamp_ansi.sql | 46 +++++++++ 4 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp_ansi.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index b818b61b1b..26b7b36162 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -233,6 +233,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[FromUnixTime] -> CometFromUnixTime, classOf[FromUTCTimestamp] -> CometFromUTCTimestamp, classOf[ToUTCTimestamp] -> CometToUTCTimestamp, + classOf[GetTimestamp] -> CometGetTimestamp, classOf[LastDay] -> CometLastDay, classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index e2995274ad..846d093915 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, GetTimestamp, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -789,3 +789,5 @@ object CometUnixMillis extends CometCodegenDispatch[UnixMillis] object CometUnixMicros extends CometCodegenDispatch[UnixMicros] object CometToUnixTimestamp extends CometCodegenDispatch[ToUnixTimestamp] + +object CometGetTimestamp extends CometCodegenDispatch[GetTimestamp] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp.sql b/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp.sql new file mode 100644 index 0000000000..f394bed13b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp.sql @@ -0,0 +1,95 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes GetTimestamp through the codegen dispatcher. +-- GetTimestamp is generated by to_timestamp(string, format), to_date(string, format) +-- and try_to_timestamp(string, format). +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_get_timestamp(s string) USING parquet + +statement +INSERT INTO test_get_timestamp VALUES + ('2024-06-15 10:30:45'), + ('1970-01-01 00:00:00'), + ('1969-12-31 23:59:59'), + ('2024-13-01 00:00:00'), + ('garbage'), + (''), + (NULL) + +-- to_timestamp(string, format) -> GetTimestamp with TimestampType output +query +SELECT to_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_get_timestamp + +-- to_date(string, format) -> Cast(GetTimestamp(...), DateType) +query +SELECT to_date(s, 'yyyy-MM-dd HH:mm:ss') FROM test_get_timestamp + +-- try_to_timestamp(string, format) -> GetTimestamp with failOnError=false +query +SELECT try_to_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_get_timestamp + +-- literal arguments +query +SELECT to_timestamp('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss') + +query +SELECT to_date('2024-06-15', 'yyyy-MM-dd') + +query +SELECT try_to_timestamp('foo', 'yyyy-MM-dd') + +query +SELECT to_timestamp(NULL, 'yyyy-MM-dd HH:mm:ss') + +-- date-only format +statement +CREATE TABLE test_get_timestamp_dates(s string) USING parquet + +statement +INSERT INTO test_get_timestamp_dates VALUES + ('2024-06-15'), + ('1970-01-01'), + (NULL) + +query +SELECT to_timestamp(s, 'yyyy-MM-dd') FROM test_get_timestamp_dates + +query +SELECT to_date(s, 'yyyy-MM-dd') FROM test_get_timestamp_dates + +-- column-as-format (non-literal format) +statement +CREATE TABLE test_get_timestamp_fmt(s string, fmt string) USING parquet + +statement +INSERT INTO test_get_timestamp_fmt VALUES + ('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss'), + ('2024-06-15', 'yyyy-MM-dd'), + ('06/15/2024', 'MM/dd/yyyy'), + (NULL, 'yyyy-MM-dd'), + ('2024-06-15', NULL) + +query +SELECT to_timestamp(s, fmt) FROM test_get_timestamp_fmt + +-- to_timestamp_ntz(string, format) -> GetTimestamp with TimestampNTZType output +query +SELECT to_timestamp_ntz(s, 'yyyy-MM-dd HH:mm:ss') FROM test_get_timestamp diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp_ansi.sql new file mode 100644 index 0000000000..007fb1a17e --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/get_timestamp_ansi.sql @@ -0,0 +1,46 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: GetTimestamp throws on parse failure. The codegen dispatcher inherits +-- the throw from Spark's own GetTimestamp.doGenCode. The time parser policy is pinned +-- to CORRECTED so the JDK java.time formatter (and the CANNOT_PARSE_TIMESTAMP error class) +-- is exercised regardless of the runtime default. +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true +-- The CANNOT_PARSE_TIMESTAMP error class was standardized in Spark 3.5. +-- MinSparkVersion: 3.5 + +query expect_error(CANNOT_PARSE_TIMESTAMP) +SELECT to_timestamp('not a date', 'yyyy-MM-dd') + +query expect_error(CANNOT_PARSE_TIMESTAMP) +SELECT to_timestamp('2024-13-99', 'yyyy-MM-dd') + +query expect_error(CANNOT_PARSE_TIMESTAMP) +SELECT to_date('not a date', 'yyyy-MM-dd') + +-- try_to_timestamp does NOT throw under ANSI mode (failOnError=false) +query +SELECT try_to_timestamp('not a date', 'yyyy-MM-dd') + +-- Sentinel: confirms Comet ran the expression natively. If the dispatcher silently rejects +-- GetTimestamp, the error queries above pass vacuously via Spark fallback. This valid +-- query uses checkSparkAnswerAndOperator and fails if Comet did not execute it natively. +query +SELECT to_timestamp('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss')