diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index ccb816c668..c6337c0e41 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -215,86 +215,214 @@ ### datetime_funcs - [x] add_months + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `inputTypes = Seq(DateType, IntegerType)`; returns `DateType`; codegen delegates to `DateTimeUtils.dateAddMonths`. + - Spark 4.0.1 (audited 2026-05-27): `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true` on `AddMonthsBase`; behaviour and codegen unchanged. - [x] convert_timezone + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. Ternary `(sourceTz, targetTz, sourceTs)`; `inputTypes = Seq(StringType, StringType, TimestampNTZType)`; delegates to `DateTimeUtils.convertTimestampNtzToAnotherTz`. + - Spark 4.0.1 (audited 2026-05-27): timezone `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; behaviour unchanged for ASCII timezone strings. + - Known divergence: Comet composes `to_utc_timestamp` then `from_utc_timestamp` and its native timezone parser only accepts IANA zone IDs and `+HH:MM` offsets, so legacy forms like `GMT+1`, `UTC+1`, or three-letter abbreviations throw a native parse error at execution (https://github.com/apache/datafusion-comet/issues/2013). - [ ] curdate - [ ] current_date - [ ] current_time - [ ] current_timestamp - [x] current_timezone - [x] date_add + - Spark 3.4.3 (audited 2026-05-27): baseline. `(DateType, IntegerType|ShortType|ByteType) -> DateType`; `nullSafeEval` returns `startDays + d.intValue()` with Java int wrap-around; no ANSI branch. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. - [x] date_diff + - Spark 3.4.3 (audited 2026-05-27): baseline. `(DateType, DateType) -> IntegerType`; `nullSafeEval` is `endDays - startDays` with Java int wrap-around. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. + - Known divergence: the native impl uses non-wrapping `i32 -`, which would panic in debug builds on extreme inputs (Spark wraps); practically unreachable for date inputs. - [x] date_format + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `(TimestampType, StringType) -> StringType`; format string is parsed via `TimestampFormatter` (`DateTimeFormatter` under `CORRECTED` policy, `SimpleDateFormat` under `LEGACY` policy). + - Spark 4.0.1 (audited 2026-05-27): trait set updated to use `DefaultStringProducingExpression`; `nullIntolerant` becomes a field; format `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`. Behaviour unchanged for ASCII format strings. + - Known divergence: only a curated allow-list of `SimpleDateFormat` patterns runs natively (via DataFusion `to_char`). Non-UTC session timezones with a whitelisted format require `spark.comet.expr.dateFormat.allowIncompatible=true`. Non-literal formats, non-whitelisted formats, and the default disabled-codegen path route through Spark's `DateFormatClass.doGenCode` only when `spark.comet.exec.scalaUDF.codegen.enabled=true`; otherwise the operator falls back to Spark. `spark.sql.legacy.timeParserPolicy=LEGACY` is honoured only on the codegen-dispatch / Spark-fallback paths; the native allow-list assumes corrected semantics. - [x] date_from_unix_date + - Spark 3.4.3 (audited 2026-05-27): baseline. `IntegerType -> DateType`; `nullSafeEval` is the identity on days-since-epoch. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. - [x] date_part - [x] date_sub + - Spark 3.4.3 (audited 2026-05-27): baseline. Mirror of `DateAdd` (`startDays - d.intValue()`); same input types and wrap-around behaviour; no ANSI branch. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. - [x] date_trunc + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `inputTypes = Seq(StringType, TimestampType)`; format parsed by `parseTruncLevel` (case-insensitive) and supports `YEAR`/`YYYY`/`YY`, `QUARTER`, `MONTH`/`MM`/`MON`, `WEEK`, `DAY`/`DD`, `HOUR`, `MINUTE`, `SECOND`, `MILLISECOND`, `MICROSECOND`. Unknown levels return NULL. Truncation is `TimeZoneAware` and uses `zoneId` for day-and-coarser units. + - Spark 4.0.1 (audited 2026-05-27): format `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; truncation semantics unchanged for ASCII format strings. + - Known divergence: Comet returns incorrect results in non-UTC session timezones for day-and-coarser units (https://github.com/apache/datafusion-comet/issues/2649); marked `Incompatible` when the resolved zone is not `UTC` / `Etc/UTC`. Non-literal and unsupported format strings raise a native execution error instead of returning NULL. - [x] dateadd + - Spark 3.4.3 (audited 2026-05-27): SQL alias for `date_add`; see that entry. + - Spark 3.5.8 (audited 2026-05-27): SQL alias for `date_add`; see that entry. + - Spark 4.0.1 (audited 2026-05-27): SQL alias for `date_add`; see that entry. - [x] datediff + - Spark 3.4.3 (audited 2026-05-27): SQL alias for `date_diff`; see that entry. + - Spark 3.5.8 (audited 2026-05-27): SQL alias for `date_diff`; see that entry. + - Spark 4.0.1 (audited 2026-05-27): SQL alias for `date_diff`; see that entry. - [x] datepart - [x] day + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. SQL alias for `DayOfMonth`; delegates to `DateTimeUtils.getDayOfMonth` via `LocalDate.getDayOfMonth` (1..31). + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [ ] dayname - [x] dayofmonth + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `DayOfMonth extends GetDateField`; delegates to `DateTimeUtils.getDayOfMonth` (1..31). + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [x] dayofweek + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `DayOfWeek extends GetDateField`; returns 1..7 with Sunday=1 via `LocalDate.getDayOfWeek.plus(1).getValue`. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [x] dayofyear + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `DayOfYear extends GetDateField`; returns 1..366 via `LocalDate.getDayOfYear`. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [x] extract - [x] from_unixtime + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `BinaryExpression` with `inputTypes = Seq(LongType, StringType)`; returns `StringType`; uses session `zoneId` to format the resulting timestamp. + - Spark 4.0.1 (audited 2026-05-27): now `DefaultStringProducingExpression`; format `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; `nullIntolerant` set via override instead of the `NullIntolerant` trait. + - Known divergence: Comet only honours the default format pattern `yyyy-MM-dd HH:mm:ss`; any other format falls back to Spark. Implemented by composing DataFusion's `from_unixtime` and `to_char`, so DataFusion's valid timestamp range differs from Spark (https://github.com/apache/datafusion/issues/16594) and Spark datetime patterns are not honoured even when supplied (https://github.com/apache/datafusion/issues/16577). - [x] from_utc_timestamp - Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8. - Spark 3.5.8 (audited 2026-05-12): baseline. - Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings. - Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution. - [x] hour + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `case class Hour` extends `GetTimeField`. + - Spark 4.0.1 (audited 2026-05-27): `case class Hour` is unchanged; parent `GetTimeField` trait refactored to override `nullIntolerant: Boolean = true` instead of mixing in `NullIntolerant` (no behavioural change). + - Known divergence: for `TimestampNTZType` inputs Comet's native path applies session-timezone conversion (Spark treats `TIMESTAMP_NTZ` as wall-clock and ignores session timezone), so the returned hour can differ. Marked `Incompatible` and gated by `spark.comet.expr.allowIncompatible` (https://github.com/apache/datafusion-comet/issues/3180). - [x] last_day + - Spark 3.4.3 (audited 2026-05-27): baseline. `DateType -> DateType`; computes `DateTimeUtils.getLastDayOfMonth`; no ANSI branch. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. - [x] localtimestamp - [x] make_date + - Spark 3.4.3 (audited 2026-05-27): baseline. `(IntegerType, IntegerType, IntegerType) -> DateType`; under `spark.sql.ansi.enabled=true` invalid `(year, month, day)` throws `ansiDateTimeError`, else returns NULL. Documented valid year range is 1 to 9999. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): error helper renamed to `ansiDateTimeArgumentOutOfRange`; behaviour otherwise unchanged. + - Known divergence: `SparkMakeDate` in `native/spark-expr/src/datetime_funcs/make_date.rs` always returns NULL on invalid input and never raises, so Comet diverges from Spark when `spark.sql.ansi.enabled=true`. It also accepts year 0 and negative years (chrono's proleptic calendar) which Spark rejects. - [ ] make_dt_interval - [ ] make_interval - [ ] make_time - [x] make_timestamp + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. Septenary `(year, month, day, hour, min, sec[, timezone])` with `sec: DecimalType(16,6)`; honours `spark.sql.ansi.enabled` (throws on invalid input, else NULL); timezone input is `StringType`; result type follows `spark.sql.timestampType`. + - Spark 4.0.1 (audited 2026-05-27): timezone input widened to `StringTypeWithCollation(supportsTrimCollation = true)`; ANSI error helpers renamed (`ansiDateTimeArgumentOutOfRange`, `invalidFractionOfSecondError(value)`); `NullIntolerant` trait replaced by `nullIntolerant: Boolean = true`; new sibling `TryMakeTimestamp` added but routed through a separate expression. - [ ] make_timestamp_ltz - [ ] make_timestamp_ntz - [ ] make_ym_interval - [x] minute + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `case class Minute` extends `GetTimeField`. + - Spark 4.0.1 (audited 2026-05-27): `case class Minute` is unchanged; parent `GetTimeField` trait refactored to override `nullIntolerant: Boolean = true` instead of mixing in `NullIntolerant` (no behavioural change). + - Known divergence: for `TimestampNTZType` inputs Comet's native path applies session-timezone conversion (Spark treats `TIMESTAMP_NTZ` as wall-clock and ignores session timezone), so the returned minute can differ. Marked `Incompatible` and gated by `spark.comet.expr.allowIncompatible` (https://github.com/apache/datafusion-comet/issues/3180). - [x] month + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Month extends GetDateField`; delegates to `DateTimeUtils.getMonth` via `LocalDate.getMonthValue` (1..12). + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [ ] monthname - [x] months_between + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. Ternary `(date1: Timestamp, date2: Timestamp, roundOff: Boolean)` returning `DoubleType`; `TimeZoneAwareExpression`; codegen delegates to `DateTimeUtils.monthsBetween`. + - Spark 4.0.1 (audited 2026-05-27): `NullIntolerant` trait dropped in favour of `nullIntolerant: Boolean = true` override; signature and runtime behaviour unchanged. - [x] next_day + - Spark 3.4.3 (audited 2026-05-27): baseline. `(DateType, StringType) -> DateType`; under `spark.sql.ansi.enabled=true` an unrecognised `dayOfWeek` throws `ansiIllegalArgumentError`, else returns NULL. Allowed tokens come from `DateTimeUtils.getDayOfWeekFromString` (`SU/SUN/SUNDAY`, `MO/MON/MONDAY`, ...), case-insensitive via `Locale.ROOT`, no trimming. + - Spark 3.5.8 (audited 2026-05-27): identical to 3.4.3. + - Spark 4.0.1 (audited 2026-05-27): error type changed to `SparkIllegalArgumentException`; `inputTypes` now uses `StringTypeWithCollation(supportsTrimCollation = true)`. + - Known divergence: `datafusion-spark::SparkNextDay` returns NULL for malformed `dayOfWeek` regardless of `spark.sql.ansi.enabled`, so ANSI mode does not throw. It also `trim()`s the day-of-week argument before matching, so `' MO '` succeeds natively while Spark would treat it as invalid. - [ ] now - [x] quarter + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Quarter extends GetDateField`; returns 1..4 via `IsoFields.QUARTER_OF_YEAR`. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [x] second + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `case class Second` extends `GetTimeField`. + - Spark 4.0.1 (audited 2026-05-27): `case class Second` is unchanged; parent `GetTimeField` trait refactored to override `nullIntolerant: Boolean = true` instead of mixing in `NullIntolerant` (no behavioural change). + - Known divergence: for `TimestampNTZType` inputs Comet's native path applies session-timezone conversion (Spark treats `TIMESTAMP_NTZ` as wall-clock and ignores session timezone), so the returned second can differ. Marked `Incompatible` and gated by `spark.comet.expr.allowIncompatible` (https://github.com/apache/datafusion-comet/issues/3180). - [ ] session_window - [ ] time_diff - [ ] time_trunc - [x] timestamp_micros + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MicrosToTimestamp` extends `IntegralToTimestampBase` with `upScaleFactor = 1`; accepts `IntegralType`, returns `TimestampType`; codegen is identity. + - Spark 4.0.1 (audited 2026-05-27): `IntegralToTimestampBase` drops the `NullIntolerant` trait in favour of `nullIntolerant: Boolean = true`; behaviour unchanged. - [x] timestamp_millis + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MillisToTimestamp` extends `IntegralToTimestampBase` with `upScaleFactor = MICROS_PER_MILLIS (1000)`; multiply overflow throws via `Math.multiplyExact`. + - Spark 4.0.1 (audited 2026-05-27): same as 3.5.8 modulo the `NullIntolerant` trait/method refactor in `IntegralToTimestampBase`. - [x] timestamp_seconds + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `inputTypes = Seq(NumericType)` accepting integral, decimal, float, and double; integral values use `Math.multiplyExact` (overflow throws); float and double return NULL on NaN or Infinity. + - Spark 4.0.1 (audited 2026-05-27): `nullIntolerant` set via override instead of the `NullIntolerant` trait; otherwise identical to 3.5.8. + - Known divergence: Comet's Rust impl supports only Int32, Int64, Float32, and Float64. `DecimalType`, `ByteType`, and `ShortType` fall back to Spark. Int64 overflow returns a `ComputeError` matching Spark's `ArithmeticException`. NaN and Infinity map to NULL on the float and double paths. - [ ] to_date - [ ] to_time - [ ] to_timestamp - [ ] to_timestamp_ltz - [ ] to_timestamp_ntz - [x] to_unix_timestamp + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `inputTypes = Seq(TypeCollection(StringType, DateType, TimestampType, TimestampNTZType), StringType)`; returns `LongType` seconds; honours `spark.sql.ansi.enabled` (throws on parse error, else NULL); `TimeZoneAwareExpression`. + - Spark 4.0.1 (audited 2026-05-27): both the value and the format argument become `StringTypeWithCollation(supportsTrimCollation = true)`; a new `suggestedFuncOnFail = "try_to_timestamp"` field is added on `ToTimestamp` (advisory). + - Known divergence: routed through the JVM codegen dispatcher rather than a native kernel, so behaviour is bit-identical to Spark only when `spark.comet.exec.scalaUDF.codegen.enabled=true`; when the flag is off the operator falls back to Spark. - [x] to_utc_timestamp - Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8. - Spark 3.5.8 (audited 2026-05-12): baseline. - Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings. - Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution. - [x] trunc + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `inputTypes = Seq(DateType, StringType)`; `parseTruncLevel` is case-insensitive and accepts `YEAR`/`YYYY`/`YY`, `QUARTER`, `MONTH`/`MM`/`MON`, `WEEK`. Unknown or sub-week levels (`DAY`, `HOUR`, ...) return NULL because `MIN_LEVEL_OF_DATE_TRUNC` is `TRUNC_TO_WEEK`. + - Spark 4.0.1 (audited 2026-05-27): format `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; truncation semantics unchanged for ASCII format strings. + - Known divergence: Comet's native kernel raises an execution error for unknown format strings instead of returning NULL, so non-literal formats are flagged `Incompatible`. Sub-week formats such as `DAY`/`DD` are rejected with `Unsupported` (Spark would return NULL) and fall back to Spark. - [ ] try_make_interval - [ ] try_make_timestamp - [ ] try_to_date - [ ] try_to_time - [ ] try_to_timestamp - [x] unix_date + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UnixDate(child)`: `inputTypes = Seq(DateType)`, `dataType = IntegerType`; `nullSafeEval` returns the underlying days-since-epoch int unchanged. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait is replaced by `nullIntolerant: Boolean = true`. - [x] unix_micros + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UnixMicros` extends `TimestampToLongBase` with `scaleFactor = 1`; codegen reduces to identity on the underlying micros. + - Spark 4.0.1 (audited 2026-05-27): same as 3.5.8 modulo the `NullIntolerant` trait/method refactor. - [x] unix_millis + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UnixMillis` extends `TimestampToLongBase` with `scaleFactor = MICROS_PER_MILLIS`; floor-divides timestamp micros by 1000. + - Spark 4.0.1 (audited 2026-05-27): same as 3.5.8 modulo the `NullIntolerant` trait/method refactor. - [x] unix_seconds + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UnixSeconds` extends `TimestampToLongBase` with `scaleFactor = MICROS_PER_SECOND`; accepts `TimestampType` only; returns `LongType`; floor-divides micros by the scale factor. + - Spark 4.0.1 (audited 2026-05-27): `TimestampToLongBase` swaps the `NullIntolerant` trait for `nullIntolerant: Boolean = true`; numerics unchanged. - [x] unix_timestamp + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. Inherits from `UnixTime` / `ToTimestamp`; `inputTypes = Seq(TypeCollection(StringType, DateType, TimestampType, TimestampNTZType), StringType)`; result is `LongType`; honours `failOnError` for ANSI parse errors on the string path. + - Spark 4.0.1 (audited 2026-05-27): `ToTimestamp.inputTypes` widens the string slot to `StringTypeWithCollation(supportsTrimCollation = true)`; behaviour unchanged for non-collated strings. + - Known divergence: Comet's native path only accepts `TimestampType`, `DateType`, and `TimestampNTZType` (string inputs fall back to Spark). For `TimestampType` and `DateType` the session timezone is applied via `array_with_timezone`; for `TimestampNTZType` the microsecond value is divided directly without timezone adjustment. - [x] weekday + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `WeekDay extends GetDateField`; returns 0..6 with Monday=0 via `LocalDate.getDayOfWeek.ordinal()`. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [x] weekofyear + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `WeekOfYear extends GetDateField`; returns the ISO-8601 week-of-week-based-year via `IsoFields.WEEK_OF_WEEK_BASED_YEAR` (Monday start, week 1 has more than 3 days). Comet maps this to DataFusion's `datepart('week', ...)` which uses Arrow's `iso_week().week()`, matching Spark. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. - [ ] window - [ ] window_time - [x] year + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Year extends GetDateField`; delegates to `DateTimeUtils.getYear` via `LocalDate.getYear`. + - Spark 4.0.1 (audited 2026-05-27): identical semantics; `GetDateField` drops the `NullIntolerant` mixin in favour of `nullIntolerant: Boolean = true`. ### generator_funcs diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index e2995274ad..01018ff712 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -23,7 +23,7 @@ import java.util.Locale import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} +import org.apache.spark.sql.types.{DataType, DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String import org.apache.comet.CometConf @@ -179,23 +179,24 @@ object CometQuarter extends CometExpressionSerde[Quarter] with CometExprGetDateF } } -object CometHour extends CometExpressionSerde[Hour] { +private object TimeFieldSerde { + val timestampNtzIncompatReason: String = + "Incorrectly applies timezone conversion to TimestampNTZ inputs" + + " (https://github.com/apache/datafusion-comet/issues/3180)" - val incompatReason: String = "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)" + def supportLevelForChild(childType: DataType): SupportLevel = childType match { + case TimestampNTZType => Incompatible(Some(timestampNtzIncompatReason)) + case _ => Compatible() + } +} - override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) +object CometHour extends CometExpressionSerde[Hour] { - override def getSupportLevel(expr: Hour): SupportLevel = { - if (expr.child.dataType == TimestampNTZType) { - Incompatible( - Some( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)")) - } else { - Compatible() - } - } + override def getIncompatibleReasons(): Seq[String] = + Seq(TimeFieldSerde.timestampNtzIncompatReason) + + override def getSupportLevel(expr: Hour): SupportLevel = + TimeFieldSerde.supportLevelForChild(expr.child.dataType) override def convert( expr: Hour, @@ -224,20 +225,11 @@ object CometHour extends CometExpressionSerde[Hour] { object CometMinute extends CometExpressionSerde[Minute] { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)") - - override def getSupportLevel(expr: Minute): SupportLevel = { - if (expr.child.dataType == TimestampNTZType) { - Incompatible( - Some( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)")) - } else { - Compatible() - } - } + override def getIncompatibleReasons(): Seq[String] = + Seq(TimeFieldSerde.timestampNtzIncompatReason) + + override def getSupportLevel(expr: Minute): SupportLevel = + TimeFieldSerde.supportLevelForChild(expr.child.dataType) override def convert( expr: Minute, @@ -266,20 +258,11 @@ object CometMinute extends CometExpressionSerde[Minute] { object CometSecond extends CometExpressionSerde[Second] { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)") - - override def getSupportLevel(expr: Second): SupportLevel = { - if (expr.child.dataType == TimestampNTZType) { - Incompatible( - Some( - "Incorrectly applies timezone conversion to TimestampNTZ inputs" + - " (https://github.com/apache/datafusion-comet/issues/3180)")) - } else { - Compatible() - } - } + override def getIncompatibleReasons(): Seq[String] = + Seq(TimeFieldSerde.timestampNtzIncompatReason) + + override def getSupportLevel(expr: Second): SupportLevel = + TimeFieldSerde.supportLevelForChild(expr.child.dataType) override def convert( expr: Second, @@ -437,6 +420,11 @@ object CometMakeDate extends CometScalarFunction[MakeDate]("make_date") object CometSecondsToTimestamp extends CometScalarFunction[SecondsToTimestamp]("seconds_to_timestamp") { + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only `IntegerType`, `LongType`, `FloatType`, and `DoubleType` inputs are supported." + + " `DecimalType`, `ByteType`, and `ShortType` fall back to Spark.") + override def getSupportLevel(expr: SecondsToTimestamp): SupportLevel = expr.child.dataType match { case IntegerType | LongType | FloatType | DoubleType => Compatible() @@ -482,8 +470,14 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = Seq("year", "yyyy", "yy", "quarter", "mon", "month", "mm", "week") - override def getIncompatibleReasons(): Seq[String] = Seq( - "Non-literal format strings will throw an exception instead of returning NULL") + private val nonLiteralFormatIncompatReason: String = + "Non-literal format strings will throw an exception instead of returning NULL" + + private def unsupportedFormatReason(fmt: Any): String = + s"Format $fmt is not supported. Only the following formats are supported: " + + supportedFormats.mkString(", ") + + override def getIncompatibleReasons(): Seq[String] = Seq(nonLiteralFormatIncompatReason) override def getUnsupportedReasons(): Seq[String] = Seq( "Only the following formats are supported: " + supportedFormats.mkString(", ")) @@ -494,11 +488,10 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { if (supportedFormats.contains(fmt.toString.toLowerCase(Locale.ROOT))) { Compatible() } else { - Unsupported(Some(s"Format $fmt is not supported")) + Unsupported(Some(unsupportedFormatReason(fmt))) } case _ => - Incompatible( - Some("Invalid format strings will throw an exception instead of returning NULL")) + Incompatible(Some(nonLiteralFormatIncompatReason)) } } @@ -521,10 +514,6 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] { object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Produces incorrect results when used with non-UTC timezones. Compatible when timezone is" + - " UTC. (https://github.com/apache/datafusion-comet/issues/2649)") - val supportedFormats: Seq[String] = Seq( "year", @@ -543,6 +532,23 @@ object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] { "millisecond", "microsecond") + private val nonUtcIncompatReason: String = + "Produces incorrect results when used with non-UTC timezones. Compatible when timezone is" + + " UTC. (https://github.com/apache/datafusion-comet/issues/2649)" + + private val nonLiteralFormatIncompatReason: String = + "Non-literal format strings will throw an exception instead of returning NULL" + + private def unsupportedFormatReason(fmt: Any): String = + s"Format $fmt is not supported. Only the following formats are supported: " + + supportedFormats.mkString(", ") + + override def getIncompatibleReasons(): Seq[String] = + Seq(nonUtcIncompatReason, nonLiteralFormatIncompatReason) + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only the following formats are supported: " + supportedFormats.mkString(", ")) + override def getSupportLevel(expr: TruncTimestamp): SupportLevel = { val timezone = expr.timeZoneId.getOrElse("UTC") val isUtc = timezone == "UTC" || timezone == "Etc/UTC" @@ -552,17 +558,13 @@ object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] { if (isUtc) { Compatible() } else { - Incompatible( - Some( - s"Incorrect results in non-UTC timezone '$timezone'" + - " (https://github.com/apache/datafusion-comet/issues/2649)")) + Incompatible(Some(nonUtcIncompatReason)) } } else { - Unsupported(Some(s"Format $fmt is not supported")) + Unsupported(Some(unsupportedFormatReason(fmt))) } case _ => - Incompatible( - Some("Invalid format strings will throw an exception instead of returning NULL")) + Incompatible(Some(nonLiteralFormatIncompatReason)) } } @@ -700,24 +702,27 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { * without applying any session timezone offset. */ object CometHours extends CometExpressionSerde[Hours] { + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only `TimestampType` and `TimestampNTZType` inputs are supported.") + + override def getSupportLevel(expr: Hours): SupportLevel = expr.child.dataType match { + case TimestampType | TimestampNTZType => Compatible() + case other => Unsupported(Some(s"Hours does not support input type: $other")) + } + override def convert( expr: Hours, inputs: Seq[Attribute], binding: Boolean): Option[ExprOuterClass.Expr] = { - val optExpr = expr.child.dataType match { - case TimestampType | TimestampNTZType => - exprToProtoInternal(expr.child, inputs, binding).map { childExpr => - val builder = ExprOuterClass.HoursTransform.newBuilder() - builder.setChild(childExpr) + val optExpr = exprToProtoInternal(expr.child, inputs, binding).map { childExpr => + val builder = ExprOuterClass.HoursTransform.newBuilder() + builder.setChild(childExpr) - ExprOuterClass.Expr - .newBuilder() - .setHoursTransform(builder) - .build() - } - case other => - withInfo(expr, s"Hours does not support input type: $other") - None + ExprOuterClass.Expr + .newBuilder() + .setHoursTransform(builder) + .build() } optExprWithInfo(optExpr, expr, expr.child) } @@ -734,6 +739,16 @@ object CometHours extends CometExpressionSerde[Hours] { * The first cast respects the session timezone to correctly determine the date boundary. */ object CometDays extends CometExpressionSerde[Days] { + + override def getUnsupportedReasons(): Seq[String] = Seq( + "Only `DateType` and `TimestampType` inputs are supported." + + " `TimestampNTZType` is not supported.") + + override def getSupportLevel(expr: Days): SupportLevel = expr.child.dataType match { + case DateType | TimestampType => Compatible() + case other => Unsupported(Some(s"Days does not support input type: $other")) + } + override def convert( expr: Days, inputs: Seq[Attribute], @@ -748,9 +763,7 @@ object CometDays extends CometExpressionSerde[Days] { childExpr.flatMap { child => CometCast.castToProto(expr, Some(timezone), DateType, child, CometEvalMode.LEGACY) } - case other => - withInfo(expr, s"Days does not support input type: $other") - None + case _ => None } // Convert DateType to IntegerType (days since epoch) diff --git a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala index e5eeb5b848..fe7bacbf49 100644 --- a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala @@ -29,12 +29,26 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn // https://github.com/apache/datafusion/issues/16594 object CometFromUnixTime extends CometExpressionSerde[FromUnixTime] { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Only supports the default datetime format pattern `yyyy-MM-dd HH:mm:ss`." + - " DataFusion's valid timestamp range differs from Spark" + - " (https://github.com/apache/datafusion/issues/16594)") + private val incompatReason: String = + "DataFusion's valid timestamp range differs from Spark" + + " (https://github.com/apache/datafusion/issues/16594)" - override def getSupportLevel(expr: FromUnixTime): SupportLevel = Incompatible(None) + private val unsupportedFormatReason: String = + "Only the default datetime format pattern `yyyy-MM-dd HH:mm:ss` is supported;" + + " other patterns fall back to Spark" + + " (https://github.com/apache/datafusion/issues/16577)" + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) + + override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedFormatReason) + + override def getSupportLevel(expr: FromUnixTime): SupportLevel = { + if (expr.format != Literal(TimestampFormatter.defaultPattern)) { + Unsupported(Some(unsupportedFormatReason)) + } else { + Incompatible(Some(incompatReason)) + } + } override def convert( expr: FromUnixTime, @@ -48,10 +62,7 @@ object CometFromUnixTime extends CometExpressionSerde[FromUnixTime] { val formatExpr = exprToProtoInternal(Literal("%Y-%m-%d %H:%M:%S"), inputs, binding) val timeZone = exprToProtoInternal(Literal(expr.timeZoneId.orNull), inputs, binding) - if (expr.format != Literal(TimestampFormatter.defaultPattern)) { - withInfo(expr, "Datetime pattern format is unsupported") - None - } else if (secExpr.isDefined && formatExpr.isDefined) { + if (secExpr.isDefined && formatExpr.isDefined) { val timestampExpr = scalarFunctionExprToProto("from_unixtime", Seq(secExpr, timeZone): _*) val optExpr = scalarFunctionExprToProto("to_char", Seq(timestampExpr, formatExpr): _*) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time.sql index a7b0960570..0a2206f0c0 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time.sql @@ -24,12 +24,12 @@ INSERT INTO test_from_unix_time VALUES (0), (1718451045), (-1), (NULL), (2147483 query expect_fallback(not fully compatible with Spark) SELECT from_unixtime(t) FROM test_from_unix_time -query expect_fallback(not fully compatible with Spark) +query expect_fallback(Only the default datetime format pattern) SELECT from_unixtime(t, 'yyyy-MM-dd') FROM test_from_unix_time -- literal arguments query expect_fallback(not fully compatible with Spark) SELECT from_unixtime(0) -query expect_fallback(not fully compatible with Spark) +query expect_fallback(Only the default datetime format pattern) SELECT from_unixtime(1718451045, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_date_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_date_ansi.sql new file mode 100644 index 0000000000..04a28e3e1c --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_date_ansi.sql @@ -0,0 +1,36 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: Spark's MakeDate wraps the java.time.DateTimeException from LocalDate.of in +-- ansiDateTimeArgumentOutOfRange (4.0) / ansiDateTimeError (3.4/3.5) when +-- spark.sql.ansi.enabled=true. Comet's native SparkMakeDate always returns NULL on +-- invalid input and never raises, so it does not throw under ANSI. The ignored queries +-- below capture the divergence; remove ignore(...) when +-- https://github.com/apache/datafusion-comet/issues/4451 is fixed. +-- Config: spark.sql.ansi.enabled=true + +-- February 30 is not a valid date. +query ignore(https://github.com/apache/datafusion-comet/issues/4451) +SELECT make_date(2024, 2, 30) + +-- Month 13 is out of range. +query ignore(https://github.com/apache/datafusion-comet/issues/4451) +SELECT make_date(2024, 13, 1) + +-- Day 0 is out of range. +query ignore(https://github.com/apache/datafusion-comet/issues/4451) +SELECT make_date(2024, 6, 0) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/next_day.sql b/spark/src/test/resources/sql-tests/expressions/datetime/next_day.sql index 057c9daea6..65cccb9356 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/next_day.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/next_day.sql @@ -72,3 +72,8 @@ SELECT next_day(date('2023-01-01'), 'Monday'), next_day(date('2023-01-01'), 'Sun -- null handling query SELECT next_day(NULL, 'Monday'), next_day(date('2023-01-01'), NULL) + +-- Comet's native impl trims whitespace before matching the day name; Spark does not, so +-- ' MO ' is invalid in Spark (NULL) but matches Monday in Comet. +query ignore(https://github.com/apache/datafusion-comet/issues/4450) +SELECT next_day(date('2024-01-01'), ' MO '), next_day(date('2024-01-01'), 'MO ') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/next_day_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/next_day_ansi.sql new file mode 100644 index 0000000000..9f8f8e435f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/next_day_ansi.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: Spark's NextDay throws SparkIllegalArgumentException on a malformed +-- dayOfWeek. Comet's native impl (datafusion-spark::SparkNextDay) always returns NULL, +-- so it does not throw under ANSI. The expect_error pattern below will be the assertion +-- once https://github.com/apache/datafusion-comet/issues/4449 is fixed; until then the +-- queries are ignored so the suite stays green. +-- Config: spark.sql.ansi.enabled=true + +-- Comet returns NULL where Spark throws. +query ignore(https://github.com/apache/datafusion-comet/issues/4449) +SELECT next_day(date('2024-01-01'), 'NOT_A_DAY') + +query ignore(https://github.com/apache/datafusion-comet/issues/4449) +SELECT next_day(date('2024-01-01'), '') diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 20ad90a91c..0700713b4a 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -65,7 +65,7 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH // Comet should fall back to Spark if format is not a literal checkSparkAnswerAndFallbackReason( "SELECT c0, trunc(c0, c1) from tbl order by c0, c1", - "Invalid format strings will throw an exception instead of returning NULL") + "Non-literal format strings will throw an exception instead of returning NULL") } test("date_trunc (TruncTimestamp) - reading from DataFrame") { @@ -89,7 +89,7 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH // Comet should fall back to Spark if format is not a literal checkSparkAnswerAndFallbackReason( "SELECT c0, date_trunc(fmt, c0) from tbl order by c0, fmt", - "Invalid format strings will throw an exception instead of returning NULL") + "Non-literal format strings will throw an exception instead of returning NULL") } } @@ -117,7 +117,7 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH // Comet should fall back to Spark if format is not a literal checkSparkAnswerAndFallbackReason( "SELECT c0, date_trunc(fmt, c0) from tbl order by c0, fmt", - "Invalid format strings will throw an exception instead of returning NULL") + "Non-literal format strings will throw an exception instead of returning NULL") } } }