diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index a6cced4e12..53b5c4ea24 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -147,6 +147,15 @@ Cast operations in Comet fall into three levels of support: Spark. - **N/A**: Spark does not support this cast. +### String to Timestamp + +Comet's native `CAST(string AS TIMESTAMP)` implementation supports all timestamp formats accepted +by Apache Spark, including ISO 8601 date-time strings, date-only strings, time-only strings +(`HH:MM:SS`), embedded timezone offsets (e.g. `+07:30`, `GMT-01:00`, `UTC`), named timezone +suffixes (e.g. `Europe/Moscow`), and the full Spark timestamp year range +(-290308 to 294247). Note that `CAST(string AS DATE)` is only compatible for years between +262143 BC and 262142 AD due to an underlying library limitation. + ### Legacy Mode diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index d1487aaea4..c100e3994d 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -406,7 +406,12 @@ impl PhysicalPlanner { Ok(Arc::new(Cast::new( child, datatype, - SparkCastOptions::new(eval_mode, &expr.timezone, expr.allow_incompat), + { + let mut opts = + SparkCastOptions::new(eval_mode, &expr.timezone, expr.allow_incompat); + opts.is_spark4_plus = expr.is_spark4_plus; + opts + }, spark_expr.expr_id, query_context, ))) diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto index c12b29df19..5701577463 100644 --- a/native/proto/src/proto/expr.proto +++ b/native/proto/src/proto/expr.proto @@ -266,6 +266,9 @@ message Cast { string timezone = 3; EvalMode eval_mode = 4; bool allow_incompat = 5; + // True when running against Spark 4.0+. Controls version-specific cast behaviour + // such as the handling of leading whitespace before T-prefixed time-only strings. + bool is_spark4_plus = 6; } message BinaryExpr { diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index 6e5a80c84a..38a5e4a8ed 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -131,6 +131,9 @@ pub struct SparkCastOptions { pub timezone: String, /// Allow casts that are supported but not guaranteed to be 100% compatible pub allow_incompat: bool, + /// True when running against Spark 4.0+. Enables version-specific cast behaviour + /// such as the handling of leading whitespace before T-prefixed time-only strings. + pub is_spark4_plus: bool, /// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter) pub allow_cast_unsigned_ints: bool, /// We also use the cast logic for adapting Parquet schemas, so this flag is used @@ -148,6 +151,7 @@ impl SparkCastOptions { eval_mode, timezone: timezone.to_string(), allow_incompat, + is_spark4_plus: false, allow_cast_unsigned_ints: false, is_adapting_schema: false, null_string: "null".to_string(), @@ -160,6 +164,7 @@ impl SparkCastOptions { eval_mode, timezone: "".to_string(), allow_incompat, + is_spark4_plus: false, allow_cast_unsigned_ints: false, is_adapting_schema: false, null_string: "null".to_string(), @@ -296,9 +301,13 @@ pub(crate) fn cast_array( let cast_result = match (&from_type, to_type) { (Utf8, Boolean) => spark_cast_utf8_to_boolean::(&array, eval_mode), (LargeUtf8, Boolean) => spark_cast_utf8_to_boolean::(&array, eval_mode), - (Utf8, Timestamp(_, _)) => { - cast_string_to_timestamp(&array, to_type, eval_mode, &cast_options.timezone) - } + (Utf8, Timestamp(_, _)) => cast_string_to_timestamp( + &array, + to_type, + eval_mode, + &cast_options.timezone, + cast_options.is_spark4_plus, + ), (Utf8, Date32) => cast_string_to_date(&array, to_type, eval_mode), (Date32, Int32) => { // Date32 is stored as days since epoch (i32), so this is a simple reinterpret cast diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index cdff90a4ea..fbad964ec3 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -25,7 +25,7 @@ use arrow::datatypes::{ i256, is_validate_decimal_precision, DataType, Date32Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType, }; -use chrono::{DateTime, NaiveDate, TimeZone, Timelike}; +use chrono::{DateTime, LocalResult, NaiveDate, NaiveTime, Offset, TimeZone, Timelike}; use num::traits::CheckedNeg; use num::{CheckedSub, Integer}; use regex::Regex; @@ -36,7 +36,7 @@ use std::sync::{Arc, LazyLock}; macro_rules! cast_utf8_to_timestamp { // $tz is a Timezone:Tz object and contains the session timezone. // $to_tz_str is a string containing the to_type timezone - ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr) => {{ + ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr, $is_spark4_plus:expr) => {{ let len = $array.len(); let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str); let mut cast_err: Option = None; @@ -44,11 +44,29 @@ macro_rules! cast_utf8_to_timestamp { if $array.is_null(i) { cast_array.append_null() } else { - match $cast_method($array.value(i).trim(), $eval_mode, $tz) { + // we use trim_end instead of trim because strings with leading spaces are interpreted differently + // by Spark in cases where the string has only the time component starting with T. + // The string " T2" results in null while "T2" results in a valid timestamp. + match $cast_method($array.value(i).trim_end(), $eval_mode, $tz, $is_spark4_plus) { Ok(Some(cast_value)) => cast_array.append_value(cast_value), Ok(None) => cast_array.append_null(), Err(e) => { if $eval_mode == EvalMode::Ansi { + // Replace the error value with the raw (untrimmed) input to match + // Spark's behavior: Spark reports the original string in CAST_INVALID_INPUT. + let raw_value = $array.value(i).to_string(); + let e = match e { + SparkError::InvalidInputInCastToDatetime { + from_type, + to_type, + .. + } => SparkError::InvalidInputInCastToDatetime { + value: raw_value, + from_type, + to_type, + }, + other => other, + }; cast_err = Some(e); break; } @@ -671,6 +689,7 @@ pub(crate) fn cast_string_to_timestamp( to_type: &DataType, eval_mode: EvalMode, timezone_str: &str, + is_spark4_plus: bool, ) -> SparkResult { let string_array = array .as_any() @@ -689,7 +708,8 @@ pub(crate) fn cast_string_to_timestamp( TimestampMicrosecondType, timestamp_parser, tz, - to_tz + to_tz, + is_spark4_plus )? } _ => unreachable!("Invalid data type {:?} in cast from string", to_type), @@ -988,8 +1008,11 @@ fn get_timestamp_values( .parse::() .unwrap_or_default(); - // NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142. - if !(-262143..=262142).contains(&year) { + // Guard against years that cannot produce a valid i64 microsecond timestamp. + // The Long.MaxValue/MinValue boundaries correspond to years 294247 / -290308. + // We allow a slightly wider range and let parse_timestamp_to_micros perform the + // exact overflow check via checked arithmetic. + if !(-290309..=294248).contains(&year) { return Ok(None); } @@ -998,7 +1021,13 @@ fn get_timestamp_values( let hour = parts.next().map_or(0, |h| h.parse::().unwrap_or(0)); let minute = parts.next().map_or(0, |m| m.parse::().unwrap_or(0)); let second = parts.next().map_or(0, |s| s.parse::().unwrap_or(0)); - let microsecond = parts.next().map_or(0, |ms| ms.parse::().unwrap_or(0)); + let microsecond = parts.next().map_or(0, |ms| { + // Truncate to at most 6 digits then scale to fill the microsecond field. + // E.g. ".123" -> 123 * 10^3 = 123_000 µs; ".1234567" -> truncated to 123_456 µs. + let ms = &ms[..ms.len().min(6)]; + let n = ms.len(); + ms.parse::().unwrap_or(0) * 10u32.pow((6 - n) as u32) + }); let mut timestamp_info = TimeStampInfo::default(); @@ -1046,31 +1075,133 @@ fn get_timestamp_values( parse_timestamp_to_micros(timestamp_info, tz) } +/// Howard Hinnant's algorithm: proleptic Gregorian days since 1970-01-01 for any i64 year. +/// Works correctly for positive and negative years via Euclidean floor division. +/// Spark uses Java's equivalent [LocalDate.toEpochDay](https://github.com/openjdk/jdk/blob/cddee6d6eb3e048635c380a32bd2f6ebfd2c18b5/src/java.base/share/classes/java/time/LocalDate.java#L1954) +fn days_from_civil(y: i64, m: i64, d: i64) -> i64 { + let (y, m) = if m <= 2 { (y - 1, m + 9) } else { (y, m - 3) }; + let era = if y >= 0 { y / 400 } else { (y - 399) / 400 }; + let yoe = y - era * 400; // year of era [0, 399] + let doy = (153 * m + 2) / 5 + d - 1; // day of year [0, 365] + let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // day of era [0, 146096] + era * 146097 + doe - 719468 +} + fn parse_timestamp_to_micros( timestamp_info: &TimeStampInfo, tz: &T, ) -> SparkResult> { - let datetime = tz.with_ymd_and_hms( + // Build NaiveDateTime explicitly so we can pattern-match LocalResult variants and + // handle the DST spring-forward gap case. + let naive_date_opt = NaiveDate::from_ymd_opt( timestamp_info.year, timestamp_info.month, timestamp_info.day, + ); + + // NaiveTime is used for the common path; also validates hour/min/sec. + let naive_time = match NaiveTime::from_hms_opt( timestamp_info.hour, timestamp_info.minute, timestamp_info.second, - ); - - // Spark uses the offset before daylight savings change so we need to use earliest() - // Return None for LocalResult::None which is the invalid time in a DST spring forward gap). - let tz_datetime = match datetime.earliest() { - Some(dt) => dt - .with_timezone(tz) - .with_nanosecond(timestamp_info.microsecond * 1000), - None => return Ok(None), + ) { + Some(t) => t, + None => return Ok(None), // invalid time components }; - match tz_datetime { - Some(dt) => Ok(Some(dt.timestamp_micros())), - None => Ok(None), + if let Some(naive_date) = naive_date_opt { + let local_naive = naive_date.and_time(naive_time); + + // Resolve local datetime to UTC, handling DST transitions. + // We compute base_micros with second precision (local_naive has no sub-second component), + // then add microseconds at the end to avoid calling with_nanosecond(), which internally + // calls from_local_datetime().single() and returns None for ambiguous (fall-back) times. + let base_micros: Option = match tz.from_local_datetime(&local_naive) { + // Unambiguous local time. + LocalResult::Single(dt) => Some(dt.timestamp_micros()), + // DST fall-back overlap: Spark picks the earlier UTC instant (pre-transition offset). + LocalResult::Ambiguous(earlier, _) => Some(earlier.timestamp_micros()), + // DST spring-forward gap: the local time does not exist. + // Java's ZonedDateTime.of() advances by the gap length, which is equivalent to + // utc = local_naive − pre_gap_offset + LocalResult::None => { + let probe = local_naive - chrono::Duration::hours(3); + let pre_offset = match tz.from_local_datetime(&probe) { + LocalResult::Single(dt) => dt.offset().fix(), + LocalResult::Ambiguous(dt, _) => dt.offset().fix(), + LocalResult::None => return Ok(None), + }; + let offset_secs = pre_offset.local_minus_utc() as i64; + let utc_naive = local_naive - chrono::Duration::seconds(offset_secs); + Some(utc_naive.and_utc().timestamp_micros()) + } + }; + + Ok(base_micros.map(|m| m + timestamp_info.microsecond as i64)) + } else { + // NaiveDate::from_ymd_opt returned None. This means either: + // (a) invalid calendar date (Feb 29 on non-leap year, month 13, etc.) + // (b) year outside chrono's representable range (> 262143 or < -262144) + // + // For case (b) we fall back to Howard Hinnant's direct arithmetic, which works + // for any year that fits in i64. This covers the Long.MaxValue / Long.MinValue + // boundary timestamps (year 294247 / -290308). + let year = timestamp_info.year as i64; + if (-262144..=262143).contains(&year) { + // Year is in chrono's range but date was rejected -> truly invalid date. + return Ok(None); + } + // Validate month and day manually for extreme years. + let m = timestamp_info.month; + let d = timestamp_info.day; + if !(1..=12).contains(&m) { + return Ok(None); + } + let max_day = match m { + 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31, + 4 | 6 | 9 | 11 => 30, + 2 => { + let leap = year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); + if leap { + 29 + } else { + 28 + } + } + _ => return Ok(None), + }; + if d < 1 || d > max_day { + return Ok(None); + } + // Compute the timezone offset using epoch as a surrogate probe point. + // Extreme-year timestamps are only valid with a UTC-like fixed offset (any DST + // zone would overflow). Using epoch gives us the standard offset. + let epoch_probe = NaiveDate::from_ymd_opt(1970, 1, 1) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap(); + let tz_offset_secs: i64 = match tz.from_local_datetime(&epoch_probe) { + LocalResult::Single(dt) => dt.offset().fix().local_minus_utc() as i64, + LocalResult::Ambiguous(dt, _) => dt.offset().fix().local_minus_utc() as i64, + LocalResult::None => 0, + }; + // Compute seconds since epoch via direct calendar arithmetic. + // Use i128 for the intermediate multiply-by-1_000_000 step: the seconds value can be + // just outside the i64 range while the final microseconds result is still within range + // (e.g., Long.MinValue boundary: seconds = -9_223_372_036_855, result = i64::MIN). + let days = days_from_civil(year, m as i64, d as i64); + let time_secs = timestamp_info.hour as i64 * 3600 + + timestamp_info.minute as i64 * 60 + + timestamp_info.second as i64; + let total_secs = days + .checked_mul(86400) + .and_then(|s| s.checked_add(time_secs)) + .and_then(|s| s.checked_sub(tz_offset_secs)); + let utc_micros = total_secs.and_then(|s| { + let micros128 = s as i128 * 1_000_000 + timestamp_info.microsecond as i128; + i64::try_from(micros128).ok() + }); + Ok(utc_micros) } } @@ -1109,40 +1240,229 @@ fn timestamp_parser( value: &str, eval_mode: EvalMode, tz: &T, + is_spark4_plus: bool, ) -> SparkResult> { - let value = value.trim(); - if value.is_empty() { + let trimmed = value.trim(); + if trimmed.is_empty() { return Ok(None); } + // Spark 4.0+ rejects leading whitespace for ALL T-prefixed time-only strings + // (T, T:, T::, T::.), but accepts trailing whitespace. + // Spark 3.x trims all whitespace first, so leading whitespace is accepted there. + // Check the raw (pre-trim) value for leading whitespace before any T-time-only match. + if is_spark4_plus + && value.len() > value.trim_start().len() + && (RE_TIME_ONLY_H.is_match(trimmed) + || RE_TIME_ONLY_HM.is_match(trimmed) + || RE_TIME_ONLY_HMS.is_match(trimmed) + || RE_TIME_ONLY_HMSU.is_match(trimmed)) + { + return if eval_mode == EvalMode::Ansi { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP".to_string(), + }) + } else { + Ok(None) + }; + } + let value = trimmed; + // Spark accepts a leading '+' year sign on full date-time strings (e.g. "+2020-01-01T12:34:56") + // but rejects it on time-only strings (e.g. "+12:12:12" -> null). + // Detect: '+' followed by at least one digit and then a '-' separator -> year prefix -> strip '+'. + // Anything else starting with '+' (time-only, bare number, etc.) -> null. + let value = if let Some(rest) = value.strip_prefix('+') { + let first_non_digit = rest.find(|c: char| !c.is_ascii_digit()); + match first_non_digit { + Some(i) if i >= 1 && rest.as_bytes()[i] == b'-' => rest, + _ => return Ok(None), + } + } else { + value + }; - // Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset. - if let Some((stripped, offset_secs)) = extract_offset_suffix(value) { - let fixed_tz = chrono::FixedOffset::east_opt(offset_secs) - .ok_or_else(|| SparkError::Internal("Invalid timezone offset".to_string()))?; - return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz); + // Only attempt offset-suffix extraction when the value does not already match a + // base pattern. This prevents the '-' in plain date strings like "2015-03-18" + // from being misidentified as a negative-offset sign. + let has_direct_match = RE_YEAR.is_match(value) + || RE_MONTH.is_match(value) + || RE_DAY.is_match(value) + || RE_HOUR.is_match(value) + || RE_MINUTE.is_match(value) + || RE_SECOND.is_match(value) + || RE_MICROSECOND.is_match(value) + || RE_TIME_ONLY_H.is_match(value) + || RE_TIME_ONLY_HM.is_match(value) + || RE_TIME_ONLY_HMS.is_match(value) + || RE_TIME_ONLY_HMSU.is_match(value) + || RE_BARE_HM.is_match(value) + || RE_BARE_HMS.is_match(value) + || RE_BARE_HMSU.is_match(value); + + if !has_direct_match { + if let Some((stripped, suffix_tz)) = extract_offset_suffix(value) { + return timestamp_parser_with_tz(stripped, eval_mode, &suffix_tz); + } } timestamp_parser_with_tz(value, eval_mode, tz) } -/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the -/// stripped string and the offset in seconds. Returns `None` if no offset suffix is present. -fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> { +/// Parses the portion of an offset string AFTER any "UTC"/"GMT"/"UT" prefix (or the +/// full bare +/- offset including its sign character). Returns the offset in whole seconds, +/// or `None` for any malformed, out-of-range, or trailing-garbage input. +/// +/// Accepted formats (H = 1–2 digit hour, M = 1–2 digit minute): +/// "" -> 0 (bare "UTC" / "GMT" / "UT") +/// "+H" -> +H*3600 (hour-only, e.g. "+0" from "UTC+0") +/// "+HH" -> same +/// "+HHMM" -> +H*3600+M*60 (4 digits, no colon) +/// "+H:M" -> same (with colon, any digit count 1-2 each) +/// "+HH:MM" -> same +/// (negative with '-' analogously) +/// +/// Hours must be 0–18 and minutes 0–59. A trailing colon ("+8:") is rejected. +fn parse_sign_offset(s: &str) -> Option { + if s.is_empty() { + return Some(0); + } + let (sign, rest) = match s.as_bytes().first() { + Some(&b'+') => (1i32, &s[1..]), + Some(&b'-') => (-1i32, &s[1..]), + _ => return None, + }; + if rest.is_empty() { + return None; // lone '+' or '-' + } + let (h, m) = if let Some(colon_pos) = rest.find(':') { + let h_str = &rest[..colon_pos]; + let m_str = &rest[colon_pos + 1..]; + if m_str.is_empty() { + return None; // trailing colon: "+8:" + } + let h: i32 = h_str.parse().ok()?; + // Note: "+HH:MM:SS" (with seconds) is not handled; Spark accepts it but it is rare. + let m: i32 = m_str.parse().ok()?; + (h, m) + } else { + match rest.len() { + 1 | 2 => (rest.parse::().ok()?, 0), + 4 => ( + rest[..2].parse::().ok()?, + rest[2..].parse::().ok()?, + ), + _ => return None, + } + }; + if !(0..=18).contains(&h) || !(0..=59).contains(&m) { + return None; + } + Some(sign * (h * 3600 + m * 60)) +} + +/// Constructs a `timezone::Tz` from an offset measured in seconds. +/// E.g. `+7*3600 + 30*60` -> `"+07:30"`. +fn tz_from_offset_secs(secs: i32) -> Option { + let abs = secs.abs(); + let h = abs / 3600; + let m = (abs % 3600) / 60; + let sign = if secs >= 0 { '+' } else { '-' }; + timezone::Tz::from_str(&format!("{}{:02}:{:02}", sign, h, m)).ok() +} + +/// Returns the last (rightmost) byte position where `needle` starts inside `haystack`. +fn rfind_str(haystack: &str, needle: &str) -> Option { + let hb = haystack.as_bytes(); + let nb = needle.as_bytes(); + if nb.len() > hb.len() { + return None; + } + (0..=(hb.len() - nb.len())) + .rev() + .find(|&i| hb[i..].starts_with(nb)) +} + +/// If `value` ends with a recognised timezone suffix, returns `(datetime_prefix, Tz)`. +/// Returns `None` when no suffix is found. +/// +/// Recognised forms (in matching priority order): +/// Z -> UTC+0 +/// UTC / " UTC" -> UTC+0 (or UTC +/- offset, e.g. "UTC+0", " UTC+07:30") +/// GMT / " GMT" -> UTC+0 (or GMT +/- offset) +/// UT / " UT" -> UTC+0 (or UT +/- offset) +/// Named IANA zone -> e.g. " Europe/Moscow" +/// Bare +/-offset -> e.g. "+07:30", "-1:0", "+0730" +/// +/// **The caller must ensure the value does not already match a base timestamp pattern.** +/// Without that guard a bare '-' in "2015-03-18" would be misread as a -18:00 offset. +fn extract_offset_suffix(value: &str) -> Option<(&str, timezone::Tz)> { + // 1. Z suffix if let Some(stripped) = value.strip_suffix('Z') { - return Some((stripped, 0)); - } - // Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits) - if value.len() >= 6 { - let suffix_start = value.len() - 6; - let suffix = &value[suffix_start..]; - let sign_byte = suffix.as_bytes()[0]; - if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == b':' { - if let (Ok(h), Ok(m)) = (suffix[1..3].parse::(), suffix[4..6].parse::()) { - let sign = if sign_byte == b'+' { 1i32 } else { -1i32 }; - return Some((&value[..suffix_start], sign * (h * 3600 + m * 60))); + return Some((stripped, tz_from_offset_secs(0)?)); + } + + // 2. Named text-prefix forms: "UTC", "GMT", "UT" (optionally space-prefixed), + // each optionally followed by a bare +/-offset. + // Longest first so " UTC" is tried before " UT", etc. + for prefix in &[" UTC", "UTC", " GMT", "GMT", " UT", "UT"] { + if let Some(pos) = rfind_str(value, prefix) { + let offset_str = &value[pos + prefix.len()..]; + if let Some(secs) = parse_sign_offset(offset_str) { + return Some((&value[..pos], tz_from_offset_secs(secs)?)); + } + } + } + + // 3. Java SHORT_IDS fixed-offset abbreviations recognised by ZoneId.of() via SHORT_IDS map. + // Only three have purely fixed offsets (no '/'): + // EST -> -05:00 (-18 000 s) + // MST -> -07:00 (-25 200 s) + // HST -> -10:00 (-36 000 s) + // These may appear with or without a leading space; no sub-offset is allowed after them. + for (abbr, offset_secs) in &[ + (" EST", -18_000i32), + ("EST", -18_000), + (" MST", -25_200), + ("MST", -25_200), + (" HST", -36_000), + ("HST", -36_000), + ] { + if let Some(pos) = rfind_str(value, abbr) { + if pos + abbr.len() == value.len() { + return Some((&value[..pos], tz_from_offset_secs(*offset_secs)?)); + } + } + } + + // 4. Named IANA timezone: a space followed by a slash-containing word at the end. + // e.g. "2015-03-18T12:03:17.123456 Europe/Moscow" + if let Some(space_pos) = value.rfind(' ') { + let tz_name = &value[space_pos + 1..]; + if tz_name.contains('/') { + if let Ok(tz) = timezone::Tz::from_str(tz_name) { + return Some((&value[..space_pos], tz)); } } } + + // 5. Bare +/-offset: find the rightmost '+' or '-' and try to parse everything + // from that position to the end as a complete valid offset. + let last_sign = { + let p = value.rfind('+'); + let m = value.rfind('-'); + match (p, m) { + (Some(p), Some(m)) => Some(p.max(m)), + (a, b) => a.or(b), + } + }; + if let Some(pos) = last_sign { + let offset_str = &value[pos..]; + if let Some(secs) = parse_sign_offset(offset_str) { + return Some((&value[..pos], tz_from_offset_secs(secs)?)); + } + } + None } @@ -1158,19 +1478,19 @@ static RE_MINUTE: LazyLock = static RE_SECOND: LazyLock = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}$").unwrap()); static RE_MICROSECOND: LazyLock = - LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap()); + LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d+$").unwrap()); static RE_TIME_ONLY_H: LazyLock = LazyLock::new(|| Regex::new(r"^T\d{1,2}$").unwrap()); static RE_TIME_ONLY_HM: LazyLock = - LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}$").unwrap()); + LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{1,2}$").unwrap()); static RE_TIME_ONLY_HMS: LazyLock = - LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}$").unwrap()); + LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{1,2}:\d{1,2}$").unwrap()); static RE_TIME_ONLY_HMSU: LazyLock = - LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap()); -static RE_BARE_HM: LazyLock = LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}$").unwrap()); + LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{1,2}:\d{1,2}\.\d+$").unwrap()); +static RE_BARE_HM: LazyLock = LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{1,2}$").unwrap()); static RE_BARE_HMS: LazyLock = - LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}$").unwrap()); + LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{1,2}:\d{1,2}$").unwrap()); static RE_BARE_HMSU: LazyLock = - LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap()); + LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{1,2}:\d{1,2}\.\d+$").unwrap()); fn timestamp_parser_with_tz( value: &str, @@ -1345,9 +1665,12 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult> return return_result(date_str, eval_mode); } - //assign a sign to the date - if bytes[j] == b'-' || bytes[j] == b'+' { - sign = if bytes[j] == b'-' { -1 } else { 1 }; + // assign a sign to the date; both '-' and '+' are accepted (Spark stringToDate line 357-360) + if bytes[j] == b'-' { + sign = -1; + j += 1; + } else if bytes[j] == b'+' { + // sign remains 1 (positive) j += 1; } @@ -1446,7 +1769,8 @@ mod tests { TimestampMicrosecondType, timestamp_parser, tz, - "UTC" + "UTC", + true ) .unwrap(); @@ -1477,7 +1801,8 @@ mod tests { TimestampMicrosecondType, timestamp_parser, tz, - "UTC" + "UTC", + true ); assert!( result.is_err(), @@ -1485,6 +1810,40 @@ mod tests { ); } + #[test] + fn test_cast_string_to_timestamp_ansi_error_trimmed_value() { + // The error value in InvalidInputInCastToDatetime must match the raw input + // (including trailing whitespace) to match Spark's CAST_INVALID_INPUT behavior. + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("91\n3 "), // trailing spaces after a newline in the middle + ])); + let tz = &timezone::Tz::from_str("UTC").unwrap(); + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Ansi; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + tz, + "UTC", + true + ); + match result { + Err(SparkError::InvalidInputInCastToDatetime { value, .. }) => { + assert_eq!( + value, "91\n3 ", + "ANSI error value should match the raw (untrimmed) input to match Spark behavior" + ); + } + other => panic!("Expected InvalidInputInCastToDatetime error, got {other:?}"), + } + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data @@ -1512,156 +1871,449 @@ mod tests { Ok(()) } + #[test] + fn extreme_year_boundary_test() { + let tz = &timezone::Tz::from_str("UTC").unwrap(); + // Long.MaxValue = 9223372036854775807 μs -> 294247-01-10T04:00:54.775807Z + assert_eq!( + timestamp_parser("294247-01-10T04:00:54.775807Z", EvalMode::Legacy, tz, true).unwrap(), + Some(i64::MAX), + ); + // Long.MinValue = -9223372036854775808 μs -> -290308-12-21T19:59:05.224192Z + assert_eq!( + timestamp_parser("-290308-12-21T19:59:05.224192Z", EvalMode::Legacy, tz, true).unwrap(), + Some(i64::MIN), + ); + // One beyond Long.MaxValue -> null (overflow) + assert_eq!( + timestamp_parser("294247-01-10T04:00:54.775808Z", EvalMode::Legacy, tz, true).unwrap(), + None, + ); + // One before Long.MinValue -> null (overflow) + assert_eq!( + timestamp_parser("-290308-12-21T19:59:05.224191Z", EvalMode::Legacy, tz, true).unwrap(), + None, + ); + } + + #[test] + fn test_leading_whitespace_t_hm() { + let tz = &timezone::Tz::from_str("UTC").unwrap(); + // Spark 4.0+ rejects leading whitespace for ALL T-prefixed time-only patterns. + for ws_input in &[" T2:30", "\tT2:30", "\nT2:30", " T2", "\tT2", "\nT2"] { + assert!( + timestamp_parser(ws_input, EvalMode::Legacy, tz, true) + .unwrap() + .is_none(), + "'{ws_input}' should be null in Legacy mode on Spark 4.0+" + ); + // In ANSI mode the same inputs must raise an error (not silently return null). + assert!( + timestamp_parser(ws_input, EvalMode::Ansi, tz, true).is_err(), + "'{ws_input}' should error in ANSI mode on Spark 4.0+" + ); + // Spark 3.x trims all whitespace first, so leading whitespace is valid. + assert!( + timestamp_parser(ws_input, EvalMode::Legacy, tz, false) + .unwrap() + .is_some(), + "'{ws_input}' should be valid in Legacy mode on Spark 3.x" + ); + } + // Without leading whitespace, these must be valid on all versions. + for ok_input in &["T2:30", "T2"] { + assert!( + timestamp_parser(ok_input, EvalMode::Legacy, tz, true) + .unwrap() + .is_some(), + "'{ok_input}' should be valid" + ); + } + } + + #[test] + fn plus_sign_year_test() { + let tz = &timezone::Tz::from_str("UTC").unwrap(); + // Spark accepts '+year' prefix on full date-time strings for TIMESTAMP casts. + // "+2020-01-01T12:34:56" -> 2020-01-01T12:34:56 UTC = 1577882096 seconds. + assert_eq!( + timestamp_parser("+2020-01-01T12:34:56", EvalMode::Legacy, tz, true).unwrap(), + Some(1577882096000000), + "+year on full datetime should parse the same as without the + prefix" + ); + // But '+' on a time-only string is rejected (Spark returns null). + assert_eq!( + timestamp_parser("+12:12:12", EvalMode::Legacy, tz, true).unwrap(), + None, + "+hour:min:sec must return null" + ); + } + #[test] #[cfg_attr(miri, ignore)] // test takes too long with miri fn timestamp_parser_test() { let tz = &timezone::Tz::from_str("UTC").unwrap(); // write for all formats assert_eq!( - timestamp_parser("2020", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020", EvalMode::Legacy, tz, true).unwrap(), Some(1577836800000000) // this is in milliseconds ); assert_eq!( - timestamp_parser("2020-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01", EvalMode::Legacy, tz, true).unwrap(), Some(1577836800000000) ); assert_eq!( - timestamp_parser("2020-01-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01", EvalMode::Legacy, tz, true).unwrap(), Some(1577836800000000) ); assert_eq!( - timestamp_parser("2020-01-01T12", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12", EvalMode::Legacy, tz, true).unwrap(), Some(1577880000000000) ); assert_eq!( - timestamp_parser("2020-01-01T12:34", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12:34", EvalMode::Legacy, tz, true).unwrap(), Some(1577882040000000) ); assert_eq!( - timestamp_parser("2020-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12:34:56", EvalMode::Legacy, tz, true).unwrap(), Some(1577882096000000) ); assert_eq!( - timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy, tz, true).unwrap(), Some(1577882096123456) ); assert_eq!( - timestamp_parser("0100", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100", EvalMode::Legacy, tz, true).unwrap(), Some(-59011459200000000) ); assert_eq!( - timestamp_parser("0100-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01", EvalMode::Legacy, tz, true).unwrap(), Some(-59011459200000000) ); assert_eq!( - timestamp_parser("0100-01-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01-01", EvalMode::Legacy, tz, true).unwrap(), Some(-59011459200000000) ); assert_eq!( - timestamp_parser("0100-01-01T12", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01-01T12", EvalMode::Legacy, tz, true).unwrap(), Some(-59011416000000000) ); assert_eq!( - timestamp_parser("0100-01-01T12:34", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01-01T12:34", EvalMode::Legacy, tz, true).unwrap(), Some(-59011413960000000) ); assert_eq!( - timestamp_parser("0100-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01-01T12:34:56", EvalMode::Legacy, tz, true).unwrap(), Some(-59011413904000000) ); assert_eq!( - timestamp_parser("0100-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("0100-01-01T12:34:56.123456", EvalMode::Legacy, tz, true).unwrap(), Some(-59011413903876544) ); assert_eq!( - timestamp_parser("10000", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000", EvalMode::Legacy, tz, true).unwrap(), Some(253402300800000000) ); assert_eq!( - timestamp_parser("10000-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01", EvalMode::Legacy, tz, true).unwrap(), Some(253402300800000000) ); assert_eq!( - timestamp_parser("10000-01-01", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01-01", EvalMode::Legacy, tz, true).unwrap(), Some(253402300800000000) ); assert_eq!( - timestamp_parser("10000-01-01T12", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01-01T12", EvalMode::Legacy, tz, true).unwrap(), Some(253402344000000000) ); assert_eq!( - timestamp_parser("10000-01-01T12:34", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01-01T12:34", EvalMode::Legacy, tz, true).unwrap(), Some(253402346040000000) ); assert_eq!( - timestamp_parser("10000-01-01T12:34:56", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01-01T12:34:56", EvalMode::Legacy, tz, true).unwrap(), Some(253402346096000000) ); assert_eq!( - timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz, true).unwrap(), Some(253402346096123456) ); // Space separator (same values as T separator) assert_eq!( - timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz, true).unwrap(), Some(1577880000000000) ); assert_eq!( - timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, tz, true).unwrap(), Some(1577882040000000) ); assert_eq!( - timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, tz, true).unwrap(), Some(1577882096000000) ); assert_eq!( - timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, tz, true).unwrap(), Some(1577882096123456) ); // Z suffix (UTC) assert_eq!( - timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, tz, true).unwrap(), Some(1577882096000000) ); // Positive offset suffix assert_eq!( - timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, tz).unwrap(), + timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, tz, true).unwrap(), Some(1577862296000000) // 12:34:56 UTC+5:30 = 07:04:56 UTC ); // T-prefixed time-only with colon - assert!(timestamp_parser("T12:34", EvalMode::Legacy, tz) + assert!(timestamp_parser("T12:34", EvalMode::Legacy, tz, true) .unwrap() .is_some()); - assert!(timestamp_parser("T12:34:56", EvalMode::Legacy, tz) - .unwrap() - .is_some()); - assert!(timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz) + assert!(timestamp_parser("T12:34:56", EvalMode::Legacy, tz, true) .unwrap() .is_some()); + assert!( + timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz, true) + .unwrap() + .is_some() + ); // Bare time-only (hour:minute without T prefix) - assert!(timestamp_parser("12:34", EvalMode::Legacy, tz) + assert!(timestamp_parser("12:34", EvalMode::Legacy, tz, true) .unwrap() .is_some()); - assert!(timestamp_parser("12:34:56", EvalMode::Legacy, tz) + assert!(timestamp_parser("12:34:56", EvalMode::Legacy, tz, true) .unwrap() .is_some()); // Negative year - assert!(timestamp_parser("-0001", EvalMode::Legacy, tz) + assert!(timestamp_parser("-0001", EvalMode::Legacy, tz, true) .unwrap() .is_some()); assert!( - timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz) + timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz, true) .unwrap() .is_some() ); } + #[test] + #[cfg_attr(miri, ignore)] + fn timestamp_parser_fraction_scaling_test() { + let tz = &timezone::Tz::from_str("UTC").unwrap(); + // Base: "2020-01-01T12:34:56" = 1577882096000000 µs (confirmed by timestamp_parser_test) + let base = 1577882096000000i64; + + // 3-digit fraction: ".123" -> 123_000 µs + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.123", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 123_000) + ); + // 1-digit fraction: ".1" -> 100_000 µs + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.1", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 100_000) + ); + // 4-digit fraction: ".1000" -> 100_000 µs (trailing zeros not extra precision) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.1000", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 100_000) + ); + // 5-digit fraction: ".12312" -> 123_120 µs + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.12312", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 123_120) + ); + // 6-digit fraction (exact): unchanged + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.123456", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 123_456) + ); + // >6 digits: truncated to 6 + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.123456789", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 123_456) + ); + // Fraction after Z-stripped offset + assert_eq!( + timestamp_parser("2020-01-01T12:34:56.123Z", EvalMode::Legacy, tz, true).unwrap(), + Some(base + 123_000) + ); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn timestamp_parser_tz_offset_formats_test() { + let tz = &timezone::Tz::from_str("UTC").unwrap(); + // All of these represent 2020-01-01T12:34:56 UTC = 1577882096000000 µs. + let utc = 1577882096000000i64; + // +05:30 offset -> UTC = 12:34:56 − 5h30m = 07:04:56 UTC = 1577862296000000 µs + let plus530 = 1577862296000000i64; + + // +/-HHMM (no colon) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56+0000", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56+0530", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // +/-H:MM (single-digit hour) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56+5:30", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56+0:00", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + // +/-H:M (single-digit both) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56+5:3", EvalMode::Legacy, tz, true).unwrap(), + Some(1577863916000000) // 12:34:56 − 5h3m = 07:31:56 UTC = 1577836800+27116 + ); + // bare UTC / " UTC" + assert_eq!( + timestamp_parser("2020-01-01T12:34:56UTC", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 UTC", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + // UTC+offset + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 UTC+5:30", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // UTC+0 (single-digit zero) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 UTC+0", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + // GMT+/-HH:MM (no space) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56GMT+00:00", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56GMT+05:30", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // " GMT+/-..." (space-prefixed) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 GMT+05:30", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // " GMT+/-HHMM" (space + GMT + no colon) + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 GMT+0530", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // " UT+/-HH:MM" + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 UT+05:30", EvalMode::Legacy, tz, true).unwrap(), + Some(plus530) + ); + // Bare "UT" (no leading space) — Spark accepts "UT" as a UTC alias. + assert_eq!( + timestamp_parser("2020-01-01T12:34:56UT", EvalMode::Legacy, tz, true).unwrap(), + Some(utc) + ); + // Java SHORT_IDS: EST (-05:00), MST (-07:00), HST (-10:00) + // 2020-01-01T12:34:56 EST = 2020-01-01T17:34:56 UTC = 1577896496 s + let est_utc = utc + 5 * 3600 * 1_000_000; + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 EST", EvalMode::Legacy, tz, true).unwrap(), + Some(est_utc) + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56EST", EvalMode::Legacy, tz, true).unwrap(), + Some(est_utc) + ); + // 2020-01-01T12:34:56 MST = 2020-01-01T19:34:56 UTC + let mst_utc = utc + 7 * 3600 * 1_000_000; + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 MST", EvalMode::Legacy, tz, true).unwrap(), + Some(mst_utc) + ); + // 2020-01-01T12:34:56 HST = 2020-01-01T22:34:56 UTC + let hst_utc = utc + 10 * 3600 * 1_000_000; + assert_eq!( + timestamp_parser("2020-01-01T12:34:56 HST", EvalMode::Legacy, tz, true).unwrap(), + Some(hst_utc) + ); + // Named IANA zone " Europe/Moscow" (UTC+3 in winter 2020) + // 2020-01-01T12:34:56 Europe/Moscow = 2020-01-01T09:34:56 UTC = 1577871296000000 µs + assert_eq!( + timestamp_parser( + "2020-01-01T12:34:56 Europe/Moscow", + EvalMode::Legacy, + tz, + true + ) + .unwrap(), + Some(1577871296000000) + ); + // Plain date strings must NOT be affected by the offset-extraction logic. + assert_eq!( + timestamp_parser("2020-01-01", EvalMode::Legacy, tz, true).unwrap(), + Some(1577836800000000) + ); + // Invalid offset formats -> null + assert_eq!( + timestamp_parser("2020-01-01T12:34:56-8:", EvalMode::Legacy, tz, true).unwrap(), + None + ); + assert_eq!( + timestamp_parser("2020-01-01T12:34:56-20:0", EvalMode::Legacy, tz, true).unwrap(), + None // h=20 > 18 invalid + ); + // Positive year-sign prefix is accepted for timestamps (see plus_sign_year_test) + assert_eq!( + timestamp_parser("+2020-01-01T12:34:56", EvalMode::Legacy, tz, true).unwrap(), + Some(1577882096000000) + ); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn timestamp_parser_dst_test() { + // DST spring-forward: America/New_York springs forward 2020-03-08 02:00 -> 03:00. + // 02:30 does not exist; Spark advances to 03:30 EDT (UTC-4) = 07:30 UTC. + // 2020-03-08T07:30:00Z = 1577836800 + 67*86400 + 27000 = 1583652600 seconds. + let ny_tz = &timezone::Tz::from_str("America/New_York").unwrap(); + assert_eq!( + timestamp_parser("2020-03-08 02:30:00", EvalMode::Legacy, ny_tz, true).unwrap(), + Some(1583652600000000) + ); + // Just before gap: 01:59:59 EST (UTC-5) = 06:59:59 UTC = 1583650799 seconds. + assert_eq!( + timestamp_parser("2020-03-08 01:59:59", EvalMode::Legacy, ny_tz, true).unwrap(), + Some(1583650799000000) + ); + // Just after gap: 03:00:00 EDT (UTC-4) = 07:00:00 UTC = 1583650800 seconds. + assert_eq!( + timestamp_parser("2020-03-08 03:00:00", EvalMode::Legacy, ny_tz, true).unwrap(), + Some(1583650800000000) + ); + + // DST fall-back: 2020-11-01 02:00 EDT -> 01:00 EST. Ambiguous: [01:00, 02:00). + // Spark picks the earlier UTC instant (pre-transition = EDT = UTC-4). + // 01:30 EDT (UTC-4) = 05:30 UTC. + // 2020-11-01 = 2020-01-01 + 305 days = 1577836800 + 305*86400 = 1604188800 seconds. + assert_eq!( + timestamp_parser("2020-11-01 01:30:00", EvalMode::Legacy, ny_tz, true).unwrap(), + Some(1604208600000000) // 1604188800 + 5*3600 + 30*60 = 1604208600 + ); + } + #[test] fn date_parser_test() { for date in &[ "2020", "2020-01", "2020-01-01", + "+2020-01-01", // Spark accepts '+' year prefix on dates "02020-01-01", "002020-01-01", "0002020-01-01", diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 2188f8e9af..3ebc5197cc 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampType} import org.apache.comet.CometConf -import org.apache.comet.CometSparkSessionExtensions.withInfo +import org.apache.comet.CometSparkSessionExtensions.{isSpark40Plus, withInfo} import org.apache.comet.serde.{CometExpressionSerde, Compatible, ExprOuterClass, Incompatible, SupportLevel, Unsupported} import org.apache.comet.serde.ExprOuterClass.Expr import org.apache.comet.serde.QueryPlanSerde.{evalModeToProto, exprToProtoInternal, serializeDataType} @@ -118,6 +118,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { .getConfString(CometConf.getExprAllowIncompatConfigKey(classOf[Cast]), "false") .toBoolean) castBuilder.setTimezone(timeZoneId.getOrElse("UTC")) + castBuilder.setIsSpark4Plus(isSpark40Plus) Some( ExprOuterClass.Expr .newBuilder() @@ -216,10 +217,8 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case DataTypes.DateType => // https://github.com/apache/datafusion-comet/issues/327 Compatible(Some("Only supports years between 262143 BC and 262142 AD")) - case DataTypes.TimestampType if timeZoneId.exists(tz => tz != "UTC") => - Incompatible(Some(s"Cast will use UTC instead of $timeZoneId")) case DataTypes.TimestampType => - Incompatible(Some("Not all valid formats are supported")) + Compatible() case _ => unsupported(DataTypes.StringType, toType) } diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index f3eb9033b5..8a71c08eb8 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -994,15 +994,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "213170-06-15T12:34", "213170-06-15T12:34:56", "213170-06-15T12:34:56.123456") - castTimestampTest(values.toDF("a"), DataTypes.TimestampType) + castTimestampTest(values.toDF("a"), DataTypes.TimestampType, assertNative = true) } } - ignore("cast StringType to TimestampType") { - // TODO: enable once string→timestamp is marked Compatible in CometCast.canCastFromString. - // All Spark timestamp formats are now supported natively (space separator, Z/offset suffix, - // T-prefixed and bare H:M time-only, negative years). The fuzz filter below can be removed - // when enabling the native path. + test("cast StringType to TimestampType") { withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "UTC")) { val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ gen.generateStrings( dataSize, @@ -1012,16 +1008,6 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - test("cast StringType to TimestampType disabled for non-UTC timezone") { - withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Denver")) { - val values = Seq("2020-01-01T12:34:56.123456", "T2").toDF("a") - castFallbackTest( - values.toDF("a"), - DataTypes.TimestampType, - "Cast will use UTC instead of Some(America/Denver)") - } - } - test("cast StringType to TimestampType - subset of supported values") { withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "Asia/Kathmandu") { val values = Seq( @@ -1064,6 +1050,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "2020-01-01T12:34:56Z", "2020-01-01T12:34:56+05:30", "2020-01-01T12:34:56-08:00", + // Single-digit hour offset (extract_offset_suffix supports ±H:MM) + "2020-01-01T12:34:56+5:30", // T-prefixed time-only with colon "T12:34", "T12:34:56", @@ -1073,7 +1061,45 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "12:34:56", // Negative year "-0001-01-01T12:34:56") - castTimestampTest(values.toDF("a"), DataTypes.TimestampType) + castTimestampTest(values.toDF("a"), DataTypes.TimestampType, assertNative = true) + } + } + + test("cast StringType to TimestampType - T-hour-only whitespace handling") { + // Spark 4.0+ changed whitespace handling for T-prefixed time-only strings: + // - Spark 3.x: trims all whitespace first, so " T2" → valid timestamp + // - Spark 4.0+: raw bytes are used; leading whitespace causes the T-check to fail → null + // Comet matches the behaviour of whichever Spark version is running (controlled via + // is_spark4_plus in the Cast proto, set from CometSparkSessionExtensions.isSpark40Plus). + // This test compares Comet output against Spark output for all cases — no hard-coded + // null/valid assertions needed. + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + val values = Seq( + // Bare T-hour-only: no leading whitespace (valid on all versions) + "T2", // single-digit hour + "T23", // two-digit hour + "T0", // midnight + // Bare T-hour-only: trailing whitespace only (valid on all versions) + "T2 ", // trailing space + "T2\t", // trailing tab + "T2\n", // trailing newline + // Bare T-hour-only: leading whitespace (null on 4.0+, valid on 3.x) + " T2", // leading space + "\tT2", // leading tab + "\nT2", // leading newline + "\r\nT2", // leading CRLF + "\t T2", // tab then space + " T2", // double space + // T-hour:minute with leading whitespace (null on 4.0+, valid on 3.x) + " T2:30", + "\tT2:30", + "\nT2:30", + // Full datetime: leading whitespace (valid on all versions — full trim applies) + " 2020-01-01T12:34:56", + "\t2020-01-01T12:34:56", + "\n2020-01-01T12:34:56", + "\r\n2020-01-01T12:34:56") + castTimestampTest(values.toDF("a"), DataTypes.TimestampType, assertNative = true) } } @@ -1569,7 +1595,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (None, Some(e)) => throw e case (Some(e), None) => - fail(s"Comet should have failed with ${e.getCause.getMessage}") + val msg = if (e.getCause != null) e.getCause.getMessage else e.getMessage + fail(s"Comet should have failed with $msg") case (Some(sparkException), Some(cometException)) => val sparkMessage = if (sparkException.getCause != null) sparkException.getCause.getMessage diff --git a/spark/src/test/scala/org/apache/comet/CometDateTimeUtilsSuite.scala b/spark/src/test/scala/org/apache/comet/CometDateTimeUtilsSuite.scala index 770766134f..0d900e8105 100644 --- a/spark/src/test/scala/org/apache/comet/CometDateTimeUtilsSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometDateTimeUtilsSuite.scala @@ -53,15 +53,17 @@ class CometDateTimeUtilsSuite extends CometTestBase { } /** - * Same as checkCastToTimestamp but casts the result to STRING before collecting. Use for - * extreme-year values (e.g. year 294247 or -290308) where collect() overflows in - * toJavaTimestamp due to Gregorian/Julian calendar rebasing in the test harness. + * Same as checkCastToTimestamp but casts the result to BIGINT (seconds since epoch) before + * collecting. Use for extreme-year values (e.g. year 294247 or -290308) where both + * toJavaTimestamp and timestamp-to-string fail in the test harness due to calendar range + * limits. Casting to LongType is pure arithmetic on the i64 epoch value and has no range + * restriction. */ - private def checkCastToTimestampAsString(values: Seq[String]): Unit = { + private def checkCastToTimestampAsLong(values: Seq[String]): Unit = { withTempPath { dir => val df = roundtripParquet(values.toDF("a"), dir).coalesce(1) checkSparkAnswer( - df.withColumn("ts", col("a").cast(DataTypes.TimestampType).cast(DataTypes.StringType))) + df.withColumn("ts", col("a").cast(DataTypes.TimestampType).cast(DataTypes.LongType))) } } @@ -224,6 +226,8 @@ class CometDateTimeUtilsSuite extends CometTestBase { // These look like time-only but with a leading sign — invalid "+12:12:12", "-12:12:12", + // Positive year-sign prefix IS accepted by Spark for timestamps (same value as without +) + "+2020-01-01T12:34:56", // Empty / whitespace "", " ", @@ -245,9 +249,10 @@ class CometDateTimeUtilsSuite extends CometTestBase { "2021-01-01T12:30:4294967297+4294967297:30")) // Extreme-year boundary cases: collecting a TimestampType value for year 294247 or -290308 - // overflows in toJavaTimestamp due to Gregorian/Julian rebasing in the test harness. - // Cast to STRING first to avoid that while still verifying correct parsing. - checkCastToTimestampAsString( + // overflows in toJavaTimestamp, and casting to StringType fails in Comet (native engine also + // limits timestamp-to-string to ±262143 CE). Cast to LongType (seconds since epoch) instead — + // it is pure i64 arithmetic with no calendar range restriction. + checkCastToTimestampAsLong( Seq( // Long.MaxValue boundary — valid, equals Long.MaxValue microseconds "294247-01-10T04:00:54.775807Z", @@ -269,8 +274,33 @@ class CometDateTimeUtilsSuite extends CometTestBase { checkCastToTimestamp(Seq("2019-10-31T10:59:23Z:::")) } + test("DST spring-forward gap and fall-back overlap") { + // America/New_York: spring-forward 2020-03-08 02:00 -> 03:00 (gap [02:00, 03:00)). + // Spark advances the non-existent time forward by the gap duration: + // "2020-03-08 02:30:00" -> 03:30 EDT (UTC-4) = 2020-03-08T07:30:00Z. + // + // Fall-back 2020-11-01 02:00 EDT -> 01:00 EST (overlap [01:00, 02:00)). + // Spark picks the earlier (pre-transition) UTC instant: + // "2020-11-01 01:30:00" (EDT, UTC-4) = 2020-11-01T05:30:00Z. + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/New_York") { + checkCastToTimestamp( + Seq( + // Spring-forward gap: 02:30 does not exist, Spark advances to 03:30 EDT + "2020-03-08 02:30:00", + // Boundary: just before the gap + "2020-03-08 01:59:59", + // Boundary: just after the gap (first valid post-transition time) + "2020-03-08 03:00:00", + // Fall-back overlap: 01:30 exists twice; Spark picks earlier UTC (EDT = UTC-4) + "2020-11-01 01:30:00", + // Boundary: just before fall-back + "2020-11-01 01:59:59", + // Just after fall-back (unambiguous EST) + "2020-11-01 02:00:00")) + } + } + test("SPARK-37326: cast string to TIMESTAMP_NTZ rejects timezone offsets") { - // A value with a timezone offset should be null for TIMESTAMP_NTZ. checkCastToTimestampNTZ( Seq( // Has offset — null