diff --git a/src/repr/src/strconv.rs b/src/repr/src/strconv.rs index 6b14bf717cdf2..847b4b77bddea 100644 --- a/src/repr/src/strconv.rs +++ b/src/repr/src/strconv.rs @@ -354,6 +354,319 @@ where format_float(buf, f) } +// --------------------------------------------------------------------------- +// Fast-path ISO 8601 timestamp/date/time parsers +// +// For the overwhelming majority of real-world data (standard ISO 8601 format +// like "2024-06-15 14:30:25.123456"), these bypass the general-purpose +// ParsedDateTime tokenizer+pattern-matcher, which allocates a VecDeque of +// tokens, builds a 264-byte ParsedDateTime struct, and tries multiple format +// templates. The fast path does direct byte-level extraction with zero heap +// allocations. Non-standard formats fall back to the general parser. +// --------------------------------------------------------------------------- + +/// Parse 2 ASCII digits at `buf[off..off+2]` as a u32 in [0, 99]. +#[inline(always)] +fn parse_2digit(buf: &[u8], off: usize) -> Option { + let d0 = buf[off].wrapping_sub(b'0'); + let d1 = buf[off + 1].wrapping_sub(b'0'); + if d0 > 9 || d1 > 9 { + return None; + } + Some(u32::from(d0) * 10 + u32::from(d1)) +} + +/// Parse 4 ASCII digits at `buf[off..off+4]` as an i32 in [0, 9999]. +#[allow(clippy::as_conversions)] // u32 ≤ 9999 always fits in i32 +#[inline(always)] +fn parse_4digit(buf: &[u8], off: usize) -> Option { + let d0 = u32::from(buf[off].wrapping_sub(b'0')); + let d1 = u32::from(buf[off + 1].wrapping_sub(b'0')); + let d2 = u32::from(buf[off + 2].wrapping_sub(b'0')); + let d3 = u32::from(buf[off + 3].wrapping_sub(b'0')); + if d0 > 9 || d1 > 9 || d2 > 9 || d3 > 9 { + return None; + } + Some((d0 * 1000 + d1 * 100 + d2 * 10 + d3) as i32) +} + +/// Parse fractional seconds (1-9 digits after '.') into nanoseconds. +#[inline] +fn parse_frac_nanos(buf: &[u8], start: usize, end: usize) -> Option { + let frac_len = end - start; + if frac_len == 0 || frac_len > 9 { + return None; + } + let mut frac: u32 = 0; + for i in start..end { + let d = buf[i].wrapping_sub(b'0'); + if d > 9 { + return None; + } + frac = frac * 10 + u32::from(d); + } + // Pad to nanoseconds: multiply by 10^(9 - frac_len) + static NANOS_SCALE: [u32; 10] = [ + 1_000_000_000, // 0 digits (unused) + 100_000_000, // 1 digit + 10_000_000, // 2 digits + 1_000_000, // 3 digits + 100_000, // 4 digits + 10_000, // 5 digits + 1_000, // 6 digits + 100, // 7 digits + 10, // 8 digits + 1, // 9 digits + ]; + Some(frac * NANOS_SCALE[frac_len]) +} + +/// Try to parse "YYYY-MM-DD" from `buf` at offset 0. Returns `(NaiveDate, next_offset)`. +#[inline] +fn try_parse_date_bytes(buf: &[u8]) -> Option<(NaiveDate, usize)> { + if buf.len() < 10 { + return None; + } + let year = parse_4digit(buf, 0)?; + if year == 0 { + return None; // Year 0 is not valid; fall back to general parser for error message + } + if buf[4] != b'-' { + return None; + } + let month = parse_2digit(buf, 5)?; + if buf[7] != b'-' { + return None; + } + let day = parse_2digit(buf, 8)?; + let date = NaiveDate::from_ymd_opt(year, month, day)?; + Some((date, 10)) +} + +/// Try to parse "HH:MM:SS[.fffffffff]" from `buf` at offset `off`. +/// Returns `(NaiveTime, next_offset)`. +#[inline] +fn try_parse_time_bytes(buf: &[u8], off: usize) -> Option<(NaiveTime, usize)> { + if buf.len() < off + 8 { + return None; + } + let hour = parse_2digit(buf, off)?; + if buf[off + 2] != b':' { + return None; + } + let minute = parse_2digit(buf, off + 3)?; + if buf[off + 5] != b':' { + return None; + } + let mut second = parse_2digit(buf, off + 6)?; + let mut pos = off + 8; + + let mut nanos = if pos < buf.len() && buf[pos] == b'.' { + pos += 1; // skip '.' + let frac_start = pos; + // Scan digits + while pos < buf.len() && buf[pos].wrapping_sub(b'0') <= 9 { + pos += 1; + } + parse_frac_nanos(buf, frac_start, pos)? + } else { + 0 + }; + + // Handle leap seconds: chrono represents second=60 as second=59 with + // nanos >= 1_000_000_000. + if second == 60 { + second = 59; + nanos = nanos.saturating_add(1_000_000_000); + } + + let time = NaiveTime::from_hms_nano_opt(hour, minute, second, nanos)?; + Some((time, pos)) +} + +/// Try to parse a fixed-offset timezone like "+00", "+00:00", "-05", "-05:30" +/// from `buf` at offset `off`. Returns `(FixedOffset, next_offset)`. +#[allow(clippy::as_conversions)] // u32 ≤ 99 always fits in i32 +#[inline] +fn try_parse_tz_offset(buf: &[u8], off: usize) -> Option<(chrono::FixedOffset, usize)> { + if off >= buf.len() { + return None; + } + let sign = match buf[off] { + b'+' => 1i32, + b'-' => -1i32, + _ => return None, + }; + let pos = off + 1; + if buf.len() < pos + 2 { + return None; + } + let tz_hours = parse_2digit(buf, pos)? as i32; + if tz_hours > 15 { + return None; // Invalid timezone hour; fall back to general parser for error message + } + let mut end = pos + 2; + + let tz_minutes = if end < buf.len() && buf[end] == b':' { + end += 1; + if buf.len() < end + 2 { + return None; + } + let m = parse_2digit(buf, end)? as i32; + if m >= 60 { + return None; // Invalid timezone minute; fall back to general parser for error message + } + end += 2; + m + } else if end + 2 <= buf.len() { + // Try compact format like "+0530" (no colon) + if let Some(m) = parse_2digit(buf, end) { + if m >= 60 { + return None; + } + end += 2; + m as i32 + } else { + 0 + } + } else { + 0 + }; + + let total_secs = sign * (tz_hours * 3600 + tz_minutes * 60); + let offset = chrono::FixedOffset::east_opt(total_secs)?; + Some((offset, end)) +} + +/// Fast-path: try to parse "YYYY-MM-DD HH:MM:SS[.fff...]" as NaiveDateTime. +/// Returns None if the format doesn't match (falls back to general parser). +#[inline] +fn try_parse_timestamp_fast(s: &str) -> Option { + let buf = s.as_bytes(); + let (date, date_end) = try_parse_date_bytes(buf)?; + if date_end >= buf.len() { + return None; + } + // Accept ' ' or 'T' as date-time separator + let sep = buf[date_end]; + if sep != b' ' && sep != b'T' && sep != b't' { + return None; + } + let (time, time_end) = try_parse_time_bytes(buf, date_end + 1)?; + // Must consume entire string + if time_end != buf.len() { + return None; + } + Some(date.and_time(time)) +} + +/// Fast-path: try to parse `"YYYY-MM-DD HH:MM:SS[.fff...]{+|-}HH[:MM]"` as `DateTime`. +/// Returns None if the format doesn't match (falls back to general parser). +#[inline] +fn try_parse_timestamptz_fast(s: &str) -> Option> { + let buf = s.as_bytes(); + let (date, date_end) = try_parse_date_bytes(buf)?; + if date_end >= buf.len() { + return None; + } + let sep = buf[date_end]; + if sep != b' ' && sep != b'T' && sep != b't' { + return None; + } + let (time, time_end) = try_parse_time_bytes(buf, date_end + 1)?; + // Must have a timezone offset remaining + if time_end >= buf.len() { + return None; + } + let (offset, tz_end) = try_parse_tz_offset(buf, time_end)?; + // Must consume entire string + if tz_end != buf.len() { + return None; + } + let dt = date.and_time(time); + Some(DateTime::from_naive_utc_and_offset(dt - offset, Utc)) +} + +/// Fast-path: try to parse "YYYY-MM-DD" as NaiveDate. +#[inline] +fn try_parse_date_fast(s: &str) -> Option { + let buf = s.as_bytes(); + if buf.len() != 10 { + return None; + } + let (date, _) = try_parse_date_bytes(buf)?; + Some(date) +} + +/// Fast-path: try to parse "HH:MM:SS[.fff...]" as NaiveTime. +#[inline] +fn try_parse_time_fast(s: &str) -> Option { + let buf = s.as_bytes(); + let (time, end) = try_parse_time_bytes(buf, 0)?; + if end != buf.len() { + return None; + } + Some(time) +} + +/// Parse a timestamp using only the general ParsedDateTime path (for benchmarking). +#[doc(hidden)] +pub fn parse_timestamp_general(s: &str) -> Result, ParseError> { + match parse_timestamp_string(s) { + Ok((date, time, _)) => CheckedTimestamp::from_timestamplike(date.and_time(time)) + .map_err(|_| ParseError::out_of_range("timestamp", s)), + Err(e) => Err(ParseError::invalid_input_syntax("timestamp", s).with_details(e)), + } +} + +/// Parse a timestamptz using only the general ParsedDateTime path (for benchmarking). +#[doc(hidden)] +pub fn parse_timestamptz_general(s: &str) -> Result>, ParseError> { + parse_timestamp_string(s) + .and_then(|(date, time, timezone)| { + use Timezone::*; + let mut dt = date.and_time(time); + let offset = match timezone { + FixedOffset(offset) => offset, + Tz(tz) => match tz.offset_from_local_datetime(&dt).latest() { + Some(offset) => offset.fix(), + None => { + dt += Duration::try_hours(1).unwrap(); + tz.offset_from_local_datetime(&dt) + .latest() + .ok_or_else(|| "invalid timezone conversion".to_owned())? + .fix() + } + }, + }; + Ok(DateTime::from_naive_utc_and_offset(dt - offset, Utc)) + }) + .map_err(|e| { + ParseError::invalid_input_syntax("timestamp with time zone", s).with_details(e) + }) + .and_then(|ts| { + CheckedTimestamp::from_timestamplike(ts) + .map_err(|_| ParseError::out_of_range("timestamp with time zone", s)) + }) +} + +/// Parse a date using only the general ParsedDateTime path (for benchmarking). +#[doc(hidden)] +pub fn parse_date_general(s: &str) -> Result { + match parse_timestamp_string(s) { + Ok((date, _, _)) => Date::try_from(date).map_err(|_| ParseError::out_of_range("date", s)), + Err(e) => Err(ParseError::invalid_input_syntax("date", s).with_details(e)), + } +} + +/// Parse a time using only the general ParsedDateTime path (for benchmarking). +#[doc(hidden)] +pub fn parse_time_general(s: &str) -> Result { + ParsedDateTime::build_parsed_datetime_time(s) + .and_then(|pdt| pdt.compute_time()) + .map_err(|e| ParseError::invalid_input_syntax("time", s).with_details(e)) +} + /// Use the following grammar to parse `s` into: /// /// - `NaiveDate` @@ -406,6 +719,11 @@ fn parse_timestamp_string(s: &str) -> Result<(NaiveDate, NaiveTime, Timezone), S /// Parses a [`Date`] from `s`. pub fn parse_date(s: &str) -> Result { + // Fast path for "YYYY-MM-DD" (10 chars, the overwhelmingly common format) + if let Some(date) = try_parse_date_fast(s) { + return Date::try_from(date).map_err(|_| ParseError::out_of_range("date", s)); + } + // Fall back to general parser for exotic formats match parse_timestamp_string(s) { Ok((date, _, _)) => Date::try_from(date).map_err(|_| ParseError::out_of_range("date", s)), Err(e) => Err(ParseError::invalid_input_syntax("date", s).with_details(e)), @@ -419,9 +737,17 @@ where { let d: NaiveDate = d.into(); let (year_ad, year) = d.year_ce(); - write!(buf, "{:04}-{}", year, d.format("%m-%d")); + write_year(buf, year); + // Build "-MM-DD" in a stack buffer (6 bytes). + let mut tmp = [0u8; 6]; + tmp[0] = b'-'; + write_u2(&mut tmp, 1, d.month()); + tmp[3] = b'-'; + write_u2(&mut tmp, 4, d.day()); + // SAFETY: all bytes are ASCII digits or '-'. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp) }); if !year_ad { - write!(buf, " BC"); + buf.write_str(" BC"); } Nestable::Yes } @@ -434,23 +760,49 @@ where /// [ [ ] ] /// ``` pub fn parse_time(s: &str) -> Result { + // Fast path for "HH:MM:SS[.ffffff]" + if let Some(time) = try_parse_time_fast(s) { + return Ok(time); + } + // Fall back to general parser ParsedDateTime::build_parsed_datetime_time(s) .and_then(|pdt| pdt.compute_time()) .map_err(|e| ParseError::invalid_input_syntax("time", s).with_details(e)) } -/// Writes a [`NaiveDateTime`] timestamp to `buf`. +/// Writes a [`NaiveTime`] to `buf`. pub fn format_time(buf: &mut F, t: NaiveTime) -> Nestable where F: FormatBuffer, { - write!(buf, "{}", t.format("%H:%M:%S")); - format_nanos_to_micros(buf, t.nanosecond()); + // Chrono represents leap seconds as second=59, nanos >= 1_000_000_000. + let nanos = t.nanosecond(); + let (second, frac_nanos) = if nanos >= 1_000_000_000 { + (t.second() + 1, nanos - 1_000_000_000) + } else { + (t.second(), nanos) + }; + // Build "HH:MM:SS" directly in a stack buffer (8 bytes). + let mut tmp = [0u8; 8]; + write_u2(&mut tmp, 0, t.hour()); + tmp[2] = b':'; + write_u2(&mut tmp, 3, t.minute()); + tmp[5] = b':'; + write_u2(&mut tmp, 6, second); + // SAFETY: all bytes are ASCII digits or ':'. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp) }); + format_nanos_to_micros(buf, frac_nanos); Nestable::Yes } /// Parses a `NaiveDateTime` from `s`. pub fn parse_timestamp(s: &str) -> Result, ParseError> { + // Fast path for "YYYY-MM-DD HH:MM:SS[.ffffff]" + if let Some(ndt) = try_parse_timestamp_fast(s) { + return CheckedTimestamp::from_timestamplike(ndt) + .map_err(|_| ParseError::out_of_range("timestamp", s)); + } + // Fall back to general parser for exotic formats match parse_timestamp_string(s) { Ok((date, time, _)) => CheckedTimestamp::from_timestamplike(date.and_time(time)) .map_err(|_| ParseError::out_of_range("timestamp", s)), @@ -463,11 +815,33 @@ pub fn format_timestamp(buf: &mut F, ts: &NaiveDateTime) -> Nestable where F: FormatBuffer, { + // Chrono represents leap seconds as second=59, nanos >= 1_000_000_000. + let nanos = ts.nanosecond(); + let second = if nanos >= 1_000_000_000 { + ts.second() + 1 + } else { + ts.second() + }; + let frac_nanos = ts.and_utc().timestamp_subsec_nanos(); let (year_ad, year) = ts.year_ce(); - write!(buf, "{:04}-{}", year, ts.format("%m-%d %H:%M:%S")); - format_nanos_to_micros(buf, ts.and_utc().timestamp_subsec_nanos()); + write_year(buf, year); + // Build "-MM-DD HH:MM:SS" in a stack buffer (15 bytes). + let mut tmp = [0u8; 15]; + tmp[0] = b'-'; + write_u2(&mut tmp, 1, ts.month()); + tmp[3] = b'-'; + write_u2(&mut tmp, 4, ts.day()); + tmp[6] = b' '; + write_u2(&mut tmp, 7, ts.hour()); + tmp[9] = b':'; + write_u2(&mut tmp, 10, ts.minute()); + tmp[12] = b':'; + write_u2(&mut tmp, 13, second); + // SAFETY: all bytes are ASCII digits, '-', ' ', or ':'. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp) }); + format_nanos_to_micros(buf, frac_nanos); if !year_ad { - write!(buf, " BC"); + buf.write_str(" BC"); } // This always needs escaping because of the whitespace Nestable::MayNeedEscaping @@ -475,6 +849,12 @@ where /// Parses a `DateTime` from `s`. See `mz_expr::scalar::func::timezone_timestamp` for timezone anomaly considerations. pub fn parse_timestamptz(s: &str) -> Result>, ParseError> { + // Fast path for "YYYY-MM-DD HH:MM:SS[.fff...]{+|-}HH[:MM]" + if let Some(dt) = try_parse_timestamptz_fast(s) { + return CheckedTimestamp::from_timestamplike(dt) + .map_err(|_| ParseError::out_of_range("timestamp with time zone", s)); + } + // Fall back to general parser for named timezones, exotic formats, etc. parse_timestamp_string(s) .and_then(|(date, time, timezone)| { use Timezone::*; @@ -508,12 +888,33 @@ pub fn format_timestamptz(buf: &mut F, ts: &DateTime) -> Nestable where F: FormatBuffer, { + // Chrono represents leap seconds as second=59, nanos >= 1_000_000_000. + let nanos = ts.nanosecond(); + let (second, frac_nanos) = if nanos >= 1_000_000_000 { + (ts.second() + 1, nanos - 1_000_000_000) + } else { + (ts.second(), nanos) + }; let (year_ad, year) = ts.year_ce(); - write!(buf, "{:04}-{}", year, ts.format("%m-%d %H:%M:%S")); - format_nanos_to_micros(buf, ts.timestamp_subsec_nanos()); - write!(buf, "+00"); + write_year(buf, year); + // Build "-MM-DD HH:MM:SS" in a stack buffer (15 bytes). + let mut tmp = [0u8; 15]; + tmp[0] = b'-'; + write_u2(&mut tmp, 1, ts.month()); + tmp[3] = b'-'; + write_u2(&mut tmp, 4, ts.day()); + tmp[6] = b' '; + write_u2(&mut tmp, 7, ts.hour()); + tmp[9] = b':'; + write_u2(&mut tmp, 10, ts.minute()); + tmp[12] = b':'; + write_u2(&mut tmp, 13, second); + // SAFETY: all bytes are ASCII digits, '-', ' ', or ':'. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp) }); + format_nanos_to_micros(buf, frac_nanos); + buf.write_str("+00"); if !year_ad { - write!(buf, " BC"); + buf.write_str(" BC"); } // This always needs escaping because of the whitespace Nestable::MayNeedEscaping @@ -743,6 +1144,35 @@ where Nestable::Yes } +/// Writes a 2-digit zero-padded value (0-99) into `buf` at `offset`. +#[allow(clippy::as_conversions)] // u32 single digit (0-9) always fits in u8 +#[inline(always)] +fn write_u2(buf: &mut [u8], offset: usize, val: u32) { + buf[offset] = b'0' + (val / 10) as u8; + buf[offset + 1] = b'0' + (val % 10) as u8; +} + +/// Writes a year with at least 4-digit zero-padding directly to a FormatBuffer. +#[allow(clippy::as_conversions)] // u32 single digit (0-9) always fits in u8 +#[inline] +fn write_year(buf: &mut F, year: u32) { + // Common case: 4-digit year (covers 1000-9999). + if year <= 9999 { + let tmp = [ + b'0' + (year / 1000) as u8, + b'0' + ((year / 100) % 10) as u8, + b'0' + ((year / 10) % 10) as u8, + b'0' + (year % 10) as u8, + ]; + // SAFETY: all bytes are ASCII digits. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp) }); + } else { + // Rare: years > 9999 (e.g. year 20000). + write!(buf, "{}", year); + } +} + +#[allow(clippy::as_conversions)] // u32 single digit (0-9) always fits in u8 fn format_nanos_to_micros(buf: &mut F, nanos: u32) where F: FormatBuffer, @@ -753,13 +1183,24 @@ where if rem >= 500 { micros += 1; } - // strip trailing zeros - let mut width = 6; - while micros % 10 == 0 { - width -= 1; - micros /= 10; + // Build ".NNNNNN" (or ".NNNNNNN" for leap-second overflow) in a stack buffer, + // then trim trailing zeros. Micros can reach 2_000_000 for leap seconds. + let digits: usize = if micros >= 1_000_000 { 7 } else { 6 }; + let buf_len = 1 + digits; + let mut tmp = [b'0'; 8]; // max ".0000000" + tmp[0] = b'.'; + let mut m = micros; + for i in (1..=digits).rev() { + tmp[i] = b'0' + (m % 10) as u8; + m /= 10; + } + // Strip trailing zeros (but keep at least one fractional digit). + let mut end = buf_len; + while end > 2 && tmp[end - 1] == b'0' { + end -= 1; } - write!(buf, ".{:0width$}", micros, width = width); + // SAFETY: all bytes are ASCII digits or '.'. + buf.write_str(unsafe { std::str::from_utf8_unchecked(&tmp[..end]) }); } }