diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index b5a382ca6a832..38a763504282f 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -18,9 +18,8 @@ use clap::{ColorChoice, Parser, ValueEnum}; use datafusion::common::instant::Instant; use datafusion::common::utils::get_available_parallelism; -use datafusion::common::{ - DataFusionError, HashMap, Result, exec_datafusion_err, exec_err, -}; +use datafusion::common::{DataFusionError, Result, exec_datafusion_err, exec_err}; +use datafusion_sqllogictest::TestFile; use datafusion_sqllogictest::{ CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip, Filter, TestContext, df_value_validator, read_dir_recursive, setup_scratch_dir, @@ -44,13 +43,12 @@ use crate::postgres_container::{ }; use datafusion::common::runtime::SpawnedTask; use futures::FutureExt; -use std::ffi::OsStr; use std::fs; use std::io::{IsTerminal, stderr, stdout}; use std::path::{Path, PathBuf}; use std::str::FromStr; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::{Arc, LazyLock}; use std::time::Duration; #[cfg(feature = "postgres")] @@ -59,6 +57,7 @@ mod postgres_container; const TEST_DIRECTORY: &str = "test_files/"; const DATAFUSION_TESTING_TEST_DIRECTORY: &str = "../../datafusion-testing/data/"; const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_"; +const TPCH_PREFIX: &str = "tpch"; const SQLITE_PREFIX: &str = "sqlite"; const ERRS_PER_FILE_LIMIT: usize = 10; const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES"; @@ -77,55 +76,6 @@ struct FileTiming { elapsed: Duration, } -/// TEST PRIORITY -/// -/// Heuristically prioritize some test to run earlier. -/// -/// Prioritizes test to run earlier if they are known to be long running (as -/// each test file itself is run sequentially, but multiple test files are run -/// in parallel. -/// -/// Tests not listed here will run after the listed tests in an arbitrary order. -/// -/// You can find the top longest running tests by running `--timing-summary` mode. -/// For example -/// -/// ```shell -/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top -/// ... -/// Per-file elapsed summary (deterministic): -/// 1. 5.375s push_down_filter_regression.slt -/// 2. 3.174s aggregate.slt -/// 3. 3.158s imdb.slt -/// 4. 2.793s joins.slt -/// 5. 2.505s array.slt -/// 6. 2.265s aggregate_skip_partial.slt -/// 7. 2.260s window.slt -/// 8. 1.677s group_by.slt -/// 9. 0.973s datetime/timestamps.slt -/// 10. 0.822s cte.slt -/// ``` -static TEST_PRIORITY: LazyLock> = LazyLock::new(|| { - [ - (PathBuf::from("push_down_filter_regression.slt"), 0), // longest running, so run first. - (PathBuf::from("aggregate.slt"), 1), - (PathBuf::from("joins.slt"), 2), - (PathBuf::from("imdb.slt"), 3), - (PathBuf::from("array.slt"), 4), - (PathBuf::from("aggregate_skip_partial.slt"), 5), - (PathBuf::from("window.slt"), 6), - (PathBuf::from("group_by.slt"), 7), - (PathBuf::from("datetime/timestamps.slt"), 8), - (PathBuf::from("cte.slt"), 9), - ] - .into_iter() - .collect() -}); - -/// Default priority for tests not in the TEST_PRIORITY map. Tests with lower -/// priority values run first. -static DEFAULT_PRIORITY: usize = 100; - pub fn main() -> Result<()> { tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -832,91 +782,35 @@ async fn run_complete_file_with_postgres( plan_err!("Can not run with postgres as postgres feature is not enabled") } -/// Represents a parsed test file -#[derive(Debug)] -struct TestFile { - /// The absolute path to the file - pub path: PathBuf, - /// The relative path of the file (used for display) - pub relative_path: PathBuf, -} - -impl TestFile { - fn new(path: PathBuf) -> Self { - let p = path.to_string_lossy(); - let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) { - p.strip_prefix(TEST_DIRECTORY).unwrap() - } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) { - p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap() - } else { - "" - }); - - Self { - path, - relative_path, - } - } - - fn is_slt_file(&self) -> bool { - self.path.extension() == Some(OsStr::new("slt")) - } - - fn check_sqlite(&self, options: &Options) -> bool { - if !self.relative_path.starts_with(SQLITE_PREFIX) { - return true; - } - - options.include_sqlite - } - - fn check_tpch(&self, options: &Options) -> bool { - if !self.relative_path.starts_with("tpch") { - return true; - } +fn read_test_files(options: &Options) -> Result> { + let prefixes: &[&str] = if options.include_sqlite { + &[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY] + } else { + &[TEST_DIRECTORY] + }; - options.include_tpch - } -} + let directories = prefixes + .iter() + .map(|prefix| { + read_dir_recursive(prefix).map_err(|e| { + exec_datafusion_err!("Error reading test directory {prefix}: {e}") + }) + }) + .collect::>>()?; -fn read_test_files(options: &Options) -> Result> { - let mut paths = read_dir_recursive(TEST_DIRECTORY)? + let mut paths = directories .into_iter() - .map(TestFile::new) + .flatten() + .map(|p| TestFile::new(p, prefixes)) .filter(|f| options.check_test_file(&f.path)) .filter(|f| f.is_slt_file()) - .filter(|f| f.check_tpch(options)) - .filter(|f| f.check_sqlite(options)) + .filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) || options.include_tpch) + .filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) || options.include_sqlite) .filter(|f| options.check_pg_compat_file(f.path.as_path())) .collect::>(); - if options.include_sqlite { - let mut sqlite_paths = read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)? - .into_iter() - .map(TestFile::new) - .filter(|f| options.check_test_file(&f.path)) - .filter(|f| f.is_slt_file()) - .filter(|f| f.check_sqlite(options)) - .filter(|f| options.check_pg_compat_file(f.path.as_path())) - .collect::>(); - - paths.append(&mut sqlite_paths) - } - Ok(sort_tests(paths)) -} - -/// Sort the tests heuristically by order of "priority" -/// -/// Prioritizes test to run earlier if they are known to be long running (as -/// each test file itself is run sequentially, but multiple test files are run -/// in parallel. -fn sort_tests(mut tests: Vec) -> Vec { - tests.sort_by_key(|f| { - TEST_PRIORITY - .get(&f.relative_path) - .unwrap_or(&DEFAULT_PRIORITY) - }); - tests + paths.sort_unstable(); + Ok(paths) } /// Parsed command line options diff --git a/datafusion/sqllogictest/src/lib.rs b/datafusion/sqllogictest/src/lib.rs index f228ee89abbe8..bb12c58bdcc20 100644 --- a/datafusion/sqllogictest/src/lib.rs +++ b/datafusion/sqllogictest/src/lib.rs @@ -27,6 +27,7 @@ //! DataFusion sqllogictest driver mod engines; +mod test_file; pub use engines::CurrentlyExecutingSqlTracker; pub use engines::DFColumnType; @@ -46,4 +47,5 @@ mod util; pub use filters::*; pub use test_context::TestContext; +pub use test_file::TestFile; pub use util::*; diff --git a/datafusion/sqllogictest/src/test_file.rs b/datafusion/sqllogictest/src/test_file.rs new file mode 100644 index 0000000000000..c44cae133639b --- /dev/null +++ b/datafusion/sqllogictest/src/test_file.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::ffi::OsStr; +use std::path::{Path, PathBuf}; +use std::sync::LazyLock; + +/// Represents a parsed test file +/// +/// Note there is a custom Ord implementation that sorts test files by: +/// 1. Hard coded test priority (lower runs first), +/// 2. Relative path as deterministic tie-breaker. +#[derive(Debug, PartialEq, Eq)] +pub struct TestFile { + /// The absolute path to the file + pub path: PathBuf, + /// The relative path of the file (used for display) + pub relative_path: PathBuf, +} + +impl TestFile { + /// Create a new [`TestFile`] from the given path, stripping any of the + /// known test directory prefixes for the relative path. + pub fn new(path: PathBuf, prefixes: &[&str]) -> Self { + let p = path.to_string_lossy(); + for prefix in prefixes { + if p.starts_with(prefix) { + let relative_path = PathBuf::from(p.strip_prefix(prefix).unwrap()); + return Self { + path, + relative_path, + }; + } + } + let relative_path = PathBuf::from(""); + + Self { + path, + relative_path, + } + } + + /// Returns true if the file has a .slt extension, indicating it is a sqllogictest file. + pub fn is_slt_file(&self) -> bool { + self.path.extension() == Some(OsStr::new("slt")) + } + + /// Returns true if the relative path starts with the given prefix, which + /// can be used to filter tests by subdirectory or filename patterns. + pub fn relative_path_starts_with(&self, prefix: impl AsRef) -> bool { + self.relative_path.starts_with(prefix) + } +} + +impl PartialOrd for TestFile { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for TestFile { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let self_path = &self.relative_path; + let other_path = &other.relative_path; + + let priority_self = TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY); + let priority_other = TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY); + + priority_self + .cmp(priority_other) + .then_with(|| self_path.cmp(other_path)) // Tie-breaker: lexicographic order of relative paths. + // Final tie-breaker keeps Ord consistent with Eq when relative paths collide. + .then_with(|| self.path.cmp(&other.path)) + } +} + +/// TEST PRIORITY +/// +/// Heuristically prioritize some test to run earlier. +/// +/// Prioritizes test to run earlier if they are known to be long running (as +/// each test file itself is run sequentially, but multiple test files are run +/// in parallel. +/// +/// Tests not listed here will run after the listed tests in deterministic +/// lexicographic order by relative path. +/// +/// You can find the top longest running tests by running `--timing-summary` +/// mode. For example +/// +/// ```shell +/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top +/// ... +/// Per-file elapsed summary (deterministic): +/// 1. 3.568s aggregate.slt +/// 2. 3.464s joins.slt +/// 3. 3.336s imdb.slt +/// 4. 3.085s push_down_filter_regression.slt +/// 5. 2.926s aggregate_skip_partial.slt +/// 6. 2.453s array.slt +/// 7. 2.399s window.slt +/// 8. 2.198s group_by.slt +/// 9. 1.281s clickbench.slt +/// 10. 1.058s datetime/timestamps.slt +/// ``` +const TEST_PRIORITY_ENTRIES: &[&str] = &[ + "aggregate.slt", // longest-running files go first + "joins.slt", + "imdb.slt", + "push_down_filter_regression.slt", + "aggregate_skip_partial.slt", + "array.slt", + "window.slt", + "group_by.slt", + "clickbench.slt", + "datetime/timestamps.slt", +]; + +/// Default priority for tests not in the priority map. Tests with lower +/// priority values run first. +const DEFAULT_PRIORITY: usize = 100; + +static TEST_PRIORITY: LazyLock> = LazyLock::new(|| { + TEST_PRIORITY_ENTRIES + .iter() + .enumerate() + .map(|(priority, path)| (PathBuf::from(path), priority)) + .collect() +}); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn prioritized_files_are_first() { + let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"]; + input.extend(TEST_PRIORITY_ENTRIES.iter()); + input.push("q_unlisted.slt"); + + let mut sorted = to_test_files(input); + sorted.sort_unstable(); + + println!("Sorted input: {sorted:?}"); + + // the prioritized files should be first, in the order specified by TEST_PRIORITY_ENTRIES + for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) { + assert!( + TEST_PRIORITY.contains_key(&file.relative_path), + "Expected prioritized file {file:?} not found in input {sorted:?}" + ); + } + // last three files should be the unlisted ones in deterministic order + let expected_files = + to_test_files(["a_unlisted.slt", "q_unlisted.slt", "z_unlisted.slt"]); + assert!( + sorted.ends_with(&expected_files), + "Expected unlisted files {expected_files:?} at the end in deterministic order of {sorted:?}" + ); + } + + fn to_test_files<'a>(files: impl IntoIterator) -> Vec { + files + .into_iter() + .map(|f| TestFile { + path: PathBuf::from(f), + relative_path: PathBuf::from(f), + }) + .collect() + } +}