refactor based on PR comments

jayshrivastava · jayshrivastava · commit d3ad9d211d70 · 2025-11-21T13:32:06.000-05:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,12 +65,6 @@ jobs:
           path: testdata/tpcds/data/**
           retention-days: 7
           if-no-files-found: ignore
-      - name: Clean up test data
-        run: |
-          rm -rf testdata/tpcds/data/*
-          rm -f $HOME/.local/bin/duckdb
-          rm -rf /home/runner/.duckdb
-          df -h
 
   format-check:
     runs-on: ubuntu-latest
diff --git a/src/test_utils/property_based.rs b/src/test_utils/property_based.rs
@@ -1,18 +1,17 @@
 use arrow::{
-    array::{ArrayRef, UInt32Array},
+    array::{ArrayRef, Float16Array, Float32Array, Float64Array, UInt32Array},
     compute::{SortColumn, concat_batches, lexsort_to_indices},
     record_batch::RecordBatch,
 };
 use datafusion::{
     common::{internal_datafusion_err, internal_err},
     error::{DataFusionError, Result},
-    execution::context::SessionContext,
     physical_expr::LexOrdering,
     physical_plan::ExecutionPlan,
 };
 use std::sync::Arc;
 
-/// compares the set of record batches for equality 
+/// compares the set of record batches for equality
 pub async fn compare_result_set(
     actual_result: &Result<Vec<RecordBatch>>,
     expected_result: &Result<Vec<RecordBatch>>,
@@ -21,10 +20,7 @@ pub async fn compare_result_set(
         Ok(batches) => batches,
         Err(e) => {
             if expected_result.is_ok() {
-                return internal_err!(
-                    "expected no error but got: {}",
-                    e
-                );
+                return internal_err!("expected no error but got: {}", e);
             }
             return Ok(()); // Both errored, so the query is valid
         }
@@ -34,10 +30,7 @@ pub async fn compare_result_set(
         Ok(batches) => batches,
         Err(e) => {
             if actual_result.is_ok() {
-                return internal_err!(
-                    "expected error but got none, error: {}",
-                    e
-                );
+                return internal_err!("expected error but got none, error: {}", e);
             }
             return Ok(()); // Both errored, so the query is valid
         }
@@ -47,7 +40,7 @@ pub async fn compare_result_set(
         .map_err(|e| internal_datafusion_err!("result sets were not equal: {}", e))
 }
 
-// Ensures that the plans have the same ordering properties and that the actual result is sorted 
+// Ensures that the plans have the same ordering properties and that the actual result is sorted
 // correctly.
 pub async fn compare_ordering(
     actual_physical_plan: Arc<dyn ExecutionPlan>,
@@ -203,6 +196,12 @@ fn batch_rows_to_strings(batches: &[RecordBatch]) -> Vec<String> {
 
                 if array.is_null(row_idx) {
                     row_values.push("NULL".to_string());
+                } else if let Some(arr) = array.as_any().downcast_ref::<Float16Array>() {
+                    row_values.push(format!("{:.1$}", arr.value(row_idx), 2));
+                } else if let Some(arr) = array.as_any().downcast_ref::<Float32Array>() {
+                    row_values.push(format!("{:.1$}", arr.value(row_idx), 2));
+                } else if let Some(arr) = array.as_any().downcast_ref::<Float64Array>() {
+                    row_values.push(format!("{:.1$}", arr.value(row_idx), 2));
                 } else {
                     // Use Arrow's deterministic string representation
                     let value_str = array_value_to_string(array, row_idx)
@@ -282,6 +281,8 @@ mod tests {
 
     use arrow::array::{Int32Array, StringArray};
     use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::physical_plan::collect;
+    use datafusion::prelude::SessionContext;
 
     use std::sync::Arc;
 
@@ -438,25 +439,27 @@ mod tests {
 
         // Query which sorted by id should pass.
         let ordered_query = "SELECT * FROM test_table ORDER BY id";
+
         let df = actual_ctx.sql(ordered_query).await.unwrap();
-        let result = df.collect().await;
+        let task_ctx = actual_ctx.task_ctx();
+        let actual_plan = df.create_physical_plan().await.unwrap();
+        let results = collect(actual_plan.clone(), task_ctx).await;
+
+        let df = expected_ctx.sql(ordered_query).await.unwrap();
+        let expected_plan = df.create_physical_plan().await.unwrap();
+
         assert!(
-            compare_ordering(&actual_ctx, &expected_ctx, ordered_query, &result)
+            compare_ordering(actual_plan.clone(), expected_plan.clone(), &results)
                 .await
                 .is_ok()
         );
 
         // This should fail because the batch is not sorted by value
         let result = Ok(vec![batch]);
         assert!(
-            compare_ordering(
-                &actual_ctx,
-                &expected_ctx,
-                "SELECT * FROM test_table ORDER BY value",
-                &result
-            )
-            .await
-            .is_err()
+            compare_ordering(actual_plan.clone(), expected_plan.clone(), &result)
+                .await
+                .is_err()
         );
     }
 }
diff --git a/src/test_utils/tpcds.rs b/src/test_utils/tpcds.rs
@@ -70,12 +70,8 @@ pub fn queries() -> Result<Vec<(String, String)>> {
 
 /// Load a single TPC-DS query by ID (1-99).
 pub fn get_test_tpcds_query(id: usize) -> Result<String> {
-    if id < 1 || id > 99 {
-        return internal_err!("Query ID must be between 1 and 99, got {}", id);
-    }
-
     let queries_dir = get_queries_dir();
-    
+
     if !queries_dir.exists() {
         return internal_err!(
             "TPC-DS queries directory not found: {}",
@@ -84,21 +80,14 @@ pub fn get_test_tpcds_query(id: usize) -> Result<String> {
     }
 
     let query_file = queries_dir.join(format!("q{}.sql", id));
-    
+
     if !query_file.exists() {
-        return internal_err!(
-            "Query file not found: {}",
-            query_file.display()
-        );
+        return internal_err!("Query file not found: {}", query_file.display());
     }
 
     let query_sql = fs::read_to_string(&query_file)
         .map_err(|e| {
-            internal_datafusion_err!(
-                "Failed to read query file {}: {}",
-                query_file.display(),
-                e
-            )
+            internal_datafusion_err!("Failed to read query file {}: {}", query_file.display(), e)
         })?
         .trim()
         .to_string();
diff --git a/testdata/tpcds/README.md b/testdata/tpcds/README.md
@@ -2,7 +2,7 @@ This directory contains 99 TPC-DS queries from https://github.com/duckdb/duckdb
 
 ## Modifications for DataFusion Compatibility
 
- - Query 57 was modified to add explicit ORDER BY d_moy to avg() window function. DataFusion requires explicit ordering with PARTITION BY.
- - Query 72 was modified to support data functions in datafusion 
+ - Queries 47 and 57 were modified to add explicit ORDER BY d_moy to avg() window function. DataFusion requires explicit ordering in window functions with PARTITION BY for deterministic results.
+ - Query 72 was modified to support date functions in datafusion
 
 `generate.sh {SCALE_FACTOR}` is a script which can generate TPC-DS parquet data. Requires the duckdb CLI: https://duckdb.org/install/
diff --git a/testdata/tpcds/queries/q47.sql b/testdata/tpcds/queries/q47.sql
@@ -1,3 +1,5 @@
+-- TPC-DS Query 47
+-- Modified: Added ORDER BY d_moy to avg() window function for DataFusion compatibility
 WITH v1 AS
   (SELECT i_category,
           i_brand,
@@ -10,7 +12,8 @@ WITH v1 AS
                                                       i_brand,
                                                       s_store_name,
                                                       s_company_name,
-                                                      d_year) avg_monthly_sales,
+                                                      d_year
+                                         ORDER BY d_moy) avg_monthly_sales,
                                         rank() OVER (PARTITION BY i_category,
                                                                   i_brand,
                                                                   s_store_name,
diff --git a/testdata/tpcds/queries/q72.sql b/testdata/tpcds/queries/q72.sql
@@ -24,7 +24,7 @@ LEFT OUTER JOIN catalog_returns ON (cr_item_sk = cs_item_sk
                                     AND cr_order_number = cs_order_number)
 WHERE d1.d_week_seq = d2.d_week_seq
   AND inv_quantity_on_hand < cs_quantity
-  AND d3.d_date > d1.d_date + INTERVAL '5' DAY -- DuckDB: day + 5
+  AND d3.d_date > d1.d_date + INTERVAL '5' DAY -- Modified - Original duckdb syntax is: d1.d_date + 5
   AND hd_buy_potential = '>10000'
   AND d1.d_year = 1999
   AND cd_marital_status = 'D'
diff --git a/tests/tpcds_test.rs b/tests/tpcds_test.rs
@@ -2,17 +2,17 @@
 mod tests {
     use datafusion::common::runtime::JoinSet;
     use datafusion::error::Result;
+    use datafusion::physical_plan::{ExecutionPlan, collect};
     use datafusion::prelude::SessionContext;
-    use datafusion::physical_plan::{collect, ExecutionPlan};
     use datafusion_distributed::test_utils::{
         localhost::start_localhost_context,
         property_based::{compare_ordering, compare_result_set},
-        tpcds::{generate_tpcds_data, get_test_tpcds_query, register_tables, get_data_dir},
+        tpcds::{generate_tpcds_data, get_data_dir, get_test_tpcds_query, register_tables},
     };
 
     use datafusion::arrow::array::RecordBatch;
 
-    use datafusion_distributed::{DefaultSessionBuilder, DistributedExt, display_plan_ascii};
+    use datafusion_distributed::{DefaultSessionBuilder, DistributedExt};
     use std::env;
     use std::fs;
     use std::sync::Arc;
@@ -25,7 +25,8 @@ mod tests {
         INIT_TEST_TPCDS_TABLES
             .get_or_init(|| async {
                 if !fs::exists(get_data_dir()).unwrap_or(false) {
-                    let scale_factor = env::var("SCALE_FACTOR").unwrap_or_else(|_| "0.01".to_string());
+                    let scale_factor =
+                        env::var("SCALE_FACTOR").unwrap_or_else(|_| "0.01".to_string());
                     generate_tpcds_data(scale_factor.as_str()).unwrap();
                 }
             })
@@ -41,7 +42,8 @@ mod tests {
         let (mut distributed_ctx, worker_tasks) =
             start_localhost_context(NUM_WORKERS, DefaultSessionBuilder).await;
         distributed_ctx.set_distributed_files_per_task(FILES_PER_TASK)?;
-        distributed_ctx.set_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)?;
+        distributed_ctx
+            .set_distributed_cardinality_effect_task_scale_factor(CARDINALITY_TASK_COUNT_FACTOR)?;
         register_tables(&distributed_ctx).await?;
 
         // Create single node context to compare results to.
@@ -51,11 +53,13 @@ mod tests {
         Ok((distributed_ctx, single_node_ctx, worker_tasks))
     }
 
-    async fn run(ctx: &SessionContext, query_sql: &str) -> (Arc<dyn ExecutionPlan>, Result<Vec<RecordBatch>>) {
+    async fn run(
+        ctx: &SessionContext,
+        query_sql: &str,
+    ) -> (Arc<dyn ExecutionPlan>, Result<Vec<RecordBatch>>) {
         let df = ctx.sql(&query_sql).await.unwrap();
         let task_ctx = ctx.task_ctx();
-        let plan = df.create_physical_plan().await.unwrap(); 
-        println!("{}", display_plan_ascii(plan.as_ref(), false));
+        let plan = df.create_physical_plan().await.unwrap();
         (plan.clone(), collect(plan, task_ctx).await) // Collect execution errors, do not unwrap.
     }
 
@@ -65,17 +69,24 @@ mod tests {
         let query_sql = get_test_tpcds_query(query_id)?;
         let (distributed_ctx, single_node_ctx, _handles) = setup().await?;
 
-        let (distributed_physical_plan, distributed_results) = run(&distributed_ctx, &query_sql).await; 
-        println!("execution complete");
-        let (single_node_physical_plan, single_node_results) = run(&single_node_ctx, &query_sql).await;
-
-        // println!(display(&distributed_physical_plan));
+        let (single_node_physical_plan, single_node_results) =
+            run(&single_node_ctx, &query_sql).await;
+        let (distributed_physical_plan, distributed_results) =
+            run(&distributed_ctx, &query_sql).await;
 
         let compare_result = tokio::try_join!(
             compare_result_set(&distributed_results, &single_node_results),
-            compare_ordering(distributed_physical_plan, single_node_physical_plan, &distributed_results),
+            compare_ordering(
+                distributed_physical_plan,
+                single_node_physical_plan,
+                &distributed_results
+            ),
+        );
+        assert!(
+            compare_result.is_ok(),
+            "Query {query_id} failed: {}",
+            compare_result.unwrap_err()
         );
-        assert!(compare_result.is_ok(), "Query {query_id} failed: {}", compare_result.unwrap_err());
         Ok(())
     }