From 21ce7b695085761e797faa2add3bb4eca9d4b32a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 22 May 2026 07:29:44 -0600
Subject: [PATCH 1/3] feat(ci): add scripts to run Spark SQL test suite locally
 for Spark 4.1

Add bash scripts under dev/ci/spark-sql-tests/ that reproduce the
spark_sql_test.yml GitHub Actions workflow on a developer machine for
Apache Spark 4.1. They run Spark's own SQL test suites with Comet
enabled, which is useful for debugging a Spark SQL test failure locally
instead of waiting on CI.

- config.sh: shared configuration and the seven CI module-shard
  definitions, copied from spark_sql_test.yml
- setup-spark.sh: maintains a persistent apache/spark checkout and
  applies dev/diffs/4.1.1.diff, preserving build artifacts across runs
- run.sh: builds Comet, runs the selected module shard(s), and prints a
  PASS/FAIL summary
- README.md: usage, prerequisites, and environment variables

Only Spark 4.1 is supported for now.

[skip ci]
---
 .gitignore                            |   1 +
 dev/ci/spark-sql-tests/README.md      |  95 ++++++++++++++
 dev/ci/spark-sql-tests/config.sh      |  89 +++++++++++++
 dev/ci/spark-sql-tests/run.sh         | 172 ++++++++++++++++++++++++++
 dev/ci/spark-sql-tests/setup-spark.sh |  72 +++++++++++
 5 files changed, 429 insertions(+)
 create mode 100644 dev/ci/spark-sql-tests/README.md
 create mode 100644 dev/ci/spark-sql-tests/config.sh
 create mode 100755 dev/ci/spark-sql-tests/run.sh
 create mode 100755 dev/ci/spark-sql-tests/setup-spark.sh

diff --git a/.gitignore b/.gitignore
index a3c97ff992..9abb2c791e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ output
 docs/comet-*/
 docs/build/
 docs/temp/
+dev/ci/spark-sql-tests/logs/
diff --git a/dev/ci/spark-sql-tests/README.md b/dev/ci/spark-sql-tests/README.md
new file mode 100644
index 0000000000..8e0c499e76
--- /dev/null
+++ b/dev/ci/spark-sql-tests/README.md
@@ -0,0 +1,95 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Local Spark SQL Tests
+
+These scripts reproduce the `spark_sql_test.yml` GitHub Actions workflow on a
+developer machine for **Apache Spark 4.1**. They run Spark's own SQL test
+suites with Comet enabled, which is useful for debugging a Spark SQL test
+failure locally instead of waiting on CI.
+
+## Prerequisites
+
+- JDK 17 with `JAVA_HOME` set. Spark 4.1 also runs on newer JDKs, but CI uses 17.
+- A Rust toolchain, plus `protobuf-compiler` and `clang`, for the Comet native build.
+- Git, and enough disk space for an `apache/spark` checkout and its build output.
+
+## Usage
+
+Run from anywhere inside the repository:
+
+```sh
+dev/ci/spark-sql-tests/run.sh [module]
+```
+
+`module` is one of the seven CI shards, or `all` (the default):
+
+| Module       | Spark suites |
+|--------------|--------------|
+| `catalyst`   | `catalyst/test` |
+| `sql_core-1` | `sql` suites excluding `ExtendedSQLTest` / `SlowSQLTest` |
+| `sql_core-2` | `sql` `ExtendedSQLTest` suites |
+| `sql_core-3` | `sql` `SlowSQLTest` suites |
+| `sql_hive-1` | `hive` suites excluding `ExtendedHiveTest` / `SlowHiveTest` |
+| `sql_hive-2` | `hive` `ExtendedHiveTest` suites |
+| `sql_hive-3` | `hive` `SlowHiveTest` suites |
+
+Examples:
+
+```sh
+# Run a single shard
+dev/ci/spark-sql-tests/run.sh sql_core-1
+
+# Run all seven shards sequentially
+dev/ci/spark-sql-tests/run.sh
+
+# Re-run a shard without rebuilding Comet or re-applying the Spark diff
+SKIP_BUILD=1 SKIP_SPARK_SETUP=1 dev/ci/spark-sql-tests/run.sh sql_core-1
+```
+
+The first run clones `apache/spark` and builds both Comet and Spark, which
+takes a while. A full `all` run takes several hours, the same as CI. Per-module
+output is written to `dev/ci/spark-sql-tests/logs/<module>.log`, and a
+PASS/FAIL summary is printed at the end.
+
+## Environment variables
+
+| Variable           | Default                                   | Effect |
+|--------------------|-------------------------------------------|--------|
+| `SKIP_BUILD`       | unset                                     | `1` skips the Comet build and reuses existing artifacts. |
+| `SKIP_SPARK_SETUP` | unset                                     | `1` skips the Spark clone/reset/diff step. |
+| `COMET_SPARK_DIR`  | `~/.cache/datafusion-comet/apache-spark`  | Persistent Spark checkout location. |
+| `SPARK_REF`        | `v4.1.1`                                  | Git ref checked out for the Spark sources. |
+| `SBT_MEM`          | `4096`                                    | sbt heap size in MB. |
+| `LC_ALL`           | `C.UTF-8`                                 | Locale for the sbt run. Use `en_US.UTF-8` on macOS if `C.UTF-8` is unavailable. |
+
+## How it works
+
+1. `run.sh` builds Comet with `PROFILES=-Pspark-4.1 make release` (unless
+   `SKIP_BUILD=1`), then purges partial Maven cache entries so sbt's resolver
+   does not choke on POM-only artifacts.
+2. `setup-spark.sh` maintains a persistent `apache/spark` checkout: it clones
+   the `v4.1.1` tag on first use, and on every run resets it to a clean state
+   and applies `dev/diffs/4.1.1.diff`. Spark's compiled `target/` artifacts are
+   preserved across runs so rebuilds are incremental.
+3. `run.sh` runs the selected module shard(s) with `build/sbt`, using the same
+   environment and arguments as the `spark_sql_test.yml` workflow.
+
+Only Spark 4.1 is supported for now. The CI workflow's optional Comet
+fallback-reason log collection (`workflow_dispatch`) is not reproduced.
diff --git a/dev/ci/spark-sql-tests/config.sh b/dev/ci/spark-sql-tests/config.sh
new file mode 100644
index 0000000000..51de6f8ed3
--- /dev/null
+++ b/dev/ci/spark-sql-tests/config.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Shared configuration for the local Spark SQL test scripts. This file is
+# sourced by setup-spark.sh and run.sh; it is not meant to be run directly.
+#
+# The variables below are consumed by the sourcing scripts, so shellcheck
+# cannot see their use when checking this file in isolation.
+# shellcheck disable=SC2034
+
+# --- Spark version under test ----------------------------------------------
+SPARK_VERSION="4.1.1"
+SPARK_SHORT="4.1"
+
+# Git ref checked out for the Spark sources. Defaults to the released tag.
+SPARK_REF="${SPARK_REF:-v${SPARK_VERSION}}"
+
+# JDK major version the CI workflow uses for this Spark version.
+REQUIRED_JDK="17"
+
+# --- Paths -----------------------------------------------------------------
+# Persistent apache/spark checkout. Reused across runs to avoid re-cloning.
+COMET_SPARK_DIR="${COMET_SPARK_DIR:-$HOME/.cache/datafusion-comet/apache-spark}"
+
+# Directory containing these scripts, and the Comet repository root.
+COMET_SQL_TEST_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+COMET_REPO_ROOT="$(git -C "$COMET_SQL_TEST_DIR" rev-parse --show-toplevel)"
+
+# --- sbt / locale ----------------------------------------------------------
+# sbt heap size in MB. Higher than CI's 3072 since local machines are not
+# constrained to 7 GB GitHub runners.
+SBT_MEM="${SBT_MEM:-4096}"
+
+# Locale for the sbt run. CI uses C.UTF-8; macOS users may need en_US.UTF-8.
+export LC_ALL="${LC_ALL:-C.UTF-8}"
+
+# --- Module shards ---------------------------------------------------------
+# The seven module shards, copied verbatim from
+# .github/workflows/spark_sql_test.yml. Order matches the CI matrix.
+SPARK_SQL_MODULES=(
+  catalyst
+  sql_core-1
+  sql_core-2
+  sql_core-3
+  sql_hive-1
+  sql_hive-2
+  sql_hive-3
+)
+
+# module_sbt_args <module>
+# Echoes the single build/sbt argument for the given module shard.
+# Returns non-zero for an unknown module.
+module_sbt_args() {
+  case "$1" in
+    catalyst)
+      echo 'catalyst/test' ;;
+    sql_core-1)
+      echo 'sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest' ;;
+    sql_core-2)
+      echo 'sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest' ;;
+    sql_core-3)
+      echo 'sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest' ;;
+    sql_hive-1)
+      echo 'hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest' ;;
+    sql_hive-2)
+      echo 'hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest' ;;
+    sql_hive-3)
+      echo 'hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest' ;;
+    *)
+      return 1 ;;
+  esac
+}
diff --git a/dev/ci/spark-sql-tests/run.sh b/dev/ci/spark-sql-tests/run.sh
new file mode 100755
index 0000000000..d9d4caca4e
--- /dev/null
+++ b/dev/ci/spark-sql-tests/run.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Runs Apache Spark's SQL test suites locally with Comet enabled, reproducing
+# the spark_sql_test.yml GitHub Actions workflow for Spark 4.1.
+#
+# -e is intentionally not set: when running all module shards, one failing
+# shard must not stop the rest. Build and setup failures are checked
+# explicitly below.
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=config.sh
+source "$SCRIPT_DIR/config.sh"
+
+usage() {
+  cat <<EOF
+Usage: $(basename "$0") [module]
+
+Run Apache Spark SQL test suites locally with Comet enabled (Spark $SPARK_VERSION).
+
+Arguments:
+  module   One of: ${SPARK_SQL_MODULES[*]}
+           or 'all' to run every shard sequentially (default).
+
+Environment variables:
+  SKIP_BUILD=1        Skip the Comet build; reuse existing artifacts.
+  SKIP_SPARK_SETUP=1  Skip the Spark clone/reset/diff step.
+  COMET_SPARK_DIR     Spark checkout path (default: \$HOME/.cache/datafusion-comet/apache-spark).
+  SPARK_REF           Git ref for the Spark sources (default: v$SPARK_VERSION).
+  SBT_MEM             sbt heap size in MB (default: 4096).
+  LC_ALL              Locale for the sbt run (default: C.UTF-8; use en_US.UTF-8 on macOS).
+EOF
+}
+
+module="${1:-all}"
+case "$module" in
+  -h|--help) usage; exit 0 ;;
+esac
+
+# Resolve the list of modules to run.
+modules_to_run=()
+if [ "$module" = "all" ]; then
+  modules_to_run=("${SPARK_SQL_MODULES[@]}")
+elif module_sbt_args "$module" >/dev/null 2>&1; then
+  modules_to_run=("$module")
+else
+  echo "ERROR: unknown module '$module'" >&2
+  echo >&2
+  usage >&2
+  exit 1
+fi
+
+# --- JDK version check (warning only) --------------------------------------
+jdk_version="$(java -version 2>&1 | head -n1 | sed -E 's/.*version "([0-9]+).*/\1/')"
+if [ "$jdk_version" != "$REQUIRED_JDK" ]; then
+  echo "WARNING: active JDK reports major version '$jdk_version'; Spark $SPARK_VERSION CI uses JDK $REQUIRED_JDK." >&2
+  echo "         Set JAVA_HOME to a JDK $REQUIRED_JDK install to match CI exactly." >&2
+fi
+
+# --- Build Comet -----------------------------------------------------------
+if [ "${SKIP_BUILD:-}" = "1" ]; then
+  echo "SKIP_BUILD=1: skipping Comet build."
+else
+  echo "Building Comet (PROFILES=-Pspark-$SPARK_SHORT make release) ..."
+  if ! ( cd "$COMET_REPO_ROOT" && PROFILES="-Pspark-$SPARK_SHORT" make release ); then
+    echo "ERROR: Comet build failed." >&2
+    exit 1
+  fi
+fi
+
+# --- Purge partial Maven cache entries -------------------------------------
+# Mirrors .github/actions/setup-spark-builder/action.yaml. Comet's Maven phase
+# downloads POMs for transitive artifacts whose JARs it never needs. sbt's
+# Coursier resolver then treats the POM-only entry as "found locally" and
+# fails on the missing JAR instead of fetching it remotely. Delete those
+# partial entries so sbt re-fetches the full artifact.
+maven_repo="$HOME/.m2/repository"
+if [ -d "$maven_repo" ]; then
+  echo "Purging partial Maven cache entries ..."
+  find "$maven_repo" -name '*.pom' | while read -r pom; do
+    jar="${pom%.pom}.jar"
+    [ -f "$jar" ] && continue
+    grep -q '<packaging>jar</packaging>\|<packaging>bundle</packaging>' "$pom" 2>/dev/null || continue
+    rm -f "$pom" "${pom}.sha1" "${pom%.pom}.pom.lastUpdated" \
+      "$(dirname "$pom")/_remote.repositories"
+  done
+fi
+
+# --- Set up the Spark checkout ---------------------------------------------
+if [ "${SKIP_SPARK_SETUP:-}" = "1" ]; then
+  echo "SKIP_SPARK_SETUP=1: using the existing Spark checkout as-is."
+  if [ ! -d "$COMET_SPARK_DIR/.git" ]; then
+    echo "ERROR: SKIP_SPARK_SETUP=1 but no Spark checkout at $COMET_SPARK_DIR" >&2
+    exit 1
+  fi
+else
+  if ! "$SCRIPT_DIR/setup-spark.sh"; then
+    echo "ERROR: Spark setup failed." >&2
+    exit 1
+  fi
+fi
+
+# --- Run the selected module shards ----------------------------------------
+log_dir="$SCRIPT_DIR/logs"
+mkdir -p "$log_dir"
+
+results=()
+overall_status=0
+
+for m in "${modules_to_run[@]}"; do
+  sbt_args="$(module_sbt_args "$m")"
+  log_file="$log_dir/${m}.log"
+  echo
+  echo "=================================================================="
+  echo "Module:   $m"
+  echo "sbt args: $sbt_args"
+  echo "Log file: $log_file"
+  echo "=================================================================="
+
+  # Stale Parquet cache workaround (mirrors spark_sql_test.yml).
+  rm -rf "$maven_repo/org/apache/parquet"
+
+  (
+    cd "$COMET_SPARK_DIR" || exit 1
+    NOLINT_ON_COMPILE=true \
+    ENABLE_COMET=true \
+    ENABLE_COMET_ONHEAP=true \
+    ENABLE_COMET_LOG_FALLBACK_REASONS=false \
+    SERIAL_SBT_TESTS=1 \
+      build/sbt -Dsbt.log.noformat=true -mem "$SBT_MEM" \
+        'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
+        "$sbt_args"
+  ) 2>&1 | tee "$log_file"
+  status="${PIPESTATUS[0]}"
+
+  if [ "$status" -eq 0 ]; then
+    results+=("PASS  $m")
+  else
+    results+=("FAIL  $m (sbt exit $status)")
+    overall_status=1
+  fi
+done
+
+# --- Summary ---------------------------------------------------------------
+echo
+echo "=================================================================="
+echo "Spark SQL test summary (Spark $SPARK_VERSION)"
+echo "=================================================================="
+for line in "${results[@]}"; do
+  echo "  $line"
+done
+echo "Logs written to: $log_dir"
+exit "$overall_status"
diff --git a/dev/ci/spark-sql-tests/setup-spark.sh b/dev/ci/spark-sql-tests/setup-spark.sh
new file mode 100755
index 0000000000..5d31aeb85b
--- /dev/null
+++ b/dev/ci/spark-sql-tests/setup-spark.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Maintains the persistent apache/spark checkout used by the local Spark SQL
+# test scripts, and applies the Comet diff. Idempotent and safe to re-run.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=config.sh
+source "$SCRIPT_DIR/config.sh"
+
+DIFF_FILE="$COMET_REPO_ROOT/dev/diffs/${SPARK_VERSION}.diff"
+if [ ! -f "$DIFF_FILE" ]; then
+  echo "ERROR: Comet diff not found: $DIFF_FILE" >&2
+  exit 1
+fi
+
+if [ ! -d "$COMET_SPARK_DIR/.git" ]; then
+  echo "Cloning apache/spark ($SPARK_REF) into $COMET_SPARK_DIR ..."
+  mkdir -p "$(dirname "$COMET_SPARK_DIR")"
+  git clone --depth 1 --branch "$SPARK_REF" \
+    https://github.com/apache/spark.git "$COMET_SPARK_DIR"
+else
+  echo "Reusing existing Spark checkout at $COMET_SPARK_DIR"
+fi
+
+# Resolve the commit to reset to. A checkout created with a different
+# SPARK_REF may not contain the requested ref; fetch it shallowly if missing.
+reset_target="$SPARK_REF"
+if ! git -C "$COMET_SPARK_DIR" rev-parse --verify --quiet "${SPARK_REF}^{commit}" >/dev/null; then
+  echo "Ref $SPARK_REF not present locally; fetching ..."
+  git -C "$COMET_SPARK_DIR" fetch --depth 1 origin "$SPARK_REF"
+  reset_target="FETCH_HEAD"
+fi
+
+echo "Resetting Spark checkout to a clean $SPARK_REF ..."
+# reset --hard reverts tracked-file edits from a previously applied diff.
+git -C "$COMET_SPARK_DIR" reset --hard "$reset_target"
+# clean -fd removes untracked files the previous diff added. Without -x it
+# leaves gitignored build output in place, so Spark's compiled target/
+# artifacts are reused across runs.
+git -C "$COMET_SPARK_DIR" clean -fd
+
+echo "Applying $DIFF_FILE ..."
+# Pre-flight check so a drifted diff produces an actionable error rather than
+# raw git apply output.
+if ! git -C "$COMET_SPARK_DIR" apply --check "$DIFF_FILE" 2>/dev/null; then
+  echo "ERROR: $DIFF_FILE does not apply cleanly to $SPARK_REF." >&2
+  echo "       The Comet diff and the Spark ref may have drifted out of sync." >&2
+  exit 1
+fi
+git -C "$COMET_SPARK_DIR" apply "$DIFF_FILE"
+
+echo "Spark checkout ready: $COMET_SPARK_DIR ($SPARK_REF + Comet diff)"

From d58c54114635b0a589d629ff086acdf2c400bdc0 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 22 May 2026 09:50:37 -0600
Subject: [PATCH 2/3] fix(ci): skip Spark 4.1 Python data source probe in local
 SQL tests

Spark 4.1's DataSourceManager probes for Python data sources during
query analysis by spawning a python3 worker. The CI amd64/rust
container has no python3, so the probe is skipped there. On a developer
machine that has python3 the worker can hang indefinitely, since the
JVM-side read has no idle timeout by default, stalling suites such as
GlobalTempViewSuite.

Point PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON at a nonexistent
interpreter so the probe is skipped, matching CI. The value is
overridable for developers who want to run the Python-dependent suites.
---
 dev/ci/spark-sql-tests/README.md |  9 +++++++++
 dev/ci/spark-sql-tests/run.sh    | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/dev/ci/spark-sql-tests/README.md b/dev/ci/spark-sql-tests/README.md
index 8e0c499e76..0a21d57286 100644
--- a/dev/ci/spark-sql-tests/README.md
+++ b/dev/ci/spark-sql-tests/README.md
@@ -78,6 +78,15 @@ PASS/FAIL summary is printed at the end.
 | `SPARK_REF`        | `v4.1.1`                                  | Git ref checked out for the Spark sources. |
 | `SBT_MEM`          | `4096`                                    | sbt heap size in MB. |
 | `LC_ALL`           | `C.UTF-8`                                 | Locale for the sbt run. Use `en_US.UTF-8` on macOS if `C.UTF-8` is unavailable. |
+| `PYSPARK_PYTHON`   | a nonexistent path                        | Python interpreter for Spark. The default skips Spark 4.1's Python data source probe, which can hang on machines that have `python3`. Export a real interpreter to run the Python-dependent suites. |
+
+> **Note on Python:** Spark 4.1 probes for Python data sources during query
+> analysis by spawning a Python worker. The CI `amd64/rust` container has no
+> `python3`, so the probe is skipped. On a developer machine that has `python3`
+> the worker can hang indefinitely (the JVM-side read has no idle timeout),
+> stalling suites such as `GlobalTempViewSuite`. `run.sh` therefore points
+> `PYSPARK_PYTHON` / `PYSPARK_DRIVER_PYTHON` at a nonexistent path by default so
+> the probe is skipped, matching CI.
 
 ## How it works
 
diff --git a/dev/ci/spark-sql-tests/run.sh b/dev/ci/spark-sql-tests/run.sh
index d9d4caca4e..0127c5e479 100755
--- a/dev/ci/spark-sql-tests/run.sh
+++ b/dev/ci/spark-sql-tests/run.sh
@@ -48,6 +48,10 @@ Environment variables:
   SPARK_REF           Git ref for the Spark sources (default: v$SPARK_VERSION).
   SBT_MEM             sbt heap size in MB (default: 4096).
   LC_ALL              Locale for the sbt run (default: C.UTF-8; use en_US.UTF-8 on macOS).
+  PYSPARK_PYTHON      Python interpreter for Spark. Defaults to a nonexistent
+                      path so Spark 4.1's Python data source probe is skipped
+                      (it can hang on machines that have python3). Export a
+                      real interpreter to run the Python-dependent suites.
 EOF
 }
 
@@ -139,6 +143,16 @@ for m in "${modules_to_run[@]}"; do
   # Stale Parquet cache workaround (mirrors spark_sql_test.yml).
   rm -rf "$maven_repo/org/apache/parquet"
 
+  # Spark 4.1's DataSourceManager probes for Python data sources during query
+  # analysis by spawning a Python worker. The CI amd64/rust container has no
+  # python3, so the probe is skipped there. On a developer machine that does
+  # have python3 (every macOS install does) the worker can hang indefinitely:
+  # the JVM-side read has no idle timeout by default, so suites such as
+  # GlobalTempViewSuite stall forever instead of failing fast. Point PySpark at
+  # a nonexistent interpreter so the probe is skipped, matching CI. A developer
+  # who wants the Python suites can export PYSPARK_PYTHON themselves.
+  no_python="/nonexistent/comet-disable-python-datasources"
+
   (
     cd "$COMET_SPARK_DIR" || exit 1
     NOLINT_ON_COMPILE=true \
@@ -146,6 +160,8 @@ for m in "${modules_to_run[@]}"; do
     ENABLE_COMET_ONHEAP=true \
     ENABLE_COMET_LOG_FALLBACK_REASONS=false \
     SERIAL_SBT_TESTS=1 \
+    PYSPARK_DRIVER_PYTHON="${PYSPARK_DRIVER_PYTHON:-$no_python}" \
+    PYSPARK_PYTHON="${PYSPARK_PYTHON:-$no_python}" \
       build/sbt -Dsbt.log.noformat=true -mem "$SBT_MEM" \
         'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
         "$sbt_args"

From a729cd572917d4aa6a8e1ea3f670ab0d3738698f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Fri, 22 May 2026 10:03:11 -0600
Subject: [PATCH 3/3] feat(ci): make Spark version configurable in local SQL
 tests [skip ci]

The local Spark SQL test scripts hardcoded Spark 4.1.1. Select the
version with a SPARK_VERSION env var instead, supporting all four
versions from the spark_sql_test.yml CI matrix: 3.4.3, 3.5.8, 4.0.2,
and 4.1.1 (default 4.1.1).

config.sh derives SPARK_SHORT and the CI JDK per version, and mirrors
the matrix test-group isolation: every version runs with
SERIAL_SBT_TESTS=1 except Spark 4.0, which forks a dedicated JVM per
leak-prone Parquet/Orc suite. run.sh builds the sbt environment as an
array so the 4.0 case omits SERIAL_SBT_TESTS entirely.

The Spark checkout and logs are namespaced by version
(apache-spark-<version>, logs/<version>/) so switching versions does
not reset away each version's build artifacts or overwrite logs.
---
 dev/ci/spark-sql-tests/README.md | 51 ++++++++++++++++++---------
 dev/ci/spark-sql-tests/config.sh | 40 +++++++++++++++++----
 dev/ci/spark-sql-tests/run.sh    | 60 +++++++++++++++++++++-----------
 3 files changed, 107 insertions(+), 44 deletions(-)

diff --git a/dev/ci/spark-sql-tests/README.md b/dev/ci/spark-sql-tests/README.md
index 0a21d57286..41afe46f9d 100644
--- a/dev/ci/spark-sql-tests/README.md
+++ b/dev/ci/spark-sql-tests/README.md
@@ -20,13 +20,24 @@ under the License.
 # Local Spark SQL Tests
 
 These scripts reproduce the `spark_sql_test.yml` GitHub Actions workflow on a
-developer machine for **Apache Spark 4.1**. They run Spark's own SQL test
-suites with Comet enabled, which is useful for debugging a Spark SQL test
-failure locally instead of waiting on CI.
+developer machine. They run Spark's own SQL test suites with Comet enabled,
+which is useful for debugging a Spark SQL test failure locally instead of
+waiting on CI.
+
+The Spark version is selected with `SPARK_VERSION` and defaults to `4.1.1`.
+Supported versions, each mirroring a CI matrix config:
+
+| `SPARK_VERSION` | JDK used by CI |
+|-----------------|----------------|
+| `3.4.3`         | 11             |
+| `3.5.8`         | 11             |
+| `4.0.2`         | 21             |
+| `4.1.1`         | 17             |
 
 ## Prerequisites
 
-- JDK 17 with `JAVA_HOME` set. Spark 4.1 also runs on newer JDKs, but CI uses 17.
+- A JDK with `JAVA_HOME` set, matching the Spark version under test (see the
+  table above). `run.sh` warns if the active JDK differs from the one CI uses.
 - A Rust toolchain, plus `protobuf-compiler` and `clang`, for the Comet native build.
 - Git, and enough disk space for an `apache/spark` checkout and its build output.
 
@@ -61,26 +72,30 @@ dev/ci/spark-sql-tests/run.sh
 
 # Re-run a shard without rebuilding Comet or re-applying the Spark diff
 SKIP_BUILD=1 SKIP_SPARK_SETUP=1 dev/ci/spark-sql-tests/run.sh sql_core-1
+
+# Test a different Spark version
+SPARK_VERSION=4.0.2 dev/ci/spark-sql-tests/run.sh sql_core-1
 ```
 
 The first run clones `apache/spark` and builds both Comet and Spark, which
 takes a while. A full `all` run takes several hours, the same as CI. Per-module
-output is written to `dev/ci/spark-sql-tests/logs/<module>.log`, and a
+output is written to `dev/ci/spark-sql-tests/logs/<version>/<module>.log`, and a
 PASS/FAIL summary is printed at the end.
 
 ## Environment variables
 
 | Variable           | Default                                   | Effect |
 |--------------------|-------------------------------------------|--------|
+| `SPARK_VERSION`    | `4.1.1`                                   | Spark version to test: `3.4.3`, `3.5.8`, `4.0.2`, or `4.1.1`. |
 | `SKIP_BUILD`       | unset                                     | `1` skips the Comet build and reuses existing artifacts. |
 | `SKIP_SPARK_SETUP` | unset                                     | `1` skips the Spark clone/reset/diff step. |
-| `COMET_SPARK_DIR`  | `~/.cache/datafusion-comet/apache-spark`  | Persistent Spark checkout location. |
-| `SPARK_REF`        | `v4.1.1`                                  | Git ref checked out for the Spark sources. |
+| `COMET_SPARK_DIR`  | `~/.cache/datafusion-comet/apache-spark-<version>` | Persistent Spark checkout location, namespaced by version. |
+| `SPARK_REF`        | `v<version>`                              | Git ref checked out for the Spark sources. |
 | `SBT_MEM`          | `4096`                                    | sbt heap size in MB. |
 | `LC_ALL`           | `C.UTF-8`                                 | Locale for the sbt run. Use `en_US.UTF-8` on macOS if `C.UTF-8` is unavailable. |
-| `PYSPARK_PYTHON`   | a nonexistent path                        | Python interpreter for Spark. The default skips Spark 4.1's Python data source probe, which can hang on machines that have `python3`. Export a real interpreter to run the Python-dependent suites. |
+| `PYSPARK_PYTHON`   | a nonexistent path                        | Python interpreter for Spark. The default skips Spark 4.x's Python data source probe, which can hang on machines that have `python3`. Export a real interpreter to run the Python-dependent suites. |
 
-> **Note on Python:** Spark 4.1 probes for Python data sources during query
+> **Note on Python:** Spark 4.x probes for Python data sources during query
 > analysis by spawning a Python worker. The CI `amd64/rust` container has no
 > `python3`, so the probe is skipped. On a developer machine that has `python3`
 > the worker can hang indefinitely (the JVM-side read has no idle timeout),
@@ -90,15 +105,17 @@ PASS/FAIL summary is printed at the end.
 
 ## How it works
 
-1. `run.sh` builds Comet with `PROFILES=-Pspark-4.1 make release` (unless
+1. `run.sh` builds Comet with `PROFILES=-Pspark-<short> make release` (unless
    `SKIP_BUILD=1`), then purges partial Maven cache entries so sbt's resolver
    does not choke on POM-only artifacts.
-2. `setup-spark.sh` maintains a persistent `apache/spark` checkout: it clones
-   the `v4.1.1` tag on first use, and on every run resets it to a clean state
-   and applies `dev/diffs/4.1.1.diff`. Spark's compiled `target/` artifacts are
-   preserved across runs so rebuilds are incremental.
+2. `setup-spark.sh` maintains a persistent `apache/spark` checkout per version:
+   it clones the `v<version>` tag on first use, and on every run resets it to a
+   clean state and applies `dev/diffs/<version>.diff`. Spark's compiled
+   `target/` artifacts are preserved across runs so rebuilds are incremental.
 3. `run.sh` runs the selected module shard(s) with `build/sbt`, using the same
-   environment and arguments as the `spark_sql_test.yml` workflow.
+   environment and arguments as the `spark_sql_test.yml` workflow, including the
+   per-version test-group isolation (Spark 4.0 forks a dedicated JVM per
+   leak-prone Parquet/Orc suite; other versions run serially).
 
-Only Spark 4.1 is supported for now. The CI workflow's optional Comet
-fallback-reason log collection (`workflow_dispatch`) is not reproduced.
+The CI workflow's optional Comet fallback-reason log collection
+(`workflow_dispatch`) is not reproduced.
diff --git a/dev/ci/spark-sql-tests/config.sh b/dev/ci/spark-sql-tests/config.sh
index 51de6f8ed3..7bfcba695e 100644
--- a/dev/ci/spark-sql-tests/config.sh
+++ b/dev/ci/spark-sql-tests/config.sh
@@ -26,18 +26,46 @@
 # shellcheck disable=SC2034
 
 # --- Spark version under test ----------------------------------------------
-SPARK_VERSION="4.1.1"
-SPARK_SHORT="4.1"
+# Override with SPARK_VERSION=<full-version>. Each supported version has a
+# matching dev/diffs/<version>.diff and mirrors a spark_sql_test.yml CI config.
+SPARK_VERSION="${SPARK_VERSION:-4.1.1}"
+
+# Per-version settings copied from the spark_sql_test.yml CI matrix: the short
+# version (Maven/sbt profile suffix) and the JDK major version CI uses.
+case "$SPARK_VERSION" in
+  3.4.3) SPARK_SHORT="3.4"; REQUIRED_JDK="11" ;;
+  3.5.8) SPARK_SHORT="3.5"; REQUIRED_JDK="11" ;;
+  4.0.2) SPARK_SHORT="4.0"; REQUIRED_JDK="21" ;;
+  4.1.1) SPARK_SHORT="4.1"; REQUIRED_JDK="17" ;;
+  *)
+    echo "ERROR: unsupported SPARK_VERSION '$SPARK_VERSION'." >&2
+    echo "       Supported versions: 3.4.3, 3.5.8, 4.0.2, 4.1.1" >&2
+    exit 1
+    ;;
+esac
 
 # Git ref checked out for the Spark sources. Defaults to the released tag.
 SPARK_REF="${SPARK_REF:-v${SPARK_VERSION}}"
 
-# JDK major version the CI workflow uses for this Spark version.
-REQUIRED_JDK="17"
+# Test-group isolation, mirroring spark_sql_test.yml. Every CI config sets
+# SERIAL_SBT_TESTS=1 except Spark 4.0 (JDK 21), which instead leaves it unset
+# and forks a dedicated JVM per leak-prone Parquet/Orc suite to work around a
+# cross-suite file-stream leak under JDK 21 (Comet issue #4327). run.sh reads
+# DEDICATED_JVM_SUITES: when non-empty it passes DEDICATED_JVM_SBT_TESTS and
+# omits SERIAL_SBT_TESTS; when empty it passes SERIAL_SBT_TESTS=1.
+DEDICATED_JVM_SUITES=""
+if [ "$SPARK_SHORT" = "4.0" ]; then
+  DEDICATED_JVM_SUITES="\
+org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV1Suite,\
+org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormatV2Suite,\
+org.apache.spark.sql.execution.datasources.orc.OrcSourceV1Suite,\
+org.apache.spark.sql.execution.datasources.orc.OrcSourceV2Suite"
+fi
 
 # --- Paths -----------------------------------------------------------------
-# Persistent apache/spark checkout. Reused across runs to avoid re-cloning.
-COMET_SPARK_DIR="${COMET_SPARK_DIR:-$HOME/.cache/datafusion-comet/apache-spark}"
+# Persistent apache/spark checkout, namespaced by Spark version so switching
+# versions does not reset away each version's compiled target/ artifacts.
+COMET_SPARK_DIR="${COMET_SPARK_DIR:-$HOME/.cache/datafusion-comet/apache-spark-${SPARK_VERSION}}"
 
 # Directory containing these scripts, and the Comet repository root.
 COMET_SQL_TEST_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/dev/ci/spark-sql-tests/run.sh b/dev/ci/spark-sql-tests/run.sh
index 0127c5e479..10cb776534 100755
--- a/dev/ci/spark-sql-tests/run.sh
+++ b/dev/ci/spark-sql-tests/run.sh
@@ -19,7 +19,8 @@
 #
 
 # Runs Apache Spark's SQL test suites locally with Comet enabled, reproducing
-# the spark_sql_test.yml GitHub Actions workflow for Spark 4.1.
+# the spark_sql_test.yml GitHub Actions workflow. The Spark version is selected
+# with SPARK_VERSION (see config.sh); it defaults to 4.1.1.
 #
 # -e is intentionally not set: when running all module shards, one failing
 # shard must not stop the rest. Build and setup failures are checked
@@ -42,14 +43,17 @@ Arguments:
            or 'all' to run every shard sequentially (default).
 
 Environment variables:
+  SPARK_VERSION       Spark version to test: 3.4.3, 3.5.8, 4.0.2, or 4.1.1
+                      (default: 4.1.1).
   SKIP_BUILD=1        Skip the Comet build; reuse existing artifacts.
   SKIP_SPARK_SETUP=1  Skip the Spark clone/reset/diff step.
-  COMET_SPARK_DIR     Spark checkout path (default: \$HOME/.cache/datafusion-comet/apache-spark).
+  COMET_SPARK_DIR     Spark checkout path
+                      (default: \$HOME/.cache/datafusion-comet/apache-spark-<version>).
   SPARK_REF           Git ref for the Spark sources (default: v$SPARK_VERSION).
   SBT_MEM             sbt heap size in MB (default: 4096).
   LC_ALL              Locale for the sbt run (default: C.UTF-8; use en_US.UTF-8 on macOS).
   PYSPARK_PYTHON      Python interpreter for Spark. Defaults to a nonexistent
-                      path so Spark 4.1's Python data source probe is skipped
+                      path so Spark 4.x's Python data source probe is skipped
                       (it can hang on machines that have python3). Export a
                       real interpreter to run the Python-dependent suites.
 EOF
@@ -124,9 +128,21 @@ else
 fi
 
 # --- Run the selected module shards ----------------------------------------
-log_dir="$SCRIPT_DIR/logs"
+# Logs are namespaced by Spark version so runs of different versions do not
+# overwrite each other.
+log_dir="$SCRIPT_DIR/logs/$SPARK_VERSION"
 mkdir -p "$log_dir"
 
+# Spark 4.x's DataSourceManager probes for Python data sources during query
+# analysis by spawning a Python worker. The CI amd64/rust container has no
+# python3, so the probe is skipped there. On a developer machine that does
+# have python3 (every macOS install does) the worker can hang indefinitely:
+# the JVM-side read has no idle timeout by default, so suites such as
+# GlobalTempViewSuite stall forever instead of failing fast. Point PySpark at
+# a nonexistent interpreter so the probe is skipped, matching CI. A developer
+# who wants the Python suites can export PYSPARK_PYTHON themselves.
+no_python="/nonexistent/comet-disable-python-datasources"
+
 results=()
 overall_status=0
 
@@ -143,25 +159,27 @@ for m in "${modules_to_run[@]}"; do
   # Stale Parquet cache workaround (mirrors spark_sql_test.yml).
   rm -rf "$maven_repo/org/apache/parquet"
 
-  # Spark 4.1's DataSourceManager probes for Python data sources during query
-  # analysis by spawning a Python worker. The CI amd64/rust container has no
-  # python3, so the probe is skipped there. On a developer machine that does
-  # have python3 (every macOS install does) the worker can hang indefinitely:
-  # the JVM-side read has no idle timeout by default, so suites such as
-  # GlobalTempViewSuite stall forever instead of failing fast. Point PySpark at
-  # a nonexistent interpreter so the probe is skipped, matching CI. A developer
-  # who wants the Python suites can export PYSPARK_PYTHON themselves.
-  no_python="/nonexistent/comet-disable-python-datasources"
-
   (
     cd "$COMET_SPARK_DIR" || exit 1
-    NOLINT_ON_COMPILE=true \
-    ENABLE_COMET=true \
-    ENABLE_COMET_ONHEAP=true \
-    ENABLE_COMET_LOG_FALLBACK_REASONS=false \
-    SERIAL_SBT_TESTS=1 \
-    PYSPARK_DRIVER_PYTHON="${PYSPARK_DRIVER_PYTHON:-$no_python}" \
-    PYSPARK_PYTHON="${PYSPARK_PYTHON:-$no_python}" \
+
+    # Environment shared by every Spark version.
+    sbt_env=(
+      NOLINT_ON_COMPILE=true
+      ENABLE_COMET=true
+      ENABLE_COMET_ONHEAP=true
+      ENABLE_COMET_LOG_FALLBACK_REASONS=false
+      PYSPARK_DRIVER_PYTHON="${PYSPARK_DRIVER_PYTHON:-$no_python}"
+      PYSPARK_PYTHON="${PYSPARK_PYTHON:-$no_python}"
+    )
+    # Per-version test-group isolation (see config.sh): Spark 4.0 forks a
+    # dedicated JVM per leak-prone suite; every other version runs serially.
+    if [ -n "$DEDICATED_JVM_SUITES" ]; then
+      sbt_env+=("DEDICATED_JVM_SBT_TESTS=$DEDICATED_JVM_SUITES")
+    else
+      sbt_env+=("SERIAL_SBT_TESTS=1")
+    fi
+
+    env "${sbt_env[@]}" \
       build/sbt -Dsbt.log.noformat=true -mem "$SBT_MEM" \
         'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
         "$sbt_args"