From f13fc87137de38b6cb5870607c728787e0f22646 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 09:27:02 -0600 Subject: [PATCH] chore(audit): audit any_value and expand tests --- .../spark_expressions_support.md | 3 + .../expressions/aggregate/any_value.sql | 104 ++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 spark/src/test/resources/sql-tests/expressions/aggregate/any_value.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 65f941210a..962afc3d34 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -33,6 +33,9 @@ - [x] any - [x] any_value + - Spark 3.4.3 (2026-05-26) + - Spark 3.5.8 (2026-05-26) + - Spark 4.0.1 (2026-05-26) - [ ] approx_count_distinct - [ ] approx_percentile - [ ] approx_top_k diff --git a/spark/src/test/resources/sql-tests/expressions/aggregate/any_value.sql b/spark/src/test/resources/sql-tests/expressions/aggregate/any_value.sql new file mode 100644 index 0000000000..421d93b08f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/aggregate/any_value.sql @@ -0,0 +1,104 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- any_value is a RuntimeReplaceableAggregate that Spark's optimizer rewrites +-- to first(expr, ignoreNulls) before physical planning, so these tests exercise +-- the same Comet path as first_last.sql but pin the any_value name explicitly. + +-- ============================================================ +-- Setup: shared tables +-- ============================================================ + +statement +CREATE TABLE test_any_value(i int, grp string) USING parquet + +statement +INSERT INTO test_any_value VALUES (1, 'a'), (2, 'a'), (3, 'a'), (NULL, 'b'), (4, 'b') + +statement +CREATE TABLE test_any_value_ignore_nulls(id int, val int, grp string) USING parquet + +statement +INSERT INTO test_any_value_ignore_nulls VALUES + (1, NULL, 'a'), + (2, 10, 'a'), + (3, 20, 'a'), + (4, NULL, 'b'), + (5, 30, 'b'), + (6, NULL, 'b') + +statement +CREATE TABLE test_any_value_all_nulls(val int, grp string) USING parquet + +statement +INSERT INTO test_any_value_all_nulls VALUES (NULL, 'a'), (NULL, 'a'), (NULL, 'b'), (1, 'b') + +-- ============================================================ +-- any_value: basic (default behavior includes nulls) +-- ============================================================ + +query +SELECT any_value(i) FROM test_any_value + +query +SELECT grp, any_value(i) FROM test_any_value GROUP BY grp ORDER BY grp + +-- ============================================================ +-- any_value with isIgnoreNull literal +-- ============================================================ + +-- any_value(expr, true) ignores nulls. +query +SELECT any_value(val, true) FROM test_any_value_ignore_nulls + +query +SELECT grp, any_value(val, true) FROM test_any_value_ignore_nulls GROUP BY grp ORDER BY grp + +-- any_value(expr, false) respects nulls (default). +query +SELECT grp, any_value(val, false) FROM test_any_value_ignore_nulls GROUP BY grp ORDER BY grp + +-- ============================================================ +-- any_value: all-null group with ignoreNulls +-- ============================================================ + +-- Group 'a' has all nulls so result is NULL even when ignoring nulls. +query +SELECT grp, any_value(val, true) FROM test_any_value_all_nulls GROUP BY grp ORDER BY grp + +-- ============================================================ +-- any_value alongside other aggregates +-- ============================================================ + +query +SELECT grp, + any_value(val, true), + count(val), + sum(val) +FROM test_any_value_ignore_nulls GROUP BY grp ORDER BY grp + +-- ============================================================ +-- any_value matches first when called identically +-- ============================================================ + +-- Same input expression and same ignoreNulls flag should yield identical results +-- because Spark rewrites any_value(x, b) to first(x, b). +query +SELECT + any_value(val, true) = first(val, true) AS ignore_nulls_match, + any_value(val, false) = first(val, false) AS respect_nulls_match +FROM test_any_value_ignore_nulls