From c0229030af3dc7020149a2a29ceda4bcbb8d97e4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Apr 2026 17:11:55 -0600 Subject: [PATCH 1/3] fix: audit array_insert expression for correctness and test coverage Fix nullable() metadata bug in ArrayInsert where only src_array_expr was checked but not pos_expr, causing incorrect nullability reporting when the position column is nullable. Expand SQL test coverage from 1 query to 27+ queries across multiple types and edge cases. --- .../skills/audit-comet-expression/SKILL.md | 13 + .../contributor-guide/expression-audit-log.md | 32 +++ .../src/array_funcs/array_insert.rs | 3 +- .../expressions/array/array_insert.sql | 226 +++++++++++++++++- .../expressions/array/array_insert_legacy.sql | 57 +++++ 5 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 docs/source/contributor-guide/expression-audit-log.md create mode 100644 spark/src/test/resources/sql-tests/expressions/array/array_insert_legacy.sql diff --git a/.claude/skills/audit-comet-expression/SKILL.md b/.claude/skills/audit-comet-expression/SKILL.md index b0f6a7edd0..9a2090af85 100644 --- a/.claude/skills/audit-comet-expression/SKILL.md +++ b/.claude/skills/audit-comet-expression/SKILL.md @@ -311,6 +311,19 @@ After implementing tests, tell the user how to run them: --- +## Step 8: Update the Expression Audit Log + +After completing the audit (whether or not tests were added), append a row to the audit log at +`docs/source/contributor-guide/expression-audit-log.md`. + +The row should include: +- Expression name +- Spark versions checked (e.g. 3.4.3, 3.5.8, 4.0.1) +- Today's date +- A brief summary of findings (behavioral differences, bugs found/fixed, tests added, known incompatibilities) + +--- + ## Output Format Present the audit as: diff --git a/docs/source/contributor-guide/expression-audit-log.md b/docs/source/contributor-guide/expression-audit-log.md new file mode 100644 index 0000000000..0708c18f2f --- /dev/null +++ b/docs/source/contributor-guide/expression-audit-log.md @@ -0,0 +1,32 @@ + + +# Expression Audit Log + +This document tracks which Comet expressions have been audited against their Spark +implementations for correctness and test coverage. + +Each audit compares the Comet implementation against the Spark source code for the listed +versions, reviews existing test coverage, identifies gaps, and adds missing tests where needed. + +## Audited Expressions + +| Expression | Spark Versions Checked | Date | Findings | +|---|---|---|---| +| `array_insert` | 3.4.3, 3.5.8, 4.0.1 | 2026-04-02 | No behavioral differences across Spark versions. Fixed `nullable()` metadata bug (did not account for `pos_expr`). Added SQL tests for multiple types (string, boolean, double, float, long, short, byte), literal arguments, null handling, negative indices, out-of-bounds padding, special float values (NaN, Infinity), multibyte UTF-8, and legacy negative index mode. Known incompatibility: pos=0 error message differs from Spark's `INVALID_INDEX_OF_ZERO`. | diff --git a/native/spark-expr/src/array_funcs/array_insert.rs b/native/spark-expr/src/array_funcs/array_insert.rs index 505ee56f0b..75917c9bf5 100644 --- a/native/spark-expr/src/array_funcs/array_insert.rs +++ b/native/spark-expr/src/array_funcs/array_insert.rs @@ -105,7 +105,8 @@ impl PhysicalExpr for ArrayInsert { } fn nullable(&self, input_schema: &Schema) -> DataFusionResult { - self.src_array_expr.nullable(input_schema) + Ok(self.src_array_expr.nullable(input_schema)? + || self.pos_expr.nullable(input_schema)?) } fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult { diff --git a/spark/src/test/resources/sql-tests/expressions/array/array_insert.sql b/spark/src/test/resources/sql-tests/expressions/array/array_insert.sql index 2d5754ea8e..06294bc318 100644 --- a/spark/src/test/resources/sql-tests/expressions/array/array_insert.sql +++ b/spark/src/test/resources/sql-tests/expressions/array/array_insert.sql @@ -16,12 +16,234 @@ -- under the License. -- ConfigMatrix: parquet.enable.dictionary=false,true +-- Config: spark.comet.expression.ArrayInsert.allowIncompatible=true + +-- ============================================================ +-- Integer arrays with column arguments +-- ============================================================ statement CREATE TABLE test_array_insert(arr array, pos int, val int) USING parquet statement -INSERT INTO test_array_insert VALUES (array(1, 2, 3), 2, 10), (array(1, 2, 3), 1, 10), (array(1, 2, 3), 4, 10), (array(), 1, 10), (NULL, 1, 10) +INSERT INTO test_array_insert VALUES + (array(1, 2, 3), 2, 10), + (array(1, 2, 3), 1, 10), + (array(1, 2, 3), 4, 10), + (array(), 1, 10), + (NULL, 1, 10), + (array(1, 2, 3), NULL, 10), + (array(1, 2, 3), 3, NULL) -query spark_answer_only +-- basic column arguments +query SELECT array_insert(arr, pos, val) FROM test_array_insert + +-- ============================================================ +-- Literal arguments (all-literal queries test native eval +-- because CometSqlFileTestSuite disables constant folding) +-- ============================================================ + +-- basic insert at various positions +query +SELECT array_insert(array(1, 2, 3), 2, 10) + +-- prepend (pos=1) +query +SELECT array_insert(array(1, 2, 3), 1, 10) + +-- append (pos = len+1) +query +SELECT array_insert(array(1, 2, 3), 4, 10) + +-- insert into empty array +query +SELECT array_insert(array(), 1, 10) + +-- ============================================================ +-- NULL handling +-- ============================================================ + +-- null array +query +SELECT array_insert(CAST(NULL AS ARRAY), 1, 10) + +-- null position +query +SELECT array_insert(array(1, 2, 3), CAST(NULL AS INT), 10) + +-- null value (insert null into array) +query +SELECT array_insert(array(1, 2, 3), 2, CAST(NULL AS INT)) + +-- null value appended beyond end +query +SELECT array_insert(array(1, 2, 3, NULL), 4, CAST(NULL AS INT)) + +-- array with existing nulls +query +SELECT array_insert(array(1, NULL, 3), 2, 10) + +-- ============================================================ +-- Positive out-of-bounds (null padding) +-- ============================================================ + +-- one beyond end +query +SELECT array_insert(array(1, 2, 3), 5, 99) + +-- far beyond end +query +SELECT array_insert(array(1, 2, 3), 10, 99) + +-- ============================================================ +-- Negative indices (non-legacy mode, which is the default) +-- ============================================================ + +-- -1 appends after last element +query +SELECT array_insert(array(1, 2, 3), -1, 10) + +-- -2 inserts before last element +query +SELECT array_insert(array(1, 2, 3), -2, 10) + +-- -3 inserts before second-to-last +query +SELECT array_insert(array(1, 2, 3), -3, 10) + +-- -4 inserts before first element (len = 3, so -4 = before start) +query +SELECT array_insert(array(1, 2, 3), -4, 10) + +-- negative beyond start (null padding) +query +SELECT array_insert(array(1, 2, 3), -5, 10) + +-- far negative beyond start +query +SELECT array_insert(array(1, 2, 3), -10, 10) + +-- ============================================================ +-- pos=0 error +-- Note: Spark throws INVALID_INDEX_OF_ZERO, Comet throws a different message. +-- Cannot use expect_error because patterns differ. This is a known incompatibility. +-- ============================================================ + +-- ============================================================ +-- String arrays +-- ============================================================ + +statement +CREATE TABLE test_array_insert_str(arr array, pos int, val string) USING parquet + +statement +INSERT INTO test_array_insert_str VALUES + (array('a', 'b', 'c'), 2, 'd'), + (array('a', 'b', 'c'), 1, 'd'), + (array('a', 'b', 'c'), 4, 'd'), + (array(), 1, 'x'), + (NULL, 1, 'x'), + (array('a', NULL, 'c'), 2, 'z'), + (array('a', 'b', 'c'), 3, NULL) + +-- column arguments with strings +query +SELECT array_insert(arr, pos, val) FROM test_array_insert_str + +-- literal string arrays +query +SELECT array_insert(array('hello', 'world'), 1, 'first') + +query +SELECT array_insert(array('hello', 'world'), 3, 'last') + +-- empty strings +query +SELECT array_insert(array('', 'a', ''), 2, '') + +-- multibyte UTF-8 characters +query +SELECT array_insert(array('hello', 'world'), 2, 'cafe\u0301') + +query +SELECT array_insert(array('abc', 'def'), 1, '中文') + +-- ============================================================ +-- Boolean arrays +-- ============================================================ + +statement +CREATE TABLE test_array_insert_bool(arr array, pos int, val boolean) USING parquet + +statement +INSERT INTO test_array_insert_bool VALUES + (array(true, false, true), 2, false), + (array(true, false), 3, true), + (NULL, 1, true), + (array(true), 1, NULL) + +query +SELECT array_insert(arr, pos, val) FROM test_array_insert_bool + +query +SELECT array_insert(array(true, false, true), 3, true) + +-- ============================================================ +-- Double arrays +-- ============================================================ + +statement +CREATE TABLE test_array_insert_double(arr array, pos int, val double) USING parquet + +statement +INSERT INTO test_array_insert_double VALUES + (array(1.1, 2.2, 3.3), 2, 4.4), + (array(1.1, 2.2), 3, 0.0), + (NULL, 1, 1.0), + (array(1.1, 2.2), 1, NULL) + +query +SELECT array_insert(arr, pos, val) FROM test_array_insert_double + +-- special float values +query +SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('NaN' AS DOUBLE)) + +query +SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('Infinity' AS DOUBLE)) + +query +SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('-Infinity' AS DOUBLE)) + +-- negative zero +query +SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 1, CAST(-0.0 AS DOUBLE)) + +-- ============================================================ +-- Long arrays +-- ============================================================ + +query +SELECT array_insert(array(CAST(1 AS BIGINT), CAST(2 AS BIGINT)), 2, CAST(3 AS BIGINT)) + +-- ============================================================ +-- Short arrays +-- ============================================================ + +query +SELECT array_insert(array(CAST(1 AS SMALLINT), CAST(2 AS SMALLINT)), 2, CAST(3 AS SMALLINT)) + +-- ============================================================ +-- Byte arrays +-- ============================================================ + +query +SELECT array_insert(array(CAST(1 AS TINYINT), CAST(2 AS TINYINT)), 2, CAST(3 AS TINYINT)) + +-- ============================================================ +-- Float arrays +-- ============================================================ + +query +SELECT array_insert(array(CAST(1.1 AS FLOAT), CAST(2.2 AS FLOAT)), 2, CAST(3.3 AS FLOAT)) diff --git a/spark/src/test/resources/sql-tests/expressions/array/array_insert_legacy.sql b/spark/src/test/resources/sql-tests/expressions/array/array_insert_legacy.sql new file mode 100644 index 0000000000..9e18afda25 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/array/array_insert_legacy.sql @@ -0,0 +1,57 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Tests for array_insert with legacy negative index mode enabled. +-- In legacy mode, -1 inserts BEFORE the last element (not after). + +-- ConfigMatrix: parquet.enable.dictionary=false,true +-- Config: spark.comet.expression.ArrayInsert.allowIncompatible=true +-- Config: spark.sql.legacy.negativeIndexInArrayInsert=true + +-- -1 inserts before last element in legacy mode +query +SELECT array_insert(array(1, 2, 3), -1, 10) + +-- -2 inserts before second-to-last +query +SELECT array_insert(array(1, 2, 3), -2, 10) + +-- -3 inserts before first element +query +SELECT array_insert(array(1, 2, 3), -3, 10) + +-- negative beyond start with null padding (legacy mode pads differently) +query +SELECT array_insert(array(1, 2, 3), -5, 10) + +-- far negative beyond start +query +SELECT array_insert(array(1, 3, 4), -2, 2) + +-- column-based test +statement +CREATE TABLE test_ai_legacy(arr array, pos int, val int) USING parquet + +statement +INSERT INTO test_ai_legacy VALUES + (array(1, 2, 3), -1, 10), + (array(4, 5), -1, 20), + (array(1, 2, 3), -4, 10), + (NULL, -1, 10) + +query +SELECT array_insert(arr, pos, val) FROM test_ai_legacy From c1d8d8cf56fd94ea45e20192c7a95714859cbb19 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Apr 2026 17:18:41 -0600 Subject: [PATCH 2/3] chore: apply prettier and cargo fmt formatting --- docs/source/contributor-guide/expression-audit-log.md | 6 +++--- native/spark-expr/src/array_funcs/array_insert.rs | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/contributor-guide/expression-audit-log.md b/docs/source/contributor-guide/expression-audit-log.md index 0708c18f2f..088e4ea766 100644 --- a/docs/source/contributor-guide/expression-audit-log.md +++ b/docs/source/contributor-guide/expression-audit-log.md @@ -27,6 +27,6 @@ versions, reviews existing test coverage, identifies gaps, and adds missing test ## Audited Expressions -| Expression | Spark Versions Checked | Date | Findings | -|---|---|---|---| -| `array_insert` | 3.4.3, 3.5.8, 4.0.1 | 2026-04-02 | No behavioral differences across Spark versions. Fixed `nullable()` metadata bug (did not account for `pos_expr`). Added SQL tests for multiple types (string, boolean, double, float, long, short, byte), literal arguments, null handling, negative indices, out-of-bounds padding, special float values (NaN, Infinity), multibyte UTF-8, and legacy negative index mode. Known incompatibility: pos=0 error message differs from Spark's `INVALID_INDEX_OF_ZERO`. | +| Expression | Spark Versions Checked | Date | Findings | +| -------------- | ---------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `array_insert` | 3.4.3, 3.5.8, 4.0.1 | 2026-04-02 | No behavioral differences across Spark versions. Fixed `nullable()` metadata bug (did not account for `pos_expr`). Added SQL tests for multiple types (string, boolean, double, float, long, short, byte), literal arguments, null handling, negative indices, out-of-bounds padding, special float values (NaN, Infinity), multibyte UTF-8, and legacy negative index mode. Known incompatibility: pos=0 error message differs from Spark's `INVALID_INDEX_OF_ZERO`. | diff --git a/native/spark-expr/src/array_funcs/array_insert.rs b/native/spark-expr/src/array_funcs/array_insert.rs index 75917c9bf5..bce00483bc 100644 --- a/native/spark-expr/src/array_funcs/array_insert.rs +++ b/native/spark-expr/src/array_funcs/array_insert.rs @@ -105,8 +105,7 @@ impl PhysicalExpr for ArrayInsert { } fn nullable(&self, input_schema: &Schema) -> DataFusionResult { - Ok(self.src_array_expr.nullable(input_schema)? - || self.pos_expr.nullable(input_schema)?) + Ok(self.src_array_expr.nullable(input_schema)? || self.pos_expr.nullable(input_schema)?) } fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult { From 08c141f5209a5cec7bcdf4829670c3a05b21ea32 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Apr 2026 17:21:05 -0600 Subject: [PATCH 3/3] chore: apply prettier formatting to skill file --- .claude/skills/audit-comet-expression/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.claude/skills/audit-comet-expression/SKILL.md b/.claude/skills/audit-comet-expression/SKILL.md index 9a2090af85..ab14ffe841 100644 --- a/.claude/skills/audit-comet-expression/SKILL.md +++ b/.claude/skills/audit-comet-expression/SKILL.md @@ -317,6 +317,7 @@ After completing the audit (whether or not tests were added), append a row to th `docs/source/contributor-guide/expression-audit-log.md`. The row should include: + - Expression name - Spark versions checked (e.g. 3.4.3, 3.5.8, 4.0.1) - Today's date