From ba2d7758b4eb518ce72aad8859c4a6e7b139627b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 2 Apr 2026 16:11:39 -0600 Subject: [PATCH] test: improve array_distinct test coverage and incompatibility description Expand SQL file tests for array_distinct from 2 queries on array to comprehensive coverage across int, bigint, string, boolean, double, float, decimal, and nested array types. Add edge case coverage for NULL handling, NaN/Infinity deduplication, boundary values, and negative zero. Add descriptive reason to the Incompatible support level so users understand that output elements are sorted rather than preserving insertion order. --- .../scala/org/apache/comet/serde/arrays.scala | 3 +- .../expressions/array/array_distinct.sql | 154 +++++++++++++++++- 2 files changed, 153 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 47a6e91421..efff731eec 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -189,7 +189,8 @@ object CometArrayContains extends CometExpressionSerde[ArrayContains] { object CometArrayDistinct extends CometExpressionSerde[ArrayDistinct] { - override def getSupportLevel(expr: ArrayDistinct): SupportLevel = Incompatible(None) + override def getSupportLevel(expr: ArrayDistinct): SupportLevel = + Incompatible(Some("Output elements are sorted rather than preserving insertion order")) override def convert( expr: ArrayDistinct, diff --git a/spark/src/test/resources/sql-tests/expressions/array/array_distinct.sql b/spark/src/test/resources/sql-tests/expressions/array/array_distinct.sql index 43c18e9889..f9d63df075 100644 --- a/spark/src/test/resources/sql-tests/expressions/array/array_distinct.sql +++ b/spark/src/test/resources/sql-tests/expressions/array/array_distinct.sql @@ -17,15 +17,163 @@ -- ConfigMatrix: parquet.enable.dictionary=false,true +-- ===== INT arrays ===== + statement -CREATE TABLE test_array_distinct(arr array) USING parquet +CREATE TABLE test_array_distinct_int(arr array) USING parquet statement -INSERT INTO test_array_distinct VALUES (array(1, 2, 2, 3, 3)), (array()), (NULL), (array(NULL, 1, NULL, 2)), (array(1)) +INSERT INTO test_array_distinct_int VALUES + (array(1, 2, 2, 3, 3)), + (array()), + (NULL), + (array(NULL, 1, NULL, 2)), + (array(1)), + (array(NULL, NULL, NULL)), + (array(-2147483648, 2147483647, -2147483648, 0)), + (array(0, -1, -1, 0, 1)) +-- column argument query spark_answer_only -SELECT array_distinct(arr) FROM test_array_distinct +SELECT array_distinct(arr) FROM test_array_distinct_int -- literal arguments query spark_answer_only SELECT array_distinct(array(1, 2, 2, 3, 3)) + +-- all NULLs +query spark_answer_only +SELECT array_distinct(array(CAST(NULL AS INT), CAST(NULL AS INT))) + +-- NULL input +query spark_answer_only +SELECT array_distinct(CAST(NULL AS array)) + +-- boundary values +query spark_answer_only +SELECT array_distinct(array(-2147483648, 2147483647, -2147483648, 2147483647, 0)) + +-- ===== LONG arrays ===== + +statement +CREATE TABLE test_array_distinct_long(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_long VALUES + (array(1, 2, 2, 3, 3)), + (NULL), + (array(NULL, 1, NULL, 2)), + (array(-9223372036854775808, 9223372036854775807, -9223372036854775808)) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_long + +-- boundary values +query spark_answer_only +SELECT array_distinct(array(CAST(-9223372036854775808 AS BIGINT), CAST(9223372036854775807 AS BIGINT), CAST(-9223372036854775808 AS BIGINT))) + +-- ===== STRING arrays ===== + +statement +CREATE TABLE test_array_distinct_string(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_string VALUES + (array('b', 'a', 'a', 'c', 'b')), + (array('')), + (NULL), + (array(NULL, 'a', NULL, 'a')), + (array('', '', NULL, '')), + (array('hello', 'world', 'hello')) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_string + +-- empty string and NULL distinction +query spark_answer_only +SELECT array_distinct(array('', NULL, '', NULL, 'a')) + +-- ===== BOOLEAN arrays ===== + +statement +CREATE TABLE test_array_distinct_bool(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_bool VALUES + (array(true, false, false, true)), + (array(true, true)), + (NULL), + (array(NULL, true, NULL, false)) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_bool + +-- ===== DOUBLE arrays ===== + +statement +CREATE TABLE test_array_distinct_double(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_double VALUES + (array(1.123, 0.1234, 1.121, 1.123, 0.1234)), + (NULL), + (array(NULL, 1.0, NULL, 2.0)) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_double + +-- NaN deduplication +query spark_answer_only +SELECT array_distinct(array(CAST('NaN' AS DOUBLE), CAST('NaN' AS DOUBLE), 1.0, 1.0)) + +-- NaN with NULL +query spark_answer_only +SELECT array_distinct(array(CAST('NaN' AS DOUBLE), NULL, CAST('NaN' AS DOUBLE), NULL, 1.0)) + +-- Infinity +query spark_answer_only +SELECT array_distinct(array(CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('Infinity' AS DOUBLE), 0.0)) + +-- negative zero +query spark_answer_only +SELECT array_distinct(array(0.0, -0.0, 1.0)) + +-- ===== FLOAT arrays ===== + +statement +CREATE TABLE test_array_distinct_float(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_float VALUES + (array(CAST(1.123 AS FLOAT), CAST(0.1234 AS FLOAT), CAST(1.121 AS FLOAT), CAST(1.123 AS FLOAT))), + (NULL), + (array(CAST(NULL AS FLOAT), CAST(1.0 AS FLOAT), CAST(NULL AS FLOAT))) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_float + +-- Float NaN deduplication +query spark_answer_only +SELECT array_distinct(array(CAST('NaN' AS FLOAT), CAST('NaN' AS FLOAT), CAST(1.0 AS FLOAT))) + +-- ===== DECIMAL arrays ===== + +statement +CREATE TABLE test_array_distinct_decimal(arr array) USING parquet + +statement +INSERT INTO test_array_distinct_decimal VALUES + (array(1.10, 2.20, 1.10, 3.30)), + (NULL), + (array(NULL, 1.10, NULL, 1.10)) + +query spark_answer_only +SELECT array_distinct(arr) FROM test_array_distinct_decimal + +-- ===== Nested array (array of arrays) ===== + +query spark_answer_only +SELECT array_distinct(array(array(1, 2), array(3, 4), array(1, 2), array(3, 4))) + +query spark_answer_only +SELECT array_distinct(array(array(1, 2), CAST(NULL AS array), array(1, 2), CAST(NULL AS array)))