Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
768b3e9
impl map_from_entries
Dec 14, 2025
c68c342
Revert "impl map_from_entries"
Dec 16, 2025
d887555
Merge branch 'apache:main' into main
kazantsev-maksim Dec 16, 2025
231aa90
Merge branch 'apache:main' into main
kazantsev-maksim Dec 17, 2025
9500bbb
Merge branch 'apache:main' into main
kazantsev-maksim Dec 24, 2025
9577481
Merge branch 'apache:main' into main
kazantsev-maksim Dec 28, 2025
3791557
Merge branch 'apache:main' into main
kazantsev-maksim Jan 2, 2026
7c2f082
Merge branch 'apache:main' into main
kazantsev-maksim Jan 3, 2026
609a605
Merge branch 'apache:main' into main
kazantsev-maksim Jan 6, 2026
a151b2c
Merge branch 'apache:main' into main
kazantsev-maksim Jan 7, 2026
ad3e7f5
Merge branch 'apache:main' into main
kazantsev-maksim Jan 10, 2026
ea92e4b
Merge branch 'apache:main' into main
kazantsev-maksim Jan 14, 2026
8dfeca3
Merge branch 'apache:main' into main
kazantsev-maksim Jan 17, 2026
559741e
Merge branch 'apache:main' into main
kazantsev-maksim Jan 20, 2026
ebda14e
Merge branch 'apache:main' into main
kazantsev-maksim Jan 21, 2026
408152e
Merge branch 'apache:main' into main
kazantsev-maksim Jan 23, 2026
d7857b2
Merge branch 'apache:main' into main
kazantsev-maksim Jan 24, 2026
aef41be
Merge branch 'apache:main' into main
kazantsev-maksim Jan 29, 2026
5ac1c58
Merge branch 'apache:main' into main
kazantsev-maksim Jan 30, 2026
9ae8e23
Merge branch 'apache:main' into main
kazantsev-maksim Feb 1, 2026
5ca3888
Merge branch 'apache:main' into main
kazantsev-maksim Feb 4, 2026
160a817
Merge branch 'apache:main' into main
kazantsev-maksim Feb 5, 2026
88fc313
Merge branch 'apache:main' into main
kazantsev-maksim Feb 7, 2026
e14c180
Merge branch 'apache:main' into main
kazantsev-maksim Feb 13, 2026
610a885
Merge branch 'apache:main' into main
kazantsev-maksim Feb 20, 2026
f8acb2c
Merge branch 'apache:main' into main
kazantsev-maksim Feb 21, 2026
ec94897
Merge branch 'apache:main' into main
kazantsev-maksim Feb 26, 2026
43405e4
Merge branch 'apache:main' into main
kazantsev-maksim Feb 27, 2026
47b4915
Merge branch 'apache:main' into main
kazantsev-maksim Mar 1, 2026
26e2682
Merge branch 'apache:main' into main
kazantsev-maksim Mar 3, 2026
6cb5f07
Merge branch 'apache:main' into main
kazantsev-maksim Mar 4, 2026
ec194fb
Merge branch 'apache:main' into main
kazantsev-maksim Mar 31, 2026
256fccb
Merge branch 'apache:main' into main
kazantsev-maksim Apr 3, 2026
912c8f9
Merge branch 'apache:main' into main
kazantsev-maksim Apr 3, 2026
561a664
Merge branch 'apache:main' into main
kazantsev-maksim Apr 8, 2026
d926ef4
Merge branch 'apache:main' into main
kazantsev-maksim Apr 11, 2026
671412c
Merge branch 'apache:main' into main
kazantsev-maksim Apr 17, 2026
c9f52d1
Merge branch 'apache:main' into main
kazantsev-maksim Apr 22, 2026
67f72d9
Merge branch 'apache:main' into main
kazantsev-maksim Apr 23, 2026
314e594
Merge branch 'apache:main' into main
kazantsev-maksim Apr 24, 2026
ac8292f
Merge branch 'apache:main' into main
kazantsev-maksim May 1, 2026
c9c140e
Merge branch 'apache:main' into main
kazantsev-maksim May 7, 2026
decca58
Merge branch 'apache:main' into main
kazantsev-maksim May 13, 2026
0919b33
Merge branch 'apache:main' into main
kazantsev-maksim May 16, 2026
21a5771
WORK
May 19, 2026
7495e21
Merge branch 'apache:main' into main
kazantsev-maksim May 19, 2026
57076f4
feat: impl json_array_length
May 19, 2026
0dfa19c
Merge branch 'main' into json_array_length
kazantsev-maksim May 19, 2026
0a37a60
Merge branch 'apache:main' into main
kazantsev-maksim May 21, 2026
060bf07
Merge remote-tracking branch 'refs/remotes/origin/main' into json_arr…
May 21, 2026
abbba84
Merge branch 'apache:main' into main
kazantsev-maksim May 25, 2026
e65284f
Merge remote-tracking branch 'refs/remotes/origin/main' into json_arr…
May 25, 2026
678e417
fix
May 25, 2026
88b5d71
fix PR issues
May 26, 2026
690f79d
Merge remote-tracking branch 'origin/json_array_length' into json_arr…
May 26, 2026
d4936a9
fix PR issues
May 26, 2026
aaf5509
Merge branch 'main' into json_array_length
andygrove May 26, 2026
09dc0d4
fix PR issues
May 27, 2026
2c5e2cf
Merge remote-tracking branch 'origin/json_array_length' into json_arr…
May 27, 2026
a0fad37
Merge branch 'main' into json_array_length
kazantsev-maksim May 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions native/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions native/spark-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ num = { workspace = true }
regex = { workspace = true }
# preserve_order: needed for get_json_object to match Spark's JSON key ordering
serde_json = { version = "1.0", features = ["preserve_order"] }
serde = { version = "1.0", features = ["derive"] }
datafusion-comet-common = { workspace = true }
datafusion-comet-jni-bridge = { workspace = true }
jni = "0.22.4"
Expand Down
2 changes: 2 additions & 0 deletions native/spark-expr/src/comet_scalar_funcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

use crate::hash_funcs::*;
use crate::json_funcs::JsonArrayLength;
use crate::map_funcs::spark_map_sort;
use crate::math_funcs::abs::abs;
use crate::math_funcs::checked_arithmetic::{checked_add, checked_div, checked_mul, checked_sub};
Expand Down Expand Up @@ -221,6 +222,7 @@ fn all_scalar_functions() -> Vec<Arc<ScalarUDF>> {
Arc::new(ScalarUDF::new_from_impl(SparkMakeTime::default())),
Arc::new(ScalarUDF::new_from_impl(SparkSecondsToTimestamp::default())),
Arc::new(ScalarUDF::new_from_impl(SparkSizeFunc::default())),
Arc::new(ScalarUDF::new_from_impl(JsonArrayLength::default())),
]
}

Expand Down
159 changes: 159 additions & 0 deletions native/spark-expr/src/json_funcs/json_array_length.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Array, ArrayRef, Int32Builder, OffsetSizeTrait};
use arrow::datatypes::DataType;
use datafusion::common::cast::as_generic_string_array;
use datafusion::common::{exec_err, Result, ScalarValue};
use datafusion::logical_expr::{
ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
};

use std::any::Any;

use serde::de::{IgnoredAny, SeqAccess, Visitor};
use serde::Deserializer;
use std::fmt;
use std::sync::Arc;

#[derive(Debug, PartialEq, Eq, Hash)]
pub struct JsonArrayLength {
signature: Signature,
}

impl Default for JsonArrayLength {
fn default() -> Self {
Self::new()
}
}

impl JsonArrayLength {
pub fn new() -> Self {
Self {
signature: Signature::variadic(
vec![DataType::Utf8, DataType::LargeUtf8],
Volatility::Immutable,
),
}
}
}

impl ScalarUDFImpl for JsonArrayLength {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"json_array_length"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(DataType::Int32)
}

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
spark_json_array_length(&args.args)
}
}

fn spark_json_array_length(args: &[ColumnarValue]) -> Result<ColumnarValue> {
if args.len() != 1 {
return exec_err!("json_array_length function takes exactly one argument");
}
match &args[0] {
ColumnarValue::Array(array) => {
let result = spark_json_array_length_array(array)?;
Ok(ColumnarValue::Array(result))
}
ColumnarValue::Scalar(scalar) => {
let result = spark_json_array_length_scalar(scalar)?;
Ok(ColumnarValue::Scalar(result))
}
}
}

fn spark_json_array_length_array(array: &ArrayRef) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => spark_json_array_length_array_inner::<i32>(array),
DataType::LargeUtf8 => spark_json_array_length_array_inner::<i64>(array),
other => {
exec_err!("Unsupported data type {other:?} for function `json_array_length`")
}
}
}

fn spark_json_array_length_scalar(scalar: &ScalarValue) -> Result<ScalarValue> {
match scalar {
ScalarValue::Utf8(value) => spark_json_array_length_scalar_inner(value),
ScalarValue::LargeUtf8(value) => spark_json_array_length_scalar_inner(value),
other => {
exec_err!("Unsupported data type {other:?} for function `json_array_length`")
}
}
}

fn spark_json_array_length_scalar_inner(json_str: &Option<String>) -> Result<ScalarValue> {
let array_length = json_str
.clone()
.and_then(|json_str| get_json_array_length(&json_str));
Ok(ScalarValue::Int32(array_length))
}

fn spark_json_array_length_array_inner<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
let str_array = as_generic_string_array::<T>(array)?;
let mut builder = Int32Builder::with_capacity(str_array.len());
for row_idx in 0..str_array.len() {
if str_array.is_null(row_idx) {
builder.append_null();
} else {
let json_str = str_array.value(row_idx);
if let Some(json_array_length) = get_json_array_length(json_str) {
builder.append_value(json_array_length);
} else {
builder.append_null()
}
}
}
Ok(Arc::new(builder.finish()))
}

struct ArrayItemCounter;

impl<'de> Visitor<'de> for ArrayItemCounter {
type Value = i32;

fn expecting(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("a JSON array")
}

fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let mut len = 0i32;
while seq.next_element::<IgnoredAny>()?.is_some() {
len += 1;
}
Ok(len)
}
}

fn get_json_array_length(json: &str) -> Option<i32> {
let mut deserializer = serde_json::Deserializer::from_str(json);
deserializer.deserialize_seq(ArrayItemCounter).ok()
}
2 changes: 2 additions & 0 deletions native/spark-expr/src/json_funcs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
// under the License.

mod from_json;
mod json_array_length;
mod to_json;

pub use from_json::FromJson;
pub use json_array_length::JsonArrayLength;
pub use to_json::ToJson;
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
classOf[Cast] -> CometCast)

private val jsonExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
classOf[LengthOfJsonArray] -> CometLengthOfJsonArray)

private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
// TODO PromotePrecision
classOf[Alias] -> CometAlias,
Expand All @@ -291,7 +294,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
mathExpressions ++ hashExpressions ++ stringExpressions ++
conditionalExpressions ++ mapExpressions ++ predicateExpressions ++
structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++
temporalExpressions ++ conversionExpressions ++ urlExpressions
temporalExpressions ++ conversionExpressions ++ urlExpressions ++ jsonExpressions

/**
* Mapping of Spark aggregate expression class to Comet expression handler.
Expand Down
36 changes: 36 additions & 0 deletions spark/src/main/scala/org/apache/comet/serde/json.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.comet.serde

import org.apache.spark.sql.catalyst.expressions.LengthOfJsonArray

object CometLengthOfJsonArray
extends CometScalarFunction[LengthOfJsonArray]("json_array_length") {

private val IncompatibleReason: String =
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"Spark's lenient JSON parser allows single quotes, unescaped controls, " +
"and trailing content, " +
"while Comet's serde_json requires strict JSON."

override def getIncompatibleReasons(): Seq[String] = Seq(IncompatibleReason)

override def getSupportLevel(expr: LengthOfJsonArray): SupportLevel = Incompatible(
Some(IncompatibleReason))
}
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,20 @@ trait CometExprShim extends CommonStringExprs {
case _ => None
}

case s: StaticInvoke =>
(s.staticObject, s.functionName, s.arguments) match {
case (cls, "lengthOfJsonArray", Seq(child)) if cls == classOf[JsonExpressionUtils] =>
val lengthOfJsonArray = LengthOfJsonArray(child)
val exprProto = exprToProtoInternal(lengthOfJsonArray, inputs, binding)
if (exprProto.isEmpty) {
lengthOfJsonArray
.getTagValue(CometExplainInfo.EXTENSION_INFO)
.foreach(reasons => s.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons))
}
exprProto
case _ => None
}

case ms: MapSort =>
val keyType = ms.dataType.asInstanceOf[MapType].keyType
if (!supportedScalarSortElementType(keyType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ package org.apache.comet.shims

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.Sum
import org.apache.spark.sql.catalyst.expressions.json.StructsToJsonEvaluator
import org.apache.spark.sql.catalyst.expressions.json.{JsonExpressionUtils, StructsToJsonEvaluator}
import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, StaticInvoke}
import org.apache.spark.sql.catalyst.expressions.url.ParseUrlEvaluator
import org.apache.spark.sql.catalyst.util.DateTimeUtils
Expand Down Expand Up @@ -191,6 +191,20 @@ trait CometExprShim extends CommonStringExprs {
case _ => None
}

case s: StaticInvoke =>
(s.staticObject, s.functionName, s.arguments) match {
case (cls, "lengthOfJsonArray", Seq(child)) if cls == classOf[JsonExpressionUtils] =>
val lengthOfJsonArray = LengthOfJsonArray(child)
val exprProto = exprToProtoInternal(lengthOfJsonArray, inputs, binding)
if (exprProto.isEmpty) {
lengthOfJsonArray
.getTagValue(CometExplainInfo.EXTENSION_INFO)
.foreach(reasons => s.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons))
}
exprProto
case _ => None
}

case ms: MapSort =>
val keyType = ms.dataType.asInstanceOf[MapType].keyType
if (!supportedScalarSortElementType(keyType)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ trait CometExprShim extends CommonStringExprs {
case _ => None
}

case s: StaticInvoke =>
(s.staticObject, s.functionName, s.arguments) match {
case (cls, "lengthOfJsonArray", Seq(child)) if cls == classOf[JsonExpressionUtils] =>
val lengthOfJsonArray = LengthOfJsonArray(child)
val exprProto = exprToProtoInternal(lengthOfJsonArray, inputs, binding)
if (exprProto.isEmpty) {
lengthOfJsonArray
.getTagValue(CometExplainInfo.EXTENSION_INFO)
.foreach(reasons => s.setTagValue(CometExplainInfo.EXTENSION_INFO, reasons))
}
exprProto
case _ => None
}

case ms: MapSort =>
val keyType = ms.dataType.asInstanceOf[MapType].keyType
if (!supportedScalarSortElementType(keyType)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

statement
CREATE TABLE test_json_array_length(j string) USING parquet

statement
INSERT INTO test_json_array_length VALUES
('[1,2,3,4]'),
('[]'),
('[1]'),
(NULL),
('[1,2,3,{"f1":1,"f2":[5,6]},4]'),
('[[1,2],[3,4],[5,6]]'),
('[{"a":1},{"b":2},{"c":3}]'),
('[1,2'),
('[1,2,3,]'),
('not a json'),
('{"object": "not array"}'),
(''),
(' '),
('[true, false, null]'),
('["string1", "string2", "string3"]'),
('[1, "mixed", true, null, {"key":"value"}]'),
('[1,2,3,4,5,6,7,8,9,10]'),
('["line1\nline2", "tab\tseparated", "quote\"here"]'),
('{"outer": [1,2,3], "inner": [[1,2],[3,4]]}'),
('{"arrays": {"first": [1,2], "second": [3,4,5]}}'),
('[{"arr": [1,2,3]}, {"arr": [4,5]}]')

query spark_answer_only
SELECT json_array_length(j) FROM test_json_array_length

query spark_answer_only
SELECT json_array_length('[1,2,3,4]')

query spark_answer_only
SELECT json_array_length('not an array')

query spark_answer_only
SELECT json_array_length('{"key":"value"}')
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you also add examples for incompatible behavior, such as using single quotes around keys and values

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


query spark_answer_only
SELECT json_array_length(NULL)

query spark_answer_only
SELECT json_array_length('[]')

query expect_fallback(Spark's lenient JSON parser allows single quotes, unescaped controls, and trailing content, while Comet's serde_json requires strict JSON.)
SELECT json_array_length("[{'key':'value'}]")
Loading