Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions be/src/exprs/function/array/function_array_combinations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "common/compiler_util.h"
#include "common/status.h"
#include "core/assert_cast.h"
#include "core/column/column_array.h"
#include "core/column/column_const.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_array.h"
#include "core/data_type/data_type_decimal.h"
#include "core/data_type/data_type_nullable.h"
#include "core/types.h"
#include "exprs/function/function.h"
#include "exprs/function/function_helpers.h"
#include "exprs/function/simple_function_factory.h"

namespace doris {
// array_combinations([1, 2, 3],2) -> [[1,2], [1,3], [2,3]]
// array_combinations([1, NULL, 3, NULL, 5],4) -> [[1,NULL,3,NULL], [1,NULL,3,5], [NULL,3,NULL,5]]

class FunctionArrayCombinations : public IFunction {
public:
static constexpr auto name = "array_combinations";
static FunctionPtr create() { return std::make_shared<FunctionArrayCombinations>(); }
bool is_variadic() const override { return false; }
String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 2; }

bool use_default_implementation_for_nulls() const override { return true; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
const auto* array_type = assert_cast<const DataTypeArray*>(arguments[0].get());
auto elem_t = make_nullable(array_type->get_nested_type());
auto res = std::make_shared<DataTypeArray>(
make_nullable(std::make_shared<DataTypeArray>(elem_t)));
return res;
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
auto array = block.get_by_position(arguments[0]).column;
ColumnPtr num = block.get_by_position(arguments[1]).column;
Int64 combination_length = num->get_int(0);

if (combination_length > MAX_COMBINATION_LENGTH || combination_length < 1) {
return Status::InvalidArgument(
fmt::format("execute failed, function {}'s second argument must be bigger than "
"0 and not bigger than 5",
get_name()));
}

ColumnPtr res;
const auto* src_arr = assert_cast<const ColumnArray*>(remove_nullable(array).get());
const auto& offsets =
assert_cast<const ColumnArray::ColumnOffsets&>(src_arr->get_offsets_column());
Status status = vector_const(src_arr, input_rows_count, res, offsets, combination_length);
if (!status.ok()) {
return status;
}
block.replace_by_position(result, std::move(res));
return status;
}

ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; }

private:
static const size_t MAX_COMBINATION_LENGTH = 5;
static const size_t MAX_COMBINATION_COUNT = 100000;

// Then combinationCount(n, k) = combinationCount(n-1, k-1) * n/k (https://en.wikipedia.org/wiki/Combination#Number_of_k-combinations)
// The formula is recursive. Here, instead of starting with k=combinationCount, n=arrayLength and recursing,
// we start with k=0 n=(arrayLength-combinationLength) and proceed "bottom up".
size_t _combination_count(size_t array_length, size_t combination_length) const {
size_t combinations = 1;

for (size_t i = 1; i <= combination_length; i++) {
combinations = combinations * (array_length - combination_length + i) / i;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Signed/unsigned comparison

Loop variable i is int (signed) but combination_length is size_t (unsigned). This triggers -Wsign-compare warnings. Consider using size_t i instead:

for (size_t i = 1; i <= combination_length; i++) {

return combinations;
}
Comment on lines +89 to +97
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we pre-calculate the max array length corresponding to each k value, rather than calculating it for each row? Like:

static constexpr size_t MAX_COMBINATION_COUNT[6] = {-1, 100000, 500, ....}
if (array_lenth > MAX_COMBINATION_COUNT[combination_length]) {
    ...
}

Copy link
Author

@daju233 daju233 Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's ok, but I think combination_count also have other usage,like resize offset. So I dont know if pre-calculate can improve overhead

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better do that before call _combination_count, to avoid overflow


ALWAYS_INLINE std::vector<size_t> _first_combination(Int64 combination_length,
size_t length) const {
std::vector<size_t> comb(combination_length + 1);
std::iota(comb.begin(), comb.begin() + combination_length, 0);
comb[combination_length] = length;
return comb;
}

// Generates the next k-combination in colex order.
//
// scan from the lowest index upward and increment the first
// position that can be increased without breaking the strictly
// increasing invariant. Resetting all lower positions to their
// minimal values ensures the result is the smallest combination
// greater than the current one.
bool _next_combination(std::vector<size_t>& comb, Int64 combination_length) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment to explain for this function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

explain why this function could generate next comb

for (size_t i = 0; i < combination_length; ++i) {
if (comb[i] + 1 < comb[i + 1]) {
++comb[i];
std::iota(comb.begin(), comb.begin() + i, 0);
return true;
}
}
return false;
}

Status vector_const(const ColumnArray* nested_src_column_ptr, size_t input_rows_count,
ColumnPtr& res, const ColumnArray::ColumnOffsets& offsets,
Int64 combination_length) const {
const auto& src_data = nested_src_column_ptr->get_data();
const auto& src_offsets = offsets.get_data();

auto inner_data = src_data.clone_empty();
auto inner_offsets = ColumnArray::ColumnOffsets::create();
auto inner_arr = ColumnArray::create(std::move(inner_data), std::move(inner_offsets));
auto* inner = assert_cast<ColumnArray*>(inner_arr.get());

auto outer_offsets = ColumnArray::ColumnOffsets::create();
auto& outer_offsets_data = outer_offsets->get_data();
outer_offsets_data.resize(input_rows_count);

size_t prev_off = 0, outer_off = 0;

for (size_t row = 0; row < input_rows_count; ++row) {
size_t curr_off = src_offsets[row];
size_t row_len = curr_off - prev_off;

if (combination_length <= 0 || combination_length > row_len) {
outer_offsets_data[row] = outer_off;
prev_off = curr_off;
continue;
}

size_t combination_count = _combination_count(row_len, combination_length);
if (combination_count > MAX_COMBINATION_COUNT) {
return Status::InvalidArgument(
fmt::format("execute failed, function {}'s total size of sub-groups "
"generated must be smaller than 100,000",
get_name()));
}
std::vector comb = _first_combination(combination_length, row_len);
inner->get_data().reserve(inner->get_data().size() +
combination_count * combination_length);
inner->get_offsets().reserve(inner->get_offsets().size() + combination_count);
outer_off += combination_count;
do {
for (int i = 0; i < combination_length; ++i) {
size_t idx = prev_off + comb[i];
inner->get_data().insert_from(src_data, idx);
}
inner->get_offsets().push_back(inner->get_data().size());
} while (_next_combination(comb, combination_length));

outer_offsets_data[row] = outer_off;
prev_off = curr_off;
}
size_t inner_size = inner_arr->size();
auto nullable_arr =
ColumnNullable::create(std::move(inner_arr), ColumnUInt8::create(inner_size, 0));
res = ColumnArray::create(std::move(nullable_arr), std::move(outer_offsets));

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug (Undefined Behavior): Use-after-move

std::move(inner_arr) is in the first argument and inner_arr->size() is in the second argument of the same function call. C++ does not guarantee evaluation order of function arguments. If the move is evaluated first, inner_arr becomes null and ->size() is a null pointer dereference.

Fix:

auto inner_size = inner_arr->size();
auto nullable_arr = ColumnNullable::create(std::move(inner_arr),
                                           ColumnUInt8::create(inner_size, 0));

return Status::OK();
}
};

void register_function_array_combinations(SimpleFunctionFactory& factory) {
factory.register_function<FunctionArrayCombinations>();
}
} // namespace doris
2 changes: 2 additions & 0 deletions be/src/exprs/function/array/function_array_register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ void register_function_array_filter_function(SimpleFunctionFactory&);
void register_function_array_splits(SimpleFunctionFactory&);
void register_function_array_contains_all(SimpleFunctionFactory&);
void register_function_array_match(SimpleFunctionFactory&);
void register_function_array_combinations(SimpleFunctionFactory&);

void register_function_array(SimpleFunctionFactory& factory) {
register_function_array_flatten(factory);
Expand Down Expand Up @@ -95,6 +96,7 @@ void register_function_array(SimpleFunctionFactory& factory) {
register_function_array_splits(factory);
register_function_array_contains_all(factory);
register_function_array_match(factory);
register_function_array_combinations(factory);
}

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Array;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayApply;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayAvg;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCombinations;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCompact;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayConcat;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayContains;
Expand Down Expand Up @@ -623,6 +624,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(ArrayFirst.class, "array_first"),
scalar(ArrayFirstIndex.class, "array_first_index"),
scalar(ArrayFlatten.class, "array_flatten"),
scalar(ArrayCombinations.class, "array_combinations"),
scalar(ArrayIntersect.class, "array_intersect"),
scalar(ArrayJoin.class, "array_join"),
scalar(ArrayLast.class, "array_last"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral;
import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.ArrayType;
import org.apache.doris.nereids.types.BigIntType;
import org.apache.doris.nereids.types.coercion.AnyDataType;
import org.apache.doris.nereids.types.coercion.FollowToAnyDataType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'combinations'
*/
public class ArrayCombinations extends ScalarFunction
implements ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(ArrayType.of(ArrayType.of(new FollowToAnyDataType(0))))
.args(ArrayType.of(new AnyDataType(0)), BigIntType.INSTANCE));

/**
* constructor with 2 arguments.
*/
public ArrayCombinations(Expression arg0, Expression arg1) {
super("array_combinations", arg0, arg1);
}

@Override
public void checkLegalityBeforeTypeCoercion() {
if (!(child(1) instanceof IntegerLikeLiteral || child(1) instanceof NullLiteral)) {
throw new AnalysisException("Array_Combinations's second argument must be a constant literal.");
}
}

@Override
public void checkLegalityAfterRewrite() {
checkLegalityBeforeTypeCoercion();
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

/**
* withChildren.
*/
@Override
public ArrayCombinations withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 2);
return new ArrayCombinations(children.get(0), children.get(1));
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitArrayCombinations(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Array;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayApply;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayAvg;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCombinations;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayCompact;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayConcat;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ArrayContains;
Expand Down Expand Up @@ -823,6 +824,10 @@ default R visitArrayFlatten(ArrayFlatten arrayFlatten, C context) {
return visitScalarFunction(arrayFlatten, context);
}

default R visitArrayCombinations(ArrayCombinations arrayCombinations, C context) {
return visitScalarFunction(arrayCombinations, context);
}

default R visitArrayMap(ArrayMap arrayMap, C context) {
return visitScalarFunction(arrayMap, context);
}
Expand Down
17 changes: 17 additions & 0 deletions regression-test/data/nereids_function_p0/scalar_function/Array.out
Original file line number Diff line number Diff line change
Expand Up @@ -16128,3 +16128,20 @@ false false
-- !sql --
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

-- !sql --
[["foo", "bar"], ["foo", "baz"], ["bar", "baz"]]

-- !sql --
[[1, 2], [1, 3], [2, 3]]

-- !sql --
[[1, 2], [1, 2], [2, 2]]

-- !sql --
[]

-- !sql --
[[null], [null]]

-- !sql --
[[[[1, 2, 3, 4, 5]], [[6, 7], [8, 9]]]]
Loading
Loading