From b5ebc1f81506c97368d55b76d0682f9187a4e150 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 14 Nov 2025 14:21:15 -0800 Subject: [PATCH 1/4] Support split eval function Signed-off-by: Kai Huang # Conflicts: # core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java # integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java # ppl/src/main/antlr/OpenSearchPPLLexer.g4 # ppl/src/main/antlr/OpenSearchPPLParser.g4 # ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java # ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java --- .../function/BuiltinFunctionName.java | 1 + .../CollectionUDF/SplitFunctionImp.java | 73 +++++++++++ .../expression/function/PPLFuncImpTable.java | 8 ++ docs/user/ppl/functions/collection.rst | 54 ++++++++ .../remote/CalciteArrayFunctionIT.java | 123 ++++++++++++++++++ ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 1 + ppl/src/main/antlr/OpenSearchPPLParser.g4 | 1 + .../calcite/CalcitePPLArrayFunctionTest.java | 54 ++++++++ .../ppl/utils/PPLQueryDataAnonymizerTest.java | 16 +++ 9 files changed, 331 insertions(+) create mode 100644 core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java index d30af69d32e..49c500caeeb 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java @@ -75,6 +75,7 @@ public enum BuiltinFunctionName { MVAPPEND(FunctionName.of("mvappend")), MVJOIN(FunctionName.of("mvjoin")), MVINDEX(FunctionName.of("mvindex")), + SPLIT(FunctionName.of("split")), MVDEDUP(FunctionName.of("mvdedup")), FORALL(FunctionName.of("forall")), EXISTS(FunctionName.of("exists")), diff --git a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java new file mode 100644 index 00000000000..0672772c0f8 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java @@ -0,0 +1,73 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.CollectionUDF; + +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlOperator; +import org.apache.calcite.sql.fun.SqlLibraryOperators; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.opensearch.sql.expression.function.PPLFuncImpTable; + +/** + * SPLIT function implementation that splits strings by delimiter. + * + *

Usage: split(str, delimiter) + * + *

Returns an array of strings split on the delimiter. + * + *

Special behavior: + * + *

+ * + *

Implementation notes: + * + *

+ */ +public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp { + + @Override + public RexNode resolve(RexBuilder builder, RexNode... args) { + RexNode str = args[0]; + RexNode delimiter = args[1]; + + // Check if delimiter is empty string + // If empty, split into individual characters using a workaround + // If not empty, use Calcite's SPLIT function + + // Create condition: delimiter = '' + RexNode emptyString = builder.makeLiteral(""); + RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString); + + // For empty delimiter: split into characters + // Pattern: Insert a delimiter between each character using regex + // 'abcd' -> 'a|b|c|d' -> split on '|' + RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)"); + RexNode replacement = builder.makeLiteral("|"); + + // Use REGEXP_REPLACE to insert delimiter between characters + SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3; + RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement); + + // Then split on the inserted delimiter + RexNode pipeDelimiter = builder.makeLiteral("|"); + RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter); + + // For non-empty delimiter: use standard SPLIT + RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter); + + // Use CASE to choose between the two approaches + // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END + return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit); + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index a2ea4cfcb30..5fd33746c7d 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -194,6 +194,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP; import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP; @@ -286,6 +287,7 @@ import org.opensearch.sql.exception.ExpressionEvaluationException; import org.opensearch.sql.executor.QueryType; import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp; +import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp; public class PPLFuncImpTable { private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class); @@ -976,6 +978,12 @@ void populate() { builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter), PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER)); + // Register SPLIT with custom logic for empty delimiter + register( + SPLIT, + new SplitFunctionImp(), + PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER)); + // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization register( MVINDEX, diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst index 34c02074641..726e2f30f0c 100644 --- a/docs/user/ppl/functions/collection.rst +++ b/docs/user/ppl/functions/collection.rst @@ -186,6 +186,60 @@ Example:: | 120 | +--------+ +SPLIT +----- + +Description +>>>>>>>>>>> + +Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array. + +Argument type: str: STRING, delimiter: STRING + +Return type: ARRAY of STRING + +Example:: + + os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1 + fetched rows / total rows = 1/1 + +--------------------------------------+ + | result | + |--------------------------------------| + | [buttercup,rarity,tenderhoof,dash] | + +--------------------------------------+ + + os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1 + fetched rows / total rows = 1/1 + +------------------+ + | result | + |------------------| + | [1a2b3c4,567890] | + +------------------+ + + os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1 + fetched rows / total rows = 1/1 + +-----------+ + | result | + |-----------| + | [a,b,c,d] | + +-----------+ + + os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1 + fetched rows / total rows = 1/1 + +--------------+ + | result | + |--------------| + | [name,value] | + +--------------+ + + os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1 + fetched rows / total rows = 1/1 + +---------+ + | result | + |---------| + | [hello] | + +---------+ + MVJOIN ------ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java index 52a6e181e20..c0a8bfd6d88 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java @@ -567,4 +567,127 @@ public void testMvdedupPreservesOrder() throws IOException { // Should preserve first occurrence order: z, a, b, c verifyDataRows(actual, rows(List.of("z", "a", "b", "c"))); } + + @Test + public void testSplitWithSemicolonDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result =" + + " split(test, ';') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh"))); + } + + @Test + public void testSplitWithMultiCharDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("1a2b3c4", "567890"))); + } + + @Test + public void testSplitWithEmptyDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + // Empty delimiter splits into individual characters + verifyDataRows(actual, rows(List.of("a", "b", "c", "d"))); + } + + @Test + public void testSplitWithColonDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'name::value', result = split(test, '::') | head 1 |" + + " fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("name", "value"))); + } + + @Test + public void testSplitWithCommaDelimiter() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'apple,banana,cherry', result = split(test, ',') | head 1" + + " | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("apple", "banana", "cherry"))); + } + + @Test + public void testSplitWithFieldReference() throws IOException { + // Test split on a real field using employer field which may contain space-separated words + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval result = split(employer, ' ') | head 1 | fields employer, result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("employer", "string"), schema("result", "array")); + // Verify that the result is an array + JSONArray dataRows = actual.getJSONArray("datarows"); + assertTrue(dataRows.length() > 0); + JSONArray firstRow = dataRows.getJSONArray(0); + // The second element should be an array + assertTrue(firstRow.get(1) instanceof JSONArray); + } + + @Test + public void testSplitWithEmptyString() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = '', result = split(test, ',') | head 1 | fields result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + // Empty string should return empty array + verifyDataRows(actual, rows(List.of())); + } + + @Test + public void testSplitNoDelimiterFound() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'hello', result = split(test, ',') | head 1 | fields" + + " result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + // If delimiter not found, should return array with original string + verifyDataRows(actual, rows(List.of("hello"))); + } + + @Test + public void testSplitMultipleOccurrences() throws IOException { + JSONObject actual = + executeQuery( + String.format( + "source=%s | eval test = 'a-b-c-d-e-f', result = split(test, '-') | head 1 | fields" + + " result", + TEST_INDEX_BANK)); + + verifySchema(actual, schema("result", "array")); + verifyDataRows(actual, rows(List.of("a", "b", "c", "d", "e", "f"))); + } } diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index ebe0fcb4f21..22f1b96bbb9 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -443,6 +443,7 @@ MVAPPEND: 'MVAPPEND'; MVJOIN: 'MVJOIN'; MVINDEX: 'MVINDEX'; MVDEDUP: 'MVDEDUP'; +SPLIT: 'SPLIT'; FORALL: 'FORALL'; FILTER: 'FILTER'; TRANSFORM: 'TRANSFORM'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 22121a1b1aa..a36647d4dd1 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -1097,6 +1097,7 @@ collectionFunctionName | MVJOIN | MVINDEX | MVDEDUP + | SPLIT | FORALL | EXISTS | FILTER diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java index 176fb534f37..7fd132a751e 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java @@ -290,4 +290,58 @@ public void testMvdedupPreservesOrder() { + "LIMIT 1"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testSplitWithSemicolonDelimiter() { + String ppl = + "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head" + + " 1 | fields result"; + RelNode root = getRelNode(ppl); + + String expectedResult = "result=[buttercup, rarity, tenderhoof]\n"; + verifyResult(root, expectedResult); + } + + @Test + public void testSplitWithMultiCharDelimiter() { + String ppl = + "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |" + + " fields result"; + RelNode root = getRelNode(ppl); + + String expectedResult = "result=[1a2b3c4, 567890]\n"; + verifyResult(root, expectedResult); + } + + @Test + public void testSplitWithEmptyDelimiter() { + String ppl = + "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result"; + RelNode root = getRelNode(ppl); + + // With empty delimiter, should split into individual characters + String expectedResult = "result=[a, b, c, d]\n"; + verifyResult(root, expectedResult); + } + + @Test + public void testSplitWithColonDelimiter() { + String ppl = + "source=EMP | eval test = 'name::value', result = split(test, '::') | head 1 | fields" + + " result"; + RelNode root = getRelNode(ppl); + + String expectedResult = "result=[name, value]\n"; + verifyResult(root, expectedResult); + } + + @Test + public void testSplitWithFieldReference() { + String ppl = "source=EMP | eval result = split(ENAME, 'A') | head 1 | fields result"; + RelNode root = getRelNode(ppl); + + // Just verify it parses and executes correctly + // Actual result depends on the ENAME field value + getRelNode(ppl); // Verify parsing succeeds + } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 0f59e98e74b..796156aae85 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -829,6 +829,22 @@ public void testMvindex() { anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result")); } + @Test + public void testSplit() { + // Test split with delimiter + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('a;b;c', ';') | fields result")); + // Test split with field reference + assertEquals( + "source=table | eval identifier=split(identifier,***) | fields + identifier", + anonymize("source=t | eval result=split(text, ',') | fields result")); + // Test split with empty delimiter (splits into characters) + assertEquals( + "source=table | eval identifier=split(***,***) | fields + identifier", + anonymize("source=t | eval result=split('abcd', '') | fields result")); + } + @Test public void testMvdedup() { // Test mvdedup with array containing duplicates From 8b62cf80c8cddbe785ea0116cb148a7501c8356d Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 14 Nov 2025 14:29:56 -0800 Subject: [PATCH 2/4] doctest Signed-off-by: Kai Huang --- docs/user/ppl/functions/collection.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst index 726e2f30f0c..fdea75d3e81 100644 --- a/docs/user/ppl/functions/collection.rst +++ b/docs/user/ppl/functions/collection.rst @@ -202,11 +202,11 @@ Example:: os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1 fetched rows / total rows = 1/1 - +--------------------------------------+ - | result | - |--------------------------------------| - | [buttercup,rarity,tenderhoof,dash] | - +--------------------------------------+ + +------------------------------------+ + | result | + |------------------------------------| + | [buttercup,rarity,tenderhoof,dash] | + +------------------------------------+ os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1 fetched rows / total rows = 1/1 From 0f070e2acdcdaeb2d44659c1f3139290caa2af74 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 14 Nov 2025 14:58:26 -0800 Subject: [PATCH 3/4] Update test cases Signed-off-by: Kai Huang --- .../remote/CalciteArrayFunctionIT.java | 84 ------------------- .../calcite/CalcitePPLArrayFunctionTest.java | 75 +++++++++++------ 2 files changed, 49 insertions(+), 110 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java index c0a8bfd6d88..31556e518b9 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java @@ -606,88 +606,4 @@ public void testSplitWithEmptyDelimiter() throws IOException { // Empty delimiter splits into individual characters verifyDataRows(actual, rows(List.of("a", "b", "c", "d"))); } - - @Test - public void testSplitWithColonDelimiter() throws IOException { - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval test = 'name::value', result = split(test, '::') | head 1 |" - + " fields result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("result", "array")); - verifyDataRows(actual, rows(List.of("name", "value"))); - } - - @Test - public void testSplitWithCommaDelimiter() throws IOException { - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval test = 'apple,banana,cherry', result = split(test, ',') | head 1" - + " | fields result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("result", "array")); - verifyDataRows(actual, rows(List.of("apple", "banana", "cherry"))); - } - - @Test - public void testSplitWithFieldReference() throws IOException { - // Test split on a real field using employer field which may contain space-separated words - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval result = split(employer, ' ') | head 1 | fields employer, result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("employer", "string"), schema("result", "array")); - // Verify that the result is an array - JSONArray dataRows = actual.getJSONArray("datarows"); - assertTrue(dataRows.length() > 0); - JSONArray firstRow = dataRows.getJSONArray(0); - // The second element should be an array - assertTrue(firstRow.get(1) instanceof JSONArray); - } - - @Test - public void testSplitWithEmptyString() throws IOException { - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval test = '', result = split(test, ',') | head 1 | fields result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("result", "array")); - // Empty string should return empty array - verifyDataRows(actual, rows(List.of())); - } - - @Test - public void testSplitNoDelimiterFound() throws IOException { - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval test = 'hello', result = split(test, ',') | head 1 | fields" - + " result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("result", "array")); - // If delimiter not found, should return array with original string - verifyDataRows(actual, rows(List.of("hello"))); - } - - @Test - public void testSplitMultipleOccurrences() throws IOException { - JSONObject actual = - executeQuery( - String.format( - "source=%s | eval test = 'a-b-c-d-e-f', result = split(test, '-') | head 1 | fields" - + " result", - TEST_INDEX_BANK)); - - verifySchema(actual, schema("result", "array")); - verifyDataRows(actual, rows(List.of("a", "b", "c", "d", "e", "f"))); - } } diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java index 7fd132a751e..ea78499b9a9 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java @@ -298,8 +298,24 @@ public void testSplitWithSemicolonDelimiter() { + " 1 | fields result"; RelNode root = getRelNode(ppl); - String expectedResult = "result=[buttercup, rarity, tenderhoof]\n"; - verifyResult(root, expectedResult); + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR]," + + " result=[CASE(=(';', '')," + + " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|')," + + " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', " + + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END " + + "`result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); } @Test @@ -309,8 +325,22 @@ public void testSplitWithMultiCharDelimiter() { + " fields result"; RelNode root = getRelNode(ppl); - String expectedResult = "result=[1a2b3c4, 567890]\n"; - verifyResult(root, expectedResult); + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR]," + + " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR," + + " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', " + + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); } @Test @@ -320,28 +350,21 @@ public void testSplitWithEmptyDelimiter() { RelNode root = getRelNode(ppl); // With empty delimiter, should split into individual characters - String expectedResult = "result=[a, b, c, d]\n"; - verifyResult(root, expectedResult); - } - - @Test - public void testSplitWithColonDelimiter() { - String ppl = - "source=EMP | eval test = 'name::value', result = split(test, '::') | head 1 | fields" - + " result"; - RelNode root = getRelNode(ppl); - - String expectedResult = "result=[name, value]\n"; - verifyResult(root, expectedResult); - } - - @Test - public void testSplitWithFieldReference() { - String ppl = "source=EMP | eval result = split(ENAME, 'A') | head 1 | fields result"; - RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(result=[$9])\n" + + " LogicalSort(fetch=[1])\n" + + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR]," + + " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR," + + " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); - // Just verify it parses and executes correctly - // Actual result depends on the ENAME field value - getRelNode(ppl); // Verify parsing succeeds + String expectedSparkSql = + "SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') " + + "ELSE SPLIT('abcd', '') END `result`\n" + + "FROM `scott`.`EMP`\n" + + "LIMIT 1"; + verifyPPLToSparkSQL(root, expectedSparkSql); } } From 444cb295d03328aeb3d169b075245c392bcef9f9 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 19 Nov 2025 15:28:16 -0800 Subject: [PATCH 4/4] Update to not use UDF Signed-off-by: Kai Huang --- .../CollectionUDF/SplitFunctionImp.java | 73 ------------------- .../expression/function/PPLFuncImpTable.java | 25 ++++++- .../calcite/CalcitePPLArrayFunctionTest.java | 22 +++--- 3 files changed, 34 insertions(+), 86 deletions(-) delete mode 100644 core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java diff --git a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java deleted file mode 100644 index 0672772c0f8..00000000000 --- a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.function.CollectionUDF; - -import org.apache.calcite.rex.RexBuilder; -import org.apache.calcite.rex.RexNode; -import org.apache.calcite.sql.SqlOperator; -import org.apache.calcite.sql.fun.SqlLibraryOperators; -import org.apache.calcite.sql.fun.SqlStdOperatorTable; -import org.opensearch.sql.expression.function.PPLFuncImpTable; - -/** - * SPLIT function implementation that splits strings by delimiter. - * - *

Usage: split(str, delimiter) - * - *

Returns an array of strings split on the delimiter. - * - *

Special behavior: - * - *

    - *
  • Empty delimiter ("") splits into individual characters - *
  • If delimiter not found, returns array with original string - *
  • Empty string returns empty array - *
- * - *

Implementation notes: - * - *

    - *
  • Uses Calcite's SPLIT for non-empty delimiters - *
  • Uses custom character splitting for empty delimiter via REGEXP_REPLACE - *
- */ -public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp { - - @Override - public RexNode resolve(RexBuilder builder, RexNode... args) { - RexNode str = args[0]; - RexNode delimiter = args[1]; - - // Check if delimiter is empty string - // If empty, split into individual characters using a workaround - // If not empty, use Calcite's SPLIT function - - // Create condition: delimiter = '' - RexNode emptyString = builder.makeLiteral(""); - RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString); - - // For empty delimiter: split into characters - // Pattern: Insert a delimiter between each character using regex - // 'abcd' -> 'a|b|c|d' -> split on '|' - RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)"); - RexNode replacement = builder.makeLiteral("|"); - - // Use REGEXP_REPLACE to insert delimiter between characters - SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3; - RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement); - - // Then split on the inserted delimiter - RexNode pipeDelimiter = builder.makeLiteral("|"); - RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter); - - // For non-empty delimiter: use standard SPLIT - RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter); - - // Use CASE to choose between the two approaches - // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END - return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit); - } -} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index 5fd33746c7d..2a54efaecbc 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -287,7 +287,6 @@ import org.opensearch.sql.exception.ExpressionEvaluationException; import org.opensearch.sql.executor.QueryType; import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp; -import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp; public class PPLFuncImpTable { private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class); @@ -979,9 +978,31 @@ void populate() { PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER)); // Register SPLIT with custom logic for empty delimiter + // Case 1: Delimiter is not empty string, use SPLIT + // Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern register( SPLIT, - new SplitFunctionImp(), + (FunctionImp2) + (builder, str, delimiter) -> { + // Create condition: delimiter = '' + RexNode emptyString = builder.makeLiteral(""); + RexNode isEmptyDelimiter = + builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString); + + // For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.' + // pattern This matches each individual character + RexNode dotPattern = builder.makeLiteral("."); + RexNode splitChars = + builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern); + + // For non-empty delimiter: use standard SPLIT + RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter); + + // Use CASE to choose between the two approaches + // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END + return builder.makeCall( + SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit); + }, PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER)); // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java index ea78499b9a9..96529adea24 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java @@ -304,14 +304,14 @@ public void testSplitWithSemicolonDelimiter() { + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR]," + " result=[CASE(=(';', '')," - + " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|')," - + " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n" + + " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.')," + + " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', " - + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END " + "SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', " + + "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END " + "`result`\n" + "FROM `scott`.`EMP`\n" + "LIMIT 1"; @@ -330,14 +330,14 @@ public void testSplitWithMultiCharDelimiter() { + " LogicalSort(fetch=[1])\n" + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR]," - + " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR," - + " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n" + + " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR," + + " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', " - + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n" + "SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', " + + "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n" + "FROM `scott`.`EMP`\n" + "LIMIT 1"; verifyPPLToSparkSQL(root, expectedSparkSql); @@ -355,13 +355,13 @@ public void testSplitWithEmptyDelimiter() { + " LogicalSort(fetch=[1])\n" + " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4]," + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR]," - + " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR," - + " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n" + + " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR," + + " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') " + "SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') " + "ELSE SPLIT('abcd', '') END `result`\n" + "FROM `scott`.`EMP`\n" + "LIMIT 1";