Skip to content

Commit effe78d

Browse files
Support split eval function (#4814) (#4918)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent d994ca9 commit effe78d

File tree

8 files changed

+218
-0
lines changed

8 files changed

+218
-0
lines changed

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
7575
MVAPPEND(FunctionName.of("mvappend")),
7676
MVJOIN(FunctionName.of("mvjoin")),
7777
MVINDEX(FunctionName.of("mvindex")),
78+
SPLIT(FunctionName.of("split")),
7879
MVDEDUP(FunctionName.of("mvdedup")),
7980
FORALL(FunctionName.of("forall")),
8081
EXISTS(FunctionName.of("exists")),

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@
195195
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
196196
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
197197
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
198+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
198199
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
199200
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
200201
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
@@ -998,6 +999,34 @@ void populate() {
998999
builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
9991000
PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
10001001

1002+
// Register SPLIT with custom logic for empty delimiter
1003+
// Case 1: Delimiter is not empty string, use SPLIT
1004+
// Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern
1005+
register(
1006+
SPLIT,
1007+
(FunctionImp2)
1008+
(builder, str, delimiter) -> {
1009+
// Create condition: delimiter = ''
1010+
RexNode emptyString = builder.makeLiteral("");
1011+
RexNode isEmptyDelimiter =
1012+
builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
1013+
1014+
// For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.'
1015+
// pattern This matches each individual character
1016+
RexNode dotPattern = builder.makeLiteral(".");
1017+
RexNode splitChars =
1018+
builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern);
1019+
1020+
// For non-empty delimiter: use standard SPLIT
1021+
RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
1022+
1023+
// Use CASE to choose between the two approaches
1024+
// CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
1025+
return builder.makeCall(
1026+
SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
1027+
},
1028+
PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
1029+
10011030
// Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
10021031
register(
10031032
MVINDEX,

docs/user/ppl/functions/collection.rst

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,60 @@ Example::
186186
| 120 |
187187
+--------+
188188

189+
SPLIT
190+
-----
191+
192+
Description
193+
>>>>>>>>>>>
194+
195+
Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.
196+
197+
Argument type: str: STRING, delimiter: STRING
198+
199+
Return type: ARRAY of STRING
200+
201+
Example::
202+
203+
os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
204+
fetched rows / total rows = 1/1
205+
+------------------------------------+
206+
| result |
207+
|------------------------------------|
208+
| [buttercup,rarity,tenderhoof,dash] |
209+
+------------------------------------+
210+
211+
os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
212+
fetched rows / total rows = 1/1
213+
+------------------+
214+
| result |
215+
|------------------|
216+
| [1a2b3c4,567890] |
217+
+------------------+
218+
219+
os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
220+
fetched rows / total rows = 1/1
221+
+-----------+
222+
| result |
223+
|-----------|
224+
| [a,b,c,d] |
225+
+-----------+
226+
227+
os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
228+
fetched rows / total rows = 1/1
229+
+--------------+
230+
| result |
231+
|--------------|
232+
| [name,value] |
233+
+--------------+
234+
235+
os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
236+
fetched rows / total rows = 1/1
237+
+---------+
238+
| result |
239+
|---------|
240+
| [hello] |
241+
+---------+
242+
189243
MVJOIN
190244
------
191245

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,4 +567,43 @@ public void testMvdedupPreservesOrder() throws IOException {
567567
// Should preserve first occurrence order: z, a, b, c
568568
verifyDataRows(actual, rows(List.of("z", "a", "b", "c")));
569569
}
570+
571+
@Test
572+
public void testSplitWithSemicolonDelimiter() throws IOException {
573+
JSONObject actual =
574+
executeQuery(
575+
String.format(
576+
"source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
577+
+ " split(test, ';') | head 1 | fields result",
578+
TEST_INDEX_BANK));
579+
580+
verifySchema(actual, schema("result", "array"));
581+
verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
582+
}
583+
584+
@Test
585+
public void testSplitWithMultiCharDelimiter() throws IOException {
586+
JSONObject actual =
587+
executeQuery(
588+
String.format(
589+
"source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
590+
+ " fields result",
591+
TEST_INDEX_BANK));
592+
593+
verifySchema(actual, schema("result", "array"));
594+
verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
595+
}
596+
597+
@Test
598+
public void testSplitWithEmptyDelimiter() throws IOException {
599+
JSONObject actual =
600+
executeQuery(
601+
String.format(
602+
"source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
603+
TEST_INDEX_BANK));
604+
605+
verifySchema(actual, schema("result", "array"));
606+
// Empty delimiter splits into individual characters
607+
verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
608+
}
570609
}

ppl/src/main/antlr/OpenSearchPPLLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ MVAPPEND: 'MVAPPEND';
448448
MVJOIN: 'MVJOIN';
449449
MVINDEX: 'MVINDEX';
450450
MVDEDUP: 'MVDEDUP';
451+
SPLIT: 'SPLIT';
451452
FORALL: 'FORALL';
452453
FILTER: 'FILTER';
453454
TRANSFORM: 'TRANSFORM';

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,7 @@ collectionFunctionName
11041104
| MVJOIN
11051105
| MVINDEX
11061106
| MVDEDUP
1107+
| SPLIT
11071108
| FORALL
11081109
| EXISTS
11091110
| FILTER

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,4 +290,81 @@ public void testMvdedupPreservesOrder() {
290290
+ "LIMIT 1";
291291
verifyPPLToSparkSQL(root, expectedSparkSql);
292292
}
293+
294+
@Test
295+
public void testSplitWithSemicolonDelimiter() {
296+
String ppl =
297+
"source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
298+
+ " 1 | fields result";
299+
RelNode root = getRelNode(ppl);
300+
301+
String expectedLogical =
302+
"LogicalProject(result=[$9])\n"
303+
+ " LogicalSort(fetch=[1])\n"
304+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
305+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
306+
+ " result=[CASE(=(';', ''),"
307+
+ " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.'),"
308+
+ " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
309+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
310+
verifyLogical(root, expectedLogical);
311+
312+
String expectedSparkSql =
313+
"SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', "
314+
+ "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
315+
+ "`result`\n"
316+
+ "FROM `scott`.`EMP`\n"
317+
+ "LIMIT 1";
318+
verifyPPLToSparkSQL(root, expectedSparkSql);
319+
}
320+
321+
@Test
322+
public void testSplitWithMultiCharDelimiter() {
323+
String ppl =
324+
"source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
325+
+ " fields result";
326+
RelNode root = getRelNode(ppl);
327+
328+
String expectedLogical =
329+
"LogicalProject(result=[$9])\n"
330+
+ " LogicalSort(fetch=[1])\n"
331+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
332+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
333+
+ " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR,"
334+
+ " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
335+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
336+
verifyLogical(root, expectedLogical);
337+
338+
String expectedSparkSql =
339+
"SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', "
340+
+ "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
341+
+ "FROM `scott`.`EMP`\n"
342+
+ "LIMIT 1";
343+
verifyPPLToSparkSQL(root, expectedSparkSql);
344+
}
345+
346+
@Test
347+
public void testSplitWithEmptyDelimiter() {
348+
String ppl =
349+
"source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
350+
RelNode root = getRelNode(ppl);
351+
352+
// With empty delimiter, should split into individual characters
353+
String expectedLogical =
354+
"LogicalProject(result=[$9])\n"
355+
+ " LogicalSort(fetch=[1])\n"
356+
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
357+
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
358+
+ " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR,"
359+
+ " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
360+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
361+
verifyLogical(root, expectedLogical);
362+
363+
String expectedSparkSql =
364+
"SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') "
365+
+ "ELSE SPLIT('abcd', '') END `result`\n"
366+
+ "FROM `scott`.`EMP`\n"
367+
+ "LIMIT 1";
368+
verifyPPLToSparkSQL(root, expectedSparkSql);
369+
}
293370
}

ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,22 @@ public void testMvindex() {
845845
anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
846846
}
847847

848+
@Test
849+
public void testSplit() {
850+
// Test split with delimiter
851+
assertEquals(
852+
"source=table | eval identifier=split(***,***) | fields + identifier",
853+
anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
854+
// Test split with field reference
855+
assertEquals(
856+
"source=table | eval identifier=split(identifier,***) | fields + identifier",
857+
anonymize("source=t | eval result=split(text, ',') | fields result"));
858+
// Test split with empty delimiter (splits into characters)
859+
assertEquals(
860+
"source=table | eval identifier=split(***,***) | fields + identifier",
861+
anonymize("source=t | eval result=split('abcd', '') | fields result"));
862+
}
863+
848864
@Test
849865
public void testMvdedup() {
850866
// Test mvdedup with array containing duplicates

0 commit comments

Comments
 (0)