From b5ebc1f81506c97368d55b76d0682f9187a4e150 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Fri, 14 Nov 2025 14:21:15 -0800
Subject: [PATCH 1/4] Support split eval function

Signed-off-by: Kai Huang <ahkcs@amazon.com>

# Conflicts:
#	core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
#	integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
#	ppl/src/main/antlr/OpenSearchPPLLexer.g4
#	ppl/src/main/antlr/OpenSearchPPLParser.g4
#	ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
#	ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java
---
 .../function/BuiltinFunctionName.java         |   1 +
 .../CollectionUDF/SplitFunctionImp.java       |  73 +++++++++++
 .../expression/function/PPLFuncImpTable.java  |   8 ++
 docs/user/ppl/functions/collection.rst        |  54 ++++++++
 .../remote/CalciteArrayFunctionIT.java        | 123 ++++++++++++++++++
 ppl/src/main/antlr/OpenSearchPPLLexer.g4      |   1 +
 ppl/src/main/antlr/OpenSearchPPLParser.g4     |   1 +
 .../calcite/CalcitePPLArrayFunctionTest.java  |  54 ++++++++
 .../ppl/utils/PPLQueryDataAnonymizerTest.java |  16 +++
 9 files changed, 331 insertions(+)
 create mode 100644 core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
index d30af69d32e..49c500caeeb 100644
--- a/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
+++ b/core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java
@@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
   MVAPPEND(FunctionName.of("mvappend")),
   MVJOIN(FunctionName.of("mvjoin")),
   MVINDEX(FunctionName.of("mvindex")),
+  SPLIT(FunctionName.of("split")),
   MVDEDUP(FunctionName.of("mvdedup")),
   FORALL(FunctionName.of("forall")),
   EXISTS(FunctionName.of("exists")),
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java
new file mode 100644
index 00000000000..0672772c0f8
--- /dev/null
+++ b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.sql.expression.function.CollectionUDF;
+
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.calcite.sql.fun.SqlLibraryOperators;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.opensearch.sql.expression.function.PPLFuncImpTable;
+
+/**
+ * SPLIT function implementation that splits strings by delimiter.
+ *
+ * <p>Usage: split(str, delimiter)
+ *
+ * <p>Returns an array of strings split on the delimiter.
+ *
+ * <p>Special behavior:
+ *
+ * <ul>
+ *   <li>Empty delimiter ("") splits into individual characters
+ *   <li>If delimiter not found, returns array with original string
+ *   <li>Empty string returns empty array
+ * </ul>
+ *
+ * <p>Implementation notes:
+ *
+ * <ul>
+ *   <li>Uses Calcite's SPLIT for non-empty delimiters
+ *   <li>Uses custom character splitting for empty delimiter via REGEXP_REPLACE
+ * </ul>
+ */
+public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp {
+
+  @Override
+  public RexNode resolve(RexBuilder builder, RexNode... args) {
+    RexNode str = args[0];
+    RexNode delimiter = args[1];
+
+    // Check if delimiter is empty string
+    // If empty, split into individual characters using a workaround
+    // If not empty, use Calcite's SPLIT function
+
+    // Create condition: delimiter = ''
+    RexNode emptyString = builder.makeLiteral("");
+    RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
+
+    // For empty delimiter: split into characters
+    // Pattern: Insert a delimiter between each character using regex
+    // 'abcd' -> 'a|b|c|d' -> split on '|'
+    RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)");
+    RexNode replacement = builder.makeLiteral("|");
+
+    // Use REGEXP_REPLACE to insert delimiter between characters
+    SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3;
+    RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement);
+
+    // Then split on the inserted delimiter
+    RexNode pipeDelimiter = builder.makeLiteral("|");
+    RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter);
+
+    // For non-empty delimiter: use standard SPLIT
+    RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
+
+    // Use CASE to choose between the two approaches
+    // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
+    return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
+  }
+}
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
index a2ea4cfcb30..5fd33746c7d 100644
--- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
+++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
@@ -194,6 +194,7 @@
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
+import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
@@ -286,6 +287,7 @@
 import org.opensearch.sql.exception.ExpressionEvaluationException;
 import org.opensearch.sql.executor.QueryType;
 import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp;
+import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp;
 
 public class PPLFuncImpTable {
   private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class);
@@ -976,6 +978,12 @@ void populate() {
                   builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
           PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
 
+      // Register SPLIT with custom logic for empty delimiter
+      register(
+          SPLIT,
+          new SplitFunctionImp(),
+          PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
+
       // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
       register(
           MVINDEX,
diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst
index 34c02074641..726e2f30f0c 100644
--- a/docs/user/ppl/functions/collection.rst
+++ b/docs/user/ppl/functions/collection.rst
@@ -186,6 +186,60 @@ Example::
     | 120    |
     +--------+
 
+SPLIT
+-----
+
+Description
+>>>>>>>>>>>
+
+Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.
+
+Argument type: str: STRING, delimiter: STRING
+
+Return type: ARRAY of STRING
+
+Example::
+
+    os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +--------------------------------------+
+    | result                               |
+    |--------------------------------------|
+    | [buttercup,rarity,tenderhoof,dash]   |
+    +--------------------------------------+
+
+    os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +------------------+
+    | result           |
+    |------------------|
+    | [1a2b3c4,567890] |
+    +------------------+
+
+    os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +-----------+
+    | result    |
+    |-----------|
+    | [a,b,c,d] |
+    +-----------+
+
+    os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +--------------+
+    | result       |
+    |--------------|
+    | [name,value] |
+    +--------------+
+
+    os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +---------+
+    | result  |
+    |---------|
+    | [hello] |
+    +---------+
+
 MVJOIN
 ------
 
diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
index 52a6e181e20..c0a8bfd6d88 100644
--- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
+++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
@@ -567,4 +567,127 @@ public void testMvdedupPreservesOrder() throws IOException {
     // Should preserve first occurrence order: z, a, b, c
     verifyDataRows(actual, rows(List.of("z", "a", "b", "c")));
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
+                    + " split(test, ';') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+                    + " fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    // Empty delimiter splits into individual characters
+    verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
+  }
+
+  @Test
+  public void testSplitWithColonDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'name::value', result = split(test, '::') | head 1 |"
+                    + " fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("name", "value")));
+  }
+
+  @Test
+  public void testSplitWithCommaDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'apple,banana,cherry', result = split(test, ',') | head 1"
+                    + " | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("apple", "banana", "cherry")));
+  }
+
+  @Test
+  public void testSplitWithFieldReference() throws IOException {
+    // Test split on a real field using employer field which may contain space-separated words
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval result = split(employer, ' ') | head 1 | fields employer, result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("employer", "string"), schema("result", "array"));
+    // Verify that the result is an array
+    JSONArray dataRows = actual.getJSONArray("datarows");
+    assertTrue(dataRows.length() > 0);
+    JSONArray firstRow = dataRows.getJSONArray(0);
+    // The second element should be an array
+    assertTrue(firstRow.get(1) instanceof JSONArray);
+  }
+
+  @Test
+  public void testSplitWithEmptyString() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = '', result = split(test, ',') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    // Empty string should return empty array
+    verifyDataRows(actual, rows(List.of()));
+  }
+
+  @Test
+  public void testSplitNoDelimiterFound() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'hello', result = split(test, ',') | head 1 | fields"
+                    + " result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    // If delimiter not found, should return array with original string
+    verifyDataRows(actual, rows(List.of("hello")));
+  }
+
+  @Test
+  public void testSplitMultipleOccurrences() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'a-b-c-d-e-f', result = split(test, '-') | head 1 | fields"
+                    + " result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("a", "b", "c", "d", "e", "f")));
+  }
 }
diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4
index ebe0fcb4f21..22f1b96bbb9 100644
--- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4
+++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4
@@ -443,6 +443,7 @@ MVAPPEND:                           'MVAPPEND';
 MVJOIN:                             'MVJOIN';
 MVINDEX:                            'MVINDEX';
 MVDEDUP:                            'MVDEDUP';
+SPLIT:                              'SPLIT';
 FORALL:                             'FORALL';
 FILTER:                             'FILTER';
 TRANSFORM:                          'TRANSFORM';
diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4
index 22121a1b1aa..a36647d4dd1 100644
--- a/ppl/src/main/antlr/OpenSearchPPLParser.g4
+++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4
@@ -1097,6 +1097,7 @@ collectionFunctionName
     | MVJOIN
     | MVINDEX
     | MVDEDUP
+    | SPLIT
     | FORALL
     | EXISTS
     | FILTER
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
index 176fb534f37..7fd132a751e 100644
--- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
+++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
@@ -290,4 +290,58 @@ public void testMvdedupPreservesOrder() {
             + "LIMIT 1";
     verifyPPLToSparkSQL(root, expectedSparkSql);
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
+            + " 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedResult = "result=[buttercup, rarity, tenderhoof]\n";
+    verifyResult(root, expectedResult);
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() {
+    String ppl =
+        "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+            + " fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedResult = "result=[1a2b3c4, 567890]\n";
+    verifyResult(root, expectedResult);
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    // With empty delimiter, should split into individual characters
+    String expectedResult = "result=[a, b, c, d]\n";
+    verifyResult(root, expectedResult);
+  }
+
+  @Test
+  public void testSplitWithColonDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'name::value', result = split(test, '::') | head 1 | fields"
+            + " result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedResult = "result=[name, value]\n";
+    verifyResult(root, expectedResult);
+  }
+
+  @Test
+  public void testSplitWithFieldReference() {
+    String ppl = "source=EMP | eval result = split(ENAME, 'A') | head 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    // Just verify it parses and executes correctly
+    // Actual result depends on the ENAME field value
+    getRelNode(ppl); // Verify parsing succeeds
+  }
 }
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java
index 0f59e98e74b..796156aae85 100644
--- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java
+++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java
@@ -829,6 +829,22 @@ public void testMvindex() {
         anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
   }
 
+  @Test
+  public void testSplit() {
+    // Test split with delimiter
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
+    // Test split with field reference
+    assertEquals(
+        "source=table | eval identifier=split(identifier,***) | fields + identifier",
+        anonymize("source=t | eval result=split(text, ',') | fields result"));
+    // Test split with empty delimiter (splits into characters)
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('abcd', '') | fields result"));
+  }
+
   @Test
   public void testMvdedup() {
     // Test mvdedup with array containing duplicates

From 8b62cf80c8cddbe785ea0116cb148a7501c8356d Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Fri, 14 Nov 2025 14:29:56 -0800
Subject: [PATCH 2/4] doctest

Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 docs/user/ppl/functions/collection.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/user/ppl/functions/collection.rst b/docs/user/ppl/functions/collection.rst
index 726e2f30f0c..fdea75d3e81 100644
--- a/docs/user/ppl/functions/collection.rst
+++ b/docs/user/ppl/functions/collection.rst
@@ -202,11 +202,11 @@ Example::
 
     os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
     fetched rows / total rows = 1/1
-    +--------------------------------------+
-    | result                               |
-    |--------------------------------------|
-    | [buttercup,rarity,tenderhoof,dash]   |
-    +--------------------------------------+
+    +------------------------------------+
+    | result                             |
+    |------------------------------------|
+    | [buttercup,rarity,tenderhoof,dash] |
+    +------------------------------------+
 
     os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
     fetched rows / total rows = 1/1

From 0f070e2acdcdaeb2d44659c1f3139290caa2af74 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Fri, 14 Nov 2025 14:58:26 -0800
Subject: [PATCH 3/4] Update test cases

Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 .../remote/CalciteArrayFunctionIT.java        | 84 -------------------
 .../calcite/CalcitePPLArrayFunctionTest.java  | 75 +++++++++++------
 2 files changed, 49 insertions(+), 110 deletions(-)

diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
index c0a8bfd6d88..31556e518b9 100644
--- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
+++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteArrayFunctionIT.java
@@ -606,88 +606,4 @@ public void testSplitWithEmptyDelimiter() throws IOException {
     // Empty delimiter splits into individual characters
     verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
   }
-
-  @Test
-  public void testSplitWithColonDelimiter() throws IOException {
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval test = 'name::value', result = split(test, '::') | head 1 |"
-                    + " fields result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("result", "array"));
-    verifyDataRows(actual, rows(List.of("name", "value")));
-  }
-
-  @Test
-  public void testSplitWithCommaDelimiter() throws IOException {
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval test = 'apple,banana,cherry', result = split(test, ',') | head 1"
-                    + " | fields result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("result", "array"));
-    verifyDataRows(actual, rows(List.of("apple", "banana", "cherry")));
-  }
-
-  @Test
-  public void testSplitWithFieldReference() throws IOException {
-    // Test split on a real field using employer field which may contain space-separated words
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval result = split(employer, ' ') | head 1 | fields employer, result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("employer", "string"), schema("result", "array"));
-    // Verify that the result is an array
-    JSONArray dataRows = actual.getJSONArray("datarows");
-    assertTrue(dataRows.length() > 0);
-    JSONArray firstRow = dataRows.getJSONArray(0);
-    // The second element should be an array
-    assertTrue(firstRow.get(1) instanceof JSONArray);
-  }
-
-  @Test
-  public void testSplitWithEmptyString() throws IOException {
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval test = '', result = split(test, ',') | head 1 | fields result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("result", "array"));
-    // Empty string should return empty array
-    verifyDataRows(actual, rows(List.of()));
-  }
-
-  @Test
-  public void testSplitNoDelimiterFound() throws IOException {
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval test = 'hello', result = split(test, ',') | head 1 | fields"
-                    + " result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("result", "array"));
-    // If delimiter not found, should return array with original string
-    verifyDataRows(actual, rows(List.of("hello")));
-  }
-
-  @Test
-  public void testSplitMultipleOccurrences() throws IOException {
-    JSONObject actual =
-        executeQuery(
-            String.format(
-                "source=%s | eval test = 'a-b-c-d-e-f', result = split(test, '-') | head 1 | fields"
-                    + " result",
-                TEST_INDEX_BANK));
-
-    verifySchema(actual, schema("result", "array"));
-    verifyDataRows(actual, rows(List.of("a", "b", "c", "d", "e", "f")));
-  }
 }
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
index 7fd132a751e..ea78499b9a9 100644
--- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
+++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
@@ -298,8 +298,24 @@ public void testSplitWithSemicolonDelimiter() {
             + " 1 | fields result";
     RelNode root = getRelNode(ppl);
 
-    String expectedResult = "result=[buttercup, rarity, tenderhoof]\n";
-    verifyResult(root, expectedResult);
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
+            + " result=[CASE(=(';', ''),"
+            + " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|'),"
+            + " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', "
+            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+            + "`result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
   }
 
   @Test
@@ -309,8 +325,22 @@ public void testSplitWithMultiCharDelimiter() {
             + " fields result";
     RelNode root = getRelNode(ppl);
 
-    String expectedResult = "result=[1a2b3c4, 567890]\n";
-    verifyResult(root, expectedResult);
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
+            + " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR,"
+            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', "
+            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
   }
 
   @Test
@@ -320,28 +350,21 @@ public void testSplitWithEmptyDelimiter() {
     RelNode root = getRelNode(ppl);
 
     // With empty delimiter, should split into individual characters
-    String expectedResult = "result=[a, b, c, d]\n";
-    verifyResult(root, expectedResult);
-  }
-
-  @Test
-  public void testSplitWithColonDelimiter() {
-    String ppl =
-        "source=EMP | eval test = 'name::value', result = split(test, '::') | head 1 | fields"
-            + " result";
-    RelNode root = getRelNode(ppl);
-
-    String expectedResult = "result=[name, value]\n";
-    verifyResult(root, expectedResult);
-  }
-
-  @Test
-  public void testSplitWithFieldReference() {
-    String ppl = "source=EMP | eval result = split(ENAME, 'A') | head 1 | fields result";
-    RelNode root = getRelNode(ppl);
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
+            + " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR,"
+            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
 
-    // Just verify it parses and executes correctly
-    // Actual result depends on the ENAME field value
-    getRelNode(ppl); // Verify parsing succeeds
+    String expectedSparkSql =
+        "SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') "
+            + "ELSE SPLIT('abcd', '') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
   }
 }

From 444cb295d03328aeb3d169b075245c392bcef9f9 Mon Sep 17 00:00:00 2001
From: Kai Huang <ahkcs@amazon.com>
Date: Wed, 19 Nov 2025 15:28:16 -0800
Subject: [PATCH 4/4] Update to not use UDF

Signed-off-by: Kai Huang <ahkcs@amazon.com>
---
 .../CollectionUDF/SplitFunctionImp.java       | 73 -------------------
 .../expression/function/PPLFuncImpTable.java  | 25 ++++++-
 .../calcite/CalcitePPLArrayFunctionTest.java  | 22 +++---
 3 files changed, 34 insertions(+), 86 deletions(-)
 delete mode 100644 core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java

diff --git a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java b/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java
deleted file mode 100644
index 0672772c0f8..00000000000
--- a/core/src/main/java/org/opensearch/sql/expression/function/CollectionUDF/SplitFunctionImp.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-package org.opensearch.sql.expression.function.CollectionUDF;
-
-import org.apache.calcite.rex.RexBuilder;
-import org.apache.calcite.rex.RexNode;
-import org.apache.calcite.sql.SqlOperator;
-import org.apache.calcite.sql.fun.SqlLibraryOperators;
-import org.apache.calcite.sql.fun.SqlStdOperatorTable;
-import org.opensearch.sql.expression.function.PPLFuncImpTable;
-
-/**
- * SPLIT function implementation that splits strings by delimiter.
- *
- * <p>Usage: split(str, delimiter)
- *
- * <p>Returns an array of strings split on the delimiter.
- *
- * <p>Special behavior:
- *
- * <ul>
- *   <li>Empty delimiter ("") splits into individual characters
- *   <li>If delimiter not found, returns array with original string
- *   <li>Empty string returns empty array
- * </ul>
- *
- * <p>Implementation notes:
- *
- * <ul>
- *   <li>Uses Calcite's SPLIT for non-empty delimiters
- *   <li>Uses custom character splitting for empty delimiter via REGEXP_REPLACE
- * </ul>
- */
-public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp {
-
-  @Override
-  public RexNode resolve(RexBuilder builder, RexNode... args) {
-    RexNode str = args[0];
-    RexNode delimiter = args[1];
-
-    // Check if delimiter is empty string
-    // If empty, split into individual characters using a workaround
-    // If not empty, use Calcite's SPLIT function
-
-    // Create condition: delimiter = ''
-    RexNode emptyString = builder.makeLiteral("");
-    RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
-
-    // For empty delimiter: split into characters
-    // Pattern: Insert a delimiter between each character using regex
-    // 'abcd' -> 'a|b|c|d' -> split on '|'
-    RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)");
-    RexNode replacement = builder.makeLiteral("|");
-
-    // Use REGEXP_REPLACE to insert delimiter between characters
-    SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3;
-    RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement);
-
-    // Then split on the inserted delimiter
-    RexNode pipeDelimiter = builder.makeLiteral("|");
-    RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter);
-
-    // For non-empty delimiter: use standard SPLIT
-    RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
-
-    // Use CASE to choose between the two approaches
-    // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
-    return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
-  }
-}
diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
index 5fd33746c7d..2a54efaecbc 100644
--- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
+++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java
@@ -287,7 +287,6 @@
 import org.opensearch.sql.exception.ExpressionEvaluationException;
 import org.opensearch.sql.executor.QueryType;
 import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp;
-import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp;
 
 public class PPLFuncImpTable {
   private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class);
@@ -979,9 +978,31 @@ void populate() {
           PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
 
       // Register SPLIT with custom logic for empty delimiter
+      // Case 1: Delimiter is not empty string, use SPLIT
+      // Case 2: Delimiter is empty string, use REGEXP_EXTRACT_ALL with '.' pattern
       register(
           SPLIT,
-          new SplitFunctionImp(),
+          (FunctionImp2)
+              (builder, str, delimiter) -> {
+                // Create condition: delimiter = ''
+                RexNode emptyString = builder.makeLiteral("");
+                RexNode isEmptyDelimiter =
+                    builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
+
+                // For empty delimiter: split into characters using REGEXP_EXTRACT_ALL with '.'
+                // pattern This matches each individual character
+                RexNode dotPattern = builder.makeLiteral(".");
+                RexNode splitChars =
+                    builder.makeCall(SqlLibraryOperators.REGEXP_EXTRACT_ALL, str, dotPattern);
+
+                // For non-empty delimiter: use standard SPLIT
+                RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
+
+                // Use CASE to choose between the two approaches
+                // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
+                return builder.makeCall(
+                    SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
+              },
           PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
 
       // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
index ea78499b9a9..96529adea24 100644
--- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
+++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLArrayFunctionTest.java
@@ -304,14 +304,14 @@ public void testSplitWithSemicolonDelimiter() {
             + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
             + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
             + " result=[CASE(=(';', ''),"
-            + " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|'),"
-            + " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+            + " REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof':VARCHAR, '.'),"
+            + " SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
             + "      LogicalTableScan(table=[[scott, EMP]])\n";
     verifyLogical(root, expectedLogical);
 
     String expectedSparkSql =
-        "SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', "
-            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+        "SELECT CASE WHEN ';' = '' THEN REGEXP_EXTRACT_ALL('buttercup;rarity;tenderhoof', "
+            + "'.') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
             + "`result`\n"
             + "FROM `scott`.`EMP`\n"
             + "LIMIT 1";
@@ -330,14 +330,14 @@ public void testSplitWithMultiCharDelimiter() {
             + "  LogicalSort(fetch=[1])\n"
             + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
             + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
-            + " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR,"
-            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+            + " result=[CASE(=('def':VARCHAR, ''), REGEXP_EXTRACT_ALL('1a2b3c4def567890':VARCHAR,"
+            + " '.'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
             + "      LogicalTableScan(table=[[scott, EMP]])\n";
     verifyLogical(root, expectedLogical);
 
     String expectedSparkSql =
-        "SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', "
-            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+        "SELECT CASE WHEN 'def' = '' THEN REGEXP_EXTRACT_ALL('1a2b3c4def567890', "
+            + "'.') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
             + "FROM `scott`.`EMP`\n"
             + "LIMIT 1";
     verifyPPLToSparkSQL(root, expectedSparkSql);
@@ -355,13 +355,13 @@ public void testSplitWithEmptyDelimiter() {
             + "  LogicalSort(fetch=[1])\n"
             + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
             + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
-            + " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR,"
-            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+            + " result=[CASE(=('':VARCHAR, ''), REGEXP_EXTRACT_ALL('abcd':VARCHAR,"
+            + " '.'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
             + "      LogicalTableScan(table=[[scott, EMP]])\n";
     verifyLogical(root, expectedLogical);
 
     String expectedSparkSql =
-        "SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') "
+        "SELECT CASE WHEN '' = '' THEN REGEXP_EXTRACT_ALL('abcd', '.') "
             + "ELSE SPLIT('abcd', '') END `result`\n"
             + "FROM `scott`.`EMP`\n"
             + "LIMIT 1";