From fe4a7ab1ab3140dff30c95cc84552d570240c9a9 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 7 Nov 2024 20:29:03 +0800 Subject: [PATCH 1/5] Add list_cat, list_concat --- python/datafusion/functions.py | 18 ++++++++++++++++++ python/tests/test_functions.py | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 907f801af..000b8532a 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -147,6 +147,8 @@ "length", "levenshtein", "list_append", + "list_cat", + "list_concat", "list_dims", "list_distinct", "list_element", @@ -1142,6 +1144,22 @@ def array_distinct(array: Expr) -> Expr: return Expr(f.array_distinct(array.expr)) +def list_cat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + """ + return array_concat(*args) + + +def list_concat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + """ + return array_concat(*args) + + def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index c65c633a4..30870316e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -289,6 +289,14 @@ def py_flatten(arr): lambda col: f.array_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], ], + [ + lambda col: f.list_cat(col, col), + lambda data: [np.concatenate([arr, arr]) for arr in data], + ], + [ + lambda col: f.list_concat(col, col), + lambda data: [np.concatenate([arr, arr]) for arr in data], + ], [ lambda col: f.array_dims(col), lambda data: [[len(r)] for r in data], From e53d708e7f8994728aa3c9cd842894c79ccff44a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 7 Nov 2024 20:33:27 +0800 Subject: [PATCH 2/5] Add list_repeat --- python/datafusion/functions.py | 9 +++++++++ python/tests/test_functions.py | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 000b8532a..f2ae6e337 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -164,6 +164,7 @@ "list_prepend", "list_push_back", "list_push_front", + "list_repeat", "list_remove", "list_remove_all", "list_remove_n", @@ -1384,6 +1385,14 @@ def array_repeat(element: Expr, count: Expr) -> Expr: return Expr(f.array_repeat(element.expr, count.expr)) +def list_repeat(element: Expr, count: Expr) -> Expr: + """Returns an array containing ``element`` ``count`` times. + + This is an alias for :py:func:`array_repeat`. + """ + return array_repeat(element, count) + + def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``.""" return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 30870316e..4436fd618 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -445,6 +445,10 @@ def py_flatten(arr): lambda col: f.array_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], ], + [ + lambda col: f.list_repeat(col, literal(2)), + lambda data: [[arr] * 2 for arr in data], + ], [ lambda col: f.array_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], From 7a6bcd4bf76e39861f64d7131fec13f614fe73d2 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 7 Nov 2024 21:49:41 +0800 Subject: [PATCH 3/5] docs: add examples for list_cat, list_concat, and list_repeat functions --- .../common-operations/expressions.rst | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index b2a83c89f..cf3bc3816 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -110,6 +110,35 @@ This function returns an integer indicating the total number of elements in the In this example, the `num_elements` column will contain `3` for both rows. +To concatenate two arrays, you can use the function :py:func:`datafusion.functions.list_cat` or :py:func:`datafusion.functions.list_concat`. +These functions return a new array that is the concatenation of the input arrays. + +.. ipython:: python + + from datafusion import SessionContext, col + from datafusion.functions import list_cat, list_concat + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) + df.select(list_cat(col("a"), col("b")).alias("concatenated_array")) + +In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`. + +To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.list_repeat`. +This function returns a new array with the elements repeated. + +.. ipython:: python + + from datafusion import SessionContext, col + from datafusion.functions import list_repeat + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + df.select(list_repeat(col("a"), 2).alias("repeated_array")) + +In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. + + Structs ------- From 28e677da92cbef3916af8f0c93d3b5224e061d48 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 8 Nov 2024 09:57:29 +0800 Subject: [PATCH 4/5] Amend list_repeat code example - literal --- docs/source/user-guide/common-operations/expressions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index cf3bc3816..65006be1f 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -129,12 +129,12 @@ This function returns a new array with the elements repeated. .. ipython:: python - from datafusion import SessionContext, col + from datafusion import SessionContext, col, literal from datafusion.functions import list_repeat ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]]}) - df.select(list_repeat(col("a"), 2).alias("repeated_array")) + df.select(list_repeat(col("a"), literal(2)).alias("repeated_array")) In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. From 0aee7c78696e40e912a9a055b2c150afd63b6671 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 12 Nov 2024 09:57:29 +0800 Subject: [PATCH 5/5] Amend list_ to array_ in documentation --- .../user-guide/common-operations/expressions.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 65006be1f..e94e1a6b5 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -110,31 +110,31 @@ This function returns an integer indicating the total number of elements in the In this example, the `num_elements` column will contain `3` for both rows. -To concatenate two arrays, you can use the function :py:func:`datafusion.functions.list_cat` or :py:func:`datafusion.functions.list_concat`. +To concatenate two arrays, you can use the function :py:func:`datafusion.functions.array_cat` or :py:func:`datafusion.functions.array_concat`. These functions return a new array that is the concatenation of the input arrays. .. ipython:: python from datafusion import SessionContext, col - from datafusion.functions import list_cat, list_concat + from datafusion.functions import array_cat, array_concat ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) - df.select(list_cat(col("a"), col("b")).alias("concatenated_array")) + df.select(array_cat(col("a"), col("b")).alias("concatenated_array")) In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`. -To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.list_repeat`. +To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.array_repeat`. This function returns a new array with the elements repeated. .. ipython:: python from datafusion import SessionContext, col, literal - from datafusion.functions import list_repeat + from datafusion.functions import array_repeat ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]]}) - df.select(list_repeat(col("a"), literal(2)).alias("repeated_array")) + df.select(array_repeat(col("a"), literal(2)).alias("repeated_array")) In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`.