From 8e1d5eb5b09643bbf965da8ff376ef1c716bb3f3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 25 Apr 2025 14:24:06 -0400 Subject: [PATCH 1/4] This is a breaking change to move the module datafusion.udf to datafusion.user_defined --- .../user-guide/common-operations/udf-and-udfa.rst | 12 ++++++------ examples/python-udwf.py | 2 +- python/datafusion/__init__.py | 10 +++++++++- python/datafusion/context.py | 2 +- python/datafusion/{udf.py => user_defined.py} | 0 python/tests/test_imports.py | 2 +- python/tests/test_udwf.py | 2 +- 7 files changed, 19 insertions(+), 11 deletions(-) rename python/datafusion/{udf.py => user_defined.py} (100%) diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index ffd7a05cb..e22338305 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -26,7 +26,7 @@ Scalar Functions When writing a user-defined function that can operate on a row by row basis, these are called Scalar Functions. You can define your own scalar function by calling -:py:func:`~datafusion.udf.ScalarUDF.udf` . +:py:func:`~datafusion.user_defined.ScalarUDF.udf` . The basic definition of a scalar UDF is a python function that takes one or more `pyarrow `_ arrays and returns a single array as @@ -93,9 +93,9 @@ converting to Python objects to do the evaluation. Aggregate Functions ------------------- -The :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined +The :py:func:`~datafusion.user_defined.AggregateUDF.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs). To use this you must implement an -:py:class:`~datafusion.udf.Accumulator` that determines how the aggregation is performed. +:py:class:`~datafusion.user_defined.Accumulator` that determines how the aggregation is performed. When defining a UDAF there are four methods you need to implement. The ``update`` function takes the array(s) of input and updates the internal state of the accumulator. You should define this function @@ -153,8 +153,8 @@ Window Functions ---------------- To implement a User-Defined Window Function (UDWF) you must call the -:py:func:`~datafusion.udf.WindowUDF.udwf` function using a class that implements the abstract -class :py:class:`~datafusion.udf.WindowEvaluator`. +:py:func:`~datafusion.user_defined.WindowUDF.udwf` function using a class that implements the abstract +class :py:class:`~datafusion.user_defined.WindowEvaluator`. There are three methods of evaluation of UDWFs. @@ -207,7 +207,7 @@ determine which evaluate functions are called. import pyarrow as pa from datafusion import udwf, col, SessionContext - from datafusion.udf import WindowEvaluator + from datafusion.user_defined import WindowEvaluator class ExponentialSmooth(WindowEvaluator): def __init__(self, alpha: float) -> None: diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 98d118bf2..645ded188 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -22,7 +22,7 @@ from datafusion import col, lit, udwf from datafusion import functions as f from datafusion.expr import WindowFrame -from datafusion.udf import WindowEvaluator +from datafusion.user_defined import WindowEvaluator # This example creates five different examples of user defined window functions in order # to demonstrate the variety of ways a user may need to implement. diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 60d0d61b4..9ae36fece 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -49,7 +49,15 @@ from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream -from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf +from .user_defined import ( + Accumulator, + AggregateUDF, + ScalarUDF, + WindowUDF, + udaf, + udf, + udwf, +) __version__ = importlib_metadata.version(__name__) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 1429a4975..940f597cc 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -30,7 +30,7 @@ from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream -from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF +from datafusion.user_defined import AggregateUDF, ScalarUDF, WindowUDF from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal from ._internal import SessionConfig as SessionConfigInternal diff --git a/python/datafusion/udf.py b/python/datafusion/user_defined.py similarity index 100% rename from python/datafusion/udf.py rename to python/datafusion/user_defined.py diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 9ef7ed89a..fca94b35a 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -107,7 +107,7 @@ def test_class_module_is_datafusion(): AggregateUDF, ScalarUDF, ]: - assert klass.__module__ == "datafusion.udf" + assert klass.__module__ == "datafusion.user_defined" # expressions for klass in [Expr, Column, Literal, BinaryExpr, AggregateFunction]: diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 4190e7d64..5aaf00664 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -22,7 +22,7 @@ from datafusion import SessionContext, column, lit, udwf from datafusion import functions as f from datafusion.expr import WindowFrame -from datafusion.udf import WindowEvaluator +from datafusion.user_defined import WindowEvaluator class ExponentialSmoothDefault(WindowEvaluator): From c62d6877fbd9de5f2f575edd1ed9d8f60268c31b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 25 Apr 2025 14:47:32 -0400 Subject: [PATCH 2/4] Documentation updates to make the formatting more consistent with the rest of the site and to correct some errors when building the rst files using autoapi --- docs/source/conf.py | 1 + python/datafusion/user_defined.py | 92 ++++++++++++++----------------- 2 files changed, 41 insertions(+), 52 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0be03d81d..0ca124fd1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -71,6 +71,7 @@ autoapi_member_order = "groupwise" suppress_warnings = ["autoapi.python_import_resolution"] autoapi_python_class_content = "both" +autoapi_keep_files = False # set to True for debugging generated files def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index e93a34ca5..f7302b01a 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -134,19 +134,18 @@ def udf( def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). - This class can be used both as a **function** and as a **decorator**. + This class can be used both as either a function or a decorator. Usage: - - **As a function**: Call `udf(func, input_types, return_type, volatility, - name)`. - - **As a decorator**: Use `@udf(input_types, return_type, volatility, - name)`. In this case, do **not** pass `func` explicitly. + - As a function: ``udf(func, input_types, return_type, volatility, name)``. + - As a decorator: ``@udf(input_types, return_type, volatility, name)``. + When used a decorator, do **not** pass ``func`` explicitly. Args: - func (Callable, optional): **Only needed when calling as a function.** - Skip this argument when using `udf` as a decorator. + func (Callable, optional): Only needed when calling as a function. + Skip this argument when using ``udf`` as a decorator. input_types (list[pa.DataType]): The data types of the arguments - to `func`. This list must be of the same length as the number of + to ``func``. This list must be of the same length as the number of arguments. return_type (_R): The data type of the return value from the function. volatility (Volatility | str): See `Volatility` for allowed values. @@ -156,21 +155,18 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 A user-defined function that can be used in SQL expressions, data aggregation, or window function calls. - Example: - **Using `udf` as a function:** - ``` + Example: Using ``udf`` as a function:: + def double_func(x): return x * 2 double_udf = udf(double_func, [pa.int32()], pa.int32(), "volatile", "double_it") - ``` - **Using `udf` as a decorator:** - ``` + Example: Using ``udf`` as a decorator:: + @udf([pa.int32()], pa.int32(), "volatile", "double_it") def double_udf(x): return x * 2 - ``` """ def _function( @@ -306,24 +302,22 @@ def udaf( def udaf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Aggregate Function (UDAF). - This class allows you to define an **aggregate function** that can be used in + This class allows you to define an aggregate function that can be used in data aggregation or window function calls. Usage: - - **As a function**: Call `udaf(accum, input_types, return_type, state_type, - volatility, name)`. - - **As a decorator**: Use `@udaf(input_types, return_type, state_type, - volatility, name)`. - When using `udaf` as a decorator, **do not pass `accum` explicitly**. - - **Function example:** - - If your `:py:class:Accumulator` can be instantiated with no arguments, you - can simply pass it's type as `accum`. If you need to pass additional - arguments to it's constructor, you can define a lambda or a factory method. - During runtime the `:py:class:Accumulator` will be constructed for every - instance in which this UDAF is used. The following examples are all valid. - ``` + - As a function: ``udaf(accum, input_types, return_type, state_type, volatility, name)``. + - As a decorator: ``@udaf(input_types, return_type, state_type, volatility, name)``. + When using ``udaf`` as a decorator, do not pass ``accum`` explicitly. + + Function example: + + If your :py:class:`Accumulator` can be instantiated with no arguments, you + can simply pass it's type as `accum`. If you need to pass additional + arguments to it's constructor, you can define a lambda or a factory method. + During runtime the :py:class:`Accumulator` will be constructed for every + instance in which this UDAF is used. The following examples are all valid:: + import pyarrow as pa import pyarrow.compute as pc @@ -352,18 +346,16 @@ def sum_bias_10() -> Summarize: "immutable") udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), [pa.float64()], "immutable") - ``` - **Decorator example:** - ``` + Decorator example::: + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") def udf4() -> Summarize: return Summarize(10.0) - ``` Args: - accum: The accumulator python function. **Only needed when calling as a - function. Skip this argument when using `udaf` as a decorator.** + accum: The accumulator python function. Only needed when calling as a + function. Skip this argument when using ``udaf`` as a decorator. input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -373,7 +365,7 @@ def udf4() -> Summarize: Returns: A user-defined aggregate function, which can be used in either data aggregation or window function calls. - """ + """ # noqa: E501 W505 def _function( accum: Callable[[], Accumulator], @@ -644,17 +636,15 @@ def udwf( def udwf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Window Function (UDWF). - This class can be used both as a **function** and as a **decorator**. + This class can be used both as either a function or a decorator. Usage: - - **As a function**: Call `udwf(func, input_types, return_type, volatility, - name)`. - - **As a decorator**: Use `@udwf(input_types, return_type, volatility, - name)`. When using `udwf` as a decorator, **do not pass `func` - explicitly**. - - **Function example:** - ``` + - As a function: ``udwf(func, input_types, return_type, volatility, name)``. + - As a decorator: ``@udwf(input_types, return_type, volatility, name)``. + When using ``udwf`` as a decorator, do not pass ``func`` explicitly. + + Function example:: + import pyarrow as pa class BiasedNumbers(WindowEvaluator): @@ -672,18 +662,16 @@ def bias_10() -> BiasedNumbers: udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") - ``` - **Decorator example:** - ``` + Decorator example:: + @udwf(pa.int64(), pa.int64(), "immutable") def biased_numbers() -> BiasedNumbers: return BiasedNumbers(10) - ``` Args: - func: **Only needed when calling as a function. Skip this argument when - using `udwf` as a decorator.** + func: Only needed when calling as a function. Skip this argument when + using ``udwf`` as a decorator. input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. From d4a454652d69959f35cf7faead488169969107c6 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 27 Apr 2025 10:01:58 -0400 Subject: [PATCH 3/4] Set async test parameters that generate a warning when unset in pytest --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d86b657ec..728cedb2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,10 @@ exclude = [".github/**", "ci/**", ".asf.yaml"] locked = true features = ["substrait"] +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" + # Enable docstring linting using the google style guide [tool.ruff.lint] select = ["ALL" ] From 17adb72f40232b0f2f0aac78d0f6733a41771c99 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 27 Apr 2025 10:02:22 -0400 Subject: [PATCH 4/4] Add deprecation warning --- python/datafusion/udf.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 python/datafusion/udf.py diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py new file mode 100644 index 000000000..c7265fa09 --- /dev/null +++ b/python/datafusion/udf.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Deprecated module for user defined functions.""" + +import warnings + +from datafusion.user_defined import * # noqa: F403 + +warnings.warn( + "The module 'udf' is deprecated and will be removed in the next release. " + "Please use 'user_defined' instead.", + DeprecationWarning, + stacklevel=2, +)