Pass Field information back and forth when using scalar UDFs (#1299)

timsaucer · web-flow · commit 015dd76f9fdc · 2026-02-04T08:54:45.000-05:00
* Pass Field information back and forth when using scalar UDFs

* Add ArrowArrayExportable class and use it to create pyarrow arrays for python UDFs

* Minor user documentation update

* Update naming from type to field where appropriate

* Add unit test to check field inputs

* Update docstring

* Add text to user documentation on passing field information for scalar UDFs

* Minor change requested in code review

* Make type hints match outer
diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst
@@ -90,6 +90,17 @@ converting to Python objects to do the evaluation.
 
     df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show()
 
+In this example we passed the PyArrow ``DataType`` when we defined the function
+by calling ``udf()``. If you need additional control, such as specifying
+metadata or nullability of the input or output, you can instead specify a
+PyArrow ``Field``.
+
+If you need to write a custom function but do not want to incur the performance
+cost of converting to Python objects and back, a more advanced approach is to
+write Rust based UDFs and to expose them to Python. There is an example in the
+`DataFusion blog <https://datafusion.apache.org/blog/2024/11/19/datafusion-python-udf-comparisons/>`_
+describing how to do this.
+
 Aggregate Functions
 -------------------
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -141,6 +141,7 @@ dev = [
     "maturin>=1.8.1",
     "numpy>1.25.0;python_version<'3.14'",
     "numpy>=2.3.2;python_version>='3.14'",
+    "pyarrow>=19.0.0",
     "pre-commit>=4.3.0",
     "pyyaml>=6.0.3",
     "pytest>=7.4.4",
diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
@@ -34,7 +34,7 @@
     from _typeshed import CapsuleType as _PyCapsule
 
     _R = TypeVar("_R", bound=pa.DataType)
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
 
 class Volatility(Enum):
@@ -81,6 +81,27 @@ def __str__(self) -> str:
         return self.name.lower()
 
 
+def data_type_or_field_to_field(value: pa.DataType | pa.Field, name: str) -> pa.Field:
+    """Helper function to return a Field from either a Field or DataType."""
+    if isinstance(value, pa.Field):
+        return value
+    return pa.field(name, type=value)
+
+
+def data_types_or_fields_to_field_list(
+    inputs: Sequence[pa.Field | pa.DataType] | pa.Field | pa.DataType,
+) -> list[pa.Field]:
+    """Helper function to return a list of Fields."""
+    if isinstance(inputs, pa.DataType):
+        return [pa.field("value", type=inputs)]
+    if isinstance(inputs, pa.Field):
+        return [inputs]
+
+    return [
+        data_type_or_field_to_field(v, f"value_{idx}") for (idx, v) in enumerate(inputs)
+    ]
+
+
 class ScalarUDFExportable(Protocol):
     """Type hint for object that has __datafusion_scalar_udf__ PyCapsule."""
 
@@ -103,8 +124,8 @@ def __init__(
         self,
         name: str,
         func: Callable[..., _R],
-        input_types: pa.DataType | list[pa.DataType],
-        return_type: _R,
+        input_fields: list[pa.Field],
+        return_field: _R,
         volatility: Volatility | str,
     ) -> None:
         """Instantiate a scalar user-defined function (UDF).
@@ -114,10 +135,10 @@ def __init__(
         if hasattr(func, "__datafusion_scalar_udf__"):
             self._udf = df_internal.ScalarUDF.from_pycapsule(func)
             return
-        if isinstance(input_types, pa.DataType):
-            input_types = [input_types]
+        if isinstance(input_fields, pa.DataType):
+            input_fields = [input_fields]
         self._udf = df_internal.ScalarUDF(
-            name, func, input_types, return_type, str(volatility)
+            name, func, input_fields, return_field, str(volatility)
         )
 
     def __repr__(self) -> str:
@@ -136,8 +157,8 @@ def __call__(self, *args: Expr) -> Expr:
     @overload
     @staticmethod
     def udf(
-        input_types: list[pa.DataType],
-        return_type: _R,
+        input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+        return_field: pa.DataType | pa.Field,
         volatility: Volatility | str,
         name: str | None = None,
     ) -> Callable[..., ScalarUDF]: ...
@@ -146,8 +167,8 @@ def udf(
     @staticmethod
     def udf(
         func: Callable[..., _R],
-        input_types: list[pa.DataType],
-        return_type: _R,
+        input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+        return_field: pa.DataType | pa.Field,
         volatility: Volatility | str,
         name: str | None = None,
     ) -> ScalarUDF: ...
@@ -163,20 +184,24 @@ def udf(*args: Any, **kwargs: Any):  # noqa: D417
         This class can be used both as either a function or a decorator.
 
         Usage:
-            - As a function: ``udf(func, input_types, return_type, volatility, name)``.
-            - As a decorator: ``@udf(input_types, return_type, volatility, name)``.
+            - As a function: ``udf(func, input_fields, return_field, volatility, name)``.
+            - As a decorator: ``@udf(input_fields, return_field, volatility, name)``.
               When used a decorator, do **not** pass ``func`` explicitly.
 
+        In lieu of passing a PyArrow Field, you can pass a DataType for simplicity.
+        When you do so, it will be assumed that the nullability of the inputs and
+        output are True and that they have no metadata.
+
         Args:
             func (Callable, optional): Only needed when calling as a function.
                 Skip this argument when using `udf` as a decorator. If you have a Rust
                 backed ScalarUDF within a PyCapsule, you can pass this parameter
                 and ignore the rest. They will be determined directly from the
                 underlying function. See the online documentation for more information.
-            input_types (list[pa.DataType]): The data types of the arguments
-                to ``func``. This list must be of the same length as the number of
-                arguments.
-            return_type (_R): The data type of the return value from the function.
+            input_fields (list[pa.Field | pa.DataType]): The data types or Fields
+                of the arguments to ``func``. This list must be of the same length
+                as the number of arguments.
+            return_field (_R): The field of the return value from the function.
             volatility (Volatility | str): See `Volatility` for allowed values.
             name (Optional[str]): A descriptive name for the function.
 
@@ -196,12 +221,12 @@ def double_func(x):
             @udf([pa.int32()], pa.int32(), "volatile", "double_it")
             def double_udf(x):
                 return x * 2
-        """
+        """  # noqa: W505 E501
 
         def _function(
             func: Callable[..., _R],
-            input_types: list[pa.DataType],
-            return_type: _R,
+            input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+            return_field: pa.DataType | pa.Field,
             volatility: Volatility | str,
             name: str | None = None,
         ) -> ScalarUDF:
@@ -213,23 +238,25 @@ def _function(
                     name = func.__qualname__.lower()
                 else:
                     name = func.__class__.__name__.lower()
+            input_fields = data_types_or_fields_to_field_list(input_fields)
+            return_field = data_type_or_field_to_field(return_field, "value")
             return ScalarUDF(
                 name=name,
                 func=func,
-                input_types=input_types,
-                return_type=return_type,
+                input_fields=input_fields,
+                return_field=return_field,
                 volatility=volatility,
             )
 
         def _decorator(
-            input_types: list[pa.DataType],
-            return_type: _R,
+            input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field,
+            return_field: _R,
             volatility: Volatility | str,
             name: str | None = None,
         ) -> Callable:
             def decorator(func: Callable) -> Callable:
                 udf_caller = ScalarUDF.udf(
-                    func, input_types, return_type, volatility, name
+                    func, input_fields, return_field, volatility, name
                 )
 
                 @functools.wraps(func)
@@ -260,8 +287,8 @@ def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF:
         return ScalarUDF(
             name=name,
             func=func,
-            input_types=None,
-            return_type=None,
+            input_fields=None,
+            return_field=None,
             volatility=None,
         )
 
diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py
@@ -17,7 +17,8 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import column, udf
+from datafusion import SessionContext, column, udf
+from datafusion import functions as f
 
 
 @pytest.fixture
@@ -124,3 +125,86 @@ def udf_with_param(values: pa.Array) -> pa.Array:
     result = df2.collect()[0].column(0)
 
     assert result == pa.array([False, True, True])
+
+
+def test_udf_with_metadata(ctx) -> None:
+    from uuid import UUID
+
+    @udf([pa.string()], pa.uuid(), "stable")
+    def uuid_from_string(uuid_string):
+        return pa.array((UUID(s).bytes for s in uuid_string.to_pylist()), pa.uuid())
+
+    @udf([pa.uuid()], pa.int64(), "stable")
+    def uuid_version(uuid):
+        return pa.array(s.version for s in uuid.to_pylist())
+
+    batch = pa.record_batch({"idx": pa.array(range(5))})
+    results = (
+        ctx.create_dataframe([[batch]])
+        .with_column("uuid_string", f.uuid())
+        .with_column("uuid", uuid_from_string(column("uuid_string")))
+        .select(uuid_version(column("uuid").alias("uuid_version")))
+        .collect()
+    )
+
+    assert results[0][0].to_pylist() == [4, 4, 4, 4, 4]
+
+
+def test_udf_with_nullability(ctx: SessionContext) -> None:
+    import pyarrow.compute as pc
+
+    field_nullable_i64 = pa.field("with_nulls", type=pa.int64(), nullable=True)
+    field_non_nullable_i64 = pa.field("no_nulls", type=pa.int64(), nullable=False)
+
+    @udf([field_nullable_i64], field_nullable_i64, "stable")
+    def nullable_abs(input_col):
+        return pc.abs(input_col)
+
+    @udf([field_non_nullable_i64], field_non_nullable_i64, "stable")
+    def non_nullable_abs(input_col):
+        return pc.abs(input_col)
+
+    batch = pa.record_batch(
+        {
+            "with_nulls": pa.array([-2, None, 0, 1, 2]),
+            "no_nulls": pa.array([-2, -1, 0, 1, 2]),
+        },
+        schema=pa.schema(
+            [
+                field_nullable_i64,
+                field_non_nullable_i64,
+            ]
+        ),
+    )
+    ctx.register_record_batches("t", [[batch]])
+    df = ctx.table("t")
+
+    # Input matches expected, nullable
+    df_result = df.select(nullable_abs(column("with_nulls")))
+    returned_field = df_result.schema().field(0)
+    assert returned_field.nullable
+    results = df_result.collect()
+    assert results[0][0].to_pylist() == [2, None, 0, 1, 2]
+
+    # Input coercible to expected, nullable
+    df_result = df.select(nullable_abs(column("no_nulls")))
+    returned_field = df_result.schema().field(0)
+    assert returned_field.nullable
+    results = df_result.collect()
+    assert results[0][0].to_pylist() == [2, 1, 0, 1, 2]
+
+    # Input matches expected, no nulls
+    df_result = df.select(non_nullable_abs(column("no_nulls")))
+    returned_field = df_result.schema().field(0)
+    assert not returned_field.nullable
+    results = df_result.collect()
+    assert results[0][0].to_pylist() == [2, 1, 0, 1, 2]
+
+    # Invalid - requires non-nullable input but that is not possible
+    df_result = df.select(non_nullable_abs(column("with_nulls")))
+    returned_field = df_result.schema().field(0)
+    assert not returned_field.nullable
+
+    with pytest.raises(Exception) as e_info:
+        _results = df_result.collect()
+    assert "InvalidArgumentError" in str(e_info)
diff --git a/src/array.rs b/src/array.rs
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::{Field, FieldRef};
+use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+use arrow::pyarrow::ToPyArrow;
+use pyo3::prelude::{PyAnyMethods, PyCapsuleMethods};
+use pyo3::types::PyCapsule;
+use pyo3::{pyclass, pymethods, Bound, PyAny, PyResult, Python};
+
+use crate::errors::PyDataFusionResult;
+use crate::utils::validate_pycapsule;
+
+/// A Python object which implements the Arrow PyCapsule for importing
+/// into other libraries.
+#[pyclass(name = "ArrowArrayExportable", module = "datafusion", frozen)]
+#[derive(Clone)]
+pub struct PyArrowArrayExportable {
+    array: ArrayRef,
+    field: FieldRef,
+}
+
+#[pymethods]
+impl PyArrowArrayExportable {
+    #[pyo3(signature = (requested_schema=None))]
+    fn __arrow_c_array__<'py>(
+        &'py self,
+        py: Python<'py>,
+        requested_schema: Option<Bound<'py, PyCapsule>>,
+    ) -> PyDataFusionResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> {
+        let field = if let Some(schema_capsule) = requested_schema {
+            validate_pycapsule(&schema_capsule, "arrow_schema")?;
+
+            let schema_ptr = unsafe { schema_capsule.reference::<FFI_ArrowSchema>() };
+            let desired_field = Field::try_from(schema_ptr)?;
+
+            Arc::new(desired_field)
+        } else {
+            Arc::clone(&self.field)
+        };
+
+        let ffi_schema = FFI_ArrowSchema::try_from(&field)?;
+        let schema_capsule = PyCapsule::new(py, ffi_schema, Some(cr"arrow_schema".into()))?;
+
+        let ffi_array = FFI_ArrowArray::new(&self.array.to_data());
+        let array_capsule = PyCapsule::new(py, ffi_array, Some(cr"arrow_array".into()))?;
+
+        Ok((schema_capsule, array_capsule))
+    }
+}
+
+impl ToPyArrow for PyArrowArrayExportable {
+    fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let module = py.import("pyarrow")?;
+        let method = module.getattr("array")?;
+        let array = method.call((self.clone(),), None)?;
+        Ok(array)
+    }
+}
+
+impl PyArrowArrayExportable {
+    pub fn new(array: ArrayRef, field: FieldRef) -> Self {
+        Self { array, field }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -52,6 +52,7 @@ pub mod store;
 pub mod table;
 pub mod unparser;
 
+mod array;
 #[cfg(feature = "substrait")]
 pub mod substrait;
 #[allow(clippy::borrow_deref_ref)]
diff --git a/src/udf.rs b/src/udf.rs
diff --git a/uv.lock b/uv.lock