From d5f0aa73e1c0ecace121a648ddb1ec9271252b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Tue, 21 Oct 2025 20:43:38 +0100 Subject: [PATCH 1/7] add SQL expression support for `with_columns` --- python/datafusion/dataframe.py | 29 +++++++++++++++++++++----- python/tests/test_dataframe.py | 38 ++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 645598b59..00fe943bf 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -545,13 +545,13 @@ def with_column(self, name: str, expr: Expr | str) -> DataFrame: return DataFrame(self.df.with_column(name, ensure_expr(expr))) def with_columns( - self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr + self, *exprs: Expr | str | Iterable[Expr | str], **named_exprs: Expr | str ) -> DataFrame: """Add columns to the DataFrame. - By passing expressions, iterables of expressions, or named expressions. + By passing expressions, iterables of expressions, string SQL expressions, or named expressions. All expressions must be :class:`~datafusion.expr.Expr` objects created via - :func:`datafusion.col` or :func:`datafusion.lit`. + :func:`datafusion.col` or :func:`datafusion.lit` or SQL expressions. To pass named expressions use the form ``name=Expr``. Example usage: The following will add 4 columns labeled ``a``, ``b``, ``c``, @@ -565,14 +565,33 @@ def with_columns( ) Args: - exprs: Either a single expression or an iterable of expressions to add. + exprs: Either a single expression, an iterable of expressions to add or string SQL expressions. named_exprs: Named expressions in the form of ``name=expr`` Returns: DataFrame with the new columns added. """ - expressions = ensure_expr_list(exprs) + expressions = [] + for expr in exprs: + if isinstance(expr, str): + expr = self.parse_sql_expr(expr) + expressions.append(ensure_expr(expr)) + elif isinstance(expr, Iterable) and not isinstance( + expr, (Expr, str, bytes, bytearray) + ): + expressions.extend( + [ + self.parse_sql_expr(e).expr + if isinstance(e, str) + else ensure_expr(e) + for e in expr + ] + ) + else: + expressions.append(ensure_expr(expr)) + for alias, expr in named_exprs.items(): + expr = self.parse_sql_expr(expr) if isinstance(expr, str) else expr ensure_expr(expr) expressions.append(expr.alias(alias).expr) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index b2333382f..24c40e2c3 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -538,15 +538,35 @@ def test_with_columns(df): assert result.column(6) == pa.array([5, 7, 9]) -def test_with_columns_invalid_expr(df): - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns("a") - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(c="a") - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(["a"]) - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(c=["a"]) +def test_with_columns_str(df): + df = df.with_columns( + "a + b as c", + "a + b as d", + [ + "a + b as e", + "a + b as f", + ], + g=("a + b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.schema.field(0).name == "a" + assert result.schema.field(1).name == "b" + assert result.schema.field(2).name == "c" + assert result.schema.field(3).name == "d" + assert result.schema.field(4).name == "e" + assert result.schema.field(5).name == "f" + assert result.schema.field(6).name == "g" + + assert result.column(0) == pa.array([1, 2, 3]) + assert result.column(1) == pa.array([4, 5, 6]) + assert result.column(2) == pa.array([5, 7, 9]) + assert result.column(3) == pa.array([5, 7, 9]) + assert result.column(4) == pa.array([5, 7, 9]) + assert result.column(5) == pa.array([5, 7, 9]) + assert result.column(6) == pa.array([5, 7, 9]) def test_cast(df): From 4aae0eb2e3865c46adecf820384507bcd242b836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Tue, 21 Oct 2025 20:56:11 +0100 Subject: [PATCH 2/7] fix ruff errors --- python/datafusion/dataframe.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 00fe943bf..01c9ed960 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -549,7 +549,8 @@ def with_columns( ) -> DataFrame: """Add columns to the DataFrame. - By passing expressions, iterables of expressions, string SQL expressions, or named expressions. + By passing expressions, iterables of expressions, string SQL expressions, + or named expressions. All expressions must be :class:`~datafusion.expr.Expr` objects created via :func:`datafusion.col` or :func:`datafusion.lit` or SQL expressions. To pass named expressions use the form ``name=Expr``. @@ -565,7 +566,8 @@ def with_columns( ) Args: - exprs: Either a single expression, an iterable of expressions to add or string SQL expressions. + exprs: Either a single expression, an iterable of expressions to add or + string SQL expressions. named_exprs: Named expressions in the form of ``name=expr`` Returns: @@ -574,8 +576,7 @@ def with_columns( expressions = [] for expr in exprs: if isinstance(expr, str): - expr = self.parse_sql_expr(expr) - expressions.append(ensure_expr(expr)) + expressions.append(self.parse_sql_expr(expr).expr) elif isinstance(expr, Iterable) and not isinstance( expr, (Expr, str, bytes, bytearray) ): @@ -591,9 +592,9 @@ def with_columns( expressions.append(ensure_expr(expr)) for alias, expr in named_exprs.items(): - expr = self.parse_sql_expr(expr) if isinstance(expr, str) else expr - ensure_expr(expr) - expressions.append(expr.alias(alias).expr) + e = self.parse_sql_expr(expr) if isinstance(expr, str) else expr + ensure_expr(e) + expressions.append(e.alias(alias).expr) return DataFrame(self.df.with_columns(expressions)) From 25809a0b82292395c59ab64367a8a162d5b312f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Wed, 22 Oct 2025 17:07:55 +0100 Subject: [PATCH 3/7] Update python/datafusion/dataframe.py Co-authored-by: Hendrik Makait --- python/datafusion/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 01c9ed960..ae4262d54 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -567,7 +567,7 @@ def with_columns( Args: exprs: Either a single expression, an iterable of expressions to add or - string SQL expressions. + SQL expression strings. named_exprs: Named expressions in the form of ``name=expr`` Returns: From a37c5da1ec3e433745bc90d2bfc2f8857ff4441f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Wed, 22 Oct 2025 17:08:07 +0100 Subject: [PATCH 4/7] Update python/datafusion/dataframe.py Co-authored-by: Hendrik Makait --- python/datafusion/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index ae4262d54..72fc11051 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -552,7 +552,7 @@ def with_columns( By passing expressions, iterables of expressions, string SQL expressions, or named expressions. All expressions must be :class:`~datafusion.expr.Expr` objects created via - :func:`datafusion.col` or :func:`datafusion.lit` or SQL expressions. + :func:`datafusion.col` or :func:`datafusion.lit`, or SQL expression strings. To pass named expressions use the form ``name=Expr``. Example usage: The following will add 4 columns labeled ``a``, ``b``, ``c``, From e8332d6529e230ae92674060573d9c0f96255537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Wed, 22 Oct 2025 17:23:17 +0100 Subject: [PATCH 5/7] remove parentheses --- python/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 24c40e2c3..c3a5253c4 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -546,7 +546,7 @@ def test_with_columns_str(df): "a + b as e", "a + b as f", ], - g=("a + b"), + g="a + b", ) # execute and collect the first (and only) batch From b13216c296dd1caf2ff6997264b898b7bdda34be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Fri, 24 Oct 2025 16:06:15 +0100 Subject: [PATCH 6/7] update example --- python/datafusion/dataframe.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 72fc11051..bc9177f77 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -565,6 +565,14 @@ def with_columns( d=lit(3) ) + Equivalent example using just SQL strings: + + df = df.with_columns( + "x as a", + ["1 as b", "y as c"], + d="3" + ) + Args: exprs: Either a single expression, an iterable of expressions to add or SQL expression strings. From 2e4aff2bef0d264e3ed1d0cc86e10b42c815de56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Fri, 24 Oct 2025 20:38:24 +0100 Subject: [PATCH 7/7] fix ident --- python/datafusion/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index bc9177f77..eed30f577 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -565,7 +565,7 @@ def with_columns( d=lit(3) ) - Equivalent example using just SQL strings: + Equivalent example using just SQL strings: df = df.with_columns( "x as a",