From 276351c3ae6ca0d198bfb03cc78fd11fb4e98844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Mon, 13 Oct 2025 18:12:15 +0100 Subject: [PATCH 1/3] feat: expose select_exprs method on DataFrame --- python/datafusion/dataframe.py | 9 +++++++++ python/tests/test_dataframe.py | 31 +++++++++++++++++++++++++++++++ src/dataframe.rs | 7 +++++++ 3 files changed, 47 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 16765656a..2f8bfd6c2 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -404,6 +404,15 @@ def select_columns(self, *args: str) -> DataFrame: DataFrame only containing the specified columns. """ return self.select(*args) + + def select_exprs(self, *args: str) -> DataFrame: + """Project arbitrary list of expression strings into a new DataFrame. Method will parse string expressions into logical plan expressions. + The output DataFrame has one column for each element in exprs. + + Returns: + DataFrame only containing the specified columns. + """ + return self.df.select_exprs(*args) def select(self, *exprs: Expr | str) -> DataFrame: """Project arbitrary expressions into a new :py:class:`DataFrame`. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index cd85221c5..8bb01c15b 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -221,6 +221,37 @@ def test_select(df): assert result.column(1) == pa.array([1, 2, 3]) +def test_select_exprs(df): + df_1 = df.select_exprs( + "a + b", + "a - b", + ) + + # execute and collect the first (and only) batch + result = df_1.collect()[0] + + assert result.column(0) == pa.array([5, 7, 9]) + assert result.column(1) == pa.array([-3, -3, -3]) + + df_2 = df.select_exprs("b", "a") + + # execute and collect the first (and only) batch + result = df_2.collect()[0] + + assert result.column(0) == pa.array([4, 5, 6]) + assert result.column(1) == pa.array([1, 2, 3]) + + df_3 = df.select_exprs( + "abs(a + b)", + "abs(a - b)", + ) + + # execute and collect the first (and only) batch + result = df_3.collect()[0] + + assert result.column(0) == pa.array([5, 7, 9]) + assert result.column(1) == pa.array([3, 3, 3]) + def test_drop_quoted_columns(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], names=["ID_For_Students"]) diff --git a/src/dataframe.rs b/src/dataframe.rs index c23c0c97f..f603c28b4 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -435,6 +435,13 @@ impl PyDataFrame { Ok(Self::new(df)) } + #[pyo3(signature = (*args))] + fn select_exprs(&self, args: Vec) -> PyDataFusionResult { + let args = args.iter().map(|s| s.as_ref()).collect::>(); + let df = self.df.as_ref().clone().select_exprs(&args)?; + Ok(Self::new(df)) + } + #[pyo3(signature = (*args))] fn select(&self, args: Vec) -> PyDataFusionResult { let expr: Vec = args.into_iter().map(|e| e.into()).collect(); From 17862b571e6d69d2286862fe877eefcc2c11b454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Milenkovi=C4=87?= Date: Mon, 13 Oct 2025 18:27:08 +0100 Subject: [PATCH 2/3] change python doc --- python/datafusion/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 2f8bfd6c2..7f01f882d 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -406,8 +406,9 @@ def select_columns(self, *args: str) -> DataFrame: return self.select(*args) def select_exprs(self, *args: str) -> DataFrame: - """Project arbitrary list of expression strings into a new DataFrame. Method will parse string expressions into logical plan expressions. - The output DataFrame has one column for each element in exprs. + """Project arbitrary list of expression strings into a new DataFrame. + Method will parse string expressions into logical plan expressions. + The output DataFrame has one column for each element in exprs. Returns: DataFrame only containing the specified columns. From 2f56fe1c7e6f4a235be534eb2d6447d248c61a2a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 13 Oct 2025 15:55:42 -0400 Subject: [PATCH 3/3] ruff linting --- python/datafusion/dataframe.py | 9 +++++---- python/tests/test_dataframe.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 7f01f882d..5b4a8aeaf 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -404,11 +404,12 @@ def select_columns(self, *args: str) -> DataFrame: DataFrame only containing the specified columns. """ return self.select(*args) - + def select_exprs(self, *args: str) -> DataFrame: - """Project arbitrary list of expression strings into a new DataFrame. - Method will parse string expressions into logical plan expressions. - The output DataFrame has one column for each element in exprs. + """Project arbitrary list of expression strings into a new DataFrame. + + This method will parse string expressions into logical plan expressions. + The output DataFrame has one column for each expression. Returns: DataFrame only containing the specified columns. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 8bb01c15b..a420e8f21 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -252,6 +252,7 @@ def test_select_exprs(df): assert result.column(0) == pa.array([5, 7, 9]) assert result.column(1) == pa.array([3, 3, 3]) + def test_drop_quoted_columns(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], names=["ID_For_Students"])