From 80611e6a2657f9ec668fc0c3f78b0bb6622a1c4d Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Tue, 11 Mar 2025 19:59:10 +0530 Subject: [PATCH 01/10] Renamed Expr to RawExpr --- python/datafusion/expr.py | 8 ++++---- src/expr.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3639abec6..079e14b21 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -193,7 +193,7 @@ class Expr: :ref:`Expressions` in the online documentation for more information. """ - def __init__(self, expr: expr_internal.Expr) -> None: + def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr @@ -383,7 +383,7 @@ def literal(value: Any) -> Expr: value = pa.scalar(value, type=pa.string_view()) if not isinstance(value, pa.Scalar): value = pa.scalar(value) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) @staticmethod def string_literal(value: str) -> Expr: @@ -398,13 +398,13 @@ def string_literal(value: str) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string()) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) return Expr.literal(value) @staticmethod def column(value: str) -> Expr: """Creates a new expression representing a column.""" - return Expr(expr_internal.Expr.column(value)) + return Expr(expr_internal.RawExpr.column(value)) def alias(self, name: str) -> Expr: """Assign a name to the expression.""" diff --git a/src/expr.rs b/src/expr.rs index e750be6a4..d3c528eb4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -101,7 +101,7 @@ pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; /// A PyExpr that can be used on a DataFrame -#[pyclass(name = "Expr", module = "datafusion.expr", subclass)] +#[pyclass(name = "RawExpr", module = "datafusion.expr", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, From 3c15d555e20b6ef727d6dc9fbaba234a5f1570c6 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 12 Mar 2025 00:39:02 +0530 Subject: [PATCH 02/10] Fixed CI test for exported classes to include RawExpr as well --- python/datafusion/expr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 079e14b21..b7a370ad0 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -86,6 +86,7 @@ Partitioning = expr_internal.Partitioning Placeholder = expr_internal.Placeholder Projection = expr_internal.Projection +RawExpr = expr_internal.RawExpr Repartition = expr_internal.Repartition ScalarSubquery = expr_internal.ScalarSubquery ScalarVariable = expr_internal.ScalarVariable @@ -102,6 +103,7 @@ __all__ = [ "Expr", + "RawExpr", "Column", "Literal", "BinaryExpr", From 8affdc0fc5d9b704eb41c34c7302c25b2f1fc943 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 12 Mar 2025 01:13:07 +0530 Subject: [PATCH 03/10] Fixed CI test for exported classes to check if Expr class covers RawExpr --- python/datafusion/expr.py | 6 +++++- python/tests/test_wrapper_coverage.py | 10 ++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index b7a370ad0..213910035 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -86,7 +86,6 @@ Partitioning = expr_internal.Partitioning Placeholder = expr_internal.Placeholder Projection = expr_internal.Projection -RawExpr = expr_internal.RawExpr Repartition = expr_internal.Repartition ScalarSubquery = expr_internal.ScalarSubquery ScalarVariable = expr_internal.ScalarVariable @@ -199,6 +198,11 @@ def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr + @property + def raw_expr(self) -> expr_internal.RawExpr: + """Returns the underlying RawExpr instance.""" + return self.expr + def to_variant(self) -> Any: """Convert this expression into a python object if possible.""" return self.expr.to_variant() diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index ac064ba95..aaa8d6660 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -34,9 +34,15 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for attr in dir(internal_obj): - if attr in ["_global_ctx"]: + # Skip internal context and RawExpr (which is handled by Expr class) + if attr in ["_global_ctx", "RawExpr"]: + continue + + # Check if RawExpr functionality is covered by Expr class + if attr == "RawExpr" and hasattr(wrapped_obj, "Expr"): + expr_class = getattr(wrapped_obj, "Expr") + assert hasattr(expr_class, "raw_expr"), "Expr class must provide raw_expr property" continue - assert attr in dir(wrapped_obj) internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) From 25291095ab7a02037da2eb4cee1b487d0e7c3944 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 12 Mar 2025 02:05:52 +0530 Subject: [PATCH 04/10] Generalized Raw* class checking --- python/tests/test_wrapper_coverage.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index aaa8d6660..bd4ab6901 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -34,14 +34,13 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for attr in dir(internal_obj): - # Skip internal context and RawExpr (which is handled by Expr class) - if attr in ["_global_ctx", "RawExpr"]: + if attr in ["_global_ctx"]: continue - # Check if RawExpr functionality is covered by Expr class - if attr == "RawExpr" and hasattr(wrapped_obj, "Expr"): - expr_class = getattr(wrapped_obj, "Expr") - assert hasattr(expr_class, "raw_expr"), "Expr class must provide raw_expr property" + # Check if Raw* classes have corresponding wrapper classes + if attr.startswith("Raw"): + base_class = attr[3:] # Remove "Raw" prefix + assert hasattr(wrapped_obj, base_class), f"Missing implementation for {attr}" continue internal_attr = getattr(internal_obj, attr) From 42daa1bf7892370c1b9df82741b72ab2c918bed2 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 12 Mar 2025 10:41:39 +0530 Subject: [PATCH 05/10] fixes --- python/datafusion/expr.py | 6 ------ python/tests/test_wrapper_coverage.py | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 213910035..079e14b21 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -102,7 +102,6 @@ __all__ = [ "Expr", - "RawExpr", "Column", "Literal", "BinaryExpr", @@ -198,11 +197,6 @@ def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr - @property - def raw_expr(self) -> expr_internal.RawExpr: - """Returns the underlying RawExpr instance.""" - return self.expr - def to_variant(self) -> Any: """Convert this expression into a python object if possible.""" return self.expr.to_variant() diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index bd4ab6901..cb44dbcdc 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -36,13 +36,15 @@ def missing_exports(internal_obj, wrapped_obj) -> None: for attr in dir(internal_obj): if attr in ["_global_ctx"]: continue - + # Check if Raw* classes have corresponding wrapper classes if attr.startswith("Raw"): base_class = attr[3:] # Remove "Raw" prefix - assert hasattr(wrapped_obj, base_class), f"Missing implementation for {attr}" + assert hasattr(wrapped_obj, base_class) continue + assert attr in dir(wrapped_obj) + internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) From f995cb8a0cd479b03091c8a2bfa3de66c11b5c18 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Wed, 12 Mar 2025 17:38:52 +0530 Subject: [PATCH 06/10] fixes --- python/tests/test_wrapper_coverage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index cb44dbcdc..c9ea5267b 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -38,7 +38,7 @@ def missing_exports(internal_obj, wrapped_obj) -> None: continue # Check if Raw* classes have corresponding wrapper classes - if attr.startswith("Raw"): + elif attr.startswith("Raw"): base_class = attr[3:] # Remove "Raw" prefix assert hasattr(wrapped_obj, base_class) continue @@ -58,6 +58,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) for val in internal_attr: + if isinstance(val, str) and val.startswith("Raw"): + continue assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): missing_exports(internal_attr, wrapped_attr) From 4200a39da857e2e2a710751419819b21b0c1c2b0 Mon Sep 17 00:00:00 2001 From: Spaarsh-root Date: Fri, 14 Mar 2025 02:05:25 +0530 Subject: [PATCH 07/10] fixed the CI test to not look for Raw classes in the datafusion module --- python/tests/test_wrapper_coverage.py | 28 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index c9ea5267b..b8daa2e54 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,19 +28,21 @@ def missing_exports(internal_obj, wrapped_obj) -> None: + """ + Special handling for: + - Raw* classes: Internal implementation details that shouldn't be exposed + - _global_ctx: Internal implementation detail + - __self__, __class__: Python special attributes + """ # Special case enums - just make sure they exist since dir() # and other functions get overridden. if isinstance(wrapped_obj, EnumType): return + # iterate through all the classes in datafusion._internal for attr in dir(internal_obj): - if attr in ["_global_ctx"]: - continue - - # Check if Raw* classes have corresponding wrapper classes - elif attr.startswith("Raw"): - base_class = attr[3:] # Remove "Raw" prefix - assert hasattr(wrapped_obj, base_class) + # Skip internal implementation details that shouldn't be exposed in public API + if attr in ["_global_ctx"] or attr.startswith("Raw"): continue assert attr in dir(wrapped_obj) @@ -55,11 +57,17 @@ def missing_exports(internal_obj, wrapped_obj) -> None: if attr in ["__self__", "__class__"]: continue + + # check if the class found in the internal module has a + # wrapper exposed in the public module, datafusion if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) for val in internal_attr: + # Skip Raw* classes as they are internal if isinstance(val, str) and val.startswith("Raw"): + print("Skipping Raw* class: ", val) continue + assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): missing_exports(internal_attr, wrapped_attr) @@ -68,7 +76,9 @@ def missing_exports(internal_obj, wrapped_obj) -> None: def test_datafusion_missing_exports() -> None: """Check for any missing python exports. - This test verifies that every exposed class, attribute, and function in - the internal (pyo3) module is also exposed in our python wrappers. + This test verifies that every exposed class, attribute, + and function in the internal (pyo3) module - datafusion._internal + is also exposed in our python wrappers - datafusion - + i.e., the ones exposed to the public. """ missing_exports(datafusion._internal, datafusion) From 580ecd57d75cd0933dfcf0ae5a0c31af3732bd0f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 14 Mar 2025 08:11:27 -0400 Subject: [PATCH 08/10] Add additional text to unit test describing operation and ensure wrapped Raw classes are checked --- python/tests/test_wrapper_coverage.py | 47 +++++++++++++++++---------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index b8daa2e54..40b4eb5bc 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -29,47 +29,60 @@ def missing_exports(internal_obj, wrapped_obj) -> None: """ + Identify if any of the rust exposted structs or functions do not have wrappers. + Special handling for: - Raw* classes: Internal implementation details that shouldn't be exposed - _global_ctx: Internal implementation detail - __self__, __class__: Python special attributes """ - # Special case enums - just make sure they exist since dir() - # and other functions get overridden. + # Special case enums - EnumType overrides a some of the internal functions, + # so check all of the values exist and move on if isinstance(wrapped_obj, EnumType): + expected_values = [v for v in dir(internal_obj) if not v.startswith("__")] + for value in expected_values: + assert value in dir(wrapped_obj) return - # iterate through all the classes in datafusion._internal - for attr in dir(internal_obj): + for internal_attr_name in dir(internal_obj): # Skip internal implementation details that shouldn't be exposed in public API - if attr in ["_global_ctx"] or attr.startswith("Raw"): + if internal_attr_name in ["_global_ctx"]: continue - assert attr in dir(wrapped_obj) + wrapped_attr_name = ( + internal_attr_name[3:] + if internal_attr_name.startswith("Raw") + else internal_attr_name + ) + assert wrapped_attr_name in dir(wrapped_obj) - internal_attr = getattr(internal_obj, attr) - wrapped_attr = getattr(wrapped_obj, attr) + internal_attr = getattr(internal_obj, internal_attr_name) + wrapped_attr = getattr(wrapped_obj, wrapped_attr_name) + # There are some auto generated attributes that can be None, such as + # __kwdefaults__ and __doc__. As long as these are None on the internal + # object, it's okay to skip them. However if they do exist on the internal + # object they must also exist on the wrapped object. if internal_attr is not None: if wrapped_attr is None: - print("Missing attribute: ", attr) + print("Missing attribute: ", internal_attr_name) assert False - if attr in ["__self__", "__class__"]: + if internal_attr_name in ["__self__", "__class__"]: continue - # check if the class found in the internal module has a - # wrapper exposed in the public module, datafusion if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) + + # We have cases like __all__ that are a list and we want to be certain that + # every value in the list in the internal object is also in the wrapper list for val in internal_attr: - # Skip Raw* classes as they are internal if isinstance(val, str) and val.startswith("Raw"): - print("Skipping Raw* class: ", val) - continue - - assert val in wrapped_attr + assert val[3:] in wrapped_attr + else: + assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): + # Check all submodules recursively missing_exports(internal_attr, wrapped_attr) From 03d3398bd4fee4eacd883ca4d14b0d2cfcbac3d5 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 14 Mar 2025 08:14:34 -0400 Subject: [PATCH 09/10] New ruff rule on main --- python/tests/test_wrapper_coverage.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index 40b4eb5bc..aef111cfc 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -19,6 +19,7 @@ import datafusion.functions import datafusion.object_store import datafusion.substrait +import pytest # EnumType introduced in 3.11. 3.10 and prior it was called EnumMeta. try: @@ -45,10 +46,6 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for internal_attr_name in dir(internal_obj): - # Skip internal implementation details that shouldn't be exposed in public API - if internal_attr_name in ["_global_ctx"]: - continue - wrapped_attr_name = ( internal_attr_name[3:] if internal_attr_name.startswith("Raw") @@ -65,8 +62,7 @@ def missing_exports(internal_obj, wrapped_obj) -> None: # object they must also exist on the wrapped object. if internal_attr is not None: if wrapped_attr is None: - print("Missing attribute: ", internal_attr_name) - assert False + pytest.fail(f"Missing attribute: {internal_attr_name}") if internal_attr_name in ["__self__", "__class__"]: continue From a669bcf906f06bf34349022af5da50a602cb94d3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 14 Mar 2025 08:21:25 -0400 Subject: [PATCH 10/10] Resolve ruff errors --- python/tests/test_wrapper_coverage.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index aef111cfc..a2de2d32b 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,7 +28,7 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: +def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 """ Identify if any of the rust exposted structs or functions do not have wrappers. @@ -46,11 +46,7 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for internal_attr_name in dir(internal_obj): - wrapped_attr_name = ( - internal_attr_name[3:] - if internal_attr_name.startswith("Raw") - else internal_attr_name - ) + wrapped_attr_name = internal_attr_name.removeprefix("Raw") assert wrapped_attr_name in dir(wrapped_obj) internal_attr = getattr(internal_obj, internal_attr_name)