From 0ea87dfa20ed35a9787cf83d1e29b62fd69ddb37 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 15 Sep 2025 22:40:05 -0700 Subject: [PATCH 1/2] add iceberg as data source --- docs/source/user-guide/data-sources.rst | 37 +++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index 7d07c67df..f14f38ab3 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -172,10 +172,41 @@ which can lead to a significant performance difference. df = ctx.table("my_delta_table") df.show() -Iceberg -------- +Apache Iceberg +-------------- -Coming soon! +DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface. + +This requires either the `pyiceberg `_ library (>=0.10.0) or the `pyiceberg-core `_ library (>=0.5.0). + +* The ``pyiceberg-core`` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings. +* The ``pyiceberg`` library utilizes the ``pyiceberg-core`` python bindings under the hood and provides a native way for Python users to interact with the DataFusion. + +.. code-block:: python + + from datafusion import SessionContext + from pyiceberg.catalog import load_catalog + import pyarrow as pa + + # Load catalog and create/load a table + catalog = load_catalog("catalog", type="in-memory") + catalog.create_namespace_if_not_exists("default") + + # Create some sample data + data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]}) + iceberg_table = catalog.create_table("default.test", schema=data.schema) + iceberg_table.append(data) + + # Register the table with DataFusion + ctx = SessionContext() + ctx.register_table_provider("test", iceberg_table) + + # Query the table using DataFusion + ctx.table("test").show() + + +Note that the Datafusion integration rely on features from the `Iceberg Rust `_ implementation instead of the `PyIceberg `_ implementation. +Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion. Custom Table Provider --------------------- From 48a95ced8471f3bd01d4069344cc151bab3e7960 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 16 Sep 2025 08:21:51 -0700 Subject: [PATCH 2/2] fix warning --- docs/source/user-guide/data-sources.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index f14f38ab3..a9b119b93 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -177,7 +177,7 @@ Apache Iceberg DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface. -This requires either the `pyiceberg `_ library (>=0.10.0) or the `pyiceberg-core `_ library (>=0.5.0). +This requires either the `pyiceberg `__ library (>=0.10.0) or the `pyiceberg-core `__ library (>=0.5.0). * The ``pyiceberg-core`` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings. * The ``pyiceberg`` library utilizes the ``pyiceberg-core`` python bindings under the hood and provides a native way for Python users to interact with the DataFusion.