Skip to content
Open
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,9 @@ Other enhancements
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
- Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`)
-
- Added a new :meth:`DataFrame.from_arrow` method to import any Arrow-compatible
tabular data object into a pandas :class:`DataFrame` through the
`Arrow PyCapsule Protocol <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`__ (:issue:`59631`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
40 changes: 40 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,4 +533,44 @@ def closed(self) -> bool:

SliceType: TypeAlias = Hashable | None


# Arrow PyCapsule Interface
# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints


class ArrowArrayExportable(Protocol):
"""
An object with an ``__arrow_c_array__`` method.

This method indicates the object is an Arrow-compatible object implementing
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in
Python), enabling zero-copy Arrow data interchange across libraries.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
.. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html

"""

def __arrow_c_array__(
self, requested_schema: object | None = None
) -> tuple[object, object]: ...


class ArrowStreamExportable(Protocol):
"""
An object with an ``__arrow_c_stream__`` method.

This method indicates the object is an Arrow-compatible object implementing
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_
for streams in Python), enabling zero-copy Arrow data interchange across
libraries.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
.. _Arrow C Stream Interface: https://arrow.apache.org/docs/format/CStreamInterface.html

"""

def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...


__all__ = ["type_t"]
52 changes: 52 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@
AnyAll,
AnyArrayLike,
ArrayLike,
ArrowArrayExportable,
ArrowStreamExportable,
Axes,
Axis,
AxisInt,
Expand Down Expand Up @@ -1836,6 +1838,56 @@ def __rmatmul__(self, other) -> DataFrame:
# ----------------------------------------------------------------------
# IO methods (to / from other formats)

@classmethod
def from_arrow(
cls, data: ArrowArrayExportable | ArrowStreamExportable
) -> DataFrame:
"""
Construct a DataFrame from a tabular Arrow object.

This function accepts any Arrow-compatible tabular object implementing
the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
or ``__arrow_c_stream__`` method).

This function currently relies on ``pyarrow`` to convert the tabular
object in Arrow format to pandas.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html

.. versionadded:: 3.0

Parameters
----------
data : pyarrow.Table or Arrow-compatible table
Any tabular object implementing the Arrow PyCapsule Protocol
(i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
method).

Returns
-------
DataFrame

"""
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
if not isinstance(data, pa.Table):
if not (
hasattr(data, "__arrow_c_array__")
or hasattr(data, "__arrow_c_stream__")
):
# explicitly test this, because otherwise we would accept variour other
# input types through the pa.table(..) call
raise TypeError(
"Expected an Arrow-compatible tabular object (i.e. having an "
"'_arrow_c_array__' or '__arrow_c_stream__' method), got "
f"'{type(data).__name__}' instead."
)
pa_table = pa.table(data)
else:
pa_table = data

df = pa_table.to_pandas()
return df

@classmethod
def from_dict(
cls,
Expand Down
44 changes: 44 additions & 0 deletions pandas/tests/frame/test_arrow_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

pa = pytest.importorskip("pyarrow")

Expand Down Expand Up @@ -45,3 +46,46 @@ def test_dataframe_to_arrow(using_infer_string):
table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
expected = expected.cast(schema)
assert table.equals(expected)


class ArrowArrayWrapper:
def __init__(self, batch):
self.array = batch

def __arrow_c_array__(self, requested_schema=None):
return self.array.__arrow_c_array__(requested_schema)


class ArrowStreamWrapper:
def __init__(self, table):
self.stream = table

def __arrow_c_stream__(self, requested_schema=None):
return self.stream.__arrow_c_stream__(requested_schema)


@td.skip_if_no("pyarrow", min_version="14.0")
def test_dataframe_from_arrow():
# objects with __arrow_c_stream__
table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})

result = pd.DataFrame.from_arrow(table)
expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
tm.assert_frame_equal(result, expected)

# not only pyarrow object are supported
result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
tm.assert_frame_equal(result, expected)

# objects with __arrow_c_array__
batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])

result = pd.DataFrame.from_arrow(table)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
tm.assert_frame_equal(result, expected)

# only accept actual Arrow objects
with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"):
pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})
Loading