From 77759a9a247902dcecb7888a2d78419825fb4c7a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 21 Jan 2025 17:26:09 +0000 Subject: [PATCH] Backport PR #60739: ENH: pandas.api.interchange.from_dataframe now uses the Arrow PyCapsule Interface if available, only falling back to the Dataframe Interchange Protocol if that fails --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/interchange/from_dataframe.py | 27 +++++++++++++++++++++++ pandas/tests/interchange/test_impl.py | 14 +++++++++--- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index f7bd8a8f191ae..65a1c55f674b4 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -36,6 +36,7 @@ Other enhancements when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 53f18883ea3ad..c0df1c17e3a7c 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -36,6 +36,21 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol. + From pandas 2.3 onwards, `from_dataframe` uses the PyCapsule Interface, + only falling back to the interchange protocol if that fails. + + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- df : DataFrameXchg @@ -67,6 +82,18 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if isinstance(df, pd.DataFrame): return df + if hasattr(df, "__arrow_c_stream__"): + try: + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + except ImportError: + # fallback to _from_dataframe + pass + else: + try: + return pa.table(df).to_pandas(zero_copy_only=not allow_copy) + except pa.ArrowInvalid as e: + raise RuntimeError(e) from e + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c32b31c297c5d..5563ee8b4caed 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -288,7 +288,7 @@ def test_empty_pyarrow(data): expected = pd.DataFrame(data) arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_multi_chunk_pyarrow() -> None: @@ -298,8 +298,7 @@ def test_multi_chunk_pyarrow() -> None: table = pa.table([n_legs], names=names) with pytest.raises( RuntimeError, - match="To join chunks a copy is required which is " - "forbidden by allow_copy=False", + match="Cannot do zero copy conversion into multi-column DataFrame block", ): pd.api.interchange.from_dataframe(table, allow_copy=False) @@ -606,3 +605,12 @@ def test_empty_dataframe(): result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) expected = pd.DataFrame({"a": []}, dtype="int8") tm.assert_frame_equal(result, expected) + + +def test_from_dataframe_list_dtype(): + pa = pytest.importorskip("pyarrow", "14.0.0") + data = {"a": [[1, 2], [4, 5, 6]]} + tbl = pa.table(data) + result = from_dataframe(tbl) + expected = pd.DataFrame(data) + tm.assert_frame_equal(result, expected)