Skip to content

Arrow support #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: "3.10"
- name: Upgrade pip
run: |
pip install pip --upgrade
Expand All @@ -23,7 +23,8 @@ jobs:
run: |
pip install ray git+https://github.com/modin-project/modin
pip install vaex # use stable as no nightly builds and long build time
pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow --force-reinstall
pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple pandas --ignore-installed --no-deps
- name: Run tests
run: |
pytest tests/ -v --ci
pytest tests/ -vv --ci
20 changes: 10 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,13 @@ def pytest_configure(config):
"test_signatures.py::test_buffer_method[cudf-__dlpack__]",
"test_signatures.py::test_buffer_method[cudf-__dlpack_device__]",
# https://github.com/vaexio/vaex/issues/2083
# https://github.com/vaexio/vaex/issues/2093
# https://github.com/vaexio/vaex/issues/2113
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pandas]",
# https://github.com/modin-project/modin/issues/6143
# https://github.com/data-apis/dataframe-interchange-tests/pull/21#issuecomment-1495914398
"test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-pyarrow.Table]",
# https://github.com/rapidsai/cudf/issues/11389
"test_column_object.py::test_dtype[cudf]",
# Raises RuntimeError, which is technically correct, but the spec will
# require TypeError soon.
# See https://github.com/data-apis/dataframe-api/pull/74
"test_column_object.py::test_describe_categorical[modin]",
# https://github.com/vaexio/vaex/issues/2113
"test_column_object.py::test_describe_categorical[vaex]",
# https://github.com/modin-project/modin/issues/4687
"test_column_object.py::test_null_count[modin]",
# https://github.com/vaexio/vaex/issues/2121
Expand All @@ -68,9 +63,14 @@ def pytest_configure(config):
"test_column_object.py::test_dtype[vaex]",
# SEGFAULT
"test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-vaex]",
# modin flakiness
# modin flakiness - probably from monkeypatching done in wrappers.py
"test_from_dataframe.py::test_from_dataframe_roundtrip[pandas-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pandas]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-vaex]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[vaex-modin]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[modin-pyarrow.Table]",
"test_from_dataframe.py::test_from_dataframe_roundtrip[pyarrow.Table-modin]",
"test_meta.py::test_frame_equal[modin]",
]
assert not any(case in ci_xfail_ids for case in ci_skip_ids) # sanity check
Expand Down
12 changes: 10 additions & 2 deletions tests/test_column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,16 @@ def test_null_count(libinfo: LibraryInfo, data: st.DataObject):
null_count = col.null_count
if null_count is not None:
assert isinstance(null_count, int)
if mock_col.nominal_dtype != NominalDtype.UTF8: # TODO: test string cols
assert null_count == sum(np.isnan(mock_col.array))
if mock_col.nominal_dtype == NominalDtype.UTF8: # TODO: test string cols
return
nullinfo = col.describe_null
assert isinstance(nullinfo, tuple) and len(nullinfo) == 2 # sanity check
kind, value = nullinfo
nan_count = sum(np.isnan(mock_col.array))
if kind == 0: # non-nullable
assert null_count in [0, nan_count] # XXX: should null_count always be 0?
else:
assert null_count == nan_count


@given(data=st.data())
Expand Down
19 changes: 12 additions & 7 deletions tests/test_dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,16 @@ def test_get_chunks(libinfo: LibraryInfo, data: st.DataObject):
df = data.draw(libinfo.interchange_dataframes(), label="df")
_n_chunks = df.num_chunks()
assert isinstance(_n_chunks, int) # sanity check
n_chunks = data.draw(
st.none() | st.integers(1, 2).map(lambda n: n * _n_chunks), label="n_chunks"
)
if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
args = []
if _n_chunks == 0:
df.get_chunks()
else:
args = [n_chunks]
df.get_chunks(*args)
assert _n_chunks >= 1 # sanity check
n_chunks_strat = st.sampled_from([None, 1])
if _n_chunks > 1:
n_chunks_strat |= st.integers(1, 2).map(lambda n: n * _n_chunks)
n_chunks = data.draw(n_chunks_strat, label="n_chunks")
if n_chunks is None and not data.draw(st.booleans(), label="pass n_chunks"):
args = []
else:
args = [n_chunks]
df.get_chunks(*args)
50 changes: 49 additions & 1 deletion tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
def test_ci_has_correct_library_params(pytestconfig):
if not pytestconfig.getoption("--ci"):
pytest.skip("only intended for --ci runs")
assert set(libname_to_libinfo.keys()) == {"pandas", "vaex", "modin"}
assert set(libname_to_libinfo.keys()) == {
"pandas",
"vaex",
"modin",
"pyarrow.Table",
"pyarrow.RecordBatch",
}


@given(utf8_strings())
Expand Down Expand Up @@ -50,3 +56,45 @@ def test_strategy(libinfo: LibraryInfo, func_name: str, data: st.DataObject):
def test_frame_equal(libinfo: LibraryInfo, data: st.DataObject):
df = data.draw(libinfo.toplevel_dataframes(), label="df")
assert libinfo.frame_equal(df, df)


def test_pandas_frame_equal_string_object_columns():
try:
import pandas as pd

libinfo = libname_to_libinfo["pandas"]
except (KeyError, ImportError) as e:
pytest.skip(e.msg)
df1 = pd.DataFrame({"foo": ["bar"]})
assert df1["foo"].dtype == object # sanity check
df2 = pd.DataFrame({"foo": pd.Series(["bar"], dtype=pd.StringDtype())})
assert libinfo.frame_equal(df1, df2)
assert libinfo.frame_equal(df2, df1)


@pytest.mark.parametrize("container_name", ["Table", "RecordBatch"])
def test_pyarrow_frame_equal_string_columns(container_name):
try:
import pyarrow as pa

libinfo = libname_to_libinfo[f"pyarrow.{container_name}"]
except (KeyError, ImportError) as e:
pytest.skip(e.msg)

container_class = getattr(pa, container_name)
df1 = container_class.from_pydict(
{
"a": pa.array(["foo"]),
"b": pa.DictionaryArray.from_arrays(pa.array([0]), pa.array(["bar"])),
}
)
df2 = container_class.from_pydict(
{
"a": pa.array(["foo"], type=pa.large_string()),
"b": pa.DictionaryArray.from_arrays(
pa.array([0]), pa.array(["bar"], type=pa.large_string())
),
}
)
assert libinfo.frame_equal(df1, df2)
assert libinfo.frame_equal(df2, df1)
5 changes: 4 additions & 1 deletion tests/test_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@


def _test_signature(func, stub):
sig = signature(func)
try:
sig = signature(func)
except ValueError:
pytest.skip("Signature not inspectable")
stub_sig = signature(stub)
params = list(sig.parameters.values())
df_stub_params = list(stub_sig.parameters.values())
Expand Down
Loading