-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: groupby with CategoricalIndex doesn't include unobserved categories #49373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
86cc43d
eb40033
57b8e07
8273de1
6f779a1
513c322
81fbdce
30c3253
e65dd2e
efc6303
35e22e4
27f39ed
df0ce75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -496,11 +496,17 @@ def __init__( | |
# In extant tests, the new self.grouping_vector matches | ||
# `index.get_level_values(ilevel)` whenever | ||
# mapper is None and isinstance(index, MultiIndex) | ||
# TODO: Can you have two levels with the same name? | ||
if isinstance(index, MultiIndex): | ||
index = index.get_level_values(ilevel) | ||
( | ||
self.grouping_vector, # Index | ||
self._codes, | ||
self._group_index, | ||
) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna) | ||
) = index._get_grouper_for_level(mapper, dropna=dropna) | ||
# We've modified the passed index; make sure it isn't used | ||
# in the remainder of this method | ||
del index | ||
|
||
# a passed Grouper like, directly get the grouper in the same way | ||
# as single grouper groupby, use the group_info to get codes | ||
|
@@ -524,15 +530,6 @@ def __init__( | |
# use Index instead of ndarray so we can recover the name | ||
self.grouping_vector = Index(ng, name=newgrouper.result_index.name) | ||
|
||
elif is_categorical_dtype(self.grouping_vector): | ||
# a passed Categorical | ||
self._passed_categorical = True | ||
|
||
self._orig_cats = self.grouping_vector.categories | ||
self.grouping_vector, self._all_grouper = recode_for_groupby( | ||
self.grouping_vector, sort, observed | ||
) | ||
|
||
elif not isinstance( | ||
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray) | ||
): | ||
|
@@ -562,6 +559,14 @@ def __init__( | |
# TODO 2022-10-08 we only have one test that gets here and | ||
# values are already in nanoseconds in that case. | ||
self.grouping_vector = Series(self.grouping_vector).to_numpy() | ||
elif is_categorical_dtype(self.grouping_vector): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any particular reason this was moved from above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, this block was in an |
||
# a passed Categorical | ||
self._passed_categorical = True | ||
|
||
self._orig_cats = self.grouping_vector.categories | ||
self.grouping_vector, self._all_grouper = recode_for_groupby( | ||
self.grouping_vector, sort, observed | ||
) | ||
|
||
def __repr__(self) -> str: | ||
return f"Grouping({self.name})" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -487,6 +487,60 @@ def test_observed_groups(observed): | |
tm.assert_dict_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"keys, expected_values, expected_index_levels", | ||
[ | ||
("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")), | ||
( | ||
["a", "b"], | ||
[7, 8, 0, 0, 0, 9, 0, 0, 0], | ||
[CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])], | ||
), | ||
( | ||
["a", "a2"], | ||
[15, 0, 0, 0, 9, 0, 0, 0, 0], | ||
[ | ||
CategoricalIndex([1, 2, 3], name="a"), | ||
CategoricalIndex([1, 2, 3], name="a"), | ||
], | ||
), | ||
], | ||
) | ||
@pytest.mark.parametrize("test_series", [True, False]) | ||
def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series): | ||
# GH#49354 - ensure unobserved cats occur when grouping by index levels | ||
df = DataFrame( | ||
{ | ||
"a": Categorical([1, 1, 2], categories=[1, 2, 3]), | ||
"a2": Categorical([1, 1, 2], categories=[1, 2, 3]), | ||
"b": [4, 5, 6], | ||
"c": [7, 8, 9], | ||
} | ||
).set_index(["a", "a2"]) | ||
if "b" not in keys: | ||
# Only keep b when it is used for grouping for consistent columns in the result | ||
df = df.drop(columns="b") | ||
|
||
gb = df.groupby(keys, observed=False) | ||
if test_series: | ||
gb = gb["c"] | ||
result = gb.sum() | ||
|
||
if len(keys) == 1: | ||
index = expected_index_levels | ||
else: | ||
codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]] | ||
index = MultiIndex( | ||
expected_index_levels, | ||
codes=codes, | ||
names=keys, | ||
) | ||
expected = DataFrame({"c": expected_values}, index=index) | ||
if test_series: | ||
expected = expected["c"] | ||
tm.assert_equal(result, expected) | ||
|
||
|
||
def test_observed_groups_with_nan(observed): | ||
# GH 24740 | ||
df = DataFrame( | ||
|
@@ -1235,10 +1289,10 @@ def df_cat(df): | |
@pytest.mark.parametrize("operation", ["agg", "apply"]) | ||
def test_seriesgroupby_observed_true(df_cat, operation): | ||
# GH 24880 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Could you add the GH reference related to why this test changed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A") | ||
lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B") | ||
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A") | ||
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B") | ||
index = MultiIndex.from_arrays([lev_a, lev_b]) | ||
expected = Series(data=[1, 3, 2, 4], index=index, name="C") | ||
expected = Series(data=[2, 4, 1, 3], index=index, name="C") | ||
|
||
grouped = df_cat.groupby(["A", "B"], observed=True)["C"] | ||
result = getattr(grouped, operation)(sum) | ||
|
@@ -1272,16 +1326,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): | |
True, | ||
MultiIndex.from_arrays( | ||
[ | ||
Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"), | ||
Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"), | ||
Index( | ||
["one", "one", "two", "two", "one", "one", "three", "three"], | ||
["one", "one", "three", "three", "one", "one", "two", "two"], | ||
dtype="category", | ||
name="B", | ||
), | ||
Index(["min", "max"] * 4), | ||
] | ||
), | ||
[1, 1, 3, 3, 2, 2, 4, 4], | ||
[2, 2, 4, 4, 1, 1, 3, 3], | ||
), | ||
( | ||
False, | ||
|
@@ -1857,7 +1911,7 @@ def test_category_order_reducer( | |
if ( | ||
reduction_func in ("idxmax", "idxmin") | ||
and not observed | ||
and index_kind == "range" | ||
and index_kind != "multi" | ||
): | ||
msg = "GH#10694 - idxmax/min fail with unused categories" | ||
request.node.add_marker(pytest.mark.xfail(reason=msg)) | ||
|
@@ -2005,10 +2059,13 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde | |
|
||
|
||
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) | ||
def test_many_categories(as_index, sort, index_kind, ordered): | ||
def test_many_categories(request, as_index, sort, index_kind, ordered): | ||
# GH#48749 - Test when the grouper has many categories | ||
if index_kind != "range" and not as_index: | ||
pytest.skip(reason="Result doesn't have categories, nothing to test") | ||
if index_kind == "multi" and as_index and not sort and ordered: | ||
msg = "GH#48749 - values are unsorted even though the Categorical is ordered" | ||
request.node.add_marker(pytest.mark.xfail(reason=msg)) | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
categories = np.arange(9999, -1, -1) | ||
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered) | ||
df = DataFrame({"a": grouper, "b": range(4)}) | ||
|
@@ -2025,11 +2082,7 @@ def test_many_categories(as_index, sort, index_kind, ordered): | |
result = gb.sum() | ||
|
||
# Test is setup so that data and index are the same values | ||
# TODO: GH#49223 - Order of values should be the same for all index_kinds | ||
if index_kind == "range": | ||
data = [3, 2, 1] if ordered else [2, 1, 3] | ||
else: | ||
data = [3, 2, 1] if sort else [2, 1, 3] | ||
data = [3, 2, 1] if sort or ordered else [2, 1, 3] | ||
|
||
index = CategoricalIndex( | ||
data, categories=grouper.categories, ordered=ordered, name="a" | ||
|
Uh oh!
There was an error while loading. Please reload this page.