Skip to content

[ENH] variable args for complete #857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- [DOC] Delete Read the Docs project and remove all readthedocs.io references from the repo. Issue #863. @loganthomas
- [DOC] Updated various documentation sources to reflect pyjanitor-dev ownership. @loganthomas
- [INF] Fix `isort` automatic checks. Issue #845. @loganthomas
- [ENH] `complete` function now uses variable args (*args) - @samukweku
- [EHN] Set `expand_column`'s `sep` default is `"|"`, same to `pandas.Series.str.get_dummies`. Issue #876. @Zeroto521
- [ENH] Deprecate `limit` from fill_direction. fill_direction now uses kwargs. @samukweku

Expand Down
49 changes: 22 additions & 27 deletions janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,7 @@ def coalesce(
if target_column_name is None:
target_column_name = column_names[0]
# bfill/ffill combo is faster than combine_first
outcome = df.filter(column_names).bfill(1).ffill(1).iloc[:, 0]
outcome = df.filter(column_names).bfill(axis=1).ffill(axis=1).iloc[:, 0]
if outcome.hasnans and (default_value is not None):
outcome = outcome.fillna(default_value)
return df.assign(**{target_column_name: outcome})
Expand Down Expand Up @@ -5316,7 +5316,7 @@ def fill_direction(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
fill_types = {fill.name for fill in FILLTYPE}
for column_name, fill_type in kwargs.items():
check("column_name", column_name, [str])
check("fill_details", fill_type, [str])
check("fill_type", fill_type, [str])
if fill_type.upper() not in fill_types:
raise ValueError(
"""
Expand Down Expand Up @@ -5486,7 +5486,7 @@ def groupby_topk(
@pf.register_dataframe_method
def complete(
df: pd.DataFrame,
columns: List[Union[List, Tuple, Dict, str]] = None,
*columns,
by: Optional[Union[list, str]] = None,
) -> pd.DataFrame:
"""
Expand All @@ -5513,11 +5513,10 @@ def complete(
1 2 2 b 2 5
2 1 2 b 3 6

To find all the unique combinations of `group`, `item_id`, and `item_name`,
including combinations not present in the data, each variable should be
passed in a list to the `columns` parameter::
Find all the unique combinations of `group`, `item_id`, and `item_name`,
including combinations not present in the data::

df.complete(columns = ['group', 'item_id', 'item_name'])
df.complete('group', 'item_id', 'item_name')

group item_id item_name value1 value2
0 1 1 a 1.0 4.0
Expand All @@ -5530,10 +5529,10 @@ def complete(
7 2 2 b 2.0 5.0

To expose just the missing values based only on the existing data,
`item_id` and `item_name` can be wrapped in a tuple, while `group`
is passed in as a separate variable::
`item_id` and `item_name` column names can be wrapped in a list/tuple,
while `group` is passed in as a separate variable::

df.complete(columns = ["group", ("item_id", "item_name")])
df.complete("group", ("item_id", "item_name"))
group item_id item_name value1 value2
0 1 1 a 1.0 4.0
1 1 2 b 3.0 6.0
Expand All @@ -5556,7 +5555,7 @@ def complete(
Note that Year 2000 and Agarum pairing is missing. Let's make it
explicit::

df.complete(columns = ['Year', 'Taxon'])
df.complete('Year', 'Taxon')

Year Taxon Abundance
0 1999 Agarum 1.0
Expand All @@ -5568,7 +5567,7 @@ def complete(

The null value can be replaced with the Pandas `fillna` argument::

df.complete(columns = ['Year', 'Taxon']).fillna(0)
df.complete('Year', 'Taxon').fillna(0)

Year Taxon Abundance
0 1999 Agarum 1.0
Expand All @@ -5584,7 +5583,7 @@ def complete(

new_year_values = lambda year: range(year.min(), year.max() + 1)

df.complete(columns = [{"Year": new_year_values}, "Taxon"])
df.complete({"Year": new_year_values}, "Taxon")

Year Taxon Abundance
0 1999 Agarum 1.0
Expand Down Expand Up @@ -5615,7 +5614,7 @@ def complete(
Let's get all the missing years per state::

df.complete(
columns = [{'year': new_year_values}],
{'year': new_year_values},
by='state'
)

Expand Down Expand Up @@ -5650,11 +5649,9 @@ def complete(

df = jn.complete(
df = df,
columns= [
column_label,
(column1, column2, ...),
{column1: new_values, ...}
],
column_label,
(column1, column2, ...),
{column1: new_values, ...},
by = label/list_of_labels
)

Expand All @@ -5664,26 +5661,24 @@ def complete(

df = (
pd.DataFrame(...)
.complete(columns=[
.complete(
column_label,
(column1, column2, ...),
{column1: new_values, ...},
],
by = label/list_of_labels
)
by = label/list_of_labels
)

:param df: A pandas dataframe.
:param columns: This is a list containing the columns to be
:param *columns: This is a sequence containing the columns to be
completed. It could be column labels (string type),
a list/tuple of column labels, or a dictionary that pairs
column labels with new values.
:param by: label or list of labels to group by.
The explicit missing values are returned per group.
:returns: A pandas dataframe with modified column(s).
:raises TypeError: if `columns` is not a list.
:raises ValueError: if entry in `columns` is not a
:raises ValueError: if entry in `*columns` is not a
str/dict/list/tuple.
:raises ValueError: if entry in `columns` is a dict/list/tuple
:raises ValueError: if entry in `*columns` is a dict/list/tuple
and is empty.

.. # noqa: DAR402
Expand Down
6 changes: 3 additions & 3 deletions janitor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,8 +695,6 @@ def _data_checks_complete(
"""
)

check("columns", columns, [list])

columns = [
list(grouping) if isinstance(grouping, tuple) else grouping
for grouping in columns
Expand Down Expand Up @@ -807,6 +805,8 @@ def _base_complete(
unique_index = df_index.is_unique
columns_to_stack = None

# stack...unstack implemented here if conditions are met
# usually faster than reindex
if all_strings and (not any_nulls) and (len(columns) > 1) and unique_index:
if df_empty:
df["dummy"] = 1
Expand All @@ -826,7 +826,7 @@ def _base_complete(
indexer = df_index.union(indexer, sort=None)
df = df.reindex(indexer)

else:
else: # reindex not possible on duplicate indices
df = df.join(pd.DataFrame([], index=indexer), how="outer")

return df
Expand Down
69 changes: 30 additions & 39 deletions tests/functions/test_complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,16 @@ def test_MultiIndex_column(df1):
df = df1
df.columns = [["A", "B", "C"], list(df.columns)]
with pytest.raises(ValueError):
df1.complete(["Year", "Taxon"])
df1.complete("Year", "Taxon")


def test_column_duplicated(df1):
"""Raise ValueError if column is duplicated in `columns`"""
with pytest.raises(ValueError):
df1.complete(
columns=[
"Year",
"Taxon",
{"Year": lambda x: range(x.min().x.max() + 1)},
]
"Year",
"Taxon",
{"Year": lambda x: range(x.min().x.max() + 1)},
)


Expand All @@ -57,14 +55,14 @@ def test_type_columns(df1):
def test_fill_value_is_a_dict(df1):
"""Raise error if fill_value is not a dictionary"""
with pytest.raises(TypeError):
df1.complete(columns=["Year", "Taxon"])
df1.complete("Year", "Taxon")


@pytest.mark.xfail(reason="fill_value dropped. fillna better.")
def test_wrong_column_fill_value(df1):
"""Raise ValueError if column in `fill_value` does not exist."""
with pytest.raises(ValueError):
df1.complete(columns=["Taxon", "Year"])
df1.complete("Taxon", "Year")


def test_wrong_data_type_dict(df1):
Expand All @@ -73,7 +71,7 @@ def test_wrong_data_type_dict(df1):
is not a 1-dimensional object.
"""
with pytest.raises(ValueError):
df1.complete(columns=[{"Year": pd.DataFrame([2005, 2006, 2007])}])
df1.complete({"Year": pd.DataFrame([2005, 2006, 2007])})


def test_not_list_like_type_dict(df1):
Expand All @@ -82,7 +80,7 @@ def test_not_list_like_type_dict(df1):
is not a list-like object.
"""
with pytest.raises(ValueError):
df1.complete(columns=[{"Year": "2001, 2002, 2003"}])
df1.complete({"Year": "2001, 2002, 2003"})


def test_MultiIndex_type_dict(df1):
Expand All @@ -92,9 +90,7 @@ def test_MultiIndex_type_dict(df1):
"""
with pytest.raises(ValueError):
df1.complete(
columns=[
{"Year": pd.MultiIndex.from_tuples([(1, 2001), (2, 2002)])}
]
{"Year": pd.MultiIndex.from_tuples([(1, 2001), (2, 2002)])}
)


Expand All @@ -104,7 +100,7 @@ def test_empty_type_dict(df1):
is empty.
"""
with pytest.raises(ValueError):
df1.complete(columns=[{"Year": pd.Index([])}])
df1.complete({"Year": pd.Index([])})


frame = pd.DataFrame(
Expand Down Expand Up @@ -138,14 +134,14 @@ def test_empty_type_dict(df1):
def test_wrong_columns(frame, wrong_columns):
"""Test that ValueError is raised if wrong column is supplied."""
with pytest.raises(ValueError):
frame.complete(columns=wrong_columns)
frame.complete(*wrong_columns)


@pytest.mark.parametrize("frame,empty_sub_cols", empty_sub_columns)
def test_empty_subcols(frame, empty_sub_cols):
"""Raise ValueError for an empty group in columns"""
with pytest.raises(ValueError):
frame.complete(columns=empty_sub_cols)
frame.complete(*empty_sub_cols)


@pytest.mark.xfail(reason="fill_value dropped. fillna is better.")
Expand All @@ -166,7 +162,7 @@ def test_fill_value(df1):
}
)

result = df1.complete(columns=["Year", "Taxon"]).fillna({"Abundance": 0})
result = df1.complete("Year", "Taxon").fillna({"Abundance": 0})
assert_frame_equal(result, output1)


Expand Down Expand Up @@ -214,7 +210,7 @@ def test_fill_value_all_years(df1, df1_output):
"""

result = df1.complete(
columns=[{"Year": lambda x: range(x.min(), x.max() + 1)}, "Taxon"]
{"Year": lambda x: range(x.min(), x.max() + 1)}, "Taxon"
).fillna(0)
assert_frame_equal(result, df1_output)

Expand All @@ -226,10 +222,8 @@ def test_dict_series(df1, df1_output):
"""

result = df1.complete(
columns=[
{"Year": lambda x: pd.Series(range(x.min(), x.max() + 1))},
"Taxon",
]
{"Year": lambda x: pd.Series(range(x.min(), x.max() + 1))},
"Taxon",
).fillna(0)
assert_frame_equal(result, df1_output)

Expand All @@ -241,14 +235,12 @@ def test_dict_series_duplicates(df1, df1_output):
"""

result = df1.complete(
columns=[
{
"Year": pd.Series(
[1999, 2000, 2000, 2001, 2002, 2002, 2002, 2003, 2004]
)
},
"Taxon",
]
{
"Year": pd.Series(
[1999, 2000, 2000, 2001, 2002, 2002, 2002, 2003, 2004]
)
},
"Taxon",
).fillna(0)
assert_frame_equal(result, df1_output)

Expand All @@ -261,7 +253,7 @@ def test_dict_values_outside_range(df1):
in the dictionary's values.
"""
result = df1.complete(
columns=[("Taxon", "Abundance"), {"Year": np.arange(2005, 2007)}]
("Taxon", "Abundance"), {"Year": np.arange(2005, 2007)}
)
expected = pd.DataFrame(
[
Expand Down Expand Up @@ -550,7 +542,7 @@ def test_dict_values_outside_range(df1):
@pytest.mark.parametrize("df,columns,output", complete_parameters)
def test_complete(df, columns, output):
"Test the complete function, with and without groupings."
assert_frame_equal(df.complete(columns), output)
assert_frame_equal(df.complete(*columns), output)


# https://stackoverflow.com/questions/32874239/
Expand Down Expand Up @@ -591,7 +583,7 @@ def test_grouping_first_columns():
"choice": [5, 6, 7, 5, 6, 7, 5, 6, 7],
}
)
result = df2.complete(columns=[("id", "c", "d"), "choice"])
result = df2.complete(("id", "c", "d"), "choice")
assert_frame_equal(result, output2)


Expand Down Expand Up @@ -619,7 +611,8 @@ def test_complete_multiple_groupings():
)

result = df3.complete(
columns=[("meta", "domain1"), ("project_id", "question_count")],
("meta", "domain1"),
("project_id", "question_count"),
).fillna({"tag_count": 0})
assert_frame_equal(result, output3)

Expand Down Expand Up @@ -669,10 +662,8 @@ def test_dict_tuple(df1, output_dict_tuple):
"""

result = df1.complete(
columns=[
{"Year": lambda x: range(x.min(), x.max() + 1)},
("Taxon", "Abundance"),
]
{"Year": lambda x: range(x.min(), x.max() + 1)},
("Taxon", "Abundance"),
)

assert_frame_equal(result, output_dict_tuple)
Expand All @@ -689,7 +680,7 @@ def test_complete_groupby():
)

result = df.complete(
columns=[{"year": lambda x: range(x.min(), x.max() + 1)}],
{"year": lambda x: range(x.min(), x.max() + 1)},
by="state",
)

Expand Down