diff --git a/CHANGELOG.md b/CHANGELOG.md index da28862c5..f46886aab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - [DOC] Delete Read the Docs project and remove all readthedocs.io references from the repo. Issue #863. @loganthomas - [DOC] Updated various documentation sources to reflect pyjanitor-dev ownership. @loganthomas - [INF] Fix `isort` automatic checks. Issue #845. @loganthomas +- [ENH] `complete` function now uses variable args (*args) - @samukweku - [EHN] Set `expand_column`'s `sep` default is `"|"`, same to `pandas.Series.str.get_dummies`. Issue #876. @Zeroto521 - [ENH] Deprecate `limit` from fill_direction. fill_direction now uses kwargs. @samukweku diff --git a/janitor/functions.py b/janitor/functions.py index 6cbe42059..ef1e4b9ee 100644 --- a/janitor/functions.py +++ b/janitor/functions.py @@ -1109,7 +1109,7 @@ def coalesce( if target_column_name is None: target_column_name = column_names[0] # bfill/ffill combo is faster than combine_first - outcome = df.filter(column_names).bfill(1).ffill(1).iloc[:, 0] + outcome = df.filter(column_names).bfill(axis=1).ffill(axis=1).iloc[:, 0] if outcome.hasnans and (default_value is not None): outcome = outcome.fillna(default_value) return df.assign(**{target_column_name: outcome}) @@ -5316,7 +5316,7 @@ def fill_direction(df: pd.DataFrame, **kwargs) -> pd.DataFrame: fill_types = {fill.name for fill in FILLTYPE} for column_name, fill_type in kwargs.items(): check("column_name", column_name, [str]) - check("fill_details", fill_type, [str]) + check("fill_type", fill_type, [str]) if fill_type.upper() not in fill_types: raise ValueError( """ @@ -5486,7 +5486,7 @@ def groupby_topk( @pf.register_dataframe_method def complete( df: pd.DataFrame, - columns: List[Union[List, Tuple, Dict, str]] = None, + *columns, by: Optional[Union[list, str]] = None, ) -> pd.DataFrame: """ @@ -5513,11 +5513,10 @@ def complete( 1 2 2 b 2 5 2 1 2 b 3 6 - To find all the unique combinations of `group`, `item_id`, and `item_name`, - including combinations not present in the data, each variable should be - passed in a list to the `columns` parameter:: + Find all the unique combinations of `group`, `item_id`, and `item_name`, + including combinations not present in the data:: - df.complete(columns = ['group', 'item_id', 'item_name']) + df.complete('group', 'item_id', 'item_name') group item_id item_name value1 value2 0 1 1 a 1.0 4.0 @@ -5530,10 +5529,10 @@ def complete( 7 2 2 b 2.0 5.0 To expose just the missing values based only on the existing data, - `item_id` and `item_name` can be wrapped in a tuple, while `group` - is passed in as a separate variable:: + `item_id` and `item_name` column names can be wrapped in a list/tuple, + while `group` is passed in as a separate variable:: - df.complete(columns = ["group", ("item_id", "item_name")]) + df.complete("group", ("item_id", "item_name")) group item_id item_name value1 value2 0 1 1 a 1.0 4.0 1 1 2 b 3.0 6.0 @@ -5556,7 +5555,7 @@ def complete( Note that Year 2000 and Agarum pairing is missing. Let's make it explicit:: - df.complete(columns = ['Year', 'Taxon']) + df.complete('Year', 'Taxon') Year Taxon Abundance 0 1999 Agarum 1.0 @@ -5568,7 +5567,7 @@ def complete( The null value can be replaced with the Pandas `fillna` argument:: - df.complete(columns = ['Year', 'Taxon']).fillna(0) + df.complete('Year', 'Taxon').fillna(0) Year Taxon Abundance 0 1999 Agarum 1.0 @@ -5584,7 +5583,7 @@ def complete( new_year_values = lambda year: range(year.min(), year.max() + 1) - df.complete(columns = [{"Year": new_year_values}, "Taxon"]) + df.complete({"Year": new_year_values}, "Taxon") Year Taxon Abundance 0 1999 Agarum 1.0 @@ -5615,7 +5614,7 @@ def complete( Let's get all the missing years per state:: df.complete( - columns = [{'year': new_year_values}], + {'year': new_year_values}, by='state' ) @@ -5650,11 +5649,9 @@ def complete( df = jn.complete( df = df, - columns= [ - column_label, - (column1, column2, ...), - {column1: new_values, ...} - ], + column_label, + (column1, column2, ...), + {column1: new_values, ...}, by = label/list_of_labels ) @@ -5664,26 +5661,24 @@ def complete( df = ( pd.DataFrame(...) - .complete(columns=[ + .complete( column_label, (column1, column2, ...), {column1: new_values, ...}, - ], - by = label/list_of_labels - ) + by = label/list_of_labels + ) :param df: A pandas dataframe. - :param columns: This is a list containing the columns to be + :param *columns: This is a sequence containing the columns to be completed. It could be column labels (string type), a list/tuple of column labels, or a dictionary that pairs column labels with new values. :param by: label or list of labels to group by. The explicit missing values are returned per group. :returns: A pandas dataframe with modified column(s). - :raises TypeError: if `columns` is not a list. - :raises ValueError: if entry in `columns` is not a + :raises ValueError: if entry in `*columns` is not a str/dict/list/tuple. - :raises ValueError: if entry in `columns` is a dict/list/tuple + :raises ValueError: if entry in `*columns` is a dict/list/tuple and is empty. .. # noqa: DAR402 diff --git a/janitor/utils.py b/janitor/utils.py index 72941cd1a..fb439c9a5 100644 --- a/janitor/utils.py +++ b/janitor/utils.py @@ -695,8 +695,6 @@ def _data_checks_complete( """ ) - check("columns", columns, [list]) - columns = [ list(grouping) if isinstance(grouping, tuple) else grouping for grouping in columns @@ -807,6 +805,8 @@ def _base_complete( unique_index = df_index.is_unique columns_to_stack = None + # stack...unstack implemented here if conditions are met + # usually faster than reindex if all_strings and (not any_nulls) and (len(columns) > 1) and unique_index: if df_empty: df["dummy"] = 1 @@ -826,7 +826,7 @@ def _base_complete( indexer = df_index.union(indexer, sort=None) df = df.reindex(indexer) - else: + else: # reindex not possible on duplicate indices df = df.join(pd.DataFrame([], index=indexer), how="outer") return df diff --git a/tests/functions/test_complete.py b/tests/functions/test_complete.py index aef4254b8..621249aa5 100644 --- a/tests/functions/test_complete.py +++ b/tests/functions/test_complete.py @@ -32,18 +32,16 @@ def test_MultiIndex_column(df1): df = df1 df.columns = [["A", "B", "C"], list(df.columns)] with pytest.raises(ValueError): - df1.complete(["Year", "Taxon"]) + df1.complete("Year", "Taxon") def test_column_duplicated(df1): """Raise ValueError if column is duplicated in `columns`""" with pytest.raises(ValueError): df1.complete( - columns=[ - "Year", - "Taxon", - {"Year": lambda x: range(x.min().x.max() + 1)}, - ] + "Year", + "Taxon", + {"Year": lambda x: range(x.min().x.max() + 1)}, ) @@ -57,14 +55,14 @@ def test_type_columns(df1): def test_fill_value_is_a_dict(df1): """Raise error if fill_value is not a dictionary""" with pytest.raises(TypeError): - df1.complete(columns=["Year", "Taxon"]) + df1.complete("Year", "Taxon") @pytest.mark.xfail(reason="fill_value dropped. fillna better.") def test_wrong_column_fill_value(df1): """Raise ValueError if column in `fill_value` does not exist.""" with pytest.raises(ValueError): - df1.complete(columns=["Taxon", "Year"]) + df1.complete("Taxon", "Year") def test_wrong_data_type_dict(df1): @@ -73,7 +71,7 @@ def test_wrong_data_type_dict(df1): is not a 1-dimensional object. """ with pytest.raises(ValueError): - df1.complete(columns=[{"Year": pd.DataFrame([2005, 2006, 2007])}]) + df1.complete({"Year": pd.DataFrame([2005, 2006, 2007])}) def test_not_list_like_type_dict(df1): @@ -82,7 +80,7 @@ def test_not_list_like_type_dict(df1): is not a list-like object. """ with pytest.raises(ValueError): - df1.complete(columns=[{"Year": "2001, 2002, 2003"}]) + df1.complete({"Year": "2001, 2002, 2003"}) def test_MultiIndex_type_dict(df1): @@ -92,9 +90,7 @@ def test_MultiIndex_type_dict(df1): """ with pytest.raises(ValueError): df1.complete( - columns=[ - {"Year": pd.MultiIndex.from_tuples([(1, 2001), (2, 2002)])} - ] + {"Year": pd.MultiIndex.from_tuples([(1, 2001), (2, 2002)])} ) @@ -104,7 +100,7 @@ def test_empty_type_dict(df1): is empty. """ with pytest.raises(ValueError): - df1.complete(columns=[{"Year": pd.Index([])}]) + df1.complete({"Year": pd.Index([])}) frame = pd.DataFrame( @@ -138,14 +134,14 @@ def test_empty_type_dict(df1): def test_wrong_columns(frame, wrong_columns): """Test that ValueError is raised if wrong column is supplied.""" with pytest.raises(ValueError): - frame.complete(columns=wrong_columns) + frame.complete(*wrong_columns) @pytest.mark.parametrize("frame,empty_sub_cols", empty_sub_columns) def test_empty_subcols(frame, empty_sub_cols): """Raise ValueError for an empty group in columns""" with pytest.raises(ValueError): - frame.complete(columns=empty_sub_cols) + frame.complete(*empty_sub_cols) @pytest.mark.xfail(reason="fill_value dropped. fillna is better.") @@ -166,7 +162,7 @@ def test_fill_value(df1): } ) - result = df1.complete(columns=["Year", "Taxon"]).fillna({"Abundance": 0}) + result = df1.complete("Year", "Taxon").fillna({"Abundance": 0}) assert_frame_equal(result, output1) @@ -214,7 +210,7 @@ def test_fill_value_all_years(df1, df1_output): """ result = df1.complete( - columns=[{"Year": lambda x: range(x.min(), x.max() + 1)}, "Taxon"] + {"Year": lambda x: range(x.min(), x.max() + 1)}, "Taxon" ).fillna(0) assert_frame_equal(result, df1_output) @@ -226,10 +222,8 @@ def test_dict_series(df1, df1_output): """ result = df1.complete( - columns=[ - {"Year": lambda x: pd.Series(range(x.min(), x.max() + 1))}, - "Taxon", - ] + {"Year": lambda x: pd.Series(range(x.min(), x.max() + 1))}, + "Taxon", ).fillna(0) assert_frame_equal(result, df1_output) @@ -241,14 +235,12 @@ def test_dict_series_duplicates(df1, df1_output): """ result = df1.complete( - columns=[ - { - "Year": pd.Series( - [1999, 2000, 2000, 2001, 2002, 2002, 2002, 2003, 2004] - ) - }, - "Taxon", - ] + { + "Year": pd.Series( + [1999, 2000, 2000, 2001, 2002, 2002, 2002, 2003, 2004] + ) + }, + "Taxon", ).fillna(0) assert_frame_equal(result, df1_output) @@ -261,7 +253,7 @@ def test_dict_values_outside_range(df1): in the dictionary's values. """ result = df1.complete( - columns=[("Taxon", "Abundance"), {"Year": np.arange(2005, 2007)}] + ("Taxon", "Abundance"), {"Year": np.arange(2005, 2007)} ) expected = pd.DataFrame( [ @@ -550,7 +542,7 @@ def test_dict_values_outside_range(df1): @pytest.mark.parametrize("df,columns,output", complete_parameters) def test_complete(df, columns, output): "Test the complete function, with and without groupings." - assert_frame_equal(df.complete(columns), output) + assert_frame_equal(df.complete(*columns), output) # https://stackoverflow.com/questions/32874239/ @@ -591,7 +583,7 @@ def test_grouping_first_columns(): "choice": [5, 6, 7, 5, 6, 7, 5, 6, 7], } ) - result = df2.complete(columns=[("id", "c", "d"), "choice"]) + result = df2.complete(("id", "c", "d"), "choice") assert_frame_equal(result, output2) @@ -619,7 +611,8 @@ def test_complete_multiple_groupings(): ) result = df3.complete( - columns=[("meta", "domain1"), ("project_id", "question_count")], + ("meta", "domain1"), + ("project_id", "question_count"), ).fillna({"tag_count": 0}) assert_frame_equal(result, output3) @@ -669,10 +662,8 @@ def test_dict_tuple(df1, output_dict_tuple): """ result = df1.complete( - columns=[ - {"Year": lambda x: range(x.min(), x.max() + 1)}, - ("Taxon", "Abundance"), - ] + {"Year": lambda x: range(x.min(), x.max() + 1)}, + ("Taxon", "Abundance"), ) assert_frame_equal(result, output_dict_tuple) @@ -689,7 +680,7 @@ def test_complete_groupby(): ) result = df.complete( - columns=[{"year": lambda x: range(x.min(), x.max() + 1)}], + {"year": lambda x: range(x.min(), x.max() + 1)}, by="state", )