diff --git a/AUTHORS.rst b/AUTHORS.rst index 50d54d206..dea16ed2b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -100,3 +100,4 @@ Contributors - `@MollyCroke `_ | `contributions `_ - `@ericclessantostv `_ | `contributions `_ - `@nvamsikrishna05 `_ | `contributions `_ +- `@fireddd `_ | `contributions `_ diff --git a/CHANGELOG.md b/CHANGELOG.md index 29cfd7345..d441733a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - [INF] Update pre-commit hooks and remove mutable references. Issue #844. @loganthomas - [INF] Add GitHub Release pointer to auto-release script. Issue #818. @loganthomas - [INF] Updated black version in github actions code-checks to match pre-commit hooks. @nvamsikrishna05 +- [ENH] Add reset_index flag to row_to_names function. @fireddd - [ENH] Updated `label_encode` to use pandas factorize instead of scikit-learn LabelEncoder. @nvamsikrishna05 - [INF] Removed the scikit-learn package from the dependencies from environment-dev.yml and base.in files. @nvamsikrishna05 diff --git a/janitor/functions.py b/janitor/functions.py index 44b03c7fd..dc23f566d 100644 --- a/janitor/functions.py +++ b/janitor/functions.py @@ -24,11 +24,15 @@ import numpy as np import pandas as pd -from pandas.api.types import is_numeric_dtype import pandas_flavor as pf from multipledispatch import dispatch from natsort import index_natsorted -from pandas.api.types import is_bool_dtype, is_list_like, union_categoricals +from pandas.api.types import ( + is_bool_dtype, + is_list_like, + is_numeric_dtype, + union_categoricals, +) from pandas.errors import OutOfBoundsDatetime from scipy.stats import mode @@ -48,10 +52,10 @@ _replace_original_empty_string_with_none, _select_columns, _strip_underscores, + asCategorical, check, check_column, deprecated_alias, - asCategorical, ) @@ -2289,6 +2293,7 @@ def row_to_names( row_number: int = None, remove_row: bool = False, remove_rows_above: bool = False, + reset_index: bool = False, ) -> pd.DataFrame: """Elevates a row to be the column names of a DataFrame. @@ -2307,6 +2312,7 @@ def row_to_names( row_number=0, remove_row=False, remove_rows_above=False, + reset_index=False, ) ) @@ -2316,6 +2322,8 @@ def row_to_names( Defaults to False. :param remove_rows_above: Whether the rows above the selected row should be removed from the DataFrame. Defaults to False. + :param reset_index: Whether the index should be reset on the returning + DataFrame. Defaults to False. :returns: A pandas DataFrame with set column names. """ # :Setup: @@ -2353,7 +2361,31 @@ def row_to_names( # 6 1 1 1 rabbit Cambridge # 7 2 2 2 leopard Shanghai - # :Example: Move first row to column names and remove row: + # :Example: Move first row to column names and + # remove row while resetting the index: + + # .. code-block:: python + + # example_dataframe = pd.DataFrame(data_dict) + # example_dataframe.row_to_names(0, remove_row=True,\ + # reset_index=True) + + # :Output: + + # .. code-block:: python + + # 1 1 1 rabbit Cambridge + # 0 2 2 2 leopard Shanghai + # 1 3 3 3 lion Basel + # 2 1 1 1 rabbit Cambridge + # 3 2 2 2 leopard Shanghai + # 4 3 3 3 lion Basel + # 5 1 1 1 rabbit Cambridge + # 6 2 2 2 leopard Shanghai + # 7 3 3 3 lion Basel + + # :Example: Move first row to column names and remove + # row without resetting the index: # .. code-block:: python @@ -2374,14 +2406,37 @@ def row_to_names( # 7 2 2 2 leopard Shanghai # 8 3 3 3 lion Basel - # :Example: Move first row to column names, remove row, \ - # and remove rows above selected row: + # :Example: Move first row to column names, remove row + # and remove rows above selected row without resetting + # index: + + # .. code-block:: python + + # example_dataframe = pd.DataFrame(data_dict) + # example_dataframe.row_to_names(2, remove_row=True, \ + # remove_rows_above=True, reset_index= True) + + # :Output: + + # .. code-block:: python + + # 3 3 3 lion Basel + # 0 1 1 1 rabbit Cambridge + # 1 2 2 2 leopard Shanghai + # 2 3 3 3 lion Basel + # 3 1 1 1 rabbit Cambridge + # 4 2 2 2 leopard Shanghai + # 5 3 3 3 lion Basel + + # :Example: Move first row to column names, remove row, + # and remove rows above selected row without resetting + # index: # .. code-block:: python # example_dataframe = pd.DataFrame(data_dict) # example_dataframe.row_to_names(2, remove_row=True, \ - # remove_rows_above=True) + # remove_rows_above=True) # :Output: @@ -2397,6 +2452,13 @@ def row_to_names( check("row_number", row_number, [int]) + warnings.warn( + "The function row_to_names will, in the official 1.0 release, " + "change its behaviour to reset the dataframe's index by default. " + "You can prepare for this change right now by explicitly setting " + "`reset_index=True` when calling on `row_to_names`." + ) + df.columns = df.iloc[row_number, :] df.columns.name = None @@ -2406,6 +2468,9 @@ def row_to_names( if remove_rows_above: df = df.drop(df.index[range(row_number)]) + if reset_index: + df.reset_index(drop=["index"], inplace=True) + return df diff --git a/tests/functions/test_row_to_names.py b/tests/functions/test_row_to_names.py index 33a9df16b..f3154dab1 100644 --- a/tests/functions/test_row_to_names.py +++ b/tests/functions/test_row_to_names.py @@ -1,3 +1,4 @@ +import pandas as pd import pytest @@ -21,6 +22,15 @@ def test_row_to_names_delete_this_row(dataframe): assert df.iloc[2, 4] == "Cambridge" +@pytest.mark.functions +def test_row_to_names_delete_the_row_without_resetting_index(dataframe): + """Test that executes row_to_names while deleting the given row + index while not resetting the index""" + df = dataframe.row_to_names(2, remove_row=True) + expected_index = pd.Index([0, 1, 3, 4, 5, 6, 7, 8]) + pd.testing.assert_index_equal(df.index, expected_index) + + @pytest.mark.functions def test_row_to_names_delete_above(dataframe): df = dataframe.row_to_names(2, remove_rows_above=True) @@ -29,3 +39,30 @@ def test_row_to_names_delete_above(dataframe): assert df.iloc[0, 2] == 3 assert df.iloc[0, 3] == "lion" assert df.iloc[0, 4] == "Basel" + + +@pytest.mark.functions +def test_row_to_names_delete_above_without_resetting_index(dataframe): + """Test that executes row_to_names while deleting the all rows + above the given row index while not resetting the index""" + df = dataframe.row_to_names(2, remove_rows_above=True) + expected_index = pd.Index([2, 3, 4, 5, 6, 7, 8]) + pd.testing.assert_index_equal(df.index, expected_index) + + +@pytest.mark.functions +def test_row_to_names_delete_above_with_resetting_index(dataframe): + """Test that executes row_to_names while deleting the all rows + above the given row index while resetting the index""" + df = dataframe.row_to_names(2, remove_rows_above=True, reset_index=True) + expected_index = pd.RangeIndex(start=0, stop=7, step=1) + pd.testing.assert_index_equal(df.index, expected_index) + + +@pytest.mark.functions +def test_row_to_names_delete_the_row_with_resetting_index(dataframe): + """Test that executes row_to_names while deleting the given row + index while resetting the index""" + df = dataframe.row_to_names(2, remove_row=True, reset_index=True) + expected_index = pd.RangeIndex(start=0, stop=8, step=1) + pd.testing.assert_index_equal(df.index, expected_index)