Skip to content

[ENH] Fill direction #879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [DOC] Delete Read the Docs project and remove all readthedocs.io references from the repo. Issue #863. @loganthomas
- [DOC] Updated various documentation sources to reflect pyjanitor-dev ownership. @loganthomas
- [INF] Fix `isort` automatic checks. Issue #845. @loganthomas
- [ENH] Deprecate `limit` from fill_direction. fill_direction now uses kwargs. @samukweku

## [v0.21.0] - 2021-07-16

Expand Down
Binary file modified examples/notebooks/dirty_data.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion examples/notebooks/inflating_converting_currency.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,4 +271,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
187 changes: 90 additions & 97 deletions janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
from scipy.stats import mode

from .errors import JanitorError
from enum import Enum
from operator import methodcaller
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh my, that is such Python trickery, @samukweku 😄 🤣

from .utils import (
_clean_accounting_column,
_computations_as_categorical,
Expand Down Expand Up @@ -324,7 +326,6 @@ def clean_names(
df = df.rename(columns=_strip_accents)

df = df.rename(columns=lambda x: re.sub("_+", "_", x)) # noqa: PD005

df = _strip_underscores(df, strip_underscores)

df = df.rename(columns=lambda x: x[:truncate_limit])
Expand All @@ -337,7 +338,6 @@ def clean_names(

def _change_case(col: str, case_type: str) -> str:
"""Change case of a column name."""

case_types = ["preserve", "upper", "lower", "snake"]
if case_type.lower() not in case_types:
raise JanitorError(f"case_type must be one of: {case_types}")
Expand All @@ -355,7 +355,6 @@ def _change_case(col: str, case_type: str) -> str:

def _remove_special(col_name: Hashable) -> str:
"""Remove special characters from column name."""

return "".join(
item for item in str(col_name) if item.isalnum() or "_" in item
)
Expand All @@ -380,7 +379,6 @@ def _camel2snake(col_name: str) -> str:

def _normalize_1(col_name: Hashable) -> str:
"""Perform normalization of column name."""

result = str(col_name)
for search, replace in FIXES:
result = re.sub(search, replace, result) # noqa: PD005
Expand Down Expand Up @@ -2523,7 +2521,7 @@ def row_to_names(
df = df.drop(df.index[range(row_number)])

if reset_index:
df.reset_index(drop=["index"], inplace=True)
df = df.reset_index(drop=["index"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉 I like this change!


return df

Expand Down Expand Up @@ -5210,72 +5208,63 @@ def process_text(


@pf.register_dataframe_method
def fill_direction(
df: pd.DataFrame,
directions: Dict[Hashable, str] = None,
limit: Optional[int] = None,
) -> pd.DataFrame:
def fill_direction(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
"""
Provide a method-chainable function for filling missing values
in selected columns.

Missing values are filled using the next or previous entry.
The columns are paired with the directions in a dictionary.
It is a wrapper for ``pd.Series.ffill`` and ``pd.Series.bfill``.
It is a wrapper for ``pd.Series.ffill`` and ``pd.Series.bfill``,
and pairs the column name with one of `up`, `down`, `updown`,
and `downup`.

.. code-block:: python

import pandas as pd
import numpy as np
import janitor as jn

df = pd.DataFrame({"text": ["ragnar", np.nan, "sammywemmy",
np.nan, "ginger"],
"code" : [np.nan, 2, 3, np.nan, 5]})

df

text code
0 ragnar NaN
1 NaN 2.0
2 sammywemmy 3.0
3 NaN NaN
4 ginger 5.0
text code
0 ragnar NaN
1 NaN 2.0
2 sammywemmy 3.0
3 NaN NaN
4 ginger 5.0



Fill on a single column::

df.fill_direction({"text" : "up"})
df.fill_direction(code = 'up')

text code
0 ragnar NaN
1 sammywemmy 2.0
2 sammywemmy 3.0
3 ginger NaN
4 ginger 5.0
text code
0 ragnar 2.0
1 NaN 2.0
2 sammywemmy 3.0
3 NaN 5.0
4 ginger 5.0

Fill on multiple columns::

df.fill_direction({"text" : "down", "code" : "down"})
df.fill_direction(text = 'down', code = 'down')

text code
0 ragnar NaN
1 ragnar 2.0
2 sammywemmy 3.0
3 sammywemmy 3.0
4 ginger 5.0
text code
0 ragnar NaN
1 ragnar 2.0
2 sammywemmy 3.0
3 sammywemmy 3.0
4 ginger 5.0

Fill multiple columns in different directions::

df.fill_direction({"text" : "up", "code" : "down"})
df.fill_direction(text = 'up', code = 'down')

text code
0 ragnar NaN
1 sammywemmy 2.0
2 sammywemmy 3.0
3 ginger 3.0
4 ginger 5.0
text code
0 ragnar NaN
1 sammywemmy 2.0
2 sammywemmy 3.0
3 ginger 3.0
4 ginger 5.0

Functional usage syntax:

Expand All @@ -5286,12 +5275,10 @@ def fill_direction(

df = pd.DataFrame(...)
df = jn.fill_direction(
df = df,
directions = {column_1 : direction_1,
column_2 : direction_2,
...},
limit = None # limit must be None or greater than 0
)
df = df,
column_1 = direction_1,
column_2 = direction_2,
)

Method-chaining usage syntax:

Expand All @@ -5300,68 +5287,75 @@ def fill_direction(
import pandas as pd
import janitor as jn

df = (
pd.DataFrame(...)
.fill_direction(
directions = {column_1 : direction_1,
column_2 : direction_2,
...},
limit = None # limit must be None or greater than 0
)
)
df = pd.DataFrame(...)
.fill_direction(
column_1 = direction_1,
column_2 = direction_2,
)


:param df: A pandas dataframe.
:param directions: Key - value pairs of columns and directions. Directions
can be either `down` (default), `up`, `updown` (fill up then down) and
:param kwargs: Key - value pairs of columns and directions. Directions
can be either `down`, `up`, `updown` (fill up then down) and
`downup` (fill down then up).
:param limit: number of consecutive null values to forward/backward fill.
Value must `None` or greater than 0.
:returns: A pandas dataframe with modified column(s).
:raises ValueError: if column supplied is not in the dataframe.
:raises ValueError: if direction supplied is not one of `down`, `up`,
`updown`, or `downup`.

.. # noqa: DAR402
"""
df = df.copy()
if not directions:
return df

check("directions", directions, [dict])

if limit is not None:
check("limit", limit, [int])
# pandas raises error if limit is not greater than zero
# so no need for a check on pyjanitor's end

check_column(df, directions)
if not kwargs:
return df

for _, direction in directions.items():
if direction not in {"up", "down", "updown", "downup"}:
fill_types = {fill.name for fill in FILLTYPE}
for column_name, fill_type in kwargs.items():
check("column_name", column_name, [str])
check("fill_details", fill_type, [str])
if fill_type.upper() not in fill_types:
raise ValueError(
"""
The direction should be a string and should be one of
`up`, `down`, `updown`, or `downup`.
fill_type should be one of
up, down, updown, or downup.
"""
)

# TODO: option to specify limit per column; current implementation
# is one `limit` for all the columns. Might need refactoring, or an
# API change.
for column, direction in directions.items():
if direction == "up":
df.loc[:, column] = df.loc[:, column].bfill(limit=limit)
elif direction == "down":
df.loc[:, column] = df.loc[:, column].ffill(limit=limit)
elif direction == "updown":
df.loc[:, column] = (
df.loc[:, column].bfill(limit=limit).ffill(limit=limit)
)
else: # downup
df.loc[:, column] = (
df.loc[:, column].ffill(limit=limit).bfill(limit=limit)
)
return df
check_column(df, kwargs)

new_values = {}
for column_name, fill_type in kwargs.items():
direction = FILLTYPE[f"{fill_type.upper()}"].value
if len(direction) == 1:
direction = methodcaller(direction[0])
output = direction(df[column_name])
else:
direction = [methodcaller(entry) for entry in direction]
output = _chain_func(df[column_name], *direction)
new_values[column_name] = output

return df.assign(**new_values)


class FILLTYPE(Enum):
"""List of fill types for fill_direction."""

UP = ("bfill",)
DOWN = ("ffill",)
UPDOWN = "bfill", "ffill"
DOWNUP = "ffill", "bfill"


def _chain_func(column: pd.Series, *funcs):
"""
Apply series of functions consecutively
to a Series.
https://blog.finxter.com/how-to-chain-multiple-function-calls-in-python/
"""
new_value = column.copy()
for func in funcs:
new_value = func(new_value)
return new_value


@pf.register_dataframe_method
Expand Down Expand Up @@ -5617,7 +5611,6 @@ def complete(
Let's get all the missing years per state::

df.complete(

columns = [{'year': new_year_values}],
by='state'
)
Expand Down
14 changes: 14 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[pytest]
# always check coverage of janitor.
addopts = --cov=janitor --cov-report term-missing --cov-report xml --durations=0
markers =
functions: test for general functions
biology: tests for biology
chemistry: tests for chemistry
finance: tests for finance
utils: utility tests
engineering: tests for engineering
ml: tests for machine learning
spark_functions: tests for pyspark functions
xarray: tests for xarray functions
timeseries: tests for timeseries
4 changes: 2 additions & 2 deletions tests/functions/test_clean_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_charac():

df = df.clean_names(strip_underscores=True, case_type="lower")

assert "current_accountbalance_in_%_of_gdp" in df.columns.values
assert "current_accountbalance_in_%_of_gdp" in df.columns


@pytest.mark.functions
Expand All @@ -205,4 +205,4 @@ def test_space():

df = df.clean_names(strip_underscores=True, case_type="lower")

assert ("in %" in df.columns.values) is False
assert ("in %" in df.columns) is False
2 changes: 1 addition & 1 deletion tests/functions/test_convert_excel_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ def test_convert_excel_date_with_string_data():
).clean_names()

with pytest.raises(ValueError):
df.convert_excel_date("hire_date_str")
df.convert_excel_date("certification")
Loading