Skip to content

Clean names remove outer underscores #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,38 @@
import re


def clean_names(df):
def _strip_underscores(df, strip_underscores=None):
"""
Strip underscores from the beginning, end or both of the
of the DataFrames column names.

.. code-block:: python

df = _strip_underscores(df, strip_underscores='left')

:param df: The pandas DataFrame object.
:param strip_underscores: (optional) Removes the outer underscores from all
column names. Default None keeps outer underscores. Values can be
either 'left', 'right' or 'both' or the respective shorthand 'l', 'r'
and True.
:returns: A pandas DataFrame.
"""
underscore_options = [None, 'left', 'right', 'both', 'l', 'r', True]
if strip_underscores not in underscore_options:
raise JanitorError(
"""strip_underscores must be one of: %s""" % underscore_options
)

if strip_underscores in ['left', 'l']:
df = df.rename(columns=lambda x: x.lstrip('_'))
elif strip_underscores in ['right', 'r']:
df = df.rename(columns=lambda x: x.rstrip('_'))
elif strip_underscores == 'both' or strip_underscores is True:
df = df.rename(columns=lambda x: x.strip('_'))
return df


def clean_names(df, strip_underscores=None):
"""
Clean column names.

Expand All @@ -29,6 +60,10 @@ def clean_names(df):
df = jn.DataFrame(df).clean_names()

:param df: The pandas DataFrame object.
:param strip_underscores: (optional) Removes the outer underscores from all
column names. Default None keeps outer underscores. Values can be
either 'left', 'right' or 'both' or the respective shorthand 'l', 'r'
and True.
:returns: A pandas DataFrame.
"""
df = df.rename(
Expand All @@ -47,6 +82,7 @@ def clean_names(df):
)

df = df.rename(columns=lambda x: re.sub('_+', '_', x))
df = _strip_underscores(df, strip_underscores)
return df


Expand Down Expand Up @@ -190,7 +226,7 @@ def get_features_targets(df, target_columns, feature_columns=None):
if isinstance(target_columns, str):
xcols = [c for c in df.columns if target_columns != c]
elif (isinstance(target_columns, list)
or isinstance(target_columns, tuple)):
or isinstance(target_columns, tuple)):
xcols = [c for c in df.columns if c not in target_columns]
X = df[xcols]
return X, Y
Expand Down
88 changes: 88 additions & 0 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,91 @@ def test_multiindex_clean_names_pipe(multiindex_dataframe):

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_both(multiindex_dataframe):
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
df = clean_names(multiindex_dataframe, strip_underscores='both')

levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_true(multiindex_dataframe):
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
df = clean_names(multiindex_dataframe, strip_underscores=True)

levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_right(multiindex_dataframe):
df = clean_names(multiindex_dataframe, strip_underscores='right')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw the following on line 192:

df = multiindex_dataframe.rename(columns=lambda x: '_' + x)

Do you think we need the same before line 179?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only need that for 'l' and 'left'. The original multiindex_dataframe has a right/trailing underscore on r_i_p_rhino_. There were no leading/left underscores so I added some with df = multiindex_dataframe.rename(columns=lambda x: '_' + x) . Is this ok?

BTW, It was actually the trailing underscore in r_i_p_rhino_ that made me think that this strip_underscores was needed!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I should add this line to 'both' so that we test fully a mix of stripping lefts, rights and boths. Will add this now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only need that for 'l' and 'left'. The original multiindex_dataframe has a right/trailing underscore on r_i_p_rhino_. There were no leading/left underscores so I added some with df = multiindex_dataframe.rename(columns=lambda x: '_' + x) . Is this ok?

Ok got it! Thanks for pointing it out, I should have read the code a bit more closely.


levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_r(multiindex_dataframe):
df = clean_names(multiindex_dataframe, strip_underscores='r')

levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_left(multiindex_dataframe):
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
df = clean_names(multiindex_dataframe, strip_underscores='left')

levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino_']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)


def test_clean_names_strip_underscores_l(multiindex_dataframe):
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
df = clean_names(multiindex_dataframe, strip_underscores='l')

levels = [
['a', 'bell_chart', 'decorated_elephant'],
['b', 'normal_distribution', 'r_i_p_rhino_']
]

labels = [[1, 0, 2], [1, 0, 2]]

expected_columns = pd.MultiIndex(levels=levels, labels=labels)
assert set(df.columns) == set(expected_columns)