Skip to content

Updated column name cleaning functions. #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from .errors import JanitorError

import re


def clean_names(df):
"""
Expand All @@ -29,8 +31,26 @@ def clean_names(df):
:param df: The pandas DataFrame object.
:returns: A pandas DataFrame.
"""
columns = [c.lower().replace(' ', '_') for c in df.columns]
df.columns = columns
columns = [(c.lower()
.replace(' ', '_')
.replace('/', '_')
.replace(':', '_')
.replace("'", '')
.replace('’', '')
.replace(',', '_')
.replace('?', '_')
.replace('-', '_')
.replace('(', '_')
.replace(')', '_')
.replace('.', '_')
) for c in df.columns]

newcolumns = []
for col in columns:
# NOTE: Replace repeating underscores with single ones
newcol = re.sub('[_]{2,}', '_', col)
newcolumns.append(newcol)
df.columns = newcolumns
return df


Expand Down
10 changes: 5 additions & 5 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def dataframe():
data = {
'a': [1, 2, 3],
'Bell Chart': [1, 2, 3],
'Bell__Chart': [1, 2, 3],
'decorated-elephant': [1, 2, 3],
}
df = pd.DataFrame(data)
Expand All @@ -28,20 +28,20 @@ def null_df():

def test_clean_names_functional(dataframe):
df = clean_names(dataframe)
expected_columns = ['a', 'bell_chart', 'decorated-elephant']
expected_columns = ['a', 'bell_chart', 'decorated_elephant']

assert set(df.columns) == set(expected_columns)


def test_clean_names_method_chain(dataframe):
df = jn.DataFrame(dataframe).clean_names()
expected_columns = ['a', 'bell_chart', 'decorated-elephant']
expected_columns = ['a', 'bell_chart', 'decorated_elephant']
assert set(df.columns) == set(expected_columns)


def test_clean_names_pipe(dataframe):
df = dataframe.pipe(clean_names)
expected_columns = ['a', 'bell_chart', 'decorated-elephant']
expected_columns = ['a', 'bell_chart', 'decorated_elephant']
assert set(df.columns) == set(expected_columns)


Expand Down Expand Up @@ -82,7 +82,7 @@ def test_get_features_targets(dataframe):
def test_rename_column(dataframe):
dataframe = jn.DataFrame(dataframe).clean_names()
df = dataframe.rename_column('a', 'index')
assert set(df.columns) == set(['index', 'bell_chart', 'decorated-elephant']) # noqa: E501
assert set(df.columns) == set(['index', 'bell_chart', 'decorated_elephant']) # noqa: E501


def test_coalesce():
Expand Down