Skip to content

[ENH] Add snake option to clean_names #509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ v0.18.1 (on deck)
- [DOC] clarified how new functions should be implemented by @shandou
- [ENH] add optional removal of accents on functions.clean_names, enabled by
default by @mralbu
- [ENH] add camelCase conversion to snake_case on ``clean_names`` by @ericmjl,
h/t @jtaylor for sharing original

For changes that happened prior to v0.18.1,
please consult the closed PRs,
Expand Down
43 changes: 33 additions & 10 deletions janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,10 @@ def clean_names(
either 'left', 'right' or 'both' or the respective shorthand 'l', 'r'
and True.
:param case_type: (optional) Whether to make columns lower or uppercase.
Current case may be preserved with 'preserve'. Default 'lower'
makes all characters lowercase.
Current case may be preserved with 'preserve',
while snake case conversion (from CamelCase or camelCase only)
can be turned on using "snake".
Default 'lower' makes all characters lowercase.
:param remove_special: (optional) Remove special characters from columns.
Only letters, numbers and underscores are preserved.
:returns: A pandas DataFrame.
Expand All @@ -170,11 +172,11 @@ def clean_names(
"""
original_column_names = list(df.columns)

assert case_type.lower() in {
"preserve",
"upper",
"lower",
}, "case_type argument must be one of ('preserve', 'upper', 'lower')"
case_types = {"preserve", "upper", "lower", "snake"}

assert (
case_type.lower() in case_types
), f"case_type argument must be one of {case_types}"

if case_type.lower() != "preserve":
if case_type.lower() == "upper":
Expand All @@ -183,10 +185,10 @@ def clean_names(
elif case_type.lower() == "lower":
df = df.rename(columns=lambda x: x.lower())

df = df.rename(columns=_normalize_1)
elif case_type.lower() == "snake":
df = df.rename(columns=_camel2snake)

def _remove_special(col):
return "".join(item for item in col if item.isalnum() or "_" in item)
df = df.rename(columns=_normalize_1)

if remove_special:
df = df.rename(columns=_remove_special)
Expand All @@ -203,6 +205,27 @@ def _remove_special(col):
return df


def _remove_special(col_name):
"""Remove special characters from column name."""
return "".join(item for item in col_name if item.isalnum() or "_" in item)


_underscorer1 = re.compile(r"(.)([A-Z][a-z]+)")
_underscorer2 = re.compile("([a-z0-9])([A-Z])")


def _camel2snake(col_name: str) -> str:
"""
Convert camelcase names to snake case.

Implementation taken from: https://gist.github.com/jaytaylor/3660565
by @jtaylor
"""

subbed = _underscorer1.sub(r"\1_\2", col_name)
return _underscorer2.sub(r"\1_\2", subbed).lower()


FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", "")]


Expand Down
29 changes: 29 additions & 0 deletions tests/functions/test_clean_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,32 @@ def test_clean_names_preserve_case_true(multiindex_dataframe):

expected_columns = pd.MultiIndex(levels=levels, codes=codes)
assert set(df.columns) == set(expected_columns)


@pytest.mark.functions
@given(df=df_strategy())
def test_clean_names_camelcase_to_snake(df):
df = (
df.select_columns(["a"])
.rename_column("a", "AColumnName")
.clean_names(case_type="snake")
)
assert list(df.columns) == ["a_column_name"]


@pytest.mark.functions
def test_clean_names_camelcase_to_snake(dataframe):
df = (
dataframe.select_columns(["a", "Bell__Chart", "decorated-elephant"])
.rename_column("a", "snakesOnAPlane")
.rename_column("Bell__Chart", "SnakesOnAPlane2")
.rename_column("decorated-elephant", "snakes_on_a_plane3")
.clean_names(
case_type="snake", strip_underscores=True, remove_special=True
)
)
assert list(df.columns) == [
"snakes_on_a_plane",
"snakes_on_a_plane2",
"snakes_on_a_plane3",
]