diff --git a/AUTHORS.rst b/AUTHORS.rst index cf1883228..8265fa70c 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -76,3 +76,4 @@ Contributors - `@puruckertom `_ | `contributions `_ - `@thomasjpfan `_ | `contributions `_ - `@jiafengkevinchen `_ | `contributions `_ +- `@mralbu `_ | `contributions `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f8a624d9e..2acf25092 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,7 +3,7 @@ v0.18.1 (on deck) - [ENH] add preserve_position kwarg to deconcatenate_column with tests by @shandou and @ericmjl - [DOC] add contributions that did not leave ``git`` traces by @ericmjl - [ENH] add inflation adjustment in finance submodule by @rahosbach - +- [ENH] add optional removal of accents on functions.clean_names, enabled by default by @mralbu For changes that happened prior to v0.18.1, please consult the closed PRs, diff --git a/janitor/functions.py b/janitor/functions.py index 4b425ff0f..74e0a45fc 100644 --- a/janitor/functions.py +++ b/janitor/functions.py @@ -2,6 +2,7 @@ import datetime as dt import re +import unicodedata import warnings from fnmatch import translate from functools import partial, reduce @@ -121,6 +122,7 @@ def clean_names( strip_underscores: str = None, case_type: str = "lower", remove_special: bool = False, + strip_accents: bool = True, preserve_original_columns: bool = True, ) -> pd.DataFrame: """ @@ -189,6 +191,9 @@ def _remove_special(col): if remove_special: df = df.rename(columns=_remove_special) + if strip_accents: + df = df.rename(columns=_strip_accents) + df = df.rename(columns=lambda x: re.sub("_+", "_", x)) df = _strip_underscores(df, strip_underscores) @@ -208,6 +213,18 @@ def _normalize_1(col_name: str) -> str: return result +def _strip_accents(col_name: str) -> str: + """ + Removes accents from a DataFrame column name. + .. _StackOverflow: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string # noqa: E501 + """ + return "".join( + l + for l in unicodedata.normalize("NFD", col_name) + if not unicodedata.combining(l) + ) + + @pf.register_dataframe_method def remove_empty(df: pd.DataFrame) -> pd.DataFrame: """ diff --git a/tests/functions/test_clean_names.py b/tests/functions/test_clean_names.py index 707350a2e..85430e7d3 100644 --- a/tests/functions/test_clean_names.py +++ b/tests/functions/test_clean_names.py @@ -104,6 +104,14 @@ def test_clean_names_strip_underscores( assert set(df.columns) == set(expected_columns) +@pytest.mark.functions +def test_clean_names_strip_accents(): + df = pd.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) + df = df.clean_names(strip_accents=True) + expected_columns = ["joao", "лукася", "kafer"] + assert set(df.columns) == set(expected_columns) + + @pytest.mark.functions def test_incorrect_strip_underscores(multiindex_dataframe): with pytest.raises(JanitorError):