Skip to content

Commit a8bfabc

Browse files
mralbuericmjl
authored andcommitted
[ENH] Add optional removal of accents on functions.clean_names, enabled by default. (#506)
1 parent f5c21d0 commit a8bfabc

File tree

4 files changed

+27
-1
lines changed

4 files changed

+27
-1
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,4 @@ Contributors
7676
- `@puruckertom <https://github.com/puruckertom>`_ | `contributions <https://github.com/ericmjl/pyjanitor/pulls?utf8=%E2%9C%93&q=is%3Apr+author%3Apuruckertom>`_
7777
- `@thomasjpfan <https://github.com/thomasjpfan>`_ | `contributions <https://github.com/ericmjl/pyjanitor/issues?q=is%3Aclosed+mentions%3Athomasjpfan>`_
7878
- `@jiafengkevinchen <https://github.com/jiafengkevinchen>`_ | `contributions <https://github.com/ericmjl/pyjanitor/pull/480#issue-298730562>`_
79+
- `@mralbu <https://github.com/mralbu>`_ | `contributions <https://github.com/ericmjl/pyjanitor/issues/502>`_

CHANGELOG.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ v0.18.1 (on deck)
33
- [ENH] add preserve_position kwarg to deconcatenate_column with tests by @shandou and @ericmjl
44
- [DOC] add contributions that did not leave ``git`` traces by @ericmjl
55
- [ENH] add inflation adjustment in finance submodule by @rahosbach
6-
6+
- [ENH] add optional removal of accents on functions.clean_names, enabled by default by @mralbu
77

88
For changes that happened prior to v0.18.1,
99
please consult the closed PRs,

janitor/functions.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import datetime as dt
44
import re
5+
import unicodedata
56
import warnings
67
from fnmatch import translate
78
from functools import partial, reduce
@@ -121,6 +122,7 @@ def clean_names(
121122
strip_underscores: str = None,
122123
case_type: str = "lower",
123124
remove_special: bool = False,
125+
strip_accents: bool = True,
124126
preserve_original_columns: bool = True,
125127
) -> pd.DataFrame:
126128
"""
@@ -189,6 +191,9 @@ def _remove_special(col):
189191
if remove_special:
190192
df = df.rename(columns=_remove_special)
191193

194+
if strip_accents:
195+
df = df.rename(columns=_strip_accents)
196+
192197
df = df.rename(columns=lambda x: re.sub("_+", "_", x))
193198
df = _strip_underscores(df, strip_underscores)
194199

@@ -208,6 +213,18 @@ def _normalize_1(col_name: str) -> str:
208213
return result
209214

210215

216+
def _strip_accents(col_name: str) -> str:
217+
"""
218+
Removes accents from a DataFrame column name.
219+
.. _StackOverflow: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string # noqa: E501
220+
"""
221+
return "".join(
222+
l
223+
for l in unicodedata.normalize("NFD", col_name)
224+
if not unicodedata.combining(l)
225+
)
226+
227+
211228
@pf.register_dataframe_method
212229
def remove_empty(df: pd.DataFrame) -> pd.DataFrame:
213230
"""

tests/functions/test_clean_names.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ def test_clean_names_strip_underscores(
104104
assert set(df.columns) == set(expected_columns)
105105

106106

107+
@pytest.mark.functions
108+
def test_clean_names_strip_accents():
109+
df = pd.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]})
110+
df = df.clean_names(strip_accents=True)
111+
expected_columns = ["joao", "лукася", "kafer"]
112+
assert set(df.columns) == set(expected_columns)
113+
114+
107115
@pytest.mark.functions
108116
def test_incorrect_strip_underscores(multiindex_dataframe):
109117
with pytest.raises(JanitorError):

0 commit comments

Comments
 (0)