Skip to content

Commit e5c9c76

Browse files
authored
Merge pull request #13 from JoshuaC3/clean_names-remove-outer-underscores
Clean names remove outer underscores
2 parents cb7ff17 + be35c09 commit e5c9c76

File tree

2 files changed

+126
-2
lines changed

2 files changed

+126
-2
lines changed

janitor/functions.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,38 @@
88
import re
99

1010

11-
def clean_names(df):
11+
def _strip_underscores(df, strip_underscores=None):
12+
"""
13+
Strip underscores from the beginning, end or both of the
14+
of the DataFrames column names.
15+
16+
.. code-block:: python
17+
18+
df = _strip_underscores(df, strip_underscores='left')
19+
20+
:param df: The pandas DataFrame object.
21+
:param strip_underscores: (optional) Removes the outer underscores from all
22+
column names. Default None keeps outer underscores. Values can be
23+
either 'left', 'right' or 'both' or the respective shorthand 'l', 'r'
24+
and True.
25+
:returns: A pandas DataFrame.
26+
"""
27+
underscore_options = [None, 'left', 'right', 'both', 'l', 'r', True]
28+
if strip_underscores not in underscore_options:
29+
raise JanitorError(
30+
"""strip_underscores must be one of: %s""" % underscore_options
31+
)
32+
33+
if strip_underscores in ['left', 'l']:
34+
df = df.rename(columns=lambda x: x.lstrip('_'))
35+
elif strip_underscores in ['right', 'r']:
36+
df = df.rename(columns=lambda x: x.rstrip('_'))
37+
elif strip_underscores == 'both' or strip_underscores is True:
38+
df = df.rename(columns=lambda x: x.strip('_'))
39+
return df
40+
41+
42+
def clean_names(df, strip_underscores=None):
1243
"""
1344
Clean column names.
1445
@@ -29,6 +60,10 @@ def clean_names(df):
2960
df = jn.DataFrame(df).clean_names()
3061
3162
:param df: The pandas DataFrame object.
63+
:param strip_underscores: (optional) Removes the outer underscores from all
64+
column names. Default None keeps outer underscores. Values can be
65+
either 'left', 'right' or 'both' or the respective shorthand 'l', 'r'
66+
and True.
3267
:returns: A pandas DataFrame.
3368
"""
3469
df = df.rename(
@@ -47,6 +82,7 @@ def clean_names(df):
4782
)
4883

4984
df = df.rename(columns=lambda x: re.sub('_+', '_', x))
85+
df = _strip_underscores(df, strip_underscores)
5086
return df
5187

5288

@@ -190,7 +226,7 @@ def get_features_targets(df, target_columns, feature_columns=None):
190226
if isinstance(target_columns, str):
191227
xcols = [c for c in df.columns if target_columns != c]
192228
elif (isinstance(target_columns, list)
193-
or isinstance(target_columns, tuple)):
229+
or isinstance(target_columns, tuple)):
194230
xcols = [c for c in df.columns if c not in target_columns]
195231
X = df[xcols]
196232
return X, Y

tests/test_functions.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,91 @@ def test_multiindex_clean_names_pipe(multiindex_dataframe):
159159

160160
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
161161
assert set(df.columns) == set(expected_columns)
162+
163+
164+
def test_clean_names_strip_underscores_both(multiindex_dataframe):
165+
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
166+
df = clean_names(multiindex_dataframe, strip_underscores='both')
167+
168+
levels = [
169+
['a', 'bell_chart', 'decorated_elephant'],
170+
['b', 'normal_distribution', 'r_i_p_rhino']
171+
]
172+
173+
labels = [[1, 0, 2], [1, 0, 2]]
174+
175+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
176+
assert set(df.columns) == set(expected_columns)
177+
178+
179+
def test_clean_names_strip_underscores_true(multiindex_dataframe):
180+
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
181+
df = clean_names(multiindex_dataframe, strip_underscores=True)
182+
183+
levels = [
184+
['a', 'bell_chart', 'decorated_elephant'],
185+
['b', 'normal_distribution', 'r_i_p_rhino']
186+
]
187+
188+
labels = [[1, 0, 2], [1, 0, 2]]
189+
190+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
191+
assert set(df.columns) == set(expected_columns)
192+
193+
194+
def test_clean_names_strip_underscores_right(multiindex_dataframe):
195+
df = clean_names(multiindex_dataframe, strip_underscores='right')
196+
197+
levels = [
198+
['a', 'bell_chart', 'decorated_elephant'],
199+
['b', 'normal_distribution', 'r_i_p_rhino']
200+
]
201+
202+
labels = [[1, 0, 2], [1, 0, 2]]
203+
204+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
205+
assert set(df.columns) == set(expected_columns)
206+
207+
208+
def test_clean_names_strip_underscores_r(multiindex_dataframe):
209+
df = clean_names(multiindex_dataframe, strip_underscores='r')
210+
211+
levels = [
212+
['a', 'bell_chart', 'decorated_elephant'],
213+
['b', 'normal_distribution', 'r_i_p_rhino']
214+
]
215+
216+
labels = [[1, 0, 2], [1, 0, 2]]
217+
218+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
219+
assert set(df.columns) == set(expected_columns)
220+
221+
222+
def test_clean_names_strip_underscores_left(multiindex_dataframe):
223+
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
224+
df = clean_names(multiindex_dataframe, strip_underscores='left')
225+
226+
levels = [
227+
['a', 'bell_chart', 'decorated_elephant'],
228+
['b', 'normal_distribution', 'r_i_p_rhino_']
229+
]
230+
231+
labels = [[1, 0, 2], [1, 0, 2]]
232+
233+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
234+
assert set(df.columns) == set(expected_columns)
235+
236+
237+
def test_clean_names_strip_underscores_l(multiindex_dataframe):
238+
df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
239+
df = clean_names(multiindex_dataframe, strip_underscores='l')
240+
241+
levels = [
242+
['a', 'bell_chart', 'decorated_elephant'],
243+
['b', 'normal_distribution', 'r_i_p_rhino_']
244+
]
245+
246+
labels = [[1, 0, 2], [1, 0, 2]]
247+
248+
expected_columns = pd.MultiIndex(levels=levels, labels=labels)
249+
assert set(df.columns) == set(expected_columns)

0 commit comments

Comments
 (0)