Skip to content

Commit 67535f0

Browse files
samukwekusamukweku
andauthored
[ENH] Process text (#878)
* remove _process_text in utils * linting * lint fixes * fix linting * add missing dataframes to conditional_join example * minor updates * add better error message for type check in conditional_join * updates for not equal * updates * updates * update changelog * update to tests Co-authored-by: samukweku <root@044f9633530c>
1 parent ab6ae49 commit 67535f0

File tree

5 files changed

+101
-204
lines changed

5 files changed

+101
-204
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
- [ENH] Deprecate `aggfunc` from `pivot_wider`; aggregation can be chained with pandas' `groupby`.
77
- [BUG] Fix conditional join issue for multiple conditions, where pd.eval fails to evaluate if numexpr is installed. #898 @samukweku
88
- [ENH] Added `case_when` to handle multiple conditionals and replacement values. Issue #736. @robertmitchellv
9+
- [ENH] Deprecate `new_column_names` and `merge_frame` from `process_text`. Only existing columns are supported. @samukweku
910

1011

1112
## [v0.21.1] - 2021-08-29

janitor/functions.py

Lines changed: 30 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
_currency_column_to_numeric,
4949
_data_checks_pivot_longer,
5050
_data_checks_pivot_wider,
51-
_process_text,
5251
_replace_empty_string_with_none,
5352
_replace_original_empty_string_with_none,
5453
_select_columns,
@@ -5060,21 +5059,17 @@ def expand_grid(
50605059
def process_text(
50615060
df: pd.DataFrame,
50625061
column_name: str,
5063-
new_column_names: Optional[Union[str, list]] = None,
5064-
merge_frame: Optional[bool] = False,
5065-
string_function: Optional[str] = None,
5062+
string_function: str,
50665063
**kwargs: str,
50675064
) -> pd.DataFrame:
50685065
"""
50695066
Apply a Pandas string method to an existing column and return a dataframe.
50705067
50715068
This function aims to make string cleaning easy, while chaining,
50725069
by simply passing the string method name to the ``process_text`` function.
5073-
This modifies an existing column and can also be used to create a new
5074-
column.
5070+
This modifies an existing column; it does not create a new column.
5071+
New columns can be created via pyjanitor's `transform_columns`.
50755072
5076-
.. note:: In versions < 0.20.11, this function did not support the
5077-
creation of new columns.
50785073
50795074
A list of all the string methods in Pandas can be accessed `here
50805075
<https://pandas.pydata.org/docs/user_guide/text.html#method-summary>`__.
@@ -5086,10 +5081,10 @@ def process_text(
50865081
import pandas as pd
50875082
import janitor as jn
50885083
5089-
df = pd.DataFrame({"text" : ["Ragnar",
5090-
"sammywemmy",
5091-
"ginger"],
5092-
"code" : [1, 2, 3]})
5084+
text code
5085+
0 Ragnar 1
5086+
1 sammywemmy 2
5087+
2 ginger 3
50935088
50945089
df.process_text(column_name = "text",
50955090
string_function = "lower")
@@ -5114,22 +5109,6 @@ def process_text(
51145109
1 NaN 2
51155110
2 NaN 3
51165111
5117-
A new column can be created, leaving the existing column unmodified::
5118-
5119-
df.process_text(
5120-
column_name = "text",
5121-
new_column_names = "new_text",
5122-
string_function = "extract",
5123-
pat = r"(ag)",
5124-
flags = re.IGNORECASE
5125-
)
5126-
5127-
text code new_text
5128-
0 Ragnar 1 ag
5129-
1 sammywemmy 2 NaN
5130-
2 ginger 3 NaN
5131-
5132-
51335112
Functional usage syntax:
51345113
51355114
.. code-block:: python
@@ -5141,8 +5120,6 @@ def process_text(
51415120
df = jn.process_text(
51425121
df = df,
51435122
column_name,
5144-
new_column_names = None/string/list_of_strings,
5145-
merge_frame = True/False,
51465123
string_function = "string_func_name_here",
51475124
kwargs
51485125
)
@@ -5158,8 +5135,6 @@ def process_text(
51585135
pd.DataFrame(...)
51595136
.process_text(
51605137
column_name,
5161-
new_column_names = None/string/list_of_strings,
5162-
merge_frame = True/False
51635138
string_function = "string_func_name_here",
51645139
kwargs
51655140
)
@@ -5168,77 +5143,39 @@ def process_text(
51685143
51695144
:param df: A pandas dataframe.
51705145
:param column_name: String column to be operated on.
5171-
:param new_column_names: Name(s) to assign to the new column(s) created
5172-
from the text processing. `new_column_names` can be a string, if
5173-
the result of the text processing is a Series or string; if the
5174-
result of the text processing is a dataframe, then `new_column_names`
5175-
is treated as a prefix for each of the columns in the new dataframe.
5176-
`new_column_names` can also be a list of strings to act as new
5177-
column names for the new dataframe. The existing `column_name`
5178-
stays unmodified if `new_column_names` is not None.
5179-
:param merge_frame: This comes into play if the result of the text
5180-
processing is a dataframe. If `True`, the resulting dataframe
5181-
will be merged with the original dataframe, else the resulting
5182-
dataframe, not the original dataframe, will be returned.
51835146
:param string_function: Pandas string method to be applied.
51845147
:param kwargs: Keyword arguments for parameters of the `string_function`.
51855148
:returns: A pandas dataframe with modified column(s).
51865149
:raises KeyError: if ``string_function`` is not a Pandas string method.
5187-
:raises TypeError: if wrong ``arg`` or ``kwarg`` is supplied.
5150+
:raises TypeError: if the wrong ``kwarg`` is supplied.
51885151
:raises ValueError: if `column_name` not found in dataframe.
5189-
:raises ValueError: if `new_column_names` is not None and is found in
5190-
dataframe.
51915152
51925153
.. # noqa: DAR402
51935154
"""
5194-
df = df.copy()
5195-
51965155
check("column_name", column_name, [str])
5156+
check("string_function", string_function, [str])
51975157
check_column(df, [column_name])
51985158

5199-
# new_column_names should not already exist in the dataframe
5200-
if new_column_names:
5201-
check("new_column_names", new_column_names, [list, str])
5202-
if isinstance(new_column_names, str):
5203-
check_column(df, [new_column_names], present=False)
5204-
else:
5205-
check_column(df, new_column_names, present=False)
5206-
5207-
if merge_frame:
5208-
check("merge_frame", merge_frame, [bool])
5209-
52105159
pandas_string_methods = [
52115160
func.__name__
52125161
for _, func in inspect.getmembers(pd.Series.str, inspect.isfunction)
52135162
if not func.__name__.startswith("_")
52145163
]
52155164

5216-
if not string_function:
5217-
return df
5218-
52195165
if string_function not in pandas_string_methods:
52205166
raise KeyError(f"{string_function} is not a Pandas string method.")
52215167

5222-
if string_function == "extractall" and merge_frame:
5223-
# create unique indices
5224-
# comes in handy for executing joins if there are
5225-
# duplicated indices in the original dataframe
5226-
df = df.set_index(np.arange(len(df)), append=True) # extra_index_line
5227-
52285168
result = getattr(df[column_name].str, string_function)(**kwargs)
52295169

5230-
# TODO: Support for str.cat with `join` parameter
5231-
# need a robust way to handle the results
5232-
# if there is a `join` parameter, as this could create more
5233-
# or less rows with varying indices or even duplicate indices
5234-
5235-
return _process_text(
5236-
result,
5237-
df=df,
5238-
column_name=column_name,
5239-
new_column_names=new_column_names,
5240-
merge_frame=merge_frame,
5241-
)
5170+
if isinstance(result, pd.DataFrame):
5171+
raise ValueError(
5172+
"""
5173+
The outcome of the processed text is a DataFrame,
5174+
which is not supported in `process_text`.
5175+
"""
5176+
)
5177+
5178+
return df.assign(**{column_name: result})
52425179

52435180

52445181
@pf.register_dataframe_method
@@ -6715,6 +6652,18 @@ def conditional_join(
67156652
Join on just equality is also possible, but should be avoided -
67166653
Pandas merge/join is more efficient::
67176654
6655+
df1
6656+
col_a col_b
6657+
0 1 A
6658+
1 2 B
6659+
2 3 C
6660+
6661+
df2
6662+
col_a col_c
6663+
0 0 Z
6664+
1 2 X
6665+
2 3 Y
6666+
67186667
df1.conditional_join(
67196668
df2,
67206669
('col_a', 'col_a', '=='),

janitor/utils.py

Lines changed: 27 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1922,7 +1922,7 @@ def _computations_pivot_wider(
19221922
# check dtype of `names_from` is string
19231923
names_from_all_strings = df.filter(names_from).agg(is_string_dtype).all()
19241924

1925-
if names_sort is True:
1925+
if names_sort:
19261926
# Categorical dtypes created only for `names_from`
19271927
# since that is what will become the new column names
19281928
dtypes = {
@@ -1941,7 +1941,7 @@ def _computations_pivot_wider(
19411941
df = df.reorder_levels(order=levels_order, axis="columns")
19421942

19431943
# an empty df is likely because
1944-
# there are no `values_from`
1944+
# there is no `values_from`
19451945
if any((df.empty, flatten_levels is False)):
19461946
return df
19471947

@@ -1953,7 +1953,7 @@ def _computations_pivot_wider(
19531953
else:
19541954
df.columns = df.columns.astype(str)
19551955

1956-
if names_sep is not None and (isinstance(df.columns, pd.MultiIndex)):
1956+
if (names_sep is not None) and (isinstance(df.columns, pd.MultiIndex)):
19571957
df.columns = df.columns.map(names_sep.join)
19581958

19591959
if names_glue:
@@ -2460,108 +2460,6 @@ def _column_sel_dispatch(columns_to_select, df): # noqa: F811
24602460
return filtered_columns
24612461

24622462

2463-
@functools.singledispatch
2464-
def _process_text(result: str, df, column_name, new_column_names, merge_frame):
2465-
"""
2466-
Base function for `process_text` when `result` is of ``str`` type.
2467-
"""
2468-
if new_column_names:
2469-
return df.assign(**{new_column_names: result})
2470-
df[column_name] = result
2471-
return df
2472-
2473-
2474-
@_process_text.register
2475-
def _sub_process_text(
2476-
result: pd.Series, df, column_name, new_column_names, merge_frame
2477-
):
2478-
"""
2479-
Base function for `process_text` when `result` is of ``pd.Series`` type.
2480-
"""
2481-
if new_column_names:
2482-
return df.assign(**{new_column_names: result})
2483-
df[column_name] = result
2484-
return df
2485-
2486-
2487-
@_process_text.register # noqa: F811
2488-
def _sub_process_text( # noqa: F811
2489-
result: pd.DataFrame, df, column_name, new_column_names, merge_frame
2490-
): # noqa: F811
2491-
"""
2492-
Base function for `process_text` when `result` is of ``pd.DataFrame`` type.
2493-
"""
2494-
result = _process_text_result_is_frame(new_column_names, result)
2495-
if not merge_frame:
2496-
return result
2497-
return _process_text_result_MultiIndex(result.index, result, df)
2498-
2499-
2500-
@functools.singledispatch
2501-
def _process_text_result_is_frame(new_column_names: str, result):
2502-
"""
2503-
Function to modify `result` columns from `process_text` if
2504-
`result` is a dataframe. Applies only if `new_column_names`
2505-
is a string type.
2506-
"""
2507-
if new_column_names:
2508-
return result.add_prefix(new_column_names)
2509-
return result
2510-
2511-
2512-
@_process_text_result_is_frame.register
2513-
def _sub_process_text_result_is_frame(new_column_names: list, result):
2514-
"""
2515-
Function to modify `result` columns from `process_text` if
2516-
`result` is a dataframe. Applies only if `new_column_names`
2517-
is a list type.
2518-
"""
2519-
if len(new_column_names) != len(result.columns):
2520-
raise ValueError(
2521-
"""
2522-
The length of `new_column_names` does not
2523-
match the number of columns in the new
2524-
dataframe generated from the text processing.
2525-
"""
2526-
)
2527-
result.columns = new_column_names
2528-
return result
2529-
2530-
2531-
@functools.singledispatch
2532-
def _process_text_result_MultiIndex(index: pd.Index, result, df):
2533-
"""
2534-
Function to modify `result` columns from `process_text` if
2535-
`result` is a dataframe and it has a single Index.
2536-
"""
2537-
return pd.concat([df, result], axis="columns")
2538-
2539-
2540-
@_process_text_result_MultiIndex.register
2541-
def _sub_process_text_result_MultiIndex(index: pd.MultiIndex, result, df):
2542-
"""
2543-
Function to modify `result` columns from `process_text` if
2544-
`result` is a dataframe and it has a MultiIndex.
2545-
At the moment, this function is primarily to cater for `str.extractall`,
2546-
since at the moment,
2547-
this is the only string method that returns a MultiIndex.
2548-
The function may be modified,
2549-
if another string function that returns a MultIndex
2550-
is added to Pandas string methods.
2551-
2552-
For this function, `df` has been converted to a MultiIndex,
2553-
with the extra index added to create unique indices.
2554-
This comes in handy when merging back the dataframe,
2555-
especially if `result` returns duplicate indices.
2556-
"""
2557-
result = result.reset_index(level="match")
2558-
df = df.join(result, how="outer")
2559-
# droplevel gets rid of the extra index added at the start
2560-
# (# extra_index_line)
2561-
df = df.droplevel(-1).set_index("match", append=True)
2562-
return df
2563-
2564-
25652463
class JOINOPERATOR(Enum):
25662464
"""
25672465
List of operators used in conditional_join.
@@ -2794,20 +2692,41 @@ def _conditional_join_type_check(
27942692
Strings can only be compared
27952693
on the equal(`==`) operator.
27962694
"""
2695+
error_msg_dtype = """
2696+
The left column ({l_name}) is a {dtype} type,
2697+
while the right column ({r_name}) is not.
2698+
Kindly ensure both columns are the same dtype.
2699+
"""
27972700
if is_string_dtype(left_column):
27982701
if not is_string_dtype(right_column):
2799-
raise ValueError(error_msg)
2702+
mapper = {
2703+
"l_name": left_column.name,
2704+
"dtype": "string",
2705+
"r_name": right_column.name,
2706+
}
2707+
raise ValueError(error_msg_dtype.format(**mapper))
28002708
if op != JOINOPERATOR.STRICTLY_EQUAL.value:
28012709
raise ValueError(error_msg_string)
28022710
return None
28032711
if is_numeric_dtype(left_column):
28042712
if not is_numeric_dtype(right_column):
2805-
raise ValueError(error_msg)
2713+
mapper = {
2714+
"l_name": left_column.name,
2715+
"dtype": "numeric",
2716+
"r_name": right_column.name,
2717+
}
2718+
raise ValueError(error_msg_dtype.format(**mapper))
28062719
return None
28072720
if is_datetime64_dtype(left_column):
28082721
if not is_datetime64_dtype(right_column):
2809-
raise ValueError(error_msg)
2722+
mapper = {
2723+
"l_name": left_column.name,
2724+
"dtype": "datetime",
2725+
"r_name": right_column.name,
2726+
}
2727+
raise ValueError(error_msg_dtype.format(**mapper))
28102728
return None
2729+
raise ValueError(error_msg)
28112730

28122731

28132732
def _interval_ranges(indices: np.ndarray, right: np.ndarray) -> np.ndarray:

0 commit comments

Comments
 (0)