48
48
_currency_column_to_numeric ,
49
49
_data_checks_pivot_longer ,
50
50
_data_checks_pivot_wider ,
51
- _process_text ,
52
51
_replace_empty_string_with_none ,
53
52
_replace_original_empty_string_with_none ,
54
53
_select_columns ,
@@ -5060,21 +5059,17 @@ def expand_grid(
5060
5059
def process_text (
5061
5060
df : pd .DataFrame ,
5062
5061
column_name : str ,
5063
- new_column_names : Optional [Union [str , list ]] = None ,
5064
- merge_frame : Optional [bool ] = False ,
5065
- string_function : Optional [str ] = None ,
5062
+ string_function : str ,
5066
5063
** kwargs : str ,
5067
5064
) -> pd .DataFrame :
5068
5065
"""
5069
5066
Apply a Pandas string method to an existing column and return a dataframe.
5070
5067
5071
5068
This function aims to make string cleaning easy, while chaining,
5072
5069
by simply passing the string method name to the ``process_text`` function.
5073
- This modifies an existing column and can also be used to create a new
5074
- column .
5070
+ This modifies an existing column; it does not create a new column.
5071
+ New columns can be created via pyjanitor's `transform_columns` .
5075
5072
5076
- .. note:: In versions < 0.20.11, this function did not support the
5077
- creation of new columns.
5078
5073
5079
5074
A list of all the string methods in Pandas can be accessed `here
5080
5075
<https://pandas.pydata.org/docs/user_guide/text.html#method-summary>`__.
@@ -5086,10 +5081,10 @@ def process_text(
5086
5081
import pandas as pd
5087
5082
import janitor as jn
5088
5083
5089
- df = pd.DataFrame({" text" : ["Ragnar",
5090
- "sammywemmy",
5091
- "ginger"],
5092
- "code" : [1, 2, 3]})
5084
+ text code
5085
+ 0 Ragnar 1
5086
+ 1 sammywemmy 2
5087
+ 2 ginger 3
5093
5088
5094
5089
df.process_text(column_name = "text",
5095
5090
string_function = "lower")
@@ -5114,22 +5109,6 @@ def process_text(
5114
5109
1 NaN 2
5115
5110
2 NaN 3
5116
5111
5117
- A new column can be created, leaving the existing column unmodified::
5118
-
5119
- df.process_text(
5120
- column_name = "text",
5121
- new_column_names = "new_text",
5122
- string_function = "extract",
5123
- pat = r"(ag)",
5124
- flags = re.IGNORECASE
5125
- )
5126
-
5127
- text code new_text
5128
- 0 Ragnar 1 ag
5129
- 1 sammywemmy 2 NaN
5130
- 2 ginger 3 NaN
5131
-
5132
-
5133
5112
Functional usage syntax:
5134
5113
5135
5114
.. code-block:: python
@@ -5141,8 +5120,6 @@ def process_text(
5141
5120
df = jn.process_text(
5142
5121
df = df,
5143
5122
column_name,
5144
- new_column_names = None/string/list_of_strings,
5145
- merge_frame = True/False,
5146
5123
string_function = "string_func_name_here",
5147
5124
kwargs
5148
5125
)
@@ -5158,8 +5135,6 @@ def process_text(
5158
5135
pd.DataFrame(...)
5159
5136
.process_text(
5160
5137
column_name,
5161
- new_column_names = None/string/list_of_strings,
5162
- merge_frame = True/False
5163
5138
string_function = "string_func_name_here",
5164
5139
kwargs
5165
5140
)
@@ -5168,77 +5143,39 @@ def process_text(
5168
5143
5169
5144
:param df: A pandas dataframe.
5170
5145
:param column_name: String column to be operated on.
5171
- :param new_column_names: Name(s) to assign to the new column(s) created
5172
- from the text processing. `new_column_names` can be a string, if
5173
- the result of the text processing is a Series or string; if the
5174
- result of the text processing is a dataframe, then `new_column_names`
5175
- is treated as a prefix for each of the columns in the new dataframe.
5176
- `new_column_names` can also be a list of strings to act as new
5177
- column names for the new dataframe. The existing `column_name`
5178
- stays unmodified if `new_column_names` is not None.
5179
- :param merge_frame: This comes into play if the result of the text
5180
- processing is a dataframe. If `True`, the resulting dataframe
5181
- will be merged with the original dataframe, else the resulting
5182
- dataframe, not the original dataframe, will be returned.
5183
5146
:param string_function: Pandas string method to be applied.
5184
5147
:param kwargs: Keyword arguments for parameters of the `string_function`.
5185
5148
:returns: A pandas dataframe with modified column(s).
5186
5149
:raises KeyError: if ``string_function`` is not a Pandas string method.
5187
- :raises TypeError: if wrong ``arg`` or ``kwarg`` is supplied.
5150
+ :raises TypeError: if the wrong ``kwarg`` is supplied.
5188
5151
:raises ValueError: if `column_name` not found in dataframe.
5189
- :raises ValueError: if `new_column_names` is not None and is found in
5190
- dataframe.
5191
5152
5192
5153
.. # noqa: DAR402
5193
5154
"""
5194
- df = df .copy ()
5195
-
5196
5155
check ("column_name" , column_name , [str ])
5156
+ check ("string_function" , string_function , [str ])
5197
5157
check_column (df , [column_name ])
5198
5158
5199
- # new_column_names should not already exist in the dataframe
5200
- if new_column_names :
5201
- check ("new_column_names" , new_column_names , [list , str ])
5202
- if isinstance (new_column_names , str ):
5203
- check_column (df , [new_column_names ], present = False )
5204
- else :
5205
- check_column (df , new_column_names , present = False )
5206
-
5207
- if merge_frame :
5208
- check ("merge_frame" , merge_frame , [bool ])
5209
-
5210
5159
pandas_string_methods = [
5211
5160
func .__name__
5212
5161
for _ , func in inspect .getmembers (pd .Series .str , inspect .isfunction )
5213
5162
if not func .__name__ .startswith ("_" )
5214
5163
]
5215
5164
5216
- if not string_function :
5217
- return df
5218
-
5219
5165
if string_function not in pandas_string_methods :
5220
5166
raise KeyError (f"{ string_function } is not a Pandas string method." )
5221
5167
5222
- if string_function == "extractall" and merge_frame :
5223
- # create unique indices
5224
- # comes in handy for executing joins if there are
5225
- # duplicated indices in the original dataframe
5226
- df = df .set_index (np .arange (len (df )), append = True ) # extra_index_line
5227
-
5228
5168
result = getattr (df [column_name ].str , string_function )(** kwargs )
5229
5169
5230
- # TODO: Support for str.cat with `join` parameter
5231
- # need a robust way to handle the results
5232
- # if there is a `join` parameter, as this could create more
5233
- # or less rows with varying indices or even duplicate indices
5234
-
5235
- return _process_text (
5236
- result ,
5237
- df = df ,
5238
- column_name = column_name ,
5239
- new_column_names = new_column_names ,
5240
- merge_frame = merge_frame ,
5241
- )
5170
+ if isinstance (result , pd .DataFrame ):
5171
+ raise ValueError (
5172
+ """
5173
+ The outcome of the processed text is a DataFrame,
5174
+ which is not supported in `process_text`.
5175
+ """
5176
+ )
5177
+
5178
+ return df .assign (** {column_name : result })
5242
5179
5243
5180
5244
5181
@pf .register_dataframe_method
@@ -6715,6 +6652,18 @@ def conditional_join(
6715
6652
Join on just equality is also possible, but should be avoided -
6716
6653
Pandas merge/join is more efficient::
6717
6654
6655
+ df1
6656
+ col_a col_b
6657
+ 0 1 A
6658
+ 1 2 B
6659
+ 2 3 C
6660
+
6661
+ df2
6662
+ col_a col_c
6663
+ 0 0 Z
6664
+ 1 2 X
6665
+ 2 3 Y
6666
+
6718
6667
df1.conditional_join(
6719
6668
df2,
6720
6669
('col_a', 'col_a', '=='),
0 commit comments