pyjanitor-devs · samukweku · Oct 2, 2021 · Apr 23, 2021 · May 24, 2021 · Jun 5, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 -   [ENH] Deprecate `aggfunc` from `pivot_wider`; aggregation can be chained with pandas' `groupby`.
 -   [BUG] Fix conditional join issue for multiple conditions, where pd.eval fails to evaluate if numexpr is installed. #898 @samukweku
 - [ENH] Added `case_when` to handle multiple conditionals and replacement values. Issue #736. @robertmitchellv
+-   [ENH] Deprecate `new_column_names` and `merge_frame` from `process_text`. Only existing columns are supported. @samukweku
 
 
 ## [v0.21.1] - 2021-08-29

diff --git a/janitor/functions.py b/janitor/functions.py
@@ -48,7 +48,6 @@
     _currency_column_to_numeric,
     _data_checks_pivot_longer,
     _data_checks_pivot_wider,
-    _process_text,
     _replace_empty_string_with_none,
     _replace_original_empty_string_with_none,
     _select_columns,
@@ -5060,21 +5059,17 @@ def expand_grid(
 def process_text(
     df: pd.DataFrame,
     column_name: str,
-    new_column_names: Optional[Union[str, list]] = None,
-    merge_frame: Optional[bool] = False,
-    string_function: Optional[str] = None,
+    string_function: str,
     **kwargs: str,
 ) -> pd.DataFrame:
     """
     Apply a Pandas string method to an existing column and return a dataframe.
 
     This function aims to make string cleaning easy, while chaining,
     by simply passing the string method name to the ``process_text`` function.
-    This modifies an existing column and can also be used to create a new
-    column.
+    This modifies an existing column; it does not create a new column.
+    New columns can be created via pyjanitor's `transform_columns`.
 
-    .. note:: In versions < 0.20.11, this function did not support the
-        creation of new columns.
 
     A list of all the string methods in Pandas can be accessed `here
     <https://pandas.pydata.org/docs/user_guide/text.html#method-summary>`__.
@@ -5086,10 +5081,10 @@ def process_text(
         import pandas as pd
         import janitor as jn
 
-        df = pd.DataFrame({"text" : ["Ragnar",
-                                    "sammywemmy",
-                                    "ginger"],
-                           "code" : [1, 2, 3]})
+                 text  code
+        0      Ragnar     1
+        1  sammywemmy     2
+        2      ginger     3
 
         df.process_text(column_name = "text",
                         string_function = "lower")
@@ -5114,22 +5109,6 @@ def process_text(
         1 NaN       2
         2 NaN       3
 
-    A new column can be created, leaving the existing column unmodified::
-
-        df.process_text(
-            column_name = "text",
-            new_column_names = "new_text",
-            string_function = "extract",
-            pat = r"(ag)",
-            flags = re.IGNORECASE
-            )
-
-          text           code     new_text
-        0 Ragnar          1          ag
-        1 sammywemmy      2          NaN
-        2 ginger          3          NaN
-
-
     Functional usage syntax:
 
     .. code-block:: python
@@ -5141,8 +5120,6 @@ def process_text(
         df = jn.process_text(
             df = df,
             column_name,
-            new_column_names = None/string/list_of_strings,
-            merge_frame = True/False,
             string_function = "string_func_name_here",
             kwargs
             )
@@ -5158,8 +5135,6 @@ def process_text(
             pd.DataFrame(...)
             .process_text(
                 column_name,
-                new_column_names = None/string/list_of_strings,
-                merge_frame = True/False
                 string_function = "string_func_name_here",
                 kwargs
                 )
@@ -5168,77 +5143,39 @@ def process_text(
 
     :param df: A pandas dataframe.
     :param column_name: String column to be operated on.
-    :param new_column_names: Name(s) to assign to the new column(s) created
-        from the text processing. `new_column_names` can be a string, if
-        the result of the text processing is a Series or string; if the
-        result of the text processing is a dataframe, then `new_column_names`
-        is treated as a prefix for each of the columns in the new dataframe.
-        `new_column_names` can also be a list of strings to act as new
-        column names for the new dataframe. The existing `column_name`
-        stays unmodified if `new_column_names` is not None.
-    :param merge_frame: This comes into play if the result of the text
-        processing is a dataframe. If `True`, the resulting dataframe
-        will be merged with the original dataframe, else the resulting
-        dataframe, not the original dataframe, will be returned.
     :param string_function: Pandas string method to be applied.
     :param kwargs: Keyword arguments for parameters of the `string_function`.
     :returns: A pandas dataframe with modified column(s).
     :raises KeyError: if ``string_function`` is not a Pandas string method.
-    :raises TypeError: if wrong ``arg`` or ``kwarg`` is supplied.
+    :raises TypeError: if the wrong ``kwarg`` is supplied.
     :raises ValueError: if `column_name` not found in dataframe.
-    :raises ValueError: if `new_column_names` is not None and is found in
-        dataframe.
 
     .. # noqa: DAR402
     """
-    df = df.copy()
-
     check("column_name", column_name, [str])
+    check("string_function", string_function, [str])
     check_column(df, [column_name])
 
-    # new_column_names should not already exist in the dataframe
-    if new_column_names:
-        check("new_column_names", new_column_names, [list, str])
-        if isinstance(new_column_names, str):
-            check_column(df, [new_column_names], present=False)
-        else:
-            check_column(df, new_column_names, present=False)
-
-    if merge_frame:
-        check("merge_frame", merge_frame, [bool])
-
     pandas_string_methods = [
         func.__name__
         for _, func in inspect.getmembers(pd.Series.str, inspect.isfunction)
         if not func.__name__.startswith("_")
     ]
 
-    if not string_function:
-        return df
-
     if string_function not in pandas_string_methods:
         raise KeyError(f"{string_function} is not a Pandas string method.")
 
-    if string_function == "extractall" and merge_frame:
-        # create unique indices
-        # comes in handy for executing joins if there are
-        # duplicated indices in the original dataframe
-        df = df.set_index(np.arange(len(df)), append=True)  # extra_index_line
-
     result = getattr(df[column_name].str, string_function)(**kwargs)
 
-    # TODO: Support for str.cat with `join` parameter
-    # need a robust way to handle the results
-    # if there is a `join` parameter, as this could create more
-    # or less rows with varying indices or even duplicate indices
-
-    return _process_text(
-        result,
-        df=df,
-        column_name=column_name,
-        new_column_names=new_column_names,
-        merge_frame=merge_frame,
-    )
+    if isinstance(result, pd.DataFrame):
+        raise ValueError(
+            """
+            The outcome of the processed text is a DataFrame,
+            which is not supported in `process_text`.
+            """
+        )
+
+    return df.assign(**{column_name: result})
 
 
 @pf.register_dataframe_method
@@ -6715,6 +6652,18 @@ def conditional_join(
     Join on just equality is also possible, but should be avoided -
     Pandas merge/join is more efficient::
 
+        df1
+            col_a col_b
+        0      1     A
+        1      2     B
+        2      3     C
+
+        df2
+            col_a col_c
+        0      0     Z
+        1      2     X
+        2      3     Y
+
         df1.conditional_join(
                 df2,
                 ('col_a', 'col_a', '=='),

diff --git a/janitor/utils.py b/janitor/utils.py
@@ -1922,7 +1922,7 @@ def _computations_pivot_wider(
     # check dtype of `names_from` is string
     names_from_all_strings = df.filter(names_from).agg(is_string_dtype).all()
 
-    if names_sort is True:
+    if names_sort:
         # Categorical dtypes created only for `names_from`
         # since that is what will become the new column names
         dtypes = {
@@ -1941,7 +1941,7 @@ def _computations_pivot_wider(
         df = df.reorder_levels(order=levels_order, axis="columns")
 
     # an empty df is likely because
-    # there are no `values_from`
+    # there is no `values_from`
     if any((df.empty, flatten_levels is False)):
         return df
 
@@ -1953,7 +1953,7 @@ def _computations_pivot_wider(
         else:
             df.columns = df.columns.astype(str)
 
-    if names_sep is not None and (isinstance(df.columns, pd.MultiIndex)):
+    if (names_sep is not None) and (isinstance(df.columns, pd.MultiIndex)):
         df.columns = df.columns.map(names_sep.join)
 
     if names_glue:
@@ -2460,108 +2460,6 @@ def _column_sel_dispatch(columns_to_select, df):  # noqa: F811
     return filtered_columns
 
 
-@functools.singledispatch
-def _process_text(result: str, df, column_name, new_column_names, merge_frame):
-    """
-    Base function for `process_text` when `result` is of ``str`` type.
-    """
-    if new_column_names:
-        return df.assign(**{new_column_names: result})
-    df[column_name] = result
-    return df
-
-
-@_process_text.register
-def _sub_process_text(
-    result: pd.Series, df, column_name, new_column_names, merge_frame
-):
-    """
-    Base function for `process_text` when `result` is of ``pd.Series`` type.
-    """
-    if new_column_names:
-        return df.assign(**{new_column_names: result})
-    df[column_name] = result
-    return df
-
-
-@_process_text.register  # noqa: F811
-def _sub_process_text(  # noqa: F811
-    result: pd.DataFrame, df, column_name, new_column_names, merge_frame
-):  # noqa: F811
-    """
-    Base function for `process_text` when `result` is of ``pd.DataFrame`` type.
-    """
-    result = _process_text_result_is_frame(new_column_names, result)
-    if not merge_frame:
-        return result
-    return _process_text_result_MultiIndex(result.index, result, df)
-
-
-@functools.singledispatch
-def _process_text_result_is_frame(new_column_names: str, result):
-    """
-    Function to modify `result` columns from `process_text` if
-    `result` is a dataframe. Applies only if `new_column_names`
-    is a string type.
-    """
-    if new_column_names:
-        return result.add_prefix(new_column_names)
-    return result
-
-
-@_process_text_result_is_frame.register
-def _sub_process_text_result_is_frame(new_column_names: list, result):
-    """
-    Function to modify `result` columns from `process_text` if
-    `result` is a dataframe. Applies only if `new_column_names`
-    is a list type.
-    """
-    if len(new_column_names) != len(result.columns):
-        raise ValueError(
-            """
-            The length of `new_column_names` does not
-            match the number of columns in the new
-            dataframe generated from the text processing.
-            """
-        )
-    result.columns = new_column_names
-    return result
-
-
-@functools.singledispatch
-def _process_text_result_MultiIndex(index: pd.Index, result, df):
-    """
-    Function to modify `result` columns from `process_text` if
-    `result` is a dataframe and it has a single Index.
-    """
-    return pd.concat([df, result], axis="columns")
-
-
-@_process_text_result_MultiIndex.register
-def _sub_process_text_result_MultiIndex(index: pd.MultiIndex, result, df):
-    """
-    Function to modify `result` columns from `process_text` if
-    `result` is a dataframe and it has a MultiIndex.
-    At the moment, this function is primarily to cater for `str.extractall`,
-    since at the moment,
-    this is the only string method that returns a MultiIndex.
-    The function may be modified,
-    if another string function that returns a  MultIndex
-    is added to Pandas string methods.
-
-    For this function, `df` has been converted to a MultiIndex,
-    with the extra index added to create unique indices.
-    This comes in handy when merging back the dataframe,
-    especially if `result` returns duplicate indices.
-    """
-    result = result.reset_index(level="match")
-    df = df.join(result, how="outer")
-    # droplevel gets rid of the extra index added at the start
-    # (# extra_index_line)
-    df = df.droplevel(-1).set_index("match", append=True)
-    return df
-
-
 class JOINOPERATOR(Enum):
     """
     List of operators used in conditional_join.
@@ -2794,20 +2692,41 @@ def _conditional_join_type_check(
                        Strings can only be compared
                        on the equal(`==`) operator.
                        """
+    error_msg_dtype = """
+                       The left column ({l_name}) is a {dtype} type,
+                       while the right column ({r_name}) is not.
+                       Kindly ensure both columns are the same dtype.
+                       """
     if is_string_dtype(left_column):
         if not is_string_dtype(right_column):
-            raise ValueError(error_msg)
+            mapper = {
+                "l_name": left_column.name,
+                "dtype": "string",
+                "r_name": right_column.name,
+            }
+            raise ValueError(error_msg_dtype.format(**mapper))
         if op != JOINOPERATOR.STRICTLY_EQUAL.value:
             raise ValueError(error_msg_string)
         return None
     if is_numeric_dtype(left_column):
         if not is_numeric_dtype(right_column):
-            raise ValueError(error_msg)
+            mapper = {
+                "l_name": left_column.name,
+                "dtype": "numeric",
+                "r_name": right_column.name,
+            }
+            raise ValueError(error_msg_dtype.format(**mapper))
         return None
     if is_datetime64_dtype(left_column):
         if not is_datetime64_dtype(right_column):
-            raise ValueError(error_msg)
+            mapper = {
+                "l_name": left_column.name,
+                "dtype": "datetime",
+                "r_name": right_column.name,
+            }
+            raise ValueError(error_msg_dtype.format(**mapper))
         return None
+    raise ValueError(error_msg)
 
 
 def _interval_ranges(indices: np.ndarray, right: np.ndarray) -> np.ndarray: