pyjanitor-devs · nvamsikrishna05 · Aug 20, 2021 · Apr 23, 2021 · May 24, 2021 · Jun 5, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 -   [ENH] `complete` function now uses variable args (*args) - @samukweku
 -   [EHN] Set `expand_column`'s `sep` default is `"|"`, same to `pandas.Series.str.get_dummies`. Issue #876. @Zeroto521
 -   [ENH] Deprecate `limit` from fill_direction. fill_direction now uses kwargs. @samukweku
+-   [ENH] Added `conditional_join` function that supports joins on non-equi operators. @samukweku
 
 ## [v0.21.0] - 2021-07-16
 
@@ -227,7 +228,6 @@ who have helped make `pyjanitor`
 the package that it is today.
 
 [Unreleased]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.21.0...HEAD
-
 [v0.21.0]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.20.14...v0.21.0
 
 [v0.20.14]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.20.13...v0.20.14

diff --git a/docs/reference/general_functions.rst b/docs/reference/general_functions.rst
@@ -95,3 +95,4 @@ Other
     complete
     pivot_longer
     pivot_wider
+    conditional_join
diff --git a/janitor/functions.py b/janitor/functions.py
@@ -58,6 +58,9 @@
     check,
     check_column,
     deprecated_alias,
+    _conditional_join_preliminary_checks,
+    _conditional_join_compute,
+    _cond_join_suffixes,
 )
 
 
@@ -6476,3 +6479,321 @@ def pivot_wider(
     )
 
     return df
+
+
+@pf.register_dataframe_method
+def conditional_join(
+    df: pd.DataFrame,
+    right: Union[pd.DataFrame, pd.Series],
+    *conditions,
+    how: str = "inner",
+    sort_by_appearance: bool = False,
+    suffixes=("_x", "_y"),
+) -> pd.DataFrame:
+    """
+
+    This is a convenience function that operates similarly to ``pd.merge``,
+    but allows joins on inequality operators, or a combination of equi
+    and non-equi joins.
+
+    If the join is solely on equality, `pd.merge` function
+    is more efficient and should be used instead.
+    If you are interested in nearest joins, or rolling joins,
+    `pd.merge_asof` covers that. There is also the IntervalIndex,
+    which can be more efficient for range joins, if the intervals
+    do not overlap.
+
+    This function returns rows, if any, where values from `df` meet the
+    condition(s) for values from `right`. The conditions are passed in
+    as a variable argument of tuples, where the tuple is of
+    the form ``(left_on, right_on, op)``; `left_on` is the column
+    label from `df`, `right_on` is the column label from `right`,
+    while `op` is the operator.
+
+    The operator can be any of `==`, `!=`, `<=`, `<`, `>=`, `>`.
+
+    If the join operator is a non-equi operator, a binary search is used
+    to get the relevant rows; this avoids a cartesian join, and makes the
+    process less memory intensive. If it is an equality operator, it simply
+    uses pandas' `merge` or `get_indexer_for` method to retrieve the relevant
+    rows.
+
+    The join is done only on the columns.
+    MultiIndex columns are not supported.
+
+    Only numeric, date and string columns are supported.
+
+    If joining on strings, only the `==` operator is supported.
+
+    Only `inner`, `left`, and `right` joins are supported.
+
+
+    Example :
+
+    df1::
+
+            id  value_1
+        0   1        2
+        1   1        5
+        2   1        7
+        3   2        1
+        4   2        3
+        5   3        4
+
+
+    df2::
+
+            id  value_2A  value_2B
+        0   1         0         1
+        1   1         3         5
+        2   1         7         9
+        3   1        12        15
+        4   2         0         1
+        5   2         2         4
+        6   2         3         6
+        7   3         1         3
+
+    Join on equi and non-equi operators is possible::
+
+        df1.conditional_join(
+                right = df2,
+                ('id', 'id', '=='),
+                ('value_1', 'value_2A', '>='),
+                ('value_1', 'value_2B', '<='),
+                sort_by_appearance = True
+            )
+
+            id_x  value_1  id_y  value_2A  value_2B
+        0     1        5     1         3         5
+        1     1        7     1         7         9
+        2     2        1     2         0         1
+        3     2        3     2         2         4
+        4     2        3     2         3         6
+
+    The default join is `inner`. left and right joins are supported as well::
+
+        df1.conditional_join(
+                right = df2,
+                ('id', 'id', '=='),
+                ('value_1', 'value_2A', '>='),
+                ('value_1', 'value_2B', '<='),
+                how='left',
+                sort_by_appearance = True
+            )
+
+            id_x  value_1  id_y  value_2A  value_2B
+        0     1        2   NaN       NaN       NaN
+        1     1        5   1.0       3.0       5.0
+        2     1        7   1.0       7.0       9.0
+        3     2        1   2.0       0.0       1.0
+        4     2        3   2.0       2.0       4.0
+        5     2        3   2.0       3.0       6.0
+        6     3        4   NaN       NaN       NaN
+
+
+        df1.conditional_join(
+                right = df2,
+                ('id', 'id', '=='),
+                ('value_1', 'value_2A', '>='),
+                ('value_1', 'value_2B', '<='),
+                how='right',
+                sort_by_appearance = True
+            )
+
+            id_x  value_1  id_y  value_2A  value_2B
+        0   NaN      NaN     1         0         1
+        1   1.0      5.0     1         3         5
+        2   1.0      7.0     1         7         9
+        3   NaN      NaN     1        12        15
+        4   2.0      1.0     2         0         1
+        5   2.0      3.0     2         2         4
+        6   2.0      3.0     2         3         6
+        7   NaN      NaN     3         1         3
+
+
+    Join on just the non-equi joins is also possible::
+
+        df1.conditional_join(
+                right = df2,
+                ('value_1', 'value_2A', '>'),
+                ('value_1', 'value_2B', '<'),
+                how='inner',
+                sort_by_appearance = True
+            )
+
+            id_x  value_1  id_y  value_2A  value_2B
+        0     1        2     3         1         3
+        1     1        5     2         3         6
+        2     2        3     2         2         4
+        3     3        4     1         3         5
+        4     3        4     2         3         6
+
+    The default for the `suffixes` parameter is ``(_x, _y)``,
+    One of the suffixes can be set as ``None``;
+    this avoids a suffix on the columns from the
+    relevant dataframe::
+
+        df1.conditional_join(
+                right = df2,
+                ('value_1', 'value_2A', '>'),
+                ('value_1', 'value_2B', '<'),
+                how='inner',
+                sort_by_appearance = True,
+                suffixes = (None, '_y')
+            )
+
+            id  value_1  id_y  value_2A  value_2B
+        0   1        2     3         1         3
+        1   1        5     2         3         6
+        2   2        3     2         2         4
+        3   3        4     1         3         5
+        4   3        4     2         3         6
+
+    Join on just equality is also possible, but should be avoided -
+    Pandas merge/join is more efficient::
+
+        df1.conditional_join(
+                right = df2,
+                ('col_a', 'col_a', '=='),
+                sort_by_appearance = True
+            )
+
+             col_a_x col_b  col_a_y col_c
+        0        2     B        2     X
+        1        3     C        3     Y
+
+    Join on not equal -> ``!=`` ::
+
+        df1.conditional_join(
+                right = df2,
+                ('col_a', 'col_a', '!='),
+                sort_by_appearance = True
+            )
+
+             col_a_x col_b  col_a_y col_c
+        0        1     A        0     Z
+        1        1     A        2     X
+        2        1     A        3     Y
+        3        2     B        0     Z
+        4        2     B        3     Y
+        5        3     C        0     Z
+        6        3     C        2     X
+
+
+    If the order from `right` is not important,
+    `sort_by_appearance` can be set to  ``False``
+    (this is the default)::
+
+        df1.conditional_join(
+                right = df2,
+                ('col_a', 'col_a', '>'),
+                sort_by_appearance = False
+            )
+
+             col_a_x col_b  col_a_y col_c
+        0        1     A        0     Z
+        1        2     B        0     Z
+        2        3     C        0     Z
+        3        3     C        2     X
+
+
+    .. note:: If `df` or `right` has labeled indices,
+              it will be lost after the merge,
+              and replaced with an integer index.
+              If you wish to preserve the labeled indices,
+              you can convert them to columns
+              before running the conditional join.
+
+    .. note:: All the columns from `df` and `right`
+              are returned in the final output.
+
+    Functional usage syntax:
+
+    .. code-block:: python
+
+        import pandas as pd
+        import janitor as jn
+
+        df = pd.DataFrame(...)
+        right = pd.DataFrame(...)
+
+        df = jn.conditional_join(
+                df = df,
+                right = right,
+                *conditions,
+                sort_by_appearance = True/False,
+                suffixes = ("_x", "_y"),
+                )
+
+    Method chaining syntax:
+
+    .. code-block:: python
+
+        df = df.conditional_join(
+                right = right,
+                *conditions,
+                sort_by_appearance = True/False,
+                suffixes = ("_x", "_y"),
+                )
+
+
+    :param df: A Pandas dataframe.
+    :param right: Named Series or DataFrame to join to.
+    :param conditions: Variable argument of tuple(s) of the form
+        ``(left_on, right_on, op)``, where `left_on` is the column
+        label from `df`, `right_on` is the column label from `right`,
+        while `op` is the operator. The operator can be any of
+        `==`, `!=`, `<=`, `<`, `>=`, `>`.
+    :param how: Indicates the type of join to be performed.
+        It can be one of `inner`, `left`, `right`.
+        Full join is not supported. Defaults to `inner`.
+    :param sort_by_appearance: Default is `False`. If True,
+        values from `right` that meet the join condition will be returned
+        in the final dataframe in the same order that they were before the
+        join.
+    :param suffixes: tuple, default is ``(_x, _y)``.
+        A sequence of length 2, where each element is optionally a string,
+        indicating the suffix to add to the overlapping column names
+        in `df` and `right`. Pass a value of ``None``
+        instead of a string to indicate that the  column name
+        from `df` or `right` should be left as-is, with no suffix.
+        At least one of the values must not be ``None``.
+    :returns: A pandas DataFrame of the two merged Pandas objects.
+    :raises ValueError: if columns from `df` or `right` is a MultiIndex.
+    :raises ValueError: if `right` is an unnamed Series.
+    :raises ValueError: if condition in *conditions is not a tuple.
+    :raises ValueError: if condition is not length 3.
+    :raises ValueError: if `left_on` and `right_on` in condition are not
+        both numeric, or string, or datetime.
+
+
+    .. # noqa: DAR402
+    """
+
+    (
+        df,
+        right,
+        conditions,
+        how,
+        sort_by_appearance,
+        suffixes,
+    ) = _conditional_join_preliminary_checks(
+        df,
+        right,
+        conditions,
+        how,
+        sort_by_appearance,
+        suffixes,
+    )
+
+    df, right, conditions = _cond_join_suffixes(
+        df, right, conditions, suffixes
+    )
+
+    # the numeric indexes play a crucial part in position tracking
+    df.index = np.arange(len(df))
+    right.index = np.arange(len(right))
+
+    return _conditional_join_compute(
+        df, right, conditions, how, sort_by_appearance
+    )
diff --git a/janitor/testing_utils/strategies.py b/janitor/testing_utils/strategies.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from hypothesis import strategies as st
-from hypothesis.extra.pandas import column, data_frames, range_indexes
+from hypothesis.extra.pandas import column, data_frames, range_indexes, series
 
 
 def nulldf_strategy():
@@ -69,3 +69,36 @@ def categoricaldf_strategy():
 
 def names_strategy():
     return st.lists(elements=st.sampled_from(names))
+
+
+def conditional_df():
+    """Dataframe used in tests_conditional_join."""
+    return data_frames(
+        [
+            column(name="A", dtype=int),
+            column(name="B", elements=st.floats(allow_nan=True)),
+            column(name="C", elements=st.text(max_size=20)),
+            column(name="D", dtype=bool),
+            column(name="E", dtype="datetime64[ns]"),
+        ]
+    )
+
+
+def conditional_series():
+    """Series used in tests_conditional_join"""
+    return series(dtype=int)
+
+
+def conditional_right():
+    """Dataframe used in tests_conditional_join."""
+    return data_frames(
+        [
+            column(name="Integers", dtype=int),
+            column(name="Numeric", elements=st.floats(allow_nan=True)),
+            column(name="Floats", elements=st.floats(max_value=20)),
+            column(name="Strings", dtype=str),
+            column(name="Booleans", dtype=np.bool_),
+            column(name="Dates", dtype="datetime64[ns]"),
+            column(name="Dates_Right", dtype="datetime64[ns]"),
+        ]
+    )