|
58 | 58 | check,
|
59 | 59 | check_column,
|
60 | 60 | deprecated_alias,
|
| 61 | + _conditional_join_preliminary_checks, |
| 62 | + _conditional_join_compute, |
| 63 | + _cond_join_suffixes, |
61 | 64 | )
|
62 | 65 |
|
63 | 66 |
|
@@ -6476,3 +6479,321 @@ def pivot_wider(
|
6476 | 6479 | )
|
6477 | 6480 |
|
6478 | 6481 | return df
|
| 6482 | + |
| 6483 | + |
| 6484 | +@pf.register_dataframe_method |
| 6485 | +def conditional_join( |
| 6486 | + df: pd.DataFrame, |
| 6487 | + right: Union[pd.DataFrame, pd.Series], |
| 6488 | + *conditions, |
| 6489 | + how: str = "inner", |
| 6490 | + sort_by_appearance: bool = False, |
| 6491 | + suffixes=("_x", "_y"), |
| 6492 | +) -> pd.DataFrame: |
| 6493 | + """ |
| 6494 | +
|
| 6495 | + This is a convenience function that operates similarly to ``pd.merge``, |
| 6496 | + but allows joins on inequality operators, or a combination of equi |
| 6497 | + and non-equi joins. |
| 6498 | +
|
| 6499 | + If the join is solely on equality, `pd.merge` function |
| 6500 | + is more efficient and should be used instead. |
| 6501 | + If you are interested in nearest joins, or rolling joins, |
| 6502 | + `pd.merge_asof` covers that. There is also the IntervalIndex, |
| 6503 | + which can be more efficient for range joins, if the intervals |
| 6504 | + do not overlap. |
| 6505 | +
|
| 6506 | + This function returns rows, if any, where values from `df` meet the |
| 6507 | + condition(s) for values from `right`. The conditions are passed in |
| 6508 | + as a variable argument of tuples, where the tuple is of |
| 6509 | + the form ``(left_on, right_on, op)``; `left_on` is the column |
| 6510 | + label from `df`, `right_on` is the column label from `right`, |
| 6511 | + while `op` is the operator. |
| 6512 | +
|
| 6513 | + The operator can be any of `==`, `!=`, `<=`, `<`, `>=`, `>`. |
| 6514 | +
|
| 6515 | + If the join operator is a non-equi operator, a binary search is used |
| 6516 | + to get the relevant rows; this avoids a cartesian join, and makes the |
| 6517 | + process less memory intensive. If it is an equality operator, it simply |
| 6518 | + uses pandas' `merge` or `get_indexer_for` method to retrieve the relevant |
| 6519 | + rows. |
| 6520 | +
|
| 6521 | + The join is done only on the columns. |
| 6522 | + MultiIndex columns are not supported. |
| 6523 | +
|
| 6524 | + Only numeric, date and string columns are supported. |
| 6525 | +
|
| 6526 | + If joining on strings, only the `==` operator is supported. |
| 6527 | +
|
| 6528 | + Only `inner`, `left`, and `right` joins are supported. |
| 6529 | +
|
| 6530 | +
|
| 6531 | + Example : |
| 6532 | +
|
| 6533 | + df1:: |
| 6534 | +
|
| 6535 | + id value_1 |
| 6536 | + 0 1 2 |
| 6537 | + 1 1 5 |
| 6538 | + 2 1 7 |
| 6539 | + 3 2 1 |
| 6540 | + 4 2 3 |
| 6541 | + 5 3 4 |
| 6542 | +
|
| 6543 | +
|
| 6544 | + df2:: |
| 6545 | +
|
| 6546 | + id value_2A value_2B |
| 6547 | + 0 1 0 1 |
| 6548 | + 1 1 3 5 |
| 6549 | + 2 1 7 9 |
| 6550 | + 3 1 12 15 |
| 6551 | + 4 2 0 1 |
| 6552 | + 5 2 2 4 |
| 6553 | + 6 2 3 6 |
| 6554 | + 7 3 1 3 |
| 6555 | +
|
| 6556 | + Join on equi and non-equi operators is possible:: |
| 6557 | +
|
| 6558 | + df1.conditional_join( |
| 6559 | + right = df2, |
| 6560 | + ('id', 'id', '=='), |
| 6561 | + ('value_1', 'value_2A', '>='), |
| 6562 | + ('value_1', 'value_2B', '<='), |
| 6563 | + sort_by_appearance = True |
| 6564 | + ) |
| 6565 | +
|
| 6566 | + id_x value_1 id_y value_2A value_2B |
| 6567 | + 0 1 5 1 3 5 |
| 6568 | + 1 1 7 1 7 9 |
| 6569 | + 2 2 1 2 0 1 |
| 6570 | + 3 2 3 2 2 4 |
| 6571 | + 4 2 3 2 3 6 |
| 6572 | +
|
| 6573 | + The default join is `inner`. left and right joins are supported as well:: |
| 6574 | +
|
| 6575 | + df1.conditional_join( |
| 6576 | + right = df2, |
| 6577 | + ('id', 'id', '=='), |
| 6578 | + ('value_1', 'value_2A', '>='), |
| 6579 | + ('value_1', 'value_2B', '<='), |
| 6580 | + how='left', |
| 6581 | + sort_by_appearance = True |
| 6582 | + ) |
| 6583 | +
|
| 6584 | + id_x value_1 id_y value_2A value_2B |
| 6585 | + 0 1 2 NaN NaN NaN |
| 6586 | + 1 1 5 1.0 3.0 5.0 |
| 6587 | + 2 1 7 1.0 7.0 9.0 |
| 6588 | + 3 2 1 2.0 0.0 1.0 |
| 6589 | + 4 2 3 2.0 2.0 4.0 |
| 6590 | + 5 2 3 2.0 3.0 6.0 |
| 6591 | + 6 3 4 NaN NaN NaN |
| 6592 | +
|
| 6593 | +
|
| 6594 | + df1.conditional_join( |
| 6595 | + right = df2, |
| 6596 | + ('id', 'id', '=='), |
| 6597 | + ('value_1', 'value_2A', '>='), |
| 6598 | + ('value_1', 'value_2B', '<='), |
| 6599 | + how='right', |
| 6600 | + sort_by_appearance = True |
| 6601 | + ) |
| 6602 | +
|
| 6603 | + id_x value_1 id_y value_2A value_2B |
| 6604 | + 0 NaN NaN 1 0 1 |
| 6605 | + 1 1.0 5.0 1 3 5 |
| 6606 | + 2 1.0 7.0 1 7 9 |
| 6607 | + 3 NaN NaN 1 12 15 |
| 6608 | + 4 2.0 1.0 2 0 1 |
| 6609 | + 5 2.0 3.0 2 2 4 |
| 6610 | + 6 2.0 3.0 2 3 6 |
| 6611 | + 7 NaN NaN 3 1 3 |
| 6612 | +
|
| 6613 | +
|
| 6614 | + Join on just the non-equi joins is also possible:: |
| 6615 | +
|
| 6616 | + df1.conditional_join( |
| 6617 | + right = df2, |
| 6618 | + ('value_1', 'value_2A', '>'), |
| 6619 | + ('value_1', 'value_2B', '<'), |
| 6620 | + how='inner', |
| 6621 | + sort_by_appearance = True |
| 6622 | + ) |
| 6623 | +
|
| 6624 | + id_x value_1 id_y value_2A value_2B |
| 6625 | + 0 1 2 3 1 3 |
| 6626 | + 1 1 5 2 3 6 |
| 6627 | + 2 2 3 2 2 4 |
| 6628 | + 3 3 4 1 3 5 |
| 6629 | + 4 3 4 2 3 6 |
| 6630 | +
|
| 6631 | + The default for the `suffixes` parameter is ``(_x, _y)``, |
| 6632 | + One of the suffixes can be set as ``None``; |
| 6633 | + this avoids a suffix on the columns from the |
| 6634 | + relevant dataframe:: |
| 6635 | +
|
| 6636 | + df1.conditional_join( |
| 6637 | + right = df2, |
| 6638 | + ('value_1', 'value_2A', '>'), |
| 6639 | + ('value_1', 'value_2B', '<'), |
| 6640 | + how='inner', |
| 6641 | + sort_by_appearance = True, |
| 6642 | + suffixes = (None, '_y') |
| 6643 | + ) |
| 6644 | +
|
| 6645 | + id value_1 id_y value_2A value_2B |
| 6646 | + 0 1 2 3 1 3 |
| 6647 | + 1 1 5 2 3 6 |
| 6648 | + 2 2 3 2 2 4 |
| 6649 | + 3 3 4 1 3 5 |
| 6650 | + 4 3 4 2 3 6 |
| 6651 | +
|
| 6652 | + Join on just equality is also possible, but should be avoided - |
| 6653 | + Pandas merge/join is more efficient:: |
| 6654 | +
|
| 6655 | + df1.conditional_join( |
| 6656 | + right = df2, |
| 6657 | + ('col_a', 'col_a', '=='), |
| 6658 | + sort_by_appearance = True |
| 6659 | + ) |
| 6660 | +
|
| 6661 | + col_a_x col_b col_a_y col_c |
| 6662 | + 0 2 B 2 X |
| 6663 | + 1 3 C 3 Y |
| 6664 | +
|
| 6665 | + Join on not equal -> ``!=`` :: |
| 6666 | +
|
| 6667 | + df1.conditional_join( |
| 6668 | + right = df2, |
| 6669 | + ('col_a', 'col_a', '!='), |
| 6670 | + sort_by_appearance = True |
| 6671 | + ) |
| 6672 | +
|
| 6673 | + col_a_x col_b col_a_y col_c |
| 6674 | + 0 1 A 0 Z |
| 6675 | + 1 1 A 2 X |
| 6676 | + 2 1 A 3 Y |
| 6677 | + 3 2 B 0 Z |
| 6678 | + 4 2 B 3 Y |
| 6679 | + 5 3 C 0 Z |
| 6680 | + 6 3 C 2 X |
| 6681 | +
|
| 6682 | +
|
| 6683 | + If the order from `right` is not important, |
| 6684 | + `sort_by_appearance` can be set to ``False`` |
| 6685 | + (this is the default):: |
| 6686 | +
|
| 6687 | + df1.conditional_join( |
| 6688 | + right = df2, |
| 6689 | + ('col_a', 'col_a', '>'), |
| 6690 | + sort_by_appearance = False |
| 6691 | + ) |
| 6692 | +
|
| 6693 | + col_a_x col_b col_a_y col_c |
| 6694 | + 0 1 A 0 Z |
| 6695 | + 1 2 B 0 Z |
| 6696 | + 2 3 C 0 Z |
| 6697 | + 3 3 C 2 X |
| 6698 | +
|
| 6699 | +
|
| 6700 | + .. note:: If `df` or `right` has labeled indices, |
| 6701 | + it will be lost after the merge, |
| 6702 | + and replaced with an integer index. |
| 6703 | + If you wish to preserve the labeled indices, |
| 6704 | + you can convert them to columns |
| 6705 | + before running the conditional join. |
| 6706 | +
|
| 6707 | + .. note:: All the columns from `df` and `right` |
| 6708 | + are returned in the final output. |
| 6709 | +
|
| 6710 | + Functional usage syntax: |
| 6711 | +
|
| 6712 | + .. code-block:: python |
| 6713 | +
|
| 6714 | + import pandas as pd |
| 6715 | + import janitor as jn |
| 6716 | +
|
| 6717 | + df = pd.DataFrame(...) |
| 6718 | + right = pd.DataFrame(...) |
| 6719 | +
|
| 6720 | + df = jn.conditional_join( |
| 6721 | + df = df, |
| 6722 | + right = right, |
| 6723 | + *conditions, |
| 6724 | + sort_by_appearance = True/False, |
| 6725 | + suffixes = ("_x", "_y"), |
| 6726 | + ) |
| 6727 | +
|
| 6728 | + Method chaining syntax: |
| 6729 | +
|
| 6730 | + .. code-block:: python |
| 6731 | +
|
| 6732 | + df = df.conditional_join( |
| 6733 | + right = right, |
| 6734 | + *conditions, |
| 6735 | + sort_by_appearance = True/False, |
| 6736 | + suffixes = ("_x", "_y"), |
| 6737 | + ) |
| 6738 | +
|
| 6739 | +
|
| 6740 | + :param df: A Pandas dataframe. |
| 6741 | + :param right: Named Series or DataFrame to join to. |
| 6742 | + :param conditions: Variable argument of tuple(s) of the form |
| 6743 | + ``(left_on, right_on, op)``, where `left_on` is the column |
| 6744 | + label from `df`, `right_on` is the column label from `right`, |
| 6745 | + while `op` is the operator. The operator can be any of |
| 6746 | + `==`, `!=`, `<=`, `<`, `>=`, `>`. |
| 6747 | + :param how: Indicates the type of join to be performed. |
| 6748 | + It can be one of `inner`, `left`, `right`. |
| 6749 | + Full join is not supported. Defaults to `inner`. |
| 6750 | + :param sort_by_appearance: Default is `False`. If True, |
| 6751 | + values from `right` that meet the join condition will be returned |
| 6752 | + in the final dataframe in the same order that they were before the |
| 6753 | + join. |
| 6754 | + :param suffixes: tuple, default is ``(_x, _y)``. |
| 6755 | + A sequence of length 2, where each element is optionally a string, |
| 6756 | + indicating the suffix to add to the overlapping column names |
| 6757 | + in `df` and `right`. Pass a value of ``None`` |
| 6758 | + instead of a string to indicate that the column name |
| 6759 | + from `df` or `right` should be left as-is, with no suffix. |
| 6760 | + At least one of the values must not be ``None``. |
| 6761 | + :returns: A pandas DataFrame of the two merged Pandas objects. |
| 6762 | + :raises ValueError: if columns from `df` or `right` is a MultiIndex. |
| 6763 | + :raises ValueError: if `right` is an unnamed Series. |
| 6764 | + :raises ValueError: if condition in *conditions is not a tuple. |
| 6765 | + :raises ValueError: if condition is not length 3. |
| 6766 | + :raises ValueError: if `left_on` and `right_on` in condition are not |
| 6767 | + both numeric, or string, or datetime. |
| 6768 | +
|
| 6769 | +
|
| 6770 | + .. # noqa: DAR402 |
| 6771 | + """ |
| 6772 | + |
| 6773 | + ( |
| 6774 | + df, |
| 6775 | + right, |
| 6776 | + conditions, |
| 6777 | + how, |
| 6778 | + sort_by_appearance, |
| 6779 | + suffixes, |
| 6780 | + ) = _conditional_join_preliminary_checks( |
| 6781 | + df, |
| 6782 | + right, |
| 6783 | + conditions, |
| 6784 | + how, |
| 6785 | + sort_by_appearance, |
| 6786 | + suffixes, |
| 6787 | + ) |
| 6788 | + |
| 6789 | + df, right, conditions = _cond_join_suffixes( |
| 6790 | + df, right, conditions, suffixes |
| 6791 | + ) |
| 6792 | + |
| 6793 | + # the numeric indexes play a crucial part in position tracking |
| 6794 | + df.index = np.arange(len(df)) |
| 6795 | + right.index = np.arange(len(right)) |
| 6796 | + |
| 6797 | + return _conditional_join_compute( |
| 6798 | + df, right, conditions, how, sort_by_appearance |
| 6799 | + ) |
0 commit comments