13
13
Hashable ,
14
14
Iterable ,
15
15
List ,
16
- NamedTuple ,
17
16
Optional ,
18
17
Pattern ,
19
18
Set ,
52
51
_replace_original_empty_string_with_none ,
53
52
_select_columns ,
54
53
_strip_underscores ,
55
- asCategorical ,
56
54
check ,
57
55
check_column ,
58
56
deprecated_alias ,
@@ -473,23 +471,6 @@ def get_dupes(
473
471
return df [dupes == True ] # noqa: E712
474
472
475
473
476
- def As_Categorical (
477
- categories : Optional [List ] = None ,
478
- order : Optional [str ] = None ,
479
- ) -> NamedTuple :
480
- """
481
- Helper function for `encode_categorical`. It makes creating the
482
- `categories` and `order` more explicit. Inspired by pd.NamedAgg.
483
- :param categories: list-like object to create new categorical column.
484
- :param order: string object that can be either "sort" or "appearance".
485
- If "sort", the `categories` argument will be sorted with np.sort;
486
- if "apperance", the `categories` argument will be used as is.
487
- :returns: A namedtuple of (`categories`, `order`).
488
- """
489
-
490
- return asCategorical (categories = categories , order = order )
491
-
492
-
493
474
@pf .register_dataframe_method
494
475
@deprecated_alias (columns = "column_names" )
495
476
def encode_categorical (
@@ -503,9 +484,6 @@ def encode_categorical(
503
484
Categories and order can be explicitly specified via the `kwargs` option, which is a
504
485
pairing of column name and a tuple of (categories, order).
505
486
506
- The `janitor.As_Categorical` function is provided to make it clearer what the arguments
507
- to the function are.
508
-
509
487
It is syntactic sugar around `pd.Categorical`.
510
488
511
489
This method does not mutate the original DataFrame.
@@ -594,21 +572,6 @@ def encode_categorical(
594
572
if the `order` is "sort", the categories argument is sorted in ascending order;
595
573
if `order` is ``None``, then the categories argument is applied unordered.
596
574
597
- The ``janitor.As_Categorical`` function can also be used to make clearer
598
- what the arguments to the function are::
599
-
600
- df = (pd.DataFrame(...)
601
- .encode_categorical(
602
- col1 = As_Categorical(
603
- categories = [3, 2, 1, 4],
604
- order = "appearance"
605
- ),
606
- col2 = As_Categorical(
607
- categories = ['a','d','c','b'],
608
- order = "sort"
609
- )
610
- )
611
- )
612
575
613
576
A User Warning will be generated if some or all of the unique values
614
577
in the column are not present in the provided `categories` argument.
@@ -617,7 +580,7 @@ def encode_categorical(
617
580
618
581
df = (pd.DataFrame(...)
619
582
.encode_categorical(
620
- col1 = As_Categorical (
583
+ col1 = (
621
584
categories = [4, 5, 6],
622
585
order = "appearance"
623
586
)
@@ -663,10 +626,9 @@ def encode_categorical(
663
626
df = jn.encode_categorical(
664
627
df,
665
628
col1 = (categories, order),
666
- col2 = jn.As_Categorical(
667
- categories = [values],
668
- order="sort"/"appearance"/None
669
- )
629
+ col2 = (categories = [values],
630
+ order="sort" # or "appearance" or None
631
+
670
632
)
671
633
672
634
Method chaining syntax:
@@ -684,31 +646,22 @@ def encode_categorical(
684
646
pd.DataFrame(...)
685
647
.encode_categorical(
686
648
col1 = (categories, order),
687
- col2 = jn.As_Categorical(
688
- categories = [values]/None,
689
- order="sort"/"appearance"/None
690
- )
649
+ col2 = (categories = [values]/None,
650
+ order="sort" # or "appearance" or None
651
+ )
691
652
)
692
653
693
654
694
655
:param df: The pandas DataFrame object.
695
656
:param column_names: A column name or an iterable (list or
696
657
tuple) of column names.
697
658
:param kwargs: A pairing of column name to a tuple of (`categories`, `order`).
698
- There is also the `janitor.As_Categorical` function, which creates a
699
- namedtuple of (`categories`, `order`) to make it clearer what the arguments
700
- are. This is useful in creating categorical columns that are ordered, or
659
+ This is useful in creating categorical columns that are ordered, or
701
660
if the user needs to explicitly specify the categories.
702
661
:returns: A pandas DataFrame.
703
- :raises JanitorError: if a column specified within ``column_names``
704
- is not found in the DataFrame.
705
- :raises JanitorError: if ``column_names`` is not hashable
706
- nor iterable.
707
662
:raises ValueError: if both ``column_names`` and ``kwargs`` are provided.
708
663
""" # noqa: E501
709
664
710
- df = df .copy ()
711
-
712
665
if all ((column_names , kwargs )):
713
666
raise ValueError (
714
667
"""
@@ -719,28 +672,17 @@ def encode_categorical(
719
672
# column_names deal with only category dtype (unordered)
720
673
# kwargs takes care of scenarios where user wants an ordered category
721
674
# or user supplies specific categories to create the categorical
722
- if column_names :
723
- if isinstance (column_names , (list , Tuple )):
724
- for col in column_names :
725
- if col not in df .columns :
726
- raise JanitorError (
727
- f"{ col } missing from DataFrame columns!"
728
- )
729
- df [col ] = pd .Categorical (df [col ])
730
- elif isinstance (column_names , Hashable ):
731
- if column_names not in df .columns :
732
- raise JanitorError (
733
- f"{ column_names } missing from DataFrame columns!"
734
- )
735
- df [column_names ] = pd .Categorical (df [column_names ])
736
- else :
737
- raise JanitorError (
738
- "kwarg `column_names` must be hashable or iterable!"
739
- )
740
- return df
675
+ if column_names is not None :
676
+ check ("column_names" , column_names , [list , tuple , Hashable ])
677
+ if isinstance (column_names , (list , tuple )):
678
+ check_column (df , column_names )
679
+ dtypes = {col : "category" for col in column_names }
680
+ return df .astype (dtypes )
681
+ if isinstance (column_names , Hashable ):
682
+ check_column (df , [column_names ])
683
+ return df .astype ({column_names : "category" })
741
684
742
- df = _computations_as_categorical (df , ** kwargs )
743
- return df
685
+ return _computations_as_categorical (df , ** kwargs )
744
686
745
687
746
688
@pf .register_dataframe_method
@@ -6359,10 +6301,43 @@ def conditional_join(
6359
6301
The join is done only on the columns.
6360
6302
MultiIndex columns are not supported.
6361
6303
6362
- For non-equi joins, onnly numeric and date columns are supported.
6304
+ For non-equi joins, only numeric and date columns are supported.
6363
6305
6364
6306
Only `inner`, `left`, and `right` joins are supported.
6365
6307
6308
+ Functional usage syntax:
6309
+
6310
+ .. code-block:: python
6311
+
6312
+ import pandas as pd
6313
+ import janitor as jn
6314
+
6315
+ df = pd.DataFrame(...)
6316
+ right = pd.DataFrame(...)
6317
+
6318
+ df = jn.conditional_join(
6319
+ df,
6320
+ right,
6321
+ (col_from_df, col_from_right, join_operator),
6322
+ (col_from_df, col_from_right, join_operator),
6323
+ ...,
6324
+ how = 'inner' # or left/right
6325
+ sort_by_appearance = True # or False
6326
+ )
6327
+
6328
+ Method chaining syntax:
6329
+
6330
+ .. code-block:: python
6331
+
6332
+ df.conditional_join(
6333
+ right,
6334
+ (col_from_df, col_from_right, join_operator),
6335
+ (col_from_df, col_from_right, join_operator),
6336
+ ...,
6337
+ how = 'inner' # or left/right
6338
+ sort_by_appearance = True # or False
6339
+ )
6340
+
6366
6341
6367
6342
:param df: A Pandas DataFrame.
6368
6343
:param right: Named Series or DataFrame to join to.
0 commit comments