diff --git a/AUTHORS.rst b/AUTHORS.rst index f71b79928..7a1404dff 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -55,7 +55,7 @@ Contributors - `@loganthomas `_ | `contributions `_ - `@kulini `_ | `contributions `_ - `@dwgoltra `_ | `contributions `_ -- `@shandou `_ | `contributions `_ +- `@shandou `_ | `contributions `_ - `@samwalkow `_ | `contributions `_ - `@portc13 `_ | `contributions `_ - `@DSNortsev `_ | `contributions `_ diff --git a/examples/notebooks/medium_franchise.ipynb b/examples/notebooks/medium_franchise.ipynb index 1d5a67eec..a6ad415fa 100644 --- a/examples/notebooks/medium_franchise.ipynb +++ b/examples/notebooks/medium_franchise.ipynb @@ -7,7 +7,7 @@ "# Tidy Up Web-Scraped Media Franchise Data\n", "\n", "## Background\n", - "This example combines functionalities of [pyjanitor](https://anaconda.org/conda-forge/pyjanitor) and [pandas-flavor](https://anaconda.org/conda-forge/pandas-flavor) to showcase an explicit--and thus reproducible--workflow enabled by dataframe __*method chaining*__.\n", + "This example combines functionalities of [pyjanitor](https://anaconda.org/conda-forge/pyjanitor) and [pandas-flavor](https://anaconda.org/conda-forge/pandas-flavor) to showcase an explicit--and thus reproducible--workflow enabled by dataframe __method chaining__.\n", "\n", "The data cleaning workflow largely follows the [R example](https://github.com/rfordatascience/tidytuesday/blob/master/data/2019/2019-07-02/revenue.R) from [the tidytuesday project](https://github.com/rfordatascience/tidytuesday). The raw data is scraped from [Wikipedia page](https://en.wikipedia.org/wiki/List_of_highest-grossing_media_franchises) titled \"*List of highest-grossing media franchises*\". The workflow is presented both in multi-step (section1) and in one-shot (section 2) fashions.\n", "\n", @@ -44,7 +44,7 @@ "\n", "## Python implementation\n", "\n", - "### Prepration" + "### Preparation" ] }, { @@ -52,8 +52,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:11.833664Z", - "start_time": "2019-07-19T04:34:11.093511Z" + "end_time": "2019-07-20T06:26:58.073741Z", + "start_time": "2019-07-20T06:26:57.348190Z" } }, "outputs": [], @@ -70,8 +70,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:11.838545Z", - "start_time": "2019-07-19T04:34:11.835963Z" + "end_time": "2019-07-20T06:26:58.078657Z", + "start_time": "2019-07-20T06:26:58.076034Z" } }, "outputs": [], @@ -105,8 +105,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.346107Z", - "start_time": "2019-07-19T04:34:11.840801Z" + "end_time": "2019-07-20T06:26:58.476918Z", + "start_time": "2019-07-20T06:26:58.081346Z" } }, "outputs": [ @@ -214,11 +214,11 @@ "source": [ "#### Rename columns\n", "R snippet:\n", - ">```R\n", - ">clean_money <- df %>% \n", - "> set_names(nm = c(\"franchise\", \"year_created\", \"total_revenue\", \"revenue_items\",\n", - "> \"original_media\", \"creators\", \"owners\"))\n", - ">```" + "```R\n", + "clean_money <- df %>% \n", + " set_names(nm = c(\"franchise\", \"year_created\", \"total_revenue\", \"revenue_items\",\n", + " \"original_media\", \"creators\", \"owners\"))\n", + "```" ] }, { @@ -226,8 +226,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.354447Z", - "start_time": "2019-07-19T04:34:12.348171Z" + "end_time": "2019-07-20T06:26:58.487280Z", + "start_time": "2019-07-20T06:26:58.479897Z" } }, "outputs": [], @@ -314,8 +314,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.369814Z", - "start_time": "2019-07-19T04:34:12.357413Z" + "end_time": "2019-07-20T06:26:58.500590Z", + "start_time": "2019-07-20T06:26:58.488818Z" } }, "outputs": [ @@ -443,8 +443,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.386894Z", - "start_time": "2019-07-19T04:34:12.371620Z" + "end_time": "2019-07-20T06:26:58.517156Z", + "start_time": "2019-07-20T06:26:58.502408Z" } }, "outputs": [ @@ -574,8 +574,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.397654Z", - "start_time": "2019-07-19T04:34:12.389122Z" + "end_time": "2019-07-20T06:26:58.528543Z", + "start_time": "2019-07-20T06:26:58.519225Z" } }, "outputs": [], @@ -656,8 +656,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.467859Z", - "start_time": "2019-07-19T04:34:12.399969Z" + "end_time": "2019-07-20T06:26:58.565078Z", + "start_time": "2019-07-20T06:26:58.531203Z" } }, "outputs": [ @@ -793,8 +793,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.474179Z", - "start_time": "2019-07-19T04:34:12.470242Z" + "end_time": "2019-07-20T06:26:58.573802Z", + "start_time": "2019-07-20T06:26:58.567195Z" } }, "outputs": [], @@ -833,8 +833,8 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.509476Z", - "start_time": "2019-07-19T04:34:12.475892Z" + "end_time": "2019-07-20T06:26:58.606774Z", + "start_time": "2019-07-20T06:26:58.576010Z" } }, "outputs": [ @@ -975,8 +975,8 @@ "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.516987Z", - "start_time": "2019-07-19T04:34:12.511117Z" + "end_time": "2019-07-20T06:26:58.614835Z", + "start_time": "2019-07-20T06:26:58.608821Z" } }, "outputs": [], @@ -1055,8 +1055,8 @@ "execution_count": 12, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.536140Z", - "start_time": "2019-07-19T04:34:12.518663Z" + "end_time": "2019-07-20T06:26:58.636212Z", + "start_time": "2019-07-20T06:26:58.616698Z" } }, "outputs": [ @@ -1180,8 +1180,8 @@ "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:12.983936Z", - "start_time": "2019-07-19T04:34:12.537990Z" + "end_time": "2019-07-20T06:26:59.015550Z", + "start_time": "2019-07-20T06:26:58.638120Z" } }, "outputs": [], @@ -1249,8 +1249,8 @@ "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:13.003367Z", - "start_time": "2019-07-19T04:34:12.986028Z" + "end_time": "2019-07-20T06:26:59.038340Z", + "start_time": "2019-07-20T06:26:59.017594Z" } }, "outputs": [ @@ -1341,8 +1341,8 @@ "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:13.016011Z", - "start_time": "2019-07-19T04:34:13.005577Z" + "end_time": "2019-07-20T06:26:59.050321Z", + "start_time": "2019-07-20T06:26:59.041266Z" } }, "outputs": [ @@ -1436,8 +1436,8 @@ "execution_count": 16, "metadata": { "ExecuteTime": { - "end_time": "2019-07-19T04:34:13.037967Z", - "start_time": "2019-07-19T04:34:13.017671Z" + "end_time": "2019-07-20T06:26:59.071102Z", + "start_time": "2019-07-20T06:26:59.051957Z" } }, "outputs": [ diff --git a/janitor/functions.py b/janitor/functions.py index 769ea3164..8db6e2fc0 100644 --- a/janitor/functions.py +++ b/janitor/functions.py @@ -908,8 +908,9 @@ def concatenate_columns( def deconcatenate_column( df: pd.DataFrame, column_name, - new_column_names: Union[str, Iterable[str], Any], + new_column_names: Union[List[str], Tuple[str]], sep: str, + preserve_position: bool = False, ) -> pd.DataFrame: """ De-concatenates a single column into multiple columns. @@ -918,41 +919,72 @@ def deconcatenate_column( Used to quickly split columns out of a single column. + The keyword argument `preserve_position` takes `True` or `False` boolean + that controls whether the `new_column_names` will take the original + position of the to-be-deconcatenated `column_name`: + + - When `preserve_position=False` (default), `df.columns` change from + `[..., column_name, ...]` to `[..., column_name, ..., new_column_names]`. + In other words, the deconcatenated new columns are appended to the right + of the original dataframe and the original `column_name` is NOT dropped. + - When `preserve_position=True`, `df.column` change from + `[..., column_name, ...]` to `[..., new_column_names, ...]`. + In other words, the deconcatenated new column will REPLACE the original + `column_name` at its original position, and `column_name` itself + is dropped. + This method does not mutate the original DataFrame. Functional usage example: .. code-block:: python - df = deconcatenate_columns(df, - column_name='id', - new_column_names=['col1', 'col2'], - sep='-') + df = deconcatenate_column( + df, column_name='id', new_column_names=['col1', 'col2'], + sep='-', preserve_position=True + ) Method chaining example: .. code-block:: python df = (pd.DataFrame(...). - deconcatenate_columns(column_name='id', - new_column_names=['col1', 'col2'], - sep='-')) + deconcatenate_column( + column_name='id', new_column_names=['col1', 'col2'], + sep='-', preserve_position=True + )) :param df: A pandas DataFrame. :param column_name: The column to split. :param new_column_names: A list of new column names post-splitting. :param sep: The separator delimiting the column's data. + :param preserve_position: Boolean for whether or not to preserve original + position of the column upon de-concatenation, default to False :returns: A pandas DataFrame with a deconcatenated column. """ assert ( column_name in df.columns ), f"column name {column_name} not present in dataframe" # noqa: E501 deconcat = df[column_name].str.split(sep, expand=True) + if preserve_position: + # Keep a copy of the original dataframe + df_original = df.copy() assert ( len(new_column_names) == deconcat.shape[1] ), "number of new column names not correct." deconcat.columns = new_column_names - return df.join(deconcat) + df = pd.concat([df, deconcat], axis=1) + if preserve_position: + cols = list(df_original.columns) + index_original = cols.index(column_name) + for i, col_new in enumerate(new_column_names): + cols.insert(index_original + i, col_new) + df = df[cols].drop(columns=column_name) + assert ( + len(df.columns) + == len(df_original.columns) + len(new_column_names) - 1 + ), "number of columns after deconcatenation is incorrect" + return df @pf.register_dataframe_method diff --git a/tests/functions/test_deconcatenate_column.py b/tests/functions/test_deconcatenate_column.py index 326d28ae7..9c90e59de 100644 --- a/tests/functions/test_deconcatenate_column.py +++ b/tests/functions/test_deconcatenate_column.py @@ -3,12 +3,12 @@ @pytest.mark.functions def test_deconcatenate_column(dataframe): - df = dataframe.concatenate_columns( + df_orig = dataframe.concatenate_columns( column_names=["a", "decorated-elephant"], sep="-", new_column_name="index", ) - df = df.deconcatenate_column( + df = df_orig.deconcatenate_column( column_name="index", new_column_names=["A", "B"], sep="-" ) assert "A" in df.columns diff --git a/tests/functions/test_deconcatenate_column_preserve_position.py b/tests/functions/test_deconcatenate_column_preserve_position.py new file mode 100644 index 000000000..35b720907 --- /dev/null +++ b/tests/functions/test_deconcatenate_column_preserve_position.py @@ -0,0 +1,29 @@ +import pytest + + +@pytest.mark.functions +def test_deconcatenate_column_preserve_position(dataframe): + df_original = dataframe.concatenate_columns( + column_names=["a", "decorated-elephant"], + sep="-", + new_column_name="index", + ) + index_original = list(df_original.columns).index("index") + df = df_original.deconcatenate_column( + column_name="index", + new_column_names=["col1", "col2"], + sep="-", + preserve_position=True, + ) + assert "index" not in df.columns, "column_name not dropped" + assert "col1" in df.columns, "new column not present" + assert "col2" in df.columns, "new column not present" + assert len(df_original.columns) + 1 == len( + df.columns + ), "Number of columns inconsistent" + assert ( + list(df.columns).index("col1") == index_original + ), "Position not preserved" + assert ( + list(df.columns).index("col2") == index_original + 1 + ), "Position not preserved"