diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 931ec895cc73f..09b7bff31d323 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -267,6 +267,7 @@ I/O - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) +- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) Period diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 57e4e87644fed..b23b5fe5b34a8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -504,6 +504,8 @@ def parse( header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) + has_index_names = is_list_like(header) and len(header) > 1 + if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: @@ -513,6 +515,12 @@ def parse( else: offset = 1 + max(header) + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + # Check if we have an empty dataset # before trying to collect data. if offset < len(data): @@ -525,8 +533,6 @@ def parse( else: last = data[row][col] - has_index_names = is_list_like(header) and len(header) > 1 - # GH 12292 : error when read one empty column from excel file try: parser = TextParser( diff --git a/pandas/tests/io/data/excel/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods index b7f03900e6617..deb88bdad1694 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.ods and b/pandas/tests/io/data/excel/testmultiindex.ods differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls index 4329992642c8c..08dc78ea34d56 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xls and b/pandas/tests/io/data/excel/testmultiindex.xls differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsb b/pandas/tests/io/data/excel/testmultiindex.xlsb index b66d6dab17ee0..f5f62d305640f 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsb and b/pandas/tests/io/data/excel/testmultiindex.xlsb differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm index ebbca4856562f..8bd16b016608c 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsm and b/pandas/tests/io/data/excel/testmultiindex.xlsm differ diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx index afe1758a7a132..56fc6f20b711a 100644 Binary files a/pandas/tests/io/data/excel/testmultiindex.xlsx and b/pandas/tests/io/data/excel/testmultiindex.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8b1a96f694e71..110b79adb5646 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -841,6 +841,43 @@ def test_read_excel_multiindex(self, read_ext): ) tm.assert_frame_equal(actual, expected) + @pytest.mark.parametrize( + "sheet_name,idx_lvl2", + [ + ("both_name_blank_after_mi_name", [np.nan, "b", "a", "b"]), + ("both_name_multiple_blanks", [np.nan] * 4), + ], + ) + def test_read_excel_multiindex_blank_after_name( + self, read_ext, sheet_name, idx_lvl2 + ): + # GH34673 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb (GH4679") + + mi_file = "testmultiindex" + read_ext + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + index=MultiIndex.from_arrays( + (["foo", "foo", "bar", "bar"], idx_lvl2), + names=["ilvl1", "ilvl2"], + ), + ) + result = pd.read_excel( + mi_file, + sheet_name=sheet_name, + index_col=[0, 1], + header=[0, 1], + ) + tm.assert_frame_equal(result, expected) + def test_read_excel_multiindex_header_only(self, read_ext): # see gh-11733. #