Skip to content

Commit 57b8e07

Browse files
committed
Merge branch 'main' of https://github.com/pandas-dev/pandas into groupby_cat_unobserved
2 parents eb40033 + f1bb3b2 commit 57b8e07

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+279
-764
lines changed

asv_bench/asv.conf.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
"openpyxl": [],
5555
"xlsxwriter": [],
5656
"xlrd": [],
57-
"xlwt": [],
5857
"odfpy": [],
5958
"jinja2": [],
6059
},

asv_bench/benchmarks/io/excel.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _generate_dataframe():
3333

3434
class WriteExcel:
3535

36-
params = ["openpyxl", "xlsxwriter", "xlwt"]
36+
params = ["openpyxl", "xlsxwriter"]
3737
param_names = ["engine"]
3838

3939
def setup(self, engine):
@@ -68,10 +68,9 @@ def time_write_excel_style(self, engine):
6868

6969
class ReadExcel:
7070

71-
params = ["xlrd", "openpyxl", "odf"]
71+
params = ["openpyxl", "odf"]
7272
param_names = ["engine"]
7373
fname_excel = "spreadsheet.xlsx"
74-
fname_excel_xls = "spreadsheet.xls"
7574
fname_odf = "spreadsheet.ods"
7675

7776
def _create_odf(self):
@@ -92,13 +91,10 @@ def setup_cache(self):
9291
self.df = _generate_dataframe()
9392

9493
self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
95-
self.df.to_excel(self.fname_excel_xls, sheet_name="Sheet1")
9694
self._create_odf()
9795

9896
def time_read_excel(self, engine):
99-
if engine == "xlrd":
100-
fname = self.fname_excel_xls
101-
elif engine == "odf":
97+
if engine == "odf":
10298
fname = self.fname_odf
10399
else:
104100
fname = self.fname_excel
@@ -107,9 +103,7 @@ def time_read_excel(self, engine):
107103

108104
class ReadExcelNRows(ReadExcel):
109105
def time_read_excel(self, engine):
110-
if engine == "xlrd":
111-
fname = self.fname_excel_xls
112-
elif engine == "odf":
106+
if engine == "odf":
113107
fname = self.fname_odf
114108
else:
115109
fname = self.fname_excel

ci/code_checks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ import pandas
4747
4848
blocklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis',
4949
'lxml', 'matplotlib', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy',
50-
'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'}
50+
'tables', 'urllib.request', 'xlrd', 'xlsxwriter'}
5151
5252
# GH#28227 for some of these check for top-level modules, while others are
5353
# more specific (e.g. urllib.request)

ci/deps/actions-310.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,4 @@ dependencies:
5151
- xarray
5252
- xlrd
5353
- xlsxwriter
54-
- xlwt
5554
- zstandard

ci/deps/actions-38-downstream_compat.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ dependencies:
5151
- xarray
5252
- xlrd
5353
- xlsxwriter
54-
- xlwt
5554
- zstandard
5655

5756
# downstream packages

ci/deps/actions-38-minimum_versions.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,4 @@ dependencies:
5353
- xarray=0.19.0
5454
- xlrd=2.0.1
5555
- xlsxwriter=1.4.3
56-
- xlwt=1.3.0
5756
- zstandard=0.15.2

ci/deps/actions-38.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,4 @@ dependencies:
5050
- xarray
5151
- xlrd
5252
- xlsxwriter
53-
- xlwt
5453
- zstandard

ci/deps/actions-39.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,4 @@ dependencies:
5151
- xarray
5252
- xlrd
5353
- xlsxwriter
54-
- xlwt
5554
- zstandard

ci/deps/circle-38-arm64.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,4 @@ dependencies:
5151
- xarray
5252
- xlrd
5353
- xlsxwriter
54-
- xlwt
5554
- zstandard

doc/scripts/eval_performance.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from timeit import repeat as timeit
2+
3+
import numpy as np
4+
import seaborn as sns
5+
6+
from pandas import DataFrame
7+
8+
setup_common = """from pandas import DataFrame
9+
from numpy.random import randn
10+
df = DataFrame(randn(%d, 3), columns=list('abc'))
11+
%s"""
12+
13+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
14+
15+
16+
def bench_with(n, times=10, repeat=3, engine="numexpr"):
17+
return (
18+
np.array(
19+
timeit(
20+
"df.eval(s, engine=%r)" % engine,
21+
setup=setup_common % (n, setup_with),
22+
repeat=repeat,
23+
number=times,
24+
)
25+
)
26+
/ times
27+
)
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=20, repeat=3, engine="numexpr"):
34+
return (
35+
np.array(
36+
timeit(
37+
"df.query(s, engine=%r)" % engine,
38+
setup=setup_common % (n, setup_subset),
39+
repeat=repeat,
40+
number=times,
41+
)
42+
)
43+
/ times
44+
)
45+
46+
47+
def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
48+
r = np.logspace(mn, mx, num=num).round().astype(int)
49+
50+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
51+
qu = ev.copy(deep=True)
52+
53+
ev["size"] = qu["size"] = r
54+
55+
for engine in engines:
56+
for i, n in enumerate(r):
57+
if verbose & (i % 10 == 0):
58+
print("engine: %r, i == %d" % (engine, i))
59+
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
60+
ev.loc[i, engine] = np.mean(ev_times)
61+
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
62+
qu.loc[i, engine] = np.mean(qu_times)
63+
64+
return ev, qu
65+
66+
67+
def plot_perf(df, engines, title, filename=None):
68+
from matplotlib.pyplot import figure
69+
70+
sns.set()
71+
sns.set_palette("Set2")
72+
73+
fig = figure(figsize=(4, 3), dpi=120)
74+
ax = fig.add_subplot(111)
75+
76+
for engine in engines:
77+
ax.loglog(df["size"], df[engine], label=engine, lw=2)
78+
79+
ax.set_xlabel("Number of Rows")
80+
ax.set_ylabel("Time (s)")
81+
ax.set_title(title)
82+
ax.legend(loc="best")
83+
ax.tick_params(top=False, right=False)
84+
85+
fig.tight_layout()
86+
87+
if filename is not None:
88+
fig.savefig(filename)
89+
90+
91+
if __name__ == "__main__":
92+
import os
93+
94+
pandas_dir = os.path.dirname(
95+
os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
96+
)
97+
static_path = os.path.join(pandas_dir, "doc", "source", "_static")
98+
99+
join = lambda p: os.path.join(static_path, p)
100+
101+
fn = join("eval-query-perf-data.h5")
102+
103+
engines = "python", "numexpr"
104+
105+
ev, qu = bench(verbose=True) # only this one
106+
107+
plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
108+
plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
-24.7 KB
Binary file not shown.

doc/source/_static/eval-perf.png

10.8 KB
Loading
-21.2 KB
Binary file not shown.

doc/source/_static/query-perf.png

8.79 KB
Loading

doc/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@
236236
if ".dev" in version:
237237
switcher_version = "dev"
238238
elif "rc" in version:
239-
switcher_version = version.split("rc")[0] + " (rc)"
239+
switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)"
240240

241241
html_theme_options = {
242242
"external_links": [],

doc/source/development/contributing_environment.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ To test out code changes, you'll need to build pandas from source, which
1010
requires a C/C++ compiler and Python environment. If you're making documentation
1111
changes, you can skip to :ref:`contributing to the documentation <contributing_documentation>` but if you skip
1212
creating the development environment you won't be able to build the documentation
13-
locally before pushing your changes.
13+
locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks <contributing.pre-commit>`.
1414

1515
.. contents:: Table of contents:
1616
:local:

doc/source/getting_started/install.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,6 @@ Can be managed as optional_extra with ``pandas[excel]``.
336336
Dependency Minimum Version optional_extra Notes
337337
========================= ================== =============== =============================================================
338338
xlrd 2.0.1 excel Reading Excel
339-
xlwt 1.3.0 excel Writing Excel
340339
xlsxwriter 1.4.3 excel Writing Excel
341340
openpyxl 3.0.7 excel Reading / writing for xlsx files
342341
pyxlsb 1.0.8 excel Reading for xlsb files

doc/source/user_guide/enhancingperf.rst

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
690690
df["a"] = 1
691691
df
692692
693-
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
694-
whether the query modifies the original frame.
695-
696-
.. ipython:: python
697-
698-
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
699-
df.query("a > 2")
700-
df.query("a > 2", inplace=True)
701-
df
702-
703693
Local variables
704694
~~~~~~~~~~~~~~~
705695

706696
You must *explicitly reference* any local variable that you want to use in an
707-
expression by placing the ``@`` character in front of the name. For example,
697+
expression by placing the ``@`` character in front of the name. This mechanism is
698+
the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
708699

709700
.. ipython:: python
710701
@@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of
820811
:func:`pandas.eval` as function of the size of the frame involved in the
821812
computation. The two lines are two different engines.
822813

814+
..
815+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
823816
824817
.. image:: ../_static/eval-perf.png
825818

826-
827-
.. note::
828-
829-
Operations with smallish objects (around 15k-20k rows) are faster using
830-
plain Python:
831-
832-
.. image:: ../_static/eval-perf-small.png
833-
819+
You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
834820

835821
This plot was created using a :class:`DataFrame` with 3 columns each containing
836822
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/indexing.rst

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
12401240
renaming your columns to something less ambiguous.
12411241

12421242

1243+
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
1244+
whether the query modifies the original frame.
1245+
1246+
.. ipython:: python
1247+
1248+
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
1249+
df.query("a > 2")
1250+
df.query("a > 2", inplace=True)
1251+
df
1252+
1253+
12431254
:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
12441255
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12451256

@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
14381449
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
14391450
large frames.
14401451

1452+
..
1453+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
1454+
14411455
.. image:: ../_static/query-perf.png
14421456

1443-
.. note::
14441457

1445-
You will only see the performance benefits of using the ``numexpr`` engine
1446-
with ``DataFrame.query()`` if your frame has more than approximately 200,000
1447-
rows.
14481458

1449-
.. image:: ../_static/query-perf-small.png
1459+
You will only see the performance benefits of using the ``numexpr`` engine
1460+
with ``DataFrame.query()`` if your frame has more than approximately 100,000
1461+
rows.
1462+
1463+
14501464

14511465
This plot was created using a ``DataFrame`` with 3 columns each containing
14521466
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/io.rst

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3466,8 +3466,6 @@ See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
34663466

34673467
.. warning::
34683468

3469-
The `xlwt <https://xlwt.readthedocs.io/en/latest/>`__ package for writing old-style ``.xls``
3470-
excel files is no longer maintained.
34713469
The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
34723470
old-style ``.xls`` files.
34733471

@@ -3481,12 +3479,6 @@ See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
34813479
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
34823480
This is no longer supported, switch to using ``openpyxl`` instead.
34833481

3484-
Attempting to use the ``xlwt`` engine will raise a ``FutureWarning``
3485-
unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``.
3486-
While this option is now deprecated and will also raise a ``FutureWarning``,
3487-
it can be globally set and the warning suppressed. Users are recommended to
3488-
write ``.xlsx`` files using the ``openpyxl`` engine instead.
3489-
34903482
.. _io.excel_reader:
34913483

34923484
Reading Excel files
@@ -3788,7 +3780,7 @@ written. For example:
37883780
37893781
df.to_excel("path_to_file.xlsx", sheet_name="Sheet1")
37903782
3791-
Files with a ``.xls`` extension will be written using ``xlwt`` and those with a
3783+
Files with a
37923784
``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or
37933785
``openpyxl``.
37943786

@@ -3849,35 +3841,26 @@ pandas supports writing Excel files to buffer-like objects such as ``StringIO``
38493841
Excel writer engines
38503842
''''''''''''''''''''
38513843

3852-
.. deprecated:: 1.2.0
3853-
3854-
As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
3855-
maintained, the ``xlwt`` engine will be removed from a future version
3856-
of pandas. This is the only engine in pandas that supports writing to
3857-
``.xls`` files.
3858-
38593844
pandas chooses an Excel writer via two methods:
38603845

38613846
1. the ``engine`` keyword argument
38623847
2. the filename extension (via the default specified in config options)
38633848

38643849
By default, pandas uses the `XlsxWriter`_ for ``.xlsx``, `openpyxl`_
3865-
for ``.xlsm``, and `xlwt`_ for ``.xls`` files. If you have multiple
3850+
for ``.xlsm``. If you have multiple
38663851
engines installed, you can set the default engine through :ref:`setting the
38673852
config options <options>` ``io.excel.xlsx.writer`` and
38683853
``io.excel.xls.writer``. pandas will fall back on `openpyxl`_ for ``.xlsx``
38693854
files if `Xlsxwriter`_ is not available.
38703855

38713856
.. _XlsxWriter: https://xlsxwriter.readthedocs.io
38723857
.. _openpyxl: https://openpyxl.readthedocs.io/
3873-
.. _xlwt: http://www.python-excel.org
38743858

38753859
To specify which writer you want to use, you can pass an engine keyword
38763860
argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
38773861

38783862
* ``openpyxl``: version 2.4 or higher is required
38793863
* ``xlsxwriter``
3880-
* ``xlwt``
38813864

38823865
.. code-block:: python
38833866

0 commit comments

Comments
 (0)