diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a62942c7cd948..d39a33e7a88c3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,11 +2,12 @@ name: CI
on:
push:
- branches: [master]
+ branches:
+ - master
+ - 1.3.x
pull_request:
branches:
- master
- - 1.2.x
- 1.3.x
env:
@@ -124,15 +125,15 @@ jobs:
echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts
- if: github.event_name == 'push'
+ if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
- name: Upload web
run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas
- if: github.event_name == 'push'
+ if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
- name: Upload dev docs
run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev
- if: github.event_name == 'push'
+ if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}}
- name: Move docs into site directory
run: mv doc/build/html web/build/docs
diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml
index d2aa76a3e6110..a2818c85ffa33 100644
--- a/.github/workflows/database.yml
+++ b/.github/workflows/database.yml
@@ -6,7 +6,6 @@ on:
pull_request:
branches:
- master
- - 1.2.x
- 1.3.x
paths-ignore:
- "doc/**"
diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml
index fa5cf8ead57bd..a132b1486ae12 100644
--- a/.github/workflows/posix.yml
+++ b/.github/workflows/posix.yml
@@ -2,11 +2,12 @@ name: Posix
on:
push:
- branches: [master]
+ branches:
+ - master
+ - 1.3.x
pull_request:
branches:
- master
- - 1.2.x
- 1.3.x
paths-ignore:
- "doc/**"
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 723347913ac38..51b52105b483a 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -3,7 +3,9 @@ name: pre-commit
on:
pull_request:
push:
- branches: [master]
+ branches:
+ - master
+ - 1.3.x
jobs:
pre-commit:
diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml
index 4ef5b16e71e71..85ef356e7fa14 100644
--- a/.github/workflows/python-dev.yml
+++ b/.github/workflows/python-dev.yml
@@ -4,9 +4,11 @@ on:
push:
branches:
- master
+ - 1.3.x
pull_request:
branches:
- master
+ - 1.3.x
paths-ignore:
- "doc/**"
diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml
index 0c2e30a74bbdb..acb574f2ab8c5 100644
--- a/.github/workflows/sdist.yml
+++ b/.github/workflows/sdist.yml
@@ -4,10 +4,10 @@ on:
push:
branches:
- master
+ - 1.3.x
pull_request:
branches:
- master
- - 1.2.x
- 1.3.x
paths-ignore:
- "doc/**"
diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py
index 427af9307f2c9..5d7a76bc01d49 100644
--- a/asv_bench/benchmarks/algos/isin.py
+++ b/asv_bench/benchmarks/algos/isin.py
@@ -1,7 +1,5 @@
import numpy as np
-from pandas.compat.numpy import np_version_under1p20
-
from pandas import (
Categorical,
NaT,
@@ -280,10 +278,6 @@ class IsInLongSeriesLookUpDominates:
def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7
- # https://github.com/pandas-dev/pandas/issues/39844
- if not np_version_under1p20 and dtype in ("Int64", "Float64"):
- raise NotImplementedError
-
if series_type == "random_hits":
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
@@ -294,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type):
array = np.arange(N) + MaxNumber
self.series = Series(array).astype(dtype)
- self.values = np.arange(MaxNumber).astype(dtype)
+
+ self.values = np.arange(MaxNumber).astype(dtype.lower())
def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)
@@ -310,16 +305,12 @@ class IsInLongSeriesValuesDominate:
def setup(self, dtype, series_type):
N = 10 ** 7
- # https://github.com/pandas-dev/pandas/issues/39844
- if not np_version_under1p20 and dtype in ("Int64", "Float64"):
- raise NotImplementedError
-
if series_type == "random":
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)
- self.values = vals.astype(dtype)
+ self.values = vals.astype(dtype.lower())
M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index c32eda4928da7..e5834f311d259 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -232,6 +232,22 @@ def time_to_html_mixed(self):
self.df2.to_html()
+class ToDict:
+ params = [["dict", "list", "series", "split", "records", "index"]]
+ param_names = ["orient"]
+
+ def setup(self, orient):
+ data = np.random.randint(0, 1000, size=(10000, 4))
+ self.int_df = DataFrame(data)
+ self.datetimelike_df = self.int_df.astype("timedelta64[ns]")
+
+ def time_to_dict_ints(self, orient):
+ self.int_df.to_dict(orient=orient)
+
+ def time_to_dict_datetimelike(self, orient):
+ self.datetimelike_df.to_dict(orient=orient)
+
+
class ToNumpy:
def setup(self):
N = 10000
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5ba4471c8d303..edc9c42fe805a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -3,16 +3,17 @@ trigger:
branches:
include:
- master
- - 1.2.x
- 1.3.x
paths:
exclude:
- 'doc/*'
pr:
-- master
-- 1.2.x
-- 1.3.x
+ autoCancel: true
+ branches:
+ include:
+ - master
+ - 1.3.x
variables:
PYTEST_WORKERS: auto
@@ -22,12 +23,12 @@ jobs:
- template: ci/azure/posix.yml
parameters:
name: macOS
- vmImage: macOS-10.14
+ vmImage: macOS-10.15
- template: ci/azure/windows.yml
parameters:
name: Windows
- vmImage: vs2017-win2016
+ vmImage: windows-2019
- job: py37_32bit
pool:
diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
index a9e4113bf9d18..cfdcf266236e6 100644
--- a/ci/deps/actions-37-db.yaml
+++ b/ci/deps/actions-37-db.yaml
@@ -15,7 +15,7 @@ dependencies:
- beautifulsoup4
- botocore>=1.11
- dask
- - fastparquet>=0.4.0
+ - fastparquet>=0.4.0, < 0.7.0
- fsspec>=0.7.4, <2021.6.0
- gcsfs>=0.6.0
- geopandas
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index b74f1af8ee0f6..03f2bc84bcc01 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -12,11 +12,30 @@ dependencies:
- hypothesis>=3.58.0
# pandas dependencies
+ - beautifulsoup4
+ - bottleneck
+ - fsspec>=0.8.0, <2021.6.0
+ - gcsfs
+ - html5lib
+ - jinja2
+ - lxml
+ - matplotlib
+ - moto>=1.3.14
+ - flask
+ - numexpr
- numpy
+ - openpyxl
+ - pyarrow
+ - pytables
- python-dateutil
- pytz
-
- # optional dependencies
- - pytables
+ - s3fs>=0.4.2
- scipy
- - pyarrow=1.0
+ - sqlalchemy
+ - xlrd
+ - xlsxwriter
+ - xlwt
+ - pyreadstat
+ - pip
+ - pip:
+ - pyxlsb
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
index 70aa46e8a5851..902daf102ccda 100644
--- a/ci/deps/azure-windows-38.yaml
+++ b/ci/deps/azure-windows-38.yaml
@@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- blosc
- bottleneck
- - fastparquet>=0.4.0
+ - fastparquet>=0.4.0, <0.7.0
- flask
- fsspec>=0.8.0, <2021.6.0
- matplotlib=3.1.3
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index ee061e7b7d3e6..e58779c090d8f 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -445,6 +445,12 @@ provides a familiar ``DataFrame`` interface for out-of-core, parallel and distri
Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow.
+`Ibis `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.).
+
+
`Koalas `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst
index 76c922fcef638..54c67674b890c 100644
--- a/doc/source/user_guide/boolean.rst
+++ b/doc/source/user_guide/boolean.rst
@@ -12,6 +12,11 @@
Nullable Boolean data type
**************************
+.. note::
+
+ BooleanArray is currently experimental. Its API or implementation may
+ change without warning.
+
.. versionadded:: 1.0.0
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index aa9a1ba6d6bf0..c78d972f33d65 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -302,28 +302,63 @@ For more about ``boundscheck`` and ``wraparound``, see the Cython docs on
.. _enhancingperf.numba:
-Using Numba
------------
+Numba (JIT compilation)
+-----------------------
-A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba.
+An alternative to statically compiling Cython code is to use a dynamic just-in-time (JIT) compiler with `Numba `__.
-Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters.
+Numba allows you to write a pure Python function which can be JIT compiled to native machine instructions, similar in performance to C, C++ and Fortran,
+by decorating your function with ``@jit``.
-Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack.
+Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool).
+Numba supports compilation of Python to run on either CPU or GPU hardware and is designed to integrate with the Python scientific software stack.
.. note::
- You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`.
+ The ``@jit`` compilation will add overhead to the runtime of the function, so performance benefits may not be realized especially when using small data sets.
+ Consider `caching `__ your function to avoid compilation overhead each time your function is run.
-.. note::
+Numba can be used in 2 ways with pandas:
+
+#. Specify the ``engine="numba"`` keyword in select pandas methods
+#. Define your own Python function decorated with ``@jit`` and pass the underlying NumPy array of :class:`Series` or :class:`Dataframe` (using ``to_numpy()``) into the function
+
+pandas Numba Engine
+~~~~~~~~~~~~~~~~~~~
+
+If Numba is installed, one can specify ``engine="numba"`` in select pandas methods to execute the method using Numba.
+Methods that support ``engine="numba"`` will also have an ``engine_kwargs`` keyword that accepts a dictionary that allows one to specify
+``"nogil"``, ``"nopython"`` and ``"parallel"`` keys with boolean values to pass into the ``@jit`` decorator.
+If ``engine_kwargs`` is not specified, it defaults to ``{"nogil": False, "nopython": True, "parallel": False}`` unless otherwise specified.
+
+In terms of performance, **the first time a function is run using the Numba engine will be slow**
+as Numba will have some function compilation overhead. However, the JIT compiled functions are cached,
+and subsequent calls will be fast. In general, the Numba engine is performant with
+a larger amount of data points (e.g. 1+ million).
- As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below.
+.. code-block:: ipython
+
+ In [1]: data = pd.Series(range(1_000_000)) # noqa: E225
+
+ In [2]: roll = data.rolling(10)
-Jit
-~~~
+ In [3]: def f(x):
+ ...: return np.sum(x) + 5
+ # Run the first time, compilation time will affect performance
+ In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True)
+ 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
+ # Function is cached and performance will improve
+ In [5]: %timeit roll.apply(f, engine='numba', raw=True)
+ 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
-We demonstrate how to use Numba to just-in-time compile our code. We simply
-take the plain Python code from above and annotate with the ``@jit`` decorator.
+ In [6]: %timeit roll.apply(f, engine='cython', raw=True)
+ 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+Custom Function Examples
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+A custom Python function decorated with ``@jit`` can be used with pandas objects by passing their NumPy array
+representations with ``to_numpy()``.
.. code-block:: python
@@ -360,8 +395,6 @@ take the plain Python code from above and annotate with the ``@jit`` decorator.
)
return pd.Series(result, index=df.index, name="result")
-Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a
-nicer interface by passing/returning pandas objects.
.. code-block:: ipython
@@ -370,19 +403,9 @@ nicer interface by passing/returning pandas objects.
In this example, using Numba was faster than Cython.
-Numba as an argument
-~~~~~~~~~~~~~~~~~~~~
-
-Additionally, we can leverage the power of `Numba `__
-by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools
-` for an extensive example.
-
-Vectorize
-~~~~~~~~~
-
Numba can also be used to write vectorized functions that do not require the user to explicitly
loop over the observations of a vector; a vectorized function will be applied to each row automatically.
-Consider the following toy example of doubling each observation:
+Consider the following example of doubling each observation:
.. code-block:: python
@@ -414,25 +437,23 @@ Consider the following toy example of doubling each observation:
Caveats
~~~~~~~
-.. note::
-
- Numba will execute on any function, but can only accelerate certain classes of functions.
-
Numba is best at accelerating functions that apply numerical functions to NumPy
-arrays. When passed a function that only uses operations it knows how to
-accelerate, it will execute in ``nopython`` mode.
-
-If Numba is passed a function that includes something it doesn't know how to
-work with -- a category that currently includes sets, lists, dictionaries, or
-string functions -- it will revert to ``object mode``. In ``object mode``,
-Numba will execute but your code will not speed up significantly. If you would
+arrays. If you try to ``@jit`` a function that contains unsupported `Python `__
+or `NumPy `__
+code, compilation will revert `object mode `__ which
+will mostly likely not speed up your function. If you would
prefer that Numba throw an error if it cannot compile a function in a way that
speeds up your code, pass Numba the argument
-``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on
+``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on
troubleshooting Numba modes, see the `Numba troubleshooting page
`__.
-Read more in the `Numba docs `__.
+Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe
+behavior. You can first `specify a safe threading layer `__
+before running a JIT function with ``parallel=True``.
+
+Generally if the you encounter a segfault (``SIGSEGV``) while using Numba, please report the issue
+to the `Numba issue tracker. `__
.. _enhancingperf.eval:
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index 870ec6763c72f..90d8ec4f95727 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -1106,11 +1106,9 @@ Numba Accelerated Routines
.. versionadded:: 1.1
If `Numba `__ is installed as an optional dependency, the ``transform`` and
-``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
-argument is a dictionary of keyword arguments that will be passed into the
-`numba.jit decorator `__.
-These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
-and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.
+``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments.
+See :ref:`enhancing performance with Numba ` for general usage of the arguments
+and performance considerations.
The function signature must start with ``values, index`` **exactly** as the data belonging to each group
will be passed into ``values``, and the group index will be passed into ``index``.
@@ -1121,52 +1119,6 @@ will be passed into ``values``, and the group index will be passed into ``index`
data and group index will be passed as NumPy arrays to the JITed user defined function, and no
alternative execution attempts will be tried.
-.. note::
-
- In terms of performance, **the first time a function is run using the Numba engine will be slow**
- as Numba will have some function compilation overhead. However, the compiled functions are cached,
- and subsequent calls will be fast. In general, the Numba engine is performant with
- a larger amount of data points (e.g. 1+ million).
-
-.. code-block:: ipython
-
- In [1]: N = 10 ** 3
-
- In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
-
- In [3]: df = pd.DataFrame(data, columns=[0, 1])
-
- In [4]: def f_numba(values, index):
- ...: total = 0
- ...: for i, value in enumerate(values):
- ...: if i % 2:
- ...: total += value + 5
- ...: else:
- ...: total += value * 2
- ...: return total
- ...:
-
- In [5]: def f_cython(values):
- ...: total = 0
- ...: for i, value in enumerate(values):
- ...: if i % 2:
- ...: total += value + 5
- ...: else:
- ...: total += value * 2
- ...: return total
- ...:
-
- In [6]: groupby = df.groupby(0)
- # Run the first time, compilation time will affect performance
- In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
- 2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
- # Function is cached and performance will improve
- In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
- 4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-
- In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
- 18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-
Other useful features
---------------------
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
index 0d6dcaa3726e6..3e533cbadc5f7 100644
--- a/doc/source/user_guide/window.rst
+++ b/doc/source/user_guide/window.rst
@@ -262,26 +262,24 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other
.. code-block:: ipython
In [2]: from pandas.api.indexers import BaseIndexer
- ...:
- ...: class CustomIndexer(BaseIndexer):
- ...:
- ...: def get_window_bounds(self, num_values, min_periods, center, closed):
- ...: start = np.empty(num_values, dtype=np.int64)
- ...: end = np.empty(num_values, dtype=np.int64)
- ...: for i in range(num_values):
- ...: if self.use_expanding[i]:
- ...: start[i] = 0
- ...: end[i] = i + 1
- ...: else:
- ...: start[i] = i
- ...: end[i] = i + self.window_size
- ...: return start, end
- ...:
-
- In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
-
- In [4]: df.rolling(indexer).sum()
- Out[4]:
+
+ In [3]: class CustomIndexer(BaseIndexer):
+ ...: def get_window_bounds(self, num_values, min_periods, center, closed):
+ ...: start = np.empty(num_values, dtype=np.int64)
+ ...: end = np.empty(num_values, dtype=np.int64)
+ ...: for i in range(num_values):
+ ...: if self.use_expanding[i]:
+ ...: start[i] = 0
+ ...: end[i] = i + 1
+ ...: else:
+ ...: start[i] = i
+ ...: end[i] = i + self.window_size
+ ...: return start, end
+
+ In [4]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+ In [5]: df.rolling(indexer).sum()
+ Out[5]:
values
0 0.0
1 1.0
@@ -365,45 +363,21 @@ Numba engine
Additionally, :meth:`~Rolling.apply` can leverage `Numba `__
if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying
``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``).
+See :ref:`enhancing performance with Numba ` for general usage of the arguments and performance considerations.
+
Numba will be applied in potentially two routines:
#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
#. The engine will JIT the for loop where the apply function is applied to each window.
-.. versionadded:: 1.3.0
-
-``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments.
-
The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
`numba.jit decorator `__.
These keyword arguments will be applied to *both* the passed function (if a standard Python function)
-and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported,
-and their default values are set to ``False``, ``True`` and ``False`` respectively.
-
-.. note::
+and the apply for loop over each window.
- In terms of performance, **the first time a function is run using the Numba engine will be slow**
- as Numba will have some function compilation overhead. However, the compiled functions are cached,
- and subsequent calls will be fast. In general, the Numba engine is performant with
- a larger amount of data points (e.g. 1+ million).
-
-.. code-block:: ipython
-
- In [1]: data = pd.Series(range(1_000_000))
-
- In [2]: roll = data.rolling(10)
-
- In [3]: def f(x):
- ...: return np.sum(x) + 5
- # Run the first time, compilation time will affect performance
- In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999
- 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
- # Function is cached and performance will improve
- In [5]: %timeit roll.apply(f, engine='numba', raw=True)
- 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+.. versionadded:: 1.3.0
- In [6]: %timeit roll.apply(f, engine='cython', raw=True)
- 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments.
.. _window.cov_corr:
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 986cf43b80494..08f46a53cf2f1 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -16,6 +16,7 @@ Version 1.3
.. toctree::
:maxdepth: 2
+ v1.3.1
v1.3.0
Version 1.2
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index b87274307431b..03dfe475475a1 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -338,19 +338,20 @@ maps labels to their new names along the default axis, is allowed to be passed b
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> df = pd.DataFrame([[1]])
- >>> df.rename({0: 1}, {0: 2})
+ In [1]: df = pd.DataFrame([[1]])
+ In [2]: df.rename({0: 1}, {0: 2})
+ Out[2]:
FutureWarning: ...Use named arguments to resolve ambiguity...
2
1 1
*pandas 1.0.0*
-.. code-block:: python
+.. code-block:: ipython
- >>> df.rename({0: 1}, {0: 2})
+ In [3]: df.rename({0: 1}, {0: 2})
Traceback (most recent call last):
...
TypeError: rename() takes from 1 to 2 positional arguments but 3 were given
@@ -359,26 +360,28 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> df.rename({0: 1}, index={0: 2})
+ In [4]: df.rename({0: 1}, index={0: 2})
+ Out[4]:
0
1 1
- >>> df.rename(mapper={0: 1}, index={0: 2})
+ In [5]: df.rename(mapper={0: 1}, index={0: 2})
+ Out[5]:
0
2 1
*pandas 1.0.0*
-.. code-block:: python
+.. code-block:: ipython
- >>> df.rename({0: 1}, index={0: 2})
+ In [6]: df.rename({0: 1}, index={0: 2})
Traceback (most recent call last):
...
TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns'
- >>> df.rename(mapper={0: 1}, index={0: 2})
+ In [7]: df.rename(mapper={0: 1}, index={0: 2})
Traceback (most recent call last):
...
TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns'
@@ -405,12 +408,12 @@ Extended verbose info output for :class:`~pandas.DataFrame`
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> df = pd.DataFrame({"int_col": [1, 2, 3],
+ In [1]: df = pd.DataFrame({"int_col": [1, 2, 3],
... "text_col": ["a", "b", "c"],
... "float_col": [0.0, 0.1, 0.2]})
- >>> df.info(verbose=True)
+ In [2]: df.info(verbose=True)
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
@@ -440,14 +443,16 @@ Extended verbose info output for :class:`~pandas.DataFrame`
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> pd.array(["a", None])
+ In [1]: pd.array(["a", None])
+ Out[1]:
['a', None]
Length: 2, dtype: object
- >>> pd.array([1, None])
+ In [2]: pd.array([1, None])
+ Out[2]:
[1, None]
Length: 2, dtype: object
@@ -470,15 +475,17 @@ As a reminder, you can specify the ``dtype`` to disable all inference.
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> a = pd.array([1, 2, None], dtype="Int64")
- >>> a
+ In [1]: a = pd.array([1, 2, None], dtype="Int64")
+ In [2]: a
+ Out[2]:
[1, 2, NaN]
Length: 3, dtype: Int64
- >>> a[2]
+ In [3]: a[2]
+ Out[3]:
nan
*pandas 1.0.0*
@@ -499,9 +506,10 @@ will now raise.
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> np.asarray(a, dtype="float")
+ In [1]: np.asarray(a, dtype="float")
+ Out[1]:
array([ 1., 2., nan])
*pandas 1.0.0*
@@ -525,9 +533,10 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> pd.Series(a).sum(skipna=False)
+ In [1]: pd.Series(a).sum(skipna=False)
+ Out[1]:
nan
*pandas 1.0.0*
@@ -543,9 +552,10 @@ integer dtype for the values.
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
+ In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype
+ Out[1]:
dtype('int64')
*pandas 1.0.0*
@@ -565,15 +575,17 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> a = pd.array([1, 2, None], dtype="Int64")
- >>> a
+ In [1]: a = pd.array([1, 2, None], dtype="Int64")
+ In [2]: a
+ Out[2]:
[1, 2, NaN]
Length: 3, dtype: Int64
- >>> a > 1
+ In [3]: a > 1
+ Out[3]:
array([False, True, False])
*pandas 1.0.0*
@@ -640,9 +652,10 @@ scalar values in the result are instances of the extension dtype's scalar type.
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> df.resample("2D").agg(lambda x: 'a').A.dtype
+ In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype
+ Out[1]:
CategoricalDtype(categories=['a', 'b'], ordered=False)
*pandas 1.0.0*
@@ -657,9 +670,10 @@ depending on how the results are cast back to the original dtype.
*pandas 0.25.x*
-.. code-block:: python
+.. code-block:: ipython
- >>> df.resample("2D").agg(lambda x: 'c')
+ In [1] df.resample("2D").agg(lambda x: 'c')
+ Out[1]:
A
0 NaN
@@ -871,10 +885,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`.
To use pandas formatters with a matplotlib plot, specify
-.. code-block:: python
+.. code-block:: ipython
- >>> import pandas as pd
- >>> pd.options.plotting.matplotlib.register_converters = True
+ In [1]: import pandas as pd
+ In [2]: pd.options.plotting.matplotlib.register_converters = True
Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters
automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot``
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
index bfe30d52e2aff..34e28eab6d4bf 100644
--- a/doc/source/whatsnew/v1.2.1.rst
+++ b/doc/source/whatsnew/v1.2.1.rst
@@ -52,20 +52,23 @@ DataFrame / Series combination) would ignore the indices, only match
the inputs by shape, and use the index/columns of the first DataFrame for
the result:
-.. code-block:: python
+.. code-block:: ipython
- >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1])
- ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2])
- >>> df1
+ In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1])
+ In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2])
+ In [3]: df1
+ Out[3]:
a b
0 1 3
1 2 4
- >>> df2
+ In [4]: df2
+ Out[4]:
a b
1 1 3
2 2 4
- >>> np.add(df1, df2)
+ In [5]: np.add(df1, df2)
+ Out[5]:
a b
0 2 6
1 4 8
@@ -73,9 +76,10 @@ the result:
This contrasts with how other pandas operations work, which first align
the inputs:
-.. code-block:: python
+.. code-block:: ipython
- >>> df1 + df2
+ In [6]: df1 + df2
+ Out[6]:
a b
0 NaN NaN
1 3.0 7.0
@@ -94,9 +98,10 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so.
To avoid the warning and keep the current behaviour of ignoring the indices,
convert one of the arguments to a NumPy array:
-.. code-block:: python
+.. code-block:: ipython
- >>> np.add(df1, np.asarray(df2))
+ In [7]: np.add(df1, np.asarray(df2))
+ Out[7]:
a b
0 2 6
1 4 8
@@ -104,10 +109,11 @@ convert one of the arguments to a NumPy array:
To obtain the future behaviour and silence the warning, you can align manually
before passing the arguments to the ufunc:
-.. code-block:: python
+.. code-block:: ipython
- >>> df1, df2 = df1.align(df2)
- >>> np.add(df1, df2)
+ In [8]: df1, df2 = df1.align(df2)
+ In [9]: np.add(df1, df2)
+ Out[9]:
a b
0 NaN NaN
1 3.0 7.0
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 59ec974aab0a4..ed66861efad93 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -1233,4 +1233,4 @@ Other
Contributors
~~~~~~~~~~~~
-.. contributors:: v1.2.5..v1.3.0|HEAD
+.. contributors:: v1.2.5..v1.3.0
diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst
new file mode 100644
index 0000000000000..0297aeecf01a6
--- /dev/null
+++ b/doc/source/whatsnew/v1.3.1.rst
@@ -0,0 +1,51 @@
+.. _whatsnew_131:
+
+What's new in 1.3.1 (July 25, 2021)
+-----------------------------------
+
+These are the changes in pandas 1.3.1. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_131.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+- Pandas could not be built on PyPy (:issue:`42355`)
+- :class:`DataFrame` constructed with an older version of pandas could not be unpickled (:issue:`42345`)
+- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42248`)
+- Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`)
+- Fixed regression in :meth:`DataFrame.astype` changing the order of noncontiguous data (:issue:`42396`)
+- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`)
+- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`)
+- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
+- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
+- Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`)
+- Bug in :class:`Series` constructor not accepting a ``dask.Array`` (:issue:`38645`)
+- Fixed regression for ``SettingWithCopyWarning`` displaying incorrect stacklevel (:issue:`42570`)
+- Fixed regression for :func:`merge_asof` raising ``KeyError`` when one of the ``by`` columns is in the index (:issue:`34488`)
+- Fixed regression in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`)
+- Fixed regression in :meth:`SeriesGroupBy.value_counts` that resulted in an ``IndexError`` when called on a Series with one row (:issue:`42618`)
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_131.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+- Fixed bug in :meth:`DataFrame.transpose` dropping values when the DataFrame had an Extension Array dtype and a duplicate index (:issue:`42380`)
+- Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`)
+- Fixed bug in :meth:`.Styler.set_sticky` not handling index names correctly for single index columns case (:issue:`42537`)
+- Fixed bug in :meth:`DataFrame.copy` failing to consolidate blocks in the result (:issue:`42579`)
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_131.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v1.3.0..v1.3.1|HEAD
diff --git a/environment.yml b/environment.yml
index 2c06c321fdbc4..500b8148a94d8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -99,7 +99,7 @@ dependencies:
- xlwt
- odfpy
- - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
+ - fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index 6c1ca3deba047..cfd4695a5335b 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -569,7 +569,12 @@ cdef class BlockManager:
public bint _known_consolidated, _is_consolidated
public ndarray _blknos, _blklocs
- def __cinit__(self, blocks, axes, verify_integrity=True):
+ def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
+ # None as defaults for unpickling GH#42345
+ if blocks is None:
+ # This adds 1-2 microseconds to DataFrame(np.array([]))
+ return
+
if isinstance(blocks, list):
# Backward compat for e.g. pyarrow
blocks = tuple(blocks)
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 077d2e60cc3a4..80b5954fa0d91 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -51,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ...
def is_float_array(values: np.ndarray, skipna: bool = False): ...
def is_integer_array(values: np.ndarray, skipna: bool = False): ...
def is_bool_array(values: np.ndarray, skipna: bool = False): ...
+def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ...
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ...
def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ...
def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ...
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 95e4a58bcb3c8..5ad686494f238 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2979,6 +2979,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray:
return result
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
+ cdef:
+ Py_ssize_t i, n = len(keys)
+ object val
+ ndarray[object] output = np.empty(n, dtype='O')
+
+ if n == 0:
+ # kludge, for Series
+ return np.empty(0, dtype='f8')
+
+ for i in range(n):
+ val = keys[i]
+ if val in mapping:
+ output[i] = mapping[val]
+ else:
+ output[i] = default
+
+ return maybe_convert_objects(output)
+
+
def is_bool_list(obj: list) -> bool:
"""
Check if this list contains only bool or np.bool_ objects.
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
index 04a6bf48c50c2..c0fca76ef701e 100644
--- a/pandas/_libs/src/klib/khash_python.h
+++ b/pandas/_libs/src/klib/khash_python.h
@@ -272,6 +272,8 @@ Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
}
+#define _PandasHASH_IMAG 1000003UL
+
// replaces _Py_HashDouble with _Pandas_HashDouble
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
@@ -279,7 +281,7 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
return -1;
}
- Py_uhash_t combined = realhash + _PyHASH_IMAG * imaghash;
+ Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
if (combined == (Py_uhash_t)-1) {
return -2;
}
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index c4b9fab28c27e..3a152bd5889b7 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]
from pandas.core.arrays import BooleanArray
- result = isin(self._data, values)
+ # algorithms.isin will eventually convert values to an ndarray, so no extra
+ # cost to doing it here first
+ values_arr = np.asarray(values)
+ result = isin(self._data, values_arr)
+
if self._hasna:
- if libmissing.NA in values:
- result += self._mask
- else:
- result *= np.invert(self._mask)
+ values_have_NA = is_object_dtype(values_arr.dtype) and any(
+ val is self.dtype.na_value for val in values_arr
+ )
+
+ # For now, NA does not propagate so set result according to presence of NA,
+ # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
+ result[self._mask] = values_have_NA
+
mask = np.zeros_like(self, dtype=bool)
return BooleanArray(result, mask, copy=False)
diff --git a/pandas/core/common.py b/pandas/core/common.py
index ebe5dd8568418..3a3af87ff788e 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -143,7 +143,11 @@ def is_bool_indexer(key: Any) -> bool:
return True
elif isinstance(key, list):
# check if np.array(key).dtype would be bool
- return len(key) > 0 and lib.is_bool_list(key)
+ if len(key) > 0:
+ if type(key) is not list:
+ # GH#42461 cython will raise TypeError if we pass a subclass
+ key = list(key)
+ return lib.is_bool_list(key)
return False
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 7e7205d1351b3..68d7f6c6f8a22 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -560,8 +560,11 @@ def sanitize_array(
raise TypeError(f"'{type(data).__name__}' type is unordered")
# materialize e.g. generators, convert e.g. tuples, abc.ValueView
- # TODO: non-standard array-likes we can convert to ndarray more efficiently?
- data = list(data)
+ if hasattr(data, "__array__"):
+ # e.g. dask array GH#38645
+ data = np.asarray(data)
+ else:
+ data = list(data)
if dtype is not None or len(data) == 0:
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 52254ff4cdb9b..de725b74be031 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -58,7 +58,6 @@
is_complex_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float,
@@ -93,8 +92,6 @@
)
if TYPE_CHECKING:
- from typing import Literal
-
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
@@ -182,9 +179,7 @@ def maybe_box_native(value: Scalar) -> Scalar:
-------
scalar or Series
"""
- if is_datetime_or_timedelta_dtype(value):
- value = maybe_box_datetimelike(value)
- elif is_float(value):
+ if is_float(value):
# error: Argument 1 to "float" has incompatible type
# "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
# expected "Union[SupportsFloat, _SupportsIndex, str]"
@@ -196,6 +191,8 @@ def maybe_box_native(value: Scalar) -> Scalar:
value = int(value) # type: ignore[arg-type]
elif is_bool(value):
value = bool(value)
+ elif isinstance(value, (np.datetime64, np.timedelta64)):
+ value = maybe_box_datetimelike(value)
return value
@@ -781,6 +778,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj,
return dtype, val
+def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
+ """
+ Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
+
+ Parameters
+ ----------
+ d: dict-like object
+
+ Returns
+ -------
+ dict
+ """
+ return {maybe_box_datetimelike(key): value for key, value in d.items()}
+
+
def infer_dtype_from_array(
arr, pandas_dtype: bool = False
) -> tuple[DtypeObj, ArrayLike]:
@@ -1079,14 +1091,11 @@ def astype_nansafe(
The dtype was a datetime64/timedelta64 dtype, but it had no unit.
"""
if arr.ndim > 1:
- # Make sure we are doing non-copy ravel and reshape.
- flags = arr.flags
- flat = arr.ravel("K")
+ flat = arr.ravel()
result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna)
- order: Literal["C", "F"] = "F" if flags.f_contiguous else "C"
# error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no
# attribute "reshape"
- return result.reshape(arr.shape, order=order) # type: ignore[union-attr]
+ return result.reshape(arr.shape) # type: ignore[union-attr]
# We get here with 0-dim from sparse
arr = np.atleast_1d(arr)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 954ea24d0d8fc..21e90e5bc507d 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3344,8 +3344,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
values = self.values
new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
- result = self._constructor(
- dict(zip(self.index, new_values)), index=self.columns
+ result = type(self)._from_arrays(
+ new_values, index=self.columns, columns=self.index
)
else:
@@ -3754,7 +3754,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
# try to set first as we want an invalid
# value exception to occur first
if len(self):
- self._check_setitem_copy()
+ self._check_setitem_copy(stacklevel=5)
def _iset_item(self, loc: int, value) -> None:
arraylike = self._sanitize_column(value)
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 18c84d9aa88bf..97e8832a8de43 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -759,7 +759,7 @@ def apply_series_value_counts():
# new values are where sorted labels change
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
inc = np.r_[True, lchanges]
- if not len(lchanges):
+ if not len(val):
inc = lchanges
inc[idx] = True # group boundaries are also new values
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 9642b30ab91ca..34d0137c26fda 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -123,12 +123,16 @@ def concat_arrays(to_concat: list) -> ArrayLike:
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
- single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
+ dtypes = {x.dtype for x in to_concat_no_proxy}
+ single_dtype = len(dtypes) == 1
- if not single_dtype:
- target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
- else:
+ if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
+ elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
+ # GH#42092
+ target_dtype = np.find_common_type(list(dtypes), [])
+ else:
+ target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
if target_dtype.kind in ["m", "M"]:
# for datetimelike use DatetimeArray/TimedeltaArray concatenation
@@ -592,6 +596,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
all(
is_dtype_equal(ju.block.dtype, join_units[0].block.dtype)
+ # GH#42092 we only want the dtype_equal check for non-numeric blocks
+ # (for now, may change but that would need a deprecation)
+ or ju.block.dtype.kind in ["b", "i", "u"]
for ju in join_units
)
and
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 7bef7ae9b39d7..5e327adfb8905 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -26,6 +26,7 @@
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
+ dict_compat,
maybe_cast_to_datetime,
maybe_convert_platform,
maybe_infer_to_datetimelike,
@@ -59,7 +60,6 @@
TimedeltaArray,
)
from pandas.core.construction import (
- create_series_with_explicit_dtype,
ensure_wrapped_if_datetimelike,
extract_array,
range_to_ndarray,
@@ -67,7 +67,9 @@
)
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import (
+ DatetimeIndex,
Index,
+ TimedeltaIndex,
ensure_index,
get_objs_combined_axis,
union_indexes,
@@ -556,6 +558,7 @@ def convert(v):
def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
+ oindex = None
homogenized = []
for val in data:
@@ -570,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
val = val._values
else:
if isinstance(val, dict):
- # see test_constructor_subclass_dict
- # test_constructor_dict_datetime64_index
- val = create_series_with_explicit_dtype(val, index=index)._values
+ # GH#41785 this _should_ be equivalent to (but faster than)
+ # val = create_series_with_explicit_dtype(val, index=index)._values
+ if oindex is None:
+ oindex = index.astype("O")
+
+ if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
+ # see test_constructor_dict_datetime64_index
+ val = dict_compat(val)
+ else:
+ # see test_constructor_subclass_dict
+ val = dict(val)
+ val = lib.fast_multiget(val, oindex._values, default=np.nan)
val = sanitize_array(
val, index, dtype=dtype, copy=False, raise_cast_failure=False
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index cc07caac31c0c..1c4fb98ee16bb 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -594,6 +594,9 @@ def copy_func(ax):
res = self.apply("copy", deep=deep)
res.axes = new_axes
+
+ if deep:
+ res._consolidate_inplace()
return res
def consolidate(self: T) -> T:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 143999a4677b3..2991484ec7e06 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -1775,16 +1775,26 @@ def _validate_specification(self) -> None:
raise MergeError("missing right_by")
# GH#29130 Check that merge keys do not have dtype object
- lo_dtype = (
- self.left[self.left_on[0]].dtype
- if not self.left_index
- else self.left.index.dtype
- )
- ro_dtype = (
- self.right[self.right_on[0]].dtype
- if not self.right_index
- else self.right.index.dtype
- )
+ if not self.left_index:
+ left_on = self.left_on[0]
+ lo_dtype = (
+ self.left[left_on].dtype
+ if left_on in self.left.columns
+ else self.left.index.get_level_values(left_on)
+ )
+ else:
+ lo_dtype = self.left.index.dtype
+
+ if not self.right_index:
+ right_on = self.right_on[0]
+ ro_dtype = (
+ self.right[right_on].dtype
+ if right_on in self.right.columns
+ else self.right.index.get_level_values(right_on)
+ )
+ else:
+ ro_dtype = self.right.index.dtype
+
if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):
raise MergeError(
f"Incompatible merge dtype, {repr(ro_dtype)} and "
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 0edb150bdc273..9203cf625d43a 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -399,7 +399,8 @@ def _unstack_multiple(data, clocs, fill_value=None):
return result
- dummy = data.copy()
+ # GH#42579 deep=False to avoid consolidating
+ dummy = data.copy(deep=False)
dummy.index = dummy_index
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 43738831981d2..2c191a2cc9a68 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -5074,9 +5074,9 @@ def between(self, left, right, inclusive="both") -> Series:
4 False
dtype: bool
- With `inclusive` set to ``False`` boundary values are excluded:
+ With `inclusive` set to ``"neither"`` boundary values are excluded:
- >>> s.between(1, 4, inclusive=False)
+ >>> s.between(1, 4, inclusive="neither")
0 True
1 False
2 False
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 014a702618bda..7414fdf190936 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -194,9 +194,9 @@ def _maybe_cache(
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates, format)
cache_array = Series(cache_dates, index=unique_dates)
- if not cache_array.is_unique:
- # GH#39882 in case of None and NaT we get duplicates
- cache_array = cache_array.drop_duplicates()
+ # GH#39882 and GH#35888 in case of None and NaT we get duplicates
+ if not cache_array.index.is_unique:
+ cache_array = cache_array[~cache_array.index.duplicated()]
return cache_array
diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py
index df69553a74683..b80a73a930818 100644
--- a/pandas/core/window/doc.py
+++ b/pandas/core/window/doc.py
@@ -94,8 +94,8 @@ def create_section_header(header: str) -> str:
).replace("\n", "", 1)
numba_notes = (
- "See :ref:`window.numba_engine` for extended documentation "
- "and performance considerations for the Numba engine.\n\n"
+ "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for "
+ "extended documentation and performance considerations for the Numba engine.\n\n"
)
window_agg_numba_parameters = dedent(
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 0360b0f9307c5..660ca9c58a5b0 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -703,7 +703,7 @@ def to_latex(
>>> df = pd.DataFrame([[1]])
>>> df.style.set_properties(
... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"}
- ... ).to_latex(css_convert=True)
+ ... ).to_latex(convert_css=True)
\begin{tabular}{lr}
{} & {0} \\
0 & {\bfseries}{\Huge{1}} \\
@@ -1452,7 +1452,8 @@ def set_sticky(
Whether to make the index or column headers sticky.
pixel_size : int, optional
Required to configure the width of index cells or the height of column
- header cells when sticking a MultiIndex. Defaults to 75 and 25 respectively.
+ header cells when sticking a MultiIndex (or with a named Index).
+ Defaults to 75 and 25 respectively.
levels : list of int
If ``axis`` is a MultiIndex the specific levels to stick. If ``None`` will
stick all levels.
@@ -1460,6 +1461,16 @@ def set_sticky(
Returns
-------
self : Styler
+
+ Notes
+ -----
+ This method uses the CSS 'position: sticky;' property to display. It is
+ designed to work with visible axes, therefore both:
+
+ - `styler.set_sticky(axis="index").hide_index()`
+ - `styler.set_sticky(axis="columns").hide_columns()`
+
+ may produce strange behaviour due to CSS controls with missing elements.
"""
if axis in [0, "index"]:
axis, obj, tag, pos = 0, self.data.index, "tbody", "left"
@@ -1471,15 +1482,42 @@ def set_sticky(
raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}")
if not isinstance(obj, pd.MultiIndex):
- return self.set_table_styles(
- [
+ # handling MultiIndexes requires different CSS
+ props = "position:sticky; background-color:white;"
+
+ if axis == 1:
+ # stick the first of and, if index names, the second
+ # if self._hide_columns then no here will exist: no conflict
+ styles: CSSStyles = [
{
- "selector": f"{tag} th",
- "props": f"position:sticky; {pos}:0px; background-color:white;",
+ "selector": "thead tr:first-child",
+ "props": props + "top:0px; z-index:2;",
}
- ],
- overwrite=False,
- )
+ ]
+ if not self.index.names[0] is None:
+ styles[0]["props"] = (
+ props + f"top:0px; z-index:2; height:{pixel_size}px;"
+ )
+ styles.append(
+ {
+ "selector": "thead tr:nth-child(2)",
+ "props": props
+ + f"top:{pixel_size}px; z-index:2; height:{pixel_size}px; ",
+ }
+ )
+ else:
+ # stick the first of each |
in both and
+ # if self._hide_index then no will exist in | : no conflict
+ # but will exist in : conflict with initial element
+ styles = [
+ {
+ "selector": "tr th:first-child",
+ "props": props + "left:0px; z-index:1;",
+ }
+ ]
+
+ return self.set_table_styles(styles, overwrite=False)
+
else:
range_idx = list(range(obj.nlevels))
diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index 5be6ae0382d87..c8454e88970fa 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -195,14 +195,18 @@ def handle_indexes(self) -> None:
This method will add indexes into attr_cols or elem_cols.
"""
+ if not self.index:
+ return
+
+ first_key = next(iter(self.frame_dicts))
indexes: list[str] = [
- x for x in self.frame_dicts[0].keys() if x not in self.orig_cols
+ x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
]
- if self.attr_cols and self.index:
+ if self.attr_cols:
self.attr_cols = indexes + self.attr_cols
- if self.elem_cols and self.index:
+ if self.elem_cols:
self.elem_cols = indexes + self.elem_cols
def get_prefix_uri(self) -> str:
@@ -307,7 +311,7 @@ def build_tree(self) -> bytes:
self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(self.frame_dicts[0].keys())
+ self.elem_cols = list(self.d.keys())
self.build_elems()
else:
@@ -477,7 +481,7 @@ def build_tree(self) -> bytes:
self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(self.frame_dicts[0].keys())
+ self.elem_cols = list(self.d.keys())
self.build_elems()
else:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 4eb42640d9b70..20e035891a625 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1673,12 +1673,7 @@ def read(
if self.dtyplist[i] is not None:
col = data.columns[i]
dtype = data[col].dtype
- # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
- # "object"
- if (
- dtype != np.dtype(object) # type: ignore[type-var]
- and dtype != self.dtyplist[i]
- ):
+ if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
requires_type_conversion = True
data_formatted.append(
(col, Series(data[col], ix, self.dtyplist[i]))
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 14266a2c29a7f..995f404dc49d3 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -53,6 +53,17 @@ def test_apply_axis1_with_ea():
tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+ "data, dtype",
+ [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)],
+)
+def test_agg_axis1_duplicate_index(data, dtype):
+ # GH 42380
+ expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype)
+ result = expected.agg(lambda x: x, axis=1)
+ tm.assert_frame_equal(result, expected)
+
+
def test_apply_mixed_datetimelike():
# mixed datetimelike
# GH 7778
diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py
index 5ba278368834c..246f33d27476c 100644
--- a/pandas/tests/base/test_transpose.py
+++ b/pandas/tests/base/test_transpose.py
@@ -1,6 +1,10 @@
import numpy as np
import pytest
+from pandas import (
+ CategoricalDtype,
+ DataFrame,
+)
import pandas._testing as tm
@@ -25,3 +29,28 @@ def test_numpy_transpose(index_or_series_obj):
with pytest.raises(ValueError, match=msg):
np.transpose(obj, axes=1)
+
+
+@pytest.mark.parametrize(
+ "data, transposed_data, index, columns, dtype",
+ [
+ ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
+ ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
+ ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
+ ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
+ ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
+ (
+ [[1, 2], [3, 4]],
+ [[1, 3], [2, 4]],
+ ["a", "a"],
+ ["b", "b"],
+ CategoricalDtype([1, 2, 3, 4]),
+ ),
+ ],
+)
+def test_duplicate_labels(data, transposed_data, index, columns, dtype):
+ # GH 42380
+ df = DataFrame(data, index=index, columns=columns, dtype=dtype)
+ result = df.T
+ expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py
new file mode 100644
index 0000000000000..13dc82d779f95
--- /dev/null
+++ b/pandas/tests/dtypes/cast/test_dict_compat.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+from pandas.core.dtypes.cast import dict_compat
+
+from pandas import Timestamp
+
+
+def test_dict_compat():
+ data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2}
+ data_unchanged = {1: 2, 3: 4, 5: 6}
+ expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2}
+ assert dict_compat(data_datetime64) == expected
+ assert dict_compat(expected) == expected
+ assert dict_compat(data_unchanged) == data_unchanged
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index f098582ca04c6..1f1991214aad0 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -670,6 +670,26 @@ def test_astype_bytes(self):
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
assert result.dtypes[0] == np.dtype("S3")
+ @pytest.mark.parametrize(
+ "index_slice",
+ [
+ np.s_[:2, :2],
+ np.s_[:1, :2],
+ np.s_[:2, :1],
+ np.s_[::2, ::2],
+ np.s_[::1, ::2],
+ np.s_[::2, ::1],
+ ],
+ )
+ def test_astype_noncontiguous(self, index_slice):
+ # GH#42396
+ data = np.arange(16).reshape(4, 4)
+ df = DataFrame(data)
+
+ result = df.iloc[index_slice].astype("int16")
+ expected = df.iloc[index_slice]
+ tm.assert_frame_equal(result, expected, check_dtype=False)
+
class TestAstypeCategorical:
def test_astype_from_categorical3(self):
diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py
index be52cf55fccb2..1c0b0755e7d94 100644
--- a/pandas/tests/frame/methods/test_copy.py
+++ b/pandas/tests/frame/methods/test_copy.py
@@ -1,5 +1,8 @@
+import numpy as np
import pytest
+import pandas.util._test_decorators as td
+
from pandas import DataFrame
import pandas._testing as tm
@@ -41,3 +44,20 @@ def test_copy(self, float_frame, float_string_frame):
# copy objects
copy = float_string_frame.copy()
assert copy._mgr is not float_string_frame._mgr
+
+ @td.skip_array_manager_invalid_test
+ def test_copy_consolidates(self):
+ # GH#42477
+ df = DataFrame(
+ {
+ "a": np.random.randint(0, 100, size=55),
+ "b": np.random.randint(0, 100, size=55),
+ }
+ )
+
+ for i in range(0, 10):
+ df.loc[:, f"n_{i}"] = np.random.randint(0, 100, size=55)
+
+ assert len(df._mgr.blocks) == 11
+ result = df.copy()
+ assert len(result._mgr.blocks) == 1
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
index 8bb07b7163f2e..54f672cb69800 100644
--- a/pandas/tests/groupby/test_value_counts.py
+++ b/pandas/tests/groupby/test_value_counts.py
@@ -122,23 +122,27 @@ def test_series_groupby_value_counts_with_grouper():
tm.assert_series_equal(result, expected)
-def test_series_groupby_value_counts_empty():
+@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
+def test_series_groupby_value_counts_empty(columns):
# GH39172
- df = DataFrame(columns=["A", "B"])
- dfg = df.groupby("A")
+ df = DataFrame(columns=columns)
+ dfg = df.groupby(columns[:-1])
- result = dfg["B"].value_counts()
- expected = Series([], name="B", dtype=result.dtype)
- expected.index = MultiIndex.from_arrays([[]] * 2, names=["A", "B"])
+ result = dfg[columns[-1]].value_counts()
+ expected = Series([], name=columns[-1], dtype=result.dtype)
+ expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
tm.assert_series_equal(result, expected)
- df = DataFrame(columns=["A", "B", "C"])
- dfg = df.groupby(["A", "B"])
- result = dfg["C"].value_counts()
- expected = Series([], name="C", dtype=result.dtype)
- expected.index = MultiIndex.from_arrays([[]] * 3, names=["A", "B", "C"])
+@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
+def test_series_groupby_value_counts_one_row(columns):
+ # GH42618
+ df = DataFrame(data=[range(len(columns))], columns=columns)
+ dfg = df.groupby(columns[:-1])
+
+ result = dfg[columns[-1]].value_counts()
+ expected = df.value_counts().rename(columns[-1])
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index af781f0b58f85..826649358e663 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -12,7 +12,6 @@
from pandas.compat import (
IS64,
- PY310,
np_datetime64_compat,
)
from pandas.util._test_decorators import async_mark
@@ -1004,24 +1003,6 @@ def test_isin_nan_common_object(self, request, nulls_fixture, nulls_fixture2):
and math.isnan(nulls_fixture)
and math.isnan(nulls_fixture2)
):
- if PY310:
- if (
- # Failing cases are
- # np.nan, float('nan')
- # float('nan'), np.nan
- # float('nan'), float('nan')
- # Since only float('nan'), np.nan is float
- # Use not np.nan to identify float('nan')
- nulls_fixture is np.nan
- and nulls_fixture2 is not np.nan
- or nulls_fixture is not np.nan
- ):
- request.applymarker(
- # This test is flaky :(
- pytest.mark.xfail(
- reason="Failing on Python 3.10 GH41940", strict=False
- )
- )
tm.assert_numpy_array_equal(
Index(["a", nulls_fixture]).isin([nulls_fixture2]),
np.array([False, True]),
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
index a38c652953fab..90bda69eaf139 100644
--- a/pandas/tests/indexing/test_chaining_and_caching.py
+++ b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -435,6 +435,16 @@ def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
)
tm.assert_frame_equal(df, expected)
+ @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})])
+ def test_detect_chained_assignment_warning_stacklevel(self, rhs):
+ # GH#42570
+ df = DataFrame(np.arange(25).reshape(5, 5))
+ chained = df.loc[:3]
+ with option_context("chained_assignment", "warn"):
+ with tm.assert_produces_warning(com.SettingWithCopyWarning) as t:
+ chained[2] = rhs
+ assert t[0].filename == __file__
+
# TODO(ArrayManager) fast_xs with array-like scalars is not yet working
@td.skip_array_manager_not_yet_implemented
def test_chained_getitem_with_lists(self):
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
index 0f4a30cfa9cf9..38a6209283080 100644
--- a/pandas/tests/internals/test_internals.py
+++ b/pandas/tests/internals/test_internals.py
@@ -461,6 +461,9 @@ def test_copy(self, mgr):
# DatetimeTZBlock has DatetimeIndex values
assert cp_blk.values._data.base is blk.values._data.base
+ # copy(deep=True) consolidates, so the block-wise assertions will
+ # fail is mgr is not consolidated
+ mgr._consolidate_inplace()
cp = mgr.copy(deep=True)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
diff --git a/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl b/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl
new file mode 100644
index 0000000000000..255a745dd9021
Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/1.2.4/empty_frame_v1_2_4-GH#42345.pkl differ
diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py
index 495dc82f0e7bd..4e71cb4c46626 100644
--- a/pandas/tests/io/formats/style/test_html.py
+++ b/pandas/tests/io/formats/style/test_html.py
@@ -272,17 +272,35 @@ def test_caption_as_sequence(styler):
@pytest.mark.parametrize("index", [False, True])
@pytest.mark.parametrize("columns", [False, True])
-def test_sticky_basic(styler, index, columns):
+@pytest.mark.parametrize("index_name", [True, False])
+def test_sticky_basic(styler, index, columns, index_name):
+ if index_name:
+ styler.index.name = "some text"
if index:
styler.set_sticky(axis=0)
if columns:
styler.set_sticky(axis=1)
res = styler.set_uuid("").to_html()
- cs1 = "tbody th {\n position: sticky;\n left: 0px;\n background-color: white;\n}"
- assert (cs1 in res) is index
- cs2 = "thead th {\n position: sticky;\n top: 0px;\n background-color: white;\n}"
- assert (cs2 in res) is columns
+
+ css_for_index = (
+ "tr th:first-child {\n position: sticky;\n background-color: white;\n "
+ "left: 0px;\n z-index: 1;\n}"
+ )
+ assert (css_for_index in res) is index
+
+ css_for_cols_1 = (
+ "thead tr:first-child {\n position: sticky;\n background-color: white;\n "
+ "top: 0px;\n z-index: 2;\n"
+ )
+ css_for_cols_1 += " height: 25px;\n}" if index_name else "}"
+ assert (css_for_cols_1 in res) is columns
+
+ css_for_cols_2 = (
+ "thead tr:nth-child(2) {\n position: sticky;\n background-color: white;\n "
+ "top: 25px;\n z-index: 2;\n height: 25px;\n}"
+ )
+ assert (css_for_cols_2 in res) is (index_name and columns)
@pytest.mark.parametrize("index", [False, True])
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
index e78448a2c32d3..86891367e9bd6 100644
--- a/pandas/tests/io/parser/common/test_chunksize.py
+++ b/pandas/tests/io/parser/common/test_chunksize.py
@@ -191,8 +191,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers, request):
buf = StringIO(data)
- with tm.assert_produces_warning(warning_type):
- df = parser.read_csv(buf)
+ try:
+ with tm.assert_produces_warning(warning_type):
+ df = parser.read_csv(buf)
+ except AssertionError as err:
+ # 2021-02-21 this occasionally fails on the CI with an unexpected
+ # ResourceWarning that we have been unable to track down,
+ # see GH#38630
+ if "ResourceWarning" not in str(err) or parser.engine != "python":
+ raise
+
+ # Check the main assertion of the test before re-raising
+ assert df.a.dtype == object
+
+ mark = pytest.mark.xfail(
+ reason="ResourceWarning for unclosed SSL Socket, GH#38630"
+ )
+ request.node.add_marker(mark)
+ raise
assert df.a.dtype == object
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
index 7cf9d7e9a1925..c4423e6cc4ead 100644
--- a/pandas/tests/io/test_pickle.py
+++ b/pandas/tests/io/test_pickle.py
@@ -163,9 +163,9 @@ def compare_index_period(result, expected, typ, version):
tm.assert_index_equal(result.shift(2), expected.shift(2))
-files = glob.glob(
- os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle")
-)
+here = os.path.dirname(__file__)
+legacy_dirname = os.path.join(here, "data", "legacy_pickle")
+files = glob.glob(os.path.join(legacy_dirname, "*", "*.pickle"))
@pytest.fixture(params=files)
@@ -635,3 +635,13 @@ def test_pickle_big_dataframe_compression(protocol, compression):
partial(pd.read_pickle, compression=compression),
)
tm.assert_frame_equal(df, result)
+
+
+def test_pickle_frame_v124_unpickle_130():
+ # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x
+ path = os.path.join(legacy_dirname, "1.2.4", "empty_frame_v1_2_4-GH#42345.pkl")
+ with open(path, "rb") as fd:
+ df = pickle.load(fd)
+
+ expected = pd.DataFrame()
+ tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index 1e2973075f98e..bdee797f3340c 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -12,7 +12,10 @@
from pandas.compat import PY38
import pandas.util._test_decorators as td
-from pandas import DataFrame
+from pandas import (
+ DataFrame,
+ Index,
+)
import pandas._testing as tm
from pandas.io.common import get_handle
@@ -291,6 +294,45 @@ def test_index_false_rename_row_root(datapath, parser):
assert output == expected
+@pytest.mark.parametrize(
+ "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]]
+)
+def test_index_false_with_offset_input_index(parser, offset_index):
+ """
+ Tests that the output does not contain the `` field when the index of the
+ input Dataframe has an offset.
+
+ This is a regression test for issue #42458.
+ """
+
+ expected = """\
+
+
+
+ square
+ 360
+ 4.0
+
+
+ circle
+ 360
+
+
+
+ triangle
+ 180
+ 3.0
+
+"""
+
+ offset_geom_df = geom_df.copy()
+ offset_geom_df.index = Index(offset_index)
+ output = offset_geom_df.to_xml(index=False, parser=parser)
+ output = equalize_decl(output)
+
+ assert output == expected
+
+
# NA_REP
na_expected = """\
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
index 3636139c19eef..91c246fc9ee2d 100644
--- a/pandas/tests/reshape/concat/test_dataframe.py
+++ b/pandas/tests/reshape/concat/test_dataframe.py
@@ -170,3 +170,13 @@ def test_concat_dataframe_keys_bug(self, sort):
# it works
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
+
+ def test_concat_bool_with_int(self):
+ # GH#42092 we may want to change this to return object, but that
+ # would need a deprecation
+ df1 = DataFrame(Series([True, False, True, True], dtype="bool"))
+ df2 = DataFrame(Series([1, 0, 1], dtype="int64"))
+
+ result = concat([df1, df2])
+ expected = concat([df1.astype("int64"), df2])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
index 6746158179964..310cf2debadc6 100644
--- a/pandas/tests/reshape/merge/test_merge_asof.py
+++ b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -1437,3 +1437,50 @@ def test_merge_asof_index_behavior(kwargs):
index=index,
)
tm.assert_frame_equal(result, expected)
+
+
+def test_merge_asof_numeri_column_in_index():
+ # GH#34488
+ left = pd.DataFrame({"b": [10, 11, 12]}, index=Index([1, 2, 3], name="a"))
+ right = pd.DataFrame({"c": [20, 21, 22]}, index=Index([0, 2, 3], name="a"))
+
+ result = merge_asof(left, right, left_on="a", right_on="a")
+ expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_merge_asof_numeri_column_in_multiindex():
+ # GH#34488
+ left = pd.DataFrame(
+ {"b": [10, 11, 12]},
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], ["a", "b", "c"]], names=["a", "z"]),
+ )
+ right = pd.DataFrame(
+ {"c": [20, 21, 22]},
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], ["x", "y", "z"]], names=["a", "y"]),
+ )
+
+ result = merge_asof(left, right, left_on="a", right_on="a")
+ expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_merge_asof_numeri_column_in_index_object_dtype():
+ # GH#34488
+ left = pd.DataFrame({"b": [10, 11, 12]}, index=Index(["1", "2", "3"], name="a"))
+ right = pd.DataFrame({"c": [20, 21, 22]}, index=Index(["m", "n", "o"], name="a"))
+
+ with pytest.raises(
+ MergeError,
+ match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
+ ):
+ merge_asof(left, right, left_on="a", right_on="a")
+
+ left = left.reset_index().set_index(["a", "b"])
+ right = right.reset_index().set_index(["a", "c"])
+
+ with pytest.raises(
+ MergeError,
+ match=r"Incompatible merge dtype, .*, both sides must have numeric dtype",
+ ):
+ merge_asof(left, right, left_on="a", right_on="a")
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index d33721c796efa..653ea88ed62ac 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas.compat import PY310
-
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
@@ -430,8 +428,6 @@ def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
result = get_dummies(**get_dummies_kwargs)
tm.assert_frame_equal(result, expected)
- # This is flaky on Python 3.10
- @pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940", strict=False)
def test_get_dummies_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
@@ -471,7 +467,6 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse):
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940", strict=False)
def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py
index 9f6cdbb81bd89..7dfda0463ecaf 100644
--- a/pandas/tests/scalar/timedelta/test_arithmetic.py
+++ b/pandas/tests/scalar/timedelta/test_arithmetic.py
@@ -10,7 +10,6 @@
import numpy as np
import pytest
-from pandas.compat import is_numpy_dev
from pandas.errors import OutOfBoundsTimedelta
import pandas as pd
@@ -18,7 +17,6 @@
NaT,
Timedelta,
Timestamp,
- compat,
offsets,
)
import pandas._testing as tm
@@ -434,15 +432,7 @@ def test_td_div_numeric_scalar(self):
"nan",
[
np.nan,
- pytest.param(
- np.float64("NaN"),
- marks=pytest.mark.xfail(
- # Works on numpy dev only in python 3.9
- is_numpy_dev and not compat.PY39,
- raises=RuntimeWarning,
- reason="https://github.com/pandas-dev/pandas/issues/31992",
- ),
- ),
+ np.float64("NaN"),
float("nan"),
],
)
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
index 898a769dfac48..d3a3434872826 100644
--- a/pandas/tests/series/methods/test_isin.py
+++ b/pandas/tests/series/methods/test_isin.py
@@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values):
expected = Series([True, False])
tm.assert_series_equal(result, expected)
+ @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
+ @pytest.mark.parametrize(
+ "data,values,expected",
+ [
+ ([0, 1, 0], [1], [False, True, False]),
+ ([0, 1, 0], [1, pd.NA], [False, True, False]),
+ ([0, pd.NA, 0], [1, 0], [True, False, True]),
+ ([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
+ ([0, 1, pd.NA], [1, np.nan], [False, True, False]),
+ ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
+ ],
+ )
+ def test_isin_masked_types(self, dtype, data, values, expected):
+ # GH#42405
+ ser = Series(data, dtype=dtype)
+
+ result = ser.isin(values)
+ expected = Series(expected, dtype="boolean")
+
+ tm.assert_series_equal(result, expected)
+
@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 490052c793fdc..4df95d895e475 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -9,10 +9,7 @@
algos as libalgos,
hashtable as ht,
)
-from pandas.compat import (
- PY310,
- np_array_datetime64_compat,
-)
+from pandas.compat import np_array_datetime64_compat
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import (
@@ -786,8 +783,6 @@ def test_different_nans(self):
expected = np.array([np.nan])
tm.assert_numpy_array_equal(result, expected)
- # Flaky on Python 3.10 -> Don't make strict
- @pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940", strict=False)
def test_first_nan_kept(self):
# GH 22295
# create different nans from bit-patterns:
@@ -993,8 +988,6 @@ def __hash__(self):
# different objects -> False
tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
- # Flaky on Python 3.10 -> Don't make strict
- @pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940", strict=False)
def test_different_nans(self):
# GH 22160
# all nans are handled as equivalent
@@ -1037,8 +1030,6 @@ def test_empty(self, empty):
result = algos.isin(vals, empty)
tm.assert_numpy_array_equal(expected, result)
- # Flaky on Python 3.10 -> Don't make strict
- @pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940", strict=False)
def test_different_nan_objects(self):
# GH 22119
comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index 93c95b3004876..12664e4463343 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -165,3 +165,28 @@ def test_non_bool_array_with_na(self):
# in particular, this should not raise
arr = np.array(["A", "B", np.nan], dtype=object)
assert not com.is_bool_indexer(arr)
+
+ def test_list_subclass(self):
+ # GH#42433
+
+ class MyList(list):
+ pass
+
+ val = MyList(["a"])
+
+ assert not com.is_bool_indexer(val)
+
+ val = MyList([True])
+ assert com.is_bool_indexer(val)
+
+ def test_frozenlist(self):
+ # GH#42461
+ data = {"col1": [1, 2], "col2": [3, 4]}
+ df = pd.DataFrame(data=data)
+
+ frozen = df.index.names[1:]
+ assert not com.is_bool_indexer(frozen)
+
+ result = df[frozen]
+ expected = df[[]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 121ca99785831..9da7951c199ca 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -957,18 +957,40 @@ def test_to_datetime_cache_scalar(self):
expected = Timestamp("20130101 00:00:00")
assert result == expected
- def test_convert_object_to_datetime_with_cache(self):
+ @pytest.mark.parametrize(
+ "datetimelikes,expected_values",
+ (
+ (
+ (None, np.nan) + (NaT,) * start_caching_at,
+ (NaT,) * (start_caching_at + 2),
+ ),
+ (
+ (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at,
+ (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at,
+ ),
+ (
+ (None,)
+ + (NaT,) * start_caching_at
+ + ("2012 July 26", Timestamp("2012-07-26")),
+ (NaT,) * (start_caching_at + 1)
+ + (Timestamp("2012-07-26"), Timestamp("2012-07-26")),
+ ),
+ ),
+ )
+ def test_convert_object_to_datetime_with_cache(
+ self, datetimelikes, expected_values
+ ):
# GH#39882
ser = Series(
- [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")],
+ datetimelikes,
dtype="object",
)
- result = to_datetime(ser, errors="coerce")
- expected = Series(
- [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")],
+ result_series = to_datetime(ser, errors="coerce")
+ expected_series = Series(
+ expected_values,
dtype="datetime64[ns]",
)
- tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result_series, expected_series)
@pytest.mark.parametrize(
"date, format",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index a0d4c8e02acf6..712c9a4b89273 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -64,7 +64,7 @@ xlrd
xlsxwriter
xlwt
odfpy
-fastparquet>=0.3.2
+fastparquet>=0.3.2, <0.7.0
pyarrow>=0.17.0
python-snappy
pyqt5>=5.9.2
|