From 907235dca56779f37e2155095d361ad9458055c7 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Thu, 5 Mar 2020 11:38:51 +0100
Subject: [PATCH] Vectorize patch extraction in Axes3D.plot_surface

When rstride and cstride divide the numbers of rows and columns minus 1,
all polygons have the same number of vertices which can be recovered
using some stride tricks on NumPy arrays. On a toy benchmark of a
800x800 grid this allows to reduce the time it takes to produce this
list of polygons from ~10s to ~60ms.
---
 lib/matplotlib/cbook/__init__.py   | 101 +++++++++++++++++++++++++++++
 lib/matplotlib/tests/test_cbook.py |  28 ++++++++
 lib/mpl_toolkits/mplot3d/axes3d.py |  62 ++++++++++++------
 3 files changed, 171 insertions(+), 20 deletions(-)

diff --git a/lib/matplotlib/cbook/__init__.py b/lib/matplotlib/cbook/__init__.py
index 5c9a419ed353..89ad5080fdd1 100644
--- a/lib/matplotlib/cbook/__init__.py
+++ b/lib/matplotlib/cbook/__init__.py
@@ -2002,6 +2002,107 @@ def _array_perimeter(arr):
     ))
 
 
+def _unfold(arr, axis, size, step):
+    """
+    Append an extra dimension containing sliding windows along *axis*.
+
+    All windows are of size *size* and begin with every *step* elements.
+
+    Parameters
+    ----------
+    arr : ndarray, shape (N_1, ..., N_k)
+        The input array
+    axis : int
+        Axis along which the windows are extracted
+    size : int
+        Size of the windows
+    step : int
+        Stride between first elements of subsequent windows.
+
+    Returns
+    -------
+    windows : ndarray, shape (N_1, ..., 1 + (N_axis-size)/step, ..., N_k, size)
+
+    Examples
+    --------
+    >>> i, j = np.ogrid[:3,:7]
+    >>> a = i*10 + j
+    >>> a
+    array([[ 0,  1,  2,  3,  4,  5,  6],
+           [10, 11, 12, 13, 14, 15, 16],
+           [20, 21, 22, 23, 24, 25, 26]])
+    >>> _unfold(a, axis=1, size=3, step=2)
+    array([[[ 0,  1,  2],
+            [ 2,  3,  4],
+            [ 4,  5,  6]],
+
+           [[10, 11, 12],
+            [12, 13, 14],
+            [14, 15, 16]],
+
+           [[20, 21, 22],
+            [22, 23, 24],
+            [24, 25, 26]]])
+    """
+    new_shape = [*arr.shape, size]
+    new_strides = [*arr.strides, arr.strides[axis]]
+    new_shape[axis] = (new_shape[axis] - size) // step + 1
+    new_strides[axis] = new_strides[axis] * step
+    return np.lib.stride_tricks.as_strided(arr,
+                                           shape=new_shape,
+                                           strides=new_strides,
+                                           writeable=False)
+
+
+def _array_patch_perimeters(x, rstride, cstride):
+    """
+    Extract perimeters of patches from *arr*.
+
+    Extracted patches are of size (*rstride* + 1) x (*cstride* + 1) and
+    share perimeters with their neighbors. The ordering of the vertices matches
+    that returned by ``_array_perimeter``.
+
+    Parameters
+    ----------
+    x : ndarray, shape (N, M)
+        Input array
+    rstride : int
+        Vertical (row) stride between corresponding elements of each patch
+    cstride : int
+        Horizontal (column) stride between corresponding elements of each patch
+
+    Returns
+    -------
+    patches : ndarray, shape (N/rstride * M/cstride, 2 * (rstride + cstride))
+    """
+    assert rstride > 0 and cstride > 0
+    assert (x.shape[0] - 1) % rstride == 0
+    assert (x.shape[1] - 1) % cstride == 0
+    # We build up each perimeter from four half-open intervals. Here is an
+    # illustrated explanation for rstride == cstride == 3
+    #
+    #       T T T R
+    #       L     R
+    #       L     R
+    #       L B B B
+    #
+    # where T means that this element will be in the top array, R for right,
+    # B for bottom and L for left. Each of the arrays below has a shape of:
+    #
+    #    (number of perimeters that can be extracted vertically,
+    #     number of perimeters that can be extracted horizontally,
+    #     cstride for top and bottom and rstride for left and right)
+    #
+    # Note that _unfold doesn't incur any memory copies, so the only costly
+    # operation here is the np.concatenate.
+    top = _unfold(x[:-1:rstride, :-1], 1, cstride, cstride)
+    bottom = _unfold(x[rstride::rstride, 1:], 1, cstride, cstride)[..., ::-1]
+    right = _unfold(x[:-1, cstride::cstride], 0, rstride, rstride)
+    left = _unfold(x[1:, :-1:cstride], 0, rstride, rstride)[..., ::-1]
+    return (np.concatenate((top, right, bottom, left), axis=2)
+              .reshape(-1, 2 * (rstride + cstride)))
+
+
 @contextlib.contextmanager
 def _setattr_cm(obj, **kwargs):
     """Temporarily set some attributes; restore original state at context exit.
diff --git a/lib/matplotlib/tests/test_cbook.py b/lib/matplotlib/tests/test_cbook.py
index 12b6edde1fc8..273b9dcf7c48 100644
--- a/lib/matplotlib/tests/test_cbook.py
+++ b/lib/matplotlib/tests/test_cbook.py
@@ -591,3 +591,31 @@ def test_warn_external(recwarn):
     cbook._warn_external("oops")
     assert len(recwarn) == 1
     assert recwarn[0].filename == __file__
+
+
+def test_array_patch_perimeters():
+    # This compares the old implementation as a reference for the
+    # vectorized one.
+    def check(x, rstride, cstride):
+        rows, cols = x.shape
+        row_inds = [*range(0, rows-1, rstride), rows-1]
+        col_inds = [*range(0, cols-1, cstride), cols-1]
+        polys = []
+        for rs, rs_next in zip(row_inds[:-1], row_inds[1:]):
+            for cs, cs_next in zip(col_inds[:-1], col_inds[1:]):
+                # +1 ensures we share edges between polygons
+                ps = cbook._array_perimeter(x[rs:rs_next+1, cs:cs_next+1]).T
+                polys.append(ps)
+        polys = np.asarray(polys)
+        assert np.array_equal(polys,
+                              cbook._array_patch_perimeters(
+                                  x, rstride=rstride, cstride=cstride))
+
+    def divisors(n):
+        return [i for i in range(1, n + 1) if n % i == 0]
+
+    for rows, cols in [(5, 5), (7, 14), (13, 9)]:
+        x = np.arange(rows * cols).reshape(rows, cols)
+        for rstride, cstride in itertools.product(divisors(rows - 1),
+                                                  divisors(cols - 1)):
+            check(x, rstride=rstride, cstride=cstride)
diff --git a/lib/mpl_toolkits/mplot3d/axes3d.py b/lib/mpl_toolkits/mplot3d/axes3d.py
index 14a94a744cce..94d60e414810 100644
--- a/lib/mpl_toolkits/mplot3d/axes3d.py
+++ b/lib/mpl_toolkits/mplot3d/axes3d.py
@@ -1444,6 +1444,17 @@ def plot_surface(self, X, Y, Z, *args, norm=None, vmin=None,
            the input data is larger, it will be downsampled (by slicing) to
            these numbers of points.
 
+        .. note::
+
+           To maximize rendering speed consider setting *rstride* and *cstride*
+           to divisors of the number of rows minus 1 and columns minus 1
+           respectively. For example, given 51 rows rstride can be any of the
+           divisors of 50.
+
+           Similarly, a setting of *rstride* and *cstride* equal to 1 (or
+           *rcount* and *ccount* equal the number of rows and columns) can use
+           the optimized path.
+
         Parameters
         ----------
         X, Y, Z : 2d arrays
@@ -1547,25 +1558,33 @@ def plot_surface(self, X, Y, Z, *args, norm=None, vmin=None,
                         "semantic or raise an error in matplotlib 3.3. "
                         "Please use shade=False instead.")
 
-        # evenly spaced, and including both endpoints
-        row_inds = list(range(0, rows-1, rstride)) + [rows-1]
-        col_inds = list(range(0, cols-1, cstride)) + [cols-1]
-
         colset = []  # the sampled facecolor
-        polys = []
-        for rs, rs_next in zip(row_inds[:-1], row_inds[1:]):
-            for cs, cs_next in zip(col_inds[:-1], col_inds[1:]):
-                ps = [
-                    # +1 ensures we share edges between polygons
-                    cbook._array_perimeter(a[rs:rs_next+1, cs:cs_next+1])
-                    for a in (X, Y, Z)
-                ]
-                # ps = np.stack(ps, axis=-1)
-                ps = np.array(ps).T
-                polys.append(ps)
-
-                if fcolors is not None:
-                    colset.append(fcolors[rs][cs])
+        if (rows - 1) % rstride == 0 and \
+           (cols - 1) % cstride == 0 and \
+           fcolors is None:
+            polys = np.stack(
+                [cbook._array_patch_perimeters(a, rstride, cstride)
+                 for a in (X, Y, Z)],
+                axis=-1)
+        else:
+            # evenly spaced, and including both endpoints
+            row_inds = list(range(0, rows-1, rstride)) + [rows-1]
+            col_inds = list(range(0, cols-1, cstride)) + [cols-1]
+
+            polys = []
+            for rs, rs_next in zip(row_inds[:-1], row_inds[1:]):
+                for cs, cs_next in zip(col_inds[:-1], col_inds[1:]):
+                    ps = [
+                        # +1 ensures we share edges between polygons
+                        cbook._array_perimeter(a[rs:rs_next+1, cs:cs_next+1])
+                        for a in (X, Y, Z)
+                    ]
+                    # ps = np.stack(ps, axis=-1)
+                    ps = np.array(ps).T
+                    polys.append(ps)
+
+                    if fcolors is not None:
+                        colset.append(fcolors[rs][cs])
 
         # note that the striding causes some polygons to have more coordinates
         # than others
@@ -1578,8 +1597,11 @@ def plot_surface(self, X, Y, Z, *args, norm=None, vmin=None,
             polyc.set_facecolors(colset)
             polyc.set_edgecolors(colset)
         elif cmap:
-            # doesn't vectorize because polys is jagged
-            avg_z = np.array([ps[:, 2].mean() for ps in polys])
+            # can't always vectorize, because polys might be jagged
+            if isinstance(polys, np.ndarray):
+                avg_z = polys[..., 2].mean(axis=-1)
+            else:
+                avg_z = np.array([ps[:, 2].mean() for ps in polys])
             polyc.set_array(avg_z)
             if vmin is not None or vmax is not None:
                 polyc.set_clim(vmin, vmax)