From 40132e7255809e2c3b92dedfab2e571817da245a Mon Sep 17 00:00:00 2001
From: Matthew Petroff <matthew@mpetroff.net>
Date: Sun, 12 Jul 2020 01:10:18 +0000
Subject: [PATCH 1/5] Use indexed color for PNG images in PDF files when
 possible.

When PNG images have 256 colors or fewer, convert them to index color before
saving them in a PDF. This can result in a signifcant reduction in file size.
---
 lib/matplotlib/backends/backend_pdf.py | 49 +++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
index 9317771c08b3..a0baea83a3a6 100644
--- a/lib/matplotlib/backends/backend_pdf.py
+++ b/lib/matplotlib/backends/backend_pdf.py
@@ -1521,28 +1521,33 @@ def _unpack(self, im):
                 alpha = None
             return rgb, alpha
 
-    def _writePng(self, data):
+    def _writePng(self, img):
         """
-        Write the image *data* into the pdf file using png
+        Write the image *img* into the pdf file using png
         predictors with Flate compression.
         """
         buffer = BytesIO()
-        if data.shape[-1] == 1:
-            data = data.squeeze(axis=-1)
-        Image.fromarray(data).save(buffer, format="png")
+        img.save(buffer, format="png")
         buffer.seek(8)
+        png_data = bit_depth = palette = None
         while True:
             length, type = struct.unpack(b'!L4s', buffer.read(8))
-            if type == b'IDAT':
+            if type in [b'IHDR', b'PLTE', b'IDAT']:
                 data = buffer.read(length)
                 if len(data) != length:
                     raise RuntimeError("truncated data")
-                self.currentstream.write(data)
+                if type == b'IHDR':
+                    bit_depth = int(data[8])
+                elif type == b'PLTE':
+                    palette = data
+                elif type == b'IDAT':
+                    png_data = data
             elif type == b'IEND':
                 break
             else:
                 buffer.seek(length, 1)
             buffer.seek(4, 1)   # skip CRC
+        return png_data, bit_depth, palette
 
     def _writeImg(self, data, id, smask=None):
         """
@@ -1561,6 +1566,34 @@ def _writeImg(self, data, id, smask=None):
         if smask:
             obj['SMask'] = smask
         if mpl.rcParams['pdf.compression']:
+            if data.shape[-1] == 1:
+                data = data.squeeze(axis=-1)
+            img = Image.fromarray(data)
+            img_colors = img.getcolors(maxcolors=256)
+            if img_colors is not None:
+                # Convert to indexed color if there are 256 colors or fewer
+                # This can significantly reduce the file size
+                num_colors = len(img_colors)
+                img = img.convert(mode='P', dither=Image.NONE,
+                                  palette=Image.ADAPTIVE, colors=num_colors)
+                data, bit_depth, palette = self._writePng(img)
+                if bit_depth is None or palette is None:
+                    raise RuntimeError("invalid PNG header")
+                palette = palette[:num_colors * 3]  # Trim padding
+                if colors == 1:
+                    # The PNG format uses an RGB palette for all indexed color
+                    # images, but the PDF format allows for grayscale palettes.
+                    # Thus, we convert the palette.
+                    palette = palette[::3]
+                palette = pdfRepr(palette)
+                colorspace = obj['ColorSpace'].pdfRepr()
+                obj['ColorSpace'] = Verbatim(b'[/Indexed ' + colorspace + b' '
+                                             + str(num_colors - 1).encode()
+                                             + b' ' + palette + b']')
+                obj['BitsPerComponent'] = bit_depth
+                colors = 1
+            else:
+                data, _, _ = self._writePng(img)
             png = {'Predictor': 10, 'Colors': colors, 'Columns': width}
         else:
             png = None
@@ -1571,7 +1604,7 @@ def _writeImg(self, data, id, smask=None):
             png=png
             )
         if png:
-            self._writePng(data)
+            self.currentstream.write(data)
         else:
             self.currentstream.write(data.tobytes())
         self.endStream()

From eb74d35cebcb3ee95b957a6364d64da93c367694 Mon Sep 17 00:00:00 2001
From: Matthew Petroff <matthew@mpetroff.net>
Date: Sun, 12 Jul 2020 02:36:37 +0000
Subject: [PATCH 2/5] Don't use indexed color for grayscale.

It seems to cause problems with images used for alpha transparency. It also
doesn't have any benefit, since Pillow seems to always use 8-bit indexed
color, even when 1-, 2-, or 4-bit indexed color is possible.
---
 lib/matplotlib/backends/backend_pdf.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
index a0baea83a3a6..bfb0097ab192 100644
--- a/lib/matplotlib/backends/backend_pdf.py
+++ b/lib/matplotlib/backends/backend_pdf.py
@@ -1570,7 +1570,7 @@ def _writeImg(self, data, id, smask=None):
                 data = data.squeeze(axis=-1)
             img = Image.fromarray(data)
             img_colors = img.getcolors(maxcolors=256)
-            if img_colors is not None:
+            if colors == 3 and img_colors is not None:
                 # Convert to indexed color if there are 256 colors or fewer
                 # This can significantly reduce the file size
                 num_colors = len(img_colors)
@@ -1580,14 +1580,8 @@ def _writeImg(self, data, id, smask=None):
                 if bit_depth is None or palette is None:
                     raise RuntimeError("invalid PNG header")
                 palette = palette[:num_colors * 3]  # Trim padding
-                if colors == 1:
-                    # The PNG format uses an RGB palette for all indexed color
-                    # images, but the PDF format allows for grayscale palettes.
-                    # Thus, we convert the palette.
-                    palette = palette[::3]
                 palette = pdfRepr(palette)
-                colorspace = obj['ColorSpace'].pdfRepr()
-                obj['ColorSpace'] = Verbatim(b'[/Indexed ' + colorspace + b' '
+                obj['ColorSpace'] = Verbatim(b'[/Indexed /DeviceRGB '
                                              + str(num_colors - 1).encode()
                                              + b' ' + palette + b']')
                 obj['BitsPerComponent'] = bit_depth

From eabfc26c5ab346924505b16e61c2774e3f73a515 Mon Sep 17 00:00:00 2001
From: Matthew Petroff <matthew@mpetroff.net>
Date: Sun, 12 Jul 2020 04:49:28 +0000
Subject: [PATCH 3/5] Fix issue with large PNG images in PDF.

PNGs separate large images into chunks, but PDFs do not.
---
 lib/matplotlib/backends/backend_pdf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
index bfb0097ab192..bf8332cfbe55 100644
--- a/lib/matplotlib/backends/backend_pdf.py
+++ b/lib/matplotlib/backends/backend_pdf.py
@@ -1529,7 +1529,8 @@ def _writePng(self, img):
         buffer = BytesIO()
         img.save(buffer, format="png")
         buffer.seek(8)
-        png_data = bit_depth = palette = None
+        png_data = b''
+        bit_depth = palette = None
         while True:
             length, type = struct.unpack(b'!L4s', buffer.read(8))
             if type in [b'IHDR', b'PLTE', b'IDAT']:
@@ -1541,7 +1542,7 @@ def _writePng(self, img):
                 elif type == b'PLTE':
                     palette = data
                 elif type == b'IDAT':
-                    png_data = data
+                    png_data += data
             elif type == b'IEND':
                 break
             else:

From ad16cb66a39b0204fa7415ca60483a42655fc8f5 Mon Sep 17 00:00:00 2001
From: Matthew Petroff <matthew@mpetroff.net>
Date: Sun, 12 Jul 2020 21:20:20 +0000
Subject: [PATCH 4/5] Increase tolerance so subpixel issue doesn't cause test
 failure.

There are ten pixels that differ due to how the subpixel calculation is done.
---
 lib/matplotlib/tests/test_image.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/matplotlib/tests/test_image.py b/lib/matplotlib/tests/test_image.py
index 91ab8295956e..8f11d961584b 100644
--- a/lib/matplotlib/tests/test_image.py
+++ b/lib/matplotlib/tests/test_image.py
@@ -732,7 +732,11 @@ def test_log_scale_image():
     ax.set(yscale='log')
 
 
-@image_comparison(['rotate_image'], remove_text=True)
+# Increased tolerance is needed for PDF test to avoid failure. After the PDF
+# backend was modified to use indexed color, there are ten pixels that differ
+# due to how the subpixel calculation is done when converting the PDF files to
+# PNG images.
+@image_comparison(['rotate_image'], remove_text=True, tol=0.35)
 def test_rotate_image():
     delta = 0.25
     x = y = np.arange(-3.0, 3.0, delta)

From c9599a249318b5f27d359d5075173aa8bc84441b Mon Sep 17 00:00:00 2001
From: Matthew Petroff <matthew@mpetroff.net>
Date: Tue, 14 Jul 2020 13:45:13 +0000
Subject: [PATCH 5/5] Clarify variable names.

---
 lib/matplotlib/backends/backend_pdf.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
index bf8332cfbe55..71dfdc91be64 100644
--- a/lib/matplotlib/backends/backend_pdf.py
+++ b/lib/matplotlib/backends/backend_pdf.py
@@ -1557,12 +1557,13 @@ def _writeImg(self, data, id, smask=None):
         (alpha channel) *smask*, which should be either None or a ``(height,
         width, 1)`` array.
         """
-        height, width, colors = data.shape
+        height, width, color_channels = data.shape
         obj = {'Type': Name('XObject'),
                'Subtype': Name('Image'),
                'Width': width,
                'Height': height,
-               'ColorSpace': Name({1: 'DeviceGray', 3: 'DeviceRGB'}[colors]),
+               'ColorSpace': Name({1: 'DeviceGray',
+                                   3: 'DeviceRGB'}[color_channels]),
                'BitsPerComponent': 8}
         if smask:
             obj['SMask'] = smask
@@ -1571,13 +1572,13 @@ def _writeImg(self, data, id, smask=None):
                 data = data.squeeze(axis=-1)
             img = Image.fromarray(data)
             img_colors = img.getcolors(maxcolors=256)
-            if colors == 3 and img_colors is not None:
+            if color_channels == 3 and img_colors is not None:
                 # Convert to indexed color if there are 256 colors or fewer
                 # This can significantly reduce the file size
                 num_colors = len(img_colors)
                 img = img.convert(mode='P', dither=Image.NONE,
                                   palette=Image.ADAPTIVE, colors=num_colors)
-                data, bit_depth, palette = self._writePng(img)
+                png_data, bit_depth, palette = self._writePng(img)
                 if bit_depth is None or palette is None:
                     raise RuntimeError("invalid PNG header")
                 palette = palette[:num_colors * 3]  # Trim padding
@@ -1586,10 +1587,10 @@ def _writeImg(self, data, id, smask=None):
                                              + str(num_colors - 1).encode()
                                              + b' ' + palette + b']')
                 obj['BitsPerComponent'] = bit_depth
-                colors = 1
+                color_channels = 1
             else:
-                data, _, _ = self._writePng(img)
-            png = {'Predictor': 10, 'Colors': colors, 'Columns': width}
+                png_data, _, _ = self._writePng(img)
+            png = {'Predictor': 10, 'Colors': color_channels, 'Columns': width}
         else:
             png = None
         self.beginStream(
@@ -1599,7 +1600,7 @@ def _writeImg(self, data, id, smask=None):
             png=png
             )
         if png:
-            self.currentstream.write(data)
+            self.currentstream.write(png_data)
         else:
             self.currentstream.write(data.tobytes())
         self.endStream()