From 40132e7255809e2c3b92dedfab2e571817da245a Mon Sep 17 00:00:00 2001 From: Matthew Petroff Date: Sun, 12 Jul 2020 01:10:18 +0000 Subject: [PATCH 1/5] Use indexed color for PNG images in PDF files when possible. When PNG images have 256 colors or fewer, convert them to index color before saving them in a PDF. This can result in a signifcant reduction in file size. --- lib/matplotlib/backends/backend_pdf.py | 49 +++++++++++++++++++++----- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index 9317771c08b3..a0baea83a3a6 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1521,28 +1521,33 @@ def _unpack(self, im): alpha = None return rgb, alpha - def _writePng(self, data): + def _writePng(self, img): """ - Write the image *data* into the pdf file using png + Write the image *img* into the pdf file using png predictors with Flate compression. """ buffer = BytesIO() - if data.shape[-1] == 1: - data = data.squeeze(axis=-1) - Image.fromarray(data).save(buffer, format="png") + img.save(buffer, format="png") buffer.seek(8) + png_data = bit_depth = palette = None while True: length, type = struct.unpack(b'!L4s', buffer.read(8)) - if type == b'IDAT': + if type in [b'IHDR', b'PLTE', b'IDAT']: data = buffer.read(length) if len(data) != length: raise RuntimeError("truncated data") - self.currentstream.write(data) + if type == b'IHDR': + bit_depth = int(data[8]) + elif type == b'PLTE': + palette = data + elif type == b'IDAT': + png_data = data elif type == b'IEND': break else: buffer.seek(length, 1) buffer.seek(4, 1) # skip CRC + return png_data, bit_depth, palette def _writeImg(self, data, id, smask=None): """ @@ -1561,6 +1566,34 @@ def _writeImg(self, data, id, smask=None): if smask: obj['SMask'] = smask if mpl.rcParams['pdf.compression']: + if data.shape[-1] == 1: + data = data.squeeze(axis=-1) + img = Image.fromarray(data) + img_colors = img.getcolors(maxcolors=256) + if img_colors is not None: + # Convert to indexed color if there are 256 colors or fewer + # This can significantly reduce the file size + num_colors = len(img_colors) + img = img.convert(mode='P', dither=Image.NONE, + palette=Image.ADAPTIVE, colors=num_colors) + data, bit_depth, palette = self._writePng(img) + if bit_depth is None or palette is None: + raise RuntimeError("invalid PNG header") + palette = palette[:num_colors * 3] # Trim padding + if colors == 1: + # The PNG format uses an RGB palette for all indexed color + # images, but the PDF format allows for grayscale palettes. + # Thus, we convert the palette. + palette = palette[::3] + palette = pdfRepr(palette) + colorspace = obj['ColorSpace'].pdfRepr() + obj['ColorSpace'] = Verbatim(b'[/Indexed ' + colorspace + b' ' + + str(num_colors - 1).encode() + + b' ' + palette + b']') + obj['BitsPerComponent'] = bit_depth + colors = 1 + else: + data, _, _ = self._writePng(img) png = {'Predictor': 10, 'Colors': colors, 'Columns': width} else: png = None @@ -1571,7 +1604,7 @@ def _writeImg(self, data, id, smask=None): png=png ) if png: - self._writePng(data) + self.currentstream.write(data) else: self.currentstream.write(data.tobytes()) self.endStream() From eb74d35cebcb3ee95b957a6364d64da93c367694 Mon Sep 17 00:00:00 2001 From: Matthew Petroff Date: Sun, 12 Jul 2020 02:36:37 +0000 Subject: [PATCH 2/5] Don't use indexed color for grayscale. It seems to cause problems with images used for alpha transparency. It also doesn't have any benefit, since Pillow seems to always use 8-bit indexed color, even when 1-, 2-, or 4-bit indexed color is possible. --- lib/matplotlib/backends/backend_pdf.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index a0baea83a3a6..bfb0097ab192 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1570,7 +1570,7 @@ def _writeImg(self, data, id, smask=None): data = data.squeeze(axis=-1) img = Image.fromarray(data) img_colors = img.getcolors(maxcolors=256) - if img_colors is not None: + if colors == 3 and img_colors is not None: # Convert to indexed color if there are 256 colors or fewer # This can significantly reduce the file size num_colors = len(img_colors) @@ -1580,14 +1580,8 @@ def _writeImg(self, data, id, smask=None): if bit_depth is None or palette is None: raise RuntimeError("invalid PNG header") palette = palette[:num_colors * 3] # Trim padding - if colors == 1: - # The PNG format uses an RGB palette for all indexed color - # images, but the PDF format allows for grayscale palettes. - # Thus, we convert the palette. - palette = palette[::3] palette = pdfRepr(palette) - colorspace = obj['ColorSpace'].pdfRepr() - obj['ColorSpace'] = Verbatim(b'[/Indexed ' + colorspace + b' ' + obj['ColorSpace'] = Verbatim(b'[/Indexed /DeviceRGB ' + str(num_colors - 1).encode() + b' ' + palette + b']') obj['BitsPerComponent'] = bit_depth From eabfc26c5ab346924505b16e61c2774e3f73a515 Mon Sep 17 00:00:00 2001 From: Matthew Petroff Date: Sun, 12 Jul 2020 04:49:28 +0000 Subject: [PATCH 3/5] Fix issue with large PNG images in PDF. PNGs separate large images into chunks, but PDFs do not. --- lib/matplotlib/backends/backend_pdf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index bfb0097ab192..bf8332cfbe55 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1529,7 +1529,8 @@ def _writePng(self, img): buffer = BytesIO() img.save(buffer, format="png") buffer.seek(8) - png_data = bit_depth = palette = None + png_data = b'' + bit_depth = palette = None while True: length, type = struct.unpack(b'!L4s', buffer.read(8)) if type in [b'IHDR', b'PLTE', b'IDAT']: @@ -1541,7 +1542,7 @@ def _writePng(self, img): elif type == b'PLTE': palette = data elif type == b'IDAT': - png_data = data + png_data += data elif type == b'IEND': break else: From ad16cb66a39b0204fa7415ca60483a42655fc8f5 Mon Sep 17 00:00:00 2001 From: Matthew Petroff Date: Sun, 12 Jul 2020 21:20:20 +0000 Subject: [PATCH 4/5] Increase tolerance so subpixel issue doesn't cause test failure. There are ten pixels that differ due to how the subpixel calculation is done. --- lib/matplotlib/tests/test_image.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/matplotlib/tests/test_image.py b/lib/matplotlib/tests/test_image.py index 91ab8295956e..8f11d961584b 100644 --- a/lib/matplotlib/tests/test_image.py +++ b/lib/matplotlib/tests/test_image.py @@ -732,7 +732,11 @@ def test_log_scale_image(): ax.set(yscale='log') -@image_comparison(['rotate_image'], remove_text=True) +# Increased tolerance is needed for PDF test to avoid failure. After the PDF +# backend was modified to use indexed color, there are ten pixels that differ +# due to how the subpixel calculation is done when converting the PDF files to +# PNG images. +@image_comparison(['rotate_image'], remove_text=True, tol=0.35) def test_rotate_image(): delta = 0.25 x = y = np.arange(-3.0, 3.0, delta) From c9599a249318b5f27d359d5075173aa8bc84441b Mon Sep 17 00:00:00 2001 From: Matthew Petroff Date: Tue, 14 Jul 2020 13:45:13 +0000 Subject: [PATCH 5/5] Clarify variable names. --- lib/matplotlib/backends/backend_pdf.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index bf8332cfbe55..71dfdc91be64 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -1557,12 +1557,13 @@ def _writeImg(self, data, id, smask=None): (alpha channel) *smask*, which should be either None or a ``(height, width, 1)`` array. """ - height, width, colors = data.shape + height, width, color_channels = data.shape obj = {'Type': Name('XObject'), 'Subtype': Name('Image'), 'Width': width, 'Height': height, - 'ColorSpace': Name({1: 'DeviceGray', 3: 'DeviceRGB'}[colors]), + 'ColorSpace': Name({1: 'DeviceGray', + 3: 'DeviceRGB'}[color_channels]), 'BitsPerComponent': 8} if smask: obj['SMask'] = smask @@ -1571,13 +1572,13 @@ def _writeImg(self, data, id, smask=None): data = data.squeeze(axis=-1) img = Image.fromarray(data) img_colors = img.getcolors(maxcolors=256) - if colors == 3 and img_colors is not None: + if color_channels == 3 and img_colors is not None: # Convert to indexed color if there are 256 colors or fewer # This can significantly reduce the file size num_colors = len(img_colors) img = img.convert(mode='P', dither=Image.NONE, palette=Image.ADAPTIVE, colors=num_colors) - data, bit_depth, palette = self._writePng(img) + png_data, bit_depth, palette = self._writePng(img) if bit_depth is None or palette is None: raise RuntimeError("invalid PNG header") palette = palette[:num_colors * 3] # Trim padding @@ -1586,10 +1587,10 @@ def _writeImg(self, data, id, smask=None): + str(num_colors - 1).encode() + b' ' + palette + b']') obj['BitsPerComponent'] = bit_depth - colors = 1 + color_channels = 1 else: - data, _, _ = self._writePng(img) - png = {'Predictor': 10, 'Colors': colors, 'Columns': width} + png_data, _, _ = self._writePng(img) + png = {'Predictor': 10, 'Colors': color_channels, 'Columns': width} else: png = None self.beginStream( @@ -1599,7 +1600,7 @@ def _writeImg(self, data, id, smask=None): png=png ) if png: - self.currentstream.write(data) + self.currentstream.write(png_data) else: self.currentstream.write(data.tobytes()) self.endStream()