Skip to content

[MRG+2] MAINT: Refactor the converted-image cache #7764

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)


def pytest_addoption(parser):
group = parser.getgroup("matplotlib", "matplotlib custom options")
group.addoption("--conversion-cache-max-size", action="store",
help="conversion cache maximum size in bytes")
group.addoption("--conversion-cache-report-misses",
action="store_true",
help="report conversion cache misses")
5 changes: 3 additions & 2 deletions lib/matplotlib/sphinxext/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)

from matplotlib.testing.conftest import (mpl_test_settings,
pytest_configure, pytest_unconfigure)
from matplotlib.testing.conftest import (
mpl_test_settings, pytest_configure, pytest_unconfigure,
pytest_sessionfinish, pytest_terminal_summary)
209 changes: 209 additions & 0 deletions lib/matplotlib/testing/_conversion_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
"""
A cache of png files keyed by the MD5 hashes of corresponding svg and
pdf files, to reduce test suite running times for svg and pdf files
that stay exactly the same from one run to the next.

The test setup in matplotlib.tests.conftest sets this up for image
comparison tests and allows specifying the maximum cache size.
"""

from __future__ import (absolute_import, division, print_function,
unicode_literals)

import hashlib
import shutil
import os
import warnings

from matplotlib import _get_cachedir
from matplotlib import cbook
from matplotlib import checkdep_ghostscript
from matplotlib import checkdep_inkscape


class _ConversionCache(object):
"""A cache that stores png files converted from svg or pdf formats.

The image comparison test cases compare svg and pdf files by
converting them to png files. When a test case has produced a
file, e.g. result.pdf, it queries this cache by the pathname
'/path/to/result_images/result.pdf'. The cache computes a hash of
the file (and the version of the external software used to convert
the file) and if a result by that hash value is available, writes
the data to the output location supplied by the caller. Otherwise
the test case has to run the conversion and can then insert the
result into the cache.

Parameters
----------
directory : str, optional
Files are stored in this directory. Defaults to `'test_cache'` in
the overall Matplotlib cache directory.
max_size : int, optional
The flush method will delete files until their combined size is
under this limit, in bytes. Defaults to 100 megabytes.

"""

def __init__(self, directory=None, max_size=int(1e8)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe document self.hits and self.gets here?
(I don't have better names yet, thought I haven't really thought about it extensively)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some comments.

# gets is the set of filenames queried from the cache
self.gets = set()
# hits is the set of filenames found in the cache when queried
self.hits = set()
if directory is not None:
self.cachedir = directory
else:
self.cachedir = self.get_cache_dir()
self.ensure_cache_dir()
if not isinstance(max_size, int):
raise ValueError("max_size is of type %s, expected int" %
type(max_size))
self.max_size = max_size
self.cached_ext = '.png'
self.converter_version = {}
_, gs_version = checkdep_ghostscript()
if gs_version is not None:
self.converter_version['.pdf'] = gs_version.encode('utf-8')
is_version = checkdep_inkscape()
if is_version is not None:
self.converter_version['.svg'] = is_version.encode('utf-8')
self.hash_cache = {}

def get(self, filename, newname):
"""Query the cache.

Parameters
----------
filename : str
Full path to the original file.
newname : str
Path to which the result should be written.

Returns
-------
bool
True if the file was found in the cache and is now written
to `newname`.
"""
self.gets.add(filename)
hash_value = self._get_file_hash(filename)
cached_file = os.path.join(self.cachedir, hash_value + self.cached_ext)
with cbook.Locked(self.cachedir):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not a problem right now but it may be preferable to at some point use (and thus, first, implement...) per-file locking rather than per directory; see #7776 (comment).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to hold the locks as briefly as possible, so for example the hashes get computed without holding the lock and only the shutil.copyfile call happens with the lock held. The expire method is the exception, but that one really needs to lock the whole directory to guarantee its file-size computations are accurate. The TeX cache, which #7776 is about, stays locked for the duration of a whole LaTeX run, which can easily take seconds.

if os.path.exists(cached_file):
shutil.copyfile(cached_file, newname)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we {sym,hard}link instead of copying?
(On Windows, admin rights are needed to symlink but not to hardlink. Python only added Windows support for either in Py3.2 (https://docs.python.org/3.6/library/os.html#os.link, https://docs.python.org/3.6/library/os.html#os.symlink).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would probably be faster and save a little disk space, but it would introduce the possibility to change the cached file later by modifying the linked file, which could lead to hard-to-find bugs. It's unlikely that the testing code would do this on purpose, but perhaps some day someone will write a recursive directory-deletion function that truncates the files first. (I have written such a function myself elsewhere because of... a long story.)

self.hits.add(filename)
return True
else:
return False

def put(self, original, converted):
"""Insert a file into the cache.

Parameters
----------
original : str
Full path to the original file.
converted : str
Full path to the png file converted from the original.
"""
hash_value = self._get_file_hash(original)
cached_file = os.path.join(self.cachedir, hash_value + self.cached_ext)
with cbook.Locked(self.cachedir):
shutil.copyfile(converted, cached_file)

def _get_file_hash(self, path, block_size=2 ** 20):
if path in self.hash_cache:
return self.hash_cache[path]
_, ext = os.path.splitext(path)
version_tag = self.converter_version.get(ext)
if version_tag is None:
warnings.warn(
("Don't know the external converter for files with extension "
"%s, cannot ensure cache invalidation on version update. "
"Either the relevant conversion program is missing or caching"
" is attempted for an unknown file type.")
% ext)
result = self._get_file_hash_static(path, block_size, version_tag)
self.hash_cache[path] = result
return result

@staticmethod
def _get_file_hash_static(path, block_size, version_tag):
# the parts of _get_file_hash that are called from the deprecated
# compare.get_file_hash; can merge into _get_file_hash once that
# function is removed
md5 = hashlib.md5()
with open(path, 'rb') as fd:
while True:
data = fd.read(block_size)
if not data:
break
md5.update(data)
if version_tag is not None:
md5.update(version_tag)
return md5.hexdigest()

def report(self):
"""Return information about the cache.

Returns
-------
r : dict
`r['gets']` is the set of files queried,
`r['hits']` is the set of files found in the cache
"""
return dict(hits=self.hits, gets=self.gets)

def expire(self):
"""Delete cached files until the disk usage is under the limit.

Orders files by access time, so the least recently used files
get deleted first.
"""
with cbook.Locked(self.cachedir):
stats = {filename: os.stat(os.path.join(self.cachedir, filename))
for filename in os.listdir(self.cachedir)
if filename.endswith(self.cached_ext)}
usage = sum(f.st_size for f in stats.values())
to_free = usage - self.max_size
if to_free <= 0:
return

files = sorted(stats.keys(),
key=lambda f: stats[f].st_atime,
reverse=True)
while to_free > 0:
filename = files.pop()
os.remove(os.path.join(self.cachedir, filename))
to_free -= stats[filename].st_size

@staticmethod
def get_cache_dir():
cachedir = _get_cachedir()
if cachedir is None:
raise _CacheError('No suitable configuration directory')
cachedir = os.path.join(cachedir, 'test_cache')
return cachedir

def ensure_cache_dir(self):
if not os.path.exists(self.cachedir):
try:
cbook.mkdirs(self.cachedir)
except OSError as e:
raise _CacheError("Error creating cache directory %s: %s"
% (self.cachedir, str(e)))
if not os.access(self.cachedir, os.W_OK):
raise _CacheError("Cache directory %s not writable"
% self.cachedir)


class _CacheError(Exception):
def __init__(self, message):
self.message = message

def __str__(self):
return self.message


# A global cache instance, set by the appropriate test runner.
_conversion_cache = None
106 changes: 40 additions & 66 deletions lib/matplotlib/testing/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,16 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import six

import hashlib
import os
import shutil

import numpy as np

import matplotlib
from matplotlib import cbook
from matplotlib.compat import subprocess
from matplotlib.testing import _conversion_cache as ccache
from matplotlib.testing.exceptions import ImageComparisonFailure
from matplotlib import _png
from matplotlib import _get_cachedir
from matplotlib import cbook
from distutils import version

__all__ = ['compare_float', 'compare_images', 'comparable_formats']

Expand Down Expand Up @@ -76,40 +71,6 @@ def compare_float(expected, actual, relTol=None, absTol=None):
return msg or None


def get_cache_dir():
cachedir = _get_cachedir()
if cachedir is None:
raise RuntimeError('Could not find a suitable configuration directory')
cache_dir = os.path.join(cachedir, 'test_cache')
if not os.path.exists(cache_dir):
try:
cbook.mkdirs(cache_dir)
except IOError:
return None
if not os.access(cache_dir, os.W_OK):
return None
return cache_dir


def get_file_hash(path, block_size=2 ** 20):
md5 = hashlib.md5()
with open(path, 'rb') as fd:
while True:
data = fd.read(block_size)
if not data:
break
md5.update(data)

if path.endswith('.pdf'):
from matplotlib import checkdep_ghostscript
md5.update(checkdep_ghostscript()[1].encode('utf-8'))
elif path.endswith('.svg'):
from matplotlib import checkdep_inkscape
md5.update(checkdep_inkscape().encode('utf-8'))

return md5.hexdigest()


def make_external_conversion_command(cmd):
def convert(old, new):
cmdline = cmd(old, new)
Expand Down Expand Up @@ -160,16 +121,39 @@ def comparable_formats():
return ['png'] + list(converter)


def convert(filename, cache):
@cbook.deprecated('2.1')
def get_cache_dir():
return ccache._ConversionCache.get_cache_dir()


@cbook.deprecated('2.1')
def get_file_hash(path, block_size=2 ** 20):
if path.endswith('.pdf'):
from matplotlib import checkdep_ghostscript
version_tag = checkdep_ghostscript()[1].encode('utf-8')
elif path.endswith('.svg'):
from matplotlib import checkdep_inkscape
version_tag = checkdep_inkscape().encode('utf-8')
else:
version_tag = None
return ccache._ConversionCache._get_file_hash_static(
path, block_size, version_tag)


def convert(filename, cache=None):
"""
Convert the named file into a png file. Returns the name of the
created file.

If *cache* is True, the result of the conversion is cached in
`matplotlib._get_cachedir() + '/test_cache/'`. The caching is based
on a hash of the exact contents of the input file. The is no limit
on the size of the cache, so it may need to be manually cleared
periodically.
Parameters
----------
filename : str
cache : _ConversionCache, optional

Returns
-------
str
The converted file.

"""
base, extension = filename.rsplit('.', 1)
Expand All @@ -190,23 +174,12 @@ def convert(filename, cache):
# is out of date.
if (not os.path.exists(newname) or
os.stat(newname).st_mtime < os.stat(filename).st_mtime):
if cache:
cache_dir = get_cache_dir()
else:
cache_dir = None

if cache_dir is not None:
hash_value = get_file_hash(filename)
new_ext = os.path.splitext(newname)[1]
cached_file = os.path.join(cache_dir, hash_value + new_ext)
if os.path.exists(cached_file):
shutil.copyfile(cached_file, newname)
return newname

in_cache = cache and cache.get(filename, newname)
if in_cache:
return newname
converter[extension](filename, newname)

if cache_dir is not None:
shutil.copyfile(newname, cached_file)
if cache:
cache.put(filename, newname)

return newname

Expand Down Expand Up @@ -269,7 +242,7 @@ def calculate_rms(expectedImage, actualImage):
return rms


def compare_images(expected, actual, tol, in_decorator=False):
def compare_images(expected, actual, tol, in_decorator=False, cache=None):
"""
Compare two "image" files checking differences within a tolerance.

Expand All @@ -290,6 +263,7 @@ def compare_images(expected, actual, tol, in_decorator=False):
in_decorator : bool
If called from image_comparison decorator, this should be
True. (default=False)
cache : matplotlib.testing._conversion_cache._ConversionCache, optional

Example
-------
Expand All @@ -311,8 +285,8 @@ def compare_images(expected, actual, tol, in_decorator=False):
raise IOError('Baseline image %r does not exist.' % expected)

if extension != 'png':
actual = convert(actual, False)
expected = convert(expected, True)
actual = convert(actual, cache)
expected = convert(expected, cache)

# open the image files and remove the alpha channel (if it exists)
expectedImage = _png.read_png_int(expected)
Expand Down
Loading