-
-
Notifications
You must be signed in to change notification settings - Fork 7.9k
[MRG+2] MAINT: Refactor the converted-image cache #7764
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1b24a16
650ddf2
3b5f636
556fd82
9f5a06d
33f2528
2136a26
6995a27
9ea66dd
3751f01
a70081e
85f54de
0d51e03
9de06c0
6007b37
e52cdd2
e5184bb
f47cbe4
996f22b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
|
||
def pytest_addoption(parser): | ||
group = parser.getgroup("matplotlib", "matplotlib custom options") | ||
group.addoption("--conversion-cache-max-size", action="store", | ||
help="conversion cache maximum size in bytes") | ||
group.addoption("--conversion-cache-report-misses", | ||
action="store_true", | ||
help="report conversion cache misses") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
from matplotlib.testing.conftest import (mpl_test_settings, | ||
pytest_configure, pytest_unconfigure) | ||
from matplotlib.testing.conftest import ( | ||
mpl_test_settings, pytest_configure, pytest_unconfigure, | ||
pytest_sessionfinish, pytest_terminal_summary) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
""" | ||
A cache of png files keyed by the MD5 hashes of corresponding svg and | ||
pdf files, to reduce test suite running times for svg and pdf files | ||
that stay exactly the same from one run to the next. | ||
|
||
The test setup in matplotlib.tests.conftest sets this up for image | ||
comparison tests and allows specifying the maximum cache size. | ||
""" | ||
|
||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
import hashlib | ||
import shutil | ||
import os | ||
import warnings | ||
|
||
from matplotlib import _get_cachedir | ||
from matplotlib import cbook | ||
from matplotlib import checkdep_ghostscript | ||
from matplotlib import checkdep_inkscape | ||
|
||
|
||
class _ConversionCache(object): | ||
"""A cache that stores png files converted from svg or pdf formats. | ||
|
||
The image comparison test cases compare svg and pdf files by | ||
converting them to png files. When a test case has produced a | ||
file, e.g. result.pdf, it queries this cache by the pathname | ||
'/path/to/result_images/result.pdf'. The cache computes a hash of | ||
the file (and the version of the external software used to convert | ||
the file) and if a result by that hash value is available, writes | ||
the data to the output location supplied by the caller. Otherwise | ||
the test case has to run the conversion and can then insert the | ||
result into the cache. | ||
|
||
Parameters | ||
---------- | ||
directory : str, optional | ||
Files are stored in this directory. Defaults to `'test_cache'` in | ||
the overall Matplotlib cache directory. | ||
max_size : int, optional | ||
The flush method will delete files until their combined size is | ||
under this limit, in bytes. Defaults to 100 megabytes. | ||
|
||
""" | ||
|
||
def __init__(self, directory=None, max_size=int(1e8)): | ||
# gets is the set of filenames queried from the cache | ||
self.gets = set() | ||
# hits is the set of filenames found in the cache when queried | ||
self.hits = set() | ||
if directory is not None: | ||
self.cachedir = directory | ||
else: | ||
self.cachedir = self.get_cache_dir() | ||
self.ensure_cache_dir() | ||
if not isinstance(max_size, int): | ||
raise ValueError("max_size is of type %s, expected int" % | ||
type(max_size)) | ||
self.max_size = max_size | ||
self.cached_ext = '.png' | ||
self.converter_version = {} | ||
_, gs_version = checkdep_ghostscript() | ||
if gs_version is not None: | ||
self.converter_version['.pdf'] = gs_version.encode('utf-8') | ||
is_version = checkdep_inkscape() | ||
if is_version is not None: | ||
self.converter_version['.svg'] = is_version.encode('utf-8') | ||
self.hash_cache = {} | ||
|
||
def get(self, filename, newname): | ||
"""Query the cache. | ||
|
||
Parameters | ||
---------- | ||
filename : str | ||
Full path to the original file. | ||
newname : str | ||
Path to which the result should be written. | ||
|
||
Returns | ||
------- | ||
bool | ||
True if the file was found in the cache and is now written | ||
to `newname`. | ||
""" | ||
self.gets.add(filename) | ||
hash_value = self._get_file_hash(filename) | ||
cached_file = os.path.join(self.cachedir, hash_value + self.cached_ext) | ||
with cbook.Locked(self.cachedir): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably not a problem right now but it may be preferable to at some point use (and thus, first, implement...) per-file locking rather than per directory; see #7776 (comment). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to hold the locks as briefly as possible, so for example the hashes get computed without holding the lock and only the shutil.copyfile call happens with the lock held. The expire method is the exception, but that one really needs to lock the whole directory to guarantee its file-size computations are accurate. The TeX cache, which #7776 is about, stays locked for the duration of a whole LaTeX run, which can easily take seconds. |
||
if os.path.exists(cached_file): | ||
shutil.copyfile(cached_file, newname) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we {sym,hard}link instead of copying? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would probably be faster and save a little disk space, but it would introduce the possibility to change the cached file later by modifying the linked file, which could lead to hard-to-find bugs. It's unlikely that the testing code would do this on purpose, but perhaps some day someone will write a recursive directory-deletion function that truncates the files first. (I have written such a function myself elsewhere because of... a long story.) |
||
self.hits.add(filename) | ||
return True | ||
else: | ||
return False | ||
|
||
def put(self, original, converted): | ||
"""Insert a file into the cache. | ||
|
||
Parameters | ||
---------- | ||
original : str | ||
Full path to the original file. | ||
converted : str | ||
Full path to the png file converted from the original. | ||
""" | ||
hash_value = self._get_file_hash(original) | ||
cached_file = os.path.join(self.cachedir, hash_value + self.cached_ext) | ||
with cbook.Locked(self.cachedir): | ||
shutil.copyfile(converted, cached_file) | ||
|
||
def _get_file_hash(self, path, block_size=2 ** 20): | ||
if path in self.hash_cache: | ||
return self.hash_cache[path] | ||
_, ext = os.path.splitext(path) | ||
version_tag = self.converter_version.get(ext) | ||
if version_tag is None: | ||
warnings.warn( | ||
("Don't know the external converter for files with extension " | ||
"%s, cannot ensure cache invalidation on version update. " | ||
"Either the relevant conversion program is missing or caching" | ||
" is attempted for an unknown file type.") | ||
% ext) | ||
result = self._get_file_hash_static(path, block_size, version_tag) | ||
self.hash_cache[path] = result | ||
return result | ||
|
||
@staticmethod | ||
def _get_file_hash_static(path, block_size, version_tag): | ||
# the parts of _get_file_hash that are called from the deprecated | ||
# compare.get_file_hash; can merge into _get_file_hash once that | ||
# function is removed | ||
md5 = hashlib.md5() | ||
with open(path, 'rb') as fd: | ||
while True: | ||
data = fd.read(block_size) | ||
if not data: | ||
break | ||
md5.update(data) | ||
if version_tag is not None: | ||
md5.update(version_tag) | ||
return md5.hexdigest() | ||
|
||
def report(self): | ||
"""Return information about the cache. | ||
|
||
Returns | ||
------- | ||
r : dict | ||
`r['gets']` is the set of files queried, | ||
`r['hits']` is the set of files found in the cache | ||
""" | ||
return dict(hits=self.hits, gets=self.gets) | ||
|
||
def expire(self): | ||
"""Delete cached files until the disk usage is under the limit. | ||
|
||
Orders files by access time, so the least recently used files | ||
get deleted first. | ||
""" | ||
with cbook.Locked(self.cachedir): | ||
stats = {filename: os.stat(os.path.join(self.cachedir, filename)) | ||
for filename in os.listdir(self.cachedir) | ||
if filename.endswith(self.cached_ext)} | ||
usage = sum(f.st_size for f in stats.values()) | ||
to_free = usage - self.max_size | ||
if to_free <= 0: | ||
return | ||
|
||
files = sorted(stats.keys(), | ||
key=lambda f: stats[f].st_atime, | ||
reverse=True) | ||
while to_free > 0: | ||
filename = files.pop() | ||
os.remove(os.path.join(self.cachedir, filename)) | ||
to_free -= stats[filename].st_size | ||
|
||
@staticmethod | ||
def get_cache_dir(): | ||
cachedir = _get_cachedir() | ||
if cachedir is None: | ||
raise _CacheError('No suitable configuration directory') | ||
cachedir = os.path.join(cachedir, 'test_cache') | ||
return cachedir | ||
|
||
def ensure_cache_dir(self): | ||
if not os.path.exists(self.cachedir): | ||
try: | ||
cbook.mkdirs(self.cachedir) | ||
except OSError as e: | ||
raise _CacheError("Error creating cache directory %s: %s" | ||
% (self.cachedir, str(e))) | ||
if not os.access(self.cachedir, os.W_OK): | ||
raise _CacheError("Cache directory %s not writable" | ||
% self.cachedir) | ||
|
||
|
||
class _CacheError(Exception): | ||
def __init__(self, message): | ||
self.message = message | ||
|
||
def __str__(self): | ||
return self.message | ||
|
||
|
||
# A global cache instance, set by the appropriate test runner. | ||
_conversion_cache = None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe document self.hits and self.gets here?
(I don't have better names yet, thought I haven't really thought about it extensively)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added some comments.