Skip to content

Commit 645c8da

Browse files
committed
Cache kpsewhich results persistently
And allow batching them. This commit does not yet use the batching but makes it possible.
1 parent f2308fc commit 645c8da

File tree

4 files changed

+347
-18
lines changed

4 files changed

+347
-18
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dviread changes
2+
---------------
3+
4+
The ``format`` keyword argument to ``dviread.find_tex_file`` has been
5+
deprecated. The function without the ``format`` argument, as well as
6+
the new ``dviread.find_tex_files`` function, cache their results in
7+
``texsupport.N.db`` in the cache directory to speed up dvi file
8+
processing.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
TeX support cache
2+
-----------------
3+
4+
The `usetex` feature sends snippets of TeX code to LaTeX and related
5+
external tools for processing. This causes a nontrivial number of
6+
helper processes to be spawned, which can be slow on some platforms.
7+
A new cache database helps reduce the need to spawn these helper
8+
processes, which should improve `usetex` processing speed.
9+
10+
The new cache files
11+
~~~~~~~~~~~~~~~~~~~
12+
13+
The cache database is stored in a file named `texsupport.N.db` in the
14+
standard cache directory (traditionally `$HOME/.matplotlib` but
15+
possibly `$HOME/.cache/matplotlib`), where `N` stands for a version
16+
number. The version number is incremented when new kinds of items are
17+
added to the caching code, in order to avoid version clashes when
18+
using multiple different versions of Matplotlib. The auxiliary files
19+
`texsupport.N.db-wal` and `texsupport.N.db-shm` help coordinate usage
20+
of the cache between concurrently running instances. All of these
21+
cache files may be deleted when Matplotlib is not running, and
22+
subsequent calls to the `usetex` code will recompute the TeX results.

lib/matplotlib/dviread.py

Lines changed: 233 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@
2424
import os
2525
import re
2626
import struct
27+
import sqlite3
2728
import sys
2829
import textwrap
2930

3031
import numpy as np
3132

32-
from matplotlib import cbook, rcParams
33+
from matplotlib import cbook, get_cachedir, rcParams
3334
from matplotlib.compat import subprocess
3435

3536
_log = logging.getLogger(__name__)
@@ -980,45 +981,259 @@ def _parse(self, file):
980981
return re.findall(br'/([^][{}<>\s]+)', data)
981982

982983

983-
def find_tex_file(filename, format=None):
984+
class TeXSupportCacheError(Exception):
985+
pass
986+
987+
988+
class TeXSupportCache:
989+
"""A persistent cache of data related to support files related to dvi
990+
files produced by TeX. Currently holds results from :program:`kpsewhich`,
991+
in future versions could hold pre-parsed font data etc.
992+
993+
Usage::
994+
995+
# create or get the singleton instance
996+
cache = TeXSupportCache.get_cache()
997+
with cache.connection as transaction:
998+
cache.update_pathnames(
999+
{"pdftex.map": "/usr/local/pdftex.map",
1000+
"cmsy10.pfb": "/usr/local/fonts/cmsy10.pfb"},
1001+
transaction)
1002+
pathnames = cache.get_pathnames(["pdftex.map", "cmr10.pfb"])
1003+
# now pathnames = {"pdftex.map": "/usr/local/pdftex.map"}
1004+
1005+
# optional after inserting new data, may improve query performance:
1006+
cache.optimize()
1007+
1008+
Parameters
1009+
----------
1010+
1011+
filename : str, optional
1012+
File in which to store the cache. Defaults to `texsupport.N.db` in
1013+
the standard cache directory where N is the current schema version.
1014+
1015+
Attributes
1016+
----------
1017+
1018+
connection
1019+
This database connection object has a context manager to set up
1020+
a transaction. Transactions are passed into methods that write to
1021+
the database.
9841022
"""
985-
Find a file in the texmf tree.
1023+
1024+
__slots__ = ('connection')
1025+
schema_version = 1 # should match PRAGMA user_version in _create
1026+
instance = None
1027+
1028+
@classmethod
1029+
def get_cache(cls):
1030+
"Return the singleton instance of the cache, at the default location"
1031+
if cls.instance is None:
1032+
cls.instance = cls()
1033+
return cls.instance
1034+
1035+
def __init__(self, filename=None):
1036+
if filename is None:
1037+
filename = os.path.join(get_cachedir(), 'texsupport.%d.db'
1038+
% self.schema_version)
1039+
1040+
self.connection = sqlite3.connect(
1041+
filename, isolation_level="DEFERRED")
1042+
with self.connection as conn:
1043+
conn.execute("PRAGMA journal_mode=WAL;")
1044+
version, = conn.execute("PRAGMA user_version;").fetchone()
1045+
1046+
if version == 0:
1047+
self._create()
1048+
elif version != self.schema_version:
1049+
raise TeXSupportCacheError(
1050+
"support database %s has version %d, expected %d"
1051+
% (filename, version, self.schema_version))
1052+
1053+
def _create(self):
1054+
"""Create the database."""
1055+
with self.connection as conn:
1056+
conn.executescript(
1057+
"""
1058+
PRAGMA page_size=4096;
1059+
CREATE TABLE file_path(
1060+
filename TEXT PRIMARY KEY NOT NULL,
1061+
pathname TEXT
1062+
) WITHOUT ROWID;
1063+
PRAGMA user_version=1;
1064+
""")
1065+
1066+
def optimize(self):
1067+
"""Optional optimization phase after updating data.
1068+
Executes sqlite's `PRAGMA optimize` statement, which can call
1069+
`ANALYZE` or other functions that can improve future query performance
1070+
by spending some time up-front."""
1071+
with self.connection as conn:
1072+
conn.execute("PRAGMA optimize;")
1073+
1074+
def get_pathnames(self, filenames):
1075+
"""Query the cache for pathnames related to `filenames`.
1076+
1077+
Parameters
1078+
----------
1079+
filenames : iterable of str
1080+
1081+
Returns
1082+
-------
1083+
mapping from str to (str or None)
1084+
For those filenames that exist in the cache, the mapping
1085+
includes either the related pathname or None to indicate that
1086+
the named file does not exist.
1087+
"""
1088+
rows = self.connection.execute(
1089+
"SELECT filename, pathname FROM file_path WHERE filename IN "
1090+
"(%s)"
1091+
% ','.join('?' for _ in filenames),
1092+
filenames).fetchall()
1093+
return {filename: pathname for (filename, pathname) in rows}
1094+
1095+
def update_pathnames(self, mapping, transaction):
1096+
"""Update the cache with the given filename-to-pathname mapping
1097+
1098+
Parameters
1099+
----------
1100+
mapping : mapping from str to (str or None)
1101+
Mapping from filenames to the corresponding full pathnames
1102+
or None to indicate that the named file does not exist.
1103+
transaction : obtained via the context manager of self.connection
1104+
"""
1105+
transaction.executemany(
1106+
"INSERT OR REPLACE INTO file_path (filename, pathname) "
1107+
"VALUES (?, ?)",
1108+
mapping.items())
1109+
1110+
1111+
def find_tex_files(filenames, cache=None):
1112+
"""Find multiple files in the texmf tree. This can be more efficient
1113+
than `find_tex_file` because it makes only one call to `kpsewhich`.
9861114
9871115
Calls :program:`kpsewhich` which is an interface to the kpathsea
9881116
library [1]_. Most existing TeX distributions on Unix-like systems use
9891117
kpathsea. It is also available as part of MikTeX, a popular
9901118
distribution on Windows.
9911119
1120+
The results are cached into the TeX support database. In case of
1121+
mistaken results, deleting the database resets the cache.
1122+
9921123
Parameters
9931124
----------
9941125
filename : string or bytestring
995-
format : string or bytestring
996-
Used as the value of the `--format` option to :program:`kpsewhich`.
997-
Could be e.g. 'tfm' or 'vf' to limit the search to that type of files.
1126+
cache : TeXSupportCache, optional
1127+
Cache instance to use, defaults to the singleton instance of the class.
9981128
9991129
References
10001130
----------
10011131
10021132
.. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_
10031133
The library that :program:`kpsewhich` is part of.
1134+
10041135
"""
10051136

10061137
# we expect these to always be ascii encoded, but use utf-8
10071138
# out of caution
1008-
if isinstance(filename, bytes):
1009-
filename = filename.decode('utf-8', errors='replace')
1010-
if isinstance(format, bytes):
1011-
format = format.decode('utf-8', errors='replace')
1139+
filenames = [f.decode('utf-8', errors='replace')
1140+
if isinstance(f, bytes) else f
1141+
for f in filenames]
1142+
if cache is None:
1143+
cache = TeXSupportCache.get_cache()
1144+
result = cache.get_pathnames(filenames)
1145+
1146+
filenames = [f for f in filenames if f not in result]
1147+
if not filenames:
1148+
return result
10121149

1013-
cmd = ['kpsewhich']
1014-
if format is not None:
1015-
cmd += ['--format=' + format]
1016-
cmd += [filename]
1017-
_log.debug('find_tex_file(%s): %s', filename, cmd)
1150+
cmd = ['kpsewhich'] + list(filenames)
1151+
_log.debug('find_tex_files: %s', cmd)
10181152
pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1019-
result = pipe.communicate()[0].rstrip()
1020-
_log.debug('find_tex_file result: %s', result)
1021-
return result.decode('ascii')
1153+
output = pipe.communicate()[0].decode('ascii').splitlines()
1154+
_log.debug('find_tex_files result: %s', output)
1155+
mapping = _match(filenames, output)
1156+
with cache.connection as transaction:
1157+
cache.update_pathnames(mapping, transaction)
1158+
result.update(mapping)
1159+
1160+
return result
1161+
1162+
1163+
def _match(filenames, pathnames):
1164+
"""
1165+
Match filenames to pathnames in lists that are in matching order,
1166+
except that some filenames may lack pathnames.
1167+
"""
1168+
result = {f: None for f in filenames}
1169+
filenames, pathnames = iter(filenames), iter(pathnames)
1170+
try:
1171+
filename, pathname = next(filenames), next(pathnames)
1172+
while True:
1173+
if pathname.endswith(os.path.sep + filename):
1174+
result[filename] = pathname
1175+
pathname = next(pathnames)
1176+
filename = next(filenames)
1177+
except StopIteration:
1178+
return result
1179+
1180+
1181+
def find_tex_file(filename, format=None, cache=None):
1182+
"""
1183+
Find a file in the texmf tree.
1184+
1185+
Calls :program:`kpsewhich` which is an interface to the kpathsea
1186+
library [1]_. Most existing TeX distributions on Unix-like systems use
1187+
kpathsea. It is also available as part of MikTeX, a popular
1188+
distribution on Windows.
1189+
1190+
The results are cached into a database whose location defaults to
1191+
:file:`~/.matplotlib/texsupport.db`. In case of mistaken results,
1192+
deleting this file resets the cache.
1193+
1194+
Parameters
1195+
----------
1196+
filename : string or bytestring
1197+
format : string or bytestring, DEPRECATED
1198+
Used as the value of the `--format` option to :program:`kpsewhich`.
1199+
Could be e.g. 'tfm' or 'vf' to limit the search to that type of files.
1200+
Deprecated to allow batching multiple filenames into one kpsewhich
1201+
call, since any format option would apply to all filenames at once.
1202+
cache : TeXSupportCache, optional
1203+
Cache instance to use, defaults to the singleton instance of the class.
1204+
1205+
References
1206+
----------
1207+
1208+
.. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_
1209+
The library that :program:`kpsewhich` is part of.
1210+
"""
1211+
1212+
if format is not None:
1213+
cbook.warn_deprecated(
1214+
"3.0",
1215+
"The format option to find_tex_file is deprecated "
1216+
"to allow batching multiple filenames into one call. "
1217+
"Omitting the option should not change the result, as "
1218+
"kpsewhich uses the filename extension to choose the path.")
1219+
# we expect these to always be ascii encoded, but use utf-8
1220+
# out of caution
1221+
if isinstance(filename, bytes):
1222+
filename = filename.decode('utf-8', errors='replace')
1223+
if isinstance(format, bytes):
1224+
format = format.decode('utf-8', errors='replace')
1225+
1226+
cmd = ['kpsewhich']
1227+
if format is not None:
1228+
cmd += ['--format=' + format]
1229+
cmd += [filename]
1230+
_log.debug('find_tex_file(%s): %s', filename, cmd)
1231+
pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1232+
result = pipe.communicate()[0].rstrip()
1233+
_log.debug('find_tex_file result: %s', result)
1234+
return result.decode('ascii')
1235+
1236+
return list(find_tex_files([filename], cache).values())[0]
10221237

10231238

10241239
# With multiple text objects per figure (e.g., tick labels) we may end

0 commit comments

Comments
 (0)