Handle dvi font names as ASCII bytestrings

Dvi is a binary format that includes some ASCII strings such as TeX names of some fonts. The associated files such as psfonts.map need to be ASCII too. This patch changes their handling to keep them as binary strings all the time. This avoids the ugly workaround try: result = some_mapping[texname] except KeyError: result = some_mapping[texname.decode('ascii')] which is essentially saying that texname is sometimes a string, sometimes a bytestring. The workaround masks real KeyErrors, leading to incomprehensible error messages such as in #6516.
matplotlib · QuLogic · Feb 26, 2017 · Aug 25, 2016 · Aug 25, 2016 · Aug 25, 2016
commit 705b021fb4959edb0d316ecaad7be7c63d765c7d
diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py
@@ -510,7 +510,7 @@ class DviFont(object):
        Name of the font as used internally by TeX and friends. This
        is usually very different from any external font names, and
        :class:`dviread.PsfontsMap` can be used to find the external
-       name of the font.
+       name of the font. ASCII bytestring.
 
     .. attribute:: size
 
@@ -526,8 +526,7 @@ class DviFont(object):
     __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')
 
     def __init__(self, scale, tfm, texname, vf):
-        if six.PY3 and isinstance(texname, bytes):
-            texname = texname.decode('ascii')
+        assert(isinstance(texname, bytes))
         self._scale, self._tfm, self.texname, self._vf = \
             scale, tfm, texname, vf
         self.size = scale * (72.0 / (72.27 * 2**16))
@@ -807,42 +806,42 @@ def __init__(self, filename):
             self._parse(file)
 
     def __getitem__(self, texname):
-        try:
-            result = self._font[texname]
-        except KeyError:
-            result = self._font[texname.decode('ascii')]
+        assert(isinstance(texname, bytes))
+        result = self._font[texname]
         fn, enc = result.filename, result.encoding
-        if fn is not None and not fn.startswith('/'):
+        if fn is not None and not fn.startswith(b'/'):
             fn = find_tex_file(fn)
-        if enc is not None and not enc.startswith('/'):
+        if enc is not None and not enc.startswith(b'/'):
             enc = find_tex_file(result.encoding)
         return result._replace(filename=fn, encoding=enc)
 
     def _parse(self, file):
-        """Parse each line into words."""
+        """Parse each line into words and process them."""
+
         for line in file:
+            line = six.b(line)
             line = line.strip()
-            if line == '' or line.startswith('%'):
+            if line == b'' or line.startswith(b'%'):
                 continue
             words, pos = [], 0
             while pos < len(line):
-                if line[pos] == '"':   # double quoted word
+                if line[pos:pos+1] == b'"':   # double quoted word
                     pos += 1
-                    end = line.index('"', pos)
+                    end = line.index(b'"', pos)
                     words.append(line[pos:end])
                     pos = end + 1
                 else:                  # ordinary word
-                    end = line.find(' ', pos+1)
+                    end = line.find(b' ', pos+1)
                     if end == -1:
                         end = len(line)
                     words.append(line[pos:end])
                     pos = end
-                while pos < len(line) and line[pos] == ' ':
+                while pos < len(line) and line[pos:pos+1] == b' ':
                     pos += 1
             self._register(words)
 
     def _register(self, words):
-        """Register a font described by "words".
+        """Register a font described by "words", a sequence of bytestrings.
 
         The format is, AFAIK: texname fontname [effects and filenames]
         Effects are PostScript snippets like ".177 SlantFont",
@@ -861,19 +860,23 @@ def _register(self, words):
         # http://tex.stackexchange.com/questions/10826/
         # http://article.gmane.org/gmane.comp.tex.pdftex/4914
 
+        # input must be bytestrings (the file format is ASCII)
+        for word in words:
+            assert(isinstance(word, bytes))
+
         texname, psname = words[:2]
-        effects, encoding, filename = '', None, None
+        effects, encoding, filename = b'', None, None
         for word in words[2:]:
-            if not word.startswith('<'):
+            if not word.startswith(b'<'):
                 effects = word
             else:
-                word = word.lstrip('<')
-                if word.startswith('[') or word.endswith('.enc'):
+                word = word.lstrip(b'<')
+                if word.startswith(b'[') or word.endswith(b'.enc'):
                     if encoding is not None:
                         matplotlib.verbose.report(
                             'Multiple encodings for %s = %s'
                             % (texname, psname), 'debug')
-                    if word.startswith('['):
+                    if word.startswith(b'['):
                         encoding = word[1:]
                     else:
                         encoding = word
@@ -884,11 +887,11 @@ def _register(self, words):
         eff = effects.split()
         effects = {}
         try:
-            effects['slant'] = float(eff[eff.index('SlantFont')-1])
+            effects['slant'] = float(eff[eff.index(b'SlantFont')-1])
         except ValueError:
             pass
         try:
-            effects['extend'] = float(eff[eff.index('ExtendFont')-1])
+            effects['extend'] = float(eff[eff.index(b'ExtendFont')-1])
         except ValueError:
             pass
 
@@ -927,26 +930,27 @@ def _parse(self, file):
 
         state = 0
         for line in file:
-            comment_start = line.find('%')
+            line = six.b(line)
+            comment_start = line.find(b'%')
             if comment_start > -1:
                 line = line[:comment_start]
             line = line.strip()
 
             if state == 0:
                 # Expecting something like /FooEncoding [
-                if '[' in line:
+                if b'[' in line:
                     state = 1
-                    line = line[line.index('[')+1:].strip()
+                    line = line[line.index(b'[')+1:].strip()
 
             if state == 1:
-                if ']' in line:  # ] def
-                    line = line[:line.index(']')]
+                if b']' in line:  # ] def
+                    line = line[:line.index(b']')]
                     state = 2
                 words = line.split()
                 for w in words:
-                    if w.startswith('/'):
+                    if w.startswith(b'/'):
                         # Allow for /abc/def/ghi
-                        subwords = w.split('/')
+                        subwords = w.split(b'/')
                         result.extend(subwords[1:])
                     else:
                         raise ValueError("Broken name in encoding file: " + w)

diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py
@@ -18,47 +18,50 @@ def test_PsfontsMap(monkeypatch):
     fontmap = dr.PsfontsMap(filename)
     # Check all properties of a few fonts
     for n in [1, 2, 3, 4, 5]:
-        key = 'TeXfont%d' % n
+        key = b'TeXfont%d' % n
         entry = fontmap[key]
         assert entry.texname == key
-        assert entry.psname == 'PSfont%d' % n
+        assert entry.psname == b'PSfont%d' % n
         if n not in [3, 5]:
-            assert entry.encoding == 'font%d.enc' % n
+            assert entry.encoding == b'font%d.enc' % n
         elif n == 3:
-            assert entry.encoding == 'enc3.foo'
+            assert entry.encoding == b'enc3.foo'
         # We don't care about the encoding of TeXfont5, which specifies
         # multiple encodings.
         if n not in [1, 5]:
-            assert entry.filename == 'font%d.pfa' % n
+            assert entry.filename == b'font%d.pfa' % n
         else:
-            assert entry.filename == 'font%d.pfb' % n
+            assert entry.filename == b'font%d.pfb' % n
         if n == 4:
             assert entry.effects == {'slant': -0.1, 'extend': 2.2}
         else:
             assert entry.effects == {}
     # Some special cases
-    entry = fontmap['TeXfont6']
+    entry = fontmap[b'TeXfont6']
     assert entry.filename is None
     assert entry.encoding is None
-    entry = fontmap['TeXfont7']
+    entry = fontmap[b'TeXfont7']
     assert entry.filename is None
-    assert entry.encoding == 'font7.enc'
-    entry = fontmap['TeXfont8']
-    assert entry.filename == 'font8.pfb'
+    assert entry.encoding == b'font7.enc'
+    entry = fontmap[b'TeXfont8']
+    assert entry.filename == b'font8.pfb'
     assert entry.encoding is None
-    entry = fontmap['TeXfont9']
-    assert entry.filename == '/absolute/font9.pfb'
+    entry = fontmap[b'TeXfont9']
+    assert entry.filename == b'/absolute/font9.pfb'
 
 
 @skip_if_command_unavailable(["kpsewhich", "-version"])
 def test_dviread():
     dir = os.path.join(os.path.dirname(__file__), 'baseline_images', 'dviread')
     with open(os.path.join(dir, 'test.json')) as f:
         correct = json.load(f)
+        for entry in correct:
+            entry['text'] = [[a, b, c, six.b(d), e]
+                             for [a, b, c, d, e] in entry['text']]
     with dr.Dvi(os.path.join(dir, 'test.dvi'), None) as dvi:
         data = [{'text': [[t.x, t.y,
                            six.unichr(t.glyph),
-                           six.text_type(t.font.texname),
+                           t.font.texname,
                            round(t.font.size, 2)]
                           for t in page.text],
                  'boxes': [[b.x, b.y, b.height, b.width] for b in page.boxes]}