Skip to content

Commit f6861ad

Browse files
committed
Type-1 subsetting
With this I can produce smaller pdf files with usetex in some small tests, but this obviously needs more extensive testing, thus marking as draft. Give dviread.DviFont a fake filename attribute for character tracking. On top of #20715. Closes #127.
1 parent e98bb83 commit f6861ad

File tree

3 files changed

+283
-10
lines changed

3 files changed

+283
-10
lines changed

lib/matplotlib/backends/backend_pdf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,8 @@ def _embedTeXFont(self, fontinfo):
981981
t1font = type1font.Type1Font(fontinfo.fontfile)
982982
if fontinfo.effects:
983983
t1font = t1font.transform(fontinfo.effects)
984+
chars = self._character_tracker.used[fontinfo.dvifont.fname]
985+
t1font = t1font.subset(chars)
984986
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
985987

986988
# Font descriptors may be shared between differently encoded
@@ -2255,6 +2257,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
22552257
seq += [['font', pdfname, dvifont.size]]
22562258
oldfont = dvifont
22572259
seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
2260+
self.file._character_tracker.track(dvifont, chr(glyph))
22582261

22592262
# Find consecutive text strings with constant y coordinate and
22602263
# combine into a sequence of strings and kerns, or just one

lib/matplotlib/dviread.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,9 @@ class DviFont:
546546
Attributes
547547
----------
548548
texname : bytes
549+
fname : str
550+
Compatibility shim so that DviFont can be used with
551+
``_backend_pdf_ps.CharacterTracker``; not a real filename.
549552
size : float
550553
Size of the font in Adobe points, converted from the slightly
551554
smaller TeX points.
@@ -570,6 +573,11 @@ def __init__(self, scale, tfm, texname, vf):
570573
self.widths = [(1000*tfm.width.get(char, 0)) >> 20
571574
for char in range(nchars)]
572575

576+
@property
577+
def fname(self):
578+
"""A fake filename"""
579+
return self.texname.decode('latin-1')
580+
573581
def __eq__(self, other):
574582
return (type(self) == type(other)
575583
and self.texname == other.texname and self.size == other.size)

lib/matplotlib/type1font.py

Lines changed: 272 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121
v1.1, 1993. ISBN 0-201-57044-0.
2222
"""
2323

24+
import base64
2425
import binascii
2526
import functools
27+
import itertools
2628
import logging
2729
import re
2830
import string
@@ -36,6 +38,35 @@
3638
_log = logging.getLogger(__name__)
3739

3840

41+
def _make_tag(set):
42+
"""
43+
Hash set into a six-character tag made of uppercase letters
44+
45+
Useful for adding a tag into subsetted fonts while keeping the code
46+
reproducible. The function always returns the same value for the
47+
same set on the same exact Python version but is not guaranteed to
48+
not have collisions.
49+
50+
Parameters
51+
----------
52+
set : iterable
53+
The set of glyphs present in a font subset
54+
55+
Returns
56+
-------
57+
str
58+
Six uppercase ASCII letters and a plus sign
59+
"""
60+
61+
# freeze the set to make it hashable, interpret the hash as bytes
62+
array = struct.pack("@q", hash(frozenset(set)))
63+
# turn the bytes into characters with b32encode, which uses uppercase
64+
# letters and numbers from 2 to 7 - remap those arbitrarily
65+
trans = str.maketrans('234567', 'MTPLIB', '=')
66+
return (base64.b32encode(array).decode('ascii')
67+
.translate(trans)[:6] + '+')
68+
69+
3970
class _Token:
4071
"""
4172
A token in a PostScript stream
@@ -627,8 +658,7 @@ def _parse_subrs(self, tokens, _data):
627658

628659
return array, next(tokens).endpos()
629660

630-
@staticmethod
631-
def _parse_charstrings(tokens, _data):
661+
def _parse_charstrings(self, tokens, _data):
632662
count_token = next(tokens)
633663
if not count_token.is_number():
634664
raise RuntimeError(
@@ -650,7 +680,12 @@ def _parse_charstrings(tokens, _data):
650680
f"Token following /{glyphname} in CharStrings definition "
651681
f"must be a number, was {nbytes_token}"
652682
)
653-
next(tokens) # usually RD or |-
683+
token = next(tokens)
684+
if not token.is_keyword(self._abbr['RD']):
685+
raise RuntimeError(
686+
"Token preceding charstring must be {self._abbr['RD']}, "
687+
f"was {token}"
688+
)
654689
binary_token = tokens.send(1+nbytes_token.value())
655690
charstrings[glyphname] = binary_token.value()
656691

@@ -681,16 +716,15 @@ def _parse_encoding(tokens, _data):
681716
continue
682717
encoding[index_token.value()] = name_token.value()
683718

684-
@staticmethod
685-
def _parse_othersubrs(tokens, data):
719+
def _parse_othersubrs(self, tokens, data):
686720
init_pos = None
687721
while True:
688722
token = next(tokens)
689723
if init_pos is None:
690724
init_pos = token.pos
691725
if token.is_delim():
692726
_expression(token, tokens, data)
693-
elif token.is_keyword('def', 'ND', '|-'):
727+
elif token.is_keyword('def', self._abbr['ND']):
694728
return data[init_pos:token.endpos()], token.endpos()
695729

696730
def transform(self, effects):
@@ -745,7 +779,7 @@ def transform(self, effects):
745779
fontmatrix = (
746780
'[%s]' % ' '.join(_format_approx(x, 6) for x in array)
747781
)
748-
replacements = (
782+
newparts = self._replace(
749783
[(x, '/FontName/%s def' % fontname)
750784
for x in self._pos['FontName']]
751785
+ [(x, '/ItalicAngle %a def' % italicangle)
@@ -755,11 +789,40 @@ def transform(self, effects):
755789
+ [(x, '') for x in self._pos.get('UniqueID', [])]
756790
)
757791

792+
return Type1Font((
793+
newparts[0],
794+
self._encrypt(newparts[1], 'eexec'),
795+
self.parts[2]
796+
))
797+
798+
def _replace(self, replacements):
799+
"""
800+
Change the font according to `replacements`
801+
802+
Parameters
803+
----------
804+
replacements : list of ((int, int), str)
805+
Each element is ((pos0, pos1), replacement) where pos0 and
806+
pos1 are indices to the original font data (parts[0] and the
807+
decrypted part concatenated). The data in the interval
808+
pos0:pos1 will be replaced by the replacement text. To
809+
accommodate binary data, the replacement is taken to be in
810+
Latin-1 encoding.
811+
812+
The case where pos0 is inside parts[0] and pos1 inside
813+
the decrypted part is not supported.
814+
815+
Returns
816+
-------
817+
(bytes, bytes)
818+
The new parts[0] and decrypted part (which needs to be
819+
encrypted in the transformed font).
820+
"""
758821
data = bytearray(self.parts[0])
759822
data.extend(self.decrypted)
760823
len0 = len(self.parts[0])
761824
for (pos0, pos1), value in sorted(replacements, reverse=True):
762-
data[pos0:pos1] = value.encode('ascii', 'replace')
825+
data[pos0:pos1] = value.encode('latin-1')
763826
if pos0 < len(self.parts[0]):
764827
if pos1 >= len(self.parts[0]):
765828
raise RuntimeError(
@@ -769,12 +832,211 @@ def transform(self, effects):
769832
len0 += len(value) - pos1 + pos0
770833

771834
data = bytes(data)
835+
return data[:len0], data[len0:]
836+
837+
def subset(self, characters):
838+
"""
839+
Return a new font that only defines the given characters.
840+
841+
Parameters
842+
----------
843+
characters : sequence of bytes
844+
The subset of characters to include
845+
846+
Returns
847+
-------
848+
`Type1Font`
849+
"""
850+
851+
characters = set(characters)
852+
encoding = {code: glyph
853+
for code, glyph in self.prop['Encoding'].items()
854+
if code in characters}
855+
encoding[0] = '.notdef'
856+
# todo and done include strings (glyph names)
857+
todo = set(encoding.values())
858+
done = set()
859+
seen_subrs = {0, 1, 2, 3}
860+
while todo - done:
861+
glyph = next(iter(todo - done))
862+
called_glyphs, called_subrs, _, _ = self._simulate(glyph, [], [])
863+
todo.update(called_glyphs)
864+
seen_subrs.update(called_subrs)
865+
done.add(glyph)
866+
867+
fontname = _make_tag(todo) + self.prop['FontName']
868+
charstrings = self._subset_charstrings(todo)
869+
subrs = self._subset_subrs(seen_subrs)
870+
newparts = self._replace(
871+
[(x, '/FontName/%s def' % fontname)
872+
for x in self._pos['FontName']]
873+
+ [(self._pos['CharStrings'][0], charstrings),
874+
(self._pos['Subrs'][0], subrs),
875+
(self._pos['Encoding'][0], self._subset_encoding(encoding))
876+
] + [(x, '') for x in self._pos.get('UniqueID', [])]
877+
)
772878
return Type1Font((
773-
data[:len0],
774-
self._encrypt(data[len0:], 'eexec'),
879+
newparts[0],
880+
self._encrypt(newparts[1], 'eexec'),
775881
self.parts[2]
776882
))
777883

884+
@staticmethod
885+
def _charstring_tokens(data):
886+
data = iter(data)
887+
for byte in data:
888+
if 32 <= byte <= 246:
889+
yield byte - 139
890+
elif 247 <= byte <= 250:
891+
byte2 = next(data)
892+
yield (byte-247) * 256 + byte2 + 108
893+
elif 251 <= byte <= 254:
894+
byte2 = next(data)
895+
yield -(byte-251)*256 - byte2 - 108
896+
elif byte == 255:
897+
bs = itertools.islice(data, 4)
898+
yield struct.unpack('>i', bs)[0]
899+
elif byte == 12:
900+
byte1 = next(data)
901+
yield {
902+
0: 'dotsection',
903+
1: 'vstem3',
904+
2: 'hstem3',
905+
6: 'seac',
906+
7: 'sbw',
907+
12: 'div',
908+
16: 'callothersubr',
909+
17: 'pop',
910+
33: 'setcurrentpoint'
911+
}[byte1]
912+
else:
913+
yield {
914+
1: 'hstem',
915+
3: 'vstem',
916+
4: 'vmoveto',
917+
5: 'rlineto',
918+
6: 'hlineto',
919+
7: 'vlineto',
920+
8: 'rrcurveto',
921+
9: 'closepath',
922+
10: 'callsubr',
923+
11: 'return',
924+
13: 'hsbw',
925+
14: 'endchar',
926+
21: 'rmoveto',
927+
22: 'hmoveto',
928+
30: 'vhcurveto',
929+
31: 'hvcurveto'
930+
}[byte]
931+
932+
def _step(self, buildchar_stack, postscript_stack, opcode):
933+
if isinstance(opcode, int):
934+
return set(), set(), buildchar_stack + [opcode], postscript_stack
935+
elif opcode in {
936+
'hsbw', 'sbw', 'closepath', 'hlineto', 'hmoveto', 'hcurveto',
937+
'hvcurveto', 'rlineto', 'rmoveto', 'rrcurveto', 'vhcurveto',
938+
'vlineto', 'vmoveto', 'dotsection', 'hstem', 'hstem3', 'vstem',
939+
'vstem3', 'setcurrentpoint'
940+
}:
941+
return set(), set(), [], postscript_stack
942+
elif opcode == 'seac':
943+
codes = buildchar_stack[3:5]
944+
glyphs = [self.prop['Encoding'][x] for x in codes]
945+
return set(glyphs), set(), [], postscript_stack
946+
elif opcode == 'div':
947+
num1, num2 = buildchar_stack[-2:]
948+
return (
949+
set(),
950+
set(),
951+
buildchar_stack[-2:] + [num1/num2], postscript_stack
952+
)
953+
elif opcode == 'callothersubr':
954+
othersubr = buildchar_stack[-1]
955+
n = buildchar_stack[-2]
956+
args = buildchar_stack[-2-n:-2]
957+
if othersubr == 3: # Section 8.1 in Type-1 spec
958+
postscript_stack.append(args[0])
959+
else:
960+
postscript_stack.extend(args[::-1])
961+
return set(), set(), buildchar_stack[:-n-2], postscript_stack
962+
elif opcode == 'callsubr':
963+
subr = buildchar_stack[-1]
964+
glyphs, subrs, new_bc_stack, new_ps_stack = \
965+
self._simulate(subr, buildchar_stack[:-1], postscript_stack)
966+
return set(), subrs | {subr}, new_bc_stack, new_ps_stack
967+
elif opcode == 'pop':
968+
return (
969+
set(),
970+
set(),
971+
buildchar_stack + [postscript_stack[-1]], postscript_stack[:-1]
972+
)
973+
else:
974+
raise RuntimeError(f'opcode {opcode}')
975+
976+
def _simulate(self, glyph_or_subr, buildchar_stack, postscript_stack):
977+
if isinstance(glyph_or_subr, str):
978+
program = self.prop['CharStrings'][glyph_or_subr]
979+
glyphs = {glyph_or_subr}
980+
subrs = set()
981+
else:
982+
program = self.prop['Subrs'][glyph_or_subr]
983+
glyphs = set()
984+
subrs = {glyph_or_subr}
985+
for opcode in self._charstring_tokens(program):
986+
if opcode in ('return', 'endchar'):
987+
return glyphs, subrs, buildchar_stack, postscript_stack
988+
newglyphs, newsubrs, buildchar_stack, postscript_stack = \
989+
self._step(buildchar_stack, postscript_stack, opcode)
990+
glyphs.update(newglyphs)
991+
subrs.update(newsubrs)
992+
993+
def _subset_encoding(self, encoding):
994+
result = [
995+
'/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put } for'
996+
]
997+
result.extend(
998+
f'dup {i} /{glyph} put'
999+
for i, glyph in sorted(encoding.items())
1000+
if glyph != '.notdef'
1001+
)
1002+
result.extend('readonly def\n')
1003+
return '\n'.join(result)
1004+
1005+
def _subset_charstrings(self, glyphs):
1006+
result = [f'/CharStrings {len(glyphs)} dict dup begin']
1007+
encrypted = [self._encrypt(self.prop['CharStrings'][glyph],
1008+
'charstring',
1009+
self.prop.get('lenIV', 4)
1010+
).decode('latin-1')
1011+
for glyph in glyphs]
1012+
RD, ND = self._abbr['RD'], self._abbr['ND']
1013+
result.extend(
1014+
f'/{glyph} {len(enc)} {RD} {enc} {ND}'
1015+
for glyph, enc in zip(glyphs, encrypted)
1016+
)
1017+
result.append('end\n')
1018+
return '\n'.join(result)
1019+
1020+
def _subset_subrs(self, indices):
1021+
# we can't remove subroutines, we just replace unused ones with a stub
1022+
n_subrs = len(self.prop['Subrs'])
1023+
result = [f'/Subrs {n_subrs} array']
1024+
lenIV = self.prop.get('lenIV', 4)
1025+
stub = self._encrypt(b'\x0b', 'charstring', lenIV).decode('latin-1')
1026+
encrypted = [
1027+
self._encrypt(self.prop['Subrs'][i], 'charstring', lenIV
1028+
).decode('latin-1')
1029+
if i in indices else stub
1030+
for i in range(n_subrs)
1031+
]
1032+
RD, ND, NP = self._abbr['RD'], self._abbr['ND'], self._abbr['NP']
1033+
result.extend(
1034+
f'dup {i} {len(enc)} {RD} {enc} {NP}'
1035+
for i, enc in enumerate(encrypted)
1036+
)
1037+
result.extend((ND, ''))
1038+
return '\n'.join(result)
1039+
7781040

7791041
_StandardEncoding = {
7801042
**{ord(letter): letter for letter in string.ascii_letters},

0 commit comments

Comments
 (0)