13
13
import logging
14
14
import math
15
15
import os
16
- import re
17
16
import string
18
17
import struct
19
18
import sys
@@ -119,25 +118,6 @@ def _fill(strings, linelen=75):
119
118
result .append (b' ' .join (strings [lasti :]))
120
119
return b'\n ' .join (result )
121
120
122
- # PDF strings are supposed to be able to include any eight-bit data,
123
- # except that unbalanced parens and backslashes must be escaped by a
124
- # backslash. However, sf bug #2708559 shows that the carriage return
125
- # character may get read as a newline; these characters correspond to
126
- # \gamma and \Omega in TeX's math font encoding. Escaping them fixes
127
- # the bug.
128
- _string_escape_regex = re .compile (br'([\\()\r\n])' )
129
-
130
-
131
- def _string_escape (match ):
132
- m = match .group (0 )
133
- if m in br'\()' :
134
- return b'\\ ' + m
135
- elif m == b'\n ' :
136
- return br'\n'
137
- elif m == b'\r ' :
138
- return br'\r'
139
- assert False
140
-
141
121
142
122
def _create_pdf_info_dict (backend , metadata ):
143
123
"""
@@ -267,6 +247,15 @@ def _get_link_annotation(gc, x, y, width, height):
267
247
return link_annotation
268
248
269
249
250
+ # PDF strings are supposed to be able to include any eight-bit data, except
251
+ # that unbalanced parens and backslashes must be escaped by a backslash.
252
+ # However, sf bug #2708559 shows that the carriage return character may get
253
+ # read as a newline; these characters correspond to \gamma and \Omega in TeX's
254
+ # math font encoding. Escaping them fixes the bug.
255
+ _str_escapes = str .maketrans ({
256
+ '\\ ' : '\\ \\ ' , '(' : '\\ (' , ')' : '\\ )' , '\n ' : '\\ n' , '\r ' : '\\ r' })
257
+
258
+
270
259
def pdfRepr (obj ):
271
260
"""Map Python objects to PDF syntax."""
272
261
@@ -292,22 +281,21 @@ def pdfRepr(obj):
292
281
elif isinstance (obj , (int , np .integer )):
293
282
return b"%d" % obj
294
283
295
- # Unicode strings are encoded in UTF-16BE with byte-order mark.
284
+ # Non-ASCII Unicode strings are encoded in UTF-16BE with byte-order mark.
296
285
elif isinstance (obj , str ):
297
- try :
298
- # But maybe it's really ASCII?
299
- s = obj .encode ('ASCII' )
300
- return pdfRepr (s )
301
- except UnicodeEncodeError :
302
- s = codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' )
303
- return pdfRepr (s )
286
+ return pdfRepr (obj .encode ('ascii' ) if obj .isascii ()
287
+ else codecs .BOM_UTF16_BE + obj .encode ('UTF-16BE' ))
304
288
305
289
# Strings are written in parentheses, with backslashes and parens
306
290
# escaped. Actually balanced parens are allowed, but it is
307
291
# simpler to escape them all. TODO: cut long strings into lines;
308
292
# I believe there is some maximum line length in PDF.
293
+ # Despite the extra decode/encode, translate is faster than regex.
309
294
elif isinstance (obj , bytes ):
310
- return b'(' + _string_escape_regex .sub (_string_escape , obj ) + b')'
295
+ return (
296
+ b'(' +
297
+ obj .decode ('latin-1' ).translate (_str_escapes ).encode ('latin-1' )
298
+ + b')' )
311
299
312
300
# Dictionaries. The keys must be PDF names, so if we find strings
313
301
# there, we make Name objects from them. The values may be
0 commit comments