|
3 | 3 |
|
4 | 4 | import re
|
5 | 5 |
|
| 6 | +from codecs import register_error, xmlcharrefreplace_errors |
| 7 | + |
6 | 8 | from ..constants import voidElements, booleanAttributes, spaceCharacters
|
7 | 9 | from ..constants import rcdataElements, entities, xmlEntities
|
8 | 10 | from .. import utils
|
|
21 | 23 | "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
22 | 24 | "\u3000]")
|
23 | 25 |
|
24 |
| -try: |
25 |
| - from codecs import register_error, xmlcharrefreplace_errors |
26 |
| -except ImportError: |
27 |
| - unicode_encode_errors = "strict" |
28 |
| -else: |
29 |
| - unicode_encode_errors = "htmlentityreplace" |
30 |
| - |
31 |
| - encode_entity_map = {} |
32 |
| - is_ucs4 = len("\U0010FFFF") == 1 |
33 |
| - for k, v in list(entities.items()): |
34 |
| - # skip multi-character entities |
35 |
| - if ((is_ucs4 and len(v) > 1) or |
36 |
| - (not is_ucs4 and len(v) > 2)): |
37 |
| - continue |
38 |
| - if v != "&": |
39 |
| - if len(v) == 2: |
40 |
| - v = utils.surrogatePairToCodepoint(v) |
41 |
| - else: |
42 |
| - v = ord(v) |
43 |
| - if v not in encode_entity_map or k.islower(): |
44 |
| - # prefer < over < and similarly for &, >, etc. |
45 |
| - encode_entity_map[v] = k |
46 |
| - |
47 |
| - def htmlentityreplace_errors(exc): |
48 |
| - if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
49 |
| - res = [] |
50 |
| - codepoints = [] |
51 |
| - skip = False |
52 |
| - for i, c in enumerate(exc.object[exc.start:exc.end]): |
53 |
| - if skip: |
54 |
| - skip = False |
55 |
| - continue |
56 |
| - index = i + exc.start |
57 |
| - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): |
58 |
| - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) |
59 |
| - skip = True |
60 |
| - else: |
61 |
| - codepoint = ord(c) |
62 |
| - codepoints.append(codepoint) |
63 |
| - for cp in codepoints: |
64 |
| - e = encode_entity_map.get(cp) |
65 |
| - if e: |
66 |
| - res.append("&") |
67 |
| - res.append(e) |
68 |
| - if not e.endswith(";"): |
69 |
| - res.append(";") |
70 |
| - else: |
71 |
| - res.append("&#x%s;" % (hex(cp)[2:])) |
72 |
| - return ("".join(res), exc.end) |
73 |
| - else: |
74 |
| - return xmlcharrefreplace_errors(exc) |
75 | 26 |
|
76 |
| - register_error(unicode_encode_errors, htmlentityreplace_errors) |
| 27 | +encode_entity_map = {} |
| 28 | +is_ucs4 = len("\U0010FFFF") == 1 |
| 29 | +for k, v in list(entities.items()): |
| 30 | + # skip multi-character entities |
| 31 | + if ((is_ucs4 and len(v) > 1) or |
| 32 | + (not is_ucs4 and len(v) > 2)): |
| 33 | + continue |
| 34 | + if v != "&": |
| 35 | + if len(v) == 2: |
| 36 | + v = utils.surrogatePairToCodepoint(v) |
| 37 | + else: |
| 38 | + v = ord(v) |
| 39 | + if v not in encode_entity_map or k.islower(): |
| 40 | + # prefer < over < and similarly for &, >, etc. |
| 41 | + encode_entity_map[v] = k |
| 42 | + |
| 43 | + |
| 44 | +def htmlentityreplace_errors(exc): |
| 45 | + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
| 46 | + res = [] |
| 47 | + codepoints = [] |
| 48 | + skip = False |
| 49 | + for i, c in enumerate(exc.object[exc.start:exc.end]): |
| 50 | + if skip: |
| 51 | + skip = False |
| 52 | + continue |
| 53 | + index = i + exc.start |
| 54 | + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): |
| 55 | + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) |
| 56 | + skip = True |
| 57 | + else: |
| 58 | + codepoint = ord(c) |
| 59 | + codepoints.append(codepoint) |
| 60 | + for cp in codepoints: |
| 61 | + e = encode_entity_map.get(cp) |
| 62 | + if e: |
| 63 | + res.append("&") |
| 64 | + res.append(e) |
| 65 | + if not e.endswith(";"): |
| 66 | + res.append(";") |
| 67 | + else: |
| 68 | + res.append("&#x%s;" % (hex(cp)[2:])) |
| 69 | + return ("".join(res), exc.end) |
| 70 | + else: |
| 71 | + return xmlcharrefreplace_errors(exc) |
77 | 72 |
|
78 |
| - del register_error |
| 73 | +register_error("htmlentityreplace", htmlentityreplace_errors) |
79 | 74 |
|
80 | 75 |
|
81 | 76 | class HTMLSerializer(object):
|
@@ -168,7 +163,7 @@ def __init__(self, **kwargs):
|
168 | 163 | def encode(self, string):
|
169 | 164 | assert(isinstance(string, text_type))
|
170 | 165 | if self.encoding:
|
171 |
| - return string.encode(self.encoding, unicode_encode_errors) |
| 166 | + return string.encode(self.encoding, "htmlentityreplace") |
172 | 167 | else:
|
173 | 168 | return string
|
174 | 169 |
|
|
0 commit comments