Skip to content

Commit 9cc3c2a

Browse files
committed
Fix for issue 143; deal with handling non-BMP codepoints in serialization to non-unicode encodings.
1 parent 4fa9fda commit 9cc3c2a

File tree

4 files changed

+42
-14
lines changed

4 files changed

+42
-14
lines changed

src/html5lib/inputstream.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
77
from constants import encodings, ReparseException
8+
import utils
89

910
#Non-unicode versions of constants for use in the pre-parser
1011
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
@@ -381,14 +382,9 @@ def characterErrorsUCS2(self, data):
381382
codepoint = ord(match.group())
382383
pos = match.start()
383384
#Pretty sure there should be endianness issues here
384-
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
385-
pos < len(data) - 1 and
386-
ord(data[pos + 1]) >= 0xDC00 and
387-
ord(data[pos + 1]) <= 0xDFFF):
385+
if utils.isSurrogatePair(data[pos:pos+2]):
388386
#We have a surrogate pair!
389-
#From a perl manpage
390-
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
391-
(ord(data[pos + 1]) - 0xDC00))
387+
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
392388
if char_val in non_bmp_invalid_codepoints:
393389
self.errors.append("invalid-codepoint")
394390
skip = True

src/html5lib/serializer/htmlserializer.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
1111
from html5lib.constants import rcdataElements, entities, xmlEntities
12-
12+
from html5lib import utils
1313
from xml.sax.saxutils import escape
1414

1515
spaceCharacters = u"".join(spaceCharacters)
@@ -27,20 +27,33 @@
2727
for k, v in entities.items():
2828
if v != "&" and encode_entity_map.get(v) != k.lower():
2929
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
30-
encode_entity_map[v] = k
30+
encode_entity_map[ord(v)] = k
3131

3232
def htmlentityreplace_errors(exc):
3333
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
3434
res = []
35-
for c in exc.object[exc.start:exc.end]:
36-
e = encode_entity_map.get(c)
35+
codepoints = []
36+
skip = False
37+
for i, c in enumerate(exc.object[exc.start:exc.end]):
38+
if skip:
39+
skip = False
40+
continue
41+
index = i + exc.start
42+
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
43+
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
44+
skip = True
45+
else:
46+
codepoint = ord(c)
47+
codepoints.append(codepoint)
48+
for cp in codepoints:
49+
e = encode_entity_map.get(cp)
3750
if e:
3851
res.append("&")
3952
res.append(e)
4053
if not e.endswith(";"):
4154
res.append(";")
4255
else:
43-
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
56+
res.append("&#x%s;"%(hex(cp)[2:]))
4457
return (u"".join(res), exc.end)
4558
else:
4659
return xmlcharrefreplace_errors(exc)

src/html5lib/utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,4 +153,23 @@ def __deepcopy__(self, memo={}):
153153
result = self.__class__()
154154
memo[id(self)] = result
155155
result.__init__(deepcopy(tuple(self), memo))
156-
return result
156+
return result
157+
158+
#Some utility functions to dal with weirdness around UCS2 vs UCS4
159+
#python builds
160+
161+
def encodingType():
162+
if len() == 2:
163+
return "UCS2"
164+
else:
165+
return "UCS4"
166+
167+
def isSurrogatePair(data):
168+
return (len(data) == 2 and
169+
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
170+
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
171+
172+
def surrogatePairToCodepoint(data):
173+
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
174+
(ord(data[1]) - 0xDC00))
175+
return char_val

tests/test_serializer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def buildTestSuite():
132132
allTests.append(unittest.TestLoader().loadTestsFromTestCase(LxmlTestCase))
133133

134134
return unittest.TestSuite(allTests)
135-
135+
136136

137137
def main():
138138
buildTestSuite()

0 commit comments

Comments
 (0)