Skip to content

Commit 362c648

Browse files
committed
Implemented and added tests for the new list of illegal numeric character references
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401238
1 parent b067b74 commit 362c648

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

src/html5lib/tokenizer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,17 @@ def consumeNumberEntity(self, isHex):
173173

174174
charAsInt = entitiesWindows1252[charAsInt - 128]
175175

176-
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
177-
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
176+
# Certain characters get replaced with U+FFFD
177+
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
178+
or (0x007F <= charAsInt <= 0x009F)
179+
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
180+
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
181+
or (0x10FFFF < charAsInt)):
182+
char = u"\uFFFD"
183+
self.tokenQueue.append({"type": "ParseError", "data":
184+
"illegal-codepoint-for-numeric-entity",
185+
"datavars": {"charAsInt": charAsInt}})
186+
else:
178187
try:
179188
# XXX We should have a separate function that does "int" to
180189
# "unicodestring" conversion since this doesn't always work
@@ -187,11 +196,6 @@ def consumeNumberEntity(self, isHex):
187196
self.tokenQueue.append({"type": "ParseError", "data":
188197
"cant-convert-numeric-entity",
189198
"datavars": {"charAsInt": charAsInt}})
190-
else:
191-
char = u"\uFFFD"
192-
self.tokenQueue.append({"type": "ParseError", "data":
193-
"illegal-codepoint-for-numeric-entity",
194-
"datavars": {"charAsInt": charAsInt}})
195199

196200
# Discard the ; if present. Otherwise, put it back on the queue and
197201
# invoke parseError on parser.

0 commit comments

Comments
 (0)