@@ -173,8 +173,17 @@ def consumeNumberEntity(self, isHex):
173
173
174
174
charAsInt = entitiesWindows1252 [charAsInt - 128 ]
175
175
176
- # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
177
- if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343 ):
176
+ # Certain characters get replaced with U+FFFD
177
+ if ((charAsInt <= 0x0008 ) or (charAsInt == 0x000B ) or (0x000E <= charAsInt <= 0x001F )
178
+ or (0x007F <= charAsInt <= 0x009F )
179
+ or (0xD800 <= charAsInt <= 0xDFFF ) or (0xFDD0 <= charAsInt <= 0xFDDF )
180
+ or (charAsInt & 0xFFFE == 0xFFFE ) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
181
+ or (0x10FFFF < charAsInt )):
182
+ char = u"\uFFFD "
183
+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
184
+ "illegal-codepoint-for-numeric-entity" ,
185
+ "datavars" : {"charAsInt" : charAsInt }})
186
+ else :
178
187
try :
179
188
# XXX We should have a separate function that does "int" to
180
189
# "unicodestring" conversion since this doesn't always work
@@ -187,11 +196,6 @@ def consumeNumberEntity(self, isHex):
187
196
self .tokenQueue .append ({"type" : "ParseError" , "data" :
188
197
"cant-convert-numeric-entity" ,
189
198
"datavars" : {"charAsInt" : charAsInt }})
190
- else :
191
- char = u"\uFFFD "
192
- self .tokenQueue .append ({"type" : "ParseError" , "data" :
193
- "illegal-codepoint-for-numeric-entity" ,
194
- "datavars" : {"charAsInt" : charAsInt }})
195
199
196
200
# Discard the ; if present. Otherwise, put it back on the queue and
197
201
# invoke parseError on parser.
0 commit comments