Skip to content

Commit f5f28de

Browse files
committed
Fix handling of numeric entity refs
1 parent 002347d commit f5f28de

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

src/html5lib/constants.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,44 @@
878878
"zwnj;": u"\u200C"
879879
}
880880

881+
replacementCharacters = {
882+
0x0:u"\uFFFD",
883+
0x0d:u"\u000A",
884+
0x80:u"\u20AC",
885+
0x81:u"\u0081",
886+
0x81:u"\u0081",
887+
0x82:u"\u201A",
888+
0x83:u"\u0192",
889+
0x84:u"\u201E",
890+
0x85:u"\u2026",
891+
0x86:u"\u2020",
892+
0x87:u"\u2021",
893+
0x88:u"\u02C6",
894+
0x89:u"\u2030",
895+
0x8A:u"\u0160",
896+
0x8B:u"\u2039",
897+
0x8C:u"\u0152",
898+
0x8D:u"\u008D",
899+
0x8E:u"\u017D",
900+
0x8F:u"\u008F",
901+
0x90:u"\u0090",
902+
0x91:u"\u2018",
903+
0x92:u"\u2019",
904+
0x93:u"\u201C",
905+
0x94:u"\u201D",
906+
0x95:u"\u2022",
907+
0x96:u"\u2013",
908+
0x97:u"\u2014",
909+
0x98:u"\u02DC",
910+
0x99:u"\u2122",
911+
0x9A:u"\u0161",
912+
0x9B:u"\u203A",
913+
0x9C:u"\u0153",
914+
0x9D:u"\u009D",
915+
0x9E:u"\u017E",
916+
0x9F:u"\u0178",
917+
}
918+
881919
encodings = {
882920
'437': 'cp437',
883921
'850': 'cp850',

src/html5lib/tokenizer.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
1515
from constants import digits, hexDigits, EOF
1616
from constants import tokenTypes, tagTokenTypes
17+
from constants import replacementCharacters
1718

1819
from inputstream import HTMLInputStream
1920

@@ -96,29 +97,37 @@ def consumeNumberEntity(self, isHex):
9697
# Convert the set of characters consumed to an int.
9798
charAsInt = int("".join(charStack), radix)
9899

99-
if charAsInt == 13:
100+
# Certain characters get replaced with others
101+
if charAsInt in replacementCharacters:
102+
char = replacementCharacters[charAsInt]
100103
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
101-
"incorrect-cr-newline-entity"})
102-
charAsInt = 10
103-
elif 127 < charAsInt < 160:
104-
# If the integer is between 127 and 160 (so 128 and bigger and 159
105-
# and smaller) we need to do the "windows trick".
106-
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
107-
"illegal-windows-1252-entity"})
108-
109-
charAsInt = entitiesWindows1252[charAsInt - 128]
110-
111-
# Certain characters get replaced with U+FFFD
112-
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
113-
or (0x007F <= charAsInt <= 0x009F)
114-
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
115-
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
116-
or (0x10FFFF < charAsInt)):
104+
"illegal-codepoint-for-numeric-entity",
105+
"datavars": {"charAsInt": charAsInt}})
106+
elif ((0xD800 <= charAsInt <= 0xDFFF) or
107+
(charAsInt > 0x10FFFF)):
117108
char = u"\uFFFD"
118109
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
119110
"illegal-codepoint-for-numeric-entity",
120111
"datavars": {"charAsInt": charAsInt}})
121112
else:
113+
#Should speed up this check somehow (e.g. move the set to a constant)
114+
if ((0x0001 <= charAsInt <= 0x0008) or
115+
(0x000E <= charAsInt <= 0x001F) or
116+
(0x007F <= charAsInt <= 0x009F) or
117+
(0xFDD0 <= charAsInt <= 0xFDEF) or
118+
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
119+
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
120+
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
121+
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
122+
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
123+
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
124+
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
125+
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
126+
0xFFFFF, 0x10FFFE, 0x10FFFF])):
127+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
128+
"data":
129+
"illegal-codepoint-for-numeric-entity",
130+
"datavars": {"charAsInt": charAsInt}})
122131
try:
123132
# XXX We should have a separate function that does "int" to
124133
# "unicodestring" conversion since this doesn't always work

0 commit comments

Comments
 (0)