|
14 | 14 | from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
15 | 15 | from constants import digits, hexDigits, EOF
|
16 | 16 | from constants import tokenTypes, tagTokenTypes
|
| 17 | +from constants import replacementCharacters |
17 | 18 |
|
18 | 19 | from inputstream import HTMLInputStream
|
19 | 20 |
|
@@ -96,29 +97,37 @@ def consumeNumberEntity(self, isHex):
|
96 | 97 | # Convert the set of characters consumed to an int.
|
97 | 98 | charAsInt = int("".join(charStack), radix)
|
98 | 99 |
|
99 |
| - if charAsInt == 13: |
| 100 | + # Certain characters get replaced with others |
| 101 | + if charAsInt in replacementCharacters: |
| 102 | + char = replacementCharacters[charAsInt] |
100 | 103 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
101 |
| - "incorrect-cr-newline-entity"}) |
102 |
| - charAsInt = 10 |
103 |
| - elif 127 < charAsInt < 160: |
104 |
| - # If the integer is between 127 and 160 (so 128 and bigger and 159 |
105 |
| - # and smaller) we need to do the "windows trick". |
106 |
| - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
107 |
| - "illegal-windows-1252-entity"}) |
108 |
| - |
109 |
| - charAsInt = entitiesWindows1252[charAsInt - 128] |
110 |
| - |
111 |
| - # Certain characters get replaced with U+FFFD |
112 |
| - if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F) |
113 |
| - or (0x007F <= charAsInt <= 0x009F) |
114 |
| - or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF) |
115 |
| - or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10 |
116 |
| - or (0x10FFFF < charAsInt)): |
| 104 | + "illegal-codepoint-for-numeric-entity", |
| 105 | + "datavars": {"charAsInt": charAsInt}}) |
| 106 | + elif ((0xD800 <= charAsInt <= 0xDFFF) or |
| 107 | + (charAsInt > 0x10FFFF)): |
117 | 108 | char = u"\uFFFD"
|
118 | 109 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
119 | 110 | "illegal-codepoint-for-numeric-entity",
|
120 | 111 | "datavars": {"charAsInt": charAsInt}})
|
121 | 112 | else:
|
| 113 | + #Should speed up this check somehow (e.g. move the set to a constant) |
| 114 | + if ((0x0001 <= charAsInt <= 0x0008) or |
| 115 | + (0x000E <= charAsInt <= 0x001F) or |
| 116 | + (0x007F <= charAsInt <= 0x009F) or |
| 117 | + (0xFDD0 <= charAsInt <= 0xFDEF) or |
| 118 | + charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, |
| 119 | + 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, |
| 120 | + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, |
| 121 | + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
| 122 | + 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, |
| 123 | + 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, |
| 124 | + 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, |
| 125 | + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, |
| 126 | + 0xFFFFF, 0x10FFFE, 0x10FFFF])): |
| 127 | + self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| 128 | + "data": |
| 129 | + "illegal-codepoint-for-numeric-entity", |
| 130 | + "datavars": {"charAsInt": charAsInt}}) |
122 | 131 | try:
|
123 | 132 | # XXX We should have a separate function that does "int" to
|
124 | 133 | # "unicodestring" conversion since this doesn't always work
|
|
0 commit comments