Skip to content

Commit 520a96c

Browse files
author
James Graham
committed
Phase 2 of the null handling changes
1 parent ff6fd06 commit 520a96c

File tree

3 files changed

+26
-11
lines changed

3 files changed

+26
-11
lines changed

html5lib/html5parser.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -987,12 +987,15 @@ def processSpaceCharactersDropNewline(self, token):
987987
self.tree.insertText(data)
988988

989989
def processCharacters(self, token):
990+
if token["data"] == u"\u0000":
991+
#The tokenizer should always emit null on its own
992+
return
990993
self.tree.reconstructActiveFormattingElements()
991994
self.tree.insertText(token["data"])
992995
#This must be bad for performance
993996
if (self.parser.framesetOK and
994-
any([char not in set(u"\ufffd") | spaceCharacters
995-
for char in token["data"]])):
997+
any([char not in spaceCharacters
998+
for char in token["data"]])):
996999
self.parser.framesetOK = False
9971000

9981001
def processSpaceCharacters(self, token):
@@ -2195,6 +2198,8 @@ def processEOF(self):
21952198
assert self.parser.innerHTML
21962199

21972200
def processCharacters(self, token):
2201+
if token["data"] == u"\u0000":
2202+
return
21982203
self.tree.insertText(token["data"])
21992204

22002205
def startTagOption(self, token):
@@ -2375,8 +2380,11 @@ def processCharacters(self, token):
23752380
new_token = self.parser.phases["inBody"].processCharacters(token)
23762381
self.parser.resetInsertionMode()
23772382
return new_token
2378-
2379-
self.parser.framesetOK = False
2383+
elif token["data"] == u"\u0000":
2384+
token["data"] = u"\uFFFD"
2385+
elif (not self.parser.framesetOK and
2386+
any(char not in spaceCharacters for char in token["data"])):
2387+
self.parser.framesetOK = False
23802388
Phase.processCharacters(self, token)
23812389

23822390
def processEOF(self):

html5lib/tests/test_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
107107
if token[0] == "StartTag" or token[0] == "EndTag":
108108
token.pop()
109109

110-
if not ignoreErrorOrder:
110+
if not ignoreErrorOrder and not ignoreErrors:
111111
return expectedTokens == receivedTokens
112112
else:
113113
#Sort the tokens into two groups; non-parse errors and parse errors

html5lib/tokenizer.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def dataState(self):
268268
self.tokenQueue.append({"type": tokenTypes["ParseError"],
269269
"data":"invalid-codepoint"})
270270
self.tokenQueue.append({"type": tokenTypes["Characters"],
271-
"data": u"\uFFFD"})
271+
"data": u"\u0000"})
272272
elif data is EOF:
273273
# Tokenization ends.
274274
return False
@@ -282,7 +282,7 @@ def dataState(self):
282282
# have already been appended to lastFourChars and will have broken
283283
# any <!-- or --> sequences
284284
else:
285-
chars = self.stream.charsUntil((u"&", u"<"))
285+
chars = self.stream.charsUntil((u"&", u"<", u"\u0000"))
286286
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
287287
data + chars})
288288
return True
@@ -646,7 +646,7 @@ def scriptDataEscapedState(self):
646646
elif data == EOF:
647647
self.state = self.dataState
648648
else:
649-
chars = self.stream.charsUntil((u"<-", u"\u0000"))
649+
chars = self.stream.charsUntil((u"<", u"-", u"\u0000"))
650650
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
651651
data + chars})
652652
return True
@@ -1150,7 +1150,7 @@ def markupDeclarationOpenState(self):
11501150
self.state = self.doctypeState
11511151
return True
11521152
elif (charStack[-1] == "[" and
1153-
self.parser is not None and
1153+
self.parser is not None and
11541154
self.parser.phase == self.parser.phases["inForeignContent"] and
11551155
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
11561156
matched = True
@@ -1731,8 +1731,15 @@ def cdataSectionState(self):
17311731
if matched:
17321732
break
17331733
data = "".join(data)
1734+
#Deal with null here rather than in the parser
1735+
nullCount = data.count(u"\u0000")
1736+
if nullCount > 0:
1737+
for i in xrange(nullCount):
1738+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
1739+
"data": "invalid-codepoint"})
1740+
data = data.replace(u"\u0000", u"\uFFFD")
17341741
if data:
1735-
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
1736-
data})
1742+
self.tokenQueue.append({"type": tokenTypes["Characters"],
1743+
"data": data})
17371744
self.state = self.dataState
17381745
return True

0 commit comments

Comments
 (0)