Skip to content

Commit b7c7de7

Browse files
committed
Optimised PCDATA Data State a bit (saves maybe 3%)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401240
1 parent cfb1e85 commit b7c7de7

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

src/html5lib/tokenizer.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -354,15 +354,22 @@ def dataState(self):
354354
self.tokenQueue.append({"type": "SpaceCharacters", "data":
355355
data + self.stream.charsUntil(spaceCharacters, True)})
356356
# No need to update lastFourChars here, since the first space will
357-
# have already broken any <!-- or --> sequences
357+
# have already been appended to lastFourChars and will have broken
358+
# any <!-- or --> sequences
358359
else:
359-
chars = self.stream.charsUntil(("&", "<", ">", "-"))
360-
self.tokenQueue.append({"type": "Characters", "data":
360+
if self.contentModelFlag in\
361+
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
362+
chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
363+
self.lastFourChars += chars[-4:]
364+
self.lastFourChars = self.lastFourChars[-4:]
365+
else:
366+
chars = self.stream.charsUntil((u"&", u"<"))
367+
# lastFourChars only needs to be kept up-to-date if we're
368+
# in CDATA or RCDATA, so ignore it here
369+
self.tokenQueue.append({"type": "Characters", "data":
361370
data + chars})
362-
self.lastFourChars += chars[-4:]
363-
self.lastFourChars = self.lastFourChars[-4:]
364371
return True
365-
372+
366373
def entityDataState(self):
367374
entity = self.consumeEntity()
368375
if entity:

0 commit comments

Comments
 (0)