Optimised PCDATA Data State a bit (saves maybe 3%)

philiptaylor · philiptaylor · commit b7c7de7786af · 2008-12-18T12:39:25.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401240
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -354,15 +354,22 @@ def dataState(self):
             self.tokenQueue.append({"type": "SpaceCharacters", "data":
               data + self.stream.charsUntil(spaceCharacters, True)})
             # No need to update lastFourChars here, since the first space will
-            # have already broken any <!-- or --> sequences
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
         else:
-            chars = self.stream.charsUntil(("&", "<", ">", "-"))
-            self.tokenQueue.append({"type": "Characters", "data": 
+            if self.contentModelFlag in\
+              (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
+                chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
+                self.lastFourChars += chars[-4:]
+                self.lastFourChars = self.lastFourChars[-4:]
+            else:
+                chars = self.stream.charsUntil((u"&", u"<"))
+                # lastFourChars only needs to be kept up-to-date if we're
+                # in CDATA or RCDATA, so ignore it here
+            self.tokenQueue.append({"type": "Characters", "data":
               data + chars})
-            self.lastFourChars += chars[-4:]
-            self.lastFourChars = self.lastFourChars[-4:]
         return True
-            
+
     def entityDataState(self):
         entity = self.consumeEntity()
         if entity: