Optimised entity lookup a bit. (Reduces tokenisation time by around 10% in some cases.)

philiptaylor · philiptaylor · commit d85d89532b4a · 2008-05-28T16:50:46.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401157
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -16,6 +16,11 @@
 
 from inputstream import HTMLInputStream
 
+# Group entities by their first character, for faster lookups
+entitiesByFirstChar = {}
+for e in entities:
+    entitiesByFirstChar.setdefault(e[0], []).append(e)
+
 class HTMLTokenizer(object):
     """ This class takes care of tokenizing HTML.
 
@@ -224,8 +229,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
             #
             # Consume characters and compare to these to a substring of the
             # entity names in the list until the substring no longer matches.
-            filteredEntityList = [e for e in entities if \
-              e.startswith(charStack[0])]
+            filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
 
             def entitiesStartingWith(name):
                 return [e for e in filteredEntityList if e.startswith(name)]