Skip to content

Commit d85d895

Browse files
committed
Optimised entity lookup a bit. (Reduces tokenisation time by around 10% in some cases.)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401157
1 parent bf696f4 commit d85d895

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

src/html5lib/tokenizer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616

1717
from inputstream import HTMLInputStream
1818

19+
# Group entities by their first character, for faster lookups
20+
entitiesByFirstChar = {}
21+
for e in entities:
22+
entitiesByFirstChar.setdefault(e[0], []).append(e)
23+
1924
class HTMLTokenizer(object):
2025
""" This class takes care of tokenizing HTML.
2126
@@ -224,8 +229,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
224229
#
225230
# Consume characters and compare to these to a substring of the
226231
# entity names in the list until the substring no longer matches.
227-
filteredEntityList = [e for e in entities if \
228-
e.startswith(charStack[0])]
232+
filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
229233

230234
def entitiesStartingWith(name):
231235
return [e for e in filteredEntityList if e.startswith(name)]

0 commit comments

Comments
 (0)