Skip to content

Commit 8205aae

Browse files
committed
Move entities to using DATrie if possible; if not, use a new custom Trie-like interface.
This provides a significant performance gain, both parse-time and memory-usage, over the previous entitiesStartingWith magic, regardless of which impl is used.
1 parent bfdf9bb commit 8205aae

File tree

5 files changed

+157
-18
lines changed

5 files changed

+157
-18
lines changed

html5lib/tokenizer.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818

1919
from .inputstream import HTMLInputStream
2020

21-
# Group entities by their first character, for faster lookups
22-
entitiesByFirstChar = {}
23-
for e in entities:
24-
entitiesByFirstChar.setdefault(e[0], []).append(e)
21+
from .trie import Trie
22+
23+
entitiesTrie = Trie(entities)
2524

2625
class HTMLTokenizer(object):
2726
""" This class takes care of tokenizing HTML.
@@ -179,28 +178,20 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
179178
#
180179
# Consume characters and compare to these to a substring of the
181180
# entity names in the list until the substring no longer matches.
182-
filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
183-
184-
def entitiesStartingWith(name):
185-
return [e for e in filteredEntityList if e.startswith(name)]
186-
187181
while (charStack[-1] is not EOF):
188-
filteredEntityList = entitiesStartingWith("".join(charStack))
189-
if not filteredEntityList:
182+
if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
190183
break
191184
charStack.append(self.stream.char())
192185

193186
# At this point we have a string that starts with some characters
194187
# that may match an entity
195-
entityName = None
196-
197188
# Try to find the longest entity the string will match to take care
198189
# of &noti for instance.
199-
for entityLength in range(len(charStack)-1, 1, -1):
200-
possibleEntityName = "".join(charStack[:entityLength])
201-
if possibleEntityName in entities:
202-
entityName = possibleEntityName
203-
break
190+
try:
191+
entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
192+
entityLength = len(entityName)
193+
except KeyError:
194+
entityName = None
204195

205196
if entityName is not None:
206197
if entityName[-1] != ";":

html5lib/trie/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .py import Trie as PyTrie
2+
3+
Trie = PyTrie
4+
5+
try:
6+
from .datrie import Trie as DATrie
7+
except ImportError:
8+
pass
9+
else:
10+
Trie = DATrie

html5lib/trie/_base.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from collections import Mapping
2+
3+
class Trie(Mapping):
4+
"""Abstract base class for tries"""
5+
6+
def keys(self, prefix=None):
7+
keys = super().keys()
8+
9+
if prefix is None:
10+
return set(keys)
11+
12+
return {x for x in keys if x.startswith(prefix)}
13+
14+
def has_keys_with_prefix(self, prefix):
15+
for key in self.keys():
16+
if key.startswith(prefix):
17+
return True
18+
19+
return False
20+
21+
def longest_prefix(self, prefix):
22+
if prefix in self:
23+
return prefix
24+
25+
for i in range(1, len(prefix) + 1):
26+
if prefix[:-i] in self:
27+
return prefix[:-i]
28+
29+
raise KeyError(prefix)
30+
31+
def longest_prefix_item(self, prefix):
32+
lprefix = self.longest_prefix(prefix)
33+
return (lprefix, self[lprefix])

html5lib/trie/datrie.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from itertools import chain
2+
3+
from datrie import Trie as DATrie
4+
5+
from ._base import Trie as ABCTrie
6+
7+
class Trie(ABCTrie):
8+
def __init__(self, data):
9+
chars = set()
10+
for key in data.keys():
11+
if not isinstance(key, str):
12+
raise TypeError("All keys must be strings")
13+
for char in key:
14+
chars.add(char)
15+
16+
self._data = DATrie("".join(chars))
17+
for key, value in data.items():
18+
self._data[key] = value
19+
20+
def __contains__(self, key):
21+
return key in self._data
22+
23+
def __len__(self):
24+
return len(self._data)
25+
26+
def __iter__(self):
27+
raise NotImplementedError()
28+
29+
def __getitem__(self, key):
30+
return self._data[key]
31+
32+
def keys(self, prefix=None):
33+
return self._data.keys(prefix)
34+
35+
def has_keys_with_prefix(self, prefix):
36+
return self._data.has_keys_with_prefix(prefix)
37+
38+
def longest_prefix(self, prefix):
39+
return self._data.longest_prefix(prefix)
40+
41+
def longest_prefix_item(self, prefix):
42+
return self._data.longest_prefix_item(prefix)

html5lib/trie/py.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from bisect import bisect_left
2+
3+
from ._base import Trie as ABCTrie
4+
5+
class Trie(ABCTrie):
6+
def __init__(self, data):
7+
if not all(isinstance(x, str) for x in data.keys()):
8+
raise TypeError("All keys must be strings")
9+
10+
self._data = data
11+
self._keys = sorted(data.keys())
12+
self._cachestr = ""
13+
self._cachepoints = (0, len(data))
14+
15+
def __contains__(self, key):
16+
return key in self._data
17+
18+
def __len__(self):
19+
return len(self._data)
20+
21+
def __iter__(self):
22+
return iter(self._data)
23+
24+
def __getitem__(self, key):
25+
return self._data[key]
26+
27+
def keys(self, prefix=None):
28+
if prefix is None or prefix == "" or not self._keys:
29+
return set(self._keys)
30+
31+
if prefix.startswith(self._cachestr):
32+
lo, hi = self._cachepoints
33+
start = i = bisect_left(self._keys, prefix, lo, hi)
34+
else:
35+
start = i = bisect_left(self._keys, prefix)
36+
37+
keys = set()
38+
if start == len(self._keys):
39+
return keys
40+
41+
while self._keys[i].startswith(prefix):
42+
keys.add(self._keys[i])
43+
i += 1
44+
45+
self._cachestr = prefix
46+
self._cachepoints = (start, i)
47+
48+
return keys
49+
50+
def has_keys_with_prefix(self, prefix):
51+
if prefix in self._data:
52+
return True
53+
54+
if prefix.startswith(self._cachestr):
55+
lo, hi = self._cachepoints
56+
i = bisect_left(self._keys, prefix, lo, hi)
57+
else:
58+
i = bisect_left(self._keys, prefix)
59+
60+
if i == len(self._keys):
61+
return False
62+
63+
return self._keys[i].startswith(prefix)

0 commit comments

Comments
 (0)