Skip to content

Commit 93365ea

Browse files
committed
Re-run 3to2, and fix where something had gone uncommitted in Python3 before.
1 parent 5d925be commit 93365ea

File tree

5 files changed

+181
-19
lines changed

5 files changed

+181
-19
lines changed

html5lib/tokenizer.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,9 @@
1919

2020
from .inputstream import HTMLInputStream
2121

22-
# Group entities by their first character, for faster lookups
23-
entitiesByFirstChar = {}
24-
for e in entities:
25-
entitiesByFirstChar.setdefault(e[0], []).append(e)
22+
from .trie import Trie
23+
24+
entitiesTrie = Trie(entities)
2625

2726
class HTMLTokenizer(object):
2827
u""" This class takes care of tokenizing HTML.
@@ -183,29 +182,20 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
183182
#
184183
# Consume characters and compare to these to a substring of the
185184
# entity names in the list until the substring no longer matches.
186-
filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
187-
188-
def entitiesStartingWith(name):
189-
return [e for e in filteredEntityList if e.startswith(name)]
190-
entitiesStartingWith.func_annotations = {}
191-
192185
while (charStack[-1] is not EOF):
193-
filteredEntityList = entitiesStartingWith(u"".join(charStack))
194-
if not filteredEntityList:
186+
if not entitiesTrie.has_keys_with_prefix(u"".join(charStack)):
195187
break
196188
charStack.append(self.stream.char())
197189

198190
# At this point we have a string that starts with some characters
199191
# that may match an entity
200-
entityName = None
201-
202192
# Try to find the longest entity the string will match to take care
203193
# of &noti for instance.
204-
for entityLength in xrange(len(charStack)-1, 1, -1):
205-
possibleEntityName = u"".join(charStack[:entityLength])
206-
if possibleEntityName in entities:
207-
entityName = possibleEntityName
208-
break
194+
try:
195+
entityName = entitiesTrie.longest_prefix(u"".join(charStack[:-1]))
196+
entityLength = len(entityName)
197+
except KeyError:
198+
entityName = None
209199

210200
if entityName is not None:
211201
if entityName[-1] != u";":

html5lib/trie/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from __future__ import absolute_import
2+
from .py import Trie as PyTrie
3+
4+
Trie = PyTrie
5+
6+
try:
7+
from .datrie import Trie as DATrie
8+
except ImportError:
9+
pass
10+
else:
11+
Trie = DATrie

html5lib/trie/_base.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from __future__ import absolute_import
2+
from collections import Mapping
3+
4+
class Trie(Mapping):
5+
u"""Abstract base class for tries"""
6+
7+
def keys(self, prefix=None):
8+
keys = super(Trie, self).keys()
9+
10+
if prefix is None:
11+
return set(keys)
12+
13+
return set(x for x in keys if x.startswith(prefix))
14+
keys.func_annotations = {}
15+
16+
def has_keys_with_prefix(self, prefix):
17+
for key in self.keys():
18+
if key.startswith(prefix):
19+
return True
20+
21+
return False
22+
has_keys_with_prefix.func_annotations = {}
23+
24+
def longest_prefix(self, prefix):
25+
if prefix in self:
26+
return prefix
27+
28+
for i in xrange(1, len(prefix) + 1):
29+
if prefix[:-i] in self:
30+
return prefix[:-i]
31+
32+
raise KeyError(prefix)
33+
longest_prefix.func_annotations = {}
34+
35+
def longest_prefix_item(self, prefix):
36+
lprefix = self.longest_prefix(prefix)
37+
return (lprefix, self[lprefix])
38+
longest_prefix_item.func_annotations = {}

html5lib/trie/datrie.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from __future__ import absolute_import
2+
from itertools import chain
3+
4+
from datrie import Trie as DATrie
5+
6+
from ._base import Trie as ABCTrie
7+
8+
class Trie(ABCTrie):
9+
def __init__(self, data):
10+
chars = set()
11+
for key in data.keys():
12+
if not isinstance(key, unicode):
13+
raise TypeError(u"All keys must be strings")
14+
for char in key:
15+
chars.add(char)
16+
17+
self._data = DATrie(u"".join(chars))
18+
for key, value in data.items():
19+
self._data[key] = value
20+
__init__.func_annotations = {}
21+
22+
def __contains__(self, key):
23+
return key in self._data
24+
__contains__.func_annotations = {}
25+
26+
def __len__(self):
27+
return len(self._data)
28+
__len__.func_annotations = {}
29+
30+
def __iter__(self):
31+
raise NotImplementedError()
32+
__iter__.func_annotations = {}
33+
34+
def __getitem__(self, key):
35+
return self._data[key]
36+
__getitem__.func_annotations = {}
37+
38+
def keys(self, prefix=None):
39+
return self._data.keys(prefix)
40+
keys.func_annotations = {}
41+
42+
def has_keys_with_prefix(self, prefix):
43+
return self._data.has_keys_with_prefix(prefix)
44+
has_keys_with_prefix.func_annotations = {}
45+
46+
def longest_prefix(self, prefix):
47+
return self._data.longest_prefix(prefix)
48+
longest_prefix.func_annotations = {}
49+
50+
def longest_prefix_item(self, prefix):
51+
return self._data.longest_prefix_item(prefix)
52+
longest_prefix_item.func_annotations = {}

html5lib/trie/py.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from __future__ import absolute_import
2+
from bisect import bisect_left
3+
4+
from ._base import Trie as ABCTrie
5+
6+
class Trie(ABCTrie):
7+
def __init__(self, data):
8+
if not all(isinstance(x, unicode) for x in data.keys()):
9+
raise TypeError(u"All keys must be strings")
10+
11+
self._data = data
12+
self._keys = sorted(data.keys())
13+
self._cachestr = u""
14+
self._cachepoints = (0, len(data))
15+
__init__.func_annotations = {}
16+
17+
def __contains__(self, key):
18+
return key in self._data
19+
__contains__.func_annotations = {}
20+
21+
def __len__(self):
22+
return len(self._data)
23+
__len__.func_annotations = {}
24+
25+
def __iter__(self):
26+
return iter(self._data)
27+
__iter__.func_annotations = {}
28+
29+
def __getitem__(self, key):
30+
return self._data[key]
31+
__getitem__.func_annotations = {}
32+
33+
def keys(self, prefix=None):
34+
if prefix is None or prefix == u"" or not self._keys:
35+
return set(self._keys)
36+
37+
if prefix.startswith(self._cachestr):
38+
lo, hi = self._cachepoints
39+
start = i = bisect_left(self._keys, prefix, lo, hi)
40+
else:
41+
start = i = bisect_left(self._keys, prefix)
42+
43+
keys = set()
44+
if start == len(self._keys):
45+
return keys
46+
47+
while self._keys[i].startswith(prefix):
48+
keys.add(self._keys[i])
49+
i += 1
50+
51+
self._cachestr = prefix
52+
self._cachepoints = (start, i)
53+
54+
return keys
55+
keys.func_annotations = {}
56+
57+
def has_keys_with_prefix(self, prefix):
58+
if prefix in self._data:
59+
return True
60+
61+
if prefix.startswith(self._cachestr):
62+
lo, hi = self._cachepoints
63+
i = bisect_left(self._keys, prefix, lo, hi)
64+
else:
65+
i = bisect_left(self._keys, prefix)
66+
67+
if i == len(self._keys):
68+
return False
69+
70+
return self._keys[i].startswith(prefix)
71+
has_keys_with_prefix.func_annotations = {}

0 commit comments

Comments
 (0)