Skip to content

Commit deb205a

Browse files
committed
Get the lxml treewalker working under the joint codebase under Py2.
This hard-codes the fact that lxml uses UTF-8 (byte) strings under Py2, and adds asserts to the generic treewalker to ensure we have Unicode strings.
1 parent 82377ec commit deb205a

File tree

2 files changed

+56
-26
lines changed

2 files changed

+56
-26
lines changed

html5lib/treewalkers/_base.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,45 @@ def __iter__(self):
1717
def error(self, msg):
1818
return {"type": "SerializeError", "data": msg}
1919

20-
def normalizeAttrs(self, attrs):
21-
newattrs = {}
22-
if attrs:
23-
#TODO: treewalkers should always have attrs
24-
for (namespace,name),value in attrs.items():
25-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
26-
assert isinstance(name, text_type)
27-
assert isinstance(value, text_type)
28-
newattrs[(namespace,name)] = value
29-
return newattrs
30-
3120
def emptyTag(self, namespace, name, attrs, hasChildren=False):
21+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
22+
assert isinstance(name, text_type), type(name)
23+
assert all((namespace is None or isinstance(namespace, text_type)) and
24+
isinstance(name, text_type) and
25+
isinstance(value, text_type)
26+
for (namespace, name), value in attrs.items())
27+
3228
yield {"type": "EmptyTag", "name": name,
3329
"namespace":namespace,
34-
"data": self.normalizeAttrs(attrs)}
30+
"data": attrs}
3531
if hasChildren:
3632
yield self.error(_("Void element has children"))
3733

3834
def startTag(self, namespace, name, attrs):
35+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
36+
assert isinstance(name, text_type), type(name)
37+
assert all((namespace is None or isinstance(namespace, text_type)) and
38+
isinstance(name, text_type) and
39+
isinstance(value, text_type)
40+
for (namespace, name), value in attrs.items())
41+
3942
return {"type": "StartTag",
4043
"name": name,
4144
"namespace":namespace,
42-
"data": self.normalizeAttrs(attrs)}
45+
"data": attrs}
4346

4447
def endTag(self, namespace, name):
48+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
49+
assert isinstance(name, text_type), type(namespace)
50+
4551
return {"type": "EndTag",
4652
"name": name,
4753
"namespace":namespace,
4854
"data": {}}
4955

5056
def text(self, data):
57+
assert isinstance(data, text_type), type(data)
58+
5159
data = data
5260
middle = data.lstrip(spaceCharacters)
5361
left = data[:len(data)-len(middle)]
@@ -62,16 +70,24 @@ def text(self, data):
6270
yield {"type": "SpaceCharacters", "data": right}
6371

6472
def comment(self, data):
73+
assert isinstance(data, text_type), type(data)
74+
6575
return {"type": "Comment", "data": data}
6676

6777
def doctype(self, name, publicId=None, systemId=None, correct=True):
78+
assert name is None or isinstance(name, text_type), type(name)
79+
assert publicId is None or isinstance(publicId, text_type), type(publicId)
80+
assert systemId is None or isinstance(systemId, text_type), type(systemId)
81+
6882
return {"type": "Doctype",
69-
"name": name is not None and name or "",
83+
"name": name if name is not None else "",
7084
"publicId": publicId,
7185
"systemId": systemId,
7286
"correct": correct}
7387

7488
def entity(self, name):
89+
assert isinstance(name, text_type), type(name)
90+
7591
return {"type": "Entity", "name": name}
7692

7793
def unknown(self, nodeType):

html5lib/treewalkers/lxmletree.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
23

34
from lxml import etree
45
from html5lib.treebuilders.etree import tag_regexp
@@ -12,14 +13,23 @@
1213
from html5lib.constants import voidElements
1314
from html5lib import ihatexml
1415

16+
def ensure_str(s):
17+
if s is None:
18+
return None
19+
elif isinstance(s, text_type):
20+
return s
21+
else:
22+
return s.decode("utf-8", "strict")
23+
1524
class Root(object):
1625
def __init__(self, et):
1726
self.elementtree = et
1827
self.children = []
1928
if et.docinfo.internalDTD:
20-
self.children.append(Doctype(self, et.docinfo.root_name,
21-
et.docinfo.public_id,
22-
et.docinfo.system_url))
29+
self.children.append(Doctype(self,
30+
ensure_str(et.docinfo.root_name),
31+
ensure_str(et.docinfo.public_id),
32+
ensure_str(et.docinfo.system_url)))
2333
root = et.getroot()
2434
node = root
2535

@@ -67,15 +77,17 @@ def __init__(self, fragment_root, obj):
6777
self.root_node = fragment_root
6878
self.obj = obj
6979
if hasattr(self.obj, 'text'):
70-
self.text = self.obj.text
80+
self.text = ensure_str(self.obj.text)
7181
else:
7282
self.text = None
7383
if hasattr(self.obj, 'tail'):
74-
self.tail = self.obj.tail
84+
self.tail = ensure_str(self.obj.tail)
7585
else:
7686
self.tail = None
7787
self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
78-
assert not self.isstring or isinstance(obj, str) or sys.version_info.major == 2
88+
# Support for bytes here is Py2
89+
if self.isstring:
90+
self.obj = ensure_str(self.obj)
7991

8092
def __getattr__(self, name):
8193
return getattr(self.obj, name)
@@ -120,7 +132,7 @@ def getNodeDetails(self, node):
120132
if isinstance(node, tuple): # Text node
121133
node, key = node
122134
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
123-
return _base.TEXT, getattr(node, key)
135+
return _base.TEXT, ensure_str(getattr(node, key))
124136

125137
elif isinstance(node, Root):
126138
return (_base.DOCUMENT,)
@@ -129,24 +141,26 @@ def getNodeDetails(self, node):
129141
return _base.DOCTYPE, node.name, node.public_id, node.system_id
130142

131143
elif isinstance(node, FragmentWrapper) and node.isstring:
132-
return _base.TEXT, node
144+
return _base.TEXT, node.obj
133145

134146
elif node.tag == etree.Comment:
135-
return _base.COMMENT, node.text
147+
return _base.COMMENT, ensure_str(node.text)
136148

137149
elif node.tag == etree.Entity:
138-
return _base.ENTITY, node.text[1:-1] # strip &;
150+
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
139151

140152
else:
141153
#This is assumed to be an ordinary element
142-
match = tag_regexp.match(node.tag)
154+
match = tag_regexp.match(ensure_str(node.tag))
143155
if match:
144156
namespace, tag = match.groups()
145157
else:
146158
namespace = None
147-
tag = node.tag
159+
tag = ensure_str(node.tag)
148160
attrs = {}
149161
for name, value in list(node.attrib.items()):
162+
name = ensure_str(name)
163+
value = ensure_str(value)
150164
match = tag_regexp.match(name)
151165
if match:
152166
attrs[(match.group(1),match.group(2))] = value

0 commit comments

Comments
 (0)