Skip to content

Commit bbbb03f

Browse files
committed
Fix issue html5lib#156.
Fix some Unicode mix-up in the serializer, too, making sure Unicode strings are unicode strings.
1 parent c2eecb5 commit bbbb03f

File tree

2 files changed

+57
-67
lines changed

2 files changed

+57
-67
lines changed

html5lib/filters/inject_meta_charset.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,44 +13,44 @@ def __iter__(self):
1313
for token in _base.Filter.__iter__(self):
1414
type = token["type"]
1515
if type == "StartTag":
16-
if token["name"].lower() == "head":
16+
if token["name"].lower() == u"head":
1717
state = "in_head"
1818

1919
elif type == "EmptyTag":
20-
if token["name"].lower() == "meta":
20+
if token["name"].lower() == u"meta":
2121
# replace charset with actual encoding
2222
has_http_equiv_content_type = False
2323
for (namespace,name),value in token["data"].iteritems():
2424
if namespace != None:
2525
continue
26-
elif name.lower() == 'charset':
26+
elif name.lower() == u'charset':
2727
token["data"][(namespace,name)] = self.encoding
2828
meta_found = True
2929
break
30-
elif name == 'http-equiv' and value.lower() == 'content-type':
30+
elif name == u'http-equiv' and value.lower() == u'content-type':
3131
has_http_equiv_content_type = True
3232
else:
33-
if has_http_equiv_content_type and (None, "content") in token["data"]:
34-
token["data"][(None, "content")] = u'text/html; charset=%s' % self.encoding
33+
if has_http_equiv_content_type and (None, u"content") in token["data"]:
34+
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
3535
meta_found = True
3636

37-
elif token["name"].lower() == "head" and not meta_found:
37+
elif token["name"].lower() == u"head" and not meta_found:
3838
# insert meta into empty head
39-
yield {"type": "StartTag", "name": "head",
39+
yield {"type": "StartTag", "name": u"head",
4040
"data": token["data"]}
41-
yield {"type": "EmptyTag", "name": "meta",
42-
"data": {(None, "charset"): self.encoding}}
43-
yield {"type": "EndTag", "name": "head"}
41+
yield {"type": "EmptyTag", "name": u"meta",
42+
"data": {(None, u"charset"): self.encoding}}
43+
yield {"type": "EndTag", "name": u"head"}
4444
meta_found = True
4545
continue
4646

4747
elif type == "EndTag":
48-
if token["name"].lower() == "head" and pending:
48+
if token["name"].lower() == u"head" and pending:
4949
# insert meta into head (if necessary) and flush pending queue
5050
yield pending.pop(0)
5151
if not meta_found:
52-
yield {"type": "EmptyTag", "name": "meta",
53-
"data": {(None, "charset"): self.encoding}}
52+
yield {"type": "EmptyTag", "name": u"meta",
53+
"data": {(None, u"charset"): self.encoding}}
5454
while pending:
5555
yield pending.pop(0)
5656
meta_found = True

html5lib/serializer/htmlserializer.py

Lines changed: 43 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,12 @@ def htmlentityreplace_errors(exc):
7676

7777
del register_error
7878

79-
def encode(text, encoding):
80-
return text.encode(encoding, unicode_encode_errors)
8179

8280
class HTMLSerializer(object):
8381

8482
# attribute quoting options
8583
quote_attr_values = False
86-
quote_char = '"'
84+
quote_char = u'"'
8785
use_best_quote_char = True
8886

8987
# tag syntax options
@@ -159,7 +157,22 @@ def __init__(self, **kwargs):
159157
self.errors = []
160158
self.strict = False
161159

160+
def encode(self, string):
161+
assert(isinstance(string, unicode))
162+
if self.encoding:
163+
return string.encode(self.encoding, unicode_encode_errors)
164+
else:
165+
return string
166+
167+
def encodeStrict(self, string):
168+
assert(isinstance(string, unicode))
169+
if self.encoding:
170+
return string.encode(self.encoding, "strict")
171+
else:
172+
return string
173+
162174
def serialize(self, treewalker, encoding=None):
175+
self.encoding = encoding
163176
in_cdata = False
164177
self.errors = []
165178
if encoding and self.inject_meta_charset:
@@ -195,27 +208,19 @@ def serialize(self, treewalker, encoding=None):
195208
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
196209

197210
doctype += u">"
198-
199-
if encoding:
200-
yield doctype.encode(encoding)
201-
else:
202-
yield doctype
211+
yield self.encodeStrict(doctype)
203212

204213
elif type in ("Characters", "SpaceCharacters"):
205214
if type == "SpaceCharacters" or in_cdata:
206215
if in_cdata and token["data"].find("</") >= 0:
207216
self.serializeError(_("Unexpected </ in CDATA"))
208-
if encoding:
209-
yield token["data"].encode(encoding, "strict")
210-
else:
211-
yield token["data"]
212-
elif encoding:
213-
yield encode(escape(token["data"]), encoding)
217+
yield self.encode(token["data"])
214218
else:
215-
yield escape(token["data"])
219+
yield self.encode(escape(token["data"]))
216220

217221
elif type in ("StartTag", "EmptyTag"):
218222
name = token["name"]
223+
yield self.encodeStrict(u"<%s" % name)
219224
if name in rcdataElements and not self.escape_rcdata:
220225
in_cdata = True
221226
elif in_cdata:
@@ -225,69 +230,56 @@ def serialize(self, treewalker, encoding=None):
225230
#TODO: Add namespace support here
226231
k = attr_name
227232
v = attr_value
228-
if encoding:
229-
k = k.encode(encoding, "strict")
230-
attributes.append(' ')
233+
yield self.encodeStrict(u' ')
231234

232-
attributes.append(k)
235+
yield self.encodeStrict(k)
233236
if not self.minimize_boolean_attributes or \
234237
(k not in booleanAttributes.get(name, tuple()) \
235238
and k not in booleanAttributes.get("", tuple())):
236-
attributes.append("=")
239+
yield self.encodeStrict(u"=")
237240
if self.quote_attr_values or not v:
238241
quote_attr = True
239242
else:
240243
quote_attr = reduce(lambda x,y: x or (y in v),
241-
spaceCharacters + ">\"'=", False)
242-
v = v.replace("&", "&amp;")
243-
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
244-
if encoding:
245-
v = encode(v, encoding)
244+
spaceCharacters + u">\"'=", False)
245+
v = v.replace(u"&", u"&amp;")
246+
if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
246247
if quote_attr:
247248
quote_char = self.quote_char
248249
if self.use_best_quote_char:
249-
if "'" in v and '"' not in v:
250-
quote_char = '"'
251-
elif '"' in v and "'" not in v:
252-
quote_char = "'"
253-
if quote_char == "'":
254-
v = v.replace("'", "&#39;")
250+
if u"'" in v and u'"' not in v:
251+
quote_char = u'"'
252+
elif u'"' in v and u"'" not in v:
253+
quote_char = u"'"
254+
if quote_char == u"'":
255+
v = v.replace(u"'", u"&#39;")
255256
else:
256-
v = v.replace('"', "&quot;")
257-
attributes.append(quote_char)
258-
attributes.append(v)
259-
attributes.append(quote_char)
257+
v = v.replace(u'"', u"&quot;")
258+
yield self.encodeStrict(quote_char)
259+
yield self.encode(v)
260+
yield self.encodeStrict(quote_char)
260261
else:
261-
attributes.append(v)
262+
yield self.encode(v)
262263
if name in voidElements and self.use_trailing_solidus:
263264
if self.space_before_trailing_solidus:
264-
attributes.append(" /")
265+
yield self.encodeStrict(u" /")
265266
else:
266-
attributes.append("/")
267-
if encoding:
268-
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
269-
else:
270-
yield u"<%s%s>" % (name, u"".join(attributes))
267+
yield self.encodeStrict(u"/")
268+
yield self.encode(u">")
271269

272270
elif type == "EndTag":
273271
name = token["name"]
274272
if name in rcdataElements:
275273
in_cdata = False
276274
elif in_cdata:
277275
self.serializeError(_("Unexpected child element of a CDATA element"))
278-
end_tag = u"</%s>" % name
279-
if encoding:
280-
end_tag = end_tag.encode(encoding, "strict")
281-
yield end_tag
276+
yield self.encodeStrict(u"</%s>" % name)
282277

283278
elif type == "Comment":
284279
data = token["data"]
285280
if data.find("--") >= 0:
286281
self.serializeError(_("Comment contains --"))
287-
comment = u"<!--%s-->" % token["data"]
288-
if encoding:
289-
comment = comment.encode(encoding, unicode_encode_errors)
290-
yield comment
282+
yield self.encodeStrict(u"<!--%s-->" % token["data"])
291283

292284
elif type == "Entity":
293285
name = token["name"]
@@ -298,9 +290,7 @@ def serialize(self, treewalker, encoding=None):
298290
data = entities[key]
299291
else:
300292
data = u"&%s;" % name
301-
if encoding:
302-
data = data.encode(encoding, unicode_encode_errors)
303-
yield data
293+
yield self.encodeStrict(data)
304294

305295
else:
306296
self.serializeError(token["data"])

0 commit comments

Comments
 (0)