@@ -76,14 +76,12 @@ def htmlentityreplace_errors(exc):
76
76
77
77
del register_error
78
78
79
- def encode (text , encoding ):
80
- return text .encode (encoding , unicode_encode_errors )
81
79
82
80
class HTMLSerializer (object ):
83
81
84
82
# attribute quoting options
85
83
quote_attr_values = False
86
- quote_char = '"'
84
+ quote_char = u '"'
87
85
use_best_quote_char = True
88
86
89
87
# tag syntax options
@@ -159,7 +157,22 @@ def __init__(self, **kwargs):
159
157
self .errors = []
160
158
self .strict = False
161
159
160
+ def encode (self , string ):
161
+ assert (isinstance (string , unicode ))
162
+ if self .encoding :
163
+ return string .encode (self .encoding , unicode_encode_errors )
164
+ else :
165
+ return string
166
+
167
+ def encodeStrict (self , string ):
168
+ assert (isinstance (string , unicode ))
169
+ if self .encoding :
170
+ return string .encode (self .encoding , "strict" )
171
+ else :
172
+ return string
173
+
162
174
def serialize (self , treewalker , encoding = None ):
175
+ self .encoding = encoding
163
176
in_cdata = False
164
177
self .errors = []
165
178
if encoding and self .inject_meta_charset :
@@ -195,27 +208,19 @@ def serialize(self, treewalker, encoding=None):
195
208
doctype += u" %s%s%s" % (quote_char , token ["systemId" ], quote_char )
196
209
197
210
doctype += u">"
198
-
199
- if encoding :
200
- yield doctype .encode (encoding )
201
- else :
202
- yield doctype
211
+ yield self .encodeStrict (doctype )
203
212
204
213
elif type in ("Characters" , "SpaceCharacters" ):
205
214
if type == "SpaceCharacters" or in_cdata :
206
215
if in_cdata and token ["data" ].find ("</" ) >= 0 :
207
216
self .serializeError (_ ("Unexpected </ in CDATA" ))
208
- if encoding :
209
- yield token ["data" ].encode (encoding , "strict" )
210
- else :
211
- yield token ["data" ]
212
- elif encoding :
213
- yield encode (escape (token ["data" ]), encoding )
217
+ yield self .encode (token ["data" ])
214
218
else :
215
- yield escape (token ["data" ])
219
+ yield self . encode ( escape (token ["data" ]) )
216
220
217
221
elif type in ("StartTag" , "EmptyTag" ):
218
222
name = token ["name" ]
223
+ yield self .encodeStrict (u"<%s" % name )
219
224
if name in rcdataElements and not self .escape_rcdata :
220
225
in_cdata = True
221
226
elif in_cdata :
@@ -225,69 +230,56 @@ def serialize(self, treewalker, encoding=None):
225
230
#TODO: Add namespace support here
226
231
k = attr_name
227
232
v = attr_value
228
- if encoding :
229
- k = k .encode (encoding , "strict" )
230
- attributes .append (' ' )
233
+ yield self .encodeStrict (u' ' )
231
234
232
- attributes . append (k )
235
+ yield self . encodeStrict (k )
233
236
if not self .minimize_boolean_attributes or \
234
237
(k not in booleanAttributes .get (name , tuple ()) \
235
238
and k not in booleanAttributes .get ("" , tuple ())):
236
- attributes . append ( "=" )
239
+ yield self . encodeStrict ( u "=" )
237
240
if self .quote_attr_values or not v :
238
241
quote_attr = True
239
242
else :
240
243
quote_attr = reduce (lambda x ,y : x or (y in v ),
241
- spaceCharacters + ">\" '=" , False )
242
- v = v .replace ("&" , "&" )
243
- if self .escape_lt_in_attrs : v = v .replace ("<" , "<" )
244
- if encoding :
245
- v = encode (v , encoding )
244
+ spaceCharacters + u">\" '=" , False )
245
+ v = v .replace (u"&" , u"&" )
246
+ if self .escape_lt_in_attrs : v = v .replace (u"<" , u"<" )
246
247
if quote_attr :
247
248
quote_char = self .quote_char
248
249
if self .use_best_quote_char :
249
- if "'" in v and '"' not in v :
250
- quote_char = '"'
251
- elif '"' in v and "'" not in v :
252
- quote_char = "'"
253
- if quote_char == "'" :
254
- v = v .replace ("'" , "'" )
250
+ if u "'" in v and u '"' not in v :
251
+ quote_char = u '"'
252
+ elif u '"' in v and u "'" not in v :
253
+ quote_char = u "'"
254
+ if quote_char == u "'" :
255
+ v = v .replace (u "'" , u "'" )
255
256
else :
256
- v = v .replace ('"' , """ )
257
- attributes . append (quote_char )
258
- attributes . append (v )
259
- attributes . append (quote_char )
257
+ v = v .replace (u '"' , u """ )
258
+ yield self . encodeStrict (quote_char )
259
+ yield self . encode (v )
260
+ yield self . encodeStrict (quote_char )
260
261
else :
261
- attributes . append (v )
262
+ yield self . encode (v )
262
263
if name in voidElements and self .use_trailing_solidus :
263
264
if self .space_before_trailing_solidus :
264
- attributes . append ( " /" )
265
+ yield self . encodeStrict ( u " /" )
265
266
else :
266
- attributes .append ("/" )
267
- if encoding :
268
- yield "<%s%s>" % (name .encode (encoding , "strict" ), "" .join (attributes ))
269
- else :
270
- yield u"<%s%s>" % (name , u"" .join (attributes ))
267
+ yield self .encodeStrict (u"/" )
268
+ yield self .encode (u">" )
271
269
272
270
elif type == "EndTag" :
273
271
name = token ["name" ]
274
272
if name in rcdataElements :
275
273
in_cdata = False
276
274
elif in_cdata :
277
275
self .serializeError (_ ("Unexpected child element of a CDATA element" ))
278
- end_tag = u"</%s>" % name
279
- if encoding :
280
- end_tag = end_tag .encode (encoding , "strict" )
281
- yield end_tag
276
+ yield self .encodeStrict (u"</%s>" % name )
282
277
283
278
elif type == "Comment" :
284
279
data = token ["data" ]
285
280
if data .find ("--" ) >= 0 :
286
281
self .serializeError (_ ("Comment contains --" ))
287
- comment = u"<!--%s-->" % token ["data" ]
288
- if encoding :
289
- comment = comment .encode (encoding , unicode_encode_errors )
290
- yield comment
282
+ yield self .encodeStrict (u"<!--%s-->" % token ["data" ])
291
283
292
284
elif type == "Entity" :
293
285
name = token ["name" ]
@@ -298,9 +290,7 @@ def serialize(self, treewalker, encoding=None):
298
290
data = entities [key ]
299
291
else :
300
292
data = u"&%s;" % name
301
- if encoding :
302
- data = data .encode (encoding , unicode_encode_errors )
303
- yield data
293
+ yield self .encodeStrict (data )
304
294
305
295
else :
306
296
self .serializeError (token ["data" ])
0 commit comments