diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index 45f1d066..260ed7dd 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -12,6 +12,8 @@
from html5lib import utils
from xml.sax.saxutils import escape
+import re
+
spaceCharacters = u"".join(spaceCharacters)
try:
@@ -84,7 +86,9 @@ class HTMLSerializer(object):
resolve_entities = True
# miscellaneous options
+ emit_doctype = 'preserve'
inject_meta_charset = True
+ lang_attr = 'preserve'
strip_whitespace = False
sanitize = False
@@ -92,9 +96,63 @@ class HTMLSerializer(object):
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
- "escape_rcdata", "resolve_entities", "sanitize")
+ "escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr",
+ "sanitize")
def __init__(self, **kwargs):
+ """Initialize HTMLSerializer.
+
+ Keyword options (default given first unless specified) include:
+
+ emit_doctype='html'|'xhtml'|'html5'|'preserve'
+ Whether to output a doctype.
+ * emit_doctype='xhtml' preserves unknown doctypes and valid
+ XHTML doctypes, converts valid HTML doctypes to their XHTML
+ counterparts, and drops
+ * emit_doctype='html' preserves unknown doctypes and valid
+ HTML doctypes, converts valid XHTML doctypes to their HTML
+ counterparts, and uses for missing doctypes
+ * emit_doctype='html5' Uses as the doctype
+ * emit_doctype='preserve' preserves the doctype, if any, unchanged
+ inject_meta_charset=True|False
+ ..?
+ lang_attr='preserve'|'xml'|'html'
+ Whether to translate 'lang' attributes.
+ * lang_attr='preserve' does no translation
+ * lang_attr='xml' translates 'lang' to 'xml:lang'
+ * lang_attr='html' translates 'xml:lang' to 'lang'
+ quote_attr_values=True|False
+ Whether to quote attribute values that don't require quoting
+ per HTML5 parsing rules.
+ quote_char=u'"'|u"'"
+ Use given quote character for attribute quoting. Default is to
+ use double quote unless attribute value contains a double quote,
+ in which case single quotes are used instead.
+ escape_lt_in_attrs=False|True
+ Whether to escape < in attribute values.
+ escape_rc_data=False|True
+ ..?
+ resolve_entities=True|False
+ Whether to resolve named character entities that appear in the
+ source tree. The XML predified entities < > & " '
+ are unaffected by this setting.
+ strip_whitespace=False|True
+ ..?
+ minimize_boolean_attributes=True|false
+ Shortens boolean attributes to give just the attribute value,
+ for example becomes .
+ use_trailing_solidus
+ Includes a close-tag slash at the end of the start tag of void
+ elements (empty elements whose end tag is forbidden). E.g.
.
+ space_before_trailing_solidus
+ Places a space immediately before the closing slash in a tag
+ using a trailing solidus. E.g. . Requires use_trailing_solidus.
+ sanitize
+ Strip all unsafe or unknown constructs from output.
+ See `html5lib user documentation`_
+
+ .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
+ """
if kwargs.has_key('quote_char'):
self.use_best_quote_char = False
for attr in self.options:
@@ -102,6 +160,86 @@ def __init__(self, **kwargs):
self.errors = []
self.strict = False
+ def calc_doctype(self, token=None):
+ if self.emit_doctype == 'html5' or \
+ not token and self.emit_doctype == 'html':
+ if token:
+ return u''
+ else:
+ return u'\n'
+
+ rootElement = token["name"]
+ publicID = token["publicId"]
+ systemID = token["systemId"]
+
+ if re.match(u'html', rootElement, re.IGNORECASE):
+ if self.emit_doctype == u'html':
+ # XHTML 1.1
+ if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
+ or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"):
+ publicID = u"-//W3C//DTD HTML 4.01//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
+ # XHTML 1.0 Strict
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"):
+ publicID = u"-//W3C//DTD HTML 4.01//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
+ # XHTML 1.0 Transitional
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"):
+ publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/html4/loose.dtd"
+ # XHTML 1.0 Frameset
+ elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"):
+ publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
+ elif self.emit_doctype == u'xhtml':
+ # HTML 4.01 Strict
+ if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \
+ (not systemID or \
+ re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)):
+ publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
+ # HTML4.01 Transitional
+ elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \
+ (not systemID or \
+ re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)):
+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+ # HTML 4.01 Frameset
+ elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \
+ (not systemID or \
+ re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)):
+ publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
+ if systemID:
+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
+ # HTML 3.2
+ elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID:
+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
+
+ doctype = u"= 0:
+ if systemID.find(u"'") >= 0:
+ self.serializeError(_("System identifer contains both single and double quote characters"))
+ quote_char = u"'"
+ else:
+ quote_char = u'"'
+ doctype += u" %s%s%s" % (quote_char, systemID, quote_char)
+ doctype += u">"
+ return doctype
+
def serialize(self, treewalker, encoding=None):
in_cdata = False
self.errors = []
@@ -119,26 +257,12 @@ def serialize(self, treewalker, encoding=None):
if self.omit_optional_tags:
from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
+ posted_doctype = False
for token in treewalker:
type = token["type"]
if type == "Doctype":
- doctype = u"= 0:
- if token["systemId"].find(u"'") >= 0:
- self.serializeError(_("System identifer contains both single and double quote characters"))
- quote_char = u"'"
- else:
- quote_char = u'"'
- doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
-
- doctype += u">"
-
+ posted_doctype = True
+ doctype = self.calc_doctype(token)
if encoding:
yield doctype.encode(encoding)
else:
@@ -158,6 +282,9 @@ def serialize(self, treewalker, encoding=None):
yield escape(token["data"])
elif type in ("StartTag", "EmptyTag"):
+ if not posted_doctype:
+ posted_doctype = True
+ yield self.calc_doctype()
name = token["name"]
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
@@ -166,9 +293,20 @@ def serialize(self, treewalker, encoding=None):
attrs = token["data"]
if hasattr(attrs, "items"):
attrs = attrs.items()
- attrs.sort()
attributes = []
for k,v in attrs:
+
+ # clean up xml:lang
+ if k == '{http://www.w3.org/XML/1998/namespace}lang':
+ k = 'xml:lang'
+ if self.lang_attr == 'xml':
+ if k == 'lang' and not ('xml:lang' in attrs or
+ '{http://www.w3.org/XML/1998/namespace}lang' in attrs):
+ k = 'xml:lang'
+ elif self.lang_attr == 'html':
+ if k == 'xml:lang' and not ('lang' in attrs):
+ k = 'lang'
+
if encoding:
k = k.encode(encoding, "strict")
attributes.append(' ')