diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 45f1d066..260ed7dd 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -12,6 +12,8 @@ from html5lib import utils from xml.sax.saxutils import escape +import re + spaceCharacters = u"".join(spaceCharacters) try: @@ -84,7 +86,9 @@ class HTMLSerializer(object): resolve_entities = True # miscellaneous options + emit_doctype = 'preserve' inject_meta_charset = True + lang_attr = 'preserve' strip_whitespace = False sanitize = False @@ -92,9 +96,63 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "resolve_entities", "emit_doctype", "lang_attr", + "sanitize") def __init__(self, **kwargs): + """Initialize HTMLSerializer. + + Keyword options (default given first unless specified) include: + + emit_doctype='html'|'xhtml'|'html5'|'preserve' + Whether to output a doctype. + * emit_doctype='xhtml' preserves unknown doctypes and valid + XHTML doctypes, converts valid HTML doctypes to their XHTML + counterparts, and drops + * emit_doctype='html' preserves unknown doctypes and valid + HTML doctypes, converts valid XHTML doctypes to their HTML + counterparts, and uses for missing doctypes + * emit_doctype='html5' Uses as the doctype + * emit_doctype='preserve' preserves the doctype, if any, unchanged + inject_meta_charset=True|False + ..? + lang_attr='preserve'|'xml'|'html' + Whether to translate 'lang' attributes. + * lang_attr='preserve' does no translation + * lang_attr='xml' translates 'lang' to 'xml:lang' + * lang_attr='html' translates 'xml:lang' to 'lang' + quote_attr_values=True|False + Whether to quote attribute values that don't require quoting + per HTML5 parsing rules. + quote_char=u'"'|u"'" + Use given quote character for attribute quoting. Default is to + use double quote unless attribute value contains a double quote, + in which case single quotes are used instead. + escape_lt_in_attrs=False|True + Whether to escape < in attribute values. + escape_rc_data=False|True + ..? + resolve_entities=True|False + Whether to resolve named character entities that appear in the + source tree. The XML predified entities < > & " ' + are unaffected by this setting. + strip_whitespace=False|True + ..? + minimize_boolean_attributes=True|false + Shortens boolean attributes to give just the attribute value, + for example becomes . + use_trailing_solidus + Includes a close-tag slash at the end of the start tag of void + elements (empty elements whose end tag is forbidden). E.g.
. + space_before_trailing_solidus + Places a space immediately before the closing slash in a tag + using a trailing solidus. E.g.
. Requires use_trailing_solidus. + sanitize + Strip all unsafe or unknown constructs from output. + See `html5lib user documentation`_ + + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """ if kwargs.has_key('quote_char'): self.use_best_quote_char = False for attr in self.options: @@ -102,6 +160,86 @@ def __init__(self, **kwargs): self.errors = [] self.strict = False + def calc_doctype(self, token=None): + if self.emit_doctype == 'html5' or \ + not token and self.emit_doctype == 'html': + if token: + return u'' + else: + return u'\n' + + rootElement = token["name"] + publicID = token["publicId"] + systemID = token["systemId"] + + if re.match(u'html', rootElement, re.IGNORECASE): + if self.emit_doctype == u'html': + # XHTML 1.1 + if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Strict + elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"): + publicID = u"-//W3C//DTD HTML 4.01//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/strict.dtd" + # XHTML 1.0 Transitional + elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/loose.dtd" + # XHTML 1.0 Frameset + elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \ + or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"): + publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/html4/frameset.dtd" + elif self.emit_doctype == u'xhtml': + # HTML 4.01 Strict + if re.match(u"-//W3C//DTD HTML 4.0(1)?//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + # HTML4.01 Transitional + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Transitional//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + # HTML 4.01 Frameset + elif re.match(u"-//W3C//DTD HTML 4.0(1)? Frameset//EN", publicID) and \ + (not systemID or \ + re.match(u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd", systemID)): + publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN" + if systemID: + systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" + # HTML 3.2 + elif re.match(u"-//W3C//DTD HTML 3.2( Final)?//EN", publicID) and not systemID: + publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN" + + doctype = u"= 0: + if systemID.find(u"'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = u"'" + else: + quote_char = u'"' + doctype += u" %s%s%s" % (quote_char, systemID, quote_char) + doctype += u">" + return doctype + def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] @@ -119,26 +257,12 @@ def serialize(self, treewalker, encoding=None): if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) + posted_doctype = False for token in treewalker: type = token["type"] if type == "Doctype": - doctype = u"= 0: - if token["systemId"].find(u"'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) - quote_char = u"'" - else: - quote_char = u'"' - doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) - - doctype += u">" - + posted_doctype = True + doctype = self.calc_doctype(token) if encoding: yield doctype.encode(encoding) else: @@ -158,6 +282,9 @@ def serialize(self, treewalker, encoding=None): yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): + if not posted_doctype: + posted_doctype = True + yield self.calc_doctype() name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True @@ -166,9 +293,20 @@ def serialize(self, treewalker, encoding=None): attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() - attrs.sort() attributes = [] for k,v in attrs: + + # clean up xml:lang + if k == '{http://www.w3.org/XML/1998/namespace}lang': + k = 'xml:lang' + if self.lang_attr == 'xml': + if k == 'lang' and not ('xml:lang' in attrs or + '{http://www.w3.org/XML/1998/namespace}lang' in attrs): + k = 'xml:lang' + elif self.lang_attr == 'html': + if k == 'xml:lang' and not ('lang' in attrs): + k = 'lang' + if encoding: k = k.encode(encoding, "strict") attributes.append(' ')