Emit separate open and close tags for non-void XHTML elements

rubys · rubys · commit a15c219657c3 · 2007-01-15T23:42:18.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40491
diff --git a/parse.py b/parse.py
@@ -8,7 +8,7 @@
 import os
 from optparse import OptionParser
 
-from src import html5parser
+from src import html5parser, liberalxmlparser
 
 def convertTreeDump(treedump):
     """convert the output of str(document) to something more readable
@@ -57,7 +57,10 @@ def parse():
         import src.treebuilders.simpletree
         treebuilder = src.treebuilders.simpletree.TreeBuilder
 
-    p = html5parser.HTMLParser(tree=treebuilder)
+    if opts.xml:
+        p = liberalxmlparser.XHTMLParser(tree=treebuilder)
+    else:
+        p = html5parser.HTMLParser(tree=treebuilder)
 
     if opts.profile:
         import hotshot
diff --git a/src/liberalxmlparser.py b/src/liberalxmlparser.py
@@ -11,30 +11,25 @@
  * http://wiki.whatwg.org/wiki/HtmlVsXhtml
 
 @@TODO:
- * Build a Treebuilder that produces Python DOM objects:
-     http://docs.python.org/lib/module-xml.dom.html
  * Produce SAX events based on the produced DOM.  This is intended not to
    support streaming, but rather to support application level compatibility. 
  * Optional namespace support
- * Special case the output of XHTML <script> elements so that the empty
-   element syntax is never used, even when the src attribute is provided.
-   Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
+ * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
    indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
- * Map illegal XML characters to U+FFFD, possibly with additional markup in
-   the case of XHTML
  * Selectively lowercase only XHTML, but not foreign markup
 """
 
 import html5parser
+from constants import voidElements
 import gettext
 _ = gettext.gettext
 
-class XHTMLParser(html5parser.HTMLParser):
-    """ liberal XMTHML parser """
+class XMLParser(html5parser.HTMLParser):
+    """ liberal XML parser """
 
     def __init__(self, *args, **kwargs):
         html5parser.HTMLParser.__init__(self, *args, **kwargs)
-        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
+        self.phases["initial"] = XmlRootPhase(self, self.tree)
 
     def normalizeToken(self, token):
         if token["type"] == "StartTag" or token["type"] == "EmptyTag":
@@ -57,20 +52,38 @@ def normalizeToken(self, token):
 
         return token
 
+class XHTMLParser(XMLParser):
+    """ liberal XMTHML parser """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
+
+    def normalizeToken(self, token):
+        token = XMLParser.normalizeToken(self, token)
+
+        # ensure that non-void XHTML elements have content so that separate
+        # open and close tags are emitted
+        if token["type"]  == "EndTag" and \
+            token["name"] not in voidElements and \
+            token["name"] == self.tree.openElements[-1].name and \
+            not self.tree.openElements[-1].hasContent():
+            for e in self.tree.openElements:
+                if 'xmlns' in e.attributes.keys():
+                    if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
+                        break
+            else:
+                self.tree.insertText('')
+
+        return token
+
 class XhmlRootPhase(html5parser.RootElementPhase):
     def insertHtmlElement(self):
         element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
         self.tree.openElements.append(element)
         self.tree.document.appendChild(element)
         self.parser.phase = self.parser.phases["beforeHead"]
 
-class XMLParser(XHTMLParser):
-    """ liberal XML parser """
-
-    def __init__(self, *args, **kwargs):
-        XHTMLParser.__init__(self, *args, **kwargs)
-        self.phases["initial"] = XmlRootPhase(self, self.tree)
-
 class XmlRootPhase(html5parser.Phase):
     """ Prime the Xml parser """
     def __getattr__(self, name):
diff --git a/src/treebuilders/dom.py b/src/treebuilders/dom.py
@@ -14,6 +14,10 @@ def __setitem__(self, name, value):
         self.element.setAttribute(name, value)
     def items(self):
         return self.element.attributes.items()
+    def keys(self):
+        return self.element.attributes.keys()
+    def __getitem__(self, name):
+        return self.element.getAttribute(name)
 
 class NodeBuilder(_base.Node):
     def __init__(self, element):
diff --git a/tests/test_lxp.py b/tests/test_lxp.py
@@ -165,6 +165,20 @@ def test_xlink(self):
   </svg>
 </body></html>""")
 
+  def test_br(self):
+    self.assertXhtmlEquals("""<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+<br/>
+</body></html>""")
+
+  def test_strong(self):
+    self.assertXhtmlEquals("""<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>XLINK</title></head>
+<body>
+<strong></strong>
+</body></html>""")
+
 def buildTestSuite():
   return unittest.defaultTestLoader.loadTestsFromName(__name__)