Skip to content

Commit a15c219

Browse files
committed
Emit separate open and close tags for non-void XHTML elements
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40491
1 parent 98590a7 commit a15c219

File tree

4 files changed

+53
-19
lines changed

4 files changed

+53
-19
lines changed

parse.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import os
99
from optparse import OptionParser
1010

11-
from src import html5parser
11+
from src import html5parser, liberalxmlparser
1212

1313
def convertTreeDump(treedump):
1414
"""convert the output of str(document) to something more readable
@@ -57,7 +57,10 @@ def parse():
5757
import src.treebuilders.simpletree
5858
treebuilder = src.treebuilders.simpletree.TreeBuilder
5959

60-
p = html5parser.HTMLParser(tree=treebuilder)
60+
if opts.xml:
61+
p = liberalxmlparser.XHTMLParser(tree=treebuilder)
62+
else:
63+
p = html5parser.HTMLParser(tree=treebuilder)
6164

6265
if opts.profile:
6366
import hotshot

src/liberalxmlparser.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,30 +11,25 @@
1111
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
1212
1313
@@TODO:
14-
* Build a Treebuilder that produces Python DOM objects:
15-
http://docs.python.org/lib/module-xml.dom.html
1614
* Produce SAX events based on the produced DOM. This is intended not to
1715
support streaming, but rather to support application level compatibility.
1816
* Optional namespace support
19-
* Special case the output of XHTML <script> elements so that the empty
20-
element syntax is never used, even when the src attribute is provided.
21-
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
17+
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
2218
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
23-
* Map illegal XML characters to U+FFFD, possibly with additional markup in
24-
the case of XHTML
2519
* Selectively lowercase only XHTML, but not foreign markup
2620
"""
2721

2822
import html5parser
23+
from constants import voidElements
2924
import gettext
3025
_ = gettext.gettext
3126

32-
class XHTMLParser(html5parser.HTMLParser):
33-
""" liberal XMTHML parser """
27+
class XMLParser(html5parser.HTMLParser):
28+
""" liberal XML parser """
3429

3530
def __init__(self, *args, **kwargs):
3631
html5parser.HTMLParser.__init__(self, *args, **kwargs)
37-
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
32+
self.phases["initial"] = XmlRootPhase(self, self.tree)
3833

3934
def normalizeToken(self, token):
4035
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
@@ -57,20 +52,38 @@ def normalizeToken(self, token):
5752

5853
return token
5954

55+
class XHTMLParser(XMLParser):
56+
""" liberal XMTHML parser """
57+
58+
def __init__(self, *args, **kwargs):
59+
html5parser.HTMLParser.__init__(self, *args, **kwargs)
60+
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
61+
62+
def normalizeToken(self, token):
63+
token = XMLParser.normalizeToken(self, token)
64+
65+
# ensure that non-void XHTML elements have content so that separate
66+
# open and close tags are emitted
67+
if token["type"] == "EndTag" and \
68+
token["name"] not in voidElements and \
69+
token["name"] == self.tree.openElements[-1].name and \
70+
not self.tree.openElements[-1].hasContent():
71+
for e in self.tree.openElements:
72+
if 'xmlns' in e.attributes.keys():
73+
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
74+
break
75+
else:
76+
self.tree.insertText('')
77+
78+
return token
79+
6080
class XhmlRootPhase(html5parser.RootElementPhase):
6181
def insertHtmlElement(self):
6282
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
6383
self.tree.openElements.append(element)
6484
self.tree.document.appendChild(element)
6585
self.parser.phase = self.parser.phases["beforeHead"]
6686

67-
class XMLParser(XHTMLParser):
68-
""" liberal XML parser """
69-
70-
def __init__(self, *args, **kwargs):
71-
XHTMLParser.__init__(self, *args, **kwargs)
72-
self.phases["initial"] = XmlRootPhase(self, self.tree)
73-
7487
class XmlRootPhase(html5parser.Phase):
7588
""" Prime the Xml parser """
7689
def __getattr__(self, name):

src/treebuilders/dom.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ def __setitem__(self, name, value):
1414
self.element.setAttribute(name, value)
1515
def items(self):
1616
return self.element.attributes.items()
17+
def keys(self):
18+
return self.element.attributes.keys()
19+
def __getitem__(self, name):
20+
return self.element.getAttribute(name)
1721

1822
class NodeBuilder(_base.Node):
1923
def __init__(self, element):

tests/test_lxp.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,20 @@ def test_xlink(self):
165165
</svg>
166166
</body></html>""")
167167

168+
def test_br(self):
169+
self.assertXhtmlEquals("""<html xmlns="http://www.w3.org/1999/xhtml">
170+
<head><title>XLINK</title></head>
171+
<body>
172+
<br/>
173+
</body></html>""")
174+
175+
def test_strong(self):
176+
self.assertXhtmlEquals("""<html xmlns="http://www.w3.org/1999/xhtml">
177+
<head><title>XLINK</title></head>
178+
<body>
179+
<strong></strong>
180+
</body></html>""")
181+
168182
def buildTestSuite():
169183
return unittest.defaultTestLoader.loadTestsFromName(__name__)
170184

0 commit comments

Comments
 (0)