11
11
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
12
12
13
13
@@TODO:
14
- * Build a Treebuilder that produces Python DOM objects:
15
- http://docs.python.org/lib/module-xml.dom.html
16
14
* Produce SAX events based on the produced DOM. This is intended not to
17
15
support streaming, but rather to support application level compatibility.
18
16
* Optional namespace support
19
- * Special case the output of XHTML <script> elements so that the empty
20
- element syntax is never used, even when the src attribute is provided.
21
- Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
17
+ * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
22
18
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
23
- * Map illegal XML characters to U+FFFD, possibly with additional markup in
24
- the case of XHTML
25
19
* Selectively lowercase only XHTML, but not foreign markup
26
20
"""
27
21
28
22
import html5parser
23
+ from constants import voidElements
29
24
import gettext
30
25
_ = gettext .gettext
31
26
32
- class XHTMLParser (html5parser .HTMLParser ):
33
- """ liberal XMTHML parser """
27
+ class XMLParser (html5parser .HTMLParser ):
28
+ """ liberal XML parser """
34
29
35
30
def __init__ (self , * args , ** kwargs ):
36
31
html5parser .HTMLParser .__init__ (self , * args , ** kwargs )
37
- self .phases ["rootElement " ] = XhmlRootPhase (self , self .tree )
32
+ self .phases ["initial " ] = XmlRootPhase (self , self .tree )
38
33
39
34
def normalizeToken (self , token ):
40
35
if token ["type" ] == "StartTag" or token ["type" ] == "EmptyTag" :
@@ -57,20 +52,38 @@ def normalizeToken(self, token):
57
52
58
53
return token
59
54
55
+ class XHTMLParser (XMLParser ):
56
+ """ liberal XMTHML parser """
57
+
58
+ def __init__ (self , * args , ** kwargs ):
59
+ html5parser .HTMLParser .__init__ (self , * args , ** kwargs )
60
+ self .phases ["rootElement" ] = XhmlRootPhase (self , self .tree )
61
+
62
+ def normalizeToken (self , token ):
63
+ token = XMLParser .normalizeToken (self , token )
64
+
65
+ # ensure that non-void XHTML elements have content so that separate
66
+ # open and close tags are emitted
67
+ if token ["type" ] == "EndTag" and \
68
+ token ["name" ] not in voidElements and \
69
+ token ["name" ] == self .tree .openElements [- 1 ].name and \
70
+ not self .tree .openElements [- 1 ].hasContent ():
71
+ for e in self .tree .openElements :
72
+ if 'xmlns' in e .attributes .keys ():
73
+ if e .attributes ['xmlns' ] <> 'http://www.w3.org/1999/xhtml' :
74
+ break
75
+ else :
76
+ self .tree .insertText ('' )
77
+
78
+ return token
79
+
60
80
class XhmlRootPhase (html5parser .RootElementPhase ):
61
81
def insertHtmlElement (self ):
62
82
element = self .tree .createElement ("html" , {'xmlns' : 'http://www.w3.org/1999/xhtml' })
63
83
self .tree .openElements .append (element )
64
84
self .tree .document .appendChild (element )
65
85
self .parser .phase = self .parser .phases ["beforeHead" ]
66
86
67
- class XMLParser (XHTMLParser ):
68
- """ liberal XML parser """
69
-
70
- def __init__ (self , * args , ** kwargs ):
71
- XHTMLParser .__init__ (self , * args , ** kwargs )
72
- self .phases ["initial" ] = XmlRootPhase (self , self .tree )
73
-
74
87
class XmlRootPhase (html5parser .Phase ):
75
88
""" Prime the Xml parser """
76
89
def __getattr__ (self , name ):
0 commit comments