@@ -37,10 +37,10 @@ class HTMLParser(object):
37
37
38
38
def __init__ (self , strict = False , tree = simpletree .TreeBuilder ):
39
39
"""
40
- strict - raise an exception when a parse error is encountered
41
-
42
- tree - a treebuilder class controlling the type of tree that will be
43
- returned. This class is almost always a subclass of
40
+ strict - raise an exception when a parse error is encountered
41
+
42
+ tree - a treebuilder class controlling the type of tree that will be
43
+ returned. This class is almost always a subclass of
44
44
html5lib.treebuilders._base.TreeBuilder
45
45
"""
46
46
@@ -72,10 +72,10 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
72
72
73
73
def parse (self , stream , encoding = None , innerHTML = False ):
74
74
"""Parse a HTML document into a well-formed tree
75
-
75
+
76
76
stream - a filelike object or string containing the HTML to be parsed
77
-
78
- innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
77
+
78
+ innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
79
79
is not yet supported)
80
80
81
81
The optional encoding parameter must be a string that indicates
@@ -131,7 +131,7 @@ def atheistParseError(self):
131
131
132
132
def normalizeToken (self , token ):
133
133
""" HTML5 specific normalizations to the token stream """
134
-
134
+
135
135
if token ["type" ] == "EmptyTag" :
136
136
# When a solidus (/) is encountered within a tag name what happens
137
137
# depends on whether the current tag name matches that of a void
@@ -249,7 +249,7 @@ def processComment(self, data):
249
249
self .tree .insertComment (data , self .tree .openElements [- 1 ])
250
250
251
251
def processDoctype (self , name , error ):
252
- self .parser .parseError ()
252
+ self .parser .parseError (_ ( "Unexpected DOCTYPE. Ignored." ) )
253
253
254
254
def processSpaceCharacters (self , data ):
255
255
self .tree .insertText (data )
@@ -436,7 +436,7 @@ def startTagTitle(self, name, attributes):
436
436
self .appendToHead (element )
437
437
self .tree .openElements .append (element )
438
438
self .parser .tokenizer .contentModelFlag = contentModelFlags ["RCDATA" ]
439
-
439
+
440
440
def startTagStyle (self , name , attributes ):
441
441
element = self .tree .createElement (name , attributes )
442
442
if self .tree .headPointer is not None and \
@@ -596,11 +596,12 @@ def __init__(self, parser, tree):
596
596
(("a" , "b" , "big" , "em" , "font" , "i" , "nobr" , "s" , "small" ,
597
597
"strike" , "strong" , "tt" , "u" ), self .endTagFormatting ),
598
598
(("marquee" , "object" , "button" ), self .endTagButtonMarqueeObject ),
599
- (("caption" , "col" , "colgroup" , "frame" , "frameset" , "head" ,
600
- "option" , "optgroup" , "tbody" , "td" , "tfoot" , "th" , "thead" ,
601
- "tr" , "area" , "basefont" , "bgsound" , "br" , "embed" , "hr" ,
602
- "image" , "img" , "input" , "isindex" , "param" , "select" , "spacer" ,
603
- "table" , "wbr" ),self .endTagMisplacedNone ),
599
+ (("head" , "frameset" , "select" , "optgroup" , "option" , "table" ,
600
+ "caption" , "colgroup" , "col" , "thead" , "tfoot" , "tbody" , "tr" ,
601
+ "td" , "th" ), self .endTagMisplaced ),
602
+ (("area" , "basefont" , "bgsound" , "br" , "embed" , "hr" , "image" ,
603
+ "img" , "input" , "isindex" , "param" , "spacer" , "wbr" , "frame" ),
604
+ self .endTagNone ),
604
605
(("noframes" , "noscript" , "noembed" , "textarea" , "xmp" , "iframe" ),
605
606
self .endTagCdataTextAreaXmp ),
606
607
(("event-source" , "section" , "nav" , "article" , "aside" , "header" ,
@@ -647,7 +648,7 @@ def startTagCloseP(self, name, attributes):
647
648
648
649
def startTagForm (self , name , attributes ):
649
650
if self .tree .formPointer :
650
- self .parser .parseError ()
651
+ self .parser .parseError ("Unexpected start tag (form). Ignored." )
651
652
else :
652
653
if self .tree .elementInScope ("p" ):
653
654
self .endTagP ("p" )
@@ -685,7 +686,8 @@ def startTagHeading(self, name, attributes):
685
686
self .endTagP ("p" )
686
687
for item in headingElements :
687
688
if self .tree .elementInScope (item ):
688
- self .parser .parseError ()
689
+ self .parser .parseError (_ ("Unexpected start tag (" + name + \
690
+ ")." ))
689
691
item = self .tree .openElements .pop ()
690
692
while item .name not in headingElements :
691
693
item = self .tree .openElements .pop ()
@@ -818,7 +820,7 @@ def startTagOther(self, name, attributes):
818
820
def endTagP (self , name ):
819
821
self .tree .generateImpliedEndTags ("p" )
820
822
if self .tree .openElements [- 1 ].name != "p" :
821
- self .parser .parseError ()
823
+ self .parser .parseError ("Unexpected end tag (p)." )
822
824
while self .tree .elementInScope ("p" ):
823
825
self .tree .openElements .pop ()
824
826
@@ -1007,25 +1009,23 @@ def endTagButtonMarqueeObject(self, name):
1007
1009
if self .tree .elementInScope (name ):
1008
1010
self .tree .generateImpliedEndTags ()
1009
1011
if self .tree .openElements [- 1 ].name != name :
1010
- self .parser .parseError ()
1012
+ self .parser .parseError (_ (u"Unexpected end tag (" + name + \
1013
+ "). Expected other end tag first." ))
1011
1014
1012
1015
if self .tree .elementInScope (name ):
1013
1016
element = self .tree .openElements .pop ()
1014
1017
while element .name != name :
1015
1018
element = self .tree .openElements .pop ()
1016
1019
self .tree .clearActiveFormattingElements ()
1017
1020
1018
- def endTagMisplacedNone (self , name ):
1019
- """ Elements that should be children of other elements that have a
1020
- different insertion mode or elements that have no end tag;
1021
- here they are ignored
1022
- "caption", "col", "colgroup", "frame", "frameset", "head",
1023
- "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1024
- "tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
1025
- "hr", "iframe", "image", "img", "input", "isindex", "noembed",
1026
- "noframes", "param", "select", "spacer", "table", "textarea", "wbr""
1027
- """
1028
- self .parser .parseError ()
1021
+ def endTagMisplaced (self , name ):
1022
+ # This handles elements with end tags in other insertion modes.
1023
+ self .parser .parseError (_ (u"Unexpected end tag (" + name + \
1024
+ u"). Ignored." ))
1025
+
1026
+ def endTagNone (self , name ):
1027
+ # This handles elements with no end tag.
1028
+ self .parser .parseError (_ (u"This tag (" + name + u")has no end tag" ))
1029
1029
1030
1030
def endTagCdataTextAreaXmp (self , name ):
1031
1031
if self .tree .openElements [- 1 ].name == name :
0 commit comments