@@ -25,7 +25,7 @@ def any(iterable):
25
25
from treebuilders import simpletree
26
26
27
27
import utils
28
- from constants import contentModelFlags , spaceCharacters , asciiUpper2Lower
28
+ from constants import spaceCharacters , asciiUpper2Lower
29
29
from constants import scopingElements , formattingElements , specialElements
30
30
from constants import headingElements , tableInsertModeElements
31
31
from constants import cdataElements , rcdataElements , voidElements
@@ -77,7 +77,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
77
77
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
78
78
"afterHead" : AfterHeadPhase (self , self .tree ),
79
79
"inBody" : InBodyPhase (self , self .tree ),
80
- "inRCDataRawtext " : InRCDataRawtextPhase (self , self .tree ),
80
+ "text " : TextPhase (self , self .tree ),
81
81
"inTable" : InTablePhase (self , self .tree ),
82
82
"inTableText" : InTableTextPhase (self , self .tree ),
83
83
"inCaption" : InCaptionPhase (self , self .tree ),
@@ -124,14 +124,14 @@ def reset(self):
124
124
self .innerHTML = self .container .lower ()
125
125
126
126
if self .innerHTML in cdataElements :
127
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "RCDATA" ]
127
+ self .tokenizer .state = self . tokenizer .rcdataState
128
128
elif self .innerHTML in rcdataElements :
129
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "RAWTEXT" ]
129
+ self .tokenizer .state = self . tokenizer .rawtextState
130
130
elif self .innerHTML == 'plaintext' :
131
- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "PLAINTEXT" ]
131
+ self .tokenizer .state = self . tokenizer .plaintextState
132
132
else :
133
- # contentModelFlag already is PCDATA
134
- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
133
+ # state already is data state
134
+ # self.tokenizer.state = self. tokenizer.dataState
135
135
pass
136
136
self .phase = self .phases ["beforeHtml" ]
137
137
self .phase .insertHtmlElement ()
@@ -406,7 +406,7 @@ def parseRCDataRawtext(self, token, contentType):
406
406
407
407
self .originalPhase = self .phase
408
408
409
- self .phase = self .phases ["inRCDataRawtext " ]
409
+ self .phase = self .phases ["text " ]
410
410
411
411
class Phase (object ):
412
412
"""Base class for helper object that implements each phase of processing
@@ -636,8 +636,12 @@ def processStartTag(self, token):
636
636
self .parser .phase .processStartTag (token )
637
637
638
638
def processEndTag (self , token ):
639
- self .insertHtmlElement ()
640
- self .parser .phase .processEndTag (token )
639
+ if token ["name" ] not in ("head" , "body" , "html" , "br" ):
640
+ self .parser .parseError ("unexpected-end-tag-before-html" ,
641
+ {"name" : token ["name" ]})
642
+ else :
643
+ self .insertHtmlElement ()
644
+ self .parser .phase .processEndTag (token )
641
645
642
646
643
647
class BeforeHeadPhase (Phase ):
@@ -651,7 +655,7 @@ def __init__(self, parser, tree):
651
655
self .startTagHandler .default = self .startTagOther
652
656
653
657
self .endTagHandler = utils .MethodDispatcher ([
654
- (("head" , "br" ), self .endTagImplyHead )
658
+ (("head" , "body" , "html" , " br" ), self .endTagImplyHead )
655
659
])
656
660
self .endTagHandler .default = self .endTagOther
657
661
@@ -666,6 +670,9 @@ def processCharacters(self, token):
666
670
self .startTagHead (impliedTagToken ("head" , "StartTag" ))
667
671
self .parser .phase .processCharacters (token )
668
672
673
+ def startTagHtml (self , token ):
674
+ self .parser .phases ["inBody" ].processStartTag (token )
675
+
669
676
def startTagHead (self , token ):
670
677
self .tree .insertElement (token )
671
678
self .tree .headPointer = self .tree .openElements [- 1 ]
@@ -692,8 +699,8 @@ def __init__(self, parser, tree):
692
699
("title" , self .startTagTitle ),
693
700
(("noscript" , "noframes" , "style" ), self .startTagNoScriptNoFramesStyle ),
694
701
("script" , self .startTagScript ),
695
- (("base" , "link" , "command" , "eventsource" ),
696
- self .startTagBaseLinkCommandEventsource ),
702
+ (("base" , "link" , "command" ),
703
+ self .startTagBaseLinkCommand ),
697
704
("meta" , self .startTagMeta ),
698
705
("head" , self .startTagHead )
699
706
])
@@ -728,7 +735,7 @@ def startTagHtml(self, token):
728
735
def startTagHead (self , token ):
729
736
self .parser .parseError ("two-heads-are-not-better-than-one" )
730
737
731
- def startTagBaseLinkCommandEventsource (self , token ):
738
+ def startTagBaseLinkCommand (self , token ):
732
739
self .tree .insertElement (token )
733
740
self .tree .openElements .pop ()
734
741
token ["selfClosingAcknowledged" ] = True
@@ -757,9 +764,10 @@ def startTagNoScriptNoFramesStyle(self, token):
757
764
self .parser .parseRCDataRawtext (token , "RAWTEXT" )
758
765
759
766
def startTagScript (self , token ):
760
- #I think this is equivalent to the RAWTEXT stuff since we don't execute script
761
- #self.tree.insertElement(token)
762
- self .parser .parseRCDataRawtext (token , "RAWTEXT" )
767
+ self .tree .insertElement (token )
768
+ self .parser .tokenizer .state = self .parser .tokenizer .scriptDataState
769
+ self .parser .originalPhase = self .parser .phase
770
+ self .parser .phase = self .parser .phases ["text" ]
763
771
764
772
def startTagOther (self , token ):
765
773
self .anythingElse ()
@@ -838,7 +846,6 @@ def startTagOther(self, token):
838
846
self .parser .phase .processStartTag (token )
839
847
840
848
def endTagHtmlBodyBr (self , token ):
841
- #This is not currently in the spec
842
849
self .anythingElse ()
843
850
self .parser .phase .processEndTag (token )
844
851
@@ -852,8 +859,8 @@ def anythingElse(self):
852
859
853
860
854
861
class InBodyPhase (Phase ):
855
- # http://www.whatwg.org/specs/web-apps/current-work/#in-body
856
- # the crazy mode
862
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
863
+ # the really-really-really-very crazy mode
857
864
def __init__ (self , parser , tree ):
858
865
Phase .__init__ (self , parser , tree )
859
866
@@ -862,15 +869,16 @@ def __init__(self, parser, tree):
862
869
863
870
self .startTagHandler = utils .MethodDispatcher ([
864
871
("html" , self .startTagHtml ),
865
- (("base" , "link" , "meta" , "script " , "style " , "title" ),
866
- self .startTagProcessInHead ),
872
+ (("base" , "command" , " link" , "meta" , "noframes " , "script " , "style" ,
873
+ "title" ), self .startTagProcessInHead ),
867
874
("body" , self .startTagBody ),
868
875
("frameset" , self .startTagFrameset ),
869
876
(("address" , "article" , "aside" , "blockquote" , "center" , "datagrid" ,
870
- "details" , "dialog" , " dir" , "div" , "dl" , "fieldset" , "figure" ,
871
- "footer" , "h1 " , "h2 " , "h3 " , "h4 " , "h5 " , "h6" , "header" , "listing " ,
872
- "menu" , "nav" , "ol" , "p" , "pre" , " section" , "ul" ),
877
+ "details" , "dir" , "div" , "dl" , "fieldset" , "figure" ,
878
+ "footer" , "header " , "hgroup " , "menu " , "nav " , "ol " , "p " ,
879
+ "section" , "ul" ),
873
880
self .startTagCloseP ),
881
+ (("pre" , "listing" ), self .startTagPreListing ),
874
882
("form" , self .startTagForm ),
875
883
(("li" , "dd" , "dt" ), self .startTagListItem ),
876
884
("plaintext" ,self .startTagPlaintext ),
@@ -899,18 +907,17 @@ def __init__(self, parser, tree):
899
907
(("svg" ), self .startTagSvg ),
900
908
(("caption" , "col" , "colgroup" , "frame" , "head" ,
901
909
"tbody" , "td" , "tfoot" , "th" , "thead" ,
902
- "tr" ), self .startTagMisplaced ),
903
- (("event-source" , "command" ), self .startTagNew )
910
+ "tr" ), self .startTagMisplaced )
904
911
])
905
912
self .startTagHandler .default = self .startTagOther
906
913
907
914
self .endTagHandler = utils .MethodDispatcher ([
908
915
("body" ,self .endTagBody ),
909
916
("html" ,self .endTagHtml ),
910
917
(("address" , "article" , "aside" , "blockquote" , "center" , "datagrid" ,
911
- "details" , "dialog" , " dir" , "div" , "dl" , "fieldset" , "figure" ,
912
- "footer" , "header" , "listing" , "menu" , "nav" , "ol" , "pre" , "section" ,
913
- "ul" ), self .endTagBlock ),
918
+ "details" , "dir" , "div" , "dl" , "fieldset" , "figure" ,
919
+ "footer" , "header" , "hgroup" , " listing" , "menu" , "nav" , "ol" , "pre" ,
920
+ "section" , " ul" ), self .endTagBlock ),
914
921
("form" , self .endTagForm ),
915
922
("p" ,self .endTagP ),
916
923
(("dd" , "dt" , "li" ), self .endTagListItem ),
@@ -953,14 +960,10 @@ def processSpaceCharactersDropNewline(self, token):
953
960
self .tree .insertText (data )
954
961
955
962
def processCharacters (self , token ):
956
- # XXX The specification says to do this for every character at the
957
- # moment, but apparently that doesn't match the real world so we don't
958
- # do it for space characters.
959
963
self .tree .reconstructActiveFormattingElements ()
960
964
self .tree .insertText (token ["data" ])
961
965
self .parser .framesetOK = False
962
966
963
- #This matches the current spec but may not match the real world
964
967
def processSpaceCharacters (self , token ):
965
968
self .tree .reconstructActiveFormattingElements ()
966
969
self .tree .insertText (token ["data" ])
@@ -996,9 +999,13 @@ def startTagCloseP(self, token):
996
999
if self .tree .elementInScope ("p" ):
997
1000
self .endTagP (impliedTagToken ("p" ))
998
1001
self .tree .insertElement (token )
999
- if token ["name" ] in ("pre" , "listing" ):
1000
- self .parser .framesetOK = False
1001
- self .processSpaceCharacters = self .processSpaceCharactersDropNewline
1002
+
1003
+ def startTagPreListing (self , token ):
1004
+ if self .tree .elementInScope ("p" ):
1005
+ self .endTagP (impliedTagToken ("p" ))
1006
+ self .tree .insertElement (token )
1007
+ self .parser .framesetOK = False
1008
+ self .processSpaceCharacters = self .processSpaceCharactersDropNewline
1002
1009
1003
1010
def startTagForm (self , token ):
1004
1011
if self .tree .formPointer :
@@ -1035,23 +1042,14 @@ def startTagPlaintext(self, token):
1035
1042
if self .tree .elementInScope ("p" ):
1036
1043
self .endTagP (impliedTagToken ("p" ))
1037
1044
self .tree .insertElement (token )
1038
- self .parser .tokenizer .contentModelFlag = contentModelFlags [ "PLAINTEXT" ]
1045
+ self .parser .tokenizer .state = self . parser . tokenizer . plaintextState
1039
1046
1040
1047
def startTagHeading (self , token ):
1041
1048
if self .tree .elementInScope ("p" ):
1042
1049
self .endTagP (impliedTagToken ("p" ))
1043
1050
if self .tree .openElements [- 1 ].name in headingElements :
1044
1051
self .parser .parseError ("unexpected-start-tag" , {"name" : token ["name" ]})
1045
1052
self .tree .openElements .pop ()
1046
- # Uncomment the following for IE7 behavior:
1047
- #
1048
- #for item in headingElements:
1049
- # if self.tree.elementInScope(item):
1050
- # self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1051
- # item = self.tree.openElements.pop()
1052
- # while item.name not in headingElements:
1053
- # item = self.tree.openElements.pop()
1054
- # break
1055
1053
self .tree .insertElement (token )
1056
1054
1057
1055
def startTagA (self , token ):
@@ -1175,9 +1173,8 @@ def startTagIsIndex(self, token):
1175
1173
self .processEndTag (impliedTagToken ("form" ))
1176
1174
1177
1175
def startTagTextarea (self , token ):
1178
- # XXX Form element pointer checking here as well...
1179
1176
self .tree .insertElement (token )
1180
- self .parser .tokenizer .contentModelFlag = contentModelFlags [ "RCDATA" ]
1177
+ self .parser .tokenizer .state = self . parser . tokenizer . rcdataState
1181
1178
self .processSpaceCharacters = self .processSpaceCharactersDropNewline
1182
1179
self .parser .framesetOK = False
1183
1180
@@ -1257,16 +1254,6 @@ def startTagMisplaced(self, token):
1257
1254
"""
1258
1255
self .parser .parseError ("unexpected-start-tag-ignored" , {"name" : token ["name" ]})
1259
1256
1260
- def startTagNew (self , token ):
1261
- """New HTML5 elements, "event-source", "section", "nav",
1262
- "article", "aside", "header", "footer", "datagrid", "command"
1263
- """
1264
- #2007-08-30 - MAP - commenting out this write to sys.stderr because
1265
- # it's really annoying me when I run the validator tests
1266
- #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
1267
- self .startTagOther (token )
1268
- #raise NotImplementedError
1269
-
1270
1257
def startTagOther (self , token ):
1271
1258
self .tree .reconstructActiveFormattingElements ()
1272
1259
self .tree .insertElement (token )
@@ -1285,19 +1272,16 @@ def endTagP(self, token):
1285
1272
node = self .tree .openElements .pop ()
1286
1273
1287
1274
def endTagBody (self , token ):
1288
- # XXX Need to take open <p> tags into account here. We shouldn't imply
1289
- # </p> but we should not throw a parse error either. Specification is
1290
- # likely to be updated.
1291
- if (len (self .tree .openElements ) == 1 or
1292
- self .tree .openElements [1 ].name != "body" ):
1293
- # innerHTML case
1275
+ if not self .tree .elementInScope ("body" ):
1294
1276
self .parser .parseError ()
1295
1277
return
1296
1278
elif self .tree .openElements [- 1 ].name != "body" :
1297
1279
for node in self .tree .openElements [2 :]:
1298
- if node .name not in frozenset (("dd" , "dt" , "li" , "p" ,
1280
+ if node .name not in frozenset (("dd" , "dt" , "li" , "optgroup" ,
1281
+ "option" , "p" , "rp" , "rt" ,
1299
1282
"tbody" , "td" , "tfoot" ,
1300
- "th" , "thead" , "tr" )):
1283
+ "th" , "thead" , "tr" , "body" ,
1284
+ "html" )):
1301
1285
#Not sure this is the correct name for the parse error
1302
1286
self .parser .parseError (
1303
1287
"expected-one-end-tag-but-got-another" ,
@@ -1524,7 +1508,7 @@ def endTagOther(self, token):
1524
1508
self .parser .parseError ("unexpected-end-tag" , {"name" : token ["name" ]})
1525
1509
break
1526
1510
1527
- class InRCDataRawtextPhase (Phase ):
1511
+ class TextPhase (Phase ):
1528
1512
def __init__ (self , parser , tree ):
1529
1513
Phase .__init__ (self , parser , tree )
1530
1514
self .startTagHandler = utils .MethodDispatcher ([])
0 commit comments