Skip to content

Commit c5de2df

Browse files
committed
Update parser (and tests) to latest spec. Fix a couple of bugs in the tokenizer that only turned up in the tree construction tests.
1 parent 981cbd1 commit c5de2df

File tree

3 files changed

+91
-93
lines changed

3 files changed

+91
-93
lines changed

src/html5lib/constants.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -256,21 +256,18 @@
256256
_(u"Unexpected end of file. Expected select content."),
257257
"eof-in-frameset":
258258
_(u"Unexpected end of file. Expected frameset content."),
259+
"eof-in-script-in-script":
260+
_(u"Unexpected end of file. Expected script content."),
259261
"non-void-element-with-trailing-solidus":
260262
_(u"Trailing solidus not allowed on element %(name)s"),
261263
"unexpected-html-element-in-foreign-content":
262264
_(u"Element %(name)s not allowed in a non-html context"),
265+
"unexpected-end-tag-before-html":
266+
_(u"Unexpected end tag (%(name)s) before html."),
263267
"XXX-undefined-error":
264268
(u"Undefined error (this sucks and should be fixed)"),
265269
}
266270

267-
contentModelFlags = {
268-
"PCDATA":0,
269-
"RCDATA":1,
270-
"RAWTEXT":2,
271-
"PLAINTEXT":3
272-
}
273-
274271
namespaces = {
275272
"html":"http://www.w3.org/1999/xhtml",
276273
"mathml":"http://www.w3.org/1998/Math/MathML",

src/html5lib/html5parser.py

Lines changed: 52 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def any(iterable):
2525
from treebuilders import simpletree
2626

2727
import utils
28-
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
28+
from constants import spaceCharacters, asciiUpper2Lower
2929
from constants import scopingElements, formattingElements, specialElements
3030
from constants import headingElements, tableInsertModeElements
3131
from constants import cdataElements, rcdataElements, voidElements
@@ -77,7 +77,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
7777
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
7878
"afterHead": AfterHeadPhase(self, self.tree),
7979
"inBody": InBodyPhase(self, self.tree),
80-
"inRCDataRawtext": InRCDataRawtextPhase(self, self.tree),
80+
"text": TextPhase(self, self.tree),
8181
"inTable": InTablePhase(self, self.tree),
8282
"inTableText": InTableTextPhase(self, self.tree),
8383
"inCaption": InCaptionPhase(self, self.tree),
@@ -124,14 +124,14 @@ def reset(self):
124124
self.innerHTML = self.container.lower()
125125

126126
if self.innerHTML in cdataElements:
127-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
127+
self.tokenizer.state = self.tokenizer.rcdataState
128128
elif self.innerHTML in rcdataElements:
129-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RAWTEXT"]
129+
self.tokenizer.state = self.tokenizer.rawtextState
130130
elif self.innerHTML == 'plaintext':
131-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
131+
self.tokenizer.state = self.tokenizer.plaintextState
132132
else:
133-
# contentModelFlag already is PCDATA
134-
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
133+
# state already is data state
134+
# self.tokenizer.state = self.tokenizer.dataState
135135
pass
136136
self.phase = self.phases["beforeHtml"]
137137
self.phase.insertHtmlElement()
@@ -406,7 +406,7 @@ def parseRCDataRawtext(self, token, contentType):
406406

407407
self.originalPhase = self.phase
408408

409-
self.phase = self.phases["inRCDataRawtext"]
409+
self.phase = self.phases["text"]
410410

411411
class Phase(object):
412412
"""Base class for helper object that implements each phase of processing
@@ -636,8 +636,12 @@ def processStartTag(self, token):
636636
self.parser.phase.processStartTag(token)
637637

638638
def processEndTag(self, token):
639-
self.insertHtmlElement()
640-
self.parser.phase.processEndTag(token)
639+
if token["name"] not in ("head", "body", "html", "br"):
640+
self.parser.parseError("unexpected-end-tag-before-html",
641+
{"name": token["name"]})
642+
else:
643+
self.insertHtmlElement()
644+
self.parser.phase.processEndTag(token)
641645

642646

643647
class BeforeHeadPhase(Phase):
@@ -651,7 +655,7 @@ def __init__(self, parser, tree):
651655
self.startTagHandler.default = self.startTagOther
652656

653657
self.endTagHandler = utils.MethodDispatcher([
654-
(("head", "br"), self.endTagImplyHead)
658+
(("head", "body", "html", "br"), self.endTagImplyHead)
655659
])
656660
self.endTagHandler.default = self.endTagOther
657661

@@ -666,6 +670,9 @@ def processCharacters(self, token):
666670
self.startTagHead(impliedTagToken("head", "StartTag"))
667671
self.parser.phase.processCharacters(token)
668672

673+
def startTagHtml(self, token):
674+
self.parser.phases["inBody"].processStartTag(token)
675+
669676
def startTagHead(self, token):
670677
self.tree.insertElement(token)
671678
self.tree.headPointer = self.tree.openElements[-1]
@@ -692,8 +699,8 @@ def __init__(self, parser, tree):
692699
("title", self.startTagTitle),
693700
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
694701
("script", self.startTagScript),
695-
(("base", "link", "command", "eventsource"),
696-
self.startTagBaseLinkCommandEventsource),
702+
(("base", "link", "command"),
703+
self.startTagBaseLinkCommand),
697704
("meta", self.startTagMeta),
698705
("head", self.startTagHead)
699706
])
@@ -728,7 +735,7 @@ def startTagHtml(self, token):
728735
def startTagHead(self, token):
729736
self.parser.parseError("two-heads-are-not-better-than-one")
730737

731-
def startTagBaseLinkCommandEventsource(self, token):
738+
def startTagBaseLinkCommand(self, token):
732739
self.tree.insertElement(token)
733740
self.tree.openElements.pop()
734741
token["selfClosingAcknowledged"] = True
@@ -757,9 +764,10 @@ def startTagNoScriptNoFramesStyle(self, token):
757764
self.parser.parseRCDataRawtext(token, "RAWTEXT")
758765

759766
def startTagScript(self, token):
760-
#I think this is equivalent to the RAWTEXT stuff since we don't execute script
761-
#self.tree.insertElement(token)
762-
self.parser.parseRCDataRawtext(token, "RAWTEXT")
767+
self.tree.insertElement(token)
768+
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
769+
self.parser.originalPhase = self.parser.phase
770+
self.parser.phase = self.parser.phases["text"]
763771

764772
def startTagOther(self, token):
765773
self.anythingElse()
@@ -838,7 +846,6 @@ def startTagOther(self, token):
838846
self.parser.phase.processStartTag(token)
839847

840848
def endTagHtmlBodyBr(self, token):
841-
#This is not currently in the spec
842849
self.anythingElse()
843850
self.parser.phase.processEndTag(token)
844851

@@ -852,8 +859,8 @@ def anythingElse(self):
852859

853860

854861
class InBodyPhase(Phase):
855-
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
856-
# the crazy mode
862+
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
863+
# the really-really-really-very crazy mode
857864
def __init__(self, parser, tree):
858865
Phase.__init__(self, parser, tree)
859866

@@ -862,15 +869,16 @@ def __init__(self, parser, tree):
862869

863870
self.startTagHandler = utils.MethodDispatcher([
864871
("html", self.startTagHtml),
865-
(("base", "link", "meta", "script", "style", "title"),
866-
self.startTagProcessInHead),
872+
(("base", "command", "link", "meta", "noframes", "script", "style",
873+
"title"), self.startTagProcessInHead),
867874
("body", self.startTagBody),
868875
("frameset", self.startTagFrameset),
869876
(("address", "article", "aside", "blockquote", "center", "datagrid",
870-
"details", "dialog", "dir", "div", "dl", "fieldset", "figure",
871-
"footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing",
872-
"menu", "nav", "ol", "p", "pre", "section", "ul"),
877+
"details", "dir", "div", "dl", "fieldset", "figure",
878+
"footer", "header", "hgroup", "menu", "nav", "ol", "p",
879+
"section", "ul"),
873880
self.startTagCloseP),
881+
(("pre", "listing"), self.startTagPreListing),
874882
("form", self.startTagForm),
875883
(("li", "dd", "dt"), self.startTagListItem),
876884
("plaintext",self.startTagPlaintext),
@@ -899,18 +907,17 @@ def __init__(self, parser, tree):
899907
(("svg"), self.startTagSvg),
900908
(("caption", "col", "colgroup", "frame", "head",
901909
"tbody", "td", "tfoot", "th", "thead",
902-
"tr"), self.startTagMisplaced),
903-
(("event-source", "command"), self.startTagNew)
910+
"tr"), self.startTagMisplaced)
904911
])
905912
self.startTagHandler.default = self.startTagOther
906913

907914
self.endTagHandler = utils.MethodDispatcher([
908915
("body",self.endTagBody),
909916
("html",self.endTagHtml),
910917
(("address", "article", "aside", "blockquote", "center", "datagrid",
911-
"details", "dialog", "dir", "div", "dl", "fieldset", "figure",
912-
"footer", "header", "listing", "menu", "nav", "ol", "pre", "section",
913-
"ul"), self.endTagBlock),
918+
"details", "dir", "div", "dl", "fieldset", "figure",
919+
"footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre",
920+
"section", "ul"), self.endTagBlock),
914921
("form", self.endTagForm),
915922
("p",self.endTagP),
916923
(("dd", "dt", "li"), self.endTagListItem),
@@ -953,14 +960,10 @@ def processSpaceCharactersDropNewline(self, token):
953960
self.tree.insertText(data)
954961

955962
def processCharacters(self, token):
956-
# XXX The specification says to do this for every character at the
957-
# moment, but apparently that doesn't match the real world so we don't
958-
# do it for space characters.
959963
self.tree.reconstructActiveFormattingElements()
960964
self.tree.insertText(token["data"])
961965
self.parser.framesetOK = False
962966

963-
#This matches the current spec but may not match the real world
964967
def processSpaceCharacters(self, token):
965968
self.tree.reconstructActiveFormattingElements()
966969
self.tree.insertText(token["data"])
@@ -996,9 +999,13 @@ def startTagCloseP(self, token):
996999
if self.tree.elementInScope("p"):
9971000
self.endTagP(impliedTagToken("p"))
9981001
self.tree.insertElement(token)
999-
if token["name"] in ("pre", "listing"):
1000-
self.parser.framesetOK = False
1001-
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1002+
1003+
def startTagPreListing(self, token):
1004+
if self.tree.elementInScope("p"):
1005+
self.endTagP(impliedTagToken("p"))
1006+
self.tree.insertElement(token)
1007+
self.parser.framesetOK = False
1008+
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
10021009

10031010
def startTagForm(self, token):
10041011
if self.tree.formPointer:
@@ -1035,23 +1042,14 @@ def startTagPlaintext(self, token):
10351042
if self.tree.elementInScope("p"):
10361043
self.endTagP(impliedTagToken("p"))
10371044
self.tree.insertElement(token)
1038-
self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
1045+
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
10391046

10401047
def startTagHeading(self, token):
10411048
if self.tree.elementInScope("p"):
10421049
self.endTagP(impliedTagToken("p"))
10431050
if self.tree.openElements[-1].name in headingElements:
10441051
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
10451052
self.tree.openElements.pop()
1046-
# Uncomment the following for IE7 behavior:
1047-
#
1048-
#for item in headingElements:
1049-
# if self.tree.elementInScope(item):
1050-
# self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1051-
# item = self.tree.openElements.pop()
1052-
# while item.name not in headingElements:
1053-
# item = self.tree.openElements.pop()
1054-
# break
10551053
self.tree.insertElement(token)
10561054

10571055
def startTagA(self, token):
@@ -1175,9 +1173,8 @@ def startTagIsIndex(self, token):
11751173
self.processEndTag(impliedTagToken("form"))
11761174

11771175
def startTagTextarea(self, token):
1178-
# XXX Form element pointer checking here as well...
11791176
self.tree.insertElement(token)
1180-
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
1177+
self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
11811178
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
11821179
self.parser.framesetOK = False
11831180

@@ -1257,16 +1254,6 @@ def startTagMisplaced(self, token):
12571254
"""
12581255
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
12591256

1260-
def startTagNew(self, token):
1261-
"""New HTML5 elements, "event-source", "section", "nav",
1262-
"article", "aside", "header", "footer", "datagrid", "command"
1263-
"""
1264-
#2007-08-30 - MAP - commenting out this write to sys.stderr because
1265-
# it's really annoying me when I run the validator tests
1266-
#sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
1267-
self.startTagOther(token)
1268-
#raise NotImplementedError
1269-
12701257
def startTagOther(self, token):
12711258
self.tree.reconstructActiveFormattingElements()
12721259
self.tree.insertElement(token)
@@ -1285,19 +1272,16 @@ def endTagP(self, token):
12851272
node = self.tree.openElements.pop()
12861273

12871274
def endTagBody(self, token):
1288-
# XXX Need to take open <p> tags into account here. We shouldn't imply
1289-
# </p> but we should not throw a parse error either. Specification is
1290-
# likely to be updated.
1291-
if (len(self.tree.openElements) == 1 or
1292-
self.tree.openElements[1].name != "body"):
1293-
# innerHTML case
1275+
if not self.tree.elementInScope("body"):
12941276
self.parser.parseError()
12951277
return
12961278
elif self.tree.openElements[-1].name != "body":
12971279
for node in self.tree.openElements[2:]:
1298-
if node.name not in frozenset(("dd", "dt", "li", "p",
1280+
if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1281+
"option", "p", "rp", "rt",
12991282
"tbody", "td", "tfoot",
1300-
"th", "thead", "tr")):
1283+
"th", "thead", "tr", "body",
1284+
"html")):
13011285
#Not sure this is the correct name for the parse error
13021286
self.parser.parseError(
13031287
"expected-one-end-tag-but-got-another",
@@ -1524,7 +1508,7 @@ def endTagOther(self, token):
15241508
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
15251509
break
15261510

1527-
class InRCDataRawtextPhase(Phase):
1511+
class TextPhase(Phase):
15281512
def __init__(self, parser, tree):
15291513
Phase.__init__(self, parser, tree)
15301514
self.startTagHandler = utils.MethodDispatcher([])

0 commit comments

Comments
 (0)