Skip to content

Commit 3385106

Browse files
committed
Python parser: support <listing> followed by newline; <style> & <script> in table; tainted tables; <title> everywhere; throw less voodoo parse errors; support <input type=hidden> in table; support <input> in in select; applet is scoped; add tests
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401107
1 parent 0e4a080 commit 3385106

File tree

4 files changed

+117
-63
lines changed

4 files changed

+117
-63
lines changed

src/html5lib/constants.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@
170170
"unexpected-char-implies-table-voodoo":
171171
_(u"Unexpected non-space characters in "
172172
u"table context caused voodoo mode."),
173+
"unpexted-hidden-input-in-table":
174+
_(u"Unexpected input with type hidden in table context."),
173175
"unexpected-start-tag-implies-table-voodoo":
174176
_(u"Unexpected start tag (%(name)s) in "
175177
u"table context caused voodoo mode."),
@@ -190,7 +192,9 @@
190192
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
191193
"unexpected-select-in-select":
192194
_(u"Unexpected select start tag in the select phase "
193-
u"implies select start tag."),
195+
u"treated as select end tag."),
196+
"unexpected-input-in-select":
197+
_(u"Unexpected input start tag in the select phase."),
194198
"unexpected-start-tag-in-select":
195199
_(u"Unexpected start tag token (%(name)s in the select phase. "
196200
u"Ignored."),
@@ -244,6 +248,7 @@
244248
}
245249

246250
scopingElements = frozenset((
251+
"applet",
247252
"button",
248253
"caption",
249254
"html",

src/html5lib/html5parser.py

Lines changed: 100 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
5858

5959
self.phases = {
6060
"initial": InitialPhase(self, self.tree),
61-
"rootElement": RootElementPhase(self, self.tree),
61+
"beforeHtml": BeforeHtmlPhase(self, self.tree),
6262
"beforeHead": BeforeHeadPhase(self, self.tree),
6363
"inHead": InHeadPhase(self, self.tree),
6464
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
@@ -71,10 +71,14 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7171
"inRow": InRowPhase(self, self.tree),
7272
"inCell": InCellPhase(self, self.tree),
7373
"inSelect": InSelectPhase(self, self.tree),
74+
# XXX inSelectInTable
7475
"afterBody": AfterBodyPhase(self, self.tree),
7576
"inFrameset": InFramesetPhase(self, self.tree),
7677
"afterFrameset": AfterFramesetPhase(self, self.tree),
7778
"trailingEnd": TrailingEndPhase(self, self.tree)
79+
# XXX after after body
80+
# XXX after after frameset
81+
# XXX trailingEnd is gone
7882
}
7983

8084
def _parse(self, stream, innerHTML=False, container="div",
@@ -101,7 +105,7 @@ def _parse(self, stream, innerHTML=False, container="div",
101105
# contentModelFlag already is PCDATA
102106
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
103107
pass
104-
self.phase = self.phases["rootElement"]
108+
self.phase = self.phases["beforeHtml"]
105109
self.phase.insertHtmlElement()
106110
self.resetInsertionMode()
107111
else:
@@ -300,7 +304,7 @@ class InitialPhase(Phase):
300304
# this.
301305
def processEOF(self):
302306
self.parser.parseError("expected-doctype-but-got-eof")
303-
self.parser.phase = self.parser.phases["rootElement"]
307+
self.parser.phase = self.parser.phases["beforeHtml"]
304308
self.parser.phase.processEOF()
305309

306310
def processComment(self, data):
@@ -401,30 +405,30 @@ def processDoctype(self, name, publicId, systemId, correct):
401405
#XXX quirks mode
402406
pass
403407

404-
self.parser.phase = self.parser.phases["rootElement"]
408+
self.parser.phase = self.parser.phases["beforeHtml"]
405409

406410
def processSpaceCharacters(self, data):
407411
pass
408412

409413
def processCharacters(self, data):
410414
self.parser.parseError("expected-doctype-but-got-chars")
411-
self.parser.phase = self.parser.phases["rootElement"]
415+
self.parser.phase = self.parser.phases["beforeHtml"]
412416
self.parser.phase.processCharacters(data)
413417

414418
def processStartTag(self, name, attributes):
415419
self.parser.parseError("expected-doctype-but-got-start-tag",
416420
{"name": name})
417-
self.parser.phase = self.parser.phases["rootElement"]
421+
self.parser.phase = self.parser.phases["beforeHtml"]
418422
self.parser.phase.processStartTag(name, attributes)
419423

420424
def processEndTag(self, name):
421425
self.parser.parseError("expected-doctype-but-got-end-tag",
422426
{"name": name})
423-
self.parser.phase = self.parser.phases["rootElement"]
427+
self.parser.phase = self.parser.phases["beforeHtml"]
424428
self.parser.phase.processEndTag(name)
425429

426430

427-
class RootElementPhase(Phase):
431+
class BeforeHtmlPhase(Phase):
428432
# helper methods
429433
def insertHtmlElement(self):
430434
self.tree.insertRoot("html")
@@ -475,6 +479,9 @@ def processEOF(self):
475479
self.startTagHead("head", {})
476480
self.parser.phase.processEOF()
477481

482+
def processSpaceCharacters(self, data):
483+
pass
484+
478485
def processCharacters(self, data):
479486
self.startTagHead("head", {})
480487
self.parser.phase.processCharacters(data)
@@ -548,33 +555,36 @@ def startTagHead(self, name, attributes):
548555
self.parser.parseError("two-heads-are-not-better-than-one")
549556

550557
def startTagTitle(self, name, attributes):
551-
element = self.tree.createElement(name, attributes)
552-
self.appendToHead(element)
553-
self.tree.openElements.append(element)
558+
if self.tree.headPointer is not None and\
559+
self.parser.phase == self.parser.phases["inHead"]:
560+
element = self.tree.createElement(name, attributes)
561+
self.appendToHead(element)
562+
self.tree.openElements.append(element)
563+
else:
564+
self.tree.insertElement(name, attributes)
554565
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
555566

556567
def startTagStyle(self, name, attributes):
557-
element = self.tree.createElement(name, attributes)
558568
if self.tree.headPointer is not None and\
559569
self.parser.phase == self.parser.phases["inHead"]:
570+
element = self.tree.createElement(name, attributes)
560571
self.appendToHead(element)
572+
self.tree.openElements.append(element)
561573
else:
562-
self.tree.openElements[-1].appendChild(element)
563-
self.tree.openElements.append(element)
574+
self.tree.insertElement(name, attributes)
564575
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
565576

566577
def startTagNoScript(self, name, attributes):
567578
# XXX Need to decide whether to implement the scripting disabled case.
568-
element = self.tree.createElement(name, attributes)
569579
if self.tree.headPointer is not None and\
570580
self.parser.phase == self.parser.phases["inHead"]:
581+
element = self.tree.createElement(name, attributes)
571582
self.appendToHead(element)
583+
self.tree.openElements.append(element)
572584
else:
573-
self.tree.openElements[-1].appendChild(element)
574-
self.tree.openElements.append(element)
585+
self.tree.insertElement(name, attributes)
575586
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
576-
577-
587+
578588
def startTagScript(self, name, attributes):
579589
#XXX Inner HTML case may be wrong
580590
element = self.tree.createElement(name, attributes)
@@ -689,9 +699,8 @@ def __init__(self, parser, tree):
689699

690700
self.startTagHandler = utils.MethodDispatcher([
691701
("html", self.startTagHtml),
692-
(("base", "link", "meta", "script", "style"),
702+
(("base", "link", "meta", "script", "style", "title"),
693703
self.startTagProcessInHead),
694-
("title", self.startTagTitle),
695704
("body", self.startTagBody),
696705
(("address", "blockquote", "center", "dir", "div", "dl",
697706
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@@ -705,7 +714,7 @@ def __init__(self, parser, tree):
705714
"tt", "u"),self.startTagFormatting),
706715
("nobr", self.startTagNobr),
707716
("button", self.startTagButton),
708-
(("marquee", "object"), self.startTagMarqueeObject),
717+
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
709718
("xmp", self.startTagXmp),
710719
("table", self.startTagTable),
711720
(("area", "basefont", "bgsound", "br", "embed", "img", "param",
@@ -736,7 +745,7 @@ def __init__(self, parser, tree):
736745
(headingElements, self.endTagHeading),
737746
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
738747
"strike", "strong", "tt", "u"), self.endTagFormatting),
739-
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
748+
(("applet", "marquee", "object", "button"), self.endTagAppletButtonMarqueeObject),
740749
(("head", "frameset", "select", "optgroup", "option", "table",
741750
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
742751
"td", "th"), self.endTagMisplaced),
@@ -759,11 +768,11 @@ def addFormattingElement(self, name, attributes):
759768

760769
# the real deal
761770
def processSpaceCharactersDropNewline(self, data):
762-
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
763-
# leading newlines
771+
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
772+
# want to drop leading newlines
764773
self.processSpaceCharacters = self.processSpaceCharactersNonPre
765774
if (data.startswith("\n") and
766-
self.tree.openElements[-1].name in ("pre", "textarea") and
775+
self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
767776
not self.tree.openElements[-1].hasContent()):
768777
data = data[1:]
769778
if data:
@@ -785,11 +794,6 @@ def processSpaceCharacters(self, data):
785794
def startTagProcessInHead(self, name, attributes):
786795
self.parser.phases["inHead"].processStartTag(name, attributes)
787796

788-
def startTagTitle(self, name, attributes):
789-
self.parser.parseError("unexpected-start-tag-out-of-my-head",
790-
{"name": name})
791-
self.parser.phases["inHead"].processStartTag(name, attributes)
792-
793797
def startTagBody(self, name, attributes):
794798
self.parser.parseError("unexpected-start-tag", {"name": "body"})
795799
if (len(self.tree.openElements) == 1
@@ -804,7 +808,7 @@ def startTagCloseP(self, name, attributes):
804808
if self.tree.elementInScope("p"):
805809
self.endTagP("p")
806810
self.tree.insertElement(name, attributes)
807-
if name == "pre":
811+
if name in ("pre", "listing"):
808812
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
809813

810814
def startTagForm(self, name, attributes):
@@ -902,7 +906,7 @@ def startTagButton(self, name, attributes):
902906
self.tree.insertElement(name, attributes)
903907
self.tree.activeFormattingElements.append(Marker)
904908

905-
def startTagMarqueeObject(self, name, attributes):
909+
def startTagAppletMarqueeObject(self, name, attributes):
906910
self.tree.reconstructActiveFormattingElements()
907911
self.tree.insertElement(name, attributes)
908912
self.tree.activeFormattingElements.append(Marker)
@@ -1201,7 +1205,7 @@ def endTagFormatting(self, name):
12011205
self.tree.openElements.insert(
12021206
self.tree.openElements.index(furthestBlock) + 1, clone)
12031207

1204-
def endTagButtonMarqueeObject(self, name):
1208+
def endTagAppletButtonMarqueeObject(self, name):
12051209
if self.tree.elementInScope(name):
12061210
self.tree.generateImpliedEndTags()
12071211
if self.tree.openElements[-1].name != name:
@@ -1269,12 +1273,15 @@ def __init__(self, parser, tree):
12691273
("col", self.startTagCol),
12701274
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
12711275
(("td", "th", "tr"), self.startTagImplyTbody),
1272-
("table", self.startTagTable)
1276+
("table", self.startTagTable),
1277+
(("style", "script"), self.startTagStyleScript),
1278+
("input", self.startTagInput)
12731279
])
12741280
self.startTagHandler.default = self.startTagOther
12751281

12761282
self.endTagHandler = utils.MethodDispatcher([
12771283
("table", self.endTagTable),
1284+
(("style", "script"), self.endTagStyleScript),
12781285
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
12791286
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
12801287
])
@@ -1289,14 +1296,30 @@ def clearStackToTableContext(self):
12891296
self.tree.openElements.pop()
12901297
# When the current node is <html> it's an innerHTML case
12911298

1299+
def getCurrentTable(self):
1300+
i = -1
1301+
while self.tree.openElements[i].name != "table":
1302+
i -= 1
1303+
return self.tree.openElements[i]
1304+
12921305
# processing methods
1306+
def processSpaceCharacters(self, data):
1307+
if "tainted" not in self.getCurrentTable()._flags:
1308+
self.tree.insertText(data)
1309+
else:
1310+
self.processCharacters(data)
1311+
12931312
def processCharacters(self, data):
1294-
self.parser.parseError("unexpected-char-implies-table-voodoo")
1295-
# Make all the special element rearranging voodoo kick in
1296-
self.tree.insertFromTable = True
1297-
# Process the character in the "in body" mode
1298-
self.parser.phases["inBody"].processCharacters(data)
1299-
self.tree.insertFromTable = False
1313+
if self.tree.openElements[-1].name in ("style", "script"):
1314+
self.tree.insertText(data)
1315+
else:
1316+
if "tainted" not in self.getCurrentTable()._flags:
1317+
self.parser.parseError("unexpected-char-implies-table-voodoo")
1318+
self.getCurrentTable()._flags.append("tainted")
1319+
# Do the table magic!
1320+
self.tree.insertFromTable = True
1321+
self.parser.phases["inBody"].processCharacters(data)
1322+
self.tree.insertFromTable = False
13001323

13011324
def startTagCaption(self, name, attributes):
13021325
self.clearStackToTableContext()
@@ -1329,12 +1352,27 @@ def startTagTable(self, name, attributes):
13291352
if not self.parser.innerHTML:
13301353
self.parser.phase.processStartTag(name, attributes)
13311354

1355+
def startTagStyleScript(self, name, attributes):
1356+
if "tainted" not in self.getCurrentTable()._flags:
1357+
self.parser.phases["inHead"].processStartTag(name, attributes)
1358+
else:
1359+
self.startTagOther(name, attributes)
1360+
1361+
def startTagInput(self, name, attributes):
1362+
if "type" in attributes and attributes["type"].translate(asciiUpper2Lower) == "hidden" and "tainted" not in self.getCurrentTable()._flags:
1363+
self.parser.parseError("unpexted-hidden-input-in-table")
1364+
self.tree.insertElement(name, attributes)
1365+
# XXX associate with form
1366+
self.tree.openElements.pop()
1367+
else:
1368+
self.startTagOther(name, attributes)
1369+
13321370
def startTagOther(self, name, attributes):
1333-
self.parser.parseError("unexpected-start-tag-implies-table-voodoo",
1334-
{"name": name})
1335-
# Make all the special element rearranging voodoo kick in
1371+
if "tainted" not in self.getCurrentTable()._flags:
1372+
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": name})
1373+
self.getCurrentTable()._flags.append("tainted")
1374+
# Do the table magic!
13361375
self.tree.insertFromTable = True
1337-
# Process the start tag in the "in body" mode
13381376
self.parser.phases["inBody"].processStartTag(name, attributes)
13391377
self.tree.insertFromTable = False
13401378

@@ -1354,15 +1392,21 @@ def endTagTable(self, name):
13541392
assert self.parser.innerHTML
13551393
self.parser.parseError()
13561394

1395+
def endTagStyleScript(self, name):
1396+
if "tainted" not in self.getCurrentTable()._flags:
1397+
self.parser.phases["inHead"].processEndTag(name)
1398+
else:
1399+
self.endTagOther(name)
1400+
13571401
def endTagIgnore(self, name):
13581402
self.parser.parseError("unexpected-end-tag", {"name": name})
13591403

13601404
def endTagOther(self, name):
1361-
self.parser.parseError("unexpected-end-tag-implies-table-voodoo",
1362-
{"name": name})
1363-
# Make all the special element rearranging voodoo kick in
1405+
if "tainted" not in self.getCurrentTable()._flags:
1406+
self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": name})
1407+
self.getCurrentTable()._flags.append("tainted")
1408+
# Do the table magic!
13641409
self.tree.insertFromTable = True
1365-
# Process the end tag in the "in body" mode
13661410
self.parser.phases["inBody"].processEndTag(name)
13671411
self.tree.insertFromTable = False
13681412

@@ -1749,7 +1793,8 @@ def __init__(self, parser, tree):
17491793
("html", self.startTagHtml),
17501794
("option", self.startTagOption),
17511795
("optgroup", self.startTagOptgroup),
1752-
("select", self.startTagSelect)
1796+
("select", self.startTagSelect),
1797+
("input", self.startTagInput)
17531798
])
17541799
self.startTagHandler.default = self.startTagOther
17551800

@@ -1783,6 +1828,11 @@ def startTagSelect(self, name, attributes):
17831828
self.parser.parseError("unexpected-select-in-select")
17841829
self.endTagSelect("select")
17851830

1831+
def startTagInput(self, name, attributes):
1832+
self.parser.parseError("unexpected-input-in-select")
1833+
self.endTagSelect("select")
1834+
self.parser.phase.processStartTag(name, attributes)
1835+
17861836
def startTagOther(self, name, attributes):
17871837
self.parser.parseError("unexpected-start-tag-in-select",
17881838
{"name": name})

src/html5lib/liberalxmlparser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def normalizeToken(self, token):
9393

9494
return token
9595

96-
class XhmlRootPhase(html5parser.RootElementPhase):
96+
class XhmlRootPhase(html5parser.BeforeHtmlPhase):
9797
def insertHtmlElement(self):
9898
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
9999
self.tree.openElements.append(element)

0 commit comments

Comments
 (0)