Skip to content

Commit 454a8ca

Browse files
committed
Correctness fixes for EOF handling and parse errors
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401143
1 parent 5999365 commit 454a8ca

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed

src/html5lib/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@
234234
"unexpected-end-tag-after-frameset":
235235
_(u"Unexpected end tag (%(name)s)"
236236
u" in the after frameset phase. Ignored."),
237+
"unexpected-end-tag-after-body-innerhtml":
238+
_(u"Unexpected end tag after body(innerHtml)"),
237239
"expected-eof-but-got-char":
238240
_(u"Unexpected non-space characters. Expected end of file."),
239241
"expected-eof-but-got-start-tag":
@@ -242,6 +244,12 @@
242244
"expected-eof-but-got-end-tag":
243245
_(u"Unexpected end tag (%(name)s)"
244246
u". Expected end of file."),
247+
"eof-in-table":
248+
_(u"Unexpected end of file. Expected table content."),
249+
"eof-in-select":
250+
_(u"Unexpected end of file. Expected select content."),
251+
"eof-in-frameset":
252+
_(u"Unexpected end of file. Expected frameset content."),
245253
"XXX-undefined-error":
246254
(u"Undefined error (this sucks and should be fixed)"),
247255
}

src/html5lib/html5parser.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ def __init__(self, parser, tree):
257257
self.tree = tree
258258

259259
def processEOF(self):
260+
raise NotImplementedError
260261
self.tree.generateImpliedEndTags()
261262
if len(self.tree.openElements) > 2:
262263
self.parser.parseError("expected-closing-tag-but-got-eof")
@@ -547,11 +548,11 @@ def appendToHead(self, element):
547548
self.tree.openElements[-1].appendChild(element)
548549

549550
# the real thing
550-
def processEOF(self):
551+
def processEOF (self):
551552
if self.tree.openElements[-1].name in ("title", "style", "script", "noscript"):
552553
self.parser.parseError("expected-named-closing-tag-but-got-eof",
553554
{"name": self.tree.openElements[-1].name})
554-
self.tree.openElements.pop()
555+
self.tree.openElements.pop()
555556
self.anythingElse()
556557
self.parser.phase.processEOF()
557558

@@ -776,6 +777,15 @@ def addFormattingElement(self, name, attributes):
776777
self.tree.openElements[-1])
777778

778779
# the real deal
780+
def processEOF(self):
781+
allowed_elements = set(("dd", "dt", "li", "p", "tbody", "td", "tfoot",
782+
"th", "thead", "tr", "body", "html"))
783+
for node in self.tree.openElements[::-1]:
784+
if node.name not in allowed_elements:
785+
self.parser.parseError("expected-closing-tag-but-got-eof")
786+
break
787+
#Stop parsing
788+
779789
def processSpaceCharactersDropNewline(self, data):
780790
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
781791
# want to drop leading newlines
@@ -1311,8 +1321,8 @@ def __init__(self, parser, tree):
13111321
def clearStackToTableContext(self):
13121322
# "clear the stack back to a table context"
13131323
while self.tree.openElements[-1].name not in ("table", "html"):
1314-
self.parser.parseError("unexpected-implied-end-tag-in-table",
1315-
{"name": self.tree.openElements[-1].name})
1324+
#self.parser.parseError("unexpected-implied-end-tag-in-table",
1325+
# {"name": self.tree.openElements[-1].name})
13161326
self.tree.openElements.pop()
13171327
# When the current node is <html> it's an innerHTML case
13181328

@@ -1323,6 +1333,13 @@ def getCurrentTable(self):
13231333
return self.tree.openElements[i]
13241334

13251335
# processing methods
1336+
def processEOF(self):
1337+
if self.tree.openElements[-1].name != "html":
1338+
self.parser.parseError("eof-in-table")
1339+
else:
1340+
assert self.parser.innerHTML
1341+
#Stop parsing
1342+
13261343
def processSpaceCharacters(self, data):
13271344
if "tainted" not in self.getCurrentTable()._flags:
13281345
self.tree.insertText(data)
@@ -1454,6 +1471,9 @@ def __init__(self, parser, tree):
14541471
def ignoreEndTagCaption(self):
14551472
return not self.tree.elementInScope("caption", True)
14561473

1474+
def processEOF(self):
1475+
self.parser.phases["inBody"].processEOF()
1476+
14571477
def processCharacters(self, data):
14581478
self.parser.phases["inBody"].processCharacters(data)
14591479

@@ -1521,6 +1541,16 @@ def __init__(self, parser, tree):
15211541
def ignoreEndTagColgroup(self):
15221542
return self.tree.openElements[-1].name == "html"
15231543

1544+
def processEOF(self):
1545+
if self.tree.openElements[-1].name == "html":
1546+
assert self.parser.innerHTML
1547+
return
1548+
else:
1549+
ignoreEndTag = self.ignoreEndTagColgroup()
1550+
self.endTagColgroup("colgroup")
1551+
if not ignoreEndTag:
1552+
self.parser.phase.processEOF()
1553+
15241554
def processCharacters(self, data):
15251555
ignoreEndTag = self.ignoreEndTagColgroup()
15261556
self.endTagColgroup("colgroup")
@@ -1564,7 +1594,8 @@ def __init__(self, parser, tree):
15641594
("html", self.startTagHtml),
15651595
("tr", self.startTagTr),
15661596
(("td", "th"), self.startTagTableCell),
1567-
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
1597+
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
1598+
self.startTagTableOther)
15681599
])
15691600
self.startTagHandler.default = self.startTagOther
15701601

@@ -1580,13 +1611,18 @@ def __init__(self, parser, tree):
15801611
def clearStackToTableBodyContext(self):
15811612
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
15821613
"thead", "html"):
1583-
self.parser.parseError("unexpected-implied-end-tag-in-table",
1584-
{"name": self.tree.openElements[-1].name})
1614+
#self.parser.parseError("unexpected-implied-end-tag-in-table",
1615+
# {"name": self.tree.openElements[-1].name})
15851616
self.tree.openElements.pop()
1617+
if self.tree.openElements[-1].name == "html":
1618+
assert self.parser.innerHTML
15861619

15871620
# the rest
1621+
def processEOF(self):
1622+
self.parser.phases["inTable"].processEOF()
1623+
15881624
def processSpaceCharacters(self,data):
1589-
self.parser.phases["inTable"].processSpaceCharacters(data)
1625+
self.parser.phases["inTable"].processSpaceCharacters(data)
15901626

15911627
def processCharacters(self,data):
15921628
self.parser.phases["inTable"].processCharacters(data)
@@ -1676,6 +1712,9 @@ def ignoreEndTagTr(self):
16761712
return not self.tree.elementInScope("tr", tableVariant=True)
16771713

16781714
# the rest
1715+
def processEOF(self):
1716+
self.parser.phases["inTable"].processEOF()
1717+
16791718
def processSpaceCharacters(self, data):
16801719
self.parser.phases["inTable"].processSpaceCharacters(data)
16811720

@@ -1757,6 +1796,9 @@ def closeCell(self):
17571796
self.endTagTableCell("th")
17581797

17591798
# the rest
1799+
def processEOF(self):
1800+
self.parser.phases["inBody"].processEOF()
1801+
17601802
def processCharacters(self, data):
17611803
self.parser.phases["inBody"].processCharacters(data)
17621804

@@ -1834,6 +1876,12 @@ def __init__(self, parser, tree):
18341876
self.endTagHandler.default = self.endTagOther
18351877

18361878
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
1879+
def processEOF(self):
1880+
if self.tree.openElements[-1].name != "html":
1881+
self.parser.parseError("eof-in-select")
1882+
else:
1883+
assert self.parser.innerHtml
1884+
18371885
def processCharacters(self, data):
18381886
self.tree.insertText(data)
18391887

@@ -1919,6 +1967,9 @@ def __init__(self, parser, tree):
19191967
])
19201968
self.endTagHandler.default = self.endTagOther
19211969

1970+
def processEOF(self):
1971+
self.parser.phases["inSelect"].processEOF()
1972+
19221973
def processCharacters(self, data):
19231974
self.parser.phases["inSelect"].processCharacters(data)
19241975

@@ -1948,6 +1999,10 @@ def __init__(self, parser, tree):
19481999
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
19492000
self.endTagHandler.default = self.endTagOther
19502001

2002+
def processEOF(self):
2003+
#Stop parsing
2004+
pass
2005+
19512006
def processComment(self, data):
19522007
# This is needed because data is to be appended to the <html> element
19532008
# here and not to whatever is currently open.
@@ -1966,7 +2021,7 @@ def processStartTag(self, name, attributes):
19662021

19672022
def endTagHtml(self,name):
19682023
if self.parser.innerHTML:
1969-
self.parser.parseError()
2024+
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
19702025
else:
19712026
# XXX: This may need to be done, not sure:
19722027
# Don't set lastPhase to the current phase but to the inBody phase
@@ -2001,6 +2056,12 @@ def __init__(self, parser, tree):
20012056
])
20022057
self.endTagHandler.default = self.endTagOther
20032058

2059+
def processEOF(self):
2060+
if self.tree.openElements[-1].name != "html":
2061+
self.parser.parseError("eof-in-frameset")
2062+
else:
2063+
assert self.parser.innerHTML
2064+
20042065
def processCharacters(self, data):
20052066
self.parser.parseError("unexpected-char-in-frameset")
20062067

@@ -2054,6 +2115,10 @@ def __init__(self, parser, tree):
20542115
])
20552116
self.endTagHandler.default = self.endTagOther
20562117

2118+
def processEOF(self):
2119+
#Stop parsing
2120+
pass
2121+
20572122
def processCharacters(self, data):
20582123
self.parser.parseError("unexpected-char-after-frameset")
20592124

0 commit comments

Comments
 (0)