Skip to content

Commit d2ba0c0

Browse files
committed
Implement generic (R)CDATA parsing algorithm
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401146
1 parent 55be4f1 commit d2ba0c0

File tree

1 file changed

+37
-50
lines changed

1 file changed

+37
-50
lines changed

src/html5lib/html5parser.py

Lines changed: 37 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,27 @@ def startTagHtml(self, name, attributes):
298298
def processEndTag(self, name):
299299
self.endTagHandler[name](name)
300300

301+
def parseRCDataCData(self, name, attributes, contentType):
302+
"""Generic (R)CDATA Parsing algorithm
303+
contentType - RCDATA or CDATA
304+
"""
305+
assert contentType in ("CDATA", "RCDATA")
306+
element = self.tree.insertElement(name, attributes)
307+
self.parser.tokenizer.contentModelFlag = contentModelFlags[contentType]
308+
for token in self.parser.tokenizer:
309+
if token["type"] in ("Characters", "SpaceCharacters"):
310+
self.tree.insertText(token["data"])
311+
elif token["type"] == "ParseError":
312+
self.parser.parseError(token["data"], token.get("datavars", {}))
313+
else:
314+
assert self.parser.tokenizer.contentModelFlag == contentModelFlags["PCDATA"]
315+
assert token["type"] == "EndTag" and token["name"] == name, repr(token)
316+
assert self.tree.openElements.pop() == element
317+
return
318+
#Otherwise we hit EOF
319+
assert self.tree.openElements.pop() == element
320+
self.parser.parseError("expected-closing-tag-but-got-eof")
321+
301322

302323
class InitialPhase(Phase):
303324
# This phase deals with error handling as well which is currently not
@@ -549,10 +570,6 @@ def appendToHead(self, element):
549570

550571
# the real thing
551572
def processEOF (self):
552-
if self.tree.openElements[-1].name in ("title", "style", "script", "noscript"):
553-
self.parser.parseError("expected-named-closing-tag-but-got-eof",
554-
{"name": self.tree.openElements[-1].name})
555-
self.tree.openElements.pop()
556573
self.anythingElse()
557574
self.parser.phase.processEOF()
558575

@@ -568,44 +585,18 @@ def startTagHead(self, name, attributes):
568585
self.parser.parseError("two-heads-are-not-better-than-one")
569586

570587
def startTagTitle(self, name, attributes):
571-
if self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]:
572-
element = self.tree.createElement(name, attributes)
573-
self.appendToHead(element)
574-
self.tree.openElements.append(element)
575-
else:
576-
self.tree.insertElement(name, attributes)
577-
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
588+
self.parseRCDataCData(name, attributes, "RCDATA")
578589

579590
def startTagStyle(self, name, attributes):
580-
if self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]:
581-
element = self.tree.createElement(name, attributes)
582-
self.appendToHead(element)
583-
self.tree.openElements.append(element)
584-
else:
585-
self.tree.insertElement(name, attributes)
586-
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
591+
self.parseRCDataCData(name, attributes, "CDATA")
587592

588593
def startTagNoScript(self, name, attributes):
589-
# XXX Need to decide whether to implement the scripting disabled case.
590-
if self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]:
591-
element = self.tree.createElement(name, attributes)
592-
self.appendToHead(element)
593-
self.tree.openElements.append(element)
594-
else:
595-
self.tree.insertElement(name, attributes)
596-
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
594+
#Need to decide whether to implement the scripting-disabled case
595+
self.parseRCDataCData(name, attributes, "CDATA")
597596

598597
def startTagScript(self, name, attributes):
599-
#XXX Inner HTML case may be wrong
600-
element = None
601-
if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]):
602-
element = self.tree.createElement(name, attributes)
603-
self.appendToHead(element)
604-
self.tree.openElements.append(element)
605-
else:
606-
element = self.tree.insertElement(name, attributes)
607-
element._flags.append("parser-inserted")
608-
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
598+
#I think this is equivalent to the CDATA stuff since we don't execute script
599+
self.parseRCDataCData(name, attributes, "CDATA")
609600

610601
def startTagBaseLinkMeta(self, name, attributes):
611602
if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]):
@@ -620,10 +611,8 @@ def startTagOther(self, name, attributes):
620611
self.parser.phase.processStartTag(name, attributes)
621612

622613
def endTagHead(self, name):
623-
if self.tree.openElements[-1].name == "head":
624-
self.tree.openElements.pop()
625-
else:
626-
self.parser.parseError("unexpected-end-tag", {"name": "head"})
614+
assert self.tree.openElements[-1].name == "head"
615+
self.tree.openElements.pop()
627616
self.parser.phase = self.parser.phases["afterHead"]
628617

629618
def endTagImplyAfterHead(self, name):
@@ -640,10 +629,8 @@ def endTagOther(self, name):
640629
self.parser.parseError("unexpected-end-tag", {"name": name})
641630

642631
def anythingElse(self):
643-
if self.tree.openElements[-1].name == "head":
644-
self.endTagHead("head")
645-
else:
646-
self.parser.phase = self.parser.phases["afterHead"]
632+
self.endTagHead("head")
633+
647634

648635
# XXX If we implement a parser for which scripting is disabled we need to
649636
# implement this phase.
@@ -682,8 +669,10 @@ def startTagFrameset(self, name, attributes):
682669
def startTagFromHead(self, name, attributes):
683670
self.parser.parseError("unexpected-start-tag-out-of-my-head",
684671
{"name": name})
685-
self.parser.phase = self.parser.phases["inHead"]
686-
self.parser.phase.processStartTag(name, attributes)
672+
self.tree.openElements.append(self.tree.headPointer)
673+
self.parser.phases["inHead"].processStartTag(name, attributes)
674+
node = self.tree.openElements.pop()
675+
assert node is self.tree.headPointer, "Node name is %s, expected head"%node.name
687676

688677
def startTagOther(self, name, attributes):
689678
self.anythingElse()
@@ -932,8 +921,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
932921

933922
def startTagXmp(self, name, attributes):
934923
self.tree.reconstructActiveFormattingElements()
935-
self.tree.insertElement(name, attributes)
936-
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
924+
self.parseRCDataCData(name, attributes, "CDATA")
937925

938926
def startTagTable(self, name, attributes):
939927
if self.tree.elementInScope("p"):
@@ -993,8 +981,7 @@ def startTagTextarea(self, name, attributes):
993981

994982
def startTagCdata(self, name, attributes):
995983
"""iframe, noembed noframes, noscript(if scripting enabled)"""
996-
self.tree.insertElement(name, attributes)
997-
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
984+
self.parseRCDataCData(name, attributes, "CDATA")
998985

999986
def startTagSelect(self, name, attributes):
1000987
self.tree.reconstructActiveFormattingElements()

0 commit comments

Comments
 (0)