Skip to content

Commit 566ca68

Browse files
committed
Move RCData parser to where it should have been to begin with in order to fix last lxml unit test
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401151
1 parent abfca04 commit 566ca68

File tree

3 files changed

+47
-38
lines changed

3 files changed

+47
-38
lines changed

src/html5lib/html5parser.py

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7575
"afterBody": AfterBodyPhase(self, self.tree),
7676
"inFrameset": InFramesetPhase(self, self.tree),
7777
"afterFrameset": AfterFramesetPhase(self, self.tree),
78-
"trailingEnd": TrailingEndPhase(self, self.tree)
78+
"trailingEnd": TrailingEndPhase(self, self.tree),
7979
# XXX after after body
8080
# XXX after after frameset
8181
# XXX trailingEnd is gone
@@ -117,10 +117,11 @@ def _parse(self, stream, innerHTML=False, container="div",
117117
# relevant ... need others too
118118
self.lastPhase = None
119119

120+
self.beforeRCDataPhase = None
121+
120122
# XXX This is temporary for the moment so there isn't any other
121123
# changes needed for the parser to work with the iterable tokenizer
122-
for token in self.tokenizer:
123-
token = self.normalizeToken(token)
124+
for token in self.normalizedTokens():
124125
type = token["type"]
125126
method = getattr(self.phase, "process%s" % type, None)
126127
if type in ("Characters", "SpaceCharacters", "Comment"):
@@ -137,6 +138,10 @@ def _parse(self, stream, innerHTML=False, container="div",
137138
# When the loop finishes it's EOF
138139
self.phase.processEOF()
139140

141+
def normalizedTokens(self):
142+
for token in self.tokenizer:
143+
yield self.normalizeToken(token)
144+
140145
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
141146
"""Parse a HTML document into a well-formed tree
142147
@@ -238,6 +243,29 @@ def resetInsertionMode(self):
238243
self.phase = self.phases["inBody"]
239244
break
240245

246+
def parseRCDataCData(self, name, attributes, contentType):
247+
"""Generic (R)CDATA Parsing algorithm
248+
contentType - RCDATA or CDATA
249+
"""
250+
assert contentType in ("CDATA", "RCDATA")
251+
252+
element = self.tree.insertElement(name, attributes)
253+
self.tokenizer.contentModelFlag = contentModelFlags[contentType]
254+
255+
for token in self.normalizedTokens():
256+
if token["type"] in ("Characters", "SpaceCharacters"):
257+
self.tree.insertText(token["data"])
258+
elif token["type"] == "ParseError":
259+
self.parseError(token["data"], token.get("datavars", {}))
260+
else:
261+
assert self.tokenizer.contentModelFlag == contentModelFlags["PCDATA"]
262+
assert token["type"] == "EndTag" and token["name"] == name, repr(token)
263+
assert self.tree.openElements.pop() == element
264+
return
265+
#Otherwise we hit EOF
266+
assert self.tree.openElements.pop() == element
267+
self.parseError("expected-closing-tag-but-got-eof")
268+
241269
class Phase(object):
242270
"""Base class for helper object that implements each phase of processing
243271
"""
@@ -298,29 +326,6 @@ def startTagHtml(self, name, attributes):
298326
def processEndTag(self, name):
299327
self.endTagHandler[name](name)
300328

301-
def parseRCDataCData(self, name, attributes, contentType):
302-
"""Generic (R)CDATA Parsing algorithm
303-
contentType - RCDATA or CDATA
304-
"""
305-
assert contentType in ("CDATA", "RCDATA")
306-
element = self.tree.insertElement(name, attributes)
307-
self.parser.tokenizer.contentModelFlag = contentModelFlags[contentType]
308-
for token in self.parser.tokenizer:
309-
token = self.parser.normalizeToken(token)
310-
if token["type"] in ("Characters", "SpaceCharacters"):
311-
self.tree.insertText(token["data"])
312-
elif token["type"] == "ParseError":
313-
self.parser.parseError(token["data"], token.get("datavars", {}))
314-
else:
315-
assert self.parser.tokenizer.contentModelFlag == contentModelFlags["PCDATA"]
316-
assert token["type"] == "EndTag" and token["name"] == name, repr(token)
317-
assert self.tree.openElements.pop() == element
318-
return
319-
#Otherwise we hit EOF
320-
assert self.tree.openElements.pop() == element
321-
self.parser.parseError("expected-closing-tag-but-got-eof")
322-
323-
324329
class InitialPhase(Phase):
325330
# This phase deals with error handling as well which is currently not
326331
# covered in the specification. The error handling is typically known as
@@ -586,18 +591,18 @@ def startTagHead(self, name, attributes):
586591
self.parser.parseError("two-heads-are-not-better-than-one")
587592

588593
def startTagTitle(self, name, attributes):
589-
self.parseRCDataCData(name, attributes, "RCDATA")
594+
self.parser.parseRCDataCData(name, attributes, "RCDATA")
590595

591596
def startTagStyle(self, name, attributes):
592-
self.parseRCDataCData(name, attributes, "CDATA")
597+
self.parser.parseRCDataCData(name, attributes, "CDATA")
593598

594599
def startTagNoScript(self, name, attributes):
595600
#Need to decide whether to implement the scripting-disabled case
596-
self.parseRCDataCData(name, attributes, "CDATA")
601+
self.parser.parseRCDataCData(name, attributes, "CDATA")
597602

598603
def startTagScript(self, name, attributes):
599604
#I think this is equivalent to the CDATA stuff since we don't execute script
600-
self.parseRCDataCData(name, attributes, "CDATA")
605+
self.parser.parseRCDataCData(name, attributes, "CDATA")
601606

602607
def startTagBaseLinkMeta(self, name, attributes):
603608
if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]):
@@ -612,7 +617,7 @@ def startTagOther(self, name, attributes):
612617
self.parser.phase.processStartTag(name, attributes)
613618

614619
def endTagHead(self, name):
615-
assert self.tree.openElements[-1].name == "head"
620+
assert self.tree.openElements[-1].name == "head", "Expected head got %s"%self.tree.openElements[-1].name
616621
self.tree.openElements.pop()
617622
self.parser.phase = self.parser.phases["afterHead"]
618623

@@ -922,7 +927,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
922927

923928
def startTagXmp(self, name, attributes):
924929
self.tree.reconstructActiveFormattingElements()
925-
self.parseRCDataCData(name, attributes, "CDATA")
930+
self.parser.parseRCDataCData(name, attributes, "CDATA")
926931

927932
def startTagTable(self, name, attributes):
928933
if self.tree.elementInScope("p"):
@@ -982,7 +987,7 @@ def startTagTextarea(self, name, attributes):
982987

983988
def startTagCdata(self, name, attributes):
984989
"""iframe, noembed noframes, noscript(if scripting enabled)"""
985-
self.parseRCDataCData(name, attributes, "CDATA")
990+
self.parser.parseRCDataCData(name, attributes, "CDATA")
986991

987992
def startTagSelect(self, name, attributes):
988993
self.tree.reconstructActiveFormattingElements()

src/html5lib/liberalxmlparser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ def _parse(self, stream, innerHTML=False, container="div", encoding=None,
6363
encoding, lowercaseElementName=False,
6464
lowercaseAttrName=False)
6565

66+
def parseRCDataCData(self, name, attributes, contentType):
67+
self.tree.insertElement(name, attributes)
68+
6669
class XHTMLParser(XMLParser):
6770
""" liberal XMTHML parser """
6871

src/html5lib/tokenizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -315,10 +315,11 @@ def dataState(self):
315315
self.escapeFlag and "".join(self.lastFourChars) == "<!--":
316316
self.escapeFlag = True
317317
self.tokenQueue.append({"type": "Characters", "data":data})
318-
elif data == "<" and (self.contentModelFlag ==\
319-
contentModelFlags["PCDATA"] or (self.contentModelFlag in
320-
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
321-
self.escapeFlag == False)):
318+
elif (data == "<" and (self.contentModelFlag == contentModelFlags["PCDATA"]
319+
or (self.contentModelFlag in
320+
(contentModelFlags["CDATA"],
321+
contentModelFlags["RCDATA"]) and
322+
self.escapeFlag == False))):
322323
self.state = self.states["tagOpen"]
323324
elif data == ">" and self.contentModelFlag in\
324325
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
@@ -343,7 +344,7 @@ def dataState(self):
343344
self.lastFourChars += chars[-4:]
344345
self.lastFourChars = self.lastFourChars[-4:]
345346
return True
346-
347+
347348
def entityDataState(self):
348349
entity = self.consumeEntity()
349350
if entity:

0 commit comments

Comments
 (0)