Skip to content

Commit 3b0ed45

Browse files
author
James Graham
committed
Tokenizer part of the changes to support CDATA sections in foreign content
--HG-- extra : transplant_source : %7C9%04%C8%F3%ED%C6%12%A9%C8%5E1%C2J%A2%1Ak%23T%CC
1 parent 97ed427 commit 3b0ed45

File tree

1 file changed

+42
-2
lines changed

1 file changed

+42
-2
lines changed

html5lib/tokenizer.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,11 @@ class HTMLTokenizer:
3939
# XXX need to fix documentation
4040

4141
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
42-
lowercaseElementName=True, lowercaseAttrName=True):
42+
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
4343

4444
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
45-
45+
self.parser = parser
46+
4647
#Perform case conversions?
4748
self.lowercaseElementName = lowercaseElementName
4849
self.lowercaseAttrName = lowercaseAttrName
@@ -1062,6 +1063,19 @@ def markupDeclarationOpenState(self):
10621063
"correct": True}
10631064
self.state = self.doctypeState
10641065
return True
1066+
elif (charStack[-1] == "[" and
1067+
self.parser is not None and
1068+
self.parser.phase == self.parser.phases["inForeignContent"] and
1069+
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
1070+
matched = True
1071+
for expected in ["C", "D", "A", "T", "A", "["]:
1072+
charStack.append(self.stream.char())
1073+
if charStack[-1] != expected:
1074+
matched = False
1075+
break
1076+
if matched:
1077+
self.state = self.cdataSectionState
1078+
return True
10651079

10661080
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10671081
"expected-dashes-or-doctype"})
@@ -1563,3 +1577,29 @@ def bogusDoctypeState(self):
15631577
else:
15641578
pass
15651579
return True
1580+
1581+
def cdataSectionState(self):
1582+
data = []
1583+
while True:
1584+
data.append(self.stream.charsUntil(u"]"))
1585+
charStack = []
1586+
1587+
for expected in ["]", "]", ">"]:
1588+
charStack.append(self.stream.char())
1589+
matched = True
1590+
if charStack[-1] == EOF:
1591+
data.extend(charStack[:-1])
1592+
break
1593+
elif charStack[-1] != expected:
1594+
matched = False
1595+
data.extend(charStack)
1596+
break
1597+
1598+
if matched:
1599+
break
1600+
data = "".join(data)
1601+
if data:
1602+
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
1603+
data})
1604+
self.state = self.dataState
1605+
return True

0 commit comments

Comments
 (0)