Skip to content

Commit abd0b0b

Browse files
author
James Graham
committed
Make foreign content switch on namespace of current node rather than insertion mode
--HG-- extra : transplant_source : %E4%FE%9F%C8T%9Cd%8F/%0C%2Cm%95%1B6%A8%BF%A0%5Bd
1 parent 12dd376 commit abd0b0b

File tree

4 files changed

+132
-164
lines changed

4 files changed

+132
-164
lines changed

html5lib/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,8 @@
483483
"area",
484484
"col",
485485
"input",
486-
"source"
486+
"source",
487+
"track"
487488
))
488489

489490
cdataElements = frozenset(('title', 'textarea'))

html5lib/html5parser.py

Lines changed: 81 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,19 @@ def reset(self):
150150
self.beforeRCDataPhase = None
151151

152152
self.framesetOK = True
153+
154+
def isHTMLIntegrationPoint(self, element):
155+
if (element.name == "annotation-xml" and
156+
element.namespace == namespaces["mathml"]):
157+
return ("encoding" in element.attributes and
158+
element.attributes["encoding"].translate(
159+
asciiUpper2Lower) in
160+
("text/html", "application/xhtml+xml"))
161+
else:
162+
return (element.namespace, element.name) in htmlIntegrationPointElements
163+
164+
def isMathMLTextIntegrationPoint(self, element):
165+
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
153166

154167
def mainLoop(self):
155168
CharactersToken = tokenTypes["Characters"]
@@ -158,27 +171,48 @@ def mainLoop(self):
158171
EndTagToken = tokenTypes["EndTag"]
159172
CommentToken = tokenTypes["Comment"]
160173
DoctypeToken = tokenTypes["Doctype"]
161-
174+
ParseErrorToken = tokenTypes["ParseError"]
162175

163176
for token in self.normalizedTokens():
164177
new_token = token
165178
while new_token is not None:
179+
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
180+
currentNodeNamespace = currentNode.namespace if currentNode else None
181+
currentNodeName = currentNode.name if currentNode else None
182+
166183
type = new_token["type"]
167-
if type == CharactersToken:
168-
new_token = self.phase.processCharacters(new_token)
169-
elif type == SpaceCharactersToken:
170-
new_token= self.phase.processSpaceCharacters(new_token)
171-
elif type == StartTagToken:
172-
new_token = self.phase.processStartTag(new_token)
173-
elif type == EndTagToken:
174-
new_token = self.phase.processEndTag(new_token)
175-
elif type == CommentToken:
176-
new_token = self.phase.processComment(new_token)
177-
elif type == DoctypeToken:
178-
new_token = self.phase.processDoctype(new_token)
179-
else:
184+
185+
if type == ParseErrorToken:
180186
self.parseError(new_token["data"], new_token.get("datavars", {}))
181187
new_token = None
188+
else:
189+
if (len(self.tree.openElements) == 0 or
190+
currentNodeNamespace == self.tree.defaultNamespace or
191+
(self.isMathMLTextIntegrationPoint(currentNode) and
192+
((type == StartTagToken and
193+
token["name"] not in frozenset(["mglyph", "malignmark"])) or
194+
type in (CharactersToken, SpaceCharactersToken))) or
195+
(currentNodeNamespace == namespaces["mathml"] and
196+
currentNodeName == "annotation-xml" and
197+
token["name"] == "svg") or
198+
(self.isHTMLIntegrationPoint(currentNode) and
199+
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
200+
phase = self.phase
201+
else:
202+
phase = self.phases["inForeignContent"]
203+
204+
if type == CharactersToken:
205+
new_token = phase.processCharacters(new_token)
206+
elif type == SpaceCharactersToken:
207+
new_token= phase.processSpaceCharacters(new_token)
208+
elif type == StartTagToken:
209+
new_token = phase.processStartTag(new_token)
210+
elif type == EndTagToken:
211+
new_token = phase.processEndTag(new_token)
212+
elif type == CommentToken:
213+
new_token = phase.processComment(new_token)
214+
elif type == DoctypeToken:
215+
new_token = phase.processDoctype(new_token)
182216

183217
if (type == StartTagToken and token["selfClosing"]
184218
and not token["selfClosingAcknowledged"]):
@@ -379,12 +413,12 @@ def resetInsertionMode(self):
379413
if nodeName in ("select", "colgroup", "head", "html"):
380414
assert self.innerHTML
381415

416+
if not last and node.namespace != self.tree.defaultNamespace:
417+
continue
418+
382419
if nodeName in newModes:
383420
new_phase = self.phases[newModes[nodeName]]
384421
break
385-
elif node.namespace in (namespaces["mathml"], namespaces["svg"]):
386-
new_phase = self.phases["inForeignContent"]
387-
break
388422
elif last:
389423
new_phase = self.phases["inBody"]
390424
break
@@ -419,7 +453,6 @@ def wrapped(self, *args, **kwargs):
419453
try:
420454
info = {"type":type_names[token['type']]}
421455
except:
422-
print token
423456
raise
424457
if token['type'] in constants.tagTokenTypes:
425458
info["name"] = token['name']
@@ -1243,7 +1276,6 @@ def startTagMath(self, token):
12431276
self.tree.insertElement(token)
12441277
#Need to get the parse error right for the case where the token
12451278
#has a namespace not equal to the xmlns attribute
1246-
self.parser.phase = self.parser.phases["inForeignContent"]
12471279
if token["selfClosing"]:
12481280
self.tree.openElements.pop()
12491281
token["selfClosingAcknowledged"] = True
@@ -1256,7 +1288,6 @@ def startTagSvg(self, token):
12561288
self.tree.insertElement(token)
12571289
#Need to get the parse error right for the case where the token
12581290
#has a namespace not equal to the xmlns attribute
1259-
self.parser.phase = self.parser.phases["inForeignContent"]
12601291
if token["selfClosing"]:
12611292
self.tree.openElements.pop()
12621293
token["selfClosingAcknowledged"] = True
@@ -1741,7 +1772,7 @@ def processSpaceCharacters(self, token):
17411772
self.characterTokens.append(token)
17421773
# assert False
17431774

1744-
def processStartTag(self, token):
1775+
def processStartTag(self, token):
17451776
self.flushCharacters()
17461777
self.parser.phase = self.originalPhase
17471778
return token
@@ -2298,7 +2329,7 @@ def endTagOther(self, token):
22982329
class InForeignContentPhase(Phase):
22992330
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
23002331
"center", "code", "dd", "div", "dl", "dt",
2301-
"em", "embed", "font", "h1", "h2", "h3",
2332+
"em", "embed", "h1", "h2", "h3",
23022333
"h4", "h5", "h6", "head", "hr", "i", "img",
23032334
"li", "listing", "menu", "meta", "nobr",
23042335
"ol", "p", "pre", "ruby", "s", "small",
@@ -2307,19 +2338,6 @@ class InForeignContentPhase(Phase):
23072338
def __init__(self, parser, tree):
23082339
Phase.__init__(self, parser, tree)
23092340

2310-
def isHTMLIntegrationPoint(self, element):
2311-
if (element.name == "annotation-xml" and
2312-
element.namespace == namespaces["mathml"]):
2313-
return ("encoding" in element.attributes and
2314-
element.attributes["encoding"].translate(
2315-
asciiUpper2Lower) in
2316-
("text/html", "application/xhtml+xml"))
2317-
else:
2318-
return (element.namespace, element.name) in htmlIntegrationPointElements
2319-
2320-
def isMathMLTextIntegrationPoint(self, element):
2321-
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
2322-
23232341
def adjustSVGTagNames(self, token):
23242342
replacements = {u"altglyph":u"altGlyph",
23252343
u"altglyphdef":u"altGlyphDef",
@@ -2362,48 +2380,25 @@ def adjustSVGTagNames(self, token):
23622380
token["name"] = replacements[token["name"]]
23632381

23642382
def processCharacters(self, token):
2365-
if (self.tree.openElements[-1].namespace == self.tree.defaultNamespace or
2366-
self.isHTMLIntegrationPoint(self.tree.openElements[-1])):
2367-
new_token = self.parser.phases["inBody"].processCharacters(token)
2368-
self.parser.resetInsertionMode()
2369-
return new_token
2370-
elif token["data"] == u"\u0000":
2383+
if token["data"] == u"\u0000":
23712384
token["data"] = u"\uFFFD"
23722385
elif (self.parser.framesetOK and
23732386
any(char not in spaceCharacters for char in token["data"])):
23742387
self.parser.framesetOK = False
23752388
Phase.processCharacters(self, token)
23762389

2377-
def processEOF(self):
2378-
reprocess = self.parser.phases["inBody"].processEOF()
2379-
self.parser.resetInsertionMode()
2380-
return reprocess
2381-
23822390
def processStartTag(self, token):
23832391
currentNode = self.tree.openElements[-1]
2384-
currentNodeNamespace = currentNode.namespace
2385-
currentNodeName = currentNode.name
2386-
if (currentNodeNamespace == self.tree.defaultNamespace or
2387-
(self.isMathMLTextIntegrationPoint(currentNode) and
2388-
token["name"] not in frozenset(["mglyph", "malignmark"])) or
2389-
(currentNodeNamespace == namespaces["mathml"] and
2390-
currentNodeName == "annotation-xml" and
2391-
token["name"] == "svg") or
2392-
self.isHTMLIntegrationPoint(currentNode)):
2393-
2394-
new_token = self.parser.phases["inBody"].processStartTag(token)
2395-
self.parser.resetInsertionMode()
2396-
return new_token
2397-
2398-
elif token["name"] in self.breakoutElements:
2392+
if (token["name"] in self.breakoutElements or
2393+
(token["name"] == "font" and
2394+
set(token["data"].keys()) | set("color", "face", "size"))):
23992395
self.parser.parseError("unexpected-html-element-in-foreign-content",
24002396
token["name"])
24012397
while (self.tree.openElements[-1].namespace !=
24022398
self.tree.defaultNamespace and
2403-
not self.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2404-
not self.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2399+
not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2400+
not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
24052401
self.tree.openElements.pop()
2406-
self.parser.resetInsertionMode()
24072402
return token
24082403

24092404
else:
@@ -2420,33 +2415,29 @@ def processStartTag(self, token):
24202415
token["selfClosingAcknowledged"] = True
24212416

24222417
def processEndTag(self, token):
2423-
if self.tree.openElements[-1].namespace == self.tree.defaultNamespace:
2424-
new_token = self.parser.phases["inBody"].processEndTag(token)
2425-
self.parser.resetInsertionMode()
2426-
return new_token
2427-
else:
2428-
nodeIndex = len(self.tree.openElements) - 1
2429-
node = self.tree.openElements[-1]
2430-
if node.name != token["name"]:
2431-
self.parser.parseError("unexpected-end-tag", token["name"])
2432-
2433-
while True:
2434-
if node.name.translate(asciiUpper2Lower) == token["name"]:
2435-
while self.tree.openElements.pop() != node:
2436-
assert self.tree.openElements
2437-
new_token = None
2438-
break
2439-
nodeIndex -= 1
2418+
nodeIndex = len(self.tree.openElements) - 1
2419+
node = self.tree.openElements[-1]
2420+
if node.name != token["name"]:
2421+
self.parser.parseError("unexpected-end-tag", token["name"])
2422+
2423+
while True:
2424+
if node.name.translate(asciiUpper2Lower) == token["name"]:
2425+
if self.parser.phase == self.parser.phases["inTableText"]:
2426+
self.parser.phase.flushCharacters()
2427+
self.parser.phase = self.parser.phase.originalPhase
2428+
while self.tree.openElements.pop() != node:
2429+
assert self.tree.openElements
2430+
new_token = None
2431+
break
2432+
nodeIndex -= 1
24402433

2441-
node = self.tree.openElements[nodeIndex]
2442-
if node.namespace != self.tree.defaultNamespace:
2443-
continue
2444-
else:
2445-
new_token = self.parser.phases["inBody"].processEndTag(token)
2446-
break
2447-
if self.parser.phase == self:
2448-
self.parser.resetInsertionMode()
2449-
return new_token
2434+
node = self.tree.openElements[nodeIndex]
2435+
if node.namespace != self.tree.defaultNamespace:
2436+
continue
2437+
else:
2438+
new_token = self.parser.phase.processEndTag(token)
2439+
break
2440+
return new_token
24502441

24512442

24522443
class AfterBodyPhase(Phase):

0 commit comments

Comments
 (0)