Skip to content

Commit 5470504

Browse files
committed
Fix table whitespace stuff
1 parent 1ad89ed commit 5470504

File tree

3 files changed

+54
-14
lines changed

3 files changed

+54
-14
lines changed

src/html5lib/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
_(u"Unexpected end of file. Expected attribute name instead."),
5959
"eof-in-attribute-name":
6060
_(u"Unexpected end of file in attribute name."),
61+
"invalid-character-in-attribute-name":
62+
_(u"Invalid chracter in attribute name"),
6163
"duplicate-attribute":
6264
_(u"Dropped duplicate attribute on tag."),
6365
"expected-end-of-tag-name-but-got-eof":

src/html5lib/html5parser.py

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
6161
"inBody": InBodyPhase(self, self.tree),
6262
"inCDataRCData": InCDataRCDataPhase(self, self.tree),
6363
"inTable": InTablePhase(self, self.tree),
64+
"inTableText": InTableTextPhase(self, self.tree),
6465
"inCaption": InCaptionPhase(self, self.tree),
6566
"inColumnGroup": InColumnGroupPhase(self, self.tree),
6667
"inTableBody": InTableBodyPhase(self, self.tree),
@@ -1568,22 +1569,16 @@ def processEOF(self):
15681569
#Stop parsing
15691570

15701571
def processSpaceCharacters(self, token):
1571-
if "tainted" not in self.getCurrentTable()._flags:
1572-
self.tree.insertText(token["data"])
1573-
else:
1574-
self.processCharacters(token)
1572+
self.parser.phase = self.parser.phases["inTableText"]
1573+
self.parser.phase.originalPhase = self
1574+
self.parser.phase.characterTokens.append(token)
15751575

15761576
def processCharacters(self, token):
1577-
if self.tree.openElements[-1].name in ("style", "script"):
1578-
self.tree.insertText(token)
1579-
else:
1580-
if "tainted" not in self.getCurrentTable()._flags:
1581-
self.parser.parseError("unexpected-char-implies-table-voodoo")
1582-
self.getCurrentTable()._flags.append("tainted")
1583-
# Do the table magic!
1584-
self.tree.insertFromTable = True
1585-
self.parser.phases["inBody"].processCharacters(token)
1586-
self.tree.insertFromTable = False
1577+
#If we get here there must be at least one non-whitespace character
1578+
# Do the table magic!
1579+
self.tree.insertFromTable = True
1580+
self.parser.phases["inBody"].processCharacters(token)
1581+
self.tree.insertFromTable = False
15871582

15881583
def startTagCaption(self, token):
15891584
self.clearStackToTableContext()
@@ -1669,6 +1664,48 @@ def endTagOther(self, token):
16691664
self.parser.phases["inBody"].processEndTag(token)
16701665
self.tree.insertFromTable = False
16711666

1667+
class InTableTextPhase(Phase):
1668+
def __init__(self, parser, tree):
1669+
Phase.__init__(self, parser, tree)
1670+
self.originalPhase = None
1671+
self.characterTokens = []
1672+
1673+
def flushCharacters(self):
1674+
data = "".join([item["data"] for item in self.characterTokens])
1675+
if any([item not in spaceCharacters for item in data]):
1676+
token = {"type":tokenTypes["Characters"], "data":data}
1677+
self.originalPhase.processCharacters(token)
1678+
elif data:
1679+
self.tree.insertText(data)
1680+
self.characterTokens = []
1681+
1682+
def processComment(self, token):
1683+
self.flushCharacters()
1684+
self.phase = self.originalPhase
1685+
self.phase.processComment(token)
1686+
1687+
def processEOF(self, token):
1688+
self.flushCharacters()
1689+
self.phase = self.originalPhase
1690+
self.phase.processEOF(token)
1691+
1692+
def processCharacters(self, token):
1693+
self.characterTokens.append(token)
1694+
1695+
def processSpaceCharacters(self, token):
1696+
#pretty sure we should never reach here
1697+
assert False
1698+
1699+
def processStartTag(self, token):
1700+
self.flushCharacters()
1701+
self.phase = self.originalPhase
1702+
self.phase.processStartTag(token)
1703+
1704+
def processEndTag(self, token):
1705+
self.flushCharacters()
1706+
self.phase = self.originalPhase
1707+
self.phase.processEndTag(token)
1708+
16721709

16731710
class InCaptionPhase(Phase):
16741711
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption

src/html5lib/tokenizer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ def dataState(self):
287287
elif data is EOF:
288288
# Tokenization ends.
289289
return False
290+
290291
elif data in spaceCharacters:
291292
# Directly after emitting a token you switch back to the "data
292293
# state". At that point spaceCharacters are important so they are

0 commit comments

Comments
 (0)