Skip to content

Commit 18e5d32

Browse files
committed
Updated Python tokeniser to match new tests
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401104
1 parent 13725e0 commit 18e5d32

File tree

2 files changed

+50
-12
lines changed

2 files changed

+50
-12
lines changed

src/html5lib/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@
667667
"kappa;": u"\u03BA",
668668
"lArr;": u"\u21D0",
669669
"lambda;": u"\u03BB",
670-
"lang;": u"\u3008",
670+
"lang;": u"\u27E8",
671671
"laquo;": u"\u00AB",
672672
"laquo": u"\u00AB",
673673
"larr;": u"\u2190",
@@ -747,7 +747,7 @@
747747
"quot": u"\u0022",
748748
"rArr;": u"\u21D2",
749749
"radic;": u"\u221A",
750-
"rang;": u"\u3009",
750+
"rang;": u"\u27E9",
751751
"raquo;": u"\u00BB",
752752
"raquo": u"\u00BB",
753753
"rarr;": u"\u2192",

src/html5lib/tokenizer.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
5151
"attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
5252
"attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
5353
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
54+
"afterAttributeValue":self.afterAttributeValueState,
5455
"bogusComment":self.bogusCommentState,
5556
"markupDeclarationOpen":self.markupDeclarationOpenState,
5657
"commentStart":self.commentStartState,
@@ -185,10 +186,11 @@ def consumeNumberEntity(self, isHex):
185186

186187
return char
187188

188-
def consumeEntity(self, fromAttribute=False):
189+
def consumeEntity(self, allowedChar=None, fromAttribute=False):
189190
char = None
190191
charStack = [self.stream.char()]
191-
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
192+
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")\
193+
or (allowedChar is not None and allowedChar == charStack[0]):
192194
self.stream.unget(charStack)
193195
elif charStack[0] == u"#":
194196
# We might have a number entity here.
@@ -260,10 +262,10 @@ def entitiesStartingWith(name):
260262
self.stream.unget(charStack)
261263
return char
262264

263-
def processEntityInAttribute(self):
265+
def processEntityInAttribute(self, allowedChar):
264266
"""This method replaces the need for "entityInAttributeValueState".
265267
"""
266-
entity = self.consumeEntity(True)
268+
entity = self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
267269
if entity:
268270
self.currentToken["data"][-1][1] += entity
269271
else:
@@ -479,6 +481,11 @@ def beforeAttributeNameState(self):
479481
self.emitCurrentToken()
480482
elif data == u"/":
481483
self.processSolidusInTag()
484+
elif data == u"'" or data == u'"' or data == u"=":
485+
self.tokenQueue.append({"type": "ParseError", "data":
486+
"invalid-character-in-attribute-name"})
487+
self.currentToken["data"].append([data, ""])
488+
self.state = self.states["attributeName"]
482489
elif data == EOF:
483490
self.tokenQueue.append({"type": "ParseError", "data":
484491
"expected-attribute-name-but-got-eof"})
@@ -508,6 +515,11 @@ def attributeNameState(self):
508515
elif data == u"/":
509516
self.processSolidusInTag()
510517
self.state = self.states["beforeAttributeName"]
518+
elif data == u"'" or data == u'"':
519+
self.tokenQueue.append({"type": "ParseError", "data":
520+
"invalid-character-in-attribute-name"})
521+
self.currentToken["data"][-1][0] += data
522+
leavingThisState = False
511523
elif data == EOF:
512524
self.tokenQueue.append({"type": "ParseError", "data":
513525
"eof-in-attribute-name"})
@@ -570,6 +582,11 @@ def beforeAttributeValueState(self):
570582
self.state = self.states["attributeValueSingleQuoted"]
571583
elif data == u">":
572584
self.emitCurrentToken()
585+
elif data == u"=":
586+
self.tokenQueue.append({"type": "ParseError", "data":
587+
"equals-in-unquoted-attribute-value"})
588+
self.currentToken["data"][-1][1] += data
589+
self.state = self.states["attributeValueUnQuoted"]
573590
elif data == EOF:
574591
self.tokenQueue.append({"type": "ParseError", "data":
575592
"expected-attribute-value-but-got-eof"})
@@ -582,9 +599,9 @@ def beforeAttributeValueState(self):
582599
def attributeValueDoubleQuotedState(self):
583600
data = self.stream.char()
584601
if data == "\"":
585-
self.state = self.states["beforeAttributeName"]
602+
self.state = self.states["afterAttributeValue"]
586603
elif data == u"&":
587-
self.processEntityInAttribute()
604+
self.processEntityInAttribute(u'"')
588605
elif data == EOF:
589606
self.tokenQueue.append({"type": "ParseError", "data":
590607
"eof-in-attribute-value-double-quote"})
@@ -597,9 +614,9 @@ def attributeValueDoubleQuotedState(self):
597614
def attributeValueSingleQuotedState(self):
598615
data = self.stream.char()
599616
if data == "'":
600-
self.state = self.states["beforeAttributeName"]
617+
self.state = self.states["afterAttributeValue"]
601618
elif data == u"&":
602-
self.processEntityInAttribute()
619+
self.processEntityInAttribute(u"'")
603620
elif data == EOF:
604621
self.tokenQueue.append({"type": "ParseError", "data":
605622
"eof-in-attribute-value-single-quote"})
@@ -614,16 +631,37 @@ def attributeValueUnQuotedState(self):
614631
if data in spaceCharacters:
615632
self.state = self.states["beforeAttributeName"]
616633
elif data == u"&":
617-
self.processEntityInAttribute()
634+
self.processEntityInAttribute(None)
618635
elif data == u">":
619636
self.emitCurrentToken()
637+
elif data == u'"' or data == u"'" or data == u"=":
638+
self.tokenQueue.append({"type": "ParseError", "data":
639+
"unexpected-character-in-unquoted-attribute-value"})
640+
self.currentToken["data"][-1][1] += data
620641
elif data == EOF:
621642
self.tokenQueue.append({"type": "ParseError", "data":
622643
"eof-in-attribute-value-no-quotes"})
623644
self.emitCurrentToken()
624645
else:
625646
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
626-
frozenset(("&", ">","<")) | spaceCharacters)
647+
frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters)
648+
return True
649+
650+
def afterAttributeValueState(self):
651+
data = self.stream.char()
652+
if data in spaceCharacters:
653+
self.state = self.states["beforeAttributeName"]
654+
elif data == u">":
655+
self.emitCurrentToken()
656+
self.state = self.states["data"]
657+
elif data == u"/":
658+
self.processSolidusInTag()
659+
self.state = self.states["beforeAttributeName"]
660+
else:
661+
self.tokenQueue.append({"type": "ParseError", "data":
662+
"unexpected-character-after-attribute-value"})
663+
self.stream.unget(data)
664+
self.state = self.states["beforeAttributeName"]
627665
return True
628666

629667
def bogusCommentState(self):

0 commit comments

Comments
 (0)