Skip to content

Commit 8b89668

Browse files
committed
Change attributes to be created as dicts from day one
This makes duplicate checking much quicker, and avoids the conversion to a dict at the end
1 parent 84cbc20 commit 8b89668

File tree

1 file changed

+48
-50
lines changed

1 file changed

+48
-50
lines changed

html5lib/_tokenizer.py

Lines changed: 48 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(self, stream, parser=None, **kwargs):
4949

5050
# The current token being created
5151
self.currentToken = None
52+
self.currentAttribute = None
5253
super(HTMLTokenizer, self).__init__()
5354

5455
def __iter__(self):
@@ -226,7 +227,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
226227
output = "&" + "".join(charStack)
227228

228229
if fromAttribute:
229-
self.currentToken["data"][-1][1] += output
230+
self.currentToken["data"][self.currentAttribute][-1] += output
230231
else:
231232
if output in spaceCharacters:
232233
tokenType = "SpaceCharacters"
@@ -249,12 +250,9 @@ def emitCurrentToken(self):
249250
if (token["type"] in tagTokenTypes):
250251
token["name"] = ascii_lower(token["name"])
251252
if token["type"] == tokenTypes["StartTag"]:
252-
raw = token["data"]
253-
data = attributeMap(raw)
254-
if len(raw) > len(data):
255-
# we had some duplicated attribute, fix so first wins
256-
data.update(raw[::-1])
257-
token["data"] = data
253+
data = token["data"]
254+
for k, v in data.items():
255+
data[k] = v[0]
258256

259257
if token["type"] == tokenTypes["EndTag"]:
260258
if token["data"]:
@@ -394,7 +392,7 @@ def tagOpenState(self):
394392
self._state = self.closeTagOpenState
395393
elif data in asciiLetters:
396394
self.currentToken = {"type": tokenTypes["StartTag"],
397-
"name": data, "data": [],
395+
"name": data, "data": attributeMap(),
398396
"selfClosing": False,
399397
"selfClosingAcknowledged": False}
400398
self._state = self.tagNameState
@@ -425,7 +423,7 @@ def closeTagOpenState(self):
425423
data = self.stream.char()
426424
if data in asciiLetters:
427425
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
428-
"data": [], "selfClosing": False}
426+
"data": attributeMap(), "selfClosing": False}
429427
self._state = self.tagNameState
430428
elif data == ">":
431429
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
@@ -495,17 +493,17 @@ def rcdataEndTagNameState(self):
495493
if data in spaceCharacters and appropriate:
496494
self.currentToken = {"type": tokenTypes["EndTag"],
497495
"name": self.temporaryBuffer,
498-
"data": [], "selfClosing": False}
496+
"data": attributeMap(), "selfClosing": False}
499497
self._state = self.beforeAttributeNameState
500498
elif data == "/" and appropriate:
501499
self.currentToken = {"type": tokenTypes["EndTag"],
502500
"name": self.temporaryBuffer,
503-
"data": [], "selfClosing": False}
501+
"data": attributeMap(), "selfClosing": False}
504502
self._state = self.selfClosingStartTagState
505503
elif data == ">" and appropriate:
506504
self.currentToken = {"type": tokenTypes["EndTag"],
507505
"name": self.temporaryBuffer,
508-
"data": [], "selfClosing": False}
506+
"data": attributeMap(), "selfClosing": False}
509507
self.emitCurrentToken()
510508
self._state = self.dataState
511509
elif data in asciiLetters:
@@ -545,17 +543,17 @@ def rawtextEndTagNameState(self):
545543
if data in spaceCharacters and appropriate:
546544
self.currentToken = {"type": tokenTypes["EndTag"],
547545
"name": self.temporaryBuffer,
548-
"data": [], "selfClosing": False}
546+
"data": attributeMap(), "selfClosing": False}
549547
self._state = self.beforeAttributeNameState
550548
elif data == "/" and appropriate:
551549
self.currentToken = {"type": tokenTypes["EndTag"],
552550
"name": self.temporaryBuffer,
553-
"data": [], "selfClosing": False}
551+
"data": attributeMap(), "selfClosing": False}
554552
self._state = self.selfClosingStartTagState
555553
elif data == ">" and appropriate:
556554
self.currentToken = {"type": tokenTypes["EndTag"],
557555
"name": self.temporaryBuffer,
558-
"data": [], "selfClosing": False}
556+
"data": attributeMap(), "selfClosing": False}
559557
self.emitCurrentToken()
560558
self._state = self.dataState
561559
elif data in asciiLetters:
@@ -598,17 +596,17 @@ def scriptDataEndTagNameState(self):
598596
if data in spaceCharacters and appropriate:
599597
self.currentToken = {"type": tokenTypes["EndTag"],
600598
"name": self.temporaryBuffer,
601-
"data": [], "selfClosing": False}
599+
"data": attributeMap(), "selfClosing": False}
602600
self._state = self.beforeAttributeNameState
603601
elif data == "/" and appropriate:
604602
self.currentToken = {"type": tokenTypes["EndTag"],
605603
"name": self.temporaryBuffer,
606-
"data": [], "selfClosing": False}
604+
"data": attributeMap(), "selfClosing": False}
607605
self._state = self.selfClosingStartTagState
608606
elif data == ">" and appropriate:
609607
self.currentToken = {"type": tokenTypes["EndTag"],
610608
"name": self.temporaryBuffer,
611-
"data": [], "selfClosing": False}
609+
"data": attributeMap(), "selfClosing": False}
612610
self.emitCurrentToken()
613611
self._state = self.dataState
614612
elif data in asciiLetters:
@@ -734,17 +732,17 @@ def scriptDataEscapedEndTagNameState(self):
734732
if data in spaceCharacters and appropriate:
735733
self.currentToken = {"type": tokenTypes["EndTag"],
736734
"name": self.temporaryBuffer,
737-
"data": [], "selfClosing": False}
735+
"data": attributeMap(), "selfClosing": False}
738736
self._state = self.beforeAttributeNameState
739737
elif data == "/" and appropriate:
740738
self.currentToken = {"type": tokenTypes["EndTag"],
741739
"name": self.temporaryBuffer,
742-
"data": [], "selfClosing": False}
740+
"data": attributeMap(), "selfClosing": False}
743741
self._state = self.selfClosingStartTagState
744742
elif data == ">" and appropriate:
745743
self.currentToken = {"type": tokenTypes["EndTag"],
746744
"name": self.temporaryBuffer,
747-
"data": [], "selfClosing": False}
745+
"data": attributeMap(), "selfClosing": False}
748746
self.emitCurrentToken()
749747
self._state = self.dataState
750748
elif data in asciiLetters:
@@ -873,7 +871,7 @@ def beforeAttributeNameState(self):
873871
if data in spaceCharacters:
874872
self.stream.charsUntil(spaceCharacters, True)
875873
elif data in asciiLetters:
876-
self.currentToken["data"].append([data, ""])
874+
self.currentAttribute = data
877875
self._state = self.attributeNameState
878876
elif data == ">":
879877
self.emitCurrentToken()
@@ -882,19 +880,19 @@ def beforeAttributeNameState(self):
882880
elif data in ("'", '"', "=", "<"):
883881
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
884882
"invalid-character-in-attribute-name"})
885-
self.currentToken["data"].append([data, ""])
883+
self.currentAttribute = data
886884
self._state = self.attributeNameState
887885
elif data == "\u0000":
888886
self.tokenQueue.append({"type": tokenTypes["ParseError"],
889887
"data": "invalid-codepoint"})
890-
self.currentToken["data"].append(["\uFFFD", ""])
888+
self.currentAttribute = "\uFFFD"
891889
self._state = self.attributeNameState
892890
elif data is EOF:
893891
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
894892
"expected-attribute-name-but-got-eof"})
895893
self._state = self.dataState
896894
else:
897-
self.currentToken["data"].append([data, ""])
895+
self.currentAttribute = data
898896
self._state = self.attributeNameState
899897
return True
900898

@@ -905,7 +903,7 @@ def attributeNameState(self):
905903
if data == "=":
906904
self._state = self.beforeAttributeValueState
907905
elif data in asciiLetters:
908-
self.currentToken["data"][-1][0] += data +\
906+
self.currentAttribute += data +\
909907
self.stream.charsUntil(asciiLetters, True)
910908
leavingThisState = False
911909
elif data == ">":
@@ -920,34 +918,34 @@ def attributeNameState(self):
920918
elif data == "\u0000":
921919
self.tokenQueue.append({"type": tokenTypes["ParseError"],
922920
"data": "invalid-codepoint"})
923-
self.currentToken["data"][-1][0] += "\uFFFD"
921+
self.currentAttribute += "\uFFFD"
924922
leavingThisState = False
925923
elif data in ("'", '"', "<"):
926924
self.tokenQueue.append({"type": tokenTypes["ParseError"],
927925
"data":
928926
"invalid-character-in-attribute-name"})
929-
self.currentToken["data"][-1][0] += data
927+
self.currentAttribute += data
930928
leavingThisState = False
931929
elif data is EOF:
932930
self.tokenQueue.append({"type": tokenTypes["ParseError"],
933931
"data": "eof-in-attribute-name"})
934932
self._state = self.dataState
935933
else:
936-
self.currentToken["data"][-1][0] += data
934+
self.currentAttribute += data
937935
leavingThisState = False
938936

939937
assert leavingThisState == ((self._state != self.attributeNameState) or emitToken)
940938
if leavingThisState:
941939
# Attributes are not dropped at this stage. That happens when the
942940
# start tag token is emitted so values can still be safely appended
943941
# to attributes, but we do want to report the parse error in time.
944-
self.currentToken["data"][-1][0] = (
945-
ascii_lower(self.currentToken["data"][-1][0]))
946-
for name, _ in self.currentToken["data"][:-1]:
947-
if self.currentToken["data"][-1][0] == name:
948-
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
949-
"duplicate-attribute"})
950-
break
942+
self.currentAttribute = ascii_lower(self.currentAttribute)
943+
if self.currentAttribute in self.currentToken["data"]:
944+
self.tokenQueue.append({"type": tokenTypes["ParseError"],
945+
"data": "duplicate-attribute"})
946+
self.currentToken["data"][self.currentAttribute].append("")
947+
else:
948+
self.currentToken["data"][self.currentAttribute] = [""]
951949
# XXX Fix for above XXX
952950
if emitToken:
953951
self.emitCurrentToken()
@@ -962,26 +960,26 @@ def afterAttributeNameState(self):
962960
elif data == ">":
963961
self.emitCurrentToken()
964962
elif data in asciiLetters:
965-
self.currentToken["data"].append([data, ""])
963+
self.currentAttribute = data
966964
self._state = self.attributeNameState
967965
elif data == "/":
968966
self._state = self.selfClosingStartTagState
969967
elif data == "\u0000":
970968
self.tokenQueue.append({"type": tokenTypes["ParseError"],
971969
"data": "invalid-codepoint"})
972-
self.currentToken["data"].append(["\uFFFD", ""])
970+
self.currentAttribute = "\uFFFD"
973971
self._state = self.attributeNameState
974972
elif data in ("'", '"', "<"):
975973
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
976974
"invalid-character-after-attribute-name"})
977-
self.currentToken["data"].append([data, ""])
975+
self.currentAttribute = data
978976
self._state = self.attributeNameState
979977
elif data is EOF:
980978
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
981979
"expected-end-of-tag-but-got-eof"})
982980
self._state = self.dataState
983981
else:
984-
self.currentToken["data"].append([data, ""])
982+
self.currentAttribute = data
985983
self._state = self.attributeNameState
986984
return True
987985

@@ -1003,19 +1001,19 @@ def beforeAttributeValueState(self):
10031001
elif data == "\u0000":
10041002
self.tokenQueue.append({"type": tokenTypes["ParseError"],
10051003
"data": "invalid-codepoint"})
1006-
self.currentToken["data"][-1][1] += "\uFFFD"
1004+
self.currentToken["data"][self.currentAttribute][-1] += "\uFFFD"
10071005
self._state = self.attributeValueUnQuotedState
10081006
elif data in ("=", "<", "`"):
10091007
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10101008
"equals-in-unquoted-attribute-value"})
1011-
self.currentToken["data"][-1][1] += data
1009+
self.currentToken["data"][self.currentAttribute][-1] += data
10121010
self._state = self.attributeValueUnQuotedState
10131011
elif data is EOF:
10141012
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10151013
"expected-attribute-value-but-got-eof"})
10161014
self._state = self.dataState
10171015
else:
1018-
self.currentToken["data"][-1][1] += data
1016+
self.currentToken["data"][self.currentAttribute][-1] += data
10191017
self._state = self.attributeValueUnQuotedState
10201018
return True
10211019

@@ -1028,13 +1026,13 @@ def attributeValueDoubleQuotedState(self):
10281026
elif data == "\u0000":
10291027
self.tokenQueue.append({"type": tokenTypes["ParseError"],
10301028
"data": "invalid-codepoint"})
1031-
self.currentToken["data"][-1][1] += "\uFFFD"
1029+
self.currentToken["data"][self.currentAttribute][-1] += "\uFFFD"
10321030
elif data is EOF:
10331031
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10341032
"eof-in-attribute-value-double-quote"})
10351033
self._state = self.dataState
10361034
else:
1037-
self.currentToken["data"][-1][1] += data +\
1035+
self.currentToken["data"][self.currentAttribute][-1] += data +\
10381036
self.stream.charsUntil(("\"", "&", "\u0000"))
10391037
return True
10401038

@@ -1047,13 +1045,13 @@ def attributeValueSingleQuotedState(self):
10471045
elif data == "\u0000":
10481046
self.tokenQueue.append({"type": tokenTypes["ParseError"],
10491047
"data": "invalid-codepoint"})
1050-
self.currentToken["data"][-1][1] += "\uFFFD"
1048+
self.currentToken["data"][self.currentAttribute][-1] += "\uFFFD"
10511049
elif data is EOF:
10521050
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10531051
"eof-in-attribute-value-single-quote"})
10541052
self._state = self.dataState
10551053
else:
1056-
self.currentToken["data"][-1][1] += data +\
1054+
self.currentToken["data"][self.currentAttribute][-1] += data +\
10571055
self.stream.charsUntil(("'", "&", "\u0000"))
10581056
return True
10591057

@@ -1068,17 +1066,17 @@ def attributeValueUnQuotedState(self):
10681066
elif data in ('"', "'", "=", "<", "`"):
10691067
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10701068
"unexpected-character-in-unquoted-attribute-value"})
1071-
self.currentToken["data"][-1][1] += data
1069+
self.currentToken["data"][self.currentAttribute][-1] += data
10721070
elif data == "\u0000":
10731071
self.tokenQueue.append({"type": tokenTypes["ParseError"],
10741072
"data": "invalid-codepoint"})
1075-
self.currentToken["data"][-1][1] += "\uFFFD"
1073+
self.currentToken["data"][self.currentAttribute][-1] += "\uFFFD"
10761074
elif data is EOF:
10771075
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10781076
"eof-in-attribute-value-no-quotes"})
10791077
self._state = self.dataState
10801078
else:
1081-
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
1079+
self.currentToken["data"][self.currentAttribute][-1] += data + self.stream.charsUntil(
10821080
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
10831081
return True
10841082

0 commit comments

Comments
 (0)