Skip to content

Commit 1f6cae9

Browse files
committed
Refactor token attribution name/value accumulation
1 parent 900bdaf commit 1f6cae9

File tree

5 files changed

+77
-76
lines changed

5 files changed

+77
-76
lines changed

html5lib/_tokenizer.py

Lines changed: 52 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,24 @@ class SpaceCharacters(Token):
4343

4444

4545
class Tag(Token):
46-
def __init__(self, name, data):
46+
def __init__(self, name, attributes):
4747
self.name = name
48-
self.data = data or []
48+
self.attributes = attributeMap(attributes or {})
4949
self.self_closing = False
50+
self.attribute_name = ""
51+
self.attribute_value = ""
52+
53+
def clearAttribute(self):
54+
if self.attribute_name and self.attribute_name not in self.attributes:
55+
self.attributes[self.attribute_name] = self.attribute_value
56+
self.attribute_name = ""
57+
self.attribute_value = ""
58+
59+
def accumulateAttributeName(self, text):
60+
self.attribute_name += text.translate(asciiUpper2Lower)
61+
62+
def accumulateAttributeValue(self, text):
63+
self.attribute_value += text
5064

5165
class StartTag(Tag):
5266
def __init__(self, name, data=None):
@@ -248,7 +262,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
248262
output = "&" + "".join(charStack)
249263

250264
if fromAttribute:
251-
self.currentToken.data[-1][1] += output
265+
self.currentToken.accumulateAttributeValue(output)
252266
else:
253267
if output in spaceCharacters:
254268
token = SpaceCharacters(output)
@@ -270,17 +284,9 @@ def emitCurrentToken(self):
270284
# Add token to the queue to be yielded
271285
if isinstance(token, Tag):
272286
token.name = token.name.translate(asciiUpper2Lower)
273-
if isinstance(token, StartTag):
274-
raw = token.data
275-
data = attributeMap(raw)
276-
if len(raw) > len(data):
277-
# we had some duplicated attribute, fix so first wins
278-
was = dict(data)
279-
data.update(raw[::-1])
280-
token.data = data
281-
287+
token.clearAttribute()
282288
if isinstance(token, EndTag):
283-
if token.data:
289+
if token.attributes:
284290
self.tokenQueue.append(ParseError("attributes-in-end-tag"))
285291
if token.self_closing:
286292
self.tokenQueue.append(ParseError("self-closing-flag-on-end-tag"))
@@ -820,25 +826,29 @@ def beforeAttributeNameState(self):
820826
if data in spaceCharacters:
821827
self.stream.charsUntil(spaceCharacters, True)
822828
elif data in asciiLetters:
823-
self.currentToken.data.append([data, ""])
829+
self.currentToken.clearAttribute()
830+
self.currentToken.accumulateAttributeName(data)
824831
self.state = self.attributeNameState
825832
elif data == ">":
826833
self.emitCurrentToken()
827834
elif data == "/":
828835
self.state = self.selfClosingStartTagState
829836
elif data in ("'", '"', "=", "<"):
830837
self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
831-
self.currentToken.data.append([data, ""])
838+
self.currentToken.clearAttribute()
839+
self.currentToken.accumulateAttributeName(data)
832840
self.state = self.attributeNameState
833841
elif data == "\u0000":
834842
self.tokenQueue.append(ParseError("invalid-codepoint"))
835-
self.currentToken.data.append(["\uFFFD", ""])
843+
self.currentToken.clearAttribute()
844+
self.currentToken.accumulateAttributeName("\uFFFD")
836845
self.state = self.attributeNameState
837846
elif data is EOF:
838847
self.tokenQueue.append(ParseError("expected-attribute-name-but-got-eof"))
839848
self.state = self.dataState
840849
else:
841-
self.currentToken.data.append([data, ""])
850+
self.currentToken.clearAttribute()
851+
self.currentToken.accumulateAttributeName(data)
842852
self.state = self.attributeNameState
843853
return True
844854

@@ -849,8 +859,7 @@ def attributeNameState(self):
849859
if data == "=":
850860
self.state = self.beforeAttributeValueState
851861
elif data in asciiLetters:
852-
self.currentToken.data[-1][0] += data +\
853-
self.stream.charsUntil(asciiLetters, True)
862+
self.currentToken.accumulateAttributeName(data + self.stream.charsUntil(asciiLetters, True))
854863
leavingThisState = False
855864
elif data == ">":
856865
# XXX If we emit here the attributes are converted to a dict
@@ -863,29 +872,25 @@ def attributeNameState(self):
863872
self.state = self.selfClosingStartTagState
864873
elif data == "\u0000":
865874
self.tokenQueue.append(ParseError("invalid-codepoint"))
866-
self.currentToken.data[-1][0] += "\uFFFD"
875+
self.currentToken.accumulateAttributeName("\uFFFD")
867876
leavingThisState = False
868877
elif data in ("'", '"', "<"):
869878
self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
870-
self.currentToken.data[-1][0] += data
879+
self.currentToken.accumulateAttributeName(data)
871880
leavingThisState = False
872881
elif data is EOF:
873882
self.tokenQueue.append(ParseError("eof-in-attribute-name"))
874883
self.state = self.dataState
875884
else:
876-
self.currentToken.data[-1][0] += data
885+
self.currentToken.accumulateAttributeName(data)
877886
leavingThisState = False
878887

879888
if leavingThisState:
880889
# Attributes are not dropped at this stage. That happens when the
881890
# start tag token is emitted so values can still be safely appended
882891
# to attributes, but we do want to report the parse error in time.
883-
self.currentToken.data[-1][0] = (
884-
self.currentToken.data[-1][0].translate(asciiUpper2Lower))
885-
for name, _ in self.currentToken.data[:-1]:
886-
if self.currentToken.data[-1][0] == name:
887-
self.tokenQueue.append(ParseError("duplicate-attribute"))
888-
break
892+
if self.currentToken.attribute_name in self.currentToken.attributes:
893+
self.tokenQueue.append(ParseError("duplicate-attribute"))
889894
# XXX Fix for above XXX
890895
if emitToken:
891896
self.emitCurrentToken()
@@ -900,23 +905,27 @@ def afterAttributeNameState(self):
900905
elif data == ">":
901906
self.emitCurrentToken()
902907
elif data in asciiLetters:
903-
self.currentToken.data.append([data, ""])
908+
self.currentToken.clearAttribute()
909+
self.currentToken.accumulateAttributeName(data)
904910
self.state = self.attributeNameState
905911
elif data == "/":
906912
self.state = self.selfClosingStartTagState
907913
elif data == "\u0000":
908914
self.tokenQueue.append(ParseError("invalid-codepoint"))
909-
self.currentToken.data.append(["\uFFFD", ""])
915+
self.currentToken.clearAttribute()
916+
self.currentToken.accumulateAttributeName("\uFFFD")
910917
self.state = self.attributeNameState
911918
elif data in ("'", '"', "<"):
912919
self.tokenQueue.append(ParseError("invalid-character-after-attribute-name"))
913-
self.currentToken.data.append([data, ""])
920+
self.currentToken.clearAttribute()
921+
self.currentToken.accumulateAttributeName(data)
914922
self.state = self.attributeNameState
915923
elif data is EOF:
916924
self.tokenQueue.append(ParseError("expected-end-of-tag-but-got-eof"))
917925
self.state = self.dataState
918926
else:
919-
self.currentToken.data.append([data, ""])
927+
self.currentToken.clearAttribute()
928+
self.currentToken.accumulateAttributeName(data)
920929
self.state = self.attributeNameState
921930
return True
922931

@@ -936,17 +945,17 @@ def beforeAttributeValueState(self):
936945
self.emitCurrentToken()
937946
elif data == "\u0000":
938947
self.tokenQueue.append(ParseError("invalid-codepoint"))
939-
self.currentToken.data[-1][1] += "\uFFFD"
948+
self.currentToken.accumulateAttributeValue("\uFFFD")
940949
self.state = self.attributeValueUnQuotedState
941950
elif data in ("=", "<", "`"):
942951
self.tokenQueue.append(ParseError("equals-in-unquoted-attribute-value"))
943-
self.currentToken.data[-1][1] += data
952+
self.currentToken.accumulateAttributeValue(data)
944953
self.state = self.attributeValueUnQuotedState
945954
elif data is EOF:
946955
self.tokenQueue.append(ParseError("expected-attribute-value-but-got-eof"))
947956
self.state = self.dataState
948957
else:
949-
self.currentToken.data[-1][1] += data
958+
self.currentToken.accumulateAttributeValue(data)
950959
self.state = self.attributeValueUnQuotedState
951960
return True
952961

@@ -958,13 +967,12 @@ def attributeValueDoubleQuotedState(self):
958967
self.processEntityInAttribute('"')
959968
elif data == "\u0000":
960969
self.tokenQueue.append(ParseError("invalid-codepoint"))
961-
self.currentToken.data[-1][1] += "\uFFFD"
970+
self.currentToken.accumulateAttributeValue("\uFFFD")
962971
elif data is EOF:
963972
self.tokenQueue.append(ParseError("eof-in-attribute-value-double-quote"))
964973
self.state = self.dataState
965974
else:
966-
self.currentToken.data[-1][1] += data +\
967-
self.stream.charsUntil(("\"", "&", "\u0000"))
975+
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("\"", "&", "\u0000")))
968976
return True
969977

970978
def attributeValueSingleQuotedState(self):
@@ -975,13 +983,12 @@ def attributeValueSingleQuotedState(self):
975983
self.processEntityInAttribute("'")
976984
elif data == "\u0000":
977985
self.tokenQueue.append(ParseError("invalid-codepoint"))
978-
self.currentToken.data[-1][1] += "\uFFFD"
986+
self.currentToken.accumulateAttributeValue("\uFFFD")
979987
elif data is EOF:
980988
self.tokenQueue.append(ParseError("eof-in-attribute-value-single-quote"))
981989
self.state = self.dataState
982990
else:
983-
self.currentToken.data[-1][1] += data +\
984-
self.stream.charsUntil(("'", "&", "\u0000"))
991+
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("'", "&", "\u0000")))
985992
return True
986993

987994
def attributeValueUnQuotedState(self):
@@ -994,16 +1001,16 @@ def attributeValueUnQuotedState(self):
9941001
self.emitCurrentToken()
9951002
elif data in ('"', "'", "=", "<", "`"):
9961003
self.tokenQueue.append(ParseError("unexpected-character-in-unquoted-attribute-value"))
997-
self.currentToken.data[-1][1] += data
1004+
self.currentToken.accumulateAttributeValue(data)
9981005
elif data == "\u0000":
9991006
self.tokenQueue.append(ParseError("invalid-codepoint"))
1000-
self.currentToken.data[-1][1] += "\uFFFD"
1007+
self.currentToken.accumulateAttributeValue("\uFFFD")
10011008
elif data is EOF:
10021009
self.tokenQueue.append(ParseError("eof-in-attribute-value-no-quotes"))
10031010
self.state = self.dataState
10041011
else:
1005-
self.currentToken.data[-1][1] += data + self.stream.charsUntil(
1006-
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
1012+
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(
1013+
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters))
10071014
return True
10081015

10091016
def afterAttributeValueState(self):

html5lib/html5parser.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from . import _inputstream
77
from ._tokenizer import (
8+
attributeMap,
89
HTMLTokenizer,
910
Characters,
1011
SpaceCharacters,
@@ -471,7 +472,7 @@ def startTagHtml(self, token):
471472
self.parser.parseError("non-html-root")
472473
# XXX Need a check here to see if the first start tag token emitted is
473474
# this token... If it's not, invoke self.parser.parseError().
474-
for attr, value in token.data.items():
475+
for attr, value in token.attributes.items():
475476
if attr not in self.tree.openElements[0].attributes:
476477
self.tree.openElements[0].attributes[attr] = value
477478
self.parser.firstStartTag = False
@@ -733,7 +734,7 @@ def startTagMeta(self, token):
733734
self.tree.openElements.pop()
734735
token.self_closing_acknowledged = True
735736

736-
attributes = token.data
737+
attributes = token.attributes
737738
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
738739
if "charset" in attributes:
739740
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
@@ -1018,7 +1019,7 @@ def startTagBody(self, token):
10181019
assert self.parser.innerHTML
10191020
else:
10201021
self.parser.framesetOK = False
1021-
for attr, value in token.data.items():
1022+
for attr, value in token.attributes.items():
10221023
if attr not in self.tree.openElements[1].attributes:
10231024
self.tree.openElements[1].attributes[attr] = value
10241025

@@ -1162,8 +1163,8 @@ def startTagVoidFormatting(self, token):
11621163
def startTagInput(self, token):
11631164
framesetOK = self.parser.framesetOK
11641165
self.startTagVoidFormatting(token)
1165-
if ("type" in token.data and
1166-
token.data["type"].translate(asciiUpper2Lower) == "hidden"):
1166+
token_type = token.attributes.get('type', '')
1167+
if token_type.translate(asciiUpper2Lower) == "hidden":
11671168
# input type=hidden doesn't change framesetOK
11681169
self.parser.framesetOK = framesetOK
11691170

@@ -1184,28 +1185,23 @@ def startTagImage(self, token):
11841185
# No really...
11851186
self.parser.parseError("unexpected-start-tag-treated-as",
11861187
{"originalName": "image", "newName": "img"})
1187-
self.processStartTag(impliedTagToken("img", StartTag,
1188-
attributes=token.data,
1189-
selfClosing=token.self_closing))
1188+
self.processStartTag(impliedTagToken("img", StartTag, attributes=token.attributes))
11901189

11911190
def startTagIsIndex(self, token):
11921191
self.parser.parseError("deprecated-tag", {"name": "isindex"})
11931192
if self.tree.formPointer:
11941193
return
11951194
form_attrs = {}
1196-
if "action" in token.data:
1197-
form_attrs["action"] = token.data["action"]
1195+
if "action" in token.attributes:
1196+
form_attrs["action"] = token.attributes["action"]
11981197
self.processStartTag(impliedTagToken("form", StartTag,
11991198
attributes=form_attrs))
12001199
self.processStartTag(impliedTagToken("hr", StartTag))
12011200
self.processStartTag(impliedTagToken("label", StartTag))
12021201
# XXX Localization ...
1203-
if "prompt" in token.data:
1204-
prompt = token.data["prompt"]
1205-
else:
1206-
prompt = "This is a searchable index. Enter search keywords: "
1202+
prompt = token.attributes.get("prompt", "This is a searchable index. Enter search keywords: ")
12071203
self.processCharacters(Characters(prompt))
1208-
attributes = token.data.copy()
1204+
attributes = token.attributes.copy()
12091205
if "action" in attributes:
12101206
del attributes["action"]
12111207
if "prompt" in attributes:
@@ -1767,8 +1763,8 @@ def startTagStyleScript(self, token):
17671763
return self.parser.phases["inHead"].processStartTag(token)
17681764

17691765
def startTagInput(self, token):
1770-
if ("type" in token.data and
1771-
token.data["type"].translate(asciiUpper2Lower) == "hidden"):
1766+
token_type = token.attributes.get('type', '')
1767+
if token_type.translate(asciiUpper2Lower) == "hidden":
17721768
self.parser.parseError("unexpected-hidden-input-in-table")
17731769
self.tree.insertElement(token)
17741770
# XXX associate with form
@@ -2483,7 +2479,7 @@ def processStartTag(self, token):
24832479
currentNode = self.tree.openElements[-1]
24842480
if (token.name in self.breakoutElements or
24852481
(token.name == "font" and
2486-
set(token.data.keys()) & {"color", "face", "size"})):
2482+
set(token.attributes.keys()) & {"color", "face", "size"})):
24872483
self.parser.parseError("unexpected-html-element-in-foreign-content",
24882484
{"name": token.name})
24892485
while (self.tree.openElements[-1].namespace !=
@@ -2773,10 +2769,8 @@ def processEndTag(self, token):
27732769

27742770

27752771
def adjust_attributes(token, replacements):
2776-
needs_adjustment = viewkeys(token.data) & viewkeys(replacements)
2777-
if needs_adjustment:
2778-
token.data = type(token.data)((replacements.get(k, k), v)
2779-
for k, v in token.data.items())
2772+
if viewkeys(token.attributes) & viewkeys(replacements):
2773+
token.attributes = attributeMap((replacements.get(k, k), v) for k, v in token.attributes.items())
27802774

27812775

27822776
def impliedTagToken(name, type=EndTag, attributes=None,

html5lib/tests/test_tokenizer2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def test_maintain_attribute_order():
2424
assert len(out) == 1
2525
assert isinstance(out[0], StartTag)
2626

27-
attrs_tok = out[0].data
27+
attrs_tok = out[0].attributes
2828
assert len(attrs_tok) == len(attrs)
2929

3030
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
@@ -41,7 +41,7 @@ def test_duplicate_attribute():
4141
assert len(out) == 1
4242
assert isinstance(out[0], StartTag)
4343

44-
attrs_tok = out[0].data
44+
attrs_tok = out[0].attributes
4545
assert len(attrs_tok) == 1
4646
assert list(attrs_tok.items()) == [('a', '1')]
4747

@@ -57,7 +57,7 @@ def test_maintain_duplicate_attribute_order():
5757
assert len(out) == 1
5858
assert isinstance(out[0], StartTag)
5959

60-
attrs_tok = out[0].data
60+
attrs_tok = out[0].attributes
6161
assert len(attrs_tok) == len(attrs)
6262

6363
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):

0 commit comments

Comments
 (0)