@@ -49,6 +49,7 @@ def __init__(self, stream, parser=None, **kwargs):
49
49
50
50
# The current token being created
51
51
self .currentToken = None
52
+ self .currentAttribute = None
52
53
super (HTMLTokenizer , self ).__init__ ()
53
54
54
55
def __iter__ (self ):
@@ -226,7 +227,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
226
227
output = "&" + "" .join (charStack )
227
228
228
229
if fromAttribute :
229
- self .currentToken ["data" ][- 1 ][ 1 ] += output
230
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += output
230
231
else :
231
232
if output in spaceCharacters :
232
233
tokenType = "SpaceCharacters"
@@ -249,12 +250,9 @@ def emitCurrentToken(self):
249
250
if (token ["type" ] in tagTokenTypes ):
250
251
token ["name" ] = ascii_lower (token ["name" ])
251
252
if token ["type" ] == tokenTypes ["StartTag" ]:
252
- raw = token ["data" ]
253
- data = attributeMap (raw )
254
- if len (raw ) > len (data ):
255
- # we had some duplicated attribute, fix so first wins
256
- data .update (raw [::- 1 ])
257
- token ["data" ] = data
253
+ data = token ["data" ]
254
+ for k , v in data .items ():
255
+ data [k ] = v [0 ]
258
256
259
257
if token ["type" ] == tokenTypes ["EndTag" ]:
260
258
if token ["data" ]:
@@ -394,7 +392,7 @@ def tagOpenState(self):
394
392
self ._state = self .closeTagOpenState
395
393
elif data in asciiLetters :
396
394
self .currentToken = {"type" : tokenTypes ["StartTag" ],
397
- "name" : data , "data" : [] ,
395
+ "name" : data , "data" : attributeMap () ,
398
396
"selfClosing" : False ,
399
397
"selfClosingAcknowledged" : False }
400
398
self ._state = self .tagNameState
@@ -425,7 +423,7 @@ def closeTagOpenState(self):
425
423
data = self .stream .char ()
426
424
if data in asciiLetters :
427
425
self .currentToken = {"type" : tokenTypes ["EndTag" ], "name" : data ,
428
- "data" : [] , "selfClosing" : False }
426
+ "data" : attributeMap () , "selfClosing" : False }
429
427
self ._state = self .tagNameState
430
428
elif data == ">" :
431
429
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
@@ -495,17 +493,17 @@ def rcdataEndTagNameState(self):
495
493
if data in spaceCharacters and appropriate :
496
494
self .currentToken = {"type" : tokenTypes ["EndTag" ],
497
495
"name" : self .temporaryBuffer ,
498
- "data" : [] , "selfClosing" : False }
496
+ "data" : attributeMap () , "selfClosing" : False }
499
497
self ._state = self .beforeAttributeNameState
500
498
elif data == "/" and appropriate :
501
499
self .currentToken = {"type" : tokenTypes ["EndTag" ],
502
500
"name" : self .temporaryBuffer ,
503
- "data" : [] , "selfClosing" : False }
501
+ "data" : attributeMap () , "selfClosing" : False }
504
502
self ._state = self .selfClosingStartTagState
505
503
elif data == ">" and appropriate :
506
504
self .currentToken = {"type" : tokenTypes ["EndTag" ],
507
505
"name" : self .temporaryBuffer ,
508
- "data" : [] , "selfClosing" : False }
506
+ "data" : attributeMap () , "selfClosing" : False }
509
507
self .emitCurrentToken ()
510
508
self ._state = self .dataState
511
509
elif data in asciiLetters :
@@ -545,17 +543,17 @@ def rawtextEndTagNameState(self):
545
543
if data in spaceCharacters and appropriate :
546
544
self .currentToken = {"type" : tokenTypes ["EndTag" ],
547
545
"name" : self .temporaryBuffer ,
548
- "data" : [] , "selfClosing" : False }
546
+ "data" : attributeMap () , "selfClosing" : False }
549
547
self ._state = self .beforeAttributeNameState
550
548
elif data == "/" and appropriate :
551
549
self .currentToken = {"type" : tokenTypes ["EndTag" ],
552
550
"name" : self .temporaryBuffer ,
553
- "data" : [] , "selfClosing" : False }
551
+ "data" : attributeMap () , "selfClosing" : False }
554
552
self ._state = self .selfClosingStartTagState
555
553
elif data == ">" and appropriate :
556
554
self .currentToken = {"type" : tokenTypes ["EndTag" ],
557
555
"name" : self .temporaryBuffer ,
558
- "data" : [] , "selfClosing" : False }
556
+ "data" : attributeMap () , "selfClosing" : False }
559
557
self .emitCurrentToken ()
560
558
self ._state = self .dataState
561
559
elif data in asciiLetters :
@@ -598,17 +596,17 @@ def scriptDataEndTagNameState(self):
598
596
if data in spaceCharacters and appropriate :
599
597
self .currentToken = {"type" : tokenTypes ["EndTag" ],
600
598
"name" : self .temporaryBuffer ,
601
- "data" : [] , "selfClosing" : False }
599
+ "data" : attributeMap () , "selfClosing" : False }
602
600
self ._state = self .beforeAttributeNameState
603
601
elif data == "/" and appropriate :
604
602
self .currentToken = {"type" : tokenTypes ["EndTag" ],
605
603
"name" : self .temporaryBuffer ,
606
- "data" : [] , "selfClosing" : False }
604
+ "data" : attributeMap () , "selfClosing" : False }
607
605
self ._state = self .selfClosingStartTagState
608
606
elif data == ">" and appropriate :
609
607
self .currentToken = {"type" : tokenTypes ["EndTag" ],
610
608
"name" : self .temporaryBuffer ,
611
- "data" : [] , "selfClosing" : False }
609
+ "data" : attributeMap () , "selfClosing" : False }
612
610
self .emitCurrentToken ()
613
611
self ._state = self .dataState
614
612
elif data in asciiLetters :
@@ -734,17 +732,17 @@ def scriptDataEscapedEndTagNameState(self):
734
732
if data in spaceCharacters and appropriate :
735
733
self .currentToken = {"type" : tokenTypes ["EndTag" ],
736
734
"name" : self .temporaryBuffer ,
737
- "data" : [] , "selfClosing" : False }
735
+ "data" : attributeMap () , "selfClosing" : False }
738
736
self ._state = self .beforeAttributeNameState
739
737
elif data == "/" and appropriate :
740
738
self .currentToken = {"type" : tokenTypes ["EndTag" ],
741
739
"name" : self .temporaryBuffer ,
742
- "data" : [] , "selfClosing" : False }
740
+ "data" : attributeMap () , "selfClosing" : False }
743
741
self ._state = self .selfClosingStartTagState
744
742
elif data == ">" and appropriate :
745
743
self .currentToken = {"type" : tokenTypes ["EndTag" ],
746
744
"name" : self .temporaryBuffer ,
747
- "data" : [] , "selfClosing" : False }
745
+ "data" : attributeMap () , "selfClosing" : False }
748
746
self .emitCurrentToken ()
749
747
self ._state = self .dataState
750
748
elif data in asciiLetters :
@@ -873,7 +871,7 @@ def beforeAttributeNameState(self):
873
871
if data in spaceCharacters :
874
872
self .stream .charsUntil (spaceCharacters , True )
875
873
elif data in asciiLetters :
876
- self .currentToken [ "data" ]. append ([ data , "" ])
874
+ self .currentAttribute = data
877
875
self ._state = self .attributeNameState
878
876
elif data == ">" :
879
877
self .emitCurrentToken ()
@@ -882,19 +880,19 @@ def beforeAttributeNameState(self):
882
880
elif data in ("'" , '"' , "=" , "<" ):
883
881
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
884
882
"invalid-character-in-attribute-name" })
885
- self .currentToken [ "data" ]. append ([ data , "" ])
883
+ self .currentAttribute = data
886
884
self ._state = self .attributeNameState
887
885
elif data == "\u0000 " :
888
886
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
889
887
"data" : "invalid-codepoint" })
890
- self .currentToken [ "data" ]. append ([ " \uFFFD ", "" ])
888
+ self .currentAttribute = " \uFFFD "
891
889
self ._state = self .attributeNameState
892
890
elif data is EOF :
893
891
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
894
892
"expected-attribute-name-but-got-eof" })
895
893
self ._state = self .dataState
896
894
else :
897
- self .currentToken [ "data" ]. append ([ data , "" ])
895
+ self .currentAttribute = data
898
896
self ._state = self .attributeNameState
899
897
return True
900
898
@@ -905,7 +903,7 @@ def attributeNameState(self):
905
903
if data == "=" :
906
904
self ._state = self .beforeAttributeValueState
907
905
elif data in asciiLetters :
908
- self .currentToken [ "data" ][ - 1 ][ 0 ] += data + \
906
+ self .currentAttribute += data + \
909
907
self .stream .charsUntil (asciiLetters , True )
910
908
leavingThisState = False
911
909
elif data == ">" :
@@ -920,34 +918,34 @@ def attributeNameState(self):
920
918
elif data == "\u0000 " :
921
919
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
922
920
"data" : "invalid-codepoint" })
923
- self .currentToken [ "data" ][ - 1 ][ 0 ] += "\uFFFD "
921
+ self .currentAttribute += "\uFFFD "
924
922
leavingThisState = False
925
923
elif data in ("'" , '"' , "<" ):
926
924
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
927
925
"data" :
928
926
"invalid-character-in-attribute-name" })
929
- self .currentToken [ "data" ][ - 1 ][ 0 ] += data
927
+ self .currentAttribute += data
930
928
leavingThisState = False
931
929
elif data is EOF :
932
930
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
933
931
"data" : "eof-in-attribute-name" })
934
932
self ._state = self .dataState
935
933
else :
936
- self .currentToken [ "data" ][ - 1 ][ 0 ] += data
934
+ self .currentAttribute += data
937
935
leavingThisState = False
938
936
939
937
assert leavingThisState == ((self ._state != self .attributeNameState ) or emitToken )
940
938
if leavingThisState :
941
939
# Attributes are not dropped at this stage. That happens when the
942
940
# start tag token is emitted so values can still be safely appended
943
941
# to attributes, but we do want to report the parse error in time.
944
- self .currentToken [ "data" ][ - 1 ][ 0 ] = (
945
- ascii_lower ( self .currentToken ["data" ][ - 1 ][ 0 ]))
946
- for name , _ in self .currentToken [ "data" ][: - 1 ]:
947
- if self . currentToken [ "data" ][ - 1 ][ 0 ] == name :
948
- self .tokenQueue . append ({ "type" : tokenTypes [ "ParseError" ], "data" :
949
- "duplicate-attribute" })
950
- break
942
+ self .currentAttribute = ascii_lower ( self . currentAttribute )
943
+ if self . currentAttribute in self .currentToken ["data" ]:
944
+ self .tokenQueue . append ({ "type" : tokenTypes [ "ParseError" ],
945
+ "data" : "duplicate-attribute" })
946
+ self .currentToken [ "data" ][ self . currentAttribute ]. append ( "" )
947
+ else :
948
+ self . currentToken [ "data" ][ self . currentAttribute ] = [ "" ]
951
949
# XXX Fix for above XXX
952
950
if emitToken :
953
951
self .emitCurrentToken ()
@@ -962,26 +960,26 @@ def afterAttributeNameState(self):
962
960
elif data == ">" :
963
961
self .emitCurrentToken ()
964
962
elif data in asciiLetters :
965
- self .currentToken [ "data" ]. append ([ data , "" ])
963
+ self .currentAttribute = data
966
964
self ._state = self .attributeNameState
967
965
elif data == "/" :
968
966
self ._state = self .selfClosingStartTagState
969
967
elif data == "\u0000 " :
970
968
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
971
969
"data" : "invalid-codepoint" })
972
- self .currentToken [ "data" ]. append ([ " \uFFFD ", "" ])
970
+ self .currentAttribute = " \uFFFD "
973
971
self ._state = self .attributeNameState
974
972
elif data in ("'" , '"' , "<" ):
975
973
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
976
974
"invalid-character-after-attribute-name" })
977
- self .currentToken [ "data" ]. append ([ data , "" ])
975
+ self .currentAttribute = data
978
976
self ._state = self .attributeNameState
979
977
elif data is EOF :
980
978
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
981
979
"expected-end-of-tag-but-got-eof" })
982
980
self ._state = self .dataState
983
981
else :
984
- self .currentToken [ "data" ]. append ([ data , "" ])
982
+ self .currentAttribute = data
985
983
self ._state = self .attributeNameState
986
984
return True
987
985
@@ -1003,19 +1001,19 @@ def beforeAttributeValueState(self):
1003
1001
elif data == "\u0000 " :
1004
1002
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
1005
1003
"data" : "invalid-codepoint" })
1006
- self .currentToken ["data" ][- 1 ][ 1 ] += "\uFFFD "
1004
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += "\uFFFD "
1007
1005
self ._state = self .attributeValueUnQuotedState
1008
1006
elif data in ("=" , "<" , "`" ):
1009
1007
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1010
1008
"equals-in-unquoted-attribute-value" })
1011
- self .currentToken ["data" ][- 1 ][ 1 ] += data
1009
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data
1012
1010
self ._state = self .attributeValueUnQuotedState
1013
1011
elif data is EOF :
1014
1012
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1015
1013
"expected-attribute-value-but-got-eof" })
1016
1014
self ._state = self .dataState
1017
1015
else :
1018
- self .currentToken ["data" ][- 1 ][ 1 ] += data
1016
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data
1019
1017
self ._state = self .attributeValueUnQuotedState
1020
1018
return True
1021
1019
@@ -1028,13 +1026,13 @@ def attributeValueDoubleQuotedState(self):
1028
1026
elif data == "\u0000 " :
1029
1027
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
1030
1028
"data" : "invalid-codepoint" })
1031
- self .currentToken ["data" ][- 1 ][ 1 ] += "\uFFFD "
1029
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += "\uFFFD "
1032
1030
elif data is EOF :
1033
1031
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1034
1032
"eof-in-attribute-value-double-quote" })
1035
1033
self ._state = self .dataState
1036
1034
else :
1037
- self .currentToken ["data" ][- 1 ][ 1 ] += data + \
1035
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data + \
1038
1036
self .stream .charsUntil (("\" " , "&" , "\u0000 " ))
1039
1037
return True
1040
1038
@@ -1047,13 +1045,13 @@ def attributeValueSingleQuotedState(self):
1047
1045
elif data == "\u0000 " :
1048
1046
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
1049
1047
"data" : "invalid-codepoint" })
1050
- self .currentToken ["data" ][- 1 ][ 1 ] += "\uFFFD "
1048
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += "\uFFFD "
1051
1049
elif data is EOF :
1052
1050
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1053
1051
"eof-in-attribute-value-single-quote" })
1054
1052
self ._state = self .dataState
1055
1053
else :
1056
- self .currentToken ["data" ][- 1 ][ 1 ] += data + \
1054
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data + \
1057
1055
self .stream .charsUntil (("'" , "&" , "\u0000 " ))
1058
1056
return True
1059
1057
@@ -1068,17 +1066,17 @@ def attributeValueUnQuotedState(self):
1068
1066
elif data in ('"' , "'" , "=" , "<" , "`" ):
1069
1067
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1070
1068
"unexpected-character-in-unquoted-attribute-value" })
1071
- self .currentToken ["data" ][- 1 ][ 1 ] += data
1069
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data
1072
1070
elif data == "\u0000 " :
1073
1071
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ],
1074
1072
"data" : "invalid-codepoint" })
1075
- self .currentToken ["data" ][- 1 ][ 1 ] += "\uFFFD "
1073
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += "\uFFFD "
1076
1074
elif data is EOF :
1077
1075
self .tokenQueue .append ({"type" : tokenTypes ["ParseError" ], "data" :
1078
1076
"eof-in-attribute-value-no-quotes" })
1079
1077
self ._state = self .dataState
1080
1078
else :
1081
- self .currentToken ["data" ][- 1 ][ 1 ] += data + self .stream .charsUntil (
1079
+ self .currentToken ["data" ][self . currentAttribute ][ - 1 ] += data + self .stream .charsUntil (
1082
1080
frozenset (("&" , ">" , '"' , "'" , "=" , "<" , "`" , "\u0000 " )) | spaceCharacters )
1083
1081
return True
1084
1082
0 commit comments