@@ -43,10 +43,24 @@ class SpaceCharacters(Token):
43
43
44
44
45
45
class Tag (Token ):
46
- def __init__ (self , name , data ):
46
+ def __init__ (self , name , attributes ):
47
47
self .name = name
48
- self .data = data or []
48
+ self .attributes = attributeMap ( attributes or {})
49
49
self .self_closing = False
50
+ self .attribute_name = ""
51
+ self .attribute_value = ""
52
+
53
+ def clearAttribute (self ):
54
+ if self .attribute_name and self .attribute_name not in self .attributes :
55
+ self .attributes [self .attribute_name ] = self .attribute_value
56
+ self .attribute_name = ""
57
+ self .attribute_value = ""
58
+
59
+ def accumulateAttributeName (self , text ):
60
+ self .attribute_name += text .translate (asciiUpper2Lower )
61
+
62
+ def accumulateAttributeValue (self , text ):
63
+ self .attribute_value += text
50
64
51
65
class StartTag (Tag ):
52
66
def __init__ (self , name , data = None ):
@@ -248,7 +262,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
248
262
output = "&" + "" .join (charStack )
249
263
250
264
if fromAttribute :
251
- self .currentToken .data [ - 1 ][ 1 ] += output
265
+ self .currentToken .accumulateAttributeValue ( output )
252
266
else :
253
267
if output in spaceCharacters :
254
268
token = SpaceCharacters (output )
@@ -270,17 +284,9 @@ def emitCurrentToken(self):
270
284
# Add token to the queue to be yielded
271
285
if isinstance (token , Tag ):
272
286
token .name = token .name .translate (asciiUpper2Lower )
273
- if isinstance (token , StartTag ):
274
- raw = token .data
275
- data = attributeMap (raw )
276
- if len (raw ) > len (data ):
277
- # we had some duplicated attribute, fix so first wins
278
- was = dict (data )
279
- data .update (raw [::- 1 ])
280
- token .data = data
281
-
287
+ token .clearAttribute ()
282
288
if isinstance (token , EndTag ):
283
- if token .data :
289
+ if token .attributes :
284
290
self .tokenQueue .append (ParseError ("attributes-in-end-tag" ))
285
291
if token .self_closing :
286
292
self .tokenQueue .append (ParseError ("self-closing-flag-on-end-tag" ))
@@ -820,25 +826,29 @@ def beforeAttributeNameState(self):
820
826
if data in spaceCharacters :
821
827
self .stream .charsUntil (spaceCharacters , True )
822
828
elif data in asciiLetters :
823
- self .currentToken .data .append ([data , "" ])
829
+ self .currentToken .clearAttribute ()
830
+ self .currentToken .accumulateAttributeName (data )
824
831
self .state = self .attributeNameState
825
832
elif data == ">" :
826
833
self .emitCurrentToken ()
827
834
elif data == "/" :
828
835
self .state = self .selfClosingStartTagState
829
836
elif data in ("'" , '"' , "=" , "<" ):
830
837
self .tokenQueue .append (ParseError ("invalid-character-in-attribute-name" ))
831
- self .currentToken .data .append ([data , "" ])
838
+ self .currentToken .clearAttribute ()
839
+ self .currentToken .accumulateAttributeName (data )
832
840
self .state = self .attributeNameState
833
841
elif data == "\u0000 " :
834
842
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
835
- self .currentToken .data .append (["\uFFFD " , "" ])
843
+ self .currentToken .clearAttribute ()
844
+ self .currentToken .accumulateAttributeName ("\uFFFD " )
836
845
self .state = self .attributeNameState
837
846
elif data is EOF :
838
847
self .tokenQueue .append (ParseError ("expected-attribute-name-but-got-eof" ))
839
848
self .state = self .dataState
840
849
else :
841
- self .currentToken .data .append ([data , "" ])
850
+ self .currentToken .clearAttribute ()
851
+ self .currentToken .accumulateAttributeName (data )
842
852
self .state = self .attributeNameState
843
853
return True
844
854
@@ -849,8 +859,7 @@ def attributeNameState(self):
849
859
if data == "=" :
850
860
self .state = self .beforeAttributeValueState
851
861
elif data in asciiLetters :
852
- self .currentToken .data [- 1 ][0 ] += data + \
853
- self .stream .charsUntil (asciiLetters , True )
862
+ self .currentToken .accumulateAttributeName (data + self .stream .charsUntil (asciiLetters , True ))
854
863
leavingThisState = False
855
864
elif data == ">" :
856
865
# XXX If we emit here the attributes are converted to a dict
@@ -863,29 +872,25 @@ def attributeNameState(self):
863
872
self .state = self .selfClosingStartTagState
864
873
elif data == "\u0000 " :
865
874
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
866
- self .currentToken .data [ - 1 ][ 0 ] += "\uFFFD "
875
+ self .currentToken .accumulateAttributeName ( "\uFFFD " )
867
876
leavingThisState = False
868
877
elif data in ("'" , '"' , "<" ):
869
878
self .tokenQueue .append (ParseError ("invalid-character-in-attribute-name" ))
870
- self .currentToken .data [ - 1 ][ 0 ] += data
879
+ self .currentToken .accumulateAttributeName ( data )
871
880
leavingThisState = False
872
881
elif data is EOF :
873
882
self .tokenQueue .append (ParseError ("eof-in-attribute-name" ))
874
883
self .state = self .dataState
875
884
else :
876
- self .currentToken .data [ - 1 ][ 0 ] += data
885
+ self .currentToken .accumulateAttributeName ( data )
877
886
leavingThisState = False
878
887
879
888
if leavingThisState :
880
889
# Attributes are not dropped at this stage. That happens when the
881
890
# start tag token is emitted so values can still be safely appended
882
891
# to attributes, but we do want to report the parse error in time.
883
- self .currentToken .data [- 1 ][0 ] = (
884
- self .currentToken .data [- 1 ][0 ].translate (asciiUpper2Lower ))
885
- for name , _ in self .currentToken .data [:- 1 ]:
886
- if self .currentToken .data [- 1 ][0 ] == name :
887
- self .tokenQueue .append (ParseError ("duplicate-attribute" ))
888
- break
892
+ if self .currentToken .attribute_name in self .currentToken .attributes :
893
+ self .tokenQueue .append (ParseError ("duplicate-attribute" ))
889
894
# XXX Fix for above XXX
890
895
if emitToken :
891
896
self .emitCurrentToken ()
@@ -900,23 +905,27 @@ def afterAttributeNameState(self):
900
905
elif data == ">" :
901
906
self .emitCurrentToken ()
902
907
elif data in asciiLetters :
903
- self .currentToken .data .append ([data , "" ])
908
+ self .currentToken .clearAttribute ()
909
+ self .currentToken .accumulateAttributeName (data )
904
910
self .state = self .attributeNameState
905
911
elif data == "/" :
906
912
self .state = self .selfClosingStartTagState
907
913
elif data == "\u0000 " :
908
914
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
909
- self .currentToken .data .append (["\uFFFD " , "" ])
915
+ self .currentToken .clearAttribute ()
916
+ self .currentToken .accumulateAttributeName ("\uFFFD " )
910
917
self .state = self .attributeNameState
911
918
elif data in ("'" , '"' , "<" ):
912
919
self .tokenQueue .append (ParseError ("invalid-character-after-attribute-name" ))
913
- self .currentToken .data .append ([data , "" ])
920
+ self .currentToken .clearAttribute ()
921
+ self .currentToken .accumulateAttributeName (data )
914
922
self .state = self .attributeNameState
915
923
elif data is EOF :
916
924
self .tokenQueue .append (ParseError ("expected-end-of-tag-but-got-eof" ))
917
925
self .state = self .dataState
918
926
else :
919
- self .currentToken .data .append ([data , "" ])
927
+ self .currentToken .clearAttribute ()
928
+ self .currentToken .accumulateAttributeName (data )
920
929
self .state = self .attributeNameState
921
930
return True
922
931
@@ -936,17 +945,17 @@ def beforeAttributeValueState(self):
936
945
self .emitCurrentToken ()
937
946
elif data == "\u0000 " :
938
947
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
939
- self .currentToken .data [ - 1 ][ 1 ] += "\uFFFD "
948
+ self .currentToken .accumulateAttributeValue ( "\uFFFD " )
940
949
self .state = self .attributeValueUnQuotedState
941
950
elif data in ("=" , "<" , "`" ):
942
951
self .tokenQueue .append (ParseError ("equals-in-unquoted-attribute-value" ))
943
- self .currentToken .data [ - 1 ][ 1 ] += data
952
+ self .currentToken .accumulateAttributeValue ( data )
944
953
self .state = self .attributeValueUnQuotedState
945
954
elif data is EOF :
946
955
self .tokenQueue .append (ParseError ("expected-attribute-value-but-got-eof" ))
947
956
self .state = self .dataState
948
957
else :
949
- self .currentToken .data [ - 1 ][ 1 ] += data
958
+ self .currentToken .accumulateAttributeValue ( data )
950
959
self .state = self .attributeValueUnQuotedState
951
960
return True
952
961
@@ -958,13 +967,12 @@ def attributeValueDoubleQuotedState(self):
958
967
self .processEntityInAttribute ('"' )
959
968
elif data == "\u0000 " :
960
969
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
961
- self .currentToken .data [ - 1 ][ 1 ] += "\uFFFD "
970
+ self .currentToken .accumulateAttributeValue ( "\uFFFD " )
962
971
elif data is EOF :
963
972
self .tokenQueue .append (ParseError ("eof-in-attribute-value-double-quote" ))
964
973
self .state = self .dataState
965
974
else :
966
- self .currentToken .data [- 1 ][1 ] += data + \
967
- self .stream .charsUntil (("\" " , "&" , "\u0000 " ))
975
+ self .currentToken .accumulateAttributeValue (data + self .stream .charsUntil (("\" " , "&" , "\u0000 " )))
968
976
return True
969
977
970
978
def attributeValueSingleQuotedState (self ):
@@ -975,13 +983,12 @@ def attributeValueSingleQuotedState(self):
975
983
self .processEntityInAttribute ("'" )
976
984
elif data == "\u0000 " :
977
985
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
978
- self .currentToken .data [ - 1 ][ 1 ] += "\uFFFD "
986
+ self .currentToken .accumulateAttributeValue ( "\uFFFD " )
979
987
elif data is EOF :
980
988
self .tokenQueue .append (ParseError ("eof-in-attribute-value-single-quote" ))
981
989
self .state = self .dataState
982
990
else :
983
- self .currentToken .data [- 1 ][1 ] += data + \
984
- self .stream .charsUntil (("'" , "&" , "\u0000 " ))
991
+ self .currentToken .accumulateAttributeValue (data + self .stream .charsUntil (("'" , "&" , "\u0000 " )))
985
992
return True
986
993
987
994
def attributeValueUnQuotedState (self ):
@@ -994,16 +1001,16 @@ def attributeValueUnQuotedState(self):
994
1001
self .emitCurrentToken ()
995
1002
elif data in ('"' , "'" , "=" , "<" , "`" ):
996
1003
self .tokenQueue .append (ParseError ("unexpected-character-in-unquoted-attribute-value" ))
997
- self .currentToken .data [ - 1 ][ 1 ] += data
1004
+ self .currentToken .accumulateAttributeValue ( data )
998
1005
elif data == "\u0000 " :
999
1006
self .tokenQueue .append (ParseError ("invalid-codepoint" ))
1000
- self .currentToken .data [ - 1 ][ 1 ] += "\uFFFD "
1007
+ self .currentToken .accumulateAttributeValue ( "\uFFFD " )
1001
1008
elif data is EOF :
1002
1009
self .tokenQueue .append (ParseError ("eof-in-attribute-value-no-quotes" ))
1003
1010
self .state = self .dataState
1004
1011
else :
1005
- self .currentToken .data [ - 1 ][ 1 ] += data + self .stream .charsUntil (
1006
- frozenset (("&" , ">" , '"' , "'" , "=" , "<" , "`" , "\u0000 " )) | spaceCharacters )
1012
+ self .currentToken .accumulateAttributeValue ( data + self .stream .charsUntil (
1013
+ frozenset (("&" , ">" , '"' , "'" , "=" , "<" , "`" , "\u0000 " )) | spaceCharacters ))
1007
1014
return True
1008
1015
1009
1016
def afterAttributeValueState (self ):
0 commit comments