Skip to content

Commit 2e86373

Browse files
committed
Refactor: pretranslate lowercase element and attribute names
1 parent 183d8a0 commit 2e86373

File tree

1 file changed

+30
-32
lines changed

1 file changed

+30
-32
lines changed

html5lib/_tokenizer.py

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,6 @@ def emitCurrentToken(self):
233233
token = self.currentToken
234234
# Add token to the queue to be yielded
235235
if (token["type"] in tagTokenTypes):
236-
token["name"] = token["name"].translate(asciiUpper2Lower)
237236
if token["type"] == tokenTypes["StartTag"]:
238237
raw = token["data"]
239238
data = attributeMap(raw)
@@ -380,7 +379,8 @@ def tagOpenState(self):
380379
self.state = self.closeTagOpenState
381380
elif data in asciiLetters:
382381
self.currentToken = {"type": tokenTypes["StartTag"],
383-
"name": data, "data": [],
382+
"name": data.translate(asciiUpper2Lower),
383+
"data": [],
384384
"selfClosing": False,
385385
"selfClosingAcknowledged": False}
386386
self.state = self.tagNameState
@@ -410,7 +410,8 @@ def tagOpenState(self):
410410
def closeTagOpenState(self):
411411
data = self.stream.char()
412412
if data in asciiLetters:
413-
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
413+
self.currentToken = {"type": tokenTypes["EndTag"],
414+
"name": data.translate(asciiUpper2Lower),
414415
"data": [], "selfClosing": False}
415416
self.state = self.tagNameState
416417
elif data == ">":
@@ -448,7 +449,7 @@ def tagNameState(self):
448449
"data": "invalid-codepoint"})
449450
self.currentToken["name"] += "\uFFFD"
450451
else:
451-
self.currentToken["name"] += data
452+
self.currentToken["name"] += data.translate(asciiUpper2Lower)
452453
# (Don't use charsUntil here, because tag names are
453454
# very short and it's faster to not do anything fancy)
454455
return True
@@ -467,7 +468,7 @@ def rcdataLessThanSignState(self):
467468
def rcdataEndTagOpenState(self):
468469
data = self.stream.char()
469470
if data in asciiLetters:
470-
self.temporaryBuffer += data
471+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
471472
self.state = self.rcdataEndTagNameState
472473
else:
473474
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -476,7 +477,7 @@ def rcdataEndTagOpenState(self):
476477
return True
477478

478479
def rcdataEndTagNameState(self):
479-
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
480+
appropriate = self.currentToken["name"] == self.temporaryBuffer
480481
data = self.stream.char()
481482
if data in spaceCharacters and appropriate:
482483
self.currentToken = {"type": tokenTypes["EndTag"],
@@ -495,7 +496,7 @@ def rcdataEndTagNameState(self):
495496
self.emitCurrentToken()
496497
self.state = self.dataState
497498
elif data in asciiLetters:
498-
self.temporaryBuffer += data
499+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
499500
else:
500501
self.tokenQueue.append({"type": tokenTypes["Characters"],
501502
"data": "</" + self.temporaryBuffer})
@@ -517,7 +518,7 @@ def rawtextLessThanSignState(self):
517518
def rawtextEndTagOpenState(self):
518519
data = self.stream.char()
519520
if data in asciiLetters:
520-
self.temporaryBuffer += data
521+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
521522
self.state = self.rawtextEndTagNameState
522523
else:
523524
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -526,7 +527,7 @@ def rawtextEndTagOpenState(self):
526527
return True
527528

528529
def rawtextEndTagNameState(self):
529-
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
530+
appropriate = self.currentToken["name"] == self.temporaryBuffer
530531
data = self.stream.char()
531532
if data in spaceCharacters and appropriate:
532533
self.currentToken = {"type": tokenTypes["EndTag"],
@@ -545,7 +546,7 @@ def rawtextEndTagNameState(self):
545546
self.emitCurrentToken()
546547
self.state = self.dataState
547548
elif data in asciiLetters:
548-
self.temporaryBuffer += data
549+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
549550
else:
550551
self.tokenQueue.append({"type": tokenTypes["Characters"],
551552
"data": "</" + self.temporaryBuffer})
@@ -570,7 +571,7 @@ def scriptDataLessThanSignState(self):
570571
def scriptDataEndTagOpenState(self):
571572
data = self.stream.char()
572573
if data in asciiLetters:
573-
self.temporaryBuffer += data
574+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
574575
self.state = self.scriptDataEndTagNameState
575576
else:
576577
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -579,7 +580,7 @@ def scriptDataEndTagOpenState(self):
579580
return True
580581

581582
def scriptDataEndTagNameState(self):
582-
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
583+
appropriate = self.currentToken["name"] == self.temporaryBuffer
583584
data = self.stream.char()
584585
if data in spaceCharacters and appropriate:
585586
self.currentToken = {"type": tokenTypes["EndTag"],
@@ -598,7 +599,7 @@ def scriptDataEndTagNameState(self):
598599
self.emitCurrentToken()
599600
self.state = self.dataState
600601
elif data in asciiLetters:
601-
self.temporaryBuffer += data
602+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
602603
else:
603604
self.tokenQueue.append({"type": tokenTypes["Characters"],
604605
"data": "</" + self.temporaryBuffer})
@@ -695,7 +696,7 @@ def scriptDataEscapedLessThanSignState(self):
695696
self.state = self.scriptDataEscapedEndTagOpenState
696697
elif data in asciiLetters:
697698
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
698-
self.temporaryBuffer = data
699+
self.temporaryBuffer = data.translate(asciiUpper2Lower)
699700
self.state = self.scriptDataDoubleEscapeStartState
700701
else:
701702
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
@@ -706,7 +707,7 @@ def scriptDataEscapedLessThanSignState(self):
706707
def scriptDataEscapedEndTagOpenState(self):
707708
data = self.stream.char()
708709
if data in asciiLetters:
709-
self.temporaryBuffer = data
710+
self.temporaryBuffer = data.translate(asciiUpper2Lower)
710711
self.state = self.scriptDataEscapedEndTagNameState
711712
else:
712713
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -715,7 +716,7 @@ def scriptDataEscapedEndTagOpenState(self):
715716
return True
716717

717718
def scriptDataEscapedEndTagNameState(self):
718-
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
719+
appropriate = self.currentToken["name"] == self.temporaryBuffer
719720
data = self.stream.char()
720721
if data in spaceCharacters and appropriate:
721722
self.currentToken = {"type": tokenTypes["EndTag"],
@@ -734,7 +735,7 @@ def scriptDataEscapedEndTagNameState(self):
734735
self.emitCurrentToken()
735736
self.state = self.dataState
736737
elif data in asciiLetters:
737-
self.temporaryBuffer += data
738+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
738739
else:
739740
self.tokenQueue.append({"type": tokenTypes["Characters"],
740741
"data": "</" + self.temporaryBuffer})
@@ -746,13 +747,13 @@ def scriptDataDoubleEscapeStartState(self):
746747
data = self.stream.char()
747748
if data in (spaceCharacters | frozenset(("/", ">"))):
748749
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
749-
if self.temporaryBuffer.lower() == "script":
750+
if self.temporaryBuffer == "script":
750751
self.state = self.scriptDataDoubleEscapedState
751752
else:
752753
self.state = self.scriptDataEscapedState
753754
elif data in asciiLetters:
754755
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
755-
self.temporaryBuffer += data
756+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
756757
else:
757758
self.stream.unget(data)
758759
self.state = self.scriptDataEscapedState
@@ -842,13 +843,13 @@ def scriptDataDoubleEscapeEndState(self):
842843
data = self.stream.char()
843844
if data in (spaceCharacters | frozenset(("/", ">"))):
844845
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
845-
if self.temporaryBuffer.lower() == "script":
846+
if self.temporaryBuffer == "script":
846847
self.state = self.scriptDataEscapedState
847848
else:
848849
self.state = self.scriptDataDoubleEscapedState
849850
elif data in asciiLetters:
850851
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
851-
self.temporaryBuffer += data
852+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
852853
else:
853854
self.stream.unget(data)
854855
self.state = self.scriptDataDoubleEscapedState
@@ -859,7 +860,8 @@ def beforeAttributeNameState(self):
859860
if data in spaceCharacters:
860861
self.stream.charsUntil(spaceCharacters, True)
861862
elif data in asciiLetters:
862-
self.currentToken["data"].append([data, ""])
863+
attr_name = data.translate(asciiUpper2Lower)
864+
self.currentToken["data"].append([attr_name, ""])
863865
self.state = self.attributeNameState
864866
elif data == ">":
865867
self.emitCurrentToken()
@@ -891,7 +893,7 @@ def attributeNameState(self):
891893
if data == "=":
892894
self.state = self.beforeAttributeValueState
893895
elif data in asciiLetters:
894-
self.currentToken["data"][-1][0] += data
896+
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
895897
leavingThisState = False
896898
elif data == ">":
897899
# XXX If we emit here the attributes are converted to a dict
@@ -918,15 +920,13 @@ def attributeNameState(self):
918920
"data": "eof-in-attribute-name"})
919921
self.state = self.dataState
920922
else:
921-
self.currentToken["data"][-1][0] += data
923+
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
922924
leavingThisState = False
923925

924926
if leavingThisState:
925927
# Attributes are not dropped at this stage. That happens when the
926928
# start tag token is emitted so values can still be safely appended
927929
# to attributes, but we do want to report the parse error in time.
928-
self.currentToken["data"][-1][0] = (
929-
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
930930
for name, _ in self.currentToken["data"][:-1]:
931931
if self.currentToken["data"][-1][0] == name:
932932
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
@@ -946,7 +946,8 @@ def afterAttributeNameState(self):
946946
elif data == ">":
947947
self.emitCurrentToken()
948948
elif data in asciiLetters:
949-
self.currentToken["data"].append([data, ""])
949+
attr_name = data.translate(asciiUpper2Lower)
950+
self.currentToken["data"].append([attr_name, ""])
950951
self.state = self.attributeNameState
951952
elif data == "/":
952953
self.state = self.selfClosingStartTagState
@@ -1340,17 +1341,15 @@ def beforeDoctypeNameState(self):
13401341
self.tokenQueue.append(self.currentToken)
13411342
self.state = self.dataState
13421343
else:
1343-
self.currentToken["name"] = data
1344+
self.currentToken["name"] = data.translate(asciiUpper2Lower)
13441345
self.state = self.doctypeNameState
13451346
return True
13461347

13471348
def doctypeNameState(self):
13481349
data = self.stream.char()
13491350
if data in spaceCharacters:
1350-
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
13511351
self.state = self.afterDoctypeNameState
13521352
elif data == ">":
1353-
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
13541353
self.tokenQueue.append(self.currentToken)
13551354
self.state = self.dataState
13561355
elif data == "\u0000":
@@ -1362,11 +1361,10 @@ def doctypeNameState(self):
13621361
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
13631362
"eof-in-doctype-name"})
13641363
self.currentToken["correct"] = False
1365-
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
13661364
self.tokenQueue.append(self.currentToken)
13671365
self.state = self.dataState
13681366
else:
1369-
self.currentToken["name"] += data
1367+
self.currentToken["name"] += data.translate(asciiUpper2Lower)
13701368
return True
13711369

13721370
def afterDoctypeNameState(self):

0 commit comments

Comments
 (0)