Skip to content

Commit bb7fabc

Browse files
committed
Refactor: pre-translate strings that are only used in lowercase context
1 parent b1a444b commit bb7fabc

File tree

1 file changed

+21
-25
lines changed

1 file changed

+21
-25
lines changed

html5lib/_tokenizer.py

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, data=None):
3030

3131
class Doctype(Token):
3232
def __init__(self, name, public_id, system_id, correct):
33-
self.name = name
33+
self.name = name.translate(asciiUpper2Lower)
3434
self.public_id = public_id
3535
self.system_id = system_id
3636
self.correct = correct
@@ -44,7 +44,7 @@ class SpaceCharacters(Token):
4444

4545
class Tag(Token):
4646
def __init__(self, name, attributes):
47-
self.name = name
47+
self.name = name.translate(asciiUpper2Lower)
4848
self.attributes = attributeMap(attributes or {})
4949
self.self_closing = False
5050
self.attribute_name = ""
@@ -278,7 +278,6 @@ def emitCurrentToken(self):
278278
token = self.currentToken
279279
# Add token to the queue to be yielded
280280
if isinstance(token, Tag):
281-
token.name = token.name.translate(asciiUpper2Lower)
282281
if self.currentToken.attribute_name in self.currentToken.attributes:
283282
self.tokenQueue.append(ParseError("duplicate-attribute"))
284283
token.clearAttribute()
@@ -456,7 +455,7 @@ def tagNameState(self):
456455
self.tokenQueue.append(ParseError("invalid-codepoint"))
457456
self.currentToken.name += "\uFFFD"
458457
else:
459-
self.currentToken.name += data
458+
self.currentToken.name += data.translate(asciiUpper2Lower)
460459
# (Don't use charsUntil here, because tag names are
461460
# very short and it's faster to not do anything fancy)
462461
return True
@@ -475,7 +474,7 @@ def rcdataLessThanSignState(self):
475474
def rcdataEndTagOpenState(self):
476475
data = self.stream.char()
477476
if data in asciiLetters:
478-
self.temporaryBuffer += data
477+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
479478
self.state = self.rcdataEndTagNameState
480479
else:
481480
self.tokenQueue.append(Characters("</"))
@@ -484,7 +483,7 @@ def rcdataEndTagOpenState(self):
484483
return True
485484

486485
def rcdataEndTagNameState(self):
487-
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
486+
appropriate = self.currentToken.name == self.temporaryBuffer
488487
data = self.stream.char()
489488
if data in spaceCharacters and appropriate:
490489
self.currentToken = EndTag(name=self.temporaryBuffer)
@@ -497,7 +496,7 @@ def rcdataEndTagNameState(self):
497496
self.emitCurrentToken()
498497
self.state = self.dataState
499498
elif data in asciiLetters:
500-
self.temporaryBuffer += data
499+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
501500
else:
502501
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
503502
self.stream.unget(data)
@@ -518,7 +517,7 @@ def rawtextLessThanSignState(self):
518517
def rawtextEndTagOpenState(self):
519518
data = self.stream.char()
520519
if data in asciiLetters:
521-
self.temporaryBuffer += data
520+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
522521
self.state = self.rawtextEndTagNameState
523522
else:
524523
self.tokenQueue.append(Characters("</"))
@@ -527,7 +526,7 @@ def rawtextEndTagOpenState(self):
527526
return True
528527

529528
def rawtextEndTagNameState(self):
530-
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
529+
appropriate = self.currentToken.name == self.temporaryBuffer
531530
data = self.stream.char()
532531
if data in spaceCharacters and appropriate:
533532
self.currentToken = EndTag(name=self.temporaryBuffer)
@@ -540,7 +539,7 @@ def rawtextEndTagNameState(self):
540539
self.emitCurrentToken()
541540
self.state = self.dataState
542541
elif data in asciiLetters:
543-
self.temporaryBuffer += data
542+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
544543
else:
545544
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
546545
self.stream.unget(data)
@@ -564,7 +563,7 @@ def scriptDataLessThanSignState(self):
564563
def scriptDataEndTagOpenState(self):
565564
data = self.stream.char()
566565
if data in asciiLetters:
567-
self.temporaryBuffer += data
566+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
568567
self.state = self.scriptDataEndTagNameState
569568
else:
570569
self.tokenQueue.append(Characters("</"))
@@ -573,7 +572,7 @@ def scriptDataEndTagOpenState(self):
573572
return True
574573

575574
def scriptDataEndTagNameState(self):
576-
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
575+
appropriate = self.currentToken.name == self.temporaryBuffer
577576
data = self.stream.char()
578577
if data in spaceCharacters and appropriate:
579578
self.currentToken = EndTag(name=self.temporaryBuffer)
@@ -586,7 +585,7 @@ def scriptDataEndTagNameState(self):
586585
self.emitCurrentToken()
587586
self.state = self.dataState
588587
elif data in asciiLetters:
589-
self.temporaryBuffer += data
588+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
590589
else:
591590
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
592591
self.stream.unget(data)
@@ -675,7 +674,7 @@ def scriptDataEscapedLessThanSignState(self):
675674
self.state = self.scriptDataEscapedEndTagOpenState
676675
elif data in asciiLetters:
677676
self.tokenQueue.append(Characters("<" + data))
678-
self.temporaryBuffer = data
677+
self.temporaryBuffer = data.translate(asciiUpper2Lower)
679678
self.state = self.scriptDataDoubleEscapeStartState
680679
else:
681680
self.tokenQueue.append(Characters("<"))
@@ -686,7 +685,7 @@ def scriptDataEscapedLessThanSignState(self):
686685
def scriptDataEscapedEndTagOpenState(self):
687686
data = self.stream.char()
688687
if data in asciiLetters:
689-
self.temporaryBuffer = data
688+
self.temporaryBuffer = data.translate(asciiUpper2Lower)
690689
self.state = self.scriptDataEscapedEndTagNameState
691690
else:
692691
self.tokenQueue.append(Characters("</"))
@@ -695,7 +694,7 @@ def scriptDataEscapedEndTagOpenState(self):
695694
return True
696695

697696
def scriptDataEscapedEndTagNameState(self):
698-
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
697+
appropriate = self.currentToken.name == self.temporaryBuffer
699698
data = self.stream.char()
700699
if data in spaceCharacters and appropriate:
701700
self.currentToken = EndTag(name=self.temporaryBuffer)
@@ -708,7 +707,7 @@ def scriptDataEscapedEndTagNameState(self):
708707
self.emitCurrentToken()
709708
self.state = self.dataState
710709
elif data in asciiLetters:
711-
self.temporaryBuffer += data
710+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
712711
else:
713712
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
714713
self.stream.unget(data)
@@ -719,13 +718,13 @@ def scriptDataDoubleEscapeStartState(self):
719718
data = self.stream.char()
720719
if data in (spaceCharacters | frozenset(("/", ">"))):
721720
self.tokenQueue.append(Characters(data))
722-
if self.temporaryBuffer.lower() == "script":
721+
if self.temporaryBuffer == "script":
723722
self.state = self.scriptDataDoubleEscapedState
724723
else:
725724
self.state = self.scriptDataEscapedState
726725
elif data in asciiLetters:
727726
self.tokenQueue.append(Characters(data))
728-
self.temporaryBuffer += data
727+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
729728
else:
730729
self.stream.unget(data)
731730
self.state = self.scriptDataEscapedState
@@ -806,13 +805,13 @@ def scriptDataDoubleEscapeEndState(self):
806805
data = self.stream.char()
807806
if data in (spaceCharacters | frozenset(("/", ">"))):
808807
self.tokenQueue.append(Characters(data))
809-
if self.temporaryBuffer.lower() == "script":
808+
if self.temporaryBuffer == "script":
810809
self.state = self.scriptDataEscapedState
811810
else:
812811
self.state = self.scriptDataDoubleEscapedState
813812
elif data in asciiLetters:
814813
self.tokenQueue.append(Characters(data))
815-
self.temporaryBuffer += data
814+
self.temporaryBuffer += data.translate(asciiUpper2Lower)
816815
else:
817816
self.stream.unget(data)
818817
self.state = self.scriptDataDoubleEscapedState
@@ -1240,10 +1239,8 @@ def beforeDoctypeNameState(self):
12401239
def doctypeNameState(self):
12411240
data = self.stream.char()
12421241
if data in spaceCharacters:
1243-
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
12441242
self.state = self.afterDoctypeNameState
12451243
elif data == ">":
1246-
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
12471244
self.tokenQueue.append(self.currentToken)
12481245
self.state = self.dataState
12491246
elif data == "\u0000":
@@ -1253,11 +1250,10 @@ def doctypeNameState(self):
12531250
elif data is EOF:
12541251
self.tokenQueue.append(ParseError("eof-in-doctype-name"))
12551252
self.currentToken.correct = False
1256-
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
12571253
self.tokenQueue.append(self.currentToken)
12581254
self.state = self.dataState
12591255
else:
1260-
self.currentToken.name += data
1256+
self.currentToken.name += data.translate(asciiUpper2Lower)
12611257
return True
12621258

12631259
def afterDoctypeNameState(self):

0 commit comments

Comments
 (0)