Skip to content

Commit a912842

Browse files
committed
Alternate approach: do not pretranslate temporary buffered data
1 parent 8f96b17 commit a912842

File tree

1 file changed

+33
-29
lines changed

1 file changed

+33
-29
lines changed

html5lib/_tokenizer.py

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def rcdataLessThanSignState(self):
468468
def rcdataEndTagOpenState(self):
469469
data = self.stream.char()
470470
if data in asciiLetters:
471-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
471+
self.temporaryBuffer += data
472472
self.state = self.rcdataEndTagNameState
473473
else:
474474
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -477,26 +477,27 @@ def rcdataEndTagOpenState(self):
477477
return True
478478

479479
def rcdataEndTagNameState(self):
480-
appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer
480+
name = self.temporaryBuffer.translate(asciiUpper2Lower)
481+
appropriate = self.currentToken and self.currentToken["name"] == name
481482
data = self.stream.char()
482483
if data in spaceCharacters and appropriate:
483484
self.currentToken = {"type": tokenTypes["EndTag"],
484-
"name": self.temporaryBuffer,
485+
"name": name,
485486
"data": [], "selfClosing": False}
486487
self.state = self.beforeAttributeNameState
487488
elif data == "/" and appropriate:
488489
self.currentToken = {"type": tokenTypes["EndTag"],
489-
"name": self.temporaryBuffer,
490+
"name": name,
490491
"data": [], "selfClosing": False}
491492
self.state = self.selfClosingStartTagState
492493
elif data == ">" and appropriate:
493494
self.currentToken = {"type": tokenTypes["EndTag"],
494-
"name": self.temporaryBuffer,
495+
"name": name,
495496
"data": [], "selfClosing": False}
496497
self.emitCurrentToken()
497498
self.state = self.dataState
498499
elif data in asciiLetters:
499-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
500+
self.temporaryBuffer += data
500501
else:
501502
self.tokenQueue.append({"type": tokenTypes["Characters"],
502503
"data": "</" + self.temporaryBuffer})
@@ -518,7 +519,7 @@ def rawtextLessThanSignState(self):
518519
def rawtextEndTagOpenState(self):
519520
data = self.stream.char()
520521
if data in asciiLetters:
521-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
522+
self.temporaryBuffer += data
522523
self.state = self.rawtextEndTagNameState
523524
else:
524525
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -527,26 +528,27 @@ def rawtextEndTagOpenState(self):
527528
return True
528529

529530
def rawtextEndTagNameState(self):
530-
appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer
531+
name = self.temporaryBuffer.translate(asciiUpper2Lower)
532+
appropriate = self.currentToken and self.currentToken["name"] == name
531533
data = self.stream.char()
532534
if data in spaceCharacters and appropriate:
533535
self.currentToken = {"type": tokenTypes["EndTag"],
534-
"name": self.temporaryBuffer,
536+
"name": name,
535537
"data": [], "selfClosing": False}
536538
self.state = self.beforeAttributeNameState
537539
elif data == "/" and appropriate:
538540
self.currentToken = {"type": tokenTypes["EndTag"],
539-
"name": self.temporaryBuffer,
541+
"name": name,
540542
"data": [], "selfClosing": False}
541543
self.state = self.selfClosingStartTagState
542544
elif data == ">" and appropriate:
543545
self.currentToken = {"type": tokenTypes["EndTag"],
544-
"name": self.temporaryBuffer,
546+
"name": name,
545547
"data": [], "selfClosing": False}
546548
self.emitCurrentToken()
547549
self.state = self.dataState
548550
elif data in asciiLetters:
549-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
551+
self.temporaryBuffer += data
550552
else:
551553
self.tokenQueue.append({"type": tokenTypes["Characters"],
552554
"data": "</" + self.temporaryBuffer})
@@ -571,7 +573,7 @@ def scriptDataLessThanSignState(self):
571573
def scriptDataEndTagOpenState(self):
572574
data = self.stream.char()
573575
if data in asciiLetters:
574-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
576+
self.temporaryBuffer += data
575577
self.state = self.scriptDataEndTagNameState
576578
else:
577579
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -580,26 +582,27 @@ def scriptDataEndTagOpenState(self):
580582
return True
581583

582584
def scriptDataEndTagNameState(self):
583-
appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer
585+
name = self.temporaryBuffer.translate(asciiUpper2Lower)
586+
appropriate = self.currentToken and self.currentToken["name"] == name
584587
data = self.stream.char()
585588
if data in spaceCharacters and appropriate:
586589
self.currentToken = {"type": tokenTypes["EndTag"],
587-
"name": self.temporaryBuffer,
590+
"name": name,
588591
"data": [], "selfClosing": False}
589592
self.state = self.beforeAttributeNameState
590593
elif data == "/" and appropriate:
591594
self.currentToken = {"type": tokenTypes["EndTag"],
592-
"name": self.temporaryBuffer,
595+
"name": name,
593596
"data": [], "selfClosing": False}
594597
self.state = self.selfClosingStartTagState
595598
elif data == ">" and appropriate:
596599
self.currentToken = {"type": tokenTypes["EndTag"],
597-
"name": self.temporaryBuffer,
600+
"name": name,
598601
"data": [], "selfClosing": False}
599602
self.emitCurrentToken()
600603
self.state = self.dataState
601604
elif data in asciiLetters:
602-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
605+
self.temporaryBuffer += data
603606
else:
604607
self.tokenQueue.append({"type": tokenTypes["Characters"],
605608
"data": "</" + self.temporaryBuffer})
@@ -696,7 +699,7 @@ def scriptDataEscapedLessThanSignState(self):
696699
self.state = self.scriptDataEscapedEndTagOpenState
697700
elif data in asciiLetters:
698701
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
699-
self.temporaryBuffer = data.translate(asciiUpper2Lower)
702+
self.temporaryBuffer = data
700703
self.state = self.scriptDataDoubleEscapeStartState
701704
else:
702705
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
@@ -707,7 +710,7 @@ def scriptDataEscapedLessThanSignState(self):
707710
def scriptDataEscapedEndTagOpenState(self):
708711
data = self.stream.char()
709712
if data in asciiLetters:
710-
self.temporaryBuffer = data.translate(asciiUpper2Lower)
713+
self.temporaryBuffer = data
711714
self.state = self.scriptDataEscapedEndTagNameState
712715
else:
713716
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
@@ -716,26 +719,27 @@ def scriptDataEscapedEndTagOpenState(self):
716719
return True
717720

718721
def scriptDataEscapedEndTagNameState(self):
719-
appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer
722+
name = self.temporaryBuffer.translate(asciiUpper2Lower)
723+
appropriate = self.currentToken and self.currentToken["name"] == name
720724
data = self.stream.char()
721725
if data in spaceCharacters and appropriate:
722726
self.currentToken = {"type": tokenTypes["EndTag"],
723-
"name": self.temporaryBuffer,
727+
"name": name,
724728
"data": [], "selfClosing": False}
725729
self.state = self.beforeAttributeNameState
726730
elif data == "/" and appropriate:
727731
self.currentToken = {"type": tokenTypes["EndTag"],
728-
"name": self.temporaryBuffer,
732+
"name": name,
729733
"data": [], "selfClosing": False}
730734
self.state = self.selfClosingStartTagState
731735
elif data == ">" and appropriate:
732736
self.currentToken = {"type": tokenTypes["EndTag"],
733-
"name": self.temporaryBuffer,
737+
"name": name,
734738
"data": [], "selfClosing": False}
735739
self.emitCurrentToken()
736740
self.state = self.dataState
737741
elif data in asciiLetters:
738-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
742+
self.temporaryBuffer += data
739743
else:
740744
self.tokenQueue.append({"type": tokenTypes["Characters"],
741745
"data": "</" + self.temporaryBuffer})
@@ -747,13 +751,13 @@ def scriptDataDoubleEscapeStartState(self):
747751
data = self.stream.char()
748752
if data in (spaceCharacters | frozenset(("/", ">"))):
749753
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
750-
if self.temporaryBuffer == "script":
754+
if self.temporaryBuffer.lower() == "script":
751755
self.state = self.scriptDataDoubleEscapedState
752756
else:
753757
self.state = self.scriptDataEscapedState
754758
elif data in asciiLetters:
755759
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
756-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
760+
self.temporaryBuffer += data
757761
else:
758762
self.stream.unget(data)
759763
self.state = self.scriptDataEscapedState
@@ -843,13 +847,13 @@ def scriptDataDoubleEscapeEndState(self):
843847
data = self.stream.char()
844848
if data in (spaceCharacters | frozenset(("/", ">"))):
845849
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
846-
if self.temporaryBuffer == "script":
850+
if self.temporaryBuffer.lower() == "script":
847851
self.state = self.scriptDataEscapedState
848852
else:
849853
self.state = self.scriptDataDoubleEscapedState
850854
elif data in asciiLetters:
851855
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
852-
self.temporaryBuffer += data.translate(asciiUpper2Lower)
856+
self.temporaryBuffer += data
853857
else:
854858
self.stream.unget(data)
855859
self.state = self.scriptDataDoubleEscapedState

0 commit comments

Comments
 (0)