@@ -21,10 +21,10 @@ class BufferedIOBase(object):
21
21
pass
22
22
23
23
#Non-unicode versions of constants for use in the pre-parser
24
- spaceCharactersBytes = frozenset ([str ( item ) for item in spaceCharacters ])
25
- asciiLettersBytes = frozenset ([str ( item ) for item in asciiLetters ])
26
- asciiUppercaseBytes = frozenset ([str ( item ) for item in asciiUppercase ])
27
- spacesAngleBrackets = spaceCharactersBytes | frozenset ([">" , "<" ])
24
+ spaceCharactersBytes = frozenset ([item . encode ( "ascii" ) for item in spaceCharacters ])
25
+ asciiLettersBytes = frozenset ([item . encode ( "ascii" ) for item in asciiLetters ])
26
+ asciiUppercaseBytes = frozenset ([item . encode ( "ascii" ) for item in asciiUppercase ])
27
+ spacesAngleBrackets = spaceCharactersBytes | frozenset ([b ">" , b "<" ])
28
28
29
29
invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
30
30
@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391
391
parseMeta - Look for a <meta> element containing encoding information
392
392
393
393
"""
394
- self .charEncoding = (codecName (encoding ), "certain" )
395
-
396
394
# Raw Stream - for unicode objects this will encode to utf-8 and set
397
395
# self.charEncoding as appropriate
398
396
self .rawStream = self .openStream (source )
399
397
398
+ HTMLUnicodeInputStream .__init__ (self , self .rawStream )
399
+
400
+ self .charEncoding = (codecName (encoding ), "certain" )
401
+
400
402
# Encoding Information
401
403
#Number of bytes to use when looking for a meta element with
402
404
#encoding information
@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
411
413
self .charEncoding = self .detectEncoding (parseMeta , chardet )
412
414
413
415
#Call superclass
414
- HTMLUnicodeInputStream . __init__ ( self , self . rawStream )
416
+ self . reset ( )
415
417
416
418
def reset (self ):
417
419
self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
@@ -538,12 +540,13 @@ def detectEncodingMeta(self):
538
540
539
541
return encoding
540
542
541
- class EncodingBytes (str ):
543
+ class EncodingBytes (bytes ):
542
544
"""String-like object with an associated position and various extra methods
543
545
If the position is ever greater than the string length then an exception is
544
546
raised"""
545
547
def __new__ (self , value ):
546
- return str .__new__ (self , value .lower ())
548
+ assert isinstance (value , bytes )
549
+ return bytes .__new__ (self , value .lower ())
547
550
548
551
def __init__ (self , value ):
549
552
self ._position = - 1
@@ -557,7 +560,7 @@ def __next__(self):
557
560
raise StopIteration
558
561
elif p < 0 :
559
562
raise TypeError
560
- return self [p ]
563
+ return self [p : p + 1 ]
561
564
562
565
def previous (self ):
563
566
p = self ._position
@@ -566,7 +569,7 @@ def previous(self):
566
569
elif p < 0 :
567
570
raise TypeError
568
571
self ._position = p = p - 1
569
- return self [p ]
572
+ return self [p : p + 1 ]
570
573
571
574
def setPosition (self , position ):
572
575
if self ._position >= len (self ):
@@ -584,15 +587,15 @@ def getPosition(self):
584
587
position = property (getPosition , setPosition )
585
588
586
589
def getCurrentByte (self ):
587
- return self [self .position ]
590
+ return self [self .position : self . position + 1 ]
588
591
589
592
currentByte = property (getCurrentByte )
590
593
591
594
def skip (self , chars = spaceCharactersBytes ):
592
595
"""Skip past a list of characters"""
593
596
p = self .position # use property for the error-checking
594
597
while p < len (self ):
595
- c = self [p ]
598
+ c = self [p : p + 1 ]
596
599
if c not in chars :
597
600
self ._position = p
598
601
return c
@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):
603
606
def skipUntil (self , chars ):
604
607
p = self .position
605
608
while p < len (self ):
606
- c = self [p ]
609
+ c = self [p : p + 1 ]
607
610
if c in chars :
608
611
self ._position = p
609
612
return c
@@ -645,12 +648,12 @@ def __init__(self, data):
645
648
646
649
def getEncoding (self ):
647
650
methodDispatch = (
648
- ("<!--" ,self .handleComment ),
649
- ("<meta" ,self .handleMeta ),
650
- ("</" ,self .handlePossibleEndTag ),
651
- ("<!" ,self .handleOther ),
652
- ("<?" ,self .handleOther ),
653
- ("<" ,self .handlePossibleStartTag ))
651
+ (b "<!--" ,self .handleComment ),
652
+ (b "<meta" ,self .handleMeta ),
653
+ (b "</" ,self .handlePossibleEndTag ),
654
+ (b "<!" ,self .handleOther ),
655
+ (b "<?" ,self .handleOther ),
656
+ (b "<" ,self .handlePossibleStartTag ))
654
657
for byte in self .data :
655
658
keepParsing = True
656
659
for key , method in methodDispatch :
@@ -663,37 +666,48 @@ def getEncoding(self):
663
666
break
664
667
if not keepParsing :
665
668
break
666
-
669
+
667
670
return self .encoding
668
671
669
672
def handleComment (self ):
670
673
"""Skip over comments"""
671
- return self .data .jumpTo ("-->" )
674
+ return self .data .jumpTo (b "-->" )
672
675
673
676
def handleMeta (self ):
674
677
if self .data .currentByte not in spaceCharactersBytes :
675
678
#if we have <meta not followed by a space so just keep going
676
679
return True
677
680
#We have a valid meta element we want to search for attributes
681
+ hasPragma = False
682
+ pendingEncoding = None
678
683
while True :
679
684
#Try to find the next attribute after the current position
680
685
attr = self .getAttribute ()
681
686
if attr is None :
682
687
return True
683
688
else :
684
- if attr [0 ] == "charset" :
689
+ if attr [0 ] == b"http-equiv" :
690
+ hasPragma = attr [1 ] == b"content-type"
691
+ if hasPragma and pendingEncoding is not None :
692
+ self .encoding = pendingEncoding
693
+ return False
694
+ elif attr [0 ] == b"charset" :
685
695
tentativeEncoding = attr [1 ]
686
696
codec = codecName (tentativeEncoding )
687
697
if codec is not None :
688
698
self .encoding = codec
689
699
return False
690
- elif attr [0 ] == "content" :
700
+ elif attr [0 ] == b "content" :
691
701
contentParser = ContentAttrParser (EncodingBytes (attr [1 ]))
692
702
tentativeEncoding = contentParser .parse ()
693
- codec = codecName (tentativeEncoding )
694
- if codec is not None :
695
- self .encoding = codec
696
- return False
703
+ if tentativeEncoding is not None :
704
+ codec = codecName (tentativeEncoding )
705
+ if codec is not None :
706
+ if hasPragma :
707
+ self .encoding = codec
708
+ return False
709
+ else :
710
+ pendingEncoding = codec
697
711
698
712
def handlePossibleStartTag (self ):
699
713
return self .handlePossibleTag (False )
@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):
714
728
return True
715
729
716
730
c = data .skipUntil (spacesAngleBrackets )
717
- if c == "<" :
731
+ if c == b "<" :
718
732
#return to the first step in the overall "two step" algorithm
719
733
#reprocessing the < byte
720
734
data .previous ()
@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):
726
740
return True
727
741
728
742
def handleOther (self ):
729
- return self .data .jumpTo (">" )
743
+ return self .data .jumpTo (b ">" )
730
744
731
745
def getAttribute (self ):
732
746
"""Return a name,value pair for the next attribute in the stream,
733
747
if one is found, or None"""
734
748
data = self .data
735
749
# Step 1 (skip chars)
736
- c = data .skip (spaceCharactersBytes | frozenset ("/" ))
750
+ c = data .skip (spaceCharactersBytes | frozenset ([b"/" ]))
751
+ assert c is None or len (c ) == 1
737
752
# Step 2
738
- if c in (">" , None ):
753
+ if c in (b ">" , None ):
739
754
return None
740
755
# Step 3
741
756
attrName = []
742
757
attrValue = []
743
758
#Step 4 attribute name
744
759
while True :
745
- if c == "=" and attrName :
760
+ if c == b "=" and attrName :
746
761
break
747
762
elif c in spaceCharactersBytes :
748
763
#Step 6!
749
764
c = data .skip ()
750
- c = next (data )
751
765
break
752
- elif c in ("/" , ">" ):
753
- return "" .join (attrName ), ""
766
+ elif c in (b "/" , b ">" ):
767
+ return b "" .join (attrName ), b ""
754
768
elif c in asciiUppercaseBytes :
755
769
attrName .append (c .lower ())
756
770
elif c == None :
@@ -760,15 +774,15 @@ def getAttribute(self):
760
774
#Step 5
761
775
c = next (data )
762
776
#Step 7
763
- if c != "=" :
777
+ if c != b "=" :
764
778
data .previous ()
765
- return "" .join (attrName ), ""
779
+ return b "" .join (attrName ), b ""
766
780
#Step 8
767
781
next (data )
768
782
#Step 9
769
783
c = data .skip ()
770
784
#Step 10
771
- if c in ("'" , '"' ):
785
+ if c in (b "'" , b '"' ):
772
786
#10.1
773
787
quoteChar = c
774
788
while True :
@@ -777,15 +791,15 @@ def getAttribute(self):
777
791
#10.3
778
792
if c == quoteChar :
779
793
next (data )
780
- return "" .join (attrName ), "" .join (attrValue )
794
+ return b "" .join (attrName ), b "" .join (attrValue )
781
795
#10.4
782
796
elif c in asciiUppercaseBytes :
783
797
attrValue .append (c .lower ())
784
798
#10.5
785
799
else :
786
800
attrValue .append (c )
787
- elif c == ">" :
788
- return "" .join (attrName ), ""
801
+ elif c == b ">" :
802
+ return b "" .join (attrName ), b ""
789
803
elif c in asciiUppercaseBytes :
790
804
attrValue .append (c .lower ())
791
805
elif c is None :
@@ -796,7 +810,7 @@ def getAttribute(self):
796
810
while True :
797
811
c = next (data )
798
812
if c in spacesAngleBrackets :
799
- return "" .join (attrName ), "" .join (attrValue )
813
+ return b "" .join (attrName ), b "" .join (attrValue )
800
814
elif c in asciiUppercaseBytes :
801
815
attrValue .append (c .lower ())
802
816
elif c is None :
@@ -807,21 +821,22 @@ def getAttribute(self):
807
821
808
822
class ContentAttrParser (object ):
809
823
def __init__ (self , data ):
824
+ assert isinstance (data , bytes )
810
825
self .data = data
811
826
def parse (self ):
812
827
try :
813
828
#Check if the attr name is charset
814
829
#otherwise return
815
- self .data .jumpTo ("charset" )
830
+ self .data .jumpTo (b "charset" )
816
831
self .data .position += 1
817
832
self .data .skip ()
818
- if not self .data .currentByte == "=" :
833
+ if not self .data .currentByte == b "=" :
819
834
#If there is no = sign keep looking for attrs
820
835
return None
821
836
self .data .position += 1
822
837
self .data .skip ()
823
838
#Look for an encoding between matching quote marks
824
- if self .data .currentByte in ('"' , "'" ):
839
+ if self .data .currentByte in (b '"' , b "'" ):
825
840
quoteMark = self .data .currentByte
826
841
self .data .position += 1
827
842
oldPosition = self .data .position
@@ -845,6 +860,11 @@ def parse(self):
845
860
def codecName (encoding ):
846
861
"""Return the python codec name corresponding to an encoding or None if the
847
862
string doesn't correspond to a valid encoding."""
863
+ if isinstance (encoding , bytes ):
864
+ try :
865
+ encoding = encoding .decode ("ascii" )
866
+ except UnicodeDecodeError :
867
+ return None
848
868
if encoding :
849
869
canonicalName = ascii_punctuation_re .sub ("" , encoding ).lower ()
850
870
return encodings .get (canonicalName , None )
0 commit comments