1
1
import codecs
2
2
import re
3
+ import types
3
4
4
5
from constants import EOF , spaceCharacters , asciiLetters , asciiUppercase
5
6
from constants import encodings
@@ -39,8 +40,9 @@ def __init__(self, source, encoding=None):
39
40
self .numBytesMeta = 512
40
41
#Encoding to use if no other information can be found
41
42
self .defaultEncoding = "windows-1252"
43
+
42
44
#Detect encoding iff no explicit "transport level" encoding is supplied
43
- if encoding is None :
45
+ if encoding is None or not isValidEncoding ( encoding ) :
44
46
encoding = self .detectEncoding ()
45
47
self .charEncoding = encoding
46
48
@@ -79,20 +81,17 @@ def detectEncoding(self):
79
81
#First look for a BOM
80
82
#This will also read past the BOM if present
81
83
encoding = self .detectBOM ()
82
-
83
84
#If there is no BOM need to look for meta elements with encoding
84
85
#information
85
86
if encoding is None :
86
87
encoding = self .detectEncodingMeta ()
87
-
88
88
#Guess with chardet, if avaliable
89
89
if encoding is None :
90
90
try :
91
91
import chardet
92
92
encoding = chardet .detect (self .rawStream )['encoding' ]
93
93
except ImportError :
94
94
pass
95
-
96
95
# If all else fails use the default encoding
97
96
if encoding is None :
98
97
encoding = self .defaultEncoding
@@ -102,7 +101,7 @@ def detectEncoding(self):
102
101
103
102
if encoding .lower () in encodingSub :
104
103
encoding = encodingSub [encoding .lower ()]
105
-
104
+
106
105
return encoding
107
106
108
107
def detectBOM (self ):
@@ -301,13 +300,13 @@ def handleMeta(self):
301
300
else :
302
301
if attr [0 ] == "charset" :
303
302
tentativeEncoding = attr [1 ]
304
- if self . isValidEncoding (tentativeEncoding ):
303
+ if isValidEncoding (tentativeEncoding ):
305
304
self .encoding = tentativeEncoding
306
305
return False
307
306
elif attr [0 ] == "content" :
308
307
contentParser = ContentAttrParser (attr [1 ])
309
308
tentativeEncoding = contentParser .parse ()
310
- if self . isValidEncoding (tentativeEncoding ):
309
+ if isValidEncoding (tentativeEncoding ):
311
310
self .encoding = tentativeEncoding
312
311
return False
313
312
@@ -358,10 +357,6 @@ def getAttribute(self):
358
357
#print attr, attrParser.position, self.data[self.position]
359
358
return attr
360
359
361
- def isValidEncoding (self , encoding ):
362
- """Determine if a string is a supported encoding"""
363
- return encoding is not None and encoding .lower ().strip () in encodings
364
-
365
360
class FragmentParser (object ):
366
361
"""Helper object for parsing document fragments e.g. attributes and content
367
362
attribte values"""
@@ -517,3 +512,8 @@ def parse(self):
517
512
attrValue .extend (self .fragment [self .position ].lower ())
518
513
else :
519
514
attrValue .extend (self .fragment [self .position ])
515
+
516
+ def isValidEncoding (encoding ):
517
+ """Determine if a string is a supported encoding"""
518
+ return (encoding is not None and type (encoding ) == types .StringType and
519
+ encoding .lower ().strip () in encodings )
0 commit comments