Skip to content

Commit 165ea70

Browse files
committed
Improve check for validity of encoding
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40512
1 parent 8abc31b commit 165ea70

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

src/inputstream.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import codecs
22
import re
3+
import types
34

45
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
56
from constants import encodings
@@ -39,8 +40,9 @@ def __init__(self, source, encoding=None):
3940
self.numBytesMeta = 512
4041
#Encoding to use if no other information can be found
4142
self.defaultEncoding = "windows-1252"
43+
4244
#Detect encoding iff no explicit "transport level" encoding is supplied
43-
if encoding is None:
45+
if encoding is None or not isValidEncoding(encoding):
4446
encoding = self.detectEncoding()
4547
self.charEncoding = encoding
4648

@@ -79,20 +81,17 @@ def detectEncoding(self):
7981
#First look for a BOM
8082
#This will also read past the BOM if present
8183
encoding = self.detectBOM()
82-
8384
#If there is no BOM need to look for meta elements with encoding
8485
#information
8586
if encoding is None:
8687
encoding = self.detectEncodingMeta()
87-
8888
#Guess with chardet, if avaliable
8989
if encoding is None:
9090
try:
9191
import chardet
9292
encoding = chardet.detect(self.rawStream)['encoding']
9393
except ImportError:
9494
pass
95-
9695
# If all else fails use the default encoding
9796
if encoding is None:
9897
encoding = self.defaultEncoding
@@ -102,7 +101,7 @@ def detectEncoding(self):
102101

103102
if encoding.lower() in encodingSub:
104103
encoding = encodingSub[encoding.lower()]
105-
104+
106105
return encoding
107106

108107
def detectBOM(self):
@@ -301,13 +300,13 @@ def handleMeta(self):
301300
else:
302301
if attr[0] == "charset":
303302
tentativeEncoding = attr[1]
304-
if self.isValidEncoding(tentativeEncoding):
303+
if isValidEncoding(tentativeEncoding):
305304
self.encoding = tentativeEncoding
306305
return False
307306
elif attr[0] == "content":
308307
contentParser = ContentAttrParser(attr[1])
309308
tentativeEncoding = contentParser.parse()
310-
if self.isValidEncoding(tentativeEncoding):
309+
if isValidEncoding(tentativeEncoding):
311310
self.encoding = tentativeEncoding
312311
return False
313312

@@ -358,10 +357,6 @@ def getAttribute(self):
358357
#print attr, attrParser.position, self.data[self.position]
359358
return attr
360359

361-
def isValidEncoding(self, encoding):
362-
"""Determine if a string is a supported encoding"""
363-
return encoding is not None and encoding.lower().strip() in encodings
364-
365360
class FragmentParser(object):
366361
"""Helper object for parsing document fragments e.g. attributes and content
367362
attribte values"""
@@ -517,3 +512,8 @@ def parse(self):
517512
attrValue.extend(self.fragment[self.position].lower())
518513
else:
519514
attrValue.extend(self.fragment[self.position])
515+
516+
def isValidEncoding(encoding):
517+
"""Determine if a string is a supported encoding"""
518+
return (encoding is not None and type(encoding) == types.StringType and
519+
encoding.lower().strip() in encodings)

0 commit comments

Comments
 (0)