11
11
asciiLettersBytes = [str (item ) for item in asciiLetters ]
12
12
asciiUppercaseBytes = [str (item ) for item in asciiUppercase ]
13
13
14
+ invalid_unicode_re = re .compile (u"[\u0001 -\u0008 ]|[\u000E -\u001F ]|[\u007F -\u009F ]|[\uD800 -\uDFFF ]|[\uFDD0 -\uFDDF ]|\uFFFE |\uFFFF |\U0001FFFE |\U0001FFFF |\U0002FFFE |\U0002FFFF |\U0003FFFE |\U0003FFFF |\U0004FFFE |\U0004FFFF |\U0005FFFE |\U0005FFFF |\U0006FFFE |\U0006FFFF |\U0007FFFE |\U0007FFFF |\U0008FFFE |\U0008FFFF |\U0009FFFE |\U0009FFFF |\U000AFFFE |\U000AFFFF |\U000BFFFE \U000BFFFF |\U000CFFFE |\U000CFFFF |\U000DFFFE |\U000DFFFF |\U000EFFFE |\U000EFFFF |\U000FFFFE |\U000FFFFF |\U0010FFFE |\U0010FFFF " )
15
+
14
16
try :
15
17
from collections import deque
16
18
except ImportError :
@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
28
30
"""Initialises the HTMLInputStream.
29
31
30
32
HTMLInputStream(source, [encoding]) -> Normalized stream from source
31
- for use by the HTML5Lib .
33
+ for use by html5lib .
32
34
33
35
source can be either a file-object, local filename or a string.
34
36
@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
59
61
self .defaultEncoding = "windows-1252"
60
62
61
63
#Detect encoding iff no explicit "transport level" encoding is supplied
62
- if self .charEncoding [0 ] is None or not isValidEncoding (self .charEncoding [0 ]):
64
+ if (self .charEncoding [0 ] is None or
65
+ not isValidEncoding (self .charEncoding [0 ])):
63
66
self .charEncoding = self .detectEncoding (parseMeta , chardet )
64
67
65
68
self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
@@ -87,7 +90,7 @@ def openStream(self, source):
87
90
# Otherwise treat source as a string and convert to a file object
88
91
if isinstance (source , unicode ):
89
92
source = source .encode ('utf-8' )
90
- self .charEncoding = "utf-8"
93
+ self .charEncoding = ( "utf-8" , "certian" )
91
94
import cStringIO
92
95
stream = cStringIO .StringIO (str (source ))
93
96
return stream
@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):
262
265
#Replace null characters
263
266
for i in xrange (data .count (u"\u0000 " )):
264
267
self .errors .append ("null-character" )
268
+ for i in xrange (len (invalid_unicode_re .findall (data ))):
269
+ self .errors .append ("invalid-codepoint" )
270
+
265
271
data = data .replace (u"\u0000 " , u"\ufffd " )
266
272
#Check for CR LF broken across chunks
267
273
if (self ._lastChunkEndsWithCR and data [0 ] == "\n " ):
@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):
271
277
data = data .replace ("\r " , "\n " )
272
278
273
279
data = unicode (data )
274
- self .queue .extend ([ char for char in data ] )
280
+ self .queue .extend (list ( data ) )
275
281
276
282
self .updatePosition ()
277
283
0 commit comments