3
3
import types
4
4
5
5
from constants import EOF , spaceCharacters , asciiLetters , asciiUppercase
6
- from constants import encodings
6
+ from constants import encodings , ReparseException
7
7
8
8
#Non-unicode versions of constants for use in the pre-parser
9
9
spaceCharactersBytes = [str (item ) for item in spaceCharacters ]
16
16
17
17
# Cache for charsUntil()
18
18
charsUntilRegEx = {}
19
+
20
+ class BufferedStream :
21
+ """Buffering for streams that do not have buffering of their own
22
+
23
+ The buffer is implemented as a list of chunks on the assumption that
24
+ joining many strings will be slow since it is O(n**2)
25
+ """
26
+
27
+ def __init__ (self , stream ):
28
+ self .stream = stream
29
+ self .buffer = []
30
+ self .position = [- 1 ,0 ] #chunk number, offset
31
+
32
+ def tell (self ):
33
+ pos = 0
34
+ for chunk in self .buffer [:self .position [0 ]]:
35
+ pos += len (chunk )
36
+ pos += self .position [1 ]
37
+ return pos
38
+
39
+ def seek (self , pos ):
40
+ assert pos < self ._bufferedBytes ()
41
+ offset = pos
42
+ i = 0
43
+ while len (self .buffer [i ]) < offset :
44
+ offset -= pos
45
+ i += 1
46
+ self .position = [i , offset ]
47
+
48
+ def read (self , bytes ):
49
+ if not self .buffer :
50
+ return self ._readStream (bytes )
51
+ elif (self .position [0 ] == len (self .buffer ) and
52
+ self .position [1 ] == len (self .buffer [- 1 ])):
53
+ return self ._readStream (bytes )
54
+ else :
55
+ return self ._readFromBuffer (bytes )
56
+
57
+ def _bufferedBytes (self ):
58
+ return sum ([len (item ) for item in self .buffer ])
59
+
60
+ def _readStream (self , bytes ):
61
+ data = self .stream .read (bytes )
62
+ self .buffer .append (data )
63
+ self .position [0 ] += 1
64
+ self .position [1 ] = len (data )
65
+ return data
66
+
67
+ def _readFromBuffer (self , bytes ):
68
+ remainingBytes = bytes
69
+ rv = []
70
+ bufferIndex = self .position [0 ]
71
+ bufferOffset = self .position [1 ]
72
+ while bufferIndex < len (self .buffer ) and remainingBytes != 0 :
73
+ assert remainingBytes > 0
74
+ bufferedData = self .buffer [bufferIndex ]
75
+
76
+ if remainingBytes <= len (bufferedData ) - bufferOffset :
77
+ bytesToRead = remainingBytes
78
+ self .position = [bufferIndex , bufferOffset + bytesToRead ]
79
+ else :
80
+ bytesToRead = len (bufferedData ) - bufferOffset
81
+ self .position = [bufferIndex , len (bufferedData )]
82
+ bufferIndex += 1
83
+ data = rv .append (bufferedData [bufferOffset :
84
+ bufferOffset + bytesToRead ])
85
+ remainingBytes -= bytesToRead
86
+
87
+ bufferOffset = 0
88
+
89
+ if remainingBytes :
90
+ rv .append (self ._readStream (remainingBytes ))
91
+
92
+ return "" .join (rv )
93
+
94
+
19
95
20
96
class HTMLInputStream :
21
97
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
65
141
if (self .charEncoding [0 ] is None ):
66
142
self .charEncoding = self .detectEncoding (parseMeta , chardet )
67
143
144
+ self .reset ()
145
+
146
+ def reset (self ):
68
147
self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
69
148
'replace' )
70
149
@@ -100,6 +179,10 @@ def openStream(self, source):
100
179
self .charEncoding = ("utf-8" , "certain" )
101
180
import cStringIO
102
181
stream = cStringIO .StringIO (str (source ))
182
+
183
+ if not (hasattr (stream , "tell" ) and hasattr (stream , "seek" )):
184
+ stream = BufferedStream (stream )
185
+
103
186
return stream
104
187
105
188
def detectEncoding (self , parseMeta = True , chardet = True ):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
128
211
detector .feed (buffer )
129
212
detector .close ()
130
213
encoding = detector .result ['encoding' ]
131
- self .seek ("" . join ( buffers ), 0 )
214
+ self .rawStream . seek (0 )
132
215
except ImportError :
133
216
pass
134
217
# If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
146
229
147
230
def changeEncoding (self , newEncoding ):
148
231
newEncoding = codecName (newEncoding )
149
- if newEncoding == "utf16" :
150
- newEncoding = "utf8"
151
-
232
+ if newEncoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
233
+ newEncoding = "utf-8"
152
234
if newEncoding is None :
153
235
return
154
236
elif newEncoding == self .charEncoding [0 ]:
155
- self .charEncoding = (self .charEncoding [0 ] and "certian" )
237
+ self .charEncoding = (self .charEncoding [0 ], "certian" )
156
238
else :
157
- raise NotImplementedError , "Cannot change character encoding mid stream"
158
-
239
+ self .rawStream .seek (0 )
240
+ self .reset ()
241
+ self .charEncoding = (newEncoding , "certian" )
242
+ raise ReparseException , "Encoding changed from %s to %s" % (self .charEncoding [0 ], newEncoding )
243
+
159
244
def detectBOM (self ):
160
245
"""Attempts to detect at BOM at the start of the stream. If
161
246
an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
182
267
183
268
# Set the read position past the BOM if one was found, otherwise
184
269
# set it to the start of the stream
185
- self .seek (string , encoding and seek or 0 )
270
+ self .rawStream . seek (encoding and seek or 0 )
186
271
187
272
return encoding
188
273
189
- def seek (self , buffer , n ):
190
- """Unget buffer[n:]"""
191
- if hasattr (self .rawStream , 'unget' ):
192
- self .rawStream .unget (buffer [n :])
193
- return
194
-
195
- if hasattr (self .rawStream , 'seek' ):
196
- try :
197
- self .rawStream .seek (n )
198
- return
199
- except IOError :
200
- pass
201
-
202
- class BufferedStream :
203
- def __init__ (self , data , stream ):
204
- self .data = data
205
- self .stream = stream
206
- def read (self , chars = - 1 ):
207
- if chars == - 1 or chars > len (self .data ):
208
- result = self .data
209
- self .data = ''
210
- if chars == - 1 :
211
- return result + self .stream .read ()
212
- else :
213
- return result + self .stream .read (chars - len (result ))
214
- elif not self .data :
215
- return self .stream .read (chars )
216
- else :
217
- result = self .data [:chars ]
218
- self .data = self .data [chars :]
219
- return result
220
- def unget (self , data ):
221
- if self .data :
222
- self .data += data
223
- else :
224
- self .data = data
225
-
226
- self .rawStream = BufferedStream (buffer [n :], self .rawStream )
227
-
228
274
def detectEncodingMeta (self ):
229
275
"""Report the encoding declared by the meta element
230
276
"""
231
277
buffer = self .rawStream .read (self .numBytesMeta )
232
278
parser = EncodingParser (buffer )
233
- self .seek (buffer , 0 )
279
+ self .rawStream . seek (0 )
234
280
encoding = parser .getEncoding ()
281
+
282
+ if encoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
283
+ encoding = "utf-8"
284
+
235
285
return encoding
236
286
237
287
def updatePosition (self , chars ):
@@ -485,13 +535,6 @@ def getEncoding(self):
485
535
break
486
536
if not keepParsing :
487
537
break
488
- if self .encoding is not None :
489
- self .encoding = self .encoding .strip ()
490
- #Spec violation that complies with hsivonen + mjs
491
- if (ascii_punctuation_re .sub ("" , self .encoding ) in
492
- ("utf16" , "utf16be" , "utf16le" ,
493
- "utf32" , "utf32be" , "utf32le" )):
494
- self .encoding = "utf-8"
495
538
496
539
return self .encoding
497
540
@@ -666,11 +709,12 @@ def parse(self):
666
709
except StopIteration :
667
710
return None
668
711
712
+
669
713
def codecName (encoding ):
670
714
"""Return the python codec name corresponding to an encoding or None if the
671
715
string doesn't correspond to a valid encoding."""
672
- if (encoding is not None and type (encoding ) == types .StringType ):
716
+ if (encoding is not None and type (encoding ) in types .StringTypes ):
673
717
canonicalName = ascii_punctuation_re .sub ("" , encoding ).lower ()
674
- return encodings .get (canonicalName , None )
718
+ return encodings .get (canonicalName , None )
675
719
else :
676
720
return None
0 commit comments