7
7
from constants import encodings , ReparseException
8
8
import utils
9
9
10
+ from StringIO import StringIO
11
+
12
+ try :
13
+ from io import BytesIO
14
+ except ImportError :
15
+ BytesIO = StringIO
16
+
17
+ try :
18
+ from io import BufferedIOBase
19
+ except ImportError :
20
+ class BufferedIOBase (object ):
21
+ pass
22
+
10
23
#Non-unicode versions of constants for use in the pre-parser
11
24
spaceCharactersBytes = frozenset ([str (item ) for item in spaceCharacters ])
12
25
asciiLettersBytes = frozenset ([str (item ) for item in asciiLetters ])
@@ -101,10 +114,21 @@ def _readFromBuffer(self, bytes):
101
114
rv .append (self ._readStream (remainingBytes ))
102
115
103
116
return "" .join (rv )
104
-
105
117
106
118
107
- class HTMLInputStream :
119
+ def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
120
+ if hasattr (source , "read" ):
121
+ isUnicode = isinstance (source .read (0 ), unicode )
122
+ else :
123
+ isUnicode = isinstance (source , unicode )
124
+
125
+ if isUnicode :
126
+ return HTMLUnicodeInputStream (source )
127
+ else :
128
+ return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
129
+
130
+
131
+ class HTMLUnicodeInputStream :
108
132
"""Provides a unicode stream of characters to the HTMLTokenizer.
109
133
110
134
This class takes care of character encoding and removing or replacing
@@ -114,7 +138,7 @@ class HTMLInputStream:
114
138
115
139
_defaultChunkSize = 10240
116
140
117
- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
141
+ def __init__ (self , source ):
118
142
"""Initialises the HTMLInputStream.
119
143
120
144
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -142,32 +166,12 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
142
166
# List of where new lines occur
143
167
self .newLines = [0 ]
144
168
145
- self .charEncoding = (codecName (encoding ), "certain" )
146
-
147
- # Raw Stream - for unicode objects this will encode to utf-8 and set
148
- # self.charEncoding as appropriate
149
- self .rawStream = self .openStream (source )
150
-
151
- # Encoding Information
152
- #Number of bytes to use when looking for a meta element with
153
- #encoding information
154
- self .numBytesMeta = 512
155
- #Number of bytes to use when using detecting encoding using chardet
156
- self .numBytesChardet = 100
157
- #Encoding to use if no other information can be found
158
- self .defaultEncoding = "windows-1252"
159
-
160
- #Detect encoding iff no explicit "transport level" encoding is supplied
161
- if (self .charEncoding [0 ] is None ):
162
- self .charEncoding = self .detectEncoding (parseMeta , chardet )
163
-
169
+ self .charEncoding = ("utf-8" , "certain" )
170
+ self .dataStream = self .openStream (source )
164
171
165
172
self .reset ()
166
173
167
174
def reset (self ):
168
- self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
169
- 'replace' )
170
-
171
175
self .chunk = u""
172
176
self .chunkSize = 0
173
177
self .chunkOffset = 0
@@ -191,128 +195,16 @@ def openStream(self, source):
191
195
if hasattr (source , 'read' ):
192
196
stream = source
193
197
else :
194
- # Otherwise treat source as a string and convert to a file object
195
- if isinstance (source , unicode ):
196
- # XXX: we should handle lone surrogates here
197
- source = source .encode ('utf-8' , errors = "replace" )
198
- self .charEncoding = ("utf-8" , "certain" )
199
- try :
200
- from io import BytesIO
201
- except :
202
- try :
203
- # 2to3 converts this line to: from io import StringIO
204
- from cStringIO import StringIO as BytesIO
205
- except :
206
- from StringIO import StringIO as BytesIO
207
- stream = BytesIO (source )
198
+ stream = StringIO (source )
208
199
209
- if (not (hasattr (stream , "tell" ) and hasattr (stream , "seek" )) or
200
+ if (#not isinstance(stream, BufferedIOBase) and
201
+ not (hasattr (stream , "tell" ) and
202
+ hasattr (stream , "seek" )) or
210
203
stream is sys .stdin ):
211
204
stream = BufferedStream (stream )
212
205
213
206
return stream
214
207
215
- def detectEncoding (self , parseMeta = True , chardet = True ):
216
- #First look for a BOM
217
- #This will also read past the BOM if present
218
- encoding = self .detectBOM ()
219
- confidence = "certain"
220
- #If there is no BOM need to look for meta elements with encoding
221
- #information
222
- if encoding is None and parseMeta :
223
- encoding = self .detectEncodingMeta ()
224
- confidence = "tentative"
225
- #Guess with chardet, if avaliable
226
- if encoding is None and chardet :
227
- confidence = "tentative"
228
- try :
229
- from chardet .universaldetector import UniversalDetector
230
- buffers = []
231
- detector = UniversalDetector ()
232
- while not detector .done :
233
- buffer = self .rawStream .read (self .numBytesChardet )
234
- assert isinstance (buffer , bytes )
235
- if not buffer :
236
- break
237
- buffers .append (buffer )
238
- detector .feed (buffer )
239
- detector .close ()
240
- encoding = detector .result ['encoding' ]
241
- self .rawStream .seek (0 )
242
- except ImportError :
243
- pass
244
- # If all else fails use the default encoding
245
- if encoding is None :
246
- confidence = "tentative"
247
- encoding = self .defaultEncoding
248
-
249
- #Substitute for equivalent encodings:
250
- encodingSub = {"iso-8859-1" :"windows-1252" }
251
-
252
- if encoding .lower () in encodingSub :
253
- encoding = encodingSub [encoding .lower ()]
254
-
255
- return encoding , confidence
256
-
257
- def changeEncoding (self , newEncoding ):
258
- newEncoding = codecName (newEncoding )
259
- if newEncoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
260
- newEncoding = "utf-8"
261
- if newEncoding is None :
262
- return
263
- elif newEncoding == self .charEncoding [0 ]:
264
- self .charEncoding = (self .charEncoding [0 ], "certain" )
265
- else :
266
- self .rawStream .seek (0 )
267
- self .reset ()
268
- self .charEncoding = (newEncoding , "certain" )
269
- raise ReparseException , "Encoding changed from %s to %s" % (self .charEncoding [0 ], newEncoding )
270
-
271
- def detectBOM (self ):
272
- """Attempts to detect at BOM at the start of the stream. If
273
- an encoding can be determined from the BOM return the name of the
274
- encoding otherwise return None"""
275
- bomDict = {
276
- codecs .BOM_UTF8 : 'utf-8' ,
277
- codecs .BOM_UTF16_LE : 'utf-16-le' , codecs .BOM_UTF16_BE : 'utf-16-be' ,
278
- codecs .BOM_UTF32_LE : 'utf-32-le' , codecs .BOM_UTF32_BE : 'utf-32-be'
279
- }
280
-
281
- # Go to beginning of file and read in 4 bytes
282
- string = self .rawStream .read (4 )
283
- assert isinstance (string , bytes )
284
-
285
- # Try detecting the BOM using bytes from the string
286
- encoding = bomDict .get (string [:3 ]) # UTF-8
287
- seek = 3
288
- if not encoding :
289
- # Need to detect UTF-32 before UTF-16
290
- encoding = bomDict .get (string ) # UTF-32
291
- seek = 4
292
- if not encoding :
293
- encoding = bomDict .get (string [:2 ]) # UTF-16
294
- seek = 2
295
-
296
- # Set the read position past the BOM if one was found, otherwise
297
- # set it to the start of the stream
298
- self .rawStream .seek (encoding and seek or 0 )
299
-
300
- return encoding
301
-
302
- def detectEncodingMeta (self ):
303
- """Report the encoding declared by the meta element
304
- """
305
- buffer = self .rawStream .read (self .numBytesMeta )
306
- assert isinstance (buffer , bytes )
307
- parser = EncodingParser (buffer )
308
- self .rawStream .seek (0 )
309
- encoding = parser .getEncoding ()
310
-
311
- if encoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
312
- encoding = "utf-8"
313
-
314
- return encoding
315
-
316
208
def _position (self , offset ):
317
209
chunk = self .chunk
318
210
nLines = chunk .count (u'\n ' , 0 , offset )
@@ -475,6 +367,177 @@ def unget(self, char):
475
367
self .chunkOffset -= 1
476
368
assert self .chunk [self .chunkOffset ] == char
477
369
370
+ class HTMLBinaryInputStream (HTMLUnicodeInputStream ):
371
+ """Provides a unicode stream of characters to the HTMLTokenizer.
372
+
373
+ This class takes care of character encoding and removing or replacing
374
+ incorrect byte-sequences and also provides column and line tracking.
375
+
376
+ """
377
+
378
+ def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
379
+ """Initialises the HTMLInputStream.
380
+
381
+ HTMLInputStream(source, [encoding]) -> Normalized stream from source
382
+ for use by html5lib.
383
+
384
+ source can be either a file-object, local filename or a string.
385
+
386
+ The optional encoding parameter must be a string that indicates
387
+ the encoding. If specified, that encoding will be used,
388
+ regardless of any BOM or later declaration (such as in a meta
389
+ element)
390
+
391
+ parseMeta - Look for a <meta> element containing encoding information
392
+
393
+ """
394
+ self .charEncoding = (codecName (encoding ), "certain" )
395
+
396
+ # Raw Stream - for unicode objects this will encode to utf-8 and set
397
+ # self.charEncoding as appropriate
398
+ self .rawStream = self .openStream (source )
399
+
400
+ # Encoding Information
401
+ #Number of bytes to use when looking for a meta element with
402
+ #encoding information
403
+ self .numBytesMeta = 512
404
+ #Number of bytes to use when using detecting encoding using chardet
405
+ self .numBytesChardet = 100
406
+ #Encoding to use if no other information can be found
407
+ self .defaultEncoding = "windows-1252"
408
+
409
+ #Detect encoding iff no explicit "transport level" encoding is supplied
410
+ if (self .charEncoding [0 ] is None ):
411
+ self .charEncoding = self .detectEncoding (parseMeta , chardet )
412
+
413
+ #Call superclass
414
+ HTMLUnicodeInputStream .__init__ (self , self .rawStream )
415
+
416
+ def reset (self ):
417
+ self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
418
+ 'replace' )
419
+ HTMLUnicodeInputStream .reset (self )
420
+
421
+ def openStream (self , source ):
422
+ """Produces a file object from source.
423
+
424
+ source can be either a file object, local filename or a string.
425
+
426
+ """
427
+ # Already a file object
428
+ if hasattr (source , 'read' ):
429
+ stream = source
430
+ else :
431
+ stream = BytesIO (source )
432
+
433
+ if (not (hasattr (stream , "tell" ) and hasattr (stream , "seek" )) or
434
+ stream is sys .stdin ):
435
+ stream = BufferedStream (stream )
436
+
437
+ return stream
438
+
439
+ def detectEncoding (self , parseMeta = True , chardet = True ):
440
+ #First look for a BOM
441
+ #This will also read past the BOM if present
442
+ encoding = self .detectBOM ()
443
+ confidence = "certain"
444
+ #If there is no BOM need to look for meta elements with encoding
445
+ #information
446
+ if encoding is None and parseMeta :
447
+ encoding = self .detectEncodingMeta ()
448
+ confidence = "tentative"
449
+ #Guess with chardet, if avaliable
450
+ if encoding is None and chardet :
451
+ confidence = "tentative"
452
+ try :
453
+ from chardet .universaldetector import UniversalDetector
454
+ buffers = []
455
+ detector = UniversalDetector ()
456
+ while not detector .done :
457
+ buffer = self .rawStream .read (self .numBytesChardet )
458
+ assert isinstance (buffer , bytes )
459
+ if not buffer :
460
+ break
461
+ buffers .append (buffer )
462
+ detector .feed (buffer )
463
+ detector .close ()
464
+ encoding = detector .result ['encoding' ]
465
+ self .rawStream .seek (0 )
466
+ except ImportError :
467
+ pass
468
+ # If all else fails use the default encoding
469
+ if encoding is None :
470
+ confidence = "tentative"
471
+ encoding = self .defaultEncoding
472
+
473
+ #Substitute for equivalent encodings:
474
+ encodingSub = {"iso-8859-1" :"windows-1252" }
475
+
476
+ if encoding .lower () in encodingSub :
477
+ encoding = encodingSub [encoding .lower ()]
478
+
479
+ return encoding , confidence
480
+
481
+ def changeEncoding (self , newEncoding ):
482
+ assert self .charEncoding [1 ] != "certain"
483
+ newEncoding = codecName (newEncoding )
484
+ if newEncoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
485
+ newEncoding = "utf-8"
486
+ if newEncoding is None :
487
+ return
488
+ elif newEncoding == self .charEncoding [0 ]:
489
+ self .charEncoding = (self .charEncoding [0 ], "certain" )
490
+ else :
491
+ self .rawStream .seek (0 )
492
+ self .reset ()
493
+ self .charEncoding = (newEncoding , "certain" )
494
+ raise ReparseException , "Encoding changed from %s to %s" % (self .charEncoding [0 ], newEncoding )
495
+
496
+ def detectBOM (self ):
497
+ """Attempts to detect at BOM at the start of the stream. If
498
+ an encoding can be determined from the BOM return the name of the
499
+ encoding otherwise return None"""
500
+ bomDict = {
501
+ codecs .BOM_UTF8 : 'utf-8' ,
502
+ codecs .BOM_UTF16_LE : 'utf-16-le' , codecs .BOM_UTF16_BE : 'utf-16-be' ,
503
+ codecs .BOM_UTF32_LE : 'utf-32-le' , codecs .BOM_UTF32_BE : 'utf-32-be'
504
+ }
505
+
506
+ # Go to beginning of file and read in 4 bytes
507
+ string = self .rawStream .read (4 )
508
+ assert isinstance (string , bytes )
509
+
510
+ # Try detecting the BOM using bytes from the string
511
+ encoding = bomDict .get (string [:3 ]) # UTF-8
512
+ seek = 3
513
+ if not encoding :
514
+ # Need to detect UTF-32 before UTF-16
515
+ encoding = bomDict .get (string ) # UTF-32
516
+ seek = 4
517
+ if not encoding :
518
+ encoding = bomDict .get (string [:2 ]) # UTF-16
519
+ seek = 2
520
+
521
+ # Set the read position past the BOM if one was found, otherwise
522
+ # set it to the start of the stream
523
+ self .rawStream .seek (encoding and seek or 0 )
524
+
525
+ return encoding
526
+
527
+ def detectEncodingMeta (self ):
528
+ """Report the encoding declared by the meta element
529
+ """
530
+ buffer = self .rawStream .read (self .numBytesMeta )
531
+ assert isinstance (buffer , bytes )
532
+ parser = EncodingParser (buffer )
533
+ self .rawStream .seek (0 )
534
+ encoding = parser .getEncoding ()
535
+
536
+ if encoding in ("utf-16" , "utf-16-be" , "utf-16-le" ):
537
+ encoding = "utf-8"
538
+
539
+ return encoding
540
+
478
541
class EncodingBytes (str ):
479
542
"""String-like object with an associated position and various extra methods
480
543
If the position is ever greater than the string length then an exception is
0 commit comments