Skip to content

Commit c1029a4

Browse files
committed
First hack at splitting InputStream up into a binary and Unicode class pair.
1 parent 67bc17b commit c1029a4

File tree

1 file changed

+204
-141
lines changed

1 file changed

+204
-141
lines changed

html5lib/inputstream.py

Lines changed: 204 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@
77
from constants import encodings, ReparseException
88
import utils
99

10+
from StringIO import StringIO
11+
12+
try:
13+
from io import BytesIO
14+
except ImportError:
15+
BytesIO = StringIO
16+
17+
try:
18+
from io import BufferedIOBase
19+
except ImportError:
20+
class BufferedIOBase(object):
21+
pass
22+
1023
#Non-unicode versions of constants for use in the pre-parser
1124
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
1225
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
@@ -101,10 +114,21 @@ def _readFromBuffer(self, bytes):
101114
rv.append(self._readStream(remainingBytes))
102115

103116
return "".join(rv)
104-
105117

106118

107-
class HTMLInputStream:
119+
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
120+
if hasattr(source, "read"):
121+
isUnicode = isinstance(source.read(0), unicode)
122+
else:
123+
isUnicode = isinstance(source, unicode)
124+
125+
if isUnicode:
126+
return HTMLUnicodeInputStream(source)
127+
else:
128+
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
129+
130+
131+
class HTMLUnicodeInputStream:
108132
"""Provides a unicode stream of characters to the HTMLTokenizer.
109133
110134
This class takes care of character encoding and removing or replacing
@@ -114,7 +138,7 @@ class HTMLInputStream:
114138

115139
_defaultChunkSize = 10240
116140

117-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
141+
def __init__(self, source):
118142
"""Initialises the HTMLInputStream.
119143
120144
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -142,32 +166,12 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
142166
# List of where new lines occur
143167
self.newLines = [0]
144168

145-
self.charEncoding = (codecName(encoding), "certain")
146-
147-
# Raw Stream - for unicode objects this will encode to utf-8 and set
148-
# self.charEncoding as appropriate
149-
self.rawStream = self.openStream(source)
150-
151-
# Encoding Information
152-
#Number of bytes to use when looking for a meta element with
153-
#encoding information
154-
self.numBytesMeta = 512
155-
#Number of bytes to use when using detecting encoding using chardet
156-
self.numBytesChardet = 100
157-
#Encoding to use if no other information can be found
158-
self.defaultEncoding = "windows-1252"
159-
160-
#Detect encoding iff no explicit "transport level" encoding is supplied
161-
if (self.charEncoding[0] is None):
162-
self.charEncoding = self.detectEncoding(parseMeta, chardet)
163-
169+
self.charEncoding = ("utf-8", "certain")
170+
self.dataStream = self.openStream(source)
164171

165172
self.reset()
166173

167174
def reset(self):
168-
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
169-
'replace')
170-
171175
self.chunk = u""
172176
self.chunkSize = 0
173177
self.chunkOffset = 0
@@ -191,128 +195,16 @@ def openStream(self, source):
191195
if hasattr(source, 'read'):
192196
stream = source
193197
else:
194-
# Otherwise treat source as a string and convert to a file object
195-
if isinstance(source, unicode):
196-
# XXX: we should handle lone surrogates here
197-
source = source.encode('utf-8', errors="replace")
198-
self.charEncoding = ("utf-8", "certain")
199-
try:
200-
from io import BytesIO
201-
except:
202-
try:
203-
# 2to3 converts this line to: from io import StringIO
204-
from cStringIO import StringIO as BytesIO
205-
except:
206-
from StringIO import StringIO as BytesIO
207-
stream = BytesIO(source)
198+
stream = StringIO(source)
208199

209-
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
200+
if (#not isinstance(stream, BufferedIOBase) and
201+
not(hasattr(stream, "tell") and
202+
hasattr(stream, "seek")) or
210203
stream is sys.stdin):
211204
stream = BufferedStream(stream)
212205

213206
return stream
214207

215-
def detectEncoding(self, parseMeta=True, chardet=True):
216-
#First look for a BOM
217-
#This will also read past the BOM if present
218-
encoding = self.detectBOM()
219-
confidence = "certain"
220-
#If there is no BOM need to look for meta elements with encoding
221-
#information
222-
if encoding is None and parseMeta:
223-
encoding = self.detectEncodingMeta()
224-
confidence = "tentative"
225-
#Guess with chardet, if avaliable
226-
if encoding is None and chardet:
227-
confidence = "tentative"
228-
try:
229-
from chardet.universaldetector import UniversalDetector
230-
buffers = []
231-
detector = UniversalDetector()
232-
while not detector.done:
233-
buffer = self.rawStream.read(self.numBytesChardet)
234-
assert isinstance(buffer, bytes)
235-
if not buffer:
236-
break
237-
buffers.append(buffer)
238-
detector.feed(buffer)
239-
detector.close()
240-
encoding = detector.result['encoding']
241-
self.rawStream.seek(0)
242-
except ImportError:
243-
pass
244-
# If all else fails use the default encoding
245-
if encoding is None:
246-
confidence="tentative"
247-
encoding = self.defaultEncoding
248-
249-
#Substitute for equivalent encodings:
250-
encodingSub = {"iso-8859-1":"windows-1252"}
251-
252-
if encoding.lower() in encodingSub:
253-
encoding = encodingSub[encoding.lower()]
254-
255-
return encoding, confidence
256-
257-
def changeEncoding(self, newEncoding):
258-
newEncoding = codecName(newEncoding)
259-
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
260-
newEncoding = "utf-8"
261-
if newEncoding is None:
262-
return
263-
elif newEncoding == self.charEncoding[0]:
264-
self.charEncoding = (self.charEncoding[0], "certain")
265-
else:
266-
self.rawStream.seek(0)
267-
self.reset()
268-
self.charEncoding = (newEncoding, "certain")
269-
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
270-
271-
def detectBOM(self):
272-
"""Attempts to detect at BOM at the start of the stream. If
273-
an encoding can be determined from the BOM return the name of the
274-
encoding otherwise return None"""
275-
bomDict = {
276-
codecs.BOM_UTF8: 'utf-8',
277-
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
278-
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
279-
}
280-
281-
# Go to beginning of file and read in 4 bytes
282-
string = self.rawStream.read(4)
283-
assert isinstance(string, bytes)
284-
285-
# Try detecting the BOM using bytes from the string
286-
encoding = bomDict.get(string[:3]) # UTF-8
287-
seek = 3
288-
if not encoding:
289-
# Need to detect UTF-32 before UTF-16
290-
encoding = bomDict.get(string) # UTF-32
291-
seek = 4
292-
if not encoding:
293-
encoding = bomDict.get(string[:2]) # UTF-16
294-
seek = 2
295-
296-
# Set the read position past the BOM if one was found, otherwise
297-
# set it to the start of the stream
298-
self.rawStream.seek(encoding and seek or 0)
299-
300-
return encoding
301-
302-
def detectEncodingMeta(self):
303-
"""Report the encoding declared by the meta element
304-
"""
305-
buffer = self.rawStream.read(self.numBytesMeta)
306-
assert isinstance(buffer, bytes)
307-
parser = EncodingParser(buffer)
308-
self.rawStream.seek(0)
309-
encoding = parser.getEncoding()
310-
311-
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
312-
encoding = "utf-8"
313-
314-
return encoding
315-
316208
def _position(self, offset):
317209
chunk = self.chunk
318210
nLines = chunk.count(u'\n', 0, offset)
@@ -475,6 +367,177 @@ def unget(self, char):
475367
self.chunkOffset -= 1
476368
assert self.chunk[self.chunkOffset] == char
477369

370+
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
371+
"""Provides a unicode stream of characters to the HTMLTokenizer.
372+
373+
This class takes care of character encoding and removing or replacing
374+
incorrect byte-sequences and also provides column and line tracking.
375+
376+
"""
377+
378+
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
379+
"""Initialises the HTMLInputStream.
380+
381+
HTMLInputStream(source, [encoding]) -> Normalized stream from source
382+
for use by html5lib.
383+
384+
source can be either a file-object, local filename or a string.
385+
386+
The optional encoding parameter must be a string that indicates
387+
the encoding. If specified, that encoding will be used,
388+
regardless of any BOM or later declaration (such as in a meta
389+
element)
390+
391+
parseMeta - Look for a <meta> element containing encoding information
392+
393+
"""
394+
self.charEncoding = (codecName(encoding), "certain")
395+
396+
# Raw Stream - for unicode objects this will encode to utf-8 and set
397+
# self.charEncoding as appropriate
398+
self.rawStream = self.openStream(source)
399+
400+
# Encoding Information
401+
#Number of bytes to use when looking for a meta element with
402+
#encoding information
403+
self.numBytesMeta = 512
404+
#Number of bytes to use when using detecting encoding using chardet
405+
self.numBytesChardet = 100
406+
#Encoding to use if no other information can be found
407+
self.defaultEncoding = "windows-1252"
408+
409+
#Detect encoding iff no explicit "transport level" encoding is supplied
410+
if (self.charEncoding[0] is None):
411+
self.charEncoding = self.detectEncoding(parseMeta, chardet)
412+
413+
#Call superclass
414+
HTMLUnicodeInputStream.__init__(self, self.rawStream)
415+
416+
def reset(self):
417+
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
418+
'replace')
419+
HTMLUnicodeInputStream.reset(self)
420+
421+
def openStream(self, source):
422+
"""Produces a file object from source.
423+
424+
source can be either a file object, local filename or a string.
425+
426+
"""
427+
# Already a file object
428+
if hasattr(source, 'read'):
429+
stream = source
430+
else:
431+
stream = BytesIO(source)
432+
433+
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
434+
stream is sys.stdin):
435+
stream = BufferedStream(stream)
436+
437+
return stream
438+
439+
def detectEncoding(self, parseMeta=True, chardet=True):
440+
#First look for a BOM
441+
#This will also read past the BOM if present
442+
encoding = self.detectBOM()
443+
confidence = "certain"
444+
#If there is no BOM need to look for meta elements with encoding
445+
#information
446+
if encoding is None and parseMeta:
447+
encoding = self.detectEncodingMeta()
448+
confidence = "tentative"
449+
#Guess with chardet, if avaliable
450+
if encoding is None and chardet:
451+
confidence = "tentative"
452+
try:
453+
from chardet.universaldetector import UniversalDetector
454+
buffers = []
455+
detector = UniversalDetector()
456+
while not detector.done:
457+
buffer = self.rawStream.read(self.numBytesChardet)
458+
assert isinstance(buffer, bytes)
459+
if not buffer:
460+
break
461+
buffers.append(buffer)
462+
detector.feed(buffer)
463+
detector.close()
464+
encoding = detector.result['encoding']
465+
self.rawStream.seek(0)
466+
except ImportError:
467+
pass
468+
# If all else fails use the default encoding
469+
if encoding is None:
470+
confidence="tentative"
471+
encoding = self.defaultEncoding
472+
473+
#Substitute for equivalent encodings:
474+
encodingSub = {"iso-8859-1":"windows-1252"}
475+
476+
if encoding.lower() in encodingSub:
477+
encoding = encodingSub[encoding.lower()]
478+
479+
return encoding, confidence
480+
481+
def changeEncoding(self, newEncoding):
482+
assert self.charEncoding[1] != "certain"
483+
newEncoding = codecName(newEncoding)
484+
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
485+
newEncoding = "utf-8"
486+
if newEncoding is None:
487+
return
488+
elif newEncoding == self.charEncoding[0]:
489+
self.charEncoding = (self.charEncoding[0], "certain")
490+
else:
491+
self.rawStream.seek(0)
492+
self.reset()
493+
self.charEncoding = (newEncoding, "certain")
494+
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
495+
496+
def detectBOM(self):
497+
"""Attempts to detect at BOM at the start of the stream. If
498+
an encoding can be determined from the BOM return the name of the
499+
encoding otherwise return None"""
500+
bomDict = {
501+
codecs.BOM_UTF8: 'utf-8',
502+
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
503+
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
504+
}
505+
506+
# Go to beginning of file and read in 4 bytes
507+
string = self.rawStream.read(4)
508+
assert isinstance(string, bytes)
509+
510+
# Try detecting the BOM using bytes from the string
511+
encoding = bomDict.get(string[:3]) # UTF-8
512+
seek = 3
513+
if not encoding:
514+
# Need to detect UTF-32 before UTF-16
515+
encoding = bomDict.get(string) # UTF-32
516+
seek = 4
517+
if not encoding:
518+
encoding = bomDict.get(string[:2]) # UTF-16
519+
seek = 2
520+
521+
# Set the read position past the BOM if one was found, otherwise
522+
# set it to the start of the stream
523+
self.rawStream.seek(encoding and seek or 0)
524+
525+
return encoding
526+
527+
def detectEncodingMeta(self):
528+
"""Report the encoding declared by the meta element
529+
"""
530+
buffer = self.rawStream.read(self.numBytesMeta)
531+
assert isinstance(buffer, bytes)
532+
parser = EncodingParser(buffer)
533+
self.rawStream.seek(0)
534+
encoding = parser.getEncoding()
535+
536+
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
537+
encoding = "utf-8"
538+
539+
return encoding
540+
478541
class EncodingBytes(str):
479542
"""String-like object with an associated position and various extra methods
480543
If the position is ever greater than the string length then an exception is

0 commit comments

Comments
 (0)