Skip to content

Commit d214d0d

Browse files
committed
WIP of new inputstream stuff
1 parent da9459f commit d214d0d

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

html5lib/inputstream2.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from codecs import getincrementaldecoder
2+
from io import TextIOBase, UnsupportedOperation
3+
4+
class ChangeableEncodingStream(TextIOBase):
5+
"""A text IO type that support changing encoding"""
6+
def __init__(self, buffer, encoding=None, detectEncoding=None,
7+
errors=None):
8+
# Detect encoding if we have to
9+
if encoding is None and detectEncoding is not None:
10+
encoding, remaining = detectEncoding(buffer)
11+
else:
12+
remaining = None
13+
14+
# Set properties on obj
15+
self.buffer = buffer
16+
self.decoder = None
17+
self._encoding = encoding
18+
self._errors = errors if errors is not None else "strict"
19+
self.remaining = remaining
20+
self.rawChunks = []
21+
self.chunkStates = [(b"", 0)]
22+
self.chunkLengths = []
23+
self.currentChunk = -1
24+
self.chunkSize = 512
25+
self.decodedChunk = None
26+
self.decodedChunkOffset = 0
27+
28+
# Create initial decoder
29+
if encoding:
30+
self.decoder = getincrementaldecoder(encoding)(self._errors)
31+
32+
@property
33+
def encoding(self):
34+
return self._encoding
35+
36+
@encoding.setter
37+
def encoding(self, v):
38+
newDecoder = getincrementaldecoder(v)(self._errors)
39+
40+
if self.rawChunks:
41+
assert self.decodedChunkOffset > 0
42+
self.decoder.setstate(self.chunkStates[self.currentChunk])
43+
currentChunk = self.rawChunks[self.currentChunk]
44+
charCount = 0
45+
byteCount = 0
46+
for byte in currentChunk:
47+
charCount += len(self.decoder.decode(byte))
48+
byteCount += 1
49+
if charCount >= self.decodedChunkOffset:
50+
break
51+
self.decoded = newDecoder.decode(currentChunk[byteCount:], False)
52+
self.decodedChunkOffset = 0
53+
else:
54+
self.decoded = None
55+
self.decodedChunkOffset = 0
56+
57+
self.decoder = newDecoder
58+
59+
@property
60+
def errors(self):
61+
return self._errors
62+
63+
@errors.setter
64+
def errors(self, v):
65+
# XXX: this somehow needs to handle the case where it is set
66+
# before the encoding (and hence we have no encoding)
67+
self.decoder.errors = v
68+
69+
@property
70+
def newlines(self):
71+
raise UnsupportedOperation()
72+
73+
def detach(self):
74+
buffer = self.buffer
75+
self.buffer = None
76+
return buffer
77+
78+
def read(self, n=-1):
79+
if n is None or n < 0:
80+
n = float("Infinity")
81+
82+
data = []
83+
remaining = n
84+
85+
# Read what we can from the current chunk
86+
chunk = self.decodedChunk
87+
if chunk:
88+
offset = self.decodedChunkOffset
89+
remainderOfChunk = len(chunk) - offset
90+
if remaining < remainderOfChunk:
91+
data.append(chunk[offset:offset + remaining])
92+
self.decodedChunkOffset += remaining
93+
remaining = 0
94+
else:
95+
data.append(chunk[offset:])
96+
self.decodedChunkOffset += remainderOfChunk
97+
remaining -= remainderOfChunk
98+
99+
# Read more data
100+
chunkOffset = self.chunkOffset
101+
chunkNumber = self.chunkNumber
102+
chunkSize = self.chunkSize
103+
while remaining > 0:
104+
readBytes = self.buffer.read(chunkSize)
105+
if readBytes == 0:
106+
decoded = self.decoder(b"", True)
107+
else:
108+
decoded = self.decoder(readBytes, False)
109+
self.rawChunks.append(readBytes)
110+
self.chunkStates.append(self.decoder.getstate())
111+
self.chunkLengths.append(len(decoded))
112+
self.currentChunk += 1
113+
self.decodedChunk = decoded
114+
if remaining < len(decoded):
115+
data.append(decoded[:remaining])
116+
remaining = 0
117+
self.decodedChunkOffset = remaining
118+
else:
119+
data.append(decoded)
120+
remaining -= len(decoded)
121+
self.decodedChunkOffset = len(decoded)
122+
123+
# Eventually we have enough data
124+
assert n == float("Infinity") or sum(map(len, data)) == n
125+
return "".join(data)
126+
127+
def readline(self, limit):
128+
# TODO: this
129+
pass
130+
131+
def seek(self, offset, whence):
132+
# TODO: this
133+
pass
134+
135+
def tell(self):
136+
return (sum(self.chunkLengths[:self.currentChunk]) +
137+
self.decodedChunkOffset)

0 commit comments

Comments
 (0)