Skip to content

Commit 980e9ad

Browse files
committed
Fixed error when \r\n(EOF) falls across chunk boundary
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401159
1 parent 66d242f commit 980e9ad

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

src/html5lib/inputstream.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class HTMLInputStream(object):
2424
2525
"""
2626

27+
_defaultChunkSize = 10240
28+
2729
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
2830
"""Initialises the HTMLInputStream.
2931
@@ -260,11 +262,12 @@ def char(self):
260262
self.readChars.append(char)
261263
return char
262264

263-
def readChunk(self, chunkSize=10240):
265+
def readChunk(self, chunkSize=_defaultChunkSize):
266+
self.chunk = u""
267+
self.chunkOffset = 0
268+
264269
data = self.dataStream.read(chunkSize)
265270
if not data:
266-
self.chunk = u""
267-
self.chunkOffset = 0
268271
return False
269272
#Replace null characters
270273
for i in xrange(data.count(u"\u0000")):
@@ -276,13 +279,15 @@ def readChunk(self, chunkSize=10240):
276279
#Check for CR LF broken across chunks
277280
if (self._lastChunkEndsWithCR and data[0] == "\n"):
278281
data = data[1:]
282+
# Stop if the chunk is now empty
283+
if not data:
284+
return False
279285
self._lastChunkEndsWithCR = data[-1] == "\r"
280286
data = data.replace("\r\n", "\n")
281287
data = data.replace("\r", "\n")
282288

283289
data = unicode(data)
284290
self.chunk = data
285-
self.chunkOffset = 0
286291

287292
self.updatePosition()
288293
return True

tests/test_stream.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ def test_newlines(self):
4646
self.assertEquals(stream.charsUntil('e'),u"x")
4747
self.assertEquals(stream.position(), (4,5))
4848

49+
def test_newlines2(self):
50+
size = HTMLInputStream._defaultChunkSize
51+
stream = HTMLInputStream("\r" * size + "\n")
52+
self.assertEquals(stream.charsUntil('x'), "\n" * size)
53+
4954
def test_position(self):
5055
stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\nccc\nddd")
5156
self.assertEquals(stream.position(), (1, 0))

0 commit comments

Comments
 (0)