Skip to content

Commit ff6fd06

Browse files
author
James Graham
committed
Phase one of the null character handling changes
1 parent 9ab28eb commit ff6fd06

File tree

2 files changed

+177
-63
lines changed

2 files changed

+177
-63
lines changed

html5lib/inputstream.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ def readChunk(self, chunkSize=None):
362362
self.reportCharacterErrors(data)
363363

364364
# Replace invalid characters
365-
data = data.replace(u"\u0000", u"\ufffd")
365+
# Note U+0000 is dealt with in the tokenizer
366366
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
367367

368368
data = data.replace(u"\r\n", u"\n")
@@ -374,16 +374,12 @@ def readChunk(self, chunkSize=None):
374374
return True
375375

376376
def characterErrorsUCS4(self, data):
377-
for i in xrange(data.count(u"\u0000")):
378-
self.errors.append("null-character")
379377
for i in xrange(len(invalid_unicode_re.findall(data))):
380378
self.errors.append("invalid-codepoint")
381379

382380
def characterErrorsUCS2(self, data):
383381
#Someone picked the wrong compile option
384382
#You lose
385-
for i in xrange(data.count(u"\u0000")):
386-
self.errors.append("null-character")
387383
skip = False
388384
import sys
389385
for match in invalid_unicode_re.finditer(data):
@@ -452,24 +448,9 @@ def charsUntil(self, characters, opposite = False):
452448
r = u"".join(rv)
453449
return r
454450

455-
def charsUntilEOF(self):
456-
""" Returns a string of characters from the stream up to EOF."""
457-
458-
rv = []
459-
460-
while True:
461-
rv.append(self.chunk[self.chunkOffset:])
462-
if not self.readChunk():
463-
# Reached EOF
464-
break
465-
466-
r = u"".join(rv)
467-
return r
468-
469451
def unget(self, char):
470452
# Only one character is allowed to be ungotten at once - it must
471453
# be consumed again before any further call to unget
472-
473454
if char is not None:
474455
if self.chunkOffset == 0:
475456
# unget is called quite rarely, so it's a good idea to do

0 commit comments

Comments
 (0)