@@ -362,7 +362,7 @@ def readChunk(self, chunkSize=None):
362
362
self .reportCharacterErrors (data )
363
363
364
364
# Replace invalid characters
365
- data = data . replace ( u" \u0000 " , u" \ufffd " )
365
+ # Note U+0000 is dealt with in the tokenizer
366
366
data = self .replaceCharactersRegexp .sub (u"\ufffd " , data )
367
367
368
368
data = data .replace (u"\r \n " , u"\n " )
@@ -374,16 +374,12 @@ def readChunk(self, chunkSize=None):
374
374
return True
375
375
376
376
def characterErrorsUCS4 (self , data ):
377
- for i in xrange (data .count (u"\u0000 " )):
378
- self .errors .append ("null-character" )
379
377
for i in xrange (len (invalid_unicode_re .findall (data ))):
380
378
self .errors .append ("invalid-codepoint" )
381
379
382
380
def characterErrorsUCS2 (self , data ):
383
381
#Someone picked the wrong compile option
384
382
#You lose
385
- for i in xrange (data .count (u"\u0000 " )):
386
- self .errors .append ("null-character" )
387
383
skip = False
388
384
import sys
389
385
for match in invalid_unicode_re .finditer (data ):
@@ -452,24 +448,9 @@ def charsUntil(self, characters, opposite = False):
452
448
r = u"" .join (rv )
453
449
return r
454
450
455
- def charsUntilEOF (self ):
456
- """ Returns a string of characters from the stream up to EOF."""
457
-
458
- rv = []
459
-
460
- while True :
461
- rv .append (self .chunk [self .chunkOffset :])
462
- if not self .readChunk ():
463
- # Reached EOF
464
- break
465
-
466
- r = u"" .join (rv )
467
- return r
468
-
469
451
def unget (self , char ):
470
452
# Only one character is allowed to be ungotten at once - it must
471
453
# be consumed again before any further call to unget
472
-
473
454
if char is not None :
474
455
if self .chunkOffset == 0 :
475
456
# unget is called quite rarely, so it's a good idea to do
0 commit comments