Skip to content

Commit 02852bd

Browse files
committed
Add a variant of charsUntil not using RE
1 parent 41c90ae commit 02852bd

File tree

1 file changed

+38
-1
lines changed

1 file changed

+38
-1
lines changed

html5lib/inputstream.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def characterErrorsUCS2(self, data):
306306
skip = False
307307
self.errors.append("invalid-codepoint")
308308

309-
def charsUntil(self, characters, opposite=False):
309+
def charsUntilRe(self, characters, opposite=False):
310310
""" Returns a string of characters from the stream up to but not
311311
including any character in 'characters' or EOF. 'characters' must be
312312
a container that supports the 'in' method and iteration over its
@@ -353,6 +353,43 @@ def charsUntil(self, characters, opposite=False):
353353
r = "".join(rv)
354354
return r
355355

356+
def charsUntilNoRe(self, characters, opposite=False):
357+
"""Identical to charsUntil, but doesn't use re"""
358+
chars = frozenset(characters)
359+
360+
if self.chunkOffset >= self.chunkSize:
361+
if not self.readChunk():
362+
return ""
363+
364+
matching = []
365+
while True:
366+
end = self.chunkOffset
367+
if opposite:
368+
for i in range(self.chunkOffset, self.chunkSize):
369+
if self.chunk[i] in chars:
370+
end = i + 1
371+
else:
372+
break
373+
else:
374+
for i in range(self.chunkOffset, self.chunkSize):
375+
if self.chunk[i] not in chars:
376+
end = i + 1
377+
else:
378+
break
379+
380+
matching.append(self.chunk[self.chunkOffset:end])
381+
self.chunkOffset = end
382+
383+
if self.chunkOffset >= self.chunkSize:
384+
if not self.readChunk():
385+
break
386+
else:
387+
break
388+
389+
r = "".join(matching)
390+
assert r is not None
391+
return r
392+
356393
def unget(self, char):
357394
# Only one character is allowed to be ungotten at once - it must
358395
# be consumed again before any further call to unget

0 commit comments

Comments
 (0)