Skip to content

[3.11] gh-109747: Improve errors for unsupported look-behind patterns (GH-109859) #110860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def _compile(code, pattern, flags):
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo > MAXCODE:
raise error("looks too much behind")
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
Expand Down Expand Up @@ -549,7 +551,7 @@ def _compile_info(code, pattern, flags):
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
emit(hi)
# add literal prefix
if prefix:
emit(len(prefix)) # length
Expand Down
13 changes: 10 additions & 3 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE

# Maximal value returned by SubPattern.getwidth().
# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
MAXWIDTH = 1 << 64

class State:
# keeps track of state for parsing
def __init__(self):
Expand Down Expand Up @@ -178,7 +182,7 @@ def getwidth(self):
lo = hi = 0
for op, av in self.data:
if op is BRANCH:
i = MAXREPEAT - 1
i = MAXWIDTH
j = 0
for av in av[1]:
l, h = av.getwidth()
Expand All @@ -197,7 +201,10 @@ def getwidth(self):
elif op in _REPEATCODES:
i, j = av[2].getwidth()
lo = lo + i * av[0]
hi = hi + j * av[1]
if av[1] == MAXREPEAT and j:
hi = MAXWIDTH
else:
hi = hi + j * av[1]
elif op in _UNITCODES:
lo = lo + 1
hi = hi + 1
Expand All @@ -217,7 +224,7 @@ def getwidth(self):
hi = hi + j
elif op is SUCCESS:
break
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
return self.width

class Tokenizer:
Expand Down
23 changes: 23 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1830,6 +1830,29 @@ def test_repeat_minmax_overflow(self):
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))

def test_look_behind_overflow(self):
string = "x" * 2_500_000
p1 = r"(?<=((.{%d}){%d}){%d})"
p2 = r"(?<!((.{%d}){%d}){%d})"
# Test that the templates are valid and look-behind with width 2**21
# (larger than sys.maxunicode) are supported.
self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
(2**21, 2**21))
self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
(0, 0))
# Test that 2**22 is accepted as a repetition number and look-behind
# width.
re.compile(p1 % (2**22, 1, 1))
re.compile(p1 % (1, 2**22, 1))
re.compile(p1 % (1, 1, 2**22))
re.compile(p2 % (2**22, 1, 1))
re.compile(p2 % (1, 2**22, 1))
re.compile(p2 % (1, 1, 2**22))
# But 2**66 is too large for look-behind width.
errmsg = "looks too much behind"
self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))

def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
self.checkPatternError('(?P=<foo>)',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Improve errors for unsupported look-behind patterns. Now re.error is raised
instead of OverflowError or RuntimeError for too large width of look-behind
pattern.
2 changes: 0 additions & 2 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -1939,8 +1939,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
GET_SKIP;
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
if (arg & 0x80000000)
FAIL; /* Width too large */
/* Stop 1 before the end; we check the SUCCESS below */
if (_validate_inner(code+1, code+skip-2, groups))
FAIL;
Expand Down
14 changes: 7 additions & 7 deletions Modules/_sre/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
TRACE(("reject (got %zd chars, need %zd)\n",
end - ptr, (Py_ssize_t) pattern[3]));
TRACE(("reject (got %tu chars, need %zu)\n",
end - ptr, (size_t) pattern[3]));
RETURN_FAILURE;
}
pattern += pattern[1] + 1;
Expand Down Expand Up @@ -1507,7 +1507,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", pattern,
ptr, pattern[1]));
if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
RETURN_FAILURE;
state->ptr = ptr - pattern[1];
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
Expand All @@ -1520,7 +1520,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
ptr, pattern[1]));
if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
state->ptr = ptr - pattern[1];
LASTMARK_SAVE();
if (state->repeat)
Expand Down Expand Up @@ -1655,9 +1655,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)

flags = pattern[2];

if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
TRACE(("reject (got %u chars, need %u)\n",
(unsigned int)(end - ptr), pattern[3]));
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
TRACE(("reject (got %tu chars, need %zu)\n",
end - ptr, (size_t) pattern[3]));
return 0;
}
if (pattern[3] > 1) {
Expand Down