Skip to content

gh-67877: Fix memory leaks in terminated RE matching #126840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2681,6 +2681,29 @@ def test_character_set_none(self):
self.assertIsNone(re.search(p, s))
self.assertIsNone(re.search('(?s:.)' + p, s))

def check_interrupt(self, pattern, string, maxcount):
class Interrupt(Exception):
pass
p = re.compile(pattern)
for n in range(maxcount):
try:
p._fail_after(n, Interrupt)
p.match(string)
return n
except Interrupt:
pass
finally:
p._fail_after(-1, None)

@unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build')
def test_memory_leaks(self):
self.check_interrupt(r'(.)*:', 'abc:', 100)
self.check_interrupt(r'([^:])*?:', 'abc:', 100)
self.check_interrupt(r'([^:])*+:', 'abc:', 100)
self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)


def get_debug_out(pat):
with captured_stdout() as out:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix memory leaks when :mod:`regular expression <re>` matching terminates
abruptly, either because of a signal or because memory allocation fails.
44 changes: 43 additions & 1 deletion Modules/_sre/clinic/sre.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

132 changes: 127 additions & 5 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
return 0;
}

/* memory pool functions for SRE_REPEAT, this can avoid memory
leak when SRE(match) function terminates abruptly.
state->repeat_pool_used is a doubly-linked list, so that we
can remove a SRE_REPEAT node from it.
state->repeat_pool_unused is a singly-linked list, we put/get
node at the head. */
static SRE_REPEAT *
repeat_pool_malloc(SRE_STATE *state)
{
SRE_REPEAT *repeat;

if (state->repeat_pool_unused) {
/* remove from unused pool (singly-linked list) */
repeat = state->repeat_pool_unused;
state->repeat_pool_unused = repeat->pool_next;
}
else {
repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
if (!repeat) {
return NULL;
}
}

/* add to used pool (doubly-linked list) */
SRE_REPEAT *temp = state->repeat_pool_used;
if (temp) {
temp->pool_prev = repeat;
}
repeat->pool_prev = NULL;
repeat->pool_next = temp;
state->repeat_pool_used = repeat;

return repeat;
}

static void
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
{
SRE_REPEAT *prev = repeat->pool_prev;
SRE_REPEAT *next = repeat->pool_next;

/* remove from used pool (doubly-linked list) */
if (prev) {
prev->pool_next = next;
}
else {
state->repeat_pool_used = next;
}
if (next) {
next->pool_prev = prev;
}

/* add to unused pool (singly-linked list) */
repeat->pool_next = state->repeat_pool_unused;
state->repeat_pool_unused = repeat;
}

static void
repeat_pool_clear(SRE_STATE *state)
{
/* clear used pool */
SRE_REPEAT *next = state->repeat_pool_used;
state->repeat_pool_used = NULL;
while (next) {
SRE_REPEAT *temp = next;
next = temp->pool_next;
PyMem_Free(temp);
}

/* clear unused pool */
next = state->repeat_pool_unused;
state->repeat_pool_unused = NULL;
while (next) {
SRE_REPEAT *temp = next;
next = temp->pool_next;
PyMem_Free(temp);
}
}

/* generate 8-bit version */

#define SRE_CHAR Py_UCS1
Expand Down Expand Up @@ -511,6 +590,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start;
state->endpos = end;

#ifdef Py_DEBUG
state->fail_after_count = pattern->fail_after_count;
state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
#endif

return string;
err:
/* We add an explicit cast here because MSVC has a bug when
Expand All @@ -533,6 +617,8 @@ state_fini(SRE_STATE* state)
/* See above PyMem_Free() for why we explicitly cast here. */
PyMem_Free((void*) state->mark);
state->mark = NULL;
/* SRE_REPEAT pool */
repeat_pool_clear(state);
}

/* calculate offset from start of string */
Expand Down Expand Up @@ -619,6 +705,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg)
Py_VISIT(self->groupindex);
Py_VISIT(self->indexgroup);
Py_VISIT(self->pattern);
#ifdef Py_DEBUG
Py_VISIT(self->fail_after_exc);
#endif
return 0;
}

Expand All @@ -628,6 +717,9 @@ pattern_clear(PatternObject *self)
Py_CLEAR(self->groupindex);
Py_CLEAR(self->indexgroup);
Py_CLEAR(self->pattern);
#ifdef Py_DEBUG
Py_CLEAR(self->fail_after_exc);
#endif
return 0;
}

Expand Down Expand Up @@ -690,7 +782,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
Py_ssize_t status;
PyObject *match;

if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
if (!state_init(&state, self, string, pos, endpos))
return NULL;

INIT_TRACE(&state);
Expand Down Expand Up @@ -1381,6 +1473,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
return Py_NewRef(self);
}

#ifdef Py_DEBUG
/*[clinic input]
_sre.SRE_Pattern._fail_after

count: int
exception: object
/

For debugging.
[clinic start generated code]*/

static PyObject *
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
PyObject *exception)
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
{
self->fail_after_count = count;
Py_INCREF(exception);
Py_XSETREF(self->fail_after_exc, exception);
Py_RETURN_NONE;
}
#endif /* Py_DEBUG */

static PyObject *
pattern_repr(PatternObject *obj)
{
Expand Down Expand Up @@ -1506,6 +1621,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
self->pattern = NULL;
self->groupindex = NULL;
self->indexgroup = NULL;
#ifdef Py_DEBUG
self->fail_after_count = -1;
self->fail_after_exc = NULL;
#endif

self->codesize = n;

Expand Down Expand Up @@ -2604,7 +2723,8 @@ pattern_new_match(_sremodulestate* module_state,
if (!match)
return NULL;

match->pattern = (PatternObject*)Py_NewRef(pattern);
Py_INCREF(pattern);
match->pattern = pattern;

match->string = Py_NewRef(state->string);

Expand Down Expand Up @@ -2740,7 +2860,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
return NULL;
}

match = pattern_new_match(module_state, (PatternObject*) self->pattern,
match = pattern_new_match(module_state, self->pattern,
state, status);

if (status == 0)
Expand Down Expand Up @@ -2790,7 +2910,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
return NULL;
}

match = pattern_new_match(module_state, (PatternObject*) self->pattern,
match = pattern_new_match(module_state, self->pattern,
state, status);

if (status == 0)
Expand Down Expand Up @@ -2826,7 +2946,8 @@ pattern_scanner(_sremodulestate *module_state,
return NULL;
}

scanner->pattern = Py_NewRef(self);
Py_INCREF(self);
scanner->pattern = self;

PyObject_GC_Track(scanner);
return (PyObject*) scanner;
Expand Down Expand Up @@ -3020,6 +3141,7 @@ static PyMethodDef pattern_methods[] = {
_SRE_SRE_PATTERN_SCANNER_METHODDEF
_SRE_SRE_PATTERN___COPY___METHODDEF
_SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
{"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
PyDoc_STR("See PEP 585")},
{NULL, NULL}
Expand Down
17 changes: 16 additions & 1 deletion Modules/_sre/sre.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ typedef struct {
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
#ifdef Py_DEBUG
/* for simulation of user interruption */
int fail_after_count;
PyObject *fail_after_exc;
#endif
/* pattern code */
Py_ssize_t codesize;
SRE_CODE code[1];
Expand Down Expand Up @@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
const SRE_CODE* pattern; /* points to REPEAT operator arguments */
const void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
/* for SRE_REPEAT pool */
struct SRE_REPEAT_T *pool_prev;
struct SRE_REPEAT_T *pool_next;
} SRE_REPEAT;

typedef struct {
Expand Down Expand Up @@ -95,12 +103,19 @@ typedef struct {
size_t data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
/* SRE_REPEAT pool */
SRE_REPEAT *repeat_pool_used;
SRE_REPEAT *repeat_pool_unused;
unsigned int sigcount;
#ifdef Py_DEBUG
int fail_after_count;
PyObject *fail_after_exc;
#endif
} SRE_STATE;

typedef struct {
PyObject_HEAD
PyObject* pattern;
PatternObject* pattern;
SRE_STATE state;
int executing;
} ScannerObject;
Expand Down
Loading
Loading