Skip to content

[3.9] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) #134346

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Include/cpython/bytesobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
int use_bytearray);

/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
const char *,
int *, const char **);
// Export for binary compatibility.
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, const char **);

Expand Down
13 changes: 13 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
);
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
chars. */
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed, /* bytes consumed */
int *first_invalid_escape_char, /* on return, if not -1, contain the first
invalid escaped char (<= 0xff) or invalid
octal escape (> 0xff) in string. */
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
point to the first invalid escaped
char in string.
May be NULL if errors is not NULL. */
// Export for binary compatibility.
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
Expand Down
36 changes: 35 additions & 1 deletion Lib/test/test_codeccallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1124,7 +1124,7 @@ def test_bug828737(self):
text = 'abc<def>ghi'*n
text.translate(charmap)

def test_mutatingdecodehandler(self):
def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
Expand Down Expand Up @@ -1159,6 +1159,40 @@ def mutating(exc):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")

def test_mutating_decode_handler_unicode_escape(self):
decode = codecs.unicode_escape_decode
def mutating(exc):
if isinstance(exc, UnicodeDecodeError):
r = data.get(exc.object[:exc.end])
if r is not None:
exc.object = r[0] + exc.object[exc.end:]
return ('\u0404', r[1])
raise AssertionError("don't know how to handle %r" % exc)

codecs.register_error('test.mutating2', mutating)
data = {
br'\x0': (b'\\', 0),
br'\x3': (b'xxx\\', 3),
br'\x5': (b'x\\', 1),
}
def check(input, expected, msg):
with self.assertWarns(DeprecationWarning) as cm:
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
self.assertIn(msg, str(cm.warning))

check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")

check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")

check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")

# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
Expand Down
39 changes: 31 additions & 8 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,20 +1178,32 @@ def test_escape(self):
check(br"[\501]", b"[A]")
check(br"[\x41]", b"[A]")
check(br"[\x410]", b"[A0]")

def test_warnings(self):
decode = codecs.escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, b"\\" + b)
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), b"\\" + b.upper())
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\8'"):
check(br"\8", b"\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", b"\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", b"\\\xfa")

with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\z'"):
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))

def test_errors(self):
decode = codecs.escape_decode
self.assertRaises(ValueError, decode, br"\x")
Expand Down Expand Up @@ -2393,20 +2405,31 @@ def test_escape_decode(self):
check(br"[\x410]", "[A0]")
check(br"\u20ac", "\u20ac")
check(br"\U0001d120", "\U0001d120")

def test_decode_warnings(self):
decode = codecs.unicode_escape_decode
check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtuvx':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, "\\" + chr(i))
if b.upper() not in b'UN':
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), "\\" + chr(i-32))
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\8'"):
check(br"\8", "\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", "\\9")
with self.assertWarns(DeprecationWarning):
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", "\\\xfa")
with self.assertWarnsRegex(DeprecationWarning,
r"invalid escape sequence '\\z'"):
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))

def test_decode_errors(self):
decode = codecs.unicode_escape_decode
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
handler.
40 changes: 29 additions & 11 deletions Objects/bytesobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1060,10 +1060,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
}

/* Unescape a backslash-escaped string. */
PyObject *_PyBytes_DecodeEscape(const char *s,
PyObject *_PyBytes_DecodeEscape2(const char *s,
Py_ssize_t len,
const char *errors,
const char **first_invalid_escape)
int *first_invalid_escape_char,
const char **first_invalid_escape_ptr)
{
int c;
char *p;
Expand All @@ -1077,7 +1078,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
return NULL;
writer.overallocate = 1;

*first_invalid_escape = NULL;
*first_invalid_escape_char = -1;
*first_invalid_escape_ptr = NULL;

end = s + len;
while (s < end) {
Expand Down Expand Up @@ -1152,9 +1154,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
break;

default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape_char == -1) {
*first_invalid_escape_char = (unsigned char)s[-1];
/* Back up one char, since we've already incremented s. */
*first_invalid_escape_ptr = s - 1;
}
*p++ = '\\';
s--;
Expand All @@ -1168,21 +1171,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
return NULL;
}

// Export for binary compatibility.
PyObject *_PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
const char **first_invalid_escape)
{
int first_invalid_escape_char;
return _PyBytes_DecodeEscape2(
s, len, errors,
&first_invalid_escape_char,
first_invalid_escape);
}

PyObject *PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
Py_ssize_t Py_UNUSED(unicode),
const char *Py_UNUSED(recode_encoding))
{
const char* first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
&first_invalid_escape);
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (first_invalid_escape_char != -1) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
(unsigned char)*first_invalid_escape) < 0) {
first_invalid_escape_char) < 0) {
Py_DECREF(result);
return NULL;
}
Expand Down
45 changes: 34 additions & 11 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -6278,20 +6278,23 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;

PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
int *first_invalid_escape_char,
const char **first_invalid_escape_ptr)
{
const char *starts = s;
const char *initial_starts = starts;
_PyUnicodeWriter writer;
const char *end;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;

// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;
*first_invalid_escape_char = -1;
*first_invalid_escape_ptr = NULL;

if (size == 0) {
if (consumed) {
Expand Down Expand Up @@ -6474,9 +6477,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
goto error;

default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape_char == -1) {
*first_invalid_escape_char = c;
if (starts == initial_starts) {
/* Back up one char, since we've already incremented s. */
*first_invalid_escape_ptr = s - 1;
}
}
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
Expand Down Expand Up @@ -6515,22 +6521,39 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
return NULL;
}

// Export for binary compatibility.
PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
{
int first_invalid_escape_char;
return _PyUnicode_DecodeUnicodeEscapeInternal2(
s, size, errors, consumed,
&first_invalid_escape_char,
first_invalid_escape);
}

PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
consumed,
&first_invalid_escape);
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (first_invalid_escape_char != -1) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
(unsigned char)*first_invalid_escape) < 0) {
first_invalid_escape_char) < 0) {
Py_DECREF(result);
return NULL;
}
Expand Down
26 changes: 16 additions & 10 deletions Parser/pegen/parse_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,15 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
len = p - buf;
s = buf;

const char *first_invalid_escape;
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);

if (v != NULL && first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
/* We have not decref u before because first_invalid_escape points
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
&first_invalid_escape_char,
&first_invalid_escape_ptr);

if (v != NULL && first_invalid_escape_ptr != NULL) {
if (warn_invalid_escape_sequence(parser, *first_invalid_escape_ptr, t) < 0) {
/* We have not decref u before because first_invalid_escape_ptr points
inside u. */
Py_XDECREF(u);
Py_DECREF(v);
Expand All @@ -138,14 +141,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
static PyObject *
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
{
const char *first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL) {
return NULL;
}

if (first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
if (first_invalid_escape_ptr != NULL) {
if (warn_invalid_escape_sequence(p, *first_invalid_escape_ptr, t) < 0) {
Py_DECREF(result);
return NULL;
}
Expand Down
Loading