Skip to content

Commit 8d35fd1

Browse files
[3.9] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (#134346)
* [3.9] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58) (cherry picked from commit 6279eb8) (cherry picked from commit a75953b) (cherry picked from commit 0c33e5b) (cherry picked from commit 8b528ca) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent d4df3c5 commit 8d35fd1

File tree

8 files changed

+164
-41
lines changed

8 files changed

+164
-41
lines changed

Include/cpython/bytesobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
2525
int use_bytearray);
2626

2727
/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
28+
PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
29+
const char *,
30+
int *, const char **);
31+
// Export for binary compatibility.
2832
PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
2933
const char *, const char **);
3034

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
866866
);
867867
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
868868
chars. */
869+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
870+
const char *string, /* Unicode-Escape encoded string */
871+
Py_ssize_t length, /* size of string */
872+
const char *errors, /* error handling */
873+
Py_ssize_t *consumed, /* bytes consumed */
874+
int *first_invalid_escape_char, /* on return, if not -1, contain the first
875+
invalid escaped char (<= 0xff) or invalid
876+
octal escape (> 0xff) in string. */
877+
const char **first_invalid_escape_ptr); /* on return, if not NULL, may
878+
point to the first invalid escaped
879+
char in string.
880+
May be NULL if errors is not NULL. */
881+
// Export for binary compatibility.
869882
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
870883
const char *string, /* Unicode-Escape encoded string */
871884
Py_ssize_t length, /* size of string */

Lib/test/test_codeccallbacks.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1124,7 +1124,7 @@ def test_bug828737(self):
11241124
text = 'abc<def>ghi'*n
11251125
text.translate(charmap)
11261126

1127-
def test_mutatingdecodehandler(self):
1127+
def test_mutating_decode_handler(self):
11281128
baddata = [
11291129
("ascii", b"\xff"),
11301130
("utf-7", b"++"),
@@ -1159,6 +1159,40 @@ def mutating(exc):
11591159
for (encoding, data) in baddata:
11601160
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
11611161

1162+
def test_mutating_decode_handler_unicode_escape(self):
1163+
decode = codecs.unicode_escape_decode
1164+
def mutating(exc):
1165+
if isinstance(exc, UnicodeDecodeError):
1166+
r = data.get(exc.object[:exc.end])
1167+
if r is not None:
1168+
exc.object = r[0] + exc.object[exc.end:]
1169+
return ('\u0404', r[1])
1170+
raise AssertionError("don't know how to handle %r" % exc)
1171+
1172+
codecs.register_error('test.mutating2', mutating)
1173+
data = {
1174+
br'\x0': (b'\\', 0),
1175+
br'\x3': (b'xxx\\', 3),
1176+
br'\x5': (b'x\\', 1),
1177+
}
1178+
def check(input, expected, msg):
1179+
with self.assertWarns(DeprecationWarning) as cm:
1180+
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
1181+
self.assertIn(msg, str(cm.warning))
1182+
1183+
check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1184+
check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
1185+
1186+
check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
1187+
check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
1188+
check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
1189+
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
1190+
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
1191+
1192+
check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
1193+
check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
1194+
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
1195+
11621196
# issue32583
11631197
def test_crashing_decode_handler(self):
11641198
# better generating one more character to fill the extra space slot

Lib/test/test_codecs.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,20 +1178,32 @@ def test_escape(self):
11781178
check(br"[\501]", b"[A]")
11791179
check(br"[\x41]", b"[A]")
11801180
check(br"[\x410]", b"[A0]")
1181+
1182+
def test_warnings(self):
1183+
decode = codecs.escape_decode
1184+
check = coding_checker(self, decode)
11811185
for i in range(97, 123):
11821186
b = bytes([i])
11831187
if b not in b'abfnrtvx':
1184-
with self.assertWarns(DeprecationWarning):
1188+
with self.assertWarnsRegex(DeprecationWarning,
1189+
r"invalid escape sequence '\\%c'" % i):
11851190
check(b"\\" + b, b"\\" + b)
1186-
with self.assertWarns(DeprecationWarning):
1191+
with self.assertWarnsRegex(DeprecationWarning,
1192+
r"invalid escape sequence '\\%c'" % (i-32)):
11871193
check(b"\\" + b.upper(), b"\\" + b.upper())
1188-
with self.assertWarns(DeprecationWarning):
1194+
with self.assertWarnsRegex(DeprecationWarning,
1195+
r"invalid escape sequence '\\8'"):
11891196
check(br"\8", b"\\8")
11901197
with self.assertWarns(DeprecationWarning):
11911198
check(br"\9", b"\\9")
1192-
with self.assertWarns(DeprecationWarning):
1199+
with self.assertWarnsRegex(DeprecationWarning,
1200+
r"invalid escape sequence '\\\xfa'") as cm:
11931201
check(b"\\\xfa", b"\\\xfa")
11941202

1203+
with self.assertWarnsRegex(DeprecationWarning,
1204+
r"invalid escape sequence '\\z'"):
1205+
self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
1206+
11951207
def test_errors(self):
11961208
decode = codecs.escape_decode
11971209
self.assertRaises(ValueError, decode, br"\x")
@@ -2393,20 +2405,31 @@ def test_escape_decode(self):
23932405
check(br"[\x410]", "[A0]")
23942406
check(br"\u20ac", "\u20ac")
23952407
check(br"\U0001d120", "\U0001d120")
2408+
2409+
def test_decode_warnings(self):
2410+
decode = codecs.unicode_escape_decode
2411+
check = coding_checker(self, decode)
23962412
for i in range(97, 123):
23972413
b = bytes([i])
23982414
if b not in b'abfnrtuvx':
2399-
with self.assertWarns(DeprecationWarning):
2415+
with self.assertWarnsRegex(DeprecationWarning,
2416+
r"invalid escape sequence '\\%c'" % i):
24002417
check(b"\\" + b, "\\" + chr(i))
24012418
if b.upper() not in b'UN':
2402-
with self.assertWarns(DeprecationWarning):
2419+
with self.assertWarnsRegex(DeprecationWarning,
2420+
r"invalid escape sequence '\\%c'" % (i-32)):
24032421
check(b"\\" + b.upper(), "\\" + chr(i-32))
2404-
with self.assertWarns(DeprecationWarning):
2422+
with self.assertWarnsRegex(DeprecationWarning,
2423+
r"invalid escape sequence '\\8'"):
24052424
check(br"\8", "\\8")
24062425
with self.assertWarns(DeprecationWarning):
24072426
check(br"\9", "\\9")
2408-
with self.assertWarns(DeprecationWarning):
2427+
with self.assertWarnsRegex(DeprecationWarning,
2428+
r"invalid escape sequence '\\\xfa'") as cm:
24092429
check(b"\\\xfa", "\\\xfa")
2430+
with self.assertWarnsRegex(DeprecationWarning,
2431+
r"invalid escape sequence '\\z'"):
2432+
self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
24102433

24112434
def test_decode_errors(self):
24122435
decode = codecs.unicode_escape_decode
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
2+
handler.

Objects/bytesobject.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,10 +1060,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
10601060
}
10611061

10621062
/* Unescape a backslash-escaped string. */
1063-
PyObject *_PyBytes_DecodeEscape(const char *s,
1063+
PyObject *_PyBytes_DecodeEscape2(const char *s,
10641064
Py_ssize_t len,
10651065
const char *errors,
1066-
const char **first_invalid_escape)
1066+
int *first_invalid_escape_char,
1067+
const char **first_invalid_escape_ptr)
10671068
{
10681069
int c;
10691070
char *p;
@@ -1077,7 +1078,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
10771078
return NULL;
10781079
writer.overallocate = 1;
10791080

1080-
*first_invalid_escape = NULL;
1081+
*first_invalid_escape_char = -1;
1082+
*first_invalid_escape_ptr = NULL;
10811083

10821084
end = s + len;
10831085
while (s < end) {
@@ -1152,9 +1154,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11521154
break;
11531155

11541156
default:
1155-
if (*first_invalid_escape == NULL) {
1156-
*first_invalid_escape = s-1; /* Back up one char, since we've
1157-
already incremented s. */
1157+
if (*first_invalid_escape_char == -1) {
1158+
*first_invalid_escape_char = (unsigned char)s[-1];
1159+
/* Back up one char, since we've already incremented s. */
1160+
*first_invalid_escape_ptr = s - 1;
11581161
}
11591162
*p++ = '\\';
11601163
s--;
@@ -1168,21 +1171,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
11681171
return NULL;
11691172
}
11701173

1174+
// Export for binary compatibility.
1175+
PyObject *_PyBytes_DecodeEscape(const char *s,
1176+
Py_ssize_t len,
1177+
const char *errors,
1178+
const char **first_invalid_escape)
1179+
{
1180+
int first_invalid_escape_char;
1181+
return _PyBytes_DecodeEscape2(
1182+
s, len, errors,
1183+
&first_invalid_escape_char,
1184+
first_invalid_escape);
1185+
}
1186+
11711187
PyObject *PyBytes_DecodeEscape(const char *s,
11721188
Py_ssize_t len,
11731189
const char *errors,
11741190
Py_ssize_t Py_UNUSED(unicode),
11751191
const char *Py_UNUSED(recode_encoding))
11761192
{
1177-
const char* first_invalid_escape;
1178-
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
1179-
&first_invalid_escape);
1193+
int first_invalid_escape_char;
1194+
const char *first_invalid_escape_ptr;
1195+
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
1196+
&first_invalid_escape_char,
1197+
&first_invalid_escape_ptr);
11801198
if (result == NULL)
11811199
return NULL;
1182-
if (first_invalid_escape != NULL) {
1200+
if (first_invalid_escape_char != -1) {
11831201
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
11841202
"invalid escape sequence '\\%c'",
1185-
(unsigned char)*first_invalid_escape) < 0) {
1203+
first_invalid_escape_char) < 0) {
11861204
Py_DECREF(result);
11871205
return NULL;
11881206
}

Objects/unicodeobject.c

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6278,20 +6278,23 @@ PyUnicode_AsUTF16String(PyObject *unicode)
62786278
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
62796279

62806280
PyObject *
6281-
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6281+
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
62826282
Py_ssize_t size,
62836283
const char *errors,
62846284
Py_ssize_t *consumed,
6285-
const char **first_invalid_escape)
6285+
int *first_invalid_escape_char,
6286+
const char **first_invalid_escape_ptr)
62866287
{
62876288
const char *starts = s;
6289+
const char *initial_starts = starts;
62886290
_PyUnicodeWriter writer;
62896291
const char *end;
62906292
PyObject *errorHandler = NULL;
62916293
PyObject *exc = NULL;
62926294

62936295
// so we can remember if we've seen an invalid escape char or not
6294-
*first_invalid_escape = NULL;
6296+
*first_invalid_escape_char = -1;
6297+
*first_invalid_escape_ptr = NULL;
62956298

62966299
if (size == 0) {
62976300
if (consumed) {
@@ -6474,9 +6477,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
64746477
goto error;
64756478

64766479
default:
6477-
if (*first_invalid_escape == NULL) {
6478-
*first_invalid_escape = s-1; /* Back up one char, since we've
6479-
already incremented s. */
6480+
if (*first_invalid_escape_char == -1) {
6481+
*first_invalid_escape_char = c;
6482+
if (starts == initial_starts) {
6483+
/* Back up one char, since we've already incremented s. */
6484+
*first_invalid_escape_ptr = s - 1;
6485+
}
64806486
}
64816487
WRITE_ASCII_CHAR('\\');
64826488
WRITE_CHAR(c);
@@ -6515,22 +6521,39 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
65156521
return NULL;
65166522
}
65176523

6524+
// Export for binary compatibility.
6525+
PyObject *
6526+
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6527+
Py_ssize_t size,
6528+
const char *errors,
6529+
Py_ssize_t *consumed,
6530+
const char **first_invalid_escape)
6531+
{
6532+
int first_invalid_escape_char;
6533+
return _PyUnicode_DecodeUnicodeEscapeInternal2(
6534+
s, size, errors, consumed,
6535+
&first_invalid_escape_char,
6536+
first_invalid_escape);
6537+
}
6538+
65186539
PyObject *
65196540
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
65206541
Py_ssize_t size,
65216542
const char *errors,
65226543
Py_ssize_t *consumed)
65236544
{
6524-
const char *first_invalid_escape;
6525-
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6545+
int first_invalid_escape_char;
6546+
const char *first_invalid_escape_ptr;
6547+
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
65266548
consumed,
6527-
&first_invalid_escape);
6549+
&first_invalid_escape_char,
6550+
&first_invalid_escape_ptr);
65286551
if (result == NULL)
65296552
return NULL;
6530-
if (first_invalid_escape != NULL) {
6553+
if (first_invalid_escape_char != -1) {
65316554
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
65326555
"invalid escape sequence '\\%c'",
6533-
(unsigned char)*first_invalid_escape) < 0) {
6556+
first_invalid_escape_char) < 0) {
65346557
Py_DECREF(result);
65356558
return NULL;
65366559
}

Parser/pegen/parse_string.c

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,15 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
119119
len = p - buf;
120120
s = buf;
121121

122-
const char *first_invalid_escape;
123-
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
124-
125-
if (v != NULL && first_invalid_escape != NULL) {
126-
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
127-
/* We have not decref u before because first_invalid_escape points
122+
int first_invalid_escape_char;
123+
const char *first_invalid_escape_ptr;
124+
v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
125+
&first_invalid_escape_char,
126+
&first_invalid_escape_ptr);
127+
128+
if (v != NULL && first_invalid_escape_ptr != NULL) {
129+
if (warn_invalid_escape_sequence(parser, *first_invalid_escape_ptr, t) < 0) {
130+
/* We have not decref u before because first_invalid_escape_ptr points
128131
inside u. */
129132
Py_XDECREF(u);
130133
Py_DECREF(v);
@@ -138,14 +141,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
138141
static PyObject *
139142
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
140143
{
141-
const char *first_invalid_escape;
142-
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
144+
int first_invalid_escape_char;
145+
const char *first_invalid_escape_ptr;
146+
PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
147+
&first_invalid_escape_char,
148+
&first_invalid_escape_ptr);
143149
if (result == NULL) {
144150
return NULL;
145151
}
146152

147-
if (first_invalid_escape != NULL) {
148-
if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
153+
if (first_invalid_escape_ptr != NULL) {
154+
if (warn_invalid_escape_sequence(p, *first_invalid_escape_ptr, t) < 0) {
149155
Py_DECREF(result);
150156
return NULL;
151157
}

0 commit comments

Comments
 (0)