Skip to content

Commit cbadeac

Browse files
committed
With GB18030, prevent SIGSEGV from reading past end of allocation.
With GB18030 as source encoding, applications could crash the server via SQL functions convert() or convert_from(). Applications themselves could crash after passing unterminated GB18030 input to libpq functions PQescapeLiteral(), PQescapeIdentifier(), PQescapeStringConn(), or PQescapeString(). Extension code could crash by passing unterminated GB18030 input to jsonapi.h functions. All those functions have been intended to handle untrusted, unterminated input safely. A crash required allocating the input such that the last byte of the allocation was the last byte of a virtual memory page. Some malloc() implementations take measures against that, making the SIGSEGV hard to reach. Back-patch to v13 (all supported versions). Author: Noah Misch <noah@leadboat.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com> Backpatch-through: 13 Security: CVE-2025-4207
1 parent 7279e58 commit cbadeac

File tree

7 files changed

+171
-17
lines changed

7 files changed

+171
-17
lines changed

src/backend/utils/mb/mbutils.c

+13-5
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,7 @@ pg_mbcliplen(const char *mbstr, int len, int limit)
971971
}
972972

973973
/*
974-
* pg_mbcliplen with specified encoding
974+
* pg_mbcliplen with specified encoding; string must be valid in encoding
975975
*/
976976
int
977977
pg_encoding_mbcliplen(int encoding, const char *mbstr,
@@ -1569,12 +1569,12 @@ check_encoding_conversion_args(int src_encoding,
15691569
* report_invalid_encoding: complain about invalid multibyte character
15701570
*
15711571
* note: len is remaining length of string, not length of character;
1572-
* len must be greater than zero, as we always examine the first byte.
1572+
* len must be greater than zero (or we'd neglect initializing "buf").
15731573
*/
15741574
void
15751575
report_invalid_encoding(int encoding, const char *mbstr, int len)
15761576
{
1577-
int l = pg_encoding_mblen(encoding, mbstr);
1577+
int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
15781578
char buf[8 * 5 + 1];
15791579
char *p = buf;
15801580
int j,
@@ -1601,18 +1601,26 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
16011601
* report_untranslatable_char: complain about untranslatable character
16021602
*
16031603
* note: len is remaining length of string, not length of character;
1604-
* len must be greater than zero, as we always examine the first byte.
1604+
* len must be greater than zero (or we'd neglect initializing "buf").
16051605
*/
16061606
void
16071607
report_untranslatable_char(int src_encoding, int dest_encoding,
16081608
const char *mbstr, int len)
16091609
{
1610-
int l = pg_encoding_mblen(src_encoding, mbstr);
1610+
int l;
16111611
char buf[8 * 5 + 1];
16121612
char *p = buf;
16131613
int j,
16141614
jlimit;
16151615

1616+
/*
1617+
* We probably could use plain pg_encoding_mblen(), because
1618+
* gb18030_to_utf8() verifies before it converts. All conversions should.
1619+
* For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
1620+
* so, be defensive, since a buggy conversion might pass invalid data.
1621+
* This is not a performance-critical path.
1622+
*/
1623+
l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
16161624
jlimit = Min(l, len);
16171625
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
16181626

src/common/jsonapi.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -698,8 +698,11 @@ json_lex_string(JsonLexContext *lex)
698698
} while (0)
699699
#define FAIL_AT_CHAR_END(code) \
700700
do { \
701-
char *term = s + pg_encoding_mblen(lex->input_encoding, s); \
702-
lex->token_terminator = (term <= end) ? term : end; \
701+
ptrdiff_t remaining = end - s; \
702+
int charlen; \
703+
charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \
704+
s, remaining); \
705+
lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \
703706
return code; \
704707
} while (0)
705708

src/common/wchar.c

+45-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
*/
1313
#include "c.h"
1414

15+
#include <limits.h>
16+
1517
#include "mb/pg_wchar.h"
1618

1719

@@ -1597,10 +1599,27 @@ const pg_wchar_tbl pg_wchar_table[] = {
15971599
/*
15981600
* Returns the byte length of a multibyte character.
15991601
*
1600-
* Caution: when dealing with text that is not certainly valid in the
1601-
* specified encoding, the result may exceed the actual remaining
1602-
* string length. Callers that are not prepared to deal with that
1603-
* should use pg_encoding_mblen_bounded() instead.
1602+
* Choose "mblen" functions based on the input string characteristics.
1603+
* pg_encoding_mblen() can be used when ANY of these conditions are met:
1604+
*
1605+
* - The input string is zero-terminated
1606+
*
1607+
* - The input string is known to be valid in the encoding (e.g., string
1608+
* converted from database encoding)
1609+
*
1610+
* - The encoding is not GB18030 (e.g., when only database encodings are
1611+
* passed to 'encoding' parameter)
1612+
*
1613+
* encoding==GB18030 requires examining up to two bytes to determine character
1614+
* length. Therefore, callers satisfying none of those conditions must use
1615+
* pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
1616+
* guaranteed to be within allocation bounds.
1617+
*
1618+
* When dealing with text that is not certainly valid in the specified
1619+
* encoding, the result may exceed the actual remaining string length.
1620+
* Callers that are not prepared to deal with that should use Min(remaining,
1621+
* pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
1622+
* pg_encoding_mblen_bounded() are interchangeable.
16041623
*/
16051624
int
16061625
pg_encoding_mblen(int encoding, const char *mbstr)
@@ -1611,8 +1630,28 @@ pg_encoding_mblen(int encoding, const char *mbstr)
16111630
}
16121631

16131632
/*
1614-
* Returns the byte length of a multibyte character; but not more than
1615-
* the distance to end of string.
1633+
* Returns the byte length of a multibyte character (possibly not
1634+
* zero-terminated), or INT_MAX if too few bytes remain to determine a length.
1635+
*/
1636+
int
1637+
pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
1638+
size_t remaining)
1639+
{
1640+
/*
1641+
* Define zero remaining as too few, even for single-byte encodings.
1642+
* pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
1643+
* zero; others read one.
1644+
*/
1645+
if (remaining < 1 ||
1646+
(encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
1647+
return INT_MAX;
1648+
return pg_encoding_mblen(encoding, mbstr);
1649+
}
1650+
1651+
/*
1652+
* Returns the byte length of a multibyte character; but not more than the
1653+
* distance to the terminating zero byte. For input that might lack a
1654+
* terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
16161655
*/
16171656
int
16181657
pg_encoding_mblen_bounded(int encoding, const char *mbstr)

src/include/mb/pg_wchar.h

+2
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,8 @@ extern int pg_valid_server_encoding_id(int encoding);
554554
*/
555555
extern void pg_encoding_set_invalid(int encoding, char *dst);
556556
extern int pg_encoding_mblen(int encoding, const char *mbstr);
557+
extern int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
558+
size_t remaining);
557559
extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
558560
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
559561
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);

src/interfaces/libpq/fe-exec.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -3373,7 +3373,8 @@ PQescapeStringInternal(PGconn *conn,
33733373
}
33743374

33753375
/* Slow path for possible multibyte characters */
3376-
charlen = pg_encoding_mblen(encoding, source);
3376+
charlen = pg_encoding_mblen_or_incomplete(encoding,
3377+
source, remaining);
33773378

33783379
if (remaining < charlen ||
33793380
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
@@ -3513,7 +3514,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
35133514
int charlen;
35143515

35153516
/* Slow path for possible multibyte characters */
3516-
charlen = pg_encoding_mblen(conn->client_encoding, s);
3517+
charlen = pg_encoding_mblen_or_incomplete(conn->client_encoding,
3518+
s, remaining);
35173519

35183520
if (charlen > remaining)
35193521
{

src/interfaces/libpq/fe-misc.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -1227,8 +1227,9 @@ pqSocketPoll(int sock, int forRead, int forWrite, time_t end_time)
12271227
*/
12281228

12291229
/*
1230-
* returns the byte length of the character beginning at s, using the
1231-
* specified encoding.
1230+
* Like pg_encoding_mblen(). Use this in callers that want the
1231+
* dynamically-linked libpq's stance on encodings, even if that means
1232+
* different behavior in different startups of the executable.
12321233
*/
12331234
int
12341235
PQmblen(const char *s, int encoding)

src/test/modules/test_escape/test_escape.c

+99
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <string.h>
1313
#include <stdio.h>
1414

15+
#include "common/jsonapi.h"
1516
#include "fe_utils/psqlscan.h"
1617
#include "fe_utils/string_utils.h"
1718
#include "getopt_long.h"
@@ -164,6 +165,91 @@ encoding_conflicts_ascii(int encoding)
164165
}
165166

166167

168+
/*
169+
* Confirm escaping doesn't read past the end of an allocation. Consider the
170+
* result of malloc(4096), in the absence of freelist entries satisfying the
171+
* allocation. On OpenBSD, reading one byte past the end of that object
172+
* yields SIGSEGV.
173+
*
174+
* Run this test before the program's other tests, so freelists are minimal.
175+
* len=4096 didn't SIGSEGV, likely due to free() calls in libpq. len=8192
176+
* did. Use 128 KiB, to somewhat insulate the outcome from distant new free()
177+
* calls and libc changes.
178+
*/
179+
static void
180+
test_gb18030_page_multiple(pe_test_config *tc)
181+
{
182+
PQExpBuffer testname;
183+
size_t input_len = 0x20000;
184+
char *input;
185+
186+
/* prepare input */
187+
input = pg_malloc(input_len);
188+
memset(input, '-', input_len - 1);
189+
input[input_len - 1] = 0xfe;
190+
191+
/* name to describe the test */
192+
testname = createPQExpBuffer();
193+
appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1);
194+
escapify(testname, input + input_len - 1, 1);
195+
appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral");
196+
197+
/* test itself */
198+
PQsetClientEncoding(tc->conn, "GB18030");
199+
report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL,
200+
testname->data, "",
201+
"input validity vs escape success", "ok");
202+
203+
destroyPQExpBuffer(testname);
204+
pg_free(input);
205+
}
206+
207+
/*
208+
* Confirm json parsing doesn't read past the end of an allocation. This
209+
* exercises wchar.c infrastructure like the true "escape" tests do, but this
210+
* isn't an "escape" test.
211+
*/
212+
static void
213+
test_gb18030_json(pe_test_config *tc)
214+
{
215+
PQExpBuffer raw_buf;
216+
PQExpBuffer testname;
217+
const char input[] = "{\"\\u\xFE";
218+
size_t input_len = sizeof(input) - 1;
219+
JsonLexContext *lex;
220+
JsonSemAction sem = {0}; /* no callbacks */
221+
JsonParseErrorType json_error;
222+
char *error_str;
223+
224+
/* prepare input like test_one_vector_escape() does */
225+
raw_buf = createPQExpBuffer();
226+
appendBinaryPQExpBuffer(raw_buf, input, input_len);
227+
appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR);
228+
VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len],
229+
raw_buf->len - input_len);
230+
231+
/* name to describe the test */
232+
testname = createPQExpBuffer();
233+
appendPQExpBuffer(testname, ">");
234+
escapify(testname, input, input_len);
235+
appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json");
236+
237+
/* test itself */
238+
lex = makeJsonLexContextCstringLen(raw_buf->data, input_len,
239+
PG_GB18030, false);
240+
json_error = pg_parse_json(lex, &sem);
241+
error_str = psprintf("JsonParseErrorType %d", json_error);
242+
report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT,
243+
testname->data, "",
244+
"diagnosed", error_str);
245+
246+
pfree(error_str);
247+
pfree(lex);
248+
destroyPQExpBuffer(testname);
249+
destroyPQExpBuffer(raw_buf);
250+
}
251+
252+
167253
static bool
168254
escape_literal(PGconn *conn, PQExpBuffer target,
169255
const char *unescaped, size_t unescaped_len,
@@ -454,8 +540,18 @@ static pe_test_vector pe_test_vectors[] =
454540
* Testcases that are not null terminated for the specified input length.
455541
* That's interesting to verify that escape functions don't read beyond
456542
* the intended input length.
543+
*
544+
* One interesting special case is GB18030, which has the odd behaviour
545+
* needing to read beyond the first byte to determine the length of a
546+
* multi-byte character.
457547
*/
458548
TV_LEN("gbk", "\x80", 1),
549+
TV_LEN("GB18030", "\x80", 1),
550+
TV_LEN("GB18030", "\x80\0", 2),
551+
TV_LEN("GB18030", "\x80\x30", 2),
552+
TV_LEN("GB18030", "\x80\x30\0", 3),
553+
TV_LEN("GB18030", "\x80\x30\x30", 3),
554+
TV_LEN("GB18030", "\x80\x30\x30\0", 4),
459555
TV_LEN("UTF-8", "\xC3\xb6 ", 1),
460556
TV_LEN("UTF-8", "\xC3\xb6 ", 2),
461557
};
@@ -864,6 +960,9 @@ main(int argc, char *argv[])
864960
exit(1);
865961
}
866962

963+
test_gb18030_page_multiple(&tc);
964+
test_gb18030_json(&tc);
965+
867966
for (int i = 0; i < lengthof(pe_test_vectors); i++)
868967
{
869968
test_one_vector(&tc, &pe_test_vectors[i]);

0 commit comments

Comments
 (0)