Skip to content

Commit 991a60a

Browse files
committed
Make escaping functions retain trailing bytes of an invalid character.
Instead of dropping the trailing byte(s) of an invalid or incomplete multibyte character, replace only the first byte with a known-invalid sequence, and process the rest normally. This seems less likely to confuse incautious callers than the behavior adopted in 5dc1e42. While we're at it, adjust PQescapeStringInternal to produce at most one bleat about invalid multibyte characters per string. This matches the behavior of PQescapeInternal, and avoids the risk of producing tons of repetitive junk if a long string is simply given in the wrong encoding. This is a followup to the fixes for CVE-2025-1094, and should be included if cherry-picking those fixes. Author: Andres Freund <andres@anarazel.de> Co-authored-by: Tom Lane <tgl@sss.pgh.pa.us> Reported-by: Jeff Davis <pgsql@j-davis.com> Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com Backpatch-through: 13
1 parent 111f4dd commit 991a60a

File tree

2 files changed

+65
-95
lines changed

2 files changed

+65
-95
lines changed

src/fe_utils/string_utils.c

Lines changed: 34 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -180,40 +180,25 @@ fmtIdEnc(const char *rawid, int encoding)
180180
/* Slow path for possible multibyte characters */
181181
charlen = pg_encoding_mblen(encoding, cp);
182182

183-
if (remaining < charlen)
184-
{
185-
/*
186-
* If the character is longer than the available input,
187-
* replace the string with an invalid sequence. The invalid
188-
* sequence ensures that the escaped string will trigger an
189-
* error on the server-side, even if we can't directly report
190-
* an error here.
191-
*/
192-
enlargePQExpBuffer(id_return, 2);
193-
pg_encoding_set_invalid(encoding,
194-
id_return->data + id_return->len);
195-
id_return->len += 2;
196-
id_return->data[id_return->len] = '\0';
197-
198-
/* there's no more input data, so we can stop */
199-
break;
200-
}
201-
else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
183+
if (remaining < charlen ||
184+
pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
202185
{
203186
/*
204187
* Multibyte character is invalid. It's important to verify
205-
* that as invalid multi-byte characters could e.g. be used to
188+
* that as invalid multibyte characters could e.g. be used to
206189
* "skip" over quote characters, e.g. when parsing
207190
* character-by-character.
208191
*
209-
* Replace the bytes corresponding to the invalid character
210-
* with an invalid sequence, for the same reason as above.
192+
* Replace the character's first byte with an invalid
193+
* sequence. The invalid sequence ensures that the escaped
194+
* string will trigger an error on the server-side, even if we
195+
* can't directly report an error here.
211196
*
212197
* It would be a bit faster to verify the whole string the
213198
* first time we encounter a set highbit, but this way we can
214-
* replace just the invalid characters, which probably makes
215-
* it easier for users to find the invalidly encoded portion
216-
* of a larger string.
199+
* replace just the invalid data, which probably makes it
200+
* easier for users to find the invalidly encoded portion of a
201+
* larger string.
217202
*/
218203
enlargePQExpBuffer(id_return, 2);
219204
pg_encoding_set_invalid(encoding,
@@ -222,11 +207,13 @@ fmtIdEnc(const char *rawid, int encoding)
222207
id_return->data[id_return->len] = '\0';
223208

224209
/*
225-
* Copy the rest of the string after the invalid multi-byte
226-
* character.
210+
* Handle the following bytes as if this byte didn't exist.
211+
* That's safer in case the subsequent bytes contain
212+
* characters that are significant for the caller (e.g. '>' in
213+
* html).
227214
*/
228-
remaining -= charlen;
229-
cp += charlen;
215+
remaining--;
216+
cp++;
230217
}
231218
else
232219
{
@@ -395,49 +382,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
395382
/* Slow path for possible multibyte characters */
396383
charlen = PQmblen(source, encoding);
397384

398-
if (remaining < charlen)
399-
{
400-
/*
401-
* If the character is longer than the available input, replace
402-
* the string with an invalid sequence. The invalid sequence
403-
* ensures that the escaped string will trigger an error on the
404-
* server-side, even if we can't directly report an error here.
405-
*
406-
* We know there's enough space for the invalid sequence because
407-
* the "target" buffer is 2 * length + 2 long, and at worst we're
408-
* replacing a single input byte with two invalid bytes.
409-
*/
410-
pg_encoding_set_invalid(encoding, target);
411-
target += 2;
412-
413-
/* there's no more valid input data, so we can stop */
414-
break;
415-
}
416-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
385+
if (remaining < charlen ||
386+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
417387
{
418388
/*
419389
* Multibyte character is invalid. It's important to verify that
420-
* as invalid multi-byte characters could e.g. be used to "skip"
390+
* as invalid multibyte characters could e.g. be used to "skip"
421391
* over quote characters, e.g. when parsing
422392
* character-by-character.
423393
*
424-
* Replace the bytes corresponding to the invalid character with
425-
* an invalid sequence, for the same reason as above.
394+
* Replace the character's first byte with an invalid sequence.
395+
* The invalid sequence ensures that the escaped string will
396+
* trigger an error on the server-side, even if we can't directly
397+
* report an error here.
398+
*
399+
* We know there's enough space for the invalid sequence because
400+
* the "target" buffer is 2 * length + 2 long, and at worst we're
401+
* replacing a single input byte with two invalid bytes.
426402
*
427403
* It would be a bit faster to verify the whole string the first
428404
* time we encounter a set highbit, but this way we can replace
429-
* just the invalid characters, which probably makes it easier for
430-
* users to find the invalidly encoded portion of a larger string.
405+
* just the invalid data, which probably makes it easier for users
406+
* to find the invalidly encoded portion of a larger string.
431407
*/
432408
pg_encoding_set_invalid(encoding, target);
433409
target += 2;
434-
remaining -= charlen;
435410

436411
/*
437-
* Copy the rest of the string after the invalid multi-byte
438-
* character.
412+
* Handle the following bytes as if this byte didn't exist. That's
413+
* safer in case the subsequent bytes contain important characters
414+
* for the caller (e.g. '>' in html).
439415
*/
440-
source += charlen;
416+
source++;
417+
remaining--;
441418
}
442419
else
443420
{

src/interfaces/libpq/fe-exec.c

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3942,6 +3942,7 @@ PQescapeStringInternal(PGconn *conn,
39423942
const char *source = from;
39433943
char *target = to;
39443944
size_t remaining = strnlen(from, length);
3945+
bool already_complained = false;
39453946

39463947
if (error)
39473948
*error = 0;
@@ -3968,65 +3969,57 @@ PQescapeStringInternal(PGconn *conn,
39683969
/* Slow path for possible multibyte characters */
39693970
charlen = pg_encoding_mblen(encoding, source);
39703971

3971-
if (remaining < charlen)
3972+
if (remaining < charlen ||
3973+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
39723974
{
39733975
/*
3974-
* If the character is longer than the available input, report an
3975-
* error if possible, and replace the string with an invalid
3976-
* sequence. The invalid sequence ensures that the escaped string
3977-
* will trigger an error on the server-side, even if we can't
3978-
* directly report an error here.
3976+
* Multibyte character is invalid. It's important to verify that
3977+
* as invalid multibyte characters could e.g. be used to "skip"
3978+
* over quote characters, e.g. when parsing
3979+
* character-by-character.
3980+
*
3981+
* Report an error if possible, and replace the character's first
3982+
* byte with an invalid sequence. The invalid sequence ensures
3983+
* that the escaped string will trigger an error on the
3984+
* server-side, even if we can't directly report an error here.
39793985
*
39803986
* This isn't *that* crucial when we can report an error to the
3981-
* caller, but if we can't, the caller will use this string
3982-
* unmodified and it needs to be safe for parsing.
3987+
* caller; but if we can't or the caller ignores it, the caller
3988+
* will use this string unmodified and it needs to be safe for
3989+
* parsing.
39833990
*
39843991
* We know there's enough space for the invalid sequence because
39853992
* the "to" buffer needs to be at least 2 * length + 1 long, and
39863993
* at worst we're replacing a single input byte with two invalid
39873994
* bytes.
3988-
*/
3989-
if (error)
3990-
*error = 1;
3991-
if (conn)
3992-
libpq_append_conn_error(conn, "incomplete multibyte character");
3993-
3994-
pg_encoding_set_invalid(encoding, target);
3995-
target += 2;
3996-
3997-
/* there's no more input data, so we can stop */
3998-
break;
3999-
}
4000-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
4001-
{
4002-
/*
4003-
* Multibyte character is invalid. It's important to verify that
4004-
* as invalid multi-byte characters could e.g. be used to "skip"
4005-
* over quote characters, e.g. when parsing
4006-
* character-by-character.
4007-
*
4008-
* Replace the bytes corresponding to the invalid character with
4009-
* an invalid sequence, for the same reason as above.
40103995
*
40113996
* It would be a bit faster to verify the whole string the first
40123997
* time we encounter a set highbit, but this way we can replace
4013-
* just the invalid characters, which probably makes it easier for
4014-
* users to find the invalidly encoded portion of a larger string.
3998+
* just the invalid data, which probably makes it easier for users
3999+
* to find the invalidly encoded portion of a larger string.
40154000
*/
40164001
if (error)
40174002
*error = 1;
4018-
if (conn)
4019-
libpq_append_conn_error(conn, "invalid multibyte character");
4003+
if (conn && !already_complained)
4004+
{
4005+
if (remaining < charlen)
4006+
libpq_append_conn_error(conn, "incomplete multibyte character");
4007+
else
4008+
libpq_append_conn_error(conn, "invalid multibyte character");
4009+
/* Issue a complaint only once per string */
4010+
already_complained = true;
4011+
}
40204012

40214013
pg_encoding_set_invalid(encoding, target);
40224014
target += 2;
4023-
remaining -= charlen;
40244015

40254016
/*
4026-
* Copy the rest of the string after the invalid multi-byte
4027-
* character.
4017+
* Handle the following bytes as if this byte didn't exist. That's
4018+
* safer in case the subsequent bytes contain important characters
4019+
* for the caller (e.g. '>' in html).
40284020
*/
4029-
source += charlen;
4021+
source++;
4022+
remaining--;
40304023
}
40314024
else
40324025
{

0 commit comments

Comments
 (0)