Skip to content

Commit d6d29b2

Browse files
committed
Make escaping functions retain trailing bytes of an invalid character.
Instead of dropping the trailing byte(s) of an invalid or incomplete multibyte character, replace only the first byte with a known-invalid sequence, and process the rest normally. This seems less likely to confuse incautious callers than the behavior adopted in 5dc1e42. While we're at it, adjust PQescapeStringInternal to produce at most one bleat about invalid multibyte characters per string. This matches the behavior of PQescapeInternal, and avoids the risk of producing tons of repetitive junk if a long string is simply given in the wrong encoding. This is a followup to the fixes for CVE-2025-1094, and should be included if cherry-picking those fixes. Author: Andres Freund <andres@anarazel.de> Co-authored-by: Tom Lane <tgl@sss.pgh.pa.us> Reported-by: Jeff Davis <pgsql@j-davis.com> Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com Backpatch-through: 13
1 parent 1f7a053 commit d6d29b2

File tree

2 files changed

+67
-97
lines changed

2 files changed

+67
-97
lines changed

src/fe_utils/string_utils.c

+34-57
Original file line numberDiff line numberDiff line change
@@ -180,40 +180,25 @@ fmtIdEnc(const char *rawid, int encoding)
180180
/* Slow path for possible multibyte characters */
181181
charlen = pg_encoding_mblen(encoding, cp);
182182

183-
if (remaining < charlen)
184-
{
185-
/*
186-
* If the character is longer than the available input,
187-
* replace the string with an invalid sequence. The invalid
188-
* sequence ensures that the escaped string will trigger an
189-
* error on the server-side, even if we can't directly report
190-
* an error here.
191-
*/
192-
enlargePQExpBuffer(id_return, 2);
193-
pg_encoding_set_invalid(encoding,
194-
id_return->data + id_return->len);
195-
id_return->len += 2;
196-
id_return->data[id_return->len] = '\0';
197-
198-
/* there's no more input data, so we can stop */
199-
break;
200-
}
201-
else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
183+
if (remaining < charlen ||
184+
pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
202185
{
203186
/*
204187
* Multibyte character is invalid. It's important to verify
205-
* that as invalid multi-byte characters could e.g. be used to
188+
* that as invalid multibyte characters could e.g. be used to
206189
* "skip" over quote characters, e.g. when parsing
207190
* character-by-character.
208191
*
209-
* Replace the bytes corresponding to the invalid character
210-
* with an invalid sequence, for the same reason as above.
192+
* Replace the character's first byte with an invalid
193+
* sequence. The invalid sequence ensures that the escaped
194+
* string will trigger an error on the server-side, even if we
195+
* can't directly report an error here.
211196
*
212197
* It would be a bit faster to verify the whole string the
213198
* first time we encounter a set highbit, but this way we can
214-
* replace just the invalid characters, which probably makes
215-
* it easier for users to find the invalidly encoded portion
216-
* of a larger string.
199+
* replace just the invalid data, which probably makes it
200+
* easier for users to find the invalidly encoded portion of a
201+
* larger string.
217202
*/
218203
enlargePQExpBuffer(id_return, 2);
219204
pg_encoding_set_invalid(encoding,
@@ -222,11 +207,13 @@ fmtIdEnc(const char *rawid, int encoding)
222207
id_return->data[id_return->len] = '\0';
223208

224209
/*
225-
* Copy the rest of the string after the invalid multi-byte
226-
* character.
210+
* Handle the following bytes as if this byte didn't exist.
211+
* That's safer in case the subsequent bytes contain
212+
* characters that are significant for the caller (e.g. '>' in
213+
* html).
227214
*/
228-
remaining -= charlen;
229-
cp += charlen;
215+
remaining--;
216+
cp++;
230217
}
231218
else
232219
{
@@ -395,49 +382,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
395382
/* Slow path for possible multibyte characters */
396383
charlen = PQmblen(source, encoding);
397384

398-
if (remaining < charlen)
399-
{
400-
/*
401-
* If the character is longer than the available input, replace
402-
* the string with an invalid sequence. The invalid sequence
403-
* ensures that the escaped string will trigger an error on the
404-
* server-side, even if we can't directly report an error here.
405-
*
406-
* We know there's enough space for the invalid sequence because
407-
* the "target" buffer is 2 * length + 2 long, and at worst we're
408-
* replacing a single input byte with two invalid bytes.
409-
*/
410-
pg_encoding_set_invalid(encoding, target);
411-
target += 2;
412-
413-
/* there's no more valid input data, so we can stop */
414-
break;
415-
}
416-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
385+
if (remaining < charlen ||
386+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
417387
{
418388
/*
419389
* Multibyte character is invalid. It's important to verify that
420-
* as invalid multi-byte characters could e.g. be used to "skip"
390+
* as invalid multibyte characters could e.g. be used to "skip"
421391
* over quote characters, e.g. when parsing
422392
* character-by-character.
423393
*
424-
* Replace the bytes corresponding to the invalid character with
425-
* an invalid sequence, for the same reason as above.
394+
* Replace the character's first byte with an invalid sequence.
395+
* The invalid sequence ensures that the escaped string will
396+
* trigger an error on the server-side, even if we can't directly
397+
* report an error here.
398+
*
399+
* We know there's enough space for the invalid sequence because
400+
* the "target" buffer is 2 * length + 2 long, and at worst we're
401+
* replacing a single input byte with two invalid bytes.
426402
*
427403
* It would be a bit faster to verify the whole string the first
428404
* time we encounter a set highbit, but this way we can replace
429-
* just the invalid characters, which probably makes it easier for
430-
* users to find the invalidly encoded portion of a larger string.
405+
* just the invalid data, which probably makes it easier for users
406+
* to find the invalidly encoded portion of a larger string.
431407
*/
432408
pg_encoding_set_invalid(encoding, target);
433409
target += 2;
434-
remaining -= charlen;
435410

436411
/*
437-
* Copy the rest of the string after the invalid multi-byte
438-
* character.
412+
* Handle the following bytes as if this byte didn't exist. That's
413+
* safer in case the subsequent bytes contain important characters
414+
* for the caller (e.g. '>' in html).
439415
*/
440-
source += charlen;
416+
source++;
417+
remaining--;
441418
}
442419
else
443420
{

src/interfaces/libpq/fe-exec.c

+33-40
Original file line numberDiff line numberDiff line change
@@ -3348,6 +3348,7 @@ PQescapeStringInternal(PGconn *conn,
33483348
const char *source = from;
33493349
char *target = to;
33503350
size_t remaining = strnlen(from, length);
3351+
bool already_complained = false;
33513352

33523353
if (error)
33533354
*error = 0;
@@ -3374,67 +3375,59 @@ PQescapeStringInternal(PGconn *conn,
33743375
/* Slow path for possible multibyte characters */
33753376
charlen = pg_encoding_mblen(encoding, source);
33763377

3377-
if (remaining < charlen)
3378+
if (remaining < charlen ||
3379+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
33783380
{
33793381
/*
3380-
* If the character is longer than the available input, report an
3381-
* error if possible, and replace the string with an invalid
3382-
* sequence. The invalid sequence ensures that the escaped string
3383-
* will trigger an error on the server-side, even if we can't
3384-
* directly report an error here.
3382+
* Multibyte character is invalid. It's important to verify that
3383+
* as invalid multibyte characters could e.g. be used to "skip"
3384+
* over quote characters, e.g. when parsing
3385+
* character-by-character.
3386+
*
3387+
* Report an error if possible, and replace the character's first
3388+
* byte with an invalid sequence. The invalid sequence ensures
3389+
* that the escaped string will trigger an error on the
3390+
* server-side, even if we can't directly report an error here.
33853391
*
33863392
* This isn't *that* crucial when we can report an error to the
3387-
* caller, but if we can't, the caller will use this string
3388-
* unmodified and it needs to be safe for parsing.
3393+
* caller; but if we can't or the caller ignores it, the caller
3394+
* will use this string unmodified and it needs to be safe for
3395+
* parsing.
33893396
*
33903397
* We know there's enough space for the invalid sequence because
33913398
* the "to" buffer needs to be at least 2 * length + 1 long, and
33923399
* at worst we're replacing a single input byte with two invalid
33933400
* bytes.
3394-
*/
3395-
if (error)
3396-
*error = 1;
3397-
if (conn)
3398-
printfPQExpBuffer(&conn->errorMessage,
3399-
libpq_gettext("incomplete multibyte character\n"));
3400-
3401-
pg_encoding_set_invalid(encoding, target);
3402-
target += 2;
3403-
3404-
/* there's no more input data, so we can stop */
3405-
break;
3406-
}
3407-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
3408-
{
3409-
/*
3410-
* Multibyte character is invalid. It's important to verify that
3411-
* as invalid multi-byte characters could e.g. be used to "skip"
3412-
* over quote characters, e.g. when parsing
3413-
* character-by-character.
3414-
*
3415-
* Replace the bytes corresponding to the invalid character with
3416-
* an invalid sequence, for the same reason as above.
34173401
*
34183402
* It would be a bit faster to verify the whole string the first
34193403
* time we encounter a set highbit, but this way we can replace
3420-
* just the invalid characters, which probably makes it easier for
3421-
* users to find the invalidly encoded portion of a larger string.
3404+
* just the invalid data, which probably makes it easier for users
3405+
* to find the invalidly encoded portion of a larger string.
34223406
*/
34233407
if (error)
34243408
*error = 1;
3425-
if (conn)
3426-
printfPQExpBuffer(&conn->errorMessage,
3427-
libpq_gettext("invalid multibyte character\n"));
3409+
if (conn && !already_complained)
3410+
{
3411+
if (remaining < charlen)
3412+
printfPQExpBuffer(&conn->errorMessage,
3413+
libpq_gettext("incomplete multibyte character"));
3414+
else
3415+
printfPQExpBuffer(&conn->errorMessage,
3416+
libpq_gettext("invalid multibyte character"));
3417+
/* Issue a complaint only once per string */
3418+
already_complained = true;
3419+
}
34283420

34293421
pg_encoding_set_invalid(encoding, target);
34303422
target += 2;
3431-
remaining -= charlen;
34323423

34333424
/*
3434-
* Copy the rest of the string after the invalid multi-byte
3435-
* character.
3425+
* Handle the following bytes as if this byte didn't exist. That's
3426+
* safer in case the subsequent bytes contain important characters
3427+
* for the caller (e.g. '>' in html).
34363428
*/
3437-
source += charlen;
3429+
source++;
3430+
remaining--;
34383431
}
34393432
else
34403433
{

0 commit comments

Comments
 (0)