Skip to content

Commit 3abe6e0

Browse files
committed
Make escaping functions retain trailing bytes of an invalid character.
Instead of dropping the trailing byte(s) of an invalid or incomplete multibyte character, replace only the first byte with a known-invalid sequence, and process the rest normally. This seems less likely to confuse incautious callers than the behavior adopted in 5dc1e42. While we're at it, adjust PQescapeStringInternal to produce at most one bleat about invalid multibyte characters per string. This matches the behavior of PQescapeInternal, and avoids the risk of producing tons of repetitive junk if a long string is simply given in the wrong encoding. This is a followup to the fixes for CVE-2025-1094, and should be included if cherry-picking those fixes. Author: Andres Freund <andres@anarazel.de> Co-authored-by: Tom Lane <tgl@sss.pgh.pa.us> Reported-by: Jeff Davis <pgsql@j-davis.com> Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com Backpatch-through: 13
1 parent a92db3d commit 3abe6e0

File tree

2 files changed

+65
-95
lines changed

2 files changed

+65
-95
lines changed

src/fe_utils/string_utils.c

+34-57
Original file line numberDiff line numberDiff line change
@@ -180,40 +180,25 @@ fmtIdEnc(const char *rawid, int encoding)
180180
/* Slow path for possible multibyte characters */
181181
charlen = pg_encoding_mblen(encoding, cp);
182182

183-
if (remaining < charlen)
184-
{
185-
/*
186-
* If the character is longer than the available input,
187-
* replace the string with an invalid sequence. The invalid
188-
* sequence ensures that the escaped string will trigger an
189-
* error on the server-side, even if we can't directly report
190-
* an error here.
191-
*/
192-
enlargePQExpBuffer(id_return, 2);
193-
pg_encoding_set_invalid(encoding,
194-
id_return->data + id_return->len);
195-
id_return->len += 2;
196-
id_return->data[id_return->len] = '\0';
197-
198-
/* there's no more input data, so we can stop */
199-
break;
200-
}
201-
else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
183+
if (remaining < charlen ||
184+
pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
202185
{
203186
/*
204187
* Multibyte character is invalid. It's important to verify
205-
* that as invalid multi-byte characters could e.g. be used to
188+
* that as invalid multibyte characters could e.g. be used to
206189
* "skip" over quote characters, e.g. when parsing
207190
* character-by-character.
208191
*
209-
* Replace the bytes corresponding to the invalid character
210-
* with an invalid sequence, for the same reason as above.
192+
* Replace the character's first byte with an invalid
193+
* sequence. The invalid sequence ensures that the escaped
194+
* string will trigger an error on the server-side, even if we
195+
* can't directly report an error here.
211196
*
212197
* It would be a bit faster to verify the whole string the
213198
* first time we encounter a set highbit, but this way we can
214-
* replace just the invalid characters, which probably makes
215-
* it easier for users to find the invalidly encoded portion
216-
* of a larger string.
199+
* replace just the invalid data, which probably makes it
200+
* easier for users to find the invalidly encoded portion of a
201+
* larger string.
217202
*/
218203
enlargePQExpBuffer(id_return, 2);
219204
pg_encoding_set_invalid(encoding,
@@ -222,11 +207,13 @@ fmtIdEnc(const char *rawid, int encoding)
222207
id_return->data[id_return->len] = '\0';
223208

224209
/*
225-
* Copy the rest of the string after the invalid multi-byte
226-
* character.
210+
* Handle the following bytes as if this byte didn't exist.
211+
* That's safer in case the subsequent bytes contain
212+
* characters that are significant for the caller (e.g. '>' in
213+
* html).
227214
*/
228-
remaining -= charlen;
229-
cp += charlen;
215+
remaining--;
216+
cp++;
230217
}
231218
else
232219
{
@@ -395,49 +382,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
395382
/* Slow path for possible multibyte characters */
396383
charlen = PQmblen(source, encoding);
397384

398-
if (remaining < charlen)
399-
{
400-
/*
401-
* If the character is longer than the available input, replace
402-
* the string with an invalid sequence. The invalid sequence
403-
* ensures that the escaped string will trigger an error on the
404-
* server-side, even if we can't directly report an error here.
405-
*
406-
* We know there's enough space for the invalid sequence because
407-
* the "target" buffer is 2 * length + 2 long, and at worst we're
408-
* replacing a single input byte with two invalid bytes.
409-
*/
410-
pg_encoding_set_invalid(encoding, target);
411-
target += 2;
412-
413-
/* there's no more valid input data, so we can stop */
414-
break;
415-
}
416-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
385+
if (remaining < charlen ||
386+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
417387
{
418388
/*
419389
* Multibyte character is invalid. It's important to verify that
420-
* as invalid multi-byte characters could e.g. be used to "skip"
390+
* as invalid multibyte characters could e.g. be used to "skip"
421391
* over quote characters, e.g. when parsing
422392
* character-by-character.
423393
*
424-
* Replace the bytes corresponding to the invalid character with
425-
* an invalid sequence, for the same reason as above.
394+
* Replace the character's first byte with an invalid sequence.
395+
* The invalid sequence ensures that the escaped string will
396+
* trigger an error on the server-side, even if we can't directly
397+
* report an error here.
398+
*
399+
* We know there's enough space for the invalid sequence because
400+
* the "target" buffer is 2 * length + 2 long, and at worst we're
401+
* replacing a single input byte with two invalid bytes.
426402
*
427403
* It would be a bit faster to verify the whole string the first
428404
* time we encounter a set highbit, but this way we can replace
429-
* just the invalid characters, which probably makes it easier for
430-
* users to find the invalidly encoded portion of a larger string.
405+
* just the invalid data, which probably makes it easier for users
406+
* to find the invalidly encoded portion of a larger string.
431407
*/
432408
pg_encoding_set_invalid(encoding, target);
433409
target += 2;
434-
remaining -= charlen;
435410

436411
/*
437-
* Copy the rest of the string after the invalid multi-byte
438-
* character.
412+
* Handle the following bytes as if this byte didn't exist. That's
413+
* safer in case the subsequent bytes contain important characters
414+
* for the caller (e.g. '>' in html).
439415
*/
440-
source += charlen;
416+
source++;
417+
remaining--;
441418
}
442419
else
443420
{

src/interfaces/libpq/fe-exec.c

+31-38
Original file line numberDiff line numberDiff line change
@@ -4076,6 +4076,7 @@ PQescapeStringInternal(PGconn *conn,
40764076
const char *source = from;
40774077
char *target = to;
40784078
size_t remaining = strnlen(from, length);
4079+
bool already_complained = false;
40794080

40804081
if (error)
40814082
*error = 0;
@@ -4102,65 +4103,57 @@ PQescapeStringInternal(PGconn *conn,
41024103
/* Slow path for possible multibyte characters */
41034104
charlen = pg_encoding_mblen(encoding, source);
41044105

4105-
if (remaining < charlen)
4106+
if (remaining < charlen ||
4107+
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
41064108
{
41074109
/*
4108-
* If the character is longer than the available input, report an
4109-
* error if possible, and replace the string with an invalid
4110-
* sequence. The invalid sequence ensures that the escaped string
4111-
* will trigger an error on the server-side, even if we can't
4112-
* directly report an error here.
4110+
* Multibyte character is invalid. It's important to verify that
4111+
* as invalid multibyte characters could e.g. be used to "skip"
4112+
* over quote characters, e.g. when parsing
4113+
* character-by-character.
4114+
*
4115+
* Report an error if possible, and replace the character's first
4116+
* byte with an invalid sequence. The invalid sequence ensures
4117+
* that the escaped string will trigger an error on the
4118+
* server-side, even if we can't directly report an error here.
41134119
*
41144120
* This isn't *that* crucial when we can report an error to the
4115-
* caller, but if we can't, the caller will use this string
4116-
* unmodified and it needs to be safe for parsing.
4121+
* caller; but if we can't or the caller ignores it, the caller
4122+
* will use this string unmodified and it needs to be safe for
4123+
* parsing.
41174124
*
41184125
* We know there's enough space for the invalid sequence because
41194126
* the "to" buffer needs to be at least 2 * length + 1 long, and
41204127
* at worst we're replacing a single input byte with two invalid
41214128
* bytes.
4122-
*/
4123-
if (error)
4124-
*error = 1;
4125-
if (conn)
4126-
libpq_append_conn_error(conn, "incomplete multibyte character");
4127-
4128-
pg_encoding_set_invalid(encoding, target);
4129-
target += 2;
4130-
4131-
/* there's no more input data, so we can stop */
4132-
break;
4133-
}
4134-
else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
4135-
{
4136-
/*
4137-
* Multibyte character is invalid. It's important to verify that
4138-
* as invalid multi-byte characters could e.g. be used to "skip"
4139-
* over quote characters, e.g. when parsing
4140-
* character-by-character.
4141-
*
4142-
* Replace the bytes corresponding to the invalid character with
4143-
* an invalid sequence, for the same reason as above.
41444129
*
41454130
* It would be a bit faster to verify the whole string the first
41464131
* time we encounter a set highbit, but this way we can replace
4147-
* just the invalid characters, which probably makes it easier for
4148-
* users to find the invalidly encoded portion of a larger string.
4132+
* just the invalid data, which probably makes it easier for users
4133+
* to find the invalidly encoded portion of a larger string.
41494134
*/
41504135
if (error)
41514136
*error = 1;
4152-
if (conn)
4153-
libpq_append_conn_error(conn, "invalid multibyte character");
4137+
if (conn && !already_complained)
4138+
{
4139+
if (remaining < charlen)
4140+
libpq_append_conn_error(conn, "incomplete multibyte character");
4141+
else
4142+
libpq_append_conn_error(conn, "invalid multibyte character");
4143+
/* Issue a complaint only once per string */
4144+
already_complained = true;
4145+
}
41544146

41554147
pg_encoding_set_invalid(encoding, target);
41564148
target += 2;
4157-
remaining -= charlen;
41584149

41594150
/*
4160-
* Copy the rest of the string after the invalid multi-byte
4161-
* character.
4151+
* Handle the following bytes as if this byte didn't exist. That's
4152+
* safer in case the subsequent bytes contain important characters
4153+
* for the caller (e.g. '>' in html).
41624154
*/
4163-
source += charlen;
4155+
source++;
4156+
remaining--;
41644157
}
41654158
else
41664159
{

0 commit comments

Comments
 (0)