Skip to content

Commit a868931

Browse files
committed
Fix insufficiently-paranoid GB18030 encoding verifier.
The previous coding effectively only verified that the second byte of a multibyte character was in the expected range; moreover, it wasn't careful to make sure that the second byte even exists in the buffer before touching it. The latter seems unlikely to cause any real problems in the field (in particular, it could never be a problem with null-terminated input), but it's still a bug. Since GB18030 is not a supported backend encoding, the only thing we'd really be doing with GB18030 text is converting it to UTF8 in LocalToUtf, which would fail anyway on any invalid character for lack of a match in its lookup table. So the only user-visible consequence of this change should be that you'll get "invalid byte sequence for encoding" rather than "character has no equivalent" for malformed GB18030 input. However, impending changes to the GB18030 conversion code will require these tighter up-front checks to avoid producing bogus results.
1 parent aff27e3 commit a868931

File tree

1 file changed

+29
-23
lines changed

1 file changed

+29
-23
lines changed

src/backend/utils/mb/wchar.c

+29-23
Original file line numberDiff line numberDiff line change
@@ -1070,25 +1070,20 @@ pg_uhc_dsplen(const unsigned char *s)
10701070
}
10711071

10721072
/*
1073-
* * GB18030
1074-
* * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1075-
* */
1073+
* GB18030
1074+
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1075+
*/
10761076
static int
10771077
pg_gb18030_mblen(const unsigned char *s)
10781078
{
10791079
int len;
10801080

10811081
if (!IS_HIGHBIT_SET(*s))
10821082
len = 1; /* ASCII */
1083+
else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1084+
len = 4;
10831085
else
1084-
{
1085-
if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) || (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1086-
len = 2;
1087-
else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1088-
len = 4;
1089-
else
1090-
len = 2;
1091-
}
1086+
len = 2;
10921087
return len;
10931088
}
10941089

@@ -1403,21 +1398,32 @@ pg_uhc_verifier(const unsigned char *s, int len)
14031398
static int
14041399
pg_gb18030_verifier(const unsigned char *s, int len)
14051400
{
1406-
int l,
1407-
mbl;
1408-
1409-
l = mbl = pg_gb18030_mblen(s);
1410-
1411-
if (len < l)
1412-
return -1;
1401+
int l;
14131402

1414-
while (--l > 0)
1403+
if (!IS_HIGHBIT_SET(*s))
1404+
l = 1; /* ASCII */
1405+
else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
14151406
{
1416-
if (*++s == '\0')
1417-
return -1;
1407+
/* Should be 4-byte, validate remaining bytes */
1408+
if (*s >= 0x81 && *s <= 0xfe &&
1409+
*(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1410+
*(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1411+
l = 4;
1412+
else
1413+
l = -1;
14181414
}
1419-
1420-
return mbl;
1415+
else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1416+
{
1417+
/* Should be 2-byte, validate */
1418+
if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1419+
(*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1420+
l = 2;
1421+
else
1422+
l = -1;
1423+
}
1424+
else
1425+
l = -1;
1426+
return l;
14211427
}
14221428

14231429
static int

0 commit comments

Comments
 (0)