Skip to content

Commit d6e37b3

Browse files
committed
Cope with more than 64K phrases in a thesaurus dictionary.
dict_thesaurus stored phrase IDs in uint16 fields, so it would get confused and even crash if there were more than 64K entries in the configuration file. It turns out to be basically free to widen the phrase IDs to uint32, so let's just do so. This was complained of some time ago by David Boutin (in bug #7793); he later submitted an informal patch but it was never acted on. We now have another complaint (bug #11901 from Luc Ouellette) so it's time to make something happen. This is basically Boutin's patch, but for future-proofing I also added a defense against too many words per phrase. Note that we don't need any explicit defense against overflow of the uint32 counters, since before that happens we'd hit array allocation sizes that repalloc rejects. Back-patch to all supported branches because of the crash risk.
1 parent 4875931 commit d6e37b3

File tree

1 file changed

+17
-8
lines changed

1 file changed

+17
-8
lines changed

src/backend/tsearch/dict_thesaurus.c

+17-8
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
typedef struct LexemeInfo
3030
{
31-
uint16 idsubst; /* entry's number in DictThesaurus->subst */
31+
uint32 idsubst; /* entry's number in DictThesaurus->subst */
3232
uint16 posinsubst; /* pos info in entry */
3333
uint16 tnvariant; /* total num lexemes in one variant */
3434
struct LexemeInfo *nextentry;
@@ -68,7 +68,7 @@ typedef struct
6868

6969

7070
static void
71-
newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
71+
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
7272
{
7373
TheLexeme *ptr;
7474

@@ -102,7 +102,7 @@ newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
102102
}
103103

104104
static void
105-
addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
105+
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
106106
{
107107
static int nres = 0;
108108
static int ntres = 0;
@@ -143,7 +143,6 @@ addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 p
143143
ntres *= 2;
144144
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
145145
}
146-
147146
}
148147

149148
ptr->res[nres].lexeme = palloc(e - b + 1);
@@ -168,7 +167,7 @@ static void
168167
thesaurusRead(char *filename, DictThesaurus *d)
169168
{
170169
tsearch_readline_state trst;
171-
uint16 idsubst = 0;
170+
uint32 idsubst = 0;
172171
bool useasis = false;
173172
char *line;
174173

@@ -184,8 +183,8 @@ thesaurusRead(char *filename, DictThesaurus *d)
184183
char *ptr;
185184
int state = TR_WAITLEX;
186185
char *beginwrd = NULL;
187-
uint16 posinsubst = 0;
188-
uint16 nwrd = 0;
186+
uint32 posinsubst = 0;
187+
uint32 nwrd = 0;
189188

190189
ptr = line;
191190

@@ -286,6 +285,16 @@ thesaurusRead(char *filename, DictThesaurus *d)
286285
(errcode(ERRCODE_CONFIG_FILE_ERROR),
287286
errmsg("unexpected end of line")));
288287

288+
/*
289+
* Note: currently, tsearch_readline can't return lines exceeding 4KB,
290+
* so overflow of the word counts is impossible. But that may not
291+
* always be true, so let's check.
292+
*/
293+
if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
294+
ereport(ERROR,
295+
(errcode(ERRCODE_CONFIG_FILE_ERROR),
296+
errmsg("too many lexemes in thesaurus entry")));
297+
289298
pfree(line);
290299
}
291300

@@ -670,7 +679,7 @@ findTheLexeme(DictThesaurus *d, char *lexeme)
670679
}
671680

672681
static bool
673-
matchIdSubst(LexemeInfo *stored, uint16 idsubst)
682+
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
674683
{
675684
bool res = true;
676685

0 commit comments

Comments
 (0)