Skip to content

Commit 5f7247b

Browse files
committed
Fix default text search parser's ts_headline code for phrase queries.
This code could produce very poor results when asked to highlight a string based on a query using phrase-match operators. The root cause is that hlCover(), which is supposed to find a minimal substring that matches the query, was written assuming that word position is not significant. I'm only 95% convinced that its algorithm was correct even for plain AND/OR queries; but it definitely fails completely for phrase matches, causing it to possibly not identify a cover string at all. Hence, rewrite hlCover() with a less-tense algorithm that just tries all the possible substrings, earlier and shorter ones first. (This is not as bad as it sounds performance-wise, because all of the string matching has been done already: the repeated tsquery match checks boil down to pointer comparisons.) Unfortunately, since that approach produces more candidate cover strings than before, it also exposes that there were bugs in the heuristics in mark_hl_words() for selecting a best cover string. Fixes there include: * Do not apply the ShortWord filter to words that appear in the query. * Remove a misguided optimization for quickly rejecting a cover. * Fix order-of-operation bug that could cause computation of a wrong figure of merit (poslen) when shortening a cover. * Change the preference rule so that candidate headlines that do not include their whole cover string (after MaxWords trimming) are lowest priority, since they may not actually satisfy the user's query. This results in some changes in existing regression test cases, but they all seem reasonable. Note in particular that the tests involving strings like "1 2 3" were previously being affected by the ShortWord filter, masking the normal matching behavior. Per bug #16345 from Augustinas Jokubauskas; the new test cases are based on that example. Back-patch to 9.6 where phrase search was added to tsquery. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
1 parent afab399 commit 5f7247b

File tree

3 files changed

+141
-86
lines changed

3 files changed

+141
-86
lines changed

src/backend/tsearch/wparser_def.c

Lines changed: 99 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,9 +2039,10 @@ prsd_end(PG_FUNCTION_ARGS)
20392039
#define INTERESTINGWORD(j) \
20402040
(prs->words[j].item && !prs->words[j].repeated)
20412041

2042-
/* Don't want to end at a non-word or a short word */
2042+
/* Don't want to end at a non-word or a short word, unless interesting */
20432043
#define BADENDPOINT(j) \
2044-
(NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
2044+
((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
2045+
!INTERESTINGWORD(j))
20452046

20462047
typedef struct
20472048
{
@@ -2100,75 +2101,97 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
21002101
return false;
21012102
}
21022103

2103-
2104-
static bool
2105-
hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
2104+
/*
2105+
* hlFirstIndex: find first index >= pos containing any word used in query
2106+
*
2107+
* Returns -1 if no such index
2108+
*/
2109+
static int
2110+
hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
21062111
{
2107-
int i,
2108-
j;
2109-
QueryItem *item = GETQUERY(query);
2110-
int pos = *p;
2111-
2112-
*q = -1;
2113-
*p = INT_MAX;
2112+
int i;
21142113

2115-
for (j = 0; j < query->size; j++)
2114+
/* For each word ... */
2115+
for (i = pos; i < prs->curwords; i++)
21162116
{
2117-
if (item->type != QI_VAL)
2117+
/* ... scan the query to see if this word matches any operand */
2118+
QueryItem *item = GETQUERY(query);
2119+
int j;
2120+
2121+
for (j = 0; j < query->size; j++)
21182122
{
2123+
if (item->type == QI_VAL &&
2124+
prs->words[i].item == &item->qoperand)
2125+
return i;
21192126
item++;
2120-
continue;
2121-
}
2122-
for (i = pos; i < prs->curwords; i++)
2123-
{
2124-
if (prs->words[i].item == &item->qoperand)
2125-
{
2126-
if (i > *q)
2127-
*q = i;
2128-
break;
2129-
}
21302127
}
2131-
item++;
21322128
}
2129+
return -1;
2130+
}
21332131

2134-
if (*q < 0)
2135-
return false;
2132+
/*
2133+
* hlCover: try to find a substring of prs' word list that satisfies query
2134+
*
2135+
* At entry, *p must be the first word index to consider (initialize this to
2136+
* zero, or to the next index after a previous successful search).
2137+
*
2138+
* On success, sets *p to first word index and *q to last word index of the
2139+
* cover substring, and returns true.
2140+
*
2141+
* The result is a minimal cover, in the sense that both *p and *q will be
2142+
* words used in the query.
2143+
*/
2144+
static bool
2145+
hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
2146+
{
2147+
int pmin,
2148+
pmax,
2149+
nextpmin,
2150+
nextpmax;
2151+
hlCheck ch;
21362152

2137-
item = GETQUERY(query);
2138-
for (j = 0; j < query->size; j++)
2153+
/*
2154+
* We look for the earliest, shortest substring of prs->words that
2155+
* satisfies the query. Both the pmin and pmax indices must be words
2156+
* appearing in the query; there's no point in trying endpoints in between
2157+
* such points.
2158+
*/
2159+
pmin = hlFirstIndex(prs, query, *p);
2160+
while (pmin >= 0)
21392161
{
2140-
if (item->type != QI_VAL)
2162+
/* This useless assignment just keeps stupider compilers quiet */
2163+
nextpmin = -1;
2164+
/* Consider substrings starting at pmin */
2165+
ch.words = &(prs->words[pmin]);
2166+
/* Consider the length-one substring first, then longer substrings */
2167+
pmax = pmin;
2168+
do
21412169
{
2142-
item++;
2143-
continue;
2144-
}
2145-
for (i = *q; i >= pos; i--)
2146-
{
2147-
if (prs->words[i].item == &item->qoperand)
2170+
/* Try to match query against pmin .. pmax substring */
2171+
ch.len = pmax - pmin + 1;
2172+
if (TS_execute(GETQUERY(query), &ch,
2173+
TS_EXEC_EMPTY, checkcondition_HL))
21482174
{
2149-
if (i < *p)
2150-
*p = i;
2151-
break;
2175+
*p = pmin;
2176+
*q = pmax;
2177+
return true;
21522178
}
2153-
}
2154-
item++;
2155-
}
2156-
2157-
if (*p <= *q)
2158-
{
2159-
hlCheck ch;
2179+
/* Nope, so advance pmax to next feasible endpoint */
2180+
nextpmax = hlFirstIndex(prs, query, pmax + 1);
21602181

2161-
ch.words = &(prs->words[*p]);
2162-
ch.len = *q - *p + 1;
2163-
if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
2164-
return true;
2165-
else
2166-
{
2167-
(*p)++;
2168-
return hlCover(prs, query, p, q);
2182+
/*
2183+
* If this is our first advance past pmin, then the result is also
2184+
* the next feasible value of pmin; remember it to save a
2185+
* redundant search.
2186+
*/
2187+
if (pmax == pmin)
2188+
nextpmin = nextpmax;
2189+
pmax = nextpmax;
21692190
}
2191+
while (pmax >= 0);
2192+
/* No luck here, so try next feasible startpoint */
2193+
pmin = nextpmin;
21702194
}
2171-
21722195
return false;
21732196
}
21742197

@@ -2454,11 +2477,12 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
24542477
int bestb = -1,
24552478
beste = -1;
24562479
int bestlen = -1;
2480+
bool bestcover = false;
24572481
int pose,
24582482
posb,
24592483
poslen,
24602484
curlen;
2461-
2485+
bool poscover;
24622486
int i;
24632487

24642488
if (!highlightall)
@@ -2484,14 +2508,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
24842508
pose = i;
24852509
}
24862510

2487-
/* XXX this optimization seems unnecessary and wrong */
2488-
if (poslen < bestlen && !BADENDPOINT(beste))
2489-
{
2490-
/* better cover already found, so try next cover */
2491-
p++;
2492-
continue;
2493-
}
2494-
24952511
if (curlen < max_words)
24962512
{
24972513
/*
@@ -2546,29 +2562,38 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
25462562
i = q;
25472563
for (; curlen > min_words; i--)
25482564
{
2565+
if (!BADENDPOINT(i))
2566+
break;
25492567
if (!NONWORDTOKEN(prs->words[i].type))
25502568
curlen--;
25512569
if (INTERESTINGWORD(i))
25522570
poslen--;
2553-
pose = i;
2554-
if (!BADENDPOINT(i))
2555-
break;
2571+
pose = i - 1;
25562572
}
25572573
}
25582574

25592575
/*
2560-
* Adopt this headline if it's the first, or if it has more
2561-
* interesting words and isn't ending at a bad endpoint, or if it
2562-
* replaces a bad endpoint with a good one (XXX even if it has
2563-
* fewer interesting words? Really?)
2576+
* Check whether the proposed headline includes the original
2577+
* cover; it might not if we trimmed it due to max_words.
2578+
*/
2579+
poscover = (posb <= p && pose >= q);
2580+
2581+
/*
2582+
* Adopt this headline if it's better than the last one, giving
2583+
* highest priority to headlines including the cover, then to
2584+
* headlines with more interesting words, then to headlines with
2585+
* good stopping points. (Since bestlen is initially -1, we will
2586+
* certainly adopt the first headline.)
25642587
*/
2565-
if (bestlen < 0 ||
2566-
(poslen > bestlen && !BADENDPOINT(pose)) ||
2567-
(!BADENDPOINT(pose) && BADENDPOINT(beste)))
2588+
if (poscover > bestcover ||
2589+
(poscover == bestcover && poslen > bestlen) ||
2590+
(poscover == bestcover && poslen == bestlen &&
2591+
!BADENDPOINT(pose) && BADENDPOINT(beste)))
25682592
{
25692593
bestb = posb;
25702594
beste = pose;
25712595
bestlen = poslen;
2596+
bestcover = poscover;
25722597
}
25732598

25742599
/* move p to generate the next cover */

src/test/regress/expected/tsearch.out

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,12 +1079,12 @@ Water, water, every where,
10791079
Nor any drop to drink.
10801080
S. T. Coleridge (1772-1834)
10811081
', phraseto_tsquery('english', 'painted Ocean'));
1082-
ts_headline
1083-
----------------------------------
1084-
<b>painted</b> <b>Ocean</b>. +
1085-
Water, water, every where +
1086-
And all the boards did shrink;+
1087-
Water, water, every
1082+
ts_headline
1083+
---------------------------------------
1084+
<b>painted</b> Ship +
1085+
Upon a <b>painted</b> <b>Ocean</b>.+
1086+
Water, water, every where +
1087+
And all the boards did shrink
10881088
(1 row)
10891089

10901090
SELECT ts_headline('english', '
@@ -1106,6 +1106,15 @@ S. T. Coleridge (1772-1834)
11061106
And all the boards
11071107
(1 row)
11081108

1109+
SELECT ts_headline('english',
1110+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
1111+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
1112+
'MaxWords=100, MinWords=1');
1113+
ts_headline
1114+
-------------------------------------------------------------------------------
1115+
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
1116+
(1 row)
1117+
11091118
SELECT ts_headline('english', '
11101119
<html>
11111120
<!-- some comment -->
@@ -1142,15 +1151,15 @@ SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords
11421151
(1 row)
11431152

11441153
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
1145-
ts_headline
1146-
------------------------------
1147-
<b>1</b> 2 <b>3</b> <b>1</b>
1154+
ts_headline
1155+
---------------------
1156+
<b>1</b> 2 <b>3</b>
11481157
(1 row)
11491158

11501159
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
1151-
ts_headline
1152-
-------------------
1153-
<b>1</b> <b>3</b>
1160+
ts_headline
1161+
----------------------------
1162+
<b>3</b> <b>1</b> <b>3</b>
11541163
(1 row)
11551164

11561165
--Check if headline fragments work
@@ -1245,6 +1254,16 @@ S. T. Coleridge (1772-1834)
12451254
S. T. <b>Coleridge</b>
12461255
(1 row)
12471256

1257+
--Fragments with phrase search
1258+
SELECT ts_headline('english',
1259+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
1260+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
1261+
'MaxFragments=100, MaxWords=100, MinWords=1');
1262+
ts_headline
1263+
-------------------------------------------------------------------------------
1264+
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
1265+
(1 row)
1266+
12481267
--Rewrite sub system
12491268
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
12501269
\set ECHO none

src/test/regress/sql/tsearch.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,11 @@ Water, water, every where,
326326
S. T. Coleridge (1772-1834)
327327
', phraseto_tsquery('english', 'idle as a painted Ship'));
328328

329+
SELECT ts_headline('english',
330+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
331+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
332+
'MaxWords=100, MinWords=1');
333+
329334
SELECT ts_headline('english', '
330335
<html>
331336
<!-- some comment -->
@@ -396,6 +401,12 @@ Water, water, every where,
396401
S. T. Coleridge (1772-1834)
397402
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
398403

404+
--Fragments with phrase search
405+
SELECT ts_headline('english',
406+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
407+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
408+
'MaxFragments=100, MaxWords=100, MinWords=1');
409+
399410
--Rewrite sub system
400411

401412
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);

0 commit comments

Comments
 (0)