Skip to content

Commit 5a617d7

Browse files
committed
Fix ts_headline() to handle ORs and phrase queries more honestly.
This patch largely reverts what I did in commits c9b0c67 and 78e73e8. The maximum cover length limit that I added in 78e73e8 (to band-aid over c9b0c67's performance issues) creates too many user-visible behavior discrepancies, as complained of for example in bug #17691. The real problem with hlCover() is not what I thought at the time, but more that it seems to have been designed with only AND tsquery semantics in mind. It doesn't work quite right for OR, and even less so for NOT or phrase queries. However, we can improve that situation by building a variant of TS_execute() that returns a list of match locations. We already get an ExecPhraseData struct representing match locations for the primitive case of a simple match, as well as one for a phrase match; we just need to add some logic to combine these for AND and OR operators. The result is a list of ExecPhraseDatas, which hlCover can regard as having simple AND semantics, so that its old algorithm works correctly. There's still a lot not to like about ts_headline's behavior, but I think the remaining issues have to do with the heuristics used in mark_hl_words and mark_hl_fragments (which, likewise, were not revisited when phrase search was added). Improving those is a task for another day. Patch by me; thanks to Alvaro Herrera for review. Discussion: https://postgr.es/m/840.1669405935@sss.pgh.pa.us
1 parent 44e9e34 commit 5a617d7

File tree

5 files changed

+560
-106
lines changed

5 files changed

+560
-106
lines changed

src/backend/tsearch/wparser_def.c

Lines changed: 166 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1957,14 +1957,18 @@ typedef struct
19571957

19581958
/*
19591959
* TS_execute callback for matching a tsquery operand to headline words
1960+
*
1961+
* Note: it's tempting to report words[] indexes as pos values to save
1962+
* searching in hlCover; but that would screw up phrase matching, which
1963+
* expects to measure distances in lexemes not tokens.
19601964
*/
19611965
static TSTernaryValue
19621966
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
19631967
{
19641968
hlCheck *checkval = (hlCheck *) opaque;
19651969
int i;
19661970

1967-
/* scan words array for marching items */
1971+
/* scan words array for matching items */
19681972
for (i = 0; i < checkval->len; i++)
19691973
{
19701974
if (checkval->words[i].item == val)
@@ -1993,35 +1997,15 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
19931997
return TS_NO;
19941998
}
19951999

1996-
/*
1997-
* hlFirstIndex: find first index >= pos containing any word used in query
1998-
*
1999-
* Returns -1 if no such index
2000-
*/
2001-
static int
2002-
hlFirstIndex(HeadlineParsedText *prs, int pos)
2003-
{
2004-
int i;
2005-
2006-
for (i = pos; i < prs->curwords; i++)
2007-
{
2008-
if (prs->words[i].item != NULL)
2009-
return i;
2010-
}
2011-
return -1;
2012-
}
2013-
20142000
/*
20152001
* hlCover: try to find a substring of prs' word list that satisfies query
20162002
*
2017-
* At entry, *p must be the first word index to consider (initialize this
2018-
* to zero, or to the next index after a previous successful search).
2019-
* We will consider all substrings starting at or after that word, and
2020-
* containing no more than max_cover words. (We need a length limit to
2021-
* keep this from taking O(N^2) time for a long document with many query
2022-
* words but few complete matches. Actually, since checkcondition_HL is
2023-
* roughly O(N) in the length of the substring being checked, it's even
2024-
* worse than that.)
2003+
* locations is the result of TS_execute_locations() for the query.
2004+
* We use this to identify plausible subranges of the query.
2005+
*
2006+
* *nextpos is the lexeme position (NOT word index) to start the search
2007+
* at. Caller should initialize this to zero. If successful, we'll
2008+
* advance it to the next place to search at.
20252009
*
20262010
* On success, sets *p to first word index and *q to last word index of the
20272011
* cover substring, and returns true.
@@ -2030,57 +2014,149 @@ hlFirstIndex(HeadlineParsedText *prs, int pos)
20302014
* words used in the query.
20312015
*/
20322016
static bool
2033-
hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
2034-
int *p, int *q)
2017+
hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2018+
int *nextpos, int *p, int *q)
20352019
{
2036-
int pmin,
2037-
pmax,
2038-
nextpmin,
2039-
nextpmax;
2040-
hlCheck ch;
2020+
int pos = *nextpos;
20412021

2042-
/*
2043-
* We look for the earliest, shortest substring of prs->words that
2044-
* satisfies the query. Both the pmin and pmax indices must be words
2045-
* appearing in the query; there's no point in trying endpoints in between
2046-
* such points.
2047-
*/
2048-
pmin = hlFirstIndex(prs, *p);
2049-
while (pmin >= 0)
2022+
/* This loop repeats when our selected word-range fails the query */
2023+
for (;;)
20502024
{
2051-
/* This useless assignment just keeps stupider compilers quiet */
2052-
nextpmin = -1;
2053-
/* Consider substrings starting at pmin */
2054-
ch.words = &(prs->words[pmin]);
2055-
/* Consider the length-one substring first, then longer substrings */
2056-
pmax = pmin;
2057-
do
2025+
int posb,
2026+
pose;
2027+
ListCell *lc;
2028+
2029+
/*
2030+
* For each AND'ed query term or phrase, find its first occurrence at
2031+
* or after pos; set pose to the maximum of those positions.
2032+
*
2033+
* We need not consider ORs or NOTs here; see the comments for
2034+
* TS_execute_locations(). Rechecking the match with TS_execute(),
2035+
* below, will deal with any ensuing imprecision.
2036+
*/
2037+
pose = -1;
2038+
foreach(lc, locations)
2039+
{
2040+
ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2041+
int first = -1;
2042+
2043+
for (int i = 0; i < pdata->npos; i++)
2044+
{
2045+
/* For phrase matches, use the ending lexeme */
2046+
int endp = pdata->pos[i];
2047+
2048+
if (endp >= pos)
2049+
{
2050+
first = endp;
2051+
break;
2052+
}
2053+
}
2054+
if (first < 0)
2055+
return false; /* no more matches for this term */
2056+
if (first > pose)
2057+
pose = first;
2058+
}
2059+
2060+
if (pose < 0)
2061+
return false; /* we only get here if empty list */
2062+
2063+
/*
2064+
* Now, for each AND'ed query term or phrase, find its last occurrence
2065+
* at or before pose; set posb to the minimum of those positions.
2066+
*
2067+
* We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2068+
* posb + 1 below.
2069+
*/
2070+
posb = INT_MAX - 1;
2071+
foreach(lc, locations)
20582072
{
2059-
/* Try to match query against pmin .. pmax substring */
2060-
ch.len = pmax - pmin + 1;
2061-
if (TS_execute(GETQUERY(query), &ch,
2062-
TS_EXEC_EMPTY, checkcondition_HL))
2073+
ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2074+
int last = -1;
2075+
2076+
for (int i = pdata->npos - 1; i >= 0; i--)
20632077
{
2064-
*p = pmin;
2065-
*q = pmax;
2066-
return true;
2078+
/* For phrase matches, use the starting lexeme */
2079+
int startp = pdata->pos[i] - pdata->width;
2080+
2081+
if (startp <= pose)
2082+
{
2083+
last = startp;
2084+
break;
2085+
}
20672086
}
2068-
/* Nope, so advance pmax to next feasible endpoint */
2069-
nextpmax = hlFirstIndex(prs, pmax + 1);
2087+
if (last < posb)
2088+
posb = last;
2089+
}
2090+
2091+
/*
2092+
* We could end up with posb to the left of pos, in case some phrase
2093+
* match crosses pos. Try the match starting at pos anyway, since the
2094+
* result of TS_execute_locations is imprecise for phrase matches OR'd
2095+
* with plain matches; that is, if the query is "(A <-> B) | C" then C
2096+
* could match at pos even though the phrase match would have to
2097+
* extend to the left of pos.
2098+
*/
2099+
posb = Max(posb, pos);
20702100

2101+
/* This test probably always succeeds, but be paranoid */
2102+
if (posb <= pose)
2103+
{
20712104
/*
2072-
* If this is our first advance past pmin, then the result is also
2073-
* the next feasible value of pmin; remember it to save a
2074-
* redundant search.
2105+
* posb .. pose is now the shortest, earliest-after-pos range of
2106+
* lexeme positions containing all the query terms. It will
2107+
* contain all phrase matches, too, except in the corner case
2108+
* described just above.
2109+
*
2110+
* Now convert these lexeme positions to indexes in prs->words[].
20752111
*/
2076-
if (pmax == pmin)
2077-
nextpmin = nextpmax;
2078-
pmax = nextpmax;
2112+
int idxb = -1;
2113+
int idxe = -1;
2114+
2115+
for (int i = 0; i < prs->curwords; i++)
2116+
{
2117+
if (prs->words[i].item == NULL)
2118+
continue;
2119+
if (idxb < 0 && prs->words[i].pos >= posb)
2120+
idxb = i;
2121+
if (prs->words[i].pos <= pose)
2122+
idxe = i;
2123+
else
2124+
break;
2125+
}
2126+
2127+
/* This test probably always succeeds, but be paranoid */
2128+
if (idxb >= 0 && idxe >= idxb)
2129+
{
2130+
/*
2131+
* Finally, check that the selected range satisfies the query.
2132+
* This should succeed in all simple cases; but odd cases
2133+
* involving non-top-level NOT conditions or phrase matches
2134+
* OR'd with other things could fail, since the result of
2135+
* TS_execute_locations doesn't fully represent such things.
2136+
*/
2137+
hlCheck ch;
2138+
2139+
ch.words = &(prs->words[idxb]);
2140+
ch.len = idxe - idxb + 1;
2141+
if (TS_execute(GETQUERY(query), &ch,
2142+
TS_EXEC_EMPTY, checkcondition_HL))
2143+
{
2144+
/* Match! Advance *nextpos and return the word range. */
2145+
*nextpos = posb + 1;
2146+
*p = idxb;
2147+
*q = idxe;
2148+
return true;
2149+
}
2150+
}
20792151
}
2080-
while (pmax >= 0 && pmax - pmin < max_cover);
2081-
/* No luck here, so try next feasible startpoint */
2082-
pmin = nextpmin;
2152+
2153+
/*
2154+
* Advance pos and try again. Any later workable match must start
2155+
* beyond posb.
2156+
*/
2157+
pos = posb + 1;
20832158
}
2159+
/* Can't get here, but stupider compilers complain if we leave it off */
20842160
return false;
20852161
}
20862162

@@ -2177,9 +2253,10 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
21772253
* it only controls presentation details.
21782254
*/
21792255
static void
2180-
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2256+
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
2257+
bool highlightall,
21812258
int shortword, int min_words,
2182-
int max_words, int max_fragments, int max_cover)
2259+
int max_words, int max_fragments)
21832260
{
21842261
int32 poslen,
21852262
curlen,
@@ -2192,6 +2269,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
21922269

21932270
int32 startpos = 0,
21942271
endpos = 0,
2272+
nextpos = 0,
21952273
p = 0,
21962274
q = 0;
21972275

@@ -2206,7 +2284,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
22062284
covers = palloc(maxcovers * sizeof(CoverPos));
22072285

22082286
/* get all covers */
2209-
while (hlCover(prs, query, max_cover, &p, &q))
2287+
while (hlCover(prs, query, locations, &nextpos, &p, &q))
22102288
{
22112289
startpos = p;
22122290
endpos = q;
@@ -2236,9 +2314,6 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
22362314
startpos = endpos + 1;
22372315
endpos = q;
22382316
}
2239-
2240-
/* move p to generate the next cover */
2241-
p++;
22422317
}
22432318

22442319
/* choose best covers */
@@ -2360,10 +2435,12 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23602435
* Headline selector used when MaxFragments == 0
23612436
*/
23622437
static void
2363-
mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2364-
int shortword, int min_words, int max_words, int max_cover)
2438+
mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
2439+
bool highlightall,
2440+
int shortword, int min_words, int max_words)
23652441
{
2366-
int p = 0,
2442+
int nextpos = 0,
2443+
p = 0,
23672444
q = 0;
23682445
int bestb = -1,
23692446
beste = -1;
@@ -2379,7 +2456,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23792456
if (!highlightall)
23802457
{
23812458
/* examine all covers, select a headline using the best one */
2382-
while (hlCover(prs, query, max_cover, &p, &q))
2459+
while (hlCover(prs, query, locations, &nextpos, &p, &q))
23832460
{
23842461
/*
23852462
* Count words (curlen) and interesting words (poslen) within
@@ -2486,9 +2563,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
24862563
bestlen = poslen;
24872564
bestcover = poscover;
24882565
}
2489-
2490-
/* move p to generate the next cover */
2491-
p++;
24922566
}
24932567

24942568
/*
@@ -2528,14 +2602,15 @@ prsd_headline(PG_FUNCTION_ARGS)
25282602
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
25292603
List *prsoptions = (List *) PG_GETARG_POINTER(1);
25302604
TSQuery query = PG_GETARG_TSQUERY(2);
2605+
hlCheck ch;
2606+
List *locations;
25312607

25322608
/* default option values: */
25332609
int min_words = 15;
25342610
int max_words = 35;
25352611
int shortword = 3;
25362612
int max_fragments = 0;
25372613
bool highlightall = false;
2538-
int max_cover;
25392614
ListCell *l;
25402615

25412616
/* Extract configuration option values */
@@ -2575,15 +2650,6 @@ prsd_headline(PG_FUNCTION_ARGS)
25752650
defel->defname)));
25762651
}
25772652

2578-
/*
2579-
* We might eventually make max_cover a user-settable parameter, but for
2580-
* now, just compute a reasonable value based on max_words and
2581-
* max_fragments.
2582-
*/
2583-
max_cover = Max(max_words * 10, 100);
2584-
if (max_fragments > 0)
2585-
max_cover *= max_fragments;
2586-
25872653
/* in HighlightAll mode these parameters are ignored */
25882654
if (!highlightall)
25892655
{
@@ -2605,13 +2671,19 @@ prsd_headline(PG_FUNCTION_ARGS)
26052671
errmsg("MaxFragments should be >= 0")));
26062672
}
26072673

2674+
/* Locate words and phrases matching the query */
2675+
ch.words = prs->words;
2676+
ch.len = prs->curwords;
2677+
locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2678+
checkcondition_HL);
2679+
26082680
/* Apply appropriate headline selector */
26092681
if (max_fragments == 0)
2610-
mark_hl_words(prs, query, highlightall, shortword,
2611-
min_words, max_words, max_cover);
2682+
mark_hl_words(prs, query, locations, highlightall, shortword,
2683+
min_words, max_words);
26122684
else
2613-
mark_hl_fragments(prs, query, highlightall, shortword,
2614-
min_words, max_words, max_fragments, max_cover);
2685+
mark_hl_fragments(prs, query, locations, highlightall, shortword,
2686+
min_words, max_words, max_fragments);
26152687

26162688
/* Fill in default values for string options */
26172689
if (!prs->startsel)

0 commit comments

Comments
 (0)