@@ -1957,14 +1957,18 @@ typedef struct
1957
1957
1958
1958
/*
1959
1959
* TS_execute callback for matching a tsquery operand to headline words
1960
+ *
1961
+ * Note: it's tempting to report words[] indexes as pos values to save
1962
+ * searching in hlCover; but that would screw up phrase matching, which
1963
+ * expects to measure distances in lexemes not tokens.
1960
1964
*/
1961
1965
static TSTernaryValue
1962
1966
checkcondition_HL (void * opaque , QueryOperand * val , ExecPhraseData * data )
1963
1967
{
1964
1968
hlCheck * checkval = (hlCheck * ) opaque ;
1965
1969
int i ;
1966
1970
1967
- /* scan words array for marching items */
1971
+ /* scan words array for matching items */
1968
1972
for (i = 0 ; i < checkval -> len ; i ++ )
1969
1973
{
1970
1974
if (checkval -> words [i ].item == val )
@@ -1993,35 +1997,15 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1993
1997
return TS_NO ;
1994
1998
}
1995
1999
1996
- /*
1997
- * hlFirstIndex: find first index >= pos containing any word used in query
1998
- *
1999
- * Returns -1 if no such index
2000
- */
2001
- static int
2002
- hlFirstIndex (HeadlineParsedText * prs , int pos )
2003
- {
2004
- int i ;
2005
-
2006
- for (i = pos ; i < prs -> curwords ; i ++ )
2007
- {
2008
- if (prs -> words [i ].item != NULL )
2009
- return i ;
2010
- }
2011
- return -1 ;
2012
- }
2013
-
2014
2000
/*
2015
2001
* hlCover: try to find a substring of prs' word list that satisfies query
2016
2002
*
2017
- * At entry, *p must be the first word index to consider (initialize this
2018
- * to zero, or to the next index after a previous successful search).
2019
- * We will consider all substrings starting at or after that word, and
2020
- * containing no more than max_cover words. (We need a length limit to
2021
- * keep this from taking O(N^2) time for a long document with many query
2022
- * words but few complete matches. Actually, since checkcondition_HL is
2023
- * roughly O(N) in the length of the substring being checked, it's even
2024
- * worse than that.)
2003
+ * locations is the result of TS_execute_locations() for the query.
2004
+ * We use this to identify plausible subranges of the query.
2005
+ *
2006
+ * *nextpos is the lexeme position (NOT word index) to start the search
2007
+ * at. Caller should initialize this to zero. If successful, we'll
2008
+ * advance it to the next place to search at.
2025
2009
*
2026
2010
* On success, sets *p to first word index and *q to last word index of the
2027
2011
* cover substring, and returns true.
@@ -2030,57 +2014,149 @@ hlFirstIndex(HeadlineParsedText *prs, int pos)
2030
2014
* words used in the query.
2031
2015
*/
2032
2016
static bool
2033
- hlCover (HeadlineParsedText * prs , TSQuery query , int max_cover ,
2034
- int * p , int * q )
2017
+ hlCover (HeadlineParsedText * prs , TSQuery query , List * locations ,
2018
+ int * nextpos , int * p , int * q )
2035
2019
{
2036
- int pmin ,
2037
- pmax ,
2038
- nextpmin ,
2039
- nextpmax ;
2040
- hlCheck ch ;
2020
+ int pos = * nextpos ;
2041
2021
2042
- /*
2043
- * We look for the earliest, shortest substring of prs->words that
2044
- * satisfies the query. Both the pmin and pmax indices must be words
2045
- * appearing in the query; there's no point in trying endpoints in between
2046
- * such points.
2047
- */
2048
- pmin = hlFirstIndex (prs , * p );
2049
- while (pmin >= 0 )
2022
+ /* This loop repeats when our selected word-range fails the query */
2023
+ for (;;)
2050
2024
{
2051
- /* This useless assignment just keeps stupider compilers quiet */
2052
- nextpmin = -1 ;
2053
- /* Consider substrings starting at pmin */
2054
- ch .words = & (prs -> words [pmin ]);
2055
- /* Consider the length-one substring first, then longer substrings */
2056
- pmax = pmin ;
2057
- do
2025
+ int posb ,
2026
+ pose ;
2027
+ ListCell * lc ;
2028
+
2029
+ /*
2030
+ * For each AND'ed query term or phrase, find its first occurrence at
2031
+ * or after pos; set pose to the maximum of those positions.
2032
+ *
2033
+ * We need not consider ORs or NOTs here; see the comments for
2034
+ * TS_execute_locations(). Rechecking the match with TS_execute(),
2035
+ * below, will deal with any ensuing imprecision.
2036
+ */
2037
+ pose = -1 ;
2038
+ foreach (lc , locations )
2039
+ {
2040
+ ExecPhraseData * pdata = (ExecPhraseData * ) lfirst (lc );
2041
+ int first = -1 ;
2042
+
2043
+ for (int i = 0 ; i < pdata -> npos ; i ++ )
2044
+ {
2045
+ /* For phrase matches, use the ending lexeme */
2046
+ int endp = pdata -> pos [i ];
2047
+
2048
+ if (endp >= pos )
2049
+ {
2050
+ first = endp ;
2051
+ break ;
2052
+ }
2053
+ }
2054
+ if (first < 0 )
2055
+ return false; /* no more matches for this term */
2056
+ if (first > pose )
2057
+ pose = first ;
2058
+ }
2059
+
2060
+ if (pose < 0 )
2061
+ return false; /* we only get here if empty list */
2062
+
2063
+ /*
2064
+ * Now, for each AND'ed query term or phrase, find its last occurrence
2065
+ * at or before pose; set posb to the minimum of those positions.
2066
+ *
2067
+ * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2068
+ * posb + 1 below.
2069
+ */
2070
+ posb = INT_MAX - 1 ;
2071
+ foreach (lc , locations )
2058
2072
{
2059
- /* Try to match query against pmin .. pmax substring */
2060
- ch . len = pmax - pmin + 1 ;
2061
- if ( TS_execute ( GETQUERY ( query ), & ch ,
2062
- TS_EXEC_EMPTY , checkcondition_HL ) )
2073
+ ExecPhraseData * pdata = ( ExecPhraseData * ) lfirst ( lc );
2074
+ int last = - 1 ;
2075
+
2076
+ for ( int i = pdata -> npos - 1 ; i >= 0 ; i -- )
2063
2077
{
2064
- * p = pmin ;
2065
- * q = pmax ;
2066
- return true;
2078
+ /* For phrase matches, use the starting lexeme */
2079
+ int startp = pdata -> pos [i ] - pdata -> width ;
2080
+
2081
+ if (startp <= pose )
2082
+ {
2083
+ last = startp ;
2084
+ break ;
2085
+ }
2067
2086
}
2068
- /* Nope, so advance pmax to next feasible endpoint */
2069
- nextpmax = hlFirstIndex (prs , pmax + 1 );
2087
+ if (last < posb )
2088
+ posb = last ;
2089
+ }
2090
+
2091
+ /*
2092
+ * We could end up with posb to the left of pos, in case some phrase
2093
+ * match crosses pos. Try the match starting at pos anyway, since the
2094
+ * result of TS_execute_locations is imprecise for phrase matches OR'd
2095
+ * with plain matches; that is, if the query is "(A <-> B) | C" then C
2096
+ * could match at pos even though the phrase match would have to
2097
+ * extend to the left of pos.
2098
+ */
2099
+ posb = Max (posb , pos );
2070
2100
2101
+ /* This test probably always succeeds, but be paranoid */
2102
+ if (posb <= pose )
2103
+ {
2071
2104
/*
2072
- * If this is our first advance past pmin, then the result is also
2073
- * the next feasible value of pmin; remember it to save a
2074
- * redundant search.
2105
+ * posb .. pose is now the shortest, earliest-after-pos range of
2106
+ * lexeme positions containing all the query terms. It will
2107
+ * contain all phrase matches, too, except in the corner case
2108
+ * described just above.
2109
+ *
2110
+ * Now convert these lexeme positions to indexes in prs->words[].
2075
2111
*/
2076
- if (pmax == pmin )
2077
- nextpmin = nextpmax ;
2078
- pmax = nextpmax ;
2112
+ int idxb = -1 ;
2113
+ int idxe = -1 ;
2114
+
2115
+ for (int i = 0 ; i < prs -> curwords ; i ++ )
2116
+ {
2117
+ if (prs -> words [i ].item == NULL )
2118
+ continue ;
2119
+ if (idxb < 0 && prs -> words [i ].pos >= posb )
2120
+ idxb = i ;
2121
+ if (prs -> words [i ].pos <= pose )
2122
+ idxe = i ;
2123
+ else
2124
+ break ;
2125
+ }
2126
+
2127
+ /* This test probably always succeeds, but be paranoid */
2128
+ if (idxb >= 0 && idxe >= idxb )
2129
+ {
2130
+ /*
2131
+ * Finally, check that the selected range satisfies the query.
2132
+ * This should succeed in all simple cases; but odd cases
2133
+ * involving non-top-level NOT conditions or phrase matches
2134
+ * OR'd with other things could fail, since the result of
2135
+ * TS_execute_locations doesn't fully represent such things.
2136
+ */
2137
+ hlCheck ch ;
2138
+
2139
+ ch .words = & (prs -> words [idxb ]);
2140
+ ch .len = idxe - idxb + 1 ;
2141
+ if (TS_execute (GETQUERY (query ), & ch ,
2142
+ TS_EXEC_EMPTY , checkcondition_HL ))
2143
+ {
2144
+ /* Match! Advance *nextpos and return the word range. */
2145
+ * nextpos = posb + 1 ;
2146
+ * p = idxb ;
2147
+ * q = idxe ;
2148
+ return true;
2149
+ }
2150
+ }
2079
2151
}
2080
- while (pmax >= 0 && pmax - pmin < max_cover );
2081
- /* No luck here, so try next feasible startpoint */
2082
- pmin = nextpmin ;
2152
+
2153
+ /*
2154
+ * Advance pos and try again. Any later workable match must start
2155
+ * beyond posb.
2156
+ */
2157
+ pos = posb + 1 ;
2083
2158
}
2159
+ /* Can't get here, but stupider compilers complain if we leave it off */
2084
2160
return false;
2085
2161
}
2086
2162
@@ -2177,9 +2253,10 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2177
2253
* it only controls presentation details.
2178
2254
*/
2179
2255
static void
2180
- mark_hl_fragments (HeadlineParsedText * prs , TSQuery query , bool highlightall ,
2256
+ mark_hl_fragments (HeadlineParsedText * prs , TSQuery query , List * locations ,
2257
+ bool highlightall ,
2181
2258
int shortword , int min_words ,
2182
- int max_words , int max_fragments , int max_cover )
2259
+ int max_words , int max_fragments )
2183
2260
{
2184
2261
int32 poslen ,
2185
2262
curlen ,
@@ -2192,6 +2269,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2192
2269
2193
2270
int32 startpos = 0 ,
2194
2271
endpos = 0 ,
2272
+ nextpos = 0 ,
2195
2273
p = 0 ,
2196
2274
q = 0 ;
2197
2275
@@ -2206,7 +2284,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2206
2284
covers = palloc (maxcovers * sizeof (CoverPos ));
2207
2285
2208
2286
/* get all covers */
2209
- while (hlCover (prs , query , max_cover , & p , & q ))
2287
+ while (hlCover (prs , query , locations , & nextpos , & p , & q ))
2210
2288
{
2211
2289
startpos = p ;
2212
2290
endpos = q ;
@@ -2236,9 +2314,6 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2236
2314
startpos = endpos + 1 ;
2237
2315
endpos = q ;
2238
2316
}
2239
-
2240
- /* move p to generate the next cover */
2241
- p ++ ;
2242
2317
}
2243
2318
2244
2319
/* choose best covers */
@@ -2360,10 +2435,12 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2360
2435
* Headline selector used when MaxFragments == 0
2361
2436
*/
2362
2437
static void
2363
- mark_hl_words (HeadlineParsedText * prs , TSQuery query , bool highlightall ,
2364
- int shortword , int min_words , int max_words , int max_cover )
2438
+ mark_hl_words (HeadlineParsedText * prs , TSQuery query , List * locations ,
2439
+ bool highlightall ,
2440
+ int shortword , int min_words , int max_words )
2365
2441
{
2366
- int p = 0 ,
2442
+ int nextpos = 0 ,
2443
+ p = 0 ,
2367
2444
q = 0 ;
2368
2445
int bestb = -1 ,
2369
2446
beste = -1 ;
@@ -2379,7 +2456,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2379
2456
if (!highlightall )
2380
2457
{
2381
2458
/* examine all covers, select a headline using the best one */
2382
- while (hlCover (prs , query , max_cover , & p , & q ))
2459
+ while (hlCover (prs , query , locations , & nextpos , & p , & q ))
2383
2460
{
2384
2461
/*
2385
2462
* Count words (curlen) and interesting words (poslen) within
@@ -2486,9 +2563,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2486
2563
bestlen = poslen ;
2487
2564
bestcover = poscover ;
2488
2565
}
2489
-
2490
- /* move p to generate the next cover */
2491
- p ++ ;
2492
2566
}
2493
2567
2494
2568
/*
@@ -2528,14 +2602,15 @@ prsd_headline(PG_FUNCTION_ARGS)
2528
2602
HeadlineParsedText * prs = (HeadlineParsedText * ) PG_GETARG_POINTER (0 );
2529
2603
List * prsoptions = (List * ) PG_GETARG_POINTER (1 );
2530
2604
TSQuery query = PG_GETARG_TSQUERY (2 );
2605
+ hlCheck ch ;
2606
+ List * locations ;
2531
2607
2532
2608
/* default option values: */
2533
2609
int min_words = 15 ;
2534
2610
int max_words = 35 ;
2535
2611
int shortword = 3 ;
2536
2612
int max_fragments = 0 ;
2537
2613
bool highlightall = false;
2538
- int max_cover ;
2539
2614
ListCell * l ;
2540
2615
2541
2616
/* Extract configuration option values */
@@ -2575,15 +2650,6 @@ prsd_headline(PG_FUNCTION_ARGS)
2575
2650
defel -> defname )));
2576
2651
}
2577
2652
2578
- /*
2579
- * We might eventually make max_cover a user-settable parameter, but for
2580
- * now, just compute a reasonable value based on max_words and
2581
- * max_fragments.
2582
- */
2583
- max_cover = Max (max_words * 10 , 100 );
2584
- if (max_fragments > 0 )
2585
- max_cover *= max_fragments ;
2586
-
2587
2653
/* in HighlightAll mode these parameters are ignored */
2588
2654
if (!highlightall )
2589
2655
{
@@ -2605,13 +2671,19 @@ prsd_headline(PG_FUNCTION_ARGS)
2605
2671
errmsg ("MaxFragments should be >= 0" )));
2606
2672
}
2607
2673
2674
+ /* Locate words and phrases matching the query */
2675
+ ch .words = prs -> words ;
2676
+ ch .len = prs -> curwords ;
2677
+ locations = TS_execute_locations (GETQUERY (query ), & ch , TS_EXEC_EMPTY ,
2678
+ checkcondition_HL );
2679
+
2608
2680
/* Apply appropriate headline selector */
2609
2681
if (max_fragments == 0 )
2610
- mark_hl_words (prs , query , highlightall , shortword ,
2611
- min_words , max_words , max_cover );
2682
+ mark_hl_words (prs , query , locations , highlightall , shortword ,
2683
+ min_words , max_words );
2612
2684
else
2613
- mark_hl_fragments (prs , query , highlightall , shortword ,
2614
- min_words , max_words , max_fragments , max_cover );
2685
+ mark_hl_fragments (prs , query , locations , highlightall , shortword ,
2686
+ min_words , max_words , max_fragments );
2615
2687
2616
2688
/* Fill in default values for string options */
2617
2689
if (!prs -> startsel )
0 commit comments