Skip to content

Commit 20de682

Browse files
committed
Phrase search ported from 9.6
1 parent 5444f06 commit 20de682

File tree

26 files changed

+910
-787
lines changed

26 files changed

+910
-787
lines changed

contrib/tsearch2/expected/tsearch2.out

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ select 'a' > 'b & c'::tsquery;
340340
select 'a | f' < 'b & c'::tsquery;
341341
?column?
342342
----------
343-
f
343+
t
344344
(1 row)
345345

346346
select 'a | ff' < 'b & c'::tsquery;
@@ -445,7 +445,7 @@ set enable_seqscan=on;
445445
select rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city');
446446
rewrite
447447
------------------------------------------------------------------------------
448-
'foo' & 'bar' & 'qq' & ( 'nyc' | 'big' & 'apple' | 'city' & 'new' & 'york' )
448+
'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | 'nyc' | 'big' & 'apple' )
449449
(1 row)
450450

451451
select rewrite('moscow', 'select keyword, sample from test_tsquery'::text );
@@ -463,7 +463,7 @@ select rewrite('moscow & hotel', 'select keyword, sample from test_tsquery'::tex
463463
select rewrite('bar & new & qq & foo & york', 'select keyword, sample from test_tsquery'::text );
464464
rewrite
465465
---------------------------------------------------------------------------------
466-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
466+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
467467
(1 row)
468468

469469
select rewrite( ARRAY['moscow', keyword, sample] ) from test_tsquery;
@@ -481,7 +481,7 @@ select rewrite( ARRAY['moscow & hotel', keyword, sample] ) from test_tsquery;
481481
select rewrite( ARRAY['bar & new & qq & foo & york', keyword, sample] ) from test_tsquery;
482482
rewrite
483483
---------------------------------------------------------------------------------
484-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
484+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
485485
(1 row)
486486

487487
select keyword from test_tsquery where keyword @> 'new';
@@ -522,7 +522,7 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
522522
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query;
523523
rewrite
524524
---------------------------------------------------------------------------------
525-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
525+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
526526
(1 row)
527527

528528
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword;
@@ -540,7 +540,7 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
540540
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword;
541541
rewrite
542542
---------------------------------------------------------------------------------
543-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
543+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
544544
(1 row)
545545

546546
create index qq on test_tsquery using gist (keyword gist_tp_tsquery_ops);
@@ -583,7 +583,7 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
583583
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query;
584584
rewrite
585585
---------------------------------------------------------------------------------
586-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
586+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
587587
(1 row)
588588

589589
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword;
@@ -601,7 +601,7 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
601601
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword;
602602
rewrite
603603
---------------------------------------------------------------------------------
604-
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
604+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
605605
(1 row)
606606

607607
set enable_seqscan='on';

doc/src/sgml/datatype.sgml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3901,7 +3901,7 @@ SELECT to_tsvector('english', 'The Fat Rats');
39013901
A <type>tsquery</type> value stores lexemes that are to be
39023902
searched for, and combines them honoring the Boolean operators
39033903
<literal>&amp;</literal> (AND), <literal>|</literal> (OR),
3904-
<literal>!</> (NOT) and <literal>?</> (FOLLOWED BY) phrase search
3904+
<literal>!</> (NOT) and <literal>&lt;-&gt;</> (FOLLOWED BY) phrase search
39053905
operator. Parentheses can be used to enforce grouping
39063906
of the operators:
39073907

@@ -3923,7 +3923,7 @@ SELECT 'fat &amp; rat &amp; ! cat'::tsquery;
39233923
</programlisting>
39243924

39253925
In the absence of parentheses, <literal>!</> (NOT) binds most tightly,
3926-
and <literal>&amp;</literal> (AND) and <literal>?</literal> (FOLLOWED BY)
3926+
and <literal>&amp;</literal> (AND) and <literal>&lt;-&gt;</literal> (FOLLOWED BY)
39273927
both bind more tightly than <literal>|</literal> (OR).
39283928
</para>
39293929

doc/src/sgml/func.sgml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8962,10 +8962,10 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
89628962
<entry><literal>!'cat'</literal></entry>
89638963
</row>
89648964
<row>
8965-
<entry> <literal>??</literal> </entry>
8965+
<entry> <literal>&lt;-&gt;</literal> </entry>
89668966
<entry><type>tsquery</> followed by <type>tsquery</></entry>
8967-
<entry><literal>to_tsquery('fat') ?? to_tsquery('rat')</literal></entry>
8968-
<entry><literal>'fat' ? 'rat'</literal></entry>
8967+
<entry><literal>to_tsquery('fat') &lt;-&gt; to_tsquery('rat')</literal></entry>
8968+
<entry><literal>'fat' &lt;-&gt; 'rat'</literal></entry>
89698969
</row>
89708970
<row>
89718971
<entry> <literal>@&gt;</literal> </entry>
@@ -9069,7 +9069,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
90699069
<entry><type>tsquery</type></entry>
90709070
<entry>produce <type>tsquery</> ignoring punctuation</entry>
90719071
<entry><literal>phraseto_tsquery('english', 'The Fat Rats')</literal></entry>
9072-
<entry><literal>'fat' ? 'rat'</literal></entry>
9072+
<entry><literal>'fat' &lt;-&gt; 'rat'</literal></entry>
90739073
</row>
90749074
<row>
90759075
<entry>
@@ -9203,9 +9203,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
92039203
<literal><function>tsquery_phrase(<replaceable class="PARAMETER">query1</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">query2</replaceable> <type>tsquery</>)</function></literal>
92049204
</entry>
92059205
<entry><type>tsquery</type></entry>
9206-
<entry>implementation of <literal>??</> (FOLLOWED BY) operator</entry>
9206+
<entry>implementation of <literal>&lt;-&gt;</> (FOLLOWED BY) operator</entry>
92079207
<entry><literal>tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'))</literal></entry>
9208-
<entry><literal>'fat' ? 'cat'</literal></entry>
9208+
<entry><literal>'fat' &lt;-&gt; 'cat'</literal></entry>
92099209
</row>
92109210
<row>
92119211
<entry>
@@ -9214,7 +9214,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
92149214
<entry><type>tsquery</type></entry>
92159215
<entry>phrase-concatenate with distance</entry>
92169216
<entry><literal>tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10)</literal></entry>
9217-
<entry><literal>'fat' ?[10] 'cat'</literal></entry>
9217+
<entry><literal>'fat' &lt;10&gt; 'cat'</literal></entry>
92189218
</row>
92199219
<row>
92209220
<entry>

doc/src/sgml/textsearch.sgml

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,10 @@ SELECT 'fat &amp; cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t
263263
As the above example suggests, a <type>tsquery</type> is not just raw
264264
text, any more than a <type>tsvector</type> is. A <type>tsquery</type>
265265
contains search terms, which must be already-normalized lexemes, and
266-
may combine multiple terms using AND, OR, and NOT operators.
266+
may combine multiple terms using AND, OR, NOT and FOLLOWED BY operators.
267267
(For details see <xref linkend="datatype-textsearch">.) There are
268-
functions <function>to_tsquery</> and <function>plainto_tsquery</>
268+
functions <function>to_tsquery</>, <function>plainto_tsquery</>
269+
and <function>phraseto_tsquery</>
269270
that are helpful in converting user-written text into a proper
270271
<type>tsquery</type>, for example by normalizing words appearing in
271272
the text. Similarly, <function>to_tsvector</> is used to parse and
@@ -293,6 +294,35 @@ SELECT 'fat cats ate fat rats'::tsvector @@ to_tsquery('fat &amp; rat');
293294
already normalized, so <literal>rats</> does not match <literal>rat</>.
294295
</para>
295296

297+
<para>
298+
Phrase search is made possible with the help of the <literal>&lt;-&gt;</>
299+
(FOLLOWED BY) operator, which enforces lexeme order. This allows you
300+
to discard strings not containing the desired phrase, for example:
301+
302+
<programlisting>
303+
SELECT q @@ to_tsquery('fatal &lt;-&gt; error')
304+
FROM unnest(array[to_tsvector('fatal error'),
305+
to_tsvector('error is not fatal')]) AS q;
306+
?column?
307+
----------
308+
t
309+
f
310+
</programlisting>
311+
312+
A more generic version of the FOLLOWED BY operator takes form of
313+
<literal>&lt;N&gt;</>, where N stands for the greatest allowed distance
314+
between the specified lexemes. The <literal>phraseto_tsquery</>
315+
function makes use of this behavior in order to construct a
316+
<literal>tsquery</> capable of matching the provided phrase:
317+
318+
<programlisting>
319+
SELECT phraseto_tsquery('cat ate some rats');
320+
phraseto_tsquery
321+
-------------------------------
322+
( 'cat' &lt;-&gt; 'ate' ) &lt;2&gt; 'rat'
323+
</programlisting>
324+
</para>
325+
296326
<para>
297327
The <literal>@@</literal> operator also
298328
supports <type>text</type> input, allowing explicit conversion of a text
@@ -732,7 +762,7 @@ to_tsquery(<optional> <replaceable class="PARAMETER">config</replaceable> <type>
732762
<replaceable>querytext</replaceable>, which must consist of single tokens
733763
separated by the Boolean operators <literal>&amp;</literal> (AND),
734764
<literal>|</literal> (OR), <literal>!</literal> (NOT), and also the
735-
<literal>?</literal> (FOLLOWED BY) phrase search operator. These operators
765+
<literal>&lt;-&gt;</literal> (FOLLOWED BY) phrase search operator. These operators
736766
can be grouped using parentheses. In other words, the input to
737767
<function>to_tsquery</function> must already follow the general rules for
738768
<type>tsquery</> input, as described in <xref
@@ -842,7 +872,7 @@ phraseto_tsquery(<optional> <replaceable class="PARAMETER">config</replaceable>
842872
<para>
843873
<function>phraseto_tsquery</> behaves much like
844874
<function>plainto_tsquery</>, with the exception
845-
that it utilizes the <literal>?</literal> (FOLLOWED BY) phrase search
875+
that it utilizes the <literal>&lt;-&gt;</literal> (FOLLOWED BY) phrase search
846876
operator instead of the <literal>&amp;</literal> (AND) Boolean operator.
847877
This is particularly useful when searching for exact lexeme sequences,
848878
since the phrase search operator helps to maintain lexeme order.
@@ -853,9 +883,9 @@ phraseto_tsquery(<optional> <replaceable class="PARAMETER">config</replaceable>
853883

854884
<screen>
855885
SELECT phraseto_tsquery('english', 'The Fat Rats');
856-
phraseto_tsquery
886+
phraseto_tsquery
857887
------------------
858-
'fat' ? 'rat'
888+
'fat' &lt;-&gt; 'rat'
859889
</screen>
860890

861891
Just like the <function>plainto_tsquery</>, the
@@ -865,9 +895,20 @@ SELECT phraseto_tsquery('english', 'The Fat Rats');
865895

866896
<screen>
867897
SELECT phraseto_tsquery('english', 'The Fat &amp; Rats:C');
868-
phraseto_tsquery
869-
-------------------------
870-
( 'fat' ? 'rat' ) ? 'c'
898+
phraseto_tsquery
899+
-----------------------------
900+
( 'fat' &lt;-&gt; 'rat' ) &lt;-&gt; 'c'
901+
</screen>
902+
903+
It is possible to specify the configuration to be used to parse the document,
904+
for example, we could create a new one using the hunspell dictionary
905+
(namely 'eng_hunspell') in order to match phrases with different word forms:
906+
907+
<screen>
908+
SELECT phraseto_tsquery('eng_hunspell', 'developer of the building which collapsed');
909+
phraseto_tsquery
910+
--------------------------------------------------------------------------------------------
911+
( 'developer' &lt;3&gt; 'building' ) &lt;2&gt; 'collapse' | ( 'developer' &lt;3&gt; 'build' ) &lt;2&gt; 'collapse'
871912
</screen>
872913
</para>
873914

@@ -1430,18 +1471,18 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
14301471
<varlistentry>
14311472

14321473
<term>
1433-
<literal><type>tsquery</> ?? <type>tsquery</></literal>
1474+
<literal><type>tsquery</> &lt;-&gt; <type>tsquery</></literal>
14341475
</term>
14351476

14361477
<listitem>
14371478
<para>
14381479
Returns the phrase-concatenation of the two given queries.
14391480

14401481
<screen>
1441-
SELECT to_tsquery('fat') ?? to_tsquery('cat | rat');
1442-
?column?
1443-
-------------------------------
1444-
'fat' ? 'cat' | 'fat' ? 'rat'
1482+
SELECT to_tsquery('fat') &lt;-&gt; to_tsquery('cat | rat');
1483+
?column?
1484+
-----------------------------------
1485+
'fat' &lt;-&gt; 'cat' | 'fat' &lt;-&gt; 'rat'
14451486
</screen>
14461487
</para>
14471488
</listitem>
@@ -1461,13 +1502,13 @@ SELECT to_tsquery('fat') ?? to_tsquery('cat | rat');
14611502
<listitem>
14621503
<para>
14631504
Returns the distanced phrase-concatenation of the two given queries.
1464-
This function lies in the implementation of the <literal>??</> operator.
1505+
This function lies in the implementation of the <literal>&lt;-&gt;</> operator.
14651506

14661507
<screen>
14671508
SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10);
1468-
tsquery_phrase
1469-
-------------------
1470-
'fat' ?[10] 'cat'
1509+
tsquery_phrase
1510+
------------------
1511+
'fat' &lt;10&gt; 'cat'
14711512
</screen>
14721513
</para>
14731514
</listitem>
@@ -1487,10 +1528,10 @@ SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10);
14871528
<listitem>
14881529
<para>
14891530
<function>setweight</> returns a copy of the input query in which every
1490-
position has been labeled with the given <replaceable>weight</>, either
1491-
<literal>A</literal>, <literal>B</literal>, <literal>C</literal>, or
1492-
<literal>D</literal>. These labels are retained when queries are
1493-
concatenated, allowing words from different parts of a document
1531+
position has been labeled with the given <replaceable>weight</>(s), either
1532+
<literal>A</literal>, <literal>B</literal>, <literal>C</literal>,
1533+
<literal>D</literal> or their combination. These labels are retained when
1534+
queries are concatenated, allowing words from different parts of a document
14941535
to be weighted differently by ranking functions.
14951536
</para>
14961537

src/backend/tsearch/to_tsany.c

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
#include "utils/builtins.h"
2020

2121

22+
typedef struct MorphOpaque
23+
{
24+
Oid cfg_id;
25+
int qoperator; /* query operator */
26+
} MorphOpaque;
27+
28+
2229
Datum
2330
get_current_ts_config(PG_FUNCTION_ARGS)
2431
{
@@ -254,11 +261,6 @@ to_tsvector(PG_FUNCTION_ARGS)
254261
* to_tsquery
255262
*/
256263

257-
typedef struct MorphOpaque
258-
{
259-
Oid cfg_id;
260-
int qoperator; /* query operator */
261-
} MorphOpaque;
262264

263265
/*
264266
* This function is used for morph parsing.
@@ -268,7 +270,7 @@ typedef struct MorphOpaque
268270
* to the stack.
269271
*
270272
* All words belonging to the same variant are pushed as an ANDed list,
271-
* and different variants are ORred together.
273+
* and different variants are ORed together.
272274
*/
273275
static void
274276
pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
@@ -293,11 +295,15 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval,
293295
{
294296
while (count < prs.curwords)
295297
{
298+
/*
299+
* Were any stop words removed? If so, fill empty positions
300+
* with placeholders linked by an appropriate operator.
301+
*/
296302
if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
297303
{
298304
while (pos + 1 < prs.words[count].pos.pos)
299305
{
300-
/* put placeholders for each stop word */
306+
/* put placeholders for each missing stop word */
301307
pushStop(state);
302308
if (cntpos)
303309
pushOperator(state, data->qoperator, 1);
@@ -306,25 +312,25 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval,
306312
}
307313
}
308314

309-
pos = prs.words[count].pos.pos;
315+
pos = prs.words[count].pos.pos; /* save current word's position */
316+
317+
/* Go through all variants obtained from this token */
310318
cntvar = 0;
311319
while (count < prs.curwords && pos == prs.words[count].pos.pos)
312320
{
313321
variant = prs.words[count].nvariant;
314322

323+
/* Push all words belonging to the same variant */
315324
cnt = 0;
316325
while (count < prs.curwords &&
317326
pos == prs.words[count].pos.pos &&
318327
variant == prs.words[count].nvariant)
319328
{
320-
321329
pushValue(state,
322330
prs.words[count].word,
323331
prs.words[count].len,
324332
weight,
325-
((prs.words[count].flags & TSL_PREFIX) || prefix) ?
326-
true :
327-
false);
333+
((prs.words[count].flags & TSL_PREFIX) || prefix));
328334
pfree(prs.words[count].word);
329335
if (cnt)
330336
pushOperator(state, OP_AND, 0);
@@ -338,11 +344,12 @@ pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval,
338344
}
339345

340346
if (cntpos)
341-
pushOperator(state, data->qoperator, 1);
347+
pushOperator(state, data->qoperator, 1); /* distance may be useful */
342348
cntpos++;
343349
}
344350

345351
pfree(prs.words);
352+
346353
}
347354
else
348355
pushStop(state);

src/backend/tsearch/ts_parse.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,6 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme
498498

499499
while (lexs)
500500
{
501-
502501
if (lexs->type > 0)
503502
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
504503

0 commit comments

Comments
 (0)