Skip to content

Commit a15127b

Browse files
committed
Merge branch 'PGPRO9_5_phrase_search' into PGPRO9_5
Merge phrase search functionality.
2 parents dddbbc4 + 249bfc7 commit a15127b

29 files changed

+2412
-386
lines changed

contrib/tsearch2/expected/tsearch2.out

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -278,15 +278,15 @@ SELECT '(!1|2)&3'::tsquery;
278278
(1 row)
279279

280280
SELECT '1|(2|(4|(5|6)))'::tsquery;
281-
tsquery
282-
-----------------------------------------
283-
'1' | ( '2' | ( '4' | ( '5' | '6' ) ) )
281+
tsquery
282+
-----------------------------
283+
'1' | '2' | '4' | '5' | '6'
284284
(1 row)
285285

286286
SELECT '1|2|4|5|6'::tsquery;
287-
tsquery
288-
-----------------------------------------
289-
( ( ( '1' | '2' ) | '4' ) | '5' ) | '6'
287+
tsquery
288+
-----------------------------
289+
'1' | '2' | '4' | '5' | '6'
290290
(1 row)
291291

292292
SELECT '1&(2&(4&(5&6)))'::tsquery;
@@ -340,7 +340,7 @@ select 'a' > 'b & c'::tsquery;
340340
select 'a | f' < 'b & c'::tsquery;
341341
?column?
342342
----------
343-
t
343+
f
344344
(1 row)
345345

346346
select 'a | ff' < 'b & c'::tsquery;
@@ -443,9 +443,9 @@ select count(*) from test_tsquery where keyword > 'new & york';
443443

444444
set enable_seqscan=on;
445445
select rewrite('foo & bar & qq & new & york', 'new & york'::tsquery, 'big & apple | nyc | new & york & city');
446-
rewrite
447-
----------------------------------------------------------------------------------
448-
'foo' & 'bar' & 'qq' & ( 'city' & 'new' & 'york' | ( 'nyc' | 'big' & 'apple' ) )
446+
rewrite
447+
------------------------------------------------------------------------------
448+
'foo' & 'bar' & 'qq' & ( 'nyc' | 'big' & 'apple' | 'city' & 'new' & 'york' )
449449
(1 row)
450450

451451
select rewrite('moscow', 'select keyword, sample from test_tsquery'::text );
@@ -461,9 +461,9 @@ select rewrite('moscow & hotel', 'select keyword, sample from test_tsquery'::tex
461461
(1 row)
462462

463463
select rewrite('bar & new & qq & foo & york', 'select keyword, sample from test_tsquery'::text );
464-
rewrite
465-
-------------------------------------------------------------------------------------
466-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
464+
rewrite
465+
---------------------------------------------------------------------------------
466+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
467467
(1 row)
468468

469469
select rewrite( ARRAY['moscow', keyword, sample] ) from test_tsquery;
@@ -479,9 +479,9 @@ select rewrite( ARRAY['moscow & hotel', keyword, sample] ) from test_tsquery;
479479
(1 row)
480480

481481
select rewrite( ARRAY['bar & new & qq & foo & york', keyword, sample] ) from test_tsquery;
482-
rewrite
483-
-------------------------------------------------------------------------------------
484-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
482+
rewrite
483+
---------------------------------------------------------------------------------
484+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
485485
(1 row)
486486

487487
select keyword from test_tsquery where keyword @> 'new';
@@ -520,9 +520,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
520520
(1 row)
521521

522522
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query;
523-
rewrite
524-
-------------------------------------------------------------------------------------
525-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
523+
rewrite
524+
---------------------------------------------------------------------------------
525+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
526526
(1 row)
527527

528528
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword;
@@ -538,9 +538,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
538538
(1 row)
539539

540540
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword;
541-
rewrite
542-
-------------------------------------------------------------------------------------
543-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
541+
rewrite
542+
---------------------------------------------------------------------------------
543+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
544544
(1 row)
545545

546546
create index qq on test_tsquery using gist (keyword gist_tp_tsquery_ops);
@@ -581,9 +581,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
581581
(1 row)
582582

583583
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where keyword <@ query;
584-
rewrite
585-
-------------------------------------------------------------------------------------
586-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
584+
rewrite
585+
---------------------------------------------------------------------------------
586+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
587587
(1 row)
588588

589589
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'moscow') as query where query @> keyword;
@@ -599,9 +599,9 @@ select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('e
599599
(1 row)
600600

601601
select rewrite( ARRAY[query, keyword, sample] ) from test_tsquery, to_tsquery('english', 'bar & new & qq & foo & york') as query where query @> keyword;
602-
rewrite
603-
-------------------------------------------------------------------------------------
604-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | ( 'big' & 'appl' | 'new' & 'york' ) )
602+
rewrite
603+
---------------------------------------------------------------------------------
604+
( 'nyc' | 'big' & 'appl' | 'new' & 'york' ) & 'citi' & 'foo' & ( 'bar' | 'qq' )
605605
(1 row)
606606

607607
set enable_seqscan='on';

doc/src/sgml/datatype.sgml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3900,8 +3900,9 @@ SELECT to_tsvector('english', 'The Fat Rats');
39003900
<para>
39013901
A <type>tsquery</type> value stores lexemes that are to be
39023902
searched for, and combines them honoring the Boolean operators
3903-
<literal>&amp;</literal> (AND), <literal>|</literal> (OR), and
3904-
<literal>!</> (NOT). Parentheses can be used to enforce grouping
3903+
<literal>&amp;</literal> (AND), <literal>|</literal> (OR),
3904+
<literal>!</> (NOT) and <literal>?</> (FOLLOWED BY) phrase search
3905+
operator. Parentheses can be used to enforce grouping
39053906
of the operators:
39063907

39073908
<programlisting>
@@ -3922,8 +3923,8 @@ SELECT 'fat &amp; rat &amp; ! cat'::tsquery;
39223923
</programlisting>
39233924

39243925
In the absence of parentheses, <literal>!</> (NOT) binds most tightly,
3925-
and <literal>&amp;</literal> (AND) binds more tightly than
3926-
<literal>|</literal> (OR).
3926+
and <literal>&amp;</literal> (AND) and <literal>?</literal> (FOLLOWED BY)
3927+
both bind more tightly than <literal>|</literal> (OR).
39273928
</para>
39283929

39293930
<para>

doc/src/sgml/func.sgml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8961,6 +8961,12 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
89618961
<entry><literal>!! 'cat'::tsquery</literal></entry>
89628962
<entry><literal>!'cat'</literal></entry>
89638963
</row>
8964+
<row>
8965+
<entry> <literal>??</literal> </entry>
8966+
<entry><type>tsquery</> followed by <type>tsquery</></entry>
8967+
<entry><literal>to_tsquery('fat') ?? to_tsquery('rat')</literal></entry>
8968+
<entry><literal>'fat' ? 'rat'</literal></entry>
8969+
</row>
89648970
<row>
89658971
<entry> <literal>@&gt;</literal> </entry>
89668972
<entry><type>tsquery</> contains another ?</entry>
@@ -9053,6 +9059,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
90539059
<entry><literal>plainto_tsquery('english', 'The Fat Rats')</literal></entry>
90549060
<entry><literal>'fat' &amp; 'rat'</literal></entry>
90559061
</row>
9062+
<row>
9063+
<entry>
9064+
<indexterm>
9065+
<primary>phraseto_tsquery</primary>
9066+
</indexterm>
9067+
<literal><function>phraseto_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
9068+
</entry>
9069+
<entry><type>tsquery</type></entry>
9070+
<entry>produce <type>tsquery</> ignoring punctuation</entry>
9071+
<entry><literal>phraseto_tsquery('english', 'The Fat Rats')</literal></entry>
9072+
<entry><literal>'fat' ? 'rat'</literal></entry>
9073+
</row>
90569074
<row>
90579075
<entry>
90589076
<indexterm>
@@ -9077,6 +9095,15 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
90779095
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
90789096
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
90799097
</row>
9098+
<row>
9099+
<entry>
9100+
<literal><function>setweight(<type>tsquery</>, <type>"char"</>)</function></literal>
9101+
</entry>
9102+
<entry><type>tsquery</type></entry>
9103+
<entry>add weight to each element of <type>tsquery</></entry>
9104+
<entry><literal>setweight('fat ? cat &amp; rat:B'::tsquery, 'A')</literal></entry>
9105+
<entry><literal>( 'fat':A ? 'cat':A ) &amp; 'rat':AB</literal></entry>
9106+
</row>
90809107
<row>
90819108
<entry>
90829109
<indexterm>
@@ -9168,6 +9195,27 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
91689195
<entry><literal>SELECT ts_rewrite('a &amp; b'::tsquery, 'SELECT t,s FROM aliases')</literal></entry>
91699196
<entry><literal>'b' &amp; ( 'foo' | 'bar' )</literal></entry>
91709197
</row>
9198+
<row>
9199+
<entry>
9200+
<indexterm>
9201+
<primary>tsquery_phrase</primary>
9202+
</indexterm>
9203+
<literal><function>tsquery_phrase(<replaceable class="PARAMETER">query1</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">query2</replaceable> <type>tsquery</>)</function></literal>
9204+
</entry>
9205+
<entry><type>tsquery</type></entry>
9206+
<entry>implementation of <literal>??</> (FOLLOWED BY) operator</entry>
9207+
<entry><literal>tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'))</literal></entry>
9208+
<entry><literal>'fat' ? 'cat'</literal></entry>
9209+
</row>
9210+
<row>
9211+
<entry>
9212+
<literal><function>tsquery_phrase(<replaceable class="PARAMETER">query1</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">query2</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">distance</replaceable> <type>integer</>)</function></literal>
9213+
</entry>
9214+
<entry><type>tsquery</type></entry>
9215+
<entry>phrase-concatenate with distance</entry>
9216+
<entry><literal>tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10)</literal></entry>
9217+
<entry><literal>'fat' ?[10] 'cat'</literal></entry>
9218+
</row>
91719219
<row>
91729220
<entry>
91739221
<indexterm>

doc/src/sgml/textsearch.sgml

Lines changed: 127 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -709,11 +709,14 @@ UPDATE tt SET ti =
709709

710710
<para>
711711
<productname>&productname;</productname> provides the
712-
functions <function>to_tsquery</function> and
713-
<function>plainto_tsquery</function> for converting a query to
714-
the <type>tsquery</type> data type. <function>to_tsquery</function>
715-
offers access to more features than <function>plainto_tsquery</function>,
716-
but is less forgiving about its input.
712+
functions <function>to_tsquery</function>,
713+
<function>plainto_tsquery</function> and
714+
<function>phraseto_tsquery</function>
715+
for converting a query to the <type>tsquery</type> data type.
716+
<function>to_tsquery</function> offers access to more features
717+
than both <function>plainto_tsquery</function> and
718+
<function>phraseto_tsquery</function>, but is less forgiving
719+
about its input.
717720
</para>
718721

719722
<indexterm>
@@ -728,7 +731,8 @@ to_tsquery(<optional> <replaceable class="PARAMETER">config</replaceable> <type>
728731
<function>to_tsquery</function> creates a <type>tsquery</> value from
729732
<replaceable>querytext</replaceable>, which must consist of single tokens
730733
separated by the Boolean operators <literal>&amp;</literal> (AND),
731-
<literal>|</literal> (OR) and <literal>!</literal> (NOT). These operators
734+
<literal>|</literal> (OR), <literal>!</literal> (NOT), and also the
735+
<literal>?</literal> (FOLLOWED BY) phrase search operator. These operators
732736
can be grouped using parentheses. In other words, the input to
733737
<function>to_tsquery</function> must already follow the general rules for
734738
<type>tsquery</> input, as described in <xref
@@ -814,8 +818,8 @@ SELECT plainto_tsquery('english', 'The Fat Rats');
814818
</screen>
815819

816820
Note that <function>plainto_tsquery</> cannot
817-
recognize Boolean operators, weight labels, or prefix-match labels
818-
in its input:
821+
recognize Boolean and phrase search operators, weight labels,
822+
or prefix-match labels in its input:
819823

820824
<screen>
821825
SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
@@ -827,6 +831,46 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
827831
Here, all the input punctuation was discarded as being space symbols.
828832
</para>
829833

834+
<indexterm>
835+
<primary>phraseto_tsquery</primary>
836+
</indexterm>
837+
838+
<synopsis>
839+
phraseto_tsquery(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">querytext</replaceable> <type>text</>) returns <type>tsquery</>
840+
</synopsis>
841+
842+
<para>
843+
<function>phraseto_tsquery</> behaves much like
844+
<function>plainto_tsquery</>, with the exception
845+
that it utilizes the <literal>?</literal> (FOLLOWED BY) phrase search
846+
operator instead of the <literal>&amp;</literal> (AND) Boolean operator.
847+
This is particularly useful when searching for exact lexeme sequences,
848+
since the phrase search operator helps to maintain lexeme order.
849+
</para>
850+
851+
<para>
852+
Example:
853+
854+
<screen>
855+
SELECT phraseto_tsquery('english', 'The Fat Rats');
856+
phraseto_tsquery
857+
------------------
858+
'fat' ? 'rat'
859+
</screen>
860+
861+
Just like the <function>plainto_tsquery</>, the
862+
<function>phraseto_tsquery</> function cannot
863+
recognize Boolean and phrase search operators, weight labels,
864+
or prefix-match labels in its input:
865+
866+
<screen>
867+
SELECT phraseto_tsquery('english', 'The Fat &amp; Rats:C');
868+
phraseto_tsquery
869+
-------------------------
870+
( 'fat' ? 'rat' ) ? 'c'
871+
</screen>
872+
</para>
873+
830874
</sect2>
831875

832876
<sect2 id="textsearch-ranking">
@@ -1383,6 +1427,81 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
13831427

13841428
</varlistentry>
13851429

1430+
<varlistentry>
1431+
1432+
<term>
1433+
<literal><type>tsquery</> ?? <type>tsquery</></literal>
1434+
</term>
1435+
1436+
<listitem>
1437+
<para>
1438+
Returns the phrase-concatenation of the two given queries.
1439+
1440+
<screen>
1441+
SELECT to_tsquery('fat') ?? to_tsquery('cat | rat');
1442+
?column?
1443+
-------------------------------
1444+
'fat' ? 'cat' | 'fat' ? 'rat'
1445+
</screen>
1446+
</para>
1447+
</listitem>
1448+
1449+
</varlistentry>
1450+
1451+
<varlistentry>
1452+
1453+
<term>
1454+
<indexterm>
1455+
<primary>tsquery_phrase</primary>
1456+
</indexterm>
1457+
1458+
<literal>tsquery_phrase(<replaceable class="PARAMETER">query1</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">query2</replaceable> <type>tsquery</> [, <replaceable class="PARAMETER">distance</replaceable> <type>integer</> ]) returns <type>tsquery</></literal>
1459+
</term>
1460+
1461+
<listitem>
1462+
<para>
1463+
Returns the distanced phrase-concatenation of the two given queries.
1464+
This function lies in the implementation of the <literal>??</> operator.
1465+
1466+
<screen>
1467+
SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10);
1468+
tsquery_phrase
1469+
-------------------
1470+
'fat' ?[10] 'cat'
1471+
</screen>
1472+
</para>
1473+
</listitem>
1474+
1475+
</varlistentry>
1476+
1477+
<varlistentry>
1478+
1479+
<term>
1480+
<indexterm>
1481+
<primary>setweight</primary>
1482+
</indexterm>
1483+
1484+
<literal>setweight(<replaceable class="PARAMETER">query</replaceable> <type>tsquery</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>) returns <type>tsquery</></literal>
1485+
</term>
1486+
1487+
<listitem>
1488+
<para>
1489+
<function>setweight</> returns a copy of the input query in which every
1490+
position has been labeled with the given <replaceable>weight</>, either
1491+
<literal>A</literal>, <literal>B</literal>, <literal>C</literal>, or
1492+
<literal>D</literal>. These labels are retained when queries are
1493+
concatenated, allowing words from different parts of a document
1494+
to be weighted differently by ranking functions.
1495+
</para>
1496+
1497+
<para>
1498+
Note that weight labels apply to <emphasis>positions</>, not
1499+
<emphasis>lexemes</>. If the input query has been stripped of
1500+
positions then <function>setweight</> does nothing.
1501+
</para>
1502+
</listitem>
1503+
</varlistentry>
1504+
13861505
<varlistentry>
13871506

13881507
<term>

0 commit comments

Comments
 (0)