Skip to content

Commit 0c4f355

Browse files
committed
Fix parsing of complex morphs to tsquery
When to_tsquery() or websearch_to_tsquery() meet a complex morph containing multiple words residing adjacent position, these words are connected with OP_AND operator. That leads to surprising results. For instace, both websearch_to_tsquery('"pg_class pg"') and to_tsquery('pg_class <-> pg') produce '( pg & class ) <-> pg' tsquery. This tsquery requires 'pg' and 'class' words to reside on the same position and doesn't match to to_tsvector('pg_class pg'). It appears to be ridiculous behavior, which needs to be fixed. This commit makes to_tsquery() or websearch_to_tsquery() connect words residing adjacent position with OP_PHRASE. Therefore, now those words are normally chained with other OP_PHRASE operator. The examples of above now produces 'pg <-> class <-> pg' tsquery, which matches to to_tsvector('pg_class pg'). Another effect of this commit is that complex morph word positions now need to match the tsvector even if there is no surrounding OP_PHRASE. This behavior change generally looks like an improvement but making this commit not backpatchable. Reported-by: Barry Pederson Bug: #16592 Discussion: https://postgr.es/m/16592-70b110ff9731c07d@postgresql.org Discussion: https://postgr.es/m/CAPpHfdv0EzVhf6CWfB1_TTZqXV_2Sn-jSY3zSd7ePH%3D-%2B1V2DQ%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Tom Lane, Neil Chen
1 parent dfb75e4 commit 0c4f355

File tree

3 files changed

+132
-97
lines changed

3 files changed

+132
-97
lines changed

src/backend/tsearch/to_tsany.c

+38-3
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,20 @@
2020
#include "utils/jsonfuncs.h"
2121

2222

23+
/*
24+
* Opaque data structure, which is passed by parse_tsquery() to pushval_morph().
25+
*/
2326
typedef struct MorphOpaque
2427
{
2528
Oid cfg_id;
26-
int qoperator; /* query operator */
29+
30+
/*
31+
* Single tsquery morph could be parsed into multiple words. When these
32+
* words reside in adjacent positions, they are connected using this
33+
* operator. Usually, that is OP_PHRASE, which requires word positions of
34+
* a complex morph to exactly match the tsvector.
35+
*/
36+
int qoperator;
2737
} MorphOpaque;
2838

2939
typedef struct TSVectorBuildState
@@ -573,7 +583,14 @@ to_tsquery_byid(PG_FUNCTION_ARGS)
573583
MorphOpaque data;
574584

575585
data.cfg_id = PG_GETARG_OID(0);
576-
data.qoperator = OP_AND;
586+
587+
/*
588+
* Passing OP_PHRASE as a qoperator makes tsquery require matching of word
589+
* positions of a complex morph exactly match the tsvector. Also, when
590+
* the complex morphs are connected with OP_PHRASE operator, we connect
591+
* all their words into the OP_PHRASE sequence.
592+
*/
593+
data.qoperator = OP_PHRASE;
577594

578595
query = parse_tsquery(text_to_cstring(in),
579596
pushval_morph,
@@ -603,6 +620,12 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS)
603620
MorphOpaque data;
604621

605622
data.cfg_id = PG_GETARG_OID(0);
623+
624+
/*
625+
* parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
626+
* single morph. Passing OP_PHRASE as a qoperator makes tsquery require
627+
* matching of all words independently on their positions.
628+
*/
606629
data.qoperator = OP_AND;
607630

608631
query = parse_tsquery(text_to_cstring(in),
@@ -634,6 +657,12 @@ phraseto_tsquery_byid(PG_FUNCTION_ARGS)
634657
MorphOpaque data;
635658

636659
data.cfg_id = PG_GETARG_OID(0);
660+
661+
/*
662+
* parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
663+
* single morph. Passing OP_PHRASE as a qoperator makes tsquery require
664+
* matching of word positions.
665+
*/
637666
data.qoperator = OP_PHRASE;
638667

639668
query = parse_tsquery(text_to_cstring(in),
@@ -665,7 +694,13 @@ websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
665694

666695
data.cfg_id = PG_GETARG_OID(0);
667696

668-
data.qoperator = OP_AND;
697+
/*
698+
* Passing OP_PHRASE as a qoperator makes tsquery require matching of word
699+
* positions of a complex morph exactly match the tsvector. Also, when
700+
* the complex morphs are given in quotes, we connect all their words into
701+
* the OP_PHRASE sequence.
702+
*/
703+
data.qoperator = OP_PHRASE;
669704

670705
query = parse_tsquery(text_to_cstring(in),
671706
pushval_morph,

src/test/regress/expected/tsearch.out

+76-76
Original file line numberDiff line numberDiff line change
@@ -1997,63 +1997,63 @@ ALTER TABLE test_tsquery ADD COLUMN keyword tsquery;
19971997
UPDATE test_tsquery SET keyword = to_tsquery('english', txtkeyword);
19981998
ALTER TABLE test_tsquery ADD COLUMN sample tsquery;
19991999
UPDATE test_tsquery SET sample = to_tsquery('english', txtsample::text);
2000-
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york';
2000+
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new <-> york';
20012001
count
20022002
-------
20032003
2
20042004
(1 row)
20052005

2006-
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york';
2006+
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new <-> york';
20072007
count
20082008
-------
20092009
3
20102010
(1 row)
20112011

2012-
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york';
2012+
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new <-> york';
20132013
count
20142014
-------
20152015
1
20162016
(1 row)
20172017

2018-
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york';
2018+
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new <-> york';
20192019
count
20202020
-------
20212021
4
20222022
(1 row)
20232023

2024-
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york';
2024+
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new <-> york';
20252025
count
20262026
-------
20272027
3
20282028
(1 row)
20292029

20302030
CREATE UNIQUE INDEX bt_tsq ON test_tsquery (keyword);
20312031
SET enable_seqscan=OFF;
2032-
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new & york';
2032+
SELECT COUNT(*) FROM test_tsquery WHERE keyword < 'new <-> york';
20332033
count
20342034
-------
20352035
2
20362036
(1 row)
20372037

2038-
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new & york';
2038+
SELECT COUNT(*) FROM test_tsquery WHERE keyword <= 'new <-> york';
20392039
count
20402040
-------
20412041
3
20422042
(1 row)
20432043

2044-
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new & york';
2044+
SELECT COUNT(*) FROM test_tsquery WHERE keyword = 'new <-> york';
20452045
count
20462046
-------
20472047
1
20482048
(1 row)
20492049

2050-
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new & york';
2050+
SELECT COUNT(*) FROM test_tsquery WHERE keyword >= 'new <-> york';
20512051
count
20522052
-------
20532053
4
20542054
(1 row)
20552055

2056-
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new & york';
2056+
SELECT COUNT(*) FROM test_tsquery WHERE keyword > 'new <-> york';
20572057
count
20582058
-------
20592059
3
@@ -2085,10 +2085,10 @@ SELECT ts_rewrite('moscow & hotel', 'SELECT keyword, sample FROM test_tsquery'::
20852085
'hotel' & ( 'moskva' | 'moscow' )
20862086
(1 row)
20872087

2088-
SELECT ts_rewrite('bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery'::text );
2089-
ts_rewrite
2090-
---------------------------------------------------------------------------------
2091-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2088+
SELECT ts_rewrite('bar & qq & foo & (new <-> york)', 'SELECT keyword, sample FROM test_tsquery'::text );
2089+
ts_rewrite
2090+
-------------------------------------------------------------------------------------
2091+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
20922092
(1 row)
20932093

20942094
SELECT ts_rewrite( 'moscow', 'SELECT keyword, sample FROM test_tsquery');
@@ -2103,10 +2103,10 @@ SELECT ts_rewrite( 'moscow & hotel', 'SELECT keyword, sample FROM test_tsquery')
21032103
'hotel' & ( 'moskva' | 'moscow' )
21042104
(1 row)
21052105

2106-
SELECT ts_rewrite( 'bar & new & qq & foo & york', 'SELECT keyword, sample FROM test_tsquery');
2107-
ts_rewrite
2108-
---------------------------------------------------------------------------------
2109-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2106+
SELECT ts_rewrite( 'bar & qq & foo & (new <-> york)', 'SELECT keyword, sample FROM test_tsquery');
2107+
ts_rewrite
2108+
-------------------------------------------------------------------------------------
2109+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
21102110
(1 row)
21112111

21122112
SELECT ts_rewrite('1 & (2 <-> 3)', 'SELECT keyword, sample FROM test_tsquery'::text );
@@ -2149,9 +2149,9 @@ NOTICE: text-search query doesn't contain lexemes: ""
21492149
(1 row)
21502150

21512151
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
2152-
keyword
2153-
----------------
2154-
'new' & 'york'
2152+
keyword
2153+
------------------
2154+
'new' <-> 'york'
21552155
(1 row)
21562156

21572157
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
@@ -2183,10 +2183,10 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
21832183
'hotel' & ( 'moskva' | 'moscow' )
21842184
(1 row)
21852185

2186-
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
2187-
ts_rewrite
2188-
---------------------------------------------------------------------------------
2189-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2186+
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
2187+
ts_rewrite
2188+
-------------------------------------------------------------------------------------
2189+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
21902190
(1 row)
21912191

21922192
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
@@ -2201,18 +2201,18 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
22012201
'hotel' & ( 'moskva' | 'moscow' )
22022202
(1 row)
22032203

2204-
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
2205-
ts_rewrite
2206-
---------------------------------------------------------------------------------
2207-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2204+
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
2205+
ts_rewrite
2206+
-------------------------------------------------------------------------------------
2207+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
22082208
(1 row)
22092209

22102210
CREATE INDEX qq ON test_tsquery USING gist (keyword tsquery_ops);
22112211
SET enable_seqscan=OFF;
22122212
SELECT keyword FROM test_tsquery WHERE keyword @> 'new';
2213-
keyword
2214-
----------------
2215-
'new' & 'york'
2213+
keyword
2214+
------------------
2215+
'new' <-> 'york'
22162216
(1 row)
22172217

22182218
SELECT keyword FROM test_tsquery WHERE keyword @> 'moscow';
@@ -2244,10 +2244,10 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
22442244
'hotel' & ( 'moskva' | 'moscow' )
22452245
(1 row)
22462246

2247-
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
2248-
ts_rewrite
2249-
---------------------------------------------------------------------------------
2250-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2247+
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
2248+
ts_rewrite
2249+
-------------------------------------------------------------------------------------
2250+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
22512251
(1 row)
22522252

22532253
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'moscow') AS query;
@@ -2262,10 +2262,10 @@ SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_t
22622262
'hotel' & ( 'moskva' | 'moscow' )
22632263
(1 row)
22642264

2265-
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & new & qq & foo & york') AS query;
2266-
ts_rewrite
2267-
---------------------------------------------------------------------------------
2268-
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' & 'appl' | 'new' & 'york' )
2265+
SELECT ts_rewrite( query, 'SELECT keyword, sample FROM test_tsquery' ) FROM to_tsquery('english', 'bar & qq & foo & (new <-> york)') AS query;
2266+
ts_rewrite
2267+
-------------------------------------------------------------------------------------
2268+
'citi' & 'foo' & ( 'bar' | 'qq' ) & ( 'nyc' | 'big' <-> 'appl' | 'new' <-> 'york' )
22692269
(1 row)
22702270

22712271
SELECT ts_rewrite(tsquery_phrase('foo', 'foo'), 'foo', 'bar | baz');
@@ -2456,19 +2456,19 @@ select websearch_to_tsquery('simple', 'fat:A : cat:B');
24562456
select websearch_to_tsquery('simple', 'fat*rat');
24572457
websearch_to_tsquery
24582458
----------------------
2459-
'fat' & 'rat'
2459+
'fat' <-> 'rat'
24602460
(1 row)
24612461

24622462
select websearch_to_tsquery('simple', 'fat-rat');
2463-
websearch_to_tsquery
2464-
---------------------------
2465-
'fat-rat' & 'fat' & 'rat'
2463+
websearch_to_tsquery
2464+
-------------------------------
2465+
'fat-rat' <-> 'fat' <-> 'rat'
24662466
(1 row)
24672467

24682468
select websearch_to_tsquery('simple', 'fat_rat');
24692469
websearch_to_tsquery
24702470
----------------------
2471-
'fat' & 'rat'
2471+
'fat' <-> 'rat'
24722472
(1 row)
24732473

24742474
-- weights are completely ignored
@@ -2665,64 +2665,64 @@ select websearch_to_tsquery('simple', 'abc OR1234');
26652665
(1 row)
26662666

26672667
select websearch_to_tsquery('simple', 'abc or-abc');
2668-
websearch_to_tsquery
2669-
---------------------------------
2670-
'abc' & 'or-abc' & 'or' & 'abc'
2668+
websearch_to_tsquery
2669+
-------------------------------------
2670+
'abc' & 'or-abc' <-> 'or' <-> 'abc'
26712671
(1 row)
26722672

26732673
select websearch_to_tsquery('simple', 'abc OR_abc');
2674-
websearch_to_tsquery
2675-
----------------------
2676-
'abc' & 'or' & 'abc'
2674+
websearch_to_tsquery
2675+
------------------------
2676+
'abc' & 'or' <-> 'abc'
26772677
(1 row)
26782678

26792679
-- test quotes
26802680
select websearch_to_tsquery('english', '"pg_class pg');
2681-
websearch_to_tsquery
2682-
-----------------------
2683-
'pg' & 'class' & 'pg'
2681+
websearch_to_tsquery
2682+
-------------------------
2683+
'pg' <-> 'class' & 'pg'
26842684
(1 row)
26852685

26862686
select websearch_to_tsquery('english', 'pg_class pg"');
2687-
websearch_to_tsquery
2688-
-----------------------
2689-
'pg' & 'class' & 'pg'
2687+
websearch_to_tsquery
2688+
-------------------------
2689+
'pg' <-> 'class' & 'pg'
26902690
(1 row)
26912691

26922692
select websearch_to_tsquery('english', '"pg_class pg"');
2693-
websearch_to_tsquery
2694-
-----------------------------
2695-
( 'pg' & 'class' ) <-> 'pg'
2693+
websearch_to_tsquery
2694+
---------------------------
2695+
'pg' <-> 'class' <-> 'pg'
26962696
(1 row)
26972697

26982698
select websearch_to_tsquery('english', 'abc "pg_class pg"');
2699-
websearch_to_tsquery
2700-
-------------------------------------
2701-
'abc' & ( 'pg' & 'class' ) <-> 'pg'
2699+
websearch_to_tsquery
2700+
-----------------------------------
2701+
'abc' & 'pg' <-> 'class' <-> 'pg'
27022702
(1 row)
27032703

27042704
select websearch_to_tsquery('english', '"pg_class pg" def');
2705-
websearch_to_tsquery
2706-
-------------------------------------
2707-
( 'pg' & 'class' ) <-> 'pg' & 'def'
2705+
websearch_to_tsquery
2706+
-----------------------------------
2707+
'pg' <-> 'class' <-> 'pg' & 'def'
27082708
(1 row)
27092709

27102710
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
2711-
websearch_to_tsquery
2712-
------------------------------------------------------
2713-
'abc' & 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' & 'def'
2711+
websearch_to_tsquery
2712+
--------------------------------------------------------
2713+
'abc' & 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' & 'def'
27142714
(1 row)
27152715

27162716
select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
2717-
websearch_to_tsquery
2718-
--------------------------------------
2719-
'pg' <-> ( 'pg' & 'class' ) <-> 'pg'
2717+
websearch_to_tsquery
2718+
----------------------------------------
2719+
'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg'
27202720
(1 row)
27212721

27222722
select websearch_to_tsquery('english', '""pg pg_class pg""');
2723-
websearch_to_tsquery
2724-
------------------------------
2725-
'pg' & 'pg' & 'class' & 'pg'
2723+
websearch_to_tsquery
2724+
--------------------------------
2725+
'pg' & 'pg' <-> 'class' & 'pg'
27262726
(1 row)
27272727

27282728
select websearch_to_tsquery('english', 'abc """"" def');
@@ -2829,7 +2829,7 @@ NOTICE: text-search query contains only stop words or doesn't contain lexemes,
28292829
select websearch_to_tsquery('''abc''''def''');
28302830
websearch_to_tsquery
28312831
----------------------
2832-
'abc' & 'def'
2832+
'abc' <-> 'def'
28332833
(1 row)
28342834

28352835
select websearch_to_tsquery('\abc');

0 commit comments

Comments
 (0)