Skip to content

Commit cbe25dc

Browse files
committed
Disallow making an empty lexeme via array_to_tsvector().
The tsvector data type has always forbidden lexemes to be empty. However, array_to_tsvector() didn't get that memo, and would allow an empty-string array element to become an empty lexeme. This could result in dump/restore failures later, not to mention whatever semantic issues might be behind the original prohibition. However, other functions that take a plain text input directly as a lexeme value do not need a similar restriction, because they only match the string against existing tsvector entries. In particular it'd be a bad idea to make ts_delete() reject empty strings, since that is the most convenient way to clean up any bad data that might have gotten into a tsvector column via this bug. Reflecting on that, let's also remove the prohibition against NULL array elements in tsvector_delete_arr and tsvector_setweight_by_filter. It seems more consistent to ignore them, as an empty-string element would be ignored. There's a case for back-patching this, since it's clearly a bug fix. On balance though, it doesn't seem like something to change in a minor release. Jean-Christophe Arnu Discussion: https://postgr.es/m/CAHZmTm1YVndPgUVRoag2WL0w900XcoiivDDj-gTTYBsG25c65A@mail.gmail.com
1 parent 1241fcb commit cbe25dc

File tree

4 files changed

+44
-17
lines changed

4 files changed

+44
-17
lines changed

doc/src/sgml/func.sgml

+12-2
Original file line numberDiff line numberDiff line change
@@ -12920,8 +12920,10 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
1292012920
<returnvalue>tsvector</returnvalue>
1292112921
</para>
1292212922
<para>
12923-
Converts an array of lexemes to a <type>tsvector</type>.
12924-
The given strings are used as-is without further processing.
12923+
Converts an array of text strings to a <type>tsvector</type>.
12924+
The given strings are used as lexemes as-is, without further
12925+
processing. Array elements must not be empty strings
12926+
or <literal>NULL</literal>.
1292512927
</para>
1292612928
<para>
1292712929
<literal>array_to_tsvector('{fat,cat,rat}'::text[])</literal>
@@ -13104,6 +13106,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
1310413106
Assigns the specified <parameter>weight</parameter> to elements
1310513107
of the <parameter>vector</parameter> that are listed
1310613108
in <parameter>lexemes</parameter>.
13109+
The strings in <parameter>lexemes</parameter> are taken as lexemes
13110+
as-is, without further processing. Strings that do not match any
13111+
lexeme in <parameter>vector</parameter> are ignored.
1310713112
</para>
1310813113
<para>
1310913114
<literal>setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}')</literal>
@@ -13265,6 +13270,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
1326513270
<para>
1326613271
Removes any occurrence of the given <parameter>lexeme</parameter>
1326713272
from the <parameter>vector</parameter>.
13273+
The <parameter>lexeme</parameter> string is treated as a lexeme as-is,
13274+
without further processing.
1326813275
</para>
1326913276
<para>
1327013277
<literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal>
@@ -13281,6 +13288,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
1328113288
Removes any occurrences of the lexemes
1328213289
in <parameter>lexemes</parameter>
1328313290
from the <parameter>vector</parameter>.
13291+
The strings in <parameter>lexemes</parameter> are taken as lexemes
13292+
as-is, without further processing. Strings that do not match any
13293+
lexeme in <parameter>vector</parameter> are ignored.
1328413294
</para>
1328513295
<para>
1328613296
<literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal>

src/backend/utils/adt/tsvector_op.c

+13-7
Original file line numberDiff line numberDiff line change
@@ -322,10 +322,9 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
322322
int lex_len,
323323
lex_pos;
324324

325+
/* Ignore null array elements, they surely don't match */
325326
if (nulls[i])
326-
ereport(ERROR,
327-
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
328-
errmsg("lexeme array may not contain nulls")));
327+
continue;
329328

330329
lex = VARDATA(dlexemes[i]);
331330
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
@@ -602,10 +601,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
602601
int lex_len,
603602
lex_pos;
604603

604+
/* Ignore null array elements, they surely don't match */
605605
if (nulls[i])
606-
ereport(ERROR,
607-
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
608-
errmsg("lexeme array may not contain nulls")));
606+
continue;
609607

610608
lex = VARDATA(dlexemes[i]);
611609
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
@@ -761,13 +759,21 @@ array_to_tsvector(PG_FUNCTION_ARGS)
761759

762760
deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems);
763761

764-
/* Reject nulls (maybe we should just ignore them, instead?) */
762+
/*
763+
* Reject nulls and zero length strings (maybe we should just ignore them,
764+
* instead?)
765+
*/
765766
for (i = 0; i < nitems; i++)
766767
{
767768
if (nulls[i])
768769
ereport(ERROR,
769770
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
770771
errmsg("lexeme array may not contain nulls")));
772+
773+
if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
774+
ereport(ERROR,
775+
(errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
776+
errmsg("lexeme array may not contain empty strings")));
771777
}
772778

773779
/* Sort and de-dup, because this is required for a valid tsvector. */

src/test/regress/expected/tstypes.out

+14-5
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
8585
'a':3A,4B 'b':2A 'ba':1237
8686
(1 row)
8787

88+
SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed
89+
ERROR: syntax error in tsvector: "'' '1' '2'"
90+
LINE 1: SELECT $$'' '1' '2'$$::tsvector;
91+
^
8892
--Base tsquery test
8993
SELECT '1'::tsquery;
9094
tsquery
@@ -1258,8 +1262,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
12581262
'base' 'hidden' 'strike'
12591263
(1 row)
12601264

1261-
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
1262-
ERROR: lexeme array may not contain nulls
1265+
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);
1266+
ts_delete
1267+
--------------------------
1268+
'base' 'hidden' 'strike'
1269+
(1 row)
1270+
12631271
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
12641272
unnest
12651273
---------------------------------------------
@@ -1328,8 +1336,11 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
13281336
'base' 'hidden' 'rebel' 'spaceship' 'strike'
13291337
(1 row)
13301338

1339+
-- null and empty string are disallowed, since we mustn't make an empty lexeme
13311340
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
13321341
ERROR: lexeme array may not contain nulls
1342+
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
1343+
ERROR: lexeme array may not contain empty strings
13331344
-- array_to_tsvector must sort and de-dup
13341345
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
13351346
array_to_tsvector
@@ -1367,14 +1378,12 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '
13671378
'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
13681379
(1 row)
13691380

1370-
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
1381+
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);
13711382
setweight
13721383
---------------------------------
13731384
'a' 'asd' 'w':5,6,12B,13A 'zxc'
13741385
(1 row)
13751386

1376-
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
1377-
ERROR: lexeme array may not contain nulls
13781387
SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
13791388
ts_filter
13801389
-------------------------------------------------------------

src/test/regress/sql/tstypes.sql

+5-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
1717
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
1818
SELECT '''w'':4A,3B,2C,1D,5 a:8';
1919
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
20+
SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed
2021

2122
--Base tsquery test
2223
SELECT '1'::tsquery;
@@ -239,7 +240,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
239240
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
240241
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
241242
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
242-
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
243+
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);
243244

244245
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
245246
SELECT unnest('base hidden rebel spaceship strike'::tsvector);
@@ -251,7 +252,9 @@ SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D st
251252
SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);
252253

253254
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
255+
-- null and empty string are disallowed, since we mustn't make an empty lexeme
254256
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
257+
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
255258
-- array_to_tsvector must sort and de-dup
256259
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
257260

@@ -260,8 +263,7 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
260263
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
261264
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
262265
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
263-
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
264-
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
266+
SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);
265267

266268
SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
267269
SELECT ts_filter('base hidden rebel spaceship strike'::tsvector, '{a}');

0 commit comments

Comments
 (0)