Skip to content

Commit c0e1500

Browse files
author
Maksim Milyutin
committed
Make stable version of index searching on hash values of lexemes in
tsvector and tsquery
1 parent 27f78d4 commit c0e1500

File tree

6 files changed

+38
-121
lines changed

6 files changed

+38
-121
lines changed

expected/rum.out

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -304,49 +304,3 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
304304
FROM test_rum
305305
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
306306
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
307-
?column? | t | a
308-
----------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------
309-
8.22467 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2
310-
8.22467 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3
311-
8.22467 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5
312-
16.4493 | little series of pictures. Have you ever been here, I wonder? You did | 'ever':7 'littl':1 'pictur':4 'seri':2 'wonder':11
313-
16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12
314-
16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5
315-
16.4493 | thickness of the walls, twenty-one feet, and the solid masonry, held it | 'feet':8 'held':13 'masonri':12 'one':7 'solid':11 'thick':1 'twenti':6 'twenty-on':5 'wall':4
316-
16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5
317-
16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14
318-
16.4493 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6
319-
16.4493 | As a reward for your reformation I write to you on this precious sheet. | 'precious':13 'reform':6 'reward':3 'sheet':14 'write':8
320-
16.4493 | entrance of the Black Forest, among picturesque, thickly-wooded hills, | 'among':6 'black':4 'entranc':1 'forest':5 'hill':11 'picturesqu':7 'thick':9 'thickly-wood':8 'wood':10
321-
16.4493 | You see I have come to be wonderfully attached to Heidelberg, the | 'attach':9 'come':5 'heidelberg':11 'see':2 'wonder':8
322-
16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9
323-
(14 rows)
324-
325-
SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
326-
FROM test_rum
327-
WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
328-
ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');
329-
?column? | t | a
330-
----------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------
331-
8.22467 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2
332-
8.22467 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13
333-
8.22467 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6
334-
8.22467 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11
335-
13.1595 | foo bar foo the over foo qq bar | 'bar':2,8 'foo':1,3,6 'qq':7
336-
16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5
337-
16.4493 | the--nearest guide-book! | 'book':5 'guid':4 'guide-book':3 'nearest':2
338-
16.4493 | to your letter, I have them all in the handiest kind of a bunch. Ariel | 'ariel':15 'bunch':14 'handiest':10 'kind':11 'letter':3
339-
16.4493 | beautiful, the quaint, the historically poetic, learned and picturesque | 'beauti':1 'histor':5 'learn':7 'picturesqu':9 'poetic':6 'quaint':3
340-
16.4493 | there are dreadful reports of floods and roads caved in and bridges | 'bridg':12 'cave':9 'dread':3 'flood':6 'report':4 'road':8
341-
16.4493 | the Conversationhaus, the bazaar, mingling with the throng, listening to | 'bazaar':4 'conversationhaus':2 'listen':9 'mingl':5 'throng':8
342-
16.4493 | the band, and comparing what it is with what it was. It was a gay and | 'band':2 'compar':4 'gay':15
343-
16.4493 | look. The situation is most beautiful. It lies, you know, at the | 'beauti':6 'know':10 'lie':8 'look':1 'situat':3
344-
16.4493 | entrance of the Black Forest, among picturesque, thickly-wooded hills, | 'among':6 'black':4 'entranc':1 'forest':5 'hill':11 'picturesqu':7 'thick':9 'thickly-wood':8 'wood':10
345-
16.4493 | town with angry, headlong speed. There is an avenue along its bank of | 'along':10 'angri':3 'avenu':9 'bank':12 'headlong':4 'speed':5 'town':1
346-
16.4493 | like, "I'll do my bidding gently," and as surely, if I get there. But | 'bid':6 'gentl':7 'get':13 'like':1 'll':3 'sure':10
347-
16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5
348-
16.4493 | Gesprente Thurm is the one that was blown up by the French. The | 'blown':8 'french':12 'gesprent':1 'one':5 'thurm':2
349-
16.4493 | portico that shows in the Schlosshof are the four brought from | 'brought':10 'four':9 'portico':1 'schlosshof':6 'show':3
350-
16.4493 | the few that escaped destruction in 1693. It is a beautiful, highly | '1693':7 'beauti':11 'destruct':5 'escap':4 'high':12
351-
(20 rows)
352-

rum--1.0.sql

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,12 @@ RETURNS bytea
8080
AS 'MODULE_PATHNAME'
8181
LANGUAGE C IMMUTABLE STRICT;
8282

83-
CREATE FUNCTION rum_cmp_tslexeme(bytea, bytea)
84-
RETURNS integer
85-
AS 'MODULE_PATHNAME'
86-
LANGUAGE C IMMUTABLE STRICT;
87-
8883
CREATE OPERATOR CLASS rum_tsvector_ops
8984
FOR TYPE tsvector USING rum
9085
AS
9186
OPERATOR 1 @@ (tsvector, tsquery),
9287
OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops,
93-
FUNCTION 1 rum_cmp_tslexeme(bytea, bytea),
88+
FUNCTION 1 btint4cmp(integer, integer),
9489
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
9590
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
9691
FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
@@ -99,7 +94,7 @@ AS
9994
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
10095
FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
10196
FUNCTION 10 rum_ts_join_pos(internal, internal),
102-
STORAGE bytea;
97+
STORAGE integer;
10398
-- timestamp ops
10499

105100
CREATE FUNCTION timestamp_distance(timestamp, timestamp)
@@ -205,13 +200,13 @@ FOR TYPE tsvector USING rum
205200
AS
206201
OPERATOR 1 @@ (tsvector, tsquery),
207202
--support function
208-
FUNCTION 1 gin_cmp_tslexeme(text, text),
203+
FUNCTION 1 btint4cmp(integer, integer),
209204
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
210205
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
211206
FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
212207
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
213208
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
214-
STORAGE text;
209+
STORAGE integer;
215210

216211
-- timestamptz ops
217212

@@ -281,13 +276,13 @@ FOR TYPE tsvector USING rum
281276
AS
282277
OPERATOR 1 @@ (tsvector, tsquery),
283278
--support function
284-
FUNCTION 1 gin_cmp_tslexeme(text, text),
279+
FUNCTION 1 btint4cmp(integer, integer),
285280
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
286281
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
287282
FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
288283
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
289284
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
290-
STORAGE text;
285+
STORAGE integer;
291286

292287
-- inversed
293288

@@ -315,10 +310,10 @@ CREATE OPERATOR CLASS rum_tsquery_ops
315310
DEFAULT FOR TYPE tsquery USING rum
316311
AS
317312
OPERATOR 1 @@ (tsquery, tsvector),
318-
FUNCTION 1 gin_cmp_tslexeme(text, text),
313+
FUNCTION 1 btint4cmp(integer, integer),
319314
FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal),
320315
FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal),
321316
FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
322317
FUNCTION 6 ruminv_tsquery_config(internal),
323-
STORAGE text;
318+
STORAGE integer;
324319

rum.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ typedef signed char RumNullCategory;
243243
*/
244244
#define RumGetDownlink(itup) RumItemPointerGetBlockNumber(&(itup)->t_tid)
245245
#define RumSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber)
246-
CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_ops);
246+
247247

248248
/*
249249
* Data (posting tree) pages

rum_ts_utils.c

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727

2828
#include <math.h>
2929

30-
PG_FUNCTION_INFO_V1(rum_cmp_tslexeme);
3130
PG_FUNCTION_INFO_V1(rum_extract_tsvector);
3231
PG_FUNCTION_INFO_V1(rum_extract_tsquery);
3332
PG_FUNCTION_INFO_V1(rum_tsvector_config);
@@ -505,16 +504,10 @@ rum_extract_tsvector(PG_FUNCTION_ARGS)
505504

506505
for (i = 0; i < vector->size; i++)
507506
{
508-
text *txt;
509-
bytea *hash_value;
510507
bytea *posData;
511508
int posDataSize;
512509

513-
txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
514-
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
515-
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
516-
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
517-
entries[i] = PointerGetDatum(hash_value);
510+
entries[i] = hash_any((const unsigned char *) (STRPTR(vector) + we->pos), we->len);
518511

519512
if (we->haspos)
520513
{
@@ -592,15 +585,9 @@ rum_extract_tsquery(PG_FUNCTION_ARGS)
592585

593586
for (i = 0; i < (*nentries); i++)
594587
{
595-
text *txt;
596-
bytea *hash_value;
597-
598-
txt = cstring_to_text_with_len(GETOPERAND(query) + operands[i]->distance,
599-
operands[i]->length);
600-
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
601-
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
602-
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
603-
entries[i] = PointerGetDatum(hash_value);
588+
entries[i] = hash_any(
589+
(const unsigned char *) (GETOPERAND(query) + operands[i]->distance),
590+
operands[i]->length);
604591
partialmatch[i] = operands[i]->prefix;
605592
(*extra_data)[i] = (Pointer) map_item_operand;
606593
}
@@ -1400,17 +1387,3 @@ rum_ts_join_pos(PG_FUNCTION_ARGS)
14001387

14011388
PG_RETURN_BYTEA_P(result);
14021389
}
1403-
1404-
Datum
1405-
rum_cmp_tslexeme(PG_FUNCTION_ARGS)
1406-
{
1407-
bytea *arg1 = PG_GETARG_BYTEA_P(0);
1408-
bytea *arg2 = PG_GETARG_BYTEA_P(1);
1409-
int32 a = *VARDATA(arg1);
1410-
int32 b = *VARDATA(arg2);
1411-
int cmp;
1412-
1413-
cmp = (a > b) ? 1 : ((a == b) ? 0 : -1);
1414-
1415-
PG_RETURN_INT32(cmp);
1416-
}

rumtsquery.c

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "postgres.h"
1313

14+
#include "access/hash.h"
1415
#include "catalog/pg_type.h"
1516
#include "tsearch/ts_type.h"
1617
#include "tsearch/ts_utils.h"
@@ -274,11 +275,14 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level)
274275

275276
for (index = 0; index < context->index; index++)
276277
{
277-
text *entry;
278-
279-
entry = DatumGetByteaP(context->entries[index]);
280-
if (VARSIZE_ANY_EXHDR(entry) == wrap->length &&
281-
!memcmp(context->operand + wrap->distance, VARDATA_ANY(entry), wrap->length))
278+
int32 entry;
279+
int32 operand_hash;
280+
281+
entry = DatumGetInt32(context->entries[index]);
282+
operand_hash = hash_any(
283+
(const unsigned char *) (context->operand + wrap->distance),
284+
wrap->length);
285+
if (entry == operand_hash)
282286
break;
283287
}
284288

@@ -287,7 +291,9 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level)
287291
index = context->index;
288292
addinfo = (bytea *) palloc(VARHDRSZ + 2 * Max(level, 1) * MAX_ENCODED_LEN);
289293
ptr = (unsigned char *) VARDATA(addinfo);
290-
context->entries[index] = PointerGetDatum(cstring_to_text_with_len(context->operand + wrap->distance, wrap->length));
294+
context->entries[index] = hash_any(
295+
(const unsigned char *) (context->operand + wrap->distance),
296+
wrap->length);
291297
context->addInfo[index] = PointerGetDatum(addinfo);
292298
context->addInfoIsNull[index] = false;
293299
context->index++;
@@ -419,12 +425,6 @@ ruminv_extract_tsquery(PG_FUNCTION_ARGS)
419425
}
420426
*nentries = count;
421427

422-
/* elog(NOTICE, "%d", *nentries);
423-
for (i = 0; i < *nentries; i++)
424-
{
425-
elog(NOTICE, "%s", text_to_cstring(DatumGetPointer((entries)[i])));
426-
}*/
427-
428428
PG_FREE_IF_COPY(query, 0);
429429
PG_RETURN_POINTER(entries);
430430
}
@@ -460,10 +460,9 @@ ruminv_extract_tsvector(PG_FUNCTION_ARGS)
460460

461461
for (i = 0; i < vector->size; i++)
462462
{
463-
text *txt;
464-
465-
txt = cstring_to_text_with_len(STRPTR(vector) + we[i].pos, we[i].len);
466-
entries[i] = PointerGetDatum(txt);
463+
entries[i] = hash_any(
464+
(const unsigned char *) (STRPTR(vector) + we[i].pos),
465+
we[i].len);
467466
(*nullFlags)[i] = false;
468467
}
469468
(*nullFlags)[*nentries - 1] = true;

sql/rum.sql

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,13 @@ DELETE FROM tst WHERE i = 5;
100100
VACUUM tst;
101101
INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i;
102102

103-
-- set enable_bitmapscan=off;
104-
-- explain (costs off)
105-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106-
-- FROM test_rum
107-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110-
-- FROM test_rum
111-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
113-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
114-
-- FROM test_rum
115-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
116-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');
103+
set enable_bitmapscan=off;
104+
explain (costs off)
105+
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106+
FROM test_rum
107+
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108+
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109+
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110+
FROM test_rum
111+
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112+
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');

0 commit comments

Comments
 (0)