Skip to content

Commit 27f78d4

Browse files
author
Maksim Milyutin
committed
First incomplete version of lexeme hashing in index
1 parent 2f57c4b commit 27f78d4

File tree

4 files changed

+49
-19
lines changed

4 files changed

+49
-19
lines changed

rum--1.0.sql

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,17 @@ RETURNS bytea
8080
AS 'MODULE_PATHNAME'
8181
LANGUAGE C IMMUTABLE STRICT;
8282

83+
CREATE FUNCTION rum_cmp_tslexeme(bytea, bytea)
84+
RETURNS integer
85+
AS 'MODULE_PATHNAME'
86+
LANGUAGE C IMMUTABLE STRICT;
87+
8388
CREATE OPERATOR CLASS rum_tsvector_ops
8489
FOR TYPE tsvector USING rum
8590
AS
8691
OPERATOR 1 @@ (tsvector, tsquery),
8792
OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops,
88-
FUNCTION 1 gin_cmp_tslexeme(text, text),
93+
FUNCTION 1 rum_cmp_tslexeme(bytea, bytea),
8994
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
9095
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
9196
FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
@@ -94,7 +99,7 @@ AS
9499
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
95100
FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
96101
FUNCTION 10 rum_ts_join_pos(internal, internal),
97-
STORAGE text;
102+
STORAGE bytea;
98103
-- timestamp ops
99104

100105
CREATE FUNCTION timestamp_distance(timestamp, timestamp)

rum.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ typedef signed char RumNullCategory;
243243
*/
244244
#define RumGetDownlink(itup) RumItemPointerGetBlockNumber(&(itup)->t_tid)
245245
#define RumSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber)
246-
246+
CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_ops);
247247

248248
/*
249249
* Data (posting tree) pages

rum_ts_utils.c

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
#include "postgres.h"
1313

14+
#include "access/hash.h"
1415
#include "access/htup_details.h"
16+
#include "catalog/pg_collation.h"
1517
#include "catalog/pg_type.h"
1618
#include "funcapi.h"
1719
#include "miscadmin.h"
@@ -25,6 +27,7 @@
2527

2628
#include <math.h>
2729

30+
PG_FUNCTION_INFO_V1(rum_cmp_tslexeme);
2831
PG_FUNCTION_INFO_V1(rum_extract_tsvector);
2932
PG_FUNCTION_INFO_V1(rum_extract_tsquery);
3033
PG_FUNCTION_INFO_V1(rum_tsvector_config);
@@ -503,11 +506,15 @@ rum_extract_tsvector(PG_FUNCTION_ARGS)
503506
for (i = 0; i < vector->size; i++)
504507
{
505508
text *txt;
509+
bytea *hash_value;
506510
bytea *posData;
507511
int posDataSize;
508512

509513
txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
510-
entries[i] = PointerGetDatum(txt);
514+
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
515+
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
516+
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
517+
entries[i] = PointerGetDatum(hash_value);
511518

512519
if (we->haspos)
513520
{
@@ -586,10 +593,14 @@ rum_extract_tsquery(PG_FUNCTION_ARGS)
586593
for (i = 0; i < (*nentries); i++)
587594
{
588595
text *txt;
596+
bytea *hash_value;
589597

590598
txt = cstring_to_text_with_len(GETOPERAND(query) + operands[i]->distance,
591599
operands[i]->length);
592-
entries[i] = PointerGetDatum(txt);
600+
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
601+
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
602+
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
603+
entries[i] = PointerGetDatum(hash_value);
593604
partialmatch[i] = operands[i]->prefix;
594605
(*extra_data)[i] = (Pointer) map_item_operand;
595606
}
@@ -1389,3 +1400,17 @@ rum_ts_join_pos(PG_FUNCTION_ARGS)
13891400

13901401
PG_RETURN_BYTEA_P(result);
13911402
}
1403+
1404+
Datum
1405+
rum_cmp_tslexeme(PG_FUNCTION_ARGS)
1406+
{
1407+
bytea *arg1 = PG_GETARG_BYTEA_P(0);
1408+
bytea *arg2 = PG_GETARG_BYTEA_P(1);
1409+
int32 a = *VARDATA(arg1);
1410+
int32 b = *VARDATA(arg2);
1411+
int cmp;
1412+
1413+
cmp = (a > b) ? 1 : ((a == b) ? 0 : -1);
1414+
1415+
PG_RETURN_INT32(cmp);
1416+
}

sql/rum.sql

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,17 @@ DELETE FROM tst WHERE i = 5;
100100
VACUUM tst;
101101
INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i;
102102

103-
set enable_bitmapscan=off;
104-
explain (costs off)
105-
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106-
FROM test_rum
107-
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108-
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109-
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110-
FROM test_rum
111-
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112-
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
113-
SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
114-
FROM test_rum
115-
WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
116-
ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');
103+
-- set enable_bitmapscan=off;
104+
-- explain (costs off)
105+
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106+
-- FROM test_rum
107+
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108+
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109+
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110+
-- FROM test_rum
111+
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112+
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
113+
-- SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
114+
-- FROM test_rum
115+
-- WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
116+
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');

0 commit comments

Comments
 (0)