Skip to content

Commit a6b6010

Browse files
author
Artur Zakirov
committed
Added opclass functions rum_extract_tsvector, rum_extract_tsquery
1 parent 38f455b commit a6b6010

File tree

3 files changed

+278
-17
lines changed

3 files changed

+278
-17
lines changed

rum--1.0.sql

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,27 @@ LANGUAGE C;
77
CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler;
88

99
-- Opclasses
10-
CREATE FUNCTION gin_tsvector_config(internal)
10+
CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal)
11+
RETURNS internal
12+
AS 'MODULE_PATHNAME'
13+
LANGUAGE C IMMUTABLE STRICT;
14+
15+
CREATE FUNCTION rum_extract_tsquery(tsvector,internal,smallint,internal,internal,internal,internal)
16+
RETURNS internal
17+
AS 'MODULE_PATHNAME'
18+
LANGUAGE C IMMUTABLE STRICT;
19+
20+
CREATE FUNCTION rum_tsvector_config(internal)
1121
RETURNS void
1222
AS 'MODULE_PATHNAME'
1323
LANGUAGE C IMMUTABLE STRICT;
1424

15-
CREATE FUNCTION gin_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal)
25+
CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal)
1626
RETURNS bool
1727
AS 'MODULE_PATHNAME'
1828
LANGUAGE C IMMUTABLE STRICT;
1929

20-
CREATE FUNCTION gin_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal)
30+
CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal)
2131
RETURNS float8
2232
AS 'MODULE_PATHNAME'
2333
LANGUAGE C IMMUTABLE STRICT;
@@ -28,12 +38,12 @@ AS
2838
OPERATOR 1 @@ (tsvector, tsquery),
2939
OPERATOR 2 @@@ (tsvector, tsquery),
3040
FUNCTION 1 bttextcmp(text, text),
31-
FUNCTION 2 gin_extract_tsvector(tsvector,internal,internal),
32-
FUNCTION 3 gin_extract_tsquery(tsvector,internal,smallint,internal,internal,internal,internal),
41+
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
42+
FUNCTION 3 rum_extract_tsquery(tsvector,internal,smallint,internal,internal,internal,internal),
3343
FUNCTION 4 gin_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
3444
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
3545
FUNCTION 6 gin_tsquery_triconsistent(internal,smallint,tsvector,int,internal,internal,internal),
36-
FUNCTION 7 gin_tsvector_config(internal),
37-
FUNCTION 8 gin_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
38-
FUNCTION 9 gin_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
46+
FUNCTION 7 rum_tsvector_config(internal),
47+
FUNCTION 8 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
48+
FUNCTION 9 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
3949
STORAGE text;

rum.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -862,9 +862,11 @@ extern void ginInsertCleanup(GinState *ginstate,
862862
#define GIN_PRE_CONSISTENT_PROC 8
863863
#define GIN_ORDERING_PROC 9
864864

865-
extern Datum gin_tsvector_config(PG_FUNCTION_ARGS);
866-
extern Datum gin_tsquery_pre_consistent(PG_FUNCTION_ARGS);
867-
extern Datum gin_tsquery_distance(PG_FUNCTION_ARGS);
865+
extern Datum rum_extract_tsvector(PG_FUNCTION_ARGS);
866+
extern Datum rum_extract_tsquery(PG_FUNCTION_ARGS);
867+
extern Datum rum_tsvector_config(PG_FUNCTION_ARGS);
868+
extern Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS);
869+
extern Datum rum_tsquery_distance(PG_FUNCTION_ARGS);
868870

869871
/*
870872
* Functions for reading ItemPointers with additional information. Used in

rum_ts_utils.c

Lines changed: 255 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,17 @@
1313
#include "catalog/pg_type.h"
1414
#include "tsearch/ts_type.h"
1515
#include "tsearch/ts_utils.h"
16+
#include "utils/builtins.h"
1617

1718
#include "rum.h"
1819

1920
#include <math.h>
2021

21-
PG_FUNCTION_INFO_V1(gin_tsvector_config);
22-
PG_FUNCTION_INFO_V1(gin_tsquery_pre_consistent);
23-
PG_FUNCTION_INFO_V1(gin_tsquery_distance);
22+
PG_FUNCTION_INFO_V1(rum_extract_tsvector);
23+
PG_FUNCTION_INFO_V1(rum_extract_tsquery);
24+
PG_FUNCTION_INFO_V1(rum_tsvector_config);
25+
PG_FUNCTION_INFO_V1(rum_tsquery_pre_consistent);
26+
PG_FUNCTION_INFO_V1(rum_tsquery_distance);
2427

2528
static float calc_rank_and(float *w, Datum *addInfo, bool *addInfoIsNull,
2629
int size);
@@ -53,7 +56,7 @@ checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
5356
}
5457

5558
Datum
56-
gin_tsquery_pre_consistent(PG_FUNCTION_ARGS)
59+
rum_tsquery_pre_consistent(PG_FUNCTION_ARGS)
5760
{
5861
bool *check = (bool *) PG_GETARG_POINTER(0);
5962

@@ -95,6 +98,7 @@ static WordEntryPosVector POSNULL = {
9598
{0}
9699
};
97100

101+
#define SIXTHBIT 0x20
98102
#define LOWERMASK 0x1F
99103

100104
/*
@@ -109,6 +113,38 @@ word_distance(int32 w)
109113
return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
110114
}
111115

116+
static int
117+
compress_pos(char *target, uint16 *pos, int npos)
118+
{
119+
int i;
120+
uint16 prev = 0, delta;
121+
char *ptr;
122+
123+
ptr = target;
124+
for (i = 0; i < npos; i++)
125+
{
126+
delta = WEP_GETPOS(pos[i]) - WEP_GETPOS(prev);
127+
128+
while (true)
129+
{
130+
if (delta >= SIXTHBIT)
131+
{
132+
*ptr = (delta & (~HIGHBIT)) | HIGHBIT;
133+
ptr++;
134+
delta >>= 7;
135+
}
136+
else
137+
{
138+
*ptr = delta | (WEP_GETWEIGHT(pos[i]) << 5);
139+
ptr++;
140+
break;
141+
}
142+
}
143+
prev = pos[i];
144+
}
145+
return ptr - target;
146+
}
147+
112148
static char *
113149
decompress_pos(char *ptr, uint16 *pos)
114150
{
@@ -293,7 +329,220 @@ calc_rank(float *w, TSQuery q, Datum *addInfo, bool *addInfoIsNull, int size)
293329
}
294330

295331
Datum
296-
gin_tsquery_distance(PG_FUNCTION_ARGS)
332+
rum_extract_tsvector(PG_FUNCTION_ARGS)
333+
{
334+
TSVector vector = PG_GETARG_TSVECTOR(0);
335+
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
336+
Datum **addInfo = (Datum **) PG_GETARG_POINTER(3);
337+
bool **addInfoIsNull = (bool **) PG_GETARG_POINTER(4);
338+
Datum *entries = NULL;
339+
340+
*nentries = vector->size;
341+
if (vector->size > 0)
342+
{
343+
int i;
344+
WordEntry *we = ARRPTR(vector);
345+
WordEntryPosVector *posVec;
346+
347+
entries = (Datum *) palloc(sizeof(Datum) * vector->size);
348+
*addInfo = (Datum *) palloc(sizeof(Datum) * vector->size);
349+
*addInfoIsNull = (bool *) palloc(sizeof(bool) * vector->size);
350+
351+
for (i = 0; i < vector->size; i++)
352+
{
353+
text *txt;
354+
bytea *posData;
355+
int posDataSize;
356+
357+
txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
358+
entries[i] = PointerGetDatum(txt);
359+
360+
if (we->haspos)
361+
{
362+
posVec = _POSVECPTR(vector, we);
363+
posDataSize = VARHDRSZ + 2 * posVec->npos * sizeof(WordEntryPos);
364+
posData = (bytea *)palloc(posDataSize);
365+
posDataSize = compress_pos(posData->vl_dat, posVec->pos, posVec->npos) + VARHDRSZ;
366+
SET_VARSIZE(posData, posDataSize);
367+
368+
(*addInfo)[i] = PointerGetDatum(posData);
369+
(*addInfoIsNull)[i] = false;
370+
}
371+
else
372+
{
373+
(*addInfo)[i] = (Datum)0;
374+
(*addInfoIsNull)[i] = true;
375+
}
376+
we++;
377+
}
378+
}
379+
380+
PG_FREE_IF_COPY(vector, 0);
381+
PG_RETURN_POINTER(entries);
382+
}
383+
384+
/*
385+
* sort QueryOperands by (length, word)
386+
*/
387+
static int
388+
compareQueryOperand(const void *a, const void *b, void *arg)
389+
{
390+
char *operand = (char *) arg;
391+
QueryOperand *qa = (*(QueryOperand *const *) a);
392+
QueryOperand *qb = (*(QueryOperand *const *) b);
393+
394+
return tsCompareString(operand + qa->distance, qa->length,
395+
operand + qb->distance, qb->length,
396+
false);
397+
}
398+
399+
/*
400+
* Returns a sorted, de-duplicated array of QueryOperands in a query.
401+
* The returned QueryOperands are pointers to the original QueryOperands
402+
* in the query.
403+
*
404+
* Length of the returned array is stored in *size
405+
*/
406+
static QueryOperand **
407+
SortAndUniqItems(TSQuery q, int *size)
408+
{
409+
char *operand = GETOPERAND(q);
410+
QueryItem *item = GETQUERY(q);
411+
QueryOperand **res,
412+
**ptr,
413+
**prevptr;
414+
415+
ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
416+
417+
/* Collect all operands from the tree to res */
418+
while ((*size)--)
419+
{
420+
if (item->type == QI_VAL)
421+
{
422+
*ptr = (QueryOperand *) item;
423+
ptr++;
424+
}
425+
item++;
426+
}
427+
428+
*size = ptr - res;
429+
if (*size < 2)
430+
return res;
431+
432+
qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
433+
434+
ptr = res + 1;
435+
prevptr = res;
436+
437+
/* remove duplicates */
438+
while (ptr - res < *size)
439+
{
440+
if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
441+
{
442+
prevptr++;
443+
*prevptr = *ptr;
444+
}
445+
ptr++;
446+
}
447+
448+
*size = prevptr + 1 - res;
449+
return res;
450+
}
451+
452+
Datum
453+
rum_extract_tsquery(PG_FUNCTION_ARGS)
454+
{
455+
TSQuery query = PG_GETARG_TSQUERY(0);
456+
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
457+
458+
/* StrategyNumber strategy = PG_GETARG_UINT16(2); */
459+
bool **ptr_partialmatch = (bool **) PG_GETARG_POINTER(3);
460+
Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
461+
462+
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
463+
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
464+
Datum *entries = NULL;
465+
466+
*nentries = 0;
467+
468+
if (query->size > 0)
469+
{
470+
QueryItem *item = GETQUERY(query);
471+
int32 i,
472+
j;
473+
bool *partialmatch;
474+
int *map_item_operand;
475+
char *operand = GETOPERAND(query);
476+
QueryOperand **operands;
477+
478+
/*
479+
* If the query doesn't have any required positive matches (for
480+
* instance, it's something like '! foo'), we have to do a full index
481+
* scan.
482+
*/
483+
if (tsquery_requires_match(item))
484+
*searchMode = GIN_SEARCH_MODE_DEFAULT;
485+
else
486+
*searchMode = GIN_SEARCH_MODE_ALL;
487+
488+
*nentries = query->size;
489+
operands = SortAndUniqItems(query, nentries);
490+
491+
entries = (Datum *) palloc(sizeof(Datum) * (*nentries));
492+
partialmatch = *ptr_partialmatch = (bool *) palloc(sizeof(bool) * (*nentries));
493+
494+
/*
495+
* Make map to convert item's number to corresponding operand's (the
496+
* same, entry's) number. Entry's number is used in check array in
497+
* consistent method. We use the same map for each entry.
498+
*/
499+
*extra_data = (Pointer *) palloc(sizeof(Pointer) * (*nentries));
500+
map_item_operand = (int *) palloc0(sizeof(int) * query->size);
501+
502+
for (i = 0; i < (*nentries); i++)
503+
{
504+
text *txt;
505+
506+
txt = cstring_to_text_with_len(GETOPERAND(query) + operands[i]->distance,
507+
operands[i]->length);
508+
entries[i] = PointerGetDatum(txt);
509+
partialmatch[i] = operands[i]->prefix;
510+
(*extra_data)[i] = (Pointer) map_item_operand;
511+
}
512+
513+
/* Now rescan the VAL items and fill in the arrays */
514+
for (j = 0; j < query->size; j++)
515+
{
516+
if (item[j].type == QI_VAL)
517+
{
518+
QueryOperand *val = &item[j].qoperand;
519+
bool found = false;
520+
521+
for (i = 0; i < (*nentries); i++)
522+
{
523+
if (!tsCompareString(operand + operands[i]->distance, operands[i]->length,
524+
operand + val->distance, val->length,
525+
false))
526+
{
527+
map_item_operand[j] = i;
528+
found = true;
529+
break;
530+
}
531+
}
532+
533+
if (!found)
534+
elog(ERROR, "Operand not found!");
535+
}
536+
}
537+
}
538+
539+
PG_FREE_IF_COPY(query, 0);
540+
541+
PG_RETURN_POINTER(entries);
542+
}
543+
544+
Datum
545+
rum_tsquery_distance(PG_FUNCTION_ARGS)
297546
{
298547
/* bool *check = (bool *) PG_GETARG_POINTER(0); */
299548

@@ -312,7 +561,7 @@ gin_tsquery_distance(PG_FUNCTION_ARGS)
312561
}
313562

314563
Datum
315-
gin_tsvector_config(PG_FUNCTION_ARGS)
564+
rum_tsvector_config(PG_FUNCTION_ARGS)
316565
{
317566
GinConfig *config = (GinConfig *)PG_GETARG_POINTER(0);
318567
config->addInfoTypeOid = BYTEAOID;

0 commit comments

Comments
 (0)