Skip to content

Commit 38f455b

Browse files
author
Artur Zakirov
committed
Added opclass functions
1 parent 4df8c7b commit 38f455b

File tree

5 files changed

+350
-6
lines changed

5 files changed

+350
-6
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# contrib/rum/Makefile
22

33
MODULE_big = rum
4-
OBJS = ginsort.o \
4+
OBJS = ginsort.o rum_ts_utils.o \
55
ginarrayproc.o ginbtree.o ginbulk.o gindatapage.o \
66
ginentrypage.o ginfast.o ginget.o gininsert.o \
77
ginscan.o ginutil.o ginvacuum.o $(WIN32RES)

ginutil.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ rumhandler(PG_FUNCTION_ARGS)
3838
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
3939

4040
amroutine->amstrategies = 0;
41-
amroutine->amsupport = 6;
41+
amroutine->amsupport = 9;
4242
amroutine->amcanorder = false;
4343
amroutine->amcanorderbyop = false;
4444
amroutine->amcanbackward = false;

rum--1.0.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@ LANGUAGE C;
77
CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler;
88

99
-- Opclasses
10+
CREATE FUNCTION gin_tsvector_config(internal)
11+
RETURNS void
12+
AS 'MODULE_PATHNAME'
13+
LANGUAGE C IMMUTABLE STRICT;
14+
15+
CREATE FUNCTION gin_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal)
16+
RETURNS bool
17+
AS 'MODULE_PATHNAME'
18+
LANGUAGE C IMMUTABLE STRICT;
19+
20+
CREATE FUNCTION gin_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal)
21+
RETURNS float8
22+
AS 'MODULE_PATHNAME'
23+
LANGUAGE C IMMUTABLE STRICT;
24+
1025
CREATE OPERATOR CLASS rum_tsvector_ops
1126
FOR TYPE tsvector USING rum
1227
AS
@@ -17,4 +32,8 @@ AS
1732
FUNCTION 3 gin_extract_tsquery(tsvector,internal,smallint,internal,internal,internal,internal),
1833
FUNCTION 4 gin_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
1934
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
35+
FUNCTION 6 gin_tsquery_triconsistent(internal,smallint,tsvector,int,internal,internal,internal),
36+
FUNCTION 7 gin_tsvector_config(internal),
37+
FUNCTION 8 gin_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
38+
FUNCTION 9 gin_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
2039
STORAGE text;

rum.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,10 +362,6 @@ typedef struct GinState
362362
Oid supportCollation[INDEX_MAX_KEYS];
363363
} GinState;
364364

365-
#define GIN_CONFIG_PROC 7
366-
#define GIN_PRE_CONSISTENT_PROC 8
367-
#define GIN_ORDERING_PROC 9
368-
369365
typedef struct GinConfig
370366
{
371367
Oid addInfoTypeOid;
@@ -861,6 +857,15 @@ extern void ginHeapTupleFastCollect(GinState *ginstate,
861857
extern void ginInsertCleanup(GinState *ginstate,
862858
bool vac_delay, IndexBulkDeleteResult *stats);
863859

860+
/* rum_ts_utils.c */
861+
#define GIN_CONFIG_PROC 7
862+
#define GIN_PRE_CONSISTENT_PROC 8
863+
#define GIN_ORDERING_PROC 9
864+
865+
extern Datum gin_tsvector_config(PG_FUNCTION_ARGS);
866+
extern Datum gin_tsquery_pre_consistent(PG_FUNCTION_ARGS);
867+
extern Datum gin_tsquery_distance(PG_FUNCTION_ARGS);
868+
864869
/*
865870
* Functions for reading ItemPointers with additional information. Used in
866871
* various .c files and have to be inline for being fast.

rum_ts_utils.c

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* rum_ts_utils.c
4+
* various support functions
5+
*
6+
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7+
*
8+
*-------------------------------------------------------------------------
9+
*/
10+
11+
#include "postgres.h"
12+
13+
#include "catalog/pg_type.h"
14+
#include "tsearch/ts_type.h"
15+
#include "tsearch/ts_utils.h"
16+
17+
#include "rum.h"
18+
19+
#include <math.h>
20+
21+
PG_FUNCTION_INFO_V1(gin_tsvector_config);
22+
PG_FUNCTION_INFO_V1(gin_tsquery_pre_consistent);
23+
PG_FUNCTION_INFO_V1(gin_tsquery_distance);
24+
25+
static float calc_rank_and(float *w, Datum *addInfo, bool *addInfoIsNull,
26+
int size);
27+
static float calc_rank_or(float *w, Datum *addInfo, bool *addInfoIsNull,
28+
int size);
29+
30+
typedef struct
31+
{
32+
QueryItem *first_item;
33+
bool *check;
34+
int *map_item_operand;
35+
bool *need_recheck;
36+
} GinChkVal;
37+
38+
static bool
39+
checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
40+
{
41+
GinChkVal *gcv = (GinChkVal *) checkval;
42+
int j;
43+
44+
/* if any val requiring a weight is used, set recheck flag */
45+
if (val->weight != 0)
46+
*(gcv->need_recheck) = true;
47+
48+
/* convert item's number to corresponding entry's (operand's) number */
49+
j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item];
50+
51+
/* return presence of current entry in indexed value */
52+
return gcv->check[j];
53+
}
54+
55+
Datum
56+
gin_tsquery_pre_consistent(PG_FUNCTION_ARGS)
57+
{
58+
bool *check = (bool *) PG_GETARG_POINTER(0);
59+
60+
TSQuery query = PG_GETARG_TSQUERY(2);
61+
62+
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
63+
bool recheck;
64+
bool res = FALSE;
65+
66+
if (query->size > 0)
67+
{
68+
QueryItem *item;
69+
GinChkVal gcv;
70+
71+
/*
72+
* check-parameter array has one entry for each value (operand) in the
73+
* query.
74+
*/
75+
gcv.first_item = item = GETQUERY(query);
76+
gcv.check = check;
77+
gcv.map_item_operand = (int *) (extra_data[0]);
78+
gcv.need_recheck = &recheck;
79+
80+
res = TS_execute(GETQUERY(query),
81+
&gcv,
82+
false,
83+
checkcondition_gin);
84+
}
85+
86+
PG_RETURN_BOOL(res);
87+
}
88+
89+
static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
90+
91+
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
92+
/* A dummy WordEntryPos array to use when haspos is false */
93+
static WordEntryPosVector POSNULL = {
94+
1, /* Number of elements that follow */
95+
{0}
96+
};
97+
98+
#define LOWERMASK 0x1F
99+
100+
/*
101+
* Returns a weight of a word collocation
102+
*/
103+
static float4
104+
word_distance(int32 w)
105+
{
106+
if (w > 100)
107+
return 1e-30f;
108+
109+
return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
110+
}
111+
112+
static char *
113+
decompress_pos(char *ptr, uint16 *pos)
114+
{
115+
int i;
116+
uint8 v;
117+
uint16 delta = 0;
118+
119+
i = 0;
120+
while (true)
121+
{
122+
v = *ptr;
123+
ptr++;
124+
if (v & HIGHBIT)
125+
{
126+
delta |= (v & (~HIGHBIT)) << i;
127+
}
128+
else
129+
{
130+
delta |= (v & LOWERMASK) << i;
131+
*pos += delta;
132+
WEP_SETWEIGHT(*pos, v >> 5);
133+
return ptr;
134+
}
135+
i += 7;
136+
}
137+
}
138+
139+
static int
140+
count_pos(char *ptr, int len)
141+
{
142+
int count = 0, i;
143+
for (i = 0; i < len; i++)
144+
{
145+
if (!(ptr[i] & HIGHBIT))
146+
count++;
147+
}
148+
return count;
149+
}
150+
151+
static float
152+
calc_rank_and(float *w, Datum *addInfo, bool *addInfoIsNull, int size)
153+
{
154+
int i,
155+
k,
156+
l,
157+
p;
158+
WordEntryPos post,
159+
ct;
160+
int32 dimt,
161+
lenct,
162+
dist;
163+
float res = -1.0;
164+
char *ptrt, *ptrc;
165+
166+
if (size < 2)
167+
{
168+
return calc_rank_or(w, addInfo, addInfoIsNull, size);
169+
}
170+
WEP_SETPOS(POSNULL.pos[0], MAXENTRYPOS - 1);
171+
172+
for (i = 0; i < size; i++)
173+
{
174+
if (!addInfoIsNull[i])
175+
{
176+
dimt = count_pos(VARDATA_ANY(addInfo[i]), VARSIZE_ANY_EXHDR(addInfo[i]));
177+
ptrt = (char *)VARDATA_ANY(addInfo[i]);
178+
}
179+
else
180+
{
181+
dimt = POSNULL.npos;
182+
ptrt = (char *)POSNULL.pos;
183+
}
184+
for (k = 0; k < i; k++)
185+
{
186+
if (!addInfoIsNull[k])
187+
lenct = count_pos(VARDATA_ANY(addInfo[k]), VARSIZE_ANY_EXHDR(addInfo[k]));
188+
else
189+
lenct = POSNULL.npos;
190+
post = 0;
191+
for (l = 0; l < dimt; l++)
192+
{
193+
ptrt = decompress_pos(ptrt, &post);
194+
ct = 0;
195+
if (!addInfoIsNull[k])
196+
ptrc = (char *)VARDATA_ANY(addInfo[k]);
197+
else
198+
ptrc = (char *)POSNULL.pos;
199+
for (p = 0; p < lenct; p++)
200+
{
201+
ptrc = decompress_pos(ptrc, &ct);
202+
dist = Abs((int) WEP_GETPOS(post) - (int) WEP_GETPOS(ct));
203+
if (dist || (dist == 0 && (ptrt == (char *)POSNULL.pos || ptrc == (char *)POSNULL.pos)))
204+
{
205+
float curw;
206+
207+
if (!dist)
208+
dist = MAXENTRYPOS;
209+
curw = sqrt(wpos(post) * wpos(ct) * word_distance(dist));
210+
res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
211+
}
212+
}
213+
}
214+
}
215+
216+
}
217+
return res;
218+
}
219+
220+
static float
221+
calc_rank_or(float *w, Datum *addInfo, bool *addInfoIsNull, int size)
222+
{
223+
WordEntryPos post;
224+
int32 dimt,
225+
j,
226+
i;
227+
float res = 0.0;
228+
char *ptrt;
229+
230+
for (i = 0; i < size; i++)
231+
{
232+
float resj,
233+
wjm;
234+
int32 jm;
235+
236+
if (!addInfoIsNull[i])
237+
{
238+
dimt = count_pos(VARDATA_ANY(addInfo[i]), VARSIZE_ANY_EXHDR(addInfo[i]));
239+
ptrt = (char *)VARDATA_ANY(addInfo[i]);
240+
}
241+
else
242+
{
243+
dimt = POSNULL.npos;
244+
ptrt = (char *)POSNULL.pos;
245+
}
246+
247+
resj = 0.0;
248+
wjm = -1.0;
249+
jm = 0;
250+
post = 0;
251+
for (j = 0; j < dimt; j++)
252+
{
253+
ptrt = decompress_pos(ptrt, &post);
254+
resj = resj + wpos(post) / ((j + 1) * (j + 1));
255+
if (wpos(post) > wjm)
256+
{
257+
wjm = wpos(post);
258+
jm = j;
259+
}
260+
}
261+
/*
262+
limit (sum(i/i^2),i->inf) = pi^2/6
263+
resj = sum(wi/i^2),i=1,noccurence,
264+
wi - should be sorted desc,
265+
don't sort for now, just choose maximum weight. This should be corrected
266+
Oleg Bartunov
267+
*/
268+
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
269+
270+
}
271+
if (size > 0)
272+
res = res / size;
273+
return res;
274+
}
275+
276+
static float
277+
calc_rank(float *w, TSQuery q, Datum *addInfo, bool *addInfoIsNull, int size)
278+
{
279+
QueryItem *item = GETQUERY(q);
280+
float res = 0.0;
281+
282+
if (!size || !q->size)
283+
return 0.0;
284+
285+
/* XXX: What about NOT? */
286+
res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ?
287+
calc_rank_and(w, addInfo, addInfoIsNull, size) : calc_rank_or(w, addInfo, addInfoIsNull, size);
288+
289+
if (res < 0)
290+
res = 1e-20f;
291+
292+
return res;
293+
}
294+
295+
Datum
296+
gin_tsquery_distance(PG_FUNCTION_ARGS)
297+
{
298+
/* bool *check = (bool *) PG_GETARG_POINTER(0); */
299+
300+
/* StrategyNumber strategy = PG_GETARG_UINT16(1); */
301+
TSQuery query = PG_GETARG_TSQUERY(2);
302+
303+
int32 nkeys = PG_GETARG_INT32(3);
304+
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
305+
Datum *addInfo = (Datum *) PG_GETARG_POINTER(8);
306+
bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9);
307+
float8 res;
308+
309+
res = 1.0 / (float8)calc_rank(weights, query, addInfo, addInfoIsNull, nkeys);
310+
311+
PG_RETURN_FLOAT8(res);
312+
}
313+
314+
Datum
315+
gin_tsvector_config(PG_FUNCTION_ARGS)
316+
{
317+
GinConfig *config = (GinConfig *)PG_GETARG_POINTER(0);
318+
config->addInfoTypeOid = BYTEAOID;
319+
PG_RETURN_VOID();
320+
}

0 commit comments

Comments
 (0)