Skip to content

Commit 38c4fe8

Browse files
committed
Significantly improve ranking:
1) rank_cd now use weight of lexemes 2) rank_cd and rank can use any combination of normalization methods: no normalization normalization by log(length of document) -----/------- by length of document -----/------- by number of unique word in document -----/------- by log(number of unique word in document) -----/------- by number of covers (only rank_cd) Improve cover's search. TODO: changes in documentation
1 parent 85fa9f5 commit 38c4fe8

File tree

3 files changed

+161
-92
lines changed

3 files changed

+161
-92
lines changed

contrib/tsearch2/expected/tsearch2.out

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2315,9 +2315,9 @@ An hour of storm to place
23152315
The sculpture of these granite seams,
23162316
Upon a woman s face. E. J. Pratt (1882 1964)
23172317
'), to_tsquery('sea&thousand&years'));
2318-
rank_cd
2319-
---------
2320-
1.2
2318+
rank_cd
2319+
-----------
2320+
0.0555556
23212321
(1 row)
23222322

23232323
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2329,9 +2329,9 @@ An hour of storm to place
23292329
The sculpture of these granite seams,
23302330
Upon a woman s face. E. J. Pratt (1882 1964)
23312331
'), to_tsquery('granite&sea'));
2332-
rank_cd
2333-
----------
2334-
0.880303
2332+
rank_cd
2333+
-----------
2334+
0.0238095
23352335
(1 row)
23362336

23372337
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2345,7 +2345,7 @@ Upon a woman s face. E. J. Pratt (1882 1964)
23452345
'), to_tsquery('sea'));
23462346
rank_cd
23472347
---------
2348-
2
2348+
0.2
23492349
(1 row)
23502350

23512351
select get_covers(to_tsvector('Erosion It took the sea a thousand years,

contrib/tsearch2/rank.c

Lines changed: 152 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
4141

4242
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
4343

44-
#define DEF_NORM_METHOD 0
44+
#define RANK_NO_NORM 0x00
45+
#define RANK_NORM_LOGLENGTH 0x01
46+
#define RANK_NORM_LENGTH 0x02
47+
#define RANK_NORM_EXTDIST 0x04
48+
#define RANK_NORM_UNIQ 0x08
49+
#define RANK_NORM_LOGUNIQ 0x10
50+
#define DEF_NORM_METHOD RANK_NO_NORM
4551

4652
static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
4753
static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
328334
if (res < 0)
329335
res = 1e-20;
330336

331-
switch (method)
332-
{
333-
case 0:
334-
break;
335-
case 1:
336-
res /= log((float) (cnt_length(t) + 1)) / log(2.0);
337-
break;
338-
case 2:
339-
len = cnt_length(t);
340-
if (len > 0)
341-
res /= (float) len;
342-
break;
343-
default:
344-
/* internal error */
345-
elog(ERROR, "unrecognized normalization method: %d", method);
337+
if ( (method & RANK_NORM_LOGLENGTH) && t->size>0 )
338+
res /= log((double) (cnt_length(t) + 1)) / log(2.0);
339+
340+
if ( method & RANK_NORM_LENGTH ) {
341+
len = cnt_length(t);
342+
if ( len>0 )
343+
res /= (float) len;
346344
}
347345

346+
if ( (method & RANK_NORM_UNIQ) && t->size > 0 )
347+
res /= (float)( t->size );
348+
349+
if ( (method & RANK_NORM_LOGUNIQ) && t->size > 0 )
350+
res /= log((double) (t->size + 1)) / log(2.0);
351+
348352
return res;
349353
}
350354

@@ -420,6 +424,7 @@ typedef struct
420424
ITEM **item;
421425
int16 nitem;
422426
bool needfree;
427+
uint8 wclass;
423428
int32 pos;
424429
} DocRepresentation;
425430

@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
452457
}
453458
}
454459

460+
typedef struct {
461+
int pos;
462+
int p;
463+
int q;
464+
DocRepresentation *begin;
465+
DocRepresentation *end;
466+
} Extention;
467+
468+
455469
static bool
456-
Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
470+
Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention *ext)
457471
{
458472
DocRepresentation *ptr;
459-
int lastpos = *pos;
473+
int lastpos = ext->pos;
460474
int i;
461475
bool found = false;
462476

463477
reset_istrue_flag(query);
464478

465-
*p = 0x7fffffff;
466-
*q = 0;
467-
ptr = doc + *pos;
479+
ext->p = 0x7fffffff;
480+
ext->q = 0;
481+
ptr = doc + ext->pos;
468482

469483
/* find upper bound of cover from current position, move up */
470484
while (ptr - doc < len)
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
473487
ptr->item[i]->istrue = 1;
474488
if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM))
475489
{
476-
if (ptr->pos > *q)
490+
if (ptr->pos > ext->q)
477491
{
478-
*q = ptr->pos;
492+
ext->q = ptr->pos;
493+
ext->end = ptr;
479494
lastpos = ptr - doc;
480495
found = true;
481496
}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
498513
ptr->item[i]->istrue = 1;
499514
if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM))
500515
{
501-
if (ptr->pos < *p)
502-
*p = ptr->pos;
516+
if (ptr->pos < ext->p) {
517+
ext->begin = ptr;
518+
ext->p = ptr->pos;
519+
}
503520
break;
504521
}
505522
ptr--;
506523
}
507524

508-
if (*p <= *q)
525+
if (ext->p <= ext->q)
509526
{
510527
/*
511528
* set position for next try to next lexeme after begining of founded
512529
* cover
513530
*/
514-
*pos = (ptr - doc) + 1;
531+
ext->pos = (ptr - doc) + 1;
515532
return true;
516533
}
517534

518-
(*pos)++;
519-
return Cover(doc, len, query, pos, p, q);
535+
ext->pos++;
536+
return Cover(doc, len, query, ext);
520537
}
521538

522539
static DocRepresentation *
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
593610
doc[cur].item = doc[cur - 1].item;
594611
}
595612
doc[cur].pos = WEP_GETPOS(post[j]);
613+
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
596614
cur++;
597615
}
598616
}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
610628
return NULL;
611629
}
612630

613-
614-
Datum
615-
rank_cd(PG_FUNCTION_ARGS)
616-
{
617-
int K = PG_GETARG_INT32(0);
618-
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
619-
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
620-
int method = DEF_NORM_METHOD;
631+
static float4
632+
calc_rank_cd(float4 *arrdata, tsvector *txt, QUERYTYPE *query, int method) {
621633
DocRepresentation *doc;
622-
float res = 0.0;
623-
int p = 0,
624-
q = 0,
625-
len,
626-
cur,
634+
int len,
627635
i,
628636
doclen = 0;
637+
Extention ext;
638+
double Wdoc = 0.0;
639+
double invws[lengthof(weights)];
640+
double SumDist=0.0, PrevExtPos=0.0, CurExtPos=0.0;
641+
int NExtent=0;
629642

630-
doc = get_docrep(txt, query, &doclen);
631-
if (!doc)
643+
for (i = 0; i < lengthof(weights); i++)
632644
{
633-
PG_FREE_IF_COPY(txt, 1);
634-
PG_FREE_IF_COPY(query, 2);
635-
PG_RETURN_FLOAT4(0.0);
645+
invws[i] = ((double)((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
646+
if (invws[i] > 1.0)
647+
ereport(ERROR,
648+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
649+
errmsg("weight out of range")));
650+
invws[i] = 1.0/invws[i];
636651
}
637652

638-
cur = 0;
639-
if (K <= 0)
640-
K = 4;
641-
while (Cover(doc, doclen, query, &cur, &p, &q))
642-
res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
653+
doc = get_docrep(txt, query, &doclen);
654+
if (!doc)
655+
return 0.0;
643656

644-
if (PG_NARGS() == 4)
645-
method = PG_GETARG_INT32(3);
657+
MemSet( &ext, 0, sizeof(Extention) );
658+
while (Cover(doc, doclen, query, &ext)) {
659+
double Cpos = 0.0;
660+
double InvSum = 0.0;
661+
DocRepresentation *ptr = ext.begin;
646662

647-
switch (method)
648-
{
649-
case 0:
650-
break;
651-
case 1:
652-
res /= log((float) (cnt_length(txt) + 1));
653-
break;
654-
case 2:
655-
len = cnt_length(txt);
656-
if (len > 0)
657-
res /= (float) len;
658-
break;
659-
default:
660-
/* internal error */
661-
elog(ERROR, "unrecognized normalization method: %d", method);
663+
while ( ptr<=ext.end ) {
664+
InvSum += invws[ ptr->wclass ];
665+
ptr++;
666+
}
667+
668+
Cpos = ((double)( ext.end-ext.begin+1 )) / InvSum;
669+
Wdoc += Cpos / ( (double)(( 1 + (ext.q - ext.p) - (ext.end - ext.begin) )) );
670+
671+
CurExtPos = ((double)(ext.q + ext.p))/2.0;
672+
if ( NExtent>0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ )
673+
SumDist += 1.0/( CurExtPos - PrevExtPos );
674+
675+
PrevExtPos = CurExtPos;
676+
NExtent++;
677+
}
678+
679+
if ( (method & RANK_NORM_LOGLENGTH) && txt->size > 0 )
680+
Wdoc /= log((double) (cnt_length(txt) + 1));
681+
682+
if ( method & RANK_NORM_LENGTH ) {
683+
len = cnt_length(txt);
684+
if ( len>0 )
685+
Wdoc /= (double) len;
662686
}
663687

688+
if ( (method & RANK_NORM_EXTDIST) && SumDist > 0 )
689+
Wdoc /= ((double)NExtent) / SumDist;
690+
691+
if ( (method & RANK_NORM_UNIQ) && txt->size > 0 )
692+
Wdoc /= (double)( txt->size );
693+
694+
if ( (method & RANK_NORM_LOGUNIQ) && txt->size > 0 )
695+
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
696+
664697
for (i = 0; i < doclen; i++)
665698
if (doc[i].needfree)
666699
pfree(doc[i].item);
667700
pfree(doc);
701+
702+
return (float4)Wdoc;
703+
}
704+
705+
Datum
706+
rank_cd(PG_FUNCTION_ARGS)
707+
{
708+
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
709+
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
710+
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
711+
int method = DEF_NORM_METHOD;
712+
float4 res;
713+
714+
if (ARR_NDIM(win) != 1)
715+
ereport(ERROR,
716+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
717+
errmsg("array of weight must be one-dimensional")));
718+
719+
if (ARRNELEMS(win) < lengthof(weights))
720+
ereport(ERROR,
721+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
722+
errmsg("array of weight is too short")));
723+
724+
if (ARR_HASNULL(win))
725+
ereport(ERROR,
726+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
727+
errmsg("array of weight must not contain nulls")));
728+
729+
if (PG_NARGS() == 4)
730+
method = PG_GETARG_INT32(3);
731+
732+
res = calc_rank_cd( (float4 *) ARR_DATA_PTR(win), txt, query, method);
733+
734+
PG_FREE_IF_COPY(win, 0);
668735
PG_FREE_IF_COPY(txt, 1);
669736
PG_FREE_IF_COPY(query, 2);
670737

@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
675742
Datum
676743
rank_cd_def(PG_FUNCTION_ARGS)
677744
{
678-
PG_RETURN_DATUM(DirectFunctionCall4(
679-
rank_cd,
680-
Int32GetDatum(-1),
681-
PG_GETARG_DATUM(0),
682-
PG_GETARG_DATUM(1),
683-
(PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
684-
));
745+
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
746+
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
747+
float4 res;
748+
749+
res = calc_rank_cd( weights, txt, query, (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : DEF_NORM_METHOD);
750+
751+
PG_FREE_IF_COPY(txt, 1);
752+
PG_FREE_IF_COPY(query, 2);
753+
754+
PG_RETURN_FLOAT4(res);
685755
}
686756

687757
/**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
721791
text *out;
722792
char *cptr;
723793
DocRepresentation *doc;
724-
int pos = 0,
725-
p,
726-
q,
727-
olddwpos = 0;
794+
int olddwpos = 0;
728795
int ncover = 1;
796+
Extention ext;
729797

730798
doc = get_docrep(txt, query, &rlen);
731799

@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
765833
}
766834
qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
767835

768-
while (Cover(doc, rlen, query, &pos, &p, &q))
836+
MemSet( &ext, 0, sizeof(Extention) );
837+
while (Cover(doc, rlen, query, &ext))
769838
{
770839
dwptr = dw + olddwpos;
771-
while (dwptr->pos < p && dwptr - dw < dlen)
840+
while (dwptr->pos < ext.p && dwptr - dw < dlen)
772841
dwptr++;
773842
olddwpos = dwptr - dw;
774843
dwptr->start = ncover;
775-
while (dwptr->pos < q + 1 && dwptr - dw < dlen)
844+
while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen)
776845
dwptr++;
777846
(dwptr - 1)->finish = ncover;
778847
len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;

0 commit comments

Comments
 (0)