@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
41
41
42
42
#define wpos (wep ) ( w[ WEP_GETWEIGHT(wep) ] )
43
43
44
- #define DEF_NORM_METHOD 0
44
+ #define RANK_NO_NORM 0x00
45
+ #define RANK_NORM_LOGLENGTH 0x01
46
+ #define RANK_NORM_LENGTH 0x02
47
+ #define RANK_NORM_EXTDIST 0x04
48
+ #define RANK_NORM_UNIQ 0x08
49
+ #define RANK_NORM_LOGUNIQ 0x10
50
+ #define DEF_NORM_METHOD RANK_NO_NORM
45
51
46
52
static float calc_rank_or (float * w , tsvector * t , QUERYTYPE * q );
47
53
static float calc_rank_and (float * w , tsvector * t , QUERYTYPE * q );
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
328
334
if (res < 0 )
329
335
res = 1e-20 ;
330
336
331
- switch (method )
332
- {
333
- case 0 :
334
- break ;
335
- case 1 :
336
- res /= log ((float ) (cnt_length (t ) + 1 )) / log (2.0 );
337
- break ;
338
- case 2 :
339
- len = cnt_length (t );
340
- if (len > 0 )
341
- res /= (float ) len ;
342
- break ;
343
- default :
344
- /* internal error */
345
- elog (ERROR , "unrecognized normalization method: %d" , method );
337
+ if ( (method & RANK_NORM_LOGLENGTH ) && t -> size > 0 )
338
+ res /= log ((double ) (cnt_length (t ) + 1 )) / log (2.0 );
339
+
340
+ if ( method & RANK_NORM_LENGTH ) {
341
+ len = cnt_length (t );
342
+ if ( len > 0 )
343
+ res /= (float ) len ;
346
344
}
347
345
346
+ if ( (method & RANK_NORM_UNIQ ) && t -> size > 0 )
347
+ res /= (float )( t -> size );
348
+
349
+ if ( (method & RANK_NORM_LOGUNIQ ) && t -> size > 0 )
350
+ res /= log ((double ) (t -> size + 1 )) / log (2.0 );
351
+
348
352
return res ;
349
353
}
350
354
@@ -420,6 +424,7 @@ typedef struct
420
424
ITEM * * item ;
421
425
int16 nitem ;
422
426
bool needfree ;
427
+ uint8 wclass ;
423
428
int32 pos ;
424
429
} DocRepresentation ;
425
430
@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
452
457
}
453
458
}
454
459
460
+ typedef struct {
461
+ int pos ;
462
+ int p ;
463
+ int q ;
464
+ DocRepresentation * begin ;
465
+ DocRepresentation * end ;
466
+ } Extention ;
467
+
468
+
455
469
static bool
456
- Cover (DocRepresentation * doc , int len , QUERYTYPE * query , int * pos , int * p , int * q )
470
+ Cover (DocRepresentation * doc , int len , QUERYTYPE * query , Extention * ext )
457
471
{
458
472
DocRepresentation * ptr ;
459
- int lastpos = * pos ;
473
+ int lastpos = ext -> pos ;
460
474
int i ;
461
475
bool found = false;
462
476
463
477
reset_istrue_flag (query );
464
478
465
- * p = 0x7fffffff ;
466
- * q = 0 ;
467
- ptr = doc + * pos ;
479
+ ext -> p = 0x7fffffff ;
480
+ ext -> q = 0 ;
481
+ ptr = doc + ext -> pos ;
468
482
469
483
/* find upper bound of cover from current position, move up */
470
484
while (ptr - doc < len )
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
473
487
ptr -> item [i ]-> istrue = 1 ;
474
488
if (TS_execute (GETQUERY (query ), NULL , false, checkcondition_ITEM ))
475
489
{
476
- if (ptr -> pos > * q )
490
+ if (ptr -> pos > ext -> q )
477
491
{
478
- * q = ptr -> pos ;
492
+ ext -> q = ptr -> pos ;
493
+ ext -> end = ptr ;
479
494
lastpos = ptr - doc ;
480
495
found = true;
481
496
}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
498
513
ptr -> item [i ]-> istrue = 1 ;
499
514
if (TS_execute (GETQUERY (query ), NULL , true, checkcondition_ITEM ))
500
515
{
501
- if (ptr -> pos < * p )
502
- * p = ptr -> pos ;
516
+ if (ptr -> pos < ext -> p ) {
517
+ ext -> begin = ptr ;
518
+ ext -> p = ptr -> pos ;
519
+ }
503
520
break ;
504
521
}
505
522
ptr -- ;
506
523
}
507
524
508
- if (* p <= * q )
525
+ if (ext -> p <= ext -> q )
509
526
{
510
527
/*
511
528
* set position for next try to next lexeme after begining of founded
512
529
* cover
513
530
*/
514
- * pos = (ptr - doc ) + 1 ;
531
+ ext -> pos = (ptr - doc ) + 1 ;
515
532
return true;
516
533
}
517
534
518
- ( * pos ) ++ ;
519
- return Cover (doc , len , query , pos , p , q );
535
+ ext -> pos ++ ;
536
+ return Cover (doc , len , query , ext );
520
537
}
521
538
522
539
static DocRepresentation *
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
593
610
doc [cur ].item = doc [cur - 1 ].item ;
594
611
}
595
612
doc [cur ].pos = WEP_GETPOS (post [j ]);
613
+ doc [cur ].wclass = WEP_GETWEIGHT (post [j ]);
596
614
cur ++ ;
597
615
}
598
616
}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
610
628
return NULL ;
611
629
}
612
630
613
-
614
- Datum
615
- rank_cd (PG_FUNCTION_ARGS )
616
- {
617
- int K = PG_GETARG_INT32 (0 );
618
- tsvector * txt = (tsvector * ) PG_DETOAST_DATUM (PG_GETARG_DATUM (1 ));
619
- QUERYTYPE * query = (QUERYTYPE * ) PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (2 ));
620
- int method = DEF_NORM_METHOD ;
631
+ static float4
632
+ calc_rank_cd (float4 * arrdata , tsvector * txt , QUERYTYPE * query , int method ) {
621
633
DocRepresentation * doc ;
622
- float res = 0.0 ;
623
- int p = 0 ,
624
- q = 0 ,
625
- len ,
626
- cur ,
634
+ int len ,
627
635
i ,
628
636
doclen = 0 ;
637
+ Extention ext ;
638
+ double Wdoc = 0.0 ;
639
+ double invws [lengthof (weights )];
640
+ double SumDist = 0.0 , PrevExtPos = 0.0 , CurExtPos = 0.0 ;
641
+ int NExtent = 0 ;
629
642
630
- doc = get_docrep (txt , query , & doclen );
631
- if (!doc )
643
+ for (i = 0 ; i < lengthof (weights ); i ++ )
632
644
{
633
- PG_FREE_IF_COPY (txt , 1 );
634
- PG_FREE_IF_COPY (query , 2 );
635
- PG_RETURN_FLOAT4 (0.0 );
645
+ invws [i ] = ((double )((arrdata [i ] >= 0 ) ? arrdata [i ] : weights [i ]));
646
+ if (invws [i ] > 1.0 )
647
+ ereport (ERROR ,
648
+ (errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
649
+ errmsg ("weight out of range" )));
650
+ invws [i ] = 1.0 /invws [i ];
636
651
}
637
652
638
- cur = 0 ;
639
- if (K <= 0 )
640
- K = 4 ;
641
- while (Cover (doc , doclen , query , & cur , & p , & q ))
642
- res += (q - p + 1 > K ) ? ((float ) K ) / ((float ) (q - p + 1 )) : 1.0 ;
653
+ doc = get_docrep (txt , query , & doclen );
654
+ if (!doc )
655
+ return 0.0 ;
643
656
644
- if (PG_NARGS () == 4 )
645
- method = PG_GETARG_INT32 (3 );
657
+ MemSet ( & ext , 0 , sizeof (Extention ) );
658
+ while (Cover (doc , doclen , query , & ext )) {
659
+ double Cpos = 0.0 ;
660
+ double InvSum = 0.0 ;
661
+ DocRepresentation * ptr = ext .begin ;
646
662
647
- switch (method )
648
- {
649
- case 0 :
650
- break ;
651
- case 1 :
652
- res /= log ((float ) (cnt_length (txt ) + 1 ));
653
- break ;
654
- case 2 :
655
- len = cnt_length (txt );
656
- if (len > 0 )
657
- res /= (float ) len ;
658
- break ;
659
- default :
660
- /* internal error */
661
- elog (ERROR , "unrecognized normalization method: %d" , method );
663
+ while ( ptr <=ext .end ) {
664
+ InvSum += invws [ ptr -> wclass ];
665
+ ptr ++ ;
666
+ }
667
+
668
+ Cpos = ((double )( ext .end - ext .begin + 1 )) / InvSum ;
669
+ Wdoc += Cpos / ( (double )(( 1 + (ext .q - ext .p ) - (ext .end - ext .begin ) )) );
670
+
671
+ CurExtPos = ((double )(ext .q + ext .p ))/2.0 ;
672
+ if ( NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ )
673
+ SumDist += 1.0 /( CurExtPos - PrevExtPos );
674
+
675
+ PrevExtPos = CurExtPos ;
676
+ NExtent ++ ;
677
+ }
678
+
679
+ if ( (method & RANK_NORM_LOGLENGTH ) && txt -> size > 0 )
680
+ Wdoc /= log ((double ) (cnt_length (txt ) + 1 ));
681
+
682
+ if ( method & RANK_NORM_LENGTH ) {
683
+ len = cnt_length (txt );
684
+ if ( len > 0 )
685
+ Wdoc /= (double ) len ;
662
686
}
663
687
688
+ if ( (method & RANK_NORM_EXTDIST ) && SumDist > 0 )
689
+ Wdoc /= ((double )NExtent ) / SumDist ;
690
+
691
+ if ( (method & RANK_NORM_UNIQ ) && txt -> size > 0 )
692
+ Wdoc /= (double )( txt -> size );
693
+
694
+ if ( (method & RANK_NORM_LOGUNIQ ) && txt -> size > 0 )
695
+ Wdoc /= log ((double ) (txt -> size + 1 )) / log (2.0 );
696
+
664
697
for (i = 0 ; i < doclen ; i ++ )
665
698
if (doc [i ].needfree )
666
699
pfree (doc [i ].item );
667
700
pfree (doc );
701
+
702
+ return (float4 )Wdoc ;
703
+ }
704
+
705
+ Datum
706
+ rank_cd (PG_FUNCTION_ARGS )
707
+ {
708
+ ArrayType * win = (ArrayType * ) PG_DETOAST_DATUM (PG_GETARG_DATUM (0 ));
709
+ tsvector * txt = (tsvector * ) PG_DETOAST_DATUM (PG_GETARG_DATUM (1 ));
710
+ QUERYTYPE * query = (QUERYTYPE * ) PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (2 ));
711
+ int method = DEF_NORM_METHOD ;
712
+ float4 res ;
713
+
714
+ if (ARR_NDIM (win ) != 1 )
715
+ ereport (ERROR ,
716
+ (errcode (ERRCODE_ARRAY_SUBSCRIPT_ERROR ),
717
+ errmsg ("array of weight must be one-dimensional" )));
718
+
719
+ if (ARRNELEMS (win ) < lengthof (weights ))
720
+ ereport (ERROR ,
721
+ (errcode (ERRCODE_ARRAY_SUBSCRIPT_ERROR ),
722
+ errmsg ("array of weight is too short" )));
723
+
724
+ if (ARR_HASNULL (win ))
725
+ ereport (ERROR ,
726
+ (errcode (ERRCODE_NULL_VALUE_NOT_ALLOWED ),
727
+ errmsg ("array of weight must not contain nulls" )));
728
+
729
+ if (PG_NARGS () == 4 )
730
+ method = PG_GETARG_INT32 (3 );
731
+
732
+ res = calc_rank_cd ( (float4 * ) ARR_DATA_PTR (win ), txt , query , method );
733
+
734
+ PG_FREE_IF_COPY (win , 0 );
668
735
PG_FREE_IF_COPY (txt , 1 );
669
736
PG_FREE_IF_COPY (query , 2 );
670
737
@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
675
742
Datum
676
743
rank_cd_def (PG_FUNCTION_ARGS )
677
744
{
678
- PG_RETURN_DATUM (DirectFunctionCall4 (
679
- rank_cd ,
680
- Int32GetDatum (-1 ),
681
- PG_GETARG_DATUM (0 ),
682
- PG_GETARG_DATUM (1 ),
683
- (PG_NARGS () == 3 ) ? PG_GETARG_DATUM (2 ) : Int32GetDatum (DEF_NORM_METHOD )
684
- ));
745
+ tsvector * txt = (tsvector * ) PG_DETOAST_DATUM (PG_GETARG_DATUM (0 ));
746
+ QUERYTYPE * query = (QUERYTYPE * ) PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (1 ));
747
+ float4 res ;
748
+
749
+ res = calc_rank_cd ( weights , txt , query , (PG_NARGS () == 3 ) ? PG_GETARG_DATUM (2 ) : DEF_NORM_METHOD );
750
+
751
+ PG_FREE_IF_COPY (txt , 1 );
752
+ PG_FREE_IF_COPY (query , 2 );
753
+
754
+ PG_RETURN_FLOAT4 (res );
685
755
}
686
756
687
757
/**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
721
791
text * out ;
722
792
char * cptr ;
723
793
DocRepresentation * doc ;
724
- int pos = 0 ,
725
- p ,
726
- q ,
727
- olddwpos = 0 ;
794
+ int olddwpos = 0 ;
728
795
int ncover = 1 ;
796
+ Extention ext ;
729
797
730
798
doc = get_docrep (txt , query , & rlen );
731
799
@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
765
833
}
766
834
qsort ((void * ) dw , dlen , sizeof (DocWord ), compareDocWord );
767
835
768
- while (Cover (doc , rlen , query , & pos , & p , & q ))
836
+ MemSet ( & ext , 0 , sizeof (Extention ) );
837
+ while (Cover (doc , rlen , query , & ext ))
769
838
{
770
839
dwptr = dw + olddwpos ;
771
- while (dwptr -> pos < p && dwptr - dw < dlen )
840
+ while (dwptr -> pos < ext . p && dwptr - dw < dlen )
772
841
dwptr ++ ;
773
842
olddwpos = dwptr - dw ;
774
843
dwptr -> start = ncover ;
775
- while (dwptr -> pos < q + 1 && dwptr - dw < dlen )
844
+ while (dwptr -> pos < ext . q + 1 && dwptr - dw < dlen )
776
845
dwptr ++ ;
777
846
(dwptr - 1 )-> finish = ncover ;
778
847
len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
0 commit comments