Skip to content

Commit 2a0083e

Browse files
committed
Improve headeline generation. Now headline can contain
several fragments a-la Google. Sushant Sinha <sushant354@gmail.com>
1 parent 906b7e5 commit 2a0083e

File tree

6 files changed

+518
-63
lines changed

6 files changed

+518
-63
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.45 2008/09/23 09:20:34 heikki Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.46 2008/10/17 18:05:19 teodor Exp $ -->
22

33
<chapter id="textsearch">
44
<title id="textsearch-title">Full Text Search</title>
@@ -1098,6 +1098,29 @@ ORDER BY rank DESC LIMIT 10;
10981098
value of three eliminates the English articles.
10991099
</para>
11001100
</listitem>
1101+
<listitem>
1102+
<para>
1103+
<literal>MaxFragments</literal>: maximum number of text excerpts
1104+
or fragments that matches the query words. It also triggers a
1105+
different headline generation function than the default one. This
1106+
function finds text fragments with as many query words as possible and
1107+
stretches those fragments around the query words. As a result
1108+
query words are close to the middle of each fragment and have words on
1109+
each side. Each fragment will be of at most MaxWords and will not
1110+
have words of size less than or equal to ShortWord at the start or
1111+
end of a fragment. If all query words are not found in the document,
1112+
then a single fragment of MinWords will be displayed.
1113+
</para>
1114+
</listitem>
1115+
<listitem>
1116+
<para>
1117+
<literal>FragmentDelimiter</literal>: When more than one fragments are
1118+
displayed, then the fragments will be separated by this delimiter. This
1119+
option is effective only if MaxFragments is greater than 1 and there are
1120+
more than one fragments to be diplayed. This option has no effect on the
1121+
default headline generation function.
1122+
</para>
1123+
</listitem>
11011124
<listitem>
11021125
<para>
11031126
<literal>HighlightAll</literal>: Boolean flag; if
@@ -1109,7 +1132,7 @@ ORDER BY rank DESC LIMIT 10;
11091132
Any unspecified options receive these defaults:
11101133

11111134
<programlisting>
1112-
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
1135+
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
11131136
</programlisting>
11141137
</para>
11151138

src/backend/tsearch/ts_parse.c

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.9 2008/10/17 18:05:19 teodor Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -583,16 +583,19 @@ text *
583583
generateHeadline(HeadlineParsedText *prs)
584584
{
585585
text *out;
586-
int len = 128;
587586
char *ptr;
587+
int len = 128;
588+
int numfragments = 0;
589+
int2 infrag = 0;
590+
588591
HeadlineWordEntry *wrd = prs->words;
589592

590593
out = (text *) palloc(len);
591594
ptr = ((char *) out) + VARHDRSZ;
592595

593596
while (wrd - prs->words < prs->curwords)
594597
{
595-
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
598+
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
596599
{
597600
int dist = ptr - ((char *) out);
598601

@@ -603,6 +606,20 @@ generateHeadline(HeadlineParsedText *prs)
603606

604607
if (wrd->in && !wrd->repeated)
605608
{
609+
if (!infrag)
610+
{
611+
612+
/* start of a new fragment */
613+
infrag = 1;
614+
numfragments ++;
615+
/* add a fragment delimitor if this is after the first one */
616+
if (numfragments > 1)
617+
{
618+
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
619+
ptr += prs->fragdelimlen;
620+
}
621+
622+
}
606623
if (wrd->replace)
607624
{
608625
*ptr = ' ';
@@ -625,7 +642,11 @@ generateHeadline(HeadlineParsedText *prs)
625642
}
626643
}
627644
else if (!wrd->repeated)
645+
{
646+
if (infrag)
647+
infrag = 0;
628648
pfree(wrd->word);
649+
}
629650

630651
wrd++;
631652
}

0 commit comments

Comments
 (0)