Skip to content

Commit b886053

Browse files
committed
tsearch2 module
1 parent a605382 commit b886053

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+16908
-0
lines changed

contrib/tsearch2/Makefile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
subdir = contrib/tsearch2
2+
top_builddir = ../..
3+
include $(top_builddir)/src/Makefile.global
4+
5+
6+
override CPPFLAGS := -I. -I./snowball -I./ispell -I./wordparser $(CPPFLAGS)
7+
8+
MODULE_big = tsearch2
9+
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
10+
snowball/english_stem.o snowball/api.o snowball/russian_stem.o snowball/utilities.o \
11+
dict_snowball.o ispell/spell.o dict_ispell.o dict_syn.o \
12+
wparser.o wordparser/parser.o wordparser/deflex.o wparser_def.o \
13+
ts_cfg.o tsvector.o rewrite.o crc32.o query.o gistidx.o \
14+
tsvector_op.o rank.o ts_stat.o
15+
16+
DATA_built = tsearch2.sql untsearch2.sql
17+
DOCS = README.tsearch2
18+
REGRESS = tsearch2
19+
20+
wordparser/parser.c: wordparser/parser.l
21+
ifdef FLEX
22+
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
23+
else
24+
@$(missing) flex $< $@
25+
endif
26+
27+
EXTRA_CLEAN = wordparser/parser.c tsearch2.sql.in
28+
29+
SHLIB_LINK := -lm
30+
include $(top_srcdir)/contrib/contrib-global.mk
31+
# DO NOT DELETE
32+
33+
install: installstop
34+
35+
installstop:
36+
cp stopword/*.stop $(datadir)
37+
38+
39+
tsearch2.sql.in: tsearch.sql._in
40+
sed 's,DATA_PATH,$(datadir),g' < $< > $@
41+
42+
untsearch2.sql: untsearch.sql.in
43+
cp $< $@
44+

contrib/tsearch2/README.tsearch2

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
Tsearch2 - full text search extension for PostgreSQL
2+
3+
[10][Online version] of this document is available
4+
5+
This module is sponsored by Delta-Soft Ltd., Moscow, Russia.
6+
7+
Notice: This version is fully incompatible with old tsearch (V1),
8+
which is considered as deprecated in upcoming 7.4 release and
9+
obsoleted in 7.5.
10+
11+
The Tsearch2 contrib module contains an implementation of a new data
12+
type tsvector - a searchable data type with indexed access. In a
13+
nutshell, tsvector is a set of unique words along with their
14+
positional information in the document, organized in a special
15+
structure optimized for fast access and lookup. Actually, each word
16+
entry, besides its position in the document, could have a weight
17+
attribute, describing importance of this word (at a specific) position
18+
in document. A set of bit-signatures of a fixed length, representing
19+
tsvectors, are stored in a search tree (developed using PostgreSQL
20+
GiST), which provides online update of full text index and fast query
21+
lookup. The module provides indexed access methods, queries,
22+
operations and supporting routines for the tsvector data type and easy
23+
conversion of text data to tsvector. Table driven configuration allows
24+
creation of custom configuration optimized for specific searches using
25+
standard SQL commands.
26+
27+
Configuration allows you to:
28+
* specify the type of lexemes to be indexed and the way they are
29+
processed.
30+
* specify dictionaries to be used along with stop words recognition.
31+
* specify the parser used to process a document.
32+
33+
See [11]Documentation Roadmap for links to documentation.
34+
35+
Authors
36+
37+
* Oleg Bartunov <oleg@sai.msu.su>, Moscow, Moscow University, Russia
38+
* Teodor Sigaev <teodor@sigaev.ru>, Moscow, Delta-Soft Ltd.,Russia
39+
40+
Contributors
41+
42+
* Robert John Shepherd and Andrew J. Kopciuch submitted
43+
"Introduction to tsearch" (Robert - tsearch v1, Andrew - tsearch
44+
v2)
45+
* Brandon Craig Rhodes wrote "Tsearch2 Guide" and "Tsearch2
46+
Reference" and proposed new naming convention for tsearch V2
47+
48+
New features
49+
50+
* Relevance ranking of search results
51+
* Table driven configuration
52+
* Morphology support (ispell dictionaries, snowball stemmers)
53+
* Headline support (text fragments with highlighted search terms)
54+
* Ability to plug-in custom dictionaries and parsers
55+
* Synonym dictionary
56+
* Generator of templates for dictionaries (built-in snowball stemmer
57+
support)
58+
* Statistics of indexed words is available
59+
60+
Limitations
61+
62+
* Lexeme should be not longer than 2048 bytes
63+
* The number of lexemes is limited by 2^32. Note, that actual
64+
capacity of tsvector is depends on whether positional information
65+
is stored or not.
66+
* tsvector - the size is limited by approximately 2^20 bytes.
67+
* tsquery - the number of entries (lexemes and operations) < 32768
68+
* Positional information
69+
+ maximal position of lexeme < 2^14 (16384)
70+
+ lexeme could have maximum 256 positions
71+
72+
References
73+
74+
* GiST development site -
75+
[12]http://www.sai.msu.su/~megera/postgres/gist
76+
* OpenFTS home page - [13]http://openfts.sourceforge.net/
77+
* Mailing list -
78+
[14]http://sourceforge.net/mailarchive/forum.php?forum=openfts-gen
79+
eral
80+
81+
[15]Documentation Roadmap
82+
83+
Documentation Roadmap
84+
85+
* Several docs are available from docs/ subdirectory
86+
+ "Tsearch V2 Introduction" by Andrew Kopciuch
87+
+ "Tsearch2 Guide" by Brandon Rhodes
88+
+ "Tsearch2 Reference" by Brandon Rhodes
89+
* Readme.gendict in gendict/ subdirectory
90+
+ [16][Gendict tutorial]
91+
92+
Online version of documentation is always available from Tsearch V2
93+
home page -
94+
[17]http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/
95+
96+
Support
97+
98+
Authors urgently recommend people to use [18][openfts-general] or
99+
[19][pgsql-general] mailing lists for questions and discussions.
100+
101+
Caution
102+
103+
In spite of apparent easy full text searching with our tsearch module
104+
(authors hope it's so), any serious search engine require profound
105+
study of various aspects, such as stop words, dictionaries, special
106+
parsers. Tsearch module was designed to facilitate both those cases.
107+
108+
Development History
109+
110+
Pre-tsearch era
111+
Development of OpenFTS began in 2000 after realizing that we
112+
needed a search engine optimized for online updates and able to
113+
access metadata from the database. This is essential for online
114+
news agencies, web portals, digital libraries, etc. Most search
115+
engines available utilize an inverted index which is very fast
116+
for searching but very slow for online updates. Incremental
117+
updates of an inverted index is a complex engineering task
118+
while we needed something light, free and with the ability to
119+
access metadata from the database. The last requirement is very
120+
important because in a real life application a search engine
121+
should always consult metadata ( topic, permissions, date
122+
range, version, etc.). We extensively use PostgreSQL as a
123+
database backend and have no intention to move from it, so the
124+
problem was to find a data structure and a fast way to access
125+
it. PostgreSQL has rather unique data type for storing sets
126+
(think about words) - arrays, but lacks index access to them. A
127+
document is parsed into lexemes, which are identified in
128+
various ways (e.g. stemming, morphology, dictionary), and as a
129+
result is reduced to an array of integer numbers. During our
130+
research we found a paper of Joseph Hellerstein which
131+
introduced an interesting data structure suitable for sets -
132+
RD-tree (Russian Doll tree). It looked very attractive, but
133+
implementing it in PostgreSQL seemed difficult because of our
134+
ignorance of database internals. Further research lead us to
135+
the idea to use GiST for implementing RD-tree, but at that time
136+
the GiST code had for a long while remained untouched and
137+
contained several bugs. After work on improving GiST for
138+
version 7.0.3 of PostgreSQL was done, we were able to implement
139+
RD-Tree and use it for index access to arrays of integers. This
140+
implementation was ideally suited for small arrays and
141+
eliminated complex joins, but was practically useless for
142+
indexing large arrays. The next improvement came from an idea
143+
to represent a document by a single bit-signature, a so-called
144+
superimposed signature (see "Index Structures for Databases
145+
Containing Data Items with Set-valued Attributes", 1997, Sven
146+
Helmer for details). We developeded the contrib/intarray module
147+
and used it for full text indexing.
148+
149+
tsearch v1
150+
It was inconvenient to use integer id's instead of words, so we
151+
introduced a new data type called 'txtidx' - a searchable data
152+
type (textual) with indexed access. This was a first step of
153+
our work on an implementation of a built-in PostgreSQL full
154+
text search engine. Even though tsearch v1 had many features of
155+
a search engine it lacked configuration support and relevance
156+
ranking. People were encouraged to use OpenFTS, which provided
157+
relevance ranking based on coordinate information and flexible
158+
configuration. OpenFTS v.0.34 is the last version based on
159+
tsearch v1.
160+
161+
tsearch V2
162+
People recognized tsearch as a powerful tool for full text
163+
searching and insisted on adding ranking support, better
164+
configurability, etc. We already thought about moving most of
165+
the features of OpenFTS to tsearch, and in the early 2003 we
166+
decided to work on a new version of tsearch - tsearch v2. We've
167+
abandoned auxiliary index tables which were used by OpenFTS to
168+
store coordinate information and modified the txtidx type to
169+
store them internally. Also, we've added table-driven
170+
configuration, support of ispell dictionaries, snowball
171+
stemmers and the ability to specify which types of lexemes to
172+
index. Also, it's now possible to generate headlines of
173+
documents with highlighted search terms. These changes make
174+
tsearch more user friendly and turn it into a really powerful
175+
full text search engine. After announcing the alpha version, we
176+
received a proposal from Brandon Rhodes to rename tsearch
177+
functions to be more consistent. So, we have renamed txtidx
178+
type to tsvector and other things as well.
179+
180+
To allow users of tsearch v1 smooth upgrade, we named the module as
181+
tsearch2.
182+
183+
Future release of OpenFTS (v.0.35) will be based on tsearch2. Brave
184+
people could download it from OpenFTS CVS (see link from [20][OpenFTS
185+
page]
186+
187+
References
188+
189+
10. http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/Tsearch_V2_Readme.html
190+
11. http://www.sai.msu.su/~megera/oddmuse/index.cgi/Tsearch_V2_Readme#Documentation_Roadmap
191+
12. http://www.sai.msu.su/~megera/postgres/gist
192+
13. http://openfts.sourceforge.net/
193+
14. http://sourceforge.net/mailarchive/forum.php?forum=openfts-general
194+
15. http://www.sai.msu.su/~megera/oddmuse/index.cgi?action=anchor&id=Documentation_Roadmap#Documentation_Roadmap
195+
16. http://www.sai.msu.su/~megera/oddmuse/index.cgi?Gendict
196+
17. http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/
197+
18. http://sourceforge.net/mailarchive/forum.php?forum=openfts-general
198+
19. http://archives.postgresql.org/pgsql-general/
199+
20. http://openfts.sourceforge.net/

contrib/tsearch2/common.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#include "postgres.h"
2+
#include "common.h"
3+
#include "wparser.h"
4+
#include "ts_cfg.h"
5+
#include "dict.h"
6+
7+
text*
8+
char2text(char* in) {
9+
return charl2text(in, strlen(in));
10+
}
11+
12+
text* charl2text(char* in, int len) {
13+
text *out=(text*)palloc(len+VARHDRSZ);
14+
memcpy(VARDATA(out), in, len);
15+
VARATT_SIZEP(out) = len+VARHDRSZ;
16+
return out;
17+
}
18+
19+
char
20+
*text2char(text* in) {
21+
char *out=palloc( VARSIZE(in) );
22+
memcpy(out, VARDATA(in), VARSIZE(in)-VARHDRSZ);
23+
out[ VARSIZE(in)-VARHDRSZ ] ='\0';
24+
return out;
25+
}
26+
27+
char
28+
*pnstrdup(char* in, int len) {
29+
char *out=palloc( len+1 );
30+
memcpy(out, in, len);
31+
out[len]='\0';
32+
return out;
33+
}
34+
35+
text
36+
*ptextdup(text* in) {
37+
text *out=(text*)palloc( VARSIZE(in) );
38+
memcpy(out,in,VARSIZE(in));
39+
return out;
40+
}
41+
42+
text
43+
*mtextdup(text* in) {
44+
text *out=(text*)malloc( VARSIZE(in) );
45+
if ( !out )
46+
ts_error(ERROR, "No memory");
47+
memcpy(out,in,VARSIZE(in));
48+
return out;
49+
}
50+
51+
void
52+
ts_error(int state, const char *format, ...) {
53+
va_list args;
54+
int tlen = 128, len=0;
55+
char *buf;
56+
57+
reset_cfg();
58+
reset_dict();
59+
reset_prs();
60+
61+
va_start(args, format);
62+
buf = palloc(tlen);
63+
len = vsnprintf(buf, tlen-1, format, args);
64+
if ( len >= tlen ) {
65+
tlen=len+1;
66+
buf = repalloc( buf, tlen );
67+
vsnprintf(buf, tlen-1, format, args);
68+
}
69+
va_end(args);
70+
71+
elog(state,buf);
72+
pfree(buf);
73+
}
74+
75+
int
76+
text_cmp(text *a, text *b) {
77+
if ( VARSIZE(a) == VARSIZE(b) )
78+
return strncmp( VARDATA(a), VARDATA(b), VARSIZE(a)-VARHDRSZ );
79+
return (int)VARSIZE(a) - (int)VARSIZE(b);
80+
81+
}
82+

contrib/tsearch2/common.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#ifndef __TS_COMMON_H__
2+
#define __TS_COMMON_H__
3+
#include "postgres.h"
4+
#include "fmgr.h"
5+
6+
#ifndef PG_NARGS
7+
#define PG_NARGS() (fcinfo->nargs)
8+
#endif
9+
10+
text* char2text(char* in);
11+
text* charl2text(char* in, int len);
12+
char *text2char(text* in);
13+
char *pnstrdup(char* in, int len);
14+
text *ptextdup(text* in);
15+
text *mtextdup(text* in);
16+
17+
int text_cmp(text *a, text *b);
18+
19+
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
20+
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
21+
22+
void ts_error(int state, const char *format, ...);
23+
24+
#endif

0 commit comments

Comments
 (0)