postgrespro
diff --git a/‎contrib/tsearch2/Makefile
Lines changed: 44 additions & 0 deletions b/‎contrib/tsearch2/Makefile
Lines changed: 44 additions & 0 deletions
diff --git a/‎contrib/tsearch2/README.tsearch2
Lines changed: 199 additions & 0 deletions b/‎contrib/tsearch2/README.tsearch2
Lines changed: 199 additions & 0 deletions
diff --git a/‎contrib/tsearch2/common.c
Lines changed: 82 additions & 0 deletions b/‎contrib/tsearch2/common.c
Lines changed: 82 additions & 0 deletions
diff --git a/‎contrib/tsearch2/common.h
Lines changed: 24 additions & 0 deletions b/‎contrib/tsearch2/common.h
Lines changed: 24 additions & 0 deletions
@@ -0,0 +1,44 @@
+subdir = contrib/tsearch2
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+
+override CPPFLAGS := -I. -I./snowball -I./ispell -I./wordparser $(CPPFLAGS)
+
+MODULE_big = tsearch2
+OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
+       snowball/english_stem.o snowball/api.o snowball/russian_stem.o snowball/utilities.o \
+       dict_snowball.o ispell/spell.o dict_ispell.o dict_syn.o \
+       wparser.o wordparser/parser.o wordparser/deflex.o wparser_def.o \
+       ts_cfg.o tsvector.o rewrite.o crc32.o query.o gistidx.o \
+       tsvector_op.o rank.o ts_stat.o
+
+DATA_built = tsearch2.sql untsearch2.sql
+DOCS = README.tsearch2
+REGRESS = tsearch2
+
+wordparser/parser.c: wordparser/parser.l
+ifdef FLEX
+	$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
+else
+	@$(missing) flex $< $@
+endif
+
+EXTRA_CLEAN = wordparser/parser.c tsearch2.sql.in
+
+SHLIB_LINK := -lm
+include $(top_srcdir)/contrib/contrib-global.mk
+# DO NOT DELETE
+
+install: installstop
+
+installstop:
+	cp stopword/*.stop $(datadir)
+
+
+tsearch2.sql.in: tsearch.sql._in
+	sed 's,DATA_PATH,$(datadir),g' < $< > $@
+
+untsearch2.sql: untsearch.sql.in
+	cp $< $@ 
+
@@ -0,0 +1,199 @@
+Tsearch2 - full text search extension for PostgreSQL
+
+   [10][Online version] of this document is available
+   
+   This module is sponsored by Delta-Soft Ltd., Moscow, Russia.
+   
+   Notice: This version is fully incompatible with old tsearch (V1),
+   which is considered as deprecated in upcoming 7.4 release and
+   obsoleted in 7.5.
+   
+   The Tsearch2 contrib module contains an implementation of a new data
+   type tsvector - a searchable data type with indexed access. In a
+   nutshell, tsvector is a set of unique words along with their
+   positional information in the document, organized in a special
+   structure optimized for fast access and lookup. Actually, each word
+   entry, besides its position in the document, could have a weight
+   attribute, describing importance of this word (at a specific) position
+   in document. A set of bit-signatures of a fixed length, representing
+   tsvectors, are stored in a search tree (developed using PostgreSQL
+   GiST), which provides online update of full text index and fast query
+   lookup. The module provides indexed access methods, queries,
+   operations and supporting routines for the tsvector data type and easy
+   conversion of text data to tsvector. Table driven configuration allows
+   creation of custom configuration optimized for specific searches using
+   standard SQL commands.
+   
+   Configuration allows you to:
+     * specify the type of lexemes to be indexed and the way they are
+       processed.
+     * specify dictionaries to be used along with stop words recognition.
+     * specify the parser used to process a document.
+       
+   See [11]Documentation Roadmap for links to documentation.
+   
+Authors
+
+     * Oleg Bartunov <oleg@sai.msu.su>, Moscow, Moscow University, Russia
+     * Teodor Sigaev <teodor@sigaev.ru>, Moscow, Delta-Soft Ltd.,Russia
+       
+Contributors
+
+     * Robert John Shepherd and Andrew J. Kopciuch submitted
+       "Introduction to tsearch" (Robert - tsearch v1, Andrew - tsearch
+       v2)
+     * Brandon Craig Rhodes wrote "Tsearch2 Guide" and "Tsearch2
+       Reference" and proposed new naming convention for tsearch V2
+       
+New features
+
+     * Relevance ranking of search results
+     * Table driven configuration
+     * Morphology support (ispell dictionaries, snowball stemmers)
+     * Headline support (text fragments with highlighted search terms)
+     * Ability to plug-in custom dictionaries and parsers
+     * Synonym dictionary
+     * Generator of templates for dictionaries (built-in snowball stemmer
+       support)
+     * Statistics of indexed words is available
+       
+Limitations
+
+     * Lexeme should be not longer than 2048 bytes
+     * The number of lexemes is limited by 2^32. Note, that actual
+       capacity of tsvector is depends on whether positional information
+       is stored or not.
+     * tsvector - the size is limited by approximately 2^20 bytes.
+     * tsquery - the number of entries (lexemes and operations) < 32768
+     * Positional information
+          + maximal position of lexeme < 2^14 (16384)
+          + lexeme could have maximum 256 positions
+       
+References
+
+     * GiST development site -
+       [12]http://www.sai.msu.su/~megera/postgres/gist
+     * OpenFTS home page - [13]http://openfts.sourceforge.net/
+     * Mailing list -
+       [14]http://sourceforge.net/mailarchive/forum.php?forum=openfts-gen
+       eral
+       
+   [15]Documentation Roadmap
+   
+Documentation Roadmap
+
+     * Several docs are available from docs/ subdirectory
+          + "Tsearch V2 Introduction" by Andrew Kopciuch
+          + "Tsearch2 Guide" by Brandon Rhodes
+          + "Tsearch2 Reference" by Brandon Rhodes
+     * Readme.gendict in gendict/ subdirectory
+          + [16][Gendict tutorial]
+       
+   Online version of documentation is always available from Tsearch V2
+   home page -
+   [17]http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/
+   
+Support
+
+   Authors urgently recommend people to use [18][openfts-general] or
+   [19][pgsql-general] mailing lists for questions and discussions.
+   
+Caution
+
+   In spite of apparent easy full text searching with our tsearch module
+   (authors hope it's so), any serious search engine require profound
+   study of various aspects, such as stop words, dictionaries, special
+   parsers. Tsearch module was designed to facilitate both those cases.
+   
+Development History
+
+   Pre-tsearch era
+          Development of OpenFTS began in 2000 after realizing that we
+          needed a search engine optimized for online updates and able to
+          access metadata from the database. This is essential for online
+          news agencies, web portals, digital libraries, etc. Most search
+          engines available utilize an inverted index which is very fast
+          for searching but very slow for online updates. Incremental
+          updates of an inverted index is a complex engineering task
+          while we needed something light, free and with the ability to
+          access metadata from the database. The last requirement is very
+          important because in a real life application a search engine
+          should always consult metadata ( topic, permissions, date
+          range, version, etc.). We extensively use PostgreSQL as a
+          database backend and have no intention to move from it, so the
+          problem was to find a data structure and a fast way to access
+          it. PostgreSQL has rather unique data type for storing sets
+          (think about words) - arrays, but lacks index access to them. A
+          document is parsed into lexemes, which are identified in
+          various ways (e.g. stemming, morphology, dictionary), and as a
+          result is reduced to an array of integer numbers. During our
+          research we found a paper of Joseph Hellerstein which
+          introduced an interesting data structure suitable for sets -
+          RD-tree (Russian Doll tree). It looked very attractive, but
+          implementing it in PostgreSQL seemed difficult because of our
+          ignorance of database internals. Further research lead us to
+          the idea to use GiST for implementing RD-tree, but at that time
+          the GiST code had for a long while remained untouched and
+          contained several bugs. After work on improving GiST for
+          version 7.0.3 of PostgreSQL was done, we were able to implement
+          RD-Tree and use it for index access to arrays of integers. This
+          implementation was ideally suited for small arrays and
+          eliminated complex joins, but was practically useless for
+          indexing large arrays. The next improvement came from an idea
+          to represent a document by a single bit-signature, a so-called
+          superimposed signature (see "Index Structures for Databases
+          Containing Data Items with Set-valued Attributes", 1997, Sven
+          Helmer for details). We developeded the contrib/intarray module
+          and used it for full text indexing.
+          
+   tsearch v1
+          It was inconvenient to use integer id's instead of words, so we
+          introduced a new data type called 'txtidx' - a searchable data
+          type (textual) with indexed access. This was a first step of
+          our work on an implementation of a built-in PostgreSQL full
+          text search engine. Even though tsearch v1 had many features of
+          a search engine it lacked configuration support and relevance
+          ranking. People were encouraged to use OpenFTS, which provided
+          relevance ranking based on coordinate information and flexible
+          configuration. OpenFTS v.0.34 is the last version based on
+          tsearch v1.
+          
+   tsearch V2
+          People recognized tsearch as a powerful tool for full text
+          searching and insisted on adding ranking support, better
+          configurability, etc. We already thought about moving most of
+          the features of OpenFTS to tsearch, and in the early 2003 we
+          decided to work on a new version of tsearch - tsearch v2. We've
+          abandoned auxiliary index tables which were used by OpenFTS to
+          store coordinate information and modified the txtidx type to
+          store them internally. Also, we've added table-driven
+          configuration, support of ispell dictionaries, snowball
+          stemmers and the ability to specify which types of lexemes to
+          index. Also, it's now possible to generate headlines of
+          documents with highlighted search terms. These changes make
+          tsearch more user friendly and turn it into a really powerful
+          full text search engine. After announcing the alpha version, we
+          received a proposal from Brandon Rhodes to rename tsearch
+          functions to be more consistent. So, we have renamed txtidx
+          type to tsvector and other things as well.
+          
+   To allow users of tsearch v1 smooth upgrade, we named the module as
+   tsearch2.
+   
+   Future release of OpenFTS (v.0.35) will be based on tsearch2. Brave
+   people could download it from OpenFTS CVS (see link from [20][OpenFTS
+   page]
+
+References
+
+  10. http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/Tsearch_V2_Readme.html
+  11. http://www.sai.msu.su/~megera/oddmuse/index.cgi/Tsearch_V2_Readme#Documentation_Roadmap
+  12. http://www.sai.msu.su/~megera/postgres/gist
+  13. http://openfts.sourceforge.net/
+  14. http://sourceforge.net/mailarchive/forum.php?forum=openfts-general
+  15. http://www.sai.msu.su/~megera/oddmuse/index.cgi?action=anchor&id=Documentation_Roadmap#Documentation_Roadmap
+  16. http://www.sai.msu.su/~megera/oddmuse/index.cgi?Gendict
+  17. http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/
+  18. http://sourceforge.net/mailarchive/forum.php?forum=openfts-general
+  19. http://archives.postgresql.org/pgsql-general/
+  20. http://openfts.sourceforge.net/
@@ -0,0 +1,82 @@
+#include "postgres.h"
+#include "common.h"
+#include "wparser.h"
+#include "ts_cfg.h"
+#include "dict.h"
+
+text* 
+char2text(char* in) {
+	return charl2text(in, strlen(in));
+}
+
+text* charl2text(char* in, int len) {
+	text *out=(text*)palloc(len+VARHDRSZ);
+	memcpy(VARDATA(out), in, len);
+	VARATT_SIZEP(out) = len+VARHDRSZ;
+	return out;
+}
+
+char	
+*text2char(text* in) {
+        char *out=palloc( VARSIZE(in) );
+        memcpy(out, VARDATA(in), VARSIZE(in)-VARHDRSZ);
+        out[ VARSIZE(in)-VARHDRSZ ] ='\0';
+	return out;
+}
+
+char    
+*pnstrdup(char* in, int len) {
+	char *out=palloc( len+1 );
+	memcpy(out, in, len);
+	out[len]='\0';
+	return out;
+}
+
+text    
+*ptextdup(text* in) {
+	text *out=(text*)palloc( VARSIZE(in) );
+	memcpy(out,in,VARSIZE(in));
+	return out;
+}
+
+text    
+*mtextdup(text* in) {
+	text *out=(text*)malloc( VARSIZE(in) );
+	if ( !out ) 
+		ts_error(ERROR, "No memory");
+	memcpy(out,in,VARSIZE(in));
+	return out;
+}
+
+void 
+ts_error(int state, const char *format, ...) {
+	va_list args;
+	int tlen = 128, len=0;
+	char	*buf;
+	
+	reset_cfg();
+	reset_dict();
+	reset_prs();
+
+	va_start(args, format);
+	buf = palloc(tlen);
+	len = vsnprintf(buf, tlen-1, format, args);
+	if ( len >= tlen ) {
+		tlen=len+1;
+		buf = repalloc( buf, tlen );
+		vsnprintf(buf, tlen-1, format, args);
+	}
+	va_end(args);
+ 
+	elog(state,buf);
+	pfree(buf);
+}
+
+int   
+text_cmp(text *a, text *b) {
+	if ( VARSIZE(a) == VARSIZE(b) )
+		return strncmp( VARDATA(a), VARDATA(b), VARSIZE(a)-VARHDRSZ );
+	return (int)VARSIZE(a) - (int)VARSIZE(b);
+
+}
+
@@ -0,0 +1,24 @@
+#ifndef __TS_COMMON_H__
+#define __TS_COMMON_H__
+#include "postgres.h"
+#include "fmgr.h"
+
+#ifndef PG_NARGS
+#define PG_NARGS() (fcinfo->nargs)
+#endif
+
+text* char2text(char* in);
+text* charl2text(char* in, int len);
+char	*text2char(text* in);
+char	*pnstrdup(char* in, int len);
+text	*ptextdup(text* in);
+text	*mtextdup(text* in);
+
+int   text_cmp(text *a, text *b);
+
+#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
+#define ARRNELEMS(x)  ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
+
+void ts_error(int state, const char *format, ...);
+
+#endif