Skip to content

Commit 1c1791e

Browse files
committed
Add json(b)_to_tsvector function
Jsonb has a complex nature so there isn't best-for-everything way to convert it to tsvector for full text search. Current to_tsvector(json(b)) suggests to convert only string values, but it's possible to index keys, numerics and even booleans value. To solve that json(b)_to_tsvector has a second required argument contained a list of desired types of json fields. Second argument is a jsonb scalar or array right now with possibility to add new options in a future. Bump catalog version Author: Dmitry Dolgov with some editorization by me Reviewed by: Teodor Sigaev Discussion: https://www.postgresql.org/message-id/CA+q6zcXJQbS1b4kJ_HeAOoOc=unfnOrUEL=KGgE32QKDww7d8g@mail.gmail.com
1 parent 529ab7b commit 1c1791e

File tree

10 files changed

+629
-49
lines changed

10 files changed

+629
-49
lines changed

doc/src/sgml/func.sgml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9727,6 +9727,26 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
97279727
<entry><literal>to_tsvector('english', '{"a": "The Fat Rats"}'::json)</literal></entry>
97289728
<entry><literal>'fat':2 'rat':3</literal></entry>
97299729
</row>
9730+
<row>
9731+
<entry>
9732+
<literal><function>json(b)_to_tsvector(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type>,
9733+
</optional> <replaceable class="parameter">document</replaceable> <type>json(b)</type>,
9734+
<replaceable class="parameter">filter</replaceable> <type>json(b)</type>)</function></literal>
9735+
</entry>
9736+
<entry><type>tsvector</type></entry>
9737+
<entry>
9738+
reduce each value in the document, specified by <replaceable class="parameter">filter</replaceable> to a <type>tsvector</type>,
9739+
and then concatenate those in document order to produce a single <type>tsvector</type>.
9740+
<replaceable class="parameter">filter</replaceable> is a jsonb array, that enumerates what kind of elements need to be included
9741+
into the resulting <type>tsvector</type>. Possible values for <replaceable class="parameter">filter</replaceable> are
9742+
<literal>"string"</literal> (to include all string values), <literal>"numeric"</literal> (to include all numeric values in the string format),
9743+
<literal>"boolean"</literal> (to include all boolean values in the string format "true"/"false"),
9744+
<literal>"key"</literal> (to include all keys) or <literal>"all"</literal> (to include all above). These values
9745+
can be combined together to include, e.g. all string and numeric values.
9746+
</entry>
9747+
<entry><literal>json_to_tsvector('english', '{"a": "The Fat Rats", "b": 123}'::json, '["string", "numeric"]')</literal></entry>
9748+
<entry><literal>'123':5 'fat':2 'rat':3</literal></entry>
9749+
</row>
97309750
<row>
97319751
<entry>
97329752
<indexterm>

src/backend/tsearch/to_tsany.c

Lines changed: 108 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,12 @@ to_tsvector(PG_FUNCTION_ARGS)
267267
PointerGetDatum(in)));
268268
}
269269

270-
Datum
271-
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
270+
/*
271+
* Worker function for jsonb(_string)_to_tsvector(_byid)
272+
*/
273+
static TSVector
274+
jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
272275
{
273-
Oid cfgId = PG_GETARG_OID(0);
274-
Jsonb *jb = PG_GETARG_JSONB_P(1);
275-
TSVector result;
276276
TSVectorBuildState state;
277277
ParsedText prs;
278278

@@ -281,33 +281,77 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
281281
state.prs = &prs;
282282
state.cfgId = cfgId;
283283

284-
iterate_jsonb_string_values(jb, &state, add_to_tsvector);
284+
iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
285285

286-
PG_FREE_IF_COPY(jb, 1);
286+
return make_tsvector(&prs);
287+
}
288+
289+
Datum
290+
jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
291+
{
292+
Oid cfgId = PG_GETARG_OID(0);
293+
Jsonb *jb = PG_GETARG_JSONB_P(1);
294+
TSVector result;
287295

288-
result = make_tsvector(&prs);
296+
result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
297+
PG_FREE_IF_COPY(jb, 1);
289298

290299
PG_RETURN_TSVECTOR(result);
291300
}
292301

293302
Datum
294-
jsonb_to_tsvector(PG_FUNCTION_ARGS)
303+
jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
295304
{
296305
Jsonb *jb = PG_GETARG_JSONB_P(0);
297306
Oid cfgId;
307+
TSVector result;
298308

299309
cfgId = getTSCurrentConfig(true);
300-
PG_RETURN_DATUM(DirectFunctionCall2(jsonb_to_tsvector_byid,
301-
ObjectIdGetDatum(cfgId),
302-
JsonbPGetDatum(jb)));
310+
result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
311+
PG_FREE_IF_COPY(jb, 0);
312+
313+
PG_RETURN_TSVECTOR(result);
303314
}
304315

305316
Datum
306-
json_to_tsvector_byid(PG_FUNCTION_ARGS)
317+
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
307318
{
308319
Oid cfgId = PG_GETARG_OID(0);
309-
text *json = PG_GETARG_TEXT_P(1);
320+
Jsonb *jb = PG_GETARG_JSONB_P(1);
321+
Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
322+
TSVector result;
323+
uint32 flags = parse_jsonb_index_flags(jbFlags);
324+
325+
result = jsonb_to_tsvector_worker(cfgId, jb, flags);
326+
PG_FREE_IF_COPY(jb, 1);
327+
PG_FREE_IF_COPY(jbFlags, 2);
328+
329+
PG_RETURN_TSVECTOR(result);
330+
}
331+
332+
Datum
333+
jsonb_to_tsvector(PG_FUNCTION_ARGS)
334+
{
335+
Jsonb *jb = PG_GETARG_JSONB_P(0);
336+
Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
337+
Oid cfgId;
310338
TSVector result;
339+
uint32 flags = parse_jsonb_index_flags(jbFlags);
340+
341+
cfgId = getTSCurrentConfig(true);
342+
result = jsonb_to_tsvector_worker(cfgId, jb, flags);
343+
PG_FREE_IF_COPY(jb, 0);
344+
PG_FREE_IF_COPY(jbFlags, 1);
345+
346+
PG_RETURN_TSVECTOR(result);
347+
}
348+
349+
/*
350+
* Worker function for json(_string)_to_tsvector(_byid)
351+
*/
352+
static TSVector
353+
json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
354+
{
311355
TSVectorBuildState state;
312356
ParsedText prs;
313357

@@ -316,11 +360,50 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
316360
state.prs = &prs;
317361
state.cfgId = cfgId;
318362

319-
iterate_json_string_values(json, &state, add_to_tsvector);
363+
iterate_json_values(json, flags, &state, add_to_tsvector);
364+
365+
return make_tsvector(&prs);
366+
}
367+
368+
Datum
369+
json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
370+
{
371+
Oid cfgId = PG_GETARG_OID(0);
372+
text *json = PG_GETARG_TEXT_P(1);
373+
TSVector result;
320374

375+
result = json_to_tsvector_worker(cfgId, json, jtiString);
321376
PG_FREE_IF_COPY(json, 1);
322377

323-
result = make_tsvector(&prs);
378+
PG_RETURN_TSVECTOR(result);
379+
}
380+
381+
Datum
382+
json_string_to_tsvector(PG_FUNCTION_ARGS)
383+
{
384+
text *json = PG_GETARG_TEXT_P(0);
385+
Oid cfgId;
386+
TSVector result;
387+
388+
cfgId = getTSCurrentConfig(true);
389+
result = json_to_tsvector_worker(cfgId, json, jtiString);
390+
PG_FREE_IF_COPY(json, 0);
391+
392+
PG_RETURN_TSVECTOR(result);
393+
}
394+
395+
Datum
396+
json_to_tsvector_byid(PG_FUNCTION_ARGS)
397+
{
398+
Oid cfgId = PG_GETARG_OID(0);
399+
text *json = PG_GETARG_TEXT_P(1);
400+
Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
401+
TSVector result;
402+
uint32 flags = parse_jsonb_index_flags(jbFlags);
403+
404+
result = json_to_tsvector_worker(cfgId, json, flags);
405+
PG_FREE_IF_COPY(json, 1);
406+
PG_FREE_IF_COPY(jbFlags, 2);
324407

325408
PG_RETURN_TSVECTOR(result);
326409
}
@@ -329,12 +412,17 @@ Datum
329412
json_to_tsvector(PG_FUNCTION_ARGS)
330413
{
331414
text *json = PG_GETARG_TEXT_P(0);
415+
Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
332416
Oid cfgId;
417+
TSVector result;
418+
uint32 flags = parse_jsonb_index_flags(jbFlags);
333419

334420
cfgId = getTSCurrentConfig(true);
335-
PG_RETURN_DATUM(DirectFunctionCall2(json_to_tsvector_byid,
336-
ObjectIdGetDatum(cfgId),
337-
PointerGetDatum(json)));
421+
result = json_to_tsvector_worker(cfgId, json, flags);
422+
PG_FREE_IF_COPY(json, 0);
423+
PG_FREE_IF_COPY(jbFlags, 1);
424+
425+
PG_RETURN_TSVECTOR(result);
338426
}
339427

340428
/*
@@ -353,7 +441,7 @@ add_to_tsvector(void *_state, char *elem_value, int elem_len)
353441
* First time through: initialize words array to a reasonable size.
354442
* (parsetext() will realloc it bigger as needed.)
355443
*/
356-
prs->lenwords = Max(elem_len / 6, 64);
444+
prs->lenwords = 16;
357445
prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
358446
prs->curwords = 0;
359447
prs->pos = 0;

0 commit comments

Comments
 (0)