Update copyright for 2022
[pgsql.git] / src / backend / tsearch / dict_thesaurus.c
blobb8c08bcf7ba500926d86c0a81cc92ba48c71885d
1 /*-------------------------------------------------------------------------
3 * dict_thesaurus.c
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * src/backend/tsearch/dict_thesaurus.c
12 *-------------------------------------------------------------------------
14 #include "postgres.h"
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/builtins.h"
22 #include "utils/regproc.h"
26 * Temporary we use TSLexeme.flags for inner use...
28 #define DT_USEASIS 0x1000
30 typedef struct LexemeInfo
32 uint32 idsubst; /* entry's number in DictThesaurus->subst */
33 uint16 posinsubst; /* pos info in entry */
34 uint16 tnvariant; /* total num lexemes in one variant */
35 struct LexemeInfo *nextentry;
36 struct LexemeInfo *nextvariant;
37 } LexemeInfo;
39 typedef struct
41 char *lexeme;
42 LexemeInfo *entries;
43 } TheLexeme;
45 typedef struct
47 uint16 lastlexeme; /* number lexemes to substitute */
48 uint16 reslen;
49 TSLexeme *res; /* prepared substituted result */
50 } TheSubstitute;
52 typedef struct
54 /* subdictionary to normalize lexemes */
55 Oid subdictOid;
56 TSDictionaryCacheEntry *subdict;
58 /* Array to search lexeme by exact match */
59 TheLexeme *wrds;
60 int nwrds; /* current number of words */
61 int ntwrds; /* allocated array length */
64 * Storage of substituted result, n-th element is for n-th expression
66 TheSubstitute *subst;
67 int nsubst;
68 } DictThesaurus;
71 static void
72 newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
74 TheLexeme *ptr;
76 if (d->nwrds >= d->ntwrds)
78 if (d->ntwrds == 0)
80 d->ntwrds = 16;
81 d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
83 else
85 d->ntwrds *= 2;
86 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
90 ptr = d->wrds + d->nwrds;
91 d->nwrds++;
93 ptr->lexeme = palloc(e - b + 1);
95 memcpy(ptr->lexeme, b, e - b);
96 ptr->lexeme[e - b] = '\0';
98 ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
100 ptr->entries->nextentry = NULL;
101 ptr->entries->idsubst = idsubst;
102 ptr->entries->posinsubst = posinsubst;
105 static void
106 addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
108 static int nres = 0;
109 static int ntres = 0;
110 TheSubstitute *ptr;
112 if (nwrd == 0)
114 nres = ntres = 0;
116 if (idsubst >= d->nsubst)
118 if (d->nsubst == 0)
120 d->nsubst = 16;
121 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
123 else
125 d->nsubst *= 2;
126 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
131 ptr = d->subst + idsubst;
133 ptr->lastlexeme = posinsubst - 1;
135 if (nres + 1 >= ntres)
137 if (ntres == 0)
139 ntres = 2;
140 ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
142 else
144 ntres *= 2;
145 ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
149 ptr->res[nres].lexeme = palloc(e - b + 1);
150 memcpy(ptr->res[nres].lexeme, b, e - b);
151 ptr->res[nres].lexeme[e - b] = '\0';
153 ptr->res[nres].nvariant = nwrd;
154 if (useasis)
155 ptr->res[nres].flags = DT_USEASIS;
156 else
157 ptr->res[nres].flags = 0;
159 ptr->res[++nres].lexeme = NULL;
162 #define TR_WAITLEX 1
163 #define TR_INLEX 2
164 #define TR_WAITSUBS 3
165 #define TR_INSUBS 4
167 static void
168 thesaurusRead(const char *filename, DictThesaurus *d)
170 tsearch_readline_state trst;
171 uint32 idsubst = 0;
172 bool useasis = false;
173 char *line;
175 filename = get_tsearch_config_filename(filename, "ths");
176 if (!tsearch_readline_begin(&trst, filename))
177 ereport(ERROR,
178 (errcode(ERRCODE_CONFIG_FILE_ERROR),
179 errmsg("could not open thesaurus file \"%s\": %m",
180 filename)));
182 while ((line = tsearch_readline(&trst)) != NULL)
184 char *ptr;
185 int state = TR_WAITLEX;
186 char *beginwrd = NULL;
187 uint32 posinsubst = 0;
188 uint32 nwrd = 0;
190 ptr = line;
192 /* is it a comment? */
193 while (*ptr && t_isspace(ptr))
194 ptr += pg_mblen(ptr);
196 if (t_iseq(ptr, '#') || *ptr == '\0' ||
197 t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
199 pfree(line);
200 continue;
203 while (*ptr)
205 if (state == TR_WAITLEX)
207 if (t_iseq(ptr, ':'))
209 if (posinsubst == 0)
210 ereport(ERROR,
211 (errcode(ERRCODE_CONFIG_FILE_ERROR),
212 errmsg("unexpected delimiter")));
213 state = TR_WAITSUBS;
215 else if (!t_isspace(ptr))
217 beginwrd = ptr;
218 state = TR_INLEX;
221 else if (state == TR_INLEX)
223 if (t_iseq(ptr, ':'))
225 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
226 state = TR_WAITSUBS;
228 else if (t_isspace(ptr))
230 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
231 state = TR_WAITLEX;
234 else if (state == TR_WAITSUBS)
236 if (t_iseq(ptr, '*'))
238 useasis = true;
239 state = TR_INSUBS;
240 beginwrd = ptr + pg_mblen(ptr);
242 else if (t_iseq(ptr, '\\'))
244 useasis = false;
245 state = TR_INSUBS;
246 beginwrd = ptr + pg_mblen(ptr);
248 else if (!t_isspace(ptr))
250 useasis = false;
251 beginwrd = ptr;
252 state = TR_INSUBS;
255 else if (state == TR_INSUBS)
257 if (t_isspace(ptr))
259 if (ptr == beginwrd)
260 ereport(ERROR,
261 (errcode(ERRCODE_CONFIG_FILE_ERROR),
262 errmsg("unexpected end of line or lexeme")));
263 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
264 state = TR_WAITSUBS;
267 else
268 elog(ERROR, "unrecognized thesaurus state: %d", state);
270 ptr += pg_mblen(ptr);
273 if (state == TR_INSUBS)
275 if (ptr == beginwrd)
276 ereport(ERROR,
277 (errcode(ERRCODE_CONFIG_FILE_ERROR),
278 errmsg("unexpected end of line or lexeme")));
279 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
282 idsubst++;
284 if (!(nwrd && posinsubst))
285 ereport(ERROR,
286 (errcode(ERRCODE_CONFIG_FILE_ERROR),
287 errmsg("unexpected end of line")));
289 if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
290 ereport(ERROR,
291 (errcode(ERRCODE_CONFIG_FILE_ERROR),
292 errmsg("too many lexemes in thesaurus entry")));
294 pfree(line);
297 d->nsubst = idsubst;
299 tsearch_readline_end(&trst);
302 static TheLexeme *
303 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
305 if (*nnw >= *tnm)
307 *tnm *= 2;
308 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
311 newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
313 if (lexeme && lexeme->lexeme)
315 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
316 newwrds[*nnw].entries->tnvariant = tnvariant;
318 else
320 newwrds[*nnw].lexeme = NULL;
321 newwrds[*nnw].entries->tnvariant = 1;
324 newwrds[*nnw].entries->idsubst = src->idsubst;
325 newwrds[*nnw].entries->posinsubst = src->posinsubst;
327 newwrds[*nnw].entries->nextentry = NULL;
329 (*nnw)++;
330 return newwrds;
333 static int
334 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
336 if (a == NULL || b == NULL)
337 return 0;
339 if (a->idsubst == b->idsubst)
341 if (a->posinsubst == b->posinsubst)
343 if (a->tnvariant == b->tnvariant)
344 return 0;
346 return (a->tnvariant > b->tnvariant) ? 1 : -1;
349 return (a->posinsubst > b->posinsubst) ? 1 : -1;
352 return (a->idsubst > b->idsubst) ? 1 : -1;
355 static int
356 cmpLexeme(const TheLexeme *a, const TheLexeme *b)
358 if (a->lexeme == NULL)
360 if (b->lexeme == NULL)
361 return 0;
362 else
363 return 1;
365 else if (b->lexeme == NULL)
366 return -1;
368 return strcmp(a->lexeme, b->lexeme);
371 static int
372 cmpLexemeQ(const void *a, const void *b)
374 return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
377 static int
378 cmpTheLexeme(const void *a, const void *b)
380 const TheLexeme *la = (const TheLexeme *) a;
381 const TheLexeme *lb = (const TheLexeme *) b;
382 int res;
384 if ((res = cmpLexeme(la, lb)) != 0)
385 return res;
387 return -cmpLexemeInfo(la->entries, lb->entries);
390 static void
391 compileTheLexeme(DictThesaurus *d)
393 int i,
394 nnw = 0,
395 tnm = 16;
396 TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
397 *ptrwrds;
399 for (i = 0; i < d->nwrds; i++)
401 TSLexeme *ptr;
403 if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
404 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
405 else
407 ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
408 PointerGetDatum(d->subdict->dictData),
409 PointerGetDatum(d->wrds[i].lexeme),
410 Int32GetDatum(strlen(d->wrds[i].lexeme)),
411 PointerGetDatum(NULL)));
413 if (!ptr)
414 ereport(ERROR,
415 (errcode(ERRCODE_CONFIG_FILE_ERROR),
416 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
417 d->wrds[i].lexeme,
418 d->wrds[i].entries->idsubst + 1)));
419 else if (!(ptr->lexeme))
420 ereport(ERROR,
421 (errcode(ERRCODE_CONFIG_FILE_ERROR),
422 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
423 d->wrds[i].lexeme,
424 d->wrds[i].entries->idsubst + 1),
425 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
426 else
428 while (ptr->lexeme)
430 TSLexeme *remptr = ptr + 1;
431 int tnvar = 1;
432 int curvar = ptr->nvariant;
434 /* compute n words in one variant */
435 while (remptr->lexeme)
437 if (remptr->nvariant != (remptr - 1)->nvariant)
438 break;
439 tnvar++;
440 remptr++;
443 remptr = ptr;
444 while (remptr->lexeme && remptr->nvariant == curvar)
446 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
447 remptr++;
450 ptr = remptr;
455 pfree(d->wrds[i].lexeme);
456 pfree(d->wrds[i].entries);
459 if (d->wrds)
460 pfree(d->wrds);
461 d->wrds = newwrds;
462 d->nwrds = nnw;
463 d->ntwrds = tnm;
465 if (d->nwrds > 1)
467 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
469 /* uniq */
470 newwrds = d->wrds;
471 ptrwrds = d->wrds + 1;
472 while (ptrwrds - d->wrds < d->nwrds)
474 if (cmpLexeme(ptrwrds, newwrds) == 0)
476 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
478 ptrwrds->entries->nextentry = newwrds->entries;
479 newwrds->entries = ptrwrds->entries;
481 else
482 pfree(ptrwrds->entries);
484 if (ptrwrds->lexeme)
485 pfree(ptrwrds->lexeme);
487 else
489 newwrds++;
490 *newwrds = *ptrwrds;
493 ptrwrds++;
496 d->nwrds = newwrds - d->wrds + 1;
497 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
501 static void
502 compileTheSubstitute(DictThesaurus *d)
504 int i;
506 for (i = 0; i < d->nsubst; i++)
508 TSLexeme *rem = d->subst[i].res,
509 *outptr,
510 *inptr;
511 int n = 2;
513 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
514 outptr->lexeme = NULL;
515 inptr = rem;
517 while (inptr && inptr->lexeme)
519 TSLexeme *lexized,
520 tmplex[2];
522 if (inptr->flags & DT_USEASIS)
523 { /* do not lexize */
524 tmplex[0] = *inptr;
525 tmplex[0].flags = 0;
526 tmplex[1].lexeme = NULL;
527 lexized = tmplex;
529 else
531 lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
532 PointerGetDatum(d->subdict->dictData),
533 PointerGetDatum(inptr->lexeme),
534 Int32GetDatum(strlen(inptr->lexeme)),
535 PointerGetDatum(NULL)));
538 if (lexized && lexized->lexeme)
540 int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
542 while (lexized->lexeme)
544 if (outptr - d->subst[i].res + 1 >= n)
546 int diff = outptr - d->subst[i].res;
548 n *= 2;
549 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
550 outptr = d->subst[i].res + diff;
553 *outptr = *lexized;
554 outptr->lexeme = pstrdup(lexized->lexeme);
556 outptr++;
557 lexized++;
560 if (toset > 0)
561 d->subst[i].res[toset].flags |= TSL_ADDPOS;
563 else if (lexized)
565 ereport(ERROR,
566 (errcode(ERRCODE_CONFIG_FILE_ERROR),
567 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568 inptr->lexeme, i + 1)));
570 else
572 ereport(ERROR,
573 (errcode(ERRCODE_CONFIG_FILE_ERROR),
574 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575 inptr->lexeme, i + 1)));
578 if (inptr->lexeme)
579 pfree(inptr->lexeme);
580 inptr++;
583 if (outptr == d->subst[i].res)
584 ereport(ERROR,
585 (errcode(ERRCODE_CONFIG_FILE_ERROR),
586 errmsg("thesaurus substitute phrase is empty (rule %d)",
587 i + 1)));
589 d->subst[i].reslen = outptr - d->subst[i].res;
591 pfree(rem);
595 Datum
596 thesaurus_init(PG_FUNCTION_ARGS)
598 List *dictoptions = (List *) PG_GETARG_POINTER(0);
599 DictThesaurus *d;
600 char *subdictname = NULL;
601 bool fileloaded = false;
602 ListCell *l;
604 d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
606 foreach(l, dictoptions)
608 DefElem *defel = (DefElem *) lfirst(l);
610 if (strcmp(defel->defname, "dictfile") == 0)
612 if (fileloaded)
613 ereport(ERROR,
614 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
615 errmsg("multiple DictFile parameters")));
616 thesaurusRead(defGetString(defel), d);
617 fileloaded = true;
619 else if (strcmp(defel->defname, "dictionary") == 0)
621 if (subdictname)
622 ereport(ERROR,
623 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
624 errmsg("multiple Dictionary parameters")));
625 subdictname = pstrdup(defGetString(defel));
627 else
629 ereport(ERROR,
630 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
631 errmsg("unrecognized Thesaurus parameter: \"%s\"",
632 defel->defname)));
636 if (!fileloaded)
637 ereport(ERROR,
638 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
639 errmsg("missing DictFile parameter")));
640 if (!subdictname)
641 ereport(ERROR,
642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
643 errmsg("missing Dictionary parameter")));
645 d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
646 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
648 compileTheLexeme(d);
649 compileTheSubstitute(d);
651 PG_RETURN_POINTER(d);
654 static LexemeInfo *
655 findTheLexeme(DictThesaurus *d, char *lexeme)
657 TheLexeme key,
658 *res;
660 if (d->nwrds == 0)
661 return NULL;
663 key.lexeme = lexeme;
664 key.entries = NULL;
666 res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
668 if (res == NULL)
669 return NULL;
670 return res->entries;
673 static bool
674 matchIdSubst(LexemeInfo *stored, uint32 idsubst)
676 bool res = true;
678 if (stored)
680 res = false;
682 for (; stored; stored = stored->nextvariant)
683 if (stored->idsubst == idsubst)
685 res = true;
686 break;
690 return res;
693 static LexemeInfo *
694 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
696 for (;;)
698 int i;
699 LexemeInfo *ptr = newin[0];
701 for (i = 0; i < newn; i++)
703 while (newin[i] && newin[i]->idsubst < ptr->idsubst)
704 newin[i] = newin[i]->nextentry;
706 if (newin[i] == NULL)
707 return in;
709 if (newin[i]->idsubst > ptr->idsubst)
711 ptr = newin[i];
712 i = -1;
713 continue;
716 while (newin[i]->idsubst == ptr->idsubst)
718 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
720 ptr = newin[i];
721 break;
724 newin[i] = newin[i]->nextentry;
725 if (newin[i] == NULL)
726 return in;
729 if (newin[i]->idsubst != ptr->idsubst)
731 ptr = newin[i];
732 i = -1;
733 continue;
737 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
738 { /* found */
740 ptr->nextvariant = in;
741 in = ptr;
744 /* step forward */
745 for (i = 0; i < newn; i++)
746 newin[i] = newin[i]->nextentry;
750 static TSLexeme *
751 copyTSLexeme(TheSubstitute *ts)
753 TSLexeme *res;
754 uint16 i;
756 res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
757 for (i = 0; i < ts->reslen; i++)
759 res[i] = ts->res[i];
760 res[i].lexeme = pstrdup(ts->res[i].lexeme);
763 res[ts->reslen].lexeme = NULL;
765 return res;
768 static TSLexeme *
769 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
771 *moreres = false;
772 while (info)
774 Assert(info->idsubst < d->nsubst);
775 if (info->nextvariant)
776 *moreres = true;
777 if (d->subst[info->idsubst].lastlexeme == curpos)
778 return copyTSLexeme(d->subst + info->idsubst);
779 info = info->nextvariant;
782 return NULL;
785 Datum
786 thesaurus_lexize(PG_FUNCTION_ARGS)
788 DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
789 DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
790 TSLexeme *res = NULL;
791 LexemeInfo *stored,
792 *info = NULL;
793 uint16 curpos = 0;
794 bool moreres = false;
796 if (PG_NARGS() != 4 || dstate == NULL)
797 elog(ERROR, "forbidden call of thesaurus or nested call");
799 if (dstate->isend)
800 PG_RETURN_POINTER(NULL);
801 stored = (LexemeInfo *) dstate->private_state;
803 if (stored)
804 curpos = stored->posinsubst + 1;
806 if (!d->subdict->isvalid)
807 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
809 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
810 PointerGetDatum(d->subdict->dictData),
811 PG_GETARG_DATUM(1),
812 PG_GETARG_DATUM(2),
813 PointerGetDatum(NULL)));
815 if (res && res->lexeme)
817 TSLexeme *ptr = res,
818 *basevar;
820 while (ptr->lexeme)
822 uint16 nv = ptr->nvariant;
823 uint16 i,
824 nlex = 0;
825 LexemeInfo **infos;
827 basevar = ptr;
828 while (ptr->lexeme && nv == ptr->nvariant)
830 nlex++;
831 ptr++;
834 infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
835 for (i = 0; i < nlex; i++)
836 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
837 break;
839 if (i < nlex)
841 /* no chance to find */
842 pfree(infos);
843 continue;
846 info = findVariant(info, stored, curpos, infos, nlex);
849 else if (res)
850 { /* stop-word */
851 LexemeInfo *infos = findTheLexeme(d, NULL);
853 info = findVariant(NULL, stored, curpos, &infos, 1);
855 else
857 info = NULL; /* word isn't recognized */
860 dstate->private_state = (void *) info;
862 if (!info)
864 dstate->getnext = false;
865 PG_RETURN_POINTER(NULL);
868 if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
870 dstate->getnext = moreres;
871 PG_RETURN_POINTER(res);
874 dstate->getnext = true;
876 PG_RETURN_POINTER(NULL);