src/backend/tsearch/dict_thesaurus.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dict_thesaurus.c
   4  *              Thesaurus dictionary: phrase to phrase substitution
   5  *
   6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        src/backend/tsearch/dict_thesaurus.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include "catalog/namespace.h"
  17 #include "commands/defrem.h"
  18 #include "tsearch/ts_cache.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "tsearch/ts_utils.h"
  21 #include "utils/builtins.h"
  22 #include "utils/regproc.h"
  23
  24
  25 /*
  26  * Temporary we use TSLexeme.flags for inner use...
  27  */
  28 #define DT_USEASIS              0x1000
  29
  30 typedef struct LexemeInfo
  31 {
  32         uint32          idsubst;                /* entry's number in DictThesaurus->subst */
  33         uint16          posinsubst;             /* pos info in entry */
  34         uint16          tnvariant;              /* total num lexemes in one variant */
  35         struct LexemeInfo *nextentry;
  36         struct LexemeInfo *nextvariant;
  37 } LexemeInfo;
  38
  39 typedef struct
  40 {
  41         char       *lexeme;
  42         LexemeInfo *entries;
  43 } TheLexeme;
  44
  45 typedef struct
  46 {
  47         uint16          lastlexeme;             /* number lexemes to substitute */
  48         uint16          reslen;
  49         TSLexeme   *res;                        /* prepared substituted result */
  50 } TheSubstitute;
  51
  52 typedef struct
  53 {
  54         /* subdictionary to normalize lexemes */
  55         Oid                     subdictOid;
  56         TSDictionaryCacheEntry *subdict;
  57
  58         /* Array to search lexeme by exact match */
  59         TheLexeme  *wrds;
  60         int                     nwrds;                  /* current number of words */
  61         int                     ntwrds;                 /* allocated array length */
  62
  63         /*
  64          * Storage of substituted result, n-th element is for n-th expression
  65          */
  66         TheSubstitute *subst;
  67         int                     nsubst;
  68 } DictThesaurus;
  69
  70
  71 static void
  72 newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
  73 {
  74         TheLexeme  *ptr;
  75
  76         if (d->nwrds >= d->ntwrds)
  77         {
  78                 if (d->ntwrds == 0)
  79                 {
  80                         d->ntwrds = 16;
  81                         d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
  82                 }
  83                 else
  84                 {
  85                         d->ntwrds *= 2;
  86                         d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
  87                 }
  88         }
  89
  90         ptr = d->wrds + d->nwrds;
  91         d->nwrds++;
  92
  93         ptr->lexeme = palloc(e - b + 1);
  94
  95         memcpy(ptr->lexeme, b, e - b);
  96         ptr->lexeme[e - b] = '\0';
  97
  98         ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
  99
 100         ptr->entries->nextentry = NULL;
 101         ptr->entries->idsubst = idsubst;
 102         ptr->entries->posinsubst = posinsubst;
 103 }
 104
 105 static void
 106 addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
 107 {
 108         static int      nres = 0;
 109         static int      ntres = 0;
 110         TheSubstitute *ptr;
 111
 112         if (nwrd == 0)
 113         {
 114                 nres = ntres = 0;
 115
 116                 if (idsubst >= d->nsubst)
 117                 {
 118                         if (d->nsubst == 0)
 119                         {
 120                                 d->nsubst = 16;
 121                                 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
 122                         }
 123                         else
 124                         {
 125                                 d->nsubst *= 2;
 126                                 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
 127                         }
 128                 }
 129         }
 130
 131         ptr = d->subst + idsubst;
 132
 133         ptr->lastlexeme = posinsubst - 1;
 134
 135         if (nres + 1 >= ntres)
 136         {
 137                 if (ntres == 0)
 138                 {
 139                         ntres = 2;
 140                         ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
 141                 }
 142                 else
 143                 {
 144                         ntres *= 2;
 145                         ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
 146                 }
 147         }
 148
 149         ptr->res[nres].lexeme = palloc(e - b + 1);
 150         memcpy(ptr->res[nres].lexeme, b, e - b);
 151         ptr->res[nres].lexeme[e - b] = '\0';
 152
 153         ptr->res[nres].nvariant = nwrd;
 154         if (useasis)
 155                 ptr->res[nres].flags = DT_USEASIS;
 156         else
 157                 ptr->res[nres].flags = 0;
 158
 159         ptr->res[++nres].lexeme = NULL;
 160 }
 161
 162 #define TR_WAITLEX      1
 163 #define TR_INLEX        2
 164 #define TR_WAITSUBS 3
 165 #define TR_INSUBS       4
 166
 167 static void
 168 thesaurusRead(const char *filename, DictThesaurus *d)
 169 {
 170         tsearch_readline_state trst;
 171         uint32          idsubst = 0;
 172         bool            useasis = false;
 173         char       *line;
 174
 175         filename = get_tsearch_config_filename(filename, "ths");
 176         if (!tsearch_readline_begin(&trst, filename))
 177                 ereport(ERROR,
 178                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 179                                  errmsg("could not open thesaurus file \"%s\": %m",
 180                                                 filename)));
 181
 182         while ((line = tsearch_readline(&trst)) != NULL)
 183         {
 184                 char       *ptr;
 185                 int                     state = TR_WAITLEX;
 186                 char       *beginwrd = NULL;
 187                 uint32          posinsubst = 0;
 188                 uint32          nwrd = 0;
 189
 190                 ptr = line;
 191
 192                 /* is it a comment? */
 193                 while (*ptr && t_isspace(ptr))
 194                         ptr += pg_mblen(ptr);
 195
 196                 if (t_iseq(ptr, '#') || *ptr == '\0' ||
 197                         t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
 198                 {
 199                         pfree(line);
 200                         continue;
 201                 }
 202
 203                 while (*ptr)
 204                 {
 205                         if (state == TR_WAITLEX)
 206                         {
 207                                 if (t_iseq(ptr, ':'))
 208                                 {
 209                                         if (posinsubst == 0)
 210                                                 ereport(ERROR,
 211                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 212                                                                  errmsg("unexpected delimiter")));
 213                                         state = TR_WAITSUBS;
 214                                 }
 215                                 else if (!t_isspace(ptr))
 216                                 {
 217                                         beginwrd = ptr;
 218                                         state = TR_INLEX;
 219                                 }
 220                         }
 221                         else if (state == TR_INLEX)
 222                         {
 223                                 if (t_iseq(ptr, ':'))
 224                                 {
 225                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
 226                                         state = TR_WAITSUBS;
 227                                 }
 228                                 else if (t_isspace(ptr))
 229                                 {
 230                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
 231                                         state = TR_WAITLEX;
 232                                 }
 233                         }
 234                         else if (state == TR_WAITSUBS)
 235                         {
 236                                 if (t_iseq(ptr, '*'))
 237                                 {
 238                                         useasis = true;
 239                                         state = TR_INSUBS;
 240                                         beginwrd = ptr + pg_mblen(ptr);
 241                                 }
 242                                 else if (t_iseq(ptr, '\\'))
 243                                 {
 244                                         useasis = false;
 245                                         state = TR_INSUBS;
 246                                         beginwrd = ptr + pg_mblen(ptr);
 247                                 }
 248                                 else if (!t_isspace(ptr))
 249                                 {
 250                                         useasis = false;
 251                                         beginwrd = ptr;
 252                                         state = TR_INSUBS;
 253                                 }
 254                         }
 255                         else if (state == TR_INSUBS)
 256                         {
 257                                 if (t_isspace(ptr))
 258                                 {
 259                                         if (ptr == beginwrd)
 260                                                 ereport(ERROR,
 261                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 262                                                                  errmsg("unexpected end of line or lexeme")));
 263                                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
 264                                         state = TR_WAITSUBS;
 265                                 }
 266                         }
 267                         else
 268                                 elog(ERROR, "unrecognized thesaurus state: %d", state);
 269
 270                         ptr += pg_mblen(ptr);
 271                 }
 272
 273                 if (state == TR_INSUBS)
 274                 {
 275                         if (ptr == beginwrd)
 276                                 ereport(ERROR,
 277                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 278                                                  errmsg("unexpected end of line or lexeme")));
 279                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
 280                 }
 281
 282                 idsubst++;
 283
 284                 if (!(nwrd && posinsubst))
 285                         ereport(ERROR,
 286                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
 287                                          errmsg("unexpected end of line")));
 288
 289                 if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
 290                         ereport(ERROR,
 291                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
 292                                          errmsg("too many lexemes in thesaurus entry")));
 293
 294                 pfree(line);
 295         }
 296
 297         d->nsubst = idsubst;
 298
 299         tsearch_readline_end(&trst);
 300 }
 301
 302 static TheLexeme *
 303 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
 304 {
 305         if (*nnw >= *tnm)
 306         {
 307                 *tnm *= 2;
 308                 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
 309         }
 310
 311         newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
 312
 313         if (lexeme && lexeme->lexeme)
 314         {
 315                 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
 316                 newwrds[*nnw].entries->tnvariant = tnvariant;
 317         }
 318         else
 319         {
 320                 newwrds[*nnw].lexeme = NULL;
 321                 newwrds[*nnw].entries->tnvariant = 1;
 322         }
 323
 324         newwrds[*nnw].entries->idsubst = src->idsubst;
 325         newwrds[*nnw].entries->posinsubst = src->posinsubst;
 326
 327         newwrds[*nnw].entries->nextentry = NULL;
 328
 329         (*nnw)++;
 330         return newwrds;
 331 }
 332
 333 static int
 334 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
 335 {
 336         if (a == NULL || b == NULL)
 337                 return 0;
 338
 339         if (a->idsubst == b->idsubst)
 340         {
 341                 if (a->posinsubst == b->posinsubst)
 342                 {
 343                         if (a->tnvariant == b->tnvariant)
 344                                 return 0;
 345
 346                         return (a->tnvariant > b->tnvariant) ? 1 : -1;
 347                 }
 348
 349                 return (a->posinsubst > b->posinsubst) ? 1 : -1;
 350         }
 351
 352         return (a->idsubst > b->idsubst) ? 1 : -1;
 353 }
 354
 355 static int
 356 cmpLexeme(const TheLexeme *a, const TheLexeme *b)
 357 {
 358         if (a->lexeme == NULL)
 359         {
 360                 if (b->lexeme == NULL)
 361                         return 0;
 362                 else
 363                         return 1;
 364         }
 365         else if (b->lexeme == NULL)
 366                 return -1;
 367
 368         return strcmp(a->lexeme, b->lexeme);
 369 }
 370
 371 static int
 372 cmpLexemeQ(const void *a, const void *b)
 373 {
 374         return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
 375 }
 376
 377 static int
 378 cmpTheLexeme(const void *a, const void *b)
 379 {
 380         const TheLexeme *la = (const TheLexeme *) a;
 381         const TheLexeme *lb = (const TheLexeme *) b;
 382         int                     res;
 383
 384         if ((res = cmpLexeme(la, lb)) != 0)
 385                 return res;
 386
 387         return -cmpLexemeInfo(la->entries, lb->entries);
 388 }
 389
 390 static void
 391 compileTheLexeme(DictThesaurus *d)
 392 {
 393         int                     i,
 394                                 nnw = 0,
 395                                 tnm = 16;
 396         TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
 397                            *ptrwrds;
 398
 399         for (i = 0; i < d->nwrds; i++)
 400         {
 401                 TSLexeme   *ptr;
 402
 403                 if (strcmp(d->wrds[i].lexeme, "?") == 0)        /* Is stop word marker? */
 404                         newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
 405                 else
 406                 {
 407                         ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
 408                                                                                                                          PointerGetDatum(d->subdict->dictData),
 409                                                                                                                          PointerGetDatum(d->wrds[i].lexeme),
 410                                                                                                                          Int32GetDatum(strlen(d->wrds[i].lexeme)),
 411                                                                                                                          PointerGetDatum(NULL)));
 412
 413                         if (!ptr)
 414                                 ereport(ERROR,
 415                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 416                                                  errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
 417                                                                 d->wrds[i].lexeme,
 418                                                                 d->wrds[i].entries->idsubst + 1)));
 419                         else if (!(ptr->lexeme))
 420                                 ereport(ERROR,
 421                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 422                                                  errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
 423                                                                 d->wrds[i].lexeme,
 424                                                                 d->wrds[i].entries->idsubst + 1),
 425                                                  errhint("Use \"?\" to represent a stop word within a sample phrase.")));
 426                         else
 427                         {
 428                                 while (ptr->lexeme)
 429                                 {
 430                                         TSLexeme   *remptr = ptr + 1;
 431                                         int                     tnvar = 1;
 432                                         int                     curvar = ptr->nvariant;
 433
 434                                         /* compute n words in one variant */
 435                                         while (remptr->lexeme)
 436                                         {
 437                                                 if (remptr->nvariant != (remptr - 1)->nvariant)
 438                                                         break;
 439                                                 tnvar++;
 440                                                 remptr++;
 441                                         }
 442
 443                                         remptr = ptr;
 444                                         while (remptr->lexeme && remptr->nvariant == curvar)
 445                                         {
 446                                                 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
 447                                                 remptr++;
 448                                         }
 449
 450                                         ptr = remptr;
 451                                 }
 452                         }
 453                 }
 454
 455                 pfree(d->wrds[i].lexeme);
 456                 pfree(d->wrds[i].entries);
 457         }
 458
 459         if (d->wrds)
 460                 pfree(d->wrds);
 461         d->wrds = newwrds;
 462         d->nwrds = nnw;
 463         d->ntwrds = tnm;
 464
 465         if (d->nwrds > 1)
 466         {
 467                 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
 468
 469                 /* uniq */
 470                 newwrds = d->wrds;
 471                 ptrwrds = d->wrds + 1;
 472                 while (ptrwrds - d->wrds < d->nwrds)
 473                 {
 474                         if (cmpLexeme(ptrwrds, newwrds) == 0)
 475                         {
 476                                 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
 477                                 {
 478                                         ptrwrds->entries->nextentry = newwrds->entries;
 479                                         newwrds->entries = ptrwrds->entries;
 480                                 }
 481                                 else
 482                                         pfree(ptrwrds->entries);
 483
 484                                 if (ptrwrds->lexeme)
 485                                         pfree(ptrwrds->lexeme);
 486                         }
 487                         else
 488                         {
 489                                 newwrds++;
 490                                 *newwrds = *ptrwrds;
 491                         }
 492
 493                         ptrwrds++;
 494                 }
 495
 496                 d->nwrds = newwrds - d->wrds + 1;
 497                 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
 498         }
 499 }
 500
 501 static void
 502 compileTheSubstitute(DictThesaurus *d)
 503 {
 504         int                     i;
 505
 506         for (i = 0; i < d->nsubst; i++)
 507         {
 508                 TSLexeme   *rem = d->subst[i].res,
 509                                    *outptr,
 510                                    *inptr;
 511                 int                     n = 2;
 512
 513                 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
 514                 outptr->lexeme = NULL;
 515                 inptr = rem;
 516
 517                 while (inptr && inptr->lexeme)
 518                 {
 519                         TSLexeme   *lexized,
 520                                                 tmplex[2];
 521
 522                         if (inptr->flags & DT_USEASIS)
 523                         {                                       /* do not lexize */
 524                                 tmplex[0] = *inptr;
 525                                 tmplex[0].flags = 0;
 526                                 tmplex[1].lexeme = NULL;
 527                                 lexized = tmplex;
 528                         }
 529                         else
 530                         {
 531                                 lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
 532                                                                                                                                          PointerGetDatum(d->subdict->dictData),
 533                                                                                                                                          PointerGetDatum(inptr->lexeme),
 534                                                                                                                                          Int32GetDatum(strlen(inptr->lexeme)),
 535                                                                                                                                          PointerGetDatum(NULL)));
 536                         }
 537
 538                         if (lexized && lexized->lexeme)
 539                         {
 540                                 int                     toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
 541
 542                                 while (lexized->lexeme)
 543                                 {
 544                                         if (outptr - d->subst[i].res + 1 >= n)
 545                                         {
 546                                                 int                     diff = outptr - d->subst[i].res;
 547
 548                                                 n *= 2;
 549                                                 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
 550                                                 outptr = d->subst[i].res + diff;
 551                                         }
 552
 553                                         *outptr = *lexized;
 554                                         outptr->lexeme = pstrdup(lexized->lexeme);
 555
 556                                         outptr++;
 557                                         lexized++;
 558                                 }
 559
 560                                 if (toset > 0)
 561                                         d->subst[i].res[toset].flags |= TSL_ADDPOS;
 562                         }
 563                         else if (lexized)
 564                         {
 565                                 ereport(ERROR,
 566                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 567                                                  errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
 568                                                                 inptr->lexeme, i + 1)));
 569                         }
 570                         else
 571                         {
 572                                 ereport(ERROR,
 573                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
 574                                                  errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
 575                                                                 inptr->lexeme, i + 1)));
 576                         }
 577
 578                         if (inptr->lexeme)
 579                                 pfree(inptr->lexeme);
 580                         inptr++;
 581                 }
 582
 583                 if (outptr == d->subst[i].res)
 584                         ereport(ERROR,
 585                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
 586                                          errmsg("thesaurus substitute phrase is empty (rule %d)",
 587                                                         i + 1)));
 588
 589                 d->subst[i].reslen = outptr - d->subst[i].res;
 590
 591                 pfree(rem);
 592         }
 593 }
 594
 595 Datum
 596 thesaurus_init(PG_FUNCTION_ARGS)
 597 {
 598         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 599         DictThesaurus *d;
 600         char       *subdictname = NULL;
 601         bool            fileloaded = false;
 602         ListCell   *l;
 603
 604         d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
 605
 606         foreach(l, dictoptions)
 607         {
 608                 DefElem    *defel = (DefElem *) lfirst(l);
 609
 610                 if (strcmp(defel->defname, "dictfile") == 0)
 611                 {
 612                         if (fileloaded)
 613                                 ereport(ERROR,
 614                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 615                                                  errmsg("multiple DictFile parameters")));
 616                         thesaurusRead(defGetString(defel), d);
 617                         fileloaded = true;
 618                 }
 619                 else if (strcmp(defel->defname, "dictionary") == 0)
 620                 {
 621                         if (subdictname)
 622                                 ereport(ERROR,
 623                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 624                                                  errmsg("multiple Dictionary parameters")));
 625                         subdictname = pstrdup(defGetString(defel));
 626                 }
 627                 else
 628                 {
 629                         ereport(ERROR,
 630                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 631                                          errmsg("unrecognized Thesaurus parameter: \"%s\"",
 632                                                         defel->defname)));
 633                 }
 634         }
 635
 636         if (!fileloaded)
 637                 ereport(ERROR,
 638                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 639                                  errmsg("missing DictFile parameter")));
 640         if (!subdictname)
 641                 ereport(ERROR,
 642                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 643                                  errmsg("missing Dictionary parameter")));
 644
 645         d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
 646         d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
 647
 648         compileTheLexeme(d);
 649         compileTheSubstitute(d);
 650
 651         PG_RETURN_POINTER(d);
 652 }
 653
 654 static LexemeInfo *
 655 findTheLexeme(DictThesaurus *d, char *lexeme)
 656 {
 657         TheLexeme       key,
 658                            *res;
 659
 660         if (d->nwrds == 0)
 661                 return NULL;
 662
 663         key.lexeme = lexeme;
 664         key.entries = NULL;
 665
 666         res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
 667
 668         if (res == NULL)
 669                 return NULL;
 670         return res->entries;
 671 }
 672
 673 static bool
 674 matchIdSubst(LexemeInfo *stored, uint32 idsubst)
 675 {
 676         bool            res = true;
 677
 678         if (stored)
 679         {
 680                 res = false;
 681
 682                 for (; stored; stored = stored->nextvariant)
 683                         if (stored->idsubst == idsubst)
 684                         {
 685                                 res = true;
 686                                 break;
 687                         }
 688         }
 689
 690         return res;
 691 }
 692
 693 static LexemeInfo *
 694 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
 695 {
 696         for (;;)
 697         {
 698                 int                     i;
 699                 LexemeInfo *ptr = newin[0];
 700
 701                 for (i = 0; i < newn; i++)
 702                 {
 703                         while (newin[i] && newin[i]->idsubst < ptr->idsubst)
 704                                 newin[i] = newin[i]->nextentry;
 705
 706                         if (newin[i] == NULL)
 707                                 return in;
 708
 709                         if (newin[i]->idsubst > ptr->idsubst)
 710                         {
 711                                 ptr = newin[i];
 712                                 i = -1;
 713                                 continue;
 714                         }
 715
 716                         while (newin[i]->idsubst == ptr->idsubst)
 717                         {
 718                                 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
 719                                 {
 720                                         ptr = newin[i];
 721                                         break;
 722                                 }
 723
 724                                 newin[i] = newin[i]->nextentry;
 725                                 if (newin[i] == NULL)
 726                                         return in;
 727                         }
 728
 729                         if (newin[i]->idsubst != ptr->idsubst)
 730                         {
 731                                 ptr = newin[i];
 732                                 i = -1;
 733                                 continue;
 734                         }
 735                 }
 736
 737                 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
 738                 {                                               /* found */
 739
 740                         ptr->nextvariant = in;
 741                         in = ptr;
 742                 }
 743
 744                 /* step forward */
 745                 for (i = 0; i < newn; i++)
 746                         newin[i] = newin[i]->nextentry;
 747         }
 748 }
 749
 750 static TSLexeme *
 751 copyTSLexeme(TheSubstitute *ts)
 752 {
 753         TSLexeme   *res;
 754         uint16          i;
 755
 756         res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
 757         for (i = 0; i < ts->reslen; i++)
 758         {
 759                 res[i] = ts->res[i];
 760                 res[i].lexeme = pstrdup(ts->res[i].lexeme);
 761         }
 762
 763         res[ts->reslen].lexeme = NULL;
 764
 765         return res;
 766 }
 767
 768 static TSLexeme *
 769 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
 770 {
 771         *moreres = false;
 772         while (info)
 773         {
 774                 Assert(info->idsubst < d->nsubst);
 775                 if (info->nextvariant)
 776                         *moreres = true;
 777                 if (d->subst[info->idsubst].lastlexeme == curpos)
 778                         return copyTSLexeme(d->subst + info->idsubst);
 779                 info = info->nextvariant;
 780         }
 781
 782         return NULL;
 783 }
 784
 785 Datum
 786 thesaurus_lexize(PG_FUNCTION_ARGS)
 787 {
 788         DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
 789         DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
 790         TSLexeme   *res = NULL;
 791         LexemeInfo *stored,
 792                            *info = NULL;
 793         uint16          curpos = 0;
 794         bool            moreres = false;
 795
 796         if (PG_NARGS() != 4 || dstate == NULL)
 797                 elog(ERROR, "forbidden call of thesaurus or nested call");
 798
 799         if (dstate->isend)
 800                 PG_RETURN_POINTER(NULL);
 801         stored = (LexemeInfo *) dstate->private_state;
 802
 803         if (stored)
 804                 curpos = stored->posinsubst + 1;
 805
 806         if (!d->subdict->isvalid)
 807                 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
 808
 809         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
 810                                                                                                          PointerGetDatum(d->subdict->dictData),
 811                                                                                                          PG_GETARG_DATUM(1),
 812                                                                                                          PG_GETARG_DATUM(2),
 813                                                                                                          PointerGetDatum(NULL)));
 814
 815         if (res && res->lexeme)
 816         {
 817                 TSLexeme   *ptr = res,
 818                                    *basevar;
 819
 820                 while (ptr->lexeme)
 821                 {
 822                         uint16          nv = ptr->nvariant;
 823                         uint16          i,
 824                                                 nlex = 0;
 825                         LexemeInfo **infos;
 826
 827                         basevar = ptr;
 828                         while (ptr->lexeme && nv == ptr->nvariant)
 829                         {
 830                                 nlex++;
 831                                 ptr++;
 832                         }
 833
 834                         infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
 835                         for (i = 0; i < nlex; i++)
 836                                 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
 837                                         break;
 838
 839                         if (i < nlex)
 840                         {
 841                                 /* no chance to find */
 842                                 pfree(infos);
 843                                 continue;
 844                         }
 845
 846                         info = findVariant(info, stored, curpos, infos, nlex);
 847                 }
 848         }
 849         else if (res)
 850         {                                                       /* stop-word */
 851                 LexemeInfo *infos = findTheLexeme(d, NULL);
 852
 853                 info = findVariant(NULL, stored, curpos, &infos, 1);
 854         }
 855         else
 856         {
 857                 info = NULL;                    /* word isn't recognized */
 858         }
 859
 860         dstate->private_state = (void *) info;
 861
 862         if (!info)
 863         {
 864                 dstate->getnext = false;
 865                 PG_RETURN_POINTER(NULL);
 866         }
 867
 868         if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
 869         {
 870                 dstate->getnext = moreres;
 871                 PG_RETURN_POINTER(res);
 872         }
 873
 874         dstate->getnext = true;
 875
 876         PG_RETURN_POINTER(NULL);
 877 }