ext/hunspell/affentry.cxx

   1 #include "license.hunspell"
   2 #include "license.myspell"
   3
   4 #include <stdlib.h>
   5 #include <string.h>
   6 #include <stdio.h>
   7 #include <ctype.h>
   8
   9 #include "affentry.hxx"
  10 #include "csutil.hxx"
  11
  12 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
  13 {
  14   // register affix manager
  15   pmyMgr = pmgr;
  16
  17   // set up its initial values
  18
  19   aflag = dp->aflag;         // flag
  20   strip = dp->strip;         // string to strip
  21   appnd = dp->appnd;         // string to append
  22   stripl = dp->stripl;       // length of strip string
  23   appndl = dp->appndl;       // length of append string
  24   numconds = dp->numconds;   // length of the condition
  25   opts = dp->opts;           // cross product flag
  26   // then copy over all of the conditions
  27   if (opts & aeLONGCOND) {
  28     memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
  29     c.l.conds2 = dp->c.l.conds2;
  30   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
  31   next = NULL;
  32   nextne = NULL;
  33   nexteq = NULL;
  34   morphcode = dp->morphcode;
  35   contclass = dp->contclass;
  36   contclasslen = dp->contclasslen;
  37 }
  38
  39
  40 PfxEntry::~PfxEntry()
  41 {
  42     aflag = 0;
  43     if (appnd) free(appnd);
  44     if (strip) free(strip);
  45     pmyMgr = NULL;
  46     appnd = NULL;
  47     strip = NULL;
  48     if (opts & aeLONGCOND) free(c.l.conds2);
  49     if (morphcode && !(opts & aeALIASM)) free(morphcode);
  50     if (contclass && !(opts & aeALIASF)) free(contclass);
  51 }
  52
  53 // add prefix to this word assuming conditions hold
  54 char * PfxEntry::add(const char * word, int len)
  55 {
  56     char tword[MAXWORDUTF8LEN + 4];
  57
  58     if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
  59        (len >= numconds) && test_condition(word) &&
  60        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
  61        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
  62     /* we have a match so add prefix */
  63               char * pp = tword;
  64               if (appndl) {
  65                   strcpy(tword,appnd);
  66                   pp += appndl;
  67                }
  68                strcpy(pp, (word + stripl));
  69                return mystrdup(tword);
  70      }
  71      return NULL;
  72 }
  73
  74 inline char * PfxEntry::nextchar(char * p) {
  75     if (p) {
  76         p++;
  77         if (opts & aeLONGCOND) {
  78             // jump to the 2nd part of the condition
  79             if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
  80         // end of the MAXCONDLEN length condition
  81         } else if (p == c.conds + MAXCONDLEN) return NULL;
  82         return *p ? p : NULL;
  83     }
  84     return NULL;
  85 }
  86
  87 inline int PfxEntry::test_condition(const char * st)
  88 {
  89     const char * pos = NULL; // group with pos input position
  90     bool neg = false;        // complementer
  91     bool ingroup = false;    // character in the group
  92     if (numconds == 0) return 1;
  93     char * p = c.conds;
  94     while (1) {
  95       switch (*p) {
  96         case '\0': return 1;
  97         case '[': {
  98                 neg = false;
  99                 ingroup = false;
 100                 p = nextchar(p);
 101                 pos = st; break;
 102             }
 103         case '^': { p = nextchar(p); neg = true; break; }
 104         case ']': {
 105                 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
 106                 pos = NULL;
 107                 p = nextchar(p);
 108                 // skip the next character
 109                 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
 110                 if (*st == '\0' && p) return 0; // word <= condition
 111                 break;
 112             }
 113          case '.': if (!pos) { // dots are not metacharacters in groups: [.]
 114                 p = nextchar(p);
 115                 // skip the next character
 116                 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
 117                 if (*st == '\0' && p) return 0; // word <= condition
 118                 break;
 119             }
 120     default: {
 121                 if (*st == *p) {
 122                     st++;
 123                     p = nextchar(p);
 124                     if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
 125                         while (p && (*p & 0xc0) == 0x80) {       // character
 126                             if (*p != *st) {
 127                                 if (!pos) return 0;
 128                                 st = pos;
 129                                 break;
 130                             }
 131                             p = nextchar(p);
 132                             st++;
 133                         }
 134                         if (pos && st != pos) {
 135                             ingroup = true;
 136                             while (p && *p != ']' && (p = nextchar(p)));
 137                         }
 138                     } else if (pos) {
 139                         ingroup = true;
 140                         while (p && *p != ']' && (p = nextchar(p)));
 141                     }
 142                 } else if (pos) { // group
 143                     p = nextchar(p);
 144                 } else return 0;
 145             }
 146       }
 147       if (!p) return 1;
 148     }
 149 }
 150
 151 // check if this prefix entry matches
 152 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
 153 {
 154     int                 tmpl;   // length of tmpword
 155     struct hentry *     he;     // hash entry of root word or NULL
 156     char                tmpword[MAXWORDUTF8LEN + 4];
 157
 158     // on entry prefix is 0 length or already matches the beginning of the word.
 159     // So if the remaining root word has positive length
 160     // and if there are enough chars in root word and added back strip chars
 161     // to meet the number of characters conditions, then test it
 162
 163      tmpl = len - appndl;
 164
 165      if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
 166
 167             // generate new root word by removing prefix and adding
 168             // back any characters that would have been stripped
 169
 170             if (stripl) strcpy (tmpword, strip);
 171             strcpy ((tmpword + stripl), (word + appndl));
 172
 173             // now make sure all of the conditions on characters
 174             // are met.  Please see the appendix at the end of
 175             // this file for more info on exactly what is being
 176             // tested
 177
 178             // if all conditions are met then check if resulting
 179             // root word in the dictionary
 180
 181             if (test_condition(tmpword)) {
 182                 tmpl += stripl;
 183                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 184                    do {
 185                       if (TESTAFF(he->astr, aflag, he->alen) &&
 186                         // forbid single prefixes with needaffix flag
 187                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
 188                         // needflag
 189                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 190                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
 191                             return he;
 192                       he = he->next_homonym; // check homonyms
 193                    } while (he);
 194                 }
 195
 196                 // prefix matched but no root word was found
 197                 // if aeXPRODUCT is allowed, try again but now
 198                 // ross checked combined with a suffix
 199
 200                 //if ((opts & aeXPRODUCT) && in_compound) {
 201                 if ((opts & aeXPRODUCT)) {
 202                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
 203                         0, NULL, FLAG_NULL, needflag, in_compound);
 204                    if (he) return he;
 205                 }
 206             }
 207      }
 208     return NULL;
 209 }
 210
 211 // check if this prefix entry matches
 212 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
 213     char in_compound, const FLAG needflag)
 214 {
 215     int                 tmpl;   // length of tmpword
 216     struct hentry *     he;     // hash entry of root word or NULL
 217     char                tmpword[MAXWORDUTF8LEN + 4];
 218
 219     // on entry prefix is 0 length or already matches the beginning of the word.
 220     // So if the remaining root word has positive length
 221     // and if there are enough chars in root word and added back strip chars
 222     // to meet the number of characters conditions, then test it
 223
 224      tmpl = len - appndl;
 225
 226      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 227         (tmpl + stripl >= numconds)) {
 228
 229             // generate new root word by removing prefix and adding
 230             // back any characters that would have been stripped
 231
 232             if (stripl) strcpy (tmpword, strip);
 233             strcpy ((tmpword + stripl), (word + appndl));
 234
 235             // now make sure all of the conditions on characters
 236             // are met.  Please see the appendix at the end of
 237             // this file for more info on exactly what is being
 238             // tested
 239
 240             // if all conditions are met then check if resulting
 241             // root word in the dictionary
 242
 243             if (test_condition(tmpword)) {
 244                 tmpl += stripl;
 245
 246                 // prefix matched but no root word was found
 247                 // if aeXPRODUCT is allowed, try again but now
 248                 // cross checked combined with a suffix
 249
 250                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 251                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
 252                    if (he) return he;
 253                 }
 254             }
 255      }
 256     return NULL;
 257 }
 258
 259 // check if this prefix entry matches
 260 char * PfxEntry::check_twosfx_morph(const char * word, int len,
 261          char in_compound, const FLAG needflag)
 262 {
 263     int                 tmpl;   // length of tmpword
 264     char                tmpword[MAXWORDUTF8LEN + 4];
 265
 266     // on entry prefix is 0 length or already matches the beginning of the word.
 267     // So if the remaining root word has positive length
 268     // and if there are enough chars in root word and added back strip chars
 269     // to meet the number of characters conditions, then test it
 270
 271      tmpl = len - appndl;
 272
 273      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 274         (tmpl + stripl >= numconds)) {
 275
 276             // generate new root word by removing prefix and adding
 277             // back any characters that would have been stripped
 278
 279             if (stripl) strcpy (tmpword, strip);
 280             strcpy ((tmpword + stripl), (word + appndl));
 281
 282             // now make sure all of the conditions on characters
 283             // are met.  Please see the appendix at the end of
 284             // this file for more info on exactly what is being
 285             // tested
 286
 287             // if all conditions are met then check if resulting
 288             // root word in the dictionary
 289
 290             if (test_condition(tmpword)) {
 291                 tmpl += stripl;
 292
 293                 // prefix matched but no root word was found
 294                 // if aeXPRODUCT is allowed, try again but now
 295                 // ross checked combined with a suffix
 296
 297                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 298                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
 299                              aeXPRODUCT, this, needflag);
 300                 }
 301             }
 302      }
 303     return NULL;
 304 }
 305
 306 // check if this prefix entry matches
 307 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
 308 {
 309     int                 tmpl;   // length of tmpword
 310     struct hentry *     he;     // hash entry of root word or NULL
 311     char                tmpword[MAXWORDUTF8LEN + 4];
 312     char                result[MAXLNLEN];
 313     char * st;
 314
 315     *result = '\0';
 316
 317     // on entry prefix is 0 length or already matches the beginning of the word.
 318     // So if the remaining root word has positive length
 319     // and if there are enough chars in root word and added back strip chars
 320     // to meet the number of characters conditions, then test it
 321
 322      tmpl = len - appndl;
 323
 324      if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 325         (tmpl + stripl >= numconds)) {
 326
 327             // generate new root word by removing prefix and adding
 328             // back any characters that would have been stripped
 329
 330             if (stripl) strcpy (tmpword, strip);
 331             strcpy ((tmpword + stripl), (word + appndl));
 332
 333             // now make sure all of the conditions on characters
 334             // are met.  Please see the appendix at the end of
 335             // this file for more info on exactly what is being
 336             // tested
 337
 338             // if all conditions are met then check if resulting
 339             // root word in the dictionary
 340
 341             if (test_condition(tmpword)) {
 342                 tmpl += stripl;
 343                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 344                     do {
 345                       if (TESTAFF(he->astr, aflag, he->alen) &&
 346                         // forbid single prefixes with needaffix flag
 347                         ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
 348                         // needflag
 349                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 350                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
 351                             if (morphcode) {
 352                                 mystrcat(result, " ", MAXLNLEN);
 353                                 mystrcat(result, morphcode, MAXLNLEN);
 354                             } else mystrcat(result,getKey(), MAXLNLEN);
 355                             if (!HENTRY_FIND(he, MORPH_STEM)) {
 356                                 mystrcat(result, " ", MAXLNLEN);
 357                                 mystrcat(result, MORPH_STEM, MAXLNLEN);
 358                                 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
 359                             }
 360                             // store the pointer of the hash entry
 361                             if (HENTRY_DATA(he)) {
 362                                 mystrcat(result, " ", MAXLNLEN);
 363                                 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
 364                             } else {
 365                                 // return with debug information
 366                                 char * flag = pmyMgr->encode_flag(getFlag());
 367                                 mystrcat(result, " ", MAXLNLEN);
 368                                 mystrcat(result, MORPH_FLAG, MAXLNLEN);
 369                                 mystrcat(result, flag, MAXLNLEN);
 370                                 free(flag);
 371                             }
 372                             mystrcat(result, "\n", MAXLNLEN);
 373                       }
 374                       he = he->next_homonym;
 375                     } while (he);
 376                 }
 377
 378                 // prefix matched but no root word was found
 379                 // if aeXPRODUCT is allowed, try again but now
 380                 // ross checked combined with a suffix
 381
 382                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 383                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
 384                      FLAG_NULL, needflag);
 385                    if (st) {
 386                         mystrcat(result, st, MAXLNLEN);
 387                         free(st);
 388                    }
 389                 }
 390             }
 391      }
 392
 393     if (*result) return mystrdup(result);
 394     return NULL;
 395 }
 396
 397 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
 398 {
 399   // register affix manager
 400   pmyMgr = pmgr;
 401
 402   // set up its initial values
 403   aflag = dp->aflag;         // char flag
 404   strip = dp->strip;         // string to strip
 405   appnd = dp->appnd;         // string to append
 406   stripl = dp->stripl;       // length of strip string
 407   appndl = dp->appndl;       // length of append string
 408   numconds = dp->numconds;   // length of the condition
 409   opts = dp->opts;           // cross product flag
 410
 411   // then copy over all of the conditions
 412   if (opts & aeLONGCOND) {
 413     memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
 414     c.l.conds2 = dp->c.l.conds2;
 415   } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
 416
 417   rappnd = myrevstrdup(appnd);
 418   morphcode = dp->morphcode;
 419   contclass = dp->contclass;
 420   contclasslen = dp->contclasslen;
 421 }
 422
 423
 424 SfxEntry::~SfxEntry()
 425 {
 426     aflag = 0;
 427     if (appnd) free(appnd);
 428     if (rappnd) free(rappnd);
 429     if (strip) free(strip);
 430     pmyMgr = NULL;
 431     appnd = NULL;
 432     strip = NULL;
 433     if (opts & aeLONGCOND) free(c.l.conds2);
 434     if (morphcode && !(opts & aeALIASM)) free(morphcode);
 435     if (contclass && !(opts & aeALIASF)) free(contclass);
 436 }
 437
 438 // add suffix to this word assuming conditions hold
 439 char * SfxEntry::add(const char * word, int len)
 440 {
 441     char                tword[MAXWORDUTF8LEN + 4];
 442
 443      /* make sure all conditions match */
 444      if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
 445         (len >= numconds) && test_condition(word + len, word) &&
 446         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
 447         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
 448               /* we have a match so add suffix */
 449               strcpy(tword,word);
 450               if (appndl) {
 451                   strcpy(tword + len - stripl, appnd);
 452               } else {
 453                   *(tword + len - stripl) = '\0';
 454               }
 455               return mystrdup(tword);
 456      }
 457      return NULL;
 458 }
 459
 460 inline char * SfxEntry::nextchar(char * p) {
 461     if (p) {
 462         p++;
 463         if (opts & aeLONGCOND) {
 464             // jump to the 2nd part of the condition
 465             if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
 466         // end of the MAXCONDLEN length condition
 467         } else if (p == c.conds + MAXCONDLEN) return NULL;
 468         return *p ? p : NULL;
 469     }
 470     return NULL;
 471 }
 472
 473 inline int SfxEntry::test_condition(const char * st, const char * beg)
 474 {
 475     const char * pos = NULL;    // group with pos input position
 476     bool neg = false;           // complementer
 477     bool ingroup = false;       // character in the group
 478     if (numconds == 0) return 1;
 479     char * p = c.conds;
 480     st--;
 481     int i = 1;
 482     while (1) {
 483       switch (*p) {
 484         case '\0': return 1;
 485         case '[': { p = nextchar(p); pos = st; break; }
 486         case '^': { p = nextchar(p); neg = true; break; }
 487         case ']': { if (!neg && !ingroup) return 0;
 488                 i++;
 489                 // skip the next character
 490                 if (!ingroup) {
 491                     for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
 492                     st--;
 493                 }
 494                 pos = NULL;
 495                 neg = false;
 496                 ingroup = false;
 497                 p = nextchar(p);
 498                 if (st < beg && p) return 0; // word <= condition
 499                 break;
 500             }
 501         case '.': if (!pos) { // dots are not metacharacters in groups: [.]
 502                 p = nextchar(p);
 503                 // skip the next character
 504                 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
 505                 if (st < beg) { // word <= condition
 506                     if (p) return 0; else return 1;
 507                 }
 508                 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
 509                     st--;
 510                     if (st < beg) { // word <= condition
 511                         if (p) return 0; else return 1;
 512                     }
 513                 }
 514                 break;
 515             }
 516     default: {
 517                 if (*st == *p) {
 518                     p = nextchar(p);
 519                     if ((opts & aeUTF8) && (*st & 0x80)) {
 520                         st--;
 521                         while (p && (st >= beg)) {
 522                             if (*p != *st) {
 523                                 if (!pos) return 0;
 524                                 st = pos;
 525                                 break;
 526                             }
 527                             // first byte of the UTF-8 multibyte character
 528                             if ((*p & 0xc0) != 0x80) break;
 529                             p = nextchar(p);
 530                             st--;
 531                         }
 532                         if (pos && st != pos) {
 533                             if (neg) return 0;
 534                             else if (i == numconds) return 1;
 535                             ingroup = true;
 536                             while (p && *p != ']' && (p = nextchar(p)));
 537                             st--;
 538                         }
 539                         if (p && *p != ']') p = nextchar(p);
 540                     } else if (pos) {
 541                         if (neg) return 0;
 542                         else if (i == numconds) return 1;
 543                         ingroup = true;
 544                         while (p && *p != ']' && (p = nextchar(p)));
 545 //                      if (p && *p != ']') p = nextchar(p);
 546                         st--;
 547                     }
 548                     if (!pos) {
 549                         i++;
 550                         st--;
 551                     }
 552                     if (st < beg && p && *p != ']') return 0; // word <= condition
 553                 } else if (pos) { // group
 554                     p = nextchar(p);
 555                 } else return 0;
 556             }
 557       }
 558       if (!p) return 1;
 559     }
 560 }
 561
 562 // see if this suffix is present in the word
 563 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
 564     PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
 565     const FLAG badflag)
 566 {
 567     int                 tmpl;            // length of tmpword
 568     struct hentry *     he;              // hash entry pointer
 569     unsigned char *     cp;
 570     char                tmpword[MAXWORDUTF8LEN + 4];
 571     PfxEntry* ep = ppfx;
 572
 573     // if this suffix is being cross checked with a prefix
 574     // but it does not support cross products skip it
 575
 576     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
 577         return NULL;
 578
 579     // upon entry suffix is 0 length or already matches the end of the word.
 580     // So if the remaining root word has positive length
 581     // and if there are enough chars in root word and added back strip chars
 582     // to meet the number of characters conditions, then test it
 583
 584     tmpl = len - appndl;
 585     // the second condition is not enough for UTF-8 strings
 586     // it checked in test_condition()
 587
 588     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 589         (tmpl + stripl >= numconds)) {
 590
 591             // generate new root word by removing suffix and adding
 592             // back any characters that would have been stripped or
 593             // or null terminating the shorter string
 594
 595             strcpy (tmpword, word);
 596             cp = (unsigned char *)(tmpword + tmpl);
 597             if (stripl) {
 598                 strcpy ((char *)cp, strip);
 599                 tmpl += stripl;
 600                 cp = (unsigned char *)(tmpword + tmpl);
 601             } else *cp = '\0';
 602
 603             // now make sure all of the conditions on characters
 604             // are met.  Please see the appendix at the end of
 605             // this file for more info on exactly what is being
 606             // tested
 607
 608             // if all conditions are met then check if resulting
 609             // root word in the dictionary
 610
 611             if (test_condition((char *) cp, (char *) tmpword)) {
 612
 613 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
 614                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
 615 #endif
 616                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 617                     do {
 618                         // check conditional suffix (enabled by prefix)
 619                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
 620                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 621                             (((optflags & aeXPRODUCT) == 0) ||
 622                             (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
 623                              // enabled by prefix
 624                             ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
 625                             ) &&
 626                             // handle cont. class
 627                             ((!cclass) ||
 628                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 629                             ) &&
 630                             // check only in compound homonyms (bad flags)
 631                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
 632                             ) &&
 633                             // handle required flag
 634                             ((!needflag) ||
 635                               (TESTAFF(he->astr, needflag, he->alen) ||
 636                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 637                             )
 638                         ) return he;
 639                         he = he->next_homonym; // check homonyms
 640                     } while (he);
 641
 642                 // obsolote stemming code (used only by the
 643                 // experimental SuffixMgr:suggest_pos_stems)
 644                 // store resulting root in wlst
 645                 } else if (wlst && (*ns < maxSug)) {
 646                     int cwrd = 1;
 647                     for (int k=0; k < *ns; k++)
 648                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
 649                     if (cwrd) {
 650                         wlst[*ns] = mystrdup(tmpword);
 651                         if (wlst[*ns] == NULL) {
 652                             for (int j=0; j<*ns; j++) free(wlst[j]);
 653                             *ns = -1;
 654                             return NULL;
 655                         }
 656                         (*ns)++;
 657                     }
 658                 }
 659             }
 660     }
 661     return NULL;
 662 }
 663
 664 // see if two-level suffix is present in the word
 665 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
 666     PfxEntry* ppfx, const FLAG needflag)
 667 {
 668     int                 tmpl;            // length of tmpword
 669     struct hentry *     he;              // hash entry pointer
 670     unsigned char *     cp;
 671     char                tmpword[MAXWORDUTF8LEN + 4];
 672     PfxEntry* ep = ppfx;
 673
 674
 675     // if this suffix is being cross checked with a prefix
 676     // but it does not support cross products skip it
 677
 678     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 679         return NULL;
 680
 681     // upon entry suffix is 0 length or already matches the end of the word.
 682     // So if the remaining root word has positive length
 683     // and if there are enough chars in root word and added back strip chars
 684     // to meet the number of characters conditions, then test it
 685
 686     tmpl = len - appndl;
 687
 688     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 689        (tmpl + stripl >= numconds)) {
 690
 691             // generate new root word by removing suffix and adding
 692             // back any characters that would have been stripped or
 693             // or null terminating the shorter string
 694
 695             strcpy (tmpword, word);
 696             cp = (unsigned char *)(tmpword + tmpl);
 697             if (stripl) {
 698                 strcpy ((char *)cp, strip);
 699                 tmpl += stripl;
 700                 cp = (unsigned char *)(tmpword + tmpl);
 701             } else *cp = '\0';
 702
 703             // now make sure all of the conditions on characters
 704             // are met.  Please see the appendix at the end of
 705             // this file for more info on exactly what is being
 706             // tested
 707
 708             // if all conditions are met then recall suffix_check
 709
 710             if (test_condition((char *) cp, (char *) tmpword)) {
 711                 if (ppfx) {
 712                     // handle conditional suffix
 713                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 714                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 715                     else
 716                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
 717                 } else {
 718                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 719                 }
 720                 if (he) return he;
 721             }
 722     }
 723     return NULL;
 724 }
 725
 726 // see if two-level suffix is present in the word
 727 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
 728     PfxEntry* ppfx, const FLAG needflag)
 729 {
 730     int                 tmpl;            // length of tmpword
 731     unsigned char *     cp;
 732     char                tmpword[MAXWORDUTF8LEN + 4];
 733     PfxEntry* ep = ppfx;
 734     char * st;
 735
 736     char result[MAXLNLEN];
 737
 738     *result = '\0';
 739
 740     // if this suffix is being cross checked with a prefix
 741     // but it does not support cross products skip it
 742
 743     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 744         return NULL;
 745
 746     // upon entry suffix is 0 length or already matches the end of the word.
 747     // So if the remaining root word has positive length
 748     // and if there are enough chars in root word and added back strip chars
 749     // to meet the number of characters conditions, then test it
 750
 751     tmpl = len - appndl;
 752
 753     if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
 754        (tmpl + stripl >= numconds)) {
 755
 756             // generate new root word by removing suffix and adding
 757             // back any characters that would have been stripped or
 758             // or null terminating the shorter string
 759
 760             strcpy (tmpword, word);
 761             cp = (unsigned char *)(tmpword + tmpl);
 762             if (stripl) {
 763                 strcpy ((char *)cp, strip);
 764                 tmpl += stripl;
 765                 cp = (unsigned char *)(tmpword + tmpl);
 766             } else *cp = '\0';
 767
 768             // now make sure all of the conditions on characters
 769             // are met.  Please see the appendix at the end of
 770             // this file for more info on exactly what is being
 771             // tested
 772
 773             // if all conditions are met then recall suffix_check
 774
 775             if (test_condition((char *) cp, (char *) tmpword)) {
 776                 if (ppfx) {
 777                     // handle conditional suffix
 778                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
 779                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 780                         if (st) {
 781                             if (ppfx->getMorph()) {
 782                                 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
 783                                 mystrcat(result, " ", MAXLNLEN);
 784                             }
 785                             mystrcat(result,st, MAXLNLEN);
 786                             free(st);
 787                             mychomp(result);
 788                         }
 789                     } else {
 790                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
 791                         if (st) {
 792                             mystrcat(result, st, MAXLNLEN);
 793                             free(st);
 794                             mychomp(result);
 795                         }
 796                     }
 797                 } else {
 798                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 799                         if (st) {
 800                             mystrcat(result, st, MAXLNLEN);
 801                             free(st);
 802                             mychomp(result);
 803                         }
 804                 }
 805                 if (*result) return mystrdup(result);
 806             }
 807     }
 808     return NULL;
 809 }
 810
 811 // get next homonym with same affix
 812 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
 813     const FLAG cclass, const FLAG needflag)
 814 {
 815     PfxEntry* ep = ppfx;
 816     FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
 817
 818     while (he->next_homonym) {
 819         he = he->next_homonym;
 820         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 821                             ((optflags & aeXPRODUCT) == 0 ||
 822                             TESTAFF(he->astr, eFlag, he->alen) ||
 823                              // handle conditional suffix
 824                             ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
 825                             ) &&
 826                             // handle cont. class
 827                             ((!cclass) ||
 828                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 829                             ) &&
 830                             // handle required flag
 831                             ((!needflag) ||
 832                               (TESTAFF(he->astr, needflag, he->alen) ||
 833                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 834                             )
 835                         ) return he;
 836     }
 837     return NULL;
 838 }
 839
 840
 841 #if 0
 842
 843 Appendix:  Understanding Affix Code
 844
 845
 846 An affix is either a  prefix or a suffix attached to root words to make
 847 other words.
 848
 849 Basically a Prefix or a Suffix is set of AffEntry objects
 850 which store information about the prefix or suffix along
 851 with supporting routines to check if a word has a particular
 852 prefix or suffix or a combination.
 853
 854 The structure affentry is defined as follows:
 855
 856 struct affentry
 857 {
 858    unsigned short aflag;    // ID used to represent the affix
 859    char * strip;            // string to strip before adding affix
 860    char * appnd;            // the affix string to add
 861    unsigned char stripl;    // length of the strip string
 862    unsigned char appndl;    // length of the affix string
 863    char numconds;           // the number of conditions that must be met
 864    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
 865    char   conds[SETSIZE];   // array which encodes the conditions to be met
 866 };
 867
 868
 869 Here is a suffix borrowed from the en_US.aff file.  This file
 870 is whitespace delimited.
 871
 872 SFX D Y 4
 873 SFX D   0     e          d
 874 SFX D   y     ied        [^aeiou]y
 875 SFX D   0     ed         [^ey]
 876 SFX D   0     ed         [aeiou]y
 877
 878 This information can be interpreted as follows:
 879
 880 In the first line has 4 fields
 881
 882 Field
 883 -----
 884 1     SFX - indicates this is a suffix
 885 2     D   - is the name of the character flag which represents this suffix
 886 3     Y   - indicates it can be combined with prefixes (cross product)
 887 4     4   - indicates that sequence of 4 affentry structures are needed to
 888                properly store the affix information
 889
 890 The remaining lines describe the unique information for the 4 SfxEntry
 891 objects that make up this affix.  Each line can be interpreted
 892 as follows: (note fields 1 and 2 are as a check against line 1 info)
 893
 894 Field
 895 -----
 896 1     SFX         - indicates this is a suffix
 897 2     D           - is the name of the character flag for this affix
 898 3     y           - the string of chars to strip off before adding affix
 899                          (a 0 here indicates the NULL string)
 900 4     ied         - the string of affix characters to add
 901 5     [^aeiou]y   - the conditions which must be met before the affix
 902                     can be applied
 903
 904 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
 905 there are 2 conditions that must be met.  The first condition is that
 906 the next to the last character in the word must *NOT* be any of the
 907 following "a", "e", "i", "o" or "u".  The second condition is that
 908 the last character of the word must end in "y".
 909
 910 So how can we encode this information concisely and be able to
 911 test for both conditions in a fast manner?  The answer is found
 912 but studying the wonderful ispell code of Geoff Kuenning, et.al.
 913 (now available under a normal BSD license).
 914
 915 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
 916 using a character (cast to an unsigned char) of a string, we have 8 bits
 917 of information we can store about that character.  Specifically we
 918 could use each bit to say if that character is allowed in any of the
 919 last (or first for prefixes) 8 characters of the word.
 920
 921 Basically, each character at one end of the word (up to the number
 922 of conditions) is used to index into the conds array and the resulting
 923 value found there says whether the that character is valid for a
 924 specific character position in the word.
 925
 926 For prefixes, it does this by setting bit 0 if that char is valid
 927 in the first position, bit 1 if valid in the second position, and so on.
 928
 929 If a bit is not set, then that char is not valid for that postion in the
 930 word.
 931
 932 If working with suffixes bit 0 is used for the character closest
 933 to the front, bit 1 for the next character towards the end, ...,
 934 with bit numconds-1 representing the last char at the end of the string.
 935
 936 Note: since entries in the conds[] are 8 bits, only 8 conditions
 937 (read that only 8 character positions) can be examined at one
 938 end of a word (the beginning for prefixes and the end for suffixes.
 939
 940 So to make this clearer, lets encode the conds array values for the
 941 first two affentries for the suffix D described earlier.
 942
 943
 944   For the first affentry:
 945      numconds = 1             (only examine the last character)
 946
 947      conds['e'] =  (1 << 0)   (the word must end in an E)
 948      all others are all 0
 949
 950   For the second affentry:
 951      numconds = 2             (only examine the last two characters)
 952
 953      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
 954          where X is all characters *but* a, e, i, o, or u
 955
 956
 957      conds['y'] = (1 << 1)     (the last char must be a y)
 958      all other bits for all other entries in the conds array are zero
 959
 960
 961 #endif
 962