ext/hunspell/affentry.cxx

   1 #include "license.hunspell"
   2 #include "license.myspell"
   3
   4 #ifndef MOZILLA_CLIENT
   5 #include <cstdlib>
   6 #include <cstring>
   7 #include <cctype>
   8 #include <cstdio>
   9 #else
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <stdio.h>
  13 #include <ctype.h>
  14 #endif
  15
  16 #include "affentry.hxx"
  17 #include "csutil.hxx"
  18
  19 #ifndef MOZILLA_CLIENT
  20 #ifndef W32
  21 using namespace std;
  22 #endif
  23 #endif
  24
  25
  26 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
  27 {
  28   // register affix manager
  29   pmyMgr = pmgr;
  30
  31   // set up its intial values
  32
  33   aflag = dp->aflag;         // flag
  34   strip = dp->strip;         // string to strip
  35   appnd = dp->appnd;         // string to append
  36   stripl = dp->stripl;       // length of strip string
  37   appndl = dp->appndl;       // length of append string
  38   numconds = dp->numconds;   // number of conditions to match
  39   opts = dp->opts;         // cross product flag
  40   // then copy over all of the conditions
  41   memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
  42   next = NULL;
  43   nextne = NULL;
  44   nexteq = NULL;
  45 #ifdef HUNSPELL_EXPERIMENTAL
  46   morphcode = dp->morphcode;
  47 #endif
  48   contclass = dp->contclass;
  49   contclasslen = dp->contclasslen;
  50 }
  51
  52
  53 PfxEntry::~PfxEntry()
  54 {
  55     aflag = 0;
  56     if (appnd) free(appnd);
  57     if (strip) free(strip);
  58     pmyMgr = NULL;
  59     appnd = NULL;
  60     strip = NULL;
  61     if (opts & aeUTF8) {
  62         for (int i = 0; i < 8; i++) {
  63             if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
  64         }
  65     }
  66 #ifdef HUNSPELL_EXPERIMENTAL
  67     if (morphcode && !(opts & aeALIASM)) free(morphcode);
  68 #endif
  69     if (contclass && !(opts & aeALIASF)) free(contclass);
  70 }
  71
  72 // add prefix to this word assuming conditions hold
  73 char * PfxEntry::add(const char * word, int len)
  74 {
  75     char tword[MAXWORDUTF8LEN + 4];
  76
  77     if ((len > stripl) && (len >= numconds) && test_condition(word) &&
  78        (!stripl || (strncmp(word, strip, stripl) == 0)) &&
  79        ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
  80     /* we have a match so add prefix */
  81               char * pp = tword;
  82               if (appndl) {
  83                   strcpy(tword,appnd);
  84                   pp += appndl;
  85                }
  86                strcpy(pp, (word + stripl));
  87                return mystrdup(tword);
  88      }
  89      return NULL;
  90 }
  91
  92
  93 inline int PfxEntry::test_condition(const char * st)
  94 {
  95     int cond;
  96     unsigned char * cp = (unsigned char *)st;
  97     if (!(opts & aeUTF8)) { // 256-character codepage
  98         for (cond = 0;  cond < numconds;  cond++) {
  99             if ((conds.base[*cp++] & (1 << cond)) == 0) return 0;
 100         }
 101     } else { // UTF-8 encoding
 102       unsigned short wc;
 103       for (cond = 0;  cond < numconds;  cond++) {
 104         // a simple 7-bit ASCII character in UTF-8
 105         if ((*cp >> 7) == 0) {
 106             // also check limit (end of word)
 107             if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0;
 108         // UTF-8 multibyte character
 109         } else {
 110             // not dot wildcard in rule
 111             if (!conds.utf8.all[cond]) {
 112                 if (conds.utf8.neg[cond]) {
 113                     u8_u16((w_char *) &wc, 1, (char *) cp);
 114                     if (conds.utf8.wchars[cond] &&
 115                         flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
 116                             wc, (short) conds.utf8.wlen[cond])) return 0;
 117                 } else {
 118                     if (!conds.utf8.wchars[cond]) return 0;
 119                     u8_u16((w_char *) &wc, 1, (char *) cp);
 120                     if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
 121                          wc, (short)conds.utf8.wlen[cond])) return 0;
 122                 }
 123             }
 124             // jump to next UTF-8 character
 125             for(cp++; (*cp & 0xc0) == 0x80; cp++);
 126         }
 127       }
 128     }
 129     return 1;
 130 }
 131
 132
 133 // check if this prefix entry matches
 134 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
 135 {
 136     int                 tmpl;   // length of tmpword
 137     struct hentry *     he;     // hash entry of root word or NULL
 138     char                tmpword[MAXWORDUTF8LEN + 4];
 139
 140     // on entry prefix is 0 length or already matches the beginning of the word.
 141     // So if the remaining root word has positive length
 142     // and if there are enough chars in root word and added back strip chars
 143     // to meet the number of characters conditions, then test it
 144
 145      tmpl = len - appndl;
 146
 147      if ((tmpl > 0) &&  (tmpl + stripl >= numconds)) {
 148
 149             // generate new root word by removing prefix and adding
 150             // back any characters that would have been stripped
 151
 152             if (stripl) strcpy (tmpword, strip);
 153             strcpy ((tmpword + stripl), (word + appndl));
 154
 155             // now make sure all of the conditions on characters
 156             // are met.  Please see the appendix at the end of
 157             // this file for more info on exactly what is being
 158             // tested
 159
 160             // if all conditions are met then check if resulting
 161             // root word in the dictionary
 162
 163             if (test_condition(tmpword)) {
 164                 tmpl += stripl;
 165                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 166                    do {
 167                       if (TESTAFF(he->astr, aflag, he->alen) &&
 168                         // forbid single prefixes with pseudoroot flag
 169                         ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
 170                         // needflag
 171                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 172                          (contclass && TESTAFF(contclass, needflag, contclasslen))))
 173                             return he;
 174                       he = he->next_homonym; // check homonyms
 175                    } while (he);
 176                 }
 177
 178                 // prefix matched but no root word was found
 179                 // if aeXPRODUCT is allowed, try again but now
 180                 // ross checked combined with a suffix
 181
 182                 //if ((opts & aeXPRODUCT) && in_compound) {
 183                 if ((opts & aeXPRODUCT)) {
 184                    he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
 185                         0, NULL, FLAG_NULL, needflag, in_compound);
 186                    if (he) return he;
 187                 }
 188             }
 189      }
 190     return NULL;
 191 }
 192
 193 // check if this prefix entry matches
 194 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
 195     char in_compound, const FLAG needflag)
 196 {
 197     int                 tmpl;   // length of tmpword
 198     struct hentry *     he;     // hash entry of root word or NULL
 199     char                tmpword[MAXWORDUTF8LEN + 4];
 200
 201     // on entry prefix is 0 length or already matches the beginning of the word.
 202     // So if the remaining root word has positive length
 203     // and if there are enough chars in root word and added back strip chars
 204     // to meet the number of characters conditions, then test it
 205
 206      tmpl = len - appndl;
 207
 208      if ((tmpl > 0) &&  (tmpl + stripl >= numconds)) {
 209
 210             // generate new root word by removing prefix and adding
 211             // back any characters that would have been stripped
 212
 213             if (stripl) strcpy (tmpword, strip);
 214             strcpy ((tmpword + stripl), (word + appndl));
 215
 216             // now make sure all of the conditions on characters
 217             // are met.  Please see the appendix at the end of
 218             // this file for more info on exactly what is being
 219             // tested
 220
 221             // if all conditions are met then check if resulting
 222             // root word in the dictionary
 223
 224             if (test_condition(tmpword)) {
 225                 tmpl += stripl;
 226
 227                 // prefix matched but no root word was found
 228                 // if aeXPRODUCT is allowed, try again but now
 229                 // cross checked combined with a suffix
 230
 231                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 232                    he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
 233                    if (he) return he;
 234                 }
 235             }
 236      }
 237     return NULL;
 238 }
 239
 240 #ifdef HUNSPELL_EXPERIMENTAL
 241 // check if this prefix entry matches
 242 char * PfxEntry::check_twosfx_morph(const char * word, int len,
 243          char in_compound, const FLAG needflag)
 244 {
 245     int                 tmpl;   // length of tmpword
 246     char                tmpword[MAXWORDUTF8LEN + 4];
 247
 248     // on entry prefix is 0 length or already matches the beginning of the word.
 249     // So if the remaining root word has positive length
 250     // and if there are enough chars in root word and added back strip chars
 251     // to meet the number of characters conditions, then test it
 252
 253      tmpl = len - appndl;
 254
 255      if ((tmpl > 0) &&  (tmpl + stripl >= numconds)) {
 256
 257             // generate new root word by removing prefix and adding
 258             // back any characters that would have been stripped
 259
 260             if (stripl) strcpy (tmpword, strip);
 261             strcpy ((tmpword + stripl), (word + appndl));
 262
 263             // now make sure all of the conditions on characters
 264             // are met.  Please see the appendix at the end of
 265             // this file for more info on exactly what is being
 266             // tested
 267
 268             // if all conditions are met then check if resulting
 269             // root word in the dictionary
 270
 271             if (test_condition(tmpword)) {
 272                 tmpl += stripl;
 273
 274                 // prefix matched but no root word was found
 275                 // if aeXPRODUCT is allowed, try again but now
 276                 // ross checked combined with a suffix
 277
 278                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 279                     return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
 280                              aeXPRODUCT, (AffEntry *)this, needflag);
 281                 }
 282             }
 283      }
 284     return NULL;
 285 }
 286
 287 // check if this prefix entry matches
 288 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
 289 {
 290     int                 tmpl;   // length of tmpword
 291     struct hentry *     he;     // hash entry of root word or NULL
 292     char                tmpword[MAXWORDUTF8LEN + 4];
 293     char                result[MAXLNLEN];
 294     char * st;
 295
 296     *result = '\0';
 297
 298     // on entry prefix is 0 length or already matches the beginning of the word.
 299     // So if the remaining root word has positive length
 300     // and if there are enough chars in root word and added back strip chars
 301     // to meet the number of characters conditions, then test it
 302
 303      tmpl = len - appndl;
 304
 305      if ((tmpl > 0) &&  (tmpl + stripl >= numconds)) {
 306
 307             // generate new root word by removing prefix and adding
 308             // back any characters that would have been stripped
 309
 310             if (stripl) strcpy (tmpword, strip);
 311             strcpy ((tmpword + stripl), (word + appndl));
 312
 313             // now make sure all of the conditions on characters
 314             // are met.  Please see the appendix at the end of
 315             // this file for more info on exactly what is being
 316             // tested
 317
 318             // if all conditions are met then check if resulting
 319             // root word in the dictionary
 320
 321             if (test_condition(tmpword)) {
 322                 tmpl += stripl;
 323                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 324                     do {
 325                       if (TESTAFF(he->astr, aflag, he->alen) &&
 326                         // forbid single prefixes with pseudoroot flag
 327                         ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
 328                         // needflag
 329                         ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
 330                          (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
 331                             if (morphcode) strcat(result, morphcode); else strcat(result,getKey());
 332                             if (he->description) {
 333                                 if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word);
 334                                 strcat(result,he->description);
 335                             }
 336                             strcat(result, "\n");
 337                       }
 338                       he = he->next_homonym;
 339                     } while (he);
 340                 }
 341
 342                 // prefix matched but no root word was found
 343                 // if aeXPRODUCT is allowed, try again but now
 344                 // ross checked combined with a suffix
 345
 346                 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
 347                    st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
 348                      FLAG_NULL, needflag);
 349                    if (st) {
 350                         strcat(result, st);
 351                         free(st);
 352                    }
 353                 }
 354             }
 355      }
 356
 357     if (*result) return mystrdup(result);
 358     return NULL;
 359 }
 360 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
 361
 362 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
 363 {
 364   // register affix manager
 365   pmyMgr = pmgr;
 366
 367   // set up its intial values
 368   aflag = dp->aflag;         // char flag
 369   strip = dp->strip;         // string to strip
 370   appnd = dp->appnd;         // string to append
 371   stripl = dp->stripl;       // length of strip string
 372   appndl = dp->appndl;       // length of append string
 373   numconds = dp->numconds;   // number of conditions to match
 374   opts = dp->opts;         // cross product flag
 375
 376   // then copy over all of the conditions
 377   memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
 378
 379   rappnd = myrevstrdup(appnd);
 380
 381 #ifdef HUNSPELL_EXPERIMENTAL
 382   morphcode = dp->morphcode;
 383 #endif
 384   contclass = dp->contclass;
 385   contclasslen = dp->contclasslen;
 386 }
 387
 388
 389 SfxEntry::~SfxEntry()
 390 {
 391     aflag = 0;
 392     if (appnd) free(appnd);
 393     if (rappnd) free(rappnd);
 394     if (strip) free(strip);
 395     pmyMgr = NULL;
 396     appnd = NULL;
 397     strip = NULL;
 398     if (opts & aeUTF8) {
 399         for (int i = 0; i < 8; i++) {
 400             if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
 401         }
 402     }
 403 #ifdef HUNSPELL_EXPERIMENTAL
 404     if (morphcode && !(opts & aeALIASM)) free(morphcode);
 405 #endif
 406     if (contclass && !(opts & aeALIASF)) free(contclass);
 407 }
 408
 409 // add suffix to this word assuming conditions hold
 410 char * SfxEntry::add(const char * word, int len)
 411 {
 412     char                tword[MAXWORDUTF8LEN + 4];
 413
 414      /* make sure all conditions match */
 415      if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
 416         (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
 417         ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
 418               /* we have a match so add suffix */
 419               strcpy(tword,word);
 420               if (appndl) {
 421                   strcpy(tword + len - stripl, appnd);
 422               } else {
 423                   *(tword + len - stripl) = '\0';
 424               }
 425               return mystrdup(tword);
 426      }
 427      return NULL;
 428 }
 429
 430
 431 inline int SfxEntry::test_condition(const char * st, const char * beg)
 432 {
 433     int cond;
 434     unsigned char * cp = (unsigned char *) st;
 435     if (!(opts & aeUTF8)) { // 256-character codepage
 436         // Dömölki affix algorithm
 437         for (cond = numconds;  --cond >= 0; ) {
 438             if ((conds.base[*--cp] & (1 << cond)) == 0) return 0;
 439         }
 440     } else { // UTF-8 encoding
 441       unsigned short wc;
 442       for (cond = numconds;  --cond >= 0; ) {
 443         // go to next character position and check limit
 444         if ((char *) --cp < beg) return 0;
 445         // a simple 7-bit ASCII character in UTF-8
 446         if ((*cp >> 7) == 0) {
 447             if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0;
 448         // UTF-8 multibyte character
 449         } else {
 450             // go to first character of UTF-8 multibyte character
 451             for (; (*cp & 0xc0) == 0x80; cp--);
 452             // not dot wildcard in rule
 453             if (!conds.utf8.all[cond]) {
 454                 if (conds.utf8.neg[cond]) {
 455                     u8_u16((w_char *) &wc, 1, (char *) cp);
 456                     if (conds.utf8.wchars[cond] &&
 457                         flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
 458                             wc, (short) conds.utf8.wlen[cond])) return 0;
 459                 } else {
 460                     if (!conds.utf8.wchars[cond]) return 0;
 461                     u8_u16((w_char *) &wc, 1, (char *) cp);
 462                     if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
 463                          wc, (short)conds.utf8.wlen[cond])) return 0;
 464                 }
 465             }
 466         }
 467       }
 468     }
 469     return 1;
 470 }
 471
 472
 473
 474 // see if this suffix is present in the word
 475 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
 476     AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
 477     const FLAG badflag)
 478 {
 479     int                 tmpl;            // length of tmpword
 480     struct hentry *     he;              // hash entry pointer
 481     unsigned char *     cp;
 482     char                tmpword[MAXWORDUTF8LEN + 4];
 483     PfxEntry* ep = (PfxEntry *) ppfx;
 484
 485     // if this suffix is being cross checked with a prefix
 486     // but it does not support cross products skip it
 487
 488     if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
 489         return NULL;
 490
 491     // upon entry suffix is 0 length or already matches the end of the word.
 492     // So if the remaining root word has positive length
 493     // and if there are enough chars in root word and added back strip chars
 494     // to meet the number of characters conditions, then test it
 495
 496     tmpl = len - appndl;
 497     // the second condition is not enough for UTF-8 strings
 498     // it checked in test_condition()
 499
 500     if ((tmpl > 0)  &&  (tmpl + stripl >= numconds)) {
 501
 502             // generate new root word by removing suffix and adding
 503             // back any characters that would have been stripped or
 504             // or null terminating the shorter string
 505
 506             strcpy (tmpword, word);
 507             cp = (unsigned char *)(tmpword + tmpl);
 508             if (stripl) {
 509                 strcpy ((char *)cp, strip);
 510                 tmpl += stripl;
 511                 cp = (unsigned char *)(tmpword + tmpl);
 512             } else *cp = '\0';
 513
 514             // now make sure all of the conditions on characters
 515             // are met.  Please see the appendix at the end of
 516             // this file for more info on exactly what is being            // tested
 517
 518             // if all conditions are met then check if resulting
 519             // root word in the dictionary
 520
 521             if (test_condition((char *) cp, (char *) tmpword)) {
 522
 523 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
 524                 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
 525 #endif
 526                 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
 527                     do {
 528                         // check conditional suffix (enabled by prefix)
 529                         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
 530                                     TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 531                             (((optflags & aeXPRODUCT) == 0) ||
 532                             TESTAFF(he->astr, ep->getFlag(), he->alen) ||
 533                              // enabled by prefix
 534                             ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 535                             ) &&
 536                             // handle cont. class
 537                             ((!cclass) ||
 538                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 539                             ) &&
 540                             // check only in compound homonyms (bad flags)
 541                             (!badflag || !TESTAFF(he->astr, badflag, he->alen)
 542                             ) &&
 543                             // handle required flag
 544                             ((!needflag) ||
 545                               (TESTAFF(he->astr, needflag, he->alen) ||
 546                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 547                             )
 548                         ) return he;
 549                         he = he->next_homonym; // check homonyms
 550                     } while (he);
 551
 552                 // obsolote stemming code (used only by the
 553                 // experimental SuffixMgr:suggest_pos_stems)
 554                 // store resulting root in wlst
 555                 } else if (wlst && (*ns < maxSug)) {
 556                     int cwrd = 1;
 557                     for (int k=0; k < *ns; k++)
 558                         if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
 559                     if (cwrd) {
 560                         wlst[*ns] = mystrdup(tmpword);
 561                         if (wlst[*ns] == NULL) {
 562                             for (int j=0; j<*ns; j++) free(wlst[j]);
 563                             *ns = -1;
 564                             return NULL;
 565                         }
 566                         (*ns)++;
 567                     }
 568                 }
 569             }
 570     }
 571     return NULL;
 572 }
 573
 574 // see if two-level suffix is present in the word
 575 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
 576     AffEntry* ppfx, const FLAG needflag)
 577 {
 578     int                 tmpl;            // length of tmpword
 579     struct hentry *     he;              // hash entry pointer
 580     unsigned char *     cp;
 581     char                tmpword[MAXWORDUTF8LEN + 4];
 582     PfxEntry* ep = (PfxEntry *) ppfx;
 583
 584
 585     // if this suffix is being cross checked with a prefix
 586     // but it does not support cross products skip it
 587
 588     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 589         return NULL;
 590
 591     // upon entry suffix is 0 length or already matches the end of the word.
 592     // So if the remaining root word has positive length
 593     // and if there are enough chars in root word and added back strip chars
 594     // to meet the number of characters conditions, then test it
 595
 596     tmpl = len - appndl;
 597
 598     if ((tmpl > 0)  &&  (tmpl + stripl >= numconds)) {
 599
 600             // generate new root word by removing suffix and adding
 601             // back any characters that would have been stripped or
 602             // or null terminating the shorter string
 603
 604             strcpy (tmpword, word);
 605             cp = (unsigned char *)(tmpword + tmpl);
 606             if (stripl) {
 607                 strcpy ((char *)cp, strip);
 608                 tmpl += stripl;
 609                 cp = (unsigned char *)(tmpword + tmpl);
 610             } else *cp = '\0';
 611
 612             // now make sure all of the conditions on characters
 613             // are met.  Please see the appendix at the end of
 614             // this file for more info on exactly what is being
 615             // tested
 616
 617             // if all conditions are met then recall suffix_check
 618
 619             if (test_condition((char *) cp, (char *) tmpword)) {
 620                 if (ppfx) {
 621                     // handle conditional suffix
 622                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 623                         he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 624                     else
 625                         he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
 626                 } else {
 627                     he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
 628                 }
 629                 if (he) return he;
 630             }
 631     }
 632     return NULL;
 633 }
 634
 635 #ifdef HUNSPELL_EXPERIMENTAL
 636 // see if two-level suffix is present in the word
 637 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
 638     AffEntry* ppfx, const FLAG needflag)
 639 {
 640     int                 tmpl;            // length of tmpword
 641     unsigned char *     cp;
 642     char                tmpword[MAXWORDUTF8LEN + 4];
 643     PfxEntry* ep = (PfxEntry *) ppfx;
 644     char * st;
 645
 646     char result[MAXLNLEN];
 647
 648     *result = '\0';
 649
 650     // if this suffix is being cross checked with a prefix
 651     // but it does not support cross products skip it
 652
 653     if ((optflags & aeXPRODUCT) != 0 &&  (opts & aeXPRODUCT) == 0)
 654         return NULL;
 655
 656     // upon entry suffix is 0 length or already matches the end of the word.
 657     // So if the remaining root word has positive length
 658     // and if there are enough chars in root word and added back strip chars
 659     // to meet the number of characters conditions, then test it
 660
 661     tmpl = len - appndl;
 662
 663     if ((tmpl > 0)  &&  (tmpl + stripl >= numconds)) {
 664
 665             // generate new root word by removing suffix and adding
 666             // back any characters that would have been stripped or
 667             // or null terminating the shorter string
 668
 669             strcpy (tmpword, word);
 670             cp = (unsigned char *)(tmpword + tmpl);
 671             if (stripl) {
 672                 strcpy ((char *)cp, strip);
 673                 tmpl += stripl;
 674                 cp = (unsigned char *)(tmpword + tmpl);
 675             } else *cp = '\0';
 676
 677             // now make sure all of the conditions on characters
 678             // are met.  Please see the appendix at the end of
 679             // this file for more info on exactly what is being
 680             // tested
 681
 682             // if all conditions are met then recall suffix_check
 683
 684             if (test_condition((char *) cp, (char *) tmpword)) {
 685                 if (ppfx) {
 686                     // handle conditional suffix
 687                     if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
 688                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 689                         if (st) {
 690                             if (((PfxEntry *) ppfx)->getMorph()) {
 691                                 strcat(result, ((PfxEntry *) ppfx)->getMorph());
 692                             }
 693                             strcat(result,st);
 694                             free(st);
 695                             mychomp(result);
 696                         }
 697                     } else {
 698                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
 699                         if (st) {
 700                             strcat(result, st);
 701                             free(st);
 702                             mychomp(result);
 703                         }
 704                     }
 705                 } else {
 706                         st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
 707                         if (st) {
 708                             strcat(result, st);
 709                             free(st);
 710                             mychomp(result);
 711                         }
 712                 }
 713                 if (*result) return mystrdup(result);
 714             }
 715     }
 716     return NULL;
 717 }
 718 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
 719
 720 // get next homonym with same affix
 721 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
 722     const FLAG cclass, const FLAG needflag)
 723 {
 724     PfxEntry* ep = (PfxEntry *) ppfx;
 725
 726     while (he->next_homonym) {
 727         he = he->next_homonym;
 728         if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
 729                             ((optflags & aeXPRODUCT) == 0 ||
 730                             TESTAFF(he->astr, ep->getFlag(), he->alen) ||
 731                              // handle conditional suffix
 732                             ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
 733                             ) &&
 734                             // handle cont. class
 735                             ((!cclass) ||
 736                                 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
 737                             ) &&
 738                             // handle required flag
 739                             ((!needflag) ||
 740                               (TESTAFF(he->astr, needflag, he->alen) ||
 741                               ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
 742                             )
 743                         ) return he;
 744     }
 745     return NULL;
 746 }
 747
 748
 749 #if 0
 750
 751 Appendix:  Understanding Affix Code
 752
 753
 754 An affix is either a  prefix or a suffix attached to root words to make
 755 other words.
 756
 757 Basically a Prefix or a Suffix is set of AffEntry objects
 758 which store information about the prefix or suffix along
 759 with supporting routines to check if a word has a particular
 760 prefix or suffix or a combination.
 761
 762 The structure affentry is defined as follows:
 763
 764 struct affentry
 765 {
 766    unsigned short aflag;    // ID used to represent the affix
 767    char * strip;            // string to strip before adding affix
 768    char * appnd;            // the affix string to add
 769    unsigned char stripl;    // length of the strip string
 770    unsigned char appndl;    // length of the affix string
 771    char numconds;           // the number of conditions that must be met
 772    char opts;               // flag: aeXPRODUCT- combine both prefix and suffix
 773    char   conds[SETSIZE];   // array which encodes the conditions to be met
 774 };
 775
 776
 777 Here is a suffix borrowed from the en_US.aff file.  This file
 778 is whitespace delimited.
 779
 780 SFX D Y 4
 781 SFX D   0     e          d
 782 SFX D   y     ied        [^aeiou]y
 783 SFX D   0     ed         [^ey]
 784 SFX D   0     ed         [aeiou]y
 785
 786 This information can be interpreted as follows:
 787
 788 In the first line has 4 fields
 789
 790 Field
 791 -----
 792 1     SFX - indicates this is a suffix
 793 2     D   - is the name of the character flag which represents this suffix
 794 3     Y   - indicates it can be combined with prefixes (cross product)
 795 4     4   - indicates that sequence of 4 affentry structures are needed to
 796                properly store the affix information
 797
 798 The remaining lines describe the unique information for the 4 SfxEntry
 799 objects that make up this affix.  Each line can be interpreted
 800 as follows: (note fields 1 and 2 are as a check against line 1 info)
 801
 802 Field
 803 -----
 804 1     SFX         - indicates this is a suffix
 805 2     D           - is the name of the character flag for this affix
 806 3     y           - the string of chars to strip off before adding affix
 807                          (a 0 here indicates the NULL string)
 808 4     ied         - the string of affix characters to add
 809 5     [^aeiou]y   - the conditions which must be met before the affix
 810                     can be applied
 811
 812 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
 813 there are 2 conditions that must be met.  The first condition is that
 814 the next to the last character in the word must *NOT* be any of the
 815 following "a", "e", "i", "o" or "u".  The second condition is that
 816 the last character of the word must end in "y".
 817
 818 So how can we encode this information concisely and be able to
 819 test for both conditions in a fast manner?  The answer is found
 820 but studying the wonderful ispell code of Geoff Kuenning, et.al.
 821 (now available under a normal BSD license).
 822
 823 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
 824 using a character (cast to an unsigned char) of a string, we have 8 bits
 825 of information we can store about that character.  Specifically we
 826 could use each bit to say if that character is allowed in any of the
 827 last (or first for prefixes) 8 characters of the word.
 828
 829 Basically, each character at one end of the word (up to the number
 830 of conditions) is used to index into the conds array and the resulting
 831 value found there says whether the that character is valid for a
 832 specific character position in the word.
 833
 834 For prefixes, it does this by setting bit 0 if that char is valid
 835 in the first position, bit 1 if valid in the second position, and so on.
 836
 837 If a bit is not set, then that char is not valid for that postion in the
 838 word.
 839
 840 If working with suffixes bit 0 is used for the character closest
 841 to the front, bit 1 for the next character towards the end, ...,
 842 with bit numconds-1 representing the last char at the end of the string.
 843
 844 Note: since entries in the conds[] are 8 bits, only 8 conditions
 845 (read that only 8 character positions) can be examined at one
 846 end of a word (the beginning for prefixes and the end for suffixes.
 847
 848 So to make this clearer, lets encode the conds array values for the
 849 first two affentries for the suffix D described earlier.
 850
 851
 852   For the first affentry:
 853      numconds = 1             (only examine the last character)
 854
 855      conds['e'] =  (1 << 0)   (the word must end in an E)
 856      all others are all 0
 857
 858   For the second affentry:
 859      numconds = 2             (only examine the last two characters)
 860
 861      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
 862          where X is all characters *but* a, e, i, o, or u
 863
 864
 865      conds['y'] = (1 << 1)     (the last char must be a y)
 866      all other bits for all other entries in the conds array are zero
 867
 868
 869 #endif
 870