ext/hunspell/affixmgr.cxx

   1 #include "license.hunspell"
   2 #include "license.myspell"
   3
   4 #ifndef MOZILLA_CLIENT
   5 #include <cstdlib>
   6 #include <cstring>
   7 #include <cctype>
   8 #include <cstdio>
   9 #else
  10 #include <stdlib.h>
  11 #include <string.h>
  12 #include <stdio.h>
  13 #include <ctype.h>
  14 #endif
  15
  16 #include "affixmgr.hxx"
  17 #include "affentry.hxx"
  18 #include "langnum.hxx"
  19
  20 #include "csutil.hxx"
  21
  22 #ifndef MOZILLA_CLIENT
  23 #ifndef W32
  24 using namespace std;
  25 #endif
  26 #endif
  27
  28 AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
  29 {
  30   // register hash manager and load affix data from aff file
  31   pHMgr = ptr;
  32   trystring = NULL;
  33   encoding=NULL;
  34   utf8 = 0;
  35   complexprefixes = 0;
  36   maptable = NULL;
  37   nummap = 0;
  38   breaktable = NULL;
  39   numbreak = 0;
  40   reptable = NULL;
  41   numrep = 0;
  42   checkcpdtable = NULL;
  43   numcheckcpd = 0;
  44   defcpdtable = NULL;
  45   numdefcpd = 0;
  46   compoundflag = FLAG_NULL; // permits word in compound forms
  47   compoundbegin = FLAG_NULL; // may be first word in compound forms
  48   compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  49   compoundend = FLAG_NULL; // may be last word in compound forms
  50   compoundroot = FLAG_NULL; // compound word signing flag
  51   compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  52   compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  53   checkcompounddup = 0; // forbid double words in compounds
  54   checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  55   checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  56   checkcompoundtriple = 0; // forbid compounds with triple letters
  57   forbiddenword = FLAG_NULL; // forbidden word signing flag
  58   nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  59   lang = NULL; // language
  60   langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  61   pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes
  62   cpdwordmax = -1; // default: unlimited wordcount in compound words
  63   cpdmin = -1;  // undefined
  64   cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  65   cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  66   cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  67   cpdvowels_utf16_len=0; // vowels
  68   pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  69   sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  70   cpdsyllablenum=NULL; // syllable count incrementing flag
  71   checknum=0; // checking numbers, and word with numbers
  72   wordchars=NULL; // letters + spec. word characters
  73   wordchars_utf16=NULL; // letters + spec. word characters
  74   wordchars_utf16_len=0; // letters + spec. word characters
  75   ignorechars=NULL; // letters + spec. word characters
  76   ignorechars_utf16=NULL; // letters + spec. word characters
  77   ignorechars_utf16_len=0; // letters + spec. word characters
  78   version=NULL; // affix and dictionary file version string
  79   havecontclass=0; // flags of possible continuing classes (double affix)
  80   // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  81   // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  82   lemma_present = FLAG_NULL;
  83   circumfix = FLAG_NULL;
  84   onlyincompound = FLAG_NULL;
  85   flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
  86   maxngramsugs = -1; // undefined
  87   nosplitsugs = 0;
  88   sugswithdots = 0;
  89   keepcase = 0;
  90   checksharps = 0;
  91
  92   derived = NULL; // XXX not threadsafe variable for experimental stemming
  93   sfx = NULL;
  94   pfx = NULL;
  95
  96   for (int i=0; i < SETSIZE; i++) {
  97      pStart[i] = NULL;
  98      sStart[i] = NULL;
  99      pFlag[i] = NULL;
 100      sFlag[i] = NULL;
 101   }
 102
 103   for (int j=0; j < CONTSIZE; j++) {
 104     contclasses[j] = 0;
 105   }
 106
 107   if (parse_file(affpath)) {
 108      HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
 109      wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
 110   }
 111
 112   if (cpdmin == -1) cpdmin = MINCPDLEN;
 113
 114 }
 115
 116
 117 AffixMgr::~AffixMgr()
 118 {
 119
 120   // pass through linked prefix entries and clean up
 121   for (int i=0; i < SETSIZE ;i++) {
 122        pFlag[i] = NULL;
 123        PfxEntry * ptr = (PfxEntry *)pStart[i];
 124        PfxEntry * nptr = NULL;
 125        while (ptr) {
 126             nptr = ptr->getNext();
 127             delete(ptr);
 128             ptr = nptr;
 129             nptr = NULL;
 130        }
 131   }
 132
 133   // pass through linked suffix entries and clean up
 134   for (int j=0; j < SETSIZE ; j++) {
 135        sFlag[j] = NULL;
 136        SfxEntry * ptr = (SfxEntry *)sStart[j];
 137        SfxEntry * nptr = NULL;
 138        while (ptr) {
 139             nptr = ptr->getNext();
 140             delete(ptr);
 141             ptr = nptr;
 142             nptr = NULL;
 143        }
 144        sStart[j] = NULL;
 145   }
 146
 147   if (trystring) free(trystring);
 148   trystring=NULL;
 149   if (encoding) free(encoding);
 150   encoding=NULL;
 151   if (maptable) {
 152      for (int j=0; j < nummap; j++) {
 153         if (maptable[j].set) free(maptable[j].set);
 154         if (maptable[j].set_utf16) free(maptable[j].set_utf16);
 155         maptable[j].set = NULL;
 156         maptable[j].len = 0;
 157      }
 158      free(maptable);
 159      maptable = NULL;
 160   }
 161   nummap = 0;
 162   if (breaktable) {
 163      for (int j=0; j < numbreak; j++) {
 164         if (breaktable[j]) free(breaktable[j]);
 165         breaktable[j] = NULL;
 166      }
 167      free(breaktable);
 168      breaktable = NULL;
 169   }
 170   numbreak = 0;
 171   if (reptable) {
 172      for (int j=0; j < numrep; j++) {
 173         free(reptable[j].pattern);
 174         free(reptable[j].pattern2);
 175         reptable[j].pattern = NULL;
 176         reptable[j].pattern2 = NULL;
 177      }
 178      free(reptable);
 179      reptable = NULL;
 180   }
 181   if (defcpdtable) {
 182      for (int j=0; j < numdefcpd; j++) {
 183         free(defcpdtable[j].def);
 184         defcpdtable[j].def = NULL;
 185      }
 186      free(defcpdtable);
 187      defcpdtable = NULL;
 188   }
 189   numrep = 0;
 190   if (checkcpdtable) {
 191      for (int j=0; j < numcheckcpd; j++) {
 192         free(checkcpdtable[j].pattern);
 193         free(checkcpdtable[j].pattern2);
 194         checkcpdtable[j].pattern = NULL;
 195         checkcpdtable[j].pattern2 = NULL;
 196      }
 197      free(checkcpdtable);
 198      checkcpdtable = NULL;
 199   }
 200   numcheckcpd = 0;
 201   FREE_FLAG(compoundflag);
 202   FREE_FLAG(compoundbegin);
 203   FREE_FLAG(compoundmiddle);
 204   FREE_FLAG(compoundend);
 205   FREE_FLAG(compoundpermitflag);
 206   FREE_FLAG(compoundforbidflag);
 207   FREE_FLAG(compoundroot);
 208   FREE_FLAG(forbiddenword);
 209   FREE_FLAG(nosuggest);
 210   FREE_FLAG(pseudoroot);
 211   FREE_FLAG(lemma_present);
 212   FREE_FLAG(circumfix);
 213   FREE_FLAG(onlyincompound);
 214
 215   cpdwordmax = 0;
 216   pHMgr = NULL;
 217   cpdmin = 0;
 218   cpdmaxsyllable = 0;
 219   if (cpdvowels) free(cpdvowels);
 220   if (cpdvowels_utf16) free(cpdvowels_utf16);
 221   if (cpdsyllablenum) free(cpdsyllablenum);
 222   free_utf_tbl();
 223   if (lang) free(lang);
 224   if (wordchars) free(wordchars);
 225   if (wordchars_utf16) free(wordchars_utf16);
 226   if (ignorechars) free(ignorechars);
 227   if (ignorechars_utf16) free(ignorechars_utf16);
 228   if (version) free(version);
 229   if (derived) free(derived);
 230   checknum=0;
 231 }
 232
 233
 234 // read in aff file and build up prefix and suffix entry objects
 235 int  AffixMgr::parse_file(const char * affpath)
 236 {
 237
 238   // io buffers
 239   char line[MAXLNLEN+1];
 240
 241   // affix type
 242   char ft;
 243
 244   // checking flag duplication
 245   char dupflags[CONTSIZE];
 246   char dupflags_ini = 1;
 247
 248   // first line indicator for removing byte order mark
 249   int firstline = 1;
 250
 251   // open the affix file
 252   FILE * afflst;
 253   afflst = fopen(affpath,"r");
 254   if (!afflst) {
 255     HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
 256     return 1;
 257   }
 258
 259   // step one is to parse the affix file building up the internal
 260   // affix data structures
 261
 262
 263     // read in each line ignoring any that do not
 264     // start with a known line type indicator
 265     while (fgets(line,MAXLNLEN,afflst)) {
 266        mychomp(line);
 267
 268        /* remove byte order mark */
 269        if (firstline) {
 270          firstline = 0;
 271          if (strncmp(line,"",3) == 0) {
 272             memmove(line, line+3, strlen(line+3)+1);
 273             HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
 274          }
 275        }
 276
 277        /* parse in the try string */
 278        if (strncmp(line,"TRY",3) == 0) {
 279           if (parse_string(line, &trystring, "TRY")) {
 280              fclose(afflst);
 281              return 1;
 282           }
 283        }
 284
 285        /* parse in the name of the character set used by the .dict and .aff */
 286        if (strncmp(line,"SET",3) == 0) {
 287           if (parse_string(line, &encoding, "SET")) {
 288              fclose(afflst);
 289              return 1;
 290           }
 291           if (strcmp(encoding, "UTF-8") == 0) {
 292              utf8 = 1;
 293 #ifndef OPENOFFICEORG
 294 #ifndef MOZILLA_CLIENT
 295              if (initialize_utf_tbl()) {
 296                fclose(afflst);
 297                return 1;
 298              }
 299 #endif
 300 #endif
 301           }
 302        }
 303
 304        /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
 305        if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
 306                    complexprefixes = 1;
 307
 308        /* parse in the flag used by the controlled compound words */
 309        if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
 310           if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
 311              fclose(afflst);
 312              return 1;
 313           }
 314        }
 315
 316        /* parse in the flag used by compound words */
 317        if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
 318           if (complexprefixes) {
 319             if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
 320               fclose(afflst);
 321               return 1;
 322             }
 323           } else {
 324             if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
 325               fclose(afflst);
 326               return 1;
 327             }
 328           }
 329        }
 330
 331        /* parse in the flag used by compound words */
 332        if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
 333           if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
 334              fclose(afflst);
 335              return 1;
 336           }
 337        }
 338        /* parse in the flag used by compound words */
 339        if (strncmp(line,"COMPOUNDEND",11) == 0) {
 340           if (complexprefixes) {
 341             if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
 342               fclose(afflst);
 343               return 1;
 344             }
 345           } else {
 346             if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
 347               fclose(afflst);
 348               return 1;
 349             }
 350           }
 351        }
 352
 353        /* parse in the data used by compound_check() method */
 354        if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
 355           if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
 356              fclose(afflst);
 357              return 1;
 358           }
 359        }
 360
 361        /* parse in the flag sign compounds in dictionary */
 362        if (strncmp(line,"COMPOUNDROOT",12) == 0) {
 363           if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
 364              fclose(afflst);
 365              return 1;
 366           }
 367        }
 368
 369        /* parse in the flag used by compound_check() method */
 370        if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
 371           if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
 372              fclose(afflst);
 373              return 1;
 374           }
 375        }
 376
 377        /* parse in the flag used by compound_check() method */
 378        if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
 379           if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
 380              fclose(afflst);
 381              return 1;
 382           }
 383        }
 384
 385        if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
 386                    checkcompounddup = 1;
 387        }
 388
 389        if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
 390                    checkcompoundrep = 1;
 391        }
 392
 393        if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
 394                    checkcompoundtriple = 1;
 395        }
 396
 397        if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
 398                    checkcompoundcase = 1;
 399        }
 400
 401        if (strncmp(line,"NOSUGGEST",9) == 0) {
 402           if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
 403              fclose(afflst);
 404              return 1;
 405           }
 406        }
 407
 408        /* parse in the flag used by forbidden words */
 409        if (strncmp(line,"FORBIDDENWORD",13) == 0) {
 410           if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
 411              fclose(afflst);
 412              return 1;
 413           }
 414        }
 415
 416        /* parse in the flag used by forbidden words */
 417        if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
 418           if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
 419              fclose(afflst);
 420              return 1;
 421           }
 422        }
 423
 424        /* parse in the flag used by circumfixes */
 425        if (strncmp(line,"CIRCUMFIX",9) == 0) {
 426           if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
 427              fclose(afflst);
 428              return 1;
 429           }
 430        }
 431
 432        /* parse in the flag used by fogemorphemes */
 433        if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
 434           if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
 435              fclose(afflst);
 436              return 1;
 437           }
 438        }
 439
 440        /* parse in the flag used by `pseudoroots' */
 441        if (strncmp(line,"PSEUDOROOT",10) == 0) {
 442           if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) {
 443              fclose(afflst);
 444              return 1;
 445           }
 446        }
 447
 448        /* parse in the flag used by `pseudoroots' */
 449        if (strncmp(line,"NEEDAFFIX",9) == 0) {
 450           if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) {
 451              fclose(afflst);
 452              return 1;
 453           }
 454        }
 455
 456        /* parse in the minimal length for words in compounds */
 457        if (strncmp(line,"COMPOUNDMIN",11) == 0) {
 458           if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
 459              fclose(afflst);
 460              return 1;
 461           }
 462           if (cpdmin < 1) cpdmin = 1;
 463        }
 464
 465        /* parse in the max. words and syllables in compounds */
 466        if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
 467           if (parse_cpdsyllable(line)) {
 468              fclose(afflst);
 469              return 1;
 470           }
 471        }
 472
 473        /* parse in the flag used by compound_check() method */
 474        if (strncmp(line,"SYLLABLENUM",11) == 0) {
 475           if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {
 476              fclose(afflst);
 477              return 1;
 478           }
 479        }
 480
 481        /* parse in the flag used by the controlled compound words */
 482        if (strncmp(line,"CHECKNUM",8) == 0) {
 483            checknum=1;
 484        }
 485
 486        /* parse in the extra word characters */
 487        if (strncmp(line,"WORDCHARS",9) == 0) {
 488           if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {
 489              fclose(afflst);
 490              return 1;
 491           }
 492        }
 493
 494        /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
 495        if (strncmp(line,"IGNORE",6) == 0) {
 496           if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
 497              fclose(afflst);
 498              return 1;
 499           }
 500        }
 501
 502        /* parse in the typical fault correcting table */
 503        if (strncmp(line,"REP",3) == 0) {
 504           if (parse_reptable(line, afflst)) {
 505              fclose(afflst);
 506              return 1;
 507           }
 508        }
 509
 510        /* parse in the checkcompoundpattern table */
 511        if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
 512           if (parse_checkcpdtable(line, afflst)) {
 513              fclose(afflst);
 514              return 1;
 515           }
 516        }
 517
 518        /* parse in the defcompound table */
 519        if (strncmp(line,"COMPOUNDRULE",12) == 0) {
 520           if (parse_defcpdtable(line, afflst)) {
 521              fclose(afflst);
 522              return 1;
 523           }
 524        }
 525
 526        /* parse in the related character map table */
 527        if (strncmp(line,"MAP",3) == 0) {
 528           if (parse_maptable(line, afflst)) {
 529              fclose(afflst);
 530              return 1;
 531           }
 532        }
 533
 534        /* parse in the word breakpoints table */
 535        if (strncmp(line,"BREAK",5) == 0) {
 536           if (parse_breaktable(line, afflst)) {
 537              fclose(afflst);
 538              return 1;
 539           }
 540        }
 541
 542        /* parse in the language for language specific codes */
 543        if (strncmp(line,"LANG",4) == 0) {
 544           if (parse_string(line, &lang, "LANG")) {
 545              fclose(afflst);
 546              return 1;
 547           }
 548           langnum = get_lang_num(lang);
 549        }
 550
 551        if (strncmp(line,"VERSION",7) == 0) {
 552           if (parse_string(line, &version, "VERSION")) {
 553              fclose(afflst);
 554              return 1;
 555           }
 556        }
 557
 558        if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
 559           if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
 560              fclose(afflst);
 561              return 1;
 562           }
 563        }
 564
 565        if (strncmp(line,"NOSPLITSUGS",11) == 0) {
 566                    nosplitsugs=1;
 567        }
 568
 569        if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
 570                    sugswithdots=1;
 571        }
 572
 573        /* parse in the flag used by forbidden words */
 574        if (strncmp(line,"KEEPCASE",8) == 0) {
 575           if (parse_flag(line, &keepcase, "KEEPCASE")) {
 576              fclose(afflst);
 577              return 1;
 578           }
 579        }
 580
 581        if (strncmp(line,"CHECKSHARPS",11) == 0) {
 582                    checksharps=1;
 583        }
 584
 585        /* parse this affix: P - prefix, S - suffix */
 586        ft = ' ';
 587        if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
 588        if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
 589        if (ft != ' ') {
 590           if (dupflags_ini) {
 591             for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
 592             dupflags_ini = 0;
 593           }
 594           if (parse_affix(line, ft, afflst, dupflags)) {
 595              fclose(afflst);
 596              process_pfx_tree_to_list();
 597              process_sfx_tree_to_list();
 598              return 1;
 599           }
 600        }
 601
 602     }
 603     fclose(afflst);
 604
 605     // convert affix trees to sorted list
 606     process_pfx_tree_to_list();
 607     process_sfx_tree_to_list();
 608
 609     // now we can speed up performance greatly taking advantage of the
 610     // relationship between the affixes and the idea of "subsets".
 611
 612     // View each prefix as a potential leading subset of another and view
 613     // each suffix (reversed) as a potential trailing subset of another.
 614
 615     // To illustrate this relationship if we know the prefix "ab" is found in the
 616     // word to examine, only prefixes that "ab" is a leading subset of need be examined.
 617     // Furthermore is "ab" is not present then none of the prefixes that "ab" is
 618     // is a subset need be examined.
 619     // The same argument goes for suffix string that are reversed.
 620
 621     // Then to top this off why not examine the first char of the word to quickly
 622     // limit the set of prefixes to examine (i.e. the prefixes to examine must
 623     // be leading supersets of the first character of the word (if they exist)
 624
 625     // To take advantage of this "subset" relationship, we need to add two links
 626     // from entry.  One to take next if the current prefix is found (call it nexteq)
 627     // and one to take next if the current prefix is not found (call it nextne).
 628
 629     // Since we have built ordered lists, all that remains is to properly intialize
 630     // the nextne and nexteq pointers that relate them
 631
 632     process_pfx_order();
 633     process_sfx_order();
 634
 635     // expand wordchars string, based on csutil (for external tokenization)
 636
 637     char * enc = get_encoding();
 638     csconv = get_current_cs(enc);
 639     free(enc);
 640     enc = NULL;
 641
 642     char expw[MAXLNLEN];
 643     if (wordchars) {
 644         strcpy(expw, wordchars);
 645         free(wordchars);
 646     } else *expw = '\0';
 647
 648     for (int i = 0; i <= 255; i++) {
 649         if ( (csconv[i].cupper != csconv[i].clower) &&
 650             (! strchr(expw, (char) i))) {
 651                 *(expw + strlen(expw) + 1) = '\0';
 652                 *(expw + strlen(expw)) = (char) i;
 653         }
 654     }
 655
 656     wordchars = mystrdup(expw);
 657
 658     // temporary BREAK definition for German dash handling (OOo issue 64400)
 659     if ((langnum == LANG_de) && (!breaktable)) {
 660         breaktable = (char **) malloc(sizeof(char *));
 661         if (!breaktable) return 1;
 662         breaktable[0] = mystrdup("-");
 663         numbreak = 1;
 664     }
 665     return 0;
 666 }
 667
 668
 669 // we want to be able to quickly access prefix information
 670 // both by prefix flag, and sorted by prefix string itself
 671 // so we need to set up two indexes
 672
 673 int AffixMgr::build_pfxtree(AffEntry* pfxptr)
 674 {
 675   PfxEntry * ptr;
 676   PfxEntry * pptr;
 677   PfxEntry * ep = (PfxEntry*) pfxptr;
 678
 679   // get the right starting points
 680   const char * key = ep->getKey();
 681   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
 682
 683   // first index by flag which must exist
 684   ptr = (PfxEntry*)pFlag[flg];
 685   ep->setFlgNxt(ptr);
 686   pFlag[flg] = (AffEntry *) ep;
 687
 688
 689   // handle the special case of null affix string
 690   if (strlen(key) == 0) {
 691     // always inset them at head of list at element 0
 692      ptr = (PfxEntry*)pStart[0];
 693      ep->setNext(ptr);
 694      pStart[0] = (AffEntry*)ep;
 695      return 0;
 696   }
 697
 698   // now handle the normal case
 699   ep->setNextEQ(NULL);
 700   ep->setNextNE(NULL);
 701
 702   unsigned char sp = *((const unsigned char *)key);
 703   ptr = (PfxEntry*)pStart[sp];
 704
 705   // handle the first insert
 706   if (!ptr) {
 707      pStart[sp] = (AffEntry*)ep;
 708      return 0;
 709   }
 710
 711
 712   // otherwise use binary tree insertion so that a sorted
 713   // list can easily be generated later
 714   pptr = NULL;
 715   for (;;) {
 716     pptr = ptr;
 717     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
 718        ptr = ptr->getNextEQ();
 719        if (!ptr) {
 720           pptr->setNextEQ(ep);
 721           break;
 722        }
 723     } else {
 724        ptr = ptr->getNextNE();
 725        if (!ptr) {
 726           pptr->setNextNE(ep);
 727           break;
 728        }
 729     }
 730   }
 731   return 0;
 732 }
 733
 734 // we want to be able to quickly access suffix information
 735 // both by suffix flag, and sorted by the reverse of the
 736 // suffix string itself; so we need to set up two indexes
 737 int AffixMgr::build_sfxtree(AffEntry* sfxptr)
 738 {
 739   SfxEntry * ptr;
 740   SfxEntry * pptr;
 741   SfxEntry * ep = (SfxEntry *) sfxptr;
 742
 743   /* get the right starting point */
 744   const char * key = ep->getKey();
 745   const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);
 746
 747   // first index by flag which must exist
 748   ptr = (SfxEntry*)sFlag[flg];
 749   ep->setFlgNxt(ptr);
 750   sFlag[flg] = (AffEntry *) ep;
 751
 752   // next index by affix string
 753
 754   // handle the special case of null affix string
 755   if (strlen(key) == 0) {
 756     // always inset them at head of list at element 0
 757      ptr = (SfxEntry*)sStart[0];
 758      ep->setNext(ptr);
 759      sStart[0] = (AffEntry*)ep;
 760      return 0;
 761   }
 762
 763   // now handle the normal case
 764   ep->setNextEQ(NULL);
 765   ep->setNextNE(NULL);
 766
 767   unsigned char sp = *((const unsigned char *)key);
 768   ptr = (SfxEntry*)sStart[sp];
 769
 770   // handle the first insert
 771   if (!ptr) {
 772      sStart[sp] = (AffEntry*)ep;
 773      return 0;
 774   }
 775
 776   // otherwise use binary tree insertion so that a sorted
 777   // list can easily be generated later
 778   pptr = NULL;
 779   for (;;) {
 780     pptr = ptr;
 781     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
 782        ptr = ptr->getNextEQ();
 783        if (!ptr) {
 784           pptr->setNextEQ(ep);
 785           break;
 786        }
 787     } else {
 788        ptr = ptr->getNextNE();
 789        if (!ptr) {
 790           pptr->setNextNE(ep);
 791           break;
 792        }
 793     }
 794   }
 795   return 0;
 796 }
 797
 798 // convert from binary tree to sorted list
 799 int AffixMgr::process_pfx_tree_to_list()
 800 {
 801   for (int i=1; i< SETSIZE; i++) {
 802     pStart[i] = process_pfx_in_order(pStart[i],NULL);
 803   }
 804   return 0;
 805 }
 806
 807
 808 AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
 809 {
 810   if (ptr) {
 811     nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
 812     ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
 813     nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
 814   }
 815   return nptr;
 816 }
 817
 818
 819 // convert from binary tree to sorted list
 820 int AffixMgr:: process_sfx_tree_to_list()
 821 {
 822   for (int i=1; i< SETSIZE; i++) {
 823     sStart[i] = process_sfx_in_order(sStart[i],NULL);
 824   }
 825   return 0;
 826 }
 827
 828 AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
 829 {
 830   if (ptr) {
 831     nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
 832     ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
 833     nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
 834   }
 835   return nptr;
 836 }
 837
 838
 839 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
 840 // using the idea of leading subsets this time
 841 int AffixMgr::process_pfx_order()
 842 {
 843     PfxEntry* ptr;
 844
 845     // loop through each prefix list starting point
 846     for (int i=1; i < SETSIZE; i++) {
 847
 848          ptr = (PfxEntry*)pStart[i];
 849
 850          // look through the remainder of the list
 851          //  and find next entry with affix that
 852          // the current one is not a subset of
 853          // mark that as destination for NextNE
 854          // use next in list that you are a subset
 855          // of as NextEQ
 856
 857          for (; ptr != NULL; ptr = ptr->getNext()) {
 858
 859              PfxEntry * nptr = ptr->getNext();
 860              for (; nptr != NULL; nptr = nptr->getNext()) {
 861                  if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
 862              }
 863              ptr->setNextNE(nptr);
 864              ptr->setNextEQ(NULL);
 865              if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
 866                  ptr->setNextEQ(ptr->getNext());
 867          }
 868
 869          // now clean up by adding smart search termination strings:
 870          // if you are already a superset of the previous prefix
 871          // but not a subset of the next, search can end here
 872          // so set NextNE properly
 873
 874          ptr = (PfxEntry *) pStart[i];
 875          for (; ptr != NULL; ptr = ptr->getNext()) {
 876              PfxEntry * nptr = ptr->getNext();
 877              PfxEntry * mptr = NULL;
 878              for (; nptr != NULL; nptr = nptr->getNext()) {
 879                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
 880                  mptr = nptr;
 881              }
 882              if (mptr) mptr->setNextNE(NULL);
 883          }
 884     }
 885     return 0;
 886 }
 887
 888 // initialize the SfxEntry links NextEQ and NextNE to speed searching
 889 // using the idea of leading subsets this time
 890 int AffixMgr::process_sfx_order()
 891 {
 892     SfxEntry* ptr;
 893
 894     // loop through each prefix list starting point
 895     for (int i=1; i < SETSIZE; i++) {
 896
 897          ptr = (SfxEntry *) sStart[i];
 898
 899          // look through the remainder of the list
 900          //  and find next entry with affix that
 901          // the current one is not a subset of
 902          // mark that as destination for NextNE
 903          // use next in list that you are a subset
 904          // of as NextEQ
 905
 906          for (; ptr != NULL; ptr = ptr->getNext()) {
 907              SfxEntry * nptr = ptr->getNext();
 908              for (; nptr != NULL; nptr = nptr->getNext()) {
 909                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
 910              }
 911              ptr->setNextNE(nptr);
 912              ptr->setNextEQ(NULL);
 913              if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey()))
 914                  ptr->setNextEQ(ptr->getNext());
 915          }
 916
 917
 918          // now clean up by adding smart search termination strings:
 919          // if you are already a superset of the previous suffix
 920          // but not a subset of the next, search can end here
 921          // so set NextNE properly
 922
 923          ptr = (SfxEntry *) sStart[i];
 924          for (; ptr != NULL; ptr = ptr->getNext()) {
 925              SfxEntry * nptr = ptr->getNext();
 926              SfxEntry * mptr = NULL;
 927              for (; nptr != NULL; nptr = nptr->getNext()) {
 928                  if (! isSubset(ptr->getKey(),nptr->getKey())) break;
 929                  mptr = nptr;
 930              }
 931              if (mptr) mptr->setNextNE(NULL);
 932          }
 933     }
 934     return 0;
 935 }
 936
 937
 938
 939 // takes aff file condition string and creates the
 940 // conds array - please see the appendix at the end of the
 941 // file affentry.cxx which describes what is going on here
 942 // in much more detail
 943
 944 int AffixMgr::encodeit(struct affentry * ptr, char * cs)
 945 {
 946   unsigned char c;
 947   int i, j, k;
 948   unsigned char mbr[MAXLNLEN];
 949   w_char wmbr[MAXLNLEN];
 950   w_char * wpos = wmbr;
 951
 952   // now clear the conditions array */
 953   for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0;
 954
 955   // now parse the string to create the conds array */
 956   int nc = strlen(cs);
 957   unsigned char neg = 0;   // complement indicator
 958   int grp = 0;   // group indicator
 959   unsigned char n = 0;     // number of conditions
 960   int ec = 0;    // end condition indicator
 961   int nm = 0;    // number of member in group
 962
 963   // if no condition just return
 964   if (strcmp(cs,".")==0) {
 965     ptr->numconds = 0;
 966     return 0;
 967   }
 968
 969   i = 0;
 970   while (i < nc) {
 971     c = *((unsigned char *)(cs + i));
 972
 973     // start group indicator
 974     if (c == '[') {
 975        grp = 1;
 976        c = 0;
 977     }
 978
 979     // complement flag
 980     if ((grp == 1) && (c == '^')) {
 981        neg = 1;
 982        c = 0;
 983     }
 984
 985     // end goup indicator
 986     if (c == ']') {
 987        ec = 1;
 988        c = 0;
 989     }
 990
 991     // add character of group to list
 992     if ((grp == 1) && (c != 0)) {
 993       *(mbr + nm) = c;
 994       nm++;
 995       c = 0;
 996     }
 997
 998     // end of condition
 999     if (c != 0) {
1000        ec = 1;
1001     }
1002
1003   if (ec) {
1004     if (!utf8) {
1005       if (grp == 1) {
1006         if (neg == 0) {
1007           // set the proper bits in the condition array vals for those chars
1008           for (j=0;j<nm;j++) {
1009              k = (unsigned int) mbr[j];
1010              ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n);
1011           }
1012         } else {
1013           // complement so set all of them and then unset indicated ones
1014            for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
1015            for (j=0;j<nm;j++) {
1016              k = (unsigned int) mbr[j];
1017              ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n);
1018            }
1019         }
1020         neg = 0;
1021         grp = 0;
1022         nm = 0;
1023       } else {
1024          // not a group so just set the proper bit for this char
1025          // but first handle special case of . inside condition
1026          if (c == '.') {
1027             // wild card character so set them all
1028             for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);
1029          } else {
1030             ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n);
1031          }
1032       }
1033       n++;
1034       ec = 0;
1035     } else { // UTF-8 character set
1036       if (grp == 1) {
1037         ptr->conds.utf8.neg[n] = neg;
1038         if (neg == 0) {
1039           // set the proper bits in the condition array vals for those chars
1040           for (j=0;j<nm;j++) {
1041              k = (unsigned int) mbr[j];
1042              if (k >> 7) {
1043                 u8_u16(wpos, 1, (char *) mbr + j);
1044                 wpos++;
1045                 if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
1046              } else {
1047                 ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n);
1048              }
1049           }
1050         } else { // neg == 1
1051           // complement so set all of them and then unset indicated ones
1052            for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
1053            for (j=0;j<nm;j++) {
1054              k = (unsigned int) mbr[j];
1055              if (k >> 7) {
1056                 u8_u16(wpos, 1, (char *) mbr + j);
1057                 wpos++;
1058                 if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
1059              } else {
1060                 ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n);
1061              }
1062            }
1063         }
1064         neg = 0;
1065         grp = 0;
1066         nm = 0;
1067         ptr->conds.utf8.wlen[n] = wpos - wmbr;
1068         if ((wpos - wmbr) != 0) {
1069             ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr));
1070             if (!ptr->conds.utf8.wchars[n]) return 1;
1071             memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr));
1072             flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]);
1073             wpos = wmbr;
1074         }
1075       } else { // grp == 0
1076          // is UTF-8 character?
1077          if (c >> 7) {
1078             ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char));
1079             if (!ptr->conds.utf8.wchars[n]) return 1;
1080             ptr->conds.utf8.wlen[n] = 1;
1081             u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i);
1082             if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character
1083          } else {
1084             ptr->conds.utf8.wchars[n] = NULL;
1085             // not a group so just set the proper bit for this char
1086             // but first handle special case of . inside condition
1087             if (c == '.') {
1088                 ptr->conds.utf8.all[n] = 1;
1089                 // wild card character so set them all
1090                 for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);
1091             } else {
1092                 ptr->conds.utf8.all[n] = 0;
1093                 ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n);
1094             }
1095          }
1096          neg = 0;
1097       }
1098       n++;
1099       ec = 0;
1100       neg = 0;
1101     }
1102   }
1103
1104     i++;
1105   }
1106   ptr->numconds = n;
1107   return 0;
1108 }
1109
1110  // return 1 if s1 is a leading subset of s2
1111 /* inline int AffixMgr::isSubset(const char * s1, const char * s2)
1112  {
1113     while ((*s1 == *s2) && *s1) {
1114         s1++;
1115         s2++;
1116     }
1117     return (*s1 == '\0');
1118  }
1119 */
1120
1121  // return 1 if s1 is a leading subset of s2 (dots are for infixes)
1122 inline int AffixMgr::isSubset(const char * s1, const char * s2)
1123  {
1124     while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
1125         s1++;
1126         s2++;
1127     }
1128     return (*s1 == '\0');
1129  }
1130
1131
1132 // check word for prefixes
1133 struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
1134     const FLAG needflag)
1135 {
1136     struct hentry * rv= NULL;
1137
1138     pfx = NULL;
1139     pfxappnd = NULL;
1140     sfxappnd = NULL;
1141
1142     // first handle the special case of 0 length prefixes
1143     PfxEntry * pe = (PfxEntry *) pStart[0];
1144     while (pe) {
1145         if (
1146             // fogemorpheme
1147               ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
1148                   (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
1149             // permit prefixes in compounds
1150               ((in_compound != IN_CPD_END) || (pe->getCont() &&
1151                   (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
1152               ) {
1153                     // check prefix
1154                     rv = pe->checkword(word, len, in_compound, needflag);
1155                     if (rv) {
1156                         pfx=(AffEntry *)pe; // BUG: pfx not stateless
1157                         return rv;
1158                     }
1159              }
1160        pe = pe->getNext();
1161     }
1162
1163     // now handle the general case
1164     unsigned char sp = *((const unsigned char *)word);
1165     PfxEntry * pptr = (PfxEntry *)pStart[sp];
1166
1167     while (pptr) {
1168         if (isSubset(pptr->getKey(),word)) {
1169              if (
1170             // fogemorpheme
1171               ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
1172                   (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
1173             // permit prefixes in compounds
1174               ((in_compound != IN_CPD_END) || (pptr->getCont() &&
1175                   (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
1176               ) {
1177             // check prefix
1178                   rv = pptr->checkword(word, len, in_compound, needflag);
1179                   if (rv) {
1180                     pfx=(AffEntry *)pptr; // BUG: pfx not stateless
1181                     return rv;
1182                   }
1183              }
1184              pptr = pptr->getNextEQ();
1185         } else {
1186              pptr = pptr->getNextNE();
1187         }
1188     }
1189
1190     return NULL;
1191 }
1192
1193 // check word for prefixes
1194 struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
1195     char in_compound, const FLAG needflag)
1196 {
1197     struct hentry * rv= NULL;
1198
1199     pfx = NULL;
1200     sfxappnd = NULL;
1201
1202     // first handle the special case of 0 length prefixes
1203     PfxEntry * pe = (PfxEntry *) pStart[0];
1204
1205     while (pe) {
1206         rv = pe->check_twosfx(word, len, in_compound, needflag);
1207         if (rv) return rv;
1208         pe = pe->getNext();
1209     }
1210
1211     // now handle the general case
1212     unsigned char sp = *((const unsigned char *)word);
1213     PfxEntry * pptr = (PfxEntry *)pStart[sp];
1214
1215     while (pptr) {
1216         if (isSubset(pptr->getKey(),word)) {
1217             rv = pptr->check_twosfx(word, len, in_compound, needflag);
1218             if (rv) {
1219                 pfx = (AffEntry *)pptr;
1220                 return rv;
1221             }
1222             pptr = pptr->getNextEQ();
1223         } else {
1224              pptr = pptr->getNextNE();
1225         }
1226     }
1227
1228     return NULL;
1229 }
1230
1231 #ifdef HUNSPELL_EXPERIMENTAL
1232 // check word for prefixes
1233 char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
1234     const FLAG needflag)
1235 {
1236     char * st;
1237
1238     char result[MAXLNLEN];
1239     result[0] = '\0';
1240
1241     pfx = NULL;
1242     sfxappnd = NULL;
1243
1244     // first handle the special case of 0 length prefixes
1245     PfxEntry * pe = (PfxEntry *) pStart[0];
1246     while (pe) {
1247        st = pe->check_morph(word,len,in_compound, needflag);
1248        if (st) {
1249             strcat(result, st);
1250             free(st);
1251        }
1252        // if (rv) return rv;
1253        pe = pe->getNext();
1254     }
1255
1256     // now handle the general case
1257     unsigned char sp = *((const unsigned char *)word);
1258     PfxEntry * pptr = (PfxEntry *)pStart[sp];
1259
1260     while (pptr) {
1261         if (isSubset(pptr->getKey(),word)) {
1262             st = pptr->check_morph(word,len,in_compound, needflag);
1263             if (st) {
1264               // fogemorpheme
1265               if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
1266                         (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
1267                     strcat(result, st);
1268                     pfx = (AffEntry *)pptr;
1269                 }
1270                 free(st);
1271             }
1272             pptr = pptr->getNextEQ();
1273         } else {
1274             pptr = pptr->getNextNE();
1275         }
1276     }
1277
1278     if (*result) return mystrdup(result);
1279     return NULL;
1280 }
1281
1282
1283 // check word for prefixes
1284 char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
1285     char in_compound, const FLAG needflag)
1286 {
1287     char * st;
1288
1289     char result[MAXLNLEN];
1290     result[0] = '\0';
1291
1292     pfx = NULL;
1293     sfxappnd = NULL;
1294
1295     // first handle the special case of 0 length prefixes
1296     PfxEntry * pe = (PfxEntry *) pStart[0];
1297     while (pe) {
1298         st = pe->check_twosfx_morph(word,len,in_compound, needflag);
1299         if (st) {
1300             strcat(result, st);
1301             free(st);
1302         }
1303         pe = pe->getNext();
1304     }
1305
1306     // now handle the general case
1307     unsigned char sp = *((const unsigned char *)word);
1308     PfxEntry * pptr = (PfxEntry *)pStart[sp];
1309
1310     while (pptr) {
1311         if (isSubset(pptr->getKey(),word)) {
1312             st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
1313             if (st) {
1314                 strcat(result, st);
1315                 free(st);
1316                 pfx = (AffEntry *)pptr;
1317             }
1318             pptr = pptr->getNextEQ();
1319         } else {
1320             pptr = pptr->getNextNE();
1321         }
1322     }
1323
1324     if (*result) return mystrdup(result);
1325     return NULL;
1326 }
1327 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1328
1329
1330 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1331 int AffixMgr::cpdrep_check(const char * word, int wl)
1332 {
1333   char candidate[MAXLNLEN];
1334   const char * r;
1335   int lenr, lenp;
1336
1337   if ((wl < 2) || !numrep) return 0;
1338
1339   for (int i=0; i < numrep; i++ ) {
1340       r = word;
1341       lenr = strlen(reptable[i].pattern2);
1342       lenp = strlen(reptable[i].pattern);
1343       // search every occurence of the pattern in the word
1344       while ((r=strstr(r, reptable[i].pattern)) != NULL) {
1345           strcpy(candidate, word);
1346           if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
1347           strcpy(candidate+(r-word),reptable[i].pattern2);
1348           strcpy(candidate+(r-word)+lenr, r+lenp);
1349           if (candidate_check(candidate,strlen(candidate))) return 1;
1350           r++; // search for the next letter
1351       }
1352    }
1353    return 0;
1354 }
1355
1356 // forbid compoundings when there are special patterns at word bound
1357 int AffixMgr::cpdpat_check(const char * word, int pos)
1358 {
1359   int len;
1360   for (int i = 0; i < numcheckcpd; i++) {
1361       if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
1362         (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
1363         (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
1364   }
1365   return 0;
1366 }
1367
1368 // forbid compounding with neighbouring upper and lower case characters at word bounds
1369 int AffixMgr::cpdcase_check(const char * word, int pos)
1370 {
1371   if (utf8) {
1372       w_char u, w;
1373       const char * p;
1374       u8_u16(&u, 1, word + pos);
1375       for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
1376       u8_u16(&w, 1, p);
1377       unsigned short a = (u.h << 8) + u.l;
1378       unsigned short b = (w.h << 8) + w.l;
1379       if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;
1380   } else {
1381       unsigned char a = *(word + pos - 1);
1382       unsigned char b = *(word + pos);
1383       if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
1384   }
1385   return 0;
1386 }
1387
1388 // check compound patterns
1389 int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
1390 {
1391   signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
1392   signed short btwp[MAXWORDLEN]; // word positions for metacharacters
1393   int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
1394   short bt = 0;
1395   int i;
1396   int ok;
1397   int w = 0;
1398   if (!*words) {
1399     w = 1;
1400     *words = def;
1401   }
1402   (*words)[wnum] = rv;
1403
1404   for (i = 0; i < numdefcpd; i++) {
1405     signed short pp = 0; // pattern position
1406     signed short wp = 0; // "words" position
1407     int ok2;
1408     ok = 1;
1409     ok2 = 1;
1410     do {
1411       while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
1412         if (((pp+1) < defcpdtable[i].len) &&
1413           ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
1414             int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
1415             ok2 = 1;
1416             pp+=2;
1417             btpp[bt] = pp;
1418             btwp[bt] = wp;
1419             while (wp <= wend) {
1420                 if (!(*words)[wp]->alen ||
1421                   !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
1422                     ok2 = 0;
1423                     break;
1424                 }
1425                 wp++;
1426             }
1427             if (wp <= wnum) ok2 = 0;
1428             btnum[bt] = wp - btwp[bt];
1429             if (btnum[bt] > 0) bt++;
1430             if (ok2) break;
1431         } else {
1432             ok2 = 1;
1433             if (!(*words)[wp] || !(*words)[wp]->alen ||
1434               !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
1435                 ok = 0;
1436                 break;
1437             }
1438             pp++;
1439             wp++;
1440             if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
1441         }
1442       }
1443     if (ok && ok2) {
1444         int r = pp;
1445         while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
1446             ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
1447         if (defcpdtable[i].len <= r) return 1;
1448     }
1449     // backtrack
1450     if (bt) do {
1451         ok = 1;
1452         btnum[bt - 1]--;
1453         pp = btpp[bt - 1];
1454         wp = btwp[bt - 1] + btnum[bt - 1];
1455     } while ((btnum[bt - 1] < 0) && --bt);
1456   } while (bt);
1457
1458   if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
1459   // check zero ending
1460   while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
1461     ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
1462   if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
1463   }
1464   (*words)[wnum] = NULL;
1465   if (w) *words = NULL;
1466   return 0;
1467 }
1468
1469 inline int AffixMgr::candidate_check(const char * word, int len)
1470 {
1471   struct hentry * rv=NULL;
1472
1473   rv = lookup(word);
1474   if (rv) return 1;
1475
1476 //  rv = prefix_check(word,len,1);
1477 //  if (rv) return 1;
1478
1479   rv = affix_check(word,len);
1480   if (rv) return 1;
1481   return 0;
1482 }
1483
1484 // calculate number of syllable for compound-checking
1485 short AffixMgr::get_syllable(const char * word, int wlen)
1486 {
1487     if (cpdmaxsyllable==0) return 0;
1488
1489     short num=0;
1490
1491     if (!utf8) {
1492         for (int i=0; i<wlen; i++) {
1493             if (strchr(cpdvowels, word[i])) num++;
1494         }
1495     } else if (cpdvowels_utf16) {
1496         w_char w[MAXWORDUTF8LEN];
1497         int i = u8_u16(w, MAXWORDUTF8LEN, word);
1498         for (; i; i--) {
1499             if (flag_bsearch((unsigned short *) cpdvowels_utf16,
1500                 ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
1501         }
1502     }
1503     return num;
1504 }
1505
1506 // check if compound word is correctly spelled
1507 // hu_mov_rule = spec. Hungarian rule (XXX)
1508 struct hentry * AffixMgr::compound_check(const char * word, int len,
1509     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
1510     char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
1511 {
1512     int i;
1513     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1514     int oldcmpdstemnum = 0;
1515     struct hentry * rv = NULL;
1516     struct hentry * rv_first;
1517     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1518     char st [MAXWORDUTF8LEN + 4];
1519     char ch;
1520     int cmin;
1521     int cmax;
1522
1523     int checked_prefix;
1524
1525 #ifdef HUNSTEM
1526     if (cmpdstemnum) {
1527         if (wordnum == 0) {
1528             *cmpdstemnum = 1;
1529         } else {
1530             (*cmpdstemnum)++;
1531         }
1532     }
1533 #endif
1534     if (utf8) {
1535         for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
1536           cmin++;
1537           for (; (word[cmin] & 0xc0) == 0x80; cmin++);
1538         }
1539         for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
1540           cmax--;
1541           for (; (word[cmax] & 0xc0) == 0x80; cmax--);
1542         }
1543     } else {
1544         cmin = cpdmin;
1545         cmax = len - cpdmin + 1;
1546     }
1547
1548     strcpy(st, word);
1549
1550     for (i = cmin; i < cmax; i++) {
1551
1552         oldnumsyllable = numsyllable;
1553         oldwordnum = wordnum;
1554         checked_prefix = 0;
1555
1556         // go to end of the UTF-8 character
1557         if (utf8) {
1558             for (; (st[i] & 0xc0) == 0x80; i++);
1559             if (i >= cmax) return NULL;
1560         }
1561
1562
1563         ch = st[i];
1564         st[i] = '\0';
1565
1566         sfx = NULL;
1567         pfx = NULL;
1568
1569         // FIRST WORD
1570
1571         rv = lookup(st); // perhaps without prefix
1572
1573         // search homonym with compound flag
1574         while ((rv) && !hu_mov_rule &&
1575             ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
1576                 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1577                   (compoundbegin && !wordnum &&
1578                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1579                   (compoundmiddle && wordnum && !words &&
1580                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1581                   (numdefcpd &&
1582                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1583                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
1584                   ))) {
1585             rv = rv->next_homonym;
1586         }
1587
1588         if (!rv) {
1589             if (compoundflag &&
1590              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1591                 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1592                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
1593                     ((SfxEntry*)sfx)->getCont() &&
1594                         ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1595                             ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
1596                         TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
1597                             ((SfxEntry*)sfx)->getContLen())))) {
1598                         rv = NULL;
1599                 }
1600             }
1601             if (rv ||
1602               (((wordnum == 0) && compoundbegin &&
1603                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1604                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1605               ((wordnum > 0) && compoundmiddle &&
1606                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1607                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
1608               ) checked_prefix = 1;
1609         // else check forbiddenwords and pseudoroot
1610         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1611             TESTAFF(rv->astr, pseudoroot, rv->alen) ||
1612             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
1613              )) {
1614                 st[i] = ch;
1615                 continue;
1616         }
1617
1618             // check non_compound flag in suffix and prefix
1619             if ((rv) && !hu_mov_rule &&
1620                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1621                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
1622                         ((PfxEntry*)pfx)->getContLen())) ||
1623                 (sfx && ((SfxEntry*)sfx)->getCont() &&
1624                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1625                         ((SfxEntry*)sfx)->getContLen())))) {
1626                     rv = NULL;
1627             }
1628
1629             // check compoundend flag in suffix and prefix
1630             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
1631                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1632                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
1633                         ((PfxEntry*)pfx)->getContLen())) ||
1634                 (sfx && ((SfxEntry*)sfx)->getCont() &&
1635                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
1636                         ((SfxEntry*)sfx)->getContLen())))) {
1637                     rv = NULL;
1638             }
1639
1640             // check compoundmiddle flag in suffix and prefix
1641             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
1642                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1643                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
1644                         ((PfxEntry*)pfx)->getContLen())) ||
1645                 (sfx && ((SfxEntry*)sfx)->getCont() &&
1646                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
1647                         ((SfxEntry*)sfx)->getContLen())))) {
1648                     rv = NULL;
1649             }
1650
1651         // check forbiddenwords
1652         if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1653             (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
1654                 return NULL;
1655             }
1656
1657         // increment word number, if the second root has a compoundroot flag
1658         if ((rv) && compoundroot &&
1659             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1660                 wordnum++;
1661         }
1662
1663         // first word is acceptable in compound words?
1664         if (((rv) &&
1665           ( checked_prefix || (words && words[wnum]) ||
1666             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1667             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1668             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
1669 //            (numdefcpd && )
1670
1671 // LANG_hu section: spec. Hungarian rule
1672             || ((langnum == LANG_hu) && hu_mov_rule && (
1673                     TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
1674                     TESTAFF(rv->astr, 'G', rv->alen) ||
1675                     TESTAFF(rv->astr, 'H', rv->alen)
1676                 )
1677               )
1678 // END of LANG_hu section
1679           )
1680           && ! (( checkcompoundtriple && // test triple letters
1681                    (word[i-1]==word[i]) && (
1682                       ((i>1) && (word[i-1]==word[i-2])) ||
1683                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
1684                    )
1685                ) ||
1686                (
1687                  // test CHECKCOMPOUNDPATTERN
1688                  numcheckcpd && cpdpat_check(word, i)
1689                ) ||
1690                (
1691                  checkcompoundcase && cpdcase_check(word, i)
1692                ))
1693          )
1694 // LANG_hu section: spec. Hungarian rule
1695          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
1696               (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
1697                         TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
1698                         TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
1699                     )
1700                )
1701              )
1702 // END of LANG_hu section
1703          ) {
1704
1705 // LANG_hu section: spec. Hungarian rule
1706             if (langnum == LANG_hu) {
1707                 // calculate syllable number of the word
1708                 numsyllable += get_syllable(st, i);
1709
1710                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1711                 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
1712             }
1713 // END of LANG_hu section
1714
1715 #ifdef HUNSTEM
1716             if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i;
1717 #endif
1718
1719             // NEXT WORD(S)
1720             rv_first = rv;
1721             rv = lookup((word+i)); // perhaps without prefix
1722
1723         // search homonym with compound flag
1724         while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
1725                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1726                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
1727                            (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
1728             rv = rv->next_homonym;
1729         }
1730
1731             if (rv && words && words[wnum + 1]) return rv;
1732
1733             oldnumsyllable2 = numsyllable;
1734             oldwordnum2 = wordnum;
1735
1736 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1737             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
1738                 numsyllable--;
1739             }
1740 // END of LANG_hu section
1741
1742             // increment word number, if the second root has a compoundroot flag
1743             if ((rv) && (compoundroot) &&
1744                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1745                     wordnum++;
1746             }
1747
1748             // check forbiddenwords
1749             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1750                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1751
1752             // second word is acceptable, as a root?
1753             // hungarian conventions: compounding is acceptable,
1754             // when compound forms consist of 2 words, or if more,
1755             // then the syllable number of root words must be 6, or lesser.
1756
1757             if ((rv) && (
1758                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1759                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
1760                     )
1761                 && (
1762                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
1763                       ((cpdmaxsyllable==0) ||
1764                           (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
1765                     )
1766                 && (
1767                      (!checkcompounddup || (rv != rv_first))
1768                    )
1769                 )
1770                  {
1771                       // forbid compound word, if it is a non compound word with typical fault
1772                       if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
1773                       return rv;
1774             }
1775
1776             numsyllable = oldnumsyllable2 ;
1777             wordnum = oldwordnum2;
1778
1779             // perhaps second word has prefix or/and suffix
1780             sfx = NULL;
1781             sfxflag = FLAG_NULL;
1782             rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
1783             if (!rv && compoundend) {
1784                 sfx = NULL;
1785                 pfx = NULL;
1786                 rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
1787             }
1788
1789             if (!rv && numdefcpd && words) {
1790                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
1791                 if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv;
1792             }
1793
1794             // check non_compound flag in suffix and prefix
1795             if ((rv) &&
1796                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
1797                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
1798                         ((PfxEntry*)pfx)->getContLen())) ||
1799                 (sfx && ((SfxEntry*)sfx)->getCont() &&
1800                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1801                         ((SfxEntry*)sfx)->getContLen())))) {
1802                     rv = NULL;
1803             }
1804
1805             // check forbiddenwords
1806             if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
1807                (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
1808
1809             // pfxappnd = prefix of word+i, or NULL
1810             // calculate syllable number of prefix.
1811             // hungarian convention: when syllable number of prefix is more,
1812             // than 1, the prefix+word counts as two words.
1813
1814             if (langnum == LANG_hu) {
1815                 // calculate syllable number of the word
1816                 numsyllable += get_syllable(word + i, strlen(word + i));
1817
1818                 // - affix syllable num.
1819                 // XXX only second suffix (inflections, not derivations)
1820                 if (sfxappnd) {
1821                     char * tmp = myrevstrdup(sfxappnd);
1822                     numsyllable -= get_syllable(tmp, strlen(tmp));
1823                     free(tmp);
1824                 }
1825
1826                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1827                 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
1828
1829                 // increment syllable num, if last word has a SYLLABLENUM flag
1830                 // and the suffix is beginning `s'
1831
1832                 if (cpdsyllablenum) {
1833                     switch (sfxflag) {
1834                         case 'c': { numsyllable+=2; break; }
1835                         case 'J': { numsyllable += 1; break; }
1836                         case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
1837                     }
1838                 }
1839             }
1840
1841             // increment word number, if the second word has a compoundroot flag
1842             if ((rv) && (compoundroot) &&
1843                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
1844                     wordnum++;
1845             }
1846
1847             // second word is acceptable, as a word with prefix or/and suffix?
1848             // hungarian conventions: compounding is acceptable,
1849             // when compound forms consist 2 word, otherwise
1850             // the syllable number of root words is 6, or lesser.
1851             if ((rv) &&
1852                     (
1853                       ((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
1854                       ((cpdmaxsyllable == 0) ||
1855                           (numsyllable <= cpdmaxsyllable))
1856                     )
1857                 && (
1858                    (!checkcompounddup || (rv != rv_first))
1859                    )) {
1860                     // forbid compound word, if it is a non compound word with typical fault
1861                     if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1862                     return rv;
1863             }
1864
1865             numsyllable = oldnumsyllable2;
1866             wordnum = oldwordnum2;
1867 #ifdef HUNSTEM
1868             if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum;
1869 #endif
1870             // perhaps second word is a compound word (recursive call)
1871             if (wordnum < maxwordnum) {
1872                 rv = compound_check((word+i),strlen(word+i), wordnum+1,
1873                      numsyllable, maxwordnum, wnum + 1, words,
1874                      0, cmpdstemnum, cmpdstem, is_sug);
1875             } else {
1876                 rv=NULL;
1877             }
1878             if (rv) {
1879                 // forbid compound word, if it is a non compound word with typical fault
1880                 if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
1881                 return rv;
1882             } else {
1883 #ifdef HUNSTEM
1884             if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum;
1885 #endif
1886             }
1887         }
1888         st[i] = ch;
1889         wordnum = oldwordnum;
1890         numsyllable = oldnumsyllable;
1891     }
1892
1893     return NULL;
1894 }
1895
1896 #ifdef HUNSPELL_EXPERIMENTAL
1897 // check if compound word is correctly spelled
1898 // hu_mov_rule = spec. Hungarian rule (XXX)
1899 int AffixMgr::compound_check_morph(const char * word, int len,
1900     short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
1901     char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
1902 {
1903     int i;
1904     short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
1905     int ok = 0;
1906
1907     struct hentry * rv = NULL;
1908     struct hentry * rv_first;
1909     struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
1910     char st [MAXWORDUTF8LEN + 4];
1911     char ch;
1912
1913     int checked_prefix;
1914     char presult[MAXLNLEN];
1915
1916     int cmin;
1917     int cmax;
1918
1919     if (utf8) {
1920         for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
1921           cmin++;
1922           for (; (word[cmin] & 0xc0) == 0x80; cmin++);
1923         }
1924         for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
1925           cmax--;
1926           for (; (word[cmax] & 0xc0) == 0x80; cmax--);
1927         }
1928     } else {
1929         cmin = cpdmin;
1930         cmax = len - cpdmin + 1;
1931     }
1932
1933     strcpy(st, word);
1934
1935     for (i = cmin; i < cmax; i++) {
1936         oldnumsyllable = numsyllable;
1937         oldwordnum = wordnum;
1938         checked_prefix = 0;
1939
1940         // go to end of the UTF-8 character
1941         if (utf8) {
1942             for (; (st[i] & 0xc0) == 0x80; i++);
1943             if (i >= cmax) return 0;
1944         }
1945
1946         ch = st[i];
1947         st[i] = '\0';
1948         sfx = NULL;
1949
1950         // FIRST WORD
1951         *presult = '\0';
1952         if (partresult) strcat(presult, partresult);
1953
1954         rv = lookup(st); // perhaps without prefix
1955
1956         // search homonym with compound flag
1957         while ((rv) && !hu_mov_rule &&
1958             ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
1959                 !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
1960                 (compoundbegin && !wordnum &&
1961                         TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
1962                 (compoundmiddle && wordnum && !words &&
1963                     TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
1964                   (numdefcpd &&
1965                     ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
1966                     (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
1967                   ))) {
1968             rv = rv->next_homonym;
1969         }
1970
1971         if (rv)  {
1972             if (rv->description) {
1973                 if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
1974                                         strcat(presult, st);
1975                 strcat(presult, rv->description);
1976             }
1977         }
1978
1979         if (!rv) {
1980             if (compoundflag &&
1981              !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
1982                 if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
1983                         FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
1984                     ((SfxEntry*)sfx)->getCont() &&
1985                         ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
1986                             ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
1987                         TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
1988                             ((SfxEntry*)sfx)->getContLen())))) {
1989                         rv = NULL;
1990                 }
1991             }
1992
1993             if (rv ||
1994               (((wordnum == 0) && compoundbegin &&
1995                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1996                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
1997               ((wordnum > 0) && compoundmiddle &&
1998                 ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
1999                 (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
2000               ) {
2001                 //char * p = prefix_check_morph(st, i, 0, compound);
2002                 char * p = NULL;
2003                 if (compoundflag) p = affix_check_morph(st, i, compoundflag);
2004                 if (!p || (*p == '\0')) {
2005                    if ((wordnum == 0) && compoundbegin) {
2006                      p = affix_check_morph(st, i, compoundbegin);
2007                    } else if ((wordnum > 0) && compoundmiddle) {
2008                      p = affix_check_morph(st, i, compoundmiddle);
2009                    }
2010                 }
2011                 if (*p != '\0') {
2012                     line_uniq(p);
2013                     if (strchr(p, '\n')) {
2014                         strcat(presult, "(");
2015                         strcat(presult, line_join(p, '|'));
2016                         strcat(presult, ")");
2017                       } else {
2018                         strcat(presult, p);
2019                       }
2020                 }
2021                 if (presult[strlen(presult) - 1] == '\n') {
2022                     presult[strlen(presult) - 1] = '\0';
2023                 }
2024                 checked_prefix = 1;
2025                 //strcat(presult, "+");
2026             }
2027         // else check forbiddenwords
2028         } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
2029             TESTAFF(rv->astr, pseudoroot, rv->alen))) {
2030                 st[i] = ch;
2031                 continue;
2032         }
2033
2034             // check non_compound flag in suffix and prefix
2035             if ((rv) && !hu_mov_rule &&
2036                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2037                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
2038                         ((PfxEntry*)pfx)->getContLen())) ||
2039                 (sfx && ((SfxEntry*)sfx)->getCont() &&
2040                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
2041                         ((SfxEntry*)sfx)->getContLen())))) {
2042                     continue;
2043             }
2044
2045             // check compoundend flag in suffix and prefix
2046             if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
2047                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2048                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
2049                         ((PfxEntry*)pfx)->getContLen())) ||
2050                 (sfx && ((SfxEntry*)sfx)->getCont() &&
2051                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
2052                         ((SfxEntry*)sfx)->getContLen())))) {
2053                     continue;
2054             }
2055
2056             // check compoundmiddle flag in suffix and prefix
2057             if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
2058                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2059                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
2060                         ((PfxEntry*)pfx)->getContLen())) ||
2061                 (sfx && ((SfxEntry*)sfx)->getCont() &&
2062                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
2063                         ((SfxEntry*)sfx)->getContLen())))) {
2064                     rv = NULL;
2065             }
2066
2067         // check forbiddenwords
2068         if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue;
2069
2070         // increment word number, if the second root has a compoundroot flag
2071         if ((rv) && (compoundroot) &&
2072             (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2073                 wordnum++;
2074         }
2075
2076         // first word is acceptable in compound words?
2077         if (((rv) &&
2078           ( checked_prefix || (words && words[wnum]) ||
2079             (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2080             ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
2081             ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
2082 // LANG_hu section: spec. Hungarian rule
2083             || ((langnum == LANG_hu) && // hu_mov_rule
2084                 hu_mov_rule && (
2085                     TESTAFF(rv->astr, 'F', rv->alen) ||
2086                     TESTAFF(rv->astr, 'G', rv->alen) ||
2087                     TESTAFF(rv->astr, 'H', rv->alen)
2088                 )
2089               )
2090 // END of LANG_hu section
2091           )
2092           && ! (( checkcompoundtriple && // test triple letters
2093                    (word[i-1]==word[i]) && (
2094                       ((i>1) && (word[i-1]==word[i-2])) ||
2095                       ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
2096                    )
2097                ) ||
2098                (
2099                    // test CHECKCOMPOUNDPATTERN
2100                    numcheckcpd && cpdpat_check(word, i)
2101                ) ||
2102                (
2103                  checkcompoundcase && cpdcase_check(word, i)
2104                ))
2105          )
2106 // LANG_hu section: spec. Hungarian rule
2107          || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
2108               (sfx && ((SfxEntry*)sfx)->getCont() && (
2109                         TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
2110                         TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
2111                     )
2112                )
2113              )
2114 // END of LANG_hu section
2115          ) {
2116
2117 // LANG_hu section: spec. Hungarian rule
2118             if (langnum == LANG_hu) {
2119                 // calculate syllable number of the word
2120                 numsyllable += get_syllable(st, i);
2121
2122                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2123                 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
2124             }
2125 // END of LANG_hu section
2126
2127             // NEXT WORD(S)
2128             rv_first = rv;
2129             rv = lookup((word+i)); // perhaps without prefix
2130
2131         // search homonym with compound flag
2132         while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
2133                         !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2134                           (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
2135                            (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
2136             rv = rv->next_homonym;
2137         }
2138
2139             if (rv && words && words[wnum + 1]) {
2140                   strcat(*result, presult);
2141                   if (complexprefixes && rv->description) strcat(*result, rv->description);
2142                   if (rv->description && ((!rv->astr) ||
2143                      !TESTAFF(rv->astr, lemma_present, rv->alen)))
2144                         strcat(*result, rv->word);
2145                   if (!complexprefixes && rv->description) strcat(*result, rv->description);
2146                   strcat(*result, "\n");
2147                   ok = 1;
2148                   return 0;
2149             }
2150
2151             oldnumsyllable2 = numsyllable;
2152             oldwordnum2 = wordnum;
2153
2154 // LANG_hu section: spec. Hungarian rule
2155             if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
2156                 numsyllable--;
2157             }
2158 // END of LANG_hu section
2159             // increment word number, if the second root has a compoundroot flag
2160             if ((rv) && (compoundroot) &&
2161                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2162                     wordnum++;
2163             }
2164
2165             // check forbiddenwords
2166             if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) {
2167                 st[i] = ch;
2168                 continue;
2169             }
2170
2171             // second word is acceptable, as a root?
2172             // hungarian conventions: compounding is acceptable,
2173             // when compound forms consist of 2 words, or if more,
2174             // then the syllable number of root words must be 6, or lesser.
2175             if ((rv) && (
2176                       (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
2177                       (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
2178                     )
2179                 && (
2180                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2181                       ((cpdmaxsyllable==0) ||
2182                           (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
2183                     )
2184                 && (
2185                      (!checkcompounddup || (rv != rv_first))
2186                    )
2187                 )
2188                  {
2189                       // bad compound word
2190                       strcat(*result, presult);
2191
2192                       if (rv->description) {
2193                         if (complexprefixes) strcat(*result, rv->description);
2194                         if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
2195                                                strcat(*result, rv->word);
2196                         if (!complexprefixes) strcat(*result, rv->description);
2197                       }
2198                       strcat(*result, "\n");
2199                               ok = 1;
2200             }
2201
2202             numsyllable = oldnumsyllable2 ;
2203             wordnum = oldwordnum2;
2204
2205             // perhaps second word has prefix or/and suffix
2206             sfx = NULL;
2207             sfxflag = FLAG_NULL;
2208
2209             if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
2210
2211             if (!rv && compoundend) {
2212                 sfx = NULL;
2213                 pfx = NULL;
2214                 rv = affix_check((word+i),strlen(word+i), compoundend);
2215             }
2216
2217             if (!rv && numdefcpd && words) {
2218                 rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
2219                 if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
2220                       char * m = NULL;
2221                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2222                       if ((!m || *m == '\0') && compoundend)
2223                             m = affix_check_morph((word+i),strlen(word+i), compoundend);
2224                       strcat(*result, presult);
2225                       if (m) {
2226                         line_uniq(m);
2227                         if (strchr(m, '\n')) {
2228                             strcat(*result, "(");
2229                             strcat(*result, line_join(m, '|'));
2230                             strcat(*result, ")");
2231                         } else {
2232                             strcat(*result, m);
2233                         }
2234                         free(m);
2235                       }
2236                       strcat(*result, "\n");
2237                       ok = 1;
2238                 }
2239             }
2240
2241             // check non_compound flag in suffix and prefix
2242             if ((rv) &&
2243                 ((pfx && ((PfxEntry*)pfx)->getCont() &&
2244                     TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
2245                         ((PfxEntry*)pfx)->getContLen())) ||
2246                 (sfx && ((SfxEntry*)sfx)->getCont() &&
2247                     TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
2248                         ((SfxEntry*)sfx)->getContLen())))) {
2249                     rv = NULL;
2250             }
2251
2252             // check forbiddenwords
2253             if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen))
2254                     && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) {
2255                         st[i] = ch;
2256                         continue;
2257                     }
2258
2259             if (langnum == LANG_hu) {
2260                 // calculate syllable number of the word
2261                 numsyllable += get_syllable(word + i, strlen(word + i));
2262
2263                 // - affix syllable num.
2264                 // XXX only second suffix (inflections, not derivations)
2265                 if (sfxappnd) {
2266                     char * tmp = myrevstrdup(sfxappnd);
2267                     numsyllable -= get_syllable(tmp, strlen(tmp));
2268                     free(tmp);
2269                 }
2270
2271                 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2272                 if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
2273
2274                 // increment syllable num, if last word has a SYLLABLENUM flag
2275                 // and the suffix is beginning `s'
2276
2277                 if (cpdsyllablenum) {
2278                     switch (sfxflag) {
2279                         case 'c': { numsyllable+=2; break; }
2280                         case 'J': { numsyllable += 1; break; }
2281                         case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
2282                     }
2283                 }
2284             }
2285
2286             // increment word number, if the second word has a compoundroot flag
2287             if ((rv) && (compoundroot) &&
2288                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
2289                     wordnum++;
2290             }
2291             // second word is acceptable, as a word with prefix or/and suffix?
2292             // hungarian conventions: compounding is acceptable,
2293             // when compound forms consist 2 word, otherwise
2294             // the syllable number of root words is 6, or lesser.
2295             if ((rv) &&
2296                     (
2297                       ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) ||
2298                       ((cpdmaxsyllable==0) ||
2299                           (numsyllable <= cpdmaxsyllable))
2300                     )
2301                 && (
2302                    (!checkcompounddup || (rv != rv_first))
2303                    )) {
2304                       char * m = NULL;
2305                       if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
2306                       if ((!m || *m == '\0') && compoundend)
2307                             m = affix_check_morph((word+i),strlen(word+i), compoundend);
2308                       strcat(*result, presult);
2309                       if (m) {
2310                         line_uniq(m);
2311                         if (strchr(m, '\n')) {
2312                             strcat(*result, "(");
2313                             strcat(*result, line_join(m, '|'));
2314                             strcat(*result, ")");
2315                         } else {
2316                             strcat(*result, m);
2317                         }
2318                         free(m);
2319                       }
2320                       strcat(*result, "\n");
2321                       ok = 1;
2322             }
2323
2324             numsyllable = oldnumsyllable2;
2325             wordnum = oldwordnum2;
2326
2327             // perhaps second word is a compound word (recursive call)
2328             if ((wordnum < maxwordnum) && (ok == 0)) {
2329                         compound_check_morph((word+i),strlen(word+i), wordnum+1,
2330                              numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
2331             } else {
2332                 rv=NULL;
2333             }
2334         }
2335         st[i] = ch;
2336         wordnum = oldwordnum;
2337         numsyllable = oldnumsyllable;
2338     }
2339     return 0;
2340 }
2341 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
2342
2343  // return 1 if s1 (reversed) is a leading subset of end of s2
2344 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2345  {
2346     while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2347         s1++;
2348         end_of_s2--;
2349         len--;
2350     }
2351     return (*s1 == '\0');
2352  }
2353  */
2354
2355 inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2356  {
2357     while ((len > 0) && (*s1 != '\0') && ((*s1 == *end_of_s2) || (*s1 == '.'))) {
2358         s1++;
2359         end_of_s2--;
2360         len--;
2361     }
2362     return (*s1 == '\0');
2363  }
2364
2365 // check word for suffixes
2366
2367 struct hentry * AffixMgr::suffix_check (const char * word, int len,
2368        int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns,
2369        const FLAG cclass, const FLAG needflag, char in_compound)
2370 {
2371     struct hentry * rv = NULL;
2372     char result[MAXLNLEN];
2373
2374     PfxEntry* ep = (PfxEntry *) ppfx;
2375
2376     // first handle the special case of 0 length suffixes
2377     SfxEntry * se = (SfxEntry *) sStart[0];
2378
2379     while (se) {
2380         if (!cclass || se->getCont()) {
2381             // suffixes are not allowed in beginning of compounds
2382             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2383              // except when signed with compoundpermitflag flag
2384              (se->getCont() && compoundpermitflag &&
2385                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2386               // no circumfix flag in prefix and suffix
2387               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2388                    circumfix, ep->getContLen())) &&
2389                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2390               // circumfix flag in prefix AND suffix
2391               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2392                    circumfix, ep->getContLen())) &&
2393                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&
2394             // fogemorpheme
2395               (in_compound ||
2396                  !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2397             // pseudoroot on prefix or first suffix
2398               (cclass ||
2399                    !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
2400                    (ppfx && !((ep->getCont()) &&
2401                      TESTAFF(ep->getCont(), pseudoroot,
2402                        ep->getContLen())))
2403               )
2404             ) {
2405                 rv = se->checkword(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass,
2406                     needflag, (in_compound ? 0 : onlyincompound));
2407                 if (rv) {
2408                     sfx=(AffEntry *)se; // BUG: sfx not stateless
2409                     return rv;
2410                 }
2411             }
2412         }
2413        se = se->getNext();
2414     }
2415
2416     // now handle the general case
2417     unsigned char sp = *((const unsigned char *)(word + len - 1));
2418     SfxEntry * sptr = (SfxEntry *) sStart[sp];
2419
2420     while (sptr) {
2421         if (isRevSubset(sptr->getKey(), word + len - 1, len)
2422         ) {
2423             // suffixes are not allowed in beginning of compounds
2424             if ((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2425              // except when signed with compoundpermitflag flag
2426              (sptr->getCont() && compoundpermitflag &&
2427                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2428               // no circumfix flag in prefix and suffix
2429               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2430                    circumfix, ep->getContLen())) &&
2431                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2432               // circumfix flag in prefix AND suffix
2433               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2434                    circumfix, ep->getContLen())) &&
2435                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&
2436             // fogemorpheme
2437               (in_compound ||
2438                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2439             // pseudoroot on prefix or first suffix
2440               (cclass ||
2441                   !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
2442                   (ppfx && !((ep->getCont()) &&
2443                      TESTAFF(ep->getCont(), pseudoroot,
2444                        ep->getContLen())))
2445               )
2446             ) {
2447                 rv = sptr->checkword(word,len, sfxopts, ppfx, wlst,
2448                     maxSug, ns, cclass, needflag, (in_compound ? 0 : onlyincompound));
2449                 if (rv) {
2450                     sfx=(AffEntry *)sptr; // BUG: sfx not stateless
2451                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2452                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2453                     if (cclass || sptr->getCont()) {
2454                                 if (!derived) {
2455                                         derived = mystrdup(word);
2456                                 } else {
2457                                         strcpy(result, derived); // XXX check size
2458                                         strcat(result, "\n");
2459                                         strcat(result, word);
2460                                         free(derived);
2461                                         derived = mystrdup(result);
2462                                 }
2463                     }
2464                     return rv;
2465                 }
2466              }
2467              sptr = sptr->getNextEQ();
2468         } else {
2469              sptr = sptr->getNextNE();
2470         }
2471     }
2472
2473     return NULL;
2474 }
2475
2476 // check word for two-level suffixes
2477
2478 struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
2479        int sfxopts, AffEntry * ppfx, const FLAG needflag)
2480 {
2481     struct hentry * rv = NULL;
2482
2483     // first handle the special case of 0 length suffixes
2484     SfxEntry * se = (SfxEntry *) sStart[0];
2485     while (se) {
2486         if (contclasses[se->getFlag()])
2487         {
2488             rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
2489             if (rv) return rv;
2490         }
2491         se = se->getNext();
2492     }
2493
2494     // now handle the general case
2495     unsigned char sp = *((const unsigned char *)(word + len - 1));
2496     SfxEntry * sptr = (SfxEntry *) sStart[sp];
2497
2498     while (sptr) {
2499         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2500             if (contclasses[sptr->getFlag()])
2501             {
2502                 rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
2503                 if (rv) {
2504                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2505                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2506                     return rv;
2507                 }
2508             }
2509             sptr = sptr->getNextEQ();
2510         } else {
2511              sptr = sptr->getNextNE();
2512         }
2513     }
2514
2515     return NULL;
2516 }
2517
2518 #ifdef HUNSPELL_EXPERIMENTAL
2519 char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
2520        int sfxopts, AffEntry * ppfx, const FLAG needflag)
2521 {
2522     char result[MAXLNLEN];
2523     char result2[MAXLNLEN];
2524     char result3[MAXLNLEN];
2525
2526     char * st;
2527
2528     result[0] = '\0';
2529     result2[0] = '\0';
2530     result3[0] = '\0';
2531
2532     // first handle the special case of 0 length suffixes
2533     SfxEntry * se = (SfxEntry *) sStart[0];
2534     while (se) {
2535         if (contclasses[se->getFlag()])
2536         {
2537             st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2538             if (st) {
2539                 if (ppfx) {
2540                     if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
2541                 }
2542                 strcat(result, st);
2543                 free(st);
2544                 if (se->getMorph()) strcat(result, se->getMorph());
2545                 strcat(result, "\n");
2546             }
2547         }
2548         se = se->getNext();
2549     }
2550
2551     // now handle the general case
2552     unsigned char sp = *((const unsigned char *)(word + len - 1));
2553     SfxEntry * sptr = (SfxEntry *) sStart[sp];
2554
2555     while (sptr) {
2556         if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
2557             if (contclasses[sptr->getFlag()])
2558             {
2559                 st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
2560                 if (st) {
2561                     sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
2562                     if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
2563                     strcpy(result2, st);
2564                     free(st);
2565
2566                 result3[0] = '\0';
2567 #ifdef DEBUG
2568                 unsigned short flag = sptr->getFlag();
2569                 if (flag_mode == FLAG_NUM) {
2570                     sprintf(result3, "<%d>", sptr->getKey());
2571                 } else if (flag_mode == FLAG_LONG) {
2572                     sprintf(result3, "<%c%c>", flag >> 8, (flag << 8) >>8);
2573                 } else sprintf(result3, "<%c>", flag);
2574                 strcat(result3, ":");
2575 #endif
2576                 if (sptr->getMorph()) strcat(result3, sptr->getMorph());
2577                 strlinecat(result2, result3);
2578                 strcat(result2, "\n");
2579                 strcat(result,  result2);
2580                 }
2581             }
2582             sptr = sptr->getNextEQ();
2583         } else {
2584              sptr = sptr->getNextNE();
2585         }
2586     }
2587     if (result) return mystrdup(result);
2588     return NULL;
2589 }
2590
2591 char * AffixMgr::suffix_check_morph(const char * word, int len,
2592        int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
2593 {
2594     char result[MAXLNLEN];
2595
2596     struct hentry * rv = NULL;
2597
2598     result[0] = '\0';
2599
2600     PfxEntry* ep = (PfxEntry *) ppfx;
2601
2602     // first handle the special case of 0 length suffixes
2603     SfxEntry * se = (SfxEntry *) sStart[0];
2604     while (se) {
2605         if (!cclass || se->getCont()) {
2606             // suffixes are not allowed in beginning of compounds
2607             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2608              // except when signed with compoundpermitflag flag
2609              (se->getCont() && compoundpermitflag &&
2610                 TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
2611               // no circumfix flag in prefix and suffix
2612               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2613                    circumfix, ep->getContLen())) &&
2614                (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
2615               // circumfix flag in prefix AND suffix
2616               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2617                    circumfix, ep->getContLen())) &&
2618                (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen())))))  &&
2619             // fogemorpheme
2620               (in_compound ||
2621                  !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
2622             // pseudoroot on prefix or first suffix
2623               (cclass ||
2624                    !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
2625                    (ppfx && !((ep->getCont()) &&
2626                      TESTAFF(ep->getCont(), pseudoroot,
2627                        ep->getContLen())))
2628               )
2629             ))
2630             rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2631          while (rv) {
2632            if (ppfx) {
2633                 if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
2634             }
2635             if (complexprefixes && rv->description) strcat(result, rv->description);
2636             if (rv->description && ((!rv->astr) ||
2637                                         !TESTAFF(rv->astr, lemma_present, rv->alen)))
2638                                                strcat(result, rv->word);
2639             if (!complexprefixes && rv->description) strcat(result, rv->description);
2640             if (se->getMorph()) strcat(result, se->getMorph());
2641             strcat(result, "\n");
2642             rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2643          }
2644        }
2645        se = se->getNext();
2646     }
2647
2648     // now handle the general case
2649     unsigned char sp = *((const unsigned char *)(word + len - 1));
2650     SfxEntry * sptr = (SfxEntry *) sStart[sp];
2651
2652     while (sptr) {
2653         if (isRevSubset(sptr->getKey(), word + len - 1, len)
2654         ) {
2655             // suffixes are not allowed in beginning of compounds
2656             if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
2657              // except when signed with compoundpermitflag flag
2658              (sptr->getCont() && compoundpermitflag &&
2659                 TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
2660               // no circumfix flag in prefix and suffix
2661               ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
2662                    circumfix, ep->getContLen())) &&
2663                (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
2664               // circumfix flag in prefix AND suffix
2665               ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
2666                    circumfix, ep->getContLen())) &&
2667                (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))))  &&
2668             // fogemorpheme
2669               (in_compound ||
2670                  !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
2671             // pseudoroot on first suffix
2672               (cclass || !(sptr->getCont() &&
2673                    TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())))
2674             )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
2675             while (rv) {
2676                     if (ppfx) {
2677                         if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
2678                     }
2679                     if (complexprefixes && rv->description) strcat(result, rv->description);
2680                     if (rv->description && ((!rv->astr) ||
2681                         !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word);
2682                     if (!complexprefixes && rv->description) strcat(result, rv->description);
2683 #ifdef DEBUG
2684                 unsigned short flag = sptr->getFlag();
2685                 if (flag_mode == FLAG_NUM) {
2686                     sprintf(result, "<%d>", sptr->getKey());
2687                 } else if (flag_mode == FLAG_LONG) {
2688                     sprintf(result, "<%c%c>", flag >> 8, (flag << 8) >>8);
2689                 } else sprintf(result, "<%c>", flag);
2690                 strcat(result, ":");
2691 #endif
2692
2693                 if (sptr->getMorph()) strcat(result, sptr->getMorph());
2694                 strcat(result, "\n");
2695                 rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
2696             }
2697              sptr = sptr->getNextEQ();
2698         } else {
2699              sptr = sptr->getNextNE();
2700         }
2701     }
2702
2703     if (*result) return mystrdup(result);
2704     return NULL;
2705 }
2706 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
2707
2708
2709 // check if word with affixes is correctly spelled
2710 struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
2711 {
2712     struct hentry * rv= NULL;
2713     if (derived) free(derived);
2714     derived =  NULL;
2715
2716     // check all prefixes (also crossed with suffixes if allowed)
2717     rv = prefix_check(word, len, in_compound, needflag);
2718     if (rv) return rv;
2719
2720     // if still not found check all suffixes
2721     rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
2722
2723     if (havecontclass) {
2724         sfx = NULL;
2725         pfx = NULL;
2726         if (rv) return rv;
2727         // if still not found check all two-level suffixes
2728         rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
2729         if (rv) return rv;
2730         // if still not found check all two-level suffixes
2731         rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
2732     }
2733     return rv;
2734 }
2735
2736 #ifdef HUNSPELL_EXPERIMENTAL
2737 // check if word with affixes is correctly spelled
2738 char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
2739 {
2740     char result[MAXLNLEN];
2741     char * st = NULL;
2742
2743     *result = '\0';
2744
2745     // check all prefixes (also crossed with suffixes if allowed)
2746     st = prefix_check_morph(word, len, in_compound);
2747     if (st) {
2748         strcat(result, st);
2749         free(st);
2750     }
2751
2752     // if still not found check all suffixes
2753     st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
2754     if (st) {
2755         strcat(result, st);
2756         free(st);
2757     }
2758
2759     if (havecontclass) {
2760         sfx = NULL;
2761         pfx = NULL;
2762         // if still not found check all two-level suffixes
2763         st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
2764         if (st) {
2765             strcat(result, st);
2766             free(st);
2767         }
2768
2769         // if still not found check all two-level suffixes
2770         st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
2771         if (st) {
2772             strcat(result, st);
2773             free(st);
2774         }
2775     }
2776
2777     return mystrdup(result);
2778 }
2779 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
2780
2781
2782 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
2783     int wl, const unsigned short * ap, unsigned short al, char * bad, int badl)
2784 {
2785
2786     int nh=0;
2787
2788     // first add root word to list
2789     if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) ||
2790          (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
2791        wlst[nh].word = mystrdup(ts);
2792        wlst[nh].allow = (1 == 0);
2793        nh++;
2794     }
2795
2796     // handle suffixes
2797     for (int i = 0; i < al; i++) {
2798        unsigned short c = (unsigned short) ap[i];
2799        SfxEntry * sptr = (SfxEntry *)sFlag[c];
2800        while (sptr) {
2801          if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
2802                 (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) &&
2803                 // check pseudoroot flag
2804                 !(sptr->getCont() && ((pseudoroot &&
2805                       TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
2806                   (circumfix &&
2807                       TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) ||
2808                   (onlyincompound &&
2809                       TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
2810                 ) {
2811             char * newword = sptr->add(ts, wl);
2812             if (newword) {
2813                 if (nh < maxn) {
2814                     wlst[nh].word = newword;
2815                     wlst[nh].allow = sptr->allowCross();
2816                 nh++;
2817                 } else {
2818                     free(newword);
2819                 }
2820             }
2821          }
2822          sptr = (SfxEntry *)sptr ->getFlgNxt();
2823        }
2824     }
2825
2826     int n = nh;
2827
2828     // handle cross products of prefixes and suffixes
2829     for (int j=1;j<n ;j++)
2830        if (wlst[j].allow) {
2831           for (int k = 0; k < al; k++) {
2832              unsigned short c = (unsigned short) ap[k];
2833              PfxEntry * cptr = (PfxEntry *) pFlag[c];
2834              while (cptr) {
2835                 if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
2836                         (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
2837                     int l1 = strlen(wlst[j].word);
2838                     char * newword = cptr->add(wlst[j].word, l1);
2839                     if (newword) {
2840                        if (nh < maxn) {
2841                           wlst[nh].word = newword;
2842                           wlst[nh].allow = cptr->allowCross();
2843                           nh++;
2844                        } else {
2845                           free(newword);
2846                        }
2847                     }
2848                 }
2849                 cptr = (PfxEntry *)cptr ->getFlgNxt();
2850              }
2851           }
2852        }
2853
2854
2855     // now handle pure prefixes
2856     for (int m = 0; m < al; m ++) {
2857        unsigned short c = (unsigned short) ap[m];
2858        PfxEntry * ptr = (PfxEntry *) pFlag[c];
2859        while (ptr) {
2860          if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
2861                 (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) &&
2862                 // check pseudoroot flag
2863                 !(ptr->getCont() && ((pseudoroot &&
2864                       TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) ||
2865                      (circumfix &&
2866                       TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) ||
2867                   (onlyincompound &&
2868                       TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
2869                 ) {
2870             char * newword = ptr->add(ts, wl);
2871             if (newword) {
2872                 if (nh < maxn) {
2873                     wlst[nh].word = newword;
2874                     wlst[nh].allow = ptr->allowCross();
2875                     nh++;
2876                 } else {
2877                     free(newword);
2878                 }
2879             }
2880          }
2881          ptr = (PfxEntry *)ptr ->getFlgNxt();
2882        }
2883     }
2884
2885     return nh;
2886 }
2887
2888
2889
2890 // return length of replacing table
2891 int AffixMgr::get_numrep()
2892 {
2893   return numrep;
2894 }
2895
2896 // return replacing table
2897 struct replentry * AffixMgr::get_reptable()
2898 {
2899   if (! reptable ) return NULL;
2900   return reptable;
2901 }
2902
2903 // return length of character map table
2904 int AffixMgr::get_nummap()
2905 {
2906   return nummap;
2907 }
2908
2909 // return character map table
2910 struct mapentry * AffixMgr::get_maptable()
2911 {
2912   if (! maptable ) return NULL;
2913   return maptable;
2914 }
2915
2916 // return length of word break table
2917 int AffixMgr::get_numbreak()
2918 {
2919   return numbreak;
2920 }
2921
2922 // return character map table
2923 char ** AffixMgr::get_breaktable()
2924 {
2925   if (! breaktable ) return NULL;
2926   return breaktable;
2927 }
2928
2929 // return text encoding of dictionary
2930 char * AffixMgr::get_encoding()
2931 {
2932   if (! encoding ) {
2933       encoding = mystrdup("ISO8859-1");
2934   }
2935   return mystrdup(encoding);
2936 }
2937
2938 // return text encoding of dictionary
2939 int AffixMgr::get_langnum()
2940 {
2941   return langnum;
2942 }
2943
2944 // return double prefix option
2945 int AffixMgr::get_complexprefixes()
2946 {
2947   return complexprefixes;
2948 }
2949
2950 FLAG AffixMgr::get_keepcase()
2951 {
2952   return keepcase;
2953 }
2954
2955 int AffixMgr::get_checksharps()
2956 {
2957   return checksharps;
2958 }
2959
2960 // return the preferred ignore string for suggestions
2961 char * AffixMgr::get_ignore()
2962 {
2963   if (!ignorechars) return NULL;
2964   return mystrdup(ignorechars);
2965 }
2966
2967 // return the preferred ignore string for suggestions
2968 unsigned short * AffixMgr::get_ignore_utf16(int * len)
2969 {
2970   *len = ignorechars_utf16_len;
2971   return ignorechars_utf16;
2972 }
2973
2974 // return the preferred try string for suggestions
2975 char * AffixMgr::get_try_string()
2976 {
2977   if (! trystring ) return NULL;
2978   return mystrdup(trystring);
2979 }
2980
2981 // return the preferred try string for suggestions
2982 const char * AffixMgr::get_wordchars()
2983 {
2984   return wordchars;
2985 }
2986
2987 unsigned short * AffixMgr::get_wordchars_utf16(int * len)
2988 {
2989   *len = wordchars_utf16_len;
2990   return wordchars_utf16;
2991 }
2992
2993 // is there compounding?
2994 int AffixMgr::get_compound()
2995 {
2996   return compoundflag || compoundbegin || numdefcpd;
2997 }
2998
2999 // return the compound words control flag
3000 FLAG AffixMgr::get_compoundflag()
3001 {
3002   return compoundflag;
3003 }
3004
3005 // return the forbidden words control flag
3006 FLAG AffixMgr::get_forbiddenword()
3007 {
3008   return forbiddenword;
3009 }
3010
3011 // return the forbidden words control flag
3012 FLAG AffixMgr::get_nosuggest()
3013 {
3014   return nosuggest;
3015 }
3016
3017 // return the forbidden words flag modify flag
3018 FLAG AffixMgr::get_pseudoroot()
3019 {
3020   return pseudoroot;
3021 }
3022
3023 // return the onlyincompound flag
3024 FLAG AffixMgr::get_onlyincompound()
3025 {
3026   return onlyincompound;
3027 }
3028
3029 // return the compound word signal flag
3030 FLAG AffixMgr::get_compoundroot()
3031 {
3032   return compoundroot;
3033 }
3034
3035 // return the compound begin signal flag
3036 FLAG AffixMgr::get_compoundbegin()
3037 {
3038   return compoundbegin;
3039 }
3040
3041 // return the value of checknum
3042 int AffixMgr::get_checknum()
3043 {
3044   return checknum;
3045 }
3046
3047 // return the value of prefix
3048 const char * AffixMgr::get_prefix()
3049 {
3050   if (pfx) return ((PfxEntry *)pfx)->getKey();
3051   return NULL;
3052 }
3053
3054 // return the value of suffix
3055 const char * AffixMgr::get_suffix()
3056 {
3057   return sfxappnd;
3058 }
3059
3060 // return the value of derived form (base word with first suffix).
3061 const char * AffixMgr::get_derived()
3062 {
3063   return derived;
3064 }
3065
3066 // return the value of suffix
3067 const char * AffixMgr::get_version()
3068 {
3069   return version;
3070 }
3071
3072 // return lemma_present flag
3073 FLAG AffixMgr::get_lemma_present()
3074 {
3075   return lemma_present;
3076 }
3077
3078 // utility method to look up root words in hash table
3079 struct hentry * AffixMgr::lookup(const char * word)
3080 {
3081   if (! pHMgr) return NULL;
3082   return pHMgr->lookup(word);
3083 }
3084
3085 // return the value of suffix
3086 const int AffixMgr::have_contclass()
3087 {
3088   return havecontclass;
3089 }
3090
3091 // return utf8
3092 int AffixMgr::get_utf8()
3093 {
3094   return utf8;
3095 }
3096
3097 // return nosplitsugs
3098 int AffixMgr::get_maxngramsugs(void)
3099 {
3100   return maxngramsugs;
3101 }
3102
3103 // return nosplitsugs
3104 int AffixMgr::get_nosplitsugs(void)
3105 {
3106   return nosplitsugs;
3107 }
3108
3109 // return sugswithdots
3110 int AffixMgr::get_sugswithdots(void)
3111 {
3112   return sugswithdots;
3113 }
3114
3115 /* parse flag */
3116 int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) {
3117    char * s = NULL;
3118    if (*out != FLAG_NULL) {
3119       HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
3120       return 1;
3121    }
3122    if (parse_string(line, &s, name)) return 1;
3123    *out = pHMgr->decode_flag(s);
3124    free(s);
3125    return 0;
3126 }
3127
3128 /* parse num */
3129 int AffixMgr::parse_num(char * line, int * out, const char * name) {
3130    char * s = NULL;
3131    if (*out != -1) {
3132       HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name);
3133       return 1;
3134    }
3135    if (parse_string(line, &s, name)) return 1;
3136    *out = atoi(s);
3137    free(s);
3138    return 0;
3139 }
3140
3141 /* parse in the max syllablecount of compound words and  */
3142 int  AffixMgr::parse_cpdsyllable(char * line)
3143 {
3144    char * tp = line;
3145    char * piece;
3146    int i = 0;
3147    int np = 0;
3148    w_char w[MAXWORDLEN];
3149    piece = mystrsep(&tp, 0);
3150    while (piece) {
3151       if (*piece != '\0') {
3152           switch(i) {
3153              case 0: { np++; break; }
3154              case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
3155              case 2: {
3156                 if (!utf8) {
3157                     cpdvowels = mystrdup(piece);
3158                 } else {
3159                     int n = u8_u16(w, MAXWORDLEN, piece);
3160                     if (n > 0) {
3161                         flag_qsort((unsigned short *) w, 0, n);
3162                         cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
3163                         if (!cpdvowels_utf16) return 1;
3164                         memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
3165                     }
3166                     cpdvowels_utf16_len = n;
3167                 }
3168                 np++;
3169                 break;
3170              }
3171              default: break;
3172           }
3173           i++;
3174       }
3175       free(piece);
3176       piece = mystrsep(&tp, 0);
3177    }
3178    if (np < 2) {
3179       HUNSPELL_WARNING(stderr, "error: missing compoundsyllable information\n");
3180       return 1;
3181    }
3182    if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
3183    return 0;
3184 }
3185
3186 /* parse in the typical fault correcting table */
3187 int  AffixMgr::parse_reptable(char * line, FILE * af)
3188 {
3189    if (numrep != 0) {
3190       HUNSPELL_WARNING(stderr, "error: duplicate REP tables used\n");
3191       return 1;
3192    }
3193    char * tp = line;
3194    char * piece;
3195    int i = 0;
3196    int np = 0;
3197    piece = mystrsep(&tp, 0);
3198    while (piece) {
3199        if (*piece != '\0') {
3200           switch(i) {
3201              case 0: { np++; break; }
3202              case 1: {
3203                        numrep = atoi(piece);
3204                        if (numrep < 1) {
3205                           HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n");
3206                           free(piece);
3207                           return 1;
3208                        }
3209                        reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
3210                        if (!reptable) {
3211                          free(piece);
3212                          return 1;
3213                        }
3214                        np++;
3215                        break;
3216                      }
3217              default: break;
3218           }
3219           i++;
3220        }
3221        free(piece);
3222        piece = mystrsep(&tp, 0);
3223    }
3224    if (np != 2) {
3225       HUNSPELL_WARNING(stderr, "error: missing replacement table information\n");
3226       return 1;
3227    }
3228
3229    /* now parse the numrep lines to read in the remainder of the table */
3230    char * nl = line;
3231    for (int j=0; j < numrep; j++) {
3232         if (!fgets(nl,MAXLNLEN,af)) return 1;
3233         mychomp(nl);
3234         tp = nl;
3235         i = 0;
3236         reptable[j].pattern = NULL;
3237         reptable[j].pattern2 = NULL;
3238         piece = mystrsep(&tp, 0);
3239         while (piece) {
3240            if (*piece != '\0') {
3241                switch(i) {
3242                   case 0: {
3243                              if (strncmp(piece,"REP",3) != 0) {
3244                                  HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
3245                                  free(piece);
3246                                  return 1;
3247                              }
3248                              break;
3249                           }
3250                   case 1: { reptable[j].pattern = mystrrep(mystrdup(piece),"_"," "); break; }
3251                   case 2: { reptable[j].pattern2 = mystrrep(mystrdup(piece),"_"," "); break; }
3252                   default: break;
3253                }
3254                i++;
3255            }
3256            free(piece);
3257            piece = mystrsep(&tp, 0);
3258         }
3259         if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
3260              HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n");
3261              return 1;
3262         }
3263    }
3264    return 0;
3265 }
3266
3267 /* parse in the checkcompoundpattern table */
3268 int  AffixMgr::parse_checkcpdtable(char * line, FILE * af)
3269 {
3270    if (numcheckcpd != 0) {
3271       HUNSPELL_WARNING(stderr, "error: duplicate compound pattern tables used\n");
3272       return 1;
3273    }
3274    char * tp = line;
3275    char * piece;
3276    int i = 0;
3277    int np = 0;
3278    piece = mystrsep(&tp, 0);
3279    while (piece) {
3280        if (*piece != '\0') {
3281           switch(i) {
3282              case 0: { np++; break; }
3283              case 1: {
3284                        numcheckcpd = atoi(piece);
3285                        if (numcheckcpd < 1) {
3286                           HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n");
3287                           free(piece);
3288                           return 1;
3289                        }
3290                        checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry));
3291                        if (!checkcpdtable) {
3292                          free(piece);
3293                          return 1;
3294                        }
3295                        np++;
3296                        break;
3297                      }
3298              default: break;
3299           }
3300           i++;
3301        }
3302        free(piece);
3303        piece = mystrsep(&tp, 0);
3304    }
3305    if (np != 2) {
3306       HUNSPELL_WARNING(stderr, "error: missing compound pattern table information\n");
3307       return 1;
3308    }
3309
3310    /* now parse the numcheckcpd lines to read in the remainder of the table */
3311    char * nl = line;
3312    for (int j=0; j < numcheckcpd; j++) {
3313         if (!fgets(nl,MAXLNLEN,af)) return 1;
3314         mychomp(nl);
3315         tp = nl;
3316         i = 0;
3317         checkcpdtable[j].pattern = NULL;
3318         checkcpdtable[j].pattern2 = NULL;
3319         piece = mystrsep(&tp, 0);
3320         while (piece) {
3321            if (*piece != '\0') {
3322                switch(i) {
3323                   case 0: {
3324                              if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
3325                                  HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
3326                                  free(piece);
3327                                  return 1;
3328                              }
3329                              break;
3330                           }
3331                   case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; }
3332                   case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; }
3333                   default: break;
3334                }
3335                i++;
3336            }
3337            free(piece);
3338            piece = mystrsep(&tp, 0);
3339         }
3340         if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
3341              HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n");
3342              return 1;
3343         }
3344    }
3345    return 0;
3346 }
3347
3348 /* parse in the compound rule table */
3349 int  AffixMgr::parse_defcpdtable(char * line, FILE * af)
3350 {
3351    if (numdefcpd != 0) {
3352       HUNSPELL_WARNING(stderr, "error: duplicate compound rule tables used\n");
3353       return 1;
3354    }
3355    char * tp = line;
3356    char * piece;
3357    int i = 0;
3358    int np = 0;
3359    piece = mystrsep(&tp, 0);
3360    while (piece) {
3361        if (*piece != '\0') {
3362           switch(i) {
3363              case 0: { np++; break; }
3364              case 1: {
3365                        numdefcpd = atoi(piece);
3366                        if (numdefcpd < 1) {
3367                           HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n");
3368                           free(piece);
3369                           return 1;
3370                        }
3371                        defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
3372                        if (!defcpdtable) {
3373                            free(piece);
3374                            return 1;
3375                        }
3376                        np++;
3377                        break;
3378                      }
3379              default: break;
3380           }
3381           i++;
3382        }
3383        free(piece);
3384        piece = mystrsep(&tp, 0);
3385    }
3386    if (np != 2) {
3387       HUNSPELL_WARNING(stderr, "error: missing compound rule table information\n");
3388       return 1;
3389    }
3390
3391    /* now parse the numdefcpd lines to read in the remainder of the table */
3392    char * nl = line;
3393    for (int j=0; j < numdefcpd; j++) {
3394         if (!fgets(nl,MAXLNLEN,af)) return 1;
3395         mychomp(nl);
3396         tp = nl;
3397         i = 0;
3398         defcpdtable[j].def = NULL;
3399         piece = mystrsep(&tp, 0);
3400         while (piece) {
3401            if (*piece != '\0') {
3402                switch(i) {
3403                   case 0: {
3404                              if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
3405                                  HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
3406                                  free(piece);
3407                                  return 1;
3408                              }
3409                              break;
3410                           }
3411                   case 1: {
3412                             defcpdtable[j].len =
3413                                 pHMgr->decode_flags(&(defcpdtable[j].def), piece);
3414                             break;
3415                            }
3416                   default: break;
3417                }
3418                i++;
3419            }
3420            free(piece);
3421            piece = mystrsep(&tp, 0);
3422         }
3423         if (!defcpdtable[j].len) {
3424              HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n");
3425              return 1;
3426         }
3427    }
3428    return 0;
3429 }
3430
3431
3432 /* parse in the character map table */
3433 int  AffixMgr::parse_maptable(char * line, FILE * af)
3434 {
3435    if (nummap != 0) {
3436       HUNSPELL_WARNING(stderr, "error: duplicate MAP tables used\n");
3437       return 1;
3438    }
3439    char * tp = line;
3440    char * piece;
3441    int i = 0;
3442    int np = 0;
3443    piece = mystrsep(&tp, 0);
3444    while (piece) {
3445        if (*piece != '\0') {
3446           switch(i) {
3447              case 0: { np++; break; }
3448              case 1: {
3449                        nummap = atoi(piece);
3450                        if (nummap < 1) {
3451                           HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n");
3452                           free(piece);
3453                           return 1;
3454                        }
3455                        maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
3456                        if (!maptable) {
3457                          free(piece);
3458                          return 1;
3459                        }
3460                        np++;
3461                        break;
3462                      }
3463              default: break;
3464           }
3465           i++;
3466        }
3467        free(piece);
3468        piece = mystrsep(&tp, 0);
3469    }
3470    if (np != 2) {
3471       HUNSPELL_WARNING(stderr, "error: missing map table information\n");
3472       return 1;
3473    }
3474
3475    /* now parse the nummap lines to read in the remainder of the table */
3476    char * nl = line;
3477    for (int j=0; j < nummap; j++) {
3478         if (!fgets(nl,MAXLNLEN,af)) return 1;
3479         mychomp(nl);
3480         tp = nl;
3481         i = 0;
3482         maptable[j].set = NULL;
3483         maptable[j].len = 0;
3484         piece = mystrsep(&tp, 0);
3485         while (piece) {
3486            if (*piece != '\0') {
3487                switch(i) {
3488                   case 0: {
3489                              if (strncmp(piece,"MAP",3) != 0) {
3490                                  HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
3491                                  free(piece);
3492                                  return 1;
3493                              }
3494                              break;
3495                           }
3496                   case 1: {
3497                             maptable[j].len = 0;
3498                             maptable[j].set = NULL;
3499                             maptable[j].set_utf16 = NULL;
3500                             if (!utf8) {
3501                                 maptable[j].set = mystrdup(piece);
3502                                 maptable[j].len = strlen(maptable[j].set);
3503                             } else {
3504                                 w_char w[MAXWORDLEN];
3505                                 int n = u8_u16(w, MAXWORDLEN, piece);
3506                                 if (n > 0) {
3507                                     flag_qsort((unsigned short *) w, 0, n);
3508                                     maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char));
3509                                     if (!maptable[j].set_utf16) return 1;
3510                                     memcpy(maptable[j].set_utf16, w, n * sizeof(w_char));
3511                                 }
3512                                 maptable[j].len = n;
3513                             }
3514                             break; }
3515                   default: break;
3516                }
3517                i++;
3518            }
3519            free(piece);
3520            piece = mystrsep(&tp, 0);
3521         }
3522         if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) {
3523              HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
3524              return 1;
3525         }
3526    }
3527    return 0;
3528 }
3529
3530 /* parse in the word breakpoint table */
3531 int  AffixMgr::parse_breaktable(char * line, FILE * af)
3532 {
3533    if (numbreak != 0) {
3534       HUNSPELL_WARNING(stderr, "error: duplicate word breakpoint tables used\n");
3535       return 1;
3536    }
3537    char * tp = line;
3538    char * piece;
3539    int i = 0;
3540    int np = 0;
3541    piece = mystrsep(&tp, 0);
3542    while (piece) {
3543        if (*piece != '\0') {
3544           switch(i) {
3545              case 0: { np++; break; }
3546              case 1: {
3547                        numbreak = atoi(piece);
3548                        if (numbreak < 1) {
3549                           HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n");
3550                           free(piece);
3551                           return 1;
3552                        }
3553                        breaktable = (char **) malloc(numbreak * sizeof(char *));
3554                        if (!breaktable) {
3555                          free(piece);
3556                          return 1;
3557                        }
3558                        np++;
3559                        break;
3560                      }
3561              default: break;
3562           }
3563           i++;
3564        }
3565        free(piece);
3566        piece = mystrsep(&tp, 0);
3567    }
3568    if (np != 2) {
3569       HUNSPELL_WARNING(stderr, "error: missing word breakpoint table information\n");
3570       return 1;
3571    }
3572
3573    /* now parse the numbreak lines to read in the remainder of the table */
3574    char * nl = line;
3575    for (int j=0; j < numbreak; j++) {
3576         if (!fgets(nl,MAXLNLEN,af)) return 1;
3577         mychomp(nl);
3578         tp = nl;
3579         i = 0;
3580         piece = mystrsep(&tp, 0);
3581         while (piece) {
3582            if (*piece != '\0') {
3583                switch(i) {
3584                   case 0: {
3585                              if (strncmp(piece,"BREAK",5) != 0) {
3586                                  HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
3587                                  free(piece);
3588                                  return 1;
3589                              }
3590                              break;
3591                           }
3592                   case 1: {
3593                             breaktable[j] = mystrdup(piece);
3594                             break;
3595                           }
3596                   default: break;
3597                }
3598                i++;
3599            }
3600            free(piece);
3601            piece = mystrsep(&tp, 0);
3602         }
3603         if (!breaktable) {
3604              HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n");
3605              return 1;
3606         }
3607    }
3608    return 0;
3609 }
3610
3611 int  AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags)
3612 {
3613    int numents = 0;      // number of affentry structures to parse
3614
3615    unsigned short aflag = 0;      // affix char identifier
3616
3617    char ff=0;
3618    struct affentry * ptr= NULL;
3619    struct affentry * nptr= NULL;
3620
3621    char * tp = line;
3622    char * nl = line;
3623    char * piece;
3624    int i = 0;
3625
3626    // checking lines with bad syntax
3627 #ifdef DEBUG
3628    int basefieldnum = 0;
3629 #endif
3630
3631    // split affix header line into pieces
3632
3633    int np = 0;
3634    piece = mystrsep(&tp, 0);
3635    while (piece) {
3636       if (*piece != '\0') {
3637           switch(i) {
3638              // piece 1 - is type of affix
3639              case 0: { np++; break; }
3640
3641              // piece 2 - is affix char
3642              case 1: {
3643                     np++;
3644                     aflag = pHMgr->decode_flag(piece);
3645                     if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
3646                         ((at == 'P') && (dupflags[aflag] & dupPFX))) {
3647                         HUNSPELL_WARNING(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl);
3648                         // return 1; XXX permissive mode for bad dictionaries
3649                     }
3650                     dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX);
3651                     break;
3652                     }
3653              // piece 3 - is cross product indicator
3654              case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
3655
3656              // piece 4 - is number of affentries
3657              case 3: {
3658                        np++;
3659                        numents = atoi(piece);
3660                        if (numents == 0) {
3661                            char * err = pHMgr->encode_flag(aflag);
3662                            HUNSPELL_WARNING(stderr, "error: affix %s header has incorrect entry count in line %s\n",
3663                                    err, nl);
3664                            free(err);
3665                            return 1;
3666                        }
3667                        ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
3668                        if (!ptr) return 1;
3669                        ptr->opts = ff;
3670                        if (utf8) ptr->opts += aeUTF8;
3671                        if (pHMgr->is_aliasf()) ptr->opts += aeALIASF;
3672 #ifdef HUNSPELL_EXPERIMENTAL
3673                        if (pHMgr->is_aliasm()) ptr->opts += aeALIASM;
3674 #endif
3675                        ptr->aflag = aflag;
3676                      }
3677
3678              default: break;
3679           }
3680           i++;
3681       }
3682       free(piece);
3683       piece = mystrsep(&tp, 0);
3684    }
3685    // check to make sure we parsed enough pieces
3686    if (np != 4) {
3687        char * err = pHMgr->encode_flag(aflag);
3688        HUNSPELL_WARNING(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl);
3689        free(err);
3690        free(ptr);
3691        return 1;
3692    }
3693
3694    // store away ptr to first affentry
3695    nptr = ptr;
3696
3697    // now parse numents affentries for this affix
3698    for (int j=0; j < numents; j++) {
3699       if (!fgets(nl,MAXLNLEN,af)) return 1;
3700       mychomp(nl);
3701       tp = nl;
3702       i = 0;
3703       np = 0;
3704
3705       // split line into pieces
3706       piece = mystrsep(&tp, 0);
3707       while (piece) {
3708          if (*piece != '\0') {
3709              switch(i) {
3710                 // piece 1 - is type
3711                 case 0: {
3712                           np++;
3713                           if (nptr != ptr) nptr->opts = ptr->opts;
3714                           break;
3715                         }
3716
3717                 // piece 2 - is affix char
3718                 case 1: {
3719                           np++;
3720                           if (pHMgr->decode_flag(piece) != aflag) {
3721                               char * err = pHMgr->encode_flag(aflag);
3722                               HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
3723                               HUNSPELL_WARNING(stderr, "error: possible incorrect count\n");
3724                               free(err);
3725                               free(piece);
3726                               return 1;
3727                           }
3728
3729                           if (nptr != ptr) nptr->aflag = ptr->aflag;
3730                           break;
3731                         }
3732
3733                 // piece 3 - is string to strip or 0 for null
3734                 case 2: {
3735                           np++;
3736                           if (complexprefixes) {
3737                             if (utf8) reverseword_utf(piece); else reverseword(piece);
3738                           }
3739                           nptr->strip = mystrdup(piece);
3740                           nptr->stripl = (unsigned char) strlen(nptr->strip);
3741                           if (strcmp(nptr->strip,"0") == 0) {
3742                               free(nptr->strip);
3743                               nptr->strip=mystrdup("");
3744                               nptr->stripl = 0;
3745                           }
3746                           break;
3747                         }
3748
3749                 // piece 4 - is affix string or 0 for null
3750                 case 3: {
3751                           char * dash;
3752 #ifdef HUNSPELL_EXPERIMENTAL
3753                           nptr->morphcode = NULL;
3754 #endif
3755                           nptr->contclass = NULL;
3756                           nptr->contclasslen = 0;
3757                           np++;
3758                           dash = strchr(piece, '/');
3759                           if (dash) {
3760                             *dash = '\0';
3761
3762                             if (ignorechars) {
3763                               if (utf8) {
3764                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
3765                               } else {
3766                                 remove_ignored_chars(piece,ignorechars);
3767                               }
3768                             }
3769
3770                             if (complexprefixes) {
3771                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
3772                             }
3773                             nptr->appnd = mystrdup(piece);
3774
3775                             if (pHMgr->is_aliasf()) {
3776                                 int index = atoi(dash + 1);
3777                                 nptr->contclasslen = (unsigned short) pHMgr->get_aliasf(index, &(nptr->contclass));
3778                             } else {
3779                                 nptr->contclasslen = (unsigned short) pHMgr->decode_flags(&(nptr->contclass), dash + 1);
3780                                 flag_qsort(nptr->contclass, 0, nptr->contclasslen);
3781                             }
3782                             *dash = '/';
3783
3784                             havecontclass = 1;
3785                             for (unsigned short _i = 0; _i < nptr->contclasslen; _i++) {
3786                               contclasses[(nptr->contclass)[_i]] = 1;
3787                             }
3788                           } else {
3789                             if (ignorechars) {
3790                               if (utf8) {
3791                                 remove_ignored_chars_utf(piece, ignorechars_utf16, ignorechars_utf16_len);
3792                               } else {
3793                                 remove_ignored_chars(piece,ignorechars);
3794                               }
3795                             }
3796
3797                             if (complexprefixes) {
3798                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
3799                             }
3800                             nptr->appnd = mystrdup(piece);
3801                           }
3802
3803                           nptr->appndl = (unsigned char) strlen(nptr->appnd);
3804                           if (strcmp(nptr->appnd,"0") == 0) {
3805                               free(nptr->appnd);
3806                               nptr->appnd=mystrdup("");
3807                               nptr->appndl = 0;
3808                           }
3809                           break;
3810                         }
3811
3812                 // piece 5 - is the conditions descriptions
3813                 case 4: {
3814                           np++;
3815                           if (complexprefixes) {
3816                             int neg = 0;
3817                             if (utf8) reverseword_utf(piece); else reverseword(piece);
3818                             // reverse condition
3819                             for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
3820                                 switch(*k) {
3821                                   case '[': {
3822                                         if (neg) *(k+1) = '['; else *k = ']';
3823                                         break;
3824                                     }
3825                                   case ']': {
3826                                         *k = '[';
3827                                         if (neg) *(k+1) = '^';
3828                                         neg = 0;
3829                                         break;
3830                                     }
3831                                   case '^': {
3832                                        if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
3833                                        break;
3834                                     }
3835                                   default: {
3836                                     if (neg) *(k+1) = *k;
3837                                   }
3838                                }
3839                             }
3840                           }
3841                           if (nptr->stripl && (strcmp(piece, ".") != 0) &&
3842                             redundant_condition(at, nptr->strip, nptr->stripl, piece, nl))
3843                                 strcpy(piece, ".");
3844                           if (encodeit(nptr,piece)) return 1;
3845                          break;
3846                 }
3847
3848 #ifdef HUNSPELL_EXPERIMENTAL
3849                 case 5: {
3850                           np++;
3851                           if (pHMgr->is_aliasm()) {
3852                             int index = atoi(piece);
3853                             nptr->morphcode = pHMgr->get_aliasm(index);
3854                           } else {
3855                             if (complexprefixes) {
3856                                 if (utf8) reverseword_utf(piece); else reverseword(piece);
3857                             }
3858                             nptr->morphcode = mystrdup(piece);
3859                           }
3860                           break;
3861                 }
3862 #endif
3863
3864                 default: break;
3865              }
3866              i++;
3867          }
3868          free(piece);
3869          piece = mystrsep(&tp, 0);
3870       }
3871       // check to make sure we parsed enough pieces
3872       if (np < 4) {
3873           char * err = pHMgr->encode_flag(aflag);
3874           HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
3875           free(err);
3876           free(ptr);
3877           return 1;
3878       }
3879
3880 #ifdef DEBUG
3881 #ifdef HUNSPELL_EXPERIMENTAL
3882       // detect unnecessary fields, excepting comments
3883       if (basefieldnum) {
3884         int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
3885           if (fieldnum != basefieldnum)
3886             HUNSPELL_WARNING(stderr, "warning: bad field number:\n%s\n", nl);
3887       } else {
3888         basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
3889       }
3890 #endif
3891 #endif
3892       nptr++;
3893    }
3894
3895    // now create SfxEntry or PfxEntry objects and use links to
3896    // build an ordered (sorted by affix string) list
3897    nptr = ptr;
3898    for (int k = 0; k < numents; k++) {
3899       if (at == 'P') {
3900           PfxEntry * pfxptr = new PfxEntry(this,nptr);
3901           build_pfxtree((AffEntry *)pfxptr);
3902       } else {
3903           SfxEntry * sfxptr = new SfxEntry(this,nptr);
3904           build_sfxtree((AffEntry *)sfxptr);
3905       }
3906       nptr++;
3907    }
3908    free(ptr);
3909    return 0;
3910 }
3911
3912 int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) {
3913   int condl = strlen(cond);
3914   int i;
3915   int j;
3916   int neg;
3917   int in;
3918   if (ft == 'P') { // prefix
3919     if (strncmp(strip, cond, condl) == 0) return 1;
3920     if (utf8) {
3921     } else {
3922       for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
3923         if (cond[j] != '[') {
3924           if (cond[j] != strip[i]) {
3925             HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
3926           }
3927         } else {
3928           neg = (cond[j+1] == '^') ? 1 : 0;
3929           in = 0;
3930           do {
3931             j++;
3932             if (strip[i] == cond[j]) in = 1;
3933           } while ((j < (condl - 1)) && (cond[j] != ']'));
3934           if (j == (condl - 1) && (cond[j] != ']')) {
3935             HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line);
3936             return 0;
3937           }
3938           if ((!neg && !in) || (neg && in)) {
3939             HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
3940             return 0;
3941           }
3942         }
3943       }
3944       if (j >= condl) return 1;
3945     }
3946   } else { // suffix
3947     if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
3948     if (utf8) {
3949     } else {
3950       for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
3951         if (cond[j] != ']') {
3952           if (cond[j] != strip[i]) {
3953             HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
3954           }
3955         } else {
3956           in = 0;
3957           do {
3958             j--;
3959             if (strip[i] == cond[j]) in = 1;
3960           } while ((j > 0) && (cond[j] != '['));
3961           if ((j == 0) && (cond[j] != '[')) {
3962             HUNSPELL_WARNING(stderr, "error: missing ] in condition:\n%s\n", line);
3963             return 0;
3964           }
3965           neg = (cond[j+1] == '^') ? 1 : 0;
3966           if ((!neg && !in) || (neg && in)) {
3967             HUNSPELL_WARNING(stderr, "warning: incompatible stripping characters and condition:\n%s\n", line);
3968             return 0;
3969           }
3970         }
3971       }
3972       if (j < 0) return 1;
3973     }
3974   }
3975   return 0;
3976 }