1 #include "license.hunspell"
2 #include "license.myspell"
11 #include "affixmgr.hxx"
12 #include "affentry.hxx"
13 #include "langnum.hxx"
17 AffixMgr::AffixMgr(const char * affpath
, HashMgr
** ptr
, int * md
, const char * key
)
19 // register hash manager and load affix data from aff file
38 // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
44 compoundflag
= FLAG_NULL
; // permits word in compound forms
45 compoundbegin
= FLAG_NULL
; // may be first word in compound forms
46 compoundmiddle
= FLAG_NULL
; // may be middle word in compound forms
47 compoundend
= FLAG_NULL
; // may be last word in compound forms
48 compoundroot
= FLAG_NULL
; // compound word signing flag
49 compoundpermitflag
= FLAG_NULL
; // compound permitting flag for suffixed word
50 compoundforbidflag
= FLAG_NULL
; // compound fordidden flag for suffixed word
51 checkcompounddup
= 0; // forbid double words in compounds
52 checkcompoundrep
= 0; // forbid bad compounds (may be non compound word with a REP substitution)
53 checkcompoundcase
= 0; // forbid upper and lowercase combinations at word bounds
54 checkcompoundtriple
= 0; // forbid compounds with triple letters
55 simplifiedtriple
= 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
56 forbiddenword
= FORBIDDENWORD
; // forbidden word signing flag
57 nosuggest
= FLAG_NULL
; // don't suggest words signed with NOSUGGEST flag
58 nongramsuggest
= FLAG_NULL
;
59 lang
= NULL
; // language
60 langnum
= 0; // language code (see http://l10n.openoffice.org/languages.html)
61 needaffix
= FLAG_NULL
; // forbidden root, allowed only with suffixes
62 cpdwordmax
= -1; // default: unlimited wordcount in compound words
63 cpdmin
= -1; // undefined
64 cpdmaxsyllable
= 0; // default: unlimited syllablecount in compound words
65 cpdvowels
=NULL
; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
66 cpdvowels_utf16
=NULL
; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
67 cpdvowels_utf16_len
=0; // vowels
68 pfxappnd
=NULL
; // previous prefix for counting the syllables of prefix BUG
69 sfxappnd
=NULL
; // previous suffix for counting a special syllables BUG
70 cpdsyllablenum
=NULL
; // syllable count incrementing flag
71 checknum
=0; // checking numbers, and word with numbers
72 wordchars
=NULL
; // letters + spec. word characters
73 wordchars_utf16
=NULL
; // letters + spec. word characters
74 wordchars_utf16_len
=0; // letters + spec. word characters
75 ignorechars
=NULL
; // letters + spec. word characters
76 ignorechars_utf16
=NULL
; // letters + spec. word characters
77 ignorechars_utf16_len
=0; // letters + spec. word characters
78 version
=NULL
; // affix and dictionary file version string
79 havecontclass
=0; // flags of possible continuing classes (double affix)
80 // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
81 // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
82 lemma_present
= FLAG_NULL
;
83 circumfix
= FLAG_NULL
;
84 onlyincompound
= FLAG_NULL
;
85 maxngramsugs
= -1; // undefined
86 maxdiff
= -1; // undefined
88 maxcpdsugs
= -1; // undefined
96 substandard
= FLAG_NULL
;
102 for (int i
=0; i
< SETSIZE
; i
++) {
109 for (int j
=0; j
< CONTSIZE
; j
++) {
113 if (parse_file(affpath
, key
)) {
114 HUNSPELL_WARNING(stderr
, "Failure loading aff file %s\n",affpath
);
117 if (cpdmin
== -1) cpdmin
= MINCPDLEN
;
122 AffixMgr::~AffixMgr()
124 // pass through linked prefix entries and clean up
125 for (int i
=0; i
< SETSIZE
;i
++) {
127 PfxEntry
* ptr
= pStart
[i
];
128 PfxEntry
* nptr
= NULL
;
130 nptr
= ptr
->getNext();
137 // pass through linked suffix entries and clean up
138 for (int j
=0; j
< SETSIZE
; j
++) {
140 SfxEntry
* ptr
= sStart
[j
];
141 SfxEntry
* nptr
= NULL
;
143 nptr
= ptr
->getNext();
151 if (keystring
) free(keystring
);
153 if (trystring
) free(trystring
);
155 if (encoding
) free(encoding
);
158 for (int j
=0; j
< nummap
; j
++) {
159 for (int k
=0; k
< maptable
[j
].len
; k
++) {
160 if (maptable
[j
].set
[k
]) free(maptable
[j
].set
[k
]);
162 free(maptable
[j
].set
);
163 maptable
[j
].set
= NULL
;
171 for (int j
=0; j
< numbreak
; j
++) {
172 if (breaktable
[j
]) free(breaktable
[j
]);
173 breaktable
[j
] = NULL
;
180 for (int j
=0; j
< numrep
; j
++) {
181 free(reptable
[j
].pattern
);
182 free(reptable
[j
].pattern2
);
187 if (iconvtable
) delete iconvtable
;
188 if (oconvtable
) delete oconvtable
;
189 if (phone
&& phone
->rules
) {
190 for (int j
=0; j
< phone
->num
+ 1; j
++) {
191 free(phone
->rules
[j
* 2]);
192 free(phone
->rules
[j
* 2 + 1]);
200 for (int j
=0; j
< numdefcpd
; j
++) {
201 free(defcpdtable
[j
].def
);
202 defcpdtable
[j
].def
= NULL
;
209 for (int j
=0; j
< numcheckcpd
; j
++) {
210 free(checkcpdtable
[j
].pattern
);
211 free(checkcpdtable
[j
].pattern2
);
212 free(checkcpdtable
[j
].pattern3
);
213 checkcpdtable
[j
].pattern
= NULL
;
214 checkcpdtable
[j
].pattern2
= NULL
;
215 checkcpdtable
[j
].pattern3
= NULL
;
218 checkcpdtable
= NULL
;
221 FREE_FLAG(compoundflag
);
222 FREE_FLAG(compoundbegin
);
223 FREE_FLAG(compoundmiddle
);
224 FREE_FLAG(compoundend
);
225 FREE_FLAG(compoundpermitflag
);
226 FREE_FLAG(compoundforbidflag
);
227 FREE_FLAG(compoundroot
);
228 FREE_FLAG(forbiddenword
);
229 FREE_FLAG(nosuggest
);
230 FREE_FLAG(nongramsuggest
);
231 FREE_FLAG(needaffix
);
232 FREE_FLAG(lemma_present
);
233 FREE_FLAG(circumfix
);
234 FREE_FLAG(onlyincompound
);
240 if (cpdvowels
) free(cpdvowels
);
241 if (cpdvowels_utf16
) free(cpdvowels_utf16
);
242 if (cpdsyllablenum
) free(cpdsyllablenum
);
244 if (lang
) free(lang
);
245 if (wordchars
) free(wordchars
);
246 if (wordchars_utf16
) free(wordchars_utf16
);
247 if (ignorechars
) free(ignorechars
);
248 if (ignorechars_utf16
) free(ignorechars_utf16
);
249 if (version
) free(version
);
251 #ifdef MOZILLA_CLIENT
257 // read in aff file and build up prefix and suffix entry objects
258 int AffixMgr::parse_file(const char * affpath
, const char * key
)
260 char * line
; // io buffers
261 char ft
; // affix type
263 // checking flag duplication
264 char dupflags
[CONTSIZE
];
265 char dupflags_ini
= 1;
267 // first line indicator for removing byte order mark
270 // open the affix file
271 FileMgr
* afflst
= new FileMgr(affpath
, key
);
273 HUNSPELL_WARNING(stderr
, "error: could not open affix description file %s\n",affpath
);
277 // step one is to parse the affix file building up the internal
278 // affix data structures
280 // read in each line ignoring any that do not
281 // start with a known line type indicator
282 while ((line
= afflst
->getline())) {
285 /* remove byte order mark */
288 // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
289 if (strncmp(line
,"\xEF\xBB\xBF",3) == 0) {
290 memmove(line
, line
+3, strlen(line
+3)+1);
294 /* parse in the keyboard string */
295 if (strncmp(line
,"KEY",3) == 0) {
296 if (parse_string(line
, &keystring
, afflst
->getlinenum())) {
302 /* parse in the try string */
303 if (strncmp(line
,"TRY",3) == 0) {
304 if (parse_string(line
, &trystring
, afflst
->getlinenum())) {
310 /* parse in the name of the character set used by the .dict and .aff */
311 if (strncmp(line
,"SET",3) == 0) {
312 if (parse_string(line
, &encoding
, afflst
->getlinenum())) {
316 if (strcmp(encoding
, "UTF-8") == 0) {
318 #ifndef OPENOFFICEORG
319 #ifndef MOZILLA_CLIENT
320 if (initialize_utf_tbl()) return 1;
326 /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
327 if (strncmp(line
,"COMPLEXPREFIXES",15) == 0)
330 /* parse in the flag used by the controlled compound words */
331 if (strncmp(line
,"COMPOUNDFLAG",12) == 0) {
332 if (parse_flag(line
, &compoundflag
, afflst
)) {
338 /* parse in the flag used by compound words */
339 if (strncmp(line
,"COMPOUNDBEGIN",13) == 0) {
340 if (complexprefixes
) {
341 if (parse_flag(line
, &compoundend
, afflst
)) {
346 if (parse_flag(line
, &compoundbegin
, afflst
)) {
353 /* parse in the flag used by compound words */
354 if (strncmp(line
,"COMPOUNDMIDDLE",14) == 0) {
355 if (parse_flag(line
, &compoundmiddle
, afflst
)) {
360 /* parse in the flag used by compound words */
361 if (strncmp(line
,"COMPOUNDEND",11) == 0) {
362 if (complexprefixes
) {
363 if (parse_flag(line
, &compoundbegin
, afflst
)) {
368 if (parse_flag(line
, &compoundend
, afflst
)) {
375 /* parse in the data used by compound_check() method */
376 if (strncmp(line
,"COMPOUNDWORDMAX",15) == 0) {
377 if (parse_num(line
, &cpdwordmax
, afflst
)) {
383 /* parse in the flag sign compounds in dictionary */
384 if (strncmp(line
,"COMPOUNDROOT",12) == 0) {
385 if (parse_flag(line
, &compoundroot
, afflst
)) {
391 /* parse in the flag used by compound_check() method */
392 if (strncmp(line
,"COMPOUNDPERMITFLAG",18) == 0) {
393 if (parse_flag(line
, &compoundpermitflag
, afflst
)) {
399 /* parse in the flag used by compound_check() method */
400 if (strncmp(line
,"COMPOUNDFORBIDFLAG",18) == 0) {
401 if (parse_flag(line
, &compoundforbidflag
, afflst
)) {
407 if (strncmp(line
,"CHECKCOMPOUNDDUP",16) == 0) {
408 checkcompounddup
= 1;
411 if (strncmp(line
,"CHECKCOMPOUNDREP",16) == 0) {
412 checkcompoundrep
= 1;
415 if (strncmp(line
,"CHECKCOMPOUNDTRIPLE",19) == 0) {
416 checkcompoundtriple
= 1;
419 if (strncmp(line
,"SIMPLIFIEDTRIPLE",16) == 0) {
420 simplifiedtriple
= 1;
423 if (strncmp(line
,"CHECKCOMPOUNDCASE",17) == 0) {
424 checkcompoundcase
= 1;
427 if (strncmp(line
,"NOSUGGEST",9) == 0) {
428 if (parse_flag(line
, &nosuggest
, afflst
)) {
434 if (strncmp(line
,"NONGRAMSUGGEST",14) == 0) {
435 if (parse_flag(line
, &nongramsuggest
, afflst
)) {
441 /* parse in the flag used by forbidden words */
442 if (strncmp(line
,"FORBIDDENWORD",13) == 0) {
443 if (parse_flag(line
, &forbiddenword
, afflst
)) {
449 /* parse in the flag used by forbidden words */
450 if (strncmp(line
,"LEMMA_PRESENT",13) == 0) {
451 if (parse_flag(line
, &lemma_present
, afflst
)) {
457 /* parse in the flag used by circumfixes */
458 if (strncmp(line
,"CIRCUMFIX",9) == 0) {
459 if (parse_flag(line
, &circumfix
, afflst
)) {
465 /* parse in the flag used by fogemorphemes */
466 if (strncmp(line
,"ONLYINCOMPOUND",14) == 0) {
467 if (parse_flag(line
, &onlyincompound
, afflst
)) {
473 /* parse in the flag used by `needaffixs' */
474 if (strncmp(line
,"PSEUDOROOT",10) == 0) {
475 if (parse_flag(line
, &needaffix
, afflst
)) {
481 /* parse in the flag used by `needaffixs' */
482 if (strncmp(line
,"NEEDAFFIX",9) == 0) {
483 if (parse_flag(line
, &needaffix
, afflst
)) {
489 /* parse in the minimal length for words in compounds */
490 if (strncmp(line
,"COMPOUNDMIN",11) == 0) {
491 if (parse_num(line
, &cpdmin
, afflst
)) {
495 if (cpdmin
< 1) cpdmin
= 1;
498 /* parse in the max. words and syllables in compounds */
499 if (strncmp(line
,"COMPOUNDSYLLABLE",16) == 0) {
500 if (parse_cpdsyllable(line
, afflst
)) {
506 /* parse in the flag used by compound_check() method */
507 if (strncmp(line
,"SYLLABLENUM",11) == 0) {
508 if (parse_string(line
, &cpdsyllablenum
, afflst
->getlinenum())) {
514 /* parse in the flag used by the controlled compound words */
515 if (strncmp(line
,"CHECKNUM",8) == 0) {
519 /* parse in the extra word characters */
520 if (strncmp(line
,"WORDCHARS",9) == 0) {
521 if (parse_array(line
, &wordchars
, &wordchars_utf16
, &wordchars_utf16_len
, utf8
, afflst
->getlinenum())) {
527 /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
528 if (strncmp(line
,"IGNORE",6) == 0) {
529 if (parse_array(line
, &ignorechars
, &ignorechars_utf16
, &ignorechars_utf16_len
, utf8
, afflst
->getlinenum())) {
535 /* parse in the typical fault correcting table */
536 if (strncmp(line
,"REP",3) == 0) {
537 if (parse_reptable(line
, afflst
)) {
543 /* parse in the input conversion table */
544 if (strncmp(line
,"ICONV",5) == 0) {
545 if (parse_convtable(line
, afflst
, &iconvtable
, "ICONV")) {
551 /* parse in the input conversion table */
552 if (strncmp(line
,"OCONV",5) == 0) {
553 if (parse_convtable(line
, afflst
, &oconvtable
, "OCONV")) {
559 /* parse in the phonetic translation table */
560 if (strncmp(line
,"PHONE",5) == 0) {
561 if (parse_phonetable(line
, afflst
)) {
567 /* parse in the checkcompoundpattern table */
568 if (strncmp(line
,"CHECKCOMPOUNDPATTERN",20) == 0) {
569 if (parse_checkcpdtable(line
, afflst
)) {
575 /* parse in the defcompound table */
576 if (strncmp(line
,"COMPOUNDRULE",12) == 0) {
577 if (parse_defcpdtable(line
, afflst
)) {
583 /* parse in the related character map table */
584 if (strncmp(line
,"MAP",3) == 0) {
585 if (parse_maptable(line
, afflst
)) {
591 /* parse in the word breakpoints table */
592 if (strncmp(line
,"BREAK",5) == 0) {
593 if (parse_breaktable(line
, afflst
)) {
599 /* parse in the language for language specific codes */
600 if (strncmp(line
,"LANG",4) == 0) {
601 if (parse_string(line
, &lang
, afflst
->getlinenum())) {
605 langnum
= get_lang_num(lang
);
608 if (strncmp(line
,"VERSION",7) == 0) {
609 for(line
= line
+ 7; *line
== ' ' || *line
== '\t'; line
++);
610 version
= mystrdup(line
);
613 if (strncmp(line
,"MAXNGRAMSUGS",12) == 0) {
614 if (parse_num(line
, &maxngramsugs
, afflst
)) {
620 if (strncmp(line
,"ONLYMAXDIFF", 11) == 0)
623 if (strncmp(line
,"MAXDIFF",7) == 0) {
624 if (parse_num(line
, &maxdiff
, afflst
)) {
630 if (strncmp(line
,"MAXCPDSUGS",10) == 0) {
631 if (parse_num(line
, &maxcpdsugs
, afflst
)) {
637 if (strncmp(line
,"NOSPLITSUGS",11) == 0) {
641 if (strncmp(line
,"FULLSTRIP",9) == 0) {
645 if (strncmp(line
,"SUGSWITHDOTS",12) == 0) {
649 /* parse in the flag used by forbidden words */
650 if (strncmp(line
,"KEEPCASE",8) == 0) {
651 if (parse_flag(line
, &keepcase
, afflst
)) {
657 /* parse in the flag used by `forceucase' */
658 if (strncmp(line
,"FORCEUCASE",10) == 0) {
659 if (parse_flag(line
, &forceucase
, afflst
)) {
665 /* parse in the flag used by `warn' */
666 if (strncmp(line
,"WARN",4) == 0) {
667 if (parse_flag(line
, &warn
, afflst
)) {
673 if (strncmp(line
,"FORBIDWARN",10) == 0) {
677 /* parse in the flag used by the affix generator */
678 if (strncmp(line
,"SUBSTANDARD",11) == 0) {
679 if (parse_flag(line
, &substandard
, afflst
)) {
685 if (strncmp(line
,"CHECKSHARPS",11) == 0) {
689 /* parse this affix: P - prefix, S - suffix */
691 if (strncmp(line
,"PFX",3) == 0) ft
= complexprefixes
? 'S' : 'P';
692 if (strncmp(line
,"SFX",3) == 0) ft
= complexprefixes
? 'P' : 'S';
695 memset(dupflags
, 0, sizeof(dupflags
));
698 if (parse_affix(line
, ft
, afflst
, dupflags
)) {
700 process_pfx_tree_to_list();
701 process_sfx_tree_to_list();
709 // convert affix trees to sorted list
710 process_pfx_tree_to_list();
711 process_sfx_tree_to_list();
713 // now we can speed up performance greatly taking advantage of the
714 // relationship between the affixes and the idea of "subsets".
716 // View each prefix as a potential leading subset of another and view
717 // each suffix (reversed) as a potential trailing subset of another.
719 // To illustrate this relationship if we know the prefix "ab" is found in the
720 // word to examine, only prefixes that "ab" is a leading subset of need be examined.
721 // Furthermore is "ab" is not present then none of the prefixes that "ab" is
722 // is a subset need be examined.
723 // The same argument goes for suffix string that are reversed.
725 // Then to top this off why not examine the first char of the word to quickly
726 // limit the set of prefixes to examine (i.e. the prefixes to examine must
727 // be leading supersets of the first character of the word (if they exist)
729 // To take advantage of this "subset" relationship, we need to add two links
730 // from entry. One to take next if the current prefix is found (call it nexteq)
731 // and one to take next if the current prefix is not found (call it nextne).
733 // Since we have built ordered lists, all that remains is to properly initialize
734 // the nextne and nexteq pointers that relate them
739 /* get encoding for CHECKCOMPOUNDCASE */
741 char * enc
= get_encoding();
742 csconv
= get_current_cs(enc
);
748 strcpy(expw
, wordchars
);
752 for (int i
= 0; i
<= 255; i
++) {
753 if ( (csconv
[i
].cupper
!= csconv
[i
].clower
) &&
754 (! strchr(expw
, (char) i
))) {
755 *(expw
+ strlen(expw
) + 1) = '\0';
756 *(expw
+ strlen(expw
)) = (char) i
;
760 wordchars
= mystrdup(expw
);
763 // default BREAK definition
764 if (numbreak
== -1) {
765 breaktable
= (char **) malloc(sizeof(char *) * 3);
766 if (!breaktable
) return 1;
767 breaktable
[0] = mystrdup("-");
768 breaktable
[1] = mystrdup("^-");
769 breaktable
[2] = mystrdup("-$");
770 if (breaktable
[0] && breaktable
[1] && breaktable
[2]) numbreak
= 3;
776 // we want to be able to quickly access prefix information
777 // both by prefix flag, and sorted by prefix string itself
778 // so we need to set up two indexes
780 int AffixMgr::build_pfxtree(PfxEntry
* pfxptr
)
784 PfxEntry
* ep
= pfxptr
;
786 // get the right starting points
787 const char * key
= ep
->getKey();
788 const unsigned char flg
= (unsigned char) (ep
->getFlag() & 0x00FF);
790 // first index by flag which must exist
796 // handle the special case of null affix string
797 if (strlen(key
) == 0) {
798 // always inset them at head of list at element 0
805 // now handle the normal case
809 unsigned char sp
= *((const unsigned char *)key
);
812 // handle the first insert
819 // otherwise use binary tree insertion so that a sorted
820 // list can easily be generated later
824 if (strcmp(ep
->getKey(), ptr
->getKey() ) <= 0) {
825 ptr
= ptr
->getNextEQ();
831 ptr
= ptr
->getNextNE();
841 // we want to be able to quickly access suffix information
842 // both by suffix flag, and sorted by the reverse of the
843 // suffix string itself; so we need to set up two indexes
844 int AffixMgr::build_sfxtree(SfxEntry
* sfxptr
)
848 SfxEntry
* ep
= sfxptr
;
850 /* get the right starting point */
851 const char * key
= ep
->getKey();
852 const unsigned char flg
= (unsigned char) (ep
->getFlag() & 0x00FF);
854 // first index by flag which must exist
859 // next index by affix string
861 // handle the special case of null affix string
862 if (strlen(key
) == 0) {
863 // always inset them at head of list at element 0
870 // now handle the normal case
874 unsigned char sp
= *((const unsigned char *)key
);
877 // handle the first insert
883 // otherwise use binary tree insertion so that a sorted
884 // list can easily be generated later
888 if (strcmp(ep
->getKey(), ptr
->getKey() ) <= 0) {
889 ptr
= ptr
->getNextEQ();
895 ptr
= ptr
->getNextNE();
905 // convert from binary tree to sorted list
906 int AffixMgr::process_pfx_tree_to_list()
908 for (int i
=1; i
< SETSIZE
; i
++) {
909 pStart
[i
] = process_pfx_in_order(pStart
[i
],NULL
);
915 PfxEntry
* AffixMgr::process_pfx_in_order(PfxEntry
* ptr
, PfxEntry
* nptr
)
918 nptr
= process_pfx_in_order(ptr
->getNextNE(), nptr
);
920 nptr
= process_pfx_in_order(ptr
->getNextEQ(), ptr
);
926 // convert from binary tree to sorted list
927 int AffixMgr:: process_sfx_tree_to_list()
929 for (int i
=1; i
< SETSIZE
; i
++) {
930 sStart
[i
] = process_sfx_in_order(sStart
[i
],NULL
);
935 SfxEntry
* AffixMgr::process_sfx_in_order(SfxEntry
* ptr
, SfxEntry
* nptr
)
938 nptr
= process_sfx_in_order(ptr
->getNextNE(), nptr
);
940 nptr
= process_sfx_in_order(ptr
->getNextEQ(), ptr
);
946 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
947 // using the idea of leading subsets this time
948 int AffixMgr::process_pfx_order()
952 // loop through each prefix list starting point
953 for (int i
=1; i
< SETSIZE
; i
++) {
957 // look through the remainder of the list
958 // and find next entry with affix that
959 // the current one is not a subset of
960 // mark that as destination for NextNE
961 // use next in list that you are a subset
964 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
966 PfxEntry
* nptr
= ptr
->getNext();
967 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
968 if (! isSubset( ptr
->getKey() , nptr
->getKey() )) break;
970 ptr
->setNextNE(nptr
);
971 ptr
->setNextEQ(NULL
);
972 if ((ptr
->getNext()) && isSubset(ptr
->getKey() , (ptr
->getNext())->getKey()))
973 ptr
->setNextEQ(ptr
->getNext());
976 // now clean up by adding smart search termination strings:
977 // if you are already a superset of the previous prefix
978 // but not a subset of the next, search can end here
979 // so set NextNE properly
982 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
983 PfxEntry
* nptr
= ptr
->getNext();
984 PfxEntry
* mptr
= NULL
;
985 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
986 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
989 if (mptr
) mptr
->setNextNE(NULL
);
995 // initialize the SfxEntry links NextEQ and NextNE to speed searching
996 // using the idea of leading subsets this time
997 int AffixMgr::process_sfx_order()
1001 // loop through each prefix list starting point
1002 for (int i
=1; i
< SETSIZE
; i
++) {
1006 // look through the remainder of the list
1007 // and find next entry with affix that
1008 // the current one is not a subset of
1009 // mark that as destination for NextNE
1010 // use next in list that you are a subset
1013 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
1014 SfxEntry
* nptr
= ptr
->getNext();
1015 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
1016 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
1018 ptr
->setNextNE(nptr
);
1019 ptr
->setNextEQ(NULL
);
1020 if ((ptr
->getNext()) && isSubset(ptr
->getKey(),(ptr
->getNext())->getKey()))
1021 ptr
->setNextEQ(ptr
->getNext());
1025 // now clean up by adding smart search termination strings:
1026 // if you are already a superset of the previous suffix
1027 // but not a subset of the next, search can end here
1028 // so set NextNE properly
1031 for (; ptr
!= NULL
; ptr
= ptr
->getNext()) {
1032 SfxEntry
* nptr
= ptr
->getNext();
1033 SfxEntry
* mptr
= NULL
;
1034 for (; nptr
!= NULL
; nptr
= nptr
->getNext()) {
1035 if (! isSubset(ptr
->getKey(),nptr
->getKey())) break;
1038 if (mptr
) mptr
->setNextNE(NULL
);
1044 // add flags to the result for dictionary debugging
1045 void AffixMgr::debugflag(char * result
, unsigned short flag
) {
1046 char * st
= encode_flag(flag
);
1047 mystrcat(result
, " ", MAXLNLEN
);
1048 mystrcat(result
, MORPH_FLAG
, MAXLNLEN
);
1050 mystrcat(result
, st
, MAXLNLEN
);
1055 // calculate the character length of the condition
1056 int AffixMgr::condlen(char * st
)
1064 } else if (*st
== ']') group
= false;
1065 else if (!group
&& (!utf8
||
1066 (!(*st
& 0x80) || ((*st
& 0xc0) == 0x80)))) l
++;
1071 int AffixMgr::encodeit(affentry
&entry
, char * cs
)
1073 if (strcmp(cs
,".") != 0) {
1074 entry
.numconds
= (char) condlen(cs
);
1075 strncpy(entry
.c
.conds
, cs
, MAXCONDLEN
);
1076 // long condition (end of conds padded by strncpy)
1077 if (entry
.c
.conds
[MAXCONDLEN
- 1] && cs
[MAXCONDLEN
]) {
1078 entry
.opts
+= aeLONGCOND
;
1079 entry
.c
.l
.conds2
= mystrdup(cs
+ MAXCONDLEN_1
);
1080 if (!entry
.c
.l
.conds2
) return 1;
1084 entry
.c
.conds
[0] = '\0';
1089 // return 1 if s1 is a leading subset of s2 (dots are for infixes)
1090 inline int AffixMgr::isSubset(const char * s1
, const char * s2
)
1092 while (((*s1
== *s2
) || (*s1
== '.')) && (*s1
!= '\0')) {
1096 return (*s1
== '\0');
1100 // check word for prefixes
1101 struct hentry
* AffixMgr::prefix_check(const char * word
, int len
, char in_compound
,
1102 const FLAG needflag
)
1104 struct hentry
* rv
= NULL
;
1110 // first handle the special case of 0 length prefixes
1111 PfxEntry
* pe
= pStart
[0];
1115 ((in_compound
!= IN_CPD_NOT
) || !(pe
->getCont() &&
1116 (TESTAFF(pe
->getCont(), onlyincompound
, pe
->getContLen())))) &&
1117 // permit prefixes in compounds
1118 ((in_compound
!= IN_CPD_END
) || (pe
->getCont() &&
1119 (TESTAFF(pe
->getCont(), compoundpermitflag
, pe
->getContLen()))))
1122 rv
= pe
->checkword(word
, len
, in_compound
, needflag
);
1124 pfx
=pe
; // BUG: pfx not stateless
1131 // now handle the general case
1132 unsigned char sp
= *((const unsigned char *)word
);
1133 PfxEntry
* pptr
= pStart
[sp
];
1136 if (isSubset(pptr
->getKey(),word
)) {
1139 ((in_compound
!= IN_CPD_NOT
) || !(pptr
->getCont() &&
1140 (TESTAFF(pptr
->getCont(), onlyincompound
, pptr
->getContLen())))) &&
1141 // permit prefixes in compounds
1142 ((in_compound
!= IN_CPD_END
) || (pptr
->getCont() &&
1143 (TESTAFF(pptr
->getCont(), compoundpermitflag
, pptr
->getContLen()))))
1146 rv
= pptr
->checkword(word
, len
, in_compound
, needflag
);
1148 pfx
=pptr
; // BUG: pfx not stateless
1152 pptr
= pptr
->getNextEQ();
1154 pptr
= pptr
->getNextNE();
1161 // check word for prefixes
1162 struct hentry
* AffixMgr::prefix_check_twosfx(const char * word
, int len
,
1163 char in_compound
, const FLAG needflag
)
1165 struct hentry
* rv
= NULL
;
1170 // first handle the special case of 0 length prefixes
1171 PfxEntry
* pe
= pStart
[0];
1174 rv
= pe
->check_twosfx(word
, len
, in_compound
, needflag
);
1179 // now handle the general case
1180 unsigned char sp
= *((const unsigned char *)word
);
1181 PfxEntry
* pptr
= pStart
[sp
];
1184 if (isSubset(pptr
->getKey(),word
)) {
1185 rv
= pptr
->check_twosfx(word
, len
, in_compound
, needflag
);
1190 pptr
= pptr
->getNextEQ();
1192 pptr
= pptr
->getNextNE();
1199 // check word for prefixes
1200 char * AffixMgr::prefix_check_morph(const char * word
, int len
, char in_compound
,
1201 const FLAG needflag
)
1205 char result
[MAXLNLEN
];
1211 // first handle the special case of 0 length prefixes
1212 PfxEntry
* pe
= pStart
[0];
1214 st
= pe
->check_morph(word
,len
,in_compound
, needflag
);
1216 mystrcat(result
, st
, MAXLNLEN
);
1219 // if (rv) return rv;
1223 // now handle the general case
1224 unsigned char sp
= *((const unsigned char *)word
);
1225 PfxEntry
* pptr
= pStart
[sp
];
1228 if (isSubset(pptr
->getKey(),word
)) {
1229 st
= pptr
->check_morph(word
,len
,in_compound
, needflag
);
1232 if ((in_compound
!= IN_CPD_NOT
) || !((pptr
->getCont() &&
1233 (TESTAFF(pptr
->getCont(), onlyincompound
, pptr
->getContLen()))))) {
1234 mystrcat(result
, st
, MAXLNLEN
);
1239 pptr
= pptr
->getNextEQ();
1241 pptr
= pptr
->getNextNE();
1245 if (*result
) return mystrdup(result
);
1250 // check word for prefixes
1251 char * AffixMgr::prefix_check_twosfx_morph(const char * word
, int len
,
1252 char in_compound
, const FLAG needflag
)
1256 char result
[MAXLNLEN
];
1262 // first handle the special case of 0 length prefixes
1263 PfxEntry
* pe
= pStart
[0];
1265 st
= pe
->check_twosfx_morph(word
,len
,in_compound
, needflag
);
1267 mystrcat(result
, st
, MAXLNLEN
);
1273 // now handle the general case
1274 unsigned char sp
= *((const unsigned char *)word
);
1275 PfxEntry
* pptr
= pStart
[sp
];
1278 if (isSubset(pptr
->getKey(),word
)) {
1279 st
= pptr
->check_twosfx_morph(word
, len
, in_compound
, needflag
);
1281 mystrcat(result
, st
, MAXLNLEN
);
1285 pptr
= pptr
->getNextEQ();
1287 pptr
= pptr
->getNextNE();
1291 if (*result
) return mystrdup(result
);
1295 // Is word a non compound with a REP substitution (see checkcompoundrep)?
1296 int AffixMgr::cpdrep_check(const char * word
, int wl
)
1298 char candidate
[MAXLNLEN
];
1302 if ((wl
< 2) || !numrep
) return 0;
1304 for (int i
=0; i
< numrep
; i
++ ) {
1306 lenr
= strlen(reptable
[i
].pattern2
);
1307 lenp
= strlen(reptable
[i
].pattern
);
1308 // search every occurence of the pattern in the word
1309 while ((r
=strstr(r
, reptable
[i
].pattern
)) != NULL
) {
1310 strcpy(candidate
, word
);
1311 if (r
-word
+ lenr
+ strlen(r
+lenp
) >= MAXLNLEN
) break;
1312 strcpy(candidate
+(r
-word
),reptable
[i
].pattern2
);
1313 strcpy(candidate
+(r
-word
)+lenr
, r
+lenp
);
1314 if (candidate_check(candidate
,strlen(candidate
))) return 1;
1315 r
++; // search for the next letter
1321 // forbid compoundings when there are special patterns at word bound
1322 int AffixMgr::cpdpat_check(const char * word
, int pos
, hentry
* r1
, hentry
* r2
, const char affixed
)
1325 for (int i
= 0; i
< numcheckcpd
; i
++) {
1326 if (isSubset(checkcpdtable
[i
].pattern2
, word
+ pos
) &&
1327 (!r1
|| !checkcpdtable
[i
].cond
||
1328 (r1
->astr
&& TESTAFF(r1
->astr
, checkcpdtable
[i
].cond
, r1
->alen
))) &&
1329 (!r2
|| !checkcpdtable
[i
].cond2
||
1330 (r2
->astr
&& TESTAFF(r2
->astr
, checkcpdtable
[i
].cond2
, r2
->alen
))) &&
1331 // zero length pattern => only TESTAFF
1332 // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
1333 (!*(checkcpdtable
[i
].pattern
) || (
1334 (*(checkcpdtable
[i
].pattern
)=='0' && r1
->blen
<= pos
&& strncmp(word
+ pos
- r1
->blen
, r1
->word
, r1
->blen
) == 0) ||
1335 (*(checkcpdtable
[i
].pattern
)!='0' && (len
= strlen(checkcpdtable
[i
].pattern
)) &&
1336 strncmp(word
+ pos
- len
, checkcpdtable
[i
].pattern
, len
) == 0)))) {
1343 // forbid compounding with neighbouring upper and lower case characters at word bounds
1344 int AffixMgr::cpdcase_check(const char * word
, int pos
)
1349 u8_u16(&u
, 1, word
+ pos
);
1350 for (p
= word
+ pos
- 1; (*p
& 0xc0) == 0x80; p
--);
1352 unsigned short a
= (u
.h
<< 8) + u
.l
;
1353 unsigned short b
= (w
.h
<< 8) + w
.l
;
1354 if (((unicodetoupper(a
, langnum
) == a
) || (unicodetoupper(b
, langnum
) == b
)) &&
1355 (a
!= '-') && (b
!= '-')) return 1;
1357 unsigned char a
= *(word
+ pos
- 1);
1358 unsigned char b
= *(word
+ pos
);
1359 if ((csconv
[a
].ccase
|| csconv
[b
].ccase
) && (a
!= '-') && (b
!= '-')) return 1;
1364 // check compound patterns
1365 int AffixMgr::defcpd_check(hentry
*** words
, short wnum
, hentry
* rv
, hentry
** def
, char all
)
1367 signed short btpp
[MAXWORDLEN
]; // metacharacter (*, ?) positions for backtracking
1368 signed short btwp
[MAXWORDLEN
]; // word positions for metacharacters
1369 int btnum
[MAXWORDLEN
]; // number of matched characters in metacharacter positions
1384 (*words
)[wnum
] = rv
;
1386 // has the last word COMPOUNDRULE flag?
1387 if (rv
->alen
== 0) {
1388 (*words
)[wnum
] = NULL
;
1389 if (w
) *words
= NULL
;
1393 for (i
= 0; i
< numdefcpd
; i
++) {
1394 for (j
= 0; j
< defcpdtable
[i
].len
; j
++) {
1395 if (defcpdtable
[i
].def
[j
] != '*' && defcpdtable
[i
].def
[j
] != '?' &&
1396 TESTAFF(rv
->astr
, defcpdtable
[i
].def
[j
], rv
->alen
)) ok
= 1;
1400 (*words
)[wnum
] = NULL
;
1401 if (w
) *words
= NULL
;
1405 for (i
= 0; i
< numdefcpd
; i
++) {
1406 signed short pp
= 0; // pattern position
1407 signed short wp
= 0; // "words" position
1412 while ((pp
< defcpdtable
[i
].len
) && (wp
<= wnum
)) {
1413 if (((pp
+1) < defcpdtable
[i
].len
) &&
1414 ((defcpdtable
[i
].def
[pp
+1] == '*') || (defcpdtable
[i
].def
[pp
+1] == '?'))) {
1415 int wend
= (defcpdtable
[i
].def
[pp
+1] == '?') ? wp
: wnum
;
1420 while (wp
<= wend
) {
1421 if (!(*words
)[wp
]->alen
||
1422 !TESTAFF((*words
)[wp
]->astr
, defcpdtable
[i
].def
[pp
-2], (*words
)[wp
]->alen
)) {
1428 if (wp
<= wnum
) ok2
= 0;
1429 btnum
[bt
] = wp
- btwp
[bt
];
1430 if (btnum
[bt
] > 0) bt
++;
1434 if (!(*words
)[wp
] || !(*words
)[wp
]->alen
||
1435 !TESTAFF((*words
)[wp
]->astr
, defcpdtable
[i
].def
[pp
], (*words
)[wp
]->alen
)) {
1441 if ((defcpdtable
[i
].len
== pp
) && !(wp
> wnum
)) ok
= 0;
1446 while ((defcpdtable
[i
].len
> r
) && ((r
+1) < defcpdtable
[i
].len
) &&
1447 ((defcpdtable
[i
].def
[r
+1] == '*') || (defcpdtable
[i
].def
[r
+1] == '?'))) r
+=2;
1448 if (defcpdtable
[i
].len
<= r
) return 1;
1455 wp
= btwp
[bt
- 1] + (signed short) btnum
[bt
- 1];
1456 } while ((btnum
[bt
- 1] < 0) && --bt
);
1459 if (ok
&& ok2
&& (!all
|| (defcpdtable
[i
].len
<= pp
))) return 1;
1461 // check zero ending
1462 while (ok
&& ok2
&& (defcpdtable
[i
].len
> pp
) && ((pp
+1) < defcpdtable
[i
].len
) &&
1463 ((defcpdtable
[i
].def
[pp
+1] == '*') || (defcpdtable
[i
].def
[pp
+1] == '?'))) pp
+=2;
1464 if (ok
&& ok2
&& (defcpdtable
[i
].len
<= pp
)) return 1;
1466 (*words
)[wnum
] = NULL
;
1467 if (w
) *words
= NULL
;
1471 inline int AffixMgr::candidate_check(const char * word
, int len
)
1473 struct hentry
* rv
=NULL
;
1478 // rv = prefix_check(word,len,1);
1479 // if (rv) return 1;
1481 rv
= affix_check(word
,len
);
1486 // calculate number of syllable for compound-checking
1487 short AffixMgr::get_syllable(const char * word
, int wlen
)
1489 if (cpdmaxsyllable
==0) return 0;
1494 for (int i
=0; i
<wlen
; i
++) {
1495 if (strchr(cpdvowels
, word
[i
])) num
++;
1497 } else if (cpdvowels_utf16
) {
1498 w_char w
[MAXWORDUTF8LEN
];
1499 int i
= u8_u16(w
, MAXWORDUTF8LEN
, word
);
1500 for (; i
> 0; i
--) {
1501 if (flag_bsearch((unsigned short *) cpdvowels_utf16
,
1502 ((unsigned short *) w
)[i
- 1], cpdvowels_utf16_len
)) num
++;
1508 void AffixMgr::setcminmax(int * cmin
, int * cmax
, const char * word
, int len
) {
1511 for (*cmin
= 0, i
= 0; (i
< cpdmin
) && word
[*cmin
]; i
++) {
1512 for ((*cmin
)++; (word
[*cmin
] & 0xc0) == 0x80; (*cmin
)++);
1514 for (*cmax
= len
, i
= 0; (i
< (cpdmin
- 1)) && *cmax
; i
++) {
1515 for ((*cmax
)--; (word
[*cmax
] & 0xc0) == 0x80; (*cmax
)--);
1519 *cmax
= len
- cpdmin
+ 1;
1524 // check if compound word is correctly spelled
1525 // hu_mov_rule = spec. Hungarian rule (XXX)
1526 struct hentry
* AffixMgr::compound_check(const char * word
, int len
,
1527 short wordnum
, short numsyllable
, short maxwordnum
, short wnum
, hentry
** words
= NULL
,
1528 char hu_mov_rule
= 0, char is_sug
= 0, int * info
= NULL
)
1531 short oldnumsyllable
, oldnumsyllable2
, oldwordnum
, oldwordnum2
;
1532 struct hentry
* rv
= NULL
;
1533 struct hentry
* rv_first
;
1534 struct hentry
* rwords
[MAXWORDLEN
]; // buffer for COMPOUND pattern checking
1535 char st
[MAXWORDUTF8LEN
+ 4];
1545 int checkedstriple
= 0;
1548 hentry
** oldwords
= words
;
1552 setcminmax(&cmin
, &cmax
, word
, len
);
1556 for (i
= cmin
; i
< cmax
; i
++) {
1557 // go to end of the UTF-8 character
1559 for (; (st
[i
] & 0xc0) == 0x80; i
++);
1560 if (i
>= cmax
) return NULL
;
1564 onlycpdrule
= (words
) ? 1 : 0;
1566 do { // onlycpdrule loop
1568 oldnumsyllable
= numsyllable
;
1569 oldwordnum
= wordnum
;
1573 do { // simplified checkcompoundpattern loop
1576 for (; scpd
<= numcheckcpd
&& (!checkcpdtable
[scpd
-1].pattern3
||
1577 strncmp(word
+ i
, checkcpdtable
[scpd
-1].pattern3
, strlen(checkcpdtable
[scpd
-1].pattern3
)) != 0); scpd
++);
1579 if (scpd
> numcheckcpd
) break; // break simplified checkcompoundpattern loop
1580 strcpy(st
+ i
, checkcpdtable
[scpd
-1].pattern
);
1582 i
+= strlen(checkcpdtable
[scpd
-1].pattern
);
1583 strcpy(st
+ i
, checkcpdtable
[scpd
-1].pattern2
);
1584 strcpy(st
+ i
+ strlen(checkcpdtable
[scpd
-1].pattern2
), word
+ soldi
+ strlen(checkcpdtable
[scpd
-1].pattern3
));
1587 len
+= strlen(checkcpdtable
[scpd
-1].pattern
) + strlen(checkcpdtable
[scpd
-1].pattern2
) - strlen(checkcpdtable
[scpd
-1].pattern3
);
1590 setcminmax(&cmin
, &cmax
, st
, len
);
1592 cmax
= len
- cpdmin
+ 1;
1604 rv
= lookup(st
); // perhaps without prefix
1606 // search homonym with compound flag
1607 while ((rv
) && !hu_mov_rule
&&
1608 ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
1609 !((compoundflag
&& !words
&& !onlycpdrule
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1610 (compoundbegin
&& !wordnum
&& !onlycpdrule
&&
1611 TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
1612 (compoundmiddle
&& wordnum
&& !words
&& !onlycpdrule
&&
1613 TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
)) ||
1614 (numdefcpd
&& onlycpdrule
&&
1615 ((!words
&& !wordnum
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0)) ||
1616 (words
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0))))) ||
1617 (scpd
!= 0 && checkcpdtable
[scpd
-1].cond
!= FLAG_NULL
&&
1618 !TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond
, rv
->alen
)))
1620 rv
= rv
->next_homonym
;
1623 if (rv
) affixed
= 0;
1626 if (onlycpdrule
) break;
1628 !(rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundflag
))) {
1629 if ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
,
1630 FLAG_NULL
, compoundflag
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) && !hu_mov_rule
&&
1632 ((compoundforbidflag
&& TESTAFF(sfx
->getCont(), compoundforbidflag
,
1633 sfx
->getContLen())) || (compoundend
&&
1634 TESTAFF(sfx
->getCont(), compoundend
,
1635 sfx
->getContLen())))) {
1641 (((wordnum
== 0) && compoundbegin
&&
1642 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundbegin
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
1643 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundbegin
)))) ||
1644 ((wordnum
> 0) && compoundmiddle
&&
1645 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundmiddle
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
1646 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundmiddle
)))))
1647 ) checked_prefix
= 1;
1648 // else check forbiddenwords and needaffix
1649 } else if (rv
->astr
&& (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1650 TESTAFF(rv
->astr
, needaffix
, rv
->alen
) ||
1651 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1652 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
))
1659 // check non_compound flag in suffix and prefix
1660 if ((rv
) && !hu_mov_rule
&&
1661 ((pfx
&& pfx
->getCont() &&
1662 TESTAFF(pfx
->getCont(), compoundforbidflag
,
1663 pfx
->getContLen())) ||
1664 (sfx
&& sfx
->getCont() &&
1665 TESTAFF(sfx
->getCont(), compoundforbidflag
,
1666 sfx
->getContLen())))) {
1670 // check compoundend flag in suffix and prefix
1671 if ((rv
) && !checked_prefix
&& compoundend
&& !hu_mov_rule
&&
1672 ((pfx
&& pfx
->getCont() &&
1673 TESTAFF(pfx
->getCont(), compoundend
,
1674 pfx
->getContLen())) ||
1675 (sfx
&& sfx
->getCont() &&
1676 TESTAFF(sfx
->getCont(), compoundend
,
1677 sfx
->getContLen())))) {
1681 // check compoundmiddle flag in suffix and prefix
1682 if ((rv
) && !checked_prefix
&& (wordnum
==0) && compoundmiddle
&& !hu_mov_rule
&&
1683 ((pfx
&& pfx
->getCont() &&
1684 TESTAFF(pfx
->getCont(), compoundmiddle
,
1685 pfx
->getContLen())) ||
1686 (sfx
&& sfx
->getCont() &&
1687 TESTAFF(sfx
->getCont(), compoundmiddle
,
1688 sfx
->getContLen())))) {
1692 // check forbiddenwords
1693 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1694 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1695 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) {
1699 // increment word number, if the second root has a compoundroot flag
1700 if ((rv
) && compoundroot
&&
1701 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1705 // first word is acceptable in compound words?
1707 ( checked_prefix
|| (words
&& words
[wnum
]) ||
1708 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1709 ((oldwordnum
== 0) && compoundbegin
&& TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
1710 ((oldwordnum
> 0) && compoundmiddle
&& TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
))// ||
1713 // LANG_hu section: spec. Hungarian rule
1714 || ((langnum
== LANG_hu
) && hu_mov_rule
&& (
1715 TESTAFF(rv
->astr
, 'F', rv
->alen
) || // XXX hardwired Hungarian dictionary codes
1716 TESTAFF(rv
->astr
, 'G', rv
->alen
) ||
1717 TESTAFF(rv
->astr
, 'H', rv
->alen
)
1720 // END of LANG_hu section
1723 // test CHECKCOMPOUNDPATTERN conditions
1724 scpd
== 0 || checkcpdtable
[scpd
-1].cond
== FLAG_NULL
||
1725 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond
, rv
->alen
)
1727 && ! (( checkcompoundtriple
&& scpd
== 0 && !words
&& // test triple letters
1728 (word
[i
-1]==word
[i
]) && (
1729 ((i
>1) && (word
[i
-1]==word
[i
-2])) ||
1730 ((word
[i
-1]==word
[i
+1])) // may be word[i+1] == '\0'
1734 checkcompoundcase
&& scpd
== 0 && !words
&& cpdcase_check(word
, i
)
1737 // LANG_hu section: spec. Hungarian rule
1738 || ((!rv
) && (langnum
== LANG_hu
) && hu_mov_rule
&& (rv
= affix_check(st
,i
)) &&
1739 (sfx
&& sfx
->getCont() && ( // XXX hardwired Hungarian dic. codes
1740 TESTAFF(sfx
->getCont(), (unsigned short) 'x', sfx
->getContLen()) ||
1741 TESTAFF(sfx
->getCont(), (unsigned short) '%', sfx
->getContLen())
1745 ) { // first word is ok condition
1747 // LANG_hu section: spec. Hungarian rule
1748 if (langnum
== LANG_hu
) {
1749 // calculate syllable number of the word
1750 numsyllable
+= get_syllable(st
, i
);
1751 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1752 if (pfx
&& (get_syllable(pfx
->getKey(),strlen(pfx
->getKey())) > 1)) wordnum
++;
1754 // END of LANG_hu section
1760 do { // striple loop
1762 // check simplifiedtriple
1763 if (simplifiedtriple
) {
1766 i
--; // check "fahrt" instead of "ahrt" in "Schiffahrt"
1767 } else if (i
> 2 && *(word
+i
- 1) == *(word
+ i
- 2)) striple
= 1;
1770 rv
= lookup((st
+i
)); // perhaps without prefix
1772 // search homonym with compound flag
1773 while ((rv
) && ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
1774 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1775 (compoundend
&& !words
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
)) ||
1776 (numdefcpd
&& words
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
,1))) ||
1777 (scpd
!= 0 && checkcpdtable
[scpd
-1].cond2
!= FLAG_NULL
&&
1778 !TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))
1780 rv
= rv
->next_homonym
;
1784 if (rv
&& forceucase
&& (rv
) &&
1785 (TESTAFF(rv
->astr
, forceucase
, rv
->alen
)) && !(info
&& *info
& SPELL_ORIGCAP
)) rv
= NULL
;
1787 if (rv
&& words
&& words
[wnum
+ 1]) return rv_first
;
1789 oldnumsyllable2
= numsyllable
;
1790 oldwordnum2
= wordnum
;
1793 // LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
1794 if ((rv
) && (langnum
== LANG_hu
) && (TESTAFF(rv
->astr
, 'I', rv
->alen
)) && !(TESTAFF(rv
->astr
, 'J', rv
->alen
))) {
1797 // END of LANG_hu section
1799 // increment word number, if the second root has a compoundroot flag
1800 if ((rv
) && (compoundroot
) &&
1801 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1805 // check forbiddenwords
1806 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1807 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1808 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) return NULL
;
1810 // second word is acceptable, as a root?
1811 // hungarian conventions: compounding is acceptable,
1812 // when compound forms consist of 2 words, or if more,
1813 // then the syllable number of root words must be 6, or lesser.
1816 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
1817 (compoundend
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
))
1820 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
1821 ((cpdmaxsyllable
!=0) &&
1822 (numsyllable
+ get_syllable(HENTRY_WORD(rv
), rv
->clen
)<=cpdmaxsyllable
))
1825 // test CHECKCOMPOUNDPATTERN
1826 !numcheckcpd
|| scpd
!= 0 || !cpdpat_check(word
, i
, rv_first
, rv
, 0)
1829 (!checkcompounddup
|| (rv
!= rv_first
))
1831 // test CHECKCOMPOUNDPATTERN conditions
1832 && (scpd
== 0 || checkcpdtable
[scpd
-1].cond2
== FLAG_NULL
||
1833 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))
1836 // forbid compound word, if it is a non compound word with typical fault
1837 if (checkcompoundrep
&& cpdrep_check(word
,len
)) return NULL
;
1841 numsyllable
= oldnumsyllable2
;
1842 wordnum
= oldwordnum2
;
1844 // perhaps second word has prefix or/and suffix
1846 sfxflag
= FLAG_NULL
;
1847 rv
= (compoundflag
&& !onlycpdrule
) ? affix_check((word
+i
),strlen(word
+i
), compoundflag
, IN_CPD_END
) : NULL
;
1848 if (!rv
&& compoundend
&& !onlycpdrule
) {
1851 rv
= affix_check((word
+i
),strlen(word
+i
), compoundend
, IN_CPD_END
);
1854 if (!rv
&& numdefcpd
&& words
) {
1855 rv
= affix_check((word
+i
),strlen(word
+i
), 0, IN_CPD_END
);
1856 if (rv
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
, 1)) return rv_first
;
1860 // test CHECKCOMPOUNDPATTERN conditions (allowed forms)
1861 if (rv
&& !(scpd
== 0 || checkcpdtable
[scpd
-1].cond2
== FLAG_NULL
||
1862 TESTAFF(rv
->astr
, checkcpdtable
[scpd
-1].cond2
, rv
->alen
))) rv
= NULL
;
1864 // test CHECKCOMPOUNDPATTERN conditions (forbidden compounds)
1865 if (rv
&& numcheckcpd
&& scpd
== 0 && cpdpat_check(word
, i
, rv_first
, rv
, affixed
)) rv
= NULL
;
1867 // check non_compound flag in suffix and prefix
1869 ((pfx
&& pfx
->getCont() &&
1870 TESTAFF(pfx
->getCont(), compoundforbidflag
,
1871 pfx
->getContLen())) ||
1872 (sfx
&& sfx
->getCont() &&
1873 TESTAFF(sfx
->getCont(), compoundforbidflag
,
1874 sfx
->getContLen())))) {
1879 if (rv
&& forceucase
&& (rv
) &&
1880 (TESTAFF(rv
->astr
, forceucase
, rv
->alen
)) && !(info
&& *info
& SPELL_ORIGCAP
)) rv
= NULL
;
1882 // check forbiddenwords
1883 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
1884 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1885 (is_sug
&& nosuggest
&& TESTAFF(rv
->astr
, nosuggest
, rv
->alen
)))) return NULL
;
1887 // pfxappnd = prefix of word+i, or NULL
1888 // calculate syllable number of prefix.
1889 // hungarian convention: when syllable number of prefix is more,
1890 // than 1, the prefix+word counts as two words.
1892 if (langnum
== LANG_hu
) {
1893 // calculate syllable number of the word
1894 numsyllable
+= get_syllable(word
+ i
, strlen(word
+ i
));
1896 // - affix syllable num.
1897 // XXX only second suffix (inflections, not derivations)
1899 char * tmp
= myrevstrdup(sfxappnd
);
1900 numsyllable
-= get_syllable(tmp
, strlen(tmp
));
1904 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
1905 if (pfx
&& (get_syllable(pfx
->getKey(),strlen(pfx
->getKey())) > 1)) wordnum
++;
1907 // increment syllable num, if last word has a SYLLABLENUM flag
1908 // and the suffix is beginning `s'
1910 if (cpdsyllablenum
) {
1912 case 'c': { numsyllable
+=2; break; }
1913 case 'J': { numsyllable
+= 1; break; }
1914 case 'I': { if (rv
&& TESTAFF(rv
->astr
, 'J', rv
->alen
)) numsyllable
+= 1; break; }
1919 // increment word number, if the second word has a compoundroot flag
1920 if ((rv
) && (compoundroot
) &&
1921 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
1925 // second word is acceptable, as a word with prefix or/and suffix?
1926 // hungarian conventions: compounding is acceptable,
1927 // when compound forms consist 2 word, otherwise
1928 // the syllable number of root words is 6, or lesser.
1931 ((cpdwordmax
== -1) || (wordnum
+ 1 < cpdwordmax
)) ||
1932 ((cpdmaxsyllable
!= 0) &&
1933 (numsyllable
<= cpdmaxsyllable
))
1936 (!checkcompounddup
|| (rv
!= rv_first
))
1938 // forbid compound word, if it is a non compound word with typical fault
1939 if (checkcompoundrep
&& cpdrep_check(word
, len
)) return NULL
;
1943 numsyllable
= oldnumsyllable2
;
1944 wordnum
= oldwordnum2
;
1946 // perhaps second word is a compound word (recursive call)
1947 if (wordnum
< maxwordnum
) {
1948 rv
= compound_check((st
+i
),strlen(st
+i
), wordnum
+1,
1949 numsyllable
, maxwordnum
, wnum
+ 1, words
, 0, is_sug
, info
);
1951 if (rv
&& numcheckcpd
&& ((scpd
== 0 && cpdpat_check(word
, i
, rv_first
, rv
, affixed
)) ||
1952 (scpd
!= 0 && !cpdpat_check(word
, i
, rv_first
, rv
, affixed
)))) rv
= NULL
;
1957 // forbid compound word, if it is a non compound word with typical fault
1958 if (checkcompoundrep
|| forbiddenword
) {
1959 struct hentry
* rv2
= NULL
;
1961 if (checkcompoundrep
&& cpdrep_check(word
, len
)) return NULL
;
1964 if (strncmp(rv
->word
, word
+ i
, rv
->blen
) == 0) {
1965 char r
= *(st
+ i
+ rv
->blen
);
1966 *(st
+ i
+ rv
->blen
) = '\0';
1968 if (checkcompoundrep
&& cpdrep_check(st
, i
+ rv
->blen
)) {
1969 *(st
+ i
+ rv
->blen
) = r
;
1973 if (forbiddenword
) {
1975 if (!rv2
) rv2
= affix_check(word
, len
);
1976 if (rv2
&& rv2
->astr
&& TESTAFF(rv2
->astr
, forbiddenword
, rv2
->alen
) &&
1977 (strncmp(rv2
->word
, st
, i
+ rv
->blen
) == 0)) {
1981 *(st
+ i
+ rv
->blen
) = r
;
1986 } while (striple
&& !checkedstriple
); // end of striple loop
1988 if (checkedstriple
) {
1994 } // first word is ok condition
2006 } while (!onlycpdrule
&& simplifiedcpd
&& scpd
<= numcheckcpd
); // end of simplifiedcpd loop
2009 wordnum
= oldwordnum
;
2010 numsyllable
= oldnumsyllable
;
2014 strcpy(st
, word
); // XXX add more optim.
2018 } while (numdefcpd
&& oldwordnum
== 0 && !onlycpdrule
&& (onlycpdrule
= 1)); // end of onlycpd loop
2025 // check if compound word is correctly spelled
2026 // hu_mov_rule = spec. Hungarian rule (XXX)
2027 int AffixMgr::compound_check_morph(const char * word
, int len
,
2028 short wordnum
, short numsyllable
, short maxwordnum
, short wnum
, hentry
** words
,
2029 char hu_mov_rule
= 0, char ** result
= NULL
, char * partresult
= NULL
)
2032 short oldnumsyllable
, oldnumsyllable2
, oldwordnum
, oldwordnum2
;
2035 struct hentry
* rv
= NULL
;
2036 struct hentry
* rv_first
;
2037 struct hentry
* rwords
[MAXWORDLEN
]; // buffer for COMPOUND pattern checking
2038 char st
[MAXWORDUTF8LEN
+ 4];
2042 char presult
[MAXLNLEN
];
2049 hentry
** oldwords
= words
;
2051 setcminmax(&cmin
, &cmax
, word
, len
);
2055 for (i
= cmin
; i
< cmax
; i
++) {
2056 oldnumsyllable
= numsyllable
;
2057 oldwordnum
= wordnum
;
2060 // go to end of the UTF-8 character
2062 for (; (st
[i
] & 0xc0) == 0x80; i
++);
2063 if (i
>= cmax
) return 0;
2067 onlycpdrule
= (words
) ? 1 : 0;
2069 do { // onlycpdrule loop
2071 oldnumsyllable
= numsyllable
;
2072 oldwordnum
= wordnum
;
2084 if (partresult
) mystrcat(presult
, partresult
, MAXLNLEN
);
2086 rv
= lookup(st
); // perhaps without prefix
2088 // search homonym with compound flag
2089 while ((rv
) && !hu_mov_rule
&&
2090 ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
2091 !((compoundflag
&& !words
&& !onlycpdrule
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2092 (compoundbegin
&& !wordnum
&& !onlycpdrule
&&
2093 TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
2094 (compoundmiddle
&& wordnum
&& !words
&& !onlycpdrule
&&
2095 TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
)) ||
2096 (numdefcpd
&& onlycpdrule
&&
2097 ((!words
&& !wordnum
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0)) ||
2098 (words
&& defcpd_check(&words
, wnum
, rv
, (hentry
**) &rwords
, 0))))
2100 rv
= rv
->next_homonym
;
2103 if (rv
) affixed
= 0;
2106 sprintf(presult
+ strlen(presult
), "%c%s%s", MSEP_FLD
, MORPH_PART
, st
);
2107 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
2108 sprintf(presult
+ strlen(presult
), "%c%s%s", MSEP_FLD
, MORPH_STEM
, st
);
2110 // store the pointer of the hash entry
2111 // sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv);
2112 if (HENTRY_DATA(rv
)) {
2113 sprintf(presult
+ strlen(presult
), "%c%s", MSEP_FLD
, HENTRY_DATA2(rv
));
2118 if (onlycpdrule
) break;
2120 !(rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundflag
))) {
2121 if ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
,
2122 FLAG_NULL
, compoundflag
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) && !hu_mov_rule
&&
2124 ((compoundforbidflag
&& TESTAFF(sfx
->getCont(), compoundforbidflag
,
2125 sfx
->getContLen())) || (compoundend
&&
2126 TESTAFF(sfx
->getCont(), compoundend
,
2127 sfx
->getContLen())))) {
2133 (((wordnum
== 0) && compoundbegin
&&
2134 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundbegin
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
2135 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundbegin
)))) ||
2136 ((wordnum
> 0) && compoundmiddle
&&
2137 ((rv
= suffix_check(st
, i
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, compoundmiddle
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
)) ||
2138 (rv
= prefix_check(st
, i
, hu_mov_rule
? IN_CPD_OTHER
: IN_CPD_BEGIN
, compoundmiddle
)))))
2140 // char * p = prefix_check_morph(st, i, 0, compound);
2142 if (compoundflag
) p
= affix_check_morph(st
, i
, compoundflag
);
2143 if (!p
|| (*p
== '\0')) {
2146 if ((wordnum
== 0) && compoundbegin
) {
2147 p
= affix_check_morph(st
, i
, compoundbegin
);
2148 } else if ((wordnum
> 0) && compoundmiddle
) {
2149 p
= affix_check_morph(st
, i
, compoundmiddle
);
2152 if (p
&& (*p
!= '\0')) {
2153 sprintf(presult
+ strlen(presult
), "%c%s%s%s", MSEP_FLD
,
2154 MORPH_PART
, st
, line_uniq_app(&p
, MSEP_REC
));
2159 // else check forbiddenwords
2160 } else if (rv
->astr
&& (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
2161 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
2162 TESTAFF(rv
->astr
, needaffix
, rv
->alen
))) {
2167 // check non_compound flag in suffix and prefix
2168 if ((rv
) && !hu_mov_rule
&&
2169 ((pfx
&& pfx
->getCont() &&
2170 TESTAFF(pfx
->getCont(), compoundforbidflag
,
2171 pfx
->getContLen())) ||
2172 (sfx
&& sfx
->getCont() &&
2173 TESTAFF(sfx
->getCont(), compoundforbidflag
,
2174 sfx
->getContLen())))) {
2178 // check compoundend flag in suffix and prefix
2179 if ((rv
) && !checked_prefix
&& compoundend
&& !hu_mov_rule
&&
2180 ((pfx
&& pfx
->getCont() &&
2181 TESTAFF(pfx
->getCont(), compoundend
,
2182 pfx
->getContLen())) ||
2183 (sfx
&& sfx
->getCont() &&
2184 TESTAFF(sfx
->getCont(), compoundend
,
2185 sfx
->getContLen())))) {
2189 // check compoundmiddle flag in suffix and prefix
2190 if ((rv
) && !checked_prefix
&& (wordnum
==0) && compoundmiddle
&& !hu_mov_rule
&&
2191 ((pfx
&& pfx
->getCont() &&
2192 TESTAFF(pfx
->getCont(), compoundmiddle
,
2193 pfx
->getContLen())) ||
2194 (sfx
&& sfx
->getCont() &&
2195 TESTAFF(sfx
->getCont(), compoundmiddle
,
2196 sfx
->getContLen())))) {
2200 // check forbiddenwords
2201 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
)
2202 || TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
))) continue;
2204 // increment word number, if the second root has a compoundroot flag
2205 if ((rv
) && (compoundroot
) &&
2206 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2210 // first word is acceptable in compound words?
2212 ( checked_prefix
|| (words
&& words
[wnum
]) ||
2213 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2214 ((oldwordnum
== 0) && compoundbegin
&& TESTAFF(rv
->astr
, compoundbegin
, rv
->alen
)) ||
2215 ((oldwordnum
> 0) && compoundmiddle
&& TESTAFF(rv
->astr
, compoundmiddle
, rv
->alen
))
2216 // LANG_hu section: spec. Hungarian rule
2217 || ((langnum
== LANG_hu
) && // hu_mov_rule
2219 TESTAFF(rv
->astr
, 'F', rv
->alen
) ||
2220 TESTAFF(rv
->astr
, 'G', rv
->alen
) ||
2221 TESTAFF(rv
->astr
, 'H', rv
->alen
)
2224 // END of LANG_hu section
2226 && ! (( checkcompoundtriple
&& !words
&& // test triple letters
2227 (word
[i
-1]==word
[i
]) && (
2228 ((i
>1) && (word
[i
-1]==word
[i
-2])) ||
2229 ((word
[i
-1]==word
[i
+1])) // may be word[i+1] == '\0'
2233 // test CHECKCOMPOUNDPATTERN
2234 numcheckcpd
&& !words
&& cpdpat_check(word
, i
, rv
, NULL
, affixed
)
2237 checkcompoundcase
&& !words
&& cpdcase_check(word
, i
)
2240 // LANG_hu section: spec. Hungarian rule
2241 || ((!rv
) && (langnum
== LANG_hu
) && hu_mov_rule
&& (rv
= affix_check(st
,i
)) &&
2242 (sfx
&& sfx
->getCont() && (
2243 TESTAFF(sfx
->getCont(), (unsigned short) 'x', sfx
->getContLen()) ||
2244 TESTAFF(sfx
->getCont(), (unsigned short) '%', sfx
->getContLen())
2248 // END of LANG_hu section
2251 // LANG_hu section: spec. Hungarian rule
2252 if (langnum
== LANG_hu
) {
2253 // calculate syllable number of the word
2254 numsyllable
+= get_syllable(st
, i
);
2256 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2257 if (pfx
&& (get_syllable(pfx
->getKey(),strlen(pfx
->getKey())) > 1)) wordnum
++;
2259 // END of LANG_hu section
2263 rv
= lookup((word
+i
)); // perhaps without prefix
2265 // search homonym with compound flag
2266 while ((rv
) && ((needaffix
&& TESTAFF(rv
->astr
, needaffix
, rv
->alen
)) ||
2267 !((compoundflag
&& !words
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2268 (compoundend
&& !words
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
)) ||
2269 (numdefcpd
&& words
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
,1))))) {
2270 rv
= rv
->next_homonym
;
2273 if (rv
&& words
&& words
[wnum
+ 1]) {
2274 mystrcat(*result
, presult
, MAXLNLEN
);
2275 mystrcat(*result
, " ", MAXLNLEN
);
2276 mystrcat(*result
, MORPH_PART
, MAXLNLEN
);
2277 mystrcat(*result
, word
+i
, MAXLNLEN
);
2278 if (complexprefixes
&& HENTRY_DATA(rv
)) mystrcat(*result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2279 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
2280 mystrcat(*result
, " ", MAXLNLEN
);
2281 mystrcat(*result
, MORPH_STEM
, MAXLNLEN
);
2282 mystrcat(*result
, HENTRY_WORD(rv
), MAXLNLEN
);
2284 // store the pointer of the hash entry
2285 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2286 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2287 mystrcat(*result
, " ", MAXLNLEN
);
2288 mystrcat(*result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2290 mystrcat(*result
, "\n", MAXLNLEN
);
2295 oldnumsyllable2
= numsyllable
;
2296 oldwordnum2
= wordnum
;
2298 // LANG_hu section: spec. Hungarian rule
2299 if ((rv
) && (langnum
== LANG_hu
) && (TESTAFF(rv
->astr
, 'I', rv
->alen
)) && !(TESTAFF(rv
->astr
, 'J', rv
->alen
))) {
2302 // END of LANG_hu section
2303 // increment word number, if the second root has a compoundroot flag
2304 if ((rv
) && (compoundroot
) &&
2305 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2309 // check forbiddenwords
2310 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
, forbiddenword
, rv
->alen
) ||
2311 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
))) {
2316 // second word is acceptable, as a root?
2317 // hungarian conventions: compounding is acceptable,
2318 // when compound forms consist of 2 words, or if more,
2319 // then the syllable number of root words must be 6, or lesser.
2321 (compoundflag
&& TESTAFF(rv
->astr
, compoundflag
, rv
->alen
)) ||
2322 (compoundend
&& TESTAFF(rv
->astr
, compoundend
, rv
->alen
))
2325 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
2326 ((cpdmaxsyllable
!=0) &&
2327 (numsyllable
+get_syllable(HENTRY_WORD(rv
),rv
->blen
)<=cpdmaxsyllable
))
2330 (!checkcompounddup
|| (rv
!= rv_first
))
2334 // bad compound word
2335 mystrcat(*result
, presult
, MAXLNLEN
);
2336 mystrcat(*result
, " ", MAXLNLEN
);
2337 mystrcat(*result
, MORPH_PART
, MAXLNLEN
);
2338 mystrcat(*result
, word
+i
, MAXLNLEN
);
2340 if (HENTRY_DATA(rv
)) {
2341 if (complexprefixes
) mystrcat(*result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2342 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2343 mystrcat(*result
, " ", MAXLNLEN
);
2344 mystrcat(*result
, MORPH_STEM
, MAXLNLEN
);
2345 mystrcat(*result
, HENTRY_WORD(rv
), MAXLNLEN
);
2347 // store the pointer of the hash entry
2348 // sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv);
2349 if (!complexprefixes
) {
2350 mystrcat(*result
, " ", MAXLNLEN
);
2351 mystrcat(*result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2354 mystrcat(*result
, "\n", MAXLNLEN
);
2358 numsyllable
= oldnumsyllable2
;
2359 wordnum
= oldwordnum2
;
2361 // perhaps second word has prefix or/and suffix
2363 sfxflag
= FLAG_NULL
;
2365 if (compoundflag
&& !onlycpdrule
) rv
= affix_check((word
+i
),strlen(word
+i
), compoundflag
); else rv
= NULL
;
2367 if (!rv
&& compoundend
&& !onlycpdrule
) {
2370 rv
= affix_check((word
+i
),strlen(word
+i
), compoundend
);
2373 if (!rv
&& numdefcpd
&& words
) {
2374 rv
= affix_check((word
+i
),strlen(word
+i
), 0, IN_CPD_END
);
2375 if (rv
&& words
&& defcpd_check(&words
, wnum
+ 1, rv
, NULL
, 1)) {
2377 if (compoundflag
) m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundflag
);
2378 if ((!m
|| *m
== '\0') && compoundend
) {
2380 m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundend
);
2382 mystrcat(*result
, presult
, MAXLNLEN
);
2383 if (m
|| (*m
!= '\0')) {
2384 sprintf(*result
+ strlen(*result
), "%c%s%s%s", MSEP_FLD
,
2385 MORPH_PART
, word
+ i
, line_uniq_app(&m
, MSEP_REC
));
2388 mystrcat(*result
, "\n", MAXLNLEN
);
2393 // check non_compound flag in suffix and prefix
2395 ((pfx
&& pfx
->getCont() &&
2396 TESTAFF(pfx
->getCont(), compoundforbidflag
,
2397 pfx
->getContLen())) ||
2398 (sfx
&& sfx
->getCont() &&
2399 TESTAFF(sfx
->getCont(), compoundforbidflag
,
2400 sfx
->getContLen())))) {
2404 // check forbiddenwords
2405 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
,forbiddenword
,rv
->alen
) ||
2406 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
))
2407 && (! TESTAFF(rv
->astr
, needaffix
, rv
->alen
))) {
2412 if (langnum
== LANG_hu
) {
2413 // calculate syllable number of the word
2414 numsyllable
+= get_syllable(word
+ i
, strlen(word
+ i
));
2416 // - affix syllable num.
2417 // XXX only second suffix (inflections, not derivations)
2419 char * tmp
= myrevstrdup(sfxappnd
);
2420 numsyllable
-= get_syllable(tmp
, strlen(tmp
));
2424 // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
2425 if (pfx
&& (get_syllable(pfx
->getKey(),strlen(pfx
->getKey())) > 1)) wordnum
++;
2427 // increment syllable num, if last word has a SYLLABLENUM flag
2428 // and the suffix is beginning `s'
2430 if (cpdsyllablenum
) {
2432 case 'c': { numsyllable
+=2; break; }
2433 case 'J': { numsyllable
+= 1; break; }
2434 case 'I': { if (rv
&& TESTAFF(rv
->astr
, 'J', rv
->alen
)) numsyllable
+= 1; break; }
2439 // increment word number, if the second word has a compoundroot flag
2440 if ((rv
) && (compoundroot
) &&
2441 (TESTAFF(rv
->astr
, compoundroot
, rv
->alen
))) {
2444 // second word is acceptable, as a word with prefix or/and suffix?
2445 // hungarian conventions: compounding is acceptable,
2446 // when compound forms consist 2 word, otherwise
2447 // the syllable number of root words is 6, or lesser.
2450 ((cpdwordmax
==-1) || (wordnum
+1<cpdwordmax
)) ||
2451 ((cpdmaxsyllable
!=0) &&
2452 (numsyllable
<= cpdmaxsyllable
))
2455 (!checkcompounddup
|| (rv
!= rv_first
))
2458 if (compoundflag
) m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundflag
);
2459 if ((!m
|| *m
== '\0') && compoundend
) {
2461 m
= affix_check_morph((word
+i
),strlen(word
+i
), compoundend
);
2463 mystrcat(*result
, presult
, MAXLNLEN
);
2464 if (m
&& (*m
!= '\0')) {
2465 sprintf(*result
+ strlen(*result
), "%c%s%s%s", MSEP_FLD
,
2466 MORPH_PART
, word
+ i
, line_uniq_app(&m
, MSEP_REC
));
2469 sprintf(*result
+ strlen(*result
), "%c", MSEP_REC
);
2473 numsyllable
= oldnumsyllable2
;
2474 wordnum
= oldwordnum2
;
2476 // perhaps second word is a compound word (recursive call)
2477 if ((wordnum
< maxwordnum
) && (ok
== 0)) {
2478 compound_check_morph((word
+i
),strlen(word
+i
), wordnum
+1,
2479 numsyllable
, maxwordnum
, wnum
+ 1, words
, 0, result
, presult
);
2485 wordnum
= oldwordnum
;
2486 numsyllable
= oldnumsyllable
;
2488 } while (numdefcpd
&& oldwordnum
== 0 && !onlycpdrule
&& (onlycpdrule
= 1)); // end of onlycpd loop
2494 // return 1 if s1 (reversed) is a leading subset of end of s2
2495 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len)
2497 while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
2502 return (*s1 == '\0');
2506 inline int AffixMgr::isRevSubset(const char * s1
, const char * end_of_s2
, int len
)
2508 while ((len
> 0) && (*s1
!= '\0') && ((*s1
== *end_of_s2
) || (*s1
== '.'))) {
2513 return (*s1
== '\0');
2516 // check word for suffixes
2518 struct hentry
* AffixMgr::suffix_check (const char * word
, int len
,
2519 int sfxopts
, PfxEntry
* ppfx
, char ** wlst
, int maxSug
, int * ns
,
2520 const FLAG cclass
, const FLAG needflag
, char in_compound
)
2522 struct hentry
* rv
= NULL
;
2523 PfxEntry
* ep
= ppfx
;
2525 // first handle the special case of 0 length suffixes
2526 SfxEntry
* se
= sStart
[0];
2529 if (!cclass
|| se
->getCont()) {
2530 // suffixes are not allowed in beginning of compounds
2531 if ((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2532 // except when signed with compoundpermitflag flag
2533 (se
->getCont() && compoundpermitflag
&&
2534 TESTAFF(se
->getCont(),compoundpermitflag
,se
->getContLen()))) && (!circumfix
||
2535 // no circumfix flag in prefix and suffix
2536 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2537 circumfix
, ep
->getContLen())) &&
2538 (!se
->getCont() || !(TESTAFF(se
->getCont(),circumfix
,se
->getContLen())))) ||
2539 // circumfix flag in prefix AND suffix
2540 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2541 circumfix
, ep
->getContLen())) &&
2542 (se
->getCont() && (TESTAFF(se
->getCont(),circumfix
,se
->getContLen()))))) &&
2545 !(se
->getCont() && (TESTAFF(se
->getCont(), onlyincompound
, se
->getContLen())))) &&
2546 // needaffix on prefix or first suffix
2548 !(se
->getCont() && TESTAFF(se
->getCont(), needaffix
, se
->getContLen())) ||
2549 (ppfx
&& !((ep
->getCont()) &&
2550 TESTAFF(ep
->getCont(), needaffix
,
2553 rv
= se
->checkword(word
,len
, sfxopts
, ppfx
, wlst
, maxSug
, ns
, (FLAG
) cclass
,
2554 needflag
, (in_compound
? 0 : onlyincompound
));
2556 sfx
=se
; // BUG: sfx not stateless
2564 // now handle the general case
2565 if (len
== 0) return NULL
; // FULLSTRIP
2566 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2567 SfxEntry
* sptr
= sStart
[sp
];
2570 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)
2572 // suffixes are not allowed in beginning of compounds
2573 if ((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2574 // except when signed with compoundpermitflag flag
2575 (sptr
->getCont() && compoundpermitflag
&&
2576 TESTAFF(sptr
->getCont(),compoundpermitflag
,sptr
->getContLen()))) && (!circumfix
||
2577 // no circumfix flag in prefix and suffix
2578 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2579 circumfix
, ep
->getContLen())) &&
2580 (!sptr
->getCont() || !(TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen())))) ||
2581 // circumfix flag in prefix AND suffix
2582 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2583 circumfix
, ep
->getContLen())) &&
2584 (sptr
->getCont() && (TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen()))))) &&
2587 !((sptr
->getCont() && (TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))) &&
2588 // needaffix on prefix or first suffix
2590 !(sptr
->getCont() && TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())) ||
2591 (ppfx
&& !((ep
->getCont()) &&
2592 TESTAFF(ep
->getCont(), needaffix
,
2595 ) if (in_compound
!= IN_CPD_END
|| ppfx
|| !(sptr
->getCont() && TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))) {
2596 rv
= sptr
->checkword(word
,len
, sfxopts
, ppfx
, wlst
,
2597 maxSug
, ns
, cclass
, needflag
, (in_compound
? 0 : onlyincompound
));
2599 sfx
=sptr
; // BUG: sfx not stateless
2600 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2601 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2605 sptr
= sptr
->getNextEQ();
2607 sptr
= sptr
->getNextNE();
2614 // check word for two-level suffixes
2616 struct hentry
* AffixMgr::suffix_check_twosfx(const char * word
, int len
,
2617 int sfxopts
, PfxEntry
* ppfx
, const FLAG needflag
)
2619 struct hentry
* rv
= NULL
;
2621 // first handle the special case of 0 length suffixes
2622 SfxEntry
* se
= sStart
[0];
2624 if (contclasses
[se
->getFlag()])
2626 rv
= se
->check_twosfx(word
,len
, sfxopts
, ppfx
, needflag
);
2632 // now handle the general case
2633 if (len
== 0) return NULL
; // FULLSTRIP
2634 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2635 SfxEntry
* sptr
= sStart
[sp
];
2638 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)) {
2639 if (contclasses
[sptr
->getFlag()])
2641 rv
= sptr
->check_twosfx(word
,len
, sfxopts
, ppfx
, needflag
);
2643 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2644 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2648 sptr
= sptr
->getNextEQ();
2650 sptr
= sptr
->getNextNE();
2657 char * AffixMgr::suffix_check_twosfx_morph(const char * word
, int len
,
2658 int sfxopts
, PfxEntry
* ppfx
, const FLAG needflag
)
2660 char result
[MAXLNLEN
];
2661 char result2
[MAXLNLEN
];
2662 char result3
[MAXLNLEN
];
2670 // first handle the special case of 0 length suffixes
2671 SfxEntry
* se
= sStart
[0];
2673 if (contclasses
[se
->getFlag()])
2675 st
= se
->check_twosfx_morph(word
,len
, sfxopts
, ppfx
, needflag
);
2678 if (ppfx
->getMorph()) {
2679 mystrcat(result
, ppfx
->getMorph(), MAXLNLEN
);
2680 mystrcat(result
, " ", MAXLNLEN
);
2681 } else debugflag(result
, ppfx
->getFlag());
2683 mystrcat(result
, st
, MAXLNLEN
);
2685 if (se
->getMorph()) {
2686 mystrcat(result
, " ", MAXLNLEN
);
2687 mystrcat(result
, se
->getMorph(), MAXLNLEN
);
2688 } else debugflag(result
, se
->getFlag());
2689 mystrcat(result
, "\n", MAXLNLEN
);
2695 // now handle the general case
2696 if (len
== 0) return NULL
; // FULLSTRIP
2697 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2698 SfxEntry
* sptr
= sStart
[sp
];
2701 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)) {
2702 if (contclasses
[sptr
->getFlag()])
2704 st
= sptr
->check_twosfx_morph(word
,len
, sfxopts
, ppfx
, needflag
);
2706 sfxflag
= sptr
->getFlag(); // BUG: sfxflag not stateless
2707 if (!sptr
->getCont()) sfxappnd
=sptr
->getKey(); // BUG: sfxappnd not stateless
2708 strcpy(result2
, st
);
2713 if (sptr
->getMorph()) {
2714 mystrcat(result3
, " ", MAXLNLEN
);
2715 mystrcat(result3
, sptr
->getMorph(), MAXLNLEN
);
2716 } else debugflag(result3
, sptr
->getFlag());
2717 strlinecat(result2
, result3
);
2718 mystrcat(result2
, "\n", MAXLNLEN
);
2719 mystrcat(result
, result2
, MAXLNLEN
);
2722 sptr
= sptr
->getNextEQ();
2724 sptr
= sptr
->getNextNE();
2727 if (*result
) return mystrdup(result
);
2731 char * AffixMgr::suffix_check_morph(const char * word
, int len
,
2732 int sfxopts
, PfxEntry
* ppfx
, const FLAG cclass
, const FLAG needflag
, char in_compound
)
2734 char result
[MAXLNLEN
];
2736 struct hentry
* rv
= NULL
;
2740 PfxEntry
* ep
= ppfx
;
2742 // first handle the special case of 0 length suffixes
2743 SfxEntry
* se
= sStart
[0];
2745 if (!cclass
|| se
->getCont()) {
2746 // suffixes are not allowed in beginning of compounds
2747 if (((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2748 // except when signed with compoundpermitflag flag
2749 (se
->getCont() && compoundpermitflag
&&
2750 TESTAFF(se
->getCont(),compoundpermitflag
,se
->getContLen()))) && (!circumfix
||
2751 // no circumfix flag in prefix and suffix
2752 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2753 circumfix
, ep
->getContLen())) &&
2754 (!se
->getCont() || !(TESTAFF(se
->getCont(),circumfix
,se
->getContLen())))) ||
2755 // circumfix flag in prefix AND suffix
2756 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2757 circumfix
, ep
->getContLen())) &&
2758 (se
->getCont() && (TESTAFF(se
->getCont(),circumfix
,se
->getContLen()))))) &&
2761 !((se
->getCont() && (TESTAFF(se
->getCont(), onlyincompound
, se
->getContLen()))))) &&
2762 // needaffix on prefix or first suffix
2764 !(se
->getCont() && TESTAFF(se
->getCont(), needaffix
, se
->getContLen())) ||
2765 (ppfx
&& !((ep
->getCont()) &&
2766 TESTAFF(ep
->getCont(), needaffix
,
2770 rv
= se
->checkword(word
, len
, sfxopts
, ppfx
, NULL
, 0, 0, cclass
, needflag
);
2773 if (ppfx
->getMorph()) {
2774 mystrcat(result
, ppfx
->getMorph(), MAXLNLEN
);
2775 mystrcat(result
, " ", MAXLNLEN
);
2776 } else debugflag(result
, ppfx
->getFlag());
2778 if (complexprefixes
&& HENTRY_DATA(rv
)) mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2779 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2780 mystrcat(result
, " ", MAXLNLEN
);
2781 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
2782 mystrcat(result
, HENTRY_WORD(rv
), MAXLNLEN
);
2784 // store the pointer of the hash entry
2785 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2787 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2788 mystrcat(result
, " ", MAXLNLEN
);
2789 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2791 if (se
->getMorph()) {
2792 mystrcat(result
, " ", MAXLNLEN
);
2793 mystrcat(result
, se
->getMorph(), MAXLNLEN
);
2794 } else debugflag(result
, se
->getFlag());
2795 mystrcat(result
, "\n", MAXLNLEN
);
2796 rv
= se
->get_next_homonym(rv
, sfxopts
, ppfx
, cclass
, needflag
);
2802 // now handle the general case
2803 if (len
== 0) return NULL
; // FULLSTRIP
2804 unsigned char sp
= *((const unsigned char *)(word
+ len
- 1));
2805 SfxEntry
* sptr
= sStart
[sp
];
2808 if (isRevSubset(sptr
->getKey(), word
+ len
- 1, len
)
2810 // suffixes are not allowed in beginning of compounds
2811 if (((((in_compound
!= IN_CPD_BEGIN
)) || // && !cclass
2812 // except when signed with compoundpermitflag flag
2813 (sptr
->getCont() && compoundpermitflag
&&
2814 TESTAFF(sptr
->getCont(),compoundpermitflag
,sptr
->getContLen()))) && (!circumfix
||
2815 // no circumfix flag in prefix and suffix
2816 ((!ppfx
|| !(ep
->getCont()) || !TESTAFF(ep
->getCont(),
2817 circumfix
, ep
->getContLen())) &&
2818 (!sptr
->getCont() || !(TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen())))) ||
2819 // circumfix flag in prefix AND suffix
2820 ((ppfx
&& (ep
->getCont()) && TESTAFF(ep
->getCont(),
2821 circumfix
, ep
->getContLen())) &&
2822 (sptr
->getCont() && (TESTAFF(sptr
->getCont(),circumfix
,sptr
->getContLen()))))) &&
2825 !((sptr
->getCont() && (TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))) &&
2826 // needaffix on first suffix
2827 (cclass
|| !(sptr
->getCont() &&
2828 TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())))
2829 )) rv
= sptr
->checkword(word
,len
, sfxopts
, ppfx
, NULL
, 0, 0, cclass
, needflag
);
2832 if (ppfx
->getMorph()) {
2833 mystrcat(result
, ppfx
->getMorph(), MAXLNLEN
);
2834 mystrcat(result
, " ", MAXLNLEN
);
2835 } else debugflag(result
, ppfx
->getFlag());
2837 if (complexprefixes
&& HENTRY_DATA(rv
)) mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2838 if (! HENTRY_FIND(rv
, MORPH_STEM
)) {
2839 mystrcat(result
, " ", MAXLNLEN
);
2840 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
2841 mystrcat(result
, HENTRY_WORD(rv
), MAXLNLEN
);
2843 // store the pointer of the hash entry
2844 // sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv);
2846 if (!complexprefixes
&& HENTRY_DATA(rv
)) {
2847 mystrcat(result
, " ", MAXLNLEN
);
2848 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
2851 if (sptr
->getMorph()) {
2852 mystrcat(result
, " ", MAXLNLEN
);
2853 mystrcat(result
, sptr
->getMorph(), MAXLNLEN
);
2854 } else debugflag(result
, sptr
->getFlag());
2855 mystrcat(result
, "\n", MAXLNLEN
);
2856 rv
= sptr
->get_next_homonym(rv
, sfxopts
, ppfx
, cclass
, needflag
);
2858 sptr
= sptr
->getNextEQ();
2860 sptr
= sptr
->getNextNE();
2864 if (*result
) return mystrdup(result
);
2868 // check if word with affixes is correctly spelled
2869 struct hentry
* AffixMgr::affix_check (const char * word
, int len
, const FLAG needflag
, char in_compound
)
2871 struct hentry
* rv
= NULL
;
2873 // check all prefixes (also crossed with suffixes if allowed)
2874 rv
= prefix_check(word
, len
, in_compound
, needflag
);
2877 // if still not found check all suffixes
2878 rv
= suffix_check(word
, len
, 0, NULL
, NULL
, 0, NULL
, FLAG_NULL
, needflag
, in_compound
);
2880 if (havecontclass
) {
2885 // if still not found check all two-level suffixes
2886 rv
= suffix_check_twosfx(word
, len
, 0, NULL
, needflag
);
2889 // if still not found check all two-level suffixes
2890 rv
= prefix_check_twosfx(word
, len
, IN_CPD_NOT
, needflag
);
2896 // check if word with affixes is correctly spelled
2897 char * AffixMgr::affix_check_morph(const char * word
, int len
, const FLAG needflag
, char in_compound
)
2899 char result
[MAXLNLEN
];
2904 // check all prefixes (also crossed with suffixes if allowed)
2905 st
= prefix_check_morph(word
, len
, in_compound
);
2907 mystrcat(result
, st
, MAXLNLEN
);
2911 // if still not found check all suffixes
2912 st
= suffix_check_morph(word
, len
, 0, NULL
, '\0', needflag
, in_compound
);
2914 mystrcat(result
, st
, MAXLNLEN
);
2918 if (havecontclass
) {
2921 // if still not found check all two-level suffixes
2922 st
= suffix_check_twosfx_morph(word
, len
, 0, NULL
, needflag
);
2924 mystrcat(result
, st
, MAXLNLEN
);
2928 // if still not found check all two-level suffixes
2929 st
= prefix_check_twosfx_morph(word
, len
, IN_CPD_NOT
, needflag
);
2931 mystrcat(result
, st
, MAXLNLEN
);
2936 return mystrdup(result
);
2939 char * AffixMgr::morphgen(char * ts
, int wl
, const unsigned short * ap
,
2940 unsigned short al
, char * morph
, char * targetmorph
, int level
)
2944 char * stemmorphcatpos
;
2945 char mymorph
[MAXLNLEN
];
2947 if (!morph
) return NULL
;
2949 // check substandard flag
2950 if (TESTAFF(ap
, substandard
, al
)) return NULL
;
2952 if (morphcmp(morph
, targetmorph
) == 0) return mystrdup(ts
);
2954 // int targetcount = get_sfxcount(targetmorph);
2956 // use input suffix fields, if exist
2957 if (strstr(morph
, MORPH_INFL_SFX
) || strstr(morph
, MORPH_DERI_SFX
)) {
2958 stemmorph
= mymorph
;
2959 strcpy(stemmorph
, morph
);
2960 mystrcat(stemmorph
, " ", MAXLNLEN
);
2961 stemmorphcatpos
= stemmorph
+ strlen(stemmorph
);
2964 stemmorphcatpos
= NULL
;
2967 for (int i
= 0; i
< al
; i
++) {
2968 const unsigned char c
= (unsigned char) (ap
[i
] & 0x00FF);
2969 SfxEntry
* sptr
= sFlag
[c
];
2971 if (sptr
->getFlag() == ap
[i
] && sptr
->getMorph() && ((sptr
->getContLen() == 0) ||
2972 // don't generate forms with substandard affixes
2973 !TESTAFF(sptr
->getCont(), substandard
, sptr
->getContLen()))) {
2975 if (stemmorphcatpos
) strcpy(stemmorphcatpos
, sptr
->getMorph());
2976 else stemmorph
= (char *) sptr
->getMorph();
2978 int cmp
= morphcmp(stemmorph
, targetmorph
);
2981 char * newword
= sptr
->add(ts
, wl
);
2983 hentry
* check
= pHMgr
->lookup(newword
); // XXX extra dic
2984 if (!check
|| !check
->astr
||
2985 !(TESTAFF(check
->astr
, forbiddenword
, check
->alen
) ||
2986 TESTAFF(check
->astr
, ONLYUPCASEFLAG
, check
->alen
))) {
2993 // recursive call for secondary suffixes
2994 if ((level
== 0) && (cmp
== 1) && (sptr
->getContLen() > 0) &&
2995 // (get_sfxcount(stemmorph) < targetcount) &&
2996 !TESTAFF(sptr
->getCont(), substandard
, sptr
->getContLen())) {
2997 char * newword
= sptr
->add(ts
, wl
);
2999 char * newword2
= morphgen(newword
, strlen(newword
), sptr
->getCont(),
3000 sptr
->getContLen(), stemmorph
, targetmorph
, 1);
3011 sptr
= sptr
->getFlgNxt();
3018 int AffixMgr::expand_rootword(struct guessword
* wlst
, int maxn
, const char * ts
,
3019 int wl
, const unsigned short * ap
, unsigned short al
, char * bad
, int badl
,
3023 // first add root word to list
3024 if ((nh
< maxn
) && !(al
&& ((needaffix
&& TESTAFF(ap
, needaffix
, al
)) ||
3025 (onlyincompound
&& TESTAFF(ap
, onlyincompound
, al
))))) {
3026 wlst
[nh
].word
= mystrdup(ts
);
3027 if (!wlst
[nh
].word
) return 0;
3028 wlst
[nh
].allow
= (1 == 0);
3029 wlst
[nh
].orig
= NULL
;
3031 // add special phonetic version
3032 if (phon
&& (nh
< maxn
)) {
3033 wlst
[nh
].word
= mystrdup(phon
);
3034 if (!wlst
[nh
].word
) return nh
- 1;
3035 wlst
[nh
].allow
= (1 == 0);
3036 wlst
[nh
].orig
= mystrdup(ts
);
3037 if (!wlst
[nh
].orig
) return nh
- 1;
3043 for (int i
= 0; i
< al
; i
++) {
3044 const unsigned char c
= (unsigned char) (ap
[i
] & 0x00FF);
3045 SfxEntry
* sptr
= sFlag
[c
];
3047 if ((sptr
->getFlag() == ap
[i
]) && (!sptr
->getKeyLen() || ((badl
> sptr
->getKeyLen()) &&
3048 (strcmp(sptr
->getAffix(), bad
+ badl
- sptr
->getKeyLen()) == 0))) &&
3049 // check needaffix flag
3050 !(sptr
->getCont() && ((needaffix
&&
3051 TESTAFF(sptr
->getCont(), needaffix
, sptr
->getContLen())) ||
3053 TESTAFF(sptr
->getCont(), circumfix
, sptr
->getContLen())) ||
3055 TESTAFF(sptr
->getCont(), onlyincompound
, sptr
->getContLen()))))
3057 char * newword
= sptr
->add(ts
, wl
);
3060 wlst
[nh
].word
= newword
;
3061 wlst
[nh
].allow
= sptr
->allowCross();
3062 wlst
[nh
].orig
= NULL
;
3064 // add special phonetic version
3065 if (phon
&& (nh
< maxn
)) {
3066 char st
[MAXWORDUTF8LEN
];
3068 strcat(st
, sptr
->getKey());
3069 reverseword(st
+ strlen(phon
));
3070 wlst
[nh
].word
= mystrdup(st
);
3071 if (!wlst
[nh
].word
) return nh
- 1;
3072 wlst
[nh
].allow
= (1 == 0);
3073 wlst
[nh
].orig
= mystrdup(newword
);
3074 if (!wlst
[nh
].orig
) return nh
- 1;
3082 sptr
= sptr
->getFlgNxt();
3088 // handle cross products of prefixes and suffixes
3089 for (int j
=1;j
<n
;j
++)
3090 if (wlst
[j
].allow
) {
3091 for (int k
= 0; k
< al
; k
++) {
3092 const unsigned char c
= (unsigned char) (ap
[k
] & 0x00FF);
3093 PfxEntry
* cptr
= pFlag
[c
];
3095 if ((cptr
->getFlag() == ap
[k
]) && cptr
->allowCross() && (!cptr
->getKeyLen() || ((badl
> cptr
->getKeyLen()) &&
3096 (strncmp(cptr
->getKey(), bad
, cptr
->getKeyLen()) == 0)))) {
3097 int l1
= strlen(wlst
[j
].word
);
3098 char * newword
= cptr
->add(wlst
[j
].word
, l1
);
3101 wlst
[nh
].word
= newword
;
3102 wlst
[nh
].allow
= cptr
->allowCross();
3103 wlst
[nh
].orig
= NULL
;
3110 cptr
= cptr
->getFlgNxt();
3116 // now handle pure prefixes
3117 for (int m
= 0; m
< al
; m
++) {
3118 const unsigned char c
= (unsigned char) (ap
[m
] & 0x00FF);
3119 PfxEntry
* ptr
= pFlag
[c
];
3121 if ((ptr
->getFlag() == ap
[m
]) && (!ptr
->getKeyLen() || ((badl
> ptr
->getKeyLen()) &&
3122 (strncmp(ptr
->getKey(), bad
, ptr
->getKeyLen()) == 0))) &&
3123 // check needaffix flag
3124 !(ptr
->getCont() && ((needaffix
&&
3125 TESTAFF(ptr
->getCont(), needaffix
, ptr
->getContLen())) ||
3127 TESTAFF(ptr
->getCont(), circumfix
, ptr
->getContLen())) ||
3129 TESTAFF(ptr
->getCont(), onlyincompound
, ptr
->getContLen()))))
3131 char * newword
= ptr
->add(ts
, wl
);
3134 wlst
[nh
].word
= newword
;
3135 wlst
[nh
].allow
= ptr
->allowCross();
3136 wlst
[nh
].orig
= NULL
;
3143 ptr
= ptr
->getFlgNxt();
3150 // return length of replacing table
3151 int AffixMgr::get_numrep() const
3156 // return replacing table
3157 struct replentry
* AffixMgr::get_reptable() const
3159 if (! reptable
) return NULL
;
3163 // return iconv table
3164 RepList
* AffixMgr::get_iconvtable() const
3166 if (! iconvtable
) return NULL
;
3170 // return oconv table
3171 RepList
* AffixMgr::get_oconvtable() const
3173 if (! oconvtable
) return NULL
;
3177 // return replacing table
3178 struct phonetable
* AffixMgr::get_phonetable() const
3180 if (! phone
) return NULL
;
3184 // return length of character map table
3185 int AffixMgr::get_nummap() const
3190 // return character map table
3191 struct mapentry
* AffixMgr::get_maptable() const
3193 if (! maptable
) return NULL
;
3197 // return length of word break table
3198 int AffixMgr::get_numbreak() const
3203 // return character map table
3204 char ** AffixMgr::get_breaktable() const
3206 if (! breaktable
) return NULL
;
3210 // return text encoding of dictionary
3211 char * AffixMgr::get_encoding()
3213 if (! encoding
) encoding
= mystrdup(SPELL_ENCODING
);
3214 return mystrdup(encoding
);
3217 // return text encoding of dictionary
3218 int AffixMgr::get_langnum() const
3223 // return double prefix option
3224 int AffixMgr::get_complexprefixes() const
3226 return complexprefixes
;
3229 // return FULLSTRIP option
3230 int AffixMgr::get_fullstrip() const
3235 FLAG
AffixMgr::get_keepcase() const
3240 FLAG
AffixMgr::get_forceucase() const
3245 FLAG
AffixMgr::get_warn() const
3250 int AffixMgr::get_forbidwarn() const
3255 int AffixMgr::get_checksharps() const
3260 char * AffixMgr::encode_flag(unsigned short aflag
) const
3262 return pHMgr
->encode_flag(aflag
);
3266 // return the preferred ignore string for suggestions
3267 char * AffixMgr::get_ignore() const
3269 if (!ignorechars
) return NULL
;
3273 // return the preferred ignore string for suggestions
3274 unsigned short * AffixMgr::get_ignore_utf16(int * len
) const
3276 *len
= ignorechars_utf16_len
;
3277 return ignorechars_utf16
;
3280 // return the keyboard string for suggestions
3281 char * AffixMgr::get_key_string()
3283 if (! keystring
) keystring
= mystrdup(SPELL_KEYSTRING
);
3284 return mystrdup(keystring
);
3287 // return the preferred try string for suggestions
3288 char * AffixMgr::get_try_string() const
3290 if (! trystring
) return NULL
;
3291 return mystrdup(trystring
);
3294 // return the preferred try string for suggestions
3295 const char * AffixMgr::get_wordchars() const
3300 unsigned short * AffixMgr::get_wordchars_utf16(int * len
) const
3302 *len
= wordchars_utf16_len
;
3303 return wordchars_utf16
;
3306 // is there compounding?
3307 int AffixMgr::get_compound() const
3309 return compoundflag
|| compoundbegin
|| numdefcpd
;
3312 // return the compound words control flag
3313 FLAG
AffixMgr::get_compoundflag() const
3315 return compoundflag
;
3318 // return the forbidden words control flag
3319 FLAG
AffixMgr::get_forbiddenword() const
3321 return forbiddenword
;
3324 // return the forbidden words control flag
3325 FLAG
AffixMgr::get_nosuggest() const
3330 // return the forbidden words control flag
3331 FLAG
AffixMgr::get_nongramsuggest() const
3333 return nongramsuggest
;
3336 // return the forbidden words flag modify flag
3337 FLAG
AffixMgr::get_needaffix() const
3342 // return the onlyincompound flag
3343 FLAG
AffixMgr::get_onlyincompound() const
3345 return onlyincompound
;
3348 // return the compound word signal flag
3349 FLAG
AffixMgr::get_compoundroot() const
3351 return compoundroot
;
3354 // return the compound begin signal flag
3355 FLAG
AffixMgr::get_compoundbegin() const
3357 return compoundbegin
;
3360 // return the value of checknum
3361 int AffixMgr::get_checknum() const
3366 // return the value of prefix
3367 const char * AffixMgr::get_prefix() const
3369 if (pfx
) return pfx
->getKey();
3373 // return the value of suffix
3374 const char * AffixMgr::get_suffix() const
3379 // return the value of suffix
3380 const char * AffixMgr::get_version() const
3385 // return lemma_present flag
3386 FLAG
AffixMgr::get_lemma_present() const
3388 return lemma_present
;
3391 // utility method to look up root words in hash table
3392 struct hentry
* AffixMgr::lookup(const char * word
)
3395 struct hentry
* he
= NULL
;
3396 for (i
= 0; i
< *maxdic
&& !he
; i
++) {
3397 he
= (alldic
[i
])->lookup(word
);
3402 // return the value of suffix
3403 int AffixMgr::have_contclass() const
3405 return havecontclass
;
3409 int AffixMgr::get_utf8() const
3414 int AffixMgr::get_maxngramsugs(void) const
3416 return maxngramsugs
;
3419 int AffixMgr::get_maxcpdsugs(void) const
3424 int AffixMgr::get_maxdiff(void) const
3429 int AffixMgr::get_onlymaxdiff(void) const
3434 // return nosplitsugs
3435 int AffixMgr::get_nosplitsugs(void) const
3440 // return sugswithdots
3441 int AffixMgr::get_sugswithdots(void) const
3443 return sugswithdots
;
3447 int AffixMgr::parse_flag(char * line
, unsigned short * out
, FileMgr
* af
) {
3449 if (*out
!= FLAG_NULL
&& !(*out
>= DEFAULTFLAGS
)) {
3450 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix file parameter\n", af
->getlinenum());
3453 if (parse_string(line
, &s
, af
->getlinenum())) return 1;
3454 *out
= pHMgr
->decode_flag(s
);
3460 int AffixMgr::parse_num(char * line
, int * out
, FileMgr
* af
) {
3463 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix file parameter\n", af
->getlinenum());
3466 if (parse_string(line
, &s
, af
->getlinenum())) return 1;
3472 /* parse in the max syllablecount of compound words and */
3473 int AffixMgr::parse_cpdsyllable(char * line
, FileMgr
* af
)
3479 w_char w
[MAXWORDLEN
];
3480 piece
= mystrsep(&tp
, 0);
3482 if (*piece
!= '\0') {
3484 case 0: { np
++; break; }
3485 case 1: { cpdmaxsyllable
= atoi(piece
); np
++; break; }
3488 cpdvowels
= mystrdup(piece
);
3490 int n
= u8_u16(w
, MAXWORDLEN
, piece
);
3492 flag_qsort((unsigned short *) w
, 0, n
);
3493 cpdvowels_utf16
= (w_char
*) malloc(n
* sizeof(w_char
));
3494 if (!cpdvowels_utf16
) return 1;
3495 memcpy(cpdvowels_utf16
, w
, n
* sizeof(w_char
));
3497 cpdvowels_utf16_len
= n
;
3506 piece
= mystrsep(&tp
, 0);
3509 HUNSPELL_WARNING(stderr
, "error: line %d: missing compoundsyllable information\n", af
->getlinenum());
3512 if (np
== 2) cpdvowels
= mystrdup("aeiouAEIOU");
3516 /* parse in the typical fault correcting table */
3517 int AffixMgr::parse_reptable(char * line
, FileMgr
* af
)
3520 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3527 piece
= mystrsep(&tp
, 0);
3529 if (*piece
!= '\0') {
3531 case 0: { np
++; break; }
3533 numrep
= atoi(piece
);
3535 HUNSPELL_WARNING(stderr
, "error: line %d: incorrect entry number\n", af
->getlinenum());
3538 reptable
= (replentry
*) malloc(numrep
* sizeof(struct replentry
));
3539 if (!reptable
) return 1;
3547 piece
= mystrsep(&tp
, 0);
3550 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3554 /* now parse the numrep lines to read in the remainder of the table */
3556 for (int j
=0; j
< numrep
; j
++) {
3557 if (!(nl
= af
->getline())) return 1;
3561 reptable
[j
].pattern
= NULL
;
3562 reptable
[j
].pattern2
= NULL
;
3563 piece
= mystrsep(&tp
, 0);
3565 if (*piece
!= '\0') {
3568 if (strncmp(piece
,"REP",3) != 0) {
3569 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3576 if (*piece
== '^') reptable
[j
].start
= true; else reptable
[j
].start
= false;
3577 reptable
[j
].pattern
= mystrrep(mystrdup(piece
+ int(reptable
[j
].start
)),"_"," ");
3578 int lr
= strlen(reptable
[j
].pattern
) - 1;
3579 if (reptable
[j
].pattern
[lr
] == '$') {
3580 reptable
[j
].end
= true;
3581 reptable
[j
].pattern
[lr
] = '\0';
3582 } else reptable
[j
].end
= false;
3585 case 2: { reptable
[j
].pattern2
= mystrrep(mystrdup(piece
),"_"," "); break; }
3590 piece
= mystrsep(&tp
, 0);
3592 if ((!(reptable
[j
].pattern
)) || (!(reptable
[j
].pattern2
))) {
3593 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3601 /* parse in the typical fault correcting table */
3602 int AffixMgr::parse_convtable(char * line
, FileMgr
* af
, RepList
** rl
, const char * keyword
)
3605 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3613 piece
= mystrsep(&tp
, 0);
3615 if (*piece
!= '\0') {
3617 case 0: { np
++; break; }
3619 numrl
= atoi(piece
);
3621 HUNSPELL_WARNING(stderr
, "error: line %d: incorrect entry number\n", af
->getlinenum());
3624 *rl
= new RepList(numrl
);
3633 piece
= mystrsep(&tp
, 0);
3636 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3640 /* now parse the num lines to read in the remainder of the table */
3642 for (int j
=0; j
< numrl
; j
++) {
3643 if (!(nl
= af
->getline())) return 1;
3647 char * pattern
= NULL
;
3648 char * pattern2
= NULL
;
3649 piece
= mystrsep(&tp
, 0);
3651 if (*piece
!= '\0') {
3654 if (strncmp(piece
, keyword
, sizeof(keyword
)) != 0) {
3655 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3662 case 1: { pattern
= mystrrep(mystrdup(piece
),"_"," "); break; }
3664 pattern2
= mystrrep(mystrdup(piece
),"_"," ");
3671 piece
= mystrsep(&tp
, 0);
3673 if (!pattern
|| !pattern2
) {
3678 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3681 (*rl
)->add(pattern
, pattern2
);
3687 /* parse in the typical fault correcting table */
3688 int AffixMgr::parse_phonetable(char * line
, FileMgr
* af
)
3691 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3698 piece
= mystrsep(&tp
, 0);
3700 if (*piece
!= '\0') {
3702 case 0: { np
++; break; }
3704 phone
= (phonetable
*) malloc(sizeof(struct phonetable
));
3705 if (!phone
) return 1;
3706 phone
->num
= atoi(piece
);
3707 phone
->rules
= NULL
;
3708 phone
->utf8
= (char) utf8
;
3709 if (phone
->num
< 1) {
3710 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3713 phone
->rules
= (char * *) malloc(2 * (phone
->num
+ 1) * sizeof(char *));
3714 if (!phone
->rules
) {
3726 piece
= mystrsep(&tp
, 0);
3729 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3733 /* now parse the phone->num lines to read in the remainder of the table */
3735 for (int j
=0; j
< phone
->num
; j
++) {
3736 if (!(nl
= af
->getline())) return 1;
3740 phone
->rules
[j
* 2] = NULL
;
3741 phone
->rules
[j
* 2 + 1] = NULL
;
3742 piece
= mystrsep(&tp
, 0);
3744 if (*piece
!= '\0') {
3747 if (strncmp(piece
,"PHONE",5) != 0) {
3748 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3754 case 1: { phone
->rules
[j
* 2] = mystrrep(mystrdup(piece
),"_",""); break; }
3755 case 2: { phone
->rules
[j
* 2 + 1] = mystrrep(mystrdup(piece
),"_",""); break; }
3760 piece
= mystrsep(&tp
, 0);
3762 if ((!(phone
->rules
[j
* 2])) || (!(phone
->rules
[j
* 2 + 1]))) {
3763 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3768 phone
->rules
[phone
->num
* 2] = mystrdup("");
3769 phone
->rules
[phone
->num
* 2 + 1] = mystrdup("");
3770 init_phonet_hash(*phone
);
3774 /* parse in the checkcompoundpattern table */
3775 int AffixMgr::parse_checkcpdtable(char * line
, FileMgr
* af
)
3777 if (numcheckcpd
!= 0) {
3778 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3785 piece
= mystrsep(&tp
, 0);
3787 if (*piece
!= '\0') {
3789 case 0: { np
++; break; }
3791 numcheckcpd
= atoi(piece
);
3792 if (numcheckcpd
< 1) {
3793 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3796 checkcpdtable
= (patentry
*) malloc(numcheckcpd
* sizeof(struct patentry
));
3797 if (!checkcpdtable
) return 1;
3805 piece
= mystrsep(&tp
, 0);
3808 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3812 /* now parse the numcheckcpd lines to read in the remainder of the table */
3814 for (int j
=0; j
< numcheckcpd
; j
++) {
3815 if (!(nl
= af
->getline())) return 1;
3819 checkcpdtable
[j
].pattern
= NULL
;
3820 checkcpdtable
[j
].pattern2
= NULL
;
3821 checkcpdtable
[j
].pattern3
= NULL
;
3822 checkcpdtable
[j
].cond
= FLAG_NULL
;
3823 checkcpdtable
[j
].cond2
= FLAG_NULL
;
3824 piece
= mystrsep(&tp
, 0);
3826 if (*piece
!= '\0') {
3829 if (strncmp(piece
,"CHECKCOMPOUNDPATTERN",20) != 0) {
3830 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3837 checkcpdtable
[j
].pattern
= mystrdup(piece
);
3838 char * p
= strchr(checkcpdtable
[j
].pattern
, '/');
3841 checkcpdtable
[j
].cond
= pHMgr
->decode_flag(p
+ 1);
3845 checkcpdtable
[j
].pattern2
= mystrdup(piece
);
3846 char * p
= strchr(checkcpdtable
[j
].pattern2
, '/');
3849 checkcpdtable
[j
].cond2
= pHMgr
->decode_flag(p
+ 1);
3853 case 3: { checkcpdtable
[j
].pattern3
= mystrdup(piece
); simplifiedcpd
= 1; break; }
3858 piece
= mystrsep(&tp
, 0);
3860 if ((!(checkcpdtable
[j
].pattern
)) || (!(checkcpdtable
[j
].pattern2
))) {
3861 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3869 /* parse in the compound rule table */
3870 int AffixMgr::parse_defcpdtable(char * line
, FileMgr
* af
)
3872 if (numdefcpd
!= 0) {
3873 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3880 piece
= mystrsep(&tp
, 0);
3882 if (*piece
!= '\0') {
3884 case 0: { np
++; break; }
3886 numdefcpd
= atoi(piece
);
3887 if (numdefcpd
< 1) {
3888 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3891 defcpdtable
= (flagentry
*) malloc(numdefcpd
* sizeof(flagentry
));
3892 if (!defcpdtable
) return 1;
3900 piece
= mystrsep(&tp
, 0);
3903 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
3907 /* now parse the numdefcpd lines to read in the remainder of the table */
3909 for (int j
=0; j
< numdefcpd
; j
++) {
3910 if (!(nl
= af
->getline())) return 1;
3914 defcpdtable
[j
].def
= NULL
;
3915 piece
= mystrsep(&tp
, 0);
3917 if (*piece
!= '\0') {
3920 if (strncmp(piece
, "COMPOUNDRULE", 12) != 0) {
3921 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3927 case 1: { // handle parenthesized flags
3928 if (strchr(piece
, '(')) {
3929 defcpdtable
[j
].def
= (FLAG
*) malloc(strlen(piece
) * sizeof(FLAG
));
3930 defcpdtable
[j
].len
= 0;
3934 char * par
= piece
+ 1;
3935 while (*par
!= '(' && *par
!= ')' && *par
!= '\0') par
++;
3936 if (*par
== '\0') end
= 1; else *par
= '\0';
3937 if (*piece
== '(') piece
++;
3938 if (*piece
== '*' || *piece
== '?') {
3939 defcpdtable
[j
].def
[defcpdtable
[j
].len
++] = (FLAG
) *piece
;
3940 } else if (*piece
!= '\0') {
3941 int l
= pHMgr
->decode_flags(&conv
, piece
, af
);
3942 for (int k
= 0; k
< l
; k
++) defcpdtable
[j
].def
[defcpdtable
[j
].len
++] = conv
[k
];
3948 defcpdtable
[j
].len
= pHMgr
->decode_flags(&(defcpdtable
[j
].def
), piece
, af
);
3956 piece
= mystrsep(&tp
, 0);
3958 if (!defcpdtable
[j
].len
) {
3959 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
3968 /* parse in the character map table */
3969 int AffixMgr::parse_maptable(char * line
, FileMgr
* af
)
3972 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
3979 piece
= mystrsep(&tp
, 0);
3981 if (*piece
!= '\0') {
3983 case 0: { np
++; break; }
3985 nummap
= atoi(piece
);
3987 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
3990 maptable
= (mapentry
*) malloc(nummap
* sizeof(struct mapentry
));
3991 if (!maptable
) return 1;
3999 piece
= mystrsep(&tp
, 0);
4002 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
4006 /* now parse the nummap lines to read in the remainder of the table */
4008 for (int j
=0; j
< nummap
; j
++) {
4009 if (!(nl
= af
->getline())) return 1;
4013 maptable
[j
].set
= NULL
;
4014 maptable
[j
].len
= 0;
4015 piece
= mystrsep(&tp
, 0);
4017 if (*piece
!= '\0') {
4020 if (strncmp(piece
,"MAP",3) != 0) {
4021 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
4029 maptable
[j
].len
= strlen(piece
);
4030 maptable
[j
].set
= (char **) malloc(maptable
[j
].len
* sizeof(char*));
4031 if (!maptable
[j
].set
) return 1;
4032 for (int k
= 0; k
< maptable
[j
].len
; k
++) {
4035 if (piece
[k
] == '(') {
4036 char * parpos
= strchr(piece
+ k
, ')');
4037 if (parpos
!= NULL
) {
4039 chl
= (int)(parpos
- piece
) - k
- 1;
4043 if (utf8
&& (piece
[k
] & 0xc0) == 0xc0) {
4044 for (k
++; utf8
&& (piece
[k
] & 0xc0) == 0x80; k
++);
4049 maptable
[j
].set
[setn
] = (char *) malloc(chl
+ 1);
4050 if (!maptable
[j
].set
[setn
]) return 1;
4051 strncpy(maptable
[j
].set
[setn
], piece
+ chb
, chl
);
4052 maptable
[j
].set
[setn
][chl
] = '\0';
4055 maptable
[j
].len
= setn
;
4061 piece
= mystrsep(&tp
, 0);
4063 if (!maptable
[j
].set
|| !maptable
[j
].len
) {
4064 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
4072 /* parse in the word breakpoint table */
4073 int AffixMgr::parse_breaktable(char * line
, FileMgr
* af
)
4075 if (numbreak
> -1) {
4076 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
4083 piece
= mystrsep(&tp
, 0);
4085 if (*piece
!= '\0') {
4087 case 0: { np
++; break; }
4089 numbreak
= atoi(piece
);
4091 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
4094 if (numbreak
== 0) return 0;
4095 breaktable
= (char **) malloc(numbreak
* sizeof(char *));
4096 if (!breaktable
) return 1;
4104 piece
= mystrsep(&tp
, 0);
4107 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
4111 /* now parse the numbreak lines to read in the remainder of the table */
4113 for (int j
=0; j
< numbreak
; j
++) {
4114 if (!(nl
= af
->getline())) return 1;
4118 piece
= mystrsep(&tp
, 0);
4120 if (*piece
!= '\0') {
4123 if (strncmp(piece
,"BREAK",5) != 0) {
4124 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
4131 breaktable
[j
] = mystrdup(piece
);
4138 piece
= mystrsep(&tp
, 0);
4141 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
4149 void AffixMgr::reverse_condition(char * piece
) {
4151 for (char * k
= piece
+ strlen(piece
) - 1; k
>= piece
; k
--) {
4154 if (neg
) *(k
+1) = '['; else *k
= ']';
4159 if (neg
) *(k
+1) = '^';
4164 if (*(k
+1) == ']') neg
= 1; else *(k
+1) = *k
;
4168 if (neg
) *(k
+1) = *k
;
4174 int AffixMgr::parse_affix(char * line
, const char at
, FileMgr
* af
, char * dupflags
)
4176 int numents
= 0; // number of affentry structures to parse
4178 unsigned short aflag
= 0; // affix char identifier
4181 std::vector
<affentry
> affentries
;
4188 // checking lines with bad syntax
4190 int basefieldnum
= 0;
4193 // split affix header line into pieces
4197 piece
= mystrsep(&tp
, 0);
4199 if (*piece
!= '\0') {
4201 // piece 1 - is type of affix
4202 case 0: { np
++; break; }
4204 // piece 2 - is affix char
4207 aflag
= pHMgr
->decode_flag(piece
);
4208 if (((at
== 'S') && (dupflags
[aflag
] & dupSFX
)) ||
4209 ((at
== 'P') && (dupflags
[aflag
] & dupPFX
))) {
4210 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of an affix flag\n",
4212 // return 1; XXX permissive mode for bad dictionaries
4214 dupflags
[aflag
] += (char) ((at
== 'S') ? dupSFX
: dupPFX
);
4217 // piece 3 - is cross product indicator
4218 case 2: { np
++; if (*piece
== 'Y') ff
= aeXPRODUCT
; break; }
4220 // piece 4 - is number of affentries
4223 numents
= atoi(piece
);
4225 char * err
= pHMgr
->encode_flag(aflag
);
4227 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n",
4233 affentries
.resize(numents
);
4234 affentries
[0].opts
= ff
;
4235 if (utf8
) affentries
[0].opts
+= aeUTF8
;
4236 if (pHMgr
->is_aliasf()) affentries
[0].opts
+= aeALIASF
;
4237 if (pHMgr
->is_aliasm()) affentries
[0].opts
+= aeALIASM
;
4238 affentries
[0].aflag
= aflag
;
4245 piece
= mystrsep(&tp
, 0);
4247 // check to make sure we parsed enough pieces
4249 char * err
= pHMgr
->encode_flag(aflag
);
4251 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
4257 // now parse numents affentries for this affix
4258 std::vector
<affentry
>::iterator start
= affentries
.begin();
4259 std::vector
<affentry
>::iterator end
= affentries
.end();
4260 for (std::vector
<affentry
>::iterator entry
= start
; entry
!= end
; ++entry
) {
4261 if (!(nl
= af
->getline())) return 1;
4267 // split line into pieces
4268 piece
= mystrsep(&tp
, 0);
4270 if (*piece
!= '\0') {
4272 // piece 1 - is type
4275 if (entry
!= start
) entry
->opts
= start
->opts
&
4276 (char) (aeXPRODUCT
+ aeUTF8
+ aeALIASF
+ aeALIASM
);
4280 // piece 2 - is affix char
4283 if (pHMgr
->decode_flag(piece
) != aflag
) {
4284 char * err
= pHMgr
->encode_flag(aflag
);
4286 HUNSPELL_WARNING(stderr
, "error: line %d: affix %s is corrupt\n",
4287 af
->getlinenum(), err
);
4293 if (entry
!= start
) entry
->aflag
= start
->aflag
;
4297 // piece 3 - is string to strip or 0 for null
4300 if (complexprefixes
) {
4301 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4303 entry
->strip
= mystrdup(piece
);
4304 entry
->stripl
= (unsigned char) strlen(entry
->strip
);
4305 if (strcmp(entry
->strip
,"0") == 0) {
4307 entry
->strip
=mystrdup("");
4313 // piece 4 - is affix string or 0 for null
4316 entry
->morphcode
= NULL
;
4317 entry
->contclass
= NULL
;
4318 entry
->contclasslen
= 0;
4320 dash
= strchr(piece
, '/');
4326 remove_ignored_chars_utf(piece
, ignorechars_utf16
, ignorechars_utf16_len
);
4328 remove_ignored_chars(piece
,ignorechars
);
4332 if (complexprefixes
) {
4333 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4335 entry
->appnd
= mystrdup(piece
);
4337 if (pHMgr
->is_aliasf()) {
4338 int index
= atoi(dash
+ 1);
4339 entry
->contclasslen
= (unsigned short) pHMgr
->get_aliasf(index
, &(entry
->contclass
), af
);
4340 if (!entry
->contclasslen
) HUNSPELL_WARNING(stderr
, "error: bad affix flag alias: \"%s\"\n", dash
+1);
4342 entry
->contclasslen
= (unsigned short) pHMgr
->decode_flags(&(entry
->contclass
), dash
+ 1, af
);
4343 flag_qsort(entry
->contclass
, 0, entry
->contclasslen
);
4348 for (unsigned short _i
= 0; _i
< entry
->contclasslen
; _i
++) {
4349 contclasses
[(entry
->contclass
)[_i
]] = 1;
4354 remove_ignored_chars_utf(piece
, ignorechars_utf16
, ignorechars_utf16_len
);
4356 remove_ignored_chars(piece
,ignorechars
);
4360 if (complexprefixes
) {
4361 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4363 entry
->appnd
= mystrdup(piece
);
4366 entry
->appndl
= (unsigned char) strlen(entry
->appnd
);
4367 if (strcmp(entry
->appnd
,"0") == 0) {
4369 entry
->appnd
=mystrdup("");
4375 // piece 5 - is the conditions descriptions
4378 if (complexprefixes
) {
4379 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4380 reverse_condition(piece
);
4382 if (entry
->stripl
&& (strcmp(piece
, ".") != 0) &&
4383 redundant_condition(at
, entry
->strip
, entry
->stripl
, piece
, af
->getlinenum()))
4387 reverse_condition(piece
);
4389 if (encodeit(*entry
, piece
)) return 1;
4395 if (pHMgr
->is_aliasm()) {
4396 int index
= atoi(piece
);
4397 entry
->morphcode
= pHMgr
->get_aliasm(index
);
4399 if (complexprefixes
) { // XXX - fix me for morph. gen.
4400 if (utf8
) reverseword_utf(piece
); else reverseword(piece
);
4402 // add the remaining of the line
4405 tp
= tp
+ strlen(tp
);
4407 entry
->morphcode
= mystrdup(piece
);
4408 if (!entry
->morphcode
) return 1;
4416 piece
= mystrsep(&tp
, 0);
4418 // check to make sure we parsed enough pieces
4420 char * err
= pHMgr
->encode_flag(aflag
);
4422 HUNSPELL_WARNING(stderr
, "error: line %d: affix %s is corrupt\n",
4423 af
->getlinenum(), err
);
4430 // detect unnecessary fields, excepting comments
4432 int fieldnum
= !(entry
->morphcode
) ? 5 : ((*(entry
->morphcode
)=='#') ? 5 : 6);
4433 if (fieldnum
!= basefieldnum
)
4434 HUNSPELL_WARNING(stderr
, "warning: line %d: bad field number\n", af
->getlinenum());
4436 basefieldnum
= !(entry
->morphcode
) ? 5 : ((*(entry
->morphcode
)=='#') ? 5 : 6);
4441 // now create SfxEntry or PfxEntry objects and use links to
4442 // build an ordered (sorted by affix string) list
4443 for (std::vector
<affentry
>::iterator entry
= start
; entry
!= end
; ++entry
) {
4445 PfxEntry
* pfxptr
= new PfxEntry(this,&(*entry
));
4446 build_pfxtree(pfxptr
);
4448 SfxEntry
* sfxptr
= new SfxEntry(this,&(*entry
));
4449 build_sfxtree(sfxptr
);
4455 int AffixMgr::redundant_condition(char ft
, char * strip
, int stripl
, const char * cond
, int linenum
) {
4456 int condl
= strlen(cond
);
4461 if (ft
== 'P') { // prefix
4462 if (strncmp(strip
, cond
, condl
) == 0) return 1;
4465 for (i
= 0, j
= 0; (i
< stripl
) && (j
< condl
); i
++, j
++) {
4466 if (cond
[j
] != '[') {
4467 if (cond
[j
] != strip
[i
]) {
4468 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4472 neg
= (cond
[j
+1] == '^') ? 1 : 0;
4476 if (strip
[i
] == cond
[j
]) in
= 1;
4477 } while ((j
< (condl
- 1)) && (cond
[j
] != ']'));
4478 if (j
== (condl
- 1) && (cond
[j
] != ']')) {
4479 HUNSPELL_WARNING(stderr
, "error: line %d: missing ] in condition:\n%s\n", linenum
, cond
);
4482 if ((!neg
&& !in
) || (neg
&& in
)) {
4483 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4488 if (j
>= condl
) return 1;
4491 if ((stripl
>= condl
) && strcmp(strip
+ stripl
- condl
, cond
) == 0) return 1;
4494 for (i
= stripl
- 1, j
= condl
- 1; (i
>= 0) && (j
>= 0); i
--, j
--) {
4495 if (cond
[j
] != ']') {
4496 if (cond
[j
] != strip
[i
]) {
4497 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4504 if (strip
[i
] == cond
[j
]) in
= 1;
4505 } while ((j
> 0) && (cond
[j
] != '['));
4506 if ((j
== 0) && (cond
[j
] != '[')) {
4507 HUNSPELL_WARNING(stderr
, "error: line: %d: missing ] in condition:\n%s\n", linenum
, cond
);
4510 neg
= (cond
[j
+1] == '^') ? 1 : 0;
4511 if ((!neg
&& !in
) || (neg
&& in
)) {
4512 HUNSPELL_WARNING(stderr
, "warning: line %d: incompatible stripping characters and condition\n", linenum
);
4517 if (j
< 0) return 1;