1 #include "license.hunspell"
2 #include "license.myspell"
16 #include "affentry.hxx"
19 #ifndef MOZILLA_CLIENT
26 PfxEntry::PfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
28 // register affix manager
31 // set up its intial values
33 aflag
= dp
->aflag
; // flag
34 strip
= dp
->strip
; // string to strip
35 appnd
= dp
->appnd
; // string to append
36 stripl
= dp
->stripl
; // length of strip string
37 appndl
= dp
->appndl
; // length of append string
38 numconds
= dp
->numconds
; // number of conditions to match
39 opts
= dp
->opts
; // cross product flag
40 // then copy over all of the conditions
41 memcpy(&conds
.base
[0],&dp
->conds
.base
[0],SETSIZE
*sizeof(conds
.base
[0]));
45 #ifdef HUNSPELL_EXPERIMENTAL
46 morphcode
= dp
->morphcode
;
48 contclass
= dp
->contclass
;
49 contclasslen
= dp
->contclasslen
;
56 if (appnd
) free(appnd
);
57 if (strip
) free(strip
);
62 for (int i
= 0; i
< 8; i
++) {
63 if (conds
.utf8
.wchars
[i
]) free(conds
.utf8
.wchars
[i
]);
66 #ifdef HUNSPELL_EXPERIMENTAL
67 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
69 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
72 // add prefix to this word assuming conditions hold
73 char * PfxEntry::add(const char * word
, int len
)
75 char tword
[MAXWORDUTF8LEN
+ 4];
77 if ((len
> stripl
) && (len
>= numconds
) && test_condition(word
) &&
78 (!stripl
|| (strncmp(word
, strip
, stripl
) == 0)) &&
79 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
80 /* we have a match so add prefix */
86 strcpy(pp
, (word
+ stripl
));
87 return mystrdup(tword
);
93 inline int PfxEntry::test_condition(const char * st
)
96 unsigned char * cp
= (unsigned char *)st
;
97 if (!(opts
& aeUTF8
)) { // 256-character codepage
98 for (cond
= 0; cond
< numconds
; cond
++) {
99 if ((conds
.base
[*cp
++] & (1 << cond
)) == 0) return 0;
101 } else { // UTF-8 encoding
103 for (cond
= 0; cond
< numconds
; cond
++) {
104 // a simple 7-bit ASCII character in UTF-8
105 if ((*cp
>> 7) == 0) {
106 // also check limit (end of word)
107 if ((!*cp
) || ((conds
.utf8
.ascii
[*cp
++] & (1 << cond
)) == 0)) return 0;
108 // UTF-8 multibyte character
110 // not dot wildcard in rule
111 if (!conds
.utf8
.all
[cond
]) {
112 if (conds
.utf8
.neg
[cond
]) {
113 u8_u16((w_char
*) &wc
, 1, (char *) cp
);
114 if (conds
.utf8
.wchars
[cond
] &&
115 flag_bsearch((unsigned short *)conds
.utf8
.wchars
[cond
],
116 wc
, (short) conds
.utf8
.wlen
[cond
])) return 0;
118 if (!conds
.utf8
.wchars
[cond
]) return 0;
119 u8_u16((w_char
*) &wc
, 1, (char *) cp
);
120 if (!flag_bsearch((unsigned short *)conds
.utf8
.wchars
[cond
],
121 wc
, (short)conds
.utf8
.wlen
[cond
])) return 0;
124 // jump to next UTF-8 character
125 for(cp
++; (*cp
& 0xc0) == 0x80; cp
++);
133 // check if this prefix entry matches
134 struct hentry
* PfxEntry::checkword(const char * word
, int len
, char in_compound
, const FLAG needflag
)
136 int tmpl
; // length of tmpword
137 struct hentry
* he
; // hash entry of root word or NULL
138 char tmpword
[MAXWORDUTF8LEN
+ 4];
140 // on entry prefix is 0 length or already matches the beginning of the word.
141 // So if the remaining root word has positive length
142 // and if there are enough chars in root word and added back strip chars
143 // to meet the number of characters conditions, then test it
147 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
149 // generate new root word by removing prefix and adding
150 // back any characters that would have been stripped
152 if (stripl
) strcpy (tmpword
, strip
);
153 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
155 // now make sure all of the conditions on characters
156 // are met. Please see the appendix at the end of
157 // this file for more info on exactly what is being
160 // if all conditions are met then check if resulting
161 // root word in the dictionary
163 if (test_condition(tmpword
)) {
165 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
167 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
168 // forbid single prefixes with pseudoroot flag
169 ! TESTAFF(contclass
, pmyMgr
->get_pseudoroot(), contclasslen
) &&
171 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
172 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
))))
174 he
= he
->next_homonym
; // check homonyms
178 // prefix matched but no root word was found
179 // if aeXPRODUCT is allowed, try again but now
180 // ross checked combined with a suffix
182 //if ((opts & aeXPRODUCT) && in_compound) {
183 if ((opts
& aeXPRODUCT
)) {
184 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this, NULL
,
185 0, NULL
, FLAG_NULL
, needflag
, in_compound
);
193 // check if this prefix entry matches
194 struct hentry
* PfxEntry::check_twosfx(const char * word
, int len
,
195 char in_compound
, const FLAG needflag
)
197 int tmpl
; // length of tmpword
198 struct hentry
* he
; // hash entry of root word or NULL
199 char tmpword
[MAXWORDUTF8LEN
+ 4];
201 // on entry prefix is 0 length or already matches the beginning of the word.
202 // So if the remaining root word has positive length
203 // and if there are enough chars in root word and added back strip chars
204 // to meet the number of characters conditions, then test it
208 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
210 // generate new root word by removing prefix and adding
211 // back any characters that would have been stripped
213 if (stripl
) strcpy (tmpword
, strip
);
214 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
216 // now make sure all of the conditions on characters
217 // are met. Please see the appendix at the end of
218 // this file for more info on exactly what is being
221 // if all conditions are met then check if resulting
222 // root word in the dictionary
224 if (test_condition(tmpword
)) {
227 // prefix matched but no root word was found
228 // if aeXPRODUCT is allowed, try again but now
229 // cross checked combined with a suffix
231 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
232 he
= pmyMgr
->suffix_check_twosfx(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this, needflag
);
240 #ifdef HUNSPELL_EXPERIMENTAL
241 // check if this prefix entry matches
242 char * PfxEntry::check_twosfx_morph(const char * word
, int len
,
243 char in_compound
, const FLAG needflag
)
245 int tmpl
; // length of tmpword
246 char tmpword
[MAXWORDUTF8LEN
+ 4];
248 // on entry prefix is 0 length or already matches the beginning of the word.
249 // So if the remaining root word has positive length
250 // and if there are enough chars in root word and added back strip chars
251 // to meet the number of characters conditions, then test it
255 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
257 // generate new root word by removing prefix and adding
258 // back any characters that would have been stripped
260 if (stripl
) strcpy (tmpword
, strip
);
261 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
263 // now make sure all of the conditions on characters
264 // are met. Please see the appendix at the end of
265 // this file for more info on exactly what is being
268 // if all conditions are met then check if resulting
269 // root word in the dictionary
271 if (test_condition(tmpword
)) {
274 // prefix matched but no root word was found
275 // if aeXPRODUCT is allowed, try again but now
276 // ross checked combined with a suffix
278 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
279 return pmyMgr
->suffix_check_twosfx_morph(tmpword
, tmpl
,
280 aeXPRODUCT
, (AffEntry
*)this, needflag
);
287 // check if this prefix entry matches
288 char * PfxEntry::check_morph(const char * word
, int len
, char in_compound
, const FLAG needflag
)
290 int tmpl
; // length of tmpword
291 struct hentry
* he
; // hash entry of root word or NULL
292 char tmpword
[MAXWORDUTF8LEN
+ 4];
293 char result
[MAXLNLEN
];
298 // on entry prefix is 0 length or already matches the beginning of the word.
299 // So if the remaining root word has positive length
300 // and if there are enough chars in root word and added back strip chars
301 // to meet the number of characters conditions, then test it
305 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
307 // generate new root word by removing prefix and adding
308 // back any characters that would have been stripped
310 if (stripl
) strcpy (tmpword
, strip
);
311 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
313 // now make sure all of the conditions on characters
314 // are met. Please see the appendix at the end of
315 // this file for more info on exactly what is being
318 // if all conditions are met then check if resulting
319 // root word in the dictionary
321 if (test_condition(tmpword
)) {
323 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
325 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
326 // forbid single prefixes with pseudoroot flag
327 ! TESTAFF(contclass
, pmyMgr
->get_pseudoroot(), contclasslen
) &&
329 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
330 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
)))) {
331 if (morphcode
) strcat(result
, morphcode
); else strcat(result
,getKey());
332 if (he
->description
) {
333 if ((*(he
->description
)=='[')||(*(he
->description
)=='<')) strcat(result
,he
->word
);
334 strcat(result
,he
->description
);
336 strcat(result
, "\n");
338 he
= he
->next_homonym
;
342 // prefix matched but no root word was found
343 // if aeXPRODUCT is allowed, try again but now
344 // ross checked combined with a suffix
346 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
347 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, aeXPRODUCT
, (AffEntry
*)this,
348 FLAG_NULL
, needflag
);
357 if (*result
) return mystrdup(result
);
360 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
362 SfxEntry::SfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
364 // register affix manager
367 // set up its intial values
368 aflag
= dp
->aflag
; // char flag
369 strip
= dp
->strip
; // string to strip
370 appnd
= dp
->appnd
; // string to append
371 stripl
= dp
->stripl
; // length of strip string
372 appndl
= dp
->appndl
; // length of append string
373 numconds
= dp
->numconds
; // number of conditions to match
374 opts
= dp
->opts
; // cross product flag
376 // then copy over all of the conditions
377 memcpy(&conds
.base
[0],&dp
->conds
.base
[0],SETSIZE
*sizeof(conds
.base
[0]));
379 rappnd
= myrevstrdup(appnd
);
381 #ifdef HUNSPELL_EXPERIMENTAL
382 morphcode
= dp
->morphcode
;
384 contclass
= dp
->contclass
;
385 contclasslen
= dp
->contclasslen
;
389 SfxEntry::~SfxEntry()
392 if (appnd
) free(appnd
);
393 if (rappnd
) free(rappnd
);
394 if (strip
) free(strip
);
399 for (int i
= 0; i
< 8; i
++) {
400 if (conds
.utf8
.wchars
[i
]) free(conds
.utf8
.wchars
[i
]);
403 #ifdef HUNSPELL_EXPERIMENTAL
404 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
406 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
409 // add suffix to this word assuming conditions hold
410 char * SfxEntry::add(const char * word
, int len
)
412 char tword
[MAXWORDUTF8LEN
+ 4];
414 /* make sure all conditions match */
415 if ((len
> stripl
) && (len
>= numconds
) && test_condition(word
+ len
, word
) &&
416 (!stripl
|| (strcmp(word
+ len
- stripl
, strip
) == 0)) &&
417 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
418 /* we have a match so add suffix */
421 strcpy(tword
+ len
- stripl
, appnd
);
423 *(tword
+ len
- stripl
) = '\0';
425 return mystrdup(tword
);
431 inline int SfxEntry::test_condition(const char * st
, const char * beg
)
434 unsigned char * cp
= (unsigned char *) st
;
435 if (!(opts
& aeUTF8
)) { // 256-character codepage
436 // Dömölki affix algorithm
437 for (cond
= numconds
; --cond
>= 0; ) {
438 if ((conds
.base
[*--cp
] & (1 << cond
)) == 0) return 0;
440 } else { // UTF-8 encoding
442 for (cond
= numconds
; --cond
>= 0; ) {
443 // go to next character position and check limit
444 if ((char *) --cp
< beg
) return 0;
445 // a simple 7-bit ASCII character in UTF-8
446 if ((*cp
>> 7) == 0) {
447 if ((conds
.utf8
.ascii
[*cp
] & (1 << cond
)) == 0) return 0;
448 // UTF-8 multibyte character
450 // go to first character of UTF-8 multibyte character
451 for (; (*cp
& 0xc0) == 0x80; cp
--);
452 // not dot wildcard in rule
453 if (!conds
.utf8
.all
[cond
]) {
454 if (conds
.utf8
.neg
[cond
]) {
455 u8_u16((w_char
*) &wc
, 1, (char *) cp
);
456 if (conds
.utf8
.wchars
[cond
] &&
457 flag_bsearch((unsigned short *)conds
.utf8
.wchars
[cond
],
458 wc
, (short) conds
.utf8
.wlen
[cond
])) return 0;
460 if (!conds
.utf8
.wchars
[cond
]) return 0;
461 u8_u16((w_char
*) &wc
, 1, (char *) cp
);
462 if (!flag_bsearch((unsigned short *)conds
.utf8
.wchars
[cond
],
463 wc
, (short)conds
.utf8
.wlen
[cond
])) return 0;
474 // see if this suffix is present in the word
475 struct hentry
* SfxEntry::checkword(const char * word
, int len
, int optflags
,
476 AffEntry
* ppfx
, char ** wlst
, int maxSug
, int * ns
, const FLAG cclass
, const FLAG needflag
,
479 int tmpl
; // length of tmpword
480 struct hentry
* he
; // hash entry pointer
482 char tmpword
[MAXWORDUTF8LEN
+ 4];
483 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
485 // if this suffix is being cross checked with a prefix
486 // but it does not support cross products skip it
488 if (((optflags
& aeXPRODUCT
) != 0) && ((opts
& aeXPRODUCT
) == 0))
491 // upon entry suffix is 0 length or already matches the end of the word.
492 // So if the remaining root word has positive length
493 // and if there are enough chars in root word and added back strip chars
494 // to meet the number of characters conditions, then test it
497 // the second condition is not enough for UTF-8 strings
498 // it checked in test_condition()
500 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
502 // generate new root word by removing suffix and adding
503 // back any characters that would have been stripped or
504 // or null terminating the shorter string
506 strcpy (tmpword
, word
);
507 cp
= (unsigned char *)(tmpword
+ tmpl
);
509 strcpy ((char *)cp
, strip
);
511 cp
= (unsigned char *)(tmpword
+ tmpl
);
514 // now make sure all of the conditions on characters
515 // are met. Please see the appendix at the end of
516 // this file for more info on exactly what is being // tested
518 // if all conditions are met then check if resulting
519 // root word in the dictionary
521 if (test_condition((char *) cp
, (char *) tmpword
)) {
523 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
524 fprintf(stdout
,"%s %s %c\n", word
, tmpword
, aflag
);
526 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
528 // check conditional suffix (enabled by prefix)
529 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() &&
530 TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
531 (((optflags
& aeXPRODUCT
) == 0) ||
532 TESTAFF(he
->astr
, ep
->getFlag(), he
->alen
) ||
534 ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
536 // handle cont. class
538 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
540 // check only in compound homonyms (bad flags)
541 (!badflag
|| !TESTAFF(he
->astr
, badflag
, he
->alen
)
543 // handle required flag
545 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
546 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
549 he
= he
->next_homonym
; // check homonyms
552 // obsolote stemming code (used only by the
553 // experimental SuffixMgr:suggest_pos_stems)
554 // store resulting root in wlst
555 } else if (wlst
&& (*ns
< maxSug
)) {
557 for (int k
=0; k
< *ns
; k
++)
558 if (strcmp(tmpword
, wlst
[k
]) == 0) cwrd
= 0;
560 wlst
[*ns
] = mystrdup(tmpword
);
561 if (wlst
[*ns
] == NULL
) {
562 for (int j
=0; j
<*ns
; j
++) free(wlst
[j
]);
574 // see if two-level suffix is present in the word
575 struct hentry
* SfxEntry::check_twosfx(const char * word
, int len
, int optflags
,
576 AffEntry
* ppfx
, const FLAG needflag
)
578 int tmpl
; // length of tmpword
579 struct hentry
* he
; // hash entry pointer
581 char tmpword
[MAXWORDUTF8LEN
+ 4];
582 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
585 // if this suffix is being cross checked with a prefix
586 // but it does not support cross products skip it
588 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
591 // upon entry suffix is 0 length or already matches the end of the word.
592 // So if the remaining root word has positive length
593 // and if there are enough chars in root word and added back strip chars
594 // to meet the number of characters conditions, then test it
598 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
600 // generate new root word by removing suffix and adding
601 // back any characters that would have been stripped or
602 // or null terminating the shorter string
604 strcpy (tmpword
, word
);
605 cp
= (unsigned char *)(tmpword
+ tmpl
);
607 strcpy ((char *)cp
, strip
);
609 cp
= (unsigned char *)(tmpword
+ tmpl
);
612 // now make sure all of the conditions on characters
613 // are met. Please see the appendix at the end of
614 // this file for more info on exactly what is being
617 // if all conditions are met then recall suffix_check
619 if (test_condition((char *) cp
, (char *) tmpword
)) {
621 // handle conditional suffix
622 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
623 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
625 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, optflags
, ppfx
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
627 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
635 #ifdef HUNSPELL_EXPERIMENTAL
636 // see if two-level suffix is present in the word
637 char * SfxEntry::check_twosfx_morph(const char * word
, int len
, int optflags
,
638 AffEntry
* ppfx
, const FLAG needflag
)
640 int tmpl
; // length of tmpword
642 char tmpword
[MAXWORDUTF8LEN
+ 4];
643 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
646 char result
[MAXLNLEN
];
650 // if this suffix is being cross checked with a prefix
651 // but it does not support cross products skip it
653 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
656 // upon entry suffix is 0 length or already matches the end of the word.
657 // So if the remaining root word has positive length
658 // and if there are enough chars in root word and added back strip chars
659 // to meet the number of characters conditions, then test it
663 if ((tmpl
> 0) && (tmpl
+ stripl
>= numconds
)) {
665 // generate new root word by removing suffix and adding
666 // back any characters that would have been stripped or
667 // or null terminating the shorter string
669 strcpy (tmpword
, word
);
670 cp
= (unsigned char *)(tmpword
+ tmpl
);
672 strcpy ((char *)cp
, strip
);
674 cp
= (unsigned char *)(tmpword
+ tmpl
);
677 // now make sure all of the conditions on characters
678 // are met. Please see the appendix at the end of
679 // this file for more info on exactly what is being
682 // if all conditions are met then recall suffix_check
684 if (test_condition((char *) cp
, (char *) tmpword
)) {
686 // handle conditional suffix
687 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
)) {
688 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
690 if (((PfxEntry
*) ppfx
)->getMorph()) {
691 strcat(result
, ((PfxEntry
*) ppfx
)->getMorph());
698 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, optflags
, ppfx
, aflag
, needflag
);
706 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
713 if (*result
) return mystrdup(result
);
718 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
720 // get next homonym with same affix
721 struct hentry
* SfxEntry::get_next_homonym(struct hentry
* he
, int optflags
, AffEntry
* ppfx
,
722 const FLAG cclass
, const FLAG needflag
)
724 PfxEntry
* ep
= (PfxEntry
*) ppfx
;
726 while (he
->next_homonym
) {
727 he
= he
->next_homonym
;
728 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() && TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
729 ((optflags
& aeXPRODUCT
) == 0 ||
730 TESTAFF(he
->astr
, ep
->getFlag(), he
->alen
) ||
731 // handle conditional suffix
732 ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
734 // handle cont. class
736 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
738 // handle required flag
740 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
741 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
751 Appendix
: Understanding Affix Code
754 An affix is either a prefix
or a suffix attached to root words to make
757 Basically a Prefix
or a Suffix is set of AffEntry objects
758 which store information about the prefix
or suffix along
759 with supporting routines to check
if a word has a particular
760 prefix
or suffix
or a combination
.
762 The structure affentry is defined as follows
:
766 unsigned short aflag
; // ID used to represent the affix
767 char * strip
; // string to strip before adding affix
768 char * appnd
; // the affix string to add
769 unsigned char stripl
; // length of the strip string
770 unsigned char appndl
; // length of the affix string
771 char numconds
; // the number of conditions that must be met
772 char opts
; // flag: aeXPRODUCT- combine both prefix and suffix
773 char conds
[SETSIZE
]; // array which encodes the conditions to be met
777 Here is a suffix borrowed from the en_US
.aff file
. This file
778 is whitespace delimited
.
782 SFX D y ied
[^aeiou
]y
786 This information can be interpreted as follows
:
788 In the first line has
4 fields
792 1 SFX
- indicates
this is a suffix
793 2 D
- is the name of the character flag which represents
this suffix
794 3 Y
- indicates it can be combined with
prefixes (cross product
)
795 4 4 - indicates that sequence of
4 affentry structures are needed to
796 properly store the affix information
798 The remaining lines describe the unique information
for the
4 SfxEntry
799 objects that make up
this affix
. Each line can be interpreted
800 as follows
: (note fields
1 and 2 are as a check against line
1 info
)
804 1 SFX
- indicates
this is a suffix
805 2 D
- is the name of the character flag
for this affix
806 3 y
- the string of chars to strip off before adding affix
807 (a
0 here indicates the NULL string
)
808 4 ied
- the string of affix characters to add
809 5 [^aeiou
]y
- the conditions which must be met before the affix
812 Field
5 is interesting
. Since
this is a suffix
, field
5 tells us that
813 there are
2 conditions that must be met
. The first condition is that
814 the next to the last character in the word must
*NOT
* be any of the
815 following
"a", "e", "i", "o" or "u". The second condition is that
816 the last character of the word must end in
"y".
818 So how can we encode
this information concisely
and be able to
819 test
for both conditions in a fast manner
? The answer is found
820 but studying the wonderful ispell code of Geoff Kuenning
, et
.al
.
821 (now available under a normal BSD license
).
823 If we set up a conds array of
256 bytes
indexed (0 to
255) and access it
824 using a
character (cast to an
unsigned char) of a string
, we have
8 bits
825 of information we can store about that character
. Specifically we
826 could use each bit to say
if that character is allowed in any of the
827 last (or first
for prefixes
) 8 characters of the word
.
829 Basically
, each character at one end of the
word (up to the number
830 of conditions
) is used to index into the conds array
and the resulting
831 value found there says whether the that character is valid
for a
832 specific character position in the word
.
834 For prefixes
, it does
this by setting bit
0 if that
char is valid
835 in the first position
, bit
1 if valid in the second position
, and so on
.
837 If a bit is
not set
, then that
char is
not valid
for that postion in the
840 If working with suffixes bit
0 is used
for the character closest
841 to the front
, bit
1 for the next character towards the end
, ...,
842 with bit numconds
-1 representing the last
char at the end of the string
.
844 Note
: since entries in the conds
[] are
8 bits
, only
8 conditions
845 (read that only
8 character positions
) can be examined at one
846 end of a
word (the beginning
for prefixes
and the end
for suffixes
.
848 So to make
this clearer
, lets encode the conds array values
for the
849 first two affentries
for the suffix D described earlier
.
852 For the first affentry
:
853 numconds
= 1 (only examine the last character
)
855 conds
['e'] = (1 << 0) (the word must end in an E
)
858 For the second affentry
:
859 numconds
= 2 (only examine the last two characters
)
861 conds
[X
] = conds
[X
] | (1 << 0) (aeiou are
not allowed
)
862 where X is all characters
*but
* a
, e
, i
, o
, or u
865 conds
['y'] = (1 << 1) (the last
char must be a y
)
866 all other bits
for all other entries in the conds array are zero