1 #include "license.hunspell"
2 #include "license.myspell"
9 #include "affentry.hxx"
12 PfxEntry::PfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
14 // register affix manager
17 // set up its initial values
19 aflag
= dp
->aflag
; // flag
20 strip
= dp
->strip
; // string to strip
21 appnd
= dp
->appnd
; // string to append
22 stripl
= dp
->stripl
; // length of strip string
23 appndl
= dp
->appndl
; // length of append string
24 numconds
= dp
->numconds
; // length of the condition
25 opts
= dp
->opts
; // cross product flag
26 // then copy over all of the conditions
27 if (opts
& aeLONGCOND
) {
28 memcpy(c
.conds
, dp
->c
.l
.conds1
, MAXCONDLEN_1
);
29 c
.l
.conds2
= dp
->c
.l
.conds2
;
30 } else memcpy(c
.conds
, dp
->c
.conds
, MAXCONDLEN
);
34 morphcode
= dp
->morphcode
;
35 contclass
= dp
->contclass
;
36 contclasslen
= dp
->contclasslen
;
43 if (appnd
) free(appnd
);
44 if (strip
) free(strip
);
48 if (opts
& aeLONGCOND
) free(c
.l
.conds2
);
49 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
50 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
53 // add prefix to this word assuming conditions hold
54 char * PfxEntry::add(const char * word
, int len
)
56 char tword
[MAXWORDUTF8LEN
+ 4];
58 if ((len
> stripl
|| (len
== 0 && pmyMgr
->get_fullstrip())) &&
59 (len
>= numconds
) && test_condition(word
) &&
60 (!stripl
|| (strncmp(word
, strip
, stripl
) == 0)) &&
61 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
62 /* we have a match so add prefix */
68 strcpy(pp
, (word
+ stripl
));
69 return mystrdup(tword
);
74 inline char * PfxEntry::nextchar(char * p
) {
77 if (opts
& aeLONGCOND
) {
78 // jump to the 2nd part of the condition
79 if (p
== c
.conds
+ MAXCONDLEN_1
) return c
.l
.conds2
;
80 // end of the MAXCONDLEN length condition
81 } else if (p
== c
.conds
+ MAXCONDLEN
) return NULL
;
87 inline int PfxEntry::test_condition(const char * st
)
89 const char * pos
= NULL
; // group with pos input position
90 bool neg
= false; // complementer
91 bool ingroup
= false; // character in the group
92 if (numconds
== 0) return 1;
103 case '^': { p
= nextchar(p
); neg
= true; break; }
105 if ((neg
&& ingroup
) || (!neg
&& !ingroup
)) return 0;
108 // skip the next character
109 if (!ingroup
&& *st
) for (st
++; (opts
& aeUTF8
) && (*st
& 0xc0) == 0x80; st
++);
110 if (*st
== '\0' && p
) return 0; // word <= condition
113 case '.': if (!pos
) { // dots are not metacharacters in groups: [.]
115 // skip the next character
116 for (st
++; (opts
& aeUTF8
) && (*st
& 0xc0) == 0x80; st
++);
117 if (*st
== '\0' && p
) return 0; // word <= condition
124 if ((opts
& aeUTF8
) && (*(st
- 1) & 0x80)) { // multibyte
125 while (p
&& (*p
& 0xc0) == 0x80) { // character
134 if (pos
&& st
!= pos
) {
136 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
140 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
142 } else if (pos
) { // group
151 // check if this prefix entry matches
152 struct hentry
* PfxEntry::checkword(const char * word
, int len
, char in_compound
, const FLAG needflag
)
154 int tmpl
; // length of tmpword
155 struct hentry
* he
; // hash entry of root word or NULL
156 char tmpword
[MAXWORDUTF8LEN
+ 4];
158 // on entry prefix is 0 length or already matches the beginning of the word.
159 // So if the remaining root word has positive length
160 // and if there are enough chars in root word and added back strip chars
161 // to meet the number of characters conditions, then test it
165 if (tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) {
167 // generate new root word by removing prefix and adding
168 // back any characters that would have been stripped
170 if (stripl
) strcpy (tmpword
, strip
);
171 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
173 // now make sure all of the conditions on characters
174 // are met. Please see the appendix at the end of
175 // this file for more info on exactly what is being
178 // if all conditions are met then check if resulting
179 // root word in the dictionary
181 if (test_condition(tmpword
)) {
183 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
185 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
186 // forbid single prefixes with needaffix flag
187 ! TESTAFF(contclass
, pmyMgr
->get_needaffix(), contclasslen
) &&
189 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
190 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
))))
192 he
= he
->next_homonym
; // check homonyms
196 // prefix matched but no root word was found
197 // if aeXPRODUCT is allowed, try again but now
198 // ross checked combined with a suffix
200 //if ((opts & aeXPRODUCT) && in_compound) {
201 if ((opts
& aeXPRODUCT
)) {
202 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, aeXPRODUCT
, this, NULL
,
203 0, NULL
, FLAG_NULL
, needflag
, in_compound
);
211 // check if this prefix entry matches
212 struct hentry
* PfxEntry::check_twosfx(const char * word
, int len
,
213 char in_compound
, const FLAG needflag
)
215 int tmpl
; // length of tmpword
216 struct hentry
* he
; // hash entry of root word or NULL
217 char tmpword
[MAXWORDUTF8LEN
+ 4];
219 // on entry prefix is 0 length or already matches the beginning of the word.
220 // So if the remaining root word has positive length
221 // and if there are enough chars in root word and added back strip chars
222 // to meet the number of characters conditions, then test it
226 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
227 (tmpl
+ stripl
>= numconds
)) {
229 // generate new root word by removing prefix and adding
230 // back any characters that would have been stripped
232 if (stripl
) strcpy (tmpword
, strip
);
233 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
235 // now make sure all of the conditions on characters
236 // are met. Please see the appendix at the end of
237 // this file for more info on exactly what is being
240 // if all conditions are met then check if resulting
241 // root word in the dictionary
243 if (test_condition(tmpword
)) {
246 // prefix matched but no root word was found
247 // if aeXPRODUCT is allowed, try again but now
248 // cross checked combined with a suffix
250 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
251 he
= pmyMgr
->suffix_check_twosfx(tmpword
, tmpl
, aeXPRODUCT
, this, needflag
);
259 // check if this prefix entry matches
260 char * PfxEntry::check_twosfx_morph(const char * word
, int len
,
261 char in_compound
, const FLAG needflag
)
263 int tmpl
; // length of tmpword
264 char tmpword
[MAXWORDUTF8LEN
+ 4];
266 // on entry prefix is 0 length or already matches the beginning of the word.
267 // So if the remaining root word has positive length
268 // and if there are enough chars in root word and added back strip chars
269 // to meet the number of characters conditions, then test it
273 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
274 (tmpl
+ stripl
>= numconds
)) {
276 // generate new root word by removing prefix and adding
277 // back any characters that would have been stripped
279 if (stripl
) strcpy (tmpword
, strip
);
280 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
282 // now make sure all of the conditions on characters
283 // are met. Please see the appendix at the end of
284 // this file for more info on exactly what is being
287 // if all conditions are met then check if resulting
288 // root word in the dictionary
290 if (test_condition(tmpword
)) {
293 // prefix matched but no root word was found
294 // if aeXPRODUCT is allowed, try again but now
295 // ross checked combined with a suffix
297 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
298 return pmyMgr
->suffix_check_twosfx_morph(tmpword
, tmpl
,
299 aeXPRODUCT
, this, needflag
);
306 // check if this prefix entry matches
307 char * PfxEntry::check_morph(const char * word
, int len
, char in_compound
, const FLAG needflag
)
309 int tmpl
; // length of tmpword
310 struct hentry
* he
; // hash entry of root word or NULL
311 char tmpword
[MAXWORDUTF8LEN
+ 4];
312 char result
[MAXLNLEN
];
317 // on entry prefix is 0 length or already matches the beginning of the word.
318 // So if the remaining root word has positive length
319 // and if there are enough chars in root word and added back strip chars
320 // to meet the number of characters conditions, then test it
324 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
325 (tmpl
+ stripl
>= numconds
)) {
327 // generate new root word by removing prefix and adding
328 // back any characters that would have been stripped
330 if (stripl
) strcpy (tmpword
, strip
);
331 strcpy ((tmpword
+ stripl
), (word
+ appndl
));
333 // now make sure all of the conditions on characters
334 // are met. Please see the appendix at the end of
335 // this file for more info on exactly what is being
338 // if all conditions are met then check if resulting
339 // root word in the dictionary
341 if (test_condition(tmpword
)) {
343 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
345 if (TESTAFF(he
->astr
, aflag
, he
->alen
) &&
346 // forbid single prefixes with needaffix flag
347 ! TESTAFF(contclass
, pmyMgr
->get_needaffix(), contclasslen
) &&
349 ((!needflag
) || TESTAFF(he
->astr
, needflag
, he
->alen
) ||
350 (contclass
&& TESTAFF(contclass
, needflag
, contclasslen
)))) {
352 mystrcat(result
, " ", MAXLNLEN
);
353 mystrcat(result
, morphcode
, MAXLNLEN
);
354 } else mystrcat(result
,getKey(), MAXLNLEN
);
355 if (!HENTRY_FIND(he
, MORPH_STEM
)) {
356 mystrcat(result
, " ", MAXLNLEN
);
357 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
358 mystrcat(result
, HENTRY_WORD(he
), MAXLNLEN
);
360 // store the pointer of the hash entry
361 if (HENTRY_DATA(he
)) {
362 mystrcat(result
, " ", MAXLNLEN
);
363 mystrcat(result
, HENTRY_DATA2(he
), MAXLNLEN
);
365 // return with debug information
366 char * flag
= pmyMgr
->encode_flag(getFlag());
367 mystrcat(result
, " ", MAXLNLEN
);
368 mystrcat(result
, MORPH_FLAG
, MAXLNLEN
);
369 mystrcat(result
, flag
, MAXLNLEN
);
372 mystrcat(result
, "\n", MAXLNLEN
);
374 he
= he
->next_homonym
;
378 // prefix matched but no root word was found
379 // if aeXPRODUCT is allowed, try again but now
380 // ross checked combined with a suffix
382 if ((opts
& aeXPRODUCT
) && (in_compound
!= IN_CPD_BEGIN
)) {
383 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, aeXPRODUCT
, this,
384 FLAG_NULL
, needflag
);
386 mystrcat(result
, st
, MAXLNLEN
);
393 if (*result
) return mystrdup(result
);
397 SfxEntry::SfxEntry(AffixMgr
* pmgr
, affentry
* dp
)
399 // register affix manager
402 // set up its initial values
403 aflag
= dp
->aflag
; // char flag
404 strip
= dp
->strip
; // string to strip
405 appnd
= dp
->appnd
; // string to append
406 stripl
= dp
->stripl
; // length of strip string
407 appndl
= dp
->appndl
; // length of append string
408 numconds
= dp
->numconds
; // length of the condition
409 opts
= dp
->opts
; // cross product flag
411 // then copy over all of the conditions
412 if (opts
& aeLONGCOND
) {
413 memcpy(c
.l
.conds1
, dp
->c
.l
.conds1
, MAXCONDLEN_1
);
414 c
.l
.conds2
= dp
->c
.l
.conds2
;
415 } else memcpy(c
.conds
, dp
->c
.conds
, MAXCONDLEN
);
417 rappnd
= myrevstrdup(appnd
);
418 morphcode
= dp
->morphcode
;
419 contclass
= dp
->contclass
;
420 contclasslen
= dp
->contclasslen
;
424 SfxEntry::~SfxEntry()
427 if (appnd
) free(appnd
);
428 if (rappnd
) free(rappnd
);
429 if (strip
) free(strip
);
433 if (opts
& aeLONGCOND
) free(c
.l
.conds2
);
434 if (morphcode
&& !(opts
& aeALIASM
)) free(morphcode
);
435 if (contclass
&& !(opts
& aeALIASF
)) free(contclass
);
438 // add suffix to this word assuming conditions hold
439 char * SfxEntry::add(const char * word
, int len
)
441 char tword
[MAXWORDUTF8LEN
+ 4];
443 /* make sure all conditions match */
444 if ((len
> stripl
|| (len
== 0 && pmyMgr
->get_fullstrip())) &&
445 (len
>= numconds
) && test_condition(word
+ len
, word
) &&
446 (!stripl
|| (strcmp(word
+ len
- stripl
, strip
) == 0)) &&
447 ((MAXWORDUTF8LEN
+ 4) > (len
+ appndl
- stripl
))) {
448 /* we have a match so add suffix */
451 strcpy(tword
+ len
- stripl
, appnd
);
453 *(tword
+ len
- stripl
) = '\0';
455 return mystrdup(tword
);
460 inline char * SfxEntry::nextchar(char * p
) {
463 if (opts
& aeLONGCOND
) {
464 // jump to the 2nd part of the condition
465 if (p
== c
.l
.conds1
+ MAXCONDLEN_1
) return c
.l
.conds2
;
466 // end of the MAXCONDLEN length condition
467 } else if (p
== c
.conds
+ MAXCONDLEN
) return NULL
;
468 return *p
? p
: NULL
;
473 inline int SfxEntry::test_condition(const char * st
, const char * beg
)
475 const char * pos
= NULL
; // group with pos input position
476 bool neg
= false; // complementer
477 bool ingroup
= false; // character in the group
478 if (numconds
== 0) return 1;
485 case '[': { p
= nextchar(p
); pos
= st
; break; }
486 case '^': { p
= nextchar(p
); neg
= true; break; }
487 case ']': { if (!neg
&& !ingroup
) return 0;
489 // skip the next character
491 for (; (opts
& aeUTF8
) && (st
>= beg
) && (*st
& 0xc0) == 0x80; st
--);
498 if (st
< beg
&& p
) return 0; // word <= condition
501 case '.': if (!pos
) { // dots are not metacharacters in groups: [.]
503 // skip the next character
504 for (st
--; (opts
& aeUTF8
) && (st
>= beg
) && (*st
& 0xc0) == 0x80; st
--);
505 if (st
< beg
) { // word <= condition
506 if (p
) return 0; else return 1;
508 if ((opts
& aeUTF8
) && (*st
& 0x80)) { // head of the UTF-8 character
510 if (st
< beg
) { // word <= condition
511 if (p
) return 0; else return 1;
519 if ((opts
& aeUTF8
) && (*st
& 0x80)) {
521 while (p
&& (st
>= beg
)) {
527 // first byte of the UTF-8 multibyte character
528 if ((*p
& 0xc0) != 0x80) break;
532 if (pos
&& st
!= pos
) {
534 else if (i
== numconds
) return 1;
536 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
539 if (p
&& *p
!= ']') p
= nextchar(p
);
542 else if (i
== numconds
) return 1;
544 while (p
&& *p
!= ']' && (p
= nextchar(p
)));
545 // if (p && *p != ']') p = nextchar(p);
552 if (st
< beg
&& p
&& *p
!= ']') return 0; // word <= condition
553 } else if (pos
) { // group
562 // see if this suffix is present in the word
563 struct hentry
* SfxEntry::checkword(const char * word
, int len
, int optflags
,
564 PfxEntry
* ppfx
, char ** wlst
, int maxSug
, int * ns
, const FLAG cclass
, const FLAG needflag
,
567 int tmpl
; // length of tmpword
568 struct hentry
* he
; // hash entry pointer
570 char tmpword
[MAXWORDUTF8LEN
+ 4];
573 // if this suffix is being cross checked with a prefix
574 // but it does not support cross products skip it
576 if (((optflags
& aeXPRODUCT
) != 0) && ((opts
& aeXPRODUCT
) == 0))
579 // upon entry suffix is 0 length or already matches the end of the word.
580 // So if the remaining root word has positive length
581 // and if there are enough chars in root word and added back strip chars
582 // to meet the number of characters conditions, then test it
585 // the second condition is not enough for UTF-8 strings
586 // it checked in test_condition()
588 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
589 (tmpl
+ stripl
>= numconds
)) {
591 // generate new root word by removing suffix and adding
592 // back any characters that would have been stripped or
593 // or null terminating the shorter string
595 strcpy (tmpword
, word
);
596 cp
= (unsigned char *)(tmpword
+ tmpl
);
598 strcpy ((char *)cp
, strip
);
600 cp
= (unsigned char *)(tmpword
+ tmpl
);
603 // now make sure all of the conditions on characters
604 // are met. Please see the appendix at the end of
605 // this file for more info on exactly what is being
608 // if all conditions are met then check if resulting
609 // root word in the dictionary
611 if (test_condition((char *) cp
, (char *) tmpword
)) {
613 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
614 fprintf(stdout
,"%s %s %c\n", word
, tmpword
, aflag
);
616 if ((he
= pmyMgr
->lookup(tmpword
)) != NULL
) {
618 // check conditional suffix (enabled by prefix)
619 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() &&
620 TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
621 (((optflags
& aeXPRODUCT
) == 0) ||
622 (ep
&& TESTAFF(he
->astr
, ep
->getFlag(), he
->alen
)) ||
624 ((contclass
) && (ep
&& TESTAFF(contclass
, ep
->getFlag(), contclasslen
)))
626 // handle cont. class
628 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
630 // check only in compound homonyms (bad flags)
631 (!badflag
|| !TESTAFF(he
->astr
, badflag
, he
->alen
)
633 // handle required flag
635 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
636 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
639 he
= he
->next_homonym
; // check homonyms
642 // obsolote stemming code (used only by the
643 // experimental SuffixMgr:suggest_pos_stems)
644 // store resulting root in wlst
645 } else if (wlst
&& (*ns
< maxSug
)) {
647 for (int k
=0; k
< *ns
; k
++)
648 if (strcmp(tmpword
, wlst
[k
]) == 0) cwrd
= 0;
650 wlst
[*ns
] = mystrdup(tmpword
);
651 if (wlst
[*ns
] == NULL
) {
652 for (int j
=0; j
<*ns
; j
++) free(wlst
[j
]);
664 // see if two-level suffix is present in the word
665 struct hentry
* SfxEntry::check_twosfx(const char * word
, int len
, int optflags
,
666 PfxEntry
* ppfx
, const FLAG needflag
)
668 int tmpl
; // length of tmpword
669 struct hentry
* he
; // hash entry pointer
671 char tmpword
[MAXWORDUTF8LEN
+ 4];
675 // if this suffix is being cross checked with a prefix
676 // but it does not support cross products skip it
678 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
681 // upon entry suffix is 0 length or already matches the end of the word.
682 // So if the remaining root word has positive length
683 // and if there are enough chars in root word and added back strip chars
684 // to meet the number of characters conditions, then test it
688 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
689 (tmpl
+ stripl
>= numconds
)) {
691 // generate new root word by removing suffix and adding
692 // back any characters that would have been stripped or
693 // or null terminating the shorter string
695 strcpy (tmpword
, word
);
696 cp
= (unsigned char *)(tmpword
+ tmpl
);
698 strcpy ((char *)cp
, strip
);
700 cp
= (unsigned char *)(tmpword
+ tmpl
);
703 // now make sure all of the conditions on characters
704 // are met. Please see the appendix at the end of
705 // this file for more info on exactly what is being
708 // if all conditions are met then recall suffix_check
710 if (test_condition((char *) cp
, (char *) tmpword
)) {
712 // handle conditional suffix
713 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
))
714 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
716 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, optflags
, ppfx
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
718 he
= pmyMgr
->suffix_check(tmpword
, tmpl
, 0, NULL
, NULL
, 0, NULL
, (FLAG
) aflag
, needflag
);
726 // see if two-level suffix is present in the word
727 char * SfxEntry::check_twosfx_morph(const char * word
, int len
, int optflags
,
728 PfxEntry
* ppfx
, const FLAG needflag
)
730 int tmpl
; // length of tmpword
732 char tmpword
[MAXWORDUTF8LEN
+ 4];
736 char result
[MAXLNLEN
];
740 // if this suffix is being cross checked with a prefix
741 // but it does not support cross products skip it
743 if ((optflags
& aeXPRODUCT
) != 0 && (opts
& aeXPRODUCT
) == 0)
746 // upon entry suffix is 0 length or already matches the end of the word.
747 // So if the remaining root word has positive length
748 // and if there are enough chars in root word and added back strip chars
749 // to meet the number of characters conditions, then test it
753 if ((tmpl
> 0 || (tmpl
== 0 && pmyMgr
->get_fullstrip())) &&
754 (tmpl
+ stripl
>= numconds
)) {
756 // generate new root word by removing suffix and adding
757 // back any characters that would have been stripped or
758 // or null terminating the shorter string
760 strcpy (tmpword
, word
);
761 cp
= (unsigned char *)(tmpword
+ tmpl
);
763 strcpy ((char *)cp
, strip
);
765 cp
= (unsigned char *)(tmpword
+ tmpl
);
768 // now make sure all of the conditions on characters
769 // are met. Please see the appendix at the end of
770 // this file for more info on exactly what is being
773 // if all conditions are met then recall suffix_check
775 if (test_condition((char *) cp
, (char *) tmpword
)) {
777 // handle conditional suffix
778 if ((contclass
) && TESTAFF(contclass
, ep
->getFlag(), contclasslen
)) {
779 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
781 if (ppfx
->getMorph()) {
782 mystrcat(result
, ppfx
->getMorph(), MAXLNLEN
);
783 mystrcat(result
, " ", MAXLNLEN
);
785 mystrcat(result
,st
, MAXLNLEN
);
790 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, optflags
, ppfx
, aflag
, needflag
);
792 mystrcat(result
, st
, MAXLNLEN
);
798 st
= pmyMgr
->suffix_check_morph(tmpword
, tmpl
, 0, NULL
, aflag
, needflag
);
800 mystrcat(result
, st
, MAXLNLEN
);
805 if (*result
) return mystrdup(result
);
811 // get next homonym with same affix
812 struct hentry
* SfxEntry::get_next_homonym(struct hentry
* he
, int optflags
, PfxEntry
* ppfx
,
813 const FLAG cclass
, const FLAG needflag
)
816 FLAG eFlag
= ep
? ep
->getFlag() : FLAG_NULL
;
818 while (he
->next_homonym
) {
819 he
= he
->next_homonym
;
820 if ((TESTAFF(he
->astr
, aflag
, he
->alen
) || (ep
&& ep
->getCont() && TESTAFF(ep
->getCont(), aflag
, ep
->getContLen()))) &&
821 ((optflags
& aeXPRODUCT
) == 0 ||
822 TESTAFF(he
->astr
, eFlag
, he
->alen
) ||
823 // handle conditional suffix
824 ((contclass
) && TESTAFF(contclass
, eFlag
, contclasslen
))
826 // handle cont. class
828 ((contclass
) && TESTAFF(contclass
, cclass
, contclasslen
))
830 // handle required flag
832 (TESTAFF(he
->astr
, needflag
, he
->alen
) ||
833 ((contclass
) && TESTAFF(contclass
, needflag
, contclasslen
)))
843 Appendix
: Understanding Affix Code
846 An affix is either a prefix
or a suffix attached to root words to make
849 Basically a Prefix
or a Suffix is set of AffEntry objects
850 which store information about the prefix
or suffix along
851 with supporting routines to check
if a word has a particular
852 prefix
or suffix
or a combination
.
854 The structure affentry is defined as follows
:
858 unsigned short aflag
; // ID used to represent the affix
859 char * strip
; // string to strip before adding affix
860 char * appnd
; // the affix string to add
861 unsigned char stripl
; // length of the strip string
862 unsigned char appndl
; // length of the affix string
863 char numconds
; // the number of conditions that must be met
864 char opts
; // flag: aeXPRODUCT- combine both prefix and suffix
865 char conds
[SETSIZE
]; // array which encodes the conditions to be met
869 Here is a suffix borrowed from the en_US
.aff file
. This file
870 is whitespace delimited
.
874 SFX D y ied
[^aeiou
]y
878 This information can be interpreted as follows
:
880 In the first line has
4 fields
884 1 SFX
- indicates
this is a suffix
885 2 D
- is the name of the character flag which represents
this suffix
886 3 Y
- indicates it can be combined with
prefixes (cross product
)
887 4 4 - indicates that sequence of
4 affentry structures are needed to
888 properly store the affix information
890 The remaining lines describe the unique information
for the
4 SfxEntry
891 objects that make up
this affix
. Each line can be interpreted
892 as follows
: (note fields
1 and 2 are as a check against line
1 info
)
896 1 SFX
- indicates
this is a suffix
897 2 D
- is the name of the character flag
for this affix
898 3 y
- the string of chars to strip off before adding affix
899 (a
0 here indicates the NULL string
)
900 4 ied
- the string of affix characters to add
901 5 [^aeiou
]y
- the conditions which must be met before the affix
904 Field
5 is interesting
. Since
this is a suffix
, field
5 tells us that
905 there are
2 conditions that must be met
. The first condition is that
906 the next to the last character in the word must
*NOT
* be any of the
907 following
"a", "e", "i", "o" or "u". The second condition is that
908 the last character of the word must end in
"y".
910 So how can we encode
this information concisely
and be able to
911 test
for both conditions in a fast manner
? The answer is found
912 but studying the wonderful ispell code of Geoff Kuenning
, et
.al
.
913 (now available under a normal BSD license
).
915 If we set up a conds array of
256 bytes
indexed (0 to
255) and access it
916 using a
character (cast to an
unsigned char) of a string
, we have
8 bits
917 of information we can store about that character
. Specifically we
918 could use each bit to say
if that character is allowed in any of the
919 last (or first
for prefixes
) 8 characters of the word
.
921 Basically
, each character at one end of the
word (up to the number
922 of conditions
) is used to index into the conds array
and the resulting
923 value found there says whether the that character is valid
for a
924 specific character position in the word
.
926 For prefixes
, it does
this by setting bit
0 if that
char is valid
927 in the first position
, bit
1 if valid in the second position
, and so on
.
929 If a bit is
not set
, then that
char is
not valid
for that postion in the
932 If working with suffixes bit
0 is used
for the character closest
933 to the front
, bit
1 for the next character towards the end
, ...,
934 with bit numconds
-1 representing the last
char at the end of the string
.
936 Note
: since entries in the conds
[] are
8 bits
, only
8 conditions
937 (read that only
8 character positions
) can be examined at one
938 end of a
word (the beginning
for prefixes
and the end
for suffixes
.
940 So to make
this clearer
, lets encode the conds array values
for the
941 first two affentries
for the suffix D described earlier
.
944 For the first affentry
:
945 numconds
= 1 (only examine the last character
)
947 conds
['e'] = (1 << 0) (the word must end in an E
)
950 For the second affentry
:
951 numconds
= 2 (only examine the last two characters
)
953 conds
[X
] = conds
[X
] | (1 << 0) (aeiou are
not allowed
)
954 where X is all characters
*but
* a
, e
, i
, o
, or u
957 conds
['y'] = (1 << 1) (the last
char must be a y
)
958 all other bits
for all other entries in the conds array are zero