code cleanup
[TortoiseGit.git] / ext / hunspell / affentry.cxx
blobce727b011050060d58221baea848ddc9074dac07
1 #include "license.hunspell"
2 #include "license.myspell"
4 #ifndef MOZILLA_CLIENT
5 #include <cstdlib>
6 #include <cstring>
7 #include <cctype>
8 #include <cstdio>
9 #else
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <ctype.h>
14 #endif
16 #include "affentry.hxx"
17 #include "csutil.hxx"
19 #ifndef MOZILLA_CLIENT
20 #ifndef W32
21 using namespace std;
22 #endif
23 #endif
26 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
28 // register affix manager
29 pmyMgr = pmgr;
31 // set up its intial values
33 aflag = dp->aflag; // flag
34 strip = dp->strip; // string to strip
35 appnd = dp->appnd; // string to append
36 stripl = dp->stripl; // length of strip string
37 appndl = dp->appndl; // length of append string
38 numconds = dp->numconds; // number of conditions to match
39 opts = dp->opts; // cross product flag
40 // then copy over all of the conditions
41 memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
42 next = NULL;
43 nextne = NULL;
44 nexteq = NULL;
45 #ifdef HUNSPELL_EXPERIMENTAL
46 morphcode = dp->morphcode;
47 #endif
48 contclass = dp->contclass;
49 contclasslen = dp->contclasslen;
53 PfxEntry::~PfxEntry()
55 aflag = 0;
56 if (appnd) free(appnd);
57 if (strip) free(strip);
58 pmyMgr = NULL;
59 appnd = NULL;
60 strip = NULL;
61 if (opts & aeUTF8) {
62 for (int i = 0; i < 8; i++) {
63 if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
66 #ifdef HUNSPELL_EXPERIMENTAL
67 if (morphcode && !(opts & aeALIASM)) free(morphcode);
68 #endif
69 if (contclass && !(opts & aeALIASF)) free(contclass);
72 // add prefix to this word assuming conditions hold
73 char * PfxEntry::add(const char * word, int len)
75 char tword[MAXWORDUTF8LEN + 4];
77 if ((len > stripl) && (len >= numconds) && test_condition(word) &&
78 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
79 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
80 /* we have a match so add prefix */
81 char * pp = tword;
82 if (appndl) {
83 strcpy(tword,appnd);
84 pp += appndl;
86 strcpy(pp, (word + stripl));
87 return mystrdup(tword);
89 return NULL;
93 inline int PfxEntry::test_condition(const char * st)
95 int cond;
96 unsigned char * cp = (unsigned char *)st;
97 if (!(opts & aeUTF8)) { // 256-character codepage
98 for (cond = 0; cond < numconds; cond++) {
99 if ((conds.base[*cp++] & (1 << cond)) == 0) return 0;
101 } else { // UTF-8 encoding
102 unsigned short wc;
103 for (cond = 0; cond < numconds; cond++) {
104 // a simple 7-bit ASCII character in UTF-8
105 if ((*cp >> 7) == 0) {
106 // also check limit (end of word)
107 if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0;
108 // UTF-8 multibyte character
109 } else {
110 // not dot wildcard in rule
111 if (!conds.utf8.all[cond]) {
112 if (conds.utf8.neg[cond]) {
113 u8_u16((w_char *) &wc, 1, (char *) cp);
114 if (conds.utf8.wchars[cond] &&
115 flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
116 wc, (short) conds.utf8.wlen[cond])) return 0;
117 } else {
118 if (!conds.utf8.wchars[cond]) return 0;
119 u8_u16((w_char *) &wc, 1, (char *) cp);
120 if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
121 wc, (short)conds.utf8.wlen[cond])) return 0;
124 // jump to next UTF-8 character
125 for(cp++; (*cp & 0xc0) == 0x80; cp++);
129 return 1;
133 // check if this prefix entry matches
134 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
136 int tmpl; // length of tmpword
137 struct hentry * he; // hash entry of root word or NULL
138 char tmpword[MAXWORDUTF8LEN + 4];
140 // on entry prefix is 0 length or already matches the beginning of the word.
141 // So if the remaining root word has positive length
142 // and if there are enough chars in root word and added back strip chars
143 // to meet the number of characters conditions, then test it
145 tmpl = len - appndl;
147 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
149 // generate new root word by removing prefix and adding
150 // back any characters that would have been stripped
152 if (stripl) strcpy (tmpword, strip);
153 strcpy ((tmpword + stripl), (word + appndl));
155 // now make sure all of the conditions on characters
156 // are met. Please see the appendix at the end of
157 // this file for more info on exactly what is being
158 // tested
160 // if all conditions are met then check if resulting
161 // root word in the dictionary
163 if (test_condition(tmpword)) {
164 tmpl += stripl;
165 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
166 do {
167 if (TESTAFF(he->astr, aflag, he->alen) &&
168 // forbid single prefixes with pseudoroot flag
169 ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
170 // needflag
171 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
172 (contclass && TESTAFF(contclass, needflag, contclasslen))))
173 return he;
174 he = he->next_homonym; // check homonyms
175 } while (he);
178 // prefix matched but no root word was found
179 // if aeXPRODUCT is allowed, try again but now
180 // ross checked combined with a suffix
182 //if ((opts & aeXPRODUCT) && in_compound) {
183 if ((opts & aeXPRODUCT)) {
184 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
185 0, NULL, FLAG_NULL, needflag, in_compound);
186 if (he) return he;
190 return NULL;
193 // check if this prefix entry matches
194 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
195 char in_compound, const FLAG needflag)
197 int tmpl; // length of tmpword
198 struct hentry * he; // hash entry of root word or NULL
199 char tmpword[MAXWORDUTF8LEN + 4];
201 // on entry prefix is 0 length or already matches the beginning of the word.
202 // So if the remaining root word has positive length
203 // and if there are enough chars in root word and added back strip chars
204 // to meet the number of characters conditions, then test it
206 tmpl = len - appndl;
208 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
210 // generate new root word by removing prefix and adding
211 // back any characters that would have been stripped
213 if (stripl) strcpy (tmpword, strip);
214 strcpy ((tmpword + stripl), (word + appndl));
216 // now make sure all of the conditions on characters
217 // are met. Please see the appendix at the end of
218 // this file for more info on exactly what is being
219 // tested
221 // if all conditions are met then check if resulting
222 // root word in the dictionary
224 if (test_condition(tmpword)) {
225 tmpl += stripl;
227 // prefix matched but no root word was found
228 // if aeXPRODUCT is allowed, try again but now
229 // cross checked combined with a suffix
231 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
232 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
233 if (he) return he;
237 return NULL;
240 #ifdef HUNSPELL_EXPERIMENTAL
241 // check if this prefix entry matches
242 char * PfxEntry::check_twosfx_morph(const char * word, int len,
243 char in_compound, const FLAG needflag)
245 int tmpl; // length of tmpword
246 char tmpword[MAXWORDUTF8LEN + 4];
248 // on entry prefix is 0 length or already matches the beginning of the word.
249 // So if the remaining root word has positive length
250 // and if there are enough chars in root word and added back strip chars
251 // to meet the number of characters conditions, then test it
253 tmpl = len - appndl;
255 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
257 // generate new root word by removing prefix and adding
258 // back any characters that would have been stripped
260 if (stripl) strcpy (tmpword, strip);
261 strcpy ((tmpword + stripl), (word + appndl));
263 // now make sure all of the conditions on characters
264 // are met. Please see the appendix at the end of
265 // this file for more info on exactly what is being
266 // tested
268 // if all conditions are met then check if resulting
269 // root word in the dictionary
271 if (test_condition(tmpword)) {
272 tmpl += stripl;
274 // prefix matched but no root word was found
275 // if aeXPRODUCT is allowed, try again but now
276 // ross checked combined with a suffix
278 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
279 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
280 aeXPRODUCT, (AffEntry *)this, needflag);
284 return NULL;
287 // check if this prefix entry matches
288 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
290 int tmpl; // length of tmpword
291 struct hentry * he; // hash entry of root word or NULL
292 char tmpword[MAXWORDUTF8LEN + 4];
293 char result[MAXLNLEN];
294 char * st;
296 *result = '\0';
298 // on entry prefix is 0 length or already matches the beginning of the word.
299 // So if the remaining root word has positive length
300 // and if there are enough chars in root word and added back strip chars
301 // to meet the number of characters conditions, then test it
303 tmpl = len - appndl;
305 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
307 // generate new root word by removing prefix and adding
308 // back any characters that would have been stripped
310 if (stripl) strcpy (tmpword, strip);
311 strcpy ((tmpword + stripl), (word + appndl));
313 // now make sure all of the conditions on characters
314 // are met. Please see the appendix at the end of
315 // this file for more info on exactly what is being
316 // tested
318 // if all conditions are met then check if resulting
319 // root word in the dictionary
321 if (test_condition(tmpword)) {
322 tmpl += stripl;
323 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
324 do {
325 if (TESTAFF(he->astr, aflag, he->alen) &&
326 // forbid single prefixes with pseudoroot flag
327 ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
328 // needflag
329 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
330 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
331 if (morphcode) strcat(result, morphcode); else strcat(result,getKey());
332 if (he->description) {
333 if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word);
334 strcat(result,he->description);
336 strcat(result, "\n");
338 he = he->next_homonym;
339 } while (he);
342 // prefix matched but no root word was found
343 // if aeXPRODUCT is allowed, try again but now
344 // ross checked combined with a suffix
346 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
347 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
348 FLAG_NULL, needflag);
349 if (st) {
350 strcat(result, st);
351 free(st);
357 if (*result) return mystrdup(result);
358 return NULL;
360 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
362 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
364 // register affix manager
365 pmyMgr = pmgr;
367 // set up its intial values
368 aflag = dp->aflag; // char flag
369 strip = dp->strip; // string to strip
370 appnd = dp->appnd; // string to append
371 stripl = dp->stripl; // length of strip string
372 appndl = dp->appndl; // length of append string
373 numconds = dp->numconds; // number of conditions to match
374 opts = dp->opts; // cross product flag
376 // then copy over all of the conditions
377 memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
379 rappnd = myrevstrdup(appnd);
381 #ifdef HUNSPELL_EXPERIMENTAL
382 morphcode = dp->morphcode;
383 #endif
384 contclass = dp->contclass;
385 contclasslen = dp->contclasslen;
389 SfxEntry::~SfxEntry()
391 aflag = 0;
392 if (appnd) free(appnd);
393 if (rappnd) free(rappnd);
394 if (strip) free(strip);
395 pmyMgr = NULL;
396 appnd = NULL;
397 strip = NULL;
398 if (opts & aeUTF8) {
399 for (int i = 0; i < 8; i++) {
400 if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
403 #ifdef HUNSPELL_EXPERIMENTAL
404 if (morphcode && !(opts & aeALIASM)) free(morphcode);
405 #endif
406 if (contclass && !(opts & aeALIASF)) free(contclass);
409 // add suffix to this word assuming conditions hold
410 char * SfxEntry::add(const char * word, int len)
412 char tword[MAXWORDUTF8LEN + 4];
414 /* make sure all conditions match */
415 if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
416 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
417 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
418 /* we have a match so add suffix */
419 strcpy(tword,word);
420 if (appndl) {
421 strcpy(tword + len - stripl, appnd);
422 } else {
423 *(tword + len - stripl) = '\0';
425 return mystrdup(tword);
427 return NULL;
431 inline int SfxEntry::test_condition(const char * st, const char * beg)
433 int cond;
434 unsigned char * cp = (unsigned char *) st;
435 if (!(opts & aeUTF8)) { // 256-character codepage
436 // Dömölki affix algorithm
437 for (cond = numconds; --cond >= 0; ) {
438 if ((conds.base[*--cp] & (1 << cond)) == 0) return 0;
440 } else { // UTF-8 encoding
441 unsigned short wc;
442 for (cond = numconds; --cond >= 0; ) {
443 // go to next character position and check limit
444 if ((char *) --cp < beg) return 0;
445 // a simple 7-bit ASCII character in UTF-8
446 if ((*cp >> 7) == 0) {
447 if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0;
448 // UTF-8 multibyte character
449 } else {
450 // go to first character of UTF-8 multibyte character
451 for (; (*cp & 0xc0) == 0x80; cp--);
452 // not dot wildcard in rule
453 if (!conds.utf8.all[cond]) {
454 if (conds.utf8.neg[cond]) {
455 u8_u16((w_char *) &wc, 1, (char *) cp);
456 if (conds.utf8.wchars[cond] &&
457 flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
458 wc, (short) conds.utf8.wlen[cond])) return 0;
459 } else {
460 if (!conds.utf8.wchars[cond]) return 0;
461 u8_u16((w_char *) &wc, 1, (char *) cp);
462 if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
463 wc, (short)conds.utf8.wlen[cond])) return 0;
469 return 1;
474 // see if this suffix is present in the word
475 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
476 AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
477 const FLAG badflag)
479 int tmpl; // length of tmpword
480 struct hentry * he; // hash entry pointer
481 unsigned char * cp;
482 char tmpword[MAXWORDUTF8LEN + 4];
483 PfxEntry* ep = (PfxEntry *) ppfx;
485 // if this suffix is being cross checked with a prefix
486 // but it does not support cross products skip it
488 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
489 return NULL;
491 // upon entry suffix is 0 length or already matches the end of the word.
492 // So if the remaining root word has positive length
493 // and if there are enough chars in root word and added back strip chars
494 // to meet the number of characters conditions, then test it
496 tmpl = len - appndl;
497 // the second condition is not enough for UTF-8 strings
498 // it checked in test_condition()
500 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
502 // generate new root word by removing suffix and adding
503 // back any characters that would have been stripped or
504 // or null terminating the shorter string
506 strcpy (tmpword, word);
507 cp = (unsigned char *)(tmpword + tmpl);
508 if (stripl) {
509 strcpy ((char *)cp, strip);
510 tmpl += stripl;
511 cp = (unsigned char *)(tmpword + tmpl);
512 } else *cp = '\0';
514 // now make sure all of the conditions on characters
515 // are met. Please see the appendix at the end of
516 // this file for more info on exactly what is being // tested
518 // if all conditions are met then check if resulting
519 // root word in the dictionary
521 if (test_condition((char *) cp, (char *) tmpword)) {
523 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
524 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
525 #endif
526 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
527 do {
528 // check conditional suffix (enabled by prefix)
529 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
530 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
531 (((optflags & aeXPRODUCT) == 0) ||
532 TESTAFF(he->astr, ep->getFlag(), he->alen) ||
533 // enabled by prefix
534 ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
535 ) &&
536 // handle cont. class
537 ((!cclass) ||
538 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
539 ) &&
540 // check only in compound homonyms (bad flags)
541 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
542 ) &&
543 // handle required flag
544 ((!needflag) ||
545 (TESTAFF(he->astr, needflag, he->alen) ||
546 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
548 ) return he;
549 he = he->next_homonym; // check homonyms
550 } while (he);
552 // obsolote stemming code (used only by the
553 // experimental SuffixMgr:suggest_pos_stems)
554 // store resulting root in wlst
555 } else if (wlst && (*ns < maxSug)) {
556 int cwrd = 1;
557 for (int k=0; k < *ns; k++)
558 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
559 if (cwrd) {
560 wlst[*ns] = mystrdup(tmpword);
561 if (wlst[*ns] == NULL) {
562 for (int j=0; j<*ns; j++) free(wlst[j]);
563 *ns = -1;
564 return NULL;
566 (*ns)++;
571 return NULL;
574 // see if two-level suffix is present in the word
575 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
576 AffEntry* ppfx, const FLAG needflag)
578 int tmpl; // length of tmpword
579 struct hentry * he; // hash entry pointer
580 unsigned char * cp;
581 char tmpword[MAXWORDUTF8LEN + 4];
582 PfxEntry* ep = (PfxEntry *) ppfx;
585 // if this suffix is being cross checked with a prefix
586 // but it does not support cross products skip it
588 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
589 return NULL;
591 // upon entry suffix is 0 length or already matches the end of the word.
592 // So if the remaining root word has positive length
593 // and if there are enough chars in root word and added back strip chars
594 // to meet the number of characters conditions, then test it
596 tmpl = len - appndl;
598 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
600 // generate new root word by removing suffix and adding
601 // back any characters that would have been stripped or
602 // or null terminating the shorter string
604 strcpy (tmpword, word);
605 cp = (unsigned char *)(tmpword + tmpl);
606 if (stripl) {
607 strcpy ((char *)cp, strip);
608 tmpl += stripl;
609 cp = (unsigned char *)(tmpword + tmpl);
610 } else *cp = '\0';
612 // now make sure all of the conditions on characters
613 // are met. Please see the appendix at the end of
614 // this file for more info on exactly what is being
615 // tested
617 // if all conditions are met then recall suffix_check
619 if (test_condition((char *) cp, (char *) tmpword)) {
620 if (ppfx) {
621 // handle conditional suffix
622 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
623 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
624 else
625 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
626 } else {
627 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
629 if (he) return he;
632 return NULL;
635 #ifdef HUNSPELL_EXPERIMENTAL
636 // see if two-level suffix is present in the word
637 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
638 AffEntry* ppfx, const FLAG needflag)
640 int tmpl; // length of tmpword
641 unsigned char * cp;
642 char tmpword[MAXWORDUTF8LEN + 4];
643 PfxEntry* ep = (PfxEntry *) ppfx;
644 char * st;
646 char result[MAXLNLEN];
648 *result = '\0';
650 // if this suffix is being cross checked with a prefix
651 // but it does not support cross products skip it
653 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
654 return NULL;
656 // upon entry suffix is 0 length or already matches the end of the word.
657 // So if the remaining root word has positive length
658 // and if there are enough chars in root word and added back strip chars
659 // to meet the number of characters conditions, then test it
661 tmpl = len - appndl;
663 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
665 // generate new root word by removing suffix and adding
666 // back any characters that would have been stripped or
667 // or null terminating the shorter string
669 strcpy (tmpword, word);
670 cp = (unsigned char *)(tmpword + tmpl);
671 if (stripl) {
672 strcpy ((char *)cp, strip);
673 tmpl += stripl;
674 cp = (unsigned char *)(tmpword + tmpl);
675 } else *cp = '\0';
677 // now make sure all of the conditions on characters
678 // are met. Please see the appendix at the end of
679 // this file for more info on exactly what is being
680 // tested
682 // if all conditions are met then recall suffix_check
684 if (test_condition((char *) cp, (char *) tmpword)) {
685 if (ppfx) {
686 // handle conditional suffix
687 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
688 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
689 if (st) {
690 if (((PfxEntry *) ppfx)->getMorph()) {
691 strcat(result, ((PfxEntry *) ppfx)->getMorph());
693 strcat(result,st);
694 free(st);
695 mychomp(result);
697 } else {
698 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
699 if (st) {
700 strcat(result, st);
701 free(st);
702 mychomp(result);
705 } else {
706 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
707 if (st) {
708 strcat(result, st);
709 free(st);
710 mychomp(result);
713 if (*result) return mystrdup(result);
716 return NULL;
718 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
720 // get next homonym with same affix
721 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
722 const FLAG cclass, const FLAG needflag)
724 PfxEntry* ep = (PfxEntry *) ppfx;
726 while (he->next_homonym) {
727 he = he->next_homonym;
728 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
729 ((optflags & aeXPRODUCT) == 0 ||
730 TESTAFF(he->astr, ep->getFlag(), he->alen) ||
731 // handle conditional suffix
732 ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
733 ) &&
734 // handle cont. class
735 ((!cclass) ||
736 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
737 ) &&
738 // handle required flag
739 ((!needflag) ||
740 (TESTAFF(he->astr, needflag, he->alen) ||
741 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
743 ) return he;
745 return NULL;
749 #if 0
751 Appendix: Understanding Affix Code
754 An affix is either a prefix or a suffix attached to root words to make
755 other words.
757 Basically a Prefix or a Suffix is set of AffEntry objects
758 which store information about the prefix or suffix along
759 with supporting routines to check if a word has a particular
760 prefix or suffix or a combination.
762 The structure affentry is defined as follows:
764 struct affentry
766 unsigned short aflag; // ID used to represent the affix
767 char * strip; // string to strip before adding affix
768 char * appnd; // the affix string to add
769 unsigned char stripl; // length of the strip string
770 unsigned char appndl; // length of the affix string
771 char numconds; // the number of conditions that must be met
772 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
773 char conds[SETSIZE]; // array which encodes the conditions to be met
777 Here is a suffix borrowed from the en_US.aff file. This file
778 is whitespace delimited.
780 SFX D Y 4
781 SFX D 0 e d
782 SFX D y ied [^aeiou]y
783 SFX D 0 ed [^ey]
784 SFX D 0 ed [aeiou]y
786 This information can be interpreted as follows:
788 In the first line has 4 fields
790 Field
791 -----
792 1 SFX - indicates this is a suffix
793 2 D - is the name of the character flag which represents this suffix
794 3 Y - indicates it can be combined with prefixes (cross product)
795 4 4 - indicates that sequence of 4 affentry structures are needed to
796 properly store the affix information
798 The remaining lines describe the unique information for the 4 SfxEntry
799 objects that make up this affix. Each line can be interpreted
800 as follows: (note fields 1 and 2 are as a check against line 1 info)
802 Field
803 -----
804 1 SFX - indicates this is a suffix
805 2 D - is the name of the character flag for this affix
806 3 y - the string of chars to strip off before adding affix
807 (a 0 here indicates the NULL string)
808 4 ied - the string of affix characters to add
809 5 [^aeiou]y - the conditions which must be met before the affix
810 can be applied
812 Field 5 is interesting. Since this is a suffix, field 5 tells us that
813 there are 2 conditions that must be met. The first condition is that
814 the next to the last character in the word must *NOT* be any of the
815 following "a", "e", "i", "o" or "u". The second condition is that
816 the last character of the word must end in "y".
818 So how can we encode this information concisely and be able to
819 test for both conditions in a fast manner? The answer is found
820 but studying the wonderful ispell code of Geoff Kuenning, et.al.
821 (now available under a normal BSD license).
823 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
824 using a character (cast to an unsigned char) of a string, we have 8 bits
825 of information we can store about that character. Specifically we
826 could use each bit to say if that character is allowed in any of the
827 last (or first for prefixes) 8 characters of the word.
829 Basically, each character at one end of the word (up to the number
830 of conditions) is used to index into the conds array and the resulting
831 value found there says whether the that character is valid for a
832 specific character position in the word.
834 For prefixes, it does this by setting bit 0 if that char is valid
835 in the first position, bit 1 if valid in the second position, and so on.
837 If a bit is not set, then that char is not valid for that postion in the
838 word.
840 If working with suffixes bit 0 is used for the character closest
841 to the front, bit 1 for the next character towards the end, ...,
842 with bit numconds-1 representing the last char at the end of the string.
844 Note: since entries in the conds[] are 8 bits, only 8 conditions
845 (read that only 8 character positions) can be examined at one
846 end of a word (the beginning for prefixes and the end for suffixes.
848 So to make this clearer, lets encode the conds array values for the
849 first two affentries for the suffix D described earlier.
852 For the first affentry:
853 numconds = 1 (only examine the last character)
855 conds['e'] = (1 << 0) (the word must end in an E)
856 all others are all 0
858 For the second affentry:
859 numconds = 2 (only examine the last two characters)
861 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
862 where X is all characters *but* a, e, i, o, or u
865 conds['y'] = (1 << 1) (the last char must be a y)
866 all other bits for all other entries in the conds array are zero
869 #endif