Updated hunspell to 1.3.2
[TortoiseGit.git] / ext / hunspell / affentry.cxx
blob246084e95a9c2744a89e7b190db970a015cb319d
1 #include "license.hunspell"
2 #include "license.myspell"
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
7 #include <ctype.h>
9 #include "affentry.hxx"
10 #include "csutil.hxx"
12 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
14 // register affix manager
15 pmyMgr = pmgr;
17 // set up its initial values
19 aflag = dp->aflag; // flag
20 strip = dp->strip; // string to strip
21 appnd = dp->appnd; // string to append
22 stripl = dp->stripl; // length of strip string
23 appndl = dp->appndl; // length of append string
24 numconds = dp->numconds; // length of the condition
25 opts = dp->opts; // cross product flag
26 // then copy over all of the conditions
27 if (opts & aeLONGCOND) {
28 memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);
29 c.l.conds2 = dp->c.l.conds2;
30 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
31 next = NULL;
32 nextne = NULL;
33 nexteq = NULL;
34 morphcode = dp->morphcode;
35 contclass = dp->contclass;
36 contclasslen = dp->contclasslen;
40 PfxEntry::~PfxEntry()
42 aflag = 0;
43 if (appnd) free(appnd);
44 if (strip) free(strip);
45 pmyMgr = NULL;
46 appnd = NULL;
47 strip = NULL;
48 if (opts & aeLONGCOND) free(c.l.conds2);
49 if (morphcode && !(opts & aeALIASM)) free(morphcode);
50 if (contclass && !(opts & aeALIASF)) free(contclass);
53 // add prefix to this word assuming conditions hold
54 char * PfxEntry::add(const char * word, int len)
56 char tword[MAXWORDUTF8LEN + 4];
58 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
59 (len >= numconds) && test_condition(word) &&
60 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
61 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
62 /* we have a match so add prefix */
63 char * pp = tword;
64 if (appndl) {
65 strcpy(tword,appnd);
66 pp += appndl;
68 strcpy(pp, (word + stripl));
69 return mystrdup(tword);
71 return NULL;
74 inline char * PfxEntry::nextchar(char * p) {
75 if (p) {
76 p++;
77 if (opts & aeLONGCOND) {
78 // jump to the 2nd part of the condition
79 if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;
80 // end of the MAXCONDLEN length condition
81 } else if (p == c.conds + MAXCONDLEN) return NULL;
82 return *p ? p : NULL;
84 return NULL;
87 inline int PfxEntry::test_condition(const char * st)
89 const char * pos = NULL; // group with pos input position
90 bool neg = false; // complementer
91 bool ingroup = false; // character in the group
92 if (numconds == 0) return 1;
93 char * p = c.conds;
94 while (1) {
95 switch (*p) {
96 case '\0': return 1;
97 case '[': {
98 neg = false;
99 ingroup = false;
100 p = nextchar(p);
101 pos = st; break;
103 case '^': { p = nextchar(p); neg = true; break; }
104 case ']': {
105 if ((neg && ingroup) || (!neg && !ingroup)) return 0;
106 pos = NULL;
107 p = nextchar(p);
108 // skip the next character
109 if (!ingroup && *st) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
110 if (*st == '\0' && p) return 0; // word <= condition
111 break;
113 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
114 p = nextchar(p);
115 // skip the next character
116 for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);
117 if (*st == '\0' && p) return 0; // word <= condition
118 break;
120 default: {
121 if (*st == *p) {
122 st++;
123 p = nextchar(p);
124 if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
125 while (p && (*p & 0xc0) == 0x80) { // character
126 if (*p != *st) {
127 if (!pos) return 0;
128 st = pos;
129 break;
131 p = nextchar(p);
132 st++;
134 if (pos && st != pos) {
135 ingroup = true;
136 while (p && *p != ']' && (p = nextchar(p)));
138 } else if (pos) {
139 ingroup = true;
140 while (p && *p != ']' && (p = nextchar(p)));
142 } else if (pos) { // group
143 p = nextchar(p);
144 } else return 0;
147 if (!p) return 1;
151 // check if this prefix entry matches
152 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
154 int tmpl; // length of tmpword
155 struct hentry * he; // hash entry of root word or NULL
156 char tmpword[MAXWORDUTF8LEN + 4];
158 // on entry prefix is 0 length or already matches the beginning of the word.
159 // So if the remaining root word has positive length
160 // and if there are enough chars in root word and added back strip chars
161 // to meet the number of characters conditions, then test it
163 tmpl = len - appndl;
165 if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
167 // generate new root word by removing prefix and adding
168 // back any characters that would have been stripped
170 if (stripl) strcpy (tmpword, strip);
171 strcpy ((tmpword + stripl), (word + appndl));
173 // now make sure all of the conditions on characters
174 // are met. Please see the appendix at the end of
175 // this file for more info on exactly what is being
176 // tested
178 // if all conditions are met then check if resulting
179 // root word in the dictionary
181 if (test_condition(tmpword)) {
182 tmpl += stripl;
183 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
184 do {
185 if (TESTAFF(he->astr, aflag, he->alen) &&
186 // forbid single prefixes with needaffix flag
187 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
188 // needflag
189 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
190 (contclass && TESTAFF(contclass, needflag, contclasslen))))
191 return he;
192 he = he->next_homonym; // check homonyms
193 } while (he);
196 // prefix matched but no root word was found
197 // if aeXPRODUCT is allowed, try again but now
198 // ross checked combined with a suffix
200 //if ((opts & aeXPRODUCT) && in_compound) {
201 if ((opts & aeXPRODUCT)) {
202 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,
203 0, NULL, FLAG_NULL, needflag, in_compound);
204 if (he) return he;
208 return NULL;
211 // check if this prefix entry matches
212 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
213 char in_compound, const FLAG needflag)
215 int tmpl; // length of tmpword
216 struct hentry * he; // hash entry of root word or NULL
217 char tmpword[MAXWORDUTF8LEN + 4];
219 // on entry prefix is 0 length or already matches the beginning of the word.
220 // So if the remaining root word has positive length
221 // and if there are enough chars in root word and added back strip chars
222 // to meet the number of characters conditions, then test it
224 tmpl = len - appndl;
226 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
227 (tmpl + stripl >= numconds)) {
229 // generate new root word by removing prefix and adding
230 // back any characters that would have been stripped
232 if (stripl) strcpy (tmpword, strip);
233 strcpy ((tmpword + stripl), (word + appndl));
235 // now make sure all of the conditions on characters
236 // are met. Please see the appendix at the end of
237 // this file for more info on exactly what is being
238 // tested
240 // if all conditions are met then check if resulting
241 // root word in the dictionary
243 if (test_condition(tmpword)) {
244 tmpl += stripl;
246 // prefix matched but no root word was found
247 // if aeXPRODUCT is allowed, try again but now
248 // cross checked combined with a suffix
250 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
251 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);
252 if (he) return he;
256 return NULL;
259 // check if this prefix entry matches
260 char * PfxEntry::check_twosfx_morph(const char * word, int len,
261 char in_compound, const FLAG needflag)
263 int tmpl; // length of tmpword
264 char tmpword[MAXWORDUTF8LEN + 4];
266 // on entry prefix is 0 length or already matches the beginning of the word.
267 // So if the remaining root word has positive length
268 // and if there are enough chars in root word and added back strip chars
269 // to meet the number of characters conditions, then test it
271 tmpl = len - appndl;
273 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
274 (tmpl + stripl >= numconds)) {
276 // generate new root word by removing prefix and adding
277 // back any characters that would have been stripped
279 if (stripl) strcpy (tmpword, strip);
280 strcpy ((tmpword + stripl), (word + appndl));
282 // now make sure all of the conditions on characters
283 // are met. Please see the appendix at the end of
284 // this file for more info on exactly what is being
285 // tested
287 // if all conditions are met then check if resulting
288 // root word in the dictionary
290 if (test_condition(tmpword)) {
291 tmpl += stripl;
293 // prefix matched but no root word was found
294 // if aeXPRODUCT is allowed, try again but now
295 // ross checked combined with a suffix
297 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
298 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
299 aeXPRODUCT, this, needflag);
303 return NULL;
306 // check if this prefix entry matches
307 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
309 int tmpl; // length of tmpword
310 struct hentry * he; // hash entry of root word or NULL
311 char tmpword[MAXWORDUTF8LEN + 4];
312 char result[MAXLNLEN];
313 char * st;
315 *result = '\0';
317 // on entry prefix is 0 length or already matches the beginning of the word.
318 // So if the remaining root word has positive length
319 // and if there are enough chars in root word and added back strip chars
320 // to meet the number of characters conditions, then test it
322 tmpl = len - appndl;
324 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
325 (tmpl + stripl >= numconds)) {
327 // generate new root word by removing prefix and adding
328 // back any characters that would have been stripped
330 if (stripl) strcpy (tmpword, strip);
331 strcpy ((tmpword + stripl), (word + appndl));
333 // now make sure all of the conditions on characters
334 // are met. Please see the appendix at the end of
335 // this file for more info on exactly what is being
336 // tested
338 // if all conditions are met then check if resulting
339 // root word in the dictionary
341 if (test_condition(tmpword)) {
342 tmpl += stripl;
343 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
344 do {
345 if (TESTAFF(he->astr, aflag, he->alen) &&
346 // forbid single prefixes with needaffix flag
347 ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
348 // needflag
349 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
350 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
351 if (morphcode) {
352 mystrcat(result, " ", MAXLNLEN);
353 mystrcat(result, morphcode, MAXLNLEN);
354 } else mystrcat(result,getKey(), MAXLNLEN);
355 if (!HENTRY_FIND(he, MORPH_STEM)) {
356 mystrcat(result, " ", MAXLNLEN);
357 mystrcat(result, MORPH_STEM, MAXLNLEN);
358 mystrcat(result, HENTRY_WORD(he), MAXLNLEN);
360 // store the pointer of the hash entry
361 if (HENTRY_DATA(he)) {
362 mystrcat(result, " ", MAXLNLEN);
363 mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);
364 } else {
365 // return with debug information
366 char * flag = pmyMgr->encode_flag(getFlag());
367 mystrcat(result, " ", MAXLNLEN);
368 mystrcat(result, MORPH_FLAG, MAXLNLEN);
369 mystrcat(result, flag, MAXLNLEN);
370 free(flag);
372 mystrcat(result, "\n", MAXLNLEN);
374 he = he->next_homonym;
375 } while (he);
378 // prefix matched but no root word was found
379 // if aeXPRODUCT is allowed, try again but now
380 // ross checked combined with a suffix
382 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
383 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,
384 FLAG_NULL, needflag);
385 if (st) {
386 mystrcat(result, st, MAXLNLEN);
387 free(st);
393 if (*result) return mystrdup(result);
394 return NULL;
397 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
399 // register affix manager
400 pmyMgr = pmgr;
402 // set up its initial values
403 aflag = dp->aflag; // char flag
404 strip = dp->strip; // string to strip
405 appnd = dp->appnd; // string to append
406 stripl = dp->stripl; // length of strip string
407 appndl = dp->appndl; // length of append string
408 numconds = dp->numconds; // length of the condition
409 opts = dp->opts; // cross product flag
411 // then copy over all of the conditions
412 if (opts & aeLONGCOND) {
413 memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);
414 c.l.conds2 = dp->c.l.conds2;
415 } else memcpy(c.conds, dp->c.conds, MAXCONDLEN);
417 rappnd = myrevstrdup(appnd);
418 morphcode = dp->morphcode;
419 contclass = dp->contclass;
420 contclasslen = dp->contclasslen;
424 SfxEntry::~SfxEntry()
426 aflag = 0;
427 if (appnd) free(appnd);
428 if (rappnd) free(rappnd);
429 if (strip) free(strip);
430 pmyMgr = NULL;
431 appnd = NULL;
432 strip = NULL;
433 if (opts & aeLONGCOND) free(c.l.conds2);
434 if (morphcode && !(opts & aeALIASM)) free(morphcode);
435 if (contclass && !(opts & aeALIASF)) free(contclass);
438 // add suffix to this word assuming conditions hold
439 char * SfxEntry::add(const char * word, int len)
441 char tword[MAXWORDUTF8LEN + 4];
443 /* make sure all conditions match */
444 if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&
445 (len >= numconds) && test_condition(word + len, word) &&
446 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
447 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
448 /* we have a match so add suffix */
449 strcpy(tword,word);
450 if (appndl) {
451 strcpy(tword + len - stripl, appnd);
452 } else {
453 *(tword + len - stripl) = '\0';
455 return mystrdup(tword);
457 return NULL;
460 inline char * SfxEntry::nextchar(char * p) {
461 if (p) {
462 p++;
463 if (opts & aeLONGCOND) {
464 // jump to the 2nd part of the condition
465 if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;
466 // end of the MAXCONDLEN length condition
467 } else if (p == c.conds + MAXCONDLEN) return NULL;
468 return *p ? p : NULL;
470 return NULL;
473 inline int SfxEntry::test_condition(const char * st, const char * beg)
475 const char * pos = NULL; // group with pos input position
476 bool neg = false; // complementer
477 bool ingroup = false; // character in the group
478 if (numconds == 0) return 1;
479 char * p = c.conds;
480 st--;
481 int i = 1;
482 while (1) {
483 switch (*p) {
484 case '\0': return 1;
485 case '[': { p = nextchar(p); pos = st; break; }
486 case '^': { p = nextchar(p); neg = true; break; }
487 case ']': { if (!neg && !ingroup) return 0;
488 i++;
489 // skip the next character
490 if (!ingroup) {
491 for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
492 st--;
494 pos = NULL;
495 neg = false;
496 ingroup = false;
497 p = nextchar(p);
498 if (st < beg && p) return 0; // word <= condition
499 break;
501 case '.': if (!pos) { // dots are not metacharacters in groups: [.]
502 p = nextchar(p);
503 // skip the next character
504 for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);
505 if (st < beg) { // word <= condition
506 if (p) return 0; else return 1;
508 if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
509 st--;
510 if (st < beg) { // word <= condition
511 if (p) return 0; else return 1;
514 break;
516 default: {
517 if (*st == *p) {
518 p = nextchar(p);
519 if ((opts & aeUTF8) && (*st & 0x80)) {
520 st--;
521 while (p && (st >= beg)) {
522 if (*p != *st) {
523 if (!pos) return 0;
524 st = pos;
525 break;
527 // first byte of the UTF-8 multibyte character
528 if ((*p & 0xc0) != 0x80) break;
529 p = nextchar(p);
530 st--;
532 if (pos && st != pos) {
533 if (neg) return 0;
534 else if (i == numconds) return 1;
535 ingroup = true;
536 while (p && *p != ']' && (p = nextchar(p)));
537 st--;
539 if (p && *p != ']') p = nextchar(p);
540 } else if (pos) {
541 if (neg) return 0;
542 else if (i == numconds) return 1;
543 ingroup = true;
544 while (p && *p != ']' && (p = nextchar(p)));
545 // if (p && *p != ']') p = nextchar(p);
546 st--;
548 if (!pos) {
549 i++;
550 st--;
552 if (st < beg && p && *p != ']') return 0; // word <= condition
553 } else if (pos) { // group
554 p = nextchar(p);
555 } else return 0;
558 if (!p) return 1;
562 // see if this suffix is present in the word
563 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
564 PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
565 const FLAG badflag)
567 int tmpl; // length of tmpword
568 struct hentry * he; // hash entry pointer
569 unsigned char * cp;
570 char tmpword[MAXWORDUTF8LEN + 4];
571 PfxEntry* ep = ppfx;
573 // if this suffix is being cross checked with a prefix
574 // but it does not support cross products skip it
576 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
577 return NULL;
579 // upon entry suffix is 0 length or already matches the end of the word.
580 // So if the remaining root word has positive length
581 // and if there are enough chars in root word and added back strip chars
582 // to meet the number of characters conditions, then test it
584 tmpl = len - appndl;
585 // the second condition is not enough for UTF-8 strings
586 // it checked in test_condition()
588 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
589 (tmpl + stripl >= numconds)) {
591 // generate new root word by removing suffix and adding
592 // back any characters that would have been stripped or
593 // or null terminating the shorter string
595 strcpy (tmpword, word);
596 cp = (unsigned char *)(tmpword + tmpl);
597 if (stripl) {
598 strcpy ((char *)cp, strip);
599 tmpl += stripl;
600 cp = (unsigned char *)(tmpword + tmpl);
601 } else *cp = '\0';
603 // now make sure all of the conditions on characters
604 // are met. Please see the appendix at the end of
605 // this file for more info on exactly what is being
606 // tested
608 // if all conditions are met then check if resulting
609 // root word in the dictionary
611 if (test_condition((char *) cp, (char *) tmpword)) {
613 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
614 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
615 #endif
616 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
617 do {
618 // check conditional suffix (enabled by prefix)
619 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
620 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
621 (((optflags & aeXPRODUCT) == 0) ||
622 (ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
623 // enabled by prefix
624 ((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))
625 ) &&
626 // handle cont. class
627 ((!cclass) ||
628 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
629 ) &&
630 // check only in compound homonyms (bad flags)
631 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
632 ) &&
633 // handle required flag
634 ((!needflag) ||
635 (TESTAFF(he->astr, needflag, he->alen) ||
636 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
638 ) return he;
639 he = he->next_homonym; // check homonyms
640 } while (he);
642 // obsolote stemming code (used only by the
643 // experimental SuffixMgr:suggest_pos_stems)
644 // store resulting root in wlst
645 } else if (wlst && (*ns < maxSug)) {
646 int cwrd = 1;
647 for (int k=0; k < *ns; k++)
648 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
649 if (cwrd) {
650 wlst[*ns] = mystrdup(tmpword);
651 if (wlst[*ns] == NULL) {
652 for (int j=0; j<*ns; j++) free(wlst[j]);
653 *ns = -1;
654 return NULL;
656 (*ns)++;
661 return NULL;
664 // see if two-level suffix is present in the word
665 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
666 PfxEntry* ppfx, const FLAG needflag)
668 int tmpl; // length of tmpword
669 struct hentry * he; // hash entry pointer
670 unsigned char * cp;
671 char tmpword[MAXWORDUTF8LEN + 4];
672 PfxEntry* ep = ppfx;
675 // if this suffix is being cross checked with a prefix
676 // but it does not support cross products skip it
678 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
679 return NULL;
681 // upon entry suffix is 0 length or already matches the end of the word.
682 // So if the remaining root word has positive length
683 // and if there are enough chars in root word and added back strip chars
684 // to meet the number of characters conditions, then test it
686 tmpl = len - appndl;
688 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
689 (tmpl + stripl >= numconds)) {
691 // generate new root word by removing suffix and adding
692 // back any characters that would have been stripped or
693 // or null terminating the shorter string
695 strcpy (tmpword, word);
696 cp = (unsigned char *)(tmpword + tmpl);
697 if (stripl) {
698 strcpy ((char *)cp, strip);
699 tmpl += stripl;
700 cp = (unsigned char *)(tmpword + tmpl);
701 } else *cp = '\0';
703 // now make sure all of the conditions on characters
704 // are met. Please see the appendix at the end of
705 // this file for more info on exactly what is being
706 // tested
708 // if all conditions are met then recall suffix_check
710 if (test_condition((char *) cp, (char *) tmpword)) {
711 if (ppfx) {
712 // handle conditional suffix
713 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
714 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
715 else
716 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
717 } else {
718 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
720 if (he) return he;
723 return NULL;
726 // see if two-level suffix is present in the word
727 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
728 PfxEntry* ppfx, const FLAG needflag)
730 int tmpl; // length of tmpword
731 unsigned char * cp;
732 char tmpword[MAXWORDUTF8LEN + 4];
733 PfxEntry* ep = ppfx;
734 char * st;
736 char result[MAXLNLEN];
738 *result = '\0';
740 // if this suffix is being cross checked with a prefix
741 // but it does not support cross products skip it
743 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
744 return NULL;
746 // upon entry suffix is 0 length or already matches the end of the word.
747 // So if the remaining root word has positive length
748 // and if there are enough chars in root word and added back strip chars
749 // to meet the number of characters conditions, then test it
751 tmpl = len - appndl;
753 if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
754 (tmpl + stripl >= numconds)) {
756 // generate new root word by removing suffix and adding
757 // back any characters that would have been stripped or
758 // or null terminating the shorter string
760 strcpy (tmpword, word);
761 cp = (unsigned char *)(tmpword + tmpl);
762 if (stripl) {
763 strcpy ((char *)cp, strip);
764 tmpl += stripl;
765 cp = (unsigned char *)(tmpword + tmpl);
766 } else *cp = '\0';
768 // now make sure all of the conditions on characters
769 // are met. Please see the appendix at the end of
770 // this file for more info on exactly what is being
771 // tested
773 // if all conditions are met then recall suffix_check
775 if (test_condition((char *) cp, (char *) tmpword)) {
776 if (ppfx) {
777 // handle conditional suffix
778 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
779 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
780 if (st) {
781 if (ppfx->getMorph()) {
782 mystrcat(result, ppfx->getMorph(), MAXLNLEN);
783 mystrcat(result, " ", MAXLNLEN);
785 mystrcat(result,st, MAXLNLEN);
786 free(st);
787 mychomp(result);
789 } else {
790 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
791 if (st) {
792 mystrcat(result, st, MAXLNLEN);
793 free(st);
794 mychomp(result);
797 } else {
798 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
799 if (st) {
800 mystrcat(result, st, MAXLNLEN);
801 free(st);
802 mychomp(result);
805 if (*result) return mystrdup(result);
808 return NULL;
811 // get next homonym with same affix
812 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,
813 const FLAG cclass, const FLAG needflag)
815 PfxEntry* ep = ppfx;
816 FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
818 while (he->next_homonym) {
819 he = he->next_homonym;
820 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
821 ((optflags & aeXPRODUCT) == 0 ||
822 TESTAFF(he->astr, eFlag, he->alen) ||
823 // handle conditional suffix
824 ((contclass) && TESTAFF(contclass, eFlag, contclasslen))
825 ) &&
826 // handle cont. class
827 ((!cclass) ||
828 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
829 ) &&
830 // handle required flag
831 ((!needflag) ||
832 (TESTAFF(he->astr, needflag, he->alen) ||
833 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
835 ) return he;
837 return NULL;
841 #if 0
843 Appendix: Understanding Affix Code
846 An affix is either a prefix or a suffix attached to root words to make
847 other words.
849 Basically a Prefix or a Suffix is set of AffEntry objects
850 which store information about the prefix or suffix along
851 with supporting routines to check if a word has a particular
852 prefix or suffix or a combination.
854 The structure affentry is defined as follows:
856 struct affentry
858 unsigned short aflag; // ID used to represent the affix
859 char * strip; // string to strip before adding affix
860 char * appnd; // the affix string to add
861 unsigned char stripl; // length of the strip string
862 unsigned char appndl; // length of the affix string
863 char numconds; // the number of conditions that must be met
864 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
865 char conds[SETSIZE]; // array which encodes the conditions to be met
869 Here is a suffix borrowed from the en_US.aff file. This file
870 is whitespace delimited.
872 SFX D Y 4
873 SFX D 0 e d
874 SFX D y ied [^aeiou]y
875 SFX D 0 ed [^ey]
876 SFX D 0 ed [aeiou]y
878 This information can be interpreted as follows:
880 In the first line has 4 fields
882 Field
883 -----
884 1 SFX - indicates this is a suffix
885 2 D - is the name of the character flag which represents this suffix
886 3 Y - indicates it can be combined with prefixes (cross product)
887 4 4 - indicates that sequence of 4 affentry structures are needed to
888 properly store the affix information
890 The remaining lines describe the unique information for the 4 SfxEntry
891 objects that make up this affix. Each line can be interpreted
892 as follows: (note fields 1 and 2 are as a check against line 1 info)
894 Field
895 -----
896 1 SFX - indicates this is a suffix
897 2 D - is the name of the character flag for this affix
898 3 y - the string of chars to strip off before adding affix
899 (a 0 here indicates the NULL string)
900 4 ied - the string of affix characters to add
901 5 [^aeiou]y - the conditions which must be met before the affix
902 can be applied
904 Field 5 is interesting. Since this is a suffix, field 5 tells us that
905 there are 2 conditions that must be met. The first condition is that
906 the next to the last character in the word must *NOT* be any of the
907 following "a", "e", "i", "o" or "u". The second condition is that
908 the last character of the word must end in "y".
910 So how can we encode this information concisely and be able to
911 test for both conditions in a fast manner? The answer is found
912 but studying the wonderful ispell code of Geoff Kuenning, et.al.
913 (now available under a normal BSD license).
915 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
916 using a character (cast to an unsigned char) of a string, we have 8 bits
917 of information we can store about that character. Specifically we
918 could use each bit to say if that character is allowed in any of the
919 last (or first for prefixes) 8 characters of the word.
921 Basically, each character at one end of the word (up to the number
922 of conditions) is used to index into the conds array and the resulting
923 value found there says whether the that character is valid for a
924 specific character position in the word.
926 For prefixes, it does this by setting bit 0 if that char is valid
927 in the first position, bit 1 if valid in the second position, and so on.
929 If a bit is not set, then that char is not valid for that postion in the
930 word.
932 If working with suffixes bit 0 is used for the character closest
933 to the front, bit 1 for the next character towards the end, ...,
934 with bit numconds-1 representing the last char at the end of the string.
936 Note: since entries in the conds[] are 8 bits, only 8 conditions
937 (read that only 8 character positions) can be examined at one
938 end of a word (the beginning for prefixes and the end for suffixes.
940 So to make this clearer, lets encode the conds array values for the
941 first two affentries for the suffix D described earlier.
944 For the first affentry:
945 numconds = 1 (only examine the last character)
947 conds['e'] = (1 << 0) (the word must end in an E)
948 all others are all 0
950 For the second affentry:
951 numconds = 2 (only examine the last two characters)
953 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
954 where X is all characters *but* a, e, i, o, or u
957 conds['y'] = (1 << 1) (the last char must be a y)
958 all other bits for all other entries in the conds array are zero
961 #endif