Updated hunspell to 1.3.2
[TortoiseGit.git] / ext / hunspell / hunspell.cxx
blob2d4f45150d606fe03250c6ab02c6829a1c1480da
1 #include "license.hunspell"
2 #include "license.myspell"
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdio.h>
8 #include "hunspell.hxx"
9 #include "hunspell.h"
10 #ifndef MOZILLA_CLIENT
11 # include "config.h"
12 #endif
13 #include "csutil.hxx"
15 Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
17 encoding = NULL;
18 csconv = NULL;
19 utf8 = 0;
20 complexprefixes = 0;
21 affixpath = mystrdup(affpath);
22 maxdic = 0;
24 /* first set up the hash manager */
25 pHMgr[0] = new HashMgr(dpath, affpath, key);
26 if (pHMgr[0]) maxdic = 1;
28 /* next set up the affix manager */
29 /* it needs access to the hash manager lookup methods */
30 pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
32 /* get the preferred try string and the dictionary */
33 /* encoding from the Affix Manager for that dictionary */
34 char * try_string = pAMgr->get_try_string();
35 encoding = pAMgr->get_encoding();
36 langnum = pAMgr->get_langnum();
37 utf8 = pAMgr->get_utf8();
38 if (!utf8)
39 csconv = get_current_cs(encoding);
40 complexprefixes = pAMgr->get_complexprefixes();
41 wordbreak = pAMgr->get_breaktable();
43 /* and finally set up the suggestion manager */
44 pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
45 if (try_string) free(try_string);
48 Hunspell::~Hunspell()
50 if (pSMgr) delete pSMgr;
51 if (pAMgr) delete pAMgr;
52 for (int i = 0; i < maxdic; i++) delete pHMgr[i];
53 maxdic = 0;
54 pSMgr = NULL;
55 pAMgr = NULL;
56 #ifdef MOZILLA_CLIENT
57 delete [] csconv;
58 #endif
59 csconv= NULL;
60 if (encoding) free(encoding);
61 encoding = NULL;
62 if (affixpath) free(affixpath);
63 affixpath = NULL;
66 // load extra dictionaries
67 int Hunspell::add_dic(const char * dpath, const char * key) {
68 if (maxdic == MAXDIC || !affixpath) return 1;
69 pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
70 if (pHMgr[maxdic]) maxdic++; else return 1;
71 return 0;
74 // make a copy of src at destination while removing all leading
75 // blanks and removing any trailing periods after recording
76 // their presence with the abbreviation flag
77 // also since already going through character by character,
78 // set the capitalization type
79 // return the length of the "cleaned" (and UTF-8 encoded) word
81 int Hunspell::cleanword2(char * dest, const char * src,
82 w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
84 unsigned char * p = (unsigned char *) dest;
85 const unsigned char * q = (const unsigned char * ) src;
87 // first skip over any leading blanks
88 while ((*q != '\0') && (*q == ' ')) q++;
90 // now strip off any trailing periods (recording their presence)
91 *pabbrev = 0;
92 int nl = strlen((const char *)q);
93 while ((nl > 0) && (*(q+nl-1)=='.')) {
94 nl--;
95 (*pabbrev)++;
98 // if no characters are left it can't be capitalized
99 if (nl <= 0) {
100 *pcaptype = NOCAP;
101 *p = '\0';
102 return 0;
105 strncpy(dest, (char *) q, nl);
106 *(dest + nl) = '\0';
107 nl = strlen(dest);
108 if (utf8) {
109 *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
110 // don't check too long words
111 if (*nc >= MAXWORDLEN) return 0;
112 if (*nc == -1) { // big Unicode character (non BMP area)
113 *pcaptype = NOCAP;
114 return nl;
116 *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
117 } else {
118 *pcaptype = get_captype(dest, nl, csconv);
119 *nc = nl;
121 return nl;
124 int Hunspell::cleanword(char * dest, const char * src,
125 int * pcaptype, int * pabbrev)
127 unsigned char * p = (unsigned char *) dest;
128 const unsigned char * q = (const unsigned char * ) src;
129 int firstcap = 0;
131 // first skip over any leading blanks
132 while ((*q != '\0') && (*q == ' ')) q++;
134 // now strip off any trailing periods (recording their presence)
135 *pabbrev = 0;
136 int nl = strlen((const char *)q);
137 while ((nl > 0) && (*(q+nl-1)=='.')) {
138 nl--;
139 (*pabbrev)++;
142 // if no characters are left it can't be capitalized
143 if (nl <= 0) {
144 *pcaptype = NOCAP;
145 *p = '\0';
146 return 0;
149 // now determine the capitalization type of the first nl letters
150 int ncap = 0;
151 int nneutral = 0;
152 int nc = 0;
154 if (!utf8) {
155 while (nl > 0) {
156 nc++;
157 if (csconv[(*q)].ccase) ncap++;
158 if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
159 *p++ = *q++;
160 nl--;
162 // remember to terminate the destination string
163 *p = '\0';
164 firstcap = csconv[(unsigned char)(*dest)].ccase;
165 } else {
166 unsigned short idx;
167 w_char t[MAXWORDLEN];
168 nc = u8_u16(t, MAXWORDLEN, src);
169 for (int i = 0; i < nc; i++) {
170 idx = (t[i].h << 8) + t[i].l;
171 unsigned short low = unicodetolower(idx, langnum);
172 if (idx != low) ncap++;
173 if (unicodetoupper(idx, langnum) == low) nneutral++;
175 u16_u8(dest, MAXWORDUTF8LEN, t, nc);
176 if (ncap) {
177 idx = (t[0].h << 8) + t[0].l;
178 firstcap = (idx != unicodetolower(idx, langnum));
182 // now finally set the captype
183 if (ncap == 0) {
184 *pcaptype = NOCAP;
185 } else if ((ncap == 1) && firstcap) {
186 *pcaptype = INITCAP;
187 } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
188 *pcaptype = ALLCAP;
189 } else if ((ncap > 1) && firstcap) {
190 *pcaptype = HUHINITCAP;
191 } else {
192 *pcaptype = HUHCAP;
194 return strlen(dest);
197 void Hunspell::mkallcap(char * p)
199 if (utf8) {
200 w_char u[MAXWORDLEN];
201 int nc = u8_u16(u, MAXWORDLEN, p);
202 unsigned short idx;
203 for (int i = 0; i < nc; i++) {
204 idx = (u[i].h << 8) + u[i].l;
205 if (idx != unicodetoupper(idx, langnum)) {
206 u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
207 u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
210 u16_u8(p, MAXWORDUTF8LEN, u, nc);
211 } else {
212 while (*p != '\0') {
213 *p = csconv[((unsigned char) *p)].cupper;
214 p++;
219 int Hunspell::mkallcap2(char * p, w_char * u, int nc)
221 if (utf8) {
222 unsigned short idx;
223 for (int i = 0; i < nc; i++) {
224 idx = (u[i].h << 8) + u[i].l;
225 unsigned short up = unicodetoupper(idx, langnum);
226 if (idx != up) {
227 u[i].h = (unsigned char) (up >> 8);
228 u[i].l = (unsigned char) (up & 0x00FF);
231 u16_u8(p, MAXWORDUTF8LEN, u, nc);
232 return strlen(p);
233 } else {
234 while (*p != '\0') {
235 *p = csconv[((unsigned char) *p)].cupper;
236 p++;
239 return nc;
243 void Hunspell::mkallsmall(char * p)
245 while (*p != '\0') {
246 *p = csconv[((unsigned char) *p)].clower;
247 p++;
251 int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
253 if (utf8) {
254 unsigned short idx;
255 for (int i = 0; i < nc; i++) {
256 idx = (u[i].h << 8) + u[i].l;
257 unsigned short low = unicodetolower(idx, langnum);
258 if (idx != low) {
259 u[i].h = (unsigned char) (low >> 8);
260 u[i].l = (unsigned char) (low & 0x00FF);
263 u16_u8(p, MAXWORDUTF8LEN, u, nc);
264 return strlen(p);
265 } else {
266 while (*p != '\0') {
267 *p = csconv[((unsigned char) *p)].clower;
268 p++;
271 return nc;
274 // convert UTF-8 sharp S codes to latin 1
275 char * Hunspell::sharps_u8_l1(char * dest, char * source) {
276 char * p = dest;
277 *p = *source;
278 for (p++, source++; *(source - 1); p++, source++) {
279 *p = *source;
280 if (*source == '\x9F') *--p = '\xDF';
282 return dest;
285 // recursive search for right ss - sharp s permutations
286 hentry * Hunspell::spellsharps(char * base, char * pos, int n,
287 int repnum, char * tmp, int * info, char **root) {
288 pos = strstr(pos, "ss");
289 if (pos && (n < MAXSHARPS)) {
290 *pos = '\xC3';
291 *(pos + 1) = '\x9F';
292 hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
293 if (h) return h;
294 *pos = 's';
295 *(pos + 1) = 's';
296 h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
297 if (h) return h;
298 } else if (repnum > 0) {
299 if (utf8) return checkword(base, info, root);
300 return checkword(sharps_u8_l1(tmp, base), info, root);
302 return NULL;
305 int Hunspell::is_keepcase(const hentry * rv) {
306 return pAMgr && rv->astr && pAMgr->get_keepcase() &&
307 TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
310 /* insert a word to the beginning of the suggestion array and return ns */
311 int Hunspell::insert_sug(char ***slst, char * word, int ns) {
312 char * dup = mystrdup(word);
313 if (!dup) return ns;
314 if (ns == MAXSUGGESTION) {
315 ns--;
316 free((*slst)[ns]);
318 for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
319 (*slst)[0] = dup;
320 return ns + 1;
323 int Hunspell::spell(const char * word, int * info, char ** root)
325 struct hentry * rv=NULL;
326 // need larger vector. For example, Turkish capital letter I converted a
327 // 2-byte UTF-8 character (dotless i) by mkallsmall.
328 char cw[MAXWORDUTF8LEN];
329 char wspace[MAXWORDUTF8LEN];
330 w_char unicw[MAXWORDLEN];
331 // Hunspell supports XML input of the simplified API (see manual)
332 if (strcmp(word, SPELL_XML) == 0) return 1;
333 int nc = strlen(word);
334 int wl2 = 0;
335 if (utf8) {
336 if (nc >= MAXWORDUTF8LEN) return 0;
337 } else {
338 if (nc >= MAXWORDLEN) return 0;
340 int captype = 0;
341 int abbv = 0;
342 int wl = 0;
344 // input conversion
345 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
346 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
347 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
349 int info2 = 0;
350 if (wl == 0 || maxdic == 0) return 1;
351 if (root) *root = NULL;
353 // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
354 enum { NBEGIN, NNUM, NSEP };
355 int nstate = NBEGIN;
356 int i;
358 for (i = 0; (i < wl); i++) {
359 if ((cw[i] <= '9') && (cw[i] >= '0')) {
360 nstate = NNUM;
361 } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
362 if ((nstate == NSEP) || (i == 0)) break;
363 nstate = NSEP;
364 } else break;
366 if ((i == wl) && (nstate == NNUM)) return 1;
367 if (!info) info = &info2; else *info = 0;
369 switch(captype) {
370 case HUHCAP:
371 case HUHINITCAP:
372 *info += SPELL_ORIGCAP;
373 case NOCAP: {
374 rv = checkword(cw, info, root);
375 if ((abbv) && !(rv)) {
376 memcpy(wspace,cw,wl);
377 *(wspace+wl) = '.';
378 *(wspace+wl+1) = '\0';
379 rv = checkword(wspace, info, root);
381 break;
383 case ALLCAP: {
384 *info += SPELL_ORIGCAP;
385 rv = checkword(cw, info, root);
386 if (rv) break;
387 if (abbv) {
388 memcpy(wspace,cw,wl);
389 *(wspace+wl) = '.';
390 *(wspace+wl+1) = '\0';
391 rv = checkword(wspace, info, root);
392 if (rv) break;
394 // Spec. prefix handling for Catalan, French, Italian:
395 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
396 if (pAMgr && strchr(cw, '\'')) {
397 wl = mkallsmall2(cw, unicw, nc);
398 //There are no really sane circumstances where this could fail,
399 //but anyway...
400 if (char * apostrophe = strchr(cw, '\'')) {
401 if (utf8) {
402 w_char tmpword[MAXWORDLEN];
403 *apostrophe = '\0';
404 wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
405 *apostrophe = '\'';
406 if (wl2 < nc) {
407 mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
408 rv = checkword(cw, info, root);
409 if (rv) break;
411 } else {
412 mkinitcap2(apostrophe + 1, unicw, nc);
413 rv = checkword(cw, info, root);
414 if (rv) break;
417 mkinitcap2(cw, unicw, nc);
418 rv = checkword(cw, info, root);
419 if (rv) break;
421 if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
422 char tmpword[MAXWORDUTF8LEN];
423 wl = mkallsmall2(cw, unicw, nc);
424 memcpy(wspace,cw,(wl+1));
425 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
426 if (!rv) {
427 wl2 = mkinitcap2(cw, unicw, nc);
428 rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
430 if ((abbv) && !(rv)) {
431 *(wspace+wl) = '.';
432 *(wspace+wl+1) = '\0';
433 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
434 if (!rv) {
435 memcpy(wspace, cw, wl2);
436 *(wspace+wl2) = '.';
437 *(wspace+wl2+1) = '\0';
438 rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
441 if (rv) break;
444 case INITCAP: {
445 *info += SPELL_ORIGCAP;
446 wl = mkallsmall2(cw, unicw, nc);
447 memcpy(wspace,cw,(wl+1));
448 wl2 = mkinitcap2(cw, unicw, nc);
449 if (captype == INITCAP) *info += SPELL_INITCAP;
450 rv = checkword(cw, info, root);
451 if (captype == INITCAP) *info -= SPELL_INITCAP;
452 // forbid bad capitalization
453 // (for example, ijs -> Ijs instead of IJs in Dutch)
454 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
455 if (*info & SPELL_FORBIDDEN) {
456 rv = NULL;
457 break;
459 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
460 if (rv) break;
462 rv = checkword(wspace, info, root);
463 if (abbv && !rv) {
465 *(wspace+wl) = '.';
466 *(wspace+wl+1) = '\0';
467 rv = checkword(wspace, info, root);
468 if (!rv) {
469 memcpy(wspace, cw, wl2);
470 *(wspace+wl2) = '.';
471 *(wspace+wl2+1) = '\0';
472 if (captype == INITCAP) *info += SPELL_INITCAP;
473 rv = checkword(wspace, info, root);
474 if (captype == INITCAP) *info -= SPELL_INITCAP;
475 if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
476 break;
479 if (rv && is_keepcase(rv) &&
480 ((captype == ALLCAP) ||
481 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
482 // in INITCAP form, too.
483 !(pAMgr->get_checksharps() &&
484 ((utf8 && strstr(wspace, "\xC3\x9F")) ||
485 (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
486 break;
490 if (rv) {
491 if (pAMgr && pAMgr->get_warn() && rv->astr &&
492 TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
493 *info += SPELL_WARN;
494 if (pAMgr->get_forbidwarn()) return 0;
495 return HUNSPELL_OK_WARN;
497 return HUNSPELL_OK;
500 // recursive breaking at break points
501 if (wordbreak) {
502 char * s;
503 char r;
504 int nbr = 0;
505 wl = strlen(cw);
506 int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
508 // calculate break points for recursion limit
509 for (int j = 0; j < numbreak; j++) {
510 s = cw;
511 do {
512 s = (char *) strstr(s, wordbreak[j]);
513 if (s) {
514 nbr++;
515 s++;
517 } while (s);
519 if (nbr >= 10) return 0;
521 // check boundary patterns (^begin and end$)
522 for (int j = 0; j < numbreak; j++) {
523 int plen = strlen(wordbreak[j]);
524 if (plen == 1 || plen > wl) continue;
525 if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
526 && spell(cw + plen - 1)) return 1;
527 if (wordbreak[j][plen - 1] == '$' &&
528 strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
529 r = cw[wl - plen + 1];
530 cw[wl - plen + 1] = '\0';
531 if (spell(cw)) return 1;
532 cw[wl - plen + 1] = r;
536 // other patterns
537 for (int j = 0; j < numbreak; j++) {
538 int plen = strlen(wordbreak[j]);
539 s=(char *) strstr(cw, wordbreak[j]);
540 if (s && (s > cw) && (s < cw + wl - plen)) {
541 if (!spell(s + plen)) continue;
542 r = *s;
543 *s = '\0';
544 // examine 2 sides of the break point
545 if (spell(cw)) return 1;
546 *s = r;
548 // LANG_hu: spec. dash rule
549 if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
550 r = s[1];
551 s[1] = '\0';
552 if (spell(cw)) return 1; // check the first part with dash
553 s[1] = r;
555 // end of LANG speficic region
561 return 0;
564 struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
566 struct hentry * he = NULL;
567 int len, i;
568 char w2[MAXWORDUTF8LEN];
569 const char * word;
571 char * ignoredchars = pAMgr->get_ignore();
572 if (ignoredchars != NULL) {
573 strcpy(w2, w);
574 if (utf8) {
575 int ignoredchars_utf16_len;
576 unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
577 remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
578 } else {
579 remove_ignored_chars(w2,ignoredchars);
581 word = w2;
582 } else word = w;
584 len = strlen(word);
586 if (!len)
587 return NULL;
589 // word reversing wrapper for complex prefixes
590 if (complexprefixes) {
591 if (word != w2) {
592 strcpy(w2, word);
593 word = w2;
595 if (utf8) reverseword_utf(w2); else reverseword(w2);
598 // look word in hash table
599 for (i = 0; (i < maxdic) && !he; i ++) {
600 he = (pHMgr[i])->lookup(word);
602 // check forbidden and onlyincompound words
603 if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
604 if (info) *info += SPELL_FORBIDDEN;
605 // LANG_hu section: set dash information for suggestions
606 if (langnum == LANG_hu) {
607 if (pAMgr->get_compoundflag() &&
608 TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
609 if (info) *info += SPELL_COMPOUND;
612 return NULL;
615 // he = next not needaffix, onlyincompound homonym or onlyupcase word
616 while (he && (he->astr) &&
617 ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
618 (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
619 (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
620 )) he = he->next_homonym;
623 // check with affixes
624 if (!he && pAMgr) {
625 // try stripping off affixes */
626 he = pAMgr->affix_check(word, len, 0);
628 // check compound restriction and onlyupcase
629 if (he && he->astr && (
630 (pAMgr->get_onlyincompound() &&
631 TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
632 (info && (*info & SPELL_INITCAP) &&
633 TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
634 he = NULL;
637 if (he) {
638 if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
639 if (info) *info += SPELL_FORBIDDEN;
640 return NULL;
642 if (root) {
643 *root = mystrdup(he->word);
644 if (*root && complexprefixes) {
645 if (utf8) reverseword_utf(*root); else reverseword(*root);
648 // try check compound word
649 } else if (pAMgr->get_compound()) {
650 he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
651 // LANG_hu section: `moving rule' with last dash
652 if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
653 char * dup = mystrdup(word);
654 if (!dup) return NULL;
655 dup[len-1] = '\0';
656 he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info);
657 free(dup);
659 // end of LANG speficic region
660 if (he) {
661 if (root) {
662 *root = mystrdup(he->word);
663 if (*root && complexprefixes) {
664 if (utf8) reverseword_utf(*root); else reverseword(*root);
667 if (info) *info += SPELL_COMPOUND;
673 return he;
676 int Hunspell::suggest(char*** slst, const char * word)
678 int onlycmpdsug = 0;
679 char cw[MAXWORDUTF8LEN];
680 char wspace[MAXWORDUTF8LEN];
681 if (!pSMgr || maxdic == 0) return 0;
682 w_char unicw[MAXWORDLEN];
683 *slst = NULL;
684 // process XML input of the simplified API (see manual)
685 if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
686 return spellml(slst, word);
688 int nc = strlen(word);
689 if (utf8) {
690 if (nc >= MAXWORDUTF8LEN) return 0;
691 } else {
692 if (nc >= MAXWORDLEN) return 0;
694 int captype = 0;
695 int abbv = 0;
696 int wl = 0;
698 // input conversion
699 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
700 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
701 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
703 if (wl == 0) return 0;
704 int ns = 0;
705 int capwords = 0;
707 // check capitalized form for FORCEUCASE
708 if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
709 int info = SPELL_ORIGCAP;
710 char ** wlst;
711 if (checkword(cw, &info, NULL)) {
712 if (*slst) {
713 wlst = *slst;
714 } else {
715 wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *));
716 if (wlst == NULL) return -1;
717 *slst = wlst;
718 for (int i = 0; i < MAXSUGGESTION; i++) {
719 wlst[i] = NULL;
722 wlst[0] = mystrdup(cw);
723 mkinitcap(wlst[0]);
724 return 1;
728 switch(captype) {
729 case NOCAP: {
730 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
731 break;
734 case INITCAP: {
735 capwords = 1;
736 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
737 if (ns == -1) break;
738 memcpy(wspace,cw,(wl+1));
739 mkallsmall2(wspace, unicw, nc);
740 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
741 break;
743 case HUHINITCAP:
744 capwords = 1;
745 case HUHCAP: {
746 ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
747 if (ns != -1) {
748 int prevns;
749 // something.The -> something. The
750 char * dot = strchr(cw, '.');
751 if (dot && (dot > cw)) {
752 int captype_;
753 if (utf8) {
754 w_char w_[MAXWORDLEN];
755 int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
756 captype_ = get_captype_utf8(w_, wl_, langnum);
757 } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
758 if (captype_ == INITCAP) {
759 char * st = mystrdup(cw);
760 if (st) st = (char *) realloc(st, wl + 2);
761 if (st) {
762 st[(dot - cw) + 1] = ' ';
763 strcpy(st + (dot - cw) + 2, dot + 1);
764 ns = insert_sug(slst, st, ns);
765 free(st);
769 if (captype == HUHINITCAP) {
770 // TheOpenOffice.org -> The OpenOffice.org
771 memcpy(wspace,cw,(wl+1));
772 mkinitsmall2(wspace, unicw, nc);
773 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
775 memcpy(wspace,cw,(wl+1));
776 mkallsmall2(wspace, unicw, nc);
777 if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
778 prevns = ns;
779 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
780 if (captype == HUHINITCAP) {
781 mkinitcap2(wspace, unicw, nc);
782 if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
783 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
785 // aNew -> "a New" (instead of "a new")
786 for (int j = prevns; j < ns; j++) {
787 char * space = strchr((*slst)[j],' ');
788 if (space) {
789 int slen = strlen(space + 1);
790 // different case after space (need capitalisation)
791 if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
792 w_char w[MAXWORDLEN];
793 int wc = 0;
794 char * r = (*slst)[j];
795 if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
796 mkinitcap2(space + 1, w, wc);
797 // set as first suggestion
798 for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
799 (*slst)[0] = r;
804 break;
807 case ALLCAP: {
808 memcpy(wspace, cw, (wl+1));
809 mkallsmall2(wspace, unicw, nc);
810 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
811 if (ns == -1) break;
812 if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
813 ns = insert_sug(slst, wspace, ns);
814 mkinitcap2(wspace, unicw, nc);
815 ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
816 for (int j=0; j < ns; j++) {
817 mkallcap((*slst)[j]);
818 if (pAMgr && pAMgr->get_checksharps()) {
819 char * pos;
820 if (utf8) {
821 pos = strstr((*slst)[j], "\xC3\x9F");
822 while (pos) {
823 *pos = 'S';
824 *(pos+1) = 'S';
825 pos = strstr(pos+2, "\xC3\x9F");
827 } else {
828 pos = strchr((*slst)[j], '\xDF');
829 while (pos) {
830 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
831 mystrrep((*slst)[j], "\xDF", "SS");
832 pos = strchr((*slst)[j], '\xDF');
837 break;
841 // LANG_hu section: replace '-' with ' ' in Hungarian
842 if (langnum == LANG_hu) {
843 for (int j=0; j < ns; j++) {
844 char * pos = strchr((*slst)[j],'-');
845 if (pos) {
846 int info;
847 char w[MAXWORDUTF8LEN];
848 *pos = '\0';
849 strcpy(w, (*slst)[j]);
850 strcat(w, pos + 1);
851 spell(w, &info, NULL);
852 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
853 *pos = ' ';
854 } else *pos = '-';
858 // END OF LANG_hu section
860 // try ngram approach since found nothing or only compound words
861 if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) {
862 switch(captype) {
863 case NOCAP: {
864 ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
865 break;
867 case HUHINITCAP:
868 capwords = 1;
869 case HUHCAP: {
870 memcpy(wspace,cw,(wl+1));
871 mkallsmall2(wspace, unicw, nc);
872 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
873 break;
875 case INITCAP: {
876 capwords = 1;
877 memcpy(wspace,cw,(wl+1));
878 mkallsmall2(wspace, unicw, nc);
879 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
880 break;
882 case ALLCAP: {
883 memcpy(wspace,cw,(wl+1));
884 mkallsmall2(wspace, unicw, nc);
885 int oldns = ns;
886 ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
887 for (int j = oldns; j < ns; j++)
888 mkallcap((*slst)[j]);
889 break;
894 // try dash suggestion (Afo-American -> Afro-American)
895 if (char * pos = strchr(cw, '-')) {
896 char * ppos = cw;
897 int nodashsug = 1;
898 char ** nlst = NULL;
899 int nn = 0;
900 int last = 0;
901 if (*slst) {
902 for (int j = 0; j < ns && nodashsug == 1; j++) {
903 if (strchr((*slst)[j], '-')) nodashsug = 0;
906 while (nodashsug && !last) {
907 if (*pos == '\0') last = 1; else *pos = '\0';
908 if (!spell(ppos)) {
909 nn = suggest(&nlst, ppos);
910 for (int j = nn - 1; j >= 0; j--) {
911 strncpy(wspace, cw, ppos - cw);
912 strcpy(wspace + (ppos - cw), nlst[j]);
913 if (!last) {
914 strcat(wspace, "-");
915 strcat(wspace, pos + 1);
917 ns = insert_sug(slst, wspace, ns);
918 free(nlst[j]);
920 if (nlst != NULL) free(nlst);
921 nodashsug = 0;
923 if (!last) {
924 *pos = '-';
925 ppos = pos + 1;
926 pos = strchr(ppos, '-');
928 if (!pos) pos = cw + strlen(cw);
932 // word reversing wrapper for complex prefixes
933 if (complexprefixes) {
934 for (int j = 0; j < ns; j++) {
935 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
939 // capitalize
940 if (capwords) for (int j=0; j < ns; j++) {
941 mkinitcap((*slst)[j]);
944 // expand suggestions with dot(s)
945 if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
946 for (int j = 0; j < ns; j++) {
947 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
948 strcat((*slst)[j], word + strlen(word) - abbv);
952 // remove bad capitalized and forbidden forms
953 if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
954 switch (captype) {
955 case INITCAP:
956 case ALLCAP: {
957 int l = 0;
958 for (int j=0; j < ns; j++) {
959 if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
960 char s[MAXSWUTF8L];
961 w_char w[MAXSWL];
962 int len;
963 if (utf8) {
964 len = u8_u16(w, MAXSWL, (*slst)[j]);
965 } else {
966 strcpy(s, (*slst)[j]);
967 len = strlen(s);
969 mkallsmall2(s, w, len);
970 free((*slst)[j]);
971 if (spell(s)) {
972 (*slst)[l] = mystrdup(s);
973 if ((*slst)[l]) l++;
974 } else {
975 mkinitcap2(s, w, len);
976 if (spell(s)) {
977 (*slst)[l] = mystrdup(s);
978 if ((*slst)[l]) l++;
981 } else {
982 (*slst)[l] = (*slst)[j];
983 l++;
986 ns = l;
991 // remove duplications
992 int l = 0;
993 for (int j = 0; j < ns; j++) {
994 (*slst)[l] = (*slst)[j];
995 for (int k = 0; k < l; k++) {
996 if (strcmp((*slst)[k], (*slst)[j]) == 0) {
997 free((*slst)[j]);
998 l--;
999 break;
1002 l++;
1004 ns = l;
1006 // output conversion
1007 rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
1008 for (int j = 0; rl && j < ns; j++) {
1009 if (rl->conv((*slst)[j], wspace)) {
1010 free((*slst)[j]);
1011 (*slst)[j] = mystrdup(wspace);
1015 // if suggestions removed by nosuggest, onlyincompound parameters
1016 if (l == 0 && *slst) {
1017 free(*slst);
1018 *slst = NULL;
1020 return l;
1023 void Hunspell::free_list(char *** slst, int n) {
1024 freelist(slst, n);
1027 char * Hunspell::get_dic_encoding()
1029 return encoding;
1032 #ifdef HUNSPELL_EXPERIMENTAL
1033 // XXX need UTF-8 support
1034 int Hunspell::suggest_auto(char*** slst, const char * word)
1036 char cw[MAXWORDUTF8LEN];
1037 char wspace[MAXWORDUTF8LEN];
1038 if (!pSMgr || maxdic == 0) return 0;
1039 int wl = strlen(word);
1040 if (utf8) {
1041 if (wl >= MAXWORDUTF8LEN) return 0;
1042 } else {
1043 if (wl >= MAXWORDLEN) return 0;
1045 int captype = 0;
1046 int abbv = 0;
1047 wl = cleanword(cw, word, &captype, &abbv);
1048 if (wl == 0) return 0;
1049 int ns = 0;
1050 *slst = NULL; // HU, nsug in pSMgr->suggest
1052 switch(captype) {
1053 case NOCAP: {
1054 ns = pSMgr->suggest_auto(slst, cw, ns);
1055 if (ns>0) break;
1056 break;
1059 case INITCAP: {
1060 memcpy(wspace,cw,(wl+1));
1061 mkallsmall(wspace);
1062 ns = pSMgr->suggest_auto(slst, wspace, ns);
1063 for (int j=0; j < ns; j++)
1064 mkinitcap((*slst)[j]);
1065 ns = pSMgr->suggest_auto(slst, cw, ns);
1066 break;
1070 case HUHINITCAP:
1071 case HUHCAP: {
1072 ns = pSMgr->suggest_auto(slst, cw, ns);
1073 if (ns == 0) {
1074 memcpy(wspace,cw,(wl+1));
1075 mkallsmall(wspace);
1076 ns = pSMgr->suggest_auto(slst, wspace, ns);
1078 break;
1081 case ALLCAP: {
1082 memcpy(wspace,cw,(wl+1));
1083 mkallsmall(wspace);
1084 ns = pSMgr->suggest_auto(slst, wspace, ns);
1086 mkinitcap(wspace);
1087 ns = pSMgr->suggest_auto(slst, wspace, ns);
1089 for (int j=0; j < ns; j++)
1090 mkallcap((*slst)[j]);
1091 break;
1095 // word reversing wrapper for complex prefixes
1096 if (complexprefixes) {
1097 for (int j = 0; j < ns; j++) {
1098 if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
1102 // expand suggestions with dot(s)
1103 if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
1104 for (int j = 0; j < ns; j++) {
1105 (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
1106 strcat((*slst)[j], word + strlen(word) - abbv);
1110 // LANG_hu section: replace '-' with ' ' in Hungarian
1111 if (langnum == LANG_hu) {
1112 for (int j=0; j < ns; j++) {
1113 char * pos = strchr((*slst)[j],'-');
1114 if (pos) {
1115 int info;
1116 char w[MAXWORDUTF8LEN];
1117 *pos = '\0';
1118 strcpy(w, (*slst)[j]);
1119 strcat(w, pos + 1);
1120 spell(w, &info, NULL);
1121 if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
1122 *pos = ' ';
1123 } else *pos = '-';
1127 // END OF LANG_hu section
1128 return ns;
1130 #endif
1132 int Hunspell::stem(char*** slst, char ** desc, int n)
1134 char result[MAXLNLEN];
1135 char result2[MAXLNLEN];
1136 *slst = NULL;
1137 if (n == 0) return 0;
1138 *result2 = '\0';
1139 for (int i = 0; i < n; i++) {
1140 *result = '\0';
1141 // add compound word parts (except the last one)
1142 char * s = (char *) desc[i];
1143 char * part = strstr(s, MORPH_PART);
1144 if (part) {
1145 char * nextpart = strstr(part + 1, MORPH_PART);
1146 while (nextpart) {
1147 copy_field(result + strlen(result), part, MORPH_PART);
1148 part = nextpart;
1149 nextpart = strstr(part + 1, MORPH_PART);
1151 s = part;
1154 char **pl;
1155 char tok[MAXLNLEN];
1156 strcpy(tok, s);
1157 char * alt = strstr(tok, " | ");
1158 while (alt) {
1159 alt[1] = MSEP_ALT;
1160 alt = strstr(alt, " | ");
1162 int pln = line_tok(tok, &pl, MSEP_ALT);
1163 for (int k = 0; k < pln; k++) {
1164 // add derivational suffixes
1165 if (strstr(pl[k], MORPH_DERI_SFX)) {
1166 // remove inflectional suffixes
1167 char * is = strstr(pl[k], MORPH_INFL_SFX);
1168 if (is) *is = '\0';
1169 char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
1170 if (sg) {
1171 char ** gen;
1172 int genl = line_tok(sg, &gen, MSEP_REC);
1173 free(sg);
1174 for (int j = 0; j < genl; j++) {
1175 sprintf(result2 + strlen(result2), "%c%s%s",
1176 MSEP_REC, result, gen[j]);
1178 freelist(&gen, genl);
1180 } else {
1181 sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
1182 if (strstr(pl[k], MORPH_SURF_PFX)) {
1183 copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
1185 copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
1188 freelist(&pl, pln);
1190 int sln = line_tok(result2, slst, MSEP_REC);
1191 return uniqlist(*slst, sln);
1195 int Hunspell::stem(char*** slst, const char * word)
1197 char ** pl;
1198 int pln = analyze(&pl, word);
1199 int pln2 = stem(slst, pl, pln);
1200 freelist(&pl, pln);
1201 return pln2;
1204 #ifdef HUNSPELL_EXPERIMENTAL
1205 int Hunspell::suggest_pos_stems(char*** slst, const char * word)
1207 char cw[MAXWORDUTF8LEN];
1208 char wspace[MAXWORDUTF8LEN];
1209 if (! pSMgr || maxdic == 0) return 0;
1210 int wl = strlen(word);
1211 if (utf8) {
1212 if (wl >= MAXWORDUTF8LEN) return 0;
1213 } else {
1214 if (wl >= MAXWORDLEN) return 0;
1216 int captype = 0;
1217 int abbv = 0;
1218 wl = cleanword(cw, word, &captype, &abbv);
1219 if (wl == 0) return 0;
1221 int ns = 0; // ns=0 = normalized input
1223 *slst = NULL; // HU, nsug in pSMgr->suggest
1225 switch(captype) {
1226 case HUHCAP:
1227 case NOCAP: {
1228 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1230 if ((abbv) && (ns == 0)) {
1231 memcpy(wspace,cw,wl);
1232 *(wspace+wl) = '.';
1233 *(wspace+wl+1) = '\0';
1234 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1237 break;
1240 case INITCAP: {
1242 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1244 if (ns == 0 || ((*slst)[0][0] == '#')) {
1245 memcpy(wspace,cw,(wl+1));
1246 mkallsmall(wspace);
1247 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1250 break;
1254 case ALLCAP: {
1255 ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1256 if (ns != 0) break;
1258 memcpy(wspace,cw,(wl+1));
1259 mkallsmall(wspace);
1260 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1262 if (ns == 0) {
1263 mkinitcap(wspace);
1264 ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1266 break;
1270 return ns;
1272 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1274 const char * Hunspell::get_wordchars()
1276 return pAMgr->get_wordchars();
1279 unsigned short * Hunspell::get_wordchars_utf16(int * len)
1281 return pAMgr->get_wordchars_utf16(len);
1284 void Hunspell::mkinitcap(char * p)
1286 if (!utf8) {
1287 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1288 } else {
1289 int len;
1290 w_char u[MAXWORDLEN];
1291 len = u8_u16(u, MAXWORDLEN, p);
1292 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1293 u[0].h = (unsigned char) (i >> 8);
1294 u[0].l = (unsigned char) (i & 0x00FF);
1295 u16_u8(p, MAXWORDUTF8LEN, u, len);
1299 int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
1301 if (!utf8) {
1302 if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1303 } else if (nc > 0) {
1304 unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1305 u[0].h = (unsigned char) (i >> 8);
1306 u[0].l = (unsigned char) (i & 0x00FF);
1307 u16_u8(p, MAXWORDUTF8LEN, u, nc);
1308 return strlen(p);
1310 return nc;
1313 int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
1315 if (!utf8) {
1316 if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
1317 } else if (nc > 0) {
1318 unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
1319 u[0].h = (unsigned char) (i >> 8);
1320 u[0].l = (unsigned char) (i & 0x00FF);
1321 u16_u8(p, MAXWORDUTF8LEN, u, nc);
1322 return strlen(p);
1324 return nc;
1327 int Hunspell::add(const char * word)
1329 if (pHMgr[0]) return (pHMgr[0])->add(word);
1330 return 0;
1333 int Hunspell::add_with_affix(const char * word, const char * example)
1335 if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
1336 return 0;
1339 int Hunspell::remove(const char * word)
1341 if (pHMgr[0]) return (pHMgr[0])->remove(word);
1342 return 0;
1345 const char * Hunspell::get_version()
1347 return pAMgr->get_version();
1350 struct cs_info * Hunspell::get_csconv()
1352 return csconv;
1355 void Hunspell::cat_result(char * result, char * st)
1357 if (st) {
1358 if (*result) mystrcat(result, "\n", MAXLNLEN);
1359 mystrcat(result, st, MAXLNLEN);
1360 free(st);
1364 int Hunspell::analyze(char*** slst, const char * word)
1366 char cw[MAXWORDUTF8LEN];
1367 char wspace[MAXWORDUTF8LEN];
1368 w_char unicw[MAXWORDLEN];
1369 int wl2 = 0;
1370 *slst = NULL;
1371 if (! pSMgr || maxdic == 0) return 0;
1372 int nc = strlen(word);
1373 if (utf8) {
1374 if (nc >= MAXWORDUTF8LEN) return 0;
1375 } else {
1376 if (nc >= MAXWORDLEN) return 0;
1378 int captype = 0;
1379 int abbv = 0;
1380 int wl = 0;
1382 // input conversion
1383 RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
1384 if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
1385 else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
1387 if (wl == 0) {
1388 if (abbv) {
1389 for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
1390 cw[wl] = '\0';
1391 abbv = 0;
1392 } else return 0;
1395 char result[MAXLNLEN];
1396 char * st = NULL;
1398 *result = '\0';
1400 int n = 0;
1401 int n2 = 0;
1402 int n3 = 0;
1404 // test numbers
1405 // LANG_hu section: set dash information for suggestions
1406 if (langnum == LANG_hu) {
1407 while ((n < wl) &&
1408 (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
1409 n++;
1410 if ((cw[n] == '.') || (cw[n] == ',')) {
1411 if (((n2 == 0) && (n > 3)) ||
1412 ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
1413 n2++;
1414 n3 = n;
1418 if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
1419 if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
1420 mystrcat(result, cw, MAXLNLEN);
1421 result[n - 1] = '\0';
1422 if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1423 else {
1424 char sign = cw[n];
1425 cw[n] = '\0';
1426 cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1427 mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
1428 cw[n] = sign;
1429 cat_result(result, pSMgr->suggest_morph(cw + n));
1431 return line_tok(result, slst, MSEP_REC);
1434 // END OF LANG_hu section
1436 switch(captype) {
1437 case HUHCAP:
1438 case HUHINITCAP:
1439 case NOCAP: {
1440 cat_result(result, pSMgr->suggest_morph(cw));
1441 if (abbv) {
1442 memcpy(wspace,cw,wl);
1443 *(wspace+wl) = '.';
1444 *(wspace+wl+1) = '\0';
1445 cat_result(result, pSMgr->suggest_morph(wspace));
1447 break;
1449 case INITCAP: {
1450 wl = mkallsmall2(cw, unicw, nc);
1451 memcpy(wspace,cw,(wl+1));
1452 wl2 = mkinitcap2(cw, unicw, nc);
1453 cat_result(result, pSMgr->suggest_morph(wspace));
1454 cat_result(result, pSMgr->suggest_morph(cw));
1455 if (abbv) {
1456 *(wspace+wl) = '.';
1457 *(wspace+wl+1) = '\0';
1458 cat_result(result, pSMgr->suggest_morph(wspace));
1460 memcpy(wspace, cw, wl2);
1461 *(wspace+wl2) = '.';
1462 *(wspace+wl2+1) = '\0';
1464 cat_result(result, pSMgr->suggest_morph(wspace));
1466 break;
1468 case ALLCAP: {
1469 cat_result(result, pSMgr->suggest_morph(cw));
1470 if (abbv) {
1471 memcpy(wspace,cw,wl);
1472 *(wspace+wl) = '.';
1473 *(wspace+wl+1) = '\0';
1474 cat_result(result, pSMgr->suggest_morph(cw));
1476 wl = mkallsmall2(cw, unicw, nc);
1477 memcpy(wspace,cw,(wl+1));
1478 wl2 = mkinitcap2(cw, unicw, nc);
1480 cat_result(result, pSMgr->suggest_morph(wspace));
1481 cat_result(result, pSMgr->suggest_morph(cw));
1482 if (abbv) {
1483 *(wspace+wl) = '.';
1484 *(wspace+wl+1) = '\0';
1485 cat_result(result, pSMgr->suggest_morph(wspace));
1487 memcpy(wspace, cw, wl2);
1488 *(wspace+wl2) = '.';
1489 *(wspace+wl2+1) = '\0';
1491 cat_result(result, pSMgr->suggest_morph(wspace));
1493 break;
1497 if (*result) {
1498 // word reversing wrapper for complex prefixes
1499 if (complexprefixes) {
1500 if (utf8) reverseword_utf(result); else reverseword(result);
1502 return line_tok(result, slst, MSEP_REC);
1505 // compound word with dash (HU) I18n
1506 char * dash = NULL;
1507 int nresult = 0;
1508 // LANG_hu section: set dash information for suggestions
1509 if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
1510 if ((langnum == LANG_hu) && dash) {
1511 *dash='\0';
1512 // examine 2 sides of the dash
1513 if (dash[1] == '\0') { // base word ending with dash
1514 if (spell(cw)) {
1515 char * p = pSMgr->suggest_morph(cw);
1516 if (p) {
1517 int ret = line_tok(p, slst, MSEP_REC);
1518 free(p);
1519 return ret;
1523 } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
1524 if (spell(cw) && (spell("-e"))) {
1525 st = pSMgr->suggest_morph(cw);
1526 if (st) {
1527 mystrcat(result, st, MAXLNLEN);
1528 free(st);
1530 mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
1531 st = pSMgr->suggest_morph("-e");
1532 if (st) {
1533 mystrcat(result, st, MAXLNLEN);
1534 free(st);
1536 return line_tok(result, slst, MSEP_REC);
1538 } else {
1539 // first word ending with dash: word- XXX ???
1540 char r2 = *(dash + 1);
1541 dash[0]='-';
1542 dash[1]='\0';
1543 nresult = spell(cw);
1544 dash[1] = r2;
1545 dash[0]='\0';
1546 if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
1547 ((dash[1] > '0') && (dash[1] < '9')))) {
1548 st = pSMgr->suggest_morph(cw);
1549 if (st) {
1550 mystrcat(result, st, MAXLNLEN);
1551 free(st);
1552 mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
1554 st = pSMgr->suggest_morph(dash+1);
1555 if (st) {
1556 mystrcat(result, st, MAXLNLEN);
1557 free(st);
1559 return line_tok(result, slst, MSEP_REC);
1562 // affixed number in correct word
1563 if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
1564 (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
1565 *dash='-';
1566 n = 1;
1567 if (*(dash - n) == '.') n++;
1568 // search first not a number character to left from dash
1569 while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
1570 n++;
1572 if ((dash - n) < cw) n--;
1573 // numbers: valami1000000-hoz
1574 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1575 // 56-hoz, 6-hoz
1576 for(; n >= 1; n--) {
1577 if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
1578 mystrcat(result, cw, MAXLNLEN);
1579 result[dash - cw - n] = '\0';
1580 st = pSMgr->suggest_morph(dash - n);
1581 if (st) {
1582 mystrcat(result, st, MAXLNLEN);
1583 free(st);
1585 return line_tok(result, slst, MSEP_REC);
1590 return 0;
1593 int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
1595 *slst = NULL;
1596 if (!pSMgr || !pln) return 0;
1597 char **pl2;
1598 int pl2n = analyze(&pl2, word);
1599 int captype = 0;
1600 int abbv = 0;
1601 char cw[MAXWORDUTF8LEN];
1602 cleanword(cw, word, &captype, &abbv);
1603 char result[MAXLNLEN];
1604 *result = '\0';
1606 for (int i = 0; i < pln; i++) {
1607 cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
1609 freelist(&pl2, pl2n);
1611 if (*result) {
1612 // allcap
1613 if (captype == ALLCAP) mkallcap(result);
1615 // line split
1616 int linenum = line_tok(result, slst, MSEP_REC);
1618 // capitalize
1619 if (captype == INITCAP || captype == HUHINITCAP) {
1620 for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
1623 // temporary filtering of prefix related errors (eg.
1624 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
1626 int r = 0;
1627 for (int j=0; j < linenum; j++) {
1628 if (!spell((*slst)[j])) {
1629 free((*slst)[j]);
1630 (*slst)[j] = NULL;
1631 } else {
1632 if (r < j) (*slst)[r] = (*slst)[j];
1633 r++;
1636 if (r > 0) return r;
1637 free(*slst);
1638 *slst = NULL;
1640 return 0;
1643 int Hunspell::generate(char*** slst, const char * word, const char * pattern)
1645 char **pl;
1646 int pln = analyze(&pl, pattern);
1647 int n = generate(slst, word, pl, pln);
1648 freelist(&pl, pln);
1649 return uniqlist(*slst, n);
1652 // minimal XML parser functions
1653 int Hunspell::get_xml_par(char * dest, const char * par, int max)
1655 char * d = dest;
1656 if (!par) return 0;
1657 char end = *par;
1658 char * dmax = dest + max;
1659 if (end == '>') end = '<';
1660 else if (end != '\'' && end != '"') return 0; // bad XML
1661 for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
1662 *d = '\0';
1663 mystrrep(dest, "&lt;", "<");
1664 mystrrep(dest, "&amp;", "&");
1665 return (int)(d - dest);
1668 int Hunspell::get_langnum() const
1670 return langnum;
1673 // return the beginning of the element (attr == NULL) or the attribute
1674 const char * Hunspell::get_xml_pos(const char * s, const char * attr)
1676 const char * end = strchr(s, '>');
1677 const char * p = s;
1678 if (attr == NULL) return end;
1679 do {
1680 p = strstr(p, attr);
1681 if (!p || p >= end) return 0;
1682 } while (*(p-1) != ' ' && *(p-1) != '\n');
1683 return p + strlen(attr);
1686 int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
1687 char cw[MAXWORDUTF8LEN];
1688 if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
1689 strcmp(cw, value) == 0) return 1;
1690 return 0;
1693 int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
1694 int n = 0;
1695 char * p;
1696 if (!list) return 0;
1697 for (p = list; (p = strstr(p, tag)); p++) n++;
1698 if (n == 0) return 0;
1699 *slst = (char **) malloc(sizeof(char *) * n);
1700 if (!*slst) return 0;
1701 for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) {
1702 int l = strlen(p);
1703 (*slst)[n] = (char *) malloc(l + 1);
1704 if (!(*slst)[n]) return n;
1705 if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
1706 free((*slst)[n]);
1707 break;
1710 return n;
1713 int Hunspell::spellml(char*** slst, const char * word)
1715 char *q, *q2;
1716 char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
1717 q = (char *) strstr(word, "<query");
1718 if (!q) return 0; // bad XML input
1719 q2 = strchr(q, '>');
1720 if (!q2) return 0; // bad XML input
1721 q2 = strstr(q2, "<word");
1722 if (!q2) return 0; // bad XML input
1723 if (check_xml_par(q, "type=", "analyze")) {
1724 int n = 0, s = 0;
1725 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw);
1726 if (n == 0) return 0;
1727 // convert the result to <code><a>ana1</a><a>ana2</a></code> format
1728 for (int i = 0; i < n; i++) s+= strlen((*slst)[i]);
1729 char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&amp;
1730 if (!r) return 0;
1731 strcpy(r, "<code>");
1732 for (int i = 0; i < n; i++) {
1733 int l = strlen(r);
1734 strcpy(r + l, "<a>");
1735 strcpy(r + l + 3, (*slst)[i]);
1736 mystrrep(r + l + 3, "\t", " ");
1737 mystrrep(r + l + 3, "<", "&lt;");
1738 mystrrep(r + l + 3, "&", "&amp;");
1739 strcat(r, "</a>");
1740 free((*slst)[i]);
1742 strcat(r, "</code>");
1743 (*slst)[0] = r;
1744 return 1;
1745 } else if (check_xml_par(q, "type=", "stem")) {
1746 if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw);
1747 } else if (check_xml_par(q, "type=", "generate")) {
1748 int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
1749 if (n == 0) return 0;
1750 char * q3 = strstr(q2 + 1, "<word");
1751 if (q3) {
1752 if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
1753 return generate(slst, cw, cw2);
1755 } else {
1756 if ((q2 = strstr(q2 + 1, "<code"))) {
1757 char ** slst2;
1758 if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) {
1759 int n2 = generate(slst, cw, slst2, n);
1760 freelist(&slst2, n);
1761 return uniqlist(*slst, n2);
1763 freelist(&slst2, n);
1767 return 0;
1771 #ifdef HUNSPELL_EXPERIMENTAL
1772 // XXX need UTF-8 support
1773 char * Hunspell::morph_with_correction(const char * word)
1775 char cw[MAXWORDUTF8LEN];
1776 char wspace[MAXWORDUTF8LEN];
1777 if (! pSMgr || maxdic == 0) return NULL;
1778 int wl = strlen(word);
1779 if (utf8) {
1780 if (wl >= MAXWORDUTF8LEN) return NULL;
1781 } else {
1782 if (wl >= MAXWORDLEN) return NULL;
1784 int captype = 0;
1785 int abbv = 0;
1786 wl = cleanword(cw, word, &captype, &abbv);
1787 if (wl == 0) return NULL;
1789 char result[MAXLNLEN];
1790 char * st = NULL;
1792 *result = '\0';
1795 switch(captype) {
1796 case NOCAP: {
1797 st = pSMgr->suggest_morph_for_spelling_error(cw);
1798 if (st) {
1799 mystrcat(result, st, MAXLNLEN);
1800 free(st);
1802 if (abbv) {
1803 memcpy(wspace,cw,wl);
1804 *(wspace+wl) = '.';
1805 *(wspace+wl+1) = '\0';
1806 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1807 if (st) {
1808 if (*result) mystrcat(result, "\n", MAXLNLEN);
1809 mystrcat(result, st, MAXLNLEN);
1810 free(st);
1813 break;
1815 case INITCAP: {
1816 memcpy(wspace,cw,(wl+1));
1817 mkallsmall(wspace);
1818 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1819 if (st) {
1820 mystrcat(result, st, MAXLNLEN);
1821 free(st);
1823 st = pSMgr->suggest_morph_for_spelling_error(cw);
1824 if (st) {
1825 if (*result) mystrcat(result, "\n", MAXLNLEN);
1826 mystrcat(result, st, MAXLNLEN);
1827 free(st);
1829 if (abbv) {
1830 memcpy(wspace,cw,wl);
1831 *(wspace+wl) = '.';
1832 *(wspace+wl+1) = '\0';
1833 mkallsmall(wspace);
1834 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1835 if (st) {
1836 if (*result) mystrcat(result, "\n", MAXLNLEN);
1837 mystrcat(result, st, MAXLNLEN);
1838 free(st);
1840 mkinitcap(wspace);
1841 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1842 if (st) {
1843 if (*result) mystrcat(result, "\n", MAXLNLEN);
1844 mystrcat(result, st, MAXLNLEN);
1845 free(st);
1848 break;
1850 case HUHCAP: {
1851 st = pSMgr->suggest_morph_for_spelling_error(cw);
1852 if (st) {
1853 mystrcat(result, st, MAXLNLEN);
1854 free(st);
1856 memcpy(wspace,cw,(wl+1));
1857 mkallsmall(wspace);
1858 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1859 if (st) {
1860 if (*result) mystrcat(result, "\n", MAXLNLEN);
1861 mystrcat(result, st, MAXLNLEN);
1862 free(st);
1864 break;
1866 case ALLCAP: {
1867 memcpy(wspace,cw,(wl+1));
1868 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1869 if (st) {
1870 mystrcat(result, st, MAXLNLEN);
1871 free(st);
1873 mkallsmall(wspace);
1874 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1875 if (st) {
1876 if (*result) mystrcat(result, "\n", MAXLNLEN);
1877 mystrcat(result, st, MAXLNLEN);
1878 free(st);
1880 mkinitcap(wspace);
1881 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1882 if (st) {
1883 if (*result) mystrcat(result, "\n", MAXLNLEN);
1884 mystrcat(result, st, MAXLNLEN);
1885 free(st);
1887 if (abbv) {
1888 memcpy(wspace,cw,(wl+1));
1889 *(wspace+wl) = '.';
1890 *(wspace+wl+1) = '\0';
1891 if (*result) mystrcat(result, "\n", MAXLNLEN);
1892 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1893 if (st) {
1894 mystrcat(result, st, MAXLNLEN);
1895 free(st);
1897 mkallsmall(wspace);
1898 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1899 if (st) {
1900 if (*result) mystrcat(result, "\n", MAXLNLEN);
1901 mystrcat(result, st, MAXLNLEN);
1902 free(st);
1904 mkinitcap(wspace);
1905 st = pSMgr->suggest_morph_for_spelling_error(wspace);
1906 if (st) {
1907 if (*result) mystrcat(result, "\n", MAXLNLEN);
1908 mystrcat(result, st, MAXLNLEN);
1909 free(st);
1912 break;
1916 if (*result) return mystrdup(result);
1917 return NULL;
1920 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1922 Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
1924 return (Hunhandle*)(new Hunspell(affpath, dpath));
1927 Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
1928 const char * key)
1930 return (Hunhandle*)(new Hunspell(affpath, dpath, key));
1933 void Hunspell_destroy(Hunhandle *pHunspell)
1935 delete (Hunspell*)(pHunspell);
1938 int Hunspell_spell(Hunhandle *pHunspell, const char *word)
1940 return ((Hunspell*)pHunspell)->spell(word);
1943 char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
1945 return ((Hunspell*)pHunspell)->get_dic_encoding();
1948 int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
1950 return ((Hunspell*)pHunspell)->suggest(slst, word);
1953 int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
1955 return ((Hunspell*)pHunspell)->analyze(slst, word);
1958 int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
1960 return ((Hunspell*)pHunspell)->stem(slst, word);
1963 int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n)
1965 return ((Hunspell*)pHunspell)->stem(slst, desc, n);
1968 int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
1969 const char * word2)
1971 return ((Hunspell*)pHunspell)->generate(slst, word, word2);
1974 int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
1975 char** desc, int n)
1977 return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
1980 /* functions for run-time modification of the dictionary */
1982 /* add word to the run-time dictionary */
1984 int Hunspell_add(Hunhandle *pHunspell, const char * word) {
1985 return ((Hunspell*)pHunspell)->add(word);
1988 /* add word to the run-time dictionary with affix flags of
1989 * the example (a dictionary word): Hunspell will recognize
1990 * affixed forms of the new word, too.
1993 int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
1994 const char * example) {
1995 return ((Hunspell*)pHunspell)->add_with_affix(word, example);
1998 /* remove word from the run-time dictionary */
2000 int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
2001 return ((Hunspell*)pHunspell)->remove(word);
2004 void Hunspell_free_list(Hunhandle *, char *** slst, int n) {
2005 freelist(slst, n);