1 #include "license.hunspell"
2 #include "license.myspell"
8 #include "hunspell.hxx"
10 #ifndef MOZILLA_CLIENT
15 Hunspell::Hunspell(const char * affpath
, const char * dpath
, const char * key
)
21 affixpath
= mystrdup(affpath
);
24 /* first set up the hash manager */
25 pHMgr
[0] = new HashMgr(dpath
, affpath
, key
);
26 if (pHMgr
[0]) maxdic
= 1;
28 /* next set up the affix manager */
29 /* it needs access to the hash manager lookup methods */
30 pAMgr
= new AffixMgr(affpath
, pHMgr
, &maxdic
, key
);
32 /* get the preferred try string and the dictionary */
33 /* encoding from the Affix Manager for that dictionary */
34 char * try_string
= pAMgr
->get_try_string();
35 encoding
= pAMgr
->get_encoding();
36 langnum
= pAMgr
->get_langnum();
37 utf8
= pAMgr
->get_utf8();
39 csconv
= get_current_cs(encoding
);
40 complexprefixes
= pAMgr
->get_complexprefixes();
41 wordbreak
= pAMgr
->get_breaktable();
43 /* and finally set up the suggestion manager */
44 pSMgr
= new SuggestMgr(try_string
, MAXSUGGESTION
, pAMgr
);
45 if (try_string
) free(try_string
);
50 if (pSMgr
) delete pSMgr
;
51 if (pAMgr
) delete pAMgr
;
52 for (int i
= 0; i
< maxdic
; i
++) delete pHMgr
[i
];
60 if (encoding
) free(encoding
);
62 if (affixpath
) free(affixpath
);
66 // load extra dictionaries
67 int Hunspell::add_dic(const char * dpath
, const char * key
) {
68 if (maxdic
== MAXDIC
|| !affixpath
) return 1;
69 pHMgr
[maxdic
] = new HashMgr(dpath
, affixpath
, key
);
70 if (pHMgr
[maxdic
]) maxdic
++; else return 1;
74 // make a copy of src at destination while removing all leading
75 // blanks and removing any trailing periods after recording
76 // their presence with the abbreviation flag
77 // also since already going through character by character,
78 // set the capitalization type
79 // return the length of the "cleaned" (and UTF-8 encoded) word
81 int Hunspell::cleanword2(char * dest
, const char * src
,
82 w_char
* dest_utf
, int * nc
, int * pcaptype
, int * pabbrev
)
84 unsigned char * p
= (unsigned char *) dest
;
85 const unsigned char * q
= (const unsigned char * ) src
;
87 // first skip over any leading blanks
88 while ((*q
!= '\0') && (*q
== ' ')) q
++;
90 // now strip off any trailing periods (recording their presence)
92 int nl
= strlen((const char *)q
);
93 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
98 // if no characters are left it can't be capitalized
105 strncpy(dest
, (char *) q
, nl
);
109 *nc
= u8_u16(dest_utf
, MAXWORDLEN
, dest
);
110 // don't check too long words
111 if (*nc
>= MAXWORDLEN
) return 0;
112 if (*nc
== -1) { // big Unicode character (non BMP area)
116 *pcaptype
= get_captype_utf8(dest_utf
, *nc
, langnum
);
118 *pcaptype
= get_captype(dest
, nl
, csconv
);
124 int Hunspell::cleanword(char * dest
, const char * src
,
125 int * pcaptype
, int * pabbrev
)
127 unsigned char * p
= (unsigned char *) dest
;
128 const unsigned char * q
= (const unsigned char * ) src
;
131 // first skip over any leading blanks
132 while ((*q
!= '\0') && (*q
== ' ')) q
++;
134 // now strip off any trailing periods (recording their presence)
136 int nl
= strlen((const char *)q
);
137 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
142 // if no characters are left it can't be capitalized
149 // now determine the capitalization type of the first nl letters
157 if (csconv
[(*q
)].ccase
) ncap
++;
158 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
162 // remember to terminate the destination string
164 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
167 w_char t
[MAXWORDLEN
];
168 nc
= u8_u16(t
, MAXWORDLEN
, src
);
169 for (int i
= 0; i
< nc
; i
++) {
170 idx
= (t
[i
].h
<< 8) + t
[i
].l
;
171 unsigned short low
= unicodetolower(idx
, langnum
);
172 if (idx
!= low
) ncap
++;
173 if (unicodetoupper(idx
, langnum
) == low
) nneutral
++;
175 u16_u8(dest
, MAXWORDUTF8LEN
, t
, nc
);
177 idx
= (t
[0].h
<< 8) + t
[0].l
;
178 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
182 // now finally set the captype
185 } else if ((ncap
== 1) && firstcap
) {
187 } else if ((ncap
== nc
) || ((ncap
+ nneutral
) == nc
)){
189 } else if ((ncap
> 1) && firstcap
) {
190 *pcaptype
= HUHINITCAP
;
197 void Hunspell::mkallcap(char * p
)
200 w_char u
[MAXWORDLEN
];
201 int nc
= u8_u16(u
, MAXWORDLEN
, p
);
203 for (int i
= 0; i
< nc
; i
++) {
204 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
205 if (idx
!= unicodetoupper(idx
, langnum
)) {
206 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
207 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
210 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
213 *p
= csconv
[((unsigned char) *p
)].cupper
;
219 int Hunspell::mkallcap2(char * p
, w_char
* u
, int nc
)
223 for (int i
= 0; i
< nc
; i
++) {
224 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
225 unsigned short up
= unicodetoupper(idx
, langnum
);
227 u
[i
].h
= (unsigned char) (up
>> 8);
228 u
[i
].l
= (unsigned char) (up
& 0x00FF);
231 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
235 *p
= csconv
[((unsigned char) *p
)].cupper
;
243 void Hunspell::mkallsmall(char * p
)
246 *p
= csconv
[((unsigned char) *p
)].clower
;
251 int Hunspell::mkallsmall2(char * p
, w_char
* u
, int nc
)
255 for (int i
= 0; i
< nc
; i
++) {
256 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
257 unsigned short low
= unicodetolower(idx
, langnum
);
259 u
[i
].h
= (unsigned char) (low
>> 8);
260 u
[i
].l
= (unsigned char) (low
& 0x00FF);
263 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
267 *p
= csconv
[((unsigned char) *p
)].clower
;
274 // convert UTF-8 sharp S codes to latin 1
275 char * Hunspell::sharps_u8_l1(char * dest
, char * source
) {
278 for (p
++, source
++; *(source
- 1); p
++, source
++) {
280 if (*source
== '\x9F') *--p
= '\xDF';
285 // recursive search for right ss - sharp s permutations
286 hentry
* Hunspell::spellsharps(char * base
, char * pos
, int n
,
287 int repnum
, char * tmp
, int * info
, char **root
) {
288 pos
= strstr(pos
, "ss");
289 if (pos
&& (n
< MAXSHARPS
)) {
292 hentry
* h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
+ 1, tmp
, info
, root
);
296 h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
, tmp
, info
, root
);
298 } else if (repnum
> 0) {
299 if (utf8
) return checkword(base
, info
, root
);
300 return checkword(sharps_u8_l1(tmp
, base
), info
, root
);
305 int Hunspell::is_keepcase(const hentry
* rv
) {
306 return pAMgr
&& rv
->astr
&& pAMgr
->get_keepcase() &&
307 TESTAFF(rv
->astr
, pAMgr
->get_keepcase(), rv
->alen
);
310 /* insert a word to the beginning of the suggestion array and return ns */
311 int Hunspell::insert_sug(char ***slst
, char * word
, int ns
) {
312 char * dup
= mystrdup(word
);
314 if (ns
== MAXSUGGESTION
) {
318 for (int k
= ns
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
323 int Hunspell::spell(const char * word
, int * info
, char ** root
)
325 struct hentry
* rv
=NULL
;
326 // need larger vector. For example, Turkish capital letter I converted a
327 // 2-byte UTF-8 character (dotless i) by mkallsmall.
328 char cw
[MAXWORDUTF8LEN
];
329 char wspace
[MAXWORDUTF8LEN
];
330 w_char unicw
[MAXWORDLEN
];
331 // Hunspell supports XML input of the simplified API (see manual)
332 if (strcmp(word
, SPELL_XML
) == 0) return 1;
333 int nc
= strlen(word
);
336 if (nc
>= MAXWORDUTF8LEN
) return 0;
338 if (nc
>= MAXWORDLEN
) return 0;
345 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
346 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
347 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
350 if (wl
== 0 || maxdic
== 0) return 1;
351 if (root
) *root
= NULL
;
353 // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
354 enum { NBEGIN
, NNUM
, NSEP
};
358 for (i
= 0; (i
< wl
); i
++) {
359 if ((cw
[i
] <= '9') && (cw
[i
] >= '0')) {
361 } else if ((cw
[i
] == ',') || (cw
[i
] == '.') || (cw
[i
] == '-')) {
362 if ((nstate
== NSEP
) || (i
== 0)) break;
366 if ((i
== wl
) && (nstate
== NNUM
)) return 1;
367 if (!info
) info
= &info2
; else *info
= 0;
372 *info
+= SPELL_ORIGCAP
;
374 rv
= checkword(cw
, info
, root
);
375 if ((abbv
) && !(rv
)) {
376 memcpy(wspace
,cw
,wl
);
378 *(wspace
+wl
+1) = '\0';
379 rv
= checkword(wspace
, info
, root
);
384 *info
+= SPELL_ORIGCAP
;
385 rv
= checkword(cw
, info
, root
);
388 memcpy(wspace
,cw
,wl
);
390 *(wspace
+wl
+1) = '\0';
391 rv
= checkword(wspace
, info
, root
);
394 // Spec. prefix handling for Catalan, French, Italian:
395 // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
396 if (pAMgr
&& strchr(cw
, '\'')) {
397 wl
= mkallsmall2(cw
, unicw
, nc
);
398 //There are no really sane circumstances where this could fail,
400 if (char * apostrophe
= strchr(cw
, '\'')) {
402 w_char tmpword
[MAXWORDLEN
];
404 wl2
= u8_u16(tmpword
, MAXWORDLEN
, cw
);
407 mkinitcap2(apostrophe
+ 1, unicw
+ wl2
+ 1, nc
- wl2
- 1);
408 rv
= checkword(cw
, info
, root
);
412 mkinitcap2(apostrophe
+ 1, unicw
, nc
);
413 rv
= checkword(cw
, info
, root
);
417 mkinitcap2(cw
, unicw
, nc
);
418 rv
= checkword(cw
, info
, root
);
421 if (pAMgr
&& pAMgr
->get_checksharps() && strstr(cw
, "SS")) {
422 char tmpword
[MAXWORDUTF8LEN
];
423 wl
= mkallsmall2(cw
, unicw
, nc
);
424 memcpy(wspace
,cw
,(wl
+1));
425 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
427 wl2
= mkinitcap2(cw
, unicw
, nc
);
428 rv
= spellsharps(cw
, cw
, 0, 0, tmpword
, info
, root
);
430 if ((abbv
) && !(rv
)) {
432 *(wspace
+wl
+1) = '\0';
433 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
435 memcpy(wspace
, cw
, wl2
);
437 *(wspace
+wl2
+1) = '\0';
438 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
445 *info
+= SPELL_ORIGCAP
;
446 wl
= mkallsmall2(cw
, unicw
, nc
);
447 memcpy(wspace
,cw
,(wl
+1));
448 wl2
= mkinitcap2(cw
, unicw
, nc
);
449 if (captype
== INITCAP
) *info
+= SPELL_INITCAP
;
450 rv
= checkword(cw
, info
, root
);
451 if (captype
== INITCAP
) *info
-= SPELL_INITCAP
;
452 // forbid bad capitalization
453 // (for example, ijs -> Ijs instead of IJs in Dutch)
454 // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
455 if (*info
& SPELL_FORBIDDEN
) {
459 if (rv
&& is_keepcase(rv
) && (captype
== ALLCAP
)) rv
= NULL
;
462 rv
= checkword(wspace
, info
, root
);
466 *(wspace
+wl
+1) = '\0';
467 rv
= checkword(wspace
, info
, root
);
469 memcpy(wspace
, cw
, wl2
);
471 *(wspace
+wl2
+1) = '\0';
472 if (captype
== INITCAP
) *info
+= SPELL_INITCAP
;
473 rv
= checkword(wspace
, info
, root
);
474 if (captype
== INITCAP
) *info
-= SPELL_INITCAP
;
475 if (rv
&& is_keepcase(rv
) && (captype
== ALLCAP
)) rv
= NULL
;
479 if (rv
&& is_keepcase(rv
) &&
480 ((captype
== ALLCAP
) ||
481 // if CHECKSHARPS: KEEPCASE words with \xDF are allowed
482 // in INITCAP form, too.
483 !(pAMgr
->get_checksharps() &&
484 ((utf8
&& strstr(wspace
, "\xC3\x9F")) ||
485 (!utf8
&& strchr(wspace
, '\xDF')))))) rv
= NULL
;
491 if (pAMgr
&& pAMgr
->get_warn() && rv
->astr
&&
492 TESTAFF(rv
->astr
, pAMgr
->get_warn(), rv
->alen
)) {
494 if (pAMgr
->get_forbidwarn()) return 0;
495 return HUNSPELL_OK_WARN
;
500 // recursive breaking at break points
506 int numbreak
= pAMgr
? pAMgr
->get_numbreak() : 0;
508 // calculate break points for recursion limit
509 for (int j
= 0; j
< numbreak
; j
++) {
512 s
= (char *) strstr(s
, wordbreak
[j
]);
519 if (nbr
>= 10) return 0;
521 // check boundary patterns (^begin and end$)
522 for (int j
= 0; j
< numbreak
; j
++) {
523 int plen
= strlen(wordbreak
[j
]);
524 if (plen
== 1 || plen
> wl
) continue;
525 if (wordbreak
[j
][0] == '^' && strncmp(cw
, wordbreak
[j
] + 1, plen
- 1) == 0
526 && spell(cw
+ plen
- 1)) return 1;
527 if (wordbreak
[j
][plen
- 1] == '$' &&
528 strncmp(cw
+ wl
- plen
+ 1, wordbreak
[j
], plen
- 1) == 0) {
529 r
= cw
[wl
- plen
+ 1];
530 cw
[wl
- plen
+ 1] = '\0';
531 if (spell(cw
)) return 1;
532 cw
[wl
- plen
+ 1] = r
;
537 for (int j
= 0; j
< numbreak
; j
++) {
538 int plen
= strlen(wordbreak
[j
]);
539 s
=(char *) strstr(cw
, wordbreak
[j
]);
540 if (s
&& (s
> cw
) && (s
< cw
+ wl
- plen
)) {
541 if (!spell(s
+ plen
)) continue;
544 // examine 2 sides of the break point
545 if (spell(cw
)) return 1;
548 // LANG_hu: spec. dash rule
549 if (langnum
== LANG_hu
&& strcmp(wordbreak
[j
], "-") == 0) {
552 if (spell(cw
)) return 1; // check the first part with dash
555 // end of LANG speficic region
564 struct hentry
* Hunspell::checkword(const char * w
, int * info
, char ** root
)
566 struct hentry
* he
= NULL
;
568 char w2
[MAXWORDUTF8LEN
];
571 char * ignoredchars
= pAMgr
->get_ignore();
572 if (ignoredchars
!= NULL
) {
575 int ignoredchars_utf16_len
;
576 unsigned short * ignoredchars_utf16
= pAMgr
->get_ignore_utf16(&ignoredchars_utf16_len
);
577 remove_ignored_chars_utf(w2
, ignoredchars_utf16
, ignoredchars_utf16_len
);
579 remove_ignored_chars(w2
,ignoredchars
);
589 // word reversing wrapper for complex prefixes
590 if (complexprefixes
) {
595 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
598 // look word in hash table
599 for (i
= 0; (i
< maxdic
) && !he
; i
++) {
600 he
= (pHMgr
[i
])->lookup(word
);
602 // check forbidden and onlyincompound words
603 if ((he
) && (he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
604 if (info
) *info
+= SPELL_FORBIDDEN
;
605 // LANG_hu section: set dash information for suggestions
606 if (langnum
== LANG_hu
) {
607 if (pAMgr
->get_compoundflag() &&
608 TESTAFF(he
->astr
, pAMgr
->get_compoundflag(), he
->alen
)) {
609 if (info
) *info
+= SPELL_COMPOUND
;
615 // he = next not needaffix, onlyincompound homonym or onlyupcase word
616 while (he
&& (he
->astr
) &&
617 ((pAMgr
->get_needaffix() && TESTAFF(he
->astr
, pAMgr
->get_needaffix(), he
->alen
)) ||
618 (pAMgr
->get_onlyincompound() && TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) ||
619 (info
&& (*info
& SPELL_INITCAP
) && TESTAFF(he
->astr
, ONLYUPCASEFLAG
, he
->alen
))
620 )) he
= he
->next_homonym
;
623 // check with affixes
625 // try stripping off affixes */
626 he
= pAMgr
->affix_check(word
, len
, 0);
628 // check compound restriction and onlyupcase
629 if (he
&& he
->astr
&& (
630 (pAMgr
->get_onlyincompound() &&
631 TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) ||
632 (info
&& (*info
& SPELL_INITCAP
) &&
633 TESTAFF(he
->astr
, ONLYUPCASEFLAG
, he
->alen
)))) {
638 if ((he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
639 if (info
) *info
+= SPELL_FORBIDDEN
;
643 *root
= mystrdup(he
->word
);
644 if (*root
&& complexprefixes
) {
645 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
648 // try check compound word
649 } else if (pAMgr
->get_compound()) {
650 he
= pAMgr
->compound_check(word
, len
, 0, 0, 100, 0, NULL
, 0, 0, info
);
651 // LANG_hu section: `moving rule' with last dash
652 if ((!he
) && (langnum
== LANG_hu
) && (word
[len
-1] == '-')) {
653 char * dup
= mystrdup(word
);
654 if (!dup
) return NULL
;
656 he
= pAMgr
->compound_check(dup
, len
-1, -5, 0, 100, 0, NULL
, 1, 0, info
);
659 // end of LANG speficic region
662 *root
= mystrdup(he
->word
);
663 if (*root
&& complexprefixes
) {
664 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
667 if (info
) *info
+= SPELL_COMPOUND
;
676 int Hunspell::suggest(char*** slst
, const char * word
)
679 char cw
[MAXWORDUTF8LEN
];
680 char wspace
[MAXWORDUTF8LEN
];
681 if (!pSMgr
|| maxdic
== 0) return 0;
682 w_char unicw
[MAXWORDLEN
];
684 // process XML input of the simplified API (see manual)
685 if (strncmp(word
, SPELL_XML
, sizeof(SPELL_XML
) - 3) == 0) {
686 return spellml(slst
, word
);
688 int nc
= strlen(word
);
690 if (nc
>= MAXWORDUTF8LEN
) return 0;
692 if (nc
>= MAXWORDLEN
) return 0;
699 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
700 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
701 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
703 if (wl
== 0) return 0;
707 // check capitalized form for FORCEUCASE
708 if (pAMgr
&& captype
== NOCAP
&& pAMgr
->get_forceucase()) {
709 int info
= SPELL_ORIGCAP
;
711 if (checkword(cw
, &info
, NULL
)) {
715 wlst
= (char **) malloc(MAXSUGGESTION
* sizeof(char *));
716 if (wlst
== NULL
) return -1;
718 for (int i
= 0; i
< MAXSUGGESTION
; i
++) {
722 wlst
[0] = mystrdup(cw
);
730 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
736 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
738 memcpy(wspace
,cw
,(wl
+1));
739 mkallsmall2(wspace
, unicw
, nc
);
740 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
746 ns
= pSMgr
->suggest(slst
, cw
, ns
, &onlycmpdsug
);
749 // something.The -> something. The
750 char * dot
= strchr(cw
, '.');
751 if (dot
&& (dot
> cw
)) {
754 w_char w_
[MAXWORDLEN
];
755 int wl_
= u8_u16(w_
, MAXWORDLEN
, dot
+ 1);
756 captype_
= get_captype_utf8(w_
, wl_
, langnum
);
757 } else captype_
= get_captype(dot
+1, strlen(dot
+1), csconv
);
758 if (captype_
== INITCAP
) {
759 char * st
= mystrdup(cw
);
760 if (st
) st
= (char *) realloc(st
, wl
+ 2);
762 st
[(dot
- cw
) + 1] = ' ';
763 strcpy(st
+ (dot
- cw
) + 2, dot
+ 1);
764 ns
= insert_sug(slst
, st
, ns
);
769 if (captype
== HUHINITCAP
) {
770 // TheOpenOffice.org -> The OpenOffice.org
771 memcpy(wspace
,cw
,(wl
+1));
772 mkinitsmall2(wspace
, unicw
, nc
);
773 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
775 memcpy(wspace
,cw
,(wl
+1));
776 mkallsmall2(wspace
, unicw
, nc
);
777 if (spell(wspace
)) ns
= insert_sug(slst
, wspace
, ns
);
779 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
780 if (captype
== HUHINITCAP
) {
781 mkinitcap2(wspace
, unicw
, nc
);
782 if (spell(wspace
)) ns
= insert_sug(slst
, wspace
, ns
);
783 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
785 // aNew -> "a New" (instead of "a new")
786 for (int j
= prevns
; j
< ns
; j
++) {
787 char * space
= strchr((*slst
)[j
],' ');
789 int slen
= strlen(space
+ 1);
790 // different case after space (need capitalisation)
791 if ((slen
< wl
) && strcmp(cw
+ wl
- slen
, space
+ 1)) {
792 w_char w
[MAXWORDLEN
];
794 char * r
= (*slst
)[j
];
795 if (utf8
) wc
= u8_u16(w
, MAXWORDLEN
, space
+ 1);
796 mkinitcap2(space
+ 1, w
, wc
);
797 // set as first suggestion
798 for (int k
= j
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
808 memcpy(wspace
, cw
, (wl
+1));
809 mkallsmall2(wspace
, unicw
, nc
);
810 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
812 if (pAMgr
&& pAMgr
->get_keepcase() && spell(wspace
))
813 ns
= insert_sug(slst
, wspace
, ns
);
814 mkinitcap2(wspace
, unicw
, nc
);
815 ns
= pSMgr
->suggest(slst
, wspace
, ns
, &onlycmpdsug
);
816 for (int j
=0; j
< ns
; j
++) {
817 mkallcap((*slst
)[j
]);
818 if (pAMgr
&& pAMgr
->get_checksharps()) {
821 pos
= strstr((*slst
)[j
], "\xC3\x9F");
825 pos
= strstr(pos
+2, "\xC3\x9F");
828 pos
= strchr((*slst
)[j
], '\xDF');
830 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 2);
831 mystrrep((*slst
)[j
], "\xDF", "SS");
832 pos
= strchr((*slst
)[j
], '\xDF');
841 // LANG_hu section: replace '-' with ' ' in Hungarian
842 if (langnum
== LANG_hu
) {
843 for (int j
=0; j
< ns
; j
++) {
844 char * pos
= strchr((*slst
)[j
],'-');
847 char w
[MAXWORDUTF8LEN
];
849 strcpy(w
, (*slst
)[j
]);
851 spell(w
, &info
, NULL
);
852 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
858 // END OF LANG_hu section
860 // try ngram approach since found nothing or only compound words
861 if (pAMgr
&& (ns
== 0 || onlycmpdsug
) && (pAMgr
->get_maxngramsugs() != 0) && (*slst
)) {
864 ns
= pSMgr
->ngsuggest(*slst
, cw
, ns
, pHMgr
, maxdic
);
870 memcpy(wspace
,cw
,(wl
+1));
871 mkallsmall2(wspace
, unicw
, nc
);
872 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
877 memcpy(wspace
,cw
,(wl
+1));
878 mkallsmall2(wspace
, unicw
, nc
);
879 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
883 memcpy(wspace
,cw
,(wl
+1));
884 mkallsmall2(wspace
, unicw
, nc
);
886 ns
= pSMgr
->ngsuggest(*slst
, wspace
, ns
, pHMgr
, maxdic
);
887 for (int j
= oldns
; j
< ns
; j
++)
888 mkallcap((*slst
)[j
]);
894 // try dash suggestion (Afo-American -> Afro-American)
895 if (char * pos
= strchr(cw
, '-')) {
902 for (int j
= 0; j
< ns
&& nodashsug
== 1; j
++) {
903 if (strchr((*slst
)[j
], '-')) nodashsug
= 0;
906 while (nodashsug
&& !last
) {
907 if (*pos
== '\0') last
= 1; else *pos
= '\0';
909 nn
= suggest(&nlst
, ppos
);
910 for (int j
= nn
- 1; j
>= 0; j
--) {
911 strncpy(wspace
, cw
, ppos
- cw
);
912 strcpy(wspace
+ (ppos
- cw
), nlst
[j
]);
915 strcat(wspace
, pos
+ 1);
917 ns
= insert_sug(slst
, wspace
, ns
);
920 if (nlst
!= NULL
) free(nlst
);
926 pos
= strchr(ppos
, '-');
928 if (!pos
) pos
= cw
+ strlen(cw
);
932 // word reversing wrapper for complex prefixes
933 if (complexprefixes
) {
934 for (int j
= 0; j
< ns
; j
++) {
935 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
940 if (capwords
) for (int j
=0; j
< ns
; j
++) {
941 mkinitcap((*slst
)[j
]);
944 // expand suggestions with dot(s)
945 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
946 for (int j
= 0; j
< ns
; j
++) {
947 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
948 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
952 // remove bad capitalized and forbidden forms
953 if (pAMgr
&& (pAMgr
->get_keepcase() || pAMgr
->get_forbiddenword())) {
958 for (int j
=0; j
< ns
; j
++) {
959 if (!strchr((*slst
)[j
],' ') && !spell((*slst
)[j
])) {
964 len
= u8_u16(w
, MAXSWL
, (*slst
)[j
]);
966 strcpy(s
, (*slst
)[j
]);
969 mkallsmall2(s
, w
, len
);
972 (*slst
)[l
] = mystrdup(s
);
975 mkinitcap2(s
, w
, len
);
977 (*slst
)[l
] = mystrdup(s
);
982 (*slst
)[l
] = (*slst
)[j
];
991 // remove duplications
993 for (int j
= 0; j
< ns
; j
++) {
994 (*slst
)[l
] = (*slst
)[j
];
995 for (int k
= 0; k
< l
; k
++) {
996 if (strcmp((*slst
)[k
], (*slst
)[j
]) == 0) {
1006 // output conversion
1007 rl
= (pAMgr
) ? pAMgr
->get_oconvtable() : NULL
;
1008 for (int j
= 0; rl
&& j
< ns
; j
++) {
1009 if (rl
->conv((*slst
)[j
], wspace
)) {
1011 (*slst
)[j
] = mystrdup(wspace
);
1015 // if suggestions removed by nosuggest, onlyincompound parameters
1016 if (l
== 0 && *slst
) {
1023 void Hunspell::free_list(char *** slst
, int n
) {
1027 char * Hunspell::get_dic_encoding()
1032 #ifdef HUNSPELL_EXPERIMENTAL
1033 // XXX need UTF-8 support
1034 int Hunspell::suggest_auto(char*** slst
, const char * word
)
1036 char cw
[MAXWORDUTF8LEN
];
1037 char wspace
[MAXWORDUTF8LEN
];
1038 if (!pSMgr
|| maxdic
== 0) return 0;
1039 int wl
= strlen(word
);
1041 if (wl
>= MAXWORDUTF8LEN
) return 0;
1043 if (wl
>= MAXWORDLEN
) return 0;
1047 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1048 if (wl
== 0) return 0;
1050 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1054 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1060 memcpy(wspace
,cw
,(wl
+1));
1062 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1063 for (int j
=0; j
< ns
; j
++)
1064 mkinitcap((*slst
)[j
]);
1065 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1072 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
1074 memcpy(wspace
,cw
,(wl
+1));
1076 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1082 memcpy(wspace
,cw
,(wl
+1));
1084 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1087 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
1089 for (int j
=0; j
< ns
; j
++)
1090 mkallcap((*slst
)[j
]);
1095 // word reversing wrapper for complex prefixes
1096 if (complexprefixes
) {
1097 for (int j
= 0; j
< ns
; j
++) {
1098 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
1102 // expand suggestions with dot(s)
1103 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
1104 for (int j
= 0; j
< ns
; j
++) {
1105 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
1106 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
1110 // LANG_hu section: replace '-' with ' ' in Hungarian
1111 if (langnum
== LANG_hu
) {
1112 for (int j
=0; j
< ns
; j
++) {
1113 char * pos
= strchr((*slst
)[j
],'-');
1116 char w
[MAXWORDUTF8LEN
];
1118 strcpy(w
, (*slst
)[j
]);
1120 spell(w
, &info
, NULL
);
1121 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
1127 // END OF LANG_hu section
1132 int Hunspell::stem(char*** slst
, char ** desc
, int n
)
1134 char result
[MAXLNLEN
];
1135 char result2
[MAXLNLEN
];
1137 if (n
== 0) return 0;
1139 for (int i
= 0; i
< n
; i
++) {
1141 // add compound word parts (except the last one)
1142 char * s
= (char *) desc
[i
];
1143 char * part
= strstr(s
, MORPH_PART
);
1145 char * nextpart
= strstr(part
+ 1, MORPH_PART
);
1147 copy_field(result
+ strlen(result
), part
, MORPH_PART
);
1149 nextpart
= strstr(part
+ 1, MORPH_PART
);
1157 char * alt
= strstr(tok
, " | ");
1160 alt
= strstr(alt
, " | ");
1162 int pln
= line_tok(tok
, &pl
, MSEP_ALT
);
1163 for (int k
= 0; k
< pln
; k
++) {
1164 // add derivational suffixes
1165 if (strstr(pl
[k
], MORPH_DERI_SFX
)) {
1166 // remove inflectional suffixes
1167 char * is
= strstr(pl
[k
], MORPH_INFL_SFX
);
1169 char * sg
= pSMgr
->suggest_gen(&(pl
[k
]), 1, pl
[k
]);
1172 int genl
= line_tok(sg
, &gen
, MSEP_REC
);
1174 for (int j
= 0; j
< genl
; j
++) {
1175 sprintf(result2
+ strlen(result2
), "%c%s%s",
1176 MSEP_REC
, result
, gen
[j
]);
1178 freelist(&gen
, genl
);
1181 sprintf(result2
+ strlen(result2
), "%c%s", MSEP_REC
, result
);
1182 if (strstr(pl
[k
], MORPH_SURF_PFX
)) {
1183 copy_field(result2
+ strlen(result2
), pl
[k
], MORPH_SURF_PFX
);
1185 copy_field(result2
+ strlen(result2
), pl
[k
], MORPH_STEM
);
1190 int sln
= line_tok(result2
, slst
, MSEP_REC
);
1191 return uniqlist(*slst
, sln
);
1195 int Hunspell::stem(char*** slst
, const char * word
)
1198 int pln
= analyze(&pl
, word
);
1199 int pln2
= stem(slst
, pl
, pln
);
1204 #ifdef HUNSPELL_EXPERIMENTAL
1205 int Hunspell::suggest_pos_stems(char*** slst
, const char * word
)
1207 char cw
[MAXWORDUTF8LEN
];
1208 char wspace
[MAXWORDUTF8LEN
];
1209 if (! pSMgr
|| maxdic
== 0) return 0;
1210 int wl
= strlen(word
);
1212 if (wl
>= MAXWORDUTF8LEN
) return 0;
1214 if (wl
>= MAXWORDLEN
) return 0;
1218 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1219 if (wl
== 0) return 0;
1221 int ns
= 0; // ns=0 = normalized input
1223 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1228 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1230 if ((abbv
) && (ns
== 0)) {
1231 memcpy(wspace
,cw
,wl
);
1233 *(wspace
+wl
+1) = '\0';
1234 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1242 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1244 if (ns
== 0 || ((*slst
)[0][0] == '#')) {
1245 memcpy(wspace
,cw
,(wl
+1));
1247 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1255 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1258 memcpy(wspace
,cw
,(wl
+1));
1260 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1264 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1272 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1274 const char * Hunspell::get_wordchars()
1276 return pAMgr
->get_wordchars();
1279 unsigned short * Hunspell::get_wordchars_utf16(int * len
)
1281 return pAMgr
->get_wordchars_utf16(len
);
1284 void Hunspell::mkinitcap(char * p
)
1287 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1290 w_char u
[MAXWORDLEN
];
1291 len
= u8_u16(u
, MAXWORDLEN
, p
);
1292 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1293 u
[0].h
= (unsigned char) (i
>> 8);
1294 u
[0].l
= (unsigned char) (i
& 0x00FF);
1295 u16_u8(p
, MAXWORDUTF8LEN
, u
, len
);
1299 int Hunspell::mkinitcap2(char * p
, w_char
* u
, int nc
)
1302 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1303 } else if (nc
> 0) {
1304 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1305 u
[0].h
= (unsigned char) (i
>> 8);
1306 u
[0].l
= (unsigned char) (i
& 0x00FF);
1307 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1313 int Hunspell::mkinitsmall2(char * p
, w_char
* u
, int nc
)
1316 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].clower
;
1317 } else if (nc
> 0) {
1318 unsigned short i
= unicodetolower((u
[0].h
<< 8) + u
[0].l
, langnum
);
1319 u
[0].h
= (unsigned char) (i
>> 8);
1320 u
[0].l
= (unsigned char) (i
& 0x00FF);
1321 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1327 int Hunspell::add(const char * word
)
1329 if (pHMgr
[0]) return (pHMgr
[0])->add(word
);
1333 int Hunspell::add_with_affix(const char * word
, const char * example
)
1335 if (pHMgr
[0]) return (pHMgr
[0])->add_with_affix(word
, example
);
1339 int Hunspell::remove(const char * word
)
1341 if (pHMgr
[0]) return (pHMgr
[0])->remove(word
);
1345 const char * Hunspell::get_version()
1347 return pAMgr
->get_version();
1350 struct cs_info
* Hunspell::get_csconv()
1355 void Hunspell::cat_result(char * result
, char * st
)
1358 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1359 mystrcat(result
, st
, MAXLNLEN
);
1364 int Hunspell::analyze(char*** slst
, const char * word
)
1366 char cw
[MAXWORDUTF8LEN
];
1367 char wspace
[MAXWORDUTF8LEN
];
1368 w_char unicw
[MAXWORDLEN
];
1371 if (! pSMgr
|| maxdic
== 0) return 0;
1372 int nc
= strlen(word
);
1374 if (nc
>= MAXWORDUTF8LEN
) return 0;
1376 if (nc
>= MAXWORDLEN
) return 0;
1383 RepList
* rl
= (pAMgr
) ? pAMgr
->get_iconvtable() : NULL
;
1384 if (rl
&& rl
->conv(word
, wspace
)) wl
= cleanword2(cw
, wspace
, unicw
, &nc
, &captype
, &abbv
);
1385 else wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
1389 for (wl
= 0; wl
< abbv
; wl
++) cw
[wl
] = '.';
1395 char result
[MAXLNLEN
];
1405 // LANG_hu section: set dash information for suggestions
1406 if (langnum
== LANG_hu
) {
1408 (((cw
[n
] <= '9') && (cw
[n
] >= '0')) || (((cw
[n
] == '.') || (cw
[n
] == ',')) && (n
> 0)))) {
1410 if ((cw
[n
] == '.') || (cw
[n
] == ',')) {
1411 if (((n2
== 0) && (n
> 3)) ||
1412 ((n2
> 0) && ((cw
[n
-1] == '.') || (cw
[n
-1] == ',')))) break;
1418 if ((n
== wl
) && (n3
> 0) && (n
- n3
> 3)) return 0;
1419 if ((n
== wl
) || ((n
>0) && ((cw
[n
]=='%') || (cw
[n
]=='\xB0')) && checkword(cw
+n
, NULL
, NULL
))) {
1420 mystrcat(result
, cw
, MAXLNLEN
);
1421 result
[n
- 1] = '\0';
1422 if (n
== wl
) cat_result(result
, pSMgr
->suggest_morph(cw
+ n
- 1));
1426 cat_result(result
, pSMgr
->suggest_morph(cw
+ n
- 1));
1427 mystrcat(result
, "+", MAXLNLEN
); // XXX SPEC. MORPHCODE
1429 cat_result(result
, pSMgr
->suggest_morph(cw
+ n
));
1431 return line_tok(result
, slst
, MSEP_REC
);
1434 // END OF LANG_hu section
1440 cat_result(result
, pSMgr
->suggest_morph(cw
));
1442 memcpy(wspace
,cw
,wl
);
1444 *(wspace
+wl
+1) = '\0';
1445 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1450 wl
= mkallsmall2(cw
, unicw
, nc
);
1451 memcpy(wspace
,cw
,(wl
+1));
1452 wl2
= mkinitcap2(cw
, unicw
, nc
);
1453 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1454 cat_result(result
, pSMgr
->suggest_morph(cw
));
1457 *(wspace
+wl
+1) = '\0';
1458 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1460 memcpy(wspace
, cw
, wl2
);
1461 *(wspace
+wl2
) = '.';
1462 *(wspace
+wl2
+1) = '\0';
1464 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1469 cat_result(result
, pSMgr
->suggest_morph(cw
));
1471 memcpy(wspace
,cw
,wl
);
1473 *(wspace
+wl
+1) = '\0';
1474 cat_result(result
, pSMgr
->suggest_morph(cw
));
1476 wl
= mkallsmall2(cw
, unicw
, nc
);
1477 memcpy(wspace
,cw
,(wl
+1));
1478 wl2
= mkinitcap2(cw
, unicw
, nc
);
1480 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1481 cat_result(result
, pSMgr
->suggest_morph(cw
));
1484 *(wspace
+wl
+1) = '\0';
1485 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1487 memcpy(wspace
, cw
, wl2
);
1488 *(wspace
+wl2
) = '.';
1489 *(wspace
+wl2
+1) = '\0';
1491 cat_result(result
, pSMgr
->suggest_morph(wspace
));
1498 // word reversing wrapper for complex prefixes
1499 if (complexprefixes
) {
1500 if (utf8
) reverseword_utf(result
); else reverseword(result
);
1502 return line_tok(result
, slst
, MSEP_REC
);
1505 // compound word with dash (HU) I18n
1508 // LANG_hu section: set dash information for suggestions
1509 if (langnum
== LANG_hu
) dash
= (char *) strchr(cw
,'-');
1510 if ((langnum
== LANG_hu
) && dash
) {
1512 // examine 2 sides of the dash
1513 if (dash
[1] == '\0') { // base word ending with dash
1515 char * p
= pSMgr
->suggest_morph(cw
);
1517 int ret
= line_tok(p
, slst
, MSEP_REC
);
1523 } else if ((dash
[1] == 'e') && (dash
[2] == '\0')) { // XXX (HU) -e hat.
1524 if (spell(cw
) && (spell("-e"))) {
1525 st
= pSMgr
->suggest_morph(cw
);
1527 mystrcat(result
, st
, MAXLNLEN
);
1530 mystrcat(result
,"+", MAXLNLEN
); // XXX spec. separator in MORPHCODE
1531 st
= pSMgr
->suggest_morph("-e");
1533 mystrcat(result
, st
, MAXLNLEN
);
1536 return line_tok(result
, slst
, MSEP_REC
);
1539 // first word ending with dash: word- XXX ???
1540 char r2
= *(dash
+ 1);
1543 nresult
= spell(cw
);
1546 if (nresult
&& spell(dash
+1) && ((strlen(dash
+1) > 1) ||
1547 ((dash
[1] > '0') && (dash
[1] < '9')))) {
1548 st
= pSMgr
->suggest_morph(cw
);
1550 mystrcat(result
, st
, MAXLNLEN
);
1552 mystrcat(result
,"+", MAXLNLEN
); // XXX spec. separator in MORPHCODE
1554 st
= pSMgr
->suggest_morph(dash
+1);
1556 mystrcat(result
, st
, MAXLNLEN
);
1559 return line_tok(result
, slst
, MSEP_REC
);
1562 // affixed number in correct word
1563 if (nresult
&& (dash
> cw
) && (((*(dash
-1)<='9') &&
1564 (*(dash
-1)>='0')) || (*(dash
-1)=='.'))) {
1567 if (*(dash
- n
) == '.') n
++;
1568 // search first not a number character to left from dash
1569 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
1572 if ((dash
- n
) < cw
) n
--;
1573 // numbers: valami1000000-hoz
1574 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1576 for(; n
>= 1; n
--) {
1577 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') && checkword(dash
- n
, NULL
, NULL
)) {
1578 mystrcat(result
, cw
, MAXLNLEN
);
1579 result
[dash
- cw
- n
] = '\0';
1580 st
= pSMgr
->suggest_morph(dash
- n
);
1582 mystrcat(result
, st
, MAXLNLEN
);
1585 return line_tok(result
, slst
, MSEP_REC
);
1593 int Hunspell::generate(char*** slst
, const char * word
, char ** pl
, int pln
)
1596 if (!pSMgr
|| !pln
) return 0;
1598 int pl2n
= analyze(&pl2
, word
);
1601 char cw
[MAXWORDUTF8LEN
];
1602 cleanword(cw
, word
, &captype
, &abbv
);
1603 char result
[MAXLNLEN
];
1606 for (int i
= 0; i
< pln
; i
++) {
1607 cat_result(result
, pSMgr
->suggest_gen(pl2
, pl2n
, pl
[i
]));
1609 freelist(&pl2
, pl2n
);
1613 if (captype
== ALLCAP
) mkallcap(result
);
1616 int linenum
= line_tok(result
, slst
, MSEP_REC
);
1619 if (captype
== INITCAP
|| captype
== HUHINITCAP
) {
1620 for (int j
=0; j
< linenum
; j
++) mkinitcap((*slst
)[j
]);
1623 // temporary filtering of prefix related errors (eg.
1624 // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
1627 for (int j
=0; j
< linenum
; j
++) {
1628 if (!spell((*slst
)[j
])) {
1632 if (r
< j
) (*slst
)[r
] = (*slst
)[j
];
1636 if (r
> 0) return r
;
1643 int Hunspell::generate(char*** slst
, const char * word
, const char * pattern
)
1646 int pln
= analyze(&pl
, pattern
);
1647 int n
= generate(slst
, word
, pl
, pln
);
1649 return uniqlist(*slst
, n
);
1652 // minimal XML parser functions
1653 int Hunspell::get_xml_par(char * dest
, const char * par
, int max
)
1658 char * dmax
= dest
+ max
;
1659 if (end
== '>') end
= '<';
1660 else if (end
!= '\'' && end
!= '"') return 0; // bad XML
1661 for (par
++; d
< dmax
&& *par
!= '\0' && *par
!= end
; par
++, d
++) *d
= *par
;
1663 mystrrep(dest
, "<", "<");
1664 mystrrep(dest
, "&", "&");
1665 return (int)(d
- dest
);
1668 int Hunspell::get_langnum() const
1673 // return the beginning of the element (attr == NULL) or the attribute
1674 const char * Hunspell::get_xml_pos(const char * s
, const char * attr
)
1676 const char * end
= strchr(s
, '>');
1678 if (attr
== NULL
) return end
;
1680 p
= strstr(p
, attr
);
1681 if (!p
|| p
>= end
) return 0;
1682 } while (*(p
-1) != ' ' && *(p
-1) != '\n');
1683 return p
+ strlen(attr
);
1686 int Hunspell::check_xml_par(const char * q
, const char * attr
, const char * value
) {
1687 char cw
[MAXWORDUTF8LEN
];
1688 if (get_xml_par(cw
, get_xml_pos(q
, attr
), MAXWORDUTF8LEN
- 1) &&
1689 strcmp(cw
, value
) == 0) return 1;
1693 int Hunspell::get_xml_list(char ***slst
, char * list
, const char * tag
) {
1696 if (!list
) return 0;
1697 for (p
= list
; (p
= strstr(p
, tag
)); p
++) n
++;
1698 if (n
== 0) return 0;
1699 *slst
= (char **) malloc(sizeof(char *) * n
);
1700 if (!*slst
) return 0;
1701 for (p
= list
, n
= 0; (p
= strstr(p
, tag
)); p
++, n
++) {
1703 (*slst
)[n
] = (char *) malloc(l
+ 1);
1704 if (!(*slst
)[n
]) return n
;
1705 if (!get_xml_par((*slst
)[n
], p
+ strlen(tag
) - 1, l
)) {
1713 int Hunspell::spellml(char*** slst
, const char * word
)
1716 char cw
[MAXWORDUTF8LEN
], cw2
[MAXWORDUTF8LEN
];
1717 q
= (char *) strstr(word
, "<query");
1718 if (!q
) return 0; // bad XML input
1719 q2
= strchr(q
, '>');
1720 if (!q2
) return 0; // bad XML input
1721 q2
= strstr(q2
, "<word");
1722 if (!q2
) return 0; // bad XML input
1723 if (check_xml_par(q
, "type=", "analyze")) {
1725 if (get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
- 10)) n
= analyze(slst
, cw
);
1726 if (n
== 0) return 0;
1727 // convert the result to <code><a>ana1</a><a>ana2</a></code> format
1728 for (int i
= 0; i
< n
; i
++) s
+= strlen((*slst
)[i
]);
1729 char * r
= (char *) malloc(6 + 5 * s
+ 7 * n
+ 7 + 1); // XXX 5*s->&->&
1731 strcpy(r
, "<code>");
1732 for (int i
= 0; i
< n
; i
++) {
1734 strcpy(r
+ l
, "<a>");
1735 strcpy(r
+ l
+ 3, (*slst
)[i
]);
1736 mystrrep(r
+ l
+ 3, "\t", " ");
1737 mystrrep(r
+ l
+ 3, "<", "<");
1738 mystrrep(r
+ l
+ 3, "&", "&");
1742 strcat(r
, "</code>");
1745 } else if (check_xml_par(q
, "type=", "stem")) {
1746 if (get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
- 1)) return stem(slst
, cw
);
1747 } else if (check_xml_par(q
, "type=", "generate")) {
1748 int n
= get_xml_par(cw
, strchr(q2
, '>'), MAXWORDUTF8LEN
- 1);
1749 if (n
== 0) return 0;
1750 char * q3
= strstr(q2
+ 1, "<word");
1752 if (get_xml_par(cw2
, strchr(q3
, '>'), MAXWORDUTF8LEN
- 1)) {
1753 return generate(slst
, cw
, cw2
);
1756 if ((q2
= strstr(q2
+ 1, "<code"))) {
1758 if ((n
= get_xml_list(&slst2
, strchr(q2
, '>'), "<a>"))) {
1759 int n2
= generate(slst
, cw
, slst2
, n
);
1760 freelist(&slst2
, n
);
1761 return uniqlist(*slst
, n2
);
1763 freelist(&slst2
, n
);
1771 #ifdef HUNSPELL_EXPERIMENTAL
1772 // XXX need UTF-8 support
1773 char * Hunspell::morph_with_correction(const char * word
)
1775 char cw
[MAXWORDUTF8LEN
];
1776 char wspace
[MAXWORDUTF8LEN
];
1777 if (! pSMgr
|| maxdic
== 0) return NULL
;
1778 int wl
= strlen(word
);
1780 if (wl
>= MAXWORDUTF8LEN
) return NULL
;
1782 if (wl
>= MAXWORDLEN
) return NULL
;
1786 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1787 if (wl
== 0) return NULL
;
1789 char result
[MAXLNLEN
];
1797 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1799 mystrcat(result
, st
, MAXLNLEN
);
1803 memcpy(wspace
,cw
,wl
);
1805 *(wspace
+wl
+1) = '\0';
1806 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1808 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1809 mystrcat(result
, st
, MAXLNLEN
);
1816 memcpy(wspace
,cw
,(wl
+1));
1818 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1820 mystrcat(result
, st
, MAXLNLEN
);
1823 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1825 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1826 mystrcat(result
, st
, MAXLNLEN
);
1830 memcpy(wspace
,cw
,wl
);
1832 *(wspace
+wl
+1) = '\0';
1834 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1836 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1837 mystrcat(result
, st
, MAXLNLEN
);
1841 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1843 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1844 mystrcat(result
, st
, MAXLNLEN
);
1851 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1853 mystrcat(result
, st
, MAXLNLEN
);
1856 memcpy(wspace
,cw
,(wl
+1));
1858 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1860 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1861 mystrcat(result
, st
, MAXLNLEN
);
1867 memcpy(wspace
,cw
,(wl
+1));
1868 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1870 mystrcat(result
, st
, MAXLNLEN
);
1874 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1876 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1877 mystrcat(result
, st
, MAXLNLEN
);
1881 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1883 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1884 mystrcat(result
, st
, MAXLNLEN
);
1888 memcpy(wspace
,cw
,(wl
+1));
1890 *(wspace
+wl
+1) = '\0';
1891 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1892 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1894 mystrcat(result
, st
, MAXLNLEN
);
1898 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1900 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1901 mystrcat(result
, st
, MAXLNLEN
);
1905 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1907 if (*result
) mystrcat(result
, "\n", MAXLNLEN
);
1908 mystrcat(result
, st
, MAXLNLEN
);
1916 if (*result
) return mystrdup(result
);
1920 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1922 Hunhandle
*Hunspell_create(const char * affpath
, const char * dpath
)
1924 return (Hunhandle
*)(new Hunspell(affpath
, dpath
));
1927 Hunhandle
*Hunspell_create_key(const char * affpath
, const char * dpath
,
1930 return (Hunhandle
*)(new Hunspell(affpath
, dpath
, key
));
1933 void Hunspell_destroy(Hunhandle
*pHunspell
)
1935 delete (Hunspell
*)(pHunspell
);
1938 int Hunspell_spell(Hunhandle
*pHunspell
, const char *word
)
1940 return ((Hunspell
*)pHunspell
)->spell(word
);
1943 char *Hunspell_get_dic_encoding(Hunhandle
*pHunspell
)
1945 return ((Hunspell
*)pHunspell
)->get_dic_encoding();
1948 int Hunspell_suggest(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1950 return ((Hunspell
*)pHunspell
)->suggest(slst
, word
);
1953 int Hunspell_analyze(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1955 return ((Hunspell
*)pHunspell
)->analyze(slst
, word
);
1958 int Hunspell_stem(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1960 return ((Hunspell
*)pHunspell
)->stem(slst
, word
);
1963 int Hunspell_stem2(Hunhandle
*pHunspell
, char*** slst
, char** desc
, int n
)
1965 return ((Hunspell
*)pHunspell
)->stem(slst
, desc
, n
);
1968 int Hunspell_generate(Hunhandle
*pHunspell
, char*** slst
, const char * word
,
1971 return ((Hunspell
*)pHunspell
)->generate(slst
, word
, word2
);
1974 int Hunspell_generate2(Hunhandle
*pHunspell
, char*** slst
, const char * word
,
1977 return ((Hunspell
*)pHunspell
)->generate(slst
, word
, desc
, n
);
1980 /* functions for run-time modification of the dictionary */
1982 /* add word to the run-time dictionary */
1984 int Hunspell_add(Hunhandle
*pHunspell
, const char * word
) {
1985 return ((Hunspell
*)pHunspell
)->add(word
);
1988 /* add word to the run-time dictionary with affix flags of
1989 * the example (a dictionary word): Hunspell will recognize
1990 * affixed forms of the new word, too.
1993 int Hunspell_add_with_affix(Hunhandle
*pHunspell
, const char * word
,
1994 const char * example
) {
1995 return ((Hunspell
*)pHunspell
)->add_with_affix(word
, example
);
1998 /* remove word from the run-time dictionary */
2000 int Hunspell_remove(Hunhandle
*pHunspell
, const char * word
) {
2001 return ((Hunspell
*)pHunspell
)->remove(word
);
2004 void Hunspell_free_list(Hunhandle
*, char *** slst
, int n
) {