1
#include "license.hunspell"
2 #include "license.myspell"
14 #include "hunspell.hxx"
17 #ifndef MOZILLA_CLIENT
23 Hunspell::Hunspell(const char * affpath
, const char * dpath
)
30 /* first set up the hash manager */
31 pHMgr
= new HashMgr(dpath
, affpath
);
33 /* next set up the affix manager */
34 /* it needs access to the hash manager lookup methods */
35 pAMgr
= new AffixMgr(affpath
,pHMgr
);
37 /* get the preferred try string and the dictionary */
38 /* encoding from the Affix Manager for that dictionary */
39 char * try_string
= pAMgr
->get_try_string();
40 encoding
= pAMgr
->get_encoding();
41 csconv
= get_current_cs(encoding
);
42 langnum
= pAMgr
->get_langnum();
43 utf8
= pAMgr
->get_utf8();
44 complexprefixes
= pAMgr
->get_complexprefixes();
45 wordbreak
= pAMgr
->get_breaktable();
47 /* and finally set up the suggestion manager */
48 pSMgr
= new SuggestMgr(try_string
, MAXSUGGESTION
, pAMgr
);
49 if (try_string
) free(try_string
);
55 if (pSMgr
) delete pSMgr
;
56 if (pAMgr
) delete pAMgr
;
57 if (pHMgr
) delete pHMgr
;
62 if (encoding
) free(encoding
);
67 // make a copy of src at destination while removing all leading
68 // blanks and removing any trailing periods after recording
69 // their presence with the abbreviation flag
70 // also since already going through character by character,
71 // set the capitalization type
72 // return the length of the "cleaned" (and UTF-8 encoded) word
74 int Hunspell::cleanword2(char * dest
, const char * src
,
75 w_char
* dest_utf
, int * nc
, int * pcaptype
, int * pabbrev
)
77 unsigned char * p
= (unsigned char *) dest
;
78 const unsigned char * q
= (const unsigned char * ) src
;
81 // first skip over any leading blanks
82 while ((*q
!= '\0') && (*q
== ' ')) q
++;
84 // now strip off any trailing periods (recording their presence)
86 int nl
= strlen((const char *)q
);
87 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
92 // if no characters are left it can't be capitalized
99 // now determine the capitalization type of the first nl letters
107 if (csconv
[(*q
)].ccase
) ncap
++;
108 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
112 // remember to terminate the destination string
115 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
119 *nc
= u8_u16(dest_utf
, MAXWORDLEN
, (const char *) q
);
120 // don't check too long words
121 if (*nc
>= MAXWORDLEN
) return 0;
122 if (*nc
== -1) { // big Unicode character (non BMP area)
124 strcpy((char *) p
, (char *) q
);
128 for (int i
= 0; i
< *nc
; i
++) {
129 idx
= (dest_utf
[i
].h
<< 8) + dest_utf
[i
].l
;
130 if (idx
!= unicodetolower(idx
, langnum
)) ncap
++;
131 if (unicodetoupper(idx
, langnum
) == unicodetolower(idx
, langnum
)) nneutral
++;
133 u16_u8(dest
, MAXWORDUTF8LEN
, dest_utf
, *nc
);
135 idx
= (dest_utf
[0].h
<< 8) + dest_utf
[0].l
;
136 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
140 // now finally set the captype
143 } else if ((ncap
== 1) && firstcap
) {
145 } else if ((ncap
== *nc
) || ((ncap
+ nneutral
) == *nc
)) {
147 } else if ((ncap
> 1) && firstcap
) {
148 *pcaptype
= HUHINITCAP
;
155 int Hunspell::cleanword(char * dest
, const char * src
,
156 int * pcaptype
, int * pabbrev
)
158 unsigned char * p
= (unsigned char *) dest
;
159 const unsigned char * q
= (const unsigned char * ) src
;
162 // first skip over any leading blanks
163 while ((*q
!= '\0') && (*q
== ' ')) q
++;
165 // now strip off any trailing periods (recording their presence)
167 int nl
= strlen((const char *)q
);
168 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
173 // if no characters are left it can't be capitalized
180 // now determine the capitalization type of the first nl letters
188 if (csconv
[(*q
)].ccase
) ncap
++;
189 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
193 // remember to terminate the destination string
195 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
198 w_char t
[MAXWORDLEN
];
199 nc
= u8_u16(t
, MAXWORDLEN
, src
);
200 for (int i
= 0; i
< nc
; i
++) {
201 idx
= (t
[i
].h
<< 8) + t
[i
].l
;
202 if (idx
!= unicodetolower(idx
, langnum
)) ncap
++;
203 if (unicodetoupper(idx
, langnum
) == unicodetolower(idx
, langnum
)) nneutral
++;
205 u16_u8(dest
, MAXWORDUTF8LEN
, t
, nc
);
207 idx
= (t
[0].h
<< 8) + t
[0].l
;
208 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
212 // now finally set the captype
215 } else if ((ncap
== 1) && firstcap
) {
217 } else if ((ncap
== nc
) || ((ncap
+ nneutral
) == nc
)){
219 } else if ((ncap
> 1) && firstcap
) {
220 *pcaptype
= HUHINITCAP
;
228 void Hunspell::mkallcap(char * p
)
231 w_char u
[MAXWORDLEN
];
232 int nc
= u8_u16(u
, MAXWORDLEN
, p
);
234 for (int i
= 0; i
< nc
; i
++) {
235 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
236 if (idx
!= unicodetoupper(idx
, langnum
)) {
237 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
238 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
241 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
244 *p
= csconv
[((unsigned char) *p
)].cupper
;
250 int Hunspell::mkallcap2(char * p
, w_char
* u
, int nc
)
254 for (int i
= 0; i
< nc
; i
++) {
255 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
256 if (idx
!= unicodetoupper(idx
, langnum
)) {
257 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
258 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
261 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
265 *p
= csconv
[((unsigned char) *p
)].cupper
;
273 void Hunspell::mkallsmall(char * p
)
276 *p
= csconv
[((unsigned char) *p
)].clower
;
281 int Hunspell::mkallsmall2(char * p
, w_char
* u
, int nc
)
285 for (int i
= 0; i
< nc
; i
++) {
286 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
287 if (idx
!= unicodetolower(idx
, langnum
)) {
288 u
[i
].h
= (unsigned char) (unicodetolower(idx
, langnum
) >> 8);
289 u
[i
].l
= (unsigned char) (unicodetolower(idx
, langnum
) & 0x00FF);
292 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
296 *p
= csconv
[((unsigned char) *p
)].clower
;
303 // convert UTF-8 sharp S codes to latin 1
304 char * Hunspell::sharps_u8_l1(char * dest
, char * source
) {
307 for (p
++, source
++; *(source
- 1); p
++, source
++) {
309 if (*source
== '?') *--p
= '?';
314 // recursive search for right ss-?permutations
315 hentry
* Hunspell::spellsharps(char * base
, char * pos
, int n
,
316 int repnum
, char * tmp
, int * info
, char **root
) {
317 pos
= strstr(pos
, "ss");
318 if (pos
&& (n
< MAXSHARPS
)) {
321 hentry
* h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
+ 1, tmp
, info
, root
);
325 h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
, tmp
, info
, root
);
327 } else if (repnum
> 0) {
328 if (utf8
) return checkword(base
, info
, root
);
329 return checkword(sharps_u8_l1(tmp
, base
), info
, root
);
334 int Hunspell::is_keepcase(const hentry
* rv
) {
335 return pAMgr
&& rv
->astr
&& pAMgr
->get_keepcase() &&
336 TESTAFF(rv
->astr
, pAMgr
->get_keepcase(), rv
->alen
);
339 /* check and insert a word to beginning of the suggestion array */
340 int Hunspell::insert_sug(char ***slst
, char * word
, int *ns
) {
342 if (*ns
== MAXSUGGESTION
) {
346 for (int k
= *ns
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
347 (*slst
)[0] = mystrdup(word
);
353 int Hunspell::spell(const char * word
, int * info
, char ** root
)
355 struct hentry
* rv
=NULL
;
356 // need larger vector. For example, Turkish capital letter I converted a
357 // 2-byte UTF-8 character (dotless i) by mkallsmall.
358 char cw
[MAXWORDUTF8LEN
+ 4];
359 char wspace
[MAXWORDUTF8LEN
+ 4];
360 w_char unicw
[MAXWORDLEN
+ 1];
361 int nc
= strlen(word
);
364 if (nc
>= MAXWORDUTF8LEN
) return 0;
366 if (nc
>= MAXWORDLEN
) return 0;
370 int wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
372 if (wl
== 0) return 1;
375 if (root
) *root
= NULL
;
377 // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.)
378 enum { NBEGIN
, NNUM
, NSEP
};
382 for (i
= 0; (i
< wl
); i
++) {
383 if ((cw
[i
] <= '9') && (cw
[i
] >= '0')) {
385 } else if ((cw
[i
] == ',') || (cw
[i
] == '.') || (cw
[i
] == '-')) {
386 if ((nstate
== NSEP
) || (i
== 0)) break;
390 if ((i
== wl
) && (nstate
== NNUM
)) return 1;
392 // LANG_hu section: number(s) + (percent or degree) with suffixes
393 if (langnum
== LANG_hu
) {
394 if ((nstate
== NNUM
) && ((cw
[i
] == '%') || (cw
[i
] == '?'))
395 && checkword(cw
+ i
, info
, root
)) return 1;
397 // END of LANG_hu section
403 rv
= checkword(cw
, info
, root
);
404 if ((abbv
) && !(rv
)) {
405 memcpy(wspace
,cw
,wl
);
407 *(wspace
+wl
+1) = '\0';
408 rv
= checkword(wspace
, info
, root
);
413 rv
= checkword(cw
, info
, root
);
416 memcpy(wspace
,cw
,wl
);
418 *(wspace
+wl
+1) = '\0';
419 rv
= checkword(wspace
, info
, root
);
422 if (pAMgr
&& pAMgr
->get_checksharps() && strstr(cw
, "SS")) {
423 char tmpword
[MAXWORDUTF8LEN
];
424 wl
= mkallsmall2(cw
, unicw
, nc
);
425 memcpy(wspace
,cw
,(wl
+1));
426 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
428 wl2
= mkinitcap2(cw
, unicw
, nc
);
429 rv
= spellsharps(cw
, cw
, 0, 0, tmpword
, info
, root
);
431 if ((abbv
) && !(rv
)) {
433 *(wspace
+wl
+1) = '\0';
434 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
436 memcpy(wspace
, cw
, wl2
);
438 *(wspace
+wl2
+1) = '\0';
439 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
446 wl
= mkallsmall2(cw
, unicw
, nc
);
447 memcpy(wspace
,cw
,(wl
+1));
448 rv
= checkword(wspace
, info
, root
);
449 if (!rv
|| (is_keepcase(rv
) && !((captype
== INITCAP
) &&
450 // if CHECKSHARPS: KEEPCASE words with ?are allowed
451 // in INITCAP form, too.
452 pAMgr
->get_checksharps() && ((utf8
&& strstr(wspace
, "脽")) ||
453 (!utf8
&& strchr(wspace
, '?')))))) {
454 wl2
= mkinitcap2(cw
, unicw
, nc
);
455 rv
= checkword(cw
, info
, root
);
456 if (rv
&& (captype
== ALLCAP
) && is_keepcase(rv
)) rv
= NULL
;
460 *(wspace
+wl
+1) = '\0';
461 rv
= checkword(wspace
, info
, root
);
462 if (!rv
|| is_keepcase(rv
)) {
463 memcpy(wspace
, cw
, wl2
);
465 *(wspace
+wl2
+1) = '\0';
466 rv
= checkword(wspace
, info
, root
);
467 if (rv
&& ((captype
== ALLCAP
) && is_keepcase(rv
))) rv
= NULL
;
476 // recursive breaking at break points (not good for morphological analysis)
480 for (int j
= 0; j
< pAMgr
->get_numbreak(); j
++) {
481 s
=(char *) strstr(cw
, wordbreak
[j
]);
485 // examine 2 sides of the break point
486 if (spell(cw
) && spell(s
+ strlen(wordbreak
[j
]))) {
495 // LANG_hu: compoundings with dashes and n-dashes XXX deprecated!
496 if (langnum
== LANG_hu
) {
498 // compound word with dash (HU) I18n
502 dash
= (char *) strstr(cw
,"-");
503 if (dash
&& !wordbreak
) {
505 // examine 2 sides of the dash
506 if (spell(cw
) && spell(dash
+ 3)) {
512 dash
= (char *) strchr(cw
,'-');
515 // examine 2 sides of the dash
516 if (dash
[1] == '\0') { // base word ending with dash
517 if (spell(cw
)) return 1;
519 // first word ending with dash: word-
520 char r2
= *(dash
+ 1);
526 if (result
&& spell(dash
+1) && ((strlen(dash
+1) > 1) || (dash
[1] == 'e') ||
527 ((dash
[1] > '0') && (dash
[1] < '9')))) return 1;
529 // affixed number in correct word
530 if (result
&& (dash
> cw
) && (((*(dash
-1)<='9') && (*(dash
-1)>='0')) || (*(dash
-1)>='.'))) {
533 if (*(dash
- n
) == '.') n
++;
534 // search first not a number character to left from dash
535 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
538 if ((dash
- n
) < cw
) n
--;
539 // numbers: deprecated
541 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') &&
542 checkword(dash
- n
, info
, root
)) return 1;
550 //int Hunspell::spell(const char * word) {
551 // return spell(word, NULL, NULL);
554 struct hentry
* Hunspell::checkword(const char * w
, int * info
, char ** root
)
556 struct hentry
* he
= NULL
;
558 char w2
[MAXWORDUTF8LEN
];
561 char * ignoredchars
= pAMgr
->get_ignore();
562 if (ignoredchars
!= NULL
) {
565 int ignoredchars_utf16_len
;
566 unsigned short * ignoredchars_utf16
= pAMgr
->get_ignore_utf16(&ignoredchars_utf16_len
);
567 remove_ignored_chars_utf(w2
, ignoredchars_utf16
, ignoredchars_utf16_len
);
569 remove_ignored_chars(w2
,ignoredchars
);
574 // word reversing wrapper for complex prefixes
575 if (complexprefixes
) {
580 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
583 // look word in hash table
584 if (pHMgr
) he
= pHMgr
->lookup(word
);
586 // check forbidden and onlyincompound words
587 if ((he
) && (he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
588 info
+= SPELL_FORBIDDEN
;
589 // LANG_hu section: set dash information for suggestions
590 if (langnum
== LANG_hu
) {
591 if (pAMgr
->get_compoundflag() &&
592 TESTAFF(he
->astr
, pAMgr
->get_compoundflag(), he
->alen
)) {
593 info
+= SPELL_COMPOUND
;
599 // he = next not pseudoroot and not onlyincompound homonym or NULL
600 while (he
&& (he
->astr
) &&
601 ((pAMgr
->get_pseudoroot() && TESTAFF(he
->astr
, pAMgr
->get_pseudoroot(), he
->alen
)) ||
602 (pAMgr
->get_onlyincompound() && TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
))
603 )) he
= he
->next_homonym
;
605 // check with affixes
607 // try stripping off affixes */
609 he
= pAMgr
->affix_check(word
, len
, 0);
611 // check compound restriction
612 if (he
&& he
->astr
&& pAMgr
->get_onlyincompound() &&
613 TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) he
= NULL
;
616 if ((he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
617 info
+= SPELL_FORBIDDEN
;
621 *root
= mystrdup(he
->word
);
622 if (complexprefixes
) {
623 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
626 // try check compound word
627 } else if (pAMgr
->get_compound()) {
628 he
= pAMgr
->compound_check(word
, len
,
629 0,0,100,0,NULL
,0,NULL
,NULL
,0);
630 // LANG_hu section: `moving rule' with last dash
631 if ((!he
) && (langnum
== LANG_hu
) && (word
[len
-1]=='-')) {
632 char * dup
= mystrdup(word
);
634 he
= pAMgr
->compound_check(dup
, len
-1,
635 -5,0,100,0,NULL
,1,NULL
,NULL
,0);
638 // end of LANG speficic region
641 *root
= mystrdup(he
->word
);
642 if (complexprefixes
) {
643 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
646 if (info
) *info
+= SPELL_COMPOUND
;
655 int Hunspell::suggest(char*** slst
, const char * word
)
657 char cw
[MAXWORDUTF8LEN
+ 4];
658 char wspace
[MAXWORDUTF8LEN
+ 4];
659 if (! pSMgr
) return 0;
660 w_char unicw
[MAXWORDLEN
+ 1];
661 int nc
= strlen(word
);
663 if (nc
>= MAXWORDUTF8LEN
) return 0;
665 if (nc
>= MAXWORDLEN
) return 0;
669 int wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
670 if (wl
== 0) return 0;
678 ns
= pSMgr
->suggest(slst
, cw
, ns
);
684 ns
= pSMgr
->suggest(slst
, cw
, ns
);
686 memcpy(wspace
,cw
,(wl
+1));
687 mkallsmall2(wspace
, unicw
, nc
);
688 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
694 ns
= pSMgr
->suggest(slst
, cw
, ns
);
697 if (captype
== HUHINITCAP
) {
698 // TheOpenOffice.org -> The OpenOffice.org
699 memcpy(wspace
,cw
,(wl
+1));
700 mkinitsmall2(wspace
, unicw
, nc
);
701 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
703 memcpy(wspace
,cw
,(wl
+1));
704 mkallsmall2(wspace
, unicw
, nc
);
705 insert_sug(slst
, wspace
, &ns
);
707 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
708 if (captype
== HUHINITCAP
) {
709 mkinitcap2(wspace
, unicw
, nc
);
710 insert_sug(slst
, wspace
, &ns
);
711 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
713 // aNew -> "a New" (instead of "a new")
714 for (int j
= prevns
; j
< ns
; j
++) {
715 char * space
= strchr((*slst
)[j
],' ');
717 int slen
= strlen(space
+ 1);
718 // different case after space (need capitalisation)
719 if ((slen
< wl
) && strcmp(cw
+ wl
- slen
, space
+ 1)) {
720 w_char w
[MAXWORDLEN
+ 1];
722 char * r
= (*slst
)[j
];
723 if (utf8
) wc
= u8_u16(w
, MAXWORDLEN
, space
+ 1);
724 mkinitcap2(space
+ 1, w
, wc
);
725 // set as first suggestion
726 for (int k
= j
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
736 memcpy(wspace
, cw
, (wl
+1));
737 mkallsmall2(wspace
, unicw
, nc
);
738 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
740 if (pAMgr
&& pAMgr
->get_keepcase()) insert_sug(slst
, wspace
, &ns
);
741 mkinitcap2(wspace
, unicw
, nc
);
742 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
743 for (int j
=0; j
< ns
; j
++) {
744 mkallcap((*slst
)[j
]);
745 if (pAMgr
&& pAMgr
->get_checksharps()) {
748 pos
= strstr((*slst
)[j
], "脽");
752 pos
= strstr(pos
+2, "脽");
755 pos
= strchr((*slst
)[j
], '?');
757 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 2);
758 mystrrep((*slst
)[j
], "?", "SS");
759 pos
= strchr((*slst
)[j
], '?');
768 // LANG_hu section: replace '-' with ' ' in Hungarian
769 if (langnum
== LANG_hu
) {
770 for (int j
=0; j
< ns
; j
++) {
771 char * pos
= strchr((*slst
)[j
],'-');
774 char w
[MAXWORDUTF8LEN
];
776 strcpy(w
, (*slst
)[j
]);
778 spell(w
, &info
, NULL
);
779 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
785 // END OF LANG_hu section
787 // try ngram approach since found nothing
788 if ((ns
== 0) && pAMgr
&& (pAMgr
->get_maxngramsugs() != 0)) {
792 ns
= pSMgr
->ngsuggest(*slst
, cw
, pHMgr
);
796 memcpy(wspace
,cw
,(wl
+1));
797 mkallsmall2(wspace
, unicw
, nc
);
798 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
803 memcpy(wspace
,cw
,(wl
+1));
804 mkallsmall2(wspace
, unicw
, nc
);
805 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
809 memcpy(wspace
,cw
,(wl
+1));
810 mkallsmall2(wspace
, unicw
, nc
);
811 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
812 for (int j
=0; j
< ns
; j
++)
813 mkallcap((*slst
)[j
]);
819 // word reversing wrapper for complex prefixes
820 if (complexprefixes
) {
821 for (int j
= 0; j
< ns
; j
++) {
822 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
827 if (capwords
) for (int j
=0; j
< ns
; j
++) {
828 mkinitcap((*slst
)[j
]);
831 // expand suggestions with dot(s)
832 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
833 for (int j
= 0; j
< ns
; j
++) {
834 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
835 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
840 if (pAMgr
->get_keepcase()) {
845 for (int j
=0; j
< ns
; j
++) {
846 if (!spell((*slst
)[j
])) {
851 len
= u8_u16(w
, MAXSWL
, (*slst
)[j
]);
853 strcpy(s
, (*slst
)[j
]);
856 mkallsmall2(s
, w
, len
);
859 (*slst
)[l
] = mystrdup(s
);
862 mkinitcap2(s
, w
, len
);
864 (*slst
)[l
] = mystrdup(s
);
869 (*slst
)[l
] = (*slst
)[j
];
878 // remove duplications
880 for (int j
= 0; j
< ns
; j
++) {
881 (*slst
)[l
] = (*slst
)[j
];
882 for (int k
= 0; k
< l
; k
++) {
883 if (strcmp((*slst
)[k
], (*slst
)[j
]) == 0) {
893 char * Hunspell::get_dic_encoding()
898 #ifdef HUNSPELL_EXPERIMENTAL
899 // XXX need UTF-8 support
900 int Hunspell::suggest_auto(char*** slst
, const char * word
)
902 char cw
[MAXWORDUTF8LEN
+ 4];
903 char wspace
[MAXWORDUTF8LEN
+ 4];
904 if (! pSMgr
) return 0;
905 int wl
= strlen(word
);
907 if (wl
>= MAXWORDUTF8LEN
) return 0;
909 if (wl
>= MAXWORDLEN
) return 0;
913 wl
= cleanword(cw
, word
, &captype
, &abbv
);
914 if (wl
== 0) return 0;
916 *slst
= NULL
; // HU, nsug in pSMgr->suggest
920 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
926 memcpy(wspace
,cw
,(wl
+1));
928 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
929 for (int j
=0; j
< ns
; j
++)
930 mkinitcap((*slst
)[j
]);
931 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
937 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
939 memcpy(wspace
,cw
,(wl
+1));
941 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
947 memcpy(wspace
,cw
,(wl
+1));
949 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
952 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
954 for (int j
=0; j
< ns
; j
++)
955 mkallcap((*slst
)[j
]);
960 // word reversing wrapper for complex prefixes
961 if (complexprefixes
) {
962 for (int j
= 0; j
< ns
; j
++) {
963 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
967 // expand suggestions with dot(s)
968 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
969 for (int j
= 0; j
< ns
; j
++) {
970 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
971 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
975 // LANG_hu section: replace '-' with ' ' in Hungarian
976 if (langnum
== LANG_hu
) {
977 for (int j
=0; j
< ns
; j
++) {
978 char * pos
= strchr((*slst
)[j
],'-');
981 char w
[MAXWORDUTF8LEN
];
983 strcpy(w
, (*slst
)[j
]);
985 spell(w
, &info
, NULL
);
986 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
992 // END OF LANG_hu section
996 // XXX need UTF-8 support
997 int Hunspell::stem(char*** slst
, const char * word
)
999 char cw
[MAXWORDUTF8LEN
+ 4];
1000 char wspace
[MAXWORDUTF8LEN
+ 4];
1001 if (! pSMgr
) return 0;
1002 int wl
= strlen(word
);
1004 if (wl
>= MAXWORDUTF8LEN
) return 0;
1006 if (wl
>= MAXWORDLEN
) return 0;
1010 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1011 if (wl
== 0) return 0;
1015 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1020 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1022 if ((abbv
) && (ns
== 0)) {
1023 memcpy(wspace
,cw
,wl
);
1025 *(wspace
+wl
+1) = '\0';
1026 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1034 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1037 memcpy(wspace
,cw
,(wl
+1));
1039 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1043 if ((abbv
) && (ns
== 0)) {
1044 memcpy(wspace
,cw
,wl
);
1047 *(wspace
+wl
+1) = '\0';
1048 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1056 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1059 memcpy(wspace
,cw
,(wl
+1));
1061 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1065 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1068 if ((abbv
) && (ns
== 0)) {
1069 memcpy(wspace
,cw
,wl
);
1072 *(wspace
+wl
+1) = '\0';
1073 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1084 int Hunspell::suggest_pos_stems(char*** slst
, const char * word
)
1086 char cw
[MAXWORDUTF8LEN
+ 4];
1087 char wspace
[MAXWORDUTF8LEN
+ 4];
1088 if (! pSMgr
) return 0;
1089 int wl
= strlen(word
);
1091 if (wl
>= MAXWORDUTF8LEN
) return 0;
1093 if (wl
>= MAXWORDLEN
) return 0;
1097 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1098 if (wl
== 0) return 0;
1100 int ns
= 0; // ns=0 = normalized input
1102 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1107 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1109 if ((abbv
) && (ns
== 0)) {
1110 memcpy(wspace
,cw
,wl
);
1112 *(wspace
+wl
+1) = '\0';
1113 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1121 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1123 if (ns
== 0 || ((*slst
)[0][0] == '#')) {
1124 memcpy(wspace
,cw
,(wl
+1));
1126 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1134 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1137 memcpy(wspace
,cw
,(wl
+1));
1139 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1143 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1151 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1153 const char * Hunspell::get_wordchars()
1155 return pAMgr
->get_wordchars();
1158 unsigned short * Hunspell::get_wordchars_utf16(int * len
)
1160 return pAMgr
->get_wordchars_utf16(len
);
1163 void Hunspell::mkinitcap(char * p
)
1166 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1169 w_char u
[MAXWORDLEN
];
1170 len
= u8_u16(u
, MAXWORDLEN
, p
);
1171 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1172 u
[0].h
= (unsigned char) (i
>> 8);
1173 u
[0].l
= (unsigned char) (i
& 0x00FF);
1174 u16_u8(p
, MAXWORDUTF8LEN
, u
, len
);
1178 int Hunspell::mkinitcap2(char * p
, w_char
* u
, int nc
)
1181 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1182 } else if (nc
> 0) {
1183 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1184 u
[0].h
= (unsigned char) (i
>> 8);
1185 u
[0].l
= (unsigned char) (i
& 0x00FF);
1186 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1192 int Hunspell::mkinitsmall2(char * p
, w_char
* u
, int nc
)
1195 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].clower
;
1196 } else if (nc
> 0) {
1197 unsigned short i
= unicodetolower((u
[0].h
<< 8) + u
[0].l
, langnum
);
1198 u
[0].h
= (unsigned char) (i
>> 8);
1199 u
[0].l
= (unsigned char) (i
& 0x00FF);
1200 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1206 int Hunspell::put_word(const char * word
)
1209 return pHMgr
->put_word(word
, strlen(word
), NULL
);
1214 int Hunspell::put_word_pattern(const char * word
, const char * pattern
)
1217 return pHMgr
->put_word_pattern(word
, strlen(word
), pattern
);
1222 const char * Hunspell::get_version()
1224 return pAMgr
->get_version();
1227 struct cs_info
* Hunspell::get_csconv()
1232 #ifdef HUNSPELL_EXPERIMENTAL
1233 // XXX need UTF-8 support
1234 char * Hunspell::morph(const char * word
)
1236 char cw
[MAXWORDUTF8LEN
+ 4];
1237 char wspace
[MAXWORDUTF8LEN
+ 4];
1238 if (! pSMgr
) return 0;
1239 int wl
= strlen(word
);
1241 if (wl
>= MAXWORDUTF8LEN
) return 0;
1243 if (wl
>= MAXWORDLEN
) return 0;
1247 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1250 for (wl
= 0; wl
< abbv
; wl
++) cw
[wl
] = '.';
1256 char result
[MAXLNLEN
];
1266 // LANG_hu section: set dash information for suggestions
1267 if (langnum
== LANG_hu
) {
1269 (((cw
[n
] <= '9') && (cw
[n
] >= '0')) || (((cw
[n
] == '.') || (cw
[n
] == ',')) && (n
> 0)))) {
1271 if ((cw
[n
] == '.') || (cw
[n
] == ',')) {
1272 if (((n2
== 0) && (n
> 3)) ||
1273 ((n2
> 0) && ((cw
[n
-1] == '.') || (cw
[n
-1] == ',')))) break;
1279 if ((n
== wl
) && (n3
> 0) && (n
- n3
> 3)) return NULL
;
1280 if ((n
== wl
) || ((n
>0) && ((cw
[n
]=='%') || (cw
[n
]=='?)) && checkword(cw+n, NULL, NULL))) {
1282 result[n - 1] = '\
0';
1284 st
= pSMgr
->suggest_morph(cw
+ n
- 1);
1292 st
= pSMgr
->suggest_morph(cw
+ n
- 1);
1297 strcat(result
, "+"); // XXX SPEC. MORPHCODE
1299 st
= pSMgr
->suggest_morph(cw
+ n
);
1305 return mystrdup(result
);
1308 // END OF LANG_hu section
1312 st
= pSMgr
->suggest_morph(cw
);
1318 memcpy(wspace
,cw
,wl
);
1320 *(wspace
+wl
+1) = '\0';
1321 st
= pSMgr
->suggest_morph(wspace
);
1323 if (*result
) strcat(result
, "\n");
1331 memcpy(wspace
,cw
,(wl
+1));
1333 st
= pSMgr
->suggest_morph(wspace
);
1338 st
= pSMgr
->suggest_morph(cw
);
1340 if (*result
) strcat(result
, "\n");
1345 memcpy(wspace
,cw
,wl
);
1347 *(wspace
+wl
+1) = '\0';
1349 st
= pSMgr
->suggest_morph(wspace
);
1351 if (*result
) strcat(result
, "\n");
1356 st
= pSMgr
->suggest_morph(wspace
);
1358 if (*result
) strcat(result
, "\n");
1366 st
= pSMgr
->suggest_morph(cw
);
1372 memcpy(wspace
,cw
,(wl
+1));
1374 st
= pSMgr
->suggest_morph(wspace
);
1376 if (*result
) strcat(result
, "\n");
1384 memcpy(wspace
,cw
,(wl
+1));
1385 st
= pSMgr
->suggest_morph(wspace
);
1391 st
= pSMgr
->suggest_morph(wspace
);
1393 if (*result
) strcat(result
, "\n");
1398 st
= pSMgr
->suggest_morph(wspace
);
1400 if (*result
) strcat(result
, "\n");
1405 memcpy(wspace
,cw
,(wl
+1));
1407 *(wspace
+wl
+1) = '\0';
1408 if (*result
) strcat(result
, "\n");
1409 st
= pSMgr
->suggest_morph(wspace
);
1415 st
= pSMgr
->suggest_morph(wspace
);
1417 if (*result
) strcat(result
, "\n");
1422 st
= pSMgr
->suggest_morph(wspace
);
1424 if (*result
) strcat(result
, "\n");
1433 if (result
&& (*result
)) {
1434 // word reversing wrapper for complex prefixes
1435 if (complexprefixes
) {
1436 if (utf8
) reverseword_utf(result
); else reverseword(result
);
1438 return mystrdup(result
);
1441 // compound word with dash (HU) I18n
1444 // LANG_hu section: set dash information for suggestions
1445 if (langnum
== LANG_hu
) dash
= (char *) strchr(cw
,'-');
1446 if ((langnum
== LANG_hu
) && dash
) {
1448 // examine 2 sides of the dash
1449 if (dash
[1] == '\0') { // base word ending with dash
1450 if (spell(cw
)) return pSMgr
->suggest_morph(cw
);
1451 } else if ((dash
[1] == 'e') && (dash
[2] == '\0')) { // XXX (HU) -e hat.
1452 if (spell(cw
) && (spell("-e"))) {
1453 st
= pSMgr
->suggest_morph(cw
);
1458 strcat(result
,"+"); // XXX spec. separator in MORPHCODE
1459 st
= pSMgr
->suggest_morph("-e");
1464 return mystrdup(result
);
1467 // first word ending with dash: word- XXX ???
1468 char r2
= *(dash
+ 1);
1471 nresult
= spell(cw
);
1474 if (nresult
&& spell(dash
+1) && ((strlen(dash
+1) > 1) ||
1475 ((dash
[1] > '0') && (dash
[1] < '9')))) {
1480 strcat(result
,"+"); // XXX spec. separator in MORPHCODE
1487 return mystrdup(result
);
1490 // affixed number in correct word
1491 if (nresult
&& (dash
> cw
) && (((*(dash
-1)<='9') &&
1492 (*(dash
-1)>='0')) || (*(dash
-1)=='.'))) {
1495 if (*(dash
- n
) == '.') n
++;
1496 // search first not a number character to left from dash
1497 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
1500 if ((dash
- n
) < cw
) n
--;
1501 // numbers: valami1000000-hoz
1502 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1504 for(; n
>= 1; n
--) {
1505 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') && checkword(dash
- n
, NULL
, NULL
)) {
1507 result
[dash
- cw
- n
] = '\0';
1508 st
= pSMgr
->suggest_morph(dash
- n
);
1513 return mystrdup(result
);
1521 // XXX need UTF-8 support
1522 char * Hunspell::morph_with_correction(const char * word
)
1524 char cw
[MAXWORDUTF8LEN
+ 4];
1525 char wspace
[MAXWORDUTF8LEN
+ 4];
1526 if (! pSMgr
) return 0;
1527 int wl
= strlen(word
);
1529 if (wl
>= MAXWORDUTF8LEN
) return 0;
1531 if (wl
>= MAXWORDLEN
) return 0;
1535 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1536 if (wl
== 0) return 0;
1538 char result
[MAXLNLEN
];
1546 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1552 memcpy(wspace
,cw
,wl
);
1554 *(wspace
+wl
+1) = '\0';
1555 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1557 if (*result
) strcat(result
, "\n");
1565 memcpy(wspace
,cw
,(wl
+1));
1567 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1572 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1574 if (*result
) strcat(result
, "\n");
1579 memcpy(wspace
,cw
,wl
);
1581 *(wspace
+wl
+1) = '\0';
1583 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1585 if (*result
) strcat(result
, "\n");
1590 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1592 if (*result
) strcat(result
, "\n");
1600 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1605 memcpy(wspace
,cw
,(wl
+1));
1607 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1609 if (*result
) strcat(result
, "\n");
1616 memcpy(wspace
,cw
,(wl
+1));
1617 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1623 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1625 if (*result
) strcat(result
, "\n");
1630 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1632 if (*result
) strcat(result
, "\n");
1637 memcpy(wspace
,cw
,(wl
+1));
1639 *(wspace
+wl
+1) = '\0';
1640 if (*result
) strcat(result
, "\n");
1641 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1647 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1649 if (*result
) strcat(result
, "\n");
1654 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1656 if (*result
) strcat(result
, "\n");
1665 if (result
) return mystrdup(result
);
1671 * XXX need a better data structure for morphological analysis */
1672 int Hunspell::analyze(char ***out
, const char *word
) {
1674 if (!word
) return 0;
1675 char * m
= morph(word
);
1677 if (!out
) return line_tok(m
, out
);
1679 // without memory allocation
1680 /* BUG missing buffer size checking */
1682 for(p
= 0, i
= 0; m
[i
]; i
++) {
1683 if(m
[i
] == '\n' || !m
[i
+1]) {
1685 strncpy((*out
)[n
++], m
+ p
, i
- p
+ 1);
1686 if (m
[i
] == '\n') (*out
)[n
++][i
- p
] = '\0';
1695 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1697 Hunhandle
*Hunspell_create(const char * affpath
, const char * dpath
)
1699 return (Hunhandle
*)(new Hunspell(affpath
, dpath
));
1702 void Hunspell_destroy(Hunhandle
*pHunspell
)
1704 delete (Hunspell
*)(pHunspell
);
1707 int Hunspell_spell(Hunhandle
*pHunspell
, const char *word
)
1709 return ((Hunspell
*)pHunspell
)->spell(word
);
1712 char *Hunspell_get_dic_encoding(Hunhandle
*pHunspell
)
1714 return ((Hunspell
*)pHunspell
)->get_dic_encoding();
1717 int Hunspell_suggest(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1719 return ((Hunspell
*)pHunspell
)->suggest(slst
, word
);