1
#include "license.hunspell"
2 #include "license.myspell"
14 #include "hunspell.hxx"
17 #ifndef MOZILLA_CLIENT
23 Hunspell::Hunspell(const char * affpath
, const char * dpath
)
30 /* first set up the hash manager */
31 pHMgr
= new HashMgr(dpath
, affpath
);
33 /* next set up the affix manager */
34 /* it needs access to the hash manager lookup methods */
35 pAMgr
= new AffixMgr(affpath
,pHMgr
);
37 /* get the preferred try string and the dictionary */
38 /* encoding from the Affix Manager for that dictionary */
39 char * try_string
= pAMgr
->get_try_string();
40 encoding
= pAMgr
->get_encoding();
41 csconv
= get_current_cs(encoding
);
42 langnum
= pAMgr
->get_langnum();
43 utf8
= pAMgr
->get_utf8();
44 complexprefixes
= pAMgr
->get_complexprefixes();
45 wordbreak
= pAMgr
->get_breaktable();
47 /* and finally set up the suggestion manager */
48 pSMgr
= new SuggestMgr(try_string
, MAXSUGGESTION
, pAMgr
);
49 if (try_string
) free(try_string
);
55 if (pSMgr
) delete pSMgr
;
56 if (pAMgr
) delete pAMgr
;
57 if (pHMgr
) delete pHMgr
;
62 if (encoding
) free(encoding
);
67 // make a copy of src at destination while removing all leading
68 // blanks and removing any trailing periods after recording
69 // their presence with the abbreviation flag
70 // also since already going through character by character,
71 // set the capitalization type
72 // return the length of the "cleaned" (and UTF-8 encoded) word
74 int Hunspell::cleanword2(char * dest
, const char * src
,
75 w_char
* dest_utf
, int * nc
, int * pcaptype
, int * pabbrev
)
77 unsigned char * p
= (unsigned char *) dest
;
78 const unsigned char * q
= (const unsigned char * ) src
;
81 // first skip over any leading blanks
82 while ((*q
!= '\0') && (*q
== ' ')) q
++;
84 // now strip off any trailing periods (recording their presence)
86 int nl
= strlen((const char *)q
);
87 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
92 // if no characters are left it can't be capitalized
99 // now determine the capitalization type of the first nl letters
107 if (csconv
[(*q
)].ccase
) ncap
++;
108 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
112 // remember to terminate the destination string
115 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
119 *nc
= u8_u16(dest_utf
, MAXWORDLEN
, (const char *) q
);
120 // don't check too long words
121 if (*nc
>= MAXWORDLEN
) return 0;
122 if (*nc
== -1) { // big Unicode character (non BMP area)
124 strcpy((char *) p
, (char *) q
);
128 for (int i
= 0; i
< *nc
; i
++) {
129 idx
= (dest_utf
[i
].h
<< 8) + dest_utf
[i
].l
;
130 if (idx
!= unicodetolower(idx
, langnum
)) ncap
++;
131 if (unicodetoupper(idx
, langnum
) == unicodetolower(idx
, langnum
)) nneutral
++;
133 u16_u8(dest
, MAXWORDUTF8LEN
, dest_utf
, *nc
);
135 idx
= (dest_utf
[0].h
<< 8) + dest_utf
[0].l
;
136 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
140 // now finally set the captype
143 } else if ((ncap
== 1) && firstcap
) {
145 } else if ((ncap
== *nc
) || ((ncap
+ nneutral
) == *nc
)) {
147 } else if ((ncap
> 1) && firstcap
) {
148 *pcaptype
= HUHINITCAP
;
155 int Hunspell::cleanword(char * dest
, const char * src
,
156 int * pcaptype
, int * pabbrev
)
158 unsigned char * p
= (unsigned char *) dest
;
159 const unsigned char * q
= (const unsigned char * ) src
;
162 // first skip over any leading blanks
163 while ((*q
!= '\0') && (*q
== ' ')) q
++;
165 // now strip off any trailing periods (recording their presence)
167 int nl
= strlen((const char *)q
);
168 while ((nl
> 0) && (*(q
+nl
-1)=='.')) {
173 // if no characters are left it can't be capitalized
180 // now determine the capitalization type of the first nl letters
188 if (csconv
[(*q
)].ccase
) ncap
++;
189 if (csconv
[(*q
)].cupper
== csconv
[(*q
)].clower
) nneutral
++;
193 // remember to terminate the destination string
195 firstcap
= csconv
[(unsigned char)(*dest
)].ccase
;
198 w_char t
[MAXWORDLEN
];
199 nc
= u8_u16(t
, MAXWORDLEN
, src
);
200 for (int i
= 0; i
< nc
; i
++) {
201 idx
= (t
[i
].h
<< 8) + t
[i
].l
;
202 if (idx
!= unicodetolower(idx
, langnum
)) ncap
++;
203 if (unicodetoupper(idx
, langnum
) == unicodetolower(idx
, langnum
)) nneutral
++;
205 u16_u8(dest
, MAXWORDUTF8LEN
, t
, nc
);
207 idx
= (t
[0].h
<< 8) + t
[0].l
;
208 firstcap
= (idx
!= unicodetolower(idx
, langnum
));
212 // now finally set the captype
215 } else if ((ncap
== 1) && firstcap
) {
217 } else if ((ncap
== nc
) || ((ncap
+ nneutral
) == nc
)){
219 } else if ((ncap
> 1) && firstcap
) {
220 *pcaptype
= HUHINITCAP
;
228 void Hunspell::mkallcap(char * p
)
231 w_char u
[MAXWORDLEN
];
232 int nc
= u8_u16(u
, MAXWORDLEN
, p
);
234 for (int i
= 0; i
< nc
; i
++) {
235 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
236 if (idx
!= unicodetoupper(idx
, langnum
)) {
237 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
238 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
241 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
244 *p
= csconv
[((unsigned char) *p
)].cupper
;
250 int Hunspell::mkallcap2(char * p
, w_char
* u
, int nc
)
254 for (int i
= 0; i
< nc
; i
++) {
255 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
256 if (idx
!= unicodetoupper(idx
, langnum
)) {
257 u
[i
].h
= (unsigned char) (unicodetoupper(idx
, langnum
) >> 8);
258 u
[i
].l
= (unsigned char) (unicodetoupper(idx
, langnum
) & 0x00FF);
261 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
265 *p
= csconv
[((unsigned char) *p
)].cupper
;
273 void Hunspell::mkallsmall(char * p
)
276 *p
= csconv
[((unsigned char) *p
)].clower
;
281 int Hunspell::mkallsmall2(char * p
, w_char
* u
, int nc
)
285 for (int i
= 0; i
< nc
; i
++) {
286 idx
= (u
[i
].h
<< 8) + u
[i
].l
;
287 if (idx
!= unicodetolower(idx
, langnum
)) {
288 u
[i
].h
= (unsigned char) (unicodetolower(idx
, langnum
) >> 8);
289 u
[i
].l
= (unsigned char) (unicodetolower(idx
, langnum
) & 0x00FF);
292 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
296 *p
= csconv
[((unsigned char) *p
)].clower
;
303 // convert UTF-8 sharp S codes to latin 1
304 char * Hunspell::sharps_u8_l1(char * dest
, char * source
) {
307 for (p
++, source
++; *(source
- 1); p
++, source
++) {
309 if (*source
== '?') *--p
= '?';
314 // recursive search for right ss-?permutations
315 hentry
* Hunspell::spellsharps(char * base
, char * pos
, int n
,
316 int repnum
, char * tmp
, int * info
, char **root
) {
317 pos
= strstr(pos
, "ss");
318 if (pos
&& (n
< MAXSHARPS
)) {
321 hentry
* h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
+ 1, tmp
, info
, root
);
325 h
= spellsharps(base
, pos
+ 2, n
+ 1, repnum
, tmp
, info
, root
);
327 } else if (repnum
> 0) {
328 if (utf8
) return checkword(base
, info
, root
);
329 return checkword(sharps_u8_l1(tmp
, base
), info
, root
);
334 int Hunspell::is_keepcase(const hentry
* rv
) {
335 return pAMgr
&& rv
->astr
&& pAMgr
->get_keepcase() &&
336 TESTAFF(rv
->astr
, pAMgr
->get_keepcase(), rv
->alen
);
339 /* check and insert a word to beginning of the suggestion array */
340 int Hunspell::insert_sug(char ***slst
, char * word
, int *ns
) {
342 if (*ns
== MAXSUGGESTION
) {
346 for (int k
= *ns
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
347 (*slst
)[0] = mystrdup(word
);
353 int Hunspell::spell(const char * word
, int * info
, char ** root
)
355 struct hentry
* rv
=NULL
;
356 // need larger vector. For example, Turkish capital letter I converted a
357 // 2-byte UTF-8 character (dotless i) by mkallsmall.
358 char cw
[MAXWORDUTF8LEN
+ 4];
359 char wspace
[MAXWORDUTF8LEN
+ 4];
360 w_char unicw
[MAXWORDLEN
+ 1];
361 int nc
= strlen(word
);
364 if (nc
>= MAXWORDUTF8LEN
) return 0;
366 if (nc
>= MAXWORDLEN
) return 0;
370 int wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
372 if (wl
== 0) return 1;
375 if (root
) *root
= NULL
;
377 // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.)
378 enum { NBEGIN
, NNUM
, NSEP
};
382 for (i
= 0; (i
< wl
); i
++) {
383 if ((cw
[i
] <= '9') && (cw
[i
] >= '0')) {
385 } else if ((cw
[i
] == ',') || (cw
[i
] == '.') || (cw
[i
] == '-')) {
386 if ((nstate
== NSEP
) || (i
== 0)) break;
390 if ((i
== wl
) && (nstate
== NNUM
)) return 1;
392 // LANG_hu section: number(s) + (percent or degree) with suffixes
393 if (langnum
== LANG_hu
) {
394 if ((nstate
== NNUM
) && ((cw
[i
] == '%') || (cw
[i
] == '?'))
395 && checkword(cw
+ i
, info
, root
)) return 1;
397 // END of LANG_hu section
403 rv
= checkword(cw
, info
, root
);
404 if ((abbv
) && !(rv
)) {
405 memcpy(wspace
,cw
,wl
);
407 *(wspace
+wl
+1) = '\0';
408 rv
= checkword(wspace
, info
, root
);
413 rv
= checkword(cw
, info
, root
);
416 memcpy(wspace
,cw
,wl
);
418 *(wspace
+wl
+1) = '\0';
419 rv
= checkword(wspace
, info
, root
);
422 if (pAMgr
&& pAMgr
->get_checksharps() && strstr(cw
, "SS")) {
423 char tmpword
[MAXWORDUTF8LEN
];
424 wl
= mkallsmall2(cw
, unicw
, nc
);
425 memcpy(wspace
,cw
,(wl
+1));
426 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
428 wl2
= mkinitcap2(cw
, unicw
, nc
);
429 rv
= spellsharps(cw
, cw
, 0, 0, tmpword
, info
, root
);
431 if ((abbv
) && !(rv
)) {
433 *(wspace
+wl
+1) = '\0';
434 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
436 memcpy(wspace
, cw
, wl2
);
438 *(wspace
+wl2
+1) = '\0';
439 rv
= spellsharps(wspace
, wspace
, 0, 0, tmpword
, info
, root
);
446 wl
= mkallsmall2(cw
, unicw
, nc
);
447 memcpy(wspace
,cw
,(wl
+1));
448 rv
= checkword(wspace
, info
, root
);
449 if (!rv
|| (is_keepcase(rv
) && !((captype
== INITCAP
) &&
450 // if CHECKSHARPS: KEEPCASE words with ?are allowed
451 // in INITCAP form, too.
452 pAMgr
->get_checksharps() && ((utf8
&& strstr(wspace
, "脽")) ||
453 (!utf8
&& strchr(wspace
, '?')))))) {
454 wl2
= mkinitcap2(cw
, unicw
, nc
);
455 rv
= checkword(cw
, info
, root
);
456 if (rv
&& (captype
== ALLCAP
) && is_keepcase(rv
)) rv
= NULL
;
460 *(wspace
+wl
+1) = '\0';
461 rv
= checkword(wspace
, info
, root
);
462 if (!rv
|| is_keepcase(rv
)) {
463 memcpy(wspace
, cw
, wl2
);
465 *(wspace
+wl2
+1) = '\0';
466 rv
= checkword(wspace
, info
, root
);
467 if (rv
&& ((captype
== ALLCAP
) && is_keepcase(rv
))) rv
= NULL
;
476 // recursive breaking at break points (not good for morphological analysis)
480 for (int j
= 0; j
< pAMgr
->get_numbreak(); j
++) {
481 s
=(char *) strstr(cw
, wordbreak
[j
]);
485 // examine 2 sides of the break point
486 if (spell(cw
) && spell(s
+ strlen(wordbreak
[j
]))) {
495 // LANG_hu: compoundings with dashes and n-dashes XXX deprecated!
496 if (langnum
== LANG_hu
) {
498 // compound word with dash (HU) I18n
502 dash
= (char *) strstr(cw
,"-");
503 if (dash
&& !wordbreak
) {
505 // examine 2 sides of the dash
506 if (spell(cw
) && spell(dash
+ 3)) {
512 dash
= (char *) strchr(cw
,'-');
515 // examine 2 sides of the dash
516 if (dash
[1] == '\0') { // base word ending with dash
517 if (spell(cw
)) return 1;
519 // first word ending with dash: word-
520 char r2
= *(dash
+ 1);
526 if (result
&& spell(dash
+1) && ((strlen(dash
+1) > 1) || (dash
[1] == 'e') ||
527 ((dash
[1] > '0') && (dash
[1] < '9')))) return 1;
529 // affixed number in correct word
530 if (result
&& (dash
> cw
) && (((*(dash
-1)<='9') && (*(dash
-1)>='0')) || (*(dash
-1)>='.'))) {
533 if (*(dash
- n
) == '.') n
++;
534 // search first not a number character to left from dash
535 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
538 if ((dash
- n
) < cw
) n
--;
539 // numbers: deprecated
541 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') &&
542 checkword(dash
- n
, info
, root
)) return 1;
550 //int Hunspell::spell(const char * word) {
551 // return spell(word, NULL, NULL);
554 struct hentry
* Hunspell::checkword(const char * w
, int * info
, char ** root
)
556 struct hentry
* he
= NULL
;
558 char w2
[MAXWORDUTF8LEN
];
561 char * ignoredchars
= pAMgr
->get_ignore();
562 if (ignoredchars
!= NULL
) {
565 int ignoredchars_utf16_len
;
566 unsigned short * ignoredchars_utf16
= pAMgr
->get_ignore_utf16(&ignoredchars_utf16_len
);
567 remove_ignored_chars_utf(w2
, ignoredchars_utf16
, ignoredchars_utf16_len
);
569 remove_ignored_chars(w2
,ignoredchars
);
575 // word reversing wrapper for complex prefixes
576 if (complexprefixes
) {
581 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
584 // look word in hash table
585 if (pHMgr
) he
= pHMgr
->lookup(word
);
587 // check forbidden and onlyincompound words
588 if ((he
) && (he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
589 info
+= SPELL_FORBIDDEN
;
590 // LANG_hu section: set dash information for suggestions
591 if (langnum
== LANG_hu
) {
592 if (pAMgr
->get_compoundflag() &&
593 TESTAFF(he
->astr
, pAMgr
->get_compoundflag(), he
->alen
)) {
594 info
+= SPELL_COMPOUND
;
600 // he = next not pseudoroot and not onlyincompound homonym or NULL
601 while (he
&& (he
->astr
) &&
602 ((pAMgr
->get_pseudoroot() && TESTAFF(he
->astr
, pAMgr
->get_pseudoroot(), he
->alen
)) ||
603 (pAMgr
->get_onlyincompound() && TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
))
604 )) he
= he
->next_homonym
;
606 // check with affixes
608 // try stripping off affixes */
610 he
= pAMgr
->affix_check(word
, len
, 0);
612 // check compound restriction
613 if (he
&& he
->astr
&& pAMgr
->get_onlyincompound() &&
614 TESTAFF(he
->astr
, pAMgr
->get_onlyincompound(), he
->alen
)) he
= NULL
;
617 if ((he
->astr
) && (pAMgr
) && TESTAFF(he
->astr
, pAMgr
->get_forbiddenword(), he
->alen
)) {
618 info
+= SPELL_FORBIDDEN
;
622 *root
= mystrdup(he
->word
);
623 if (complexprefixes
) {
624 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
627 // try check compound word
628 } else if (pAMgr
->get_compound()) {
629 he
= pAMgr
->compound_check(word
, len
,
630 0,0,100,0,NULL
,0,NULL
,NULL
,0);
631 // LANG_hu section: `moving rule' with last dash
632 if ((!he
) && (langnum
== LANG_hu
) && (word
[len
-1]=='-')) {
633 char * dup
= mystrdup(word
);
635 he
= pAMgr
->compound_check(dup
, len
-1,
636 -5,0,100,0,NULL
,1,NULL
,NULL
,0);
639 // end of LANG speficic region
642 *root
= mystrdup(he
->word
);
643 if (complexprefixes
) {
644 if (utf8
) reverseword_utf(*root
); else reverseword(*root
);
647 if (info
) *info
+= SPELL_COMPOUND
;
656 int Hunspell::suggest(char*** slst
, const char * word
)
658 char cw
[MAXWORDUTF8LEN
+ 4];
659 char wspace
[MAXWORDUTF8LEN
+ 4];
660 if (! pSMgr
) return 0;
661 w_char unicw
[MAXWORDLEN
+ 1];
662 int nc
= strlen(word
);
664 if (nc
>= MAXWORDUTF8LEN
) return 0;
666 if (nc
>= MAXWORDLEN
) return 0;
670 int wl
= cleanword2(cw
, word
, unicw
, &nc
, &captype
, &abbv
);
671 if (wl
== 0) return 0;
679 ns
= pSMgr
->suggest(slst
, cw
, ns
);
685 ns
= pSMgr
->suggest(slst
, cw
, ns
);
687 memcpy(wspace
,cw
,(wl
+1));
688 mkallsmall2(wspace
, unicw
, nc
);
689 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
695 ns
= pSMgr
->suggest(slst
, cw
, ns
);
698 if (captype
== HUHINITCAP
) {
699 // TheOpenOffice.org -> The OpenOffice.org
700 memcpy(wspace
,cw
,(wl
+1));
701 mkinitsmall2(wspace
, unicw
, nc
);
702 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
704 memcpy(wspace
,cw
,(wl
+1));
705 mkallsmall2(wspace
, unicw
, nc
);
706 insert_sug(slst
, wspace
, &ns
);
708 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
709 if (captype
== HUHINITCAP
) {
710 mkinitcap2(wspace
, unicw
, nc
);
711 insert_sug(slst
, wspace
, &ns
);
712 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
714 // aNew -> "a New" (instead of "a new")
715 for (int j
= prevns
; j
< ns
; j
++) {
716 char * space
= strchr((*slst
)[j
],' ');
718 int slen
= strlen(space
+ 1);
719 // different case after space (need capitalisation)
720 if ((slen
< wl
) && strcmp(cw
+ wl
- slen
, space
+ 1)) {
721 w_char w
[MAXWORDLEN
+ 1];
723 char * r
= (*slst
)[j
];
724 if (utf8
) wc
= u8_u16(w
, MAXWORDLEN
, space
+ 1);
725 mkinitcap2(space
+ 1, w
, wc
);
726 // set as first suggestion
727 for (int k
= j
; k
> 0; k
--) (*slst
)[k
] = (*slst
)[k
- 1];
737 memcpy(wspace
, cw
, (wl
+1));
738 mkallsmall2(wspace
, unicw
, nc
);
739 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
741 if (pAMgr
&& pAMgr
->get_keepcase()) insert_sug(slst
, wspace
, &ns
);
742 mkinitcap2(wspace
, unicw
, nc
);
743 ns
= pSMgr
->suggest(slst
, wspace
, ns
);
744 for (int j
=0; j
< ns
; j
++) {
745 mkallcap((*slst
)[j
]);
746 if (pAMgr
&& pAMgr
->get_checksharps()) {
749 pos
= strstr((*slst
)[j
], "脽");
753 pos
= strstr(pos
+2, "脽");
756 pos
= strchr((*slst
)[j
], '?');
758 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 2);
759 mystrrep((*slst
)[j
], "?", "SS");
760 pos
= strchr((*slst
)[j
], '?');
769 // LANG_hu section: replace '-' with ' ' in Hungarian
770 if (langnum
== LANG_hu
) {
771 for (int j
=0; j
< ns
; j
++) {
772 char * pos
= strchr((*slst
)[j
],'-');
775 char w
[MAXWORDUTF8LEN
];
777 strcpy(w
, (*slst
)[j
]);
779 spell(w
, &info
, NULL
);
780 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
786 // END OF LANG_hu section
788 // try ngram approach since found nothing
789 if ((ns
== 0) && pAMgr
&& (pAMgr
->get_maxngramsugs() != 0)) {
793 ns
= pSMgr
->ngsuggest(*slst
, cw
, pHMgr
);
797 memcpy(wspace
,cw
,(wl
+1));
798 mkallsmall2(wspace
, unicw
, nc
);
799 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
804 memcpy(wspace
,cw
,(wl
+1));
805 mkallsmall2(wspace
, unicw
, nc
);
806 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
810 memcpy(wspace
,cw
,(wl
+1));
811 mkallsmall2(wspace
, unicw
, nc
);
812 ns
= pSMgr
->ngsuggest(*slst
, wspace
, pHMgr
);
813 for (int j
=0; j
< ns
; j
++)
814 mkallcap((*slst
)[j
]);
820 // word reversing wrapper for complex prefixes
821 if (complexprefixes
) {
822 for (int j
= 0; j
< ns
; j
++) {
823 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
828 if (capwords
) for (int j
=0; j
< ns
; j
++) {
829 mkinitcap((*slst
)[j
]);
832 // expand suggestions with dot(s)
833 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
834 for (int j
= 0; j
< ns
; j
++) {
835 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
836 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
841 if (pAMgr
->get_keepcase()) {
846 for (int j
=0; j
< ns
; j
++) {
847 if (!spell((*slst
)[j
])) {
852 len
= u8_u16(w
, MAXSWL
, (*slst
)[j
]);
854 strcpy(s
, (*slst
)[j
]);
857 mkallsmall2(s
, w
, len
);
860 (*slst
)[l
] = mystrdup(s
);
863 mkinitcap2(s
, w
, len
);
865 (*slst
)[l
] = mystrdup(s
);
870 (*slst
)[l
] = (*slst
)[j
];
879 // remove duplications
881 for (int j
= 0; j
< ns
; j
++) {
882 (*slst
)[l
] = (*slst
)[j
];
883 for (int k
= 0; k
< l
; k
++) {
884 if (strcmp((*slst
)[k
], (*slst
)[j
]) == 0) {
894 char * Hunspell::get_dic_encoding()
899 #ifdef HUNSPELL_EXPERIMENTAL
900 // XXX need UTF-8 support
901 int Hunspell::suggest_auto(char*** slst
, const char * word
)
903 char cw
[MAXWORDUTF8LEN
+ 4];
904 char wspace
[MAXWORDUTF8LEN
+ 4];
905 if (! pSMgr
) return 0;
906 int wl
= strlen(word
);
908 if (wl
>= MAXWORDUTF8LEN
) return 0;
910 if (wl
>= MAXWORDLEN
) return 0;
914 wl
= cleanword(cw
, word
, &captype
, &abbv
);
915 if (wl
== 0) return 0;
917 *slst
= NULL
; // HU, nsug in pSMgr->suggest
921 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
927 memcpy(wspace
,cw
,(wl
+1));
929 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
930 for (int j
=0; j
< ns
; j
++)
931 mkinitcap((*slst
)[j
]);
932 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
938 ns
= pSMgr
->suggest_auto(slst
, cw
, ns
);
940 memcpy(wspace
,cw
,(wl
+1));
942 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
948 memcpy(wspace
,cw
,(wl
+1));
950 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
953 ns
= pSMgr
->suggest_auto(slst
, wspace
, ns
);
955 for (int j
=0; j
< ns
; j
++)
956 mkallcap((*slst
)[j
]);
961 // word reversing wrapper for complex prefixes
962 if (complexprefixes
) {
963 for (int j
= 0; j
< ns
; j
++) {
964 if (utf8
) reverseword_utf((*slst
)[j
]); else reverseword((*slst
)[j
]);
968 // expand suggestions with dot(s)
969 if (abbv
&& pAMgr
&& pAMgr
->get_sugswithdots()) {
970 for (int j
= 0; j
< ns
; j
++) {
971 (*slst
)[j
] = (char *) realloc((*slst
)[j
], strlen((*slst
)[j
]) + 1 + abbv
);
972 strcat((*slst
)[j
], word
+ strlen(word
) - abbv
);
976 // LANG_hu section: replace '-' with ' ' in Hungarian
977 if (langnum
== LANG_hu
) {
978 for (int j
=0; j
< ns
; j
++) {
979 char * pos
= strchr((*slst
)[j
],'-');
982 char w
[MAXWORDUTF8LEN
];
984 strcpy(w
, (*slst
)[j
]);
986 spell(w
, &info
, NULL
);
987 if ((info
& SPELL_COMPOUND
) && (info
& SPELL_FORBIDDEN
)) {
993 // END OF LANG_hu section
997 // XXX need UTF-8 support
998 int Hunspell::stem(char*** slst
, const char * word
)
1000 char cw
[MAXWORDUTF8LEN
+ 4];
1001 char wspace
[MAXWORDUTF8LEN
+ 4];
1002 if (! pSMgr
) return 0;
1003 int wl
= strlen(word
);
1005 if (wl
>= MAXWORDUTF8LEN
) return 0;
1007 if (wl
>= MAXWORDLEN
) return 0;
1011 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1012 if (wl
== 0) return 0;
1016 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1021 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1023 if ((abbv
) && (ns
== 0)) {
1024 memcpy(wspace
,cw
,wl
);
1026 *(wspace
+wl
+1) = '\0';
1027 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1035 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1038 memcpy(wspace
,cw
,(wl
+1));
1040 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1044 if ((abbv
) && (ns
== 0)) {
1045 memcpy(wspace
,cw
,wl
);
1048 *(wspace
+wl
+1) = '\0';
1049 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1057 ns
= pSMgr
->suggest_stems(slst
, cw
, ns
);
1060 memcpy(wspace
,cw
,(wl
+1));
1062 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1066 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1069 if ((abbv
) && (ns
== 0)) {
1070 memcpy(wspace
,cw
,wl
);
1073 *(wspace
+wl
+1) = '\0';
1074 ns
= pSMgr
->suggest_stems(slst
, wspace
, ns
);
1085 int Hunspell::suggest_pos_stems(char*** slst
, const char * word
)
1087 char cw
[MAXWORDUTF8LEN
+ 4];
1088 char wspace
[MAXWORDUTF8LEN
+ 4];
1089 if (! pSMgr
) return 0;
1090 int wl
= strlen(word
);
1092 if (wl
>= MAXWORDUTF8LEN
) return 0;
1094 if (wl
>= MAXWORDLEN
) return 0;
1098 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1099 if (wl
== 0) return 0;
1101 int ns
= 0; // ns=0 = normalized input
1103 *slst
= NULL
; // HU, nsug in pSMgr->suggest
1108 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1110 if ((abbv
) && (ns
== 0)) {
1111 memcpy(wspace
,cw
,wl
);
1113 *(wspace
+wl
+1) = '\0';
1114 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1122 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1124 if (ns
== 0 || ((*slst
)[0][0] == '#')) {
1125 memcpy(wspace
,cw
,(wl
+1));
1127 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1135 ns
= pSMgr
->suggest_pos_stems(slst
, cw
, ns
);
1138 memcpy(wspace
,cw
,(wl
+1));
1140 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1144 ns
= pSMgr
->suggest_pos_stems(slst
, wspace
, ns
);
1152 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1154 const char * Hunspell::get_wordchars()
1156 return pAMgr
->get_wordchars();
1159 unsigned short * Hunspell::get_wordchars_utf16(int * len
)
1161 return pAMgr
->get_wordchars_utf16(len
);
1164 void Hunspell::mkinitcap(char * p
)
1167 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1170 w_char u
[MAXWORDLEN
];
1171 len
= u8_u16(u
, MAXWORDLEN
, p
);
1172 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1173 u
[0].h
= (unsigned char) (i
>> 8);
1174 u
[0].l
= (unsigned char) (i
& 0x00FF);
1175 u16_u8(p
, MAXWORDUTF8LEN
, u
, len
);
1179 int Hunspell::mkinitcap2(char * p
, w_char
* u
, int nc
)
1182 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].cupper
;
1183 } else if (nc
> 0) {
1184 unsigned short i
= unicodetoupper((u
[0].h
<< 8) + u
[0].l
, langnum
);
1185 u
[0].h
= (unsigned char) (i
>> 8);
1186 u
[0].l
= (unsigned char) (i
& 0x00FF);
1187 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1193 int Hunspell::mkinitsmall2(char * p
, w_char
* u
, int nc
)
1196 if (*p
!= '\0') *p
= csconv
[((unsigned char)*p
)].clower
;
1197 } else if (nc
> 0) {
1198 unsigned short i
= unicodetolower((u
[0].h
<< 8) + u
[0].l
, langnum
);
1199 u
[0].h
= (unsigned char) (i
>> 8);
1200 u
[0].l
= (unsigned char) (i
& 0x00FF);
1201 u16_u8(p
, MAXWORDUTF8LEN
, u
, nc
);
1207 int Hunspell::put_word(const char * word
)
1210 return pHMgr
->put_word(word
, strlen(word
), NULL
);
1215 int Hunspell::put_word_pattern(const char * word
, const char * pattern
)
1218 return pHMgr
->put_word_pattern(word
, strlen(word
), pattern
);
1223 const char * Hunspell::get_version()
1225 return pAMgr
->get_version();
1228 struct cs_info
* Hunspell::get_csconv()
1233 #ifdef HUNSPELL_EXPERIMENTAL
1234 // XXX need UTF-8 support
1235 char * Hunspell::morph(const char * word
)
1237 char cw
[MAXWORDUTF8LEN
+ 4];
1238 char wspace
[MAXWORDUTF8LEN
+ 4];
1239 if (! pSMgr
) return 0;
1240 int wl
= strlen(word
);
1242 if (wl
>= MAXWORDUTF8LEN
) return 0;
1244 if (wl
>= MAXWORDLEN
) return 0;
1248 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1251 for (wl
= 0; wl
< abbv
; wl
++) cw
[wl
] = '.';
1257 char result
[MAXLNLEN
];
1267 // LANG_hu section: set dash information for suggestions
1268 if (langnum
== LANG_hu
) {
1270 (((cw
[n
] <= '9') && (cw
[n
] >= '0')) || (((cw
[n
] == '.') || (cw
[n
] == ',')) && (n
> 0)))) {
1272 if ((cw
[n
] == '.') || (cw
[n
] == ',')) {
1273 if (((n2
== 0) && (n
> 3)) ||
1274 ((n2
> 0) && ((cw
[n
-1] == '.') || (cw
[n
-1] == ',')))) break;
1280 if ((n
== wl
) && (n3
> 0) && (n
- n3
> 3)) return NULL
;
1281 if ((n
== wl
) || ((n
>0) && ((cw
[n
]=='%') || (cw
[n
]=='?)) && checkword(cw+n, NULL, NULL))) {
1283 result[n - 1] = '\
0';
1285 st
= pSMgr
->suggest_morph(cw
+ n
- 1);
1293 st
= pSMgr
->suggest_morph(cw
+ n
- 1);
1298 strcat(result
, "+"); // XXX SPEC. MORPHCODE
1300 st
= pSMgr
->suggest_morph(cw
+ n
);
1306 return mystrdup(result
);
1309 // END OF LANG_hu section
1313 st
= pSMgr
->suggest_morph(cw
);
1319 memcpy(wspace
,cw
,wl
);
1321 *(wspace
+wl
+1) = '\0';
1322 st
= pSMgr
->suggest_morph(wspace
);
1324 if (*result
) strcat(result
, "\n");
1332 memcpy(wspace
,cw
,(wl
+1));
1334 st
= pSMgr
->suggest_morph(wspace
);
1339 st
= pSMgr
->suggest_morph(cw
);
1341 if (*result
) strcat(result
, "\n");
1346 memcpy(wspace
,cw
,wl
);
1348 *(wspace
+wl
+1) = '\0';
1350 st
= pSMgr
->suggest_morph(wspace
);
1352 if (*result
) strcat(result
, "\n");
1357 st
= pSMgr
->suggest_morph(wspace
);
1359 if (*result
) strcat(result
, "\n");
1367 st
= pSMgr
->suggest_morph(cw
);
1373 memcpy(wspace
,cw
,(wl
+1));
1375 st
= pSMgr
->suggest_morph(wspace
);
1377 if (*result
) strcat(result
, "\n");
1385 memcpy(wspace
,cw
,(wl
+1));
1386 st
= pSMgr
->suggest_morph(wspace
);
1392 st
= pSMgr
->suggest_morph(wspace
);
1394 if (*result
) strcat(result
, "\n");
1399 st
= pSMgr
->suggest_morph(wspace
);
1401 if (*result
) strcat(result
, "\n");
1406 memcpy(wspace
,cw
,(wl
+1));
1408 *(wspace
+wl
+1) = '\0';
1409 if (*result
) strcat(result
, "\n");
1410 st
= pSMgr
->suggest_morph(wspace
);
1416 st
= pSMgr
->suggest_morph(wspace
);
1418 if (*result
) strcat(result
, "\n");
1423 st
= pSMgr
->suggest_morph(wspace
);
1425 if (*result
) strcat(result
, "\n");
1434 if (result
&& (*result
)) {
1435 // word reversing wrapper for complex prefixes
1436 if (complexprefixes
) {
1437 if (utf8
) reverseword_utf(result
); else reverseword(result
);
1439 return mystrdup(result
);
1442 // compound word with dash (HU) I18n
1445 // LANG_hu section: set dash information for suggestions
1446 if (langnum
== LANG_hu
) dash
= (char *) strchr(cw
,'-');
1447 if ((langnum
== LANG_hu
) && dash
) {
1449 // examine 2 sides of the dash
1450 if (dash
[1] == '\0') { // base word ending with dash
1451 if (spell(cw
)) return pSMgr
->suggest_morph(cw
);
1452 } else if ((dash
[1] == 'e') && (dash
[2] == '\0')) { // XXX (HU) -e hat.
1453 if (spell(cw
) && (spell("-e"))) {
1454 st
= pSMgr
->suggest_morph(cw
);
1459 strcat(result
,"+"); // XXX spec. separator in MORPHCODE
1460 st
= pSMgr
->suggest_morph("-e");
1465 return mystrdup(result
);
1468 // first word ending with dash: word- XXX ???
1469 char r2
= *(dash
+ 1);
1472 nresult
= spell(cw
);
1475 if (nresult
&& spell(dash
+1) && ((strlen(dash
+1) > 1) ||
1476 ((dash
[1] > '0') && (dash
[1] < '9')))) {
1481 strcat(result
,"+"); // XXX spec. separator in MORPHCODE
1488 return mystrdup(result
);
1491 // affixed number in correct word
1492 if (nresult
&& (dash
> cw
) && (((*(dash
-1)<='9') &&
1493 (*(dash
-1)>='0')) || (*(dash
-1)=='.'))) {
1496 if (*(dash
- n
) == '.') n
++;
1497 // search first not a number character to left from dash
1498 while (((dash
- n
)>=cw
) && ((*(dash
- n
)=='0') || (n
< 3)) && (n
< 6)) {
1501 if ((dash
- n
) < cw
) n
--;
1502 // numbers: valami1000000-hoz
1503 // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
1505 for(; n
>= 1; n
--) {
1506 if ((*(dash
- n
) >= '0') && (*(dash
- n
) <= '9') && checkword(dash
- n
, NULL
, NULL
)) {
1508 result
[dash
- cw
- n
] = '\0';
1509 st
= pSMgr
->suggest_morph(dash
- n
);
1514 return mystrdup(result
);
1522 // XXX need UTF-8 support
1523 char * Hunspell::morph_with_correction(const char * word
)
1525 char cw
[MAXWORDUTF8LEN
+ 4];
1526 char wspace
[MAXWORDUTF8LEN
+ 4];
1527 if (! pSMgr
) return 0;
1528 int wl
= strlen(word
);
1530 if (wl
>= MAXWORDUTF8LEN
) return 0;
1532 if (wl
>= MAXWORDLEN
) return 0;
1536 wl
= cleanword(cw
, word
, &captype
, &abbv
);
1537 if (wl
== 0) return 0;
1539 char result
[MAXLNLEN
];
1547 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1553 memcpy(wspace
,cw
,wl
);
1555 *(wspace
+wl
+1) = '\0';
1556 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1558 if (*result
) strcat(result
, "\n");
1566 memcpy(wspace
,cw
,(wl
+1));
1568 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1573 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1575 if (*result
) strcat(result
, "\n");
1580 memcpy(wspace
,cw
,wl
);
1582 *(wspace
+wl
+1) = '\0';
1584 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1586 if (*result
) strcat(result
, "\n");
1591 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1593 if (*result
) strcat(result
, "\n");
1601 st
= pSMgr
->suggest_morph_for_spelling_error(cw
);
1606 memcpy(wspace
,cw
,(wl
+1));
1608 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1610 if (*result
) strcat(result
, "\n");
1617 memcpy(wspace
,cw
,(wl
+1));
1618 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1624 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1626 if (*result
) strcat(result
, "\n");
1631 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1633 if (*result
) strcat(result
, "\n");
1638 memcpy(wspace
,cw
,(wl
+1));
1640 *(wspace
+wl
+1) = '\0';
1641 if (*result
) strcat(result
, "\n");
1642 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1648 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1650 if (*result
) strcat(result
, "\n");
1655 st
= pSMgr
->suggest_morph_for_spelling_error(wspace
);
1657 if (*result
) strcat(result
, "\n");
1666 if (result
) return mystrdup(result
);
1672 * XXX need a better data structure for morphological analysis */
1673 int Hunspell::analyze(char ***out
, const char *word
) {
1675 if (!word
) return 0;
1676 char * m
= morph(word
);
1678 if (!out
) return line_tok(m
, out
);
1680 // without memory allocation
1681 /* BUG missing buffer size checking */
1683 for(p
= 0, i
= 0; m
[i
]; i
++) {
1684 if(m
[i
] == '\n' || !m
[i
+1]) {
1686 strncpy((*out
)[n
++], m
+ p
, i
- p
+ 1);
1687 if (m
[i
] == '\n') (*out
)[n
++][i
- p
] = '\0';
1696 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1698 Hunhandle
*Hunspell_create(const char * affpath
, const char * dpath
)
1700 return (Hunhandle
*)(new Hunspell(affpath
, dpath
));
1703 void Hunspell_destroy(Hunhandle
*pHunspell
)
1705 delete (Hunspell
*)(pHunspell
);
1708 int Hunspell_spell(Hunhandle
*pHunspell
, const char *word
)
1710 return ((Hunspell
*)pHunspell
)->spell(word
);
1713 char *Hunspell_get_dic_encoding(Hunhandle
*pHunspell
)
1715 return ((Hunspell
*)pHunspell
)->get_dic_encoding();
1718 int Hunspell_suggest(Hunhandle
*pHunspell
, char*** slst
, const char * word
)
1720 return ((Hunspell
*)pHunspell
)->suggest(slst
, word
);