1 #include "license.hunspell"
2 #include "license.myspell"
9 #include "suggestmgr.hxx"
13 const w_char W_VLINE
= { '\0', '|' };
15 SuggestMgr::SuggestMgr(const char * tryme
, int maxn
,
19 // register affix manager and check in string of chars to
20 // try when building candidate suggestions
23 csconv
= get_current_cs(SPELL_ENCODING
);
39 maxngramsugs
= MAXNGRAMSUGS
;
40 maxcpdsugs
= MAXCOMPOUNDSUGS
;
43 langnum
= pAMgr
->get_langnum();
44 ckey
= pAMgr
->get_key_string();
45 nosplitsugs
= pAMgr
->get_nosplitsugs();
46 if (pAMgr
->get_maxngramsugs() >= 0)
47 maxngramsugs
= pAMgr
->get_maxngramsugs();
48 utf8
= pAMgr
->get_utf8();
49 if (pAMgr
->get_maxcpdsugs() >= 0)
50 maxcpdsugs
= pAMgr
->get_maxcpdsugs();
53 char * enc
= pAMgr
->get_encoding();
54 csconv
= get_current_cs(enc
);
57 complexprefixes
= pAMgr
->get_complexprefixes();
63 ckeyl
= u8_u16(t
, MAXSWL
, ckey
);
64 ckey_utf
= (w_char
*) malloc(ckeyl
* sizeof(w_char
));
65 if (ckey_utf
) memcpy(ckey_utf
, t
, ckeyl
* sizeof(w_char
));
73 ctry
= mystrdup(tryme
);
74 if (ctry
) ctryl
= strlen(ctry
);
77 ctryl
= u8_u16(t
, MAXSWL
, tryme
);
78 ctry_utf
= (w_char
*) malloc(ctryl
* sizeof(w_char
));
79 if (ctry_utf
) memcpy(ctry_utf
, t
, ctryl
* sizeof(w_char
));
86 SuggestMgr::~SuggestMgr()
91 if (ckey_utf
) free(ckey_utf
);
96 if (ctry_utf
) free(ctry_utf
);
100 #ifdef MOZILLA_CLIENT
105 int SuggestMgr::testsug(char** wlst
, const char * candidate
, int wl
, int ns
, int cpdsuggest
,
106 int * timer
, clock_t * timelimit
) {
108 if (ns
== maxSug
) return maxSug
;
109 for (int k
=0; k
< ns
; k
++) {
110 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
112 if ((cwrd
) && checkword(candidate
, wl
, cpdsuggest
, timer
, timelimit
)) {
113 wlst
[ns
] = mystrdup(candidate
);
114 if (wlst
[ns
] == NULL
) {
115 for (int j
=0; j
<ns
; j
++) free(wlst
[j
]);
123 // generate suggestions for a misspelled word
124 // pass in address of array of char * pointers
125 // onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
127 int SuggestMgr::suggest(char*** slst
, const char * w
, int nsug
,
128 int * onlycompoundsug
)
130 int nocompoundtwowords
= 0;
132 w_char word_utf
[MAXSWL
];
135 char w2
[MAXWORDUTF8LEN
];
136 const char * word
= w
;
139 // word reversing wrapper for complex prefixes
140 if (complexprefixes
) {
142 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
149 wlst
= (char **) malloc(maxSug
* sizeof(char *));
150 if (wlst
== NULL
) return -1;
151 for (int i
= 0; i
< maxSug
; i
++) {
157 wl
= u8_u16(word_utf
, MAXSWL
, word
);
164 for (int cpdsuggest
=0; (cpdsuggest
<2) && (nocompoundtwowords
==0); cpdsuggest
++) {
166 // limit compound suggestion
167 if (cpdsuggest
> 0) oldSug
= nsug
;
169 // suggestions for an uppercase word (html -> HTML)
170 if ((nsug
< maxSug
) && (nsug
> -1)) {
171 nsug
= (utf8
) ? capchars_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
172 capchars(wlst
, word
, nsug
, cpdsuggest
);
175 // perhaps we made a typical fault of spelling
176 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
177 nsug
= replchars(wlst
, word
, nsug
, cpdsuggest
);
180 // perhaps we made chose the wrong char from a related set
181 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
182 nsug
= mapchars(wlst
, word
, nsug
, cpdsuggest
);
185 // only suggest compound words when no other suggestion
186 if ((cpdsuggest
== 0) && (nsug
> nsugorig
)) nocompoundtwowords
=1;
188 // did we swap the order of chars by mistake
189 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
190 nsug
= (utf8
) ? swapchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
191 swapchar(wlst
, word
, nsug
, cpdsuggest
);
194 // did we swap the order of non adjacent chars by mistake
195 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
196 nsug
= (utf8
) ? longswapchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
197 longswapchar(wlst
, word
, nsug
, cpdsuggest
);
200 // did we just hit the wrong key in place of a good char (case and keyboard)
201 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
202 nsug
= (utf8
) ? badcharkey_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
203 badcharkey(wlst
, word
, nsug
, cpdsuggest
);
206 // did we add a char that should not be there
207 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
208 nsug
= (utf8
) ? extrachar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
209 extrachar(wlst
, word
, nsug
, cpdsuggest
);
213 // did we forgot a char
214 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
215 nsug
= (utf8
) ? forgotchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
216 forgotchar(wlst
, word
, nsug
, cpdsuggest
);
219 // did we move a char
220 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
221 nsug
= (utf8
) ? movechar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
222 movechar(wlst
, word
, nsug
, cpdsuggest
);
225 // did we just hit the wrong key in place of a good char
226 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
227 nsug
= (utf8
) ? badchar_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
228 badchar(wlst
, word
, nsug
, cpdsuggest
);
231 // did we double two characters
232 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
233 nsug
= (utf8
) ? doubletwochars_utf(wlst
, word_utf
, wl
, nsug
, cpdsuggest
) :
234 doubletwochars(wlst
, word
, nsug
, cpdsuggest
);
237 // perhaps we forgot to hit space and two words ran together
238 if (!nosplitsugs
&& (nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
))) {
239 nsug
= twowords(wlst
, word
, nsug
, cpdsuggest
);
242 } // repeating ``for'' statement compounding support
245 // we ran out of memory - we should free up as much as possible
246 for (int i
= 0; i
< maxSug
; i
++)
247 if (wlst
[i
] != NULL
) free(wlst
[i
]);
252 if (!nocompoundtwowords
&& (nsug
> 0) && onlycompoundsug
) *onlycompoundsug
= 1;
258 // generate suggestions for a word with typical mistake
259 // pass in address of array of char * pointers
260 #ifdef HUNSPELL_EXPERIMENTAL
261 int SuggestMgr::suggest_auto(char*** slst
, const char * w
, int nsug
)
263 int nocompoundtwowords
= 0;
267 char w2
[MAXWORDUTF8LEN
];
268 const char * word
= w
;
270 // word reversing wrapper for complex prefixes
271 if (complexprefixes
) {
273 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
280 wlst
= (char **) malloc(maxSug
* sizeof(char *));
281 if (wlst
== NULL
) return -1;
284 for (int cpdsuggest
=0; (cpdsuggest
<2) && (nocompoundtwowords
==0); cpdsuggest
++) {
286 // limit compound suggestion
287 if (cpdsuggest
> 0) oldSug
= nsug
;
289 // perhaps we made a typical fault of spelling
290 if ((nsug
< maxSug
) && (nsug
> -1))
291 nsug
= replchars(wlst
, word
, nsug
, cpdsuggest
);
293 // perhaps we made chose the wrong char from a related set
294 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
)))
295 nsug
= mapchars(wlst
, word
, nsug
, cpdsuggest
);
297 if ((cpdsuggest
==0) && (nsug
>0)) nocompoundtwowords
=1;
299 // perhaps we forgot to hit space and two words ran together
301 if ((nsug
< maxSug
) && (nsug
> -1) && (!cpdsuggest
|| (nsug
< oldSug
+ maxcpdsugs
)) && check_forbidden(word
, strlen(word
))) {
302 nsug
= twowords(wlst
, word
, nsug
, cpdsuggest
);
305 } // repeating ``for'' statement compounding support
308 for (int i
=0;i
<maxSug
; i
++)
309 if (wlst
[i
] != NULL
) free(wlst
[i
]);
317 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
319 // suggestions for an uppercase word (html -> HTML)
320 int SuggestMgr::capchars_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
322 char candidate
[MAXSWUTF8L
];
323 w_char candidate_utf
[MAXSWL
];
324 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
325 mkallcap_utf(candidate_utf
, wl
, langnum
);
326 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
327 return testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
330 // suggestions for an uppercase word (html -> HTML)
331 int SuggestMgr::capchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
333 char candidate
[MAXSWUTF8L
];
334 strcpy(candidate
, word
);
335 mkallcap(candidate
, csconv
);
336 return testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
339 // suggestions for when chose the wrong char out of a related set
340 int SuggestMgr::mapchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
342 char candidate
[MAXSWUTF8L
];
347 int wl
= strlen(word
);
348 if (wl
< 2 || ! pAMgr
) return ns
;
350 int nummap
= pAMgr
->get_nummap();
351 struct mapentry
* maptable
= pAMgr
->get_maptable();
352 if (maptable
==NULL
) return ns
;
356 return map_related(word
, (char *) &candidate
, 0, 0, wlst
, cpdsuggest
, ns
, maptable
, nummap
, &timer
, &timelimit
);
359 int SuggestMgr::map_related(const char * word
, char * candidate
, int wn
, int cn
,
360 char** wlst
, int cpdsuggest
, int ns
,
361 const mapentry
* maptable
, int nummap
, int * timer
, clock_t * timelimit
)
363 if (*(word
+ wn
) == '\0') {
365 *(candidate
+ cn
) = '\0';
366 int wl
= strlen(candidate
);
367 for (int m
=0; m
< ns
; m
++)
368 if (strcmp(candidate
, wlst
[m
]) == 0) cwrd
= 0;
369 if ((cwrd
) && checkword(candidate
, wl
, cpdsuggest
, timer
, timelimit
)) {
371 wlst
[ns
] = mystrdup(candidate
);
372 if (wlst
[ns
] == NULL
) return -1;
379 for (int j
= 0; j
< nummap
; j
++) {
380 for (int k
= 0; k
< maptable
[j
].len
; k
++) {
381 int len
= strlen(maptable
[j
].set
[k
]);
382 if (strncmp(maptable
[j
].set
[k
], word
+ wn
, len
) == 0) {
384 for (int l
= 0; l
< maptable
[j
].len
; l
++) {
385 strcpy(candidate
+ cn
, maptable
[j
].set
[l
]);
386 ns
= map_related(word
, candidate
, wn
+ len
, strlen(candidate
), wlst
,
387 cpdsuggest
, ns
, maptable
, nummap
, timer
, timelimit
);
388 if (!(*timer
)) return ns
;
394 *(candidate
+ cn
) = *(word
+ wn
);
395 ns
= map_related(word
, candidate
, wn
+ 1, cn
+ 1, wlst
, cpdsuggest
,
396 ns
, maptable
, nummap
, timer
, timelimit
);
401 // suggestions for a typical fault of spelling, that
402 // differs with more, than 1 letter from the right form.
403 int SuggestMgr::replchars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
405 char candidate
[MAXSWUTF8L
];
408 int wl
= strlen(word
);
409 if (wl
< 2 || ! pAMgr
) return ns
;
410 int numrep
= pAMgr
->get_numrep();
411 struct replentry
* reptable
= pAMgr
->get_reptable();
412 if (reptable
==NULL
) return ns
;
413 for (int i
=0; i
< numrep
; i
++ ) {
415 lenr
= strlen(reptable
[i
].pattern2
);
416 lenp
= strlen(reptable
[i
].pattern
);
417 // search every occurence of the pattern in the word
418 while ((r
=strstr(r
, reptable
[i
].pattern
)) != NULL
&& (!reptable
[i
].end
|| strlen(r
) == strlen(reptable
[i
].pattern
)) &&
419 (!reptable
[i
].start
|| r
== word
)) {
420 strcpy(candidate
, word
);
421 if (r
-word
+ lenr
+ strlen(r
+lenp
) >= MAXSWUTF8L
) break;
422 strcpy(candidate
+(r
-word
),reptable
[i
].pattern2
);
423 strcpy(candidate
+(r
-word
)+lenr
, r
+lenp
);
424 ns
= testsug(wlst
, candidate
, wl
-lenp
+lenr
, ns
, cpdsuggest
, NULL
, NULL
);
425 if (ns
== -1) return -1;
426 // check REP suggestions with space
427 char * sp
= strchr(candidate
, ' ');
429 char * prev
= candidate
;
432 if (checkword(prev
, strlen(prev
), 0, NULL
, NULL
)) {
435 ns
= testsug(wlst
, sp
+ 1, strlen(sp
+ 1), ns
, cpdsuggest
, NULL
, NULL
);
436 if (ns
== -1) return -1;
439 wlst
[ns
- 1] = mystrdup(candidate
);
440 if (!wlst
[ns
- 1]) return -1;
445 sp
= strchr(prev
, ' ');
448 r
++; // search for the next letter
454 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
455 int SuggestMgr::doubletwochars(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
457 char candidate
[MAXSWUTF8L
];
459 int wl
= strlen(word
);
460 if (wl
< 5 || ! pAMgr
) return ns
;
461 for (int i
=2; i
< wl
; i
++ ) {
462 if (word
[i
]==word
[i
-2]) {
465 strcpy(candidate
,word
);
466 strcpy(candidate
+i
-1,word
+i
+1);
467 ns
= testsug(wlst
, candidate
, wl
-2, ns
, cpdsuggest
, NULL
, NULL
);
468 if (ns
== -1) return -1;
478 // perhaps we doubled two characters (pattern aba -> ababa, for example vacation -> vacacation)
479 int SuggestMgr::doubletwochars_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
481 w_char candidate_utf
[MAXSWL
];
482 char candidate
[MAXSWUTF8L
];
484 if (wl
< 5 || ! pAMgr
) return ns
;
485 for (int i
=2; i
< wl
; i
++) {
486 if (w_char_eq(word
[i
], word
[i
-2])) {
489 memcpy(candidate_utf
, word
, (i
- 1) * sizeof(w_char
));
490 memcpy(candidate_utf
+i
-1, word
+i
+1, (wl
-i
-1) * sizeof(w_char
));
491 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
-2);
492 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
493 if (ns
== -1) return -1;
503 // error is wrong char in place of correct one (case and keyboard related version)
504 int SuggestMgr::badcharkey(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
507 char candidate
[MAXSWUTF8L
];
508 int wl
= strlen(word
);
509 strcpy(candidate
, word
);
510 // swap out each char one by one and try uppercase and neighbor
511 // keyboard chars in its place to see if that makes a good word
513 for (int i
=0; i
< wl
; i
++) {
515 // check with uppercase letters
516 candidate
[i
] = csconv
[((unsigned char)tmpc
)].cupper
;
517 if (tmpc
!= candidate
[i
]) {
518 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
519 if (ns
== -1) return -1;
522 // check neighbor characters in keyboard string
524 char * loc
= strchr(ckey
, tmpc
);
526 if ((loc
> ckey
) && (*(loc
- 1) != '|')) {
527 candidate
[i
] = *(loc
- 1);
528 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
529 if (ns
== -1) return -1;
531 if ((*(loc
+ 1) != '|') && (*(loc
+ 1) != '\0')) {
532 candidate
[i
] = *(loc
+ 1);
533 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
534 if (ns
== -1) return -1;
536 loc
= strchr(loc
+ 1, tmpc
);
543 // error is wrong char in place of correct one (case and keyboard related version)
544 int SuggestMgr::badcharkey_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
547 w_char candidate_utf
[MAXSWL
];
548 char candidate
[MAXSWUTF8L
];
549 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
550 // swap out each char one by one and try all the tryme
551 // chars in its place to see if that makes a good word
552 for (int i
=0; i
< wl
; i
++) {
553 tmpc
= candidate_utf
[i
];
554 // check with uppercase letters
555 mkallcap_utf(candidate_utf
+ i
, 1, langnum
);
556 if (!w_char_eq(tmpc
, candidate_utf
[i
])) {
557 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
558 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
559 if (ns
== -1) return -1;
560 candidate_utf
[i
] = tmpc
;
562 // check neighbor characters in keyboard string
564 w_char
* loc
= ckey_utf
;
565 while ((loc
< (ckey_utf
+ ckeyl
)) && !w_char_eq(*loc
, tmpc
)) loc
++;
566 while (loc
< (ckey_utf
+ ckeyl
)) {
567 if ((loc
> ckey_utf
) && !w_char_eq(*(loc
- 1), W_VLINE
)) {
568 candidate_utf
[i
] = *(loc
- 1);
569 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
570 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
571 if (ns
== -1) return -1;
573 if (((loc
+ 1) < (ckey_utf
+ ckeyl
)) && !w_char_eq(*(loc
+ 1), W_VLINE
)) {
574 candidate_utf
[i
] = *(loc
+ 1);
575 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
576 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
577 if (ns
== -1) return -1;
579 do { loc
++; } while ((loc
< (ckey_utf
+ ckeyl
)) && !w_char_eq(*loc
, tmpc
));
581 candidate_utf
[i
] = tmpc
;
586 // error is wrong char in place of correct one
587 int SuggestMgr::badchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
590 char candidate
[MAXSWUTF8L
];
591 clock_t timelimit
= clock();
592 int timer
= MINTIMER
;
593 int wl
= strlen(word
);
594 strcpy(candidate
, word
);
595 // swap out each char one by one and try all the tryme
596 // chars in its place to see if that makes a good word
597 for (int j
=0; j
< ctryl
; j
++) {
598 for (int i
=wl
-1; i
>= 0; i
--) {
600 if (ctry
[j
] == tmpc
) continue;
601 candidate
[i
] = ctry
[j
];
602 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, &timer
, &timelimit
);
603 if (ns
== -1) return -1;
604 if (!timer
) return ns
;
611 // error is wrong char in place of correct one
612 int SuggestMgr::badchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
615 w_char candidate_utf
[MAXSWL
];
616 char candidate
[MAXSWUTF8L
];
617 clock_t timelimit
= clock();
618 int timer
= MINTIMER
;
619 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
620 // swap out each char one by one and try all the tryme
621 // chars in its place to see if that makes a good word
622 for (int j
=0; j
< ctryl
; j
++) {
623 for (int i
=wl
-1; i
>= 0; i
--) {
624 tmpc
= candidate_utf
[i
];
625 if (w_char_eq(tmpc
, ctry_utf
[j
])) continue;
626 candidate_utf
[i
] = ctry_utf
[j
];
627 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
628 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, &timer
, &timelimit
);
629 if (ns
== -1) return -1;
630 if (!timer
) return ns
;
631 candidate_utf
[i
] = tmpc
;
637 // error is word has an extra letter it does not need
638 int SuggestMgr::extrachar_utf(char** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
640 char candidate
[MAXSWUTF8L
];
641 w_char candidate_utf
[MAXSWL
];
643 w_char tmpc
= W_VLINE
; // not used value, only for VCC warning message
644 if (wl
< 2) return ns
;
645 // try omitting one char of word at a time
646 memcpy(candidate_utf
, word
, wl
* sizeof(w_char
));
647 for (p
= candidate_utf
+ wl
- 1; p
>= candidate_utf
; p
--) {
649 if (p
< candidate_utf
+ wl
- 1) *p
= tmpc
;
650 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
- 1);
651 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
652 if (ns
== -1) return -1;
658 // error is word has an extra letter it does not need
659 int SuggestMgr::extrachar(char** wlst
, const char * word
, int ns
, int cpdsuggest
)
662 char candidate
[MAXSWUTF8L
];
664 int wl
= strlen(word
);
665 if (wl
< 2) return ns
;
666 // try omitting one char of word at a time
667 strcpy (candidate
, word
);
668 for (p
= candidate
+ wl
- 1; p
>=candidate
; p
--) {
671 ns
= testsug(wlst
, candidate
, wl
-1, ns
, cpdsuggest
, NULL
, NULL
);
672 if (ns
== -1) return -1;
678 // error is missing a letter it needs
679 int SuggestMgr::forgotchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
681 char candidate
[MAXSWUTF8L
];
683 clock_t timelimit
= clock();
684 int timer
= MINTIMER
;
685 int wl
= strlen(word
);
686 // try inserting a tryme character before every letter (and the null terminator)
687 for (int i
= 0; i
< ctryl
; i
++) {
688 strcpy(candidate
, word
);
689 for (p
= candidate
+ wl
; p
>= candidate
; p
--) {
692 ns
= testsug(wlst
, candidate
, wl
+1, ns
, cpdsuggest
, &timer
, &timelimit
);
693 if (ns
== -1) return -1;
694 if (!timer
) return ns
;
700 // error is missing a letter it needs
701 int SuggestMgr::forgotchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
703 w_char candidate_utf
[MAXSWL
];
704 char candidate
[MAXSWUTF8L
];
706 clock_t timelimit
= clock();
707 int timer
= MINTIMER
;
708 // try inserting a tryme character at the end of the word and before every letter
709 for (int i
= 0; i
< ctryl
; i
++) {
710 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
711 for (p
= candidate_utf
+ wl
; p
>= candidate_utf
; p
--) {
714 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
+ 1);
715 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, &timer
, &timelimit
);
716 if (ns
== -1) return -1;
717 if (!timer
) return ns
;
724 /* error is should have been two words */
725 int SuggestMgr::twowords(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
727 char candidate
[MAXSWUTF8L
];
734 if (wl
< 3) return ns
;
736 if (langnum
== LANG_hu
) forbidden
= check_forbidden(word
, wl
);
738 strcpy(candidate
+ 1, word
);
739 // split the string into two pieces after every char
740 // if both pieces are good words make them a suggestion
741 for (p
= candidate
+ 1; p
[1] != '\0'; p
++) {
743 // go to end of the UTF-8 character
744 while (utf8
&& ((p
[1] & 0xc0) == 0x80)) {
748 if (utf8
&& p
[1] == '\0') break; // last UTF-8 character
750 c1
= checkword(candidate
,strlen(candidate
), cpdsuggest
, NULL
, NULL
);
752 c2
= checkword((p
+1),strlen(p
+1), cpdsuggest
, NULL
, NULL
);
756 // spec. Hungarian code (need a better compound word support)
757 if ((langnum
== LANG_hu
) && !forbidden
&&
758 // if 3 repeating letter, use - instead of space
759 (((p
[-1] == p
[1]) && (((p
>candidate
+1) && (p
[-1] == p
[-2])) || (p
[-1] == p
[2]))) ||
760 // or multiple compounding, with more, than 6 syllables
761 ((c1
== 3) && (c2
>= 2)))) *p
= '-';
764 for (int k
=0; k
< ns
; k
++)
765 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
768 wlst
[ns
] = mystrdup(candidate
);
769 if (wlst
[ns
] == NULL
) return -1;
773 // add two word suggestion with dash, if TRY string contains
775 // NOTE: cwrd doesn't modified for REP twoword sugg.
776 if (ctry
&& (strchr(ctry
, 'a') || strchr(ctry
, '-')) &&
777 mystrlen(p
+ 1) > 1 &&
778 mystrlen(candidate
) - mystrlen(p
) > 1) {
780 for (int k
=0; k
< ns
; k
++)
781 if (strcmp(candidate
,wlst
[k
]) == 0) cwrd
= 0;
784 wlst
[ns
] = mystrdup(candidate
);
785 if (wlst
[ns
] == NULL
) return -1;
797 // error is adjacent letter were swapped
798 int SuggestMgr::swapchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
800 char candidate
[MAXSWUTF8L
];
804 // try swapping adjacent chars one by one
805 strcpy(candidate
, word
);
806 for (p
= candidate
; p
[1] != 0; p
++) {
810 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
811 if (ns
== -1) return -1;
815 // try double swaps for short words
816 // ahev -> have, owudl -> would
817 if (wl
== 4 || wl
== 5) {
818 candidate
[0] = word
[1];
819 candidate
[1] = word
[0];
820 candidate
[2] = word
[2];
821 candidate
[wl
- 2] = word
[wl
- 1];
822 candidate
[wl
- 1] = word
[wl
- 2];
823 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
824 if (ns
== -1) return -1;
826 candidate
[0] = word
[0];
827 candidate
[1] = word
[2];
828 candidate
[2] = word
[1];
829 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
830 if (ns
== -1) return -1;
836 // error is adjacent letter were swapped
837 int SuggestMgr::swapchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
839 w_char candidate_utf
[MAXSWL
];
840 char candidate
[MAXSWUTF8L
];
844 // try swapping adjacent chars one by one
845 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
846 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
- 1); p
++) {
850 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
851 if (len
== 0) len
= strlen(candidate
);
852 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
853 if (ns
== -1) return -1;
857 // try double swaps for short words
858 // ahev -> have, owudl -> would, suodn -> sound
859 if (wl
== 4 || wl
== 5) {
860 candidate_utf
[0] = word
[1];
861 candidate_utf
[1] = word
[0];
862 candidate_utf
[2] = word
[2];
863 candidate_utf
[wl
- 2] = word
[wl
- 1];
864 candidate_utf
[wl
- 1] = word
[wl
- 2];
865 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
866 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
867 if (ns
== -1) return -1;
869 candidate_utf
[0] = word
[0];
870 candidate_utf
[1] = word
[2];
871 candidate_utf
[2] = word
[1];
872 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
873 ns
= testsug(wlst
, candidate
, len
, ns
, cpdsuggest
, NULL
, NULL
);
874 if (ns
== -1) return -1;
880 // error is not adjacent letter were swapped
881 int SuggestMgr::longswapchar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
883 char candidate
[MAXSWUTF8L
];
888 // try swapping not adjacent chars one by one
889 strcpy(candidate
, word
);
890 for (p
= candidate
; *p
!= 0; p
++) {
891 for (q
= candidate
; *q
!= 0; q
++) {
892 if (abs((int)(p
-q
)) > 1) {
896 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
897 if (ns
== -1) return -1;
907 // error is adjacent letter were swapped
908 int SuggestMgr::longswapchar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
910 w_char candidate_utf
[MAXSWL
];
911 char candidate
[MAXSWUTF8L
];
915 // try swapping not adjacent chars
916 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
917 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
); p
++) {
918 for (q
= candidate_utf
; q
< (candidate_utf
+ wl
); q
++) {
919 if (abs((int)(p
-q
)) > 1) {
923 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
924 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
925 if (ns
== -1) return -1;
934 // error is a letter was moved
935 int SuggestMgr::movechar(char ** wlst
, const char * word
, int ns
, int cpdsuggest
)
937 char candidate
[MAXSWUTF8L
];
944 strcpy(candidate
, word
);
945 for (p
= candidate
; *p
!= 0; p
++) {
946 for (q
= p
+ 1; (*q
!= 0) && ((q
- p
) < 10); q
++) {
950 if ((q
-p
) < 2) continue; // omit swap char
951 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
952 if (ns
== -1) return -1;
954 strcpy(candidate
, word
);
956 for (p
= candidate
+ wl
- 1; p
> candidate
; p
--) {
957 for (q
= p
- 1; (q
>= candidate
) && ((p
- q
) < 10); q
--) {
961 if ((p
-q
) < 2) continue; // omit swap char
962 ns
= testsug(wlst
, candidate
, wl
, ns
, cpdsuggest
, NULL
, NULL
);
963 if (ns
== -1) return -1;
965 strcpy(candidate
, word
);
970 // error is a letter was moved
971 int SuggestMgr::movechar_utf(char ** wlst
, const w_char
* word
, int wl
, int ns
, int cpdsuggest
)
973 w_char candidate_utf
[MAXSWL
];
974 char candidate
[MAXSWUTF8L
];
979 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
980 for (p
= candidate_utf
; p
< (candidate_utf
+ wl
); p
++) {
981 for (q
= p
+ 1; (q
< (candidate_utf
+ wl
)) && ((q
- p
) < 10); q
++) {
985 if ((q
-p
) < 2) continue; // omit swap char
986 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
987 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
988 if (ns
== -1) return -1;
990 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
992 for (p
= candidate_utf
+ wl
- 1; p
> candidate_utf
; p
--) {
993 for (q
= p
- 1; (q
>= candidate_utf
) && ((p
- q
) < 10); q
--) {
997 if ((p
-q
) < 2) continue; // omit swap char
998 u16_u8(candidate
, MAXSWUTF8L
, candidate_utf
, wl
);
999 ns
= testsug(wlst
, candidate
, strlen(candidate
), ns
, cpdsuggest
, NULL
, NULL
);
1000 if (ns
== -1) return -1;
1002 memcpy (candidate_utf
, word
, wl
* sizeof(w_char
));
1007 // generate a set of suggestions for very poorly spelled words
1008 int SuggestMgr::ngsuggest(char** wlst
, char * w
, int ns
, HashMgr
** pHMgr
, int md
)
1017 // exhaustively search through all root words
1018 // keeping track of the MAX_ROOTS most similar root words
1019 struct hentry
* roots
[MAX_ROOTS
];
1020 char * rootsphon
[MAX_ROOTS
];
1021 int scores
[MAX_ROOTS
];
1022 int scoresphon
[MAX_ROOTS
];
1023 for (i
= 0; i
< MAX_ROOTS
; i
++) {
1025 scores
[i
] = -100 * i
;
1026 rootsphon
[i
] = NULL
;
1027 scoresphon
[i
] = -100 * i
;
1030 lpphon
= MAX_ROOTS
- 1;
1032 int low
= NGRAM_LOWERING
;
1034 char w2
[MAXWORDUTF8LEN
];
1038 // word reversing wrapper for complex prefixes
1039 if (complexprefixes
) {
1041 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1045 char mw
[MAXSWUTF8L
];
1047 int nc
= strlen(word
);
1048 int n
= (utf8
) ? u8_u16(u8
, MAXSWL
, word
) : nc
;
1050 // set character based ngram suggestion for words with non-BMP Unicode characters
1052 utf8
= 0; // XXX not state-free
1058 struct hentry
* hp
= NULL
;
1060 phonetable
* ph
= (pAMgr
) ? pAMgr
->get_phonetable() : NULL
;
1061 char target
[MAXSWUTF8L
];
1062 char candidate
[MAXSWUTF8L
];
1066 int _wl
= u8_u16(_w
, MAXSWL
, word
);
1067 mkallcap_utf(_w
, _wl
, langnum
);
1068 u16_u8(candidate
, MAXSWUTF8L
, _w
, _wl
);
1070 strcpy(candidate
, word
);
1071 if (!nonbmp
) mkallcap(candidate
, csconv
);
1073 phonet(candidate
, target
, nc
, *ph
); // XXX phonet() is 8-bit (nc, not n)
1076 FLAG forbiddenword
= pAMgr
? pAMgr
->get_forbiddenword() : FLAG_NULL
;
1077 FLAG nosuggest
= pAMgr
? pAMgr
->get_nosuggest() : FLAG_NULL
;
1078 FLAG nongramsuggest
= pAMgr
? pAMgr
->get_nongramsuggest() : FLAG_NULL
;
1079 FLAG onlyincompound
= pAMgr
? pAMgr
->get_onlyincompound() : FLAG_NULL
;
1081 for (i
= 0; i
< md
; i
++) {
1082 while (0 != (hp
= (pHMgr
[i
])->walk_hashtable(col
, hp
))) {
1083 if ((hp
->astr
) && (pAMgr
) &&
1084 (TESTAFF(hp
->astr
, forbiddenword
, hp
->alen
) ||
1085 TESTAFF(hp
->astr
, ONLYUPCASEFLAG
, hp
->alen
) ||
1086 TESTAFF(hp
->astr
, nosuggest
, hp
->alen
) ||
1087 TESTAFF(hp
->astr
, nongramsuggest
, hp
->alen
) ||
1088 TESTAFF(hp
->astr
, onlyincompound
, hp
->alen
))) continue;
1090 sc
= ngram(3, word
, HENTRY_WORD(hp
), NGRAM_LONGER_WORSE
+ low
) +
1091 leftcommonsubstring(word
, HENTRY_WORD(hp
));
1093 // check special pronounciation
1094 if ((hp
->var
& H_OPT_PHON
) && copy_field(f
, HENTRY_DATA(hp
), MORPH_PHON
)) {
1095 int sc2
= ngram(3, word
, f
, NGRAM_LONGER_WORSE
+ low
) +
1096 + leftcommonsubstring(word
, f
);
1097 if (sc2
> sc
) sc
= sc2
;
1101 if (ph
&& (sc
> 2) && (abs(n
- (int) hp
->clen
) <= 3)) {
1102 char target2
[MAXSWUTF8L
];
1105 int _wl
= u8_u16(_w
, MAXSWL
, HENTRY_WORD(hp
));
1106 mkallcap_utf(_w
, _wl
, langnum
);
1107 u16_u8(candidate
, MAXSWUTF8L
, _w
, _wl
);
1109 strcpy(candidate
, HENTRY_WORD(hp
));
1110 mkallcap(candidate
, csconv
);
1112 phonet(candidate
, target2
, -1, *ph
);
1113 scphon
= 2 * ngram(3, target
, target2
, NGRAM_LONGER_WORSE
);
1116 if (sc
> scores
[lp
]) {
1120 for (j
=0; j
< MAX_ROOTS
; j
++)
1121 if (scores
[j
] < lval
) {
1128 if (scphon
> scoresphon
[lpphon
]) {
1129 scoresphon
[lpphon
] = scphon
;
1130 rootsphon
[lpphon
] = HENTRY_WORD(hp
);
1132 for (j
=0; j
< MAX_ROOTS
; j
++)
1133 if (scoresphon
[j
] < lval
) {
1135 lval
= scoresphon
[j
];
1140 // find minimum threshold for a passable suggestion
1141 // mangle original word three differnt ways
1142 // and score them to generate a minimum acceptable score
1144 for (int sp
= 1; sp
< 4; sp
++) {
1146 for (int k
=sp
; k
< n
; k
+=4) *((unsigned short *) u8
+ k
) = '*';
1147 u16_u8(mw
, MAXSWUTF8L
, u8
, n
);
1148 thresh
= thresh
+ ngram(n
, word
, mw
, NGRAM_ANY_MISMATCH
+ low
);
1151 for (int k
=sp
; k
< n
; k
+=4) *(mw
+ k
) = '*';
1152 thresh
= thresh
+ ngram(n
, word
, mw
, NGRAM_ANY_MISMATCH
+ low
);
1155 thresh
= thresh
/ 3;
1158 // now expand affixes on each of these root words and
1159 // and use length adjusted ngram scores to select
1160 // possible suggestions
1161 char * guess
[MAX_GUESS
];
1162 char * guessorig
[MAX_GUESS
];
1163 int gscore
[MAX_GUESS
];
1164 for(i
=0;i
<MAX_GUESS
;i
++) {
1166 guessorig
[i
] = NULL
;
1167 gscore
[i
] = -100 * i
;
1172 struct guessword
* glst
;
1173 glst
= (struct guessword
*) calloc(MAX_WORDS
,sizeof(struct guessword
));
1175 if (nonbmp
) utf8
= 1;
1179 for (i
= 0; i
< MAX_ROOTS
; i
++) {
1181 struct hentry
* rp
= roots
[i
];
1182 int nw
= pAMgr
->expand_rootword(glst
, MAX_WORDS
, HENTRY_WORD(rp
), rp
->blen
,
1183 rp
->astr
, rp
->alen
, word
, nc
,
1184 ((rp
->var
& H_OPT_PHON
) ? copy_field(f
, HENTRY_DATA(rp
), MORPH_PHON
) : NULL
));
1186 for (int k
= 0; k
< nw
; k
++) {
1187 sc
= ngram(n
, word
, glst
[k
].word
, NGRAM_ANY_MISMATCH
+ low
) +
1188 leftcommonsubstring(word
, glst
[k
].word
);
1191 if (sc
> gscore
[lp
]) {
1194 if (guessorig
[lp
]) {
1195 free(guessorig
[lp
]);
1196 guessorig
[lp
] = NULL
;
1200 guess
[lp
] = glst
[k
].word
;
1201 guessorig
[lp
] = glst
[k
].orig
;
1203 for (j
=0; j
< MAX_GUESS
; j
++)
1204 if (gscore
[j
] < lval
) {
1210 if (glst
[k
].orig
) free(glst
[k
].orig
);
1214 if (glst
[k
].orig
) free(glst
[k
].orig
);
1221 // now we are done generating guesses
1222 // sort in order of decreasing score
1225 bubblesort(&guess
[0], &guessorig
[0], &gscore
[0], MAX_GUESS
);
1226 if (ph
) bubblesort(&rootsphon
[0], NULL
, &scoresphon
[0], MAX_ROOTS
);
1228 // weight suggestions with a similarity index, based on
1229 // the longest common subsequent algorithm and resort
1235 int maxd
= pAMgr
->get_maxdiff();
1236 if (maxd
>= 0) fact
= (10.0 - maxd
)/5.0;
1239 for (i
=0; i
< MAX_GUESS
; i
++) {
1241 // lowering guess[i]
1242 char gl
[MAXSWUTF8L
];
1246 len
= u8_u16(_w
, MAXSWL
, guess
[i
]);
1247 mkallsmall_utf(_w
, len
, langnum
);
1248 u16_u8(gl
, MAXSWUTF8L
, _w
, len
);
1250 strcpy(gl
, guess
[i
]);
1251 if (!nonbmp
) mkallsmall(gl
, csconv
);
1252 len
= strlen(guess
[i
]);
1255 int _lcs
= lcslen(word
, gl
);
1257 // same characters with different casing
1258 if ((n
== len
) && (n
== _lcs
)) {
1262 // using 2-gram instead of 3, and other weightening
1264 re
= ngram(2, word
, gl
, NGRAM_ANY_MISMATCH
+ low
+ NGRAM_WEIGHTED
) +
1265 ngram(2, gl
, word
, NGRAM_ANY_MISMATCH
+ low
+ NGRAM_WEIGHTED
);
1268 // length of longest common subsequent minus length difference
1269 2 * _lcs
- abs((int) (n
- len
)) +
1270 // weight length of the left common substring
1271 leftcommonsubstring(word
, gl
) +
1272 // weight equal character positions
1273 (!nonbmp
&& commoncharacterpositions(word
, gl
, &is_swap
) ? 1: 0) +
1274 // swap character (not neighboring)
1275 ((is_swap
) ? 10 : 0) +
1277 ngram(4, word
, gl
, NGRAM_ANY_MISMATCH
+ low
) +
1280 // different limit for dictionaries with PHONE rules
1281 (ph
? (re
< len
* fact
? -1000 : 0) : (re
< (n
+ len
)*fact
? -1000 : 0));
1285 bubblesort(&guess
[0], &guessorig
[0], &gscore
[0], MAX_GUESS
);
1288 if (ph
) for (i
=0; i
< MAX_ROOTS
; i
++) {
1290 // lowering rootphon[i]
1291 char gl
[MAXSWUTF8L
];
1295 len
= u8_u16(_w
, MAXSWL
, rootsphon
[i
]);
1296 mkallsmall_utf(_w
, len
, langnum
);
1297 u16_u8(gl
, MAXSWUTF8L
, _w
, len
);
1299 strcpy(gl
, rootsphon
[i
]);
1300 if (!nonbmp
) mkallsmall(gl
, csconv
);
1301 len
= strlen(rootsphon
[i
]);
1304 // heuristic weigthing of ngram scores
1305 scoresphon
[i
] += 2 * lcslen(word
, gl
) - abs((int) (n
- len
)) +
1306 // weight length of the left common substring
1307 leftcommonsubstring(word
, gl
);
1311 if (ph
) bubblesort(&rootsphon
[0], NULL
, &scoresphon
[0], MAX_ROOTS
);
1317 for (i
=0; i
< MAX_GUESS
; i
++) {
1319 if ((ns
< oldns
+ maxngramsugs
) && (ns
< maxSug
) && (!same
|| (gscore
[i
] > 1000))) {
1321 // leave only excellent suggestions, if exists
1322 if (gscore
[i
] > 1000) same
= 1; else if (gscore
[i
] < -100) {
1324 // keep the best ngram suggestions, unless in ONLYMAXDIFF mode
1325 if (ns
> oldns
|| (pAMgr
&& pAMgr
->get_onlymaxdiff())) {
1327 if (guessorig
[i
]) free(guessorig
[i
]);
1331 for (j
= 0; j
< ns
; j
++) {
1332 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1333 if ((!guessorig
[i
] && strstr(guess
[i
], wlst
[j
])) ||
1334 (guessorig
[i
] && strstr(guessorig
[i
], wlst
[j
])) ||
1335 // check forbidden words
1336 !checkword(guess
[i
], strlen(guess
[i
]), 0, NULL
, NULL
)) unique
= 0;
1339 wlst
[ns
++] = guess
[i
];
1342 wlst
[ns
-1] = guessorig
[i
];
1346 if (guessorig
[i
]) free(guessorig
[i
]);
1350 if (guessorig
[i
]) free(guessorig
[i
]);
1356 if (ph
) for (i
=0; i
< MAX_ROOTS
; i
++) {
1358 if ((ns
< oldns
+ MAXPHONSUGS
) && (ns
< maxSug
)) {
1360 for (j
= 0; j
< ns
; j
++) {
1361 // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
1362 if (strstr(rootsphon
[i
], wlst
[j
]) ||
1363 // check forbidden words
1364 !checkword(rootsphon
[i
], strlen(rootsphon
[i
]), 0, NULL
, NULL
)) unique
= 0;
1367 wlst
[ns
++] = mystrdup(rootsphon
[i
]);
1368 if (!wlst
[ns
- 1]) return ns
- 1;
1374 if (nonbmp
) utf8
= 1;
1379 // see if a candidate suggestion is spelled correctly
1380 // needs to check both root words and words with affixes
1382 // obsolote MySpell-HU modifications:
1383 // return value 2 and 3 marks compounding with hyphen (-)
1384 // `3' marks roots without suffix
1385 int SuggestMgr::checkword(const char * word
, int len
, int cpdsuggest
, int * timer
, clock_t * timelimit
)
1387 struct hentry
* rv
=NULL
;
1388 struct hentry
* rv2
=NULL
;
1394 if (!(*timer
) && timelimit
) {
1395 if ((clock() - *timelimit
) > TIMELIMIT
) return 0;
1396 *timer
= MAXPLUSTIMER
;
1401 if (cpdsuggest
==1) {
1402 if (pAMgr
->get_compound()) {
1403 rv
= pAMgr
->compound_check(word
, len
, 0, 0, 100, 0, NULL
, 0, 1, 0); //EXT
1404 if (rv
&& (!(rv2
= pAMgr
->lookup(word
)) || !rv2
->astr
||
1405 !(TESTAFF(rv2
->astr
,pAMgr
->get_forbiddenword(),rv2
->alen
) ||
1406 TESTAFF(rv2
->astr
,pAMgr
->get_nosuggest(),rv2
->alen
)))) return 3; // XXX obsolote categorisation + only ICONV needs affix flag check?
1411 rv
= pAMgr
->lookup(word
);
1414 if ((rv
->astr
) && (TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
)
1415 || TESTAFF(rv
->astr
,pAMgr
->get_nosuggest(),rv
->alen
))) return 0;
1417 if (rv
->astr
&& (TESTAFF(rv
->astr
,pAMgr
->get_needaffix(),rv
->alen
) ||
1418 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1419 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) {
1420 rv
= rv
->next_homonym
;
1423 } else rv
= pAMgr
->prefix_check(word
, len
, 0); // only prefix, and prefix + suffix XXX
1428 rv
= pAMgr
->suffix_check(word
, len
, 0, NULL
, NULL
, 0, NULL
); // only suffix
1431 if (!rv
&& pAMgr
->have_contclass()) {
1432 rv
= pAMgr
->suffix_check_twosfx(word
, len
, 0, NULL
, FLAG_NULL
);
1433 if (!rv
) rv
= pAMgr
->prefix_check_twosfx(word
, len
, 1, FLAG_NULL
);
1436 // check forbidden words
1437 if ((rv
) && (rv
->astr
) && (TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
) ||
1438 TESTAFF(rv
->astr
, ONLYUPCASEFLAG
, rv
->alen
) ||
1439 TESTAFF(rv
->astr
,pAMgr
->get_nosuggest(),rv
->alen
) ||
1440 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) return 0;
1442 if (rv
) { // XXX obsolote
1443 if ((pAMgr
->get_compoundflag()) &&
1444 TESTAFF(rv
->astr
, pAMgr
->get_compoundflag(), rv
->alen
)) return 2 + nosuffix
;
1451 int SuggestMgr::check_forbidden(const char * word
, int len
)
1453 struct hentry
* rv
= NULL
;
1456 rv
= pAMgr
->lookup(word
);
1457 if (rv
&& rv
->astr
&& (TESTAFF(rv
->astr
,pAMgr
->get_needaffix(),rv
->alen
) ||
1458 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) rv
= NULL
;
1459 if (!(pAMgr
->prefix_check(word
,len
,1)))
1460 rv
= pAMgr
->suffix_check(word
,len
, 0, NULL
, NULL
, 0, NULL
); // prefix+suffix, suffix
1461 // check forbidden words
1462 if ((rv
) && (rv
->astr
) && TESTAFF(rv
->astr
,pAMgr
->get_forbiddenword(),rv
->alen
)) return 1;
1467 #ifdef HUNSPELL_EXPERIMENTAL
1468 // suggest possible stems
1469 int SuggestMgr::suggest_pos_stems(char*** slst
, const char * w
, int nsug
)
1473 struct hentry
* rv
= NULL
;
1475 char w2
[MAXSWUTF8L
];
1476 const char * word
= w
;
1478 // word reversing wrapper for complex prefixes
1479 if (complexprefixes
) {
1481 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1485 int wl
= strlen(word
);
1491 wlst
= (char **) calloc(maxSug
, sizeof(char *));
1492 if (wlst
== NULL
) return -1;
1495 rv
= pAMgr
->suffix_check(word
, wl
, 0, NULL
, wlst
, maxSug
, &nsug
);
1497 // delete dash from end of word
1499 for (int j
=0; j
< nsug
; j
++) {
1500 if (wlst
[j
][strlen(wlst
[j
]) - 1] == '-') wlst
[j
][strlen(wlst
[j
]) - 1] = '\0';
1507 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1510 char * SuggestMgr::suggest_morph(const char * w
)
1512 char result
[MAXLNLEN
];
1513 char * r
= (char *) result
;
1516 struct hentry
* rv
= NULL
;
1520 if (! pAMgr
) return NULL
;
1522 char w2
[MAXSWUTF8L
];
1523 const char * word
= w
;
1525 // word reversing wrapper for complex prefixes
1526 if (complexprefixes
) {
1528 if (utf8
) reverseword_utf(w2
); else reverseword(w2
);
1532 rv
= pAMgr
->lookup(word
);
1535 if ((!rv
->astr
) || !(TESTAFF(rv
->astr
, pAMgr
->get_forbiddenword(), rv
->alen
) ||
1536 TESTAFF(rv
->astr
, pAMgr
->get_needaffix(), rv
->alen
) ||
1537 TESTAFF(rv
->astr
,pAMgr
->get_onlyincompound(),rv
->alen
))) {
1538 if (!HENTRY_FIND(rv
, MORPH_STEM
)) {
1539 mystrcat(result
, " ", MAXLNLEN
);
1540 mystrcat(result
, MORPH_STEM
, MAXLNLEN
);
1541 mystrcat(result
, word
, MAXLNLEN
);
1543 if (HENTRY_DATA(rv
)) {
1544 mystrcat(result
, " ", MAXLNLEN
);
1545 mystrcat(result
, HENTRY_DATA2(rv
), MAXLNLEN
);
1547 mystrcat(result
, "\n", MAXLNLEN
);
1549 rv
= rv
->next_homonym
;
1552 st
= pAMgr
->affix_check_morph(word
,strlen(word
));
1554 mystrcat(result
, st
, MAXLNLEN
);
1558 if (pAMgr
->get_compound() && (*result
== '\0'))
1559 pAMgr
->compound_check_morph(word
, strlen(word
),
1560 0, 0, 100, 0,NULL
, 0, &r
, NULL
);
1562 return (*result
) ? mystrdup(line_uniq(result
, MSEP_REC
)) : NULL
;
1565 #ifdef HUNSPELL_EXPERIMENTAL
1566 char * SuggestMgr::suggest_morph_for_spelling_error(const char * word
)
1569 char ** wlst
= (char **) calloc(maxSug
, sizeof(char *));
1570 if (!**wlst
) return NULL
;
1571 // we will use only the first suggestion
1572 for (int i
= 0; i
< maxSug
- 1; i
++) wlst
[i
] = "";
1573 int ns
= suggest(&wlst
, word
, maxSug
- 1, NULL
);
1575 p
= suggest_morph(wlst
[maxSug
- 1]);
1576 free(wlst
[maxSug
- 1]);
1578 if (wlst
) free(wlst
);
1581 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
1584 char * SuggestMgr::suggest_hentry_gen(hentry
* rv
, char * pattern
)
1586 char result
[MAXLNLEN
];
1588 int sfxcount
= get_sfxcount(pattern
);
1590 if (get_sfxcount(HENTRY_DATA(rv
)) > sfxcount
) return NULL
;
1592 if (HENTRY_DATA(rv
)) {
1593 char * aff
= pAMgr
->morphgen(HENTRY_WORD(rv
), rv
->blen
, rv
->astr
, rv
->alen
,
1594 HENTRY_DATA(rv
), pattern
, 0);
1596 mystrcat(result
, aff
, MAXLNLEN
);
1597 mystrcat(result
, "\n", MAXLNLEN
);
1602 // check all allomorphs
1603 char allomorph
[MAXLNLEN
];
1605 if (HENTRY_DATA(rv
)) p
= (char *) strstr(HENTRY_DATA2(rv
), MORPH_ALLOMORPH
);
1607 struct hentry
* rv2
= NULL
;
1609 int plen
= fieldlen(p
);
1610 strncpy(allomorph
, p
, plen
);
1611 allomorph
[plen
] = '\0';
1612 rv2
= pAMgr
->lookup(allomorph
);
1614 // if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) {
1615 if (HENTRY_DATA(rv2
)) {
1616 char * st
= (char *) strstr(HENTRY_DATA2(rv2
), MORPH_STEM
);
1617 if (st
&& (strncmp(st
+ MORPH_TAG_LEN
,
1618 HENTRY_WORD(rv
), fieldlen(st
+ MORPH_TAG_LEN
)) == 0)) {
1619 char * aff
= pAMgr
->morphgen(HENTRY_WORD(rv2
), rv2
->blen
, rv2
->astr
, rv2
->alen
,
1620 HENTRY_DATA(rv2
), pattern
, 0);
1622 mystrcat(result
, aff
, MAXLNLEN
);
1623 mystrcat(result
, "\n", MAXLNLEN
);
1628 rv2
= rv2
->next_homonym
;
1630 p
= strstr(p
+ plen
, MORPH_ALLOMORPH
);
1633 return (*result
) ? mystrdup(result
) : NULL
;
1636 char * SuggestMgr::suggest_gen(char ** desc
, int n
, char * pattern
) {
1637 char result
[MAXLNLEN
];
1638 char result2
[MAXLNLEN
];
1639 char newpattern
[MAXLNLEN
];
1641 if (n
== 0) return 0;
1643 struct hentry
* rv
= NULL
;
1644 if (!pAMgr
) return NULL
;
1646 // search affixed forms with and without derivational suffixes
1649 for (int k
= 0; k
< n
; k
++) {
1651 // add compound word parts (except the last one)
1652 char * s
= (char *) desc
[k
];
1653 char * part
= strstr(s
, MORPH_PART
);
1655 char * nextpart
= strstr(part
+ 1, MORPH_PART
);
1657 copy_field(result
+ strlen(result
), part
, MORPH_PART
);
1659 nextpart
= strstr(part
+ 1, MORPH_PART
);
1667 char * alt
= strstr(tok
, " | ");
1670 alt
= strstr(alt
, " | ");
1672 int pln
= line_tok(tok
, &pl
, MSEP_ALT
);
1673 for (int i
= 0; i
< pln
; i
++) {
1674 // remove inflectional and terminal suffixes
1675 char * is
= strstr(pl
[i
], MORPH_INFL_SFX
);
1677 char * ts
= strstr(pl
[i
], MORPH_TERM_SFX
);
1680 ts
= strstr(pl
[i
], MORPH_TERM_SFX
);
1682 char * st
= strstr(s
, MORPH_STEM
);
1684 copy_field(tok
, st
, MORPH_STEM
);
1685 rv
= pAMgr
->lookup(tok
);
1687 char newpat
[MAXLNLEN
];
1688 strcpy(newpat
, pl
[i
]);
1689 strcat(newpat
, pattern
);
1690 char * sg
= suggest_hentry_gen(rv
, newpat
);
1691 if (!sg
) sg
= suggest_hentry_gen(rv
, pattern
);
1694 int genl
= line_tok(sg
, &gen
, MSEP_REC
);
1697 for (int j
= 0; j
< genl
; j
++) {
1698 if (strstr(pl
[i
], MORPH_SURF_PFX
)) {
1699 int r2l
= strlen(result2
);
1700 result2
[r2l
] = MSEP_REC
;
1701 strcpy(result2
+ r2l
+ 1, result
);
1702 copy_field(result2
+ strlen(result2
), pl
[i
], MORPH_SURF_PFX
);
1703 mystrcat(result2
, gen
[j
], MAXLNLEN
);
1705 sprintf(result2
+ strlen(result2
), "%c%s%s",
1706 MSEP_REC
, result
, gen
[j
]);
1709 freelist(&gen
, genl
);
1711 rv
= rv
->next_homonym
;
1718 if (*result2
|| !strstr(pattern
, MORPH_DERI_SFX
)) break;
1719 strcpy(newpattern
, pattern
);
1720 pattern
= newpattern
;
1721 char * ds
= strstr(pattern
, MORPH_DERI_SFX
);
1723 strncpy(ds
, MORPH_TERM_SFX
, MORPH_TAG_LEN
);
1724 ds
= strstr(pattern
, MORPH_DERI_SFX
);
1727 return (*result2
? mystrdup(result2
) : NULL
);
1731 // generate an n-gram score comparing s1 and s2
1732 int SuggestMgr::ngram(int n
, char * s1
, const char * s2
, int opt
)
1743 l1
= u8_u16(su1
, MAXSWL
, s1
);
1744 l2
= u8_u16(su2
, MAXSWL
, s2
);
1745 if ((l2
<= 0) || (l1
== -1)) return 0;
1746 // lowering dictionary word
1747 if (opt
& NGRAM_LOWERING
) mkallsmall_utf(su2
, l2
, langnum
);
1748 for (int j
= 1; j
<= n
; j
++) {
1750 for (int i
= 0; i
<= (l1
-j
); i
++) {
1752 for (int l
= 0; l
<= (l2
-j
); l
++) {
1753 for (k
= 0; k
< j
; k
++) {
1754 w_char
* c1
= su1
+ i
+ k
;
1755 w_char
* c2
= su2
+ l
+ k
;
1756 if ((c1
->l
!= c2
->l
) || (c1
->h
!= c2
->h
)) break;
1763 if (k
!= j
&& opt
& NGRAM_WEIGHTED
) {
1766 if (i
== 0 || i
== l1
-j
) ns
--; // side weight
1769 nscore
= nscore
+ ns
;
1770 if (ns
< 2 && !(opt
& NGRAM_WEIGHTED
)) break;
1774 if (l2
== 0) return 0;
1776 char *t
= mystrdup(s2
);
1777 if (opt
& NGRAM_LOWERING
) mkallsmall(t
, csconv
);
1778 for (int j
= 1; j
<= n
; j
++) {
1780 for (int i
= 0; i
<= (l1
-j
); i
++) {
1781 char c
= *(s1
+ i
+ j
);
1782 *(s1
+ i
+ j
) = '\0';
1783 if (strstr(t
,(s1
+i
))) {
1785 } else if (opt
& NGRAM_WEIGHTED
) {
1788 if (i
== 0 || i
== l1
-j
) ns
--; // side weight
1792 nscore
= nscore
+ ns
;
1793 if (ns
< 2 && !(opt
& NGRAM_WEIGHTED
)) break;
1799 if (opt
& NGRAM_LONGER_WORSE
) ns
= (l2
-l1
)-2;
1800 if (opt
& NGRAM_ANY_MISMATCH
) ns
= abs(l2
-l1
)-2;
1801 ns
= (nscore
- ((ns
> 0) ? ns
: 0));
1805 // length of the left common substring of s1 and (decapitalised) s2
1806 int SuggestMgr::leftcommonsubstring(char * s1
, const char * s2
) {
1810 su1
[0].l
= su2
[0].l
= su1
[0].h
= su2
[0].h
= 0;
1811 // decapitalize dictionary word
1812 if (complexprefixes
) {
1813 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1814 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1815 if (*((short *)su1
+l1
-1) == *((short *)su2
+l2
-1)) return 1;
1820 unsigned short idx
= (su2
->h
<< 8) + su2
->l
;
1821 unsigned short otheridx
= (su1
->h
<< 8) + su1
->l
;
1822 if (otheridx
!= idx
&&
1823 (otheridx
!= unicodetolower(idx
, langnum
))) return 0;
1824 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1825 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1826 for(i
= 1; (i
< l1
) && (i
< l2
) &&
1827 (su1
[i
].l
== su2
[i
].l
) && (su1
[i
].h
== su2
[i
].h
); i
++);
1831 if (complexprefixes
) {
1832 int l1
= strlen(s1
);
1833 int l2
= strlen(s2
);
1834 if (*(s2
+l1
-1) == *(s2
+l2
-1)) return 1;
1837 // decapitalise dictionary word
1838 if ((*s1
!= *s2
) && (*s1
!= csconv
[((unsigned char)*s2
)].clower
)) return 0;
1841 } while ((*s1
== *s2
) && (*s1
!= '\0'));
1842 return (int)(s1
- olds
);
1848 int SuggestMgr::commoncharacterpositions(char * s1
, const char * s2
, int * is_swap
) {
1856 int l1
= u8_u16(su1
, MAXSWL
, s1
);
1857 int l2
= u8_u16(su2
, MAXSWL
, s2
);
1858 // decapitalize dictionary word
1859 if (complexprefixes
) {
1860 mkallsmall_utf(su2
+l2
-1, 1, langnum
);
1862 mkallsmall_utf(su2
, 1, langnum
);
1864 for (int i
= 0; (i
< l1
) && (i
< l2
); i
++) {
1865 if (((short *) su1
)[i
] == ((short *) su2
)[i
]) {
1868 if (diff
< 2) diffpos
[diff
] = i
;
1872 if ((diff
== 2) && (l1
== l2
) &&
1873 (((short *) su1
)[diffpos
[0]] == ((short *) su2
)[diffpos
[1]]) &&
1874 (((short *) su1
)[diffpos
[1]] == ((short *) su2
)[diffpos
[0]])) *is_swap
= 1;
1879 // decapitalize dictionary word
1880 if (complexprefixes
) {
1882 *(t
+l2
-1) = csconv
[((unsigned char)*(t
+l2
-1))].clower
;
1884 mkallsmall(t
, csconv
);
1886 for (i
= 0; (*(s1
+i
) != 0) && (*(t
+i
) != 0); i
++) {
1887 if (*(s1
+i
) == *(t
+i
)) {
1890 if (diff
< 2) diffpos
[diff
] = i
;
1894 if ((diff
== 2) && (*(s1
+i
) == 0) && (*(t
+i
) == 0) &&
1895 (*(s1
+diffpos
[0]) == *(t
+diffpos
[1])) &&
1896 (*(s1
+diffpos
[1]) == *(t
+diffpos
[0]))) *is_swap
= 1;
1901 int SuggestMgr::mystrlen(const char * word
) {
1904 return u8_u16(w
, MAXSWL
, word
);
1905 } else return strlen(word
);
1908 // sort in decreasing order of score
1909 void SuggestMgr::bubblesort(char** rword
, char** rword2
, int* rsc
, int n
)
1915 if (rsc
[j
-1] < rsc
[j
]) {
1916 int sctmp
= rsc
[j
-1];
1917 char * wdtmp
= rword
[j
-1];
1919 rword
[j
-1] = rword
[j
];
1923 wdtmp
= rword2
[j
-1];
1924 rword2
[j
-1] = rword2
[j
];
1935 // longest common subsequence
1936 void SuggestMgr::lcs(const char * s
, const char * s2
, int * l1
, int * l2
, char ** result
) {
1945 m
= u8_u16(su
, MAXSWL
, s
);
1946 n
= u8_u16(su2
, MAXSWL
, s2
);
1951 c
= (char *) malloc((m
+ 1) * (n
+ 1));
1952 b
= (char *) malloc((m
+ 1) * (n
+ 1));
1959 for (i
= 1; i
<= m
; i
++) c
[i
*(n
+1)] = 0;
1960 for (j
= 0; j
<= n
; j
++) c
[j
] = 0;
1961 for (i
= 1; i
<= m
; i
++) {
1962 for (j
= 1; j
<= n
; j
++) {
1963 if ( ((utf8
) && (*((short *) su
+i
-1) == *((short *)su2
+j
-1)))
1964 || ((!utf8
) && ((*(s
+i
-1)) == (*(s2
+j
-1))))) {
1965 c
[i
*(n
+1) + j
] = c
[(i
-1)*(n
+1) + j
-1]+1;
1966 b
[i
*(n
+1) + j
] = LCS_UPLEFT
;
1967 } else if (c
[(i
-1)*(n
+1) + j
] >= c
[i
*(n
+1) + j
-1]) {
1968 c
[i
*(n
+1) + j
] = c
[(i
-1)*(n
+1) + j
];
1969 b
[i
*(n
+1) + j
] = LCS_UP
;
1971 c
[i
*(n
+1) + j
] = c
[i
*(n
+1) + j
-1];
1972 b
[i
*(n
+1) + j
] = LCS_LEFT
;
1982 int SuggestMgr::lcslen(const char * s
, const char* s2
) {
1989 lcs(s
, s2
, &m
, &n
, &result
);
1990 if (!result
) return 0;
1993 while ((i
!= 0) && (j
!= 0)) {
1994 if (result
[i
*(n
+1) + j
] == LCS_UPLEFT
) {
1998 } else if (result
[i
*(n
+1) + j
] == LCS_UP
) {