Enable Submodule Add dialog resizable
[TortoiseGit.git] / ext / hunspell / hashmgr.cxx
blob1d01cbfc7e7f2a6a4b38724d53d65c4686c54b0d
1 #include "license.hunspell"
2 #include "license.myspell"
4 #ifndef MOZILLA_CLIENT
5 #include <cstdlib>
6 #include <cstring>
7 #include <cstdio>
8 #include <cctype>
9 #else
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <ctype.h>
14 #endif
16 #include "hashmgr.hxx"
17 #include "csutil.hxx"
18 #include "atypes.hxx"
20 #ifdef MOZILLA_CLIENT
21 #ifdef __SUNPRO_CC // for SunONE Studio compiler
22 using namespace std;
23 #endif
24 #else
25 #ifndef W32
26 using namespace std;
27 #endif
28 #endif
30 // build a hash table from a munched word list
32 HashMgr::HashMgr(const char * tpath, const char * apath)
34 tablesize = 0;
35 tableptr = NULL;
36 flag_mode = FLAG_CHAR;
37 complexprefixes = 0;
38 utf8 = 0;
39 ignorechars = NULL;
40 ignorechars_utf16 = NULL;
41 ignorechars_utf16_len = 0;
42 numaliasf = 0;
43 aliasf = NULL;
44 numaliasm = 0;
45 aliasm = NULL;
46 load_config(apath);
47 int ec = load_tables(tpath);
48 if (ec) {
49 /* error condition - what should we do here */
50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
51 if (tableptr) {
52 free(tableptr);
54 tablesize = 0;
59 HashMgr::~HashMgr()
61 if (tableptr) {
62 // now pass through hash table freeing up everything
63 // go through column by column of the table
64 for (int i=0; i < tablesize; i++) {
65 struct hentry * pt = &tableptr[i];
66 struct hentry * nt = NULL;
67 if (pt) {
68 if (pt->astr && !aliasf) free(pt->astr);
69 if (pt->word) free(pt->word);
70 #ifdef HUNSPELL_EXPERIMENTAL
71 if (pt->description && !aliasm) free(pt->description);
72 #endif
73 pt = pt->next;
75 while(pt) {
76 nt = pt->next;
77 if (pt->astr && !aliasf) free(pt->astr);
78 if (pt->word) free(pt->word);
79 #ifdef HUNSPELL_EXPERIMENTAL
80 if (pt->description && !aliasm) free(pt->description);
81 #endif
82 free(pt);
83 pt = nt;
86 free(tableptr);
88 tablesize = 0;
90 if (aliasf) {
91 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
92 free(aliasf);
93 aliasf = NULL;
94 if (aliasflen) {
95 free(aliasflen);
96 aliasflen = NULL;
99 if (aliasm) {
100 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
101 free(aliasm);
102 aliasm = NULL;
105 if (ignorechars) free(ignorechars);
106 if (ignorechars_utf16) free(ignorechars_utf16);
109 // lookup a root word in the hashtable
111 struct hentry * HashMgr::lookup(const char *word) const
113 struct hentry * dp;
114 if (tableptr) {
115 dp = &tableptr[hash(word)];
116 if (dp->word == NULL) return NULL;
117 for ( ; dp != NULL; dp = dp->next) {
118 if (strcmp(word,dp->word) == 0) return dp;
121 return NULL;
124 // add a word to the hash table (private)
126 int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc)
128 char * st = mystrdup(word);
129 if (wl && !st) return 1;
130 if (ignorechars != NULL) {
131 if (utf8) {
132 remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len);
133 } else {
134 remove_ignored_chars(st, ignorechars);
137 if (complexprefixes) {
138 if (utf8) reverseword_utf(st); else reverseword(st);
140 int i = hash(st);
141 struct hentry * dp = &tableptr[i];
142 if (dp->word == NULL) {
143 dp->wlen = (short) wl;
144 dp->alen = (short) al;
145 dp->word = st;
146 dp->astr = aff;
147 dp->next = NULL;
148 dp->next_homonym = NULL;
149 #ifdef HUNSPELL_EXPERIMENTAL
150 if (aliasm) {
151 dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
152 } else {
153 dp->description = mystrdup(desc);
154 if (desc && !dp->description) return 1;
155 if (dp->description && complexprefixes) {
156 if (utf8) reverseword_utf(dp->description); else reverseword(dp->description);
159 #endif
160 } else {
161 struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry));
162 if (!hp) return 1;
163 hp->wlen = (short) wl;
164 hp->alen = (short) al;
165 hp->word = st;
166 hp->astr = aff;
167 hp->next = NULL;
168 hp->next_homonym = NULL;
169 #ifdef HUNSPELL_EXPERIMENTAL
170 if (aliasm) {
171 hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
172 } else {
173 hp->description = mystrdup(desc);
174 if (desc && !hp->description) return 1;
175 if (dp->description && complexprefixes) {
176 if (utf8) reverseword_utf(hp->description); else reverseword(hp->description);
179 #endif
180 while (dp->next != NULL) {
181 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
182 dp=dp->next;
184 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
185 dp->next = hp;
187 return 0;
190 // add a custom dic. word to the hash table (public)
191 int HashMgr::put_word(const char * word, int wl, char * aff)
193 unsigned short * flags;
194 int al = 0;
195 if (aff) {
196 al = decode_flags(&flags, aff);
197 flag_qsort(flags, 0, al);
198 } else {
199 flags = NULL;
201 add_word(word, wl, flags, al, NULL);
202 return 0;
205 int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern)
207 unsigned short * flags;
208 struct hentry * dp = lookup(pattern);
209 if (!dp || !dp->astr) return 1;
210 flags = (unsigned short *) malloc (dp->alen * sizeof(short));
211 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
212 add_word(word, wl, flags, dp->alen, NULL);
213 return 0;
216 // walk the hash table entry by entry - null at end
217 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
219 //reset to start
220 if ((col < 0) || (hp == NULL)) {
221 col = -1;
222 hp = NULL;
225 if (hp && hp->next != NULL) {
226 hp = hp->next;
227 } else {
228 col++;
229 hp = (col < tablesize) ? &tableptr[col] : NULL;
230 // search for next non-blank column entry
231 while (hp && (hp->word == NULL)) {
232 col ++;
233 hp = (col < tablesize) ? &tableptr[col] : NULL;
235 if (col < tablesize) return hp;
236 hp = NULL;
237 col = -1;
239 return hp;
242 // load a munched word list and build a hash table on the fly
243 int HashMgr::load_tables(const char * tpath)
245 int wl, al;
246 char * ap;
247 char * dp;
248 unsigned short * flags;
250 // raw dictionary - munched file
251 FILE * rawdict = fopen(tpath, "r");
252 if (rawdict == NULL) return 1;
254 // first read the first line of file to get hash table size */
255 char ts[MAXDELEN];
256 if (! fgets(ts, MAXDELEN-1,rawdict)) return 2;
257 mychomp(ts);
259 /* remove byte order mark */
260 if (strncmp(ts,"",3) == 0) {
261 memmove(ts, ts+3, strlen(ts+3)+1);
262 HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
265 if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n");
266 tablesize = atoi(ts);
267 if (!tablesize) return 4;
268 tablesize = tablesize + 5 + USERWORD;
269 if ((tablesize %2) == 0) tablesize++;
271 // allocate the hash table
272 tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
273 if (! tableptr) return 3;
274 for (int i=0; i<tablesize; i++) tableptr[i].word = NULL;
276 // loop through all words on much list and add to hash
277 // table and create word and affix strings
279 while (fgets(ts,MAXDELEN-1,rawdict)) {
280 mychomp(ts);
281 // split each line into word and morphological description
282 dp = strchr(ts,'\t');
284 if (dp) {
285 *dp = '\0';
286 dp++;
287 } else {
288 dp = NULL;
291 // split each line into word and affix char strings
292 // "\/" signs slash in words (not affix separator)
293 // "/" at beginning of the line is word character (not affix separator)
294 ap = strchr(ts,'/');
295 while (ap) {
296 if (ap == ts) {
297 ap++;
298 continue;
299 } else if (*(ap - 1) != '\\') break;
300 // replace "\/" with "/"
301 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
302 ap = strchr(ap,'/');
305 if (ap) {
306 *ap = '\0';
307 if (aliasf) {
308 int index = atoi(ap + 1);
309 al = get_aliasf(index, &flags);
310 if (!al) {
311 HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts);
312 *ap = '\0';
314 } else {
315 al = decode_flags(&flags, ap + 1);
316 flag_qsort(flags, 0, al);
318 } else {
319 al = 0;
320 ap = NULL;
321 flags = NULL;
324 wl = strlen(ts);
326 // add the word and its index
327 if (add_word(ts,wl,flags,al,dp)) return 5;
331 fclose(rawdict);
332 return 0;
336 // the hash function is a simple load and rotate
337 // algorithm borrowed
339 int HashMgr::hash(const char * word) const
341 long hv = 0;
342 for (int i=0; i < 4 && *word != 0; i++)
343 hv = (hv << 8) | (*word++);
344 while (*word != 0) {
345 ROTATE(hv,ROTATE_LEN);
346 hv ^= (*word++);
348 return (unsigned long) hv % tablesize;
351 int HashMgr::decode_flags(unsigned short ** result, char * flags) {
352 int len;
353 switch (flag_mode) {
354 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
355 len = strlen(flags);
356 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags);
357 len = len/2;
358 *result = (unsigned short *) malloc(len * sizeof(short));
359 for (int i = 0; i < len; i++) {
360 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
362 break;
364 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
365 len = 1;
366 char * src = flags;
367 unsigned short * dest;
368 char * p;
369 for (p = flags; *p; p++) {
370 if (*p == ',') len++;
372 *result = (unsigned short *) malloc(len * sizeof(short));
373 dest = *result;
374 for (p = flags; *p; p++) {
375 if (*p == ',') {
376 *dest = (unsigned short) atoi(src);
377 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
378 src = p + 1;
379 dest++;
382 *dest = (unsigned short) atoi(src);
383 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
384 break;
386 case FLAG_UNI: { // UTF-8 characters
387 w_char w[MAXDELEN/2];
388 len = u8_u16(w, MAXDELEN/2, flags);
389 *result = (unsigned short *) malloc(len * sizeof(short));
390 memcpy(*result, w, len * sizeof(short));
391 break;
393 default: { // Ispell's one-character flags (erfg -> e r f g)
394 unsigned short * dest;
395 len = strlen(flags);
396 *result = (unsigned short *) malloc(len * sizeof(short));
397 dest = *result;
398 for (unsigned char * p = (unsigned char *) flags; *p; p++) {
399 *dest = (unsigned short) *p;
400 dest++;
404 return len;
407 unsigned short HashMgr::decode_flag(const char * f) {
408 unsigned short s = 0;
409 switch (flag_mode) {
410 case FLAG_LONG:
411 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
412 break;
413 case FLAG_NUM:
414 s = (unsigned short) atoi(f);
415 break;
416 case FLAG_UNI:
417 u8_u16((w_char *) &s, 1, f);
418 break;
419 default:
420 s = (unsigned short) *((unsigned char *)f);
422 if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
423 return s;
426 char * HashMgr::encode_flag(unsigned short f) {
427 unsigned char ch[10];
428 if (f==0) return mystrdup("(NULL)");
429 if (flag_mode == FLAG_LONG) {
430 ch[0] = (unsigned char) (f >> 8);
431 ch[1] = (unsigned char) (f - ((f >> 8) << 8));
432 ch[2] = '\0';
433 } else if (flag_mode == FLAG_NUM) {
434 sprintf((char *) ch, "%d", f);
435 } else if (flag_mode == FLAG_UNI) {
436 u16_u8((char *) &ch, 10, (w_char *) &f, 1);
437 } else {
438 ch[0] = (unsigned char) (f);
439 ch[1] = '\0';
441 return mystrdup((char *) ch);
444 // read in aff file and set flag mode
445 int HashMgr::load_config(const char * affpath)
447 int firstline = 1;
449 // io buffers
450 char line[MAXDELEN+1];
452 // open the affix file
453 FILE * afflst;
454 afflst = fopen(affpath,"r");
455 if (!afflst) {
456 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
457 return 1;
460 // read in each line ignoring any that do not
461 // start with a known line type indicator
463 while (fgets(line,MAXDELEN,afflst)) {
464 mychomp(line);
466 /* remove byte order mark */
467 if (firstline) {
468 firstline = 0;
469 if (strncmp(line,"",3) == 0) memmove(line, line+3, strlen(line+3)+1);
472 /* parse in the try string */
473 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
474 if (flag_mode != FLAG_CHAR) {
475 HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n");
477 if (strstr(line, "long")) flag_mode = FLAG_LONG;
478 if (strstr(line, "num")) flag_mode = FLAG_NUM;
479 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
480 if (flag_mode == FLAG_CHAR) {
481 HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line);
484 if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1;
486 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
487 if (strncmp(line,"IGNORE",6) == 0) {
488 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
489 fclose(afflst);
490 return 1;
494 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
495 if (parse_aliasf(line, afflst)) {
496 fclose(afflst);
497 return 1;
501 #ifdef HUNSPELL_EXPERIMENTAL
502 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
503 if (parse_aliasm(line, afflst)) {
504 fclose(afflst);
505 return 1;
508 #endif
509 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
510 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
512 fclose(afflst);
513 return 0;
516 /* parse in the ALIAS table */
517 int HashMgr::parse_aliasf(char * line, FILE * af)
519 if (numaliasf != 0) {
520 HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n");
521 return 1;
523 char * tp = line;
524 char * piece;
525 int i = 0;
526 int np = 0;
527 piece = mystrsep(&tp, 0);
528 while (piece) {
529 if (*piece != '\0') {
530 switch(i) {
531 case 0: { np++; break; }
532 case 1: {
533 numaliasf = atoi(piece);
534 if (numaliasf < 1) {
535 numaliasf = 0;
536 aliasf = NULL;
537 aliasflen = NULL;
538 HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n");
539 free(piece);
540 return 1;
542 aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
543 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
544 if (!aliasf || !aliasflen) {
545 numaliasf = 0;
546 if (aliasf) free(aliasf);
547 if (aliasflen) free(aliasflen);
548 aliasf = NULL;
549 aliasflen = NULL;
550 return 1;
552 np++;
553 break;
555 default: break;
557 i++;
559 free(piece);
560 piece = mystrsep(&tp, 0);
562 if (np != 2) {
563 numaliasf = 0;
564 free(aliasf);
565 free(aliasflen);
566 aliasf = NULL;
567 aliasflen = NULL;
568 HUNSPELL_WARNING(stderr, "error: missing AF table information\n");
569 return 1;
572 /* now parse the numaliasf lines to read in the remainder of the table */
573 char * nl = line;
574 for (int j=0; j < numaliasf; j++) {
575 if (!fgets(nl,MAXDELEN,af)) return 1;
576 mychomp(nl);
577 tp = nl;
578 i = 0;
579 aliasf[j] = NULL;
580 aliasflen[j] = 0;
581 piece = mystrsep(&tp, 0);
582 while (piece) {
583 if (*piece != '\0') {
584 switch(i) {
585 case 0: {
586 if (strncmp(piece,"AF",2) != 0) {
587 numaliasf = 0;
588 free(aliasf);
589 free(aliasflen);
590 aliasf = NULL;
591 aliasflen = NULL;
592 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
593 free(piece);
594 return 1;
596 break;
598 case 1: {
599 aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece);
600 flag_qsort(aliasf[j], 0, aliasflen[j]);
601 break;
603 default: break;
605 i++;
607 free(piece);
608 piece = mystrsep(&tp, 0);
610 if (!aliasf[j]) {
611 free(aliasf);
612 free(aliasflen);
613 aliasf = NULL;
614 aliasflen = NULL;
615 numaliasf = 0;
616 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
617 return 1;
620 return 0;
623 int HashMgr::is_aliasf() {
624 return (aliasf != NULL);
627 int HashMgr::get_aliasf(int index, unsigned short ** fvec) {
628 if ((index > 0) && (index <= numaliasf)) {
629 *fvec = aliasf[index - 1];
630 return aliasflen[index - 1];
632 HUNSPELL_WARNING(stderr, "error: bad flag alias index: %d\n", index);
633 *fvec = NULL;
634 return 0;
637 #ifdef HUNSPELL_EXPERIMENTAL
638 /* parse morph alias definitions */
639 int HashMgr::parse_aliasm(char * line, FILE * af)
641 if (numaliasm != 0) {
642 HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n");
643 return 1;
645 char * tp = line;
646 char * piece;
647 int i = 0;
648 int np = 0;
649 piece = mystrsep(&tp, 0);
650 while (piece) {
651 if (*piece != '\0') {
652 switch(i) {
653 case 0: { np++; break; }
654 case 1: {
655 numaliasm = atoi(piece);
656 if (numaliasm < 1) {
657 HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n");
658 free(piece);
659 return 1;
661 aliasm = (char **) malloc(numaliasm * sizeof(char *));
662 if (!aliasm) {
663 numaliasm = 0;
664 return 1;
666 np++;
667 break;
669 default: break;
671 i++;
673 free(piece);
674 piece = mystrsep(&tp, 0);
676 if (np != 2) {
677 numaliasm = 0;
678 free(aliasm);
679 aliasm = NULL;
680 HUNSPELL_WARNING(stderr, "error: missing AM alias information\n");
681 return 1;
684 /* now parse the numaliasm lines to read in the remainder of the table */
685 char * nl = line;
686 for (int j=0; j < numaliasm; j++) {
687 if (!fgets(nl,MAXDELEN,af)) return 1;
688 mychomp(nl);
689 tp = nl;
690 i = 0;
691 aliasm[j] = NULL;
692 piece = mystrsep(&tp, 0);
693 while (piece) {
694 if (*piece != '\0') {
695 switch(i) {
696 case 0: {
697 if (strncmp(piece,"AM",2) != 0) {
698 HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n");
699 free(piece);
700 numaliasm = 0;
701 free(aliasm);
702 aliasm = NULL;
703 return 1;
705 break;
707 case 1: {
708 if (complexprefixes) {
709 if (utf8) reverseword_utf(piece);
710 else reverseword(piece);
712 aliasm[j] = mystrdup(piece);
713 break; }
714 default: break;
716 i++;
718 free(piece);
719 piece = mystrsep(&tp, 0);
721 if (!aliasm[j]) {
722 numaliasm = 0;
723 free(aliasm);
724 aliasm = NULL;
725 HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
726 return 1;
729 return 0;
732 int HashMgr::is_aliasm() {
733 return (aliasm != NULL);
736 char * HashMgr::get_aliasm(int index) {
737 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
738 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
739 return NULL;
741 #endif