Use true instead of TRUE
[TortoiseGit.git] / ext / hunspell / hashmgr.cxx
blob57be70b9909b0b1a05d4394f92c0f5995cdcd06d
1 #include "license.hunspell"
2 #include "license.myspell"
4 #ifndef MOZILLA_CLIENT
5 #include <cstdlib>
6 #include <cstring>
7 #include <cstdio>
8 #include <cctype>
9 #else
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <ctype.h>
14 #endif
16 #include "hashmgr.hxx"
17 #include "csutil.hxx"
18 #include "atypes.hxx"
20 #ifdef MOZILLA_CLIENT
21 #ifdef __SUNPRO_CC // for SunONE Studio compiler
22 using namespace std;
23 #endif
24 #else
25 #ifndef W32
26 using namespace std;
27 #endif
28 #endif
30 // build a hash table from a munched word list
32 HashMgr::HashMgr(const char * tpath, const char * apath)
34 tablesize = 0;
35 tableptr = NULL;
36 flag_mode = FLAG_CHAR;
37 complexprefixes = 0;
38 utf8 = 0;
39 ignorechars = NULL;
40 ignorechars_utf16 = NULL;
41 ignorechars_utf16_len = 0;
42 numaliasf = 0;
43 aliasf = NULL;
44 numaliasm = 0;
45 aliasm = NULL;
46 load_config(apath);
47 int ec = load_tables(tpath);
48 if (ec) {
49 /* error condition - what should we do here */
50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
51 if (tableptr) {
52 free(tableptr);
54 tablesize = 0;
59 HashMgr::~HashMgr()
61 if (tableptr) {
62 // now pass through hash table freeing up everything
63 // go through column by column of the table
64 for (int i=0; i < tablesize; i++) {
65 struct hentry * pt = &tableptr[i];
66 struct hentry * nt = NULL;
67 if (pt) {
68 if (pt->astr && !aliasf) free(pt->astr);
69 if (pt->word) free(pt->word);
70 #ifdef HUNSPELL_EXPERIMENTAL
71 if (pt->description && !aliasm) free(pt->description);
72 #endif
73 pt = pt->next;
75 while(pt) {
76 nt = pt->next;
77 if (pt->astr && !aliasf) free(pt->astr);
78 if (pt->word) free(pt->word);
79 #ifdef HUNSPELL_EXPERIMENTAL
80 if (pt->description && !aliasm) free(pt->description);
81 #endif
82 free(pt);
83 pt = nt;
86 free(tableptr);
88 tablesize = 0;
90 if (aliasf) {
91 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
92 free(aliasf);
93 aliasf = NULL;
94 if (aliasflen) {
95 free(aliasflen);
96 aliasflen = NULL;
99 if (aliasm) {
100 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
101 free(aliasm);
102 aliasm = NULL;
105 if (ignorechars) free(ignorechars);
106 if (ignorechars_utf16) free(ignorechars_utf16);
109 // lookup a root word in the hashtable
111 struct hentry * HashMgr::lookup(const char *word) const
113 struct hentry * dp;
114 if (tableptr) {
115 dp = &tableptr[hash(word)];
116 if (dp->word == NULL) return NULL;
117 for ( ; dp != NULL; dp = dp->next) {
118 if (strcmp(word,dp->word) == 0) return dp;
121 return NULL;
124 // add a word to the hash table (private)
126 int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc)
128 char * st = mystrdup(word);
129 if (wl && !st) return 1;
130 if (ignorechars != NULL) {
131 if (utf8) {
132 remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len);
133 } else {
134 remove_ignored_chars(st, ignorechars);
137 if (complexprefixes) {
138 if (utf8) reverseword_utf(st); else reverseword(st);
140 int i = hash(st);
141 struct hentry * dp = &tableptr[i];
142 if (dp->word == NULL) {
143 dp->wlen = (short) wl;
144 dp->alen = (short) al;
145 dp->word = st;
146 dp->astr = aff;
147 dp->next = NULL;
148 dp->next_homonym = NULL;
149 #ifdef HUNSPELL_EXPERIMENTAL
150 if (aliasm) {
151 dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
152 } else {
153 dp->description = mystrdup(desc);
154 if (desc && !dp->description) return 1;
155 if (dp->description && complexprefixes) {
156 if (utf8) reverseword_utf(dp->description); else reverseword(dp->description);
159 #endif
160 } else {
161 struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry));
162 if (!hp) return 1;
163 hp->wlen = (short) wl;
164 hp->alen = (short) al;
165 hp->word = st;
166 hp->astr = aff;
167 hp->next = NULL;
168 hp->next_homonym = NULL;
169 #ifdef HUNSPELL_EXPERIMENTAL
170 if (aliasm) {
171 hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
172 } else {
173 hp->description = mystrdup(desc);
174 if (desc && !hp->description) return 1;
175 if (dp->description && complexprefixes) {
176 if (utf8) reverseword_utf(hp->description); else reverseword(hp->description);
179 #endif
180 while (dp->next != NULL) {
181 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
182 dp=dp->next;
184 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
185 dp->next = hp;
187 return 0;
190 // add a custom dic. word to the hash table (public)
191 int HashMgr::put_word(const char * word, int wl, char * aff)
193 unsigned short * flags;
194 int al = 0;
195 if (aff) {
196 al = decode_flags(&flags, aff);
197 flag_qsort(flags, 0, al);
198 } else {
199 flags = NULL;
201 add_word(word, wl, flags, al, NULL);
202 return 0;
205 int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern)
207 unsigned short * flags;
208 struct hentry * dp = lookup(pattern);
209 if (!dp || !dp->astr) return 1;
210 flags = (unsigned short *) malloc (dp->alen * sizeof(short));
211 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
212 add_word(word, wl, flags, dp->alen, NULL);
213 return 0;
216 // walk the hash table entry by entry - null at end
217 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
219 //reset to start
220 if ((col < 0) || (hp == NULL)) {
221 col = -1;
222 hp = NULL;
225 if (hp && hp->next != NULL) {
226 hp = hp->next;
227 } else {
228 col++;
229 hp = (col < tablesize) ? &tableptr[col] : NULL;
230 // search for next non-blank column entry
231 while (hp && (hp->word == NULL)) {
232 col ++;
233 hp = (col < tablesize) ? &tableptr[col] : NULL;
235 if (col < tablesize) return hp;
236 hp = NULL;
237 col = -1;
239 return hp;
242 // load a munched word list and build a hash table on the fly
243 int HashMgr::load_tables(const char * tpath)
245 int wl, al;
246 char * ap;
247 char * dp;
248 unsigned short * flags;
250 // raw dictionary - munched file
251 FILE * rawdict = fopen(tpath, "r");
252 if (rawdict == NULL) return 1;
254 // first read the first line of file to get hash table size */
255 char ts[MAXDELEN];
256 if (! fgets(ts, MAXDELEN-1,rawdict)) {
257 fclose(rawdict);
258 return 2;
260 mychomp(ts);
262 /* remove byte order mark */
263 if (strncmp(ts,"?",3) == 0) {
264 memmove(ts, ts+3, strlen(ts+3)+1);
265 HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
268 if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n");
269 tablesize = atoi(ts);
270 if (!tablesize) {
271 fclose(rawdict);
272 return 4;
274 tablesize = tablesize + 5 + USERWORD;
275 if ((tablesize %2) == 0) tablesize++;
277 // allocate the hash table
278 tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
279 if (!tableptr) {
280 fclose(rawdict);
281 return 3;
284 for (int i=0; i<tablesize; i++) tableptr[i].word = NULL;
286 // loop through all words on much list and add to hash
287 // table and create word and affix strings
289 while (fgets(ts,MAXDELEN-1,rawdict)) {
290 mychomp(ts);
291 // split each line into word and morphological description
292 dp = strchr(ts,'\t');
294 if (dp) {
295 *dp = '\0';
296 dp++;
297 } else {
298 dp = NULL;
301 // split each line into word and affix char strings
302 // "\/" signs slash in words (not affix separator)
303 // "/" at beginning of the line is word character (not affix separator)
304 ap = strchr(ts,'/');
305 while (ap) {
306 if (ap == ts) {
307 ap++;
308 continue;
309 } else if (*(ap - 1) != '\\') break;
310 // replace "\/" with "/"
311 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
312 ap = strchr(ap,'/');
315 if (ap) {
316 *ap = '\0';
317 if (aliasf) {
318 int index = atoi(ap + 1);
319 al = get_aliasf(index, &flags);
320 if (!al) {
321 HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts);
322 *ap = '\0';
324 } else {
325 al = decode_flags(&flags, ap + 1);
326 flag_qsort(flags, 0, al);
328 } else {
329 al = 0;
330 ap = NULL;
331 flags = NULL;
334 wl = strlen(ts);
336 // add the word and its index
337 if (add_word(ts,wl,flags,al,dp)) return 5;
341 fclose(rawdict);
342 return 0;
346 // the hash function is a simple load and rotate
347 // algorithm borrowed
349 int HashMgr::hash(const char * word) const
351 long hv = 0;
352 for (int i=0; i < 4 && *word != 0; i++)
353 hv = (hv << 8) | (*word++);
354 while (*word != 0) {
355 ROTATE(hv,ROTATE_LEN);
356 hv ^= (*word++);
358 return (unsigned long) hv % tablesize;
361 int HashMgr::decode_flags(unsigned short ** result, char * flags) {
362 int len;
363 switch (flag_mode) {
364 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
365 len = strlen(flags);
366 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags);
367 len = len/2;
368 *result = (unsigned short *) malloc(len * sizeof(short));
369 for (int i = 0; i < len; i++) {
370 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
372 break;
374 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
375 len = 1;
376 char * src = flags;
377 unsigned short * dest;
378 char * p;
379 for (p = flags; *p; p++) {
380 if (*p == ',') len++;
382 *result = (unsigned short *) malloc(len * sizeof(short));
383 dest = *result;
384 for (p = flags; *p; p++) {
385 if (*p == ',') {
386 *dest = (unsigned short) atoi(src);
387 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
388 src = p + 1;
389 dest++;
392 *dest = (unsigned short) atoi(src);
393 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
394 break;
396 case FLAG_UNI: { // UTF-8 characters
397 w_char w[MAXDELEN/2];
398 len = u8_u16(w, MAXDELEN/2, flags);
399 *result = (unsigned short *) malloc(len * sizeof(short));
400 memcpy(*result, w, len * sizeof(short));
401 break;
403 default: { // Ispell's one-character flags (erfg -> e r f g)
404 unsigned short * dest;
405 len = strlen(flags);
406 *result = (unsigned short *) malloc(len * sizeof(short));
407 dest = *result;
408 for (unsigned char * p = (unsigned char *) flags; *p; p++) {
409 *dest = (unsigned short) *p;
410 dest++;
414 return len;
417 unsigned short HashMgr::decode_flag(const char * f) {
418 unsigned short s = 0;
419 switch (flag_mode) {
420 case FLAG_LONG:
421 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
422 break;
423 case FLAG_NUM:
424 s = (unsigned short) atoi(f);
425 break;
426 case FLAG_UNI:
427 u8_u16((w_char *) &s, 1, f);
428 break;
429 default:
430 s = (unsigned short) *((unsigned char *)f);
432 if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
433 return s;
436 char * HashMgr::encode_flag(unsigned short f) {
437 unsigned char ch[10];
438 if (f==0) return mystrdup("(NULL)");
439 if (flag_mode == FLAG_LONG) {
440 ch[0] = (unsigned char) (f >> 8);
441 ch[1] = (unsigned char) (f - ((f >> 8) << 8));
442 ch[2] = '\0';
443 } else if (flag_mode == FLAG_NUM) {
444 sprintf((char *) ch, "%d", f);
445 } else if (flag_mode == FLAG_UNI) {
446 u16_u8((char *) &ch, 10, (w_char *) &f, 1);
447 } else {
448 ch[0] = (unsigned char) (f);
449 ch[1] = '\0';
451 return mystrdup((char *) ch);
454 // read in aff file and set flag mode
455 int HashMgr::load_config(const char * affpath)
457 int firstline = 1;
459 // io buffers
460 char line[MAXDELEN+1];
462 // open the affix file
463 FILE * afflst;
464 afflst = fopen(affpath,"r");
465 if (!afflst) {
466 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
467 return 1;
470 // read in each line ignoring any that do not
471 // start with a known line type indicator
473 while (fgets(line,MAXDELEN,afflst)) {
474 mychomp(line);
476 /* remove byte order mark */
477 if (firstline) {
478 firstline = 0;
479 if (strncmp(line,"",3) == 0) memmove(line, line+3, strlen(line+3)+1);
482 /* parse in the try string */
483 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
484 if (flag_mode != FLAG_CHAR) {
485 HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n");
487 if (strstr(line, "long")) flag_mode = FLAG_LONG;
488 if (strstr(line, "num")) flag_mode = FLAG_NUM;
489 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
490 if (flag_mode == FLAG_CHAR) {
491 HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line);
494 if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1;
496 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
497 if (strncmp(line,"IGNORE",6) == 0) {
498 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
499 fclose(afflst);
500 return 1;
504 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
505 if (parse_aliasf(line, afflst)) {
506 fclose(afflst);
507 return 1;
511 #ifdef HUNSPELL_EXPERIMENTAL
512 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
513 if (parse_aliasm(line, afflst)) {
514 fclose(afflst);
515 return 1;
518 #endif
519 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
520 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
522 fclose(afflst);
523 return 0;
526 /* parse in the ALIAS table */
527 int HashMgr::parse_aliasf(char * line, FILE * af)
529 if (numaliasf != 0) {
530 HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n");
531 return 1;
533 char * tp = line;
534 char * piece;
535 int i = 0;
536 int np = 0;
537 piece = mystrsep(&tp, 0);
538 while (piece) {
539 if (*piece != '\0') {
540 switch(i) {
541 case 0: { np++; break; }
542 case 1: {
543 numaliasf = atoi(piece);
544 if (numaliasf < 1) {
545 numaliasf = 0;
546 aliasf = NULL;
547 aliasflen = NULL;
548 HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n");
549 free(piece);
550 return 1;
552 aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
553 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
554 if (!aliasf || !aliasflen) {
555 numaliasf = 0;
556 if (aliasf) free(aliasf);
557 if (aliasflen) free(aliasflen);
558 aliasf = NULL;
559 aliasflen = NULL;
560 free(piece);
561 return 1;
563 np++;
564 break;
566 default: break;
568 i++;
570 free(piece);
571 piece = mystrsep(&tp, 0);
573 if (np != 2) {
574 numaliasf = 0;
575 free(aliasf);
576 free(aliasflen);
577 aliasf = NULL;
578 aliasflen = NULL;
579 HUNSPELL_WARNING(stderr, "error: missing AF table information\n");
580 return 1;
583 /* now parse the numaliasf lines to read in the remainder of the table */
584 char * nl = line;
585 for (int j=0; j < numaliasf; j++) {
586 if (!fgets(nl,MAXDELEN,af)) return 1;
587 mychomp(nl);
588 tp = nl;
589 i = 0;
590 aliasf[j] = NULL;
591 aliasflen[j] = 0;
592 piece = mystrsep(&tp, 0);
593 while (piece) {
594 if (*piece != '\0') {
595 switch(i) {
596 case 0: {
597 if (strncmp(piece,"AF",2) != 0) {
598 numaliasf = 0;
599 free(aliasf);
600 free(aliasflen);
601 aliasf = NULL;
602 aliasflen = NULL;
603 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
604 free(piece);
605 return 1;
607 break;
609 case 1: {
610 aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece);
611 flag_qsort(aliasf[j], 0, aliasflen[j]);
612 break;
614 default: break;
616 i++;
618 free(piece);
619 piece = mystrsep(&tp, 0);
621 if (!aliasf[j]) {
622 free(aliasf);
623 free(aliasflen);
624 aliasf = NULL;
625 aliasflen = NULL;
626 numaliasf = 0;
627 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
628 return 1;
631 return 0;
634 int HashMgr::is_aliasf() {
635 return (aliasf != NULL);
638 int HashMgr::get_aliasf(int index, unsigned short ** fvec) {
639 if ((index > 0) && (index <= numaliasf)) {
640 *fvec = aliasf[index - 1];
641 return aliasflen[index - 1];
643 HUNSPELL_WARNING(stderr, "error: bad flag alias index: %d\n", index);
644 *fvec = NULL;
645 return 0;
648 #ifdef HUNSPELL_EXPERIMENTAL
649 /* parse morph alias definitions */
650 int HashMgr::parse_aliasm(char * line, FILE * af)
652 if (numaliasm != 0) {
653 HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n");
654 return 1;
656 char * tp = line;
657 char * piece;
658 int i = 0;
659 int np = 0;
660 piece = mystrsep(&tp, 0);
661 while (piece) {
662 if (*piece != '\0') {
663 switch(i) {
664 case 0: { np++; break; }
665 case 1: {
666 numaliasm = atoi(piece);
667 if (numaliasm < 1) {
668 HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n");
669 free(piece);
670 return 1;
672 aliasm = (char **) malloc(numaliasm * sizeof(char *));
673 if (!aliasm) {
674 numaliasm = 0;
675 return 1;
677 np++;
678 break;
680 default: break;
682 i++;
684 free(piece);
685 piece = mystrsep(&tp, 0);
687 if (np != 2) {
688 numaliasm = 0;
689 free(aliasm);
690 aliasm = NULL;
691 HUNSPELL_WARNING(stderr, "error: missing AM alias information\n");
692 return 1;
695 /* now parse the numaliasm lines to read in the remainder of the table */
696 char * nl = line;
697 for (int j=0; j < numaliasm; j++) {
698 if (!fgets(nl,MAXDELEN,af)) return 1;
699 mychomp(nl);
700 tp = nl;
701 i = 0;
702 aliasm[j] = NULL;
703 piece = mystrsep(&tp, 0);
704 while (piece) {
705 if (*piece != '\0') {
706 switch(i) {
707 case 0: {
708 if (strncmp(piece,"AM",2) != 0) {
709 HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n");
710 free(piece);
711 numaliasm = 0;
712 free(aliasm);
713 aliasm = NULL;
714 return 1;
716 break;
718 case 1: {
719 if (complexprefixes) {
720 if (utf8) reverseword_utf(piece);
721 else reverseword(piece);
723 aliasm[j] = mystrdup(piece);
724 break; }
725 default: break;
727 i++;
729 free(piece);
730 piece = mystrsep(&tp, 0);
732 if (!aliasm[j]) {
733 numaliasm = 0;
734 free(aliasm);
735 aliasm = NULL;
736 HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
737 return 1;
740 return 0;
743 int HashMgr::is_aliasm() {
744 return (aliasm != NULL);
747 char * HashMgr::get_aliasm(int index) {
748 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
749 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
750 return NULL;
752 #endif