1 #include "license.hunspell"
2 #include "license.myspell"
16 #include "hashmgr.hxx"
21 #ifdef __SUNPRO_CC // for SunONE Studio compiler
30 // build a hash table from a munched word list
32 HashMgr::HashMgr(const char * tpath
, const char * apath
)
36 flag_mode
= FLAG_CHAR
;
40 ignorechars_utf16
= NULL
;
41 ignorechars_utf16_len
= 0;
47 int ec
= load_tables(tpath
);
49 /* error condition - what should we do here */
50 HUNSPELL_WARNING(stderr
, "Hash Manager Error : %d\n",ec
);
62 // now pass through hash table freeing up everything
63 // go through column by column of the table
64 for (int i
=0; i
< tablesize
; i
++) {
65 struct hentry
* pt
= &tableptr
[i
];
66 struct hentry
* nt
= NULL
;
68 if (pt
->astr
&& !aliasf
) free(pt
->astr
);
69 if (pt
->word
) free(pt
->word
);
70 #ifdef HUNSPELL_EXPERIMENTAL
71 if (pt
->description
&& !aliasm
) free(pt
->description
);
77 if (pt
->astr
&& !aliasf
) free(pt
->astr
);
78 if (pt
->word
) free(pt
->word
);
79 #ifdef HUNSPELL_EXPERIMENTAL
80 if (pt
->description
&& !aliasm
) free(pt
->description
);
91 for (int j
= 0; j
< (numaliasf
); j
++) free(aliasf
[j
]);
100 for (int j
= 0; j
< (numaliasm
); j
++) free(aliasm
[j
]);
105 if (ignorechars
) free(ignorechars
);
106 if (ignorechars_utf16
) free(ignorechars_utf16
);
109 // lookup a root word in the hashtable
111 struct hentry
* HashMgr::lookup(const char *word
) const
115 dp
= &tableptr
[hash(word
)];
116 if (dp
->word
== NULL
) return NULL
;
117 for ( ; dp
!= NULL
; dp
= dp
->next
) {
118 if (strcmp(word
,dp
->word
) == 0) return dp
;
124 // add a word to the hash table (private)
126 int HashMgr::add_word(const char * word
, int wl
, unsigned short * aff
, int al
, const char * desc
)
128 char * st
= mystrdup(word
);
129 if (wl
&& !st
) return 1;
130 if (ignorechars
!= NULL
) {
132 remove_ignored_chars_utf(st
, ignorechars_utf16
, ignorechars_utf16_len
);
134 remove_ignored_chars(st
, ignorechars
);
137 if (complexprefixes
) {
138 if (utf8
) reverseword_utf(st
); else reverseword(st
);
141 struct hentry
* dp
= &tableptr
[i
];
142 if (dp
->word
== NULL
) {
143 dp
->wlen
= (short) wl
;
144 dp
->alen
= (short) al
;
148 dp
->next_homonym
= NULL
;
149 #ifdef HUNSPELL_EXPERIMENTAL
151 dp
->description
= (desc
) ? get_aliasm(atoi(desc
)) : mystrdup(desc
);
153 dp
->description
= mystrdup(desc
);
154 if (desc
&& !dp
->description
) return 1;
155 if (dp
->description
&& complexprefixes
) {
156 if (utf8
) reverseword_utf(dp
->description
); else reverseword(dp
->description
);
161 struct hentry
* hp
= (struct hentry
*) malloc (sizeof(struct hentry
));
163 hp
->wlen
= (short) wl
;
164 hp
->alen
= (short) al
;
168 hp
->next_homonym
= NULL
;
169 #ifdef HUNSPELL_EXPERIMENTAL
171 hp
->description
= (desc
) ? get_aliasm(atoi(desc
)) : mystrdup(desc
);
173 hp
->description
= mystrdup(desc
);
174 if (desc
&& !hp
->description
) return 1;
175 if (dp
->description
&& complexprefixes
) {
176 if (utf8
) reverseword_utf(hp
->description
); else reverseword(hp
->description
);
180 while (dp
->next
!= NULL
) {
181 if ((!dp
->next_homonym
) && (strcmp(hp
->word
, dp
->word
) == 0)) dp
->next_homonym
= hp
;
184 if ((!dp
->next_homonym
) && (strcmp(hp
->word
, dp
->word
) == 0)) dp
->next_homonym
= hp
;
190 // add a custom dic. word to the hash table (public)
191 int HashMgr::put_word(const char * word
, int wl
, char * aff
)
193 unsigned short * flags
;
196 al
= decode_flags(&flags
, aff
);
197 flag_qsort(flags
, 0, al
);
201 add_word(word
, wl
, flags
, al
, NULL
);
205 int HashMgr::put_word_pattern(const char * word
, int wl
, const char * pattern
)
207 unsigned short * flags
;
208 struct hentry
* dp
= lookup(pattern
);
209 if (!dp
|| !dp
->astr
) return 1;
210 flags
= (unsigned short *) malloc (dp
->alen
* sizeof(short));
211 memcpy((void *) flags
, (void *) dp
->astr
, dp
->alen
* sizeof(short));
212 add_word(word
, wl
, flags
, dp
->alen
, NULL
);
216 // walk the hash table entry by entry - null at end
217 struct hentry
* HashMgr::walk_hashtable(int &col
, struct hentry
* hp
) const
220 if ((col
< 0) || (hp
== NULL
)) {
225 if (hp
&& hp
->next
!= NULL
) {
229 hp
= (col
< tablesize
) ? &tableptr
[col
] : NULL
;
230 // search for next non-blank column entry
231 while (hp
&& (hp
->word
== NULL
)) {
233 hp
= (col
< tablesize
) ? &tableptr
[col
] : NULL
;
235 if (col
< tablesize
) return hp
;
242 // load a munched word list and build a hash table on the fly
243 int HashMgr::load_tables(const char * tpath
)
248 unsigned short * flags
;
250 // raw dictionary - munched file
251 FILE * rawdict
= fopen(tpath
, "r");
252 if (rawdict
== NULL
) return 1;
254 // first read the first line of file to get hash table size */
256 if (! fgets(ts
, MAXDELEN
-1,rawdict
)) {
262 /* remove byte order mark */
263 if (strncmp(ts
,"?",3) == 0) {
264 memmove(ts
, ts
+3, strlen(ts
+3)+1);
265 HUNSPELL_WARNING(stderr
, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
268 if ((*ts
< '1') || (*ts
> '9')) HUNSPELL_WARNING(stderr
, "error - missing word count in dictionary file\n");
269 tablesize
= atoi(ts
);
274 tablesize
= tablesize
+ 5 + USERWORD
;
275 if ((tablesize
%2) == 0) tablesize
++;
277 // allocate the hash table
278 tableptr
= (struct hentry
*) calloc(tablesize
, sizeof(struct hentry
));
284 for (int i
=0; i
<tablesize
; i
++) tableptr
[i
].word
= NULL
;
286 // loop through all words on much list and add to hash
287 // table and create word and affix strings
289 while (fgets(ts
,MAXDELEN
-1,rawdict
)) {
291 // split each line into word and morphological description
292 dp
= strchr(ts
,'\t');
301 // split each line into word and affix char strings
302 // "\/" signs slash in words (not affix separator)
303 // "/" at beginning of the line is word character (not affix separator)
309 } else if (*(ap
- 1) != '\\') break;
310 // replace "\/" with "/"
311 for (char * sp
= ap
- 1; *sp
; *sp
= *(sp
+ 1), sp
++);
318 int index
= atoi(ap
+ 1);
319 al
= get_aliasf(index
, &flags
);
321 HUNSPELL_WARNING(stderr
, "error - bad flag vector alias: %s\n", ts
);
325 al
= decode_flags(&flags
, ap
+ 1);
326 flag_qsort(flags
, 0, al
);
336 // add the word and its index
337 if (add_word(ts
,wl
,flags
,al
,dp
)) return 5;
346 // the hash function is a simple load and rotate
347 // algorithm borrowed
349 int HashMgr::hash(const char * word
) const
352 for (int i
=0; i
< 4 && *word
!= 0; i
++)
353 hv
= (hv
<< 8) | (*word
++);
355 ROTATE(hv
,ROTATE_LEN
);
358 return (unsigned long) hv
% tablesize
;
361 int HashMgr::decode_flags(unsigned short ** result
, char * flags
) {
364 case FLAG_LONG
: { // two-character flags (1x2yZz -> 1x 2y Zz)
366 if (len
%2 == 1) HUNSPELL_WARNING(stderr
, "error: length of FLAG_LONG flagvector is odd: %s\n", flags
);
368 *result
= (unsigned short *) malloc(len
* sizeof(short));
369 for (int i
= 0; i
< len
; i
++) {
370 (*result
)[i
] = (((unsigned short) flags
[i
* 2]) << 8) + (unsigned short) flags
[i
* 2 + 1];
374 case FLAG_NUM
: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
377 unsigned short * dest
;
379 for (p
= flags
; *p
; p
++) {
380 if (*p
== ',') len
++;
382 *result
= (unsigned short *) malloc(len
* sizeof(short));
384 for (p
= flags
; *p
; p
++) {
386 *dest
= (unsigned short) atoi(src
);
387 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
392 *dest
= (unsigned short) atoi(src
);
393 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
396 case FLAG_UNI
: { // UTF-8 characters
397 w_char w
[MAXDELEN
/2];
398 len
= u8_u16(w
, MAXDELEN
/2, flags
);
399 *result
= (unsigned short *) malloc(len
* sizeof(short));
400 memcpy(*result
, w
, len
* sizeof(short));
403 default: { // Ispell's one-character flags (erfg -> e r f g)
404 unsigned short * dest
;
406 *result
= (unsigned short *) malloc(len
* sizeof(short));
408 for (unsigned char * p
= (unsigned char *) flags
; *p
; p
++) {
409 *dest
= (unsigned short) *p
;
417 unsigned short HashMgr::decode_flag(const char * f
) {
418 unsigned short s
= 0;
421 s
= ((unsigned short) f
[0] << 8) + (unsigned short) f
[1];
424 s
= (unsigned short) atoi(f
);
427 u8_u16((w_char
*) &s
, 1, f
);
430 s
= (unsigned short) *((unsigned char *)f
);
432 if (!s
) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
436 char * HashMgr::encode_flag(unsigned short f
) {
437 unsigned char ch
[10];
438 if (f
==0) return mystrdup("(NULL)");
439 if (flag_mode
== FLAG_LONG
) {
440 ch
[0] = (unsigned char) (f
>> 8);
441 ch
[1] = (unsigned char) (f
- ((f
>> 8) << 8));
443 } else if (flag_mode
== FLAG_NUM
) {
444 sprintf((char *) ch
, "%d", f
);
445 } else if (flag_mode
== FLAG_UNI
) {
446 u16_u8((char *) &ch
, 10, (w_char
*) &f
, 1);
448 ch
[0] = (unsigned char) (f
);
451 return mystrdup((char *) ch
);
454 // read in aff file and set flag mode
455 int HashMgr::load_config(const char * affpath
)
460 char line
[MAXDELEN
+1];
462 // open the affix file
464 afflst
= fopen(affpath
,"r");
466 HUNSPELL_WARNING(stderr
, "Error - could not open affix description file %s\n",affpath
);
470 // read in each line ignoring any that do not
471 // start with a known line type indicator
473 while (fgets(line
,MAXDELEN
,afflst
)) {
476 /* remove byte order mark */
479 if (strncmp(line
,"",3) == 0) memmove(line
, line
+3, strlen(line
+3)+1);
482 /* parse in the try string */
483 if ((strncmp(line
,"FLAG",4) == 0) && isspace(line
[4])) {
484 if (flag_mode
!= FLAG_CHAR
) {
485 HUNSPELL_WARNING(stderr
, "error: duplicate FLAG parameter\n");
487 if (strstr(line
, "long")) flag_mode
= FLAG_LONG
;
488 if (strstr(line
, "num")) flag_mode
= FLAG_NUM
;
489 if (strstr(line
, "UTF-8")) flag_mode
= FLAG_UNI
;
490 if (flag_mode
== FLAG_CHAR
) {
491 HUNSPELL_WARNING(stderr
, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line
);
494 if ((strncmp(line
,"SET",3) == 0) && isspace(line
[3]) && strstr(line
, "UTF-8")) utf8
= 1;
496 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
497 if (strncmp(line
,"IGNORE",6) == 0) {
498 if (parse_array(line
, &ignorechars
, &ignorechars_utf16
, &ignorechars_utf16_len
, "IGNORE", utf8
)) {
504 if ((strncmp(line
,"AF",2) == 0) && isspace(line
[2])) {
505 if (parse_aliasf(line
, afflst
)) {
511 #ifdef HUNSPELL_EXPERIMENTAL
512 if ((strncmp(line
,"AM",2) == 0) && isspace(line
[2])) {
513 if (parse_aliasm(line
, afflst
)) {
519 if (strncmp(line
,"COMPLEXPREFIXES",15) == 0) complexprefixes
= 1;
520 if (((strncmp(line
,"SFX",3) == 0) || (strncmp(line
,"PFX",3) == 0)) && isspace(line
[3])) break;
526 /* parse in the ALIAS table */
527 int HashMgr::parse_aliasf(char * line
, FILE * af
)
529 if (numaliasf
!= 0) {
530 HUNSPELL_WARNING(stderr
, "error: duplicate AF (alias for flag vector) tables used\n");
537 piece
= mystrsep(&tp
, 0);
539 if (*piece
!= '\0') {
541 case 0: { np
++; break; }
543 numaliasf
= atoi(piece
);
548 HUNSPELL_WARNING(stderr
, "incorrect number of entries in AF table\n");
552 aliasf
= (unsigned short **) malloc(numaliasf
* sizeof(unsigned short *));
553 aliasflen
= (unsigned short *) malloc(numaliasf
* sizeof(short));
554 if (!aliasf
|| !aliasflen
) {
556 if (aliasf
) free(aliasf
);
557 if (aliasflen
) free(aliasflen
);
571 piece
= mystrsep(&tp
, 0);
579 HUNSPELL_WARNING(stderr
, "error: missing AF table information\n");
583 /* now parse the numaliasf lines to read in the remainder of the table */
585 for (int j
=0; j
< numaliasf
; j
++) {
586 if (!fgets(nl
,MAXDELEN
,af
)) return 1;
592 piece
= mystrsep(&tp
, 0);
594 if (*piece
!= '\0') {
597 if (strncmp(piece
,"AF",2) != 0) {
603 HUNSPELL_WARNING(stderr
, "error: AF table is corrupt\n");
610 aliasflen
[j
] = (unsigned short) decode_flags(&(aliasf
[j
]), piece
);
611 flag_qsort(aliasf
[j
], 0, aliasflen
[j
]);
619 piece
= mystrsep(&tp
, 0);
627 HUNSPELL_WARNING(stderr
, "error: AF table is corrupt\n");
634 int HashMgr::is_aliasf() {
635 return (aliasf
!= NULL
);
638 int HashMgr::get_aliasf(int index
, unsigned short ** fvec
) {
639 if ((index
> 0) && (index
<= numaliasf
)) {
640 *fvec
= aliasf
[index
- 1];
641 return aliasflen
[index
- 1];
643 HUNSPELL_WARNING(stderr
, "error: bad flag alias index: %d\n", index
);
648 #ifdef HUNSPELL_EXPERIMENTAL
649 /* parse morph alias definitions */
650 int HashMgr::parse_aliasm(char * line
, FILE * af
)
652 if (numaliasm
!= 0) {
653 HUNSPELL_WARNING(stderr
, "error: duplicate AM (aliases for morphological descriptions) tables used\n");
660 piece
= mystrsep(&tp
, 0);
662 if (*piece
!= '\0') {
664 case 0: { np
++; break; }
666 numaliasm
= atoi(piece
);
668 HUNSPELL_WARNING(stderr
, "incorrect number of entries in AM table\n");
672 aliasm
= (char **) malloc(numaliasm
* sizeof(char *));
685 piece
= mystrsep(&tp
, 0);
691 HUNSPELL_WARNING(stderr
, "error: missing AM alias information\n");
695 /* now parse the numaliasm lines to read in the remainder of the table */
697 for (int j
=0; j
< numaliasm
; j
++) {
698 if (!fgets(nl
,MAXDELEN
,af
)) return 1;
703 piece
= mystrsep(&tp
, 0);
705 if (*piece
!= '\0') {
708 if (strncmp(piece
,"AM",2) != 0) {
709 HUNSPELL_WARNING(stderr
, "error: AM table is corrupt\n");
719 if (complexprefixes
) {
720 if (utf8
) reverseword_utf(piece
);
721 else reverseword(piece
);
723 aliasm
[j
] = mystrdup(piece
);
730 piece
= mystrsep(&tp
, 0);
736 HUNSPELL_WARNING(stderr
, "error: map table is corrupt\n");
743 int HashMgr::is_aliasm() {
744 return (aliasm
!= NULL
);
747 char * HashMgr::get_aliasm(int index
) {
748 if ((index
> 0) && (index
<= numaliasm
)) return aliasm
[index
- 1];
749 HUNSPELL_WARNING(stderr
, "error: bad morph. alias index: %d\n", index
);