1 #include "license.hunspell"
2 #include "license.myspell"
16 #include "hashmgr.hxx"
21 #ifdef __SUNPRO_CC // for SunONE Studio compiler
30 // build a hash table from a munched word list
32 HashMgr::HashMgr(const char * tpath
, const char * apath
)
36 flag_mode
= FLAG_CHAR
;
40 ignorechars_utf16
= NULL
;
41 ignorechars_utf16_len
= 0;
47 int ec
= load_tables(tpath
);
49 /* error condition - what should we do here */
50 HUNSPELL_WARNING(stderr
, "Hash Manager Error : %d\n",ec
);
62 // now pass through hash table freeing up everything
63 // go through column by column of the table
64 for (int i
=0; i
< tablesize
; i
++) {
65 struct hentry
* pt
= &tableptr
[i
];
66 struct hentry
* nt
= NULL
;
68 if (pt
->astr
&& !aliasf
) free(pt
->astr
);
69 if (pt
->word
) free(pt
->word
);
70 #ifdef HUNSPELL_EXPERIMENTAL
71 if (pt
->description
&& !aliasm
) free(pt
->description
);
77 if (pt
->astr
&& !aliasf
) free(pt
->astr
);
78 if (pt
->word
) free(pt
->word
);
79 #ifdef HUNSPELL_EXPERIMENTAL
80 if (pt
->description
&& !aliasm
) free(pt
->description
);
91 for (int j
= 0; j
< (numaliasf
); j
++) free(aliasf
[j
]);
100 for (int j
= 0; j
< (numaliasm
); j
++) free(aliasm
[j
]);
105 if (ignorechars
) free(ignorechars
);
106 if (ignorechars_utf16
) free(ignorechars_utf16
);
109 // lookup a root word in the hashtable
111 struct hentry
* HashMgr::lookup(const char *word
) const
115 dp
= &tableptr
[hash(word
)];
116 if (dp
->word
== NULL
) return NULL
;
117 for ( ; dp
!= NULL
; dp
= dp
->next
) {
118 if (strcmp(word
,dp
->word
) == 0) return dp
;
124 // add a word to the hash table (private)
126 int HashMgr::add_word(const char * word
, int wl
, unsigned short * aff
, int al
, const char * desc
)
128 char * st
= mystrdup(word
);
129 if (wl
&& !st
) return 1;
130 if (ignorechars
!= NULL
) {
132 remove_ignored_chars_utf(st
, ignorechars_utf16
, ignorechars_utf16_len
);
134 remove_ignored_chars(st
, ignorechars
);
137 if (complexprefixes
) {
138 if (utf8
) reverseword_utf(st
); else reverseword(st
);
141 struct hentry
* dp
= &tableptr
[i
];
142 if (dp
->word
== NULL
) {
143 dp
->wlen
= (short) wl
;
144 dp
->alen
= (short) al
;
148 dp
->next_homonym
= NULL
;
149 #ifdef HUNSPELL_EXPERIMENTAL
151 dp
->description
= (desc
) ? get_aliasm(atoi(desc
)) : mystrdup(desc
);
153 dp
->description
= mystrdup(desc
);
154 if (desc
&& !dp
->description
) return 1;
155 if (dp
->description
&& complexprefixes
) {
156 if (utf8
) reverseword_utf(dp
->description
); else reverseword(dp
->description
);
161 struct hentry
* hp
= (struct hentry
*) malloc (sizeof(struct hentry
));
163 hp
->wlen
= (short) wl
;
164 hp
->alen
= (short) al
;
168 hp
->next_homonym
= NULL
;
169 #ifdef HUNSPELL_EXPERIMENTAL
171 hp
->description
= (desc
) ? get_aliasm(atoi(desc
)) : mystrdup(desc
);
173 hp
->description
= mystrdup(desc
);
174 if (desc
&& !hp
->description
) return 1;
175 if (dp
->description
&& complexprefixes
) {
176 if (utf8
) reverseword_utf(hp
->description
); else reverseword(hp
->description
);
180 while (dp
->next
!= NULL
) {
181 if ((!dp
->next_homonym
) && (strcmp(hp
->word
, dp
->word
) == 0)) dp
->next_homonym
= hp
;
184 if ((!dp
->next_homonym
) && (strcmp(hp
->word
, dp
->word
) == 0)) dp
->next_homonym
= hp
;
190 // add a custom dic. word to the hash table (public)
191 int HashMgr::put_word(const char * word
, int wl
, char * aff
)
193 unsigned short * flags
;
196 al
= decode_flags(&flags
, aff
);
197 flag_qsort(flags
, 0, al
);
201 add_word(word
, wl
, flags
, al
, NULL
);
205 int HashMgr::put_word_pattern(const char * word
, int wl
, const char * pattern
)
207 unsigned short * flags
;
208 struct hentry
* dp
= lookup(pattern
);
209 if (!dp
|| !dp
->astr
) return 1;
210 flags
= (unsigned short *) malloc (dp
->alen
* sizeof(short));
211 memcpy((void *) flags
, (void *) dp
->astr
, dp
->alen
* sizeof(short));
212 add_word(word
, wl
, flags
, dp
->alen
, NULL
);
216 // walk the hash table entry by entry - null at end
217 struct hentry
* HashMgr::walk_hashtable(int &col
, struct hentry
* hp
) const
220 if ((col
< 0) || (hp
== NULL
)) {
225 if (hp
&& hp
->next
!= NULL
) {
229 hp
= (col
< tablesize
) ? &tableptr
[col
] : NULL
;
230 // search for next non-blank column entry
231 while (hp
&& (hp
->word
== NULL
)) {
233 hp
= (col
< tablesize
) ? &tableptr
[col
] : NULL
;
235 if (col
< tablesize
) return hp
;
242 // load a munched word list and build a hash table on the fly
243 int HashMgr::load_tables(const char * tpath
)
248 unsigned short * flags
;
250 // raw dictionary - munched file
251 FILE * rawdict
= fopen(tpath
, "r");
252 if (rawdict
== NULL
) return 1;
254 // first read the first line of file to get hash table size */
256 if (! fgets(ts
, MAXDELEN
-1,rawdict
)) return 2;
259 /* remove byte order mark */
260 if (strncmp(ts
,"",3) == 0) {
261 memmove(ts
, ts
+3, strlen(ts
+3)+1);
262 HUNSPELL_WARNING(stderr
, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
265 if ((*ts
< '1') || (*ts
> '9')) HUNSPELL_WARNING(stderr
, "error - missing word count in dictionary file\n");
266 tablesize
= atoi(ts
);
267 if (!tablesize
) return 4;
268 tablesize
= tablesize
+ 5 + USERWORD
;
269 if ((tablesize
%2) == 0) tablesize
++;
271 // allocate the hash table
272 tableptr
= (struct hentry
*) calloc(tablesize
, sizeof(struct hentry
));
273 if (! tableptr
) return 3;
274 for (int i
=0; i
<tablesize
; i
++) tableptr
[i
].word
= NULL
;
276 // loop through all words on much list and add to hash
277 // table and create word and affix strings
279 while (fgets(ts
,MAXDELEN
-1,rawdict
)) {
281 // split each line into word and morphological description
282 dp
= strchr(ts
,'\t');
291 // split each line into word and affix char strings
292 // "\/" signs slash in words (not affix separator)
293 // "/" at beginning of the line is word character (not affix separator)
299 } else if (*(ap
- 1) != '\\') break;
300 // replace "\/" with "/"
301 for (char * sp
= ap
- 1; *sp
; *sp
= *(sp
+ 1), sp
++);
308 int index
= atoi(ap
+ 1);
309 al
= get_aliasf(index
, &flags
);
311 HUNSPELL_WARNING(stderr
, "error - bad flag vector alias: %s\n", ts
);
315 al
= decode_flags(&flags
, ap
+ 1);
316 flag_qsort(flags
, 0, al
);
326 // add the word and its index
327 if (add_word(ts
,wl
,flags
,al
,dp
)) return 5;
336 // the hash function is a simple load and rotate
337 // algorithm borrowed
339 int HashMgr::hash(const char * word
) const
342 for (int i
=0; i
< 4 && *word
!= 0; i
++)
343 hv
= (hv
<< 8) | (*word
++);
345 ROTATE(hv
,ROTATE_LEN
);
348 return (unsigned long) hv
% tablesize
;
351 int HashMgr::decode_flags(unsigned short ** result
, char * flags
) {
354 case FLAG_LONG
: { // two-character flags (1x2yZz -> 1x 2y Zz)
356 if (len
%2 == 1) HUNSPELL_WARNING(stderr
, "error: length of FLAG_LONG flagvector is odd: %s\n", flags
);
358 *result
= (unsigned short *) malloc(len
* sizeof(short));
359 for (int i
= 0; i
< len
; i
++) {
360 (*result
)[i
] = (((unsigned short) flags
[i
* 2]) << 8) + (unsigned short) flags
[i
* 2 + 1];
364 case FLAG_NUM
: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
367 unsigned short * dest
;
369 for (p
= flags
; *p
; p
++) {
370 if (*p
== ',') len
++;
372 *result
= (unsigned short *) malloc(len
* sizeof(short));
374 for (p
= flags
; *p
; p
++) {
376 *dest
= (unsigned short) atoi(src
);
377 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
382 *dest
= (unsigned short) atoi(src
);
383 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
386 case FLAG_UNI
: { // UTF-8 characters
387 w_char w
[MAXDELEN
/2];
388 len
= u8_u16(w
, MAXDELEN
/2, flags
);
389 *result
= (unsigned short *) malloc(len
* sizeof(short));
390 memcpy(*result
, w
, len
* sizeof(short));
393 default: { // Ispell's one-character flags (erfg -> e r f g)
394 unsigned short * dest
;
396 *result
= (unsigned short *) malloc(len
* sizeof(short));
398 for (unsigned char * p
= (unsigned char *) flags
; *p
; p
++) {
399 *dest
= (unsigned short) *p
;
407 unsigned short HashMgr::decode_flag(const char * f
) {
408 unsigned short s
= 0;
411 s
= ((unsigned short) f
[0] << 8) + (unsigned short) f
[1];
414 s
= (unsigned short) atoi(f
);
417 u8_u16((w_char
*) &s
, 1, f
);
420 s
= (unsigned short) *((unsigned char *)f
);
422 if (!s
) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
426 char * HashMgr::encode_flag(unsigned short f
) {
427 unsigned char ch
[10];
428 if (f
==0) return mystrdup("(NULL)");
429 if (flag_mode
== FLAG_LONG
) {
430 ch
[0] = (unsigned char) (f
>> 8);
431 ch
[1] = (unsigned char) (f
- ((f
>> 8) << 8));
433 } else if (flag_mode
== FLAG_NUM
) {
434 sprintf((char *) ch
, "%d", f
);
435 } else if (flag_mode
== FLAG_UNI
) {
436 u16_u8((char *) &ch
, 10, (w_char
*) &f
, 1);
438 ch
[0] = (unsigned char) (f
);
441 return mystrdup((char *) ch
);
444 // read in aff file and set flag mode
445 int HashMgr::load_config(const char * affpath
)
450 char line
[MAXDELEN
+1];
452 // open the affix file
454 afflst
= fopen(affpath
,"r");
456 HUNSPELL_WARNING(stderr
, "Error - could not open affix description file %s\n",affpath
);
460 // read in each line ignoring any that do not
461 // start with a known line type indicator
463 while (fgets(line
,MAXDELEN
,afflst
)) {
466 /* remove byte order mark */
469 if (strncmp(line
,"",3) == 0) memmove(line
, line
+3, strlen(line
+3)+1);
472 /* parse in the try string */
473 if ((strncmp(line
,"FLAG",4) == 0) && isspace(line
[4])) {
474 if (flag_mode
!= FLAG_CHAR
) {
475 HUNSPELL_WARNING(stderr
, "error: duplicate FLAG parameter\n");
477 if (strstr(line
, "long")) flag_mode
= FLAG_LONG
;
478 if (strstr(line
, "num")) flag_mode
= FLAG_NUM
;
479 if (strstr(line
, "UTF-8")) flag_mode
= FLAG_UNI
;
480 if (flag_mode
== FLAG_CHAR
) {
481 HUNSPELL_WARNING(stderr
, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line
);
484 if ((strncmp(line
,"SET",3) == 0) && isspace(line
[3]) && strstr(line
, "UTF-8")) utf8
= 1;
486 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
487 if (strncmp(line
,"IGNORE",6) == 0) {
488 if (parse_array(line
, &ignorechars
, &ignorechars_utf16
, &ignorechars_utf16_len
, "IGNORE", utf8
)) {
494 if ((strncmp(line
,"AF",2) == 0) && isspace(line
[2])) {
495 if (parse_aliasf(line
, afflst
)) {
501 #ifdef HUNSPELL_EXPERIMENTAL
502 if ((strncmp(line
,"AM",2) == 0) && isspace(line
[2])) {
503 if (parse_aliasm(line
, afflst
)) {
509 if (strncmp(line
,"COMPLEXPREFIXES",15) == 0) complexprefixes
= 1;
510 if (((strncmp(line
,"SFX",3) == 0) || (strncmp(line
,"PFX",3) == 0)) && isspace(line
[3])) break;
516 /* parse in the ALIAS table */
517 int HashMgr::parse_aliasf(char * line
, FILE * af
)
519 if (numaliasf
!= 0) {
520 HUNSPELL_WARNING(stderr
, "error: duplicate AF (alias for flag vector) tables used\n");
527 piece
= mystrsep(&tp
, 0);
529 if (*piece
!= '\0') {
531 case 0: { np
++; break; }
533 numaliasf
= atoi(piece
);
538 HUNSPELL_WARNING(stderr
, "incorrect number of entries in AF table\n");
542 aliasf
= (unsigned short **) malloc(numaliasf
* sizeof(unsigned short *));
543 aliasflen
= (unsigned short *) malloc(numaliasf
* sizeof(short));
544 if (!aliasf
|| !aliasflen
) {
546 if (aliasf
) free(aliasf
);
547 if (aliasflen
) free(aliasflen
);
560 piece
= mystrsep(&tp
, 0);
568 HUNSPELL_WARNING(stderr
, "error: missing AF table information\n");
572 /* now parse the numaliasf lines to read in the remainder of the table */
574 for (int j
=0; j
< numaliasf
; j
++) {
575 if (!fgets(nl
,MAXDELEN
,af
)) return 1;
581 piece
= mystrsep(&tp
, 0);
583 if (*piece
!= '\0') {
586 if (strncmp(piece
,"AF",2) != 0) {
592 HUNSPELL_WARNING(stderr
, "error: AF table is corrupt\n");
599 aliasflen
[j
] = (unsigned short) decode_flags(&(aliasf
[j
]), piece
);
600 flag_qsort(aliasf
[j
], 0, aliasflen
[j
]);
608 piece
= mystrsep(&tp
, 0);
616 HUNSPELL_WARNING(stderr
, "error: AF table is corrupt\n");
623 int HashMgr::is_aliasf() {
624 return (aliasf
!= NULL
);
627 int HashMgr::get_aliasf(int index
, unsigned short ** fvec
) {
628 if ((index
> 0) && (index
<= numaliasf
)) {
629 *fvec
= aliasf
[index
- 1];
630 return aliasflen
[index
- 1];
632 HUNSPELL_WARNING(stderr
, "error: bad flag alias index: %d\n", index
);
637 #ifdef HUNSPELL_EXPERIMENTAL
638 /* parse morph alias definitions */
639 int HashMgr::parse_aliasm(char * line
, FILE * af
)
641 if (numaliasm
!= 0) {
642 HUNSPELL_WARNING(stderr
, "error: duplicate AM (aliases for morphological descriptions) tables used\n");
649 piece
= mystrsep(&tp
, 0);
651 if (*piece
!= '\0') {
653 case 0: { np
++; break; }
655 numaliasm
= atoi(piece
);
657 HUNSPELL_WARNING(stderr
, "incorrect number of entries in AM table\n");
661 aliasm
= (char **) malloc(numaliasm
* sizeof(char *));
674 piece
= mystrsep(&tp
, 0);
680 HUNSPELL_WARNING(stderr
, "error: missing AM alias information\n");
684 /* now parse the numaliasm lines to read in the remainder of the table */
686 for (int j
=0; j
< numaliasm
; j
++) {
687 if (!fgets(nl
,MAXDELEN
,af
)) return 1;
692 piece
= mystrsep(&tp
, 0);
694 if (*piece
!= '\0') {
697 if (strncmp(piece
,"AM",2) != 0) {
698 HUNSPELL_WARNING(stderr
, "error: AM table is corrupt\n");
708 if (complexprefixes
) {
709 if (utf8
) reverseword_utf(piece
);
710 else reverseword(piece
);
712 aliasm
[j
] = mystrdup(piece
);
719 piece
= mystrsep(&tp
, 0);
725 HUNSPELL_WARNING(stderr
, "error: map table is corrupt\n");
732 int HashMgr::is_aliasm() {
733 return (aliasm
!= NULL
);
736 char * HashMgr::get_aliasm(int index
) {
737 if ((index
> 0) && (index
<= numaliasm
)) return aliasm
[index
- 1];
738 HUNSPELL_WARNING(stderr
, "error: bad morph. alias index: %d\n", index
);