1 #include "license.hunspell"
2 #include "license.myspell"
13 // build a hash table from a munched word list
15 HashMgr::HashMgr(const char * tpath
, const char * apath
, const char * key
)
19 flag_mode
= FLAG_CHAR
;
27 ignorechars_utf16
= NULL
;
28 ignorechars_utf16_len
= 0;
33 forbiddenword
= FORBIDDENWORD
; // forbidden word signing flag
34 load_config(apath
, key
);
35 int ec
= load_tables(tpath
, key
);
37 /* error condition - what should we do here */
38 HUNSPELL_WARNING(stderr
, "Hash Manager Error : %d\n",ec
);
51 // now pass through hash table freeing up everything
52 // go through column by column of the table
53 for (int i
=0; i
< tablesize
; i
++) {
54 struct hentry
* pt
= tableptr
[i
];
55 struct hentry
* nt
= NULL
;
58 if (pt
->astr
&& (!aliasf
|| TESTAFF(pt
->astr
, ONLYUPCASEFLAG
, pt
->alen
))) free(pt
->astr
);
68 for (int j
= 0; j
< (numaliasf
); j
++) free(aliasf
[j
]);
77 for (int j
= 0; j
< (numaliasm
); j
++) free(aliasm
[j
]);
83 #ifndef MOZILLA_CLIENT
84 if (utf8
) free_utf_tbl();
91 if (ignorechars
) free(ignorechars
);
92 if (ignorechars_utf16
) free(ignorechars_utf16
);
99 // lookup a root word in the hashtable
101 struct hentry
* HashMgr::lookup(const char *word
) const
105 dp
= tableptr
[hash(word
)];
106 if (!dp
) return NULL
;
107 for ( ; dp
!= NULL
; dp
= dp
->next
) {
108 if (strcmp(word
, dp
->word
) == 0) return dp
;
114 // add a word to the hash table (private)
115 int HashMgr::add_word(const char * word
, int wbl
, int wcl
, unsigned short * aff
,
116 int al
, const char * desc
, bool onlyupcase
)
118 bool upcasehomonym
= false;
119 int descl
= desc
? (aliasm
? sizeof(short) : strlen(desc
) + 1) : 0;
120 // variable-length hash record with word and optional fields
122 (struct hentry
*) malloc (sizeof(struct hentry
) + wbl
+ descl
);
124 char * hpw
= hp
->word
;
126 if (ignorechars
!= NULL
) {
128 remove_ignored_chars_utf(hpw
, ignorechars_utf16
, ignorechars_utf16_len
);
130 remove_ignored_chars(hpw
, ignorechars
);
133 if (complexprefixes
) {
134 if (utf8
) reverseword_utf(hpw
); else reverseword(hpw
);
139 hp
->blen
= (unsigned char) wbl
;
140 hp
->clen
= (unsigned char) wcl
;
141 hp
->alen
= (short) al
;
144 hp
->next_homonym
= NULL
;
146 // store the description string or its pointer
150 hp
->var
+= H_OPT_ALIASM
;
151 store_pointer(hpw
+ wbl
+ 1, get_aliasm(atoi(desc
)));
153 strcpy(hpw
+ wbl
+ 1, desc
);
154 if (complexprefixes
) {
155 if (utf8
) reverseword_utf(HENTRY_DATA(hp
));
156 else reverseword(HENTRY_DATA(hp
));
159 if (strstr(HENTRY_DATA(hp
), MORPH_PHON
)) hp
->var
+= H_OPT_PHON
;
162 struct hentry
* dp
= tableptr
[i
];
167 while (dp
->next
!= NULL
) {
168 if ((!dp
->next_homonym
) && (strcmp(hp
->word
, dp
->word
) == 0)) {
169 // remove hidden onlyupcase homonym
171 if ((dp
->astr
) && TESTAFF(dp
->astr
, ONLYUPCASEFLAG
, dp
->alen
)) {
178 dp
->next_homonym
= hp
;
181 upcasehomonym
= true;
186 if (strcmp(hp
->word
, dp
->word
) == 0) {
187 // remove hidden onlyupcase homonym
189 if ((dp
->astr
) && TESTAFF(dp
->astr
, ONLYUPCASEFLAG
, dp
->alen
)) {
196 dp
->next_homonym
= hp
;
199 upcasehomonym
= true;
202 if (!upcasehomonym
) {
205 // remove hidden onlyupcase homonym
206 if (hp
->astr
) free(hp
->astr
);
212 int HashMgr::add_hidden_capitalized_word(char * word
, int wbl
, int wcl
,
213 unsigned short * flags
, int al
, char * dp
, int captype
)
215 // add inner capitalized forms to handle the following allcap forms:
216 // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
217 // Allcaps with suffixes: CIA's -> CIA'S
218 if (((captype
== HUHCAP
) || (captype
== HUHINITCAP
) ||
219 ((captype
== ALLCAP
) && (flags
!= NULL
))) &&
220 !((flags
!= NULL
) && TESTAFF(flags
, forbiddenword
, al
))) {
221 unsigned short * flags2
= (unsigned short *) malloc (sizeof(unsigned short) * (al
+1));
222 if (!flags2
) return 1;
223 if (al
) memcpy(flags2
, flags
, al
* sizeof(unsigned short));
224 flags2
[al
] = ONLYUPCASEFLAG
;
228 int wlen
= u8_u16(w
, BUFSIZE
, word
);
229 mkallsmall_utf(w
, wlen
, langnum
);
230 mkallcap_utf(w
, 1, langnum
);
231 u16_u8(st
, BUFSIZE
, w
, wlen
);
232 return add_word(st
,wbl
,wcl
,flags2
,al
+1,dp
, true);
234 mkallsmall(word
, csconv
);
235 mkinitcap(word
, csconv
);
236 return add_word(word
,wbl
,wcl
,flags2
,al
+1,dp
, true);
242 // detect captype and modify word length for UTF-8 encoding
243 int HashMgr::get_clen_and_captype(const char * word
, int wbl
, int * captype
) {
246 w_char dest_utf
[BUFSIZE
];
247 len
= u8_u16(dest_utf
, BUFSIZE
, word
);
248 *captype
= get_captype_utf8(dest_utf
, len
, langnum
);
251 *captype
= get_captype((char *) word
, len
, csconv
);
256 // remove word (personal dictionary function for standalone applications)
257 int HashMgr::remove(const char * word
)
259 struct hentry
* dp
= lookup(word
);
261 if (dp
->alen
== 0 || !TESTAFF(dp
->astr
, forbiddenword
, dp
->alen
)) {
262 unsigned short * flags
=
263 (unsigned short *) malloc(sizeof(short) * (dp
->alen
+ 1));
264 if (!flags
) return 1;
265 for (int i
= 0; i
< dp
->alen
; i
++) flags
[i
] = dp
->astr
[i
];
266 flags
[dp
->alen
] = forbiddenword
;
269 flag_qsort(flags
, 0, dp
->alen
);
271 dp
= dp
->next_homonym
;
276 /* remove forbidden flag to add a personal word to the hash */
277 int HashMgr::remove_forbidden_flag(const char * word
) {
278 struct hentry
* dp
= lookup(word
);
281 if (dp
->astr
&& TESTAFF(dp
->astr
, forbiddenword
, dp
->alen
)) {
282 if (dp
->alen
== 1) dp
->alen
= 0; // XXX forbidden words of personal dic.
284 unsigned short * flags2
=
285 (unsigned short *) malloc(sizeof(short) * (dp
->alen
- 1));
286 if (!flags2
) return 1;
288 for (i
= 0; i
< dp
->alen
; i
++) {
289 if (dp
->astr
[i
] != forbiddenword
) flags2
[j
++] = dp
->astr
[i
];
292 dp
->astr
= flags2
; // XXX allowed forbidden words
295 dp
= dp
->next_homonym
;
300 // add a custom dic. word to the hash table (public)
301 int HashMgr::add(const char * word
)
303 unsigned short * flags
= NULL
;
305 if (remove_forbidden_flag(word
)) {
307 int wbl
= strlen(word
);
308 int wcl
= get_clen_and_captype(word
, wbl
, &captype
);
309 add_word(word
, wbl
, wcl
, flags
, al
, NULL
, false);
310 return add_hidden_capitalized_word((char *) word
, wbl
, wcl
, flags
, al
, NULL
, captype
);
315 int HashMgr::add_with_affix(const char * word
, const char * example
)
317 // detect captype and modify word length for UTF-8 encoding
318 struct hentry
* dp
= lookup(example
);
319 remove_forbidden_flag(word
);
320 if (dp
&& dp
->astr
) {
322 int wbl
= strlen(word
);
323 int wcl
= get_clen_and_captype(word
, wbl
, &captype
);
325 add_word(word
, wbl
, wcl
, dp
->astr
, dp
->alen
, NULL
, false);
327 unsigned short * flags
= (unsigned short *) malloc (dp
->alen
* sizeof(short));
329 memcpy((void *) flags
, (void *) dp
->astr
, dp
->alen
* sizeof(short));
330 add_word(word
, wbl
, wcl
, flags
, dp
->alen
, NULL
, false);
333 return add_hidden_capitalized_word((char *) word
, wbl
, wcl
, dp
->astr
, dp
->alen
, NULL
, captype
);
338 // walk the hash table entry by entry - null at end
339 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
340 struct hentry
* HashMgr::walk_hashtable(int &col
, struct hentry
* hp
) const
342 if (hp
&& hp
->next
!= NULL
) return hp
->next
;
343 for (col
++; col
< tablesize
; col
++) {
344 if (tableptr
[col
]) return tableptr
[col
];
346 // null at end and reset to start
351 // load a munched word list and build a hash table on the fly
352 int HashMgr::load_tables(const char * tpath
, const char * key
)
358 unsigned short * flags
;
361 // open dictionary file
362 FileMgr
* dict
= new FileMgr(tpath
, key
);
363 if (dict
== NULL
) return 1;
365 // first read the first line of file to get hash table size */
366 if (!(ts
= dict
->getline())) {
367 HUNSPELL_WARNING(stderr
, "error: empty dic file\n");
373 /* remove byte order mark */
374 if (strncmp(ts
,"\xEF\xBB\xBF",3) == 0) {
375 memmove(ts
, ts
+3, strlen(ts
+3)+1);
376 // warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions
379 tablesize
= atoi(ts
);
380 if (tablesize
== 0) {
381 HUNSPELL_WARNING(stderr
, "error: line 1: missing or bad word count in the dic file\n");
385 tablesize
= tablesize
+ 5 + USERWORD
;
386 if ((tablesize
%2) == 0) tablesize
++;
388 // allocate the hash table
389 tableptr
= (struct hentry
**) malloc(tablesize
* sizeof(struct hentry
*));
394 for (int i
=0; i
<tablesize
; i
++) tableptr
[i
] = NULL
;
396 // loop through all words on much list and add to hash
397 // table and create word and affix strings
399 while ((ts
= dict
->getline())) {
401 // split each line into word and morphological description
403 while ((dp
= strchr(dp
, ':'))) {
404 if ((dp
> ts
+ 3) && (*(dp
- 3) == ' ' || *(dp
- 3) == '\t')) {
405 for (dp
-= 4; dp
>= ts
&& (*dp
== ' ' || *dp
== '\t'); dp
--);
406 if (dp
< ts
) { // missing word
417 // tabulator is the old morphological field separator
418 dp2
= strchr(ts
, '\t');
419 if (dp2
&& (!dp
|| dp2
< dp
)) {
424 // split each line into word and affix char strings
425 // "\/" signs slash in words (not affix separator)
426 // "/" at beginning of the line is word character (not affix separator)
432 } else if (*(ap
- 1) != '\\') break;
433 // replace "\/" with "/"
434 for (char * sp
= ap
- 1; *sp
; *sp
= *(sp
+ 1), sp
++);
441 int index
= atoi(ap
+ 1);
442 al
= get_aliasf(index
, &flags
, dict
);
444 HUNSPELL_WARNING(stderr
, "error: line %d: bad flag vector alias\n", dict
->getlinenum());
448 al
= decode_flags(&flags
, ap
+ 1, dict
);
450 HUNSPELL_WARNING(stderr
, "Can't allocate memory.\n");
454 flag_qsort(flags
, 0, al
);
463 int wbl
= strlen(ts
);
464 int wcl
= get_clen_and_captype(ts
, wbl
, &captype
);
465 // add the word and its index plus its capitalized form optionally
466 if (add_word(ts
,wbl
,wcl
,flags
,al
,dp
, false) ||
467 add_hidden_capitalized_word(ts
, wbl
, wcl
, flags
, al
, dp
, captype
)) {
477 // the hash function is a simple load and rotate
478 // algorithm borrowed
480 int HashMgr::hash(const char * word
) const
483 for (int i
=0; i
< 4 && *word
!= 0; i
++)
484 hv
= (hv
<< 8) | (*word
++);
486 ROTATE(hv
,ROTATE_LEN
);
489 return (unsigned long) hv
% tablesize
;
492 int HashMgr::decode_flags(unsigned short ** result
, char * flags
, FileMgr
* af
) {
494 if (*flags
== '\0') {
499 case FLAG_LONG
: { // two-character flags (1x2yZz -> 1x 2y Zz)
501 if (len
%2 == 1) HUNSPELL_WARNING(stderr
, "error: line %d: bad flagvector\n", af
->getlinenum());
503 *result
= (unsigned short *) malloc(len
* sizeof(short));
504 if (!*result
) return -1;
505 for (int i
= 0; i
< len
; i
++) {
506 (*result
)[i
] = (((unsigned short) flags
[i
* 2]) << 8) + (unsigned short) flags
[i
* 2 + 1];
510 case FLAG_NUM
: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
514 unsigned short * dest
;
516 for (p
= flags
; *p
; p
++) {
517 if (*p
== ',') len
++;
519 *result
= (unsigned short *) malloc(len
* sizeof(short));
520 if (!*result
) return -1;
522 for (p
= flags
; *p
; p
++) {
525 if (i
>= DEFAULTFLAGS
) HUNSPELL_WARNING(stderr
, "error: line %d: flag id %d is too large (max: %d)\n",
526 af
->getlinenum(), i
, DEFAULTFLAGS
- 1);
527 *dest
= (unsigned short) i
;
528 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: line %d: 0 is wrong flag id\n", af
->getlinenum());
534 if (i
>= DEFAULTFLAGS
) HUNSPELL_WARNING(stderr
, "error: line %d: flag id %d is too large (max: %d)\n",
535 af
->getlinenum(), i
, DEFAULTFLAGS
- 1);
536 *dest
= (unsigned short) i
;
537 if (*dest
== 0) HUNSPELL_WARNING(stderr
, "error: line %d: 0 is wrong flag id\n", af
->getlinenum());
540 case FLAG_UNI
: { // UTF-8 characters
542 len
= u8_u16(w
, BUFSIZE
/2, flags
);
543 *result
= (unsigned short *) malloc(len
* sizeof(short));
544 if (!*result
) return -1;
545 memcpy(*result
, w
, len
* sizeof(short));
548 default: { // Ispell's one-character flags (erfg -> e r f g)
549 unsigned short * dest
;
551 *result
= (unsigned short *) malloc(len
* sizeof(short));
552 if (!*result
) return -1;
554 for (unsigned char * p
= (unsigned char *) flags
; *p
; p
++) {
555 *dest
= (unsigned short) *p
;
563 unsigned short HashMgr::decode_flag(const char * f
) {
564 unsigned short s
= 0;
568 s
= ((unsigned short) f
[0] << 8) + (unsigned short) f
[1];
572 if (i
>= DEFAULTFLAGS
) HUNSPELL_WARNING(stderr
, "error: flag id %d is too large (max: %d)\n", i
, DEFAULTFLAGS
- 1);
573 s
= (unsigned short) i
;
576 u8_u16((w_char
*) &s
, 1, f
);
579 s
= (unsigned short) *((unsigned char *)f
);
581 if (s
== 0) HUNSPELL_WARNING(stderr
, "error: 0 is wrong flag id\n");
585 char * HashMgr::encode_flag(unsigned short f
) {
586 unsigned char ch
[10];
587 if (f
==0) return mystrdup("(NULL)");
588 if (flag_mode
== FLAG_LONG
) {
589 ch
[0] = (unsigned char) (f
>> 8);
590 ch
[1] = (unsigned char) (f
- ((f
>> 8) << 8));
592 } else if (flag_mode
== FLAG_NUM
) {
593 sprintf((char *) ch
, "%d", f
);
594 } else if (flag_mode
== FLAG_UNI
) {
595 u16_u8((char *) &ch
, 10, (w_char
*) &f
, 1);
597 ch
[0] = (unsigned char) (f
);
600 return mystrdup((char *) ch
);
603 // read in aff file and set flag mode
604 int HashMgr::load_config(const char * affpath
, const char * key
)
606 char * line
; // io buffers
609 // open the affix file
610 FileMgr
* afflst
= new FileMgr(affpath
, key
);
612 HUNSPELL_WARNING(stderr
, "Error - could not open affix description file %s\n",affpath
);
616 // read in each line ignoring any that do not
617 // start with a known line type indicator
619 while ((line
= afflst
->getline())) {
622 /* remove byte order mark */
625 if (strncmp(line
,"\xEF\xBB\xBF",3) == 0) memmove(line
, line
+3, strlen(line
+3)+1);
628 /* parse in the try string */
629 if ((strncmp(line
,"FLAG",4) == 0) && isspace(line
[4])) {
630 if (flag_mode
!= FLAG_CHAR
) {
631 HUNSPELL_WARNING(stderr
, "error: line %d: multiple definitions of the FLAG affix file parameter\n", afflst
->getlinenum());
633 if (strstr(line
, "long")) flag_mode
= FLAG_LONG
;
634 if (strstr(line
, "num")) flag_mode
= FLAG_NUM
;
635 if (strstr(line
, "UTF-8")) flag_mode
= FLAG_UNI
;
636 if (flag_mode
== FLAG_CHAR
) {
637 HUNSPELL_WARNING(stderr
, "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n", afflst
->getlinenum());
640 if (strncmp(line
,"FORBIDDENWORD",13) == 0) {
642 if (parse_string(line
, &st
, afflst
->getlinenum())) {
646 forbiddenword
= decode_flag(st
);
649 if (strncmp(line
, "SET", 3) == 0) {
650 if (parse_string(line
, &enc
, afflst
->getlinenum())) {
654 if (strcmp(enc
, "UTF-8") == 0) {
656 #ifndef OPENOFFICEORG
657 #ifndef MOZILLA_CLIENT
658 initialize_utf_tbl();
661 } else csconv
= get_current_cs(enc
);
663 if (strncmp(line
, "LANG", 4) == 0) {
664 if (parse_string(line
, &lang
, afflst
->getlinenum())) {
668 langnum
= get_lang_num(lang
);
671 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
672 if (strncmp(line
,"IGNORE",6) == 0) {
673 if (parse_array(line
, &ignorechars
, &ignorechars_utf16
,
674 &ignorechars_utf16_len
, utf8
, afflst
->getlinenum())) {
680 if ((strncmp(line
,"AF",2) == 0) && isspace(line
[2])) {
681 if (parse_aliasf(line
, afflst
)) {
687 if ((strncmp(line
,"AM",2) == 0) && isspace(line
[2])) {
688 if (parse_aliasm(line
, afflst
)) {
694 if (strncmp(line
,"COMPLEXPREFIXES",15) == 0) complexprefixes
= 1;
695 if (((strncmp(line
,"SFX",3) == 0) || (strncmp(line
,"PFX",3) == 0)) && isspace(line
[3])) break;
697 if (csconv
== NULL
) csconv
= get_current_cs(SPELL_ENCODING
);
702 /* parse in the ALIAS table */
703 int HashMgr::parse_aliasf(char * line
, FileMgr
* af
)
705 if (numaliasf
!= 0) {
706 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
713 piece
= mystrsep(&tp
, 0);
715 if (*piece
!= '\0') {
717 case 0: { np
++; break; }
719 numaliasf
= atoi(piece
);
724 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
727 aliasf
= (unsigned short **) malloc(numaliasf
* sizeof(unsigned short *));
728 aliasflen
= (unsigned short *) malloc(numaliasf
* sizeof(short));
729 if (!aliasf
|| !aliasflen
) {
731 if (aliasf
) free(aliasf
);
732 if (aliasflen
) free(aliasflen
);
744 piece
= mystrsep(&tp
, 0);
752 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
756 /* now parse the numaliasf lines to read in the remainder of the table */
758 for (int j
=0; j
< numaliasf
; j
++) {
759 if (!(nl
= af
->getline())) return 1;
765 piece
= mystrsep(&tp
, 0);
767 if (*piece
!= '\0') {
770 if (strncmp(piece
,"AF",2) != 0) {
776 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
782 aliasflen
[j
] = (unsigned short) decode_flags(&(aliasf
[j
]), piece
, af
);
783 flag_qsort(aliasf
[j
], 0, aliasflen
[j
]);
790 piece
= mystrsep(&tp
, 0);
798 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
805 int HashMgr::is_aliasf() {
806 return (aliasf
!= NULL
);
809 int HashMgr::get_aliasf(int index
, unsigned short ** fvec
, FileMgr
* af
) {
810 if ((index
> 0) && (index
<= numaliasf
)) {
811 *fvec
= aliasf
[index
- 1];
812 return aliasflen
[index
- 1];
814 HUNSPELL_WARNING(stderr
, "error: line %d: bad flag alias index: %d\n", af
->getlinenum(), index
);
819 /* parse morph alias definitions */
820 int HashMgr::parse_aliasm(char * line
, FileMgr
* af
)
822 if (numaliasm
!= 0) {
823 HUNSPELL_WARNING(stderr
, "error: line %d: multiple table definitions\n", af
->getlinenum());
830 piece
= mystrsep(&tp
, 0);
832 if (*piece
!= '\0') {
834 case 0: { np
++; break; }
836 numaliasm
= atoi(piece
);
838 HUNSPELL_WARNING(stderr
, "error: line %d: bad entry number\n", af
->getlinenum());
841 aliasm
= (char **) malloc(numaliasm
* sizeof(char *));
853 piece
= mystrsep(&tp
, 0);
859 HUNSPELL_WARNING(stderr
, "error: line %d: missing data\n", af
->getlinenum());
863 /* now parse the numaliasm lines to read in the remainder of the table */
865 for (int j
=0; j
< numaliasm
; j
++) {
866 if (!(nl
= af
->getline())) return 1;
871 piece
= mystrsep(&tp
, ' ');
873 if (*piece
!= '\0') {
876 if (strncmp(piece
,"AM",2) != 0) {
877 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
886 // add the remaining of the line
889 tp
= tp
+ strlen(tp
);
891 if (complexprefixes
) {
892 if (utf8
) reverseword_utf(piece
);
893 else reverseword(piece
);
895 aliasm
[j
] = mystrdup(piece
);
907 piece
= mystrsep(&tp
, ' ');
913 HUNSPELL_WARNING(stderr
, "error: line %d: table is corrupt\n", af
->getlinenum());
920 int HashMgr::is_aliasm() {
921 return (aliasm
!= NULL
);
924 char * HashMgr::get_aliasm(int index
) {
925 if ((index
> 0) && (index
<= numaliasm
)) return aliasm
[index
- 1];
926 HUNSPELL_WARNING(stderr
, "error: bad morph. alias index: %d\n", index
);