disable sd filters test for now
[LibreOffice.git] / hyphen / hyphen-2.7.1-2.8.3.patch
blob047ce13608ac81865bfa04a7de2dcc3b641f2371
1 --- misc/build/hyphen-2.7.1/hyphen.c.old 2011-10-10 15:58:33.317260138 +0200
2 +++ misc/build/hyphen-2.7.1/hyphen.c 2011-10-10 15:58:55.221260136 +0200
3 @@ -226,115 +226,61 @@
6 #ifdef VERBOSE
7 -HashTab *global;
8 +HashTab *global[1];
10 static char *
11 -get_state_str (int state)
12 +get_state_str (int state, int level)
14 int i;
15 HashEntry *e;
17 for (i = 0; i < HASH_SIZE; i++)
18 - for (e = global->entries[i]; e; e = e->next)
19 + for (e = global[level]->entries[i]; e; e = e->next)
20 if (e->val == state)
21 return e->key;
22 return NULL;
24 #endif
26 -HyphenDict *
27 -hnj_hyphen_load (const char *fn)
29 - HyphenDict *dict[2];
30 - HashTab *hashtab;
31 - FILE *f;
32 - char buf[MAX_CHARS];
33 +void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
34 + int i, j;
35 char word[MAX_CHARS];
36 char pattern[MAX_CHARS];
37 char * repl;
38 signed char replindex;
39 signed char replcut;
40 - int state_num = 0, last_state;
41 - int i, j, k;
42 + int state_num = 0;
43 + int last_state;
44 char ch;
45 int found;
46 - HashEntry *e;
47 - int nextlevel = 0;
49 - f = fopen (fn, "r");
50 - if (f == NULL)
51 - return NULL;
53 -// loading one or two dictionaries (separated by NEXTLEVEL keyword)
54 -for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
55 - hashtab = hnj_hash_new ();
56 -#ifdef VERBOSE
57 - global = hashtab;
58 -#endif
59 - hnj_hash_insert (hashtab, "", 0);
60 - dict[k] = hnj_malloc (sizeof(HyphenDict));
61 - dict[k]->num_states = 1;
62 - dict[k]->states = hnj_malloc (sizeof(HyphenState));
63 - dict[k]->states[0].match = NULL;
64 - dict[k]->states[0].repl = NULL;
65 - dict[k]->states[0].fallback_state = -1;
66 - dict[k]->states[0].num_trans = 0;
67 - dict[k]->states[0].trans = NULL;
68 - dict[k]->nextlevel = NULL;
69 - dict[k]->lhmin = 0;
70 - dict[k]->rhmin = 0;
71 - dict[k]->clhmin = 0;
72 - dict[k]->crhmin = 0;
73 - dict[k]->nohyphen = NULL;
74 - dict[k]->nohyphenl = 0;
76 - /* read in character set info */
77 - if (k == 0) {
78 - for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
79 - fgets(dict[k]->cset, sizeof(dict[k]->cset),f);
80 - for (i=0;i<MAX_NAME;i++)
81 - if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
82 - dict[k]->cset[i] = 0;
83 - dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
84 - } else {
85 - strcpy(dict[k]->cset, dict[0]->cset);
86 - dict[k]->utf8 = dict[0]->utf8;
87 - }
89 - while (fgets (buf, sizeof(buf), f) != NULL)
90 - {
91 - if (buf[0] != '%')
92 - {
93 - if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
94 - nextlevel = 1;
95 - break;
96 - } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
97 - dict[k]->lhmin = atoi(buf + 13);
98 - continue;
99 + if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
100 + dict->lhmin = atoi(buf + 13);
101 + return;
102 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
103 - dict[k]->rhmin = atoi(buf + 14);
104 - continue;
105 + dict->rhmin = atoi(buf + 14);
106 + return;
107 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
108 - dict[k]->clhmin = atoi(buf + 21);
109 - continue;
110 + dict->clhmin = atoi(buf + 21);
111 + return;
112 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
113 - dict[k]->crhmin = atoi(buf + 22);
114 - continue;
115 + dict->crhmin = atoi(buf + 22);
116 + return;
117 } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
118 char * space = buf + 8;
119 while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
120 - if (*buf != '\0') dict[k]->nohyphen = hnj_strdup(space);
121 - if (dict[k]->nohyphen) {
122 - char * nhe = dict[k]->nohyphen + strlen(dict[k]->nohyphen) - 1;
123 + if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
124 + if (dict->nohyphen) {
125 + char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
126 *nhe = 0;
127 - for (nhe = nhe - 1; nhe > dict[k]->nohyphen; nhe--) {
128 + for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
129 if (*nhe == ',') {
130 - dict[k]->nohyphenl++;
131 + dict->nohyphenl++;
132 *nhe = 0;
136 - continue;
137 + return;
139 j = 0;
140 pattern[j] = '0';
141 @@ -379,7 +325,7 @@
142 } else {
143 if (*word == '.') i++;
144 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
145 - if (dict[k]->utf8) {
146 + if (dict->utf8) {
147 int pu = -1; /* unicode character position */
148 int ps = -1; /* unicode start position (original replindex) */
149 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
150 @@ -403,14 +349,14 @@
151 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl);
152 #endif
153 found = hnj_hash_lookup (hashtab, word);
154 - state_num = hnj_get_state (dict[k], hashtab, word);
155 - dict[k]->states[state_num].match = hnj_strdup (pattern + i);
156 - dict[k]->states[state_num].repl = repl;
157 - dict[k]->states[state_num].replindex = replindex;
158 + state_num = hnj_get_state (dict, hashtab, word);
159 + dict->states[state_num].match = hnj_strdup (pattern + i);
160 + dict->states[state_num].repl = repl;
161 + dict->states[state_num].replindex = replindex;
162 if (!replcut) {
163 - dict[k]->states[state_num].replcut = (signed char) strlen(word);
164 + dict->states[state_num].replcut = (signed char) strlen(word);
165 } else {
166 - dict[k]->states[state_num].replcut = replcut;
167 + dict->states[state_num].replcut = replcut;
170 /* now, put in the prefix transitions */
171 @@ -420,11 +366,82 @@
172 ch = word[j - 1];
173 word[j - 1] = '\0';
174 found = hnj_hash_lookup (hashtab, word);
175 - state_num = hnj_get_state (dict[k], hashtab, word);
176 - hnj_add_trans (dict[k], state_num, last_state, ch);
177 + state_num = hnj_get_state (dict, hashtab, word);
178 + hnj_add_trans (dict, state_num, last_state, ch);
183 +HyphenDict *
184 +hnj_hyphen_load (const char *fn)
186 + HyphenDict *dict[2];
187 + HashTab *hashtab;
188 + FILE *f;
189 + char buf[MAX_CHARS];
190 + int nextlevel = 0;
191 + int i, j, k;
192 + HashEntry *e;
193 + int state_num = 0;
195 + f = fopen (fn, "r");
196 + if (f == NULL)
197 + return NULL;
199 +// loading one or two dictionaries (separated by NEXTLEVEL keyword)
200 +for (k = 0; k < 2; k++) {
201 + hashtab = hnj_hash_new ();
202 +#ifdef VERBOSE
203 + global[k] = hashtab;
204 +#endif
205 + hnj_hash_insert (hashtab, "", 0);
206 + dict[k] = hnj_malloc (sizeof(HyphenDict));
207 + dict[k]->num_states = 1;
208 + dict[k]->states = hnj_malloc (sizeof(HyphenState));
209 + dict[k]->states[0].match = NULL;
210 + dict[k]->states[0].repl = NULL;
211 + dict[k]->states[0].fallback_state = -1;
212 + dict[k]->states[0].num_trans = 0;
213 + dict[k]->states[0].trans = NULL;
214 + dict[k]->nextlevel = NULL;
215 + dict[k]->lhmin = 0;
216 + dict[k]->rhmin = 0;
217 + dict[k]->clhmin = 0;
218 + dict[k]->crhmin = 0;
219 + dict[k]->nohyphen = NULL;
220 + dict[k]->nohyphenl = 0;
222 + /* read in character set info */
223 + if (k == 0) {
224 + for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
225 + fgets(dict[k]->cset, sizeof(dict[k]->cset),f);
226 + for (i=0;i<MAX_NAME;i++)
227 + if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
228 + dict[k]->cset[i] = 0;
229 + dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
230 + } else {
231 + strcpy(dict[k]->cset, dict[0]->cset);
232 + dict[k]->utf8 = dict[0]->utf8;
235 + if (k == 0 || nextlevel) {
236 + while (fgets (buf, sizeof(buf), f) != NULL) {
237 + if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
238 + nextlevel = 1;
239 + break;
240 + } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
242 + } else if (k == 1) {
243 + /* default first level: hyphen and ASCII apostrophe */
244 + if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN '\n", dict[k], hashtab);
245 + else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99\n", dict[k], hashtab);
246 + strcpy(buf, "1-1/=,1,1\n"); // buf rewritten by hnj_hyphen_load here
247 + hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
248 + hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
249 + if (dict[0]->utf8) {
250 + hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
251 + hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
255 /* Could do unioning of matches here (instead of the preprocessor script).
256 If we did, the pseudocode would look something like this:
257 @@ -476,7 +493,20 @@
258 state_num = 0;
260 fclose(f);
261 - if (k == 2) dict[0]->nextlevel = dict[1];
262 + if (nextlevel) dict[0]->nextlevel = dict[1];
263 + else {
264 + dict[1] -> nextlevel = dict[0];
265 + dict[1]->lhmin = dict[0]->lhmin;
266 + dict[1]->rhmin = dict[0]->rhmin;
267 + dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
268 + dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
269 +#ifdef VERBOSE
270 + HashTab *r = global[0];
271 + global[0] = global[1];
272 + global[1] = r;
273 +#endif
274 + return dict[1];
276 return dict[0];
279 @@ -527,8 +557,13 @@
280 j = 0;
281 prep_word[j++] = '.';
283 - for (i = 0; i < word_size; i++)
284 + for (i = 0; i < word_size; i++) {
285 + if (word[i] <= '9' && word[i] >= '0') {
286 + prep_word[j++] = '.';
287 + } else {
288 prep_word[j++] = word[i];
292 prep_word[j++] = '.';
293 prep_word[j] = '\0';
294 @@ -557,7 +592,7 @@
296 #ifdef VERBOSE
297 char *state_str;
298 - state_str = get_state_str (state);
299 + state_str = get_state_str (state, 0);
301 for (k = 0; k < i - strlen (state_str); k++)
302 putchar (' ');
303 @@ -670,6 +705,9 @@
304 i += hnj_ligature(word[2]);
307 + // ignore numbers
308 + for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
310 for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
311 // check length of the non-standard part
312 if (*rep && *pos && *cut && (*rep)[j]) {
313 @@ -696,9 +734,13 @@
314 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
315 char *** rep, int ** pos, int ** cut, int rhmin)
317 - int i;
318 - int j = word_size - 2;
319 - for (i = 1; i < rhmin && j > 0; j--) {
320 + int i = 1;
321 + int j;
323 + // ignore numbers
324 + for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
326 + for (j = word_size - 2; i < rhmin && j > 0; j--) {
327 // check length of the non-standard part
328 if (*rep && *pos && *cut && (*rep)[j]) {
329 char * rh = strchr((*rep)[j], '=');
330 @@ -756,8 +798,15 @@
331 j = 0;
332 prep_word[j++] = '.';
334 - for (i = 0; i < word_size; i++)
335 + for (i = 0; i < word_size; i++) {
336 + if (word[i] <= '9' && word[i] >= '0') {
337 + prep_word[j++] = '.';
338 + } else {
339 prep_word[j++] = word[i];
345 prep_word[j++] = '.';
346 prep_word[j] = '\0';
347 @@ -786,7 +835,7 @@
349 #ifdef VERBOSE
350 char *state_str;
351 - state_str = get_state_str (state);
352 + state_str = get_state_str (state, 1);
354 for (k = 0; k < i - strlen (state_str); k++)
355 putchar (' ');
356 @@ -1033,6 +1082,9 @@
359 hyphens[j + 1] = '\0';
360 +#ifdef VERBOSE
361 + printf ("nums: %s\n", hyphens);
362 +#endif
363 return 0;
366 @@ -1074,8 +1126,8 @@
367 for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
368 char * nhy = (char *) strstr(word, nh);
369 while (nhy) {
370 - hyphens[nhy - word + strlen(nh) - 1] = 0;
371 - if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0;
372 + hyphens[nhy - word + strlen(nh) - 1] = '0';
373 + if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0';
374 nhy = (char *) strstr(nhy + 1, nh);
376 nh = nh + strlen(nh) + 1;
377 @@ -1084,6 +1136,9 @@
379 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
380 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
381 +#ifdef VERBOSE
382 + printf ("nums: %s\n", hyphens);
383 +#endif
384 return 0;
387 @@ -1093,8 +1148,10 @@
388 char *hyphword, char *** rep, int ** pos, int ** cut,
389 int lhmin, int rhmin, int clhmin, int crhmin)
391 - lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
392 - rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
393 + lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
394 + rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
395 + clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
396 + crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
397 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
398 clhmin, crhmin, 1, 1);
399 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,