hyphen/hyphen-2.7.1-2.8.3.patch

   1 --- misc/build/hyphen-2.7.1/hyphen.c.old        2011-10-10 15:58:33.317260138 +0200
   2 +++ misc/build/hyphen-2.7.1/hyphen.c    2011-10-10 15:58:55.221260136 +0200
   3 @@ -226,115 +226,61 @@
   4  }
   5
   6  #ifdef VERBOSE
   7 -HashTab *global;
   8 +HashTab *global[1];
   9
  10  static char *
  11 -get_state_str (int state)
  12 +get_state_str (int state, int level)
  13  {
  14    int i;
  15    HashEntry *e;
  16
  17    for (i = 0; i < HASH_SIZE; i++)
  18 -    for (e = global->entries[i]; e; e = e->next)
  19 +    for (e = global[level]->entries[i]; e; e = e->next)
  20        if (e->val == state)
  21         return e->key;
  22    return NULL;
  23  }
  24  #endif
  25
  26 -HyphenDict *
  27 -hnj_hyphen_load (const char *fn)
  28 -{
  29 -  HyphenDict *dict[2];
  30 -  HashTab *hashtab;
  31 -  FILE *f;
  32 -  char buf[MAX_CHARS];
  33 +void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
  34 +  int i, j;
  35    char word[MAX_CHARS];
  36    char pattern[MAX_CHARS];
  37    char * repl;
  38    signed char replindex;
  39    signed char replcut;
  40 -  int state_num = 0, last_state;
  41 -  int i, j, k;
  42 +  int state_num = 0;
  43 +  int last_state;
  44    char ch;
  45    int found;
  46 -  HashEntry *e;
  47 -  int nextlevel = 0;
  48 -
  49 -  f = fopen (fn, "r");
  50 -  if (f == NULL)
  51 -    return NULL;
  52 -
  53 -// loading one or two dictionaries (separated by NEXTLEVEL keyword)
  54 -for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
  55 -  hashtab = hnj_hash_new ();
  56 -#ifdef VERBOSE
  57 -  global = hashtab;
  58 -#endif
  59 -  hnj_hash_insert (hashtab, "", 0);
  60 -  dict[k] = hnj_malloc (sizeof(HyphenDict));
  61 -  dict[k]->num_states = 1;
  62 -  dict[k]->states = hnj_malloc (sizeof(HyphenState));
  63 -  dict[k]->states[0].match = NULL;
  64 -  dict[k]->states[0].repl = NULL;
  65 -  dict[k]->states[0].fallback_state = -1;
  66 -  dict[k]->states[0].num_trans = 0;
  67 -  dict[k]->states[0].trans = NULL;
  68 -  dict[k]->nextlevel = NULL;
  69 -  dict[k]->lhmin = 0;
  70 -  dict[k]->rhmin = 0;
  71 -  dict[k]->clhmin = 0;
  72 -  dict[k]->crhmin = 0;
  73 -  dict[k]->nohyphen = NULL;
  74 -  dict[k]->nohyphenl = 0;
  75 -
  76 -  /* read in character set info */
  77 -  if (k == 0) {
  78 -    for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
  79 -    fgets(dict[k]->cset,  sizeof(dict[k]->cset),f);
  80 -    for (i=0;i<MAX_NAME;i++)
  81 -      if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
  82 -        dict[k]->cset[i] = 0;
  83 -    dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
  84 -  } else {
  85 -    strcpy(dict[k]->cset, dict[0]->cset);
  86 -    dict[k]->utf8 = dict[0]->utf8;
  87 -  }
  88
  89 -  while (fgets (buf, sizeof(buf), f) != NULL)
  90 -    {
  91 -      if (buf[0] != '%')
  92 -       {
  93 -         if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
  94 -           nextlevel = 1;
  95 -           break;
  96 -         } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
  97 -           dict[k]->lhmin = atoi(buf + 13);
  98 -           continue;
  99 +         if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
 100 +           dict->lhmin = atoi(buf + 13);
 101 +           return;
 102           } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
 103 -           dict[k]->rhmin = atoi(buf + 14);
 104 -           continue;
 105 +           dict->rhmin = atoi(buf + 14);
 106 +           return;
 107           } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
 108 -           dict[k]->clhmin = atoi(buf + 21);
 109 -           continue;
 110 +           dict->clhmin = atoi(buf + 21);
 111 +           return;
 112           } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
 113 -           dict[k]->crhmin = atoi(buf + 22);
 114 -           continue;
 115 +           dict->crhmin = atoi(buf + 22);
 116 +           return;
 117           } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
 118             char * space = buf + 8;
 119             while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
 120 -           if (*buf != '\0') dict[k]->nohyphen = hnj_strdup(space);
 121 -           if (dict[k]->nohyphen) {
 122 -               char * nhe = dict[k]->nohyphen + strlen(dict[k]->nohyphen) - 1;
 123 +           if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
 124 +           if (dict->nohyphen) {
 125 +               char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
 126                 *nhe = 0;
 127 -               for (nhe = nhe - 1; nhe > dict[k]->nohyphen; nhe--) {
 128 +               for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
 129                         if (*nhe == ',') {
 130 -                           dict[k]->nohyphenl++;
 131 +                           dict->nohyphenl++;
 132                             *nhe = 0;
 133                         }
 134                 }
 135             }
 136 -           continue;
 137 +           return;
 138           }
 139           j = 0;
 140           pattern[j] = '0';
 141 @@ -379,7 +325,7 @@
 142            } else {
 143              if (*word == '.') i++;
 144              /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
 145 -            if (dict[k]->utf8) {
 146 +            if (dict->utf8) {
 147                  int pu = -1;        /* unicode character position */
 148                  int ps = -1;        /* unicode start position (original replindex) */
 149                  int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
 150 @@ -403,14 +349,14 @@
 151           printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
 152  #endif
 153           found = hnj_hash_lookup (hashtab, word);
 154 -         state_num = hnj_get_state (dict[k], hashtab, word);
 155 -         dict[k]->states[state_num].match = hnj_strdup (pattern + i);
 156 -         dict[k]->states[state_num].repl = repl;
 157 -         dict[k]->states[state_num].replindex = replindex;
 158 +         state_num = hnj_get_state (dict, hashtab, word);
 159 +         dict->states[state_num].match = hnj_strdup (pattern + i);
 160 +         dict->states[state_num].repl = repl;
 161 +         dict->states[state_num].replindex = replindex;
 162            if (!replcut) {
 163 -            dict[k]->states[state_num].replcut = (signed char) strlen(word);
 164 +            dict->states[state_num].replcut = (signed char) strlen(word);
 165            } else {
 166 -            dict[k]->states[state_num].replcut = replcut;
 167 +            dict->states[state_num].replcut = replcut;
 168            }
 169
 170           /* now, put in the prefix transitions */
 171 @@ -420,11 +366,82 @@
 172               ch = word[j - 1];
 173               word[j - 1] = '\0';
 174               found = hnj_hash_lookup (hashtab, word);
 175 -             state_num = hnj_get_state (dict[k], hashtab, word);
 176 -             hnj_add_trans (dict[k], state_num, last_state, ch);
 177 +             state_num = hnj_get_state (dict, hashtab, word);
 178 +             hnj_add_trans (dict, state_num, last_state, ch);
 179             }
 180 -       }
 181 +}
 182 +
 183 +HyphenDict *
 184 +hnj_hyphen_load (const char *fn)
 185 +{
 186 +  HyphenDict *dict[2];
 187 +  HashTab *hashtab;
 188 +  FILE *f;
 189 +  char buf[MAX_CHARS];
 190 +  int nextlevel = 0;
 191 +  int i, j, k;
 192 +  HashEntry *e;
 193 +  int state_num = 0;
 194 +
 195 +  f = fopen (fn, "r");
 196 +  if (f == NULL)
 197 +    return NULL;
 198 +
 199 +// loading one or two dictionaries (separated by NEXTLEVEL keyword)
 200 +for (k = 0; k < 2; k++) {
 201 +  hashtab = hnj_hash_new ();
 202 +#ifdef VERBOSE
 203 +  global[k] = hashtab;
 204 +#endif
 205 +  hnj_hash_insert (hashtab, "", 0);
 206 +  dict[k] = hnj_malloc (sizeof(HyphenDict));
 207 +  dict[k]->num_states = 1;
 208 +  dict[k]->states = hnj_malloc (sizeof(HyphenState));
 209 +  dict[k]->states[0].match = NULL;
 210 +  dict[k]->states[0].repl = NULL;
 211 +  dict[k]->states[0].fallback_state = -1;
 212 +  dict[k]->states[0].num_trans = 0;
 213 +  dict[k]->states[0].trans = NULL;
 214 +  dict[k]->nextlevel = NULL;
 215 +  dict[k]->lhmin = 0;
 216 +  dict[k]->rhmin = 0;
 217 +  dict[k]->clhmin = 0;
 218 +  dict[k]->crhmin = 0;
 219 +  dict[k]->nohyphen = NULL;
 220 +  dict[k]->nohyphenl = 0;
 221 +
 222 +  /* read in character set info */
 223 +  if (k == 0) {
 224 +    for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
 225 +    fgets(dict[k]->cset,  sizeof(dict[k]->cset),f);
 226 +    for (i=0;i<MAX_NAME;i++)
 227 +      if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
 228 +        dict[k]->cset[i] = 0;
 229 +    dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
 230 +  } else {
 231 +    strcpy(dict[k]->cset, dict[0]->cset);
 232 +    dict[k]->utf8 = dict[0]->utf8;
 233 +  }
 234 +
 235 +  if (k == 0 || nextlevel) {
 236 +    while (fgets (buf, sizeof(buf), f) != NULL) {
 237 +      if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
 238 +       nextlevel = 1;
 239 +       break;
 240 +      } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
 241 +    }
 242 +  } else if (k == 1) {
 243 +    /* default first level: hyphen and ASCII apostrophe */
 244 +    if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN '\n", dict[k], hashtab);
 245 +    else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99\n", dict[k], hashtab);
 246 +    strcpy(buf, "1-1/=,1,1\n"); // buf rewritten by hnj_hyphen_load here
 247 +    hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
 248 +    hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
 249 +    if (dict[0]->utf8) {
 250 +      hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
 251 +      hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
 252      }
 253 +  }
 254
 255    /* Could do unioning of matches here (instead of the preprocessor script).
 256       If we did, the pseudocode would look something like this:
 257 @@ -476,7 +493,20 @@
 258    state_num = 0;
 259  }
 260    fclose(f);
 261 -  if (k == 2) dict[0]->nextlevel = dict[1];
 262 +  if (nextlevel) dict[0]->nextlevel = dict[1];
 263 +  else {
 264 +    dict[1] -> nextlevel = dict[0];
 265 +    dict[1]->lhmin = dict[0]->lhmin;
 266 +    dict[1]->rhmin = dict[0]->rhmin;
 267 +    dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
 268 +    dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
 269 +#ifdef VERBOSE
 270 +    HashTab *r = global[0];
 271 +    global[0] = global[1];
 272 +    global[1] = r;
 273 +#endif
 274 +    return dict[1];
 275 +  }
 276    return dict[0];
 277  }
 278
 279 @@ -527,8 +557,13 @@
 280    j = 0;
 281    prep_word[j++] = '.';
 282
 283 -  for (i = 0; i < word_size; i++)
 284 +  for (i = 0; i < word_size; i++) {
 285 +    if (word[i] <= '9' && word[i] >= '0') {
 286 +      prep_word[j++] = '.';
 287 +    } else {
 288        prep_word[j++] = word[i];
 289 +    }
 290 +  }
 291
 292    prep_word[j++] = '.';
 293    prep_word[j] = '\0';
 294 @@ -557,7 +592,7 @@
 295
 296  #ifdef VERBOSE
 297           char *state_str;
 298 -         state_str = get_state_str (state);
 299 +         state_str = get_state_str (state, 0);
 300
 301           for (k = 0; k < i - strlen (state_str); k++)
 302             putchar (' ');
 303 @@ -670,6 +705,9 @@
 304        i += hnj_ligature(word[2]);
 305      }
 306
 307 +    // ignore numbers
 308 +    for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
 309 +
 310      for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
 311        // check length of the non-standard part
 312        if (*rep && *pos && *cut && (*rep)[j]) {
 313 @@ -696,9 +734,13 @@
 314  int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
 315         char *** rep, int ** pos, int ** cut, int rhmin)
 316  {
 317 -    int i;
 318 -    int j = word_size - 2;
 319 -    for (i = 1; i < rhmin && j > 0; j--) {
 320 +    int i = 1;
 321 +    int j;
 322 +
 323 +    // ignore numbers
 324 +    for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
 325 +
 326 +    for (j = word_size - 2; i < rhmin && j > 0; j--) {
 327        // check length of the non-standard part
 328        if (*rep && *pos && *cut && (*rep)[j]) {
 329          char * rh = strchr((*rep)[j], '=');
 330 @@ -756,8 +798,15 @@
 331    j = 0;
 332    prep_word[j++] = '.';
 333
 334 -  for (i = 0; i < word_size; i++)
 335 +  for (i = 0; i < word_size; i++) {
 336 +    if (word[i] <= '9' && word[i] >= '0') {
 337 +      prep_word[j++] = '.';
 338 +    } else {
 339        prep_word[j++] = word[i];
 340 +    }
 341 +  }
 342 +
 343 +
 344
 345    prep_word[j++] = '.';
 346    prep_word[j] = '\0';
 347 @@ -786,7 +835,7 @@
 348
 349  #ifdef VERBOSE
 350           char *state_str;
 351 -         state_str = get_state_str (state);
 352 +         state_str = get_state_str (state, 1);
 353
 354           for (k = 0; k < i - strlen (state_str); k++)
 355             putchar (' ');
 356 @@ -1033,6 +1082,9 @@
 357      }
 358    }
 359    hyphens[j + 1] = '\0';
 360 +#ifdef VERBOSE
 361 +  printf ("nums: %s\n", hyphens);
 362 +#endif
 363    return 0;
 364  }
 365
 366 @@ -1074,8 +1126,8 @@
 367      for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
 368          char * nhy = (char *) strstr(word, nh);
 369          while (nhy) {
 370 -            hyphens[nhy - word + strlen(nh) - 1] = 0;
 371 -            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
 372 +            hyphens[nhy - word + strlen(nh) - 1] = '0';
 373 +            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
 374              nhy = (char *) strstr(nhy + 1, nh);
 375          }
 376          nh = nh + strlen(nh) + 1;
 377 @@ -1084,6 +1136,9 @@
 378
 379    if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
 380    if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
 381 +#ifdef VERBOSE
 382 +  printf ("nums: %s\n", hyphens);
 383 +#endif
 384    return 0;
 385  }
 386
 387 @@ -1093,8 +1148,10 @@
 388         char *hyphword, char *** rep, int ** pos, int ** cut,
 389         int lhmin, int rhmin, int clhmin, int crhmin)
 390  {
 391 -  lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
 392 -  rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
 393 +  lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
 394 +  rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
 395 +  clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
 396 +  crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
 397    hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
 398      clhmin, crhmin, 1, 1);
 399    hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,