source/texk/web2c/luatexdir/lang/hyphen.w

   1 % hyphen.w
   2 %
   3 % Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
   4 % licenses follows.
   5 %
   6 %
   7 % LibHnj - a library for high quality hyphenation and justification
   8 % Copyright (C) 1998 Raph Levien,
   9 %            (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
  10 %           (C) 2001 Peter Novodvorsky (nidd@@cs.msu.su)
  11 %
  12 % This library is free software; you can redistribute it and/or
  13 % modify it under the terms of the GNU Library General Public
  14 % License as published by the Free Software Foundation; either
  15 % version 2 of the License, or (at your option) any later version.
  16 %
  17 % This library is distributed in the hope that it will be useful,
  18 % but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 % Library General Public License for more details.
  21 %
  22 % You should have received a copy of the GNU Library General Public
  23 % License along with this library; if not, write to the
  24 % Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  25 % Boston, MA  02111-1307  USA.
  26 %
  27 %
  28 %
  29 % The contents of this file are subject to the Mozilla Public License
  30 % Version 1.0 (the "MPL"); you may not use this file except in
  31 % compliance with the MPL.  You may obtain a copy of the MPL at
  32 % http://www.mozilla.org/MPL/
  33 %
  34 % Software distributed under the MPL is distributed on an "AS IS" basis,
  35 % WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
  36 % for the specific language governing rights and limitations under the
  37 % MPL.
  38
  39
  40 @ @c
  41
  42
  43 #include "ptexlib.h"
  44 #include "lua/luatex-api.h"
  45
  46 #include <stdlib.h>             /* for NULL, malloc */
  47 #include <stdio.h>              /* for fprintf */
  48 #include <string.h>             /* for strdup */
  49 #include <stdlib.h>             /* for malloc used by substring inclusion */
  50
  51 #define MAXPATHS 40960
  52
  53 #ifdef UNX
  54 #  include <unistd.h>           /* for exit */
  55 #endif
  56
  57 #include <kpathsea/c-ctype.h>
  58
  59 #define noVERBOSE
  60
  61 #include "lang/hnjalloc.h"
  62
  63 @ TODO: should be moved to separate library
  64
  65 @c
  66 static unsigned char *hnj_strdup(const unsigned char *s)
  67 {
  68     unsigned char *new;
  69     size_t l;
  70
  71     l = strlen((const char *) s);
  72     new = hnj_malloc((int) l + 1);
  73     memcpy(new, s, l);
  74     new[l] = 0;
  75     return new;
  76 }
  77
  78 @* Type definitions.
  79
  80 @ a little bit of a hash table implementation. This simply maps strings
  81    to state numbers
  82
  83 @c
  84 typedef struct _HashTab HashTab;
  85 typedef struct _HashEntry HashEntry;
  86 typedef struct _HashIter HashIter;
  87 typedef union _HashVal HashVal;
  88
  89 /* A cheap, but effective, hack. */
  90 #define HASH_SIZE 31627
  91
  92 struct _HashTab {
  93     HashEntry *entries[HASH_SIZE];
  94 };
  95
  96 union _HashVal {
  97     int state;
  98     char *hyppat;
  99 };
 100
 101 struct _HashEntry {
 102     HashEntry *next;
 103     unsigned char *key;
 104     HashVal u;
 105 };
 106
 107 struct _HashIter {
 108     HashEntry **e;
 109     HashEntry *cur;
 110     int ndx;
 111 };
 112
 113 @ State machine
 114
 115 @c
 116 typedef struct _HyphenState HyphenState;
 117 typedef struct _HyphenTrans HyphenTrans;
 118 #define MAX_CHARS 256
 119 #define MAX_NAME 20
 120
 121 struct _HyphenDict {
 122     int num_states;
 123     int pat_length;
 124     char cset[MAX_NAME];
 125     HyphenState *states;
 126     HashTab *patterns;
 127     HashTab *merged;
 128     HashTab *state_num;
 129 };
 130
 131 struct _HyphenState {
 132     char *match;
 133     /*char *repl; */
 134     /*signed char replindex; */
 135     /*signed char replcut; */
 136     int fallback_state;
 137     int num_trans;
 138     HyphenTrans *trans;
 139 };
 140
 141 struct _HyphenTrans {
 142     int uni_ch;
 143     int new_state;
 144 };
 145
 146
 147 @ Combine two right-aligned number patterns, 04000 + 020 becomes 04020
 148
 149 @c
 150 static char *combine(char *expr, const char *subexpr)
 151 {
 152     size_t l1 = strlen(expr);
 153     size_t l2 = strlen(subexpr);
 154     size_t off = l1 - l2;
 155     unsigned j;
 156     /* this works also for utf8 sequences because the substring is identical
 157      to the last substring-length bytes of expr except for the (single byte)
 158      hyphenation encoders
 159      */
 160     for (j = 0; j < l2; j++) {
 161         if (expr[off + j] < subexpr[j])
 162             expr[off + j] = subexpr[j];
 163     }
 164     return expr;
 165 }
 166
 167
 168 @ ORIGINAL CODE
 169 @c
 170 static HashIter *new_HashIter(HashTab * h)
 171 {
 172     HashIter *i = hnj_malloc(sizeof(HashIter));
 173     i->e = h->entries;
 174     i->cur = NULL;
 175     i->ndx = -1;
 176     return i;
 177 }
 178
 179
 180 static int nextHashStealPattern(HashIter * i, unsigned char **word, char **pattern)
 181 {
 182     while (i->cur == NULL) {
 183         if (i->ndx >= HASH_SIZE - 1)
 184             return 0;
 185         i->cur = i->e[++i->ndx];
 186     }
 187     *word = i->cur->key;
 188     *pattern = i->cur->u.hyppat;
 189     i->cur->u.hyppat = NULL;
 190     i->cur = i->cur->next;
 191     return 1;
 192 }
 193
 194
 195 static int nextHash(HashIter * i, unsigned char **word)
 196 {
 197     while (i->cur == NULL) {
 198         if (i->ndx >= HASH_SIZE - 1)
 199             return 0;
 200         i->cur = i->e[++i->ndx];
 201     }
 202     *word = i->cur->key;
 203     i->cur = i->cur->next;
 204     return 1;
 205 }
 206
 207
 208 static int eachHash(HashIter * i, unsigned char **word, char **pattern)
 209 {
 210     while (i->cur == NULL) {
 211         if (i->ndx >= HASH_SIZE - 1)
 212             return 0;
 213         i->cur = i->e[++i->ndx];
 214     }
 215     *word = i->cur->key;
 216     *pattern = i->cur->u.hyppat;
 217     i->cur = i->cur->next;
 218     return 1;
 219 }
 220
 221
 222 static void delete_HashIter(HashIter * i)
 223 {
 224     hnj_free(i);
 225 }
 226
 227
 228 @ a |char*| hash function from ASU - adapted from Gtk+
 229
 230 @c
 231 static unsigned int hnj_string_hash(const unsigned char *s)
 232 {
 233     const unsigned char *p;
 234     unsigned int h = 0, g;
 235
 236     for (p = s; *p != '\0'; p += 1) {
 237         h = (h << 4) + *p;
 238         if ((g = (h & 0xf0000000))) {
 239             h = h ^ (g >> 24);
 240             h = h ^ g;
 241         }
 242     }
 243     return h /* \% M */ ;
 244 }
 245
 246
 247 @ assumes that key is not already present!
 248
 249 @c
 250 static void state_insert(HashTab * hashtab, unsigned char *key, int state)
 251 {
 252     int i;
 253     HashEntry *e;
 254
 255     i = (int) (hnj_string_hash(key) % HASH_SIZE);
 256     e = hnj_malloc(sizeof(HashEntry));
 257     e->next = hashtab->entries[i];
 258     e->key = key;
 259     e->u.state = state;
 260     hashtab->entries[i] = e;
 261 }
 262
 263
 264 @ assumes that key is not already present!
 265
 266 @c
 267 static void hyppat_insert(HashTab * hashtab, unsigned char *key, char *hyppat)
 268 {
 269     int i;
 270     HashEntry *e;
 271
 272     i = (int) (hnj_string_hash(key) % HASH_SIZE);
 273     for (e = hashtab->entries[i]; e; e = e->next) {
 274         if (strcmp((char *) e->key, (char *) key) == 0) {
 275             if (e->u.hyppat) {
 276                 if (hyppat
 277                     && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
 278                     print_err("Conflicting pattern ignored");
 279                     error();
 280                 }
 281                 hnj_free(e->u.hyppat);
 282             }
 283             e->u.hyppat = hyppat;
 284             hnj_free(key);
 285             return;
 286         }
 287     }
 288     e = hnj_malloc(sizeof(HashEntry));
 289     e->next = hashtab->entries[i];
 290     e->key = key;
 291     e->u.hyppat = hyppat;
 292     hashtab->entries[i] = e;
 293 }
 294
 295
 296 @ return state if found, otherwise $-1$
 297
 298 @c
 299 static int state_lookup(HashTab * hashtab, const unsigned char *key)
 300 {
 301     int i;
 302     HashEntry *e;
 303
 304     i = (int) (hnj_string_hash(key) % HASH_SIZE);
 305     for (e = hashtab->entries[i]; e; e = e->next) {
 306         if (!strcmp((const char *) key, (const char *) e->key)) {
 307             return e->u.state;
 308         }
 309     }
 310     return -1;
 311 }
 312
 313
 314 @ return state if found, otherwise $-1$
 315
 316 @c
 317 static char *hyppat_lookup(HashTab * hashtab, const unsigned char *chars, int l)
 318 {
 319     int i;
 320     HashEntry *e;
 321     unsigned char key[256];     /* should be ample */
 322     strncpy((char *) key, (const char *) chars, (size_t) l);
 323     key[l] = 0;
 324     i = (int) (hnj_string_hash(key) % HASH_SIZE);
 325     for (e = hashtab->entries[i]; e; e = e->next) {
 326         if (!strcmp((char *) key, (char *) e->key)) {
 327             return e->u.hyppat;
 328         }
 329     }
 330     return NULL;
 331 }
 332
 333
 334 @ Get the state number, allocating a new state if necessary.
 335
 336 @c
 337 static int hnj_get_state(HyphenDict * dict,
 338                          const unsigned char *str, int *state_num)
 339 {
 340     *state_num = state_lookup(dict->state_num, str);
 341
 342     if (*state_num >= 0)
 343         return *state_num;
 344
 345     state_insert(dict->state_num, hnj_strdup(str), dict->num_states);
 346     /* predicate is true if |dict->num_states| is a power of two */
 347     if (!(dict->num_states & (dict->num_states - 1))) {
 348         dict->states = hnj_realloc(dict->states,
 349                                    (int) ((dict->num_states << 1) *
 350                                           (int) sizeof(HyphenState)));
 351     }
 352     dict->states[dict->num_states].match = NULL;
 353     dict->states[dict->num_states].fallback_state = -1;
 354     dict->states[dict->num_states].num_trans = 0;
 355     dict->states[dict->num_states].trans = NULL;
 356     return dict->num_states++;
 357 }
 358
 359
 360 @ Add a transition from state1 to state2 through ch - assumes that the
 361    transition does not already exist
 362
 363 @c
 364 static void hnj_add_trans(HyphenDict * dict, int state1, int state2, int uni_ch)
 365 {
 366     int num_trans;
 367     /* TH: this test was a bit too strict, it is quite normal for old
 368        patterns to have chars in the range 0-31 or 127-159 (inclusive).
 369        To ease the transition, let's only disallow NUL for now
 370        (this is probably a requirement of the code anyway).
 371      */
 372     if (uni_ch == 0) {
 373         char errmsg[256]; /* temp hack ... we will have a formatted error */
 374         snprintf(errmsg, 255, "character out of bounds: u%04x", uni_ch);
 375         errmsg[255] = '\0';
 376         normal_error("hyphenation",errmsg); /* todo */
 377     }
 378     num_trans = dict->states[state1].num_trans;
 379     if (num_trans == 0) {
 380         dict->states[state1].trans = hnj_malloc(sizeof(HyphenTrans));
 381     } else {
 382         /* TH: The old version did
 383            } else if (!(num_trans & (num_trans - 1))) {
 384              ... hnj_realloc(dict->states[state1].trans,
 385                                                  (int) ((num_trans << 1) *
 386                                                         sizeof(HyphenTrans)));
 387            but that is incredibly nasty when adding patters one-at-a-time.
 388            Controlled growth would be nicer than the current +1, but if
 389            noone complains, this is good enough ;)
 390          */
 391         dict->states[state1].trans = hnj_realloc(dict->states[state1].trans,
 392                                                  (int) ((num_trans + 1) *
 393                                                         sizeof(HyphenTrans)));
 394     }
 395     dict->states[state1].trans[num_trans].uni_ch = uni_ch;
 396     dict->states[state1].trans[num_trans].new_state = state2;
 397     dict->states[state1].num_trans++;
 398 }
 399
 400
 401 #ifdef VERBOSE
 402
 403 static unsigned char *get_state_str(int state)
 404 {
 405     int i;
 406     HashEntry *e;
 407
 408     for (i = 0; i < HASH_SIZE; i++)
 409         for (e = global->entries[i]; e; e = e->next)
 410             if (e->u.state == state)
 411                 return e->key;
 412     return NULL;
 413 }
 414 #endif
 415
 416
 417 @ I've changed the semantics a bit here: |hnj_hyphen_load| used to
 418    operate on a file, but now the argument is a string buffer.
 419
 420 @c
 421 static const unsigned char *next_pattern(size_t * length,
 422                                          const unsigned char **buf)
 423 {
 424     const unsigned char *here, *rover = *buf;
 425     while (*rover && isspace(*rover))
 426         rover++;
 427     here = rover;
 428     while (*rover) {
 429         if (isspace(*rover)) {
 430             *length = (size_t) (rover - here);
 431             *buf = rover;
 432             return here;
 433         }
 434         rover++;
 435     }
 436     *length = (size_t) (rover - here);
 437     *buf = rover;
 438     return *length ? here : NULL;       /* zero sensed */
 439 }
 440
 441 static void init_hash(HashTab ** h)
 442 {
 443     int i;
 444     if (*h)
 445         return;
 446     *h = hnj_malloc(sizeof(HashTab));
 447     for (i = 0; i < HASH_SIZE; i++)
 448         (*h)->entries[i] = NULL;
 449 }
 450
 451
 452 static void clear_state_hash(HashTab ** h)
 453 {
 454     int i;
 455     if (*h == NULL)
 456         return;
 457     for (i = 0; i < HASH_SIZE; i++) {
 458         HashEntry *e, *next;
 459         for (e = (*h)->entries[i]; e; e = next) {
 460             next = e->next;
 461             hnj_free(e->key);
 462             hnj_free(e);
 463         }
 464     }
 465     hnj_free(*h);
 466     *h = NULL;
 467 }
 468
 469
 470 static void clear_hyppat_hash(HashTab ** h)
 471 {
 472     int i;
 473     if (*h == NULL)
 474         return;
 475     for (i = 0; i < HASH_SIZE; i++) {
 476         HashEntry *e, *next;
 477         for (e = (*h)->entries[i]; e; e = next) {
 478             next = e->next;
 479             hnj_free(e->key);
 480             if (e->u.hyppat)
 481                 hnj_free(e->u.hyppat);
 482             hnj_free(e);
 483         }
 484     }
 485     hnj_free(*h);
 486     *h = NULL;
 487 }
 488
 489
 490 static void init_dict(HyphenDict * dict)
 491 {
 492     dict->num_states = 1;
 493     dict->pat_length = 0;
 494     dict->states = hnj_malloc(sizeof(HyphenState));
 495     dict->states[0].match = NULL;
 496     dict->states[0].fallback_state = -1;
 497     dict->states[0].num_trans = 0;
 498     dict->states[0].trans = NULL;
 499     dict->patterns = NULL;
 500     dict->merged = NULL;
 501     dict->state_num = NULL;
 502     init_hash(&dict->patterns);
 503 }
 504
 505
 506 static void clear_dict(HyphenDict * dict)
 507 {
 508     int state_num;
 509     for (state_num = 0; state_num < dict->num_states; state_num++) {
 510         HyphenState *hstate = &dict->states[state_num];
 511         if (hstate->match)
 512             hnj_free(hstate->match);
 513         if (hstate->trans)
 514             hnj_free(hstate->trans);
 515     }
 516     hnj_free(dict->states);
 517     clear_hyppat_hash(&dict->patterns);
 518     clear_hyppat_hash(&dict->merged);
 519     clear_state_hash(&dict->state_num);
 520 }
 521
 522
 523
 524 HyphenDict *hnj_hyphen_new(void)
 525 {
 526     HyphenDict *dict = hnj_malloc(sizeof(HyphenDict));
 527     init_dict(dict);
 528     return dict;
 529 }
 530
 531
 532 void hnj_hyphen_clear(HyphenDict * dict)
 533 {
 534     clear_dict(dict);
 535     init_dict(dict);
 536 }
 537
 538
 539 void hnj_hyphen_free(HyphenDict * dict)
 540 {
 541     clear_dict(dict);
 542     hnj_free(dict);
 543 }
 544
 545 unsigned char *hnj_serialize(HyphenDict * dict)
 546 {
 547     HashIter *v;
 548     unsigned char *word;
 549     char *pattern;
 550     unsigned char *buf = hnj_malloc(dict->pat_length);
 551     unsigned char *cur = buf;
 552     v = new_HashIter(dict->patterns);
 553     while (eachHash(v, &word, &pattern)) {
 554         int i = 0, e = 0;
 555         while (word[e + i]) {
 556             if (pattern[i] != '0')
 557                 *cur++ = (unsigned char) pattern[i];
 558             *cur++ = word[e + i++];
 559             while (is_utf8_follow(word[e + i]))
 560                 *cur++ = word[i + e++];
 561         }
 562         if (pattern[i] != '0')
 563             *cur++ = (unsigned char) pattern[i];
 564         *cur++ = ' ';
 565     }
 566     delete_HashIter(v);
 567     *cur = 0;
 568     return buf;
 569 }
 570
 571
 572 void hnj_free_serialize(unsigned char *c)
 573 {
 574     hnj_free(c);
 575 }
 576
 577
 578 @ hyphenation pattern:
 579
 580 signed bytes
 581
 582 0 indicates end (actually any negative number)
 583
 584 : prio(1+),startpos,length,len1,[replace],len2,[replace]
 585
 586 most basic example is:
 587
 588 p n 0 0 0
 589
 590 for a hyphenation point between characters
 591
 592
 593 @c
 594 void hnj_hyphen_load(HyphenDict * dict, const unsigned char *f)
 595 {
 596     int state_num, last_state;
 597     int ch;
 598     int found;
 599     HashEntry *e;
 600     HashIter *v;
 601     unsigned char *word;
 602     char *pattern;
 603     size_t l = 0;
 604
 605     const unsigned char *format;
 606     const unsigned char *begin = f;
 607     unsigned char *pat;
 608     char *org;
 609     while ((format = next_pattern(&l, &f)) != NULL) {
 610         int i, j, e1;
 611         if (l>=255) {
 612            help1("Individual patterns should not be longer than 254 bytes total.");
 613            print_err("Pattern of enormous length ignored");
 614            error();
 615            continue;
 616         }
 617 #if 0
 618            printf("%s\n",format);
 619            char* repl = strnchr(format, '/',l);
 620            int replindex = 0;
 621            int replcut = 0;
 622            if (repl) {
 623            int clen = l-(repl-format);
 624            l = repl-format;
 625            char * index = strnchr(repl + 1, ',',clen);
 626            if (index) {
 627            char * index2 = strnchr(index + 1, ',',clen-(index-repl));
 628            if (index2) {
 629            replindex = (signed char) atoi(index + 1) - 1;
 630            replcut = (signed char) atoi(index2 + 1);
 631            }
 632            } else {
 633            hnj_strchomp(repl + 1);
 634            replindex = 0;
 635            replcut = strlen(buf);
 636            }
 637            repl = hnj_strdup(repl + 1);
 638            }
 639 #endif
 640         for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
 641             if (format[i] >= '0' && format[i] <= '9')
 642                 j++;
 643             if (is_utf8_follow(format[i]))
 644                 e1++;
 645         }
 646         /* |l-e1|   => number of {\it characters} not {\it bytes} */
 647         /* |l-j|   => number of pattern bytes */
 648         /* |l-e1-j| => number of pattern characters */
 649         pat = (unsigned char *) malloc((1 + l - (size_t) j));
 650         org = (char *) malloc((size_t) (2 + l - (size_t) e1 - (size_t) j));
 651         /* remove hyphenation encoders (digits) from pat */
 652         org[0] = '0';
 653         for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
 654             unsigned char c = format[i];
 655             if (is_utf8_follow(c)) {
 656                 pat[j + e1++] = c;
 657             } else if (c < '0' || c > '9') {
 658                 pat[e1 + j++] = c;
 659                 org[j] = '0';
 660             } else {
 661                 org[j] = (char) c;
 662             }
 663         }
 664         pat[e1 + j] = 0;
 665         org[j + 1] = 0;
 666         hyppat_insert(dict->patterns, pat, org);
 667     }
 668     dict->pat_length += (int) ((f - begin) + 2);        /* 2 for spurious spaces */
 669     init_hash(&dict->merged);
 670     v = new_HashIter(dict->patterns);
 671     while (nextHash(v, &word)) {
 672         int wordsize = (int) strlen((char *) word);
 673         int j1, l1;
 674         for (l1 = 1; l1 <= wordsize; l1++) {
 675             if (is_utf8_follow(word[l1]))
 676                 continue;       /* Do not clip an utf8 sequence */
 677             for (j1 = 1; j1 <= l1; j1++) {
 678                 char *subpat_pat;
 679                 int i1 = l1 - j1;
 680                 if (is_utf8_follow(word[i1]))
 681                     continue;   /* Do not start halfway an utf8 sequence */
 682                 if ((subpat_pat =
 683                      hyppat_lookup(dict->patterns, word + i1, j1)) != NULL) {
 684                     char *newpat_pat;
 685                     if ((newpat_pat =
 686                          hyppat_lookup(dict->merged, word, l1)) == NULL) {
 687                         char *neworg;
 688                         unsigned char *newword =
 689                             (unsigned char *) malloc((size_t) (l1 + 1));
 690                         int e1 = 0;
 691                         strncpy((char *) newword, (char *) word, (size_t) l1);
 692                         newword[l1] = 0;
 693                         for (i1 = 0; i1 < l1; i1++)
 694                             if (is_utf8_follow(newword[i1]))
 695                                 e1++;
 696                         neworg = malloc((size_t) (l1 + 2 - e1));
 697                         sprintf(neworg, "%0*d", l1 + 1 - e1, 0);  /* fill with right amount of '0' */
 698                         hyppat_insert(dict->merged, newword, combine(neworg, subpat_pat));
 699                     } else {
 700                         combine(newpat_pat, subpat_pat);
 701                     }
 702                 }
 703             }
 704         }
 705     }
 706     delete_HashIter(v);
 707
 708     init_hash(&dict->state_num);
 709     state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0);
 710     v = new_HashIter(dict->merged);
 711     while (nextHashStealPattern(v, &word, &pattern)) {
 712         static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 };
 713         int j1 = (int) strlen((char *) word);
 714 #ifdef VERBOSE
 715         printf("word %s pattern %s, j = %d\n", word, pattern, j1);
 716 #endif
 717         state_num = hnj_get_state(dict, word, &found);
 718         dict->states[state_num].match = pattern;
 719
 720         /* now, put in the prefix transitions */
 721         while (found < 0) {
 722             j1--;
 723             last_state = state_num;
 724             ch = word[j1];
 725             if (ch >= 0x80) {
 726                 int m;
 727                 int i1 = 1;
 728                 while (is_utf8_follow(word[j1 - i1]))
 729                     i1++;
 730                 ch = word[j1 - i1] & mask[i1];
 731                 m = j1 - i1;
 732                 while (i1--) {
 733                     ch = (ch << 6) + (0x3F & word[j1 - i1]);
 734                 }
 735                 j1 = m;
 736             }
 737             word[j1] = '\0';
 738             state_num = hnj_get_state(dict, word, &found);
 739             hnj_add_trans(dict, state_num, last_state, ch);
 740         }
 741     }
 742     delete_HashIter(v);
 743     clear_hyppat_hash(&dict->merged);
 744
 745     /* put in the fallback states */
 746     {
 747     int i, j = 0;
 748     for (i = 0; i < HASH_SIZE; i++) {
 749         for (e = dict->state_num->entries[i]; e; e = e->next) {
 750             /* do not do state==0 otherwise things get confused */
 751             if (e->u.state) {
 752                 for (j = 1; 1; j++) {
 753                     state_num = state_lookup(dict->state_num, e->key + j);
 754                     if (state_num >= 0)
 755                         break;
 756                 }
 757                 dict->states[e->u.state].fallback_state = state_num;
 758             }
 759         }
 760     }
 761 #ifdef VERBOSE
 762     for (i = 0; i < HASH_SIZE; i++) {
 763         for (e = dict->state_num->entries[i]; e; e = e->next) {
 764             printf("%d string %s state %d, fallback=%d\n",
 765                 i, e->key, e->u.state, dict->states[e->u.state].fallback_state);
 766             for (j = 0; j < dict->states[e->u.state].num_trans; j++) {
 767                 printf(" u%4x->%d\n",
 768                    (int) dict->states[e->u.state].trans[j].uni_ch,
 769                    dict->states[e->u.state].trans[j].new_state);
 770             }
 771         }
 772     }
 773 #endif
 774     }
 775     clear_state_hash(&dict->state_num);
 776 }
 777
 778 @ @c
 779 void hnj_hyphen_hyphenate(HyphenDict * dict,
 780                           halfword first1,
 781                           halfword last1,
 782                           int length,
 783                           halfword left, halfword right, lang_variables * lan)
 784 {
 785     int char_num;
 786     halfword here;
 787     int state = 0;
 788     /* +2 for dots at each end, +1 for points /outside/ characters */
 789     int ext_word_len = length + 2;
 790     int hyphen_len = ext_word_len + 1;
 791     char *hyphens = hnj_malloc(hyphen_len + 1);
 792
 793     /* Add a '.' to beginning and end to facilitate matching */
 794     set_vlink(begin_point, first1);
 795     set_vlink(end_point, get_vlink(last1));
 796     set_vlink(last1, end_point);
 797
 798     for (char_num = 0; char_num < hyphen_len; char_num++) {
 799         hyphens[char_num] = '0';
 800     }
 801     hyphens[hyphen_len] = 0;
 802
 803     /* now, run the finite state machine */
 804     for (char_num = 0, here = begin_point; here != get_vlink(end_point);
 805          here = get_vlink(here)) {
 806
 807         int ch;
 808         if (here == begin_point || here == end_point)
 809             ch = '.';
 810         else
 811             ch = get_lc_code(get_character(here));
 812         while (state != -1) {
 813 #if 0
 814             printf("%*s%s%c",char_num-strlen(get_state_str(state)),"",get_state_str(state),(char)ch);
 815 #endif
 816             HyphenState *hstate = &dict->states[state];
 817             int k;
 818             for (k = 0; k < hstate->num_trans; k++) {
 819                 if (hstate->trans[k].uni_ch == ch) {
 820                     char *match;
 821                     state = hstate->trans[k].new_state;
 822 #if 0
 823                     printf(" state %d\n",state);
 824 #endif
 825                     match = dict->states[state].match;
 826                     if (match) {
 827                         /* +2 because:
 828                          1 string length is one bigger than offset
 829                          1 hyphenation starts before first character
 830                          */
 831                         int offset = (int) (char_num + 2 - (int) strlen(match));
 832 #if 0
 833                         printf("%*s%s\n", offset,"", match);
 834 #endif
 835                         int m;
 836                         for (m = 0; match[m]; m++) {
 837                             if (hyphens[offset + m] < match[m])
 838                                 hyphens[offset + m] = match[m];
 839                         }
 840                     }
 841                     goto try_next_letter;
 842                 }
 843             }
 844             state = hstate->fallback_state;
 845 #if 0
 846             printf(" back to %d\n", state);
 847 #endif
 848         }
 849         /* nothing worked, let's go to the next character */
 850         state = 0;
 851       try_next_letter:;
 852         char_num++;
 853     }
 854
 855     /* restore the correct pointers */
 856     set_vlink(last1, get_vlink(end_point));
 857
 858     /* pattern is \.{\^.\^w\^o\^r\^d\^.\^}   |word_len|=4, |ext_word_len|=6, |hyphens|=7
 859      * check      \.{    \^ \^ \^    }   so drop first two and stop after |word_len-1|
 860      */
 861     for (here = first1, char_num = 2; here != left; here = get_vlink(here))
 862         char_num++;
 863     for (; here != right; here = get_vlink(here)) {
 864         if (hyphens[char_num] & 1)
 865             here = insert_syllable_discretionary(here, lan);
 866         char_num++;
 867     }
 868     hnj_free(hyphens);
 869 }