source/texk/web2c/luatexdir/lang/texlang.w

   1 % texlang.w
   2 %
   3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
   4 %
   5 % This file is part of LuaTeX.
   6 %
   7 % LuaTeX is free software; you can redistribute it and/or modify it under
   8 % the terms of the GNU General Public License as published by the Free
   9 % Software Foundation; either version 2 of the License, or (at your
  10 % option) any later version.
  11 %
  12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
  13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 % FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 % License for more details.
  16 %
  17 % You should have received a copy of the GNU General Public License along
  18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
  19
  20 @ @c
  21
  22 #include "ptexlib.h"
  23 #include <string.h>
  24 #include "lua/luatex-api.h"
  25
  26 @ Low-level helpers
  27
  28 @ @c
  29 #define unVERBOSE
  30
  31 #define MAX_TEX_LANGUAGES  16384
  32
  33 #define ex_hyphen_char int_par(ex_hyphen_char_code)
  34
  35 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
  36
  37 static int next_lang_id = 0;
  38
  39 struct tex_language *new_language(int n)
  40 {
  41     struct tex_language *lang;
  42     unsigned l;
  43     if (n >= 0) {
  44         l = (unsigned) n;
  45         if (l != (MAX_TEX_LANGUAGES - 1))
  46             if (next_lang_id <= n)
  47                 next_lang_id = n + 1;
  48     } else {
  49         while (tex_languages[next_lang_id] != NULL)
  50             next_lang_id++;
  51         l = (unsigned) next_lang_id++;
  52     }
  53     if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
  54         lang = xmalloc(sizeof(struct tex_language));
  55         tex_languages[l] = lang;
  56         lang->id = (int) l;
  57         lang->exceptions = 0;
  58         lang->patterns = NULL;
  59         lang->pre_hyphen_char = '-';
  60         lang->post_hyphen_char = 0;
  61         lang->pre_exhyphen_char = 0;
  62         lang->post_exhyphen_char = 0;
  63         lang->hyphenation_min = -1;
  64         if (int_par(saving_hyph_codes_code)) {
  65             hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
  66         }
  67         return lang;
  68     } else {
  69         return NULL;
  70     }
  71 }
  72
  73 struct tex_language *get_language(int n)
  74 {
  75     if (n >= 0 && n < MAX_TEX_LANGUAGES) {
  76         if (tex_languages[n] != NULL) {
  77             return tex_languages[n];
  78         } else {
  79             return new_language(n);
  80         }
  81     } else {
  82         return NULL;
  83     }
  84 }
  85
  86 @ @c
  87 void set_pre_hyphen_char(int n, int v)
  88 {
  89     struct tex_language *l = get_language((int) n);
  90     if (l != NULL)
  91         l->pre_hyphen_char = (int) v;
  92 }
  93
  94 void set_post_hyphen_char(int n, int v)
  95 {
  96     struct tex_language *l = get_language((int) n);
  97     if (l != NULL)
  98         l->post_hyphen_char = (int) v;
  99 }
 100
 101 void set_pre_exhyphen_char(int n, int v)
 102 {
 103     struct tex_language *l = get_language((int) n);
 104     if (l != NULL)
 105         l->pre_exhyphen_char = (int) v;
 106 }
 107
 108 void set_post_exhyphen_char(int n, int v)
 109 {
 110     struct tex_language *l = get_language((int) n);
 111     if (l != NULL)
 112         l->post_exhyphen_char = (int) v;
 113 }
 114
 115 int get_pre_hyphen_char(int n)
 116 {
 117     struct tex_language *l = get_language((int) n);
 118     if (l == NULL)
 119         return -1;
 120     return (int) l->pre_hyphen_char;
 121 }
 122
 123 int get_post_hyphen_char(int n)
 124 {
 125     struct tex_language *l = get_language((int) n);
 126     if (l == NULL)
 127         return -1;
 128     return (int) l->post_hyphen_char;
 129 }
 130
 131 int get_pre_exhyphen_char(int n)
 132 {
 133     struct tex_language *l = get_language((int) n);
 134     if (l == NULL)
 135         return -1;
 136     return (int) l->pre_exhyphen_char;
 137 }
 138
 139 int get_post_exhyphen_char(int n)
 140 {
 141     struct tex_language *l = get_language((int) n);
 142     if (l == NULL)
 143         return -1;
 144     return (int) l->post_exhyphen_char;
 145 }
 146
 147 void set_hyphenation_min(int n, int v)
 148 {
 149     struct tex_language *l = get_language((int) n);
 150     if (l != NULL)
 151         l->hyphenation_min = (int) v;
 152 }
 153
 154 int get_hyphenation_min(int n)
 155 {
 156     struct tex_language *l = get_language((int) n);
 157     if (l == NULL)
 158         return -1;
 159     return (int) l->hyphenation_min;
 160 }
 161
 162 void load_patterns(struct tex_language *lang, const unsigned char *buff)
 163 {
 164     if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
 165         return;
 166     if (lang->patterns == NULL) {
 167         lang->patterns = hnj_hyphen_new();
 168     }
 169     hnj_hyphen_load(lang->patterns, buff);
 170 }
 171
 172 void clear_patterns(struct tex_language *lang)
 173 {
 174     if (lang == NULL)
 175         return;
 176     if (lang->patterns != NULL) {
 177         hnj_hyphen_clear(lang->patterns);
 178     }
 179 }
 180
 181 void load_tex_patterns(int curlang, halfword head)
 182 {
 183     char *s = tokenlist_to_cstring(head, 1, NULL);
 184     load_patterns(get_language(curlang), (unsigned char *) s);
 185 }
 186
 187 @ @c
 188 #define STORE_CHAR(l,x) do { \
 189     unsigned xx = get_hj_code(l,x); \
 190     if (!xx) { \
 191         xx = x; \
 192     } \
 193     uindex = uni2string(uindex, xx); \
 194 } while (0)
 195
 196 @ Cleans one word which is returned in |cleaned|, returns the new offset
 197 into |buffer|
 198
 199 @c
 200 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
 201 {
 202     int items = 0;
 203     unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
 204     unsigned uword[MAX_WORD_LEN + 1] = { 0 };  /* work buffer for unicode */
 205     int u = 0; /* unicode buffer value */
 206     int i = 0; /* index into buffer */
 207     char *uindex = (char *)word;
 208     const char *s = buff;
 209
 210     while (*s && !isspace((unsigned char)*s)) {
 211         word[i++] = (unsigned)*s;
 212         s++;
 213         if ((s-buff)>MAX_WORD_LEN) {
 214             /* todo: this is too strict, should count unicode, not bytes */
 215             *cleaned = NULL;
 216             tex_error("exception too long", NULL);
 217             return s;
 218         }
 219     }
 220     /* now convert the input to unicode */
 221     word[i] = '\0';
 222     utf2uni_strcpy(uword, (const char *)word);
 223
 224     /* build the new word string */
 225     i = 0;
 226     while (uword[i]>0) {
 227         u = uword[i++];
 228         if (u == '-') {        /* skip */
 229         } else if (u == '=') {
 230             STORE_CHAR(id,'-');
 231         } else if (u == '{') {
 232             u = uword[i++];
 233             items = 0;
 234             while (u && u != '}') {
 235                 u = uword[i++];
 236             }
 237             if (u == '}') {
 238                 items++;
 239                 u = uword[i++];
 240             }
 241             while (u && u != '}') {
 242                 u = uword[i++];
 243             }
 244             if (u == '}') {
 245                 items++;
 246                 u = uword[i++];;
 247             }
 248             if (u == '{') {
 249                 u = uword[i++];;
 250             }
 251             while (u && u != '}') {
 252                 STORE_CHAR(id,u);
 253                 u = uword[i++];
 254             }
 255             if (u == '}') {
 256                 items++;
 257             }
 258             if (items != 3) {   /* syntax error */
 259                 *cleaned = NULL;
 260                 tex_error("exception syntax error", NULL);
 261                 return s;
 262             }
 263         } else {
 264             STORE_CHAR(id,u);
 265         }
 266     }
 267     *uindex = '\0';
 268     *cleaned = xstrdup((char *) word);
 269     return s;
 270 }
 271
 272 @ @c
 273 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
 274 {
 275     const char *s;
 276     const char *value;
 277     char *cleaned;
 278     int id ;
 279     lua_State *L = Luas;
 280     if (lang == NULL)
 281         return;
 282     if (lang->exceptions == 0) {
 283         lua_newtable(L);
 284         lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
 285     }
 286     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 287     s = (const char *) buff;
 288     id = lang->id;
 289     while (*s) {
 290         while (isspace((unsigned char)*s))
 291             s++;
 292         if (*s) {
 293             value = s;
 294             s = clean_hyphenation(id, s, &cleaned);
 295             if (cleaned != NULL) {
 296                 if ((s - value) > 0) {
 297                     lua_pushstring(L, cleaned);
 298                     lua_pushlstring(L, value, (size_t) (s - value));
 299                     lua_rawset(L, -3);
 300                 }
 301                 free(cleaned);
 302             } else {
 303 #ifdef VERBOSE
 304                 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
 305 #endif
 306             }
 307         }
 308     }
 309 }
 310
 311 void clear_hyphenation(struct tex_language *lang)
 312 {
 313     if (lang == NULL)
 314         return;
 315     if (lang->exceptions != 0) {
 316         luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 317         lang->exceptions = 0;
 318     }
 319 }
 320
 321
 322 void load_tex_hyphenation(int curlang, halfword head)
 323 {
 324     char *s = tokenlist_to_cstring(head, 1, NULL);
 325     load_hyphenation(get_language(curlang), (unsigned char *) s);
 326 }
 327
 328 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very
 329    nice, but needed. Also, in the post-break, it would be nicer to get the
 330    attribute list from |vlink(n)|. No rush, as it is currently not used much.
 331
 332 @c
 333 halfword insert_discretionary(halfword t, halfword pre, halfword post,
 334                               halfword replace, int penalty)
 335 {
 336     halfword g, n;
 337     int f;
 338     n = new_node(disc_node, syllable_disc);
 339     disc_penalty(n) = penalty;
 340     try_couple_nodes(n, vlink(t));
 341     couple_nodes(t, n);
 342     if (replace != null)
 343         f = font(replace);
 344     else
 345         f = get_cur_font();     /* for compound words following explicit hyphens */
 346     for (g = pre; g != null; g = vlink(g)) {
 347         font(g) = f;
 348         if (node_attr(t) != null) {
 349             delete_attribute_ref(node_attr(g));
 350             node_attr(g) = node_attr(t);
 351             attr_list_ref(node_attr(t)) += 1;
 352         }
 353     }
 354     for (g = post; g != null; g = vlink(g)) {
 355         font(g) = f;
 356         if (node_attr(t) != null) {
 357             delete_attribute_ref(node_attr(g));
 358             node_attr(g) = node_attr(t);
 359             attr_list_ref(node_attr(t)) += 1;
 360         }
 361     }
 362     for (g = replace; g != null; g = vlink(g)) {
 363         if (node_attr(t) != null) {
 364             delete_attribute_ref(node_attr(g));
 365             node_attr(g) = node_attr(t);
 366             attr_list_ref(node_attr(t)) += 1;
 367         }
 368     }
 369     if (node_attr(t) != null) {
 370         delete_attribute_ref(node_attr(vlink(t)));
 371         node_attr(vlink(t)) = node_attr(t);
 372         attr_list_ref(node_attr(t)) += 1;
 373     }
 374     t = vlink(t);
 375     set_disc_field(pre_break(t), pre);
 376     set_disc_field(post_break(t), post);
 377     set_disc_field(no_break(t), replace);
 378     return t;
 379 }
 380
 381 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
 382 {
 383     halfword g, n;
 384     n = new_node(disc_node, syllable_disc);
 385     disc_penalty(n) = int_par(hyphen_penalty_code);
 386     couple_nodes(n, vlink(t));
 387     couple_nodes(t, n);
 388     delete_attribute_ref(node_attr(n));
 389     if (node_attr(t) != null) {
 390         node_attr(n) = node_attr(t);
 391         attr_list_ref(node_attr(t))++;
 392     } else {
 393         node_attr(n) = null;
 394     }
 395     if (lan->pre_hyphen_char > 0) {
 396         g = raw_glyph_node();
 397         set_to_character(g);
 398         character(g) = lan->pre_hyphen_char;
 399         font(g) = font(t);
 400         lang_data(g) = lang_data(t);
 401         if (node_attr(t) != null) {
 402             node_attr(g) = node_attr(t);
 403             attr_list_ref(node_attr(t))++;
 404         }
 405         set_disc_field(pre_break(n), g);
 406     }
 407
 408     if (lan->post_hyphen_char > 0) {
 409         t = vlink(n);
 410         g = raw_glyph_node();
 411         set_to_character(g);
 412         character(g) = lan->post_hyphen_char;
 413         font(g) = font(t);
 414         lang_data(g) = lang_data(t);
 415         if (node_attr(t) != null) {
 416             node_attr(g) = node_attr(t);
 417             attr_list_ref(node_attr(t)) += 1;
 418         }
 419         set_disc_field(post_break(n), g);
 420     }
 421     return n;
 422 }
 423
 424 halfword insert_word_discretionary(halfword t, lang_variables * lan)
 425 {
 426     halfword pre = null, pos = null;
 427     if (lan->pre_exhyphen_char > 0)
 428         pre = insert_character(null, lan->pre_exhyphen_char);
 429     if (lan->post_exhyphen_char > 0)
 430         pos = insert_character(null, lan->post_exhyphen_char);
 431     return insert_discretionary(t, pre, pos, null,int_par(ex_hyphen_penalty_code));
 432 }
 433
 434 @ @c
 435 halfword compound_word_break(halfword t, int clang)
 436 {
 437     int disc;
 438     lang_variables langdata;
 439     langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
 440     langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
 441     disc = insert_word_discretionary(t, &langdata);
 442     return disc;
 443 }
 444
 445 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
 446                                       halfword pre, halfword pos,
 447                                       halfword replace)
 448 {
 449     (void) lan;
 450     return insert_discretionary(t, pre, pos, replace,int_par(hyphen_penalty_code));
 451 }
 452
 453 halfword insert_character(halfword t, int c)
 454 {
 455     halfword p;
 456     p = new_node(glyph_node, 0);
 457     set_to_character(p);
 458     character(p) = c;
 459     if (t != null) {
 460         couple_nodes(t, p);
 461     }
 462     return p;
 463 }
 464
 465 @ @c
 466 void set_disc_field(halfword f, halfword t)
 467 {
 468     if (t != null) {
 469         couple_nodes(f, t);
 470         tlink(f) = tail_of_list(t);
 471     } else {
 472         vlink(f) = null;
 473         tlink(f) = null;
 474     }
 475 }
 476
 477 @ @c
 478 static char *hyphenation_exception(int exceptions, char *w)
 479 {
 480     char *ret = NULL;
 481     lua_State *L = Luas;
 482     lua_checkstack(L, 2);
 483     lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
 484     if (lua_istable(L, -1)) {   /* ?? */
 485         lua_pushstring(L, w);   /* word table */
 486         lua_rawget(L, -2);
 487         if (lua_type(L, -1) == LUA_TSTRING) {
 488             ret = xstrdup(lua_tostring(L, -1));
 489         }
 490         lua_pop(L, 2);
 491     } else {
 492         lua_pop(L, 1);
 493     }
 494     return ret;
 495 }
 496
 497 @ @c
 498 char *exception_strings(struct tex_language *lang)
 499 {
 500     const char *value;
 501     size_t size = 0, current = 0;
 502     size_t l = 0;
 503     char *ret = NULL;
 504     lua_State *L = Luas;
 505     if (lang->exceptions == 0)
 506         return NULL;
 507     lua_checkstack(L, 2);
 508     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 509     if (lua_istable(L, -1)) {
 510         /* iterate and join */
 511         lua_pushnil(L);         /* first key */
 512         while (lua_next(L, -2) != 0) {
 513             value = lua_tolstring(L, -1, &l);
 514             if (current + 2 + l > size) {
 515                 ret =
 516                     xrealloc(ret,
 517                              (unsigned) ((size + size / 5) + current + l + 1024));
 518                 size = (size + size / 5) + current + l + 1024;
 519             }
 520             *(ret + current) = ' ';
 521             strcpy(ret + current + 1, value);
 522             current += l + 1;
 523             lua_pop(L, 1);
 524         }
 525     }
 526     return ret;
 527 }
 528
 529 @ the sequence from |wordstart| to |r| can contain only normal characters
 530 it could be faster to modify a halfword pointer and return an integer
 531
 532 @c
 533 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
 534 {
 535     halfword g = null, gg = null;
 536     register unsigned i = *j;
 537     i++;                        /* this puts uword[i] on the |{| */
 538     while (i < (unsigned) len && uword[i + 1] != '}') {
 539         if (g == null) {
 540             gg = new_char(0, (int) uword[i + 1]);
 541             g = gg;
 542         } else {
 543             halfword s = new_char(0, (int) uword[i + 1]);
 544             couple_nodes(g, s);
 545             g = vlink(g);
 546         }
 547         i++;
 548     }
 549     *j = ++i;
 550     return gg;
 551 }
 552
 553 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
 554 {
 555     int ret = 0;
 556     register unsigned i = *j;
 557     i++;                        /* this puts uword[i] on the |{| */
 558     while (i < (unsigned) len && uword[i + 1] != '}') {
 559         ret++;
 560         i++;
 561     }
 562     *j = ++i;
 563     return ret;
 564 }
 565
 566 @ @c
 567 static const char *PAT_ERROR[] = {
 568     "Exception discretionaries should contain three pairs of braced items.",
 569     "No intervening spaces are allowed.",
 570     NULL
 571 };
 572
 573 /*
 574     The exceptions are taken as-is: no min values are taken into account. One can
 575     add normal patterns on-the-fly if needed.
 576 */
 577
 578 static void do_exception(halfword wordstart, halfword r, char *replacement)
 579 {
 580     unsigned i;
 581     halfword t;
 582     unsigned len;
 583     int clang;
 584     lang_variables langdata;
 585     unsigned uword[MAX_WORD_LEN + 1] = { 0 };
 586     utf2uni_strcpy(uword, replacement);
 587     len = u_length(uword);
 588     i = 0;
 589     t = wordstart;
 590     clang = char_lang(wordstart);
 591     langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 592     langdata.post_hyphen_char = get_post_hyphen_char(clang);
 593
 594     for (i = 0; i < len; i++) {
 595         if (uword[i + 1] == '-') {      /* a hyphen follows */
 596             while (vlink(t) != r
 597                    && (type(t) != glyph_node || !is_simple_character(t)))
 598                 t = vlink(t);
 599             if (vlink(t) == r)
 600                 break;
 601             insert_syllable_discretionary(t, &langdata);
 602             t = vlink(t);       /* skip the new disc */
 603         } else if (uword[i + 1] == '=') {
 604             /* do nothing ? */
 605             t = vlink(t);
 606         } else if (uword[i + 1] == '{') {
 607             halfword gg, hh, replace = null;
 608             int repl;
 609             gg = find_exception_part(&i, uword, (int) len);
 610             if (i == len || uword[i + 1] != '{') {
 611                 tex_error("broken pattern 1", PAT_ERROR);
 612             }
 613             hh = find_exception_part(&i, uword, (int) len);
 614             if (i == len || uword[i + 1] != '{') {
 615                 tex_error("broken pattern 2", PAT_ERROR);
 616             }
 617             repl = count_exception_part(&i, uword, (int) len);
 618             if (i == len) {
 619                 tex_error("broken pattern 3", PAT_ERROR);
 620             }
 621             /*i++;  *//* jump over the last right brace */
 622             if (vlink(t) == r)
 623                 break;
 624             if (repl > 0) {
 625                 halfword q = t;
 626                 replace = vlink(q);
 627                 while (repl > 0 && q != null) {
 628                     q = vlink(q);
 629                     if (type(q) == glyph_node) {
 630                         repl--;
 631                     }
 632                 }
 633                 try_couple_nodes(t, vlink(q));
 634                 vlink(q) = null;
 635             }
 636             t = insert_discretionary(t, gg, hh, replace,int_par(hyphen_penalty_code));
 637             t = vlink(t);       /* skip the new disc */
 638         } else {
 639             t = vlink(t);
 640         }
 641     }
 642 }
 643
 644 @ This is a documentation section from the pascal web file. It is not
 645 true any more, but I do not have time right now to rewrite it -- Taco
 646
 647 When the line-breaking routine is unable to find a feasible sequence of
 648 breakpoints, it makes a second pass over the paragraph, attempting to
 649 hyphenate the hyphenatable words. The goal of hyphenation is to insert
 650 discretionary material into the paragraph so that there are more
 651 potential places to break.
 652
 653 The general rules for hyphenation are somewhat complex and technical,
 654 because we want to be able to hyphenate words that are preceded or
 655 followed by punctuation marks, and because we want the rules to work
 656 for languages other than English. We also must contend with the fact
 657 that hyphens might radically alter the ligature and kerning structure
 658 of a word.
 659
 660 A sequence of characters will be considered for hyphenation only if it
 661 belongs to a ``potentially hyphenatable part'' of the current paragraph.
 662 This is a sequence of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node,
 663 $p_1\ldots p_{m-1}$ are either character or ligature or whatsit or
 664 implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust
 665 or mark or whatsit or explicit kern node.  (Therefore hyphenation is
 666 disabled by boxes, math formulas, and discretionary nodes already inserted
 667 by the user.) The ligature nodes among $p_1\ldots p_{m-1}$ are effectively
 668 expanded into the original non-ligature characters; the kern nodes and
 669 whatsits are ignored. Each character |c| is now classified as either a
 670 nonletter (if |lc_code(c)=0|), a lowercase letter (if
 671 |lc_code(c)=c|), or an uppercase letter (otherwise); an uppercase letter
 672 is treated as if it were |lc_code(c)| for purposes of hyphenation. The
 673 characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let
 674 $c_1$ be the first letter that is not in the middle of a ligature. Whatsit
 675 nodes preceding $c_1$ are ignored; a whatsit found after $c_1$ will be the
 676 terminating node $p_m$. All characters that do not have the same font as
 677 $c_1$ will be treated as nonletters. The |hyphen_char| for that font
 678 must be between 0 and 255, otherwise hyphenation will not be attempted.
 679 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as
 680 possible; however, |n| must be less than 64, so a character that would
 681 otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
 682 not be in the middle of a ligature.  In this way we obtain a string of
 683 letters $c_1\ldots c_n$ that are generated by nodes $p_a\ldots p_b$, where
 684 |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this string qualifies for hyphenation;
 685 however, |uc_hyph| must be positive, if $c_1$ is uppercase.
 686
 687 The hyphenation process takes place in three stages. First, the candidate
 688 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens
 689 are determined by referring to hyphenation tables; and finally, the nodes
 690 $p_a\ldots p_b$ are replaced by a new sequence of nodes that includes the
 691 discretionary breaks found.
 692
 693 Fortunately, we do not have to do all this calculation very often, because
 694 of the way it has been taken out of \TeX's inner loop. For example, when
 695 the second edition of the author's 700-page book {\sl Seminumerical
 696 Algorithms} was typeset by \TeX, only about 1.2 hyphenations needed to be
 697 @^Knuth, Donald Ervin@>
 698 tried per paragraph, since the line breaking algorithm needed to use two
 699 passes on only about 5 per cent of the paragraphs.
 700
 701 When a word been set up to contain a candidate for hyphenation,
 702 \TeX\ first looks to see if it is in the user's exception dictionary. If not,
 703 hyphens are inserted based on patterns that appear within the given word,
 704 using an algorithm due to Frank~M. Liang.
 705 @^Liang, Franklin Mark@>
 706
 707 @ This is incompatible with TEX because the first word of a paragraph
 708 can be hyphenated, but most european users seem to agree that
 709 prohibiting hyphenation there was not the best idea ever.
 710
 711 @c
 712 static halfword find_next_wordstart(halfword r)
 713 {
 714     register int l;
 715     register int start_ok = 1;
 716     int mathlevel = 1;
 717     int chr ;
 718     halfword t ;
 719     while (r != null) {
 720         switch (type(r)) {
 721         case boundary_node:
 722         case whatsit_node:
 723             break;
 724         case glue_node:
 725             start_ok = 1;
 726             break;
 727         case math_node:
 728             while (mathlevel > 0) {
 729                 r = vlink(r);
 730                 if (r == null)
 731                     return r;
 732                 if (type(r) == math_node) {
 733                     if (subtype(r) == before) {
 734                         mathlevel++;
 735                     } else {
 736                         mathlevel--;
 737                     }
 738                 }
 739             }
 740             break;
 741         case glyph_node:
 742             if (is_simple_character(r)) {
 743                 chr = character(r) ;
 744                 if (chr == ex_hyphen_char) {
 745                     /*
 746                         We only accept an explicit hyphen when there is a preceding glyph and
 747                         we skip a sequence of explicit hyphens as that normally indicates a
 748                         -- or --- ligature in which case we can in a worse case usage get bad
 749                         node lists later on due to messed up ligature building as these dashes
 750                         are ligatures in base fonts. This is a side effect of the separating the
 751                         hyphenation, ligaturing and kerning steps. A test is cmr with ------.
 752                     */
 753                     t = vlink(r) ;
 754                     if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
 755                         t = compound_word_break(r, char_lang(r));
 756                         subtype(t) = automatic_disc;
 757                         start_ok = 1 ;
 758                     } else {
 759                         start_ok = 0;
 760                     }
 761                 } else if (start_ok && (char_lang(r)>0) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
 762                     if (char_uchyph(r) || l == chr) {
 763                         return r;
 764                     } else {
 765                         start_ok = 0;
 766                     }
 767                 }
 768             }
 769             break;
 770         default:
 771             start_ok = 0;
 772             break;
 773         }
 774         r = vlink(r);
 775     }
 776     return r;
 777 }
 778
 779 @ @c
 780 static int valid_wordend(halfword s)
 781 {
 782     register halfword r = s;
 783     register int clang = char_lang(s);
 784     if (r == null)
 785         return 1;
 786     while ((r != null) && (   (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
 787                            || (type(r) == kern_node && (subtype(r) == normal))
 788            )) {
 789         r = vlink(r);
 790     }
 791     if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
 792                   ||  type(r) == glue_node
 793                   ||  type(r) == boundary_node
 794                   ||  type(r) == whatsit_node
 795                   ||  type(r) == ins_node
 796                   ||  type(r) == adjust_node
 797                   ||  type(r) == penalty_node
 798                   || (type(r) == kern_node && (subtype(r) == explicit_kern ||
 799                                                subtype(r) == italic_kern   ||
 800                                                subtype(r) == accent_kern   )))
 801         return 1;
 802     return 0;
 803 }
 804
 805 @ @c
 806 void hnj_hyphenation(halfword head, halfword tail)
 807 {
 808     int lchar, i;
 809     struct tex_language *lang;
 810     lang_variables langdata;
 811     char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
 812     int wordlen = 0;
 813     char *hy = utf8word;
 814     char *replacement = NULL;
 815     boolean explicit_hyphen = false;
 816     halfword s, r = head, wordstart = null, save_tail1 = null, left =
 817         null, right = null;
 818
 819     /* this first movement assures two things:
 820      \item{a)} that we won't waste lots of time on something that has been
 821       handled already (in that case, none of the glyphs match |simple_character|).
 822      \item{b)} that the first word can be hyphenated. if the movement was
 823      not explicit, then the indentation at the start of a paragraph
 824      list would make |find_next_wordstart()| look too far ahead.
 825      */
 826
 827     while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
 828         r = vlink(r);
 829     }
 830     /* this will make |r| a glyph node with subtype character */
 831     r = find_next_wordstart(r);
 832     if (r == null)
 833         return;
 834
 835     assert(tail != null);
 836     save_tail1 = vlink(tail);
 837     s = new_penalty(0);
 838     couple_nodes(tail, s);
 839
 840     while (r != null) {         /* could be while(1), but let's be paranoid */
 841         int clang, lhmin, rhmin, hmin;
 842         halfword hyf_font;
 843         halfword end_word = r;
 844         wordstart = r;
 845         assert(is_simple_character(wordstart));
 846         hyf_font = font(wordstart);
 847         if (hyphen_char(hyf_font) < 0)  /* for backward compat */
 848             hyf_font = 0;
 849         clang = char_lang(wordstart);
 850         lhmin = char_lhmin(wordstart);
 851         rhmin = char_rhmin(wordstart);
 852         hmin = get_hyphenation_min(clang);
 853         langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 854         langdata.post_hyphen_char = get_post_hyphen_char(clang);
 855         while (r != null && type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r) &&
 856               (((clang > 0) && (lchar = get_hj_code(clang,character(r))) > 0) || (character(r) == ex_hyphen_char && (lchar = ex_hyphen_char)))) {
 857             if (character(r) == ex_hyphen_char)
 858                 explicit_hyphen = true;
 859             wordlen++;
 860             hy = uni2string(hy, (unsigned) lchar);
 861             /* this should not be needed  any more */
 862             /*if (vlink(r)!=null) alink(vlink(r))=r; */
 863             end_word = r;
 864             r = vlink(r);
 865         }
 866         if (valid_wordend(r) && wordlen >= lhmin + rhmin && (hmin <= 0 || wordlen >= hmin)
 867             && (hyf_font != 0) && clang >=0 && (lang = tex_languages[clang]) != NULL) {
 868             *hy = 0;
 869             if (lang->exceptions != 0 &&
 870                 (replacement =
 871                  hyphenation_exception(lang->exceptions, utf8word)) != NULL) {
 872 #ifdef VERBOSE
 873                 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
 874 #endif
 875                 do_exception(wordstart, r, replacement);
 876                 free(replacement);
 877             } else if (explicit_hyphen == true) {
 878                 /* insert an explicit discretionary after each of the last in a
 879                    set of explicit hyphens */
 880                 halfword rr = r;
 881                 halfword t = null;
 882 #ifdef VERBOSE
 883                 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
 884 #endif
 885                 while (rr != wordstart) {
 886                     if (is_simple_character(rr)) {
 887                         if (character(rr) == ex_hyphen_char) {
 888                             t = compound_word_break(rr, clang);
 889                             subtype(t) = automatic_disc;
 890                             while(character(alink(rr)) == ex_hyphen_char)
 891                                rr = alink(rr);
 892                             if (rr == wordstart)
 893                                break;
 894                         }
 895                     }
 896                     rr = alink(rr);
 897                 }
 898
 899             } else if (lang->patterns != NULL) {
 900
 901                 left = wordstart;
 902                 for (i = lhmin; i > 1; i--) {
 903                     left = vlink(left);
 904                     while (!is_simple_character(left))
 905                         left = vlink(left);
 906                 }
 907                 right = r;
 908                 for (i = rhmin; i > 0; i--) {
 909                     right = alink(right);
 910                     while (!is_simple_character(right))
 911                         right = alink(right);
 912                 }
 913
 914 #ifdef VERBOSE
 915                 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
 916                     utf8word, clang, lhmin, rhmin, character(left),
 917                     character(right));
 918 #endif
 919                 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
 920             }
 921         }
 922         explicit_hyphen = false;
 923         wordlen = 0;
 924         hy = utf8word;
 925         if (r == null)
 926             break;
 927         r = find_next_wordstart(r);
 928     }
 929     flush_node(vlink(tail));
 930     vlink(tail) = save_tail1;
 931 }
 932
 933 @ @c
 934 void new_hyphenation(halfword head, halfword tail)
 935 {
 936     register int callback_id = 0;
 937     if (head == null || vlink(head) == null)
 938         return;
 939     fix_node_list(head);
 940     callback_id = callback_defined(hyphenate_callback);
 941     if (callback_id > 0) {
 942         lua_State *L = Luas;
 943         if (!get_callback(L, callback_id)) {
 944             lua_pop(L, 2);
 945             return;
 946         }
 947         nodelist_to_lua(L, head);
 948         nodelist_to_lua(L, tail);
 949         if (lua_pcall(L, 2, 0, 0) != 0) {
 950             formatted_warning("hyphenation","bad specification: %s",lua_tostring(L, -1));
 951             lua_pop(L, 2);
 952             lua_error(L);
 953             return;
 954         }
 955         lua_pop(L, 1);
 956     } else if (callback_id == 0) {
 957         hnj_hyphenation(head, tail);
 958     }
 959 }
 960
 961 @ dumping and undumping languages
 962
 963 @c
 964 #define dump_string(a)                \
 965   if (a!=NULL) {                      \
 966       x = (int)strlen(a)+1;           \
 967     dump_int(x);  dump_things(*a, x); \
 968   } else {                            \
 969     x = 0; dump_int(x);               \
 970   }
 971
 972 static void dump_one_language(int i)
 973 {
 974     char *s = NULL;
 975     int x = 0;
 976     struct tex_language *lang;
 977     lang = tex_languages[i];
 978     dump_int(lang->id);
 979     dump_int(lang->pre_hyphen_char);
 980     dump_int(lang->post_hyphen_char);
 981     dump_int(lang->pre_exhyphen_char);
 982     dump_int(lang->post_exhyphen_char);
 983     dump_int(lang->hyphenation_min);
 984     if (lang->patterns != NULL) {
 985         s = (char *) hnj_serialize(lang->patterns);
 986     }
 987     dump_string(s);
 988     if (s != NULL) {
 989         free(s);
 990         s = NULL;
 991     }
 992     if (lang->exceptions != 0)
 993         s = exception_strings(lang);
 994     dump_string(s);
 995     if (s != NULL) {
 996         free(s);
 997     }
 998     free(lang);
 999 }
1000
1001 void dump_language_data(void)
1002 {
1003     int i;
1004     dump_int(next_lang_id);
1005     for (i = 0; i < next_lang_id; i++) {
1006         if (tex_languages[i]) {
1007             dump_int(1);
1008             dump_one_language(i);
1009         } else {
1010             dump_int(0);
1011         }
1012     }
1013 }
1014
1015 static void undump_one_language(int i)
1016 {
1017     char *s = NULL;
1018     int x = 0;
1019     struct tex_language *lang = get_language(i);
1020     undump_int(x);
1021     lang->id = x;
1022     undump_int(x);
1023     lang->pre_hyphen_char = x;
1024     undump_int(x);
1025     lang->post_hyphen_char = x;
1026     undump_int(x);
1027     lang->pre_exhyphen_char = x;
1028     undump_int(x);
1029     lang->post_exhyphen_char = x;
1030     undump_int(x);
1031     lang->hyphenation_min = x;
1032     /* patterns */
1033     undump_int(x);
1034     if (x > 0) {
1035         s = xmalloc((unsigned) x);
1036         undump_things(*s, x);
1037         load_patterns(lang, (unsigned char *) s);
1038         free(s);
1039     }
1040     /* exceptions */
1041     undump_int(x);
1042     if (x > 0) {
1043         s = xmalloc((unsigned) x);
1044         undump_things(*s, x);
1045         load_hyphenation(lang, (unsigned char *) s);
1046         free(s);
1047     }
1048 }
1049
1050 void undump_language_data(void)
1051 {
1052     int i, x, numlangs;
1053     undump_int(numlangs);
1054     next_lang_id = numlangs;
1055     for (i = 0; i < numlangs; i++) {
1056         undump_int(x);
1057         if (x == 1) {
1058             undump_one_language(i);
1059         }
1060     }
1061 }
1062
1063 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1064 |new_hyph_exceptions| to do the right thing.
1065
1066 @c
1067 void new_hyph_exceptions(void)
1068 {                               /* enters new exceptions */
1069     (void) scan_toks(false, true);
1070     load_tex_hyphenation(int_par(language_code), def_ref);
1071     flush_list(def_ref);
1072 }
1073
1074 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1075 procedure named |new_patterns|.
1076
1077 @c
1078 void new_patterns(void)
1079 {                               /* initializes the hyphenation pattern data */
1080     (void) scan_toks(false, true);
1081     load_tex_patterns(int_par(language_code), def_ref);
1082     flush_list(def_ref);
1083 }
1084
1085 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1086 `\.{\\posthyphenchar}' the |post_break| character. Their respective
1087 defaults are ascii hyphen ("-") and zero (nul).
1088
1089 @c
1090 void new_pre_hyphen_char(void)
1091 {
1092     scan_optional_equals();
1093     scan_int();
1094     set_pre_hyphen_char(int_par(language_code), cur_val);
1095 }
1096
1097 void new_post_hyphen_char(void)
1098 {
1099     scan_optional_equals();
1100     scan_int();
1101     set_post_hyphen_char(int_par(language_code), cur_val);
1102 }
1103
1104 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1105 `\.{\\postexhyphenchar}' the |post_break| character. Their
1106 defaults are both zero (nul).
1107
1108 @c
1109 void new_pre_exhyphen_char(void)
1110 {
1111     scan_optional_equals();
1112     scan_int();
1113     set_pre_exhyphen_char(int_par(language_code), cur_val);
1114 }
1115
1116 void new_post_exhyphen_char(void)
1117 {
1118     scan_optional_equals();
1119     scan_int();
1120     set_post_exhyphen_char(int_par(language_code), cur_val);
1121 }
1122
1123 void new_hyphenation_min(void)
1124 {
1125     scan_optional_equals();
1126     scan_int();
1127     set_hyphenation_min(int_par(language_code), cur_val);
1128 }
1129
1130 void new_hj_code(void)
1131 {
1132     int i ;
1133     scan_int();
1134     i = cur_val;
1135     scan_optional_equals();
1136     scan_int();
1137     set_hj_code(int_par(language_code), i, cur_val, -1);
1138 }