source/texk/web2c/luatexdir/lang/texlang.w

   1 % texlang.w
   2 %
   3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
   4 %
   5 % This file is part of LuaTeX.
   6 %
   7 % LuaTeX is free software; you can redistribute it and/or modify it under
   8 % the terms of the GNU General Public License as published by the Free
   9 % Software Foundation; either version 2 of the License, or (at your
  10 % option) any later version.
  11 %
  12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
  13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 % FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 % License for more details.
  16 %
  17 % You should have received a copy of the GNU General Public License along
  18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
  19
  20 @ @c
  21
  22 #include "ptexlib.h"
  23 #include <string.h>
  24 #include "lua/luatex-api.h"
  25
  26 @ Low-level helpers
  27
  28 @ @c
  29 #define unVERBOSE
  30
  31 #define MAX_TEX_LANGUAGES  16384
  32
  33 #define ex_hyphen_char int_par(ex_hyphen_char_code)
  34
  35 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
  36
  37 static int next_lang_id = 0;
  38
  39 struct tex_language *new_language(int n)
  40 {
  41     struct tex_language *lang;
  42     unsigned l;
  43     if (n >= 0) {
  44         l = (unsigned) n;
  45         if (l != (MAX_TEX_LANGUAGES - 1))
  46             if (next_lang_id <= n)
  47                 next_lang_id = n + 1;
  48     } else {
  49         while (tex_languages[next_lang_id] != NULL)
  50             next_lang_id++;
  51         l = (unsigned) next_lang_id++;
  52     }
  53     if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
  54         lang = xmalloc(sizeof(struct tex_language));
  55         tex_languages[l] = lang;
  56         lang->id = (int) l;
  57         lang->exceptions = 0;
  58         lang->patterns = NULL;
  59         lang->pre_hyphen_char = '-';
  60         lang->post_hyphen_char = 0;
  61         lang->pre_exhyphen_char = 0;
  62         lang->post_exhyphen_char = 0;
  63         lang->hyphenation_min = -1;
  64         if (int_par(saving_hyph_codes_code)) {
  65             hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
  66         }
  67         return lang;
  68     } else {
  69         return NULL;
  70     }
  71 }
  72
  73 struct tex_language *get_language(int n)
  74 {
  75     if (n >= 0 && n < MAX_TEX_LANGUAGES) {
  76         if (tex_languages[n] != NULL) {
  77             return tex_languages[n];
  78         } else {
  79             return new_language(n);
  80         }
  81     } else {
  82         return NULL;
  83     }
  84 }
  85
  86 @ @c
  87 void set_pre_hyphen_char(int n, int v)
  88 {
  89     struct tex_language *l = get_language((int) n);
  90     if (l != NULL)
  91         l->pre_hyphen_char = (int) v;
  92 }
  93
  94 void set_post_hyphen_char(int n, int v)
  95 {
  96     struct tex_language *l = get_language((int) n);
  97     if (l != NULL)
  98         l->post_hyphen_char = (int) v;
  99 }
 100
 101 void set_pre_exhyphen_char(int n, int v)
 102 {
 103     struct tex_language *l = get_language((int) n);
 104     if (l != NULL)
 105         l->pre_exhyphen_char = (int) v;
 106 }
 107
 108 void set_post_exhyphen_char(int n, int v)
 109 {
 110     struct tex_language *l = get_language((int) n);
 111     if (l != NULL)
 112         l->post_exhyphen_char = (int) v;
 113 }
 114
 115 int get_pre_hyphen_char(int n)
 116 {
 117     struct tex_language *l = get_language((int) n);
 118     if (l == NULL)
 119         return -1;
 120     return (int) l->pre_hyphen_char;
 121 }
 122
 123 int get_post_hyphen_char(int n)
 124 {
 125     struct tex_language *l = get_language((int) n);
 126     if (l == NULL)
 127         return -1;
 128     return (int) l->post_hyphen_char;
 129 }
 130
 131 int get_pre_exhyphen_char(int n)
 132 {
 133     struct tex_language *l = get_language((int) n);
 134     if (l == NULL)
 135         return -1;
 136     return (int) l->pre_exhyphen_char;
 137 }
 138
 139 int get_post_exhyphen_char(int n)
 140 {
 141     struct tex_language *l = get_language((int) n);
 142     if (l == NULL)
 143         return -1;
 144     return (int) l->post_exhyphen_char;
 145 }
 146
 147 void set_hyphenation_min(int n, int v)
 148 {
 149     struct tex_language *l = get_language((int) n);
 150     if (l != NULL)
 151         l->hyphenation_min = (int) v;
 152 }
 153
 154 int get_hyphenation_min(int n)
 155 {
 156     struct tex_language *l = get_language((int) n);
 157     if (l == NULL)
 158         return -1;
 159     return (int) l->hyphenation_min;
 160 }
 161
 162 void load_patterns(struct tex_language *lang, const unsigned char *buff)
 163 {
 164     if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
 165         return;
 166     if (lang->patterns == NULL) {
 167         lang->patterns = hnj_hyphen_new();
 168     }
 169     hnj_hyphen_load(lang->patterns, buff);
 170 }
 171
 172 void clear_patterns(struct tex_language *lang)
 173 {
 174     if (lang == NULL)
 175         return;
 176     if (lang->patterns != NULL) {
 177         hnj_hyphen_clear(lang->patterns);
 178     }
 179 }
 180
 181 void load_tex_patterns(int curlang, halfword head)
 182 {
 183     char *s = tokenlist_to_cstring(head, 1, NULL);
 184     load_patterns(get_language(curlang), (unsigned char *) s);
 185 }
 186
 187 @ @c
 188 #define STORE_CHAR(l,x) do { \
 189     unsigned xx = get_hj_code(l,x); \
 190     if (!xx) { \
 191         xx = x; \
 192     } \
 193     uindex = uni2string(uindex, xx); \
 194 } while (0)
 195
 196 @ Cleans one word which is returned in |cleaned|, returns the new offset into
 197 |buffer|
 198
 199 @c
 200 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
 201 {
 202     int items = 0;
 203     unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
 204     unsigned uword[MAX_WORD_LEN + 1] = { 0 };  /* work buffer for unicode */
 205     int u = 0; /* unicode buffer value */
 206     int i = 0; /* index into buffer */
 207     char *uindex = (char *)word;
 208     const char *s = buff;
 209
 210     while (*s && !isspace((unsigned char)*s)) {
 211         word[i++] = (unsigned)*s;
 212         s++;
 213         if ((s-buff)>MAX_WORD_LEN) {
 214             /* todo: this is too strict, should count unicode, not bytes */
 215             *cleaned = NULL;
 216             tex_error("exception too long", NULL);
 217             return s;
 218         }
 219     }
 220     /* now convert the input to unicode */
 221     word[i] = '\0';
 222     utf2uni_strcpy(uword, (const char *)word);
 223
 224     /* build the new word string */
 225     i = 0;
 226     while (uword[i]>0) {
 227         u = uword[i++];
 228         if (u == '-') {        /* skip */
 229         } else if (u == '=') {
 230             STORE_CHAR(id,'-');
 231         } else if (u == '{') {
 232             u = uword[i++];
 233             items = 0;
 234             while (u && u != '}') {
 235                 u = uword[i++];
 236             }
 237             if (u == '}') {
 238                 items++;
 239                 u = uword[i++];
 240             }
 241             while (u && u != '}') {
 242                 u = uword[i++];
 243             }
 244             if (u == '}') {
 245                 items++;
 246                 u = uword[i++];
 247             }
 248             if (u == '{') {
 249                 u = uword[i++];
 250             }
 251             while (u && u != '}') {
 252                 STORE_CHAR(id,u);
 253                 u = uword[i++];
 254             }
 255             if (u == '}') {
 256                 items++;
 257             }
 258             if (items != 3) {   /* syntax error */
 259                 *cleaned = NULL;
 260                 tex_error("exception syntax error", NULL);
 261                 return s;
 262             }
 263         } else {
 264             STORE_CHAR(id,u);
 265         }
 266     }
 267     *uindex = '\0';
 268     *cleaned = xstrdup((char *) word);
 269     return s;
 270 }
 271
 272 @ @c
 273 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
 274 {
 275     const char *s;
 276     const char *value;
 277     char *cleaned;
 278     int id ;
 279     lua_State *L = Luas;
 280     if (lang == NULL)
 281         return;
 282     if (lang->exceptions == 0) {
 283         lua_newtable(L);
 284         lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
 285     }
 286     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 287     s = (const char *) buff;
 288     id = lang->id;
 289     while (*s) {
 290         while (isspace((unsigned char)*s))
 291             s++;
 292         if (*s) {
 293             value = s;
 294             s = clean_hyphenation(id, s, &cleaned);
 295             if (cleaned != NULL) {
 296                 if ((s - value) > 0) {
 297                     lua_pushstring(L, cleaned);
 298                     lua_pushlstring(L, value, (size_t) (s - value));
 299                     lua_rawset(L, -3);
 300                 }
 301                 free(cleaned);
 302             } else {
 303 #ifdef VERBOSE
 304                 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
 305 #endif
 306             }
 307         }
 308     }
 309 }
 310
 311 void clear_hyphenation(struct tex_language *lang)
 312 {
 313     if (lang == NULL)
 314         return;
 315     if (lang->exceptions != 0) {
 316         luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 317         lang->exceptions = 0;
 318     }
 319 }
 320
 321 void load_tex_hyphenation(int curlang, halfword head)
 322 {
 323     char *s = tokenlist_to_cstring(head, 1, NULL);
 324     load_hyphenation(get_language(curlang), (unsigned char *) s);
 325 }
 326
 327 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very nice,
 328 but needed. Also, in the post-break, it would be nicer to get the attribute list
 329 from |vlink(n)|. No rush, as it is currently not used much.
 330
 331 @c
 332 halfword insert_discretionary(halfword t, halfword pre, halfword post,
 333                               halfword replace, int penalty)
 334 {
 335     halfword g, n;
 336     int f;
 337     n = new_node(disc_node, syllable_disc);
 338     disc_penalty(n) = penalty;
 339     try_couple_nodes(n, vlink(t));
 340     couple_nodes(t, n);
 341     if (replace != null)
 342         f = font(replace);
 343     else
 344         f = get_cur_font();     /* for compound words following explicit hyphens */
 345     for (g = pre; g != null; g = vlink(g)) {
 346         font(g) = f;
 347         if (node_attr(t) != null) {
 348             delete_attribute_ref(node_attr(g));
 349             node_attr(g) = node_attr(t);
 350             attr_list_ref(node_attr(t)) += 1;
 351         }
 352     }
 353     for (g = post; g != null; g = vlink(g)) {
 354         font(g) = f;
 355         if (node_attr(t) != null) {
 356             delete_attribute_ref(node_attr(g));
 357             node_attr(g) = node_attr(t);
 358             attr_list_ref(node_attr(t)) += 1;
 359         }
 360     }
 361     for (g = replace; g != null; g = vlink(g)) {
 362         if (node_attr(t) != null) {
 363             delete_attribute_ref(node_attr(g));
 364             node_attr(g) = node_attr(t);
 365             attr_list_ref(node_attr(t)) += 1;
 366         }
 367     }
 368     if (node_attr(t) != null) {
 369         delete_attribute_ref(node_attr(vlink(t)));
 370         node_attr(vlink(t)) = node_attr(t);
 371         attr_list_ref(node_attr(t)) += 1;
 372     }
 373     t = vlink(t);
 374     set_disc_field(pre_break(t), pre);
 375     set_disc_field(post_break(t), post);
 376     set_disc_field(no_break(t), replace);
 377     return t;
 378 }
 379
 380 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
 381 {
 382     halfword g, n;
 383     n = new_node(disc_node, syllable_disc);
 384     disc_penalty(n) = int_par(hyphen_penalty_code);
 385     couple_nodes(n, vlink(t));
 386     couple_nodes(t, n);
 387     delete_attribute_ref(node_attr(n));
 388     if (node_attr(t) != null) {
 389         node_attr(n) = node_attr(t);
 390         attr_list_ref(node_attr(t))++;
 391     } else {
 392         node_attr(n) = null;
 393     }
 394     if (lan->pre_hyphen_char > 0) {
 395         g = raw_glyph_node();
 396         set_to_character(g);
 397         character(g) = lan->pre_hyphen_char;
 398         font(g) = font(t);
 399         lang_data(g) = lang_data(t);
 400         if (node_attr(t) != null) {
 401             node_attr(g) = node_attr(t);
 402             attr_list_ref(node_attr(t))++;
 403         }
 404         set_disc_field(pre_break(n), g);
 405     }
 406
 407     if (lan->post_hyphen_char > 0) {
 408         t = vlink(n);
 409         g = raw_glyph_node();
 410         set_to_character(g);
 411         character(g) = lan->post_hyphen_char;
 412         font(g) = font(t);
 413         lang_data(g) = lang_data(t);
 414         if (node_attr(t) != null) {
 415             node_attr(g) = node_attr(t);
 416             attr_list_ref(node_attr(t)) += 1;
 417         }
 418         set_disc_field(post_break(n), g);
 419     }
 420     return n;
 421 }
 422
 423 halfword insert_word_discretionary(halfword t, lang_variables * lan)
 424 {
 425     halfword pre = null, pos = null;
 426     if (lan->pre_exhyphen_char > 0)
 427         pre = insert_character(null, lan->pre_exhyphen_char);
 428     if (lan->post_exhyphen_char > 0)
 429         pos = insert_character(null, lan->post_exhyphen_char);
 430     return insert_discretionary(t, pre, pos, null,int_par(ex_hyphen_penalty_code));
 431 }
 432
 433 @ @c
 434 halfword compound_word_break(halfword t, int clang)
 435 {
 436     int disc;
 437     lang_variables langdata;
 438     langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
 439     langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
 440     disc = insert_word_discretionary(t, &langdata);
 441     return disc;
 442 }
 443
 444 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
 445                                       halfword pre, halfword pos,
 446                                       halfword replace)
 447 {
 448     (void) lan;
 449     return insert_discretionary(t, pre, pos, replace,int_par(hyphen_penalty_code));
 450 }
 451
 452 halfword insert_character(halfword t, int c)
 453 {
 454     halfword p;
 455     p = new_node(glyph_node, 0);
 456     set_to_character(p);
 457     character(p) = c;
 458     if (t != null) {
 459         couple_nodes(t, p);
 460     }
 461     return p;
 462 }
 463
 464 @ @c
 465 void set_disc_field(halfword f, halfword t)
 466 {
 467     if (t != null) {
 468         couple_nodes(f, t);
 469         tlink(f) = tail_of_list(t);
 470     } else {
 471         vlink(f) = null;
 472         tlink(f) = null;
 473     }
 474 }
 475
 476 @ @c
 477 static char *hyphenation_exception(int exceptions, char *w)
 478 {
 479     char *ret = NULL;
 480     lua_State *L = Luas;
 481     lua_checkstack(L, 2);
 482     lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
 483     if (lua_istable(L, -1)) {   /* ?? */
 484         lua_pushstring(L, w);   /* word table */
 485         lua_rawget(L, -2);
 486         if (lua_type(L, -1) == LUA_TSTRING) {
 487             ret = xstrdup(lua_tostring(L, -1));
 488         }
 489         lua_pop(L, 2);
 490     } else {
 491         lua_pop(L, 1);
 492     }
 493     return ret;
 494 }
 495
 496 @ @c
 497 char *exception_strings(struct tex_language *lang)
 498 {
 499     const char *value;
 500     size_t size = 0, current = 0;
 501     size_t l = 0;
 502     char *ret = NULL;
 503     lua_State *L = Luas;
 504     if (lang->exceptions == 0)
 505         return NULL;
 506     lua_checkstack(L, 2);
 507     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 508     if (lua_istable(L, -1)) {
 509         /* iterate and join */
 510         lua_pushnil(L);         /* first key */
 511         while (lua_next(L, -2) != 0) {
 512             value = lua_tolstring(L, -1, &l);
 513             if (current + 2 + l > size) {
 514                 ret = xrealloc(ret, (unsigned) ((size + size / 5) + current + l + 1024));
 515                 size = (size + size / 5) + current + l + 1024;
 516             }
 517             *(ret + current) = ' ';
 518             strcpy(ret + current + 1, value);
 519             current += l + 1;
 520             lua_pop(L, 1);
 521         }
 522     }
 523     return ret;
 524 }
 525
 526 @ the sequence from |wordstart| to |r| can contain only normal characters it
 527 could be faster to modify a halfword pointer and return an integer
 528
 529 @c
 530 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
 531 {
 532     halfword g = null, gg = null;
 533     register unsigned i = *j;
 534     i++;                        /* this puts uword[i] on the |{| */
 535     while (i < (unsigned) len && uword[i + 1] != '}') {
 536         if (g == null) {
 537             gg = new_char(0, (int) uword[i + 1]);
 538             g = gg;
 539         } else {
 540             halfword s = new_char(0, (int) uword[i + 1]);
 541             couple_nodes(g, s);
 542             g = vlink(g);
 543         }
 544         i++;
 545     }
 546     *j = ++i;
 547     return gg;
 548 }
 549
 550 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
 551 {
 552     int ret = 0;
 553     register unsigned i = *j;
 554     i++;                        /* this puts uword[i] on the |{| */
 555     while (i < (unsigned) len && uword[i + 1] != '}') {
 556         ret++;
 557         i++;
 558     }
 559     *j = ++i;
 560     return ret;
 561 }
 562
 563 @ @c
 564 static const char *PAT_ERROR[] = {
 565     "Exception discretionaries should contain three pairs of braced items.",
 566     "No intervening spaces are allowed.",
 567     NULL
 568 };
 569
 570 /*
 571     The exceptions are taken as-is: no min values are taken into account. One can
 572     add normal patterns on-the-fly if needed.
 573 */
 574
 575 static void do_exception(halfword wordstart, halfword r, char *replacement)
 576 {
 577     unsigned i;
 578     halfword t;
 579     unsigned len;
 580     int clang;
 581     lang_variables langdata;
 582     unsigned uword[MAX_WORD_LEN + 1] = { 0 };
 583     utf2uni_strcpy(uword, replacement);
 584     len = u_length(uword);
 585     i = 0;
 586     t = wordstart;
 587     clang = char_lang(wordstart);
 588     langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 589     langdata.post_hyphen_char = get_post_hyphen_char(clang);
 590
 591     for (i = 0; i < len; i++) {
 592         if (uword[i + 1] == '-') {      /* a hyphen follows */
 593             while (vlink(t) != r && (type(t) != glyph_node || !is_simple_character(t)))
 594                 t = vlink(t);
 595             if (vlink(t) == r)
 596                 break;
 597             insert_syllable_discretionary(t, &langdata);
 598             t = vlink(t);       /* skip the new disc */
 599         } else if (uword[i + 1] == '=') {
 600             /* do nothing ? */
 601             t = vlink(t);
 602         } else if (uword[i + 1] == '{') {
 603             halfword gg, hh, replace = null;
 604             int repl;
 605             gg = find_exception_part(&i, uword, (int) len);
 606             if (i == len || uword[i + 1] != '{') {
 607                 tex_error("broken pattern 1", PAT_ERROR);
 608             }
 609             hh = find_exception_part(&i, uword, (int) len);
 610             if (i == len || uword[i + 1] != '{') {
 611                 tex_error("broken pattern 2", PAT_ERROR);
 612             }
 613             repl = count_exception_part(&i, uword, (int) len);
 614             if (i == len) {
 615                 tex_error("broken pattern 3", PAT_ERROR);
 616             }
 617             /*i++;  *//* jump over the last right brace */
 618             if (vlink(t) == r)
 619                 break;
 620             if (repl > 0) {
 621                 halfword q = t;
 622                 replace = vlink(q);
 623                 while (repl > 0 && q != null) {
 624                     q = vlink(q);
 625                     if (type(q) == glyph_node) {
 626                         repl--;
 627                     }
 628                 }
 629                 try_couple_nodes(t, vlink(q));
 630                 vlink(q) = null;
 631             }
 632             t = insert_discretionary(t, gg, hh, replace,int_par(hyphen_penalty_code));
 633             t = vlink(t);       /* skip the new disc */
 634         } else {
 635             t = vlink(t);
 636         }
 637     }
 638 }
 639
 640 @ This is a documentation section from the pascal web file. It is not true any
 641 more, but I do not have time right now to rewrite it -- Taco
 642
 643 When the line-breaking routine is unable to find a feasible sequence of
 644 breakpoints, it makes a second pass over the paragraph, attempting to hyphenate
 645 the hyphenatable words. The goal of hyphenation is to insert discretionary
 646 material into the paragraph so that there are more potential places to break.
 647
 648 The general rules for hyphenation are somewhat complex and technical, because we
 649 want to be able to hyphenate words that are preceded or followed by punctuation
 650 marks, and because we want the rules to work for languages other than English. We
 651 also must contend with the fact that hyphens might radically alter the ligature
 652 and kerning structure of a word.
 653
 654 A sequence of characters will be considered for hyphenation only if it belongs to
 655 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
 656 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are
 657 either character or ligature or whatsit or implicit kern nodes, and $p_m$ is a
 658 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
 659 (Therefore hyphenation is disabled by boxes, math formulas, and discretionary
 660 nodes already inserted by the user.) The ligature nodes among $p_1\ldots p_{m-1}$
 661 are effectively expanded into the original non-ligature characters; the kern
 662 nodes and whatsits are ignored. Each character |c| is now classified as either a
 663 nonletter (if |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an
 664 uppercase letter (otherwise); an uppercase letter is treated as if it were
 665 |lc_code(c)| for purposes of hyphenation. The characters generated by $p_1\ldots
 666 p_{m-1}$ may begin with nonletters; let $c_1$ be the first letter that is not in
 667 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a whatsit
 668 found after $c_1$ will be the terminating node $p_m$. All characters that do not
 669 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
 670 that font must be between 0 and 255, otherwise hyphenation will not be attempted.
 671 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible;
 672 however, |n| must be less than 64, so a character that would otherwise be
 673 $c_{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
 674 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
 675 generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|,
 676 this string qualifies for hyphenation; however, |uc_hyph| must be positive, if
 677 $c_1$ is uppercase.
 678
 679 The hyphenation process takes place in three stages. First, the candidate
 680 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens are
 681 determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots
 682 p_b$ are replaced by a new sequence of nodes that includes the discretionary
 683 breaks found.
 684
 685 Fortunately, we do not have to do all this calculation very often, because of the
 686 way it has been taken out of \TeX's inner loop. For example, when the second
 687 edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset
 688 by \TeX, only about 1.2 hyphenations needed to be @^Knuth, Donald Ervin@> tried
 689 per paragraph, since the line breaking algorithm needed to use two passes on only
 690 about 5 per cent of the paragraphs.
 691
 692 When a word been set up to contain a candidate for hyphenation, \TeX\ first looks
 693 to see if it is in the user's exception dictionary. If not, hyphens are inserted
 694 based on patterns that appear within the given word, using an algorithm due to
 695 Frank~M. Liang. @^Liang, Franklin Mark@>
 696
 697 @ This is incompatible with TEX because the first word of a paragraph can be
 698 hyphenated, but most european users seem to agree that prohibiting hyphenation
 699 there was not the best idea ever.
 700
 701 @c
 702 static halfword find_next_wordstart(halfword r, halfword first_language)
 703 {
 704     register int l;
 705     register int start_ok = 1;
 706     int mathlevel = 1;
 707     int chr ;
 708     halfword t ;
 709     while (r != null) {
 710         switch (type(r)) {
 711         case boundary_node:
 712         case whatsit_node:
 713             break;
 714         case glue_node:
 715             start_ok = 1;
 716             break;
 717         case math_node:
 718             while (mathlevel > 0) {
 719                 r = vlink(r);
 720                 if (r == null)
 721                     return r;
 722                 if (type(r) == math_node) {
 723                     if (subtype(r) == before) {
 724                         mathlevel++;
 725                     } else {
 726                         mathlevel--;
 727                     }
 728                 }
 729             }
 730             break;
 731         case glyph_node:
 732             if (is_simple_character(r)) {
 733                 chr = character(r) ;
 734                 if (chr == ex_hyphen_char) {
 735                     /*
 736                         We only accept an explicit hyphen when there is a preceding glyph and
 737                         we skip a sequence of explicit hyphens as that normally indicates a
 738                         -- or --- ligature in which case we can in a worse case usage get bad
 739                         node lists later on due to messed up ligature building as these dashes
 740                         are ligatures in base fonts. This is a side effect of the separating the
 741                         hyphenation, ligaturing and kerning steps. A test is cmr with ------.
 742                     */
 743                     t = vlink(r) ;
 744                     if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
 745                         t = compound_word_break(r, char_lang(r));
 746                         subtype(t) = automatic_disc;
 747                         start_ok = 1 ;
 748                     } else {
 749                         start_ok = 0;
 750                     }
 751                 } else if (start_ok && (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
 752                     if (char_uchyph(r) || l == chr) {
 753                         return r;
 754                     } else {
 755                         start_ok = 0;
 756                     }
 757                 }
 758             }
 759             break;
 760         default:
 761             start_ok = 0;
 762             break;
 763         }
 764         r = vlink(r);
 765     }
 766     return r;
 767 }
 768
 769 @ @c
 770 static int valid_wordend(halfword s)
 771 {
 772     register halfword r = s;
 773     register int clang = char_lang(s);
 774     if (r == null)
 775         return 1;
 776     while ((r != null) && (   (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
 777                            || (type(r) == kern_node && (subtype(r) == normal))
 778            )) {
 779         r = vlink(r);
 780     }
 781     if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
 782                   ||  type(r) == glue_node
 783                   ||  type(r) == boundary_node
 784                   ||  type(r) == whatsit_node
 785                   ||  type(r) == ins_node
 786                   ||  type(r) == adjust_node
 787                   ||  type(r) == penalty_node
 788                   || (type(r) == kern_node && (subtype(r) == explicit_kern ||
 789                                                subtype(r) == italic_kern   ||
 790                                                subtype(r) == accent_kern   )))
 791         return 1;
 792     return 0;
 793 }
 794
 795 @ @c
 796 void hnj_hyphenation(halfword head, halfword tail)
 797 {
 798     int lchar, i;
 799     struct tex_language *lang;
 800     lang_variables langdata;
 801     char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
 802     int wordlen = 0;
 803     char *hy = utf8word;
 804     char *replacement = NULL;
 805     boolean explicit_hyphen = false;
 806     halfword first_language = int_par(first_valid_language_code);
 807     halfword s, r = head, wordstart = null, save_tail1 = null, left = null, right = null;
 808
 809     /* this first movement assures two things:
 810      \item{a)} that we won't waste lots of time on something that has been
 811       handled already (in that case, none of the glyphs match |simple_character|).
 812      \item{b)} that the first word can be hyphenated. if the movement was
 813      not explicit, then the indentation at the start of a paragraph
 814      list would make |find_next_wordstart()| look too far ahead.
 815      */
 816
 817     while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
 818         r = vlink(r);
 819     }
 820     /* this will make |r| a glyph node with subtype character */
 821     r = find_next_wordstart(r,first_language);
 822     if (r == null)
 823         return;
 824
 825     assert(tail != null);
 826     save_tail1 = vlink(tail);
 827     s = new_penalty(0);
 828     couple_nodes(tail, s);
 829
 830     while (r != null) {         /* could be while(1), but let's be paranoid */
 831         int clang, lhmin, rhmin, hmin;
 832         halfword hyf_font;
 833         halfword end_word = r;
 834         wordstart = r;
 835         assert(is_simple_character(wordstart));
 836         hyf_font = font(wordstart);
 837         if (hyphen_char(hyf_font) < 0)  /* for backward compat */
 838             hyf_font = 0;
 839         clang = char_lang(wordstart);
 840         lhmin = char_lhmin(wordstart);
 841         rhmin = char_rhmin(wordstart);
 842         hmin = get_hyphenation_min(clang);
 843         langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 844         langdata.post_hyphen_char = get_post_hyphen_char(clang);
 845         while (    r != null
 846                 && type(r) == glyph_node
 847                 && is_simple_character(r)
 848                 && clang == char_lang(r)
 849                 && (    (     (clang >= first_language)
 850                            && (lchar = get_hj_code(clang,character(r))) > 0
 851                         )
 852                      || (     character(r) == ex_hyphen_char
 853                            && (lchar = ex_hyphen_char)
 854                         )
 855                    )
 856               ) {
 857             if (character(r) == ex_hyphen_char)
 858                 explicit_hyphen = true;
 859             wordlen++;
 860             hy = uni2string(hy, (unsigned) lchar);
 861             /* this should not be needed  any more */
 862             /*if (vlink(r)!=null) alink(vlink(r))=r; */
 863             end_word = r;
 864             r = vlink(r);
 865         }
 866         if (     valid_wordend(r)
 867               && clang >= first_language
 868               && wordlen >= lhmin + rhmin
 869               && (hmin <= 0 || wordlen >= hmin)
 870               && (hyf_font != 0)
 871               && (lang = tex_languages[clang]) != NULL
 872            ) {
 873             *hy = 0;
 874             if (    lang->exceptions != 0
 875                  && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
 876                ) {
 877 #ifdef VERBOSE
 878                 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
 879 #endif
 880                 do_exception(wordstart, r, replacement);
 881                 free(replacement);
 882             } else if (explicit_hyphen == true) {
 883                 /* insert an explicit discretionary after each of the last in a
 884                    set of explicit hyphens */
 885                 halfword rr = r;
 886                 halfword t = null;
 887 #ifdef VERBOSE
 888                 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
 889 #endif
 890                 while (rr != wordstart) {
 891                     if (is_simple_character(rr)) {
 892                         if (character(rr) == ex_hyphen_char) {
 893                             t = compound_word_break(rr, clang);
 894                             subtype(t) = automatic_disc;
 895                             while (character(alink(rr)) == ex_hyphen_char)
 896                                 rr = alink(rr);
 897                             if (rr == wordstart)
 898                                 break;
 899                         }
 900                     }
 901                     rr = alink(rr);
 902                 }
 903             } else if (lang->patterns != NULL) {
 904                 left = wordstart;
 905                 for (i = lhmin; i > 1; i--) {
 906                     left = vlink(left);
 907                     while (!is_simple_character(left))
 908                         left = vlink(left);
 909                 }
 910                 right = r;
 911                 for (i = rhmin; i > 0; i--) {
 912                     right = alink(right);
 913                     while (!is_simple_character(right))
 914                         right = alink(right);
 915                 }
 916 #ifdef VERBOSE
 917                 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
 918                     utf8word, clang, lhmin, rhmin, character(left), character(right));
 919 #endif
 920                 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
 921             }
 922         }
 923         explicit_hyphen = false;
 924         wordlen = 0;
 925         hy = utf8word;
 926         if (r == null)
 927             break;
 928         r = find_next_wordstart(r,first_language);
 929     }
 930     flush_node(vlink(tail));
 931     vlink(tail) = save_tail1;
 932 }
 933
 934 @ @c
 935 void new_hyphenation(halfword head, halfword tail)
 936 {
 937     register int callback_id = 0;
 938     if (head == null || vlink(head) == null)
 939         return;
 940     fix_node_list(head);
 941     callback_id = callback_defined(hyphenate_callback);
 942     if (callback_id > 0) {
 943         lua_State *L = Luas;
 944         if (!get_callback(L, callback_id)) {
 945             lua_pop(L, 2);
 946             return;
 947         }
 948         nodelist_to_lua(L, head);
 949         nodelist_to_lua(L, tail);
 950         if (lua_pcall(L, 2, 0, 0) != 0) {
 951             formatted_warning("hyphenation","bad specification: %s",lua_tostring(L, -1));
 952             lua_pop(L, 2);
 953             lua_error(L);
 954             return;
 955         }
 956         lua_pop(L, 1);
 957     } else if (callback_id == 0) {
 958         hnj_hyphenation(head, tail);
 959     }
 960 }
 961
 962 @ dumping and undumping languages
 963
 964 @c
 965 #define dump_string(a)                \
 966   if (a!=NULL) {                      \
 967       x = (int)strlen(a)+1;           \
 968     dump_int(x);  dump_things(*a, x); \
 969   } else {                            \
 970     x = 0; dump_int(x);               \
 971   }
 972
 973 static void dump_one_language(int i)
 974 {
 975     char *s = NULL;
 976     int x = 0;
 977     struct tex_language *lang;
 978     lang = tex_languages[i];
 979     dump_int(lang->id);
 980     dump_int(lang->pre_hyphen_char);
 981     dump_int(lang->post_hyphen_char);
 982     dump_int(lang->pre_exhyphen_char);
 983     dump_int(lang->post_exhyphen_char);
 984     dump_int(lang->hyphenation_min);
 985     if (lang->patterns != NULL) {
 986         s = (char *) hnj_serialize(lang->patterns);
 987     }
 988     dump_string(s);
 989     if (s != NULL) {
 990         free(s);
 991         s = NULL;
 992     }
 993     if (lang->exceptions != 0)
 994         s = exception_strings(lang);
 995     dump_string(s);
 996     if (s != NULL) {
 997         free(s);
 998     }
 999     free(lang);
1000 }
1001
1002 void dump_language_data(void)
1003 {
1004     int i;
1005     dump_int(next_lang_id);
1006     for (i = 0; i < next_lang_id; i++) {
1007         if (tex_languages[i]) {
1008             dump_int(1);
1009             dump_one_language(i);
1010         } else {
1011             dump_int(0);
1012         }
1013     }
1014 }
1015
1016 static void undump_one_language(int i)
1017 {
1018     char *s = NULL;
1019     int x = 0;
1020     struct tex_language *lang = get_language(i);
1021     undump_int(x);
1022     lang->id = x;
1023     undump_int(x);
1024     lang->pre_hyphen_char = x;
1025     undump_int(x);
1026     lang->post_hyphen_char = x;
1027     undump_int(x);
1028     lang->pre_exhyphen_char = x;
1029     undump_int(x);
1030     lang->post_exhyphen_char = x;
1031     undump_int(x);
1032     lang->hyphenation_min = x;
1033     /* patterns */
1034     undump_int(x);
1035     if (x > 0) {
1036         s = xmalloc((unsigned) x);
1037         undump_things(*s, x);
1038         load_patterns(lang, (unsigned char *) s);
1039         free(s);
1040     }
1041     /* exceptions */
1042     undump_int(x);
1043     if (x > 0) {
1044         s = xmalloc((unsigned) x);
1045         undump_things(*s, x);
1046         load_hyphenation(lang, (unsigned char *) s);
1047         free(s);
1048     }
1049 }
1050
1051 void undump_language_data(void)
1052 {
1053     int i, x, numlangs;
1054     undump_int(numlangs);
1055     next_lang_id = numlangs;
1056     for (i = 0; i < numlangs; i++) {
1057         undump_int(x);
1058         if (x == 1) {
1059             undump_one_language(i);
1060         }
1061     }
1062 }
1063
1064 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1065 |new_hyph_exceptions| to do the right thing.
1066
1067 @c
1068 void new_hyph_exceptions(void)
1069 {                               /* enters new exceptions */
1070     (void) scan_toks(false, true);
1071     load_tex_hyphenation(int_par(language_code), def_ref);
1072     flush_list(def_ref);
1073 }
1074
1075 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1076 procedure named |new_patterns|.
1077
1078 @c
1079 void new_patterns(void)
1080 {                               /* initializes the hyphenation pattern data */
1081     (void) scan_toks(false, true);
1082     load_tex_patterns(int_par(language_code), def_ref);
1083     flush_list(def_ref);
1084 }
1085
1086 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1087 `\.{\\posthyphenchar}' the |post_break| character. Their respective defaults are
1088 ascii hyphen ("-") and zero (nul).
1089
1090 @c
1091 void new_pre_hyphen_char(void)
1092 {
1093     scan_optional_equals();
1094     scan_int();
1095     set_pre_hyphen_char(int_par(language_code), cur_val);
1096 }
1097
1098 void new_post_hyphen_char(void)
1099 {
1100     scan_optional_equals();
1101     scan_int();
1102     set_post_hyphen_char(int_par(language_code), cur_val);
1103 }
1104
1105 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1106 `\.{\\postexhyphenchar}' the |post_break| character. Their defaults are both zero
1107 (nul).
1108
1109 @c
1110 void new_pre_exhyphen_char(void)
1111 {
1112     scan_optional_equals();
1113     scan_int();
1114     set_pre_exhyphen_char(int_par(language_code), cur_val);
1115 }
1116
1117 void new_post_exhyphen_char(void)
1118 {
1119     scan_optional_equals();
1120     scan_int();
1121     set_post_exhyphen_char(int_par(language_code), cur_val);
1122 }
1123
1124 void new_hyphenation_min(void)
1125 {
1126     scan_optional_equals();
1127     scan_int();
1128     set_hyphenation_min(int_par(language_code), cur_val);
1129 }
1130
1131 void new_hj_code(void)
1132 {
1133     int i ;
1134     scan_int();
1135     i = cur_val;
1136     scan_optional_equals();
1137     scan_int();
1138     set_hj_code(int_par(language_code), i, cur_val, -1);
1139 }