source/texk/web2c/luatexdir/lang/texlang.w

   1 % texlang.w
   2 %
   3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
   4 %
   5 % This file is part of LuaTeX.
   6 %
   7 % LuaTeX is free software; you can redistribute it and/or modify it under
   8 % the terms of the GNU General Public License as published by the Free
   9 % Software Foundation; either version 2 of the License, or (at your
  10 % option) any later version.
  11 %
  12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
  13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 % FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 % License for more details.
  16 %
  17 % You should have received a copy of the GNU General Public License along
  18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
  19
  20 @ @c
  21
  22 #include "ptexlib.h"
  23 #include <string.h>
  24 #include "lua/luatex-api.h"
  25
  26 @ Low-level helpers
  27
  28 @ @c
  29 #define unVERBOSE
  30
  31 #define MAX_TEX_LANGUAGES  16384
  32
  33 #define ex_hyphen_char int_par(ex_hyphen_char_code)
  34
  35 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
  36
  37 static int next_lang_id = 0;
  38
  39 struct tex_language *new_language(int n)
  40 {
  41     struct tex_language *lang;
  42     unsigned l;
  43     if (n >= 0) {
  44         l = (unsigned) n;
  45         if (l != (MAX_TEX_LANGUAGES - 1))
  46             if (next_lang_id <= n)
  47                 next_lang_id = n + 1;
  48     } else {
  49         while (tex_languages[next_lang_id] != NULL)
  50             next_lang_id++;
  51         l = (unsigned) next_lang_id++;
  52     }
  53     if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
  54         lang = xmalloc(sizeof(struct tex_language));
  55         tex_languages[l] = lang;
  56         lang->id = (int) l;
  57         lang->exceptions = 0;
  58         lang->patterns = NULL;
  59         lang->pre_hyphen_char = '-';
  60         lang->post_hyphen_char = 0;
  61         lang->pre_exhyphen_char = 0;
  62         lang->post_exhyphen_char = 0;
  63         lang->hyphenation_min = -1;
  64         if (int_par(saving_hyph_codes_code)) {
  65             hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
  66         }
  67         return lang;
  68     } else {
  69         return NULL;
  70     }
  71 }
  72
  73 struct tex_language *get_language(int n)
  74 {
  75     if (n >= 0 && n < MAX_TEX_LANGUAGES) {
  76         if (tex_languages[n] != NULL) {
  77             return tex_languages[n];
  78         } else {
  79             return new_language(n);
  80         }
  81     } else {
  82         return NULL;
  83     }
  84 }
  85
  86 @ @c
  87 void set_pre_hyphen_char(int n, int v)
  88 {
  89     struct tex_language *l = get_language((int) n);
  90     if (l != NULL)
  91         l->pre_hyphen_char = (int) v;
  92 }
  93
  94 void set_post_hyphen_char(int n, int v)
  95 {
  96     struct tex_language *l = get_language((int) n);
  97     if (l != NULL)
  98         l->post_hyphen_char = (int) v;
  99 }
 100
 101 void set_pre_exhyphen_char(int n, int v)
 102 {
 103     struct tex_language *l = get_language((int) n);
 104     if (l != NULL)
 105         l->pre_exhyphen_char = (int) v;
 106 }
 107
 108 void set_post_exhyphen_char(int n, int v)
 109 {
 110     struct tex_language *l = get_language((int) n);
 111     if (l != NULL)
 112         l->post_exhyphen_char = (int) v;
 113 }
 114
 115 int get_pre_hyphen_char(int n)
 116 {
 117     struct tex_language *l = get_language((int) n);
 118     if (l == NULL)
 119         return -1;
 120     return (int) l->pre_hyphen_char;
 121 }
 122
 123 int get_post_hyphen_char(int n)
 124 {
 125     struct tex_language *l = get_language((int) n);
 126     if (l == NULL)
 127         return -1;
 128     return (int) l->post_hyphen_char;
 129 }
 130
 131 int get_pre_exhyphen_char(int n)
 132 {
 133     struct tex_language *l = get_language((int) n);
 134     if (l == NULL)
 135         return -1;
 136     return (int) l->pre_exhyphen_char;
 137 }
 138
 139 int get_post_exhyphen_char(int n)
 140 {
 141     struct tex_language *l = get_language((int) n);
 142     if (l == NULL)
 143         return -1;
 144     return (int) l->post_exhyphen_char;
 145 }
 146
 147 void set_hyphenation_min(int n, int v)
 148 {
 149     struct tex_language *l = get_language((int) n);
 150     if (l != NULL)
 151         l->hyphenation_min = (int) v;
 152 }
 153
 154 int get_hyphenation_min(int n)
 155 {
 156     struct tex_language *l = get_language((int) n);
 157     if (l == NULL)
 158         return -1;
 159     return (int) l->hyphenation_min;
 160 }
 161
 162 void load_patterns(struct tex_language *lang, const unsigned char *buff)
 163 {
 164     if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
 165         return;
 166     if (lang->patterns == NULL) {
 167         lang->patterns = hnj_hyphen_new();
 168     }
 169     hnj_hyphen_load(lang->patterns, buff);
 170 }
 171
 172 void clear_patterns(struct tex_language *lang)
 173 {
 174     if (lang == NULL)
 175         return;
 176     if (lang->patterns != NULL) {
 177         hnj_hyphen_clear(lang->patterns);
 178     }
 179 }
 180
 181 void load_tex_patterns(int curlang, halfword head)
 182 {
 183     char *s = tokenlist_to_cstring(head, 1, NULL);
 184     load_patterns(get_language(curlang), (unsigned char *) s);
 185 }
 186
 187 @ @c
 188 #define STORE_CHAR(l,x) do { \
 189     unsigned xx = get_hj_code(l,x); \
 190     if (!xx) { \
 191         xx = x; \
 192     } \
 193     uindex = uni2string(uindex, xx); \
 194 } while (0)
 195
 196 @ Cleans one word which is returned in |cleaned|, returns the new offset into
 197 |buffer|
 198
 199 @c
 200 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
 201 {
 202     int items = 0;
 203     unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
 204     unsigned uword[MAX_WORD_LEN + 1] = { 0 };  /* work buffer for unicode */
 205     int u = 0; /* unicode buffer value */
 206     int i = 0; /* index into buffer */
 207     char *uindex = (char *)word;
 208     const char *s = buff;
 209
 210     while (*s && !isspace((unsigned char)*s)) {
 211         word[i++] = (unsigned)*s;
 212         s++;
 213         if ((s-buff)>MAX_WORD_LEN) {
 214             /* todo: this is too strict, should count unicode, not bytes */
 215             *cleaned = NULL;
 216             tex_error("exception too long", NULL);
 217             return s;
 218         }
 219     }
 220     /* now convert the input to unicode */
 221     word[i] = '\0';
 222     utf2uni_strcpy(uword, (const char *)word);
 223
 224     /* build the new word string */
 225     i = 0;
 226     while (uword[i]>0) {
 227         u = uword[i++];
 228         if (u == '-') {        /* skip */
 229         } else if (u == '=') {
 230             STORE_CHAR(id,'-');
 231         } else if (u == '{') {
 232             u = uword[i++];
 233             items = 0;
 234             while (u && u != '}') {
 235                 u = uword[i++];
 236             }
 237             if (u == '}') {
 238                 items++;
 239                 u = uword[i++];
 240             }
 241             while (u && u != '}') {
 242                 u = uword[i++];
 243             }
 244             if (u == '}') {
 245                 items++;
 246                 u = uword[i++];
 247             }
 248             if (u == '{') {
 249                 u = uword[i++];
 250             }
 251             while (u && u != '}') {
 252                 STORE_CHAR(id,u);
 253                 u = uword[i++];
 254             }
 255             if (u == '}') {
 256                 items++;
 257             }
 258             if (items != 3) {   /* syntax error */
 259                 *cleaned = NULL;
 260                 tex_error("exception syntax error", NULL);
 261                 return s;
 262             }
 263         } else {
 264             STORE_CHAR(id,u);
 265         }
 266     }
 267     *uindex = '\0';
 268     *cleaned = xstrdup((char *) word);
 269     return s;
 270 }
 271
 272 @ @c
 273 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
 274 {
 275     const char *s;
 276     const char *value;
 277     char *cleaned;
 278     int id ;
 279     lua_State *L = Luas;
 280     if (lang == NULL)
 281         return;
 282     if (lang->exceptions == 0) {
 283         lua_newtable(L);
 284         lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
 285     }
 286     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 287     s = (const char *) buff;
 288     id = lang->id;
 289     while (*s) {
 290         while (isspace((unsigned char)*s))
 291             s++;
 292         if (*s) {
 293             value = s;
 294             s = clean_hyphenation(id, s, &cleaned);
 295             if (cleaned != NULL) {
 296                 if ((s - value) > 0) {
 297                     lua_pushstring(L, cleaned);
 298                     lua_pushlstring(L, value, (size_t) (s - value));
 299                     lua_rawset(L, -3);
 300                 }
 301                 free(cleaned);
 302             } else {
 303 #ifdef VERBOSE
 304                 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
 305 #endif
 306             }
 307         }
 308     }
 309 }
 310
 311 void clear_hyphenation(struct tex_language *lang)
 312 {
 313     if (lang == NULL)
 314         return;
 315     if (lang->exceptions != 0) {
 316         luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 317         lang->exceptions = 0;
 318     }
 319 }
 320
 321 void load_tex_hyphenation(int curlang, halfword head)
 322 {
 323     char *s = tokenlist_to_cstring(head, 1, NULL);
 324     load_hyphenation(get_language(curlang), (unsigned char *) s);
 325 }
 326
 327 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very nice,
 328 but needed. Also, in the post-break, it would be nicer to get the attribute list
 329 from |vlink(n)|. No rush, as it is currently not used much.
 330
 331 @c
 332 halfword insert_discretionary(halfword t, halfword pre, halfword post,
 333                               halfword replace, int penalty)
 334 {
 335     halfword g, n;
 336     int f;
 337     n = new_node(disc_node, syllable_disc);
 338     disc_penalty(n) = penalty;
 339     try_couple_nodes(n, vlink(t));
 340     couple_nodes(t, n);
 341     if (replace != null)
 342         f = font(replace);
 343     else
 344         f = get_cur_font();     /* for compound words following explicit hyphens */
 345     for (g = pre; g != null; g = vlink(g)) {
 346         font(g) = f;
 347         if (node_attr(t) != null) {
 348             delete_attribute_ref(node_attr(g));
 349             node_attr(g) = node_attr(t);
 350             attr_list_ref(node_attr(t)) += 1;
 351         }
 352     }
 353     for (g = post; g != null; g = vlink(g)) {
 354         font(g) = f;
 355         if (node_attr(t) != null) {
 356             delete_attribute_ref(node_attr(g));
 357             node_attr(g) = node_attr(t);
 358             attr_list_ref(node_attr(t)) += 1;
 359         }
 360     }
 361     for (g = replace; g != null; g = vlink(g)) {
 362         if (node_attr(t) != null) {
 363             delete_attribute_ref(node_attr(g));
 364             node_attr(g) = node_attr(t);
 365             attr_list_ref(node_attr(t)) += 1;
 366         }
 367     }
 368     if (node_attr(t) != null) {
 369         delete_attribute_ref(node_attr(vlink(t)));
 370         node_attr(vlink(t)) = node_attr(t);
 371         attr_list_ref(node_attr(t)) += 1;
 372     }
 373     t = vlink(t);
 374     set_disc_field(pre_break(t), pre);
 375     set_disc_field(post_break(t), post);
 376     set_disc_field(no_break(t), replace);
 377     return t;
 378 }
 379
 380 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
 381 {
 382     halfword g, n;
 383     n = new_node(disc_node, syllable_disc);
 384     disc_penalty(n) = int_par(hyphen_penalty_code);
 385     couple_nodes(n, vlink(t));
 386     couple_nodes(t, n);
 387     delete_attribute_ref(node_attr(n));
 388     if (node_attr(t) != null) {
 389         node_attr(n) = node_attr(t);
 390         attr_list_ref(node_attr(t))++;
 391     } else {
 392         node_attr(n) = null;
 393     }
 394     if (lan->pre_hyphen_char > 0) {
 395         g = raw_glyph_node();
 396         set_to_character(g);
 397         character(g) = lan->pre_hyphen_char;
 398         font(g) = font(t);
 399         lang_data(g) = lang_data(t);
 400         if (node_attr(t) != null) {
 401             node_attr(g) = node_attr(t);
 402             attr_list_ref(node_attr(t))++;
 403         }
 404         set_disc_field(pre_break(n), g);
 405     }
 406
 407     if (lan->post_hyphen_char > 0) {
 408         t = vlink(n);
 409         g = raw_glyph_node();
 410         set_to_character(g);
 411         character(g) = lan->post_hyphen_char;
 412         font(g) = font(t);
 413         lang_data(g) = lang_data(t);
 414         if (node_attr(t) != null) {
 415             node_attr(g) = node_attr(t);
 416             attr_list_ref(node_attr(t)) += 1;
 417         }
 418         set_disc_field(post_break(n), g);
 419     }
 420     return n;
 421 }
 422
 423 halfword insert_word_discretionary(halfword t, lang_variables * lan)
 424 {
 425     halfword pre = null, pos = null;
 426     if (lan->pre_exhyphen_char > 0)
 427         pre = insert_character(null, lan->pre_exhyphen_char);
 428     if (lan->post_exhyphen_char > 0)
 429         pos = insert_character(null, lan->post_exhyphen_char);
 430     return insert_discretionary(t, pre, pos, null,int_par(ex_hyphen_penalty_code));
 431 }
 432
 433 @ @c
 434 halfword compound_word_break(halfword t, int clang)
 435 {
 436     int disc;
 437     lang_variables langdata;
 438     langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
 439     langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
 440     disc = insert_word_discretionary(t, &langdata);
 441     return disc;
 442 }
 443
 444 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
 445                                       halfword pre, halfword pos,
 446                                       halfword replace)
 447 {
 448     (void) lan;
 449     return insert_discretionary(t, pre, pos, replace,int_par(hyphen_penalty_code));
 450 }
 451
 452 halfword insert_character(halfword t, int c)
 453 {
 454     halfword p;
 455     p = new_node(glyph_node, 0);
 456     set_to_character(p);
 457     character(p) = c;
 458     if (t != null) {
 459         couple_nodes(t, p);
 460     }
 461     return p;
 462 }
 463
 464 @ @c
 465 void set_disc_field(halfword f, halfword t)
 466 {
 467     if (t != null) {
 468         couple_nodes(f, t);
 469         tlink(f) = tail_of_list(t);
 470     } else {
 471         vlink(f) = null;
 472         tlink(f) = null;
 473     }
 474 }
 475
 476 @ @c
 477 static char *hyphenation_exception(int exceptions, char *w)
 478 {
 479     char *ret = NULL;
 480     lua_State *L = Luas;
 481     lua_checkstack(L, 2);
 482     lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
 483     if (lua_istable(L, -1)) {   /* ?? */
 484         lua_pushstring(L, w);   /* word table */
 485         lua_rawget(L, -2);
 486         if (lua_type(L, -1) == LUA_TSTRING) {
 487             ret = xstrdup(lua_tostring(L, -1));
 488         }
 489         lua_pop(L, 2);
 490     } else {
 491         lua_pop(L, 1);
 492     }
 493     return ret;
 494 }
 495
 496 @ @c
 497 char *exception_strings(struct tex_language *lang)
 498 {
 499     const char *value;
 500     size_t size = 0, current = 0;
 501     size_t l = 0;
 502     char *ret = NULL;
 503     lua_State *L = Luas;
 504     if (lang->exceptions == 0)
 505         return NULL;
 506     lua_checkstack(L, 2);
 507     lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
 508     if (lua_istable(L, -1)) {
 509         /* iterate and join */
 510         lua_pushnil(L);         /* first key */
 511         while (lua_next(L, -2) != 0) {
 512             value = lua_tolstring(L, -1, &l);
 513             if (current + 2 + l > size) {
 514                 ret = xrealloc(ret, (unsigned) ((size + size / 5) + current + l + 1024));
 515                 size = (size + size / 5) + current + l + 1024;
 516             }
 517             *(ret + current) = ' ';
 518             strcpy(ret + current + 1, value);
 519             current += l + 1;
 520             lua_pop(L, 1);
 521         }
 522     }
 523     return ret;
 524 }
 525
 526 @ the sequence from |wordstart| to |r| can contain only normal characters it
 527 could be faster to modify a halfword pointer and return an integer
 528
 529 @c
 530 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
 531 {
 532     halfword g = null, gg = null;
 533     register unsigned i = *j;
 534     i++;                        /* this puts uword[i] on the |{| */
 535     while (i < (unsigned) len && uword[i + 1] != '}') {
 536         if (g == null) {
 537             gg = new_char(0, (int) uword[i + 1]);
 538             g = gg;
 539         } else {
 540             halfword s = new_char(0, (int) uword[i + 1]);
 541             couple_nodes(g, s);
 542             g = vlink(g);
 543         }
 544         i++;
 545     }
 546     *j = ++i;
 547     return gg;
 548 }
 549
 550 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
 551 {
 552     int ret = 0;
 553     register unsigned i = *j;
 554     i++;                        /* this puts uword[i] on the |{| */
 555     while (i < (unsigned) len && uword[i + 1] != '}') {
 556         ret++;
 557         i++;
 558     }
 559     *j = ++i;
 560     return ret;
 561 }
 562
 563 @ @c
 564 static const char *PAT_ERROR[] = {
 565     "Exception discretionaries should contain three pairs of braced items.",
 566     "No intervening spaces are allowed.",
 567     NULL
 568 };
 569
 570 /*
 571     The exceptions are taken as-is: no min values are taken into account. One can
 572     add normal patterns on-the-fly if needed.
 573 */
 574
 575 static void do_exception(halfword wordstart, halfword r, char *replacement)
 576 {
 577     unsigned i;
 578     halfword t;
 579     unsigned len;
 580     int clang;
 581     lang_variables langdata;
 582     unsigned uword[MAX_WORD_LEN + 1] = { 0 };
 583     utf2uni_strcpy(uword, replacement);
 584     len = u_length(uword);
 585     i = 0;
 586     t = wordstart;
 587     clang = char_lang(wordstart);
 588     langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 589     langdata.post_hyphen_char = get_post_hyphen_char(clang);
 590
 591     for (i = 0; i < len; i++) {
 592         if (uword[i + 1] == '-') {      /* a hyphen follows */
 593             while (vlink(t) != r && (type(t) != glyph_node || !is_simple_character(t)))
 594                 t = vlink(t);
 595             if (vlink(t) == r)
 596                 break;
 597             insert_syllable_discretionary(t, &langdata);
 598             t = vlink(t);       /* skip the new disc */
 599         } else if (uword[i + 1] == '=') {
 600             /* do nothing ? */
 601             t = vlink(t);
 602         } else if (uword[i + 1] == '{') {
 603             halfword gg, hh, replace = null;
 604             int repl;
 605             gg = find_exception_part(&i, uword, (int) len);
 606             if (i == len || uword[i + 1] != '{') {
 607                 tex_error("broken pattern 1", PAT_ERROR);
 608             }
 609             hh = find_exception_part(&i, uword, (int) len);
 610             if (i == len || uword[i + 1] != '{') {
 611                 tex_error("broken pattern 2", PAT_ERROR);
 612             }
 613             repl = count_exception_part(&i, uword, (int) len);
 614             if (i == len) {
 615                 tex_error("broken pattern 3", PAT_ERROR);
 616             }
 617             /*i++;  *//* jump over the last right brace */
 618             if (vlink(t) == r)
 619                 break;
 620             if (repl > 0) {
 621                 halfword q = t;
 622                 replace = vlink(q);
 623                 while (repl > 0 && q != null) {
 624                     q = vlink(q);
 625                     if (type(q) == glyph_node) {
 626                         repl--;
 627                     }
 628                 }
 629                 try_couple_nodes(t, vlink(q));
 630                 vlink(q) = null;
 631             }
 632             t = insert_discretionary(t, gg, hh, replace,int_par(hyphen_penalty_code));
 633             t = vlink(t);       /* skip the new disc */
 634         } else {
 635             t = vlink(t);
 636         }
 637     }
 638 }
 639
 640 @ This is a documentation section from the pascal web file. It is not true any
 641 more, but I do not have time right now to rewrite it -- Taco
 642
 643 When the line-breaking routine is unable to find a feasible sequence of
 644 breakpoints, it makes a second pass over the paragraph, attempting to hyphenate
 645 the hyphenatable words. The goal of hyphenation is to insert discretionary
 646 material into the paragraph so that there are more potential places to break.
 647
 648 The general rules for hyphenation are somewhat complex and technical, because we
 649 want to be able to hyphenate words that are preceded or followed by punctuation
 650 marks, and because we want the rules to work for languages other than English. We
 651 also must contend with the fact that hyphens might radically alter the ligature
 652 and kerning structure of a word.
 653
 654 A sequence of characters will be considered for hyphenation only if it belongs to
 655 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
 656 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are
 657 either character or ligature or whatsit or implicit kern nodes, and $p_m$ is a
 658 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
 659 (Therefore hyphenation is disabled by boxes, math formulas, and discretionary
 660 nodes already inserted by the user.) The ligature nodes among $p_1\ldots p_{m-1}$
 661 are effectively expanded into the original non-ligature characters; the kern
 662 nodes and whatsits are ignored. Each character |c| is now classified as either a
 663 nonletter (if |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an
 664 uppercase letter (otherwise); an uppercase letter is treated as if it were
 665 |lc_code(c)| for purposes of hyphenation. The characters generated by $p_1\ldots
 666 p_{m-1}$ may begin with nonletters; let $c_1$ be the first letter that is not in
 667 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a whatsit
 668 found after $c_1$ will be the terminating node $p_m$. All characters that do not
 669 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
 670 that font must be between 0 and 255, otherwise hyphenation will not be attempted.
 671 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible;
 672 however, |n| must be less than 64, so a character that would otherwise be
 673 $c_{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
 674 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
 675 generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|,
 676 this string qualifies for hyphenation; however, |uc_hyph| must be positive, if
 677 $c_1$ is uppercase.
 678
 679 The hyphenation process takes place in three stages. First, the candidate
 680 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens are
 681 determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots
 682 p_b$ are replaced by a new sequence of nodes that includes the discretionary
 683 breaks found.
 684
 685 Fortunately, we do not have to do all this calculation very often, because of the
 686 way it has been taken out of \TeX's inner loop. For example, when the second
 687 edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset
 688 by \TeX, only about 1.2 hyphenations needed to be @^Knuth, Donald Ervin@> tried
 689 per paragraph, since the line breaking algorithm needed to use two passes on only
 690 about 5 per cent of the paragraphs.
 691
 692 When a word been set up to contain a candidate for hyphenation, \TeX\ first looks
 693 to see if it is in the user's exception dictionary. If not, hyphens are inserted
 694 based on patterns that appear within the given word, using an algorithm due to
 695 Frank~M. Liang. @^Liang, Franklin Mark@>
 696
 697 @ This is incompatible with TEX because the first word of a paragraph can be
 698 hyphenated, but most european users seem to agree that prohibiting hyphenation
 699 there was not the best idea ever.
 700
 701 @c
 702 static halfword find_next_wordstart(halfword r, halfword first_language)
 703 {
 704     register int l;
 705     register int start_ok = 1;
 706     int mathlevel = 1;
 707     int chr ;
 708     halfword t ;
 709     while (r != null) {
 710         switch (type(r)) {
 711         case boundary_node:
 712             if (subtype(r) == word_boundary) {
 713                 start_ok = 1;
 714             }
 715             break;
 716         case whatsit_node:
 717             break;
 718         case glue_node:
 719             start_ok = 1;
 720             break;
 721         case math_node:
 722             while (mathlevel > 0) {
 723                 r = vlink(r);
 724                 if (r == null)
 725                     return r;
 726                 if (type(r) == math_node) {
 727                     if (subtype(r) == before) {
 728                         mathlevel++;
 729                     } else {
 730                         mathlevel--;
 731                     }
 732                 }
 733             }
 734             break;
 735         case glyph_node:
 736             if (is_simple_character(r)) {
 737                 chr = character(r) ;
 738                 if (chr == ex_hyphen_char) {
 739                     /*
 740                         We only accept an explicit hyphen when there is a preceding glyph and
 741                         we skip a sequence of explicit hyphens as that normally indicates a
 742                         -- or --- ligature in which case we can in a worse case usage get bad
 743                         node lists later on due to messed up ligature building as these dashes
 744                         are ligatures in base fonts. This is a side effect of the separating the
 745                         hyphenation, ligaturing and kerning steps. A test is cmr with ------.
 746                     */
 747                     t = vlink(r) ;
 748                     if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
 749                         t = compound_word_break(r, char_lang(r));
 750                         subtype(t) = automatic_disc;
 751                         start_ok = 1 ;
 752                     } else {
 753                         start_ok = 0;
 754                     }
 755                 } else if (start_ok && (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
 756                     if (char_uchyph(r) || l == chr) {
 757                         return r;
 758                     } else {
 759                         start_ok = 0;
 760                     }
 761                 }
 762             }
 763             break;
 764         default:
 765             start_ok = 0;
 766             break;
 767         }
 768         r = vlink(r);
 769     }
 770     return r;
 771 }
 772
 773 @ @c
 774 static int valid_wordend(halfword s)
 775 {
 776     register halfword r = s;
 777     register int clang = char_lang(s);
 778     if (r == null)
 779         return 1;
 780     while ((r != null) && (   (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
 781                            || (type(r) == kern_node && (subtype(r) == normal))
 782            )) {
 783         r = vlink(r);
 784     }
 785     if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
 786                   ||  type(r) == glue_node
 787                   ||  type(r) == boundary_node
 788                   ||  type(r) == whatsit_node
 789                   ||  type(r) == ins_node
 790                   ||  type(r) == adjust_node
 791                   ||  type(r) == penalty_node
 792                   || (type(r) == kern_node && (subtype(r) == explicit_kern ||
 793                                                subtype(r) == italic_kern   ||
 794                                                subtype(r) == accent_kern   )))
 795         return 1;
 796     return 0;
 797 }
 798
 799 @ @c
 800 void hnj_hyphenation(halfword head, halfword tail)
 801 {
 802     int lchar, i;
 803     struct tex_language *lang;
 804     lang_variables langdata;
 805     char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
 806     int wordlen = 0;
 807     char *hy = utf8word;
 808     char *replacement = NULL;
 809     boolean explicit_hyphen = false;
 810     halfword first_language = int_par(first_valid_language_code);
 811     halfword s, r = head, wordstart = null, save_tail1 = null, left = null, right = null;
 812
 813     /* this first movement assures two things:
 814      \item{a)} that we won't waste lots of time on something that has been
 815       handled already (in that case, none of the glyphs match |simple_character|).
 816      \item{b)} that the first word can be hyphenated. if the movement was
 817      not explicit, then the indentation at the start of a paragraph
 818      list would make |find_next_wordstart()| look too far ahead.
 819      */
 820
 821     while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
 822         r = vlink(r);
 823     }
 824     /* this will make |r| a glyph node with subtype character */
 825     r = find_next_wordstart(r,first_language);
 826     if (r == null)
 827         return;
 828
 829     assert(tail != null);
 830     save_tail1 = vlink(tail);
 831     s = new_penalty(0);
 832     couple_nodes(tail, s);
 833
 834     while (r != null) {         /* could be while(1), but let's be paranoid */
 835         int clang, lhmin, rhmin, hmin;
 836         halfword hyf_font;
 837         halfword end_word = r;
 838         wordstart = r;
 839         assert(is_simple_character(wordstart));
 840         hyf_font = font(wordstart);
 841         if (hyphen_char(hyf_font) < 0)  /* for backward compat */
 842             hyf_font = 0;
 843         clang = char_lang(wordstart);
 844         lhmin = char_lhmin(wordstart);
 845         rhmin = char_rhmin(wordstart);
 846         hmin = get_hyphenation_min(clang);
 847         langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 848         langdata.post_hyphen_char = get_post_hyphen_char(clang);
 849         while (    r != null
 850                 && type(r) == glyph_node
 851                 && is_simple_character(r)
 852                 && clang == char_lang(r)
 853                 && (    (     (clang >= first_language)
 854                            && (lchar = get_hj_code(clang,character(r))) > 0
 855                         )
 856                      || (     character(r) == ex_hyphen_char
 857                            && (lchar = ex_hyphen_char)
 858                         )
 859                    )
 860               ) {
 861             if (character(r) == ex_hyphen_char)
 862                 explicit_hyphen = true;
 863             wordlen++;
 864             hy = uni2string(hy, (unsigned) lchar);
 865             /* this should not be needed  any more */
 866             /*if (vlink(r)!=null) alink(vlink(r))=r; */
 867             end_word = r;
 868             r = vlink(r);
 869         }
 870         if (     valid_wordend(r)
 871               && clang >= first_language
 872               && wordlen >= lhmin + rhmin
 873               && (hmin <= 0 || wordlen >= hmin)
 874               && (hyf_font != 0)
 875               && (lang = tex_languages[clang]) != NULL
 876            ) {
 877             *hy = 0;
 878             if (    lang->exceptions != 0
 879                  && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
 880                ) {
 881 #ifdef VERBOSE
 882                 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
 883 #endif
 884                 do_exception(wordstart, r, replacement);
 885                 free(replacement);
 886             } else if (explicit_hyphen == true) {
 887                 /* insert an explicit discretionary after each of the last in a
 888                    set of explicit hyphens */
 889                 halfword rr = r;
 890                 halfword t = null;
 891 #ifdef VERBOSE
 892                 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
 893 #endif
 894                 while (rr != wordstart) {
 895                     if (is_simple_character(rr)) {
 896                         if (character(rr) == ex_hyphen_char) {
 897                             t = compound_word_break(rr, clang);
 898                             subtype(t) = automatic_disc;
 899                             while (character(alink(rr)) == ex_hyphen_char)
 900                                 rr = alink(rr);
 901                             if (rr == wordstart)
 902                                 break;
 903                         }
 904                     }
 905                     rr = alink(rr);
 906                 }
 907             } else if (lang->patterns != NULL) {
 908                 left = wordstart;
 909                 for (i = lhmin; i > 1; i--) {
 910                     left = vlink(left);
 911                     while (!is_simple_character(left))
 912                         left = vlink(left);
 913                 }
 914                 right = r;
 915                 for (i = rhmin; i > 0; i--) {
 916                     right = alink(right);
 917                     while (!is_simple_character(right))
 918                         right = alink(right);
 919                 }
 920 #ifdef VERBOSE
 921                 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
 922                     utf8word, clang, lhmin, rhmin, character(left), character(right));
 923 #endif
 924                 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
 925             }
 926         }
 927         explicit_hyphen = false;
 928         wordlen = 0;
 929         hy = utf8word;
 930         if (r == null)
 931             break;
 932         r = find_next_wordstart(r,first_language);
 933     }
 934     flush_node(vlink(tail));
 935     vlink(tail) = save_tail1;
 936 }
 937
 938 @ @c
 939 void new_hyphenation(halfword head, halfword tail)
 940 {
 941     register int callback_id = 0;
 942     if (head == null || vlink(head) == null)
 943         return;
 944     fix_node_list(head);
 945     callback_id = callback_defined(hyphenate_callback);
 946     if (callback_id > 0) {
 947         lua_State *L = Luas;
 948         if (!get_callback(L, callback_id)) {
 949             lua_pop(L, 2);
 950             return;
 951         }
 952         nodelist_to_lua(L, head);
 953         nodelist_to_lua(L, tail);
 954         if (lua_pcall(L, 2, 0, 0) != 0) {
 955             formatted_warning("hyphenation","bad specification: %s",lua_tostring(L, -1));
 956             lua_pop(L, 2);
 957             lua_error(L);
 958             return;
 959         }
 960         lua_pop(L, 1);
 961     } else if (callback_id == 0) {
 962         hnj_hyphenation(head, tail);
 963     }
 964 }
 965
 966 @ dumping and undumping languages
 967
 968 @c
 969 #define dump_string(a)                \
 970   if (a!=NULL) {                      \
 971       x = (int)strlen(a)+1;           \
 972     dump_int(x);  dump_things(*a, x); \
 973   } else {                            \
 974     x = 0; dump_int(x);               \
 975   }
 976
 977 static void dump_one_language(int i)
 978 {
 979     char *s = NULL;
 980     int x = 0;
 981     struct tex_language *lang;
 982     lang = tex_languages[i];
 983     dump_int(lang->id);
 984     dump_int(lang->pre_hyphen_char);
 985     dump_int(lang->post_hyphen_char);
 986     dump_int(lang->pre_exhyphen_char);
 987     dump_int(lang->post_exhyphen_char);
 988     dump_int(lang->hyphenation_min);
 989     if (lang->patterns != NULL) {
 990         s = (char *) hnj_serialize(lang->patterns);
 991     }
 992     dump_string(s);
 993     if (s != NULL) {
 994         free(s);
 995         s = NULL;
 996     }
 997     if (lang->exceptions != 0)
 998         s = exception_strings(lang);
 999     dump_string(s);
1000     if (s != NULL) {
1001         free(s);
1002     }
1003     free(lang);
1004 }
1005
1006 void dump_language_data(void)
1007 {
1008     int i;
1009     dump_int(next_lang_id);
1010     for (i = 0; i < next_lang_id; i++) {
1011         if (tex_languages[i]) {
1012             dump_int(1);
1013             dump_one_language(i);
1014         } else {
1015             dump_int(0);
1016         }
1017     }
1018 }
1019
1020 static void undump_one_language(int i)
1021 {
1022     char *s = NULL;
1023     int x = 0;
1024     struct tex_language *lang = get_language(i);
1025     undump_int(x);
1026     lang->id = x;
1027     undump_int(x);
1028     lang->pre_hyphen_char = x;
1029     undump_int(x);
1030     lang->post_hyphen_char = x;
1031     undump_int(x);
1032     lang->pre_exhyphen_char = x;
1033     undump_int(x);
1034     lang->post_exhyphen_char = x;
1035     undump_int(x);
1036     lang->hyphenation_min = x;
1037     /* patterns */
1038     undump_int(x);
1039     if (x > 0) {
1040         s = xmalloc((unsigned) x);
1041         undump_things(*s, x);
1042         load_patterns(lang, (unsigned char *) s);
1043         free(s);
1044     }
1045     /* exceptions */
1046     undump_int(x);
1047     if (x > 0) {
1048         s = xmalloc((unsigned) x);
1049         undump_things(*s, x);
1050         load_hyphenation(lang, (unsigned char *) s);
1051         free(s);
1052     }
1053 }
1054
1055 void undump_language_data(void)
1056 {
1057     int i, x, numlangs;
1058     undump_int(numlangs);
1059     next_lang_id = numlangs;
1060     for (i = 0; i < numlangs; i++) {
1061         undump_int(x);
1062         if (x == 1) {
1063             undump_one_language(i);
1064         }
1065     }
1066 }
1067
1068 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1069 |new_hyph_exceptions| to do the right thing.
1070
1071 @c
1072 void new_hyph_exceptions(void)
1073 {                               /* enters new exceptions */
1074     (void) scan_toks(false, true);
1075     load_tex_hyphenation(int_par(language_code), def_ref);
1076     flush_list(def_ref);
1077 }
1078
1079 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1080 procedure named |new_patterns|.
1081
1082 @c
1083 void new_patterns(void)
1084 {                               /* initializes the hyphenation pattern data */
1085     (void) scan_toks(false, true);
1086     load_tex_patterns(int_par(language_code), def_ref);
1087     flush_list(def_ref);
1088 }
1089
1090 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1091 `\.{\\posthyphenchar}' the |post_break| character. Their respective defaults are
1092 ascii hyphen ("-") and zero (nul).
1093
1094 @c
1095 void new_pre_hyphen_char(void)
1096 {
1097     scan_optional_equals();
1098     scan_int();
1099     set_pre_hyphen_char(int_par(language_code), cur_val);
1100 }
1101
1102 void new_post_hyphen_char(void)
1103 {
1104     scan_optional_equals();
1105     scan_int();
1106     set_post_hyphen_char(int_par(language_code), cur_val);
1107 }
1108
1109 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1110 `\.{\\postexhyphenchar}' the |post_break| character. Their defaults are both zero
1111 (nul).
1112
1113 @c
1114 void new_pre_exhyphen_char(void)
1115 {
1116     scan_optional_equals();
1117     scan_int();
1118     set_pre_exhyphen_char(int_par(language_code), cur_val);
1119 }
1120
1121 void new_post_exhyphen_char(void)
1122 {
1123     scan_optional_equals();
1124     scan_int();
1125     set_post_exhyphen_char(int_par(language_code), cur_val);
1126 }
1127
1128 void new_hyphenation_min(void)
1129 {
1130     scan_optional_equals();
1131     scan_int();
1132     set_hyphenation_min(int_par(language_code), cur_val);
1133 }
1134
1135 void new_hj_code(void)
1136 {
1137     int i ;
1138     scan_int();
1139     i = cur_val;
1140     scan_optional_equals();
1141     scan_int();
1142     set_hj_code(int_par(language_code), i, cur_val, -1);
1143 }