source/texk/web2c/luatexdir/lang/texlang.w

   1 % texlang.w
   2 %
   3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
   4 %
   5 % This file is part of LuaTeX.
   6 %
   7 % LuaTeX is free software; you can redistribute it and/or modify it under
   8 % the terms of the GNU General Public License as published by the Free
   9 % Software Foundation; either version 2 of the License, or (at your
  10 % option) any later version.
  11 %
  12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
  13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 % FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 % License for more details.
  16 %
  17 % You should have received a copy of the GNU General Public License along
  18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
  19
  20 @ @c
  21
  22 #include "ptexlib.h"
  23 #include <string.h>
  24 #include "lua/luatex-api.h"
  25
  26 @ Low-level helpers
  27
  28 @ @c
  29 #define unVERBOSE
  30
  31 #define MAX_TEX_LANGUAGES  16384
  32
  33 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
  34
  35 static int next_lang_id = 0;
  36
  37 struct tex_language *new_language(int n)
  38 {
  39     struct tex_language *lang;
  40     unsigned l;
  41     if (n >= 0) {
  42         l = (unsigned) n;
  43         if (l != (MAX_TEX_LANGUAGES - 1))
  44             if (next_lang_id <= n)
  45                 next_lang_id = n + 1;
  46     } else {
  47         while (tex_languages[next_lang_id] != NULL)
  48             next_lang_id++;
  49         l = (unsigned) next_lang_id++;
  50     }
  51     if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
  52         lang = xmalloc(sizeof(struct tex_language));
  53         tex_languages[l] = lang;
  54         lang->id = (int) l;
  55         lang->exceptions = 0;
  56         lang->patterns = NULL;
  57         lang->pre_hyphen_char = '-';
  58         lang->post_hyphen_char = 0;
  59         lang->pre_exhyphen_char = 0;
  60         lang->post_exhyphen_char = 0;
  61         lang->hyphenation_min = -1;
  62         if (saving_hyph_codes_par) {
  63             hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
  64         }
  65         return lang;
  66     } else {
  67         return NULL;
  68     }
  69 }
  70
  71 struct tex_language *get_language(int n)
  72 {
  73     if (n >= 0 && n < MAX_TEX_LANGUAGES) {
  74         if (tex_languages[n] != NULL) {
  75             return tex_languages[n];
  76         } else {
  77             return new_language(n);
  78         }
  79     } else {
  80         return NULL;
  81     }
  82 }
  83
  84 @ @c
  85 void set_pre_hyphen_char(int n, int v)
  86 {
  87     struct tex_language *l = get_language((int) n);
  88     if (l != NULL)
  89         l->pre_hyphen_char = (int) v;
  90 }
  91
  92 void set_post_hyphen_char(int n, int v)
  93 {
  94     struct tex_language *l = get_language((int) n);
  95     if (l != NULL)
  96         l->post_hyphen_char = (int) v;
  97 }
  98
  99 void set_pre_exhyphen_char(int n, int v)
 100 {
 101     struct tex_language *l = get_language((int) n);
 102     if (l != NULL)
 103         l->pre_exhyphen_char = (int) v;
 104 }
 105
 106 void set_post_exhyphen_char(int n, int v)
 107 {
 108     struct tex_language *l = get_language((int) n);
 109     if (l != NULL)
 110         l->post_exhyphen_char = (int) v;
 111 }
 112
 113 int get_pre_hyphen_char(int n)
 114 {
 115     struct tex_language *l = get_language((int) n);
 116     if (l == NULL)
 117         return -1;
 118     return (int) l->pre_hyphen_char;
 119 }
 120
 121 int get_post_hyphen_char(int n)
 122 {
 123     struct tex_language *l = get_language((int) n);
 124     if (l == NULL)
 125         return -1;
 126     return (int) l->post_hyphen_char;
 127 }
 128
 129 int get_pre_exhyphen_char(int n)
 130 {
 131     struct tex_language *l = get_language((int) n);
 132     if (l == NULL)
 133         return -1;
 134     return (int) l->pre_exhyphen_char;
 135 }
 136
 137 int get_post_exhyphen_char(int n)
 138 {
 139     struct tex_language *l = get_language((int) n);
 140     if (l == NULL)
 141         return -1;
 142     return (int) l->post_exhyphen_char;
 143 }
 144
 145 void set_hyphenation_min(int n, int v)
 146 {
 147     struct tex_language *l = get_language((int) n);
 148     if (l != NULL)
 149         l->hyphenation_min = (int) v;
 150 }
 151
 152 int get_hyphenation_min(int n)
 153 {
 154     struct tex_language *l = get_language((int) n);
 155     if (l == NULL)
 156         return -1;
 157     return (int) l->hyphenation_min;
 158 }
 159
 160 void load_patterns(struct tex_language *lang, const unsigned char *buff)
 161 {
 162     if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
 163         return;
 164     if (lang->patterns == NULL) {
 165         lang->patterns = hnj_hyphen_new();
 166     }
 167     hnj_hyphen_load(lang->patterns, buff);
 168 }
 169
 170 void clear_patterns(struct tex_language *lang)
 171 {
 172     if (lang == NULL)
 173         return;
 174     if (lang->patterns != NULL) {
 175         hnj_hyphen_clear(lang->patterns);
 176     }
 177 }
 178
 179 void load_tex_patterns(int curlang, halfword head)
 180 {
 181     char *s = tokenlist_to_cstring(head, 1, NULL);
 182     load_patterns(get_language(curlang), (unsigned char *) s);
 183 }
 184
 185 @ @c
 186 #define STORE_CHAR(l,x) do { \
 187     unsigned xx = get_hj_code(l,x); \
 188     if (!xx || xx <= 32) { \
 189         xx = x; \
 190     } \
 191     uindex = uni2string(uindex, xx); \
 192 } while (0)
 193
 194 @ Cleans one word which is returned in |cleaned|, returns the new offset into
 195 |buffer|
 196
 197 @c
 198 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
 199 {
 200     int items = 0;
 201     unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
 202     unsigned uword[MAX_WORD_LEN + 1] = { 0 };  /* work buffer for unicode */
 203     int u = 0; /* unicode buffer value */
 204     int i = 0; /* index into buffer */
 205     char *uindex = (char *)word;
 206     const char *s = buff;
 207
 208     while (*s && !isspace((unsigned char)*s)) {
 209         word[i++] = (unsigned)*s;
 210         s++;
 211         if ((s-buff)>MAX_WORD_LEN) {
 212             /* todo: this is too strict, should count unicode, not bytes */
 213             *cleaned = NULL;
 214             tex_error("exception too long", NULL);
 215             return s;
 216         }
 217     }
 218     /* now convert the input to unicode */
 219     word[i] = '\0';
 220     utf2uni_strcpy(uword, (const char *)word);
 221
 222     /* build the new word string */
 223     i = 0;
 224     while (uword[i]>0) {
 225         u = uword[i++];
 226         if (u == '-') {        /* skip */
 227         } else if (u == '=') {
 228             STORE_CHAR(id,'-');
 229         } else if (u == '{') {
 230             u = uword[i++];
 231             items = 0;
 232             while (u && u != '}') {
 233                 u = uword[i++];
 234             }
 235             if (u == '}') {
 236                 items++;
 237                 u = uword[i++];
 238             }
 239             while (u && u != '}') {
 240                 u = uword[i++];
 241             }
 242             if (u == '}') {
 243                 items++;
 244                 u = uword[i++];
 245             }
 246             if (u == '{') {
 247                 u = uword[i++];
 248             }
 249             while (u && u != '}') {
 250                 STORE_CHAR(id,u);
 251                 u = uword[i++];
 252             }
 253             if (u == '}') {
 254                 items++;
 255             }
 256             if (items != 3) {   /* syntax error */
 257                 *cleaned = NULL;
 258                 tex_error("exception syntax error", NULL);
 259                 return s;
 260             }
 261         } else {
 262             STORE_CHAR(id,u);
 263         }
 264     }
 265     *uindex = '\0';
 266     *cleaned = xstrdup((char *) word);
 267     return s;
 268 }
 269
 270 @ @c
 271 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
 272 {
 273     const char *s;
 274     const char *value;
 275     char *cleaned;
 276     int id ;
 277     if (lang == NULL)
 278         return;
 279     if (lang->exceptions == 0) {
 280         lua_newtable(Luas);
 281         lang->exceptions = luaL_ref(Luas, LUA_REGISTRYINDEX);
 282     }
 283     lua_rawgeti(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 284     s = (const char *) buff;
 285     id = lang->id;
 286     while (*s) {
 287         while (isspace((unsigned char)*s))
 288             s++;
 289         if (*s) {
 290             value = s;
 291             s = clean_hyphenation(id, s, &cleaned);
 292             if (cleaned != NULL) {
 293                 if ((s - value) > 0) {
 294                     lua_pushstring(Luas, cleaned);
 295                     lua_pushlstring(Luas, value, (size_t) (s - value));
 296                     lua_rawset(Luas, -3);
 297                 }
 298                 free(cleaned);
 299             } else {
 300 #ifdef VERBOSE
 301                 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
 302 #endif
 303             }
 304         }
 305     }
 306 }
 307
 308 void clear_hyphenation(struct tex_language *lang)
 309 {
 310     if (lang == NULL)
 311         return;
 312     if (lang->exceptions != 0) {
 313         luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 314         lang->exceptions = 0;
 315     }
 316 }
 317
 318 void load_tex_hyphenation(int curlang, halfword head)
 319 {
 320     char *s = tokenlist_to_cstring(head, 1, NULL);
 321     load_hyphenation(get_language(curlang), (unsigned char *) s);
 322 }
 323
 324 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very nice,
 325 but needed. Also, in the post-break, it would be nicer to get the attribute list
 326 from |vlink(n)|. No rush, as it is currently not used much.
 327
 328 @c
 329 halfword insert_discretionary(halfword t, halfword pre, halfword post,
 330                               halfword replace, int penalty)
 331 {
 332     halfword g, n;
 333     int f;
 334     n = new_node(disc_node, syllable_disc);
 335     disc_penalty(n) = penalty;
 336     try_couple_nodes(n, vlink(t));
 337     couple_nodes(t, n);
 338     if (replace != null)
 339         f = font(replace);
 340     else
 341         f = get_cur_font();     /* for compound words following explicit hyphens */
 342     for (g = pre; g != null; g = vlink(g)) {
 343         font(g) = f;
 344         if (node_attr(t) != null) {
 345             delete_attribute_ref(node_attr(g));
 346             node_attr(g) = node_attr(t);
 347             attr_list_ref(node_attr(t)) += 1;
 348         }
 349     }
 350     for (g = post; g != null; g = vlink(g)) {
 351         font(g) = f;
 352         if (node_attr(t) != null) {
 353             delete_attribute_ref(node_attr(g));
 354             node_attr(g) = node_attr(t);
 355             attr_list_ref(node_attr(t)) += 1;
 356         }
 357     }
 358     for (g = replace; g != null; g = vlink(g)) {
 359         if (node_attr(t) != null) {
 360             delete_attribute_ref(node_attr(g));
 361             node_attr(g) = node_attr(t);
 362             attr_list_ref(node_attr(t)) += 1;
 363         }
 364     }
 365     if (node_attr(t) != null) {
 366         delete_attribute_ref(node_attr(vlink(t)));
 367         node_attr(vlink(t)) = node_attr(t);
 368         attr_list_ref(node_attr(t)) += 1;
 369     }
 370     t = vlink(t);
 371     set_disc_field(pre_break(t), pre);
 372     set_disc_field(post_break(t), post);
 373     set_disc_field(no_break(t), replace);
 374     return t;
 375 }
 376
 377 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
 378 {
 379     halfword g, n;
 380     n = new_node(disc_node, syllable_disc);
 381     disc_penalty(n) = hyphen_penalty_par;
 382     couple_nodes(n, vlink(t));
 383     couple_nodes(t, n);
 384     delete_attribute_ref(node_attr(n));
 385     if (node_attr(t) != null) {
 386         node_attr(n) = node_attr(t);
 387         attr_list_ref(node_attr(t))++;
 388     } else {
 389         node_attr(n) = null;
 390     }
 391     if (lan->pre_hyphen_char > 0) {
 392         g = raw_glyph_node();
 393         set_to_character(g);
 394         character(g) = lan->pre_hyphen_char;
 395         font(g) = font(t);
 396         lang_data(g) = lang_data(t);
 397         if (node_attr(t) != null) {
 398             node_attr(g) = node_attr(t);
 399             attr_list_ref(node_attr(t))++;
 400         }
 401         set_disc_field(pre_break(n), g);
 402     }
 403
 404     if (lan->post_hyphen_char > 0) {
 405         t = vlink(n);
 406         g = raw_glyph_node();
 407         set_to_character(g);
 408         character(g) = lan->post_hyphen_char;
 409         font(g) = font(t);
 410         lang_data(g) = lang_data(t);
 411         if (node_attr(t) != null) {
 412             node_attr(g) = node_attr(t);
 413             attr_list_ref(node_attr(t)) += 1;
 414         }
 415         set_disc_field(post_break(n), g);
 416     }
 417     return n;
 418 }
 419
 420 halfword insert_word_discretionary(halfword t, lang_variables * lan)
 421 {
 422     halfword pre = null, pos = null;
 423     if (lan->pre_exhyphen_char > 0)
 424         pre = insert_character(null, lan->pre_exhyphen_char);
 425     if (lan->post_exhyphen_char > 0)
 426         pos = insert_character(null, lan->post_exhyphen_char);
 427     return insert_discretionary(t, pre, pos, null,ex_hyphen_penalty_par);
 428 }
 429
 430 @ @c
 431 halfword compound_word_break(halfword t, int clang)
 432 {
 433     int disc;
 434     lang_variables langdata;
 435     langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
 436     langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
 437     disc = insert_word_discretionary(t, &langdata);
 438     return disc;
 439 }
 440
 441 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
 442                                       halfword pre, halfword pos,
 443                                       halfword replace)
 444 {
 445     (void) lan;
 446     return insert_discretionary(t, pre, pos, replace,hyphen_penalty_par);
 447 }
 448
 449 halfword insert_character(halfword t, int c)
 450 {
 451     halfword p;
 452     p = new_node(glyph_node, 0);
 453     set_to_character(p);
 454     character(p) = c;
 455     if (t != null) {
 456         couple_nodes(t, p);
 457     }
 458     return p;
 459 }
 460
 461 @ @c
 462 void set_disc_field(halfword f, halfword t)
 463 {
 464     if (t != null) {
 465         /*
 466             couple_nodes(f, t); // better not expose f as prev pointer
 467         */
 468         vlink(f) = t ;
 469         alink(t) = null ;
 470         tlink(f) = tail_of_list(t);
 471     } else {
 472         vlink(f) = null;
 473         tlink(f) = null;
 474     }
 475 }
 476
 477 @ @c
 478 static char *hyphenation_exception(int exceptions, char *w)
 479 {
 480     char *ret = NULL;
 481     lua_checkstack(Luas, 2);
 482     lua_rawgeti(Luas, LUA_REGISTRYINDEX, exceptions);
 483     if (lua_istable(Luas, -1)) {   /* ?? */
 484         lua_pushstring(Luas, w);   /* word table */
 485         lua_rawget(Luas, -2);
 486         if (lua_type(Luas, -1) == LUA_TSTRING) {
 487             ret = xstrdup(lua_tostring(Luas, -1));
 488         }
 489         lua_pop(Luas, 2);
 490     } else {
 491         lua_pop(Luas, 1);
 492     }
 493     return ret;
 494 }
 495
 496 @ @c
 497 char *exception_strings(struct tex_language *lang)
 498 {
 499     const char *value;
 500     size_t size = 0, current = 0;
 501     size_t l = 0;
 502     char *ret = NULL;
 503     if (lang->exceptions == 0)
 504         return NULL;
 505     lua_checkstack(Luas, 2);
 506     lua_rawgeti(Luas, LUA_REGISTRYINDEX, lang->exceptions);
 507     if (lua_istable(Luas, -1)) {
 508         /* iterate and join */
 509         lua_pushnil(Luas);         /* first key */
 510         while (lua_next(Luas, -2) != 0) {
 511             value = lua_tolstring(Luas, -1, &l);
 512             if (current + 2 + l > size) {
 513                 ret = xrealloc(ret, (unsigned) ((size + size / 5) + current + l + 1024));
 514                 size = (size + size / 5) + current + l + 1024;
 515             }
 516             *(ret + current) = ' ';
 517             strcpy(ret + current + 1, value);
 518             current += l + 1;
 519             lua_pop(Luas, 1);
 520         }
 521     }
 522     return ret;
 523 }
 524
 525 @ the sequence from |wordstart| to |r| can contain only normal characters it
 526 could be faster to modify a halfword pointer and return an integer
 527
 528 @c
 529 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
 530 {
 531     halfword g = null, gg = null;
 532     register unsigned i = *j;
 533     i++;                        /* this puts uword[i] on the |{| */
 534     while (i < (unsigned) len && uword[i + 1] != '}') {
 535         if (g == null) {
 536             gg = new_char(0, (int) uword[i + 1]);
 537             g = gg;
 538         } else {
 539             halfword s = new_char(0, (int) uword[i + 1]);
 540             couple_nodes(g, s);
 541             g = vlink(g);
 542         }
 543         i++;
 544     }
 545     *j = ++i;
 546     return gg;
 547 }
 548
 549 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
 550 {
 551     int ret = 0;
 552     register unsigned i = *j;
 553     i++;                        /* this puts uword[i] on the |{| */
 554     while (i < (unsigned) len && uword[i + 1] != '}') {
 555         ret++;
 556         i++;
 557     }
 558     *j = ++i;
 559     return ret;
 560 }
 561
 562 @ @c
 563 static const char *PAT_ERROR[] = {
 564     "Exception discretionaries should contain three pairs of braced items.",
 565     "No intervening spaces are allowed.",
 566     NULL
 567 };
 568
 569 /*
 570     The exceptions are taken as-is: no min values are taken into account. One can
 571     add normal patterns on-the-fly if needed.
 572 */
 573
 574 static void do_exception(halfword wordstart, halfword r, char *replacement)
 575 {
 576     unsigned i;
 577     halfword t;
 578     unsigned len;
 579     int clang;
 580     lang_variables langdata;
 581     unsigned uword[MAX_WORD_LEN + 1] = { 0 };
 582     utf2uni_strcpy(uword, replacement);
 583     len = u_length(uword);
 584     i = 0;
 585     t = wordstart;
 586     clang = char_lang(wordstart);
 587     langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 588     langdata.post_hyphen_char = get_post_hyphen_char(clang);
 589
 590     for (i = 0; i < len; i++) {
 591         if (uword[i + 1] == '-') {      /* a hyphen follows */
 592             while (vlink(t) != r && (type(t) != glyph_node || !is_simple_character(t)))
 593                 t = vlink(t);
 594             if (vlink(t) == r)
 595                 break;
 596             insert_syllable_discretionary(t, &langdata);
 597             t = vlink(t);       /* skip the new disc */
 598         } else if (uword[i + 1] == '=') {
 599             /* do nothing ? */
 600             t = vlink(t);
 601         } else if (uword[i + 1] == '{') {
 602             halfword gg, hh, replace = null;
 603             int repl;
 604             gg = find_exception_part(&i, uword, (int) len);
 605             if (i == len || uword[i + 1] != '{') {
 606                 tex_error("broken pattern 1", PAT_ERROR);
 607             }
 608             hh = find_exception_part(&i, uword, (int) len);
 609             if (i == len || uword[i + 1] != '{') {
 610                 tex_error("broken pattern 2", PAT_ERROR);
 611             }
 612             repl = count_exception_part(&i, uword, (int) len);
 613             if (i == len) {
 614                 tex_error("broken pattern 3", PAT_ERROR);
 615             }
 616             /*i++;  *//* jump over the last right brace */
 617             if (vlink(t) == r)
 618                 break;
 619             if (repl > 0) {
 620                 halfword q = t;
 621                 replace = vlink(q);
 622                 while (repl > 0 && q != null) {
 623                     q = vlink(q);
 624                     if (type(q) == glyph_node) {
 625                         repl--;
 626                     }
 627                 }
 628                 try_couple_nodes(t, vlink(q));
 629                 vlink(q) = null;
 630             }
 631             t = insert_discretionary(t, gg, hh, replace, hyphen_penalty_par);
 632             t = vlink(t);       /* skip the new disc */
 633         } else {
 634             t = vlink(t);
 635         }
 636     }
 637 }
 638
 639 @ This is a documentation section from the pascal web file. It is not true any
 640 more, but I do not have time right now to rewrite it -- Taco
 641
 642 When the line-breaking routine is unable to find a feasible sequence of
 643 breakpoints, it makes a second pass over the paragraph, attempting to hyphenate
 644 the hyphenatable words. The goal of hyphenation is to insert discretionary
 645 material into the paragraph so that there are more potential places to break.
 646
 647 The general rules for hyphenation are somewhat complex and technical, because we
 648 want to be able to hyphenate words that are preceded or followed by punctuation
 649 marks, and because we want the rules to work for languages other than English. We
 650 also must contend with the fact that hyphens might radically alter the ligature
 651 and kerning structure of a word.
 652
 653 A sequence of characters will be considered for hyphenation only if it belongs to
 654 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
 655 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are
 656 either character or ligature or whatsit or implicit kern nodes, and $p_m$ is a
 657 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
 658 (Therefore hyphenation is disabled by boxes, math formulas, and discretionary
 659 nodes already inserted by the user.) The ligature nodes among $p_1\ldots p_{m-1}$
 660 are effectively expanded into the original non-ligature characters; the kern
 661 nodes and whatsits are ignored. Each character |c| is now classified as either a
 662 nonletter (if |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an
 663 uppercase letter (otherwise); an uppercase letter is treated as if it were
 664 |lc_code(c)| for purposes of hyphenation. The characters generated by $p_1\ldots
 665 p_{m-1}$ may begin with nonletters; let $c_1$ be the first letter that is not in
 666 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a whatsit
 667 found after $c_1$ will be the terminating node $p_m$. All characters that do not
 668 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
 669 that font must be between 0 and 255, otherwise hyphenation will not be attempted.
 670 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible;
 671 however, |n| must be less than 64, so a character that would otherwise be
 672 $c_{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
 673 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
 674 generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|,
 675 this string qualifies for hyphenation; however, |uc_hyph| must be positive, if
 676 $c_1$ is uppercase.
 677
 678 The hyphenation process takes place in three stages. First, the candidate
 679 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens are
 680 determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots
 681 p_b$ are replaced by a new sequence of nodes that includes the discretionary
 682 breaks found.
 683
 684 Fortunately, we do not have to do all this calculation very often, because of the
 685 way it has been taken out of \TeX's inner loop. For example, when the second
 686 edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset
 687 by \TeX, only about 1.2 hyphenations needed to be @^Knuth, Donald Ervin@> tried
 688 per paragraph, since the line breaking algorithm needed to use two passes on only
 689 about 5 per cent of the paragraphs.
 690
 691 When a word been set up to contain a candidate for hyphenation, \TeX\ first looks
 692 to see if it is in the user's exception dictionary. If not, hyphens are inserted
 693 based on patterns that appear within the given word, using an algorithm due to
 694 Frank~M. Liang. @^Liang, Franklin Mark@>
 695
 696 @ This is incompatible with TEX because the first word of a paragraph can be
 697 hyphenated, but most european users seem to agree that prohibiting hyphenation
 698 there was not the best idea ever.
 699
 700 @c
 701 /*
 702     More strict: \hyphenationbounds
 703
 704     0 = not strict
 705     1 = strict start
 706     2 = strict end
 707     3 = strict start and strict end
 708
 709     \parindent0pt \hsize=1.1cm
 710     12-34-56 \par
 711     12-34-\hbox{56} \par
 712     12-34-\vrule width 1em height 1.5ex \par
 713     12-\hbox{34}-56 \par
 714     12-\vrule width 1em height 1.5ex-56 \par
 715     \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
 716     12-34-56 \par
 717     12-34-\hbox{56} \par
 718     12-34-\vrule width 1em height 1.5ex \par
 719     12-\hbox{34}-56 \par
 720     12-\vrule width 1em height 1.5ex-56 \par
 721
 722 */
 723
 724 static halfword find_next_wordstart(halfword r, halfword first_language, halfword strict_bound)
 725 {
 726     register int l;
 727     register int start_ok = 1;
 728     int mathlevel = 1;
 729     int chr ;
 730     halfword t ;
 731     while (r != null) {
 732         switch (type(r)) {
 733         case boundary_node:
 734             if (subtype(r) == word_boundary) {
 735                 start_ok = 1;
 736             }
 737             break;
 738         case hlist_node: /* new > 0.95 */
 739         case vlist_node: /* new > 0.95 */
 740         case rule_node:  /* new > 0.95 */
 741         case dir_node:
 742         case whatsit_node:
 743             if (strict_bound == 1 || strict_bound == 3) {
 744                 start_ok = 0;
 745             }
 746             break;
 747         case glue_node:
 748             start_ok = 1;
 749             break;
 750         case math_node:
 751             while (mathlevel > 0) {
 752                 r = vlink(r);
 753                 if (r == null)
 754                     return r;
 755                 if (type(r) == math_node) {
 756                     if (subtype(r) == before) {
 757                         mathlevel++;
 758                     } else {
 759                         mathlevel--;
 760                     }
 761                 }
 762             }
 763             break;
 764         case glyph_node:
 765             if (is_simple_character(r)) {
 766                 chr = character(r) ;
 767                 if (chr == ex_hyphen_char_par) {
 768                     /*
 769                         We only accept an explicit hyphen when there is a preceding glyph and
 770                         we skip a sequence of explicit hyphens as that normally indicates a
 771                         -- or --- ligature in which case we can in a worse case usage get bad
 772                         node lists later on due to messed up ligature building as these dashes
 773                         are ligatures in base fonts. This is a side effect of the separating the
 774                         hyphenation, ligaturing and kerning steps. A test is cmr with ------.
 775                     */
 776                     t = vlink(r) ;
 777                     if ((start_ok == 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char_par)) {
 778                         t = compound_word_break(r, char_lang(r));
 779                         subtype(t) = automatic_disc;
 780                         start_ok = 1 ;
 781                     } else {
 782                         start_ok = 0;
 783                     }
 784                 } else if (start_ok && (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
 785                     if (char_uchyph(r) || l == chr || l <= 32) {
 786                         return r;
 787                     } else {
 788                         start_ok = 0;
 789                     }
 790                 }
 791             }
 792             break;
 793         default:
 794             start_ok = 0;
 795             break;
 796         }
 797         r = vlink(r);
 798     }
 799     return r;
 800 }
 801
 802 @ @c
 803 static int valid_wordend(halfword s, halfword strict_bound)
 804 {
 805     register halfword r = s;
 806     register int clang = char_lang(s);
 807     if (r == null)
 808         return 1;
 809     while ( (r != null) &&
 810            (    (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
 811              || (type(r) == kern_node && (subtype(r) == normal))
 812             )
 813            ) {
 814         r = vlink(r);
 815     }
 816     if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
 817                   ||  type(r) == glue_node
 818                   ||  type(r) == penalty_node
 819                   || (type(r) == kern_node && (subtype(r) == explicit_kern || /* so why not italic correction ? */
 820                                                subtype(r) == italic_kern   ||
 821                                                subtype(r) == accent_kern   ))
 822                   ||  ((type(r) == hlist_node   || /* new > 0.95 */
 823                         type(r) == vlist_node   || /* new > 0.95 */
 824                         type(r) == rule_node    || /* new > 0.95 */
 825                         type(r) == dir_node     || /* new > 0.97 */
 826                         type(r) == whatsit_node ||
 827                         type(r) == ins_node     || /* yes or no strict test */
 828                         type(r) == adjust_node     /* yes or no strict test */
 829                        ) && ! (strict_bound == 2 || strict_bound == 3))
 830                   ||  type(r) == boundary_node
 831         )
 832         return 1;
 833     return 0;
 834 }
 835
 836 @ @c
 837 void hnj_hyphenation(halfword head, halfword tail)
 838 {
 839     int lchar, i;
 840     struct tex_language *lang;
 841     lang_variables langdata;
 842     char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
 843     int wordlen = 0;
 844     char *hy = utf8word;
 845     char *replacement = NULL;
 846     boolean explicit_hyphen = false;
 847     halfword first_language = first_valid_language_par;
 848     halfword strict_bound = hyphenation_bounds_par;
 849     halfword s, r = head, wordstart = null, save_tail1 = null, left = null, right = null;
 850
 851     /* this first movement assures two things:
 852      \item{a)} that we won't waste lots of time on something that has been
 853       handled already (in that case, none of the glyphs match |simple_character|).
 854      \item{b)} that the first word can be hyphenated. if the movement was
 855      not explicit, then the indentation at the start of a paragraph
 856      list would make |find_next_wordstart()| look too far ahead.
 857      */
 858
 859     while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
 860         r = vlink(r);
 861     }
 862     /* this will make |r| a glyph node with subtype character */
 863     r = find_next_wordstart(r,first_language,strict_bound);
 864     if (r == null)
 865         return;
 866
 867     assert(tail != null);
 868     save_tail1 = vlink(tail);
 869     s = new_penalty(0);
 870     couple_nodes(tail, s);
 871
 872     while (r != null) {         /* could be while(1), but let's be paranoid */
 873         int clang, lhmin, rhmin, hmin;
 874         halfword hyf_font;
 875         halfword end_word = r;
 876         wordstart = r;
 877         assert(is_simple_character(wordstart));
 878         hyf_font = font(wordstart);
 879         if (hyphen_char(hyf_font) < 0)  /* for backward compat */
 880             hyf_font = 0;
 881         clang = char_lang(wordstart);
 882         lhmin = char_lhmin(wordstart);
 883         rhmin = char_rhmin(wordstart);
 884         hmin = get_hyphenation_min(clang);
 885         langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
 886         langdata.post_hyphen_char = get_post_hyphen_char(clang);
 887         while (    r != null
 888                 && type(r) == glyph_node
 889                 && is_simple_character(r)
 890                 && clang == char_lang(r)
 891                 && (    (     (clang >= first_language)
 892                            && (lchar = get_hj_code(clang,character(r))) > 0
 893                         )
 894                      || (     character(r) == ex_hyphen_char_par
 895                            && (lchar = ex_hyphen_char_par)
 896                         )
 897                    )
 898               ) {
 899             if (character(r) == ex_hyphen_char_par) {
 900                 explicit_hyphen = true;
 901             }
 902             wordlen++;
 903             if (lchar <= 32) {
 904                 if (lchar == 32) {
 905                     lchar = 0 ;
 906                 }
 907                 if (wordlen <= lhmin) {
 908                     lhmin = lhmin - lchar + 1 ;
 909                     if (lhmin < 0)
 910                         lhmin = 1;
 911                 }
 912                 if (wordlen >= rhmin) {
 913                     rhmin = rhmin - lchar + 1 ;
 914                     if (rhmin < 0)
 915                         rhmin = 1;
 916                 }
 917                 hmin = hmin - lchar + 1 ;
 918                 if (hmin < 0)
 919                     rhmin = 1;
 920                 lchar = character(r) ;
 921             }
 922             hy = uni2string(hy, (unsigned) lchar);
 923             /* this should not be needed  any more */
 924             /*if (vlink(r)!=null) alink(vlink(r))=r; */
 925             end_word = r;
 926             r = vlink(r);
 927         }
 928         if (     valid_wordend(r,strict_bound)
 929               && clang >= first_language
 930               && wordlen >= lhmin + rhmin
 931               && (hmin <= 0 || wordlen >= hmin)
 932               && (hyf_font != 0)
 933               && (lang = tex_languages[clang]) != NULL
 934            ) {
 935             *hy = 0;
 936             if (    lang->exceptions != 0
 937                  && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
 938                ) {
 939 #ifdef VERBOSE
 940                 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
 941 #endif
 942                 do_exception(wordstart, r, replacement);
 943                 free(replacement);
 944             } else if (explicit_hyphen == true) {
 945                 /*
 946                     insert an explicit discretionary after each of the last in a
 947                     set of explicit hyphens
 948                 */
 949                 halfword rr = r;
 950                 halfword t = null;
 951 #ifdef VERBOSE
 952                 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
 953 #endif
 954                 while (rr != wordstart) {
 955                 if (is_simple_character(rr)) {
 956                         if (character(rr) == ex_hyphen_char_par) {
 957                             t = compound_word_break(rr, clang);
 958                             subtype(t) = automatic_disc;
 959                             while (character(alink(rr)) == ex_hyphen_char_par)
 960                                 rr = alink(rr);
 961                             if (rr == wordstart)
 962                                 break;
 963                         }
 964                     }
 965                     rr = alink(rr);
 966                 }
 967             } else if (lang->patterns != NULL) {
 968                 left = wordstart;
 969                 for (i = lhmin; i > 1; i--) {
 970                     left = vlink(left);
 971                     while (!is_simple_character(left)) {
 972                         left = vlink(left);
 973                     }
 974                     /*
 975                     if (!left)
 976                         break ;
 977                     */
 978                     /* what is left overruns right .. a bit messy */
 979                 }
 980                 right = r;
 981                 for (i = rhmin; i > 0; i--) {
 982                     right = alink(right);
 983                     while (!is_simple_character(right)) {
 984                         right = alink(right);
 985                     }
 986                     /*
 987                     if (!right)
 988                         break ;
 989                     */
 990                     /* what is right overruns left .. a bit messy */
 991                 }
 992                 /* maybe an extra check ... */
 993                 /* if (left && right) { */
 994 #ifdef VERBOSE
 995                     formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
 996                         utf8word, clang, lhmin, rhmin, character(left), character(right));
 997 #endif
 998                     (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
 999                 /* } */
1000             }
1001         }
1002         explicit_hyphen = false;
1003         wordlen = 0;
1004         hy = utf8word;
1005         if (r == null)
1006             break;
1007         r = find_next_wordstart(r,first_language,strict_bound);
1008     }
1009     flush_node(vlink(tail));
1010     vlink(tail) = save_tail1;
1011 }
1012
1013 @ @c
1014 void new_hyphenation(halfword head, halfword tail)
1015 {
1016     register int callback_id = 0;
1017     if (head == null || vlink(head) == null)
1018         return;
1019     fix_node_list(head);
1020     callback_id = callback_defined(hyphenate_callback);
1021     if (callback_id > 0) {
1022         if (!get_callback(Luas, callback_id)) {
1023             lua_pop(Luas, 2);
1024             return;
1025         }
1026         nodelist_to_lua(Luas, head);
1027         nodelist_to_lua(Luas, tail);
1028         if (lua_pcall(Luas, 2, 0, 0) != 0) {
1029             formatted_warning("hyphenation","bad specification: %s",lua_tostring(Luas, -1));
1030             lua_pop(Luas, 2);
1031             lua_error(Luas);
1032             return;
1033         }
1034         lua_pop(Luas, 1);
1035     } else if (callback_id == 0) {
1036         hnj_hyphenation(head, tail);
1037     }
1038 }
1039
1040 @ dumping and undumping languages
1041
1042 @c
1043 #define dump_string(a)                \
1044   if (a!=NULL) {                      \
1045       x = (int)strlen(a)+1;           \
1046     dump_int(x);  dump_things(*a, x); \
1047   } else {                            \
1048     x = 0; dump_int(x);               \
1049   }
1050
1051 static void dump_one_language(int i)
1052 {
1053     char *s = NULL;
1054     int x = 0;
1055     struct tex_language *lang;
1056     lang = tex_languages[i];
1057     dump_int(lang->id);
1058     dump_int(lang->pre_hyphen_char);
1059     dump_int(lang->post_hyphen_char);
1060     dump_int(lang->pre_exhyphen_char);
1061     dump_int(lang->post_exhyphen_char);
1062     dump_int(lang->hyphenation_min);
1063     if (lang->patterns != NULL) {
1064         s = (char *) hnj_serialize(lang->patterns);
1065     }
1066     dump_string(s);
1067     if (s != NULL) {
1068         free(s);
1069         s = NULL;
1070     }
1071     if (lang->exceptions != 0)
1072         s = exception_strings(lang);
1073     dump_string(s);
1074     if (s != NULL) {
1075         free(s);
1076     }
1077     free(lang);
1078 }
1079
1080 void dump_language_data(void)
1081 {
1082     int i;
1083     dump_int(next_lang_id);
1084     for (i = 0; i < next_lang_id; i++) {
1085         if (tex_languages[i]) {
1086             dump_int(1);
1087             dump_one_language(i);
1088         } else {
1089             dump_int(0);
1090         }
1091     }
1092 }
1093
1094 static void undump_one_language(int i)
1095 {
1096     char *s = NULL;
1097     int x = 0;
1098     struct tex_language *lang = get_language(i);
1099     undump_int(x);
1100     lang->id = x;
1101     undump_int(x);
1102     lang->pre_hyphen_char = x;
1103     undump_int(x);
1104     lang->post_hyphen_char = x;
1105     undump_int(x);
1106     lang->pre_exhyphen_char = x;
1107     undump_int(x);
1108     lang->post_exhyphen_char = x;
1109     undump_int(x);
1110     lang->hyphenation_min = x;
1111     /* patterns */
1112     undump_int(x);
1113     if (x > 0) {
1114         s = xmalloc((unsigned) x);
1115         undump_things(*s, x);
1116         load_patterns(lang, (unsigned char *) s);
1117         free(s);
1118     }
1119     /* exceptions */
1120     undump_int(x);
1121     if (x > 0) {
1122         s = xmalloc((unsigned) x);
1123         undump_things(*s, x);
1124         load_hyphenation(lang, (unsigned char *) s);
1125         free(s);
1126     }
1127 }
1128
1129 void undump_language_data(void)
1130 {
1131     int i, x, numlangs;
1132     undump_int(numlangs);
1133     next_lang_id = numlangs;
1134     for (i = 0; i < numlangs; i++) {
1135         undump_int(x);
1136         if (x == 1) {
1137             undump_one_language(i);
1138         }
1139     }
1140 }
1141
1142 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1143 |new_hyph_exceptions| to do the right thing.
1144
1145 @c
1146 void new_hyph_exceptions(void)
1147 {                               /* enters new exceptions */
1148     (void) scan_toks(false, true);
1149     load_tex_hyphenation(language_par, def_ref);
1150     flush_list(def_ref);
1151 }
1152
1153 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1154 procedure named |new_patterns|.
1155
1156 @c
1157 void new_patterns(void)
1158 {                               /* initializes the hyphenation pattern data */
1159     (void) scan_toks(false, true);
1160     load_tex_patterns(language_par, def_ref);
1161     flush_list(def_ref);
1162 }
1163
1164 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1165 `\.{\\posthyphenchar}' the |post_break| character. Their respective defaults are
1166 ascii hyphen ("-") and zero (nul).
1167
1168 @c
1169 void new_pre_hyphen_char(void)
1170 {
1171     scan_optional_equals();
1172     scan_int();
1173     set_pre_hyphen_char(language_par, cur_val);
1174 }
1175
1176 void new_post_hyphen_char(void)
1177 {
1178     scan_optional_equals();
1179     scan_int();
1180     set_post_hyphen_char(language_par, cur_val);
1181 }
1182
1183 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1184 `\.{\\postexhyphenchar}' the |post_break| character. Their defaults are both zero
1185 (nul).
1186
1187 @c
1188 void new_pre_exhyphen_char(void)
1189 {
1190     scan_optional_equals();
1191     scan_int();
1192     set_pre_exhyphen_char(language_par, cur_val);
1193 }
1194
1195 void new_post_exhyphen_char(void)
1196 {
1197     scan_optional_equals();
1198     scan_int();
1199     set_post_exhyphen_char(language_par, cur_val);
1200 }
1201
1202 void new_hyphenation_min(void)
1203 {
1204     scan_optional_equals();
1205     scan_int();
1206     set_hyphenation_min(language_par, cur_val);
1207 }
1208
1209 void new_hj_code(void)
1210 {
1211     int i ;
1212     scan_int();
1213     i = cur_val;
1214     scan_optional_equals();
1215     scan_int();
1216     set_hj_code(language_par, i, cur_val, -1);
1217 }