sync with experimental
[luatex.git] / source / texk / web2c / luatexdir / lang / texlang.w
blob8e0deb3357d21ea2829d3af243f541b3989ca81e
1 % texlang.w
3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
22 #include "ptexlib.h"
23 #include <string.h>
24 #include "lua/luatex-api.h"
26 @ Low-level helpers
28 @ @c
29 #define unVERBOSE
31 #define MAX_TEX_LANGUAGES 16384
33 #define ex_hyphen_char int_par(ex_hyphen_char_code)
35 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
37 static int next_lang_id = 0;
39 struct tex_language *new_language(int n)
41 struct tex_language *lang;
42 unsigned l;
43 if (n >= 0) {
44 l = (unsigned) n;
45 if (l != (MAX_TEX_LANGUAGES - 1))
46 if (next_lang_id <= n)
47 next_lang_id = n + 1;
48 } else {
49 while (tex_languages[next_lang_id] != NULL)
50 next_lang_id++;
51 l = (unsigned) next_lang_id++;
53 if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
54 lang = xmalloc(sizeof(struct tex_language));
55 tex_languages[l] = lang;
56 lang->id = (int) l;
57 lang->exceptions = 0;
58 lang->patterns = NULL;
59 lang->pre_hyphen_char = '-';
60 lang->post_hyphen_char = 0;
61 lang->pre_exhyphen_char = 0;
62 lang->post_exhyphen_char = 0;
63 lang->hyphenation_min = -1;
64 if (int_par(saving_hyph_codes_code)) {
65 hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
67 return lang;
68 } else {
69 return NULL;
73 struct tex_language *get_language(int n)
75 if (n >= 0 && n < MAX_TEX_LANGUAGES) {
76 if (tex_languages[n] != NULL) {
77 return tex_languages[n];
78 } else {
79 return new_language(n);
81 } else {
82 return NULL;
86 @ @c
87 void set_pre_hyphen_char(int n, int v)
89 struct tex_language *l = get_language((int) n);
90 if (l != NULL)
91 l->pre_hyphen_char = (int) v;
94 void set_post_hyphen_char(int n, int v)
96 struct tex_language *l = get_language((int) n);
97 if (l != NULL)
98 l->post_hyphen_char = (int) v;
101 void set_pre_exhyphen_char(int n, int v)
103 struct tex_language *l = get_language((int) n);
104 if (l != NULL)
105 l->pre_exhyphen_char = (int) v;
108 void set_post_exhyphen_char(int n, int v)
110 struct tex_language *l = get_language((int) n);
111 if (l != NULL)
112 l->post_exhyphen_char = (int) v;
115 int get_pre_hyphen_char(int n)
117 struct tex_language *l = get_language((int) n);
118 if (l == NULL)
119 return -1;
120 return (int) l->pre_hyphen_char;
123 int get_post_hyphen_char(int n)
125 struct tex_language *l = get_language((int) n);
126 if (l == NULL)
127 return -1;
128 return (int) l->post_hyphen_char;
131 int get_pre_exhyphen_char(int n)
133 struct tex_language *l = get_language((int) n);
134 if (l == NULL)
135 return -1;
136 return (int) l->pre_exhyphen_char;
139 int get_post_exhyphen_char(int n)
141 struct tex_language *l = get_language((int) n);
142 if (l == NULL)
143 return -1;
144 return (int) l->post_exhyphen_char;
147 void set_hyphenation_min(int n, int v)
149 struct tex_language *l = get_language((int) n);
150 if (l != NULL)
151 l->hyphenation_min = (int) v;
154 int get_hyphenation_min(int n)
156 struct tex_language *l = get_language((int) n);
157 if (l == NULL)
158 return -1;
159 return (int) l->hyphenation_min;
162 void load_patterns(struct tex_language *lang, const unsigned char *buff)
164 if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
165 return;
166 if (lang->patterns == NULL) {
167 lang->patterns = hnj_hyphen_new();
169 hnj_hyphen_load(lang->patterns, buff);
172 void clear_patterns(struct tex_language *lang)
174 if (lang == NULL)
175 return;
176 if (lang->patterns != NULL) {
177 hnj_hyphen_clear(lang->patterns);
181 void load_tex_patterns(int curlang, halfword head)
183 char *s = tokenlist_to_cstring(head, 1, NULL);
184 load_patterns(get_language(curlang), (unsigned char *) s);
187 @ @c
188 #define STORE_CHAR(l,x) do { \
189 unsigned xx = get_hj_code(l,x); \
190 if (!xx) { \
191 xx = x; \
193 uindex = uni2string(uindex, xx); \
194 } while (0)
196 @ Cleans one word which is returned in |cleaned|, returns the new offset
197 into |buffer|
200 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
202 int items = 0;
203 unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
204 unsigned uword[MAX_WORD_LEN + 1] = { 0 }; /* work buffer for unicode */
205 int u = 0; /* unicode buffer value */
206 int i = 0; /* index into buffer */
207 char *uindex = (char *)word;
208 const char *s = buff;
210 while (*s && !isspace((unsigned char)*s)) {
211 word[i++] = (unsigned)*s;
212 s++;
213 if ((s-buff)>MAX_WORD_LEN) {
214 /* todo: this is too strict, should count unicode, not bytes */
215 *cleaned = NULL;
216 tex_error("exception too long", NULL);
217 return s;
220 /* now convert the input to unicode */
221 word[i] = '\0';
222 utf2uni_strcpy(uword, (const char *)word);
224 /* build the new word string */
225 i = 0;
226 while (uword[i]>0) {
227 u = uword[i++];
228 if (u == '-') { /* skip */
229 } else if (u == '=') {
230 STORE_CHAR(id,'-');
231 } else if (u == '{') {
232 u = uword[i++];
233 items = 0;
234 while (u && u != '}') {
235 u = uword[i++];
237 if (u == '}') {
238 items++;
239 u = uword[i++];
241 while (u && u != '}') {
242 u = uword[i++];
244 if (u == '}') {
245 items++;
246 u = uword[i++];;
248 if (u == '{') {
249 u = uword[i++];;
251 while (u && u != '}') {
252 STORE_CHAR(id,u);
253 u = uword[i++];
255 if (u == '}') {
256 items++;
258 if (items != 3) { /* syntax error */
259 *cleaned = NULL;
260 tex_error("exception syntax error", NULL);
261 return s;
263 } else {
264 STORE_CHAR(id,u);
267 *uindex = '\0';
268 *cleaned = xstrdup((char *) word);
269 return s;
272 @ @c
273 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
275 const char *s;
276 const char *value;
277 char *cleaned;
278 int id ;
279 lua_State *L = Luas;
280 if (lang == NULL)
281 return;
282 if (lang->exceptions == 0) {
283 lua_newtable(L);
284 lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
286 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
287 s = (const char *) buff;
288 id = lang->id;
289 while (*s) {
290 while (isspace((unsigned char)*s))
291 s++;
292 if (*s) {
293 value = s;
294 s = clean_hyphenation(id, s, &cleaned);
295 if (cleaned != NULL) {
296 if ((s - value) > 0) {
297 lua_pushstring(L, cleaned);
298 lua_pushlstring(L, value, (size_t) (s - value));
299 lua_rawset(L, -3);
301 free(cleaned);
302 } else {
303 #ifdef VERBOSE
304 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
305 #endif
311 void clear_hyphenation(struct tex_language *lang)
313 if (lang == NULL)
314 return;
315 if (lang->exceptions != 0) {
316 luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
317 lang->exceptions = 0;
322 void load_tex_hyphenation(int curlang, halfword head)
324 char *s = tokenlist_to_cstring(head, 1, NULL);
325 load_hyphenation(get_language(curlang), (unsigned char *) s);
328 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very
329 nice, but needed. Also, in the post-break, it would be nicer to get the
330 attribute list from |vlink(n)|. No rush, as it is currently not used much.
333 halfword insert_discretionary(halfword t, halfword pre, halfword post,
334 halfword replace, int penalty)
336 halfword g, n;
337 int f;
338 n = new_node(disc_node, syllable_disc);
339 disc_penalty(n) = penalty;
340 try_couple_nodes(n, vlink(t));
341 couple_nodes(t, n);
342 if (replace != null)
343 f = font(replace);
344 else
345 f = get_cur_font(); /* for compound words following explicit hyphens */
346 for (g = pre; g != null; g = vlink(g)) {
347 font(g) = f;
348 if (node_attr(t) != null) {
349 delete_attribute_ref(node_attr(g));
350 node_attr(g) = node_attr(t);
351 attr_list_ref(node_attr(t)) += 1;
354 for (g = post; g != null; g = vlink(g)) {
355 font(g) = f;
356 if (node_attr(t) != null) {
357 delete_attribute_ref(node_attr(g));
358 node_attr(g) = node_attr(t);
359 attr_list_ref(node_attr(t)) += 1;
362 for (g = replace; g != null; g = vlink(g)) {
363 if (node_attr(t) != null) {
364 delete_attribute_ref(node_attr(g));
365 node_attr(g) = node_attr(t);
366 attr_list_ref(node_attr(t)) += 1;
369 if (node_attr(t) != null) {
370 delete_attribute_ref(node_attr(vlink(t)));
371 node_attr(vlink(t)) = node_attr(t);
372 attr_list_ref(node_attr(t)) += 1;
374 t = vlink(t);
375 set_disc_field(pre_break(t), pre);
376 set_disc_field(post_break(t), post);
377 set_disc_field(no_break(t), replace);
378 return t;
381 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
383 halfword g, n;
384 n = new_node(disc_node, syllable_disc);
385 disc_penalty(n) = int_par(hyphen_penalty_code);
386 couple_nodes(n, vlink(t));
387 couple_nodes(t, n);
388 delete_attribute_ref(node_attr(n));
389 if (node_attr(t) != null) {
390 node_attr(n) = node_attr(t);
391 attr_list_ref(node_attr(t))++;
392 } else {
393 node_attr(n) = null;
395 if (lan->pre_hyphen_char > 0) {
396 g = raw_glyph_node();
397 set_to_character(g);
398 character(g) = lan->pre_hyphen_char;
399 font(g) = font(t);
400 lang_data(g) = lang_data(t);
401 if (node_attr(t) != null) {
402 node_attr(g) = node_attr(t);
403 attr_list_ref(node_attr(t))++;
405 set_disc_field(pre_break(n), g);
408 if (lan->post_hyphen_char > 0) {
409 t = vlink(n);
410 g = raw_glyph_node();
411 set_to_character(g);
412 character(g) = lan->post_hyphen_char;
413 font(g) = font(t);
414 lang_data(g) = lang_data(t);
415 if (node_attr(t) != null) {
416 node_attr(g) = node_attr(t);
417 attr_list_ref(node_attr(t)) += 1;
419 set_disc_field(post_break(n), g);
421 return n;
424 halfword insert_word_discretionary(halfword t, lang_variables * lan)
426 halfword pre = null, pos = null;
427 if (lan->pre_exhyphen_char > 0)
428 pre = insert_character(null, lan->pre_exhyphen_char);
429 if (lan->post_exhyphen_char > 0)
430 pos = insert_character(null, lan->post_exhyphen_char);
431 return insert_discretionary(t, pre, pos, null,int_par(ex_hyphen_penalty_code));
434 @ @c
435 halfword compound_word_break(halfword t, int clang)
437 int disc;
438 lang_variables langdata;
439 langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
440 langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
441 disc = insert_word_discretionary(t, &langdata);
442 return disc;
445 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
446 halfword pre, halfword pos,
447 halfword replace)
449 (void) lan;
450 return insert_discretionary(t, pre, pos, replace,int_par(hyphen_penalty_code));
453 halfword insert_character(halfword t, int c)
455 halfword p;
456 p = new_node(glyph_node, 0);
457 set_to_character(p);
458 character(p) = c;
459 if (t != null) {
460 couple_nodes(t, p);
462 return p;
465 @ @c
466 void set_disc_field(halfword f, halfword t)
468 if (t != null) {
469 couple_nodes(f, t);
470 tlink(f) = tail_of_list(t);
471 } else {
472 vlink(f) = null;
473 tlink(f) = null;
477 @ @c
478 static char *hyphenation_exception(int exceptions, char *w)
480 char *ret = NULL;
481 lua_State *L = Luas;
482 lua_checkstack(L, 2);
483 lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
484 if (lua_istable(L, -1)) { /* ?? */
485 lua_pushstring(L, w); /* word table */
486 lua_rawget(L, -2);
487 if (lua_type(L, -1) == LUA_TSTRING) {
488 ret = xstrdup(lua_tostring(L, -1));
490 lua_pop(L, 2);
491 } else {
492 lua_pop(L, 1);
494 return ret;
497 @ @c
498 char *exception_strings(struct tex_language *lang)
500 const char *value;
501 size_t size = 0, current = 0;
502 size_t l = 0;
503 char *ret = NULL;
504 lua_State *L = Luas;
505 if (lang->exceptions == 0)
506 return NULL;
507 lua_checkstack(L, 2);
508 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
509 if (lua_istable(L, -1)) {
510 /* iterate and join */
511 lua_pushnil(L); /* first key */
512 while (lua_next(L, -2) != 0) {
513 value = lua_tolstring(L, -1, &l);
514 if (current + 2 + l > size) {
515 ret =
516 xrealloc(ret,
517 (unsigned) ((size + size / 5) + current + l + 1024));
518 size = (size + size / 5) + current + l + 1024;
520 *(ret + current) = ' ';
521 strcpy(ret + current + 1, value);
522 current += l + 1;
523 lua_pop(L, 1);
526 return ret;
529 @ the sequence from |wordstart| to |r| can contain only normal characters
530 it could be faster to modify a halfword pointer and return an integer
533 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
535 halfword g = null, gg = null;
536 register unsigned i = *j;
537 i++; /* this puts uword[i] on the |{| */
538 while (i < (unsigned) len && uword[i + 1] != '}') {
539 if (g == null) {
540 gg = new_char(0, (int) uword[i + 1]);
541 g = gg;
542 } else {
543 halfword s = new_char(0, (int) uword[i + 1]);
544 couple_nodes(g, s);
545 g = vlink(g);
547 i++;
549 *j = ++i;
550 return gg;
553 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
555 int ret = 0;
556 register unsigned i = *j;
557 i++; /* this puts uword[i] on the |{| */
558 while (i < (unsigned) len && uword[i + 1] != '}') {
559 ret++;
560 i++;
562 *j = ++i;
563 return ret;
566 @ @c
567 static const char *PAT_ERROR[] = {
568 "Exception discretionaries should contain three pairs of braced items.",
569 "No intervening spaces are allowed.",
570 NULL
574 The exceptions are taken as-is: no min values are taken into account. One can
575 add normal patterns on-the-fly if needed.
578 static void do_exception(halfword wordstart, halfword r, char *replacement)
580 unsigned i;
581 halfword t;
582 unsigned len;
583 int clang;
584 lang_variables langdata;
585 unsigned uword[MAX_WORD_LEN + 1] = { 0 };
586 utf2uni_strcpy(uword, replacement);
587 len = u_length(uword);
588 i = 0;
589 t = wordstart;
590 clang = char_lang(wordstart);
591 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
592 langdata.post_hyphen_char = get_post_hyphen_char(clang);
594 for (i = 0; i < len; i++) {
595 if (uword[i + 1] == '-') { /* a hyphen follows */
596 while (vlink(t) != r
597 && (type(t) != glyph_node || !is_simple_character(t)))
598 t = vlink(t);
599 if (vlink(t) == r)
600 break;
601 insert_syllable_discretionary(t, &langdata);
602 t = vlink(t); /* skip the new disc */
603 } else if (uword[i + 1] == '=') {
604 /* do nothing ? */
605 t = vlink(t);
606 } else if (uword[i + 1] == '{') {
607 halfword gg, hh, replace = null;
608 int repl;
609 gg = find_exception_part(&i, uword, (int) len);
610 if (i == len || uword[i + 1] != '{') {
611 tex_error("broken pattern 1", PAT_ERROR);
613 hh = find_exception_part(&i, uword, (int) len);
614 if (i == len || uword[i + 1] != '{') {
615 tex_error("broken pattern 2", PAT_ERROR);
617 repl = count_exception_part(&i, uword, (int) len);
618 if (i == len) {
619 tex_error("broken pattern 3", PAT_ERROR);
621 /*i++; *//* jump over the last right brace */
622 if (vlink(t) == r)
623 break;
624 if (repl > 0) {
625 halfword q = t;
626 replace = vlink(q);
627 while (repl > 0 && q != null) {
628 q = vlink(q);
629 if (type(q) == glyph_node) {
630 repl--;
633 try_couple_nodes(t, vlink(q));
634 vlink(q) = null;
636 t = insert_discretionary(t, gg, hh, replace,int_par(hyphen_penalty_code));
637 t = vlink(t); /* skip the new disc */
638 } else {
639 t = vlink(t);
644 @ This is a documentation section from the pascal web file. It is not
645 true any more, but I do not have time right now to rewrite it -- Taco
647 When the line-breaking routine is unable to find a feasible sequence of
648 breakpoints, it makes a second pass over the paragraph, attempting to
649 hyphenate the hyphenatable words. The goal of hyphenation is to insert
650 discretionary material into the paragraph so that there are more
651 potential places to break.
653 The general rules for hyphenation are somewhat complex and technical,
654 because we want to be able to hyphenate words that are preceded or
655 followed by punctuation marks, and because we want the rules to work
656 for languages other than English. We also must contend with the fact
657 that hyphens might radically alter the ligature and kerning structure
658 of a word.
660 A sequence of characters will be considered for hyphenation only if it
661 belongs to a ``potentially hyphenatable part'' of the current paragraph.
662 This is a sequence of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node,
663 $p_1\ldots p_{m-1}$ are either character or ligature or whatsit or
664 implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust
665 or mark or whatsit or explicit kern node. (Therefore hyphenation is
666 disabled by boxes, math formulas, and discretionary nodes already inserted
667 by the user.) The ligature nodes among $p_1\ldots p_{m-1}$ are effectively
668 expanded into the original non-ligature characters; the kern nodes and
669 whatsits are ignored. Each character |c| is now classified as either a
670 nonletter (if |lc_code(c)=0|), a lowercase letter (if
671 |lc_code(c)=c|), or an uppercase letter (otherwise); an uppercase letter
672 is treated as if it were |lc_code(c)| for purposes of hyphenation. The
673 characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let
674 $c_1$ be the first letter that is not in the middle of a ligature. Whatsit
675 nodes preceding $c_1$ are ignored; a whatsit found after $c_1$ will be the
676 terminating node $p_m$. All characters that do not have the same font as
677 $c_1$ will be treated as nonletters. The |hyphen_char| for that font
678 must be between 0 and 255, otherwise hyphenation will not be attempted.
679 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as
680 possible; however, |n| must be less than 64, so a character that would
681 otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
682 not be in the middle of a ligature. In this way we obtain a string of
683 letters $c_1\ldots c_n$ that are generated by nodes $p_a\ldots p_b$, where
684 |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this string qualifies for hyphenation;
685 however, |uc_hyph| must be positive, if $c_1$ is uppercase.
687 The hyphenation process takes place in three stages. First, the candidate
688 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens
689 are determined by referring to hyphenation tables; and finally, the nodes
690 $p_a\ldots p_b$ are replaced by a new sequence of nodes that includes the
691 discretionary breaks found.
693 Fortunately, we do not have to do all this calculation very often, because
694 of the way it has been taken out of \TeX's inner loop. For example, when
695 the second edition of the author's 700-page book {\sl Seminumerical
696 Algorithms} was typeset by \TeX, only about 1.2 hyphenations needed to be
697 @^Knuth, Donald Ervin@>
698 tried per paragraph, since the line breaking algorithm needed to use two
699 passes on only about 5 per cent of the paragraphs.
701 When a word been set up to contain a candidate for hyphenation,
702 \TeX\ first looks to see if it is in the user's exception dictionary. If not,
703 hyphens are inserted based on patterns that appear within the given word,
704 using an algorithm due to Frank~M. Liang.
705 @^Liang, Franklin Mark@>
707 @ This is incompatible with TEX because the first word of a paragraph
708 can be hyphenated, but most european users seem to agree that
709 prohibiting hyphenation there was not the best idea ever.
712 static halfword find_next_wordstart(halfword r)
714 register int l;
715 register int start_ok = 1;
716 int mathlevel = 1;
717 int chr ;
718 halfword t ;
719 while (r != null) {
720 switch (type(r)) {
721 case boundary_node:
722 case whatsit_node:
723 break;
724 case glue_node:
725 start_ok = 1;
726 break;
727 case math_node:
728 while (mathlevel > 0) {
729 r = vlink(r);
730 if (r == null)
731 return r;
732 if (type(r) == math_node) {
733 if (subtype(r) == before) {
734 mathlevel++;
735 } else {
736 mathlevel--;
740 break;
741 case glyph_node:
742 if (is_simple_character(r)) {
743 chr = character(r) ;
744 if (chr == ex_hyphen_char) {
746 We only accept an explicit hyphen when there is a preceding glyph and
747 we skip a sequence of explicit hyphens as that normally indicates a
748 -- or --- ligature in which case we can in a worse case usage get bad
749 node lists later on due to messed up ligature building as these dashes
750 are ligatures in base fonts. This is a side effect of the separating the
751 hyphenation, ligaturing and kerning steps. A test is cmr with ------.
753 t = vlink(r) ;
754 if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
755 t = compound_word_break(r, char_lang(r));
756 subtype(t) = automatic_disc;
757 start_ok = 1 ;
758 } else {
759 start_ok = 0;
761 } else if (start_ok && (char_lang(r)>0) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
762 if (char_uchyph(r) || l == chr) {
763 return r;
764 } else {
765 start_ok = 0;
769 break;
770 default:
771 start_ok = 0;
772 break;
774 r = vlink(r);
776 return r;
779 @ @c
780 static int valid_wordend(halfword s)
782 register halfword r = s;
783 register int clang = char_lang(s);
784 if (r == null)
785 return 1;
786 while ((r != null) && ( (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
787 || (type(r) == kern_node && (subtype(r) == normal))
788 )) {
789 r = vlink(r);
791 if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
792 || type(r) == glue_node
793 || type(r) == boundary_node
794 || type(r) == whatsit_node
795 || type(r) == ins_node
796 || type(r) == adjust_node
797 || type(r) == penalty_node
798 || (type(r) == kern_node && (subtype(r) == explicit_kern ||
799 subtype(r) == italic_kern ||
800 subtype(r) == accent_kern )))
801 return 1;
802 return 0;
805 @ @c
806 void hnj_hyphenation(halfword head, halfword tail)
808 int lchar, i;
809 struct tex_language *lang;
810 lang_variables langdata;
811 char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
812 int wordlen = 0;
813 char *hy = utf8word;
814 char *replacement = NULL;
815 boolean explicit_hyphen = false;
816 halfword s, r = head, wordstart = null, save_tail1 = null, left =
817 null, right = null;
819 /* this first movement assures two things:
820 \item{a)} that we won't waste lots of time on something that has been
821 handled already (in that case, none of the glyphs match |simple_character|).
822 \item{b)} that the first word can be hyphenated. if the movement was
823 not explicit, then the indentation at the start of a paragraph
824 list would make |find_next_wordstart()| look too far ahead.
827 while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
828 r = vlink(r);
830 /* this will make |r| a glyph node with subtype character */
831 r = find_next_wordstart(r);
832 if (r == null)
833 return;
835 assert(tail != null);
836 save_tail1 = vlink(tail);
837 s = new_penalty(0);
838 couple_nodes(tail, s);
840 while (r != null) { /* could be while(1), but let's be paranoid */
841 int clang, lhmin, rhmin, hmin;
842 halfword hyf_font;
843 halfword end_word = r;
844 wordstart = r;
845 assert(is_simple_character(wordstart));
846 hyf_font = font(wordstart);
847 if (hyphen_char(hyf_font) < 0) /* for backward compat */
848 hyf_font = 0;
849 clang = char_lang(wordstart);
850 lhmin = char_lhmin(wordstart);
851 rhmin = char_rhmin(wordstart);
852 hmin = get_hyphenation_min(clang);
853 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
854 langdata.post_hyphen_char = get_post_hyphen_char(clang);
855 while (r != null && type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r) &&
856 (((clang > 0) && (lchar = get_hj_code(clang,character(r))) > 0) || (character(r) == ex_hyphen_char && (lchar = ex_hyphen_char)))) {
857 if (character(r) == ex_hyphen_char)
858 explicit_hyphen = true;
859 wordlen++;
860 hy = uni2string(hy, (unsigned) lchar);
861 /* this should not be needed any more */
862 /*if (vlink(r)!=null) alink(vlink(r))=r; */
863 end_word = r;
864 r = vlink(r);
866 if (valid_wordend(r) && wordlen >= lhmin + rhmin && (hmin <= 0 || wordlen >= hmin)
867 && (hyf_font != 0) && clang >=0 && (lang = tex_languages[clang]) != NULL) {
868 *hy = 0;
869 if (lang->exceptions != 0 &&
870 (replacement =
871 hyphenation_exception(lang->exceptions, utf8word)) != NULL) {
872 #ifdef VERBOSE
873 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
874 #endif
875 do_exception(wordstart, r, replacement);
876 free(replacement);
877 } else if (explicit_hyphen == true) {
878 /* insert an explicit discretionary after each of the last in a
879 set of explicit hyphens */
880 halfword rr = r;
881 halfword t = null;
882 #ifdef VERBOSE
883 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
884 #endif
885 while (rr != wordstart) {
886 if (is_simple_character(rr)) {
887 if (character(rr) == ex_hyphen_char) {
888 t = compound_word_break(rr, clang);
889 subtype(t) = automatic_disc;
890 while(character(alink(rr)) == ex_hyphen_char)
891 rr = alink(rr);
892 if (rr == wordstart)
893 break;
896 rr = alink(rr);
899 } else if (lang->patterns != NULL) {
901 left = wordstart;
902 for (i = lhmin; i > 1; i--) {
903 left = vlink(left);
904 while (!is_simple_character(left))
905 left = vlink(left);
907 right = r;
908 for (i = rhmin; i > 0; i--) {
909 right = alink(right);
910 while (!is_simple_character(right))
911 right = alink(right);
914 #ifdef VERBOSE
915 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
916 utf8word, clang, lhmin, rhmin, character(left),
917 character(right));
918 #endif
919 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
922 explicit_hyphen = false;
923 wordlen = 0;
924 hy = utf8word;
925 if (r == null)
926 break;
927 r = find_next_wordstart(r);
929 flush_node(vlink(tail));
930 vlink(tail) = save_tail1;
933 @ @c
934 void new_hyphenation(halfword head, halfword tail)
936 register int callback_id = 0;
937 if (head == null || vlink(head) == null)
938 return;
939 fix_node_list(head);
940 callback_id = callback_defined(hyphenate_callback);
941 if (callback_id > 0) {
942 lua_State *L = Luas;
943 if (!get_callback(L, callback_id)) {
944 lua_pop(L, 2);
945 return;
947 nodelist_to_lua(L, head);
948 nodelist_to_lua(L, tail);
949 if (lua_pcall(L, 2, 0, 0) != 0) {
950 formatted_warning("hyphenation","bad specification: %s",lua_tostring(L, -1));
951 lua_pop(L, 2);
952 lua_error(L);
953 return;
955 lua_pop(L, 1);
956 } else if (callback_id == 0) {
957 hnj_hyphenation(head, tail);
961 @ dumping and undumping languages
964 #define dump_string(a) \
965 if (a!=NULL) { \
966 x = (int)strlen(a)+1; \
967 dump_int(x); dump_things(*a, x); \
968 } else { \
969 x = 0; dump_int(x); \
972 static void dump_one_language(int i)
974 char *s = NULL;
975 int x = 0;
976 struct tex_language *lang;
977 lang = tex_languages[i];
978 dump_int(lang->id);
979 dump_int(lang->pre_hyphen_char);
980 dump_int(lang->post_hyphen_char);
981 dump_int(lang->pre_exhyphen_char);
982 dump_int(lang->post_exhyphen_char);
983 dump_int(lang->hyphenation_min);
984 if (lang->patterns != NULL) {
985 s = (char *) hnj_serialize(lang->patterns);
987 dump_string(s);
988 if (s != NULL) {
989 free(s);
990 s = NULL;
992 if (lang->exceptions != 0)
993 s = exception_strings(lang);
994 dump_string(s);
995 if (s != NULL) {
996 free(s);
998 free(lang);
1001 void dump_language_data(void)
1003 int i;
1004 dump_int(next_lang_id);
1005 for (i = 0; i < next_lang_id; i++) {
1006 if (tex_languages[i]) {
1007 dump_int(1);
1008 dump_one_language(i);
1009 } else {
1010 dump_int(0);
1015 static void undump_one_language(int i)
1017 char *s = NULL;
1018 int x = 0;
1019 struct tex_language *lang = get_language(i);
1020 undump_int(x);
1021 lang->id = x;
1022 undump_int(x);
1023 lang->pre_hyphen_char = x;
1024 undump_int(x);
1025 lang->post_hyphen_char = x;
1026 undump_int(x);
1027 lang->pre_exhyphen_char = x;
1028 undump_int(x);
1029 lang->post_exhyphen_char = x;
1030 undump_int(x);
1031 lang->hyphenation_min = x;
1032 /* patterns */
1033 undump_int(x);
1034 if (x > 0) {
1035 s = xmalloc((unsigned) x);
1036 undump_things(*s, x);
1037 load_patterns(lang, (unsigned char *) s);
1038 free(s);
1040 /* exceptions */
1041 undump_int(x);
1042 if (x > 0) {
1043 s = xmalloc((unsigned) x);
1044 undump_things(*s, x);
1045 load_hyphenation(lang, (unsigned char *) s);
1046 free(s);
1050 void undump_language_data(void)
1052 int i, x, numlangs;
1053 undump_int(numlangs);
1054 next_lang_id = numlangs;
1055 for (i = 0; i < numlangs; i++) {
1056 undump_int(x);
1057 if (x == 1) {
1058 undump_one_language(i);
1063 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1064 |new_hyph_exceptions| to do the right thing.
1067 void new_hyph_exceptions(void)
1068 { /* enters new exceptions */
1069 (void) scan_toks(false, true);
1070 load_tex_hyphenation(int_par(language_code), def_ref);
1071 flush_list(def_ref);
1074 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1075 procedure named |new_patterns|.
1078 void new_patterns(void)
1079 { /* initializes the hyphenation pattern data */
1080 (void) scan_toks(false, true);
1081 load_tex_patterns(int_par(language_code), def_ref);
1082 flush_list(def_ref);
1085 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1086 `\.{\\posthyphenchar}' the |post_break| character. Their respective
1087 defaults are ascii hyphen ("-") and zero (nul).
1090 void new_pre_hyphen_char(void)
1092 scan_optional_equals();
1093 scan_int();
1094 set_pre_hyphen_char(int_par(language_code), cur_val);
1097 void new_post_hyphen_char(void)
1099 scan_optional_equals();
1100 scan_int();
1101 set_post_hyphen_char(int_par(language_code), cur_val);
1104 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1105 `\.{\\postexhyphenchar}' the |post_break| character. Their
1106 defaults are both zero (nul).
1109 void new_pre_exhyphen_char(void)
1111 scan_optional_equals();
1112 scan_int();
1113 set_pre_exhyphen_char(int_par(language_code), cur_val);
1116 void new_post_exhyphen_char(void)
1118 scan_optional_equals();
1119 scan_int();
1120 set_post_exhyphen_char(int_par(language_code), cur_val);
1123 void new_hyphenation_min(void)
1125 scan_optional_equals();
1126 scan_int();
1127 set_hyphenation_min(int_par(language_code), cur_val);
1130 void new_hj_code(void)
1132 int i ;
1133 scan_int();
1134 i = cur_val;
1135 scan_optional_equals();
1136 scan_int();
1137 set_hj_code(int_par(language_code), i, cur_val, -1);