boundary nodes made consistent (cleanup and document): WARNING: bump the format numbe...
[luatex.git] / source / texk / web2c / luatexdir / lang / texlang.w
blob68cb0581cd218cff779ab63ba22527f0c64aa186
1 % texlang.w
3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
22 #include "ptexlib.h"
23 #include <string.h>
24 #include "lua/luatex-api.h"
26 @ Low-level helpers
28 @ @c
29 #define unVERBOSE
31 #define MAX_TEX_LANGUAGES 16384
33 #define ex_hyphen_char int_par(ex_hyphen_char_code)
35 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
37 static int next_lang_id = 0;
39 struct tex_language *new_language(int n)
41 struct tex_language *lang;
42 unsigned l;
43 if (n >= 0) {
44 l = (unsigned) n;
45 if (l != (MAX_TEX_LANGUAGES - 1))
46 if (next_lang_id <= n)
47 next_lang_id = n + 1;
48 } else {
49 while (tex_languages[next_lang_id] != NULL)
50 next_lang_id++;
51 l = (unsigned) next_lang_id++;
53 if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
54 lang = xmalloc(sizeof(struct tex_language));
55 tex_languages[l] = lang;
56 lang->id = (int) l;
57 lang->exceptions = 0;
58 lang->patterns = NULL;
59 lang->pre_hyphen_char = '-';
60 lang->post_hyphen_char = 0;
61 lang->pre_exhyphen_char = 0;
62 lang->post_exhyphen_char = 0;
63 lang->hyphenation_min = -1;
64 if (int_par(saving_hyph_codes_code)) {
65 hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
67 return lang;
68 } else {
69 return NULL;
73 struct tex_language *get_language(int n)
75 if (n >= 0 && n < MAX_TEX_LANGUAGES) {
76 if (tex_languages[n] != NULL) {
77 return tex_languages[n];
78 } else {
79 return new_language(n);
81 } else {
82 return NULL;
86 @ @c
87 void set_pre_hyphen_char(int n, int v)
89 struct tex_language *l = get_language((int) n);
90 if (l != NULL)
91 l->pre_hyphen_char = (int) v;
94 void set_post_hyphen_char(int n, int v)
96 struct tex_language *l = get_language((int) n);
97 if (l != NULL)
98 l->post_hyphen_char = (int) v;
101 void set_pre_exhyphen_char(int n, int v)
103 struct tex_language *l = get_language((int) n);
104 if (l != NULL)
105 l->pre_exhyphen_char = (int) v;
108 void set_post_exhyphen_char(int n, int v)
110 struct tex_language *l = get_language((int) n);
111 if (l != NULL)
112 l->post_exhyphen_char = (int) v;
115 int get_pre_hyphen_char(int n)
117 struct tex_language *l = get_language((int) n);
118 if (l == NULL)
119 return -1;
120 return (int) l->pre_hyphen_char;
123 int get_post_hyphen_char(int n)
125 struct tex_language *l = get_language((int) n);
126 if (l == NULL)
127 return -1;
128 return (int) l->post_hyphen_char;
131 int get_pre_exhyphen_char(int n)
133 struct tex_language *l = get_language((int) n);
134 if (l == NULL)
135 return -1;
136 return (int) l->pre_exhyphen_char;
139 int get_post_exhyphen_char(int n)
141 struct tex_language *l = get_language((int) n);
142 if (l == NULL)
143 return -1;
144 return (int) l->post_exhyphen_char;
147 void set_hyphenation_min(int n, int v)
149 struct tex_language *l = get_language((int) n);
150 if (l != NULL)
151 l->hyphenation_min = (int) v;
154 int get_hyphenation_min(int n)
156 struct tex_language *l = get_language((int) n);
157 if (l == NULL)
158 return -1;
159 return (int) l->hyphenation_min;
162 void load_patterns(struct tex_language *lang, const unsigned char *buff)
164 if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
165 return;
166 if (lang->patterns == NULL) {
167 lang->patterns = hnj_hyphen_new();
169 hnj_hyphen_load(lang->patterns, buff);
172 void clear_patterns(struct tex_language *lang)
174 if (lang == NULL)
175 return;
176 if (lang->patterns != NULL) {
177 hnj_hyphen_clear(lang->patterns);
181 void load_tex_patterns(int curlang, halfword head)
183 char *s = tokenlist_to_cstring(head, 1, NULL);
184 load_patterns(get_language(curlang), (unsigned char *) s);
187 @ @c
188 #define STORE_CHAR(l,x) do { \
189 unsigned xx = get_hj_code(l,x); \
190 if (!xx) { \
191 xx = x; \
193 uindex = uni2string(uindex, xx); \
194 } while (0)
196 @ Cleans one word which is returned in |cleaned|, returns the new offset into
197 |buffer|
200 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
202 int items = 0;
203 unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
204 unsigned uword[MAX_WORD_LEN + 1] = { 0 }; /* work buffer for unicode */
205 int u = 0; /* unicode buffer value */
206 int i = 0; /* index into buffer */
207 char *uindex = (char *)word;
208 const char *s = buff;
210 while (*s && !isspace((unsigned char)*s)) {
211 word[i++] = (unsigned)*s;
212 s++;
213 if ((s-buff)>MAX_WORD_LEN) {
214 /* todo: this is too strict, should count unicode, not bytes */
215 *cleaned = NULL;
216 tex_error("exception too long", NULL);
217 return s;
220 /* now convert the input to unicode */
221 word[i] = '\0';
222 utf2uni_strcpy(uword, (const char *)word);
224 /* build the new word string */
225 i = 0;
226 while (uword[i]>0) {
227 u = uword[i++];
228 if (u == '-') { /* skip */
229 } else if (u == '=') {
230 STORE_CHAR(id,'-');
231 } else if (u == '{') {
232 u = uword[i++];
233 items = 0;
234 while (u && u != '}') {
235 u = uword[i++];
237 if (u == '}') {
238 items++;
239 u = uword[i++];
241 while (u && u != '}') {
242 u = uword[i++];
244 if (u == '}') {
245 items++;
246 u = uword[i++];
248 if (u == '{') {
249 u = uword[i++];
251 while (u && u != '}') {
252 STORE_CHAR(id,u);
253 u = uword[i++];
255 if (u == '}') {
256 items++;
258 if (items != 3) { /* syntax error */
259 *cleaned = NULL;
260 tex_error("exception syntax error", NULL);
261 return s;
263 } else {
264 STORE_CHAR(id,u);
267 *uindex = '\0';
268 *cleaned = xstrdup((char *) word);
269 return s;
272 @ @c
273 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
275 const char *s;
276 const char *value;
277 char *cleaned;
278 int id ;
279 lua_State *L = Luas;
280 if (lang == NULL)
281 return;
282 if (lang->exceptions == 0) {
283 lua_newtable(L);
284 lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
286 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
287 s = (const char *) buff;
288 id = lang->id;
289 while (*s) {
290 while (isspace((unsigned char)*s))
291 s++;
292 if (*s) {
293 value = s;
294 s = clean_hyphenation(id, s, &cleaned);
295 if (cleaned != NULL) {
296 if ((s - value) > 0) {
297 lua_pushstring(L, cleaned);
298 lua_pushlstring(L, value, (size_t) (s - value));
299 lua_rawset(L, -3);
301 free(cleaned);
302 } else {
303 #ifdef VERBOSE
304 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
305 #endif
311 void clear_hyphenation(struct tex_language *lang)
313 if (lang == NULL)
314 return;
315 if (lang->exceptions != 0) {
316 luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
317 lang->exceptions = 0;
321 void load_tex_hyphenation(int curlang, halfword head)
323 char *s = tokenlist_to_cstring(head, 1, NULL);
324 load_hyphenation(get_language(curlang), (unsigned char *) s);
327 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very nice,
328 but needed. Also, in the post-break, it would be nicer to get the attribute list
329 from |vlink(n)|. No rush, as it is currently not used much.
332 halfword insert_discretionary(halfword t, halfword pre, halfword post,
333 halfword replace, int penalty)
335 halfword g, n;
336 int f;
337 n = new_node(disc_node, syllable_disc);
338 disc_penalty(n) = penalty;
339 try_couple_nodes(n, vlink(t));
340 couple_nodes(t, n);
341 if (replace != null)
342 f = font(replace);
343 else
344 f = get_cur_font(); /* for compound words following explicit hyphens */
345 for (g = pre; g != null; g = vlink(g)) {
346 font(g) = f;
347 if (node_attr(t) != null) {
348 delete_attribute_ref(node_attr(g));
349 node_attr(g) = node_attr(t);
350 attr_list_ref(node_attr(t)) += 1;
353 for (g = post; g != null; g = vlink(g)) {
354 font(g) = f;
355 if (node_attr(t) != null) {
356 delete_attribute_ref(node_attr(g));
357 node_attr(g) = node_attr(t);
358 attr_list_ref(node_attr(t)) += 1;
361 for (g = replace; g != null; g = vlink(g)) {
362 if (node_attr(t) != null) {
363 delete_attribute_ref(node_attr(g));
364 node_attr(g) = node_attr(t);
365 attr_list_ref(node_attr(t)) += 1;
368 if (node_attr(t) != null) {
369 delete_attribute_ref(node_attr(vlink(t)));
370 node_attr(vlink(t)) = node_attr(t);
371 attr_list_ref(node_attr(t)) += 1;
373 t = vlink(t);
374 set_disc_field(pre_break(t), pre);
375 set_disc_field(post_break(t), post);
376 set_disc_field(no_break(t), replace);
377 return t;
380 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
382 halfword g, n;
383 n = new_node(disc_node, syllable_disc);
384 disc_penalty(n) = int_par(hyphen_penalty_code);
385 couple_nodes(n, vlink(t));
386 couple_nodes(t, n);
387 delete_attribute_ref(node_attr(n));
388 if (node_attr(t) != null) {
389 node_attr(n) = node_attr(t);
390 attr_list_ref(node_attr(t))++;
391 } else {
392 node_attr(n) = null;
394 if (lan->pre_hyphen_char > 0) {
395 g = raw_glyph_node();
396 set_to_character(g);
397 character(g) = lan->pre_hyphen_char;
398 font(g) = font(t);
399 lang_data(g) = lang_data(t);
400 if (node_attr(t) != null) {
401 node_attr(g) = node_attr(t);
402 attr_list_ref(node_attr(t))++;
404 set_disc_field(pre_break(n), g);
407 if (lan->post_hyphen_char > 0) {
408 t = vlink(n);
409 g = raw_glyph_node();
410 set_to_character(g);
411 character(g) = lan->post_hyphen_char;
412 font(g) = font(t);
413 lang_data(g) = lang_data(t);
414 if (node_attr(t) != null) {
415 node_attr(g) = node_attr(t);
416 attr_list_ref(node_attr(t)) += 1;
418 set_disc_field(post_break(n), g);
420 return n;
423 halfword insert_word_discretionary(halfword t, lang_variables * lan)
425 halfword pre = null, pos = null;
426 if (lan->pre_exhyphen_char > 0)
427 pre = insert_character(null, lan->pre_exhyphen_char);
428 if (lan->post_exhyphen_char > 0)
429 pos = insert_character(null, lan->post_exhyphen_char);
430 return insert_discretionary(t, pre, pos, null,int_par(ex_hyphen_penalty_code));
433 @ @c
434 halfword compound_word_break(halfword t, int clang)
436 int disc;
437 lang_variables langdata;
438 langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
439 langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
440 disc = insert_word_discretionary(t, &langdata);
441 return disc;
444 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
445 halfword pre, halfword pos,
446 halfword replace)
448 (void) lan;
449 return insert_discretionary(t, pre, pos, replace,int_par(hyphen_penalty_code));
452 halfword insert_character(halfword t, int c)
454 halfword p;
455 p = new_node(glyph_node, 0);
456 set_to_character(p);
457 character(p) = c;
458 if (t != null) {
459 couple_nodes(t, p);
461 return p;
464 @ @c
465 void set_disc_field(halfword f, halfword t)
467 if (t != null) {
468 couple_nodes(f, t);
469 tlink(f) = tail_of_list(t);
470 } else {
471 vlink(f) = null;
472 tlink(f) = null;
476 @ @c
477 static char *hyphenation_exception(int exceptions, char *w)
479 char *ret = NULL;
480 lua_State *L = Luas;
481 lua_checkstack(L, 2);
482 lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
483 if (lua_istable(L, -1)) { /* ?? */
484 lua_pushstring(L, w); /* word table */
485 lua_rawget(L, -2);
486 if (lua_type(L, -1) == LUA_TSTRING) {
487 ret = xstrdup(lua_tostring(L, -1));
489 lua_pop(L, 2);
490 } else {
491 lua_pop(L, 1);
493 return ret;
496 @ @c
497 char *exception_strings(struct tex_language *lang)
499 const char *value;
500 size_t size = 0, current = 0;
501 size_t l = 0;
502 char *ret = NULL;
503 lua_State *L = Luas;
504 if (lang->exceptions == 0)
505 return NULL;
506 lua_checkstack(L, 2);
507 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
508 if (lua_istable(L, -1)) {
509 /* iterate and join */
510 lua_pushnil(L); /* first key */
511 while (lua_next(L, -2) != 0) {
512 value = lua_tolstring(L, -1, &l);
513 if (current + 2 + l > size) {
514 ret = xrealloc(ret, (unsigned) ((size + size / 5) + current + l + 1024));
515 size = (size + size / 5) + current + l + 1024;
517 *(ret + current) = ' ';
518 strcpy(ret + current + 1, value);
519 current += l + 1;
520 lua_pop(L, 1);
523 return ret;
526 @ the sequence from |wordstart| to |r| can contain only normal characters it
527 could be faster to modify a halfword pointer and return an integer
530 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
532 halfword g = null, gg = null;
533 register unsigned i = *j;
534 i++; /* this puts uword[i] on the |{| */
535 while (i < (unsigned) len && uword[i + 1] != '}') {
536 if (g == null) {
537 gg = new_char(0, (int) uword[i + 1]);
538 g = gg;
539 } else {
540 halfword s = new_char(0, (int) uword[i + 1]);
541 couple_nodes(g, s);
542 g = vlink(g);
544 i++;
546 *j = ++i;
547 return gg;
550 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
552 int ret = 0;
553 register unsigned i = *j;
554 i++; /* this puts uword[i] on the |{| */
555 while (i < (unsigned) len && uword[i + 1] != '}') {
556 ret++;
557 i++;
559 *j = ++i;
560 return ret;
563 @ @c
564 static const char *PAT_ERROR[] = {
565 "Exception discretionaries should contain three pairs of braced items.",
566 "No intervening spaces are allowed.",
567 NULL
571 The exceptions are taken as-is: no min values are taken into account. One can
572 add normal patterns on-the-fly if needed.
575 static void do_exception(halfword wordstart, halfword r, char *replacement)
577 unsigned i;
578 halfword t;
579 unsigned len;
580 int clang;
581 lang_variables langdata;
582 unsigned uword[MAX_WORD_LEN + 1] = { 0 };
583 utf2uni_strcpy(uword, replacement);
584 len = u_length(uword);
585 i = 0;
586 t = wordstart;
587 clang = char_lang(wordstart);
588 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
589 langdata.post_hyphen_char = get_post_hyphen_char(clang);
591 for (i = 0; i < len; i++) {
592 if (uword[i + 1] == '-') { /* a hyphen follows */
593 while (vlink(t) != r && (type(t) != glyph_node || !is_simple_character(t)))
594 t = vlink(t);
595 if (vlink(t) == r)
596 break;
597 insert_syllable_discretionary(t, &langdata);
598 t = vlink(t); /* skip the new disc */
599 } else if (uword[i + 1] == '=') {
600 /* do nothing ? */
601 t = vlink(t);
602 } else if (uword[i + 1] == '{') {
603 halfword gg, hh, replace = null;
604 int repl;
605 gg = find_exception_part(&i, uword, (int) len);
606 if (i == len || uword[i + 1] != '{') {
607 tex_error("broken pattern 1", PAT_ERROR);
609 hh = find_exception_part(&i, uword, (int) len);
610 if (i == len || uword[i + 1] != '{') {
611 tex_error("broken pattern 2", PAT_ERROR);
613 repl = count_exception_part(&i, uword, (int) len);
614 if (i == len) {
615 tex_error("broken pattern 3", PAT_ERROR);
617 /*i++; *//* jump over the last right brace */
618 if (vlink(t) == r)
619 break;
620 if (repl > 0) {
621 halfword q = t;
622 replace = vlink(q);
623 while (repl > 0 && q != null) {
624 q = vlink(q);
625 if (type(q) == glyph_node) {
626 repl--;
629 try_couple_nodes(t, vlink(q));
630 vlink(q) = null;
632 t = insert_discretionary(t, gg, hh, replace,int_par(hyphen_penalty_code));
633 t = vlink(t); /* skip the new disc */
634 } else {
635 t = vlink(t);
640 @ This is a documentation section from the pascal web file. It is not true any
641 more, but I do not have time right now to rewrite it -- Taco
643 When the line-breaking routine is unable to find a feasible sequence of
644 breakpoints, it makes a second pass over the paragraph, attempting to hyphenate
645 the hyphenatable words. The goal of hyphenation is to insert discretionary
646 material into the paragraph so that there are more potential places to break.
648 The general rules for hyphenation are somewhat complex and technical, because we
649 want to be able to hyphenate words that are preceded or followed by punctuation
650 marks, and because we want the rules to work for languages other than English. We
651 also must contend with the fact that hyphens might radically alter the ligature
652 and kerning structure of a word.
654 A sequence of characters will be considered for hyphenation only if it belongs to
655 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
656 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are
657 either character or ligature or whatsit or implicit kern nodes, and $p_m$ is a
658 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
659 (Therefore hyphenation is disabled by boxes, math formulas, and discretionary
660 nodes already inserted by the user.) The ligature nodes among $p_1\ldots p_{m-1}$
661 are effectively expanded into the original non-ligature characters; the kern
662 nodes and whatsits are ignored. Each character |c| is now classified as either a
663 nonletter (if |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an
664 uppercase letter (otherwise); an uppercase letter is treated as if it were
665 |lc_code(c)| for purposes of hyphenation. The characters generated by $p_1\ldots
666 p_{m-1}$ may begin with nonletters; let $c_1$ be the first letter that is not in
667 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a whatsit
668 found after $c_1$ will be the terminating node $p_m$. All characters that do not
669 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
670 that font must be between 0 and 255, otherwise hyphenation will not be attempted.
671 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible;
672 however, |n| must be less than 64, so a character that would otherwise be
673 $c_{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
674 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
675 generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|,
676 this string qualifies for hyphenation; however, |uc_hyph| must be positive, if
677 $c_1$ is uppercase.
679 The hyphenation process takes place in three stages. First, the candidate
680 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens are
681 determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots
682 p_b$ are replaced by a new sequence of nodes that includes the discretionary
683 breaks found.
685 Fortunately, we do not have to do all this calculation very often, because of the
686 way it has been taken out of \TeX's inner loop. For example, when the second
687 edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset
688 by \TeX, only about 1.2 hyphenations needed to be @^Knuth, Donald Ervin@> tried
689 per paragraph, since the line breaking algorithm needed to use two passes on only
690 about 5 per cent of the paragraphs.
692 When a word been set up to contain a candidate for hyphenation, \TeX\ first looks
693 to see if it is in the user's exception dictionary. If not, hyphens are inserted
694 based on patterns that appear within the given word, using an algorithm due to
695 Frank~M. Liang. @^Liang, Franklin Mark@>
697 @ This is incompatible with TEX because the first word of a paragraph can be
698 hyphenated, but most european users seem to agree that prohibiting hyphenation
699 there was not the best idea ever.
702 static halfword find_next_wordstart(halfword r, halfword first_language)
704 register int l;
705 register int start_ok = 1;
706 int mathlevel = 1;
707 int chr ;
708 halfword t ;
709 while (r != null) {
710 switch (type(r)) {
711 case boundary_node:
712 if (subtype(r) == word_boundary) {
713 start_ok = 1;
715 break;
716 case whatsit_node:
717 break;
718 case glue_node:
719 start_ok = 1;
720 break;
721 case math_node:
722 while (mathlevel > 0) {
723 r = vlink(r);
724 if (r == null)
725 return r;
726 if (type(r) == math_node) {
727 if (subtype(r) == before) {
728 mathlevel++;
729 } else {
730 mathlevel--;
734 break;
735 case glyph_node:
736 if (is_simple_character(r)) {
737 chr = character(r) ;
738 if (chr == ex_hyphen_char) {
740 We only accept an explicit hyphen when there is a preceding glyph and
741 we skip a sequence of explicit hyphens as that normally indicates a
742 -- or --- ligature in which case we can in a worse case usage get bad
743 node lists later on due to messed up ligature building as these dashes
744 are ligatures in base fonts. This is a side effect of the separating the
745 hyphenation, ligaturing and kerning steps. A test is cmr with ------.
747 t = vlink(r) ;
748 if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
749 t = compound_word_break(r, char_lang(r));
750 subtype(t) = automatic_disc;
751 start_ok = 1 ;
752 } else {
753 start_ok = 0;
755 } else if (start_ok && (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
756 if (char_uchyph(r) || l == chr) {
757 return r;
758 } else {
759 start_ok = 0;
763 break;
764 default:
765 start_ok = 0;
766 break;
768 r = vlink(r);
770 return r;
773 @ @c
774 static int valid_wordend(halfword s)
776 register halfword r = s;
777 register int clang = char_lang(s);
778 if (r == null)
779 return 1;
780 while ((r != null) && ( (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
781 || (type(r) == kern_node && (subtype(r) == normal))
782 )) {
783 r = vlink(r);
785 if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
786 || type(r) == glue_node
787 || type(r) == boundary_node
788 || type(r) == whatsit_node
789 || type(r) == ins_node
790 || type(r) == adjust_node
791 || type(r) == penalty_node
792 || (type(r) == kern_node && (subtype(r) == explicit_kern ||
793 subtype(r) == italic_kern ||
794 subtype(r) == accent_kern )))
795 return 1;
796 return 0;
799 @ @c
800 void hnj_hyphenation(halfword head, halfword tail)
802 int lchar, i;
803 struct tex_language *lang;
804 lang_variables langdata;
805 char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
806 int wordlen = 0;
807 char *hy = utf8word;
808 char *replacement = NULL;
809 boolean explicit_hyphen = false;
810 halfword first_language = int_par(first_valid_language_code);
811 halfword s, r = head, wordstart = null, save_tail1 = null, left = null, right = null;
813 /* this first movement assures two things:
814 \item{a)} that we won't waste lots of time on something that has been
815 handled already (in that case, none of the glyphs match |simple_character|).
816 \item{b)} that the first word can be hyphenated. if the movement was
817 not explicit, then the indentation at the start of a paragraph
818 list would make |find_next_wordstart()| look too far ahead.
821 while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
822 r = vlink(r);
824 /* this will make |r| a glyph node with subtype character */
825 r = find_next_wordstart(r,first_language);
826 if (r == null)
827 return;
829 assert(tail != null);
830 save_tail1 = vlink(tail);
831 s = new_penalty(0);
832 couple_nodes(tail, s);
834 while (r != null) { /* could be while(1), but let's be paranoid */
835 int clang, lhmin, rhmin, hmin;
836 halfword hyf_font;
837 halfword end_word = r;
838 wordstart = r;
839 assert(is_simple_character(wordstart));
840 hyf_font = font(wordstart);
841 if (hyphen_char(hyf_font) < 0) /* for backward compat */
842 hyf_font = 0;
843 clang = char_lang(wordstart);
844 lhmin = char_lhmin(wordstart);
845 rhmin = char_rhmin(wordstart);
846 hmin = get_hyphenation_min(clang);
847 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
848 langdata.post_hyphen_char = get_post_hyphen_char(clang);
849 while ( r != null
850 && type(r) == glyph_node
851 && is_simple_character(r)
852 && clang == char_lang(r)
853 && ( ( (clang >= first_language)
854 && (lchar = get_hj_code(clang,character(r))) > 0
856 || ( character(r) == ex_hyphen_char
857 && (lchar = ex_hyphen_char)
861 if (character(r) == ex_hyphen_char)
862 explicit_hyphen = true;
863 wordlen++;
864 hy = uni2string(hy, (unsigned) lchar);
865 /* this should not be needed any more */
866 /*if (vlink(r)!=null) alink(vlink(r))=r; */
867 end_word = r;
868 r = vlink(r);
870 if ( valid_wordend(r)
871 && clang >= first_language
872 && wordlen >= lhmin + rhmin
873 && (hmin <= 0 || wordlen >= hmin)
874 && (hyf_font != 0)
875 && (lang = tex_languages[clang]) != NULL
877 *hy = 0;
878 if ( lang->exceptions != 0
879 && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
881 #ifdef VERBOSE
882 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
883 #endif
884 do_exception(wordstart, r, replacement);
885 free(replacement);
886 } else if (explicit_hyphen == true) {
887 /* insert an explicit discretionary after each of the last in a
888 set of explicit hyphens */
889 halfword rr = r;
890 halfword t = null;
891 #ifdef VERBOSE
892 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
893 #endif
894 while (rr != wordstart) {
895 if (is_simple_character(rr)) {
896 if (character(rr) == ex_hyphen_char) {
897 t = compound_word_break(rr, clang);
898 subtype(t) = automatic_disc;
899 while (character(alink(rr)) == ex_hyphen_char)
900 rr = alink(rr);
901 if (rr == wordstart)
902 break;
905 rr = alink(rr);
907 } else if (lang->patterns != NULL) {
908 left = wordstart;
909 for (i = lhmin; i > 1; i--) {
910 left = vlink(left);
911 while (!is_simple_character(left))
912 left = vlink(left);
914 right = r;
915 for (i = rhmin; i > 0; i--) {
916 right = alink(right);
917 while (!is_simple_character(right))
918 right = alink(right);
920 #ifdef VERBOSE
921 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
922 utf8word, clang, lhmin, rhmin, character(left), character(right));
923 #endif
924 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
927 explicit_hyphen = false;
928 wordlen = 0;
929 hy = utf8word;
930 if (r == null)
931 break;
932 r = find_next_wordstart(r,first_language);
934 flush_node(vlink(tail));
935 vlink(tail) = save_tail1;
938 @ @c
939 void new_hyphenation(halfword head, halfword tail)
941 register int callback_id = 0;
942 if (head == null || vlink(head) == null)
943 return;
944 fix_node_list(head);
945 callback_id = callback_defined(hyphenate_callback);
946 if (callback_id > 0) {
947 lua_State *L = Luas;
948 if (!get_callback(L, callback_id)) {
949 lua_pop(L, 2);
950 return;
952 nodelist_to_lua(L, head);
953 nodelist_to_lua(L, tail);
954 if (lua_pcall(L, 2, 0, 0) != 0) {
955 formatted_warning("hyphenation","bad specification: %s",lua_tostring(L, -1));
956 lua_pop(L, 2);
957 lua_error(L);
958 return;
960 lua_pop(L, 1);
961 } else if (callback_id == 0) {
962 hnj_hyphenation(head, tail);
966 @ dumping and undumping languages
969 #define dump_string(a) \
970 if (a!=NULL) { \
971 x = (int)strlen(a)+1; \
972 dump_int(x); dump_things(*a, x); \
973 } else { \
974 x = 0; dump_int(x); \
977 static void dump_one_language(int i)
979 char *s = NULL;
980 int x = 0;
981 struct tex_language *lang;
982 lang = tex_languages[i];
983 dump_int(lang->id);
984 dump_int(lang->pre_hyphen_char);
985 dump_int(lang->post_hyphen_char);
986 dump_int(lang->pre_exhyphen_char);
987 dump_int(lang->post_exhyphen_char);
988 dump_int(lang->hyphenation_min);
989 if (lang->patterns != NULL) {
990 s = (char *) hnj_serialize(lang->patterns);
992 dump_string(s);
993 if (s != NULL) {
994 free(s);
995 s = NULL;
997 if (lang->exceptions != 0)
998 s = exception_strings(lang);
999 dump_string(s);
1000 if (s != NULL) {
1001 free(s);
1003 free(lang);
1006 void dump_language_data(void)
1008 int i;
1009 dump_int(next_lang_id);
1010 for (i = 0; i < next_lang_id; i++) {
1011 if (tex_languages[i]) {
1012 dump_int(1);
1013 dump_one_language(i);
1014 } else {
1015 dump_int(0);
1020 static void undump_one_language(int i)
1022 char *s = NULL;
1023 int x = 0;
1024 struct tex_language *lang = get_language(i);
1025 undump_int(x);
1026 lang->id = x;
1027 undump_int(x);
1028 lang->pre_hyphen_char = x;
1029 undump_int(x);
1030 lang->post_hyphen_char = x;
1031 undump_int(x);
1032 lang->pre_exhyphen_char = x;
1033 undump_int(x);
1034 lang->post_exhyphen_char = x;
1035 undump_int(x);
1036 lang->hyphenation_min = x;
1037 /* patterns */
1038 undump_int(x);
1039 if (x > 0) {
1040 s = xmalloc((unsigned) x);
1041 undump_things(*s, x);
1042 load_patterns(lang, (unsigned char *) s);
1043 free(s);
1045 /* exceptions */
1046 undump_int(x);
1047 if (x > 0) {
1048 s = xmalloc((unsigned) x);
1049 undump_things(*s, x);
1050 load_hyphenation(lang, (unsigned char *) s);
1051 free(s);
1055 void undump_language_data(void)
1057 int i, x, numlangs;
1058 undump_int(numlangs);
1059 next_lang_id = numlangs;
1060 for (i = 0; i < numlangs; i++) {
1061 undump_int(x);
1062 if (x == 1) {
1063 undump_one_language(i);
1068 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1069 |new_hyph_exceptions| to do the right thing.
1072 void new_hyph_exceptions(void)
1073 { /* enters new exceptions */
1074 (void) scan_toks(false, true);
1075 load_tex_hyphenation(int_par(language_code), def_ref);
1076 flush_list(def_ref);
1079 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1080 procedure named |new_patterns|.
1083 void new_patterns(void)
1084 { /* initializes the hyphenation pattern data */
1085 (void) scan_toks(false, true);
1086 load_tex_patterns(int_par(language_code), def_ref);
1087 flush_list(def_ref);
1090 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1091 `\.{\\posthyphenchar}' the |post_break| character. Their respective defaults are
1092 ascii hyphen ("-") and zero (nul).
1095 void new_pre_hyphen_char(void)
1097 scan_optional_equals();
1098 scan_int();
1099 set_pre_hyphen_char(int_par(language_code), cur_val);
1102 void new_post_hyphen_char(void)
1104 scan_optional_equals();
1105 scan_int();
1106 set_post_hyphen_char(int_par(language_code), cur_val);
1109 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1110 `\.{\\postexhyphenchar}' the |post_break| character. Their defaults are both zero
1111 (nul).
1114 void new_pre_exhyphen_char(void)
1116 scan_optional_equals();
1117 scan_int();
1118 set_pre_exhyphen_char(int_par(language_code), cur_val);
1121 void new_post_exhyphen_char(void)
1123 scan_optional_equals();
1124 scan_int();
1125 set_post_exhyphen_char(int_par(language_code), cur_val);
1128 void new_hyphenation_min(void)
1130 scan_optional_equals();
1131 scan_int();
1132 set_hyphenation_min(int_par(language_code), cur_val);
1135 void new_hj_code(void)
1137 int i ;
1138 scan_int();
1139 i = cur_val;
1140 scan_optional_equals();
1141 scan_int();
1142 set_hj_code(int_par(language_code), i, cur_val, -1);