fix getsup (HH)
[luatex.git] / source / texk / web2c / luatexdir / lang / texlang.w
blobcf88928e8d4c4c711297bb4c1ce71bd850c5d93a
1 % texlang.w
3 % Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
22 #include "ptexlib.h"
23 #include <string.h>
24 #include "lua/luatex-api.h"
26 @ Low-level helpers
28 @ @c
29 #define unVERBOSE
31 #define MAX_TEX_LANGUAGES 16384
33 static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
35 static int next_lang_id = 0;
37 struct tex_language *new_language(int n)
39 struct tex_language *lang;
40 unsigned l;
41 if (n >= 0) {
42 l = (unsigned) n;
43 if (l != (MAX_TEX_LANGUAGES - 1))
44 if (next_lang_id <= n)
45 next_lang_id = n + 1;
46 } else {
47 while (tex_languages[next_lang_id] != NULL)
48 next_lang_id++;
49 l = (unsigned) next_lang_id++;
51 if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
52 lang = xmalloc(sizeof(struct tex_language));
53 tex_languages[l] = lang;
54 lang->id = (int) l;
55 lang->exceptions = 0;
56 lang->patterns = NULL;
57 lang->pre_hyphen_char = '-';
58 lang->post_hyphen_char = 0;
59 lang->pre_exhyphen_char = 0;
60 lang->post_exhyphen_char = 0;
61 lang->hyphenation_min = -1;
62 if (saving_hyph_codes_par) {
63 hj_codes_from_lc_codes(l); /* for now, we might just use specific value for whatever task */
65 return lang;
66 } else {
67 return NULL;
71 struct tex_language *get_language(int n)
73 if (n >= 0 && n < MAX_TEX_LANGUAGES) {
74 if (tex_languages[n] != NULL) {
75 return tex_languages[n];
76 } else {
77 return new_language(n);
79 } else {
80 return NULL;
84 @ @c
85 void set_pre_hyphen_char(int n, int v)
87 struct tex_language *l = get_language((int) n);
88 if (l != NULL)
89 l->pre_hyphen_char = (int) v;
92 void set_post_hyphen_char(int n, int v)
94 struct tex_language *l = get_language((int) n);
95 if (l != NULL)
96 l->post_hyphen_char = (int) v;
99 void set_pre_exhyphen_char(int n, int v)
101 struct tex_language *l = get_language((int) n);
102 if (l != NULL)
103 l->pre_exhyphen_char = (int) v;
106 void set_post_exhyphen_char(int n, int v)
108 struct tex_language *l = get_language((int) n);
109 if (l != NULL)
110 l->post_exhyphen_char = (int) v;
113 int get_pre_hyphen_char(int n)
115 struct tex_language *l = get_language((int) n);
116 if (l == NULL)
117 return -1;
118 return (int) l->pre_hyphen_char;
121 int get_post_hyphen_char(int n)
123 struct tex_language *l = get_language((int) n);
124 if (l == NULL)
125 return -1;
126 return (int) l->post_hyphen_char;
129 int get_pre_exhyphen_char(int n)
131 struct tex_language *l = get_language((int) n);
132 if (l == NULL)
133 return -1;
134 return (int) l->pre_exhyphen_char;
137 int get_post_exhyphen_char(int n)
139 struct tex_language *l = get_language((int) n);
140 if (l == NULL)
141 return -1;
142 return (int) l->post_exhyphen_char;
145 void set_hyphenation_min(int n, int v)
147 struct tex_language *l = get_language((int) n);
148 if (l != NULL)
149 l->hyphenation_min = (int) v;
152 int get_hyphenation_min(int n)
154 struct tex_language *l = get_language((int) n);
155 if (l == NULL)
156 return -1;
157 return (int) l->hyphenation_min;
160 void load_patterns(struct tex_language *lang, const unsigned char *buff)
162 if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
163 return;
164 if (lang->patterns == NULL) {
165 lang->patterns = hnj_hyphen_new();
167 hnj_hyphen_load(lang->patterns, buff);
170 void clear_patterns(struct tex_language *lang)
172 if (lang == NULL)
173 return;
174 if (lang->patterns != NULL) {
175 hnj_hyphen_clear(lang->patterns);
179 void load_tex_patterns(int curlang, halfword head)
181 char *s = tokenlist_to_cstring(head, 1, NULL);
182 load_patterns(get_language(curlang), (unsigned char *) s);
185 @ @c
186 #define STORE_CHAR(l,x) do { \
187 unsigned xx = get_hj_code(l,x); \
188 if (!xx || xx <= 32) { \
189 xx = x; \
191 uindex = uni2string(uindex, xx); \
192 } while (0)
194 @ Cleans one word which is returned in |cleaned|, returns the new offset into
195 |buffer|
198 const char *clean_hyphenation(int id, const char *buff, char **cleaned)
200 int items = 0;
201 unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
202 unsigned uword[MAX_WORD_LEN + 1] = { 0 }; /* work buffer for unicode */
203 int u = 0; /* unicode buffer value */
204 int i = 0; /* index into buffer */
205 char *uindex = (char *)word;
206 const char *s = buff;
208 while (*s && !isspace((unsigned char)*s)) {
209 word[i++] = (unsigned)*s;
210 s++;
211 if ((s-buff)>MAX_WORD_LEN) {
212 /* todo: this is too strict, should count unicode, not bytes */
213 *cleaned = NULL;
214 tex_error("exception too long", NULL);
215 return s;
218 /* now convert the input to unicode */
219 word[i] = '\0';
220 utf2uni_strcpy(uword, (const char *)word);
222 /* build the new word string */
223 i = 0;
224 while (uword[i]>0) {
225 u = uword[i++];
226 if (u == '-') { /* skip */
227 } else if (u == '=') {
228 STORE_CHAR(id,'-');
229 } else if (u == '{') {
230 u = uword[i++];
231 items = 0;
232 while (u && u != '}') {
233 u = uword[i++];
235 if (u == '}') {
236 items++;
237 u = uword[i++];
239 while (u && u != '}') {
240 u = uword[i++];
242 if (u == '}') {
243 items++;
244 u = uword[i++];
246 if (u == '{') {
247 u = uword[i++];
249 while (u && u != '}') {
250 STORE_CHAR(id,u);
251 u = uword[i++];
253 if (u == '}') {
254 items++;
256 if (items != 3) { /* syntax error */
257 *cleaned = NULL;
258 tex_error("exception syntax error", NULL);
259 return s;
261 } else {
262 STORE_CHAR(id,u);
265 *uindex = '\0';
266 *cleaned = xstrdup((char *) word);
267 return s;
270 @ @c
271 void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
273 const char *s;
274 const char *value;
275 char *cleaned;
276 int id ;
277 if (lang == NULL)
278 return;
279 if (lang->exceptions == 0) {
280 lua_newtable(Luas);
281 lang->exceptions = luaL_ref(Luas, LUA_REGISTRYINDEX);
283 lua_rawgeti(Luas, LUA_REGISTRYINDEX, lang->exceptions);
284 s = (const char *) buff;
285 id = lang->id;
286 while (*s) {
287 while (isspace((unsigned char)*s))
288 s++;
289 if (*s) {
290 value = s;
291 s = clean_hyphenation(id, s, &cleaned);
292 if (cleaned != NULL) {
293 if ((s - value) > 0) {
294 lua_pushstring(Luas, cleaned);
295 lua_pushlstring(Luas, value, (size_t) (s - value));
296 lua_rawset(Luas, -3);
298 free(cleaned);
299 } else {
300 #ifdef VERBOSE
301 formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value);
302 #endif
308 void clear_hyphenation(struct tex_language *lang)
310 if (lang == NULL)
311 return;
312 if (lang->exceptions != 0) {
313 luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
314 lang->exceptions = 0;
318 void load_tex_hyphenation(int curlang, halfword head)
320 char *s = tokenlist_to_cstring(head, 1, NULL);
321 load_hyphenation(get_language(curlang), (unsigned char *) s);
324 @ TODO: clean this up. The |delete_attribute_ref()| statements are not very nice,
325 but needed. Also, in the post-break, it would be nicer to get the attribute list
326 from |vlink(n)|. No rush, as it is currently not used much.
329 halfword insert_discretionary(halfword t, halfword pre, halfword post,
330 halfword replace, int penalty)
332 halfword g, n;
333 int f;
334 n = new_node(disc_node, syllable_disc);
335 disc_penalty(n) = penalty;
336 try_couple_nodes(n, vlink(t));
337 couple_nodes(t, n);
338 if (replace != null)
339 f = font(replace);
340 else
341 f = get_cur_font(); /* for compound words following explicit hyphens */
342 for (g = pre; g != null; g = vlink(g)) {
343 font(g) = f;
344 if (node_attr(t) != null) {
345 delete_attribute_ref(node_attr(g));
346 node_attr(g) = node_attr(t);
347 attr_list_ref(node_attr(t)) += 1;
350 for (g = post; g != null; g = vlink(g)) {
351 font(g) = f;
352 if (node_attr(t) != null) {
353 delete_attribute_ref(node_attr(g));
354 node_attr(g) = node_attr(t);
355 attr_list_ref(node_attr(t)) += 1;
358 for (g = replace; g != null; g = vlink(g)) {
359 if (node_attr(t) != null) {
360 delete_attribute_ref(node_attr(g));
361 node_attr(g) = node_attr(t);
362 attr_list_ref(node_attr(t)) += 1;
365 if (node_attr(t) != null) {
366 delete_attribute_ref(node_attr(vlink(t)));
367 node_attr(vlink(t)) = node_attr(t);
368 attr_list_ref(node_attr(t)) += 1;
370 t = vlink(t);
371 set_disc_field(pre_break(t), pre);
372 set_disc_field(post_break(t), post);
373 set_disc_field(no_break(t), replace);
374 return t;
377 halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
379 halfword g, n;
380 n = new_node(disc_node, syllable_disc);
381 disc_penalty(n) = hyphen_penalty_par;
382 couple_nodes(n, vlink(t));
383 couple_nodes(t, n);
384 delete_attribute_ref(node_attr(n));
385 if (node_attr(t) != null) {
386 node_attr(n) = node_attr(t);
387 attr_list_ref(node_attr(t))++;
388 } else {
389 node_attr(n) = null;
391 if (lan->pre_hyphen_char > 0) {
392 g = raw_glyph_node();
393 set_to_character(g);
394 character(g) = lan->pre_hyphen_char;
395 font(g) = font(t);
396 lang_data(g) = lang_data(t);
397 if (node_attr(t) != null) {
398 node_attr(g) = node_attr(t);
399 attr_list_ref(node_attr(t))++;
401 set_disc_field(pre_break(n), g);
404 if (lan->post_hyphen_char > 0) {
405 t = vlink(n);
406 g = raw_glyph_node();
407 set_to_character(g);
408 character(g) = lan->post_hyphen_char;
409 font(g) = font(t);
410 lang_data(g) = lang_data(t);
411 if (node_attr(t) != null) {
412 node_attr(g) = node_attr(t);
413 attr_list_ref(node_attr(t)) += 1;
415 set_disc_field(post_break(n), g);
417 return n;
420 halfword insert_word_discretionary(halfword t, lang_variables * lan)
422 halfword pre = null, pos = null;
423 if (lan->pre_exhyphen_char > 0)
424 pre = insert_character(null, lan->pre_exhyphen_char);
425 if (lan->post_exhyphen_char > 0)
426 pos = insert_character(null, lan->post_exhyphen_char);
427 return insert_discretionary(t, pre, pos, null,ex_hyphen_penalty_par);
430 @ @c
431 halfword compound_word_break(halfword t, int clang)
433 int disc;
434 lang_variables langdata;
435 langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
436 langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
437 disc = insert_word_discretionary(t, &langdata);
438 return disc;
441 halfword insert_complex_discretionary(halfword t, lang_variables * lan,
442 halfword pre, halfword pos,
443 halfword replace)
445 (void) lan;
446 return insert_discretionary(t, pre, pos, replace,hyphen_penalty_par);
449 halfword insert_character(halfword t, int c)
451 halfword p;
452 p = new_node(glyph_node, 0);
453 set_to_character(p);
454 character(p) = c;
455 if (t != null) {
456 couple_nodes(t, p);
458 return p;
461 @ @c
462 void set_disc_field(halfword f, halfword t)
464 if (t != null) {
466 couple_nodes(f, t); // better not expose f as prev pointer
468 vlink(f) = t ;
469 alink(t) = null ;
470 tlink(f) = tail_of_list(t);
471 } else {
472 vlink(f) = null;
473 tlink(f) = null;
477 @ @c
478 static char *hyphenation_exception(int exceptions, char *w)
480 char *ret = NULL;
481 lua_checkstack(Luas, 2);
482 lua_rawgeti(Luas, LUA_REGISTRYINDEX, exceptions);
483 if (lua_istable(Luas, -1)) { /* ?? */
484 lua_pushstring(Luas, w); /* word table */
485 lua_rawget(Luas, -2);
486 if (lua_type(Luas, -1) == LUA_TSTRING) {
487 ret = xstrdup(lua_tostring(Luas, -1));
489 lua_pop(Luas, 2);
490 } else {
491 lua_pop(Luas, 1);
493 return ret;
496 @ @c
497 char *exception_strings(struct tex_language *lang)
499 const char *value;
500 size_t size = 0, current = 0;
501 size_t l = 0;
502 char *ret = NULL;
503 if (lang->exceptions == 0)
504 return NULL;
505 lua_checkstack(Luas, 2);
506 lua_rawgeti(Luas, LUA_REGISTRYINDEX, lang->exceptions);
507 if (lua_istable(Luas, -1)) {
508 /* iterate and join */
509 lua_pushnil(Luas); /* first key */
510 while (lua_next(Luas, -2) != 0) {
511 value = lua_tolstring(Luas, -1, &l);
512 if (current + 2 + l > size) {
513 ret = xrealloc(ret, (unsigned) ((size + size / 5) + current + l + 1024));
514 size = (size + size / 5) + current + l + 1024;
516 *(ret + current) = ' ';
517 strcpy(ret + current + 1, value);
518 current += l + 1;
519 lua_pop(Luas, 1);
522 return ret;
525 @ the sequence from |wordstart| to |r| can contain only normal characters it
526 could be faster to modify a halfword pointer and return an integer
529 static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
531 halfword g = null, gg = null;
532 register unsigned i = *j;
533 i++; /* this puts uword[i] on the |{| */
534 while (i < (unsigned) len && uword[i + 1] != '}') {
535 if (g == null) {
536 gg = new_char(0, (int) uword[i + 1]);
537 g = gg;
538 } else {
539 halfword s = new_char(0, (int) uword[i + 1]);
540 couple_nodes(g, s);
541 g = vlink(g);
543 i++;
545 *j = ++i;
546 return gg;
549 static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
551 int ret = 0;
552 register unsigned i = *j;
553 i++; /* this puts uword[i] on the |{| */
554 while (i < (unsigned) len && uword[i + 1] != '}') {
555 ret++;
556 i++;
558 *j = ++i;
559 return ret;
562 @ @c
563 static const char *PAT_ERROR[] = {
564 "Exception discretionaries should contain three pairs of braced items.",
565 "No intervening spaces are allowed.",
566 NULL
570 The exceptions are taken as-is: no min values are taken into account. One can
571 add normal patterns on-the-fly if needed.
574 static void do_exception(halfword wordstart, halfword r, char *replacement)
576 unsigned i;
577 halfword t;
578 unsigned len;
579 int clang;
580 lang_variables langdata;
581 unsigned uword[MAX_WORD_LEN + 1] = { 0 };
582 utf2uni_strcpy(uword, replacement);
583 len = u_length(uword);
584 i = 0;
585 t = wordstart;
586 clang = char_lang(wordstart);
587 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
588 langdata.post_hyphen_char = get_post_hyphen_char(clang);
590 for (i = 0; i < len; i++) {
591 if (uword[i + 1] == '-') { /* a hyphen follows */
592 while (vlink(t) != r && (type(t) != glyph_node || !is_simple_character(t)))
593 t = vlink(t);
594 if (vlink(t) == r)
595 break;
596 insert_syllable_discretionary(t, &langdata);
597 t = vlink(t); /* skip the new disc */
598 } else if (uword[i + 1] == '=') {
599 /* do nothing ? */
600 t = vlink(t);
601 } else if (uword[i + 1] == '{') {
602 halfword gg, hh, replace = null;
603 int repl;
604 gg = find_exception_part(&i, uword, (int) len);
605 if (i == len || uword[i + 1] != '{') {
606 tex_error("broken pattern 1", PAT_ERROR);
608 hh = find_exception_part(&i, uword, (int) len);
609 if (i == len || uword[i + 1] != '{') {
610 tex_error("broken pattern 2", PAT_ERROR);
612 repl = count_exception_part(&i, uword, (int) len);
613 if (i == len) {
614 tex_error("broken pattern 3", PAT_ERROR);
616 /*i++; *//* jump over the last right brace */
617 if (vlink(t) == r)
618 break;
619 if (repl > 0) {
620 halfword q = t;
621 replace = vlink(q);
622 while (repl > 0 && q != null) {
623 q = vlink(q);
624 if (type(q) == glyph_node) {
625 repl--;
628 try_couple_nodes(t, vlink(q));
629 vlink(q) = null;
631 t = insert_discretionary(t, gg, hh, replace, hyphen_penalty_par);
632 t = vlink(t); /* skip the new disc */
633 } else {
634 t = vlink(t);
639 @ This is a documentation section from the pascal web file. It is not true any
640 more, but I do not have time right now to rewrite it -- Taco
642 When the line-breaking routine is unable to find a feasible sequence of
643 breakpoints, it makes a second pass over the paragraph, attempting to hyphenate
644 the hyphenatable words. The goal of hyphenation is to insert discretionary
645 material into the paragraph so that there are more potential places to break.
647 The general rules for hyphenation are somewhat complex and technical, because we
648 want to be able to hyphenate words that are preceded or followed by punctuation
649 marks, and because we want the rules to work for languages other than English. We
650 also must contend with the fact that hyphens might radically alter the ligature
651 and kerning structure of a word.
653 A sequence of characters will be considered for hyphenation only if it belongs to
654 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
655 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are
656 either character or ligature or whatsit or implicit kern nodes, and $p_m$ is a
657 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
658 (Therefore hyphenation is disabled by boxes, math formulas, and discretionary
659 nodes already inserted by the user.) The ligature nodes among $p_1\ldots p_{m-1}$
660 are effectively expanded into the original non-ligature characters; the kern
661 nodes and whatsits are ignored. Each character |c| is now classified as either a
662 nonletter (if |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an
663 uppercase letter (otherwise); an uppercase letter is treated as if it were
664 |lc_code(c)| for purposes of hyphenation. The characters generated by $p_1\ldots
665 p_{m-1}$ may begin with nonletters; let $c_1$ be the first letter that is not in
666 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a whatsit
667 found after $c_1$ will be the terminating node $p_m$. All characters that do not
668 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
669 that font must be between 0 and 255, otherwise hyphenation will not be attempted.
670 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible;
671 however, |n| must be less than 64, so a character that would otherwise be
672 $c_{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
673 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
674 generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|,
675 this string qualifies for hyphenation; however, |uc_hyph| must be positive, if
676 $c_1$ is uppercase.
678 The hyphenation process takes place in three stages. First, the candidate
679 sequence $c_1\ldots c_n$ is found; then potential positions for hyphens are
680 determined by referring to hyphenation tables; and finally, the nodes $p_a\ldots
681 p_b$ are replaced by a new sequence of nodes that includes the discretionary
682 breaks found.
684 Fortunately, we do not have to do all this calculation very often, because of the
685 way it has been taken out of \TeX's inner loop. For example, when the second
686 edition of the author's 700-page book {\sl Seminumerical Algorithms} was typeset
687 by \TeX, only about 1.2 hyphenations needed to be @^Knuth, Donald Ervin@> tried
688 per paragraph, since the line breaking algorithm needed to use two passes on only
689 about 5 per cent of the paragraphs.
691 When a word been set up to contain a candidate for hyphenation, \TeX\ first looks
692 to see if it is in the user's exception dictionary. If not, hyphens are inserted
693 based on patterns that appear within the given word, using an algorithm due to
694 Frank~M. Liang. @^Liang, Franklin Mark@>
696 @ This is incompatible with TEX because the first word of a paragraph can be
697 hyphenated, but most european users seem to agree that prohibiting hyphenation
698 there was not the best idea ever.
702 More strict: \hyphenationbounds
704 0 = not strict
705 1 = strict start
706 2 = strict end
707 3 = strict start and strict end
709 \parindent0pt \hsize=1.1cm
710 12-34-56 \par
711 12-34-\hbox{56} \par
712 12-34-\vrule width 1em height 1.5ex \par
713 12-\hbox{34}-56 \par
714 12-\vrule width 1em height 1.5ex-56 \par
715 \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
716 12-34-56 \par
717 12-34-\hbox{56} \par
718 12-34-\vrule width 1em height 1.5ex \par
719 12-\hbox{34}-56 \par
720 12-\vrule width 1em height 1.5ex-56 \par
724 static halfword find_next_wordstart(halfword r, halfword first_language, halfword strict_bound)
726 register int l;
727 register int start_ok = 1;
728 int mathlevel = 1;
729 int chr ;
730 halfword t ;
731 while (r != null) {
732 switch (type(r)) {
733 case boundary_node:
734 if (subtype(r) == word_boundary) {
735 start_ok = 1;
737 break;
738 case hlist_node: /* new > 0.95 */
739 case vlist_node: /* new > 0.95 */
740 case rule_node: /* new > 0.95 */
741 case dir_node:
742 case whatsit_node:
743 if (strict_bound == 1 || strict_bound == 3) {
744 start_ok = 0;
746 break;
747 case glue_node:
748 start_ok = 1;
749 break;
750 case math_node:
751 while (mathlevel > 0) {
752 r = vlink(r);
753 if (r == null)
754 return r;
755 if (type(r) == math_node) {
756 if (subtype(r) == before) {
757 mathlevel++;
758 } else {
759 mathlevel--;
763 break;
764 case glyph_node:
765 if (is_simple_character(r)) {
766 chr = character(r) ;
767 if (chr == ex_hyphen_char_par) {
769 We only accept an explicit hyphen when there is a preceding glyph and
770 we skip a sequence of explicit hyphens as that normally indicates a
771 -- or --- ligature in which case we can in a worse case usage get bad
772 node lists later on due to messed up ligature building as these dashes
773 are ligatures in base fonts. This is a side effect of the separating the
774 hyphenation, ligaturing and kerning steps. A test is cmr with ------.
776 t = vlink(r) ;
777 if ((start_ok == 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char_par)) {
778 t = compound_word_break(r, char_lang(r));
779 subtype(t) = automatic_disc;
780 start_ok = 1 ;
781 } else {
782 start_ok = 0;
784 } else if (start_ok && (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
785 if (char_uchyph(r) || l == chr || l <= 32) {
786 return r;
787 } else {
788 start_ok = 0;
792 break;
793 default:
794 start_ok = 0;
795 break;
797 r = vlink(r);
799 return r;
802 @ @c
803 static int valid_wordend(halfword s, halfword strict_bound)
805 register halfword r = s;
806 register int clang = char_lang(s);
807 if (r == null)
808 return 1;
809 while ( (r != null) &&
810 ( (type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r))
811 || (type(r) == kern_node && (subtype(r) == normal))
814 r = vlink(r);
816 if (r == null || (type(r) == glyph_node && is_simple_character(r) && clang != char_lang(r))
817 || type(r) == glue_node
818 || type(r) == penalty_node
819 || (type(r) == kern_node && (subtype(r) == explicit_kern || /* so why not italic correction ? */
820 subtype(r) == italic_kern ||
821 subtype(r) == accent_kern ))
822 || ((type(r) == hlist_node || /* new > 0.95 */
823 type(r) == vlist_node || /* new > 0.95 */
824 type(r) == rule_node || /* new > 0.95 */
825 type(r) == dir_node || /* new > 0.97 */
826 type(r) == whatsit_node ||
827 type(r) == ins_node || /* yes or no strict test */
828 type(r) == adjust_node /* yes or no strict test */
829 ) && ! (strict_bound == 2 || strict_bound == 3))
830 || type(r) == boundary_node
832 return 1;
833 return 0;
836 @ @c
837 void hnj_hyphenation(halfword head, halfword tail)
839 int lchar, i;
840 struct tex_language *lang;
841 lang_variables langdata;
842 char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
843 int wordlen = 0;
844 char *hy = utf8word;
845 char *replacement = NULL;
846 boolean explicit_hyphen = false;
847 halfword first_language = first_valid_language_par;
848 halfword strict_bound = hyphenation_bounds_par;
849 halfword s, r = head, wordstart = null, save_tail1 = null, left = null, right = null;
851 /* this first movement assures two things:
852 \item{a)} that we won't waste lots of time on something that has been
853 handled already (in that case, none of the glyphs match |simple_character|).
854 \item{b)} that the first word can be hyphenated. if the movement was
855 not explicit, then the indentation at the start of a paragraph
856 list would make |find_next_wordstart()| look too far ahead.
859 while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
860 r = vlink(r);
862 /* this will make |r| a glyph node with subtype character */
863 r = find_next_wordstart(r,first_language,strict_bound);
864 if (r == null)
865 return;
867 assert(tail != null);
868 save_tail1 = vlink(tail);
869 s = new_penalty(0);
870 couple_nodes(tail, s);
872 while (r != null) { /* could be while(1), but let's be paranoid */
873 int clang, lhmin, rhmin, hmin;
874 halfword hyf_font;
875 halfword end_word = r;
876 wordstart = r;
877 assert(is_simple_character(wordstart));
878 hyf_font = font(wordstart);
879 if (hyphen_char(hyf_font) < 0) /* for backward compat */
880 hyf_font = 0;
881 clang = char_lang(wordstart);
882 lhmin = char_lhmin(wordstart);
883 rhmin = char_rhmin(wordstart);
884 hmin = get_hyphenation_min(clang);
885 langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
886 langdata.post_hyphen_char = get_post_hyphen_char(clang);
887 while ( r != null
888 && type(r) == glyph_node
889 && is_simple_character(r)
890 && clang == char_lang(r)
891 && ( ( (clang >= first_language)
892 && (lchar = get_hj_code(clang,character(r))) > 0
894 || ( character(r) == ex_hyphen_char_par
895 && (lchar = ex_hyphen_char_par)
899 if (character(r) == ex_hyphen_char_par) {
900 explicit_hyphen = true;
902 wordlen++;
903 if (lchar <= 32) {
904 if (lchar == 32) {
905 lchar = 0 ;
907 if (wordlen <= lhmin) {
908 lhmin = lhmin - lchar + 1 ;
909 if (lhmin < 0)
910 lhmin = 1;
912 if (wordlen >= rhmin) {
913 rhmin = rhmin - lchar + 1 ;
914 if (rhmin < 0)
915 rhmin = 1;
917 hmin = hmin - lchar + 1 ;
918 if (hmin < 0)
919 rhmin = 1;
920 lchar = character(r) ;
922 hy = uni2string(hy, (unsigned) lchar);
923 /* this should not be needed any more */
924 /*if (vlink(r)!=null) alink(vlink(r))=r; */
925 end_word = r;
926 r = vlink(r);
928 if ( valid_wordend(r,strict_bound)
929 && clang >= first_language
930 && wordlen >= lhmin + rhmin
931 && (hmin <= 0 || wordlen >= hmin)
932 && (hyf_font != 0)
933 && (lang = tex_languages[clang]) != NULL
935 *hy = 0;
936 if ( lang->exceptions != 0
937 && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
939 #ifdef VERBOSE
940 formatted_warning("hyphenation","replacing %s (c=%d) by %s", utf8word, clang, replacement);
941 #endif
942 do_exception(wordstart, r, replacement);
943 free(replacement);
944 } else if (explicit_hyphen == true) {
946 insert an explicit discretionary after each of the last in a
947 set of explicit hyphens
949 halfword rr = r;
950 halfword t = null;
951 #ifdef VERBOSE
952 formatted_warning("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word, clang);
953 #endif
954 while (rr != wordstart) {
955 if (is_simple_character(rr)) {
956 if (character(rr) == ex_hyphen_char_par) {
957 t = compound_word_break(rr, clang);
958 subtype(t) = automatic_disc;
959 while (character(alink(rr)) == ex_hyphen_char_par)
960 rr = alink(rr);
961 if (rr == wordstart)
962 break;
965 rr = alink(rr);
967 } else if (lang->patterns != NULL) {
968 left = wordstart;
969 for (i = lhmin; i > 1; i--) {
970 left = vlink(left);
971 while (!is_simple_character(left)) {
972 left = vlink(left);
975 if (!left)
976 break ;
978 /* what is left overruns right .. a bit messy */
980 right = r;
981 for (i = rhmin; i > 0; i--) {
982 right = alink(right);
983 while (!is_simple_character(right)) {
984 right = alink(right);
987 if (!right)
988 break ;
990 /* what is right overruns left .. a bit messy */
992 /* maybe an extra check ... */
993 /* if (left && right) { */
994 #ifdef VERBOSE
995 formatted_warning("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
996 utf8word, clang, lhmin, rhmin, character(left), character(right));
997 #endif
998 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, wordlen, left, right, &langdata);
999 /* } */
1002 explicit_hyphen = false;
1003 wordlen = 0;
1004 hy = utf8word;
1005 if (r == null)
1006 break;
1007 r = find_next_wordstart(r,first_language,strict_bound);
1009 flush_node(vlink(tail));
1010 vlink(tail) = save_tail1;
1013 @ @c
1014 void new_hyphenation(halfword head, halfword tail)
1016 register int callback_id = 0;
1017 if (head == null || vlink(head) == null)
1018 return;
1019 fix_node_list(head);
1020 callback_id = callback_defined(hyphenate_callback);
1021 if (callback_id > 0) {
1022 if (!get_callback(Luas, callback_id)) {
1023 lua_pop(Luas, 2);
1024 return;
1026 nodelist_to_lua(Luas, head);
1027 nodelist_to_lua(Luas, tail);
1028 if (lua_pcall(Luas, 2, 0, 0) != 0) {
1029 formatted_warning("hyphenation","bad specification: %s",lua_tostring(Luas, -1));
1030 lua_pop(Luas, 2);
1031 lua_error(Luas);
1032 return;
1034 lua_pop(Luas, 1);
1035 } else if (callback_id == 0) {
1036 hnj_hyphenation(head, tail);
1040 @ dumping and undumping languages
1043 #define dump_string(a) \
1044 if (a!=NULL) { \
1045 x = (int)strlen(a)+1; \
1046 dump_int(x); dump_things(*a, x); \
1047 } else { \
1048 x = 0; dump_int(x); \
1051 static void dump_one_language(int i)
1053 char *s = NULL;
1054 int x = 0;
1055 struct tex_language *lang;
1056 lang = tex_languages[i];
1057 dump_int(lang->id);
1058 dump_int(lang->pre_hyphen_char);
1059 dump_int(lang->post_hyphen_char);
1060 dump_int(lang->pre_exhyphen_char);
1061 dump_int(lang->post_exhyphen_char);
1062 dump_int(lang->hyphenation_min);
1063 if (lang->patterns != NULL) {
1064 s = (char *) hnj_serialize(lang->patterns);
1066 dump_string(s);
1067 if (s != NULL) {
1068 free(s);
1069 s = NULL;
1071 if (lang->exceptions != 0)
1072 s = exception_strings(lang);
1073 dump_string(s);
1074 if (s != NULL) {
1075 free(s);
1077 free(lang);
1080 void dump_language_data(void)
1082 int i;
1083 dump_int(next_lang_id);
1084 for (i = 0; i < next_lang_id; i++) {
1085 if (tex_languages[i]) {
1086 dump_int(1);
1087 dump_one_language(i);
1088 } else {
1089 dump_int(0);
1094 static void undump_one_language(int i)
1096 char *s = NULL;
1097 int x = 0;
1098 struct tex_language *lang = get_language(i);
1099 undump_int(x);
1100 lang->id = x;
1101 undump_int(x);
1102 lang->pre_hyphen_char = x;
1103 undump_int(x);
1104 lang->post_hyphen_char = x;
1105 undump_int(x);
1106 lang->pre_exhyphen_char = x;
1107 undump_int(x);
1108 lang->post_exhyphen_char = x;
1109 undump_int(x);
1110 lang->hyphenation_min = x;
1111 /* patterns */
1112 undump_int(x);
1113 if (x > 0) {
1114 s = xmalloc((unsigned) x);
1115 undump_things(*s, x);
1116 load_patterns(lang, (unsigned char *) s);
1117 free(s);
1119 /* exceptions */
1120 undump_int(x);
1121 if (x > 0) {
1122 s = xmalloc((unsigned) x);
1123 undump_things(*s, x);
1124 load_hyphenation(lang, (unsigned char *) s);
1125 free(s);
1129 void undump_language_data(void)
1131 int i, x, numlangs;
1132 undump_int(numlangs);
1133 next_lang_id = numlangs;
1134 for (i = 0; i < numlangs; i++) {
1135 undump_int(x);
1136 if (x == 1) {
1137 undump_one_language(i);
1142 @ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1143 |new_hyph_exceptions| to do the right thing.
1146 void new_hyph_exceptions(void)
1147 { /* enters new exceptions */
1148 (void) scan_toks(false, true);
1149 load_tex_hyphenation(language_par, def_ref);
1150 flush_list(def_ref);
1153 @ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1154 procedure named |new_patterns|.
1157 void new_patterns(void)
1158 { /* initializes the hyphenation pattern data */
1159 (void) scan_toks(false, true);
1160 load_tex_patterns(language_par, def_ref);
1161 flush_list(def_ref);
1164 @ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1165 `\.{\\posthyphenchar}' the |post_break| character. Their respective defaults are
1166 ascii hyphen ("-") and zero (nul).
1169 void new_pre_hyphen_char(void)
1171 scan_optional_equals();
1172 scan_int();
1173 set_pre_hyphen_char(language_par, cur_val);
1176 void new_post_hyphen_char(void)
1178 scan_optional_equals();
1179 scan_int();
1180 set_post_hyphen_char(language_par, cur_val);
1183 @ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1184 `\.{\\postexhyphenchar}' the |post_break| character. Their defaults are both zero
1185 (nul).
1188 void new_pre_exhyphen_char(void)
1190 scan_optional_equals();
1191 scan_int();
1192 set_pre_exhyphen_char(language_par, cur_val);
1195 void new_post_exhyphen_char(void)
1197 scan_optional_equals();
1198 scan_int();
1199 set_post_exhyphen_char(language_par, cur_val);
1202 void new_hyphenation_min(void)
1204 scan_optional_equals();
1205 scan_int();
1206 set_hyphenation_min(language_par, cur_val);
1209 void new_hj_code(void)
1211 int i ;
1212 scan_int();
1213 i = cur_val;
1214 scan_optional_equals();
1215 scan_int();
1216 set_hj_code(language_par, i, cur_val, -1);