3 % Copyright
2006-2012 Taco Hoekwater
<taco@@luatex.org
>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software
; you can redistribute it and
/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation
; either version
2 of the License
, or
(at your
10 % option
) any later version.
12 % LuaTeX is distributed in the hope that it will be useful
, but WITHOUT
13 % ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX
; if not
, see
<http
://www.gnu.org
/licenses
/>.
24 #include
"lua/luatex-api.h"
31 #define MAX_TEX_LANGUAGES
16384
33 static struct tex_language
*tex_languages
[MAX_TEX_LANGUAGES
] = { NULL };
35 static int next_lang_id
= 0;
37 struct tex_language
*new_language
(int n
)
39 struct tex_language
*lang
;
43 if
(l
!= (MAX_TEX_LANGUAGES
- 1))
44 if
(next_lang_id
<= n
)
47 while
(tex_languages
[next_lang_id
] != NULL)
49 l
= (unsigned
) next_lang_id
++;
51 if
(l
< (MAX_TEX_LANGUAGES
- 1) && tex_languages[l] == NULL) {
52 lang
= xmalloc
(sizeof
(struct tex_language
));
53 tex_languages
[l
] = lang
;
56 lang-
>patterns
= NULL;
57 lang-
>pre_hyphen_char
= '
-'
;
58 lang-
>post_hyphen_char
= 0;
59 lang-
>pre_exhyphen_char
= 0;
60 lang-
>post_exhyphen_char
= 0;
61 lang-
>hyphenation_min
= -1;
62 if
(saving_hyph_codes_par
) {
63 hj_codes_from_lc_codes
(l
); /* for now
, we might just use specific value for whatever task
*/
71 struct tex_language
*get_language
(int n
)
73 if
(n
>= 0 && n < MAX_TEX_LANGUAGES) {
74 if
(tex_languages
[n
] != NULL) {
75 return tex_languages
[n
];
77 return new_language
(n
);
85 void set_pre_hyphen_char
(int n
, int v
)
87 struct tex_language
*l
= get_language
((int
) n
);
89 l-
>pre_hyphen_char
= (int
) v
;
92 void set_post_hyphen_char
(int n
, int v
)
94 struct tex_language
*l
= get_language
((int
) n
);
96 l-
>post_hyphen_char
= (int
) v
;
99 void set_pre_exhyphen_char
(int n
, int v
)
101 struct tex_language
*l
= get_language
((int
) n
);
103 l-
>pre_exhyphen_char
= (int
) v
;
106 void set_post_exhyphen_char
(int n
, int v
)
108 struct tex_language
*l
= get_language
((int
) n
);
110 l-
>post_exhyphen_char
= (int
) v
;
113 int get_pre_hyphen_char
(int n
)
115 struct tex_language
*l
= get_language
((int
) n
);
118 return
(int
) l-
>pre_hyphen_char
;
121 int get_post_hyphen_char
(int n
)
123 struct tex_language
*l
= get_language
((int
) n
);
126 return
(int
) l-
>post_hyphen_char
;
129 int get_pre_exhyphen_char
(int n
)
131 struct tex_language
*l
= get_language
((int
) n
);
134 return
(int
) l-
>pre_exhyphen_char
;
137 int get_post_exhyphen_char
(int n
)
139 struct tex_language
*l
= get_language
((int
) n
);
142 return
(int
) l-
>post_exhyphen_char
;
145 void set_hyphenation_min
(int n
, int v
)
147 struct tex_language
*l
= get_language
((int
) n
);
149 l-
>hyphenation_min
= (int
) v
;
152 int get_hyphenation_min
(int n
)
154 struct tex_language
*l
= get_language
((int
) n
);
157 return
(int
) l-
>hyphenation_min
;
160 void load_patterns
(struct tex_language
*lang
, const unsigned char
*buff
)
162 if
(lang
== NULL || buff
== NULL || strlen
((const char
*) buff
) == 0)
164 if
(lang-
>patterns
== NULL) {
165 lang-
>patterns
= hnj_hyphen_new
();
167 hnj_hyphen_load
(lang-
>patterns
, buff
);
170 void clear_patterns
(struct tex_language
*lang
)
174 if
(lang-
>patterns
!= NULL) {
175 hnj_hyphen_clear
(lang-
>patterns
);
179 void load_tex_patterns
(int curlang
, halfword head
)
181 char
*s
= tokenlist_to_cstring
(head
, 1, NULL);
182 load_patterns
(get_language
(curlang
), (unsigned char
*) s
);
186 #define STORE_CHAR
(l
,x
) do
{ \
187 unsigned xx
= get_hj_code
(l
,x
); \
188 if
(!xx || xx
<= 32) { \
191 uindex
= uni2string
(uindex
, xx
); \
194 @ Cleans one word which is returned in |cleaned|
, returns the new offset into
198 const char
*clean_hyphenation
(int id
, const char
*buff
, char
**cleaned
)
201 unsigned char word
[MAX_WORD_LEN
+ 1]; /* work buffer for bytes
*/
202 unsigned uword
[MAX_WORD_LEN
+ 1] = { 0 }; /* work buffer for unicode
*/
203 int u
= 0; /* unicode buffer value
*/
204 int i
= 0; /* index into buffer
*/
205 char
*uindex
= (char
*)word
;
206 const char
*s
= buff
;
208 while
(*s
&& !isspace((unsigned char)*s)) {
209 word
[i
++] = (unsigned
)*s
;
211 if
((s-buff
)>MAX_WORD_LEN
) {
212 /* todo
: this is too strict
, should count unicode
, not bytes
*/
214 tex_error
("exception too long", NULL);
218 /* now convert the input to unicode
*/
220 utf2uni_strcpy
(uword
, (const char
*)word
);
222 /* build the new word string
*/
226 if
(u
== '
-'
) { /* skip
*/
227 } else if
(u
== '
='
) {
229 } else if
(u
== '
{'
) {
232 while
(u
&& u != '}') {
239 while
(u
&& u != '}') {
249 while
(u
&& u != '}') {
256 if
(items
!= 3) { /* syntax error
*/
258 tex_error
("exception syntax error", NULL);
266 *cleaned
= xstrdup
((char
*) word
);
271 void load_hyphenation
(struct tex_language
*lang
, const unsigned char
*buff
)
279 if
(lang-
>exceptions
== 0) {
281 lang-
>exceptions
= luaL_ref
(Luas
, LUA_REGISTRYINDEX
);
283 lua_rawgeti
(Luas
, LUA_REGISTRYINDEX
, lang-
>exceptions
);
284 s
= (const char
*) buff
;
287 while
(isspace
((unsigned char
)*s
))
291 s
= clean_hyphenation
(id
, s
, &cleaned);
292 if
(cleaned
!= NULL) {
293 if
((s
- value
) > 0) {
294 lua_pushstring
(Luas
, cleaned
);
295 lua_pushlstring
(Luas
, value
, (size_t
) (s
- value
));
296 lua_rawset
(Luas
, -3);
301 formatted_warning
("hyphenation","skipping invalid hyphenation exception: %s", value
);
308 void clear_hyphenation
(struct tex_language
*lang
)
312 if
(lang-
>exceptions
!= 0) {
313 luaL_unref
(Luas
, LUA_REGISTRYINDEX
, lang-
>exceptions
);
314 lang-
>exceptions
= 0;
318 void load_tex_hyphenation
(int curlang
, halfword head
)
320 char
*s
= tokenlist_to_cstring
(head
, 1, NULL);
321 load_hyphenation
(get_language
(curlang
), (unsigned char
*) s
);
324 @ TODO
: clean this up. The |delete_attribute_ref
()| statements are not very nice
,
325 but needed. Also
, in the post-break
, it would be nicer to get the attribute list
326 from |vlink
(n
)|. No rush
, as it is currently not used much.
329 halfword insert_discretionary
(halfword t
, halfword pre
, halfword post
,
330 halfword replace
, int penalty
)
334 n
= new_node
(disc_node
, syllable_disc
);
335 disc_penalty
(n
) = penalty
;
336 try_couple_nodes
(n
, vlink
(t
));
341 f
= get_cur_font
(); /* for compound words following explicit hyphens
*/
342 for
(g
= pre
; g
!= null
; g
= vlink
(g
)) {
344 if
(node_attr
(t
) != null
) {
345 delete_attribute_ref
(node_attr
(g
));
346 node_attr
(g
) = node_attr
(t
);
347 attr_list_ref
(node_attr
(t
)) += 1;
350 for
(g
= post
; g
!= null
; g
= vlink
(g
)) {
352 if
(node_attr
(t
) != null
) {
353 delete_attribute_ref
(node_attr
(g
));
354 node_attr
(g
) = node_attr
(t
);
355 attr_list_ref
(node_attr
(t
)) += 1;
358 for
(g
= replace
; g
!= null
; g
= vlink
(g
)) {
359 if
(node_attr
(t
) != null
) {
360 delete_attribute_ref
(node_attr
(g
));
361 node_attr
(g
) = node_attr
(t
);
362 attr_list_ref
(node_attr
(t
)) += 1;
365 if
(node_attr
(t
) != null
) {
366 delete_attribute_ref
(node_attr
(vlink
(t
)));
367 node_attr
(vlink
(t
)) = node_attr
(t
);
368 attr_list_ref
(node_attr
(t
)) += 1;
371 set_disc_field
(pre_break
(t
), pre
);
372 set_disc_field
(post_break
(t
), post
);
373 set_disc_field
(no_break
(t
), replace
);
377 halfword insert_syllable_discretionary
(halfword t
, lang_variables
* lan
)
380 n
= new_node
(disc_node
, syllable_disc
);
381 disc_penalty
(n
) = hyphen_penalty_par
;
382 couple_nodes
(n
, vlink
(t
));
384 delete_attribute_ref
(node_attr
(n
));
385 if
(node_attr
(t
) != null
) {
386 node_attr
(n
) = node_attr
(t
);
387 attr_list_ref
(node_attr
(t
))++;
391 if
(lan-
>pre_hyphen_char
> 0) {
392 g
= raw_glyph_node
();
394 character
(g
) = lan-
>pre_hyphen_char
;
396 lang_data
(g
) = lang_data
(t
);
397 if
(node_attr
(t
) != null
) {
398 node_attr
(g
) = node_attr
(t
);
399 attr_list_ref
(node_attr
(t
))++;
401 set_disc_field
(pre_break
(n
), g
);
404 if
(lan-
>post_hyphen_char
> 0) {
406 g
= raw_glyph_node
();
408 character
(g
) = lan-
>post_hyphen_char
;
410 lang_data
(g
) = lang_data
(t
);
411 if
(node_attr
(t
) != null
) {
412 node_attr
(g
) = node_attr
(t
);
413 attr_list_ref
(node_attr
(t
)) += 1;
415 set_disc_field
(post_break
(n
), g
);
420 halfword insert_word_discretionary
(halfword t
, lang_variables
* lan
)
422 halfword pre
= null
, pos
= null
;
423 if
(lan-
>pre_exhyphen_char
> 0)
424 pre
= insert_character
(null
, lan-
>pre_exhyphen_char
);
425 if
(lan-
>post_exhyphen_char
> 0)
426 pos
= insert_character
(null
, lan-
>post_exhyphen_char
);
427 return insert_discretionary
(t
, pre
, pos
, null
,ex_hyphen_penalty_par
);
431 halfword compound_word_break
(halfword t
, int clang
)
434 lang_variables langdata
;
435 langdata.pre_exhyphen_char
= get_pre_exhyphen_char
(clang
);
436 langdata.post_exhyphen_char
= get_post_exhyphen_char
(clang
);
437 disc
= insert_word_discretionary
(t
, &langdata);
441 halfword insert_complex_discretionary
(halfword t
, lang_variables
* lan
,
442 halfword pre
, halfword pos
,
446 return insert_discretionary
(t
, pre
, pos
, replace
,hyphen_penalty_par
);
449 halfword insert_character
(halfword t
, int c
)
452 p
= new_node
(glyph_node
, 0);
462 void set_disc_field
(halfword f
, halfword t
)
466 couple_nodes
(f
, t
); // better not expose f as prev pointer
470 tlink
(f
) = tail_of_list
(t
);
478 static char
*hyphenation_exception
(int exceptions
, char
*w
)
481 lua_checkstack
(Luas
, 2);
482 lua_rawgeti
(Luas
, LUA_REGISTRYINDEX
, exceptions
);
483 if
(lua_istable
(Luas
, -1)) { /* ??
*/
484 lua_pushstring
(Luas
, w
); /* word table
*/
485 lua_rawget
(Luas
, -2);
486 if
(lua_type
(Luas
, -1) == LUA_TSTRING
) {
487 ret
= xstrdup
(lua_tostring
(Luas
, -1));
497 char
*exception_strings
(struct tex_language
*lang
)
500 size_t size
= 0, current
= 0;
503 if
(lang-
>exceptions
== 0)
505 lua_checkstack
(Luas
, 2);
506 lua_rawgeti
(Luas
, LUA_REGISTRYINDEX
, lang-
>exceptions
);
507 if
(lua_istable
(Luas
, -1)) {
508 /* iterate and join
*/
509 lua_pushnil
(Luas
); /* first key
*/
510 while
(lua_next
(Luas
, -2) != 0) {
511 value
= lua_tolstring
(Luas
, -1, &l);
512 if
(current
+ 2 + l
> size
) {
513 ret
= xrealloc
(ret
, (unsigned
) ((size
+ size
/ 5) + current
+ l
+ 1024));
514 size
= (size
+ size
/ 5) + current
+ l
+ 1024;
516 *(ret
+ current
) = ' '
;
517 strcpy
(ret
+ current
+ 1, value
);
525 @ the sequence from |wordstart| to |r| can contain only normal characters it
526 could be faster to modify a halfword pointer and return an integer
529 static halfword find_exception_part
(unsigned int
*j
, unsigned int
*uword
, int len
)
531 halfword g
= null
, gg
= null
;
532 register unsigned i
= *j
;
533 i
++; /* this puts uword
[i
] on the |
{|
*/
534 while
(i
< (unsigned
) len
&& uword[i + 1] != '}') {
536 gg
= new_char
(0, (int
) uword
[i
+ 1]);
539 halfword s
= new_char
(0, (int
) uword
[i
+ 1]);
549 static int count_exception_part
(unsigned int
*j
, unsigned int
*uword
, int len
)
552 register unsigned i
= *j
;
553 i
++; /* this puts uword
[i
] on the |
{|
*/
554 while
(i
< (unsigned
) len
&& uword[i + 1] != '}') {
563 static const char
*PAT_ERROR
[] = {
564 "Exception discretionaries should contain three pairs of braced items.",
565 "No intervening spaces are allowed.",
570 The exceptions are taken as-is
: no min values are taken into account. One can
571 add normal patterns on-the-fly if needed.
574 static void do_exception
(halfword wordstart
, halfword r
, char
*replacement
)
580 lang_variables langdata
;
581 unsigned uword
[MAX_WORD_LEN
+ 1] = { 0 };
582 utf2uni_strcpy
(uword
, replacement
);
583 len
= u_length
(uword
);
586 clang
= char_lang
(wordstart
);
587 langdata.pre_hyphen_char
= get_pre_hyphen_char
(clang
);
588 langdata.post_hyphen_char
= get_post_hyphen_char
(clang
);
590 for
(i
= 0; i
< len
; i
++) {
591 if
(uword
[i
+ 1] == '
-'
) { /* a hyphen follows
*/
592 while
(vlink
(t
) != r
&& (type(t) != glyph_node || !is_simple_character(t)))
596 insert_syllable_discretionary
(t
, &langdata);
597 t
= vlink
(t
); /* skip the new disc
*/
598 } else if
(uword
[i
+ 1] == '
='
) {
601 } else if
(uword
[i
+ 1] == '
{'
) {
602 halfword gg
, hh
, replace
= null
;
604 gg
= find_exception_part
(&i, uword, (int) len);
605 if
(i
== len || uword
[i
+ 1] != '
{'
) {
606 tex_error
("broken pattern 1", PAT_ERROR
);
608 hh
= find_exception_part
(&i, uword, (int) len);
609 if
(i
== len || uword
[i
+ 1] != '
{'
) {
610 tex_error
("broken pattern 2", PAT_ERROR
);
612 repl
= count_exception_part
(&i, uword, (int) len);
614 tex_error
("broken pattern 3", PAT_ERROR
);
616 /*i
++; *//* jump over the last right brace
*/
622 while
(repl
> 0 && q != null) {
624 if
(type
(q
) == glyph_node
) {
628 try_couple_nodes
(t
, vlink
(q
));
631 t
= insert_discretionary
(t
, gg
, hh
, replace
, hyphen_penalty_par
);
632 t
= vlink
(t
); /* skip the new disc
*/
639 @ This is a documentation section from the pascal web file. It is not true any
640 more
, but I do not have time right now to rewrite it
-- Taco
642 When the line-breaking routine is unable to find a feasible sequence of
643 breakpoints
, it makes a second pass over the paragraph
, attempting to hyphenate
644 the hyphenatable words. The goal of hyphenation is to insert discretionary
645 material into the paragraph so that there are more potential places to break.
647 The general rules for hyphenation are somewhat complex and technical
, because we
648 want to be able to hyphenate words that are preceded or followed by punctuation
649 marks
, and because we want the rules to work for languages other than English. We
650 also must contend with the fact that hyphens might radically alter the ligature
651 and kerning structure of a word.
653 A sequence of characters will be considered for hyphenation only if it belongs to
654 a ``potentially hyphenatable part'' of the current paragraph. This is a sequence
655 of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node
, $p_1\ldots p_
{m-1
}$ are
656 either character or ligature or whatsit or implicit kern nodes
, and $p_m$ is a
657 glue or penalty or insertion or adjust or mark or whatsit or explicit kern node.
658 (Therefore hyphenation is disabled by boxes
, math formulas
, and discretionary
659 nodes already inserted by the user.
) The ligature nodes among $p_1\ldots p_
{m-1
}$
660 are effectively expanded into the original non-ligature characters
; the kern
661 nodes and whatsits are ignored. Each character |c| is now classified as either a
662 nonletter
(if |lc_code
(c
)=0|
), a lowercase letter
(if |lc_code
(c
)=c|
), or an
663 uppercase letter
(otherwise
); an uppercase letter is treated as if it were
664 |lc_code
(c
)| for purposes of hyphenation. The characters generated by $p_1\ldots
665 p_
{m-1
}$ may begin with nonletters
; let $c_1$ be the first letter that is not in
666 the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored
; a whatsit
667 found after $c_1$ will be the terminating node $p_m$. All characters that do not
668 have the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for
669 that font must be between
0 and
255, otherwise hyphenation will not be attempted.
670 \TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as possible
;
671 however
, |n| must be less than
64, so a character that would otherwise be
672 $c_
{64}$ is effectively not a letter. Furthermore $c_n$ must not be in the middle
673 of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$ that are
674 generated by nodes $p_a\ldots p_b$
, where |
1<=a
<=b
+1<=m|. If |n
>=l_hyf
+r_hyf|
,
675 this string qualifies for hyphenation
; however
, |uc_hyph| must be positive
, if
678 The hyphenation process takes place in three stages. First
, the candidate
679 sequence $c_1\ldots c_n$ is found
; then potential positions for hyphens are
680 determined by referring to hyphenation tables
; and finally
, the nodes $p_a\ldots
681 p_b$ are replaced by a new sequence of nodes that includes the discretionary
684 Fortunately
, we do not have to do all this calculation very often
, because of the
685 way it has been taken out of \TeX's inner loop. For example
, when the second
686 edition of the author's
700-page book
{\sl Seminumerical Algorithms
} was typeset
687 by \TeX
, only about
1.2 hyphenations needed to be @^Knuth
, Donald Ervin@
> tried
688 per paragraph
, since the line breaking algorithm needed to use two passes on only
689 about
5 per cent of the paragraphs.
691 When a word been set up to contain a candidate for hyphenation
, \TeX\ first looks
692 to see if it is in the user's exception dictionary. If not
, hyphens are inserted
693 based on patterns that appear within the given word
, using an algorithm due to
694 Frank~M. Liang. @^Liang
, Franklin Mark@
>
696 @ This is incompatible with TEX because the first word of a paragraph can be
697 hyphenated
, but most european users seem to agree that prohibiting hyphenation
698 there was not the best idea ever.
702 More strict
: \hyphenationbounds
707 3 = strict start and strict end
709 \parindent0pt \hsize
=1.1cm
712 12-34-\vrule width
1em height
1.5ex \par
714 12-\vrule width
1em height
1.5ex-56 \par
715 \hjcode`\
1=`\
1 \hjcode`\
2=`\
2 \hjcode`\
3=`\
3 \hjcode`\
4=`\
4 \vskip
.5cm
718 12-34-\vrule width
1em height
1.5ex \par
720 12-\vrule width
1em height
1.5ex-56 \par
724 static halfword find_next_wordstart
(halfword r
, halfword first_language
, halfword strict_bound
)
727 register int start_ok
= 1;
734 if
(subtype
(r
) == word_boundary
) {
738 case hlist_node
: /* new
> 0.95 */
739 case vlist_node
: /* new
> 0.95 */
740 case rule_node
: /* new
> 0.95 */
743 if
(strict_bound
== 1 || strict_bound
== 3) {
751 while
(mathlevel
> 0) {
755 if
(type
(r
) == math_node
) {
756 if
(subtype
(r
) == before
) {
765 if
(is_simple_character
(r
)) {
767 if
(chr
== ex_hyphen_char_par
) {
769 We only accept an explicit hyphen when there is a preceding glyph and
770 we skip a sequence of explicit hyphens as that normally indicates a
771 -- or
--- ligature in which case we can in a worse case usage get bad
772 node lists later on due to messed up ligature building as these dashes
773 are ligatures in base fonts. This is a side effect of the separating the
774 hyphenation
, ligaturing and kerning steps. A test is cmr with
------.
777 if
((start_ok
== 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char_par)) {
778 t
= compound_word_break
(r
, char_lang
(r
));
779 subtype
(t
) = automatic_disc
;
784 } else if
(start_ok
&& (char_lang(r)>=first_language) && ((l = get_hj_code(char_lang(r),chr)) > 0)) {
785 if
(char_uchyph
(r
) || l
== chr || l
<= 32) {
803 static int valid_wordend
(halfword s
, halfword strict_bound
)
805 register halfword r
= s
;
806 register int clang
= char_lang
(s
);
809 while
( (r
!= null
) &&
810 ( (type
(r
) == glyph_node
&& is_simple_character(r) && clang == char_lang(r))
811 ||
(type
(r
) == kern_node
&& (subtype(r) == normal))
816 if
(r
== null ||
(type
(r
) == glyph_node
&& is_simple_character(r) && clang != char_lang(r))
817 || type
(r
) == glue_node
818 || type
(r
) == penalty_node
819 ||
(type
(r
) == kern_node
&& (subtype(r) == explicit_kern || /* so why not italic correction ? */
820 subtype
(r
) == italic_kern ||
821 subtype
(r
) == accent_kern
))
822 ||
((type
(r
) == hlist_node ||
/* new
> 0.95 */
823 type
(r
) == vlist_node ||
/* new
> 0.95 */
824 type
(r
) == rule_node ||
/* new
> 0.95 */
825 type
(r
) == dir_node ||
/* new
> 0.97 */
826 type
(r
) == whatsit_node ||
827 type
(r
) == ins_node ||
/* yes or no strict test
*/
828 type
(r
) == adjust_node
/* yes or no strict test
*/
829 ) && ! (strict_bound == 2 || strict_bound == 3))
830 || type
(r
) == boundary_node
837 void hnj_hyphenation
(halfword head
, halfword tail
)
840 struct tex_language
*lang
;
841 lang_variables langdata
;
842 char utf8word
[(4 * MAX_WORD_LEN
) + 1] = { 0 };
845 char
*replacement
= NULL;
846 boolean explicit_hyphen
= false
;
847 halfword first_language
= first_valid_language_par
;
848 halfword strict_bound
= hyphenation_bounds_par
;
849 halfword s
, r
= head
, wordstart
= null
, save_tail1
= null
, left
= null
, right
= null
;
851 /* this first movement assures two things
:
852 \item
{a
)} that we won't waste lots of time on something that has been
853 handled already
(in that case
, none of the glyphs match |simple_character|
).
854 \item
{b
)} that the first word can be hyphenated. if the movement was
855 not explicit
, then the indentation at the start of a paragraph
856 list would make |find_next_wordstart
()| look too far ahead.
859 while
(r
!= null
&& (type(r) != glyph_node || !is_simple_character(r))) {
862 /* this will make |r| a glyph node with subtype character
*/
863 r
= find_next_wordstart
(r
,first_language
,strict_bound
);
867 assert
(tail
!= null
);
868 save_tail1
= vlink
(tail
);
870 couple_nodes
(tail
, s
);
872 while
(r
!= null
) { /* could be while
(1), but let's be paranoid
*/
873 int clang
, lhmin
, rhmin
, hmin
;
875 halfword end_word
= r
;
877 assert
(is_simple_character
(wordstart
));
878 hyf_font
= font
(wordstart
);
879 if
(hyphen_char
(hyf_font
) < 0) /* for backward compat
*/
881 clang
= char_lang
(wordstart
);
882 lhmin
= char_lhmin
(wordstart
);
883 rhmin
= char_rhmin
(wordstart
);
884 hmin
= get_hyphenation_min
(clang
);
885 langdata.pre_hyphen_char
= get_pre_hyphen_char
(clang
);
886 langdata.post_hyphen_char
= get_post_hyphen_char
(clang
);
888 && type(r) == glyph_node
889 && is_simple_character(r)
890 && clang == char_lang(r)
891 && ( ( (clang >= first_language)
892 && (lchar = get_hj_code(clang,character(r))) > 0
894 ||
( character
(r
) == ex_hyphen_char_par
895 && (lchar = ex_hyphen_char_par)
899 if
(character
(r
) == ex_hyphen_char_par
) {
900 explicit_hyphen
= true
;
907 if
(wordlen
<= lhmin
) {
908 lhmin
= lhmin
- lchar
+ 1 ;
912 if
(wordlen
>= rhmin
) {
913 rhmin
= rhmin
- lchar
+ 1 ;
917 hmin
= hmin
- lchar
+ 1 ;
920 lchar
= character
(r
) ;
922 hy
= uni2string
(hy
, (unsigned
) lchar
);
923 /* this should not be needed any more
*/
924 /*if
(vlink
(r
)!=null
) alink
(vlink
(r
))=r
; */
928 if
( valid_wordend
(r
,strict_bound
)
929 && clang >= first_language
930 && wordlen >= lhmin + rhmin
931 && (hmin <= 0 || wordlen >= hmin)
933 && (lang = tex_languages[clang]) != NULL
936 if
( lang-
>exceptions
!= 0
937 && (replacement = hyphenation_exception(lang->exceptions, utf8word)) != NULL
940 formatted_warning
("hyphenation","replacing %s (c=%d) by %s", utf8word
, clang
, replacement
);
942 do_exception
(wordstart
, r
, replacement
);
944 } else if
(explicit_hyphen
== true
) {
946 insert an explicit discretionary after each of the last in a
947 set of explicit hyphens
952 formatted_warning
("hyphenation","explicit hyphen(s) found in %s (c=%d)", utf8word
, clang
);
954 while
(rr
!= wordstart
) {
955 if
(is_simple_character
(rr
)) {
956 if
(character
(rr
) == ex_hyphen_char_par
) {
957 t
= compound_word_break
(rr
, clang
);
958 subtype
(t
) = automatic_disc
;
959 while
(character
(alink
(rr
)) == ex_hyphen_char_par
)
967 } else if
(lang-
>patterns
!= NULL) {
969 for
(i
= lhmin
; i
> 1; i--
) {
971 while
(!is_simple_character
(left
)) {
978 /* what is left overruns right .. a bit messy
*/
981 for
(i
= rhmin
; i
> 0; i--
) {
982 right
= alink
(right
);
983 while
(!is_simple_character
(right
)) {
984 right
= alink
(right
);
990 /* what is right overruns left .. a bit messy
*/
992 /* maybe an extra check ...
*/
993 /* if
(left
&& right) { */
995 formatted_warning
("hyphenation","hyphenate %s (c=%d,l=%d,r=%d) from %c to %c",
996 utf8word
, clang
, lhmin
, rhmin
, character
(left
), character
(right
));
998 (void
) hnj_hyphen_hyphenate
(lang-
>patterns
, wordstart
, end_word
, wordlen
, left
, right
, &langdata);
1002 explicit_hyphen
= false
;
1007 r
= find_next_wordstart
(r
,first_language
,strict_bound
);
1009 flush_node
(vlink
(tail
));
1010 vlink
(tail
) = save_tail1
;
1014 void new_hyphenation
(halfword head
, halfword tail
)
1016 register int callback_id
= 0;
1017 if
(head
== null || vlink
(head
) == null
)
1019 fix_node_list
(head
);
1020 callback_id
= callback_defined
(hyphenate_callback
);
1021 if
(callback_id
> 0) {
1022 if
(!get_callback
(Luas
, callback_id
)) {
1026 nodelist_to_lua
(Luas
, head
);
1027 nodelist_to_lua
(Luas
, tail
);
1028 if
(lua_pcall
(Luas
, 2, 0, 0) != 0) {
1029 formatted_warning
("hyphenation","bad specification: %s",lua_tostring
(Luas
, -1));
1035 } else if
(callback_id
== 0) {
1036 hnj_hyphenation
(head
, tail
);
1040 @ dumping and undumping languages
1043 #define dump_string
(a
) \
1045 x
= (int
)strlen
(a
)+1; \
1046 dump_int
(x
); dump_things
(*a
, x
); \
1048 x
= 0; dump_int
(x
); \
1051 static void dump_one_language
(int i
)
1055 struct tex_language
*lang
;
1056 lang
= tex_languages
[i
];
1058 dump_int
(lang-
>pre_hyphen_char
);
1059 dump_int
(lang-
>post_hyphen_char
);
1060 dump_int
(lang-
>pre_exhyphen_char
);
1061 dump_int
(lang-
>post_exhyphen_char
);
1062 dump_int
(lang-
>hyphenation_min
);
1063 if
(lang-
>patterns
!= NULL) {
1064 s
= (char
*) hnj_serialize
(lang-
>patterns
);
1071 if
(lang-
>exceptions
!= 0)
1072 s
= exception_strings
(lang
);
1080 void dump_language_data
(void
)
1083 dump_int
(next_lang_id
);
1084 for
(i
= 0; i
< next_lang_id
; i
++) {
1085 if
(tex_languages
[i
]) {
1087 dump_one_language
(i
);
1094 static void undump_one_language
(int i
)
1098 struct tex_language
*lang
= get_language
(i
);
1102 lang-
>pre_hyphen_char
= x
;
1104 lang-
>post_hyphen_char
= x
;
1106 lang-
>pre_exhyphen_char
= x
;
1108 lang-
>post_exhyphen_char
= x
;
1110 lang-
>hyphenation_min
= x
;
1114 s
= xmalloc
((unsigned
) x
);
1115 undump_things
(*s
, x
);
1116 load_patterns
(lang
, (unsigned char
*) s
);
1122 s
= xmalloc
((unsigned
) x
);
1123 undump_things
(*s
, x
);
1124 load_hyphenation
(lang
, (unsigned char
*) s
);
1129 void undump_language_data
(void
)
1132 undump_int
(numlangs
);
1133 next_lang_id
= numlangs
;
1134 for
(i
= 0; i
< numlangs
; i
++) {
1137 undump_one_language
(i
);
1142 @ When \TeX\ has scanned `\.
{\\hyphenation
}'
, it calls on a procedure named
1143 |new_hyph_exceptions| to do the right thing.
1146 void new_hyph_exceptions
(void
)
1147 { /* enters new exceptions
*/
1148 (void
) scan_toks
(false
, true
);
1149 load_tex_hyphenation
(language_par
, def_ref
);
1150 flush_list
(def_ref
);
1153 @ Similarly
, when \TeX\ has scanned `\.
{\\patterns
}'
, it calls on a
1154 procedure named |new_patterns|.
1157 void new_patterns
(void
)
1158 { /* initializes the hyphenation pattern data
*/
1159 (void
) scan_toks
(false
, true
);
1160 load_tex_patterns
(language_par
, def_ref
);
1161 flush_list
(def_ref
);
1164 @ `\.
{\\prehyphenchar
}'
, sets the |pre_break| character
, and
1165 `\.
{\\posthyphenchar
}' the |post_break| character. Their respective defaults are
1166 ascii hyphen
("-") and zero
(nul
).
1169 void new_pre_hyphen_char
(void
)
1171 scan_optional_equals
();
1173 set_pre_hyphen_char
(language_par
, cur_val
);
1176 void new_post_hyphen_char
(void
)
1178 scan_optional_equals
();
1180 set_post_hyphen_char
(language_par
, cur_val
);
1183 @ `\.
{\\preexhyphenchar
}'
, sets the |pre_break| character
, and
1184 `\.
{\\postexhyphenchar
}' the |post_break| character. Their defaults are both zero
1188 void new_pre_exhyphen_char
(void
)
1190 scan_optional_equals
();
1192 set_pre_exhyphen_char
(language_par
, cur_val
);
1195 void new_post_exhyphen_char
(void
)
1197 scan_optional_equals
();
1199 set_post_exhyphen_char
(language_par
, cur_val
);
1202 void new_hyphenation_min
(void
)
1204 scan_optional_equals
();
1206 set_hyphenation_min
(language_par
, cur_val
);
1209 void new_hj_code
(void
)
1214 scan_optional_equals
();
1216 set_hj_code
(language_par
, i
, cur_val
, -1);