3 % Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
7 % LibHnj
- a library for high quality hyphenation and justification
8 % Copyright
(C
) 1998 Raph Levien
,
9 % (C
) 2001 ALTLinux
, Moscow
(http
://www.alt-linux.org
),
10 % (C
) 2001 Peter Novodvorsky
(nidd@@cs.msu.su
)
12 % This library is free software
; you can redistribute it and
/or
13 % modify it under the terms of the GNU Library General Public
14 % License as published by the Free Software Foundation
; either
15 % version
2 of the License
, or
(at your option
) any later version.
17 % This library is distributed in the hope that it will be useful
,
18 % but WITHOUT
ANY WARRANTY
; without even the implied warranty of
19 % MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU
20 % Library General Public License for more details.
22 % You should have received a copy of the GNU Library General Public
23 % License along with this library
; if not
, write to the
24 % Free Software Foundation
, Inc.
, 59 Temple Place
- Suite
330,
25 % Boston
, MA
02111-1307 USA.
29 % The contents of this file are subject to the Mozilla Public License
30 % Version
1.0 (the
"MPL"); you may not use this file except in
31 % compliance with the MPL. You may obtain a copy of the MPL at
32 % http
://www.mozilla.org
/MPL
/
34 % Software distributed under the MPL is distributed on an
"AS IS" basis
,
35 % WITHOUT WARRANTY
OF ANY KIND
, either express or implied. See the MPL
36 % for the specific language governing rights and limitations under the
44 #include
"lua/luatex-api.h"
46 #include
<stdlib.h
> /* for
NULL, malloc
*/
47 #include
<stdio.h
> /* for fprintf
*/
48 #include
<string.h
> /* for strdup
*/
49 #include
<stdlib.h
> /* for malloc used by substring inclusion
*/
51 #define MAXPATHS
40960
54 # include
<unistd.h
> /* for exit
*/
57 #include
<kpathsea
/c-ctype.h
>
61 #include
"lang/hnjalloc.h"
63 @ TODO
: should be moved to separate library
66 static unsigned char
*hnj_strdup
(const unsigned char
*s
)
71 l
= strlen
((const char
*) s
);
72 new
= hnj_malloc
((int
) l
+ 1);
80 @ a little bit of a hash table implementation. This simply maps strings
84 typedef struct _HashTab HashTab
;
85 typedef struct _HashEntry HashEntry
;
86 typedef struct _HashIter HashIter
;
87 typedef union _HashVal HashVal
;
89 /* A cheap
, but effective
, hack.
*/
90 #define HASH_SIZE
31627
93 HashEntry
*entries
[HASH_SIZE
];
116 typedef struct _HyphenState HyphenState
;
117 typedef struct _HyphenTrans HyphenTrans
;
118 #define MAX_CHARS
256
131 struct _HyphenState
{
134 /*signed char replindex
; */
135 /*signed char replcut
; */
141 struct _HyphenTrans
{
147 @ Combine two right-aligned number patterns
, 04000 + 020 becomes
04020
150 static char
*combine
(char
*expr
, const char
*subexpr
)
152 size_t l1
= strlen
(expr
);
153 size_t l2
= strlen
(subexpr
);
154 size_t off
= l1
- l2
;
156 /* this works also for utf8 sequences because the substring is identical
157 to the last substring-length bytes of expr except for the
(single byte
)
160 for
(j
= 0; j
< l2
; j
++) {
161 if
(expr
[off
+ j
] < subexpr
[j
])
162 expr
[off
+ j
] = subexpr
[j
];
170 static HashIter
*new_HashIter
(HashTab
* h
)
172 HashIter
*i
= hnj_malloc
(sizeof
(HashIter
));
180 static int nextHashStealPattern
(HashIter
* i
, unsigned char
**word
, char
**pattern
)
182 while
(i-
>cur
== NULL) {
183 if
(i-
>ndx
>= HASH_SIZE
- 1)
185 i-
>cur
= i-
>e
[++i-
>ndx
];
188 *pattern
= i-
>cur-
>u.hyppat
;
189 i-
>cur-
>u.hyppat
= NULL;
190 i-
>cur
= i-
>cur-
>next
;
195 static int nextHash
(HashIter
* i
, unsigned char
**word
)
197 while
(i-
>cur
== NULL) {
198 if
(i-
>ndx
>= HASH_SIZE
- 1)
200 i-
>cur
= i-
>e
[++i-
>ndx
];
203 i-
>cur
= i-
>cur-
>next
;
208 static int eachHash
(HashIter
* i
, unsigned char
**word
, char
**pattern
)
210 while
(i-
>cur
== NULL) {
211 if
(i-
>ndx
>= HASH_SIZE
- 1)
213 i-
>cur
= i-
>e
[++i-
>ndx
];
216 *pattern
= i-
>cur-
>u.hyppat
;
217 i-
>cur
= i-
>cur-
>next
;
222 static void delete_HashIter
(HashIter
* i
)
228 @ a |char
*| hash function from ASU
- adapted from Gtk
+
231 static unsigned int hnj_string_hash
(const unsigned char
*s
)
233 const unsigned char
*p
;
234 unsigned int h
= 0, g
;
236 for
(p
= s
; *p
!= '\
0'
; p
+= 1) {
238 if
((g
= (h
& 0xf0000000))) {
243 return h
/* \
% M
*/ ;
247 @ assumes that key is not already present
!
250 static void state_insert
(HashTab
* hashtab
, unsigned char
*key
, int state
)
255 i
= (int
) (hnj_string_hash
(key
) % HASH_SIZE
);
256 e
= hnj_malloc
(sizeof
(HashEntry
));
257 e-
>next
= hashtab-
>entries
[i
];
260 hashtab-
>entries
[i
] = e
;
264 @ assumes that key is not already present
!
267 static void hyppat_insert
(HashTab
* hashtab
, unsigned char
*key
, char
*hyppat
)
272 i
= (int
) (hnj_string_hash
(key
) % HASH_SIZE
);
273 for
(e
= hashtab-
>entries
[i
]; e
; e
= e-
>next
) {
274 if
(strcmp
((char
*) e-
>key
, (char
*) key
) == 0) {
277 && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
278 print_err
("Conflicting pattern ignored");
281 hnj_free
(e-
>u.hyppat
);
283 e-
>u.hyppat
= hyppat
;
288 e
= hnj_malloc
(sizeof
(HashEntry
));
289 e-
>next
= hashtab-
>entries
[i
];
291 e-
>u.hyppat
= hyppat
;
292 hashtab-
>entries
[i
] = e
;
296 @ return state if found
, otherwise $
-1$
299 static int state_lookup
(HashTab
* hashtab
, const unsigned char
*key
)
304 i
= (int
) (hnj_string_hash
(key
) % HASH_SIZE
);
305 for
(e
= hashtab-
>entries
[i
]; e
; e
= e-
>next
) {
306 if
(!strcmp
((const char
*) key
, (const char
*) e-
>key
)) {
314 @ return state if found
, otherwise $
-1$
317 static char
*hyppat_lookup
(HashTab
* hashtab
, const unsigned char
*chars
, int l
)
321 unsigned char key
[256]; /* should be ample
*/
322 strncpy
((char
*) key
, (const char
*) chars
, (size_t
) l
);
324 i
= (int
) (hnj_string_hash
(key
) % HASH_SIZE
);
325 for
(e
= hashtab-
>entries
[i
]; e
; e
= e-
>next
) {
326 if
(!strcmp
((char
*) key
, (char
*) e-
>key
)) {
334 @ Get the state number
, allocating a new state if necessary.
337 static int hnj_get_state
(HyphenDict
* dict
,
338 const unsigned char
*str
, int
*state_num
)
340 *state_num
= state_lookup
(dict-
>state_num
, str
);
345 state_insert
(dict-
>state_num
, hnj_strdup
(str
), dict-
>num_states
);
346 /* predicate is true if |dict-
>num_states| is a power of two
*/
347 if
(!(dict-
>num_states
& (dict->num_states - 1))) {
348 dict-
>states
= hnj_realloc
(dict-
>states
,
349 (int
) ((dict-
>num_states
<< 1) *
350 (int
) sizeof
(HyphenState
)));
352 dict-
>states
[dict-
>num_states
].match
= NULL;
353 dict-
>states
[dict-
>num_states
].fallback_state
= -1;
354 dict-
>states
[dict-
>num_states
].num_trans
= 0;
355 dict-
>states
[dict-
>num_states
].trans
= NULL;
356 return dict-
>num_states
++;
360 @ Add a transition from state1 to state2 through ch
- assumes that the
361 transition does not already exist
364 static void hnj_add_trans
(HyphenDict
* dict
, int state1
, int state2
, int uni_ch
)
367 /* TH
: this test was a bit too strict
, it is quite normal for old
368 patterns to have chars in the range
0-31 or
127-159 (inclusive
).
369 To ease the transition
, let's only disallow NUL for now
370 (this is probably a requirement of the code anyway
).
373 char errmsg
[256]; /* temp hack ... we will have a formatted error
*/
374 snprintf
(errmsg
, 255, "character out of bounds: u%04x", uni_ch
);
376 normal_error
("hyphenation",errmsg
); /* todo
*/
378 num_trans
= dict-
>states
[state1
].num_trans
;
379 if
(num_trans
== 0) {
380 dict-
>states
[state1
].trans
= hnj_malloc
(sizeof
(HyphenTrans
));
382 /* TH
: The old version did
383 } else if
(!(num_trans
& (num_trans - 1))) {
384 ... hnj_realloc
(dict-
>states
[state1
].trans
,
385 (int
) ((num_trans
<< 1) *
386 sizeof
(HyphenTrans
)));
387 but that is incredibly nasty when adding patters one-at-a-time.
388 Controlled growth would be nicer than the current
+1, but if
389 noone complains
, this is good enough
;)
391 dict-
>states
[state1
].trans
= hnj_realloc
(dict-
>states
[state1
].trans
,
392 (int
) ((num_trans
+ 1) *
393 sizeof
(HyphenTrans
)));
395 dict-
>states
[state1
].trans
[num_trans
].uni_ch
= uni_ch
;
396 dict-
>states
[state1
].trans
[num_trans
].new_state
= state2
;
397 dict-
>states
[state1
].num_trans
++;
403 static unsigned char
*get_state_str
(int state
)
408 for
(i
= 0; i
< HASH_SIZE
; i
++)
409 for
(e
= global-
>entries
[i
]; e
; e
= e-
>next
)
410 if
(e-
>u.state
== state
)
417 @ I've changed the semantics a bit here
: |hnj_hyphen_load| used to
418 operate on a file
, but now the argument is a string buffer.
421 static const unsigned char
*next_pattern
(size_t
* length
,
422 const unsigned char
**buf
)
424 const unsigned char
*here
, *rover
= *buf
;
425 while
(*rover
&& isspace(*rover))
429 if
(isspace
(*rover
)) {
430 *length
= (size_t
) (rover
- here
);
436 *length
= (size_t
) (rover
- here
);
438 return
*length ? here
: NULL; /* zero sensed
*/
441 static void init_hash
(HashTab
** h
)
446 *h
= hnj_malloc
(sizeof
(HashTab
));
447 for
(i
= 0; i
< HASH_SIZE
; i
++)
448 (*h
)->entries
[i
] = NULL;
452 static void clear_state_hash
(HashTab
** h
)
457 for
(i
= 0; i
< HASH_SIZE
; i
++) {
459 for
(e
= (*h
)->entries
[i
]; e
; e
= next
) {
470 static void clear_hyppat_hash
(HashTab
** h
)
475 for
(i
= 0; i
< HASH_SIZE
; i
++) {
477 for
(e
= (*h
)->entries
[i
]; e
; e
= next
) {
481 hnj_free
(e-
>u.hyppat
);
490 static void init_dict
(HyphenDict
* dict
)
492 dict-
>num_states
= 1;
493 dict-
>pat_length
= 0;
494 dict-
>states
= hnj_malloc
(sizeof
(HyphenState
));
495 dict-
>states
[0].match
= NULL;
496 dict-
>states
[0].fallback_state
= -1;
497 dict-
>states
[0].num_trans
= 0;
498 dict-
>states
[0].trans
= NULL;
499 dict-
>patterns
= NULL;
501 dict-
>state_num
= NULL;
502 init_hash
(&dict->patterns);
506 static void clear_dict
(HyphenDict
* dict
)
509 for
(state_num
= 0; state_num
< dict-
>num_states
; state_num
++) {
510 HyphenState
*hstate
= &dict->states[state_num];
512 hnj_free
(hstate-
>match
);
514 hnj_free
(hstate-
>trans
);
516 hnj_free
(dict-
>states
);
517 clear_hyppat_hash
(&dict->patterns);
518 clear_hyppat_hash
(&dict->merged);
519 clear_state_hash
(&dict->state_num);
524 HyphenDict
*hnj_hyphen_new
(void
)
526 HyphenDict
*dict
= hnj_malloc
(sizeof
(HyphenDict
));
532 void hnj_hyphen_clear
(HyphenDict
* dict
)
539 void hnj_hyphen_free
(HyphenDict
* dict
)
545 unsigned char
*hnj_serialize
(HyphenDict
* dict
)
550 unsigned char
*buf
= hnj_malloc
(dict-
>pat_length
);
551 unsigned char
*cur
= buf
;
552 v
= new_HashIter
(dict-
>patterns
);
553 while
(eachHash
(v
, &word, &pattern)) {
555 while
(word
[e
+ i
]) {
556 if
(pattern
[i
] != '
0'
)
557 *cur
++ = (unsigned char
) pattern
[i
];
558 *cur
++ = word
[e
+ i
++];
559 while
(is_utf8_follow
(word
[e
+ i
]))
560 *cur
++ = word
[i
+ e
++];
562 if
(pattern
[i
] != '
0'
)
563 *cur
++ = (unsigned char
) pattern
[i
];
572 void hnj_free_serialize
(unsigned char
*c
)
578 @ hyphenation pattern
:
582 0 indicates end
(actually any negative number
)
584 : prio
(1+),startpos
,length
,len1
,[replace
],len2
,[replace
]
586 most basic example is
:
590 for a hyphenation point between characters
594 void hnj_hyphen_load
(HyphenDict
* dict
, const unsigned char
*f
)
596 int state_num
, last_state
;
605 const unsigned char
*format
;
606 const unsigned char
*begin
= f
;
609 while
((format
= next_pattern
(&l, &f)) != NULL) {
612 help1
("Individual patterns should not be longer than 254 bytes total.");
613 print_err
("Pattern of enormous length ignored");
618 printf
("%s\n",format
);
619 char
* repl
= strnchr
(format
, '
/'
,l
);
623 int clen
= l-
(repl-format
);
625 char
* index
= strnchr
(repl
+ 1, '
,'
,clen
);
627 char
* index2
= strnchr
(index
+ 1, '
,'
,clen-
(index-repl
));
629 replindex
= (signed char
) atoi
(index
+ 1) - 1;
630 replcut
= (signed char
) atoi
(index2
+ 1);
633 hnj_strchomp
(repl
+ 1);
635 replcut
= strlen
(buf
);
637 repl
= hnj_strdup
(repl
+ 1);
640 for
(i
= 0, j
= 0, e1
= 0; (unsigned
) i
< l
; i
++) {
641 if
(format
[i
] >= '
0'
&& format[i] <= '9')
643 if
(is_utf8_follow
(format
[i
]))
646 /* |l-e1|
=> number of
{\it characters
} not
{\it bytes
} */
647 /* |l-j|
=> number of pattern bytes
*/
648 /* |l-e1-j|
=> number of pattern characters
*/
649 pat
= (unsigned char
*) malloc
((1 + l
- (size_t
) j
));
650 org
= (char
*) malloc
((size_t
) (2 + l
- (size_t
) e1
- (size_t
) j
));
651 /* remove hyphenation encoders
(digits
) from pat
*/
653 for
(i
= 0, j
= 0, e1
= 0; (unsigned
) i
< l
; i
++) {
654 unsigned char c
= format
[i
];
655 if
(is_utf8_follow
(c
)) {
657 } else if
(c
< '
0' || c
> '
9'
) {
666 hyppat_insert
(dict-
>patterns
, pat
, org
);
668 dict-
>pat_length
+= (int
) ((f
- begin
) + 2); /* 2 for spurious spaces
*/
669 init_hash
(&dict->merged);
670 v
= new_HashIter
(dict-
>patterns
);
671 while
(nextHash
(v
, &word)) {
672 int wordsize
= (int
) strlen
((char
*) word
);
674 for
(l1
= 1; l1
<= wordsize
; l1
++) {
675 if
(is_utf8_follow
(word
[l1
]))
676 continue
; /* Do not clip an utf8 sequence
*/
677 for
(j1
= 1; j1
<= l1
; j1
++) {
680 if
(is_utf8_follow
(word
[i1
]))
681 continue
; /* Do not start halfway an utf8 sequence
*/
683 hyppat_lookup
(dict-
>patterns
, word
+ i1
, j1
)) != NULL) {
686 hyppat_lookup
(dict-
>merged
, word
, l1
)) == NULL) {
688 unsigned char
*newword
=
689 (unsigned char
*) malloc
((size_t
) (l1
+ 1));
691 strncpy
((char
*) newword
, (char
*) word
, (size_t
) l1
);
693 for
(i1
= 0; i1
< l1
; i1
++)
694 if
(is_utf8_follow
(newword
[i1
]))
696 neworg
= malloc
((size_t
) (l1
+ 2 - e1
));
697 sprintf
(neworg
, "%0*d", l1
+ 1 - e1
, 0); /* fill with right amount of '
0'
*/
698 hyppat_insert
(dict-
>merged
, newword
, combine
(neworg
, subpat_pat
));
700 combine
(newpat_pat
, subpat_pat
);
708 init_hash
(&dict->state_num);
709 state_insert
(dict-
>state_num
, hnj_strdup
((const unsigned char
*) ""), 0);
710 v
= new_HashIter
(dict-
>merged
);
711 while
(nextHashStealPattern
(v
, &word, &pattern)) {
712 static unsigned char mask
[] = { 0x3F, 0x1F, 0xF, 0x7 };
713 int j1
= (int
) strlen
((char
*) word
);
715 printf
("word %s pattern %s, j = %d\n", word
, pattern
, j1
);
717 state_num
= hnj_get_state
(dict
, word
, &found);
718 dict-
>states
[state_num
].match
= pattern
;
720 /* now
, put in the prefix transitions
*/
723 last_state
= state_num
;
728 while
(is_utf8_follow
(word
[j1
- i1
]))
730 ch
= word
[j1
- i1
] & mask[i1];
733 ch
= (ch
<< 6) + (0x3F & word[j1 - i1]);
738 state_num
= hnj_get_state
(dict
, word
, &found);
739 hnj_add_trans
(dict
, state_num
, last_state
, ch
);
743 clear_hyppat_hash
(&dict->merged);
745 /* put in the fallback states
*/
748 for
(i
= 0; i
< HASH_SIZE
; i
++) {
749 for
(e
= dict-
>state_num-
>entries
[i
]; e
; e
= e-
>next
) {
750 /* do not do state
==0 otherwise things get confused
*/
752 for
(j
= 1; 1; j
++) {
753 state_num
= state_lookup
(dict-
>state_num
, e-
>key
+ j
);
757 dict-
>states
[e-
>u.state
].fallback_state
= state_num
;
762 for
(i
= 0; i
< HASH_SIZE
; i
++) {
763 for
(e
= dict-
>state_num-
>entries
[i
]; e
; e
= e-
>next
) {
764 printf
("%d string %s state %d, fallback=%d\n",
765 i
, e-
>key
, e-
>u.state
, dict-
>states
[e-
>u.state
].fallback_state
);
766 for
(j
= 0; j
< dict-
>states
[e-
>u.state
].num_trans
; j
++) {
767 printf
(" u%4x->%d\n",
768 (int
) dict-
>states
[e-
>u.state
].trans
[j
].uni_ch
,
769 dict-
>states
[e-
>u.state
].trans
[j
].new_state
);
775 clear_state_hash
(&dict->state_num);
779 void hnj_hyphen_hyphenate
(HyphenDict
* dict
,
783 halfword left
, halfword right
, lang_variables
* lan
)
788 /* +2 for dots at each end
, +1 for points
/outside
/ characters
*/
789 int ext_word_len
= length
+ 2;
790 int hyphen_len
= ext_word_len
+ 1;
791 char
*hyphens
= hnj_malloc
(hyphen_len
+ 1);
793 /* Add a '.' to beginning and end to facilitate matching
*/
794 set_vlink
(begin_point
, first1
);
795 set_vlink
(end_point
, get_vlink
(last1
));
796 set_vlink
(last1
, end_point
);
798 for
(char_num
= 0; char_num
< hyphen_len
; char_num
++) {
799 hyphens
[char_num
] = '
0'
;
801 hyphens
[hyphen_len
] = 0;
803 /* now
, run the finite state machine
*/
804 for
(char_num
= 0, here
= begin_point
; here
!= get_vlink
(end_point
);
805 here
= get_vlink
(here
)) {
808 if
(here
== begin_point || here
== end_point
)
811 ch
= get_lc_code
(get_character
(here
));
812 while
(state
!= -1) {
814 printf
("%*s%s%c",char_num-strlen
(get_state_str
(state
)),"",get_state_str
(state
),(char
)ch
);
816 HyphenState
*hstate
= &dict->states[state];
818 for
(k
= 0; k
< hstate-
>num_trans
; k
++) {
819 if
(hstate-
>trans
[k
].uni_ch
== ch
) {
821 state
= hstate-
>trans
[k
].new_state
;
823 printf
(" state %d\n",state
);
825 match
= dict-
>states
[state
].match
;
828 1 string length is one bigger than offset
829 1 hyphenation starts before first character
831 int offset
= (int
) (char_num
+ 2 - (int
) strlen
(match
));
833 printf
("%*s%s\n", offset
,"", match
);
836 for
(m
= 0; match
[m
]; m
++) {
837 if
(hyphens
[offset
+ m
] < match
[m
])
838 hyphens
[offset
+ m
] = match
[m
];
841 goto try_next_letter
;
844 state
= hstate-
>fallback_state
;
846 printf
(" back to %d\n", state
);
849 /* nothing worked
, let's go to the next character
*/
855 /* restore the correct pointers
*/
856 set_vlink
(last1
, get_vlink
(end_point
));
858 /* pattern is \.
{\^.\^w\^o\^r\^d\^.\^
} |word_len|
=4, |ext_word_len|
=6, |hyphens|
=7
859 * check \.
{ \^ \^ \^
} so drop first two and stop after |word_len-1|
861 for
(here
= first1
, char_num
= 2; here
!= left
; here
= get_vlink
(here
))
863 for
(; here
!= right
; here
= get_vlink
(here
)) {
864 if
(hyphens
[char_num
] & 1)
865 here
= insert_syllable_discretionary
(here
, lan
);