6 #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
7 /* this value is used (as a literal) in suftab.c */
8 /* to encode possible hyphenation points in suffixes. */
9 /* it could be changed, by widening the tables */
10 /* to be shorts instead of chars. */
18 int hexsize
= 0; /* hyphenation exception list size */
19 char *hbufp
= NULL
; /* base of list */
20 char *nexth
= NULL
; /* first free slot in list */
23 #define THRESH 160 /* digram goodness threshold */
27 static int alpha(Tchar
);
29 void hyphen(Tchar
*wp
)
42 hyend
= wdend
= --i
- 1;
47 if (wdend
- wdstart
< 4) /* 4 chars is too short to hyphenate */
53 /* for now, try exceptions first, then tex (if hyphalg is non-zero),
54 then suffix and digram if tex didn't hyphenate it at all.
57 if (!exword() && !texhyphen() && !suffix())
60 /* this appears to sort hyphenation points into increasing order */
65 for (hyp
= hyptr
+ 1; *hyp
!= 0; hyp
++) {
66 if (*(hyp
- 1) > *hyp
) {
76 static int alpha(Tchar i
) /* non-zero if really alphabetic */
80 else if (cbits(i
) >= ALPHABET
) /* this isn't very elegant, but there's */
81 return 0; /* no good way to make sure i is in range for */
82 else /* the call of isalpha */
83 return isalpha(cbits(i
));
96 void caseha(void) /* set hyphenation algorithm */
107 void caseht(void) /* set hyphenation threshold; not in manual! */
118 char *growh(char *where
)
123 if ((new = grow(hbufp
, hexsize
, sizeof(char))) == NULL
)
129 diff
= where
- hbufp
;
143 if ((nexth
= hbufp
= grow(hbufp
, NHEX
, sizeof(char))) == NULL
) {
144 ERROR
"No space for exception word list." WARN
;
151 if ((j
= nexth
) >= hbufp
+ hexsize
- 2)
152 if ((j
= nexth
= growh(j
)) == NULL
)
155 if (ismot(t
= getch()))
158 if (i
== ' ' || i
== '\n') {
171 *j
++ = maplow(i
) | k
;
173 if (j
>= hbufp
+ hexsize
- 2)
174 if ((j
= growh(j
)) == NULL
)
180 ERROR
"Cannot grow exception word list." WARN
;
193 if (e
== NULL
|| *e
== 0)
196 while (*e
&& w
<= hyend
&& (*e
& 0177) == maplow(cbits(*w
))) {
201 if (w
-1 == hyend
|| (w
== wdend
&& maplow(cbits(*w
)) == 's')) {
203 for (e
= save
; *e
; e
++) {
206 if (hyp
> hyptr
+ NHYP
- 1)
207 hyp
= hyptr
+ NHYP
- 1;
227 extern char *suftab
[];
235 if ((s0
= suftab
[i
-'a']) == 0)
238 if ((i
= *s0
& 017) == 0)
242 while (s
> s0
&& w
>= wdstart
&& (*s
& 0177) == maplow(cbits(*w
))) {
259 if (*s0
& 0100) /* 0100 used in suftab to encode something too */
298 Tchar
*chkvow(Tchar
*w
)
300 while (--w
>= wdstart
)
301 if (vowel(cbits(*w
)))
311 Tchar
*nhyend
, *maxw
;
313 extern char bxh
[26][13], bxxh
[26][13], xxh
[26][13], xhx
[26][13], hxx
[26][13];
316 if (!(w
= chkvow(hyend
+ 1)))
319 if (!(w
= chkvow(hyend
)))
324 while (++w
< hyend
&& w
< wdend
- 1) {
327 val
*= dilook('a', cbits(*w
), bxh
);
328 else if (w
== wdstart
+ 1)
329 val
*= dilook(cbits(*(w
-1)), cbits(*w
), bxxh
);
331 val
*= dilook(cbits(*(w
-1)), cbits(*w
), xxh
);
332 val
*= dilook(cbits(*w
), cbits(*(w
+1)), xhx
);
333 val
*= dilook(cbits(*(w
+1)), cbits(*(w
+2)), hxx
);
346 int dilook(int a
, int b
, char t
[26][13])
350 i
= t
[maplow(a
) - 'a'][(j
= maplow(b
) - 'a') / 2];
357 /* here beginneth the tex hyphenation code, as interpreted freely */
358 /* the main difference is that there is no attempt to squeeze space */
359 /* as tightly at tex does. */
361 static int texit(Tchar
*, Tchar
*);
362 static int readpats(void);
363 static void install(char *);
364 static void fixup(void);
365 static int trieindex(int, int);
367 static char pats
[50000]; /* size ought to be computed dynamically */
368 static char *nextpat
= pats
;
369 static char *trie
[27*27]; /* english-specific sizes */
373 static int loaded
= 0; /* -1: couldn't find tex file */
375 if (hyphalg
== 0 || loaded
== -1) /* non-zero => tex for now */
383 return texit(wdstart
, wdend
);
386 static int texit(Tchar
*start
, Tchar
*end
) /* hyphenate as in tex, return # found */
388 int nw
, i
, k
, equal
, cnt
[500];
389 char w
[500+1], *np
, *pp
, *wp
, *xpp
, *xwp
;
392 for (nw
= 1; start
<= end
&& nw
< 500-1; nw
++, start
++)
393 w
[nw
] = maplow(tolower(cbits(*start
)));
398 * printf("try %s\n", w);
400 for (i
= 0; i
<= nw
; i
++)
403 for (wp
= w
; wp
+1 < w
+nw
; wp
++) {
404 for (pp
= trie
[trieindex(*wp
, *(wp
+1))]; pp
< nextpat
; ) {
405 if (pp
== 0 /* no trie entry */
406 || *pp
!= *wp
/* no match on 1st letter */
407 || *(pp
+1) != *(wp
+1)) /* no match on 2nd letter */
408 break; /* so move to next letter of word */
410 for (xpp
= pp
+2, xwp
= wp
+2; *xpp
; )
411 if (*xpp
++ != *xwp
++) {
416 np
= xpp
+1; /* numpat */
417 for (k
= wp
-w
; *np
; k
++, np
++)
421 * printf("match: %s %s\n", pp, xpp+1);
424 pp
+= *(pp
-1); /* skip over pattern and numbers to next */
428 * for (i = 0; i < nw; i++) printf("%c", w[i]);
430 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
434 * for (i = 1; i < nw - 1; i++) {
435 * if (i > 2 && i < nw - 3 && cnt[i] % 2)
437 * if (cbits(start[i-1]) != '.')
438 * printf("%c", cbits(start[i-1]));
442 for (i
= 1; i
< nw
-1; i
++)
443 if (i
> 2 && i
< nw
- 3 && cnt
[i
] % 2)
444 *hyp
++ = start
+ i
- 1;
445 return hyp
- hyptr
; /* non-zero if a hyphen was found */
449 This code assumes that hyphen.tex looks like
451 \patterns{ % more comments
452 pat5ter4ns, 1 per line, SORTED, nothing else
455 \hyphenation{ % more comments
456 ex-cep-tions, one per line; i ignore this part for now
459 this code is NOT robust against variations. unfortunately,
460 it looks like every local language version of this file has
461 a different format. i have also made no provision for weird
465 static int readpats(void)
468 char buf
[200], buf1
[200];
470 if ((fp
= fopen(TEXHYPHENS
, "r")) == NULL
&&
471 (fp
= fopen(ALTHYPHENS
, "r")) == NULL
) {
472 ERROR
"warning: can't find hyphen.tex" WARN
;
476 while (fgets(buf
, sizeof buf
, fp
) != NULL
) {
477 sscanf(buf
, "%s", buf1
);
478 if (strcmp(buf1
, "\\patterns{") == 0)
481 while (fgets(buf
, sizeof buf
, fp
) != NULL
) {
491 static void install(char *s
) /* map ab4c5de to: 12 abcde \0 00405 \0 */
494 char num
[500], *onextpat
= nextpat
;
497 *nextpat
++ = ' '; /* fill in with count later */
498 for (npat
= lastpat
= 0; *s
!= '\n' && *s
!= '\0'; s
++) {
509 if (nextpat
> pats
+ sizeof(pats
)-20) {
510 ERROR
"tex hyphenation table overflow, tail end ignored" WARN
;
514 strcat(nextpat
, num
);
515 nextpat
+= strlen(nextpat
) + 1;
518 static void fixup(void) /* build indexes of where . a b c ... start */
523 for (lastc
= pats
, p
= pats
+1; p
< nextpat
; p
++)
529 for (p
= pats
+1; p
< nextpat
; ) {
530 n
= trieindex(p
[0], p
[1]);
535 /* printf("pats = %d\n", nextpat - pats); */
538 static int trieindex(int d1
, int d2
)
542 i
= 27*(d1
== '.'? 0: d1
- 'a' + 1) + (d2
== '.'? 0: d2
- 'a' + 1);
543 assert(0 <= i
&& i
< 27*27);