hyph: allow utf-8 hyphenation patterns
[neatroff.git] / hyph.c
blob2e88c617f210a87fdba4c189481b9165721453be
1 /* hyphenation */
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "roff.h"
6 #include "hyen.h"
8 #define HYPATLEN (NHYPHS * 16) /* hyphenation pattern length */
10 /* the hyphenation dictionary (.hw) */
12 static char hwword[HYPATLEN]; /* buffer for .hw words */
13 static char hwhyph[HYPATLEN]; /* buffer for .hw hyphenations */
14 static int hwword_len; /* used hwword[] length */
15 /* word lists (per starting characters) for dictionary entries */
16 static int hwhead[256]; /* the head of hw_*[] lists */
17 static int hwnext[NHYPHS]; /* the next word with the same initial */
18 static int hwidx[NHYPHS]; /* the offset of this word in hwword[] */
19 static int hwlen[NHYPHS]; /* the length of the word */
20 static int hw_n = 1; /* number of words in hw_*[] lists */
22 /* functions for the hyphenation dictionary */
24 static void hw_add(char *word)
26 char *s = word;
27 char *d = hwword + hwword_len;
28 int c, i;
29 if (hw_n == LEN(hwidx) || hwword_len + 128 > sizeof(hwword))
30 return;
31 i = hw_n++;
32 while ((c = *s++)) {
33 if (c == '-')
34 hwhyph[d - hwword] = 1;
35 else
36 *d++ = c;
38 *d++ = '\0';
39 hwidx[i] = hwword_len;
40 hwword_len = d - hwword;
41 hwlen[i] = hwword_len - hwidx[i] - 1;
42 hwnext[i] = hwhead[(unsigned char) word[0]];
43 hwhead[(unsigned char) word[0]] = i;
46 /* copy lower-cased s to d */
47 static void hw_strcpy(char *d, char *s)
49 while (*s) {
50 if (*s & 0x80)
51 *d++ = *s++;
52 else
53 *d++ = tolower(*s++);
55 *d = '\0';
58 static char *hw_lookup(char *s)
60 char word[ILNLEN];
61 int i;
62 hw_strcpy(word, s);
63 /* finding a dictionary entry that matches a prefix of the input */
64 i = hwhead[(unsigned char) word[0]];
65 while (i > 0) {
66 if (!strncmp(word, hwword + hwidx[i], hwlen[i]))
67 return hwhyph + hwidx[i];
68 i = hwnext[i];
70 return NULL;
73 void tr_hw(char **args)
75 int i;
76 for (i = 1; i < NARGS && args[i]; i++)
77 hw_add(args[i]);
80 /* the tex hyphenation algorithm */
82 static int hyinit; /* hyphenation data initialized */
83 static char hypats[HYPATLEN]; /* the patterns */
84 static char hynums[HYPATLEN]; /* numbers in the patterns */
85 static int hypats_len;
86 /* lists (one per pair of starting characters) for storing patterns */
87 static int hyhead[256 * 256]; /* the head of hy_*[] lists */
88 static int hynext[NHYPHS]; /* the next pattern with the same initial */
89 static int hyoff[NHYPHS]; /* the offset of this pattern in hypats[] */
90 static int hy_n = 1; /* number of words in hy_*[] lists */
92 #define HYC_MAP(c) ((c) == '.' ? 0 : (c))
94 /* index of the string starting with a and b in hyhash[] */
95 static int hy_idx(char *s)
97 return (HYC_MAP((unsigned char) s[1]) << 8) |
98 HYC_MAP((unsigned char) s[0]);
101 /* make s lower-case and replace its non-alphabetic characters with . */
102 static void hy_strcpy(char *d, char *s)
104 int c;
105 *d++ = '.';
106 while ((c = (unsigned char) *s++))
107 *d++ = c & 0x80 ? c : (isalpha(c) ? tolower(c) : '.');
108 *d++ = '.';
109 *d = '\0';
112 /* find the patterns matching s and update hyphenation values in n */
113 static void hy_find(char *s, char *n)
115 int plen;
116 char *p, *np;
117 int j;
118 int idx = hyhead[hy_idx(s)];
119 while (idx > 0) {
120 p = hypats + hyoff[idx];
121 np = hynums + (p - hypats);
122 plen = strlen(p);
123 if (!strncmp(s + 2, p + 2, plen - 2))
124 for (j = 0; j < plen; j++)
125 if (n[j] < np[j])
126 n[j] = np[j];
127 idx = hynext[idx];
131 /* mark the hyphenation points of word in hyph */
132 static void hy_dohyph(char *hyph, char *word, int flg)
134 char n[ILNLEN] = {0};
135 char w[ILNLEN];
136 int c[ILNLEN]; /* start of the i-th character in w */
137 int nc = 0;
138 int i, wlen;
139 hy_strcpy(w, word);
140 wlen = strlen(w);
141 for (i = 0; i < wlen - 1; i += utf8len((unsigned int) w[i]))
142 c[nc++] = i;
143 for (i = 0; i < nc - 1; i++)
144 hy_find(w + c[i], n + c[i]);
145 memset(hyph, 0, wlen * sizeof(hyph[0]));
146 for (i = 3; i < nc - 2; i++)
147 if (n[i] % 2 && w[c[i - 1]] != '.' && w[c[i - 2]] != '.' && w[c[i + 1]] != '.')
148 hyph[c[i - 1]] = (~flg & HY_FINAL2 || w[c[i + 2]] != '.') &&
149 (~flg & HY_FIRST2 || w[c[i - 3]] != '.');
152 /* insert pattern s into hypats[] and hynums[] */
153 static void hy_ins(char *s)
155 char *p = hypats + hypats_len;
156 char *n = hynums + hypats_len;
157 int i = 0, idx;
158 if (hy_n >= NHYPHS || hypats_len + 64 >= sizeof(hypats))
159 return;
160 idx = hy_n++;
161 while (*s) {
162 if (*s >= '0' && *s <= '9')
163 n[i] = *s++ - '0';
164 else
165 p[i++] = *s++;
167 p[i] = '\0';
168 hyoff[idx] = hypats_len;
169 hynext[idx] = hyhead[hy_idx(p)];
170 hyhead[hy_idx(p)] = idx;
171 hypats_len += i + 1;
174 static void hyph_readpatterns(char *s)
176 char word[ILNLEN];
177 char *d;
178 while (*s) {
179 d = word;
180 while (*s && !isspace((unsigned char) *s))
181 *d++ = *s++;
182 *d = '\0';
183 hy_ins(word);
184 while (*s && isspace((unsigned char) *s))
185 s++;
189 static void hyph_readexceptions(char *s)
191 char word[ILNLEN];
192 char *d;
193 while (*s) {
194 d = word;
195 while (*s && !isspace((unsigned char) *s))
196 *d++ = *s++;
197 *d = '\0';
198 hw_add(word);
199 while (*s && isspace((unsigned char) *s))
200 s++;
204 void hyphenate(char *hyph, char *word, int flg)
206 char *r;
207 if (!hyinit) {
208 hyinit = 1;
209 hyph_readpatterns(en_patterns);
210 hyph_readexceptions(en_exceptions);
212 r = hw_lookup(word);
213 if (r)
214 memcpy(hyph, r, strlen(word) + 1);
215 else
216 hy_dohyph(hyph, word, flg);
219 void tr_hpfa(char **args)
221 char tok[ILNLEN];
222 FILE *filp;
223 /* reading patterns */
224 if (args[1]) {
225 hyinit = 1;
226 filp = fopen(args[1], "r");
227 while (fscanf(filp, "%s", tok) == 1)
228 hy_ins(tok);
229 fclose(filp);
231 /* reading exceptions */
232 if (args[2]) {
233 filp = fopen(args[1], "r");
234 while (fscanf(filp, "%s", tok) == 1)
235 hw_add(tok);
236 fclose(filp);
240 void tr_hpf(char **args)
242 /* reseting the patterns */
243 hypats_len = 0;
244 hy_n = 1;
245 memset(hyhead, 0, sizeof(hyhead));
246 memset(hynext, 0, sizeof(hynext));
247 /* reseting the dictionary */
248 hwword_len = 0;
249 hw_n = 1;
250 memset(hwhead, 0, sizeof(hwhead));
251 memset(hwnext, 0, sizeof(hwnext));
252 /* reading */
253 tr_hpfa(args);