reg: use snprintf for string values in num_str()
[neatroff.git] / hyph.c
blob4107a40158e09d7d7109598a5725f6c751605064
1 /* hyphenation */
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "roff.h"
6 #include "hyen.h"
8 #define HYPATLEN (NHYPHS * 16) /* hyphenation pattern length */
10 static void hcode_strcpy(char *d, char *s, int *map, int dots);
11 static int hcode_mapchar(char *s);
13 /* the hyphenation dictionary (.hw) */
15 static char hwword[HYPATLEN]; /* buffer for .hw words */
16 static char hwhyph[HYPATLEN]; /* buffer for .hw hyphenations */
17 static int hwword_len; /* used hwword[] length */
18 static struct dict *hwdict; /* map words to their index in hwoff[] */
19 static int hwoff[NHYPHS]; /* the offset of words in hwword[] */
20 static int hw_n; /* the number of dictionary words */
22 /* read a single character from s into d; return the number of characters read */
23 static int hy_cget(char *d, char *s)
25 int i = 0;
26 if (s[0] != '\\')
27 return utf8read(&s, d);
28 if (s[1] == '[') {
29 s += 2;
30 while (*s && *s != ']' && i < GNLEN - 1)
31 d[i++] = *s++;
32 d[i] = '\0';
33 return *s ? i + 3 : i + 2;
35 if (s[1] == '(') {
36 s += 2;
37 i += utf8read(&s, d + i);
38 i += utf8read(&s, d + i);
39 return 2 + i;
41 if (s[1] == 'C') {
42 int q = s[2];
43 s += 3;
44 while (*s && *s != q && i < GNLEN - 1)
45 d[i++] = *s++;
46 d[i] = '\0';
47 return *s ? i + 4 : i + 3;
49 *d++ = *s++;
50 return 1 + utf8read(&s, d);
53 /* append character s to d; return the number of characters written */
54 int hy_cput(char *d, char *s)
56 if (!s[0] || !s[1] || utf8one(s))
57 strcpy(d, s);
58 else if (s[0] == '\\')
59 strcpy(d, s);
60 else if (!s[2])
61 snprintf(d, GNLEN, "\\[%s]", s);
62 return strlen(d);
65 /* insert word s into hwword[] and hwhyph[] */
66 static void hw_add(char *s)
68 char *p = hwword + hwword_len;
69 char *n = hwhyph + hwword_len;
70 int len = strlen(s) + 1;
71 int i = 0, c;
72 if (hw_n == NHYPHS || hwword_len + len > sizeof(hwword))
73 return;
74 memset(n, 0, len);
75 while ((c = (unsigned char) *s++)) {
76 if (c == '-')
77 n[i] = 1;
78 else
79 p[i++] = c;
81 p[i] = '\0';
82 hwoff[hw_n] = hwword_len;
83 dict_put(hwdict, hwword + hwoff[hw_n], hw_n);
84 hwword_len += i + 1;
85 hw_n++;
88 static int hw_lookup(char *word, char *hyph)
90 char word2[WORDLEN] = {0};
91 char *hyph2;
92 int map[WORDLEN] = {0};
93 int off = 0;
94 int i, j, idx = -1;
95 hcode_strcpy(word2, word, map, 0);
96 while (word2[off] == '.') /* skip unknown characters at the front */
97 off++;
98 i = dict_prefix(hwdict, word2 + off, &idx);
99 if (i < 0)
100 return 1;
101 hyph2 = hwhyph + hwoff[i];
102 for (j = 0; word2[j + off]; j++)
103 if (hyph2[j])
104 hyph[map[j + off]] = hyph2[j];
105 return 0;
108 void tr_hw(char **args)
110 char word[WORDLEN];
111 char *c;
112 int i;
113 for (i = 1; i < NARGS && args[i]; i++) {
114 char *s = args[i];
115 char *d = word;
116 while (d - word < WORDLEN - GNLEN && !escread(&s, &c)) {
117 if (strcmp("-", c))
118 hcode_mapchar(c);
119 d += hy_cput(d, c);
121 hw_add(word);
125 /* the tex hyphenation algorithm */
127 static int hyinit; /* hyphenation data initialized */
128 static char hypats[HYPATLEN]; /* hyphenation patterns */
129 static char hynums[HYPATLEN]; /* hyphenation pattern numbers */
130 static int hypats_len; /* used hypats[] and hynums[] length */
131 static struct dict *hydict; /* map patterns to their index in hyoff[] */
132 static int hyoff[NHYPHS]; /* the offset of this pattern in hypats[] */
133 static int hy_n; /* the number of patterns */
135 /* find the patterns matching s and update hyphenation values in n */
136 static void hy_find(char *s, char *n)
138 int plen;
139 char *p, *np;
140 int i, j;
141 int idx = -1;
142 while ((i = dict_prefix(hydict, s, &idx)) >= 0) {
143 p = hypats + hyoff[i];
144 np = hynums + (p - hypats);
145 plen = strlen(p) + 1;
146 for (j = 0; j < plen; j++)
147 if (n[j] < np[j])
148 n[j] = np[j];
152 /* mark the hyphenation points of word in hyph */
153 static void hy_dohyph(char *hyph, char *word, int flg)
155 char w[WORDLEN] = {0}; /* cleaned-up word[]; "Abc" -> ".abc." */
156 char n[WORDLEN] = {0}; /* the hyphenation value for w[] */
157 int c[WORDLEN]; /* start of the i-th character in w */
158 int wmap[WORDLEN] = {0}; /* w[i] corresponds to word[wmap[i]] */
159 char ch[GNLEN];
160 int nc = 0;
161 int i, wlen;
162 hcode_strcpy(w, word, wmap, 1);
163 wlen = strlen(w);
164 for (i = 0; i < wlen - 1; i += hy_cget(ch, w + i))
165 c[nc++] = i;
166 for (i = 0; i < nc - 1; i++)
167 hy_find(w + c[i], n + c[i]);
168 memset(hyph, 0, wlen * sizeof(hyph[0]));
169 for (i = 3; i < nc - 2; i++)
170 if (n[c[i]] % 2 && w[c[i - 1]] != '.' && w[c[i]] != '.' &&
171 w[c[i - 2]] != '.' && w[c[i + 1]] != '.' &&
172 (~flg & HY_FINAL2 || w[c[i + 2]] != '.') &&
173 (~flg & HY_FIRST2 || w[c[i - 3]] != '.'))
174 hyph[wmap[c[i]]] = 1;
177 /* insert pattern s into hypats[] and hynums[] */
178 static void hy_add(char *s)
180 char *p = hypats + hypats_len;
181 char *n = hynums + hypats_len;
182 int len = strlen(s) + 1;
183 int i = 0, c;
184 if (hy_n >= NHYPHS || hypats_len + len >= sizeof(hypats))
185 return;
186 memset(n, 0, len);
187 while ((c = (unsigned char) *s++)) {
188 if (c >= '0' && c <= '9')
189 n[i] = c - '0';
190 else
191 p[i++] = c;
193 p[i] = '\0';
194 hyoff[hy_n] = hypats_len;
195 dict_put(hydict, hypats + hyoff[hy_n], hy_n);
196 hypats_len += i + 1;
197 hy_n++;
200 /* .hcode request */
201 static struct dict *hcodedict;
202 static char hcodesrc[NHCODES][GNLEN];
203 static char hcodedst[NHCODES][GNLEN];
204 static int hcode_n;
206 /* replace the character in s after .hcode mapping; returns s's new length */
207 static int hcode_mapchar(char *s)
209 int i = dict_get(hcodedict, s);
210 if (i >= 0)
211 strcpy(s, hcodedst[i]);
212 else if (!s[1])
213 *s = isalpha((unsigned char) *s) ? tolower((unsigned char) *s) : '.';
214 return strlen(s);
217 /* copy s to d after .hcode mappings; s[map[j]] corresponds to d[j] */
218 static void hcode_strcpy(char *d, char *s, int *map, int dots)
220 char c[GNLEN];
221 int di = 0, si = 0;
222 if (dots)
223 d[di++] = '.';
224 while (di < WORDLEN - GNLEN && s[si]) {
225 map[di] = si;
226 si += hy_cget(c, s + si);
227 hcode_mapchar(c);
228 di += hy_cput(d + di, c);
230 if (dots)
231 d[di++] = '.';
232 d[di] = '\0';
235 static void hcode_add(char *c1, char *c2)
237 int i = dict_get(hcodedict, c1);
238 if (i >= 0) {
239 strcpy(hcodedst[i], c2);
240 } else if (hcode_n < NHCODES) {
241 strcpy(hcodesrc[hcode_n], c1);
242 strcpy(hcodedst[hcode_n], c2);
243 dict_put(hcodedict, hcodesrc[hcode_n], hcode_n);
244 hcode_n++;
248 void tr_hcode(char **args)
250 char c1[GNLEN], c2[GNLEN];
251 char *s = args[1];
252 while (s && charread(&s, c1) >= 0 && charread(&s, c2) >= 0)
253 hcode_add(c1, c2);
256 static void hyph_readpatterns(char *s)
258 char word[WORDLEN];
259 char *d;
260 while (*s) {
261 d = word;
262 while (*s && !isspace((unsigned char) *s))
263 *d++ = *s++;
264 *d = '\0';
265 hy_add(word);
266 while (*s && isspace((unsigned char) *s))
267 s++;
271 static void hyph_readexceptions(char *s)
273 char word[WORDLEN];
274 char *d;
275 while (*s) {
276 d = word;
277 while (*s && !isspace((unsigned char) *s))
278 *d++ = *s++;
279 *d = '\0';
280 hw_add(word);
281 while (*s && isspace((unsigned char) *s))
282 s++;
286 void hyphenate(char *hyph, char *word, int flg)
288 if (!hyinit) {
289 hyinit = 1;
290 hyph_readpatterns(en_patterns);
291 hyph_readexceptions(en_exceptions);
293 if (hw_lookup(word, hyph))
294 hy_dohyph(hyph, word, flg);
297 /* lowercase-uppercase character mapping */
298 static char *hycase[][2] = {
299 {"a", "A"}, {"á", "Á"}, {"à", "À"}, {"ă", "Ă"}, {"â", "Â"},
300 {"ǎ", "Ǎ"}, {"å", "Å"}, {"ä", "Ä"}, {"ã", "Ã"}, {"ą", "Ą"},
301 {"ā", "Ā"}, {"æ", "Æ"}, {"ǽ", "Ǽ"}, {"b", "B"}, {"c", "C"},
302 {"ć", "Ć"}, {"ĉ", "Ĉ"}, {"č", "Č"}, {"ç", "Ç"}, {"d", "D"},
303 {"ď", "Ď"}, {"đ", "Đ"}, {"ḍ", "Ḍ"}, {"ð", "Ð"}, {"e", "E"},
304 {"é", "É"}, {"è", "È"}, {"ê", "Ê"}, {"ě", "Ě"}, {"ë", "Ë"},
305 {"ė", "Ė"}, {"ę", "Ę"}, {"ē", "Ē"}, {"f", "F"}, {"g", "G"},
306 {"ğ", "Ğ"}, {"ĝ", "Ĝ"}, {"ģ", "Ģ"}, {"h", "H"}, {"ĥ", "Ĥ"},
307 {"ḥ", "Ḥ"}, {"ḫ", "Ḫ"}, {"i", "I"}, {"ı", "I"}, {"í", "Í"},
308 {"ì", "Ì"}, {"î", "Î"}, {"ǐ", "Ǐ"}, {"ï", "Ï"}, {"į", "Į"},
309 {"ī", "Ī"}, {"j", "J"}, {"ĵ", "Ĵ"}, {"k", "K"}, {"ķ", "Ķ"},
310 {"l", "L"}, {"ľ", "Ľ"}, {"ł", "Ł"}, {"ļ", "Ļ"}, {"ḷ", "Ḷ"},
311 {"m", "M"}, {"ṁ", "Ṁ"}, {"ṃ", "Ṃ"}, {"n", "N"}, {"ń", "Ń"},
312 {"ň", "Ň"}, {"ñ", "Ñ"}, {"ṅ", "Ṅ"}, {"ņ", "Ņ"}, {"ṇ", "Ṇ"},
313 {"œ", "Œ"}, {"o", "O"}, {"ó", "Ó"}, {"ò", "Ò"}, {"ô", "Ô"},
314 {"ǒ", "Ǒ"}, {"ö", "Ö"}, {"ő", "Ő"}, {"õ", "Õ"}, {"ø", "Ø"},
315 {"ō", "Ō"}, {"p", "P"}, {"q", "Q"}, {"r", "R"}, {"ŕ", "Ŕ"},
316 {"ř", "Ř"}, {"s", "S"}, {"ś", "Ś"}, {"ŝ", "Ŝ"}, {"š", "Š"},
317 {"ş", "Ş"}, {"ṣ", "Ṣ"}, {"t", "T"}, {"ť", "Ť"}, {"ț", "Ț"},
318 {"ṭ", "Ṭ"}, {"u", "U"}, {"ú", "Ú"}, {"ù", "Ù"}, {"ŭ", "Ŭ"},
319 {"û", "Û"}, {"ǔ", "Ǔ"}, {"ů", "Ů"}, {"ü", "Ü"}, {"ǘ", "Ǘ"},
320 {"ǜ", "Ǜ"}, {"ǚ", "Ǚ"}, {"ǖ", "Ǖ"}, {"ű", "Ű"}, {"ų", "Ų"},
321 {"ū", "Ū"}, {"v", "V"}, {"w", "W"}, {"x", "X"}, {"y", "Y"},
322 {"ý", "Ý"}, {"z", "Z"}, {"ź", "Ź"}, {"ž", "Ž"}, {"ż", "Ż"},
323 {"þ", "Þ"}, {"α", "Α"}, {"ά", "Ά"}, {"β", "Β"}, {"ϐ", "Β"},
324 {"γ", "Γ"}, {"δ", "Δ"}, {"ϫ", "Ϫ"}, {"ε", "Ε"}, {"έ", "Έ"},
325 {"ζ", "Ζ"}, {"ϩ", "Ϩ"}, {"η", "Η"}, {"ή", "Ή"}, {"θ", "Θ"},
326 {"ι", "Ι"}, {"ί", "Ί"}, {"ϊ", "Ϊ"}, {"κ", "Κ"}, {"ϧ", "Ϧ"},
327 {"λ", "Λ"}, {"μ", "Μ"}, {"ν", "Ν"}, {"ξ", "Ξ"}, {"ο", "Ο"},
328 {"ό", "Ό"}, {"π", "Π"}, {"ρ", "Ρ"}, {"ϲ", "Ϲ"}, {"σ", "Σ"},
329 {"ς", "Σ"}, {"ϭ", "Ϭ"}, {"τ", "Τ"}, {"ϯ", "Ϯ"}, {"υ", "Υ"},
330 {"ύ", "Ύ"}, {"ϋ", "Ϋ"}, {"φ", "Φ"}, {"ϥ", "Ϥ"}, {"χ", "Χ"},
331 {"ψ", "Ψ"}, {"ϣ", "Ϣ"}, {"ω", "Ω"}, {"ώ", "Ώ"}, {"а", "А"},
332 {"ӓ", "Ӓ"}, {"б", "Б"}, {"в", "В"}, {"г", "Г"}, {"ґ", "Ґ"},
333 {"д", "Д"}, {"ђ", "Ђ"}, {"е", "Е"}, {"ѐ", "Ѐ"}, {"є", "Є"},
334 {"ё", "Ё"}, {"ж", "Ж"}, {"з", "З"}, {"ѕ", "Ѕ"}, {"и", "И"},
335 {"ѝ", "Ѝ"}, {"ӥ", "Ӥ"}, {"і", "І"}, {"ї", "Ї"}, {"й", "Й"},
336 {"ј", "Ј"}, {"к", "К"}, {"л", "Л"}, {"љ", "Љ"}, {"м", "М"},
337 {"н", "Н"}, {"њ", "Њ"}, {"ᲂ", "О"}, {"о", "О"}, {"ӧ", "Ӧ"},
338 {"ө", "Ө"}, {"п", "П"}, {"р", "Р"}, {"с", "С"}, {"т", "Т"},
339 {"ћ", "Ћ"}, {"у", "У"}, {"ӱ", "Ӱ"}, {"ү", "Ү"}, {"ў", "Ў"},
340 {"ф", "Ф"}, {"х", "Х"}, {"ц", "Ц"}, {"ч", "Ч"}, {"џ", "Џ"},
341 {"ш", "Ш"}, {"щ", "Щ"}, {"ᲆ", "Ъ"}, {"ъ", "Ъ"}, {"ы", "Ы"},
342 {"ӹ", "Ӹ"}, {"ь", "Ь"}, {"э", "Э"}, {"ӭ", "Ӭ"}, {"ю", "Ю"},
343 {"я", "Я"}, {"ա", "Ա"}, {"բ", "Բ"}, {"գ", "Գ"}, {"դ", "Դ"},
344 {"ե", "Ե"}, {"զ", "Զ"}, {"է", "Է"}, {"ը", "Ը"}, {"թ", "Թ"},
345 {"ժ", "Ժ"}, {"ի", "Ի"}, {"լ", "Լ"}, {"խ", "Խ"}, {"ծ", "Ծ"},
346 {"կ", "Կ"}, {"հ", "Հ"}, {"ձ", "Ձ"}, {"ղ", "Ղ"}, {"ճ", "Ճ"},
347 {"մ", "Մ"}, {"յ", "Յ"}, {"ն", "Ն"}, {"շ", "Շ"}, {"ո", "Ո"},
348 {"չ", "Չ"}, {"պ", "Պ"}, {"ջ", "Ջ"}, {"ռ", "Ռ"}, {"ս", "Ս"},
349 {"վ", "Վ"}, {"տ", "Տ"}, {"ր", "Ր"}, {"ց", "Ց"}, {"փ", "Փ"},
350 {"ք", "Ք"}, {"օ", "Օ"},
353 void tr_hpfa(char **args)
355 char tok[128], c1[GNLEN], c2[GNLEN];
356 FILE *filp;
357 hyinit = 1;
358 /* load english hyphenation patterns with no arguments */
359 if (!args[1]) {
360 hyph_readpatterns(en_patterns);
361 hyph_readexceptions(en_exceptions);
363 /* reading patterns */
364 if (args[1] && (filp = fopen(args[1], "r"))) {
365 while (fscanf(filp, "%128s", tok) == 1)
366 if (strlen(tok) < WORDLEN)
367 hy_add(tok);
368 fclose(filp);
370 /* reading exceptions */
371 if (args[2] && (filp = fopen(args[2], "r"))) {
372 while (fscanf(filp, "%128s", tok) == 1)
373 if (strlen(tok) < WORDLEN)
374 hw_add(tok);
375 fclose(filp);
377 /* reading hcode mappings */
378 if (args[3] && (filp = fopen(args[3], "r"))) {
379 while (fscanf(filp, "%128s", tok) == 1) {
380 char *s = tok;
381 if (utf8read(&s, c1) && utf8read(&s, c2) && !*s)
382 hcode_add(c2, c1); /* inverting */
384 fclose(filp);
386 /* lowercase-uppercase character hcode mappings */
387 if (args[3] && !strcmp("-", args[3])) {
388 int i;
389 for (i = 0; i < LEN(hycase); i++)
390 hcode_add(hycase[i][1], hycase[i][0]);
394 void hyph_init(void)
396 hwdict = dict_make(-1, 0, 2);
397 hydict = dict_make(-1, 0, 2);
398 hcodedict = dict_make(-1, 0, 1);
401 void hyph_done(void)
403 if (hwdict)
404 dict_free(hwdict);
405 if (hydict)
406 dict_free(hydict);
407 if (hcodedict)
408 dict_free(hcodedict);
411 void tr_hpf(char **args)
413 /* reseting the patterns */
414 hypats_len = 0;
415 hy_n = 0;
416 dict_free(hydict);
417 /* reseting the dictionary */
418 hwword_len = 0;
419 hw_n = 0;
420 dict_free(hwdict);
421 /* reseting hcode mappings */
422 hcode_n = 0;
423 dict_free(hcodedict);
424 /* reading */
425 hyph_init();
426 tr_hpfa(args);