Merge branch 'obsd-master'
[tmux.git] / utf8.c
blob1b3acd5323e218f8510c8bb53d1eb1da1061c05a
1 /* $OpenBSD$ */
3 /*
4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/types.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wchar.h>
27 #include "tmux.h"
29 static const wchar_t utf8_force_wide[] = {
30 0x0261D,
31 0x026F9,
32 0x0270A,
33 0x0270B,
34 0x0270C,
35 0x0270D,
36 0x1F1E6,
37 0x1F1E7,
38 0x1F1E8,
39 0x1F1E9,
40 0x1F1EA,
41 0x1F1EB,
42 0x1F1EC,
43 0x1F1ED,
44 0x1F1EE,
45 0x1F1EF,
46 0x1F1F0,
47 0x1F1F1,
48 0x1F1F2,
49 0x1F1F3,
50 0x1F1F4,
51 0x1F1F5,
52 0x1F1F6,
53 0x1F1F7,
54 0x1F1F8,
55 0x1F1F9,
56 0x1F1FA,
57 0x1F1FB,
58 0x1F1FC,
59 0x1F1FD,
60 0x1F1FE,
61 0x1F1FF,
62 0x1F385,
63 0x1F3C2,
64 0x1F3C3,
65 0x1F3C4,
66 0x1F3C7,
67 0x1F3CA,
68 0x1F3CB,
69 0x1F3CC,
70 0x1F3FB,
71 0x1F3FC,
72 0x1F3FD,
73 0x1F3FE,
74 0x1F3FF,
75 0x1F442,
76 0x1F443,
77 0x1F446,
78 0x1F447,
79 0x1F448,
80 0x1F449,
81 0x1F44A,
82 0x1F44B,
83 0x1F44C,
84 0x1F44D,
85 0x1F44E,
86 0x1F44F,
87 0x1F450,
88 0x1F466,
89 0x1F467,
90 0x1F468,
91 0x1F469,
92 0x1F46B,
93 0x1F46C,
94 0x1F46D,
95 0x1F46E,
96 0x1F470,
97 0x1F471,
98 0x1F472,
99 0x1F473,
100 0x1F474,
101 0x1F475,
102 0x1F476,
103 0x1F477,
104 0x1F478,
105 0x1F47C,
106 0x1F481,
107 0x1F482,
108 0x1F483,
109 0x1F485,
110 0x1F486,
111 0x1F487,
112 0x1F48F,
113 0x1F491,
114 0x1F4AA,
115 0x1F574,
116 0x1F575,
117 0x1F57A,
118 0x1F590,
119 0x1F595,
120 0x1F596,
121 0x1F645,
122 0x1F646,
123 0x1F647,
124 0x1F64B,
125 0x1F64C,
126 0x1F64D,
127 0x1F64E,
128 0x1F64F,
129 0x1F6A3,
130 0x1F6B4,
131 0x1F6B5,
132 0x1F6B6,
133 0x1F6C0,
134 0x1F6CC,
135 0x1F90C,
136 0x1F90F,
137 0x1F918,
138 0x1F919,
139 0x1F91A,
140 0x1F91B,
141 0x1F91C,
142 0x1F91D,
143 0x1F91E,
144 0x1F91F,
145 0x1F926,
146 0x1F930,
147 0x1F931,
148 0x1F932,
149 0x1F933,
150 0x1F934,
151 0x1F935,
152 0x1F936,
153 0x1F937,
154 0x1F938,
155 0x1F939,
156 0x1F93D,
157 0x1F93E,
158 0x1F977,
159 0x1F9B5,
160 0x1F9B6,
161 0x1F9B8,
162 0x1F9B9,
163 0x1F9BB,
164 0x1F9CD,
165 0x1F9CE,
166 0x1F9CF,
167 0x1F9D1,
168 0x1F9D2,
169 0x1F9D3,
170 0x1F9D4,
171 0x1F9D5,
172 0x1F9D6,
173 0x1F9D7,
174 0x1F9D8,
175 0x1F9D9,
176 0x1F9DA,
177 0x1F9DB,
178 0x1F9DC,
179 0x1F9DD,
180 0x1FAC3,
181 0x1FAC4,
182 0x1FAC5,
183 0x1FAF0,
184 0x1FAF1,
185 0x1FAF2,
186 0x1FAF3,
187 0x1FAF4,
188 0x1FAF5,
189 0x1FAF6,
190 0x1FAF7,
191 0x1FAF8
194 struct utf8_item {
195 RB_ENTRY(utf8_item) index_entry;
196 u_int index;
198 RB_ENTRY(utf8_item) data_entry;
199 char data[UTF8_SIZE];
200 u_char size;
203 static int
204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
206 if (ui1->size < ui2->size)
207 return (-1);
208 if (ui1->size > ui2->size)
209 return (1);
210 return (memcmp(ui1->data, ui2->data, ui1->size));
212 RB_HEAD(utf8_data_tree, utf8_item);
213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
216 static int
217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
219 if (ui1->index < ui2->index)
220 return (-1);
221 if (ui1->index > ui2->index)
222 return (1);
223 return (0);
225 RB_HEAD(utf8_index_tree, utf8_item);
226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
229 static u_int utf8_next_index;
231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
237 /* Get a UTF-8 item from data. */
238 static struct utf8_item *
239 utf8_item_by_data(const u_char *data, size_t size)
241 struct utf8_item ui;
243 memcpy(ui.data, data, size);
244 ui.size = size;
246 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
249 /* Get a UTF-8 item from data. */
250 static struct utf8_item *
251 utf8_item_by_index(u_int index)
253 struct utf8_item ui;
255 ui.index = index;
257 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
260 /* Add a UTF-8 item. */
261 static int
262 utf8_put_item(const u_char *data, size_t size, u_int *index)
264 struct utf8_item *ui;
266 ui = utf8_item_by_data(data, size);
267 if (ui != NULL) {
268 *index = ui->index;
269 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
270 *index);
271 return (0);
274 if (utf8_next_index == 0xffffff + 1)
275 return (-1);
277 ui = xcalloc(1, sizeof *ui);
278 ui->index = utf8_next_index++;
279 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
281 memcpy(ui->data, data, size);
282 ui->size = size;
283 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
285 *index = ui->index;
286 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
287 return (0);
290 static int
291 utf8_table_cmp(const void *vp1, const void *vp2)
293 const wchar_t *wc1 = vp1, *wc2 = vp2;
295 if (*wc1 < *wc2)
296 return (-1);
297 if (*wc1 > *wc2)
298 return (1);
299 return (0);
302 /* Check if character in table. */
304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
306 wchar_t *found;
308 found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
309 return (found != NULL);
312 /* Get UTF-8 character from data. */
313 enum utf8_state
314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
316 u_int index;
318 if (ud->width > 2)
319 fatalx("invalid UTF-8 width: %u", ud->width);
321 if (ud->size > UTF8_SIZE)
322 goto fail;
323 if (ud->size <= 3) {
324 index = (((utf8_char)ud->data[2] << 16)|
325 ((utf8_char)ud->data[1] << 8)|
326 ((utf8_char)ud->data[0]));
327 } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
328 goto fail;
329 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
330 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
331 (int)ud->size, ud->data, *uc);
332 return (UTF8_DONE);
334 fail:
335 if (ud->width == 0)
336 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
337 else if (ud->width == 1)
338 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
339 else
340 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
341 return (UTF8_ERROR);
344 /* Get UTF-8 data from character. */
345 void
346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
348 struct utf8_item *ui;
349 u_int index;
351 memset(ud, 0, sizeof *ud);
352 ud->size = ud->have = UTF8_GET_SIZE(uc);
353 ud->width = UTF8_GET_WIDTH(uc);
355 if (ud->size <= 3) {
356 ud->data[2] = (uc >> 16);
357 ud->data[1] = ((uc >> 8) & 0xff);
358 ud->data[0] = (uc & 0xff);
359 } else {
360 index = (uc & 0xffffff);
361 if ((ui = utf8_item_by_index(index)) == NULL)
362 memset(ud->data, ' ', ud->size);
363 else
364 memcpy(ud->data, ui->data, ud->size);
367 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
368 (int)ud->size, ud->data);
371 /* Get UTF-8 character from a single ASCII character. */
372 u_int
373 utf8_build_one(u_char ch)
375 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
378 /* Set a single character. */
379 void
380 utf8_set(struct utf8_data *ud, u_char ch)
382 static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
384 memcpy(ud, &empty, sizeof *ud);
385 *ud->data = ch;
388 /* Copy UTF-8 character. */
389 void
390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
392 u_int i;
394 memcpy(to, from, sizeof *to);
396 for (i = to->size; i < sizeof to->data; i++)
397 to->data[i] = '\0';
400 /* Get width of Unicode character. */
401 static enum utf8_state
402 utf8_width(struct utf8_data *ud, int *width)
404 wchar_t wc;
406 if (utf8_towc(ud, &wc) != UTF8_DONE)
407 return (UTF8_ERROR);
408 if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
409 *width = 2;
410 return (UTF8_DONE);
412 #ifdef HAVE_UTF8PROC
413 *width = utf8proc_wcwidth(wc);
414 log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
415 #else
416 *width = wcwidth(wc);
417 log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
418 if (*width < 0) {
420 * C1 control characters are nonprintable, so they are always
421 * zero width.
423 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
425 #endif
426 if (*width >= 0 && *width <= 0xff)
427 return (UTF8_DONE);
428 return (UTF8_ERROR);
431 /* Convert UTF-8 character to wide character. */
432 enum utf8_state
433 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
435 #ifdef HAVE_UTF8PROC
436 switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
437 #else
438 switch (mbtowc(wc, ud->data, ud->size)) {
439 #endif
440 case -1:
441 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
442 errno);
443 mbtowc(NULL, NULL, MB_CUR_MAX);
444 return (UTF8_ERROR);
445 case 0:
446 return (UTF8_ERROR);
448 log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
449 return (UTF8_DONE);
453 * Open UTF-8 sequence.
455 * 11000010-11011111 C2-DF start of 2-byte sequence
456 * 11100000-11101111 E0-EF start of 3-byte sequence
457 * 11110000-11110100 F0-F4 start of 4-byte sequence
459 enum utf8_state
460 utf8_open(struct utf8_data *ud, u_char ch)
462 memset(ud, 0, sizeof *ud);
463 if (ch >= 0xc2 && ch <= 0xdf)
464 ud->size = 2;
465 else if (ch >= 0xe0 && ch <= 0xef)
466 ud->size = 3;
467 else if (ch >= 0xf0 && ch <= 0xf4)
468 ud->size = 4;
469 else
470 return (UTF8_ERROR);
471 utf8_append(ud, ch);
472 return (UTF8_MORE);
475 /* Append character to UTF-8, closing if finished. */
476 enum utf8_state
477 utf8_append(struct utf8_data *ud, u_char ch)
479 int width;
481 if (ud->have >= ud->size)
482 fatalx("UTF-8 character overflow");
483 if (ud->size > sizeof ud->data)
484 fatalx("UTF-8 character size too large");
486 if (ud->have != 0 && (ch & 0xc0) != 0x80)
487 ud->width = 0xff;
489 ud->data[ud->have++] = ch;
490 if (ud->have != ud->size)
491 return (UTF8_MORE);
493 if (ud->width == 0xff)
494 return (UTF8_ERROR);
495 if (utf8_width(ud, &width) != UTF8_DONE)
496 return (UTF8_ERROR);
497 ud->width = width;
499 return (UTF8_DONE);
503 * Encode len characters from src into dst, which is guaranteed to have four
504 * bytes available for each character from src (for \abc or UTF-8) plus space
505 * for \0.
508 utf8_strvis(char *dst, const char *src, size_t len, int flag)
510 struct utf8_data ud;
511 const char *start = dst, *end = src + len;
512 enum utf8_state more;
513 size_t i;
515 while (src < end) {
516 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
517 while (++src < end && more == UTF8_MORE)
518 more = utf8_append(&ud, *src);
519 if (more == UTF8_DONE) {
520 /* UTF-8 character finished. */
521 for (i = 0; i < ud.size; i++)
522 *dst++ = ud.data[i];
523 continue;
525 /* Not a complete, valid UTF-8 character. */
526 src -= ud.have;
528 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
529 if (isalpha((u_char)src[1]) ||
530 src[1] == '_' ||
531 src[1] == '{')
532 *dst++ = '\\';
533 *dst++ = '$';
534 } else if (src < end - 1)
535 dst = vis(dst, src[0], flag, src[1]);
536 else if (src < end)
537 dst = vis(dst, src[0], flag, '\0');
538 src++;
540 *dst = '\0';
541 return (dst - start);
544 /* Same as utf8_strvis but allocate the buffer. */
546 utf8_stravis(char **dst, const char *src, int flag)
548 char *buf;
549 int len;
551 buf = xreallocarray(NULL, 4, strlen(src) + 1);
552 len = utf8_strvis(buf, src, strlen(src), flag);
554 *dst = xrealloc(buf, len + 1);
555 return (len);
558 /* Same as utf8_strvis but allocate the buffer. */
560 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
562 char *buf;
563 int len;
565 buf = xreallocarray(NULL, 4, srclen + 1);
566 len = utf8_strvis(buf, src, srclen, flag);
568 *dst = xrealloc(buf, len + 1);
569 return (len);
572 /* Does this string contain anything that isn't valid UTF-8? */
574 utf8_isvalid(const char *s)
576 struct utf8_data ud;
577 const char *end;
578 enum utf8_state more;
580 end = s + strlen(s);
581 while (s < end) {
582 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
583 while (++s < end && more == UTF8_MORE)
584 more = utf8_append(&ud, *s);
585 if (more == UTF8_DONE)
586 continue;
587 return (0);
589 if (*s < 0x20 || *s > 0x7e)
590 return (0);
591 s++;
593 return (1);
597 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
598 * the returned string. Anything not valid printable ASCII or UTF-8 is
599 * stripped.
601 char *
602 utf8_sanitize(const char *src)
604 char *dst = NULL;
605 size_t n = 0;
606 enum utf8_state more;
607 struct utf8_data ud;
608 u_int i;
610 while (*src != '\0') {
611 dst = xreallocarray(dst, n + 1, sizeof *dst);
612 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
613 while (*++src != '\0' && more == UTF8_MORE)
614 more = utf8_append(&ud, *src);
615 if (more == UTF8_DONE) {
616 dst = xreallocarray(dst, n + ud.width,
617 sizeof *dst);
618 for (i = 0; i < ud.width; i++)
619 dst[n++] = '_';
620 continue;
622 src -= ud.have;
624 if (*src > 0x1f && *src < 0x7f)
625 dst[n++] = *src;
626 else
627 dst[n++] = '_';
628 src++;
630 dst = xreallocarray(dst, n + 1, sizeof *dst);
631 dst[n] = '\0';
632 return (dst);
635 /* Get UTF-8 buffer length. */
636 size_t
637 utf8_strlen(const struct utf8_data *s)
639 size_t i;
641 for (i = 0; s[i].size != 0; i++)
642 /* nothing */;
643 return (i);
646 /* Get UTF-8 string width. */
647 u_int
648 utf8_strwidth(const struct utf8_data *s, ssize_t n)
650 ssize_t i;
651 u_int width = 0;
653 for (i = 0; s[i].size != 0; i++) {
654 if (n != -1 && n == i)
655 break;
656 width += s[i].width;
658 return (width);
662 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
663 * Caller frees.
665 struct utf8_data *
666 utf8_fromcstr(const char *src)
668 struct utf8_data *dst = NULL;
669 size_t n = 0;
670 enum utf8_state more;
672 while (*src != '\0') {
673 dst = xreallocarray(dst, n + 1, sizeof *dst);
674 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
675 while (*++src != '\0' && more == UTF8_MORE)
676 more = utf8_append(&dst[n], *src);
677 if (more == UTF8_DONE) {
678 n++;
679 continue;
681 src -= dst[n].have;
683 utf8_set(&dst[n], *src);
684 n++;
685 src++;
687 dst = xreallocarray(dst, n + 1, sizeof *dst);
688 dst[n].size = 0;
689 return (dst);
692 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
693 char *
694 utf8_tocstr(struct utf8_data *src)
696 char *dst = NULL;
697 size_t n = 0;
699 for(; src->size != 0; src++) {
700 dst = xreallocarray(dst, n + src->size, 1);
701 memcpy(dst + n, src->data, src->size);
702 n += src->size;
704 dst = xreallocarray(dst, n + 1, 1);
705 dst[n] = '\0';
706 return (dst);
709 /* Get width of UTF-8 string. */
710 u_int
711 utf8_cstrwidth(const char *s)
713 struct utf8_data tmp;
714 u_int width;
715 enum utf8_state more;
717 width = 0;
718 while (*s != '\0') {
719 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
720 while (*++s != '\0' && more == UTF8_MORE)
721 more = utf8_append(&tmp, *s);
722 if (more == UTF8_DONE) {
723 width += tmp.width;
724 continue;
726 s -= tmp.have;
728 if (*s > 0x1f && *s != 0x7f)
729 width++;
730 s++;
732 return (width);
735 /* Pad UTF-8 string to width on the left. Caller frees. */
736 char *
737 utf8_padcstr(const char *s, u_int width)
739 size_t slen;
740 char *out;
741 u_int n, i;
743 n = utf8_cstrwidth(s);
744 if (n >= width)
745 return (xstrdup(s));
747 slen = strlen(s);
748 out = xmalloc(slen + 1 + (width - n));
749 memcpy(out, s, slen);
750 for (i = n; i < width; i++)
751 out[slen++] = ' ';
752 out[slen] = '\0';
753 return (out);
756 /* Pad UTF-8 string to width on the right. Caller frees. */
757 char *
758 utf8_rpadcstr(const char *s, u_int width)
760 size_t slen;
761 char *out;
762 u_int n, i;
764 n = utf8_cstrwidth(s);
765 if (n >= width)
766 return (xstrdup(s));
768 slen = strlen(s);
769 out = xmalloc(slen + 1 + (width - n));
770 for (i = 0; i < width - n; i++)
771 out[i] = ' ';
772 memcpy(out + i, s, slen);
773 out[i + slen] = '\0';
774 return (out);
778 utf8_cstrhas(const char *s, const struct utf8_data *ud)
780 struct utf8_data *copy, *loop;
781 int found = 0;
783 copy = utf8_fromcstr(s);
784 for (loop = copy; loop->size != 0; loop++) {
785 if (loop->size != ud->size)
786 continue;
787 if (memcmp(loop->data, ud->data, loop->size) == 0) {
788 found = 1;
789 break;
792 free(copy);
794 return (found);