Merge branch 'obsd-master'
[tmux.git] / utf8.c
blobbc7c8fd29dc1773b0a5b5903dcaa8faf04d19165
1 /* $OpenBSD$ */
3 /*
4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/types.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wchar.h>
27 #include "compat.h"
28 #include "tmux.h"
30 static const wchar_t utf8_force_wide[] = {
31 0x0261D,
32 0x026F9,
33 0x0270A,
34 0x0270B,
35 0x0270C,
36 0x0270D,
37 0x1F1E6,
38 0x1F1E7,
39 0x1F1E8,
40 0x1F1E9,
41 0x1F1EA,
42 0x1F1EB,
43 0x1F1EC,
44 0x1F1ED,
45 0x1F1EE,
46 0x1F1EF,
47 0x1F1F0,
48 0x1F1F1,
49 0x1F1F2,
50 0x1F1F3,
51 0x1F1F4,
52 0x1F1F5,
53 0x1F1F6,
54 0x1F1F7,
55 0x1F1F8,
56 0x1F1F9,
57 0x1F1FA,
58 0x1F1FB,
59 0x1F1FC,
60 0x1F1FD,
61 0x1F1FE,
62 0x1F1FF,
63 0x1F385,
64 0x1F3C2,
65 0x1F3C3,
66 0x1F3C4,
67 0x1F3C7,
68 0x1F3CA,
69 0x1F3CB,
70 0x1F3CC,
71 0x1F3FB,
72 0x1F3FC,
73 0x1F3FD,
74 0x1F3FE,
75 0x1F3FF,
76 0x1F442,
77 0x1F443,
78 0x1F446,
79 0x1F447,
80 0x1F448,
81 0x1F449,
82 0x1F44A,
83 0x1F44B,
84 0x1F44C,
85 0x1F44D,
86 0x1F44E,
87 0x1F44F,
88 0x1F450,
89 0x1F466,
90 0x1F467,
91 0x1F468,
92 0x1F469,
93 0x1F46B,
94 0x1F46C,
95 0x1F46D,
96 0x1F46E,
97 0x1F470,
98 0x1F471,
99 0x1F472,
100 0x1F473,
101 0x1F474,
102 0x1F475,
103 0x1F476,
104 0x1F477,
105 0x1F478,
106 0x1F47C,
107 0x1F481,
108 0x1F482,
109 0x1F483,
110 0x1F485,
111 0x1F486,
112 0x1F487,
113 0x1F48F,
114 0x1F491,
115 0x1F4AA,
116 0x1F574,
117 0x1F575,
118 0x1F57A,
119 0x1F590,
120 0x1F595,
121 0x1F596,
122 0x1F645,
123 0x1F646,
124 0x1F647,
125 0x1F64B,
126 0x1F64C,
127 0x1F64D,
128 0x1F64E,
129 0x1F64F,
130 0x1F6A3,
131 0x1F6B4,
132 0x1F6B5,
133 0x1F6B6,
134 0x1F6C0,
135 0x1F6CC,
136 0x1F90C,
137 0x1F90F,
138 0x1F918,
139 0x1F919,
140 0x1F91A,
141 0x1F91B,
142 0x1F91C,
143 0x1F91D,
144 0x1F91E,
145 0x1F91F,
146 0x1F926,
147 0x1F930,
148 0x1F931,
149 0x1F932,
150 0x1F933,
151 0x1F934,
152 0x1F935,
153 0x1F936,
154 0x1F937,
155 0x1F938,
156 0x1F939,
157 0x1F93D,
158 0x1F93E,
159 0x1F977,
160 0x1F9B5,
161 0x1F9B6,
162 0x1F9B8,
163 0x1F9B9,
164 0x1F9BB,
165 0x1F9CD,
166 0x1F9CE,
167 0x1F9CF,
168 0x1F9D1,
169 0x1F9D2,
170 0x1F9D3,
171 0x1F9D4,
172 0x1F9D5,
173 0x1F9D6,
174 0x1F9D7,
175 0x1F9D8,
176 0x1F9D9,
177 0x1F9DA,
178 0x1F9DB,
179 0x1F9DC,
180 0x1F9DD,
181 0x1FAC3,
182 0x1FAC4,
183 0x1FAC5,
184 0x1FAF0,
185 0x1FAF1,
186 0x1FAF2,
187 0x1FAF3,
188 0x1FAF4,
189 0x1FAF5,
190 0x1FAF6,
191 0x1FAF7,
192 0x1FAF8
195 struct utf8_item {
196 RB_ENTRY(utf8_item) index_entry;
197 u_int index;
199 RB_ENTRY(utf8_item) data_entry;
200 char data[UTF8_SIZE];
201 u_char size;
204 static int
205 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
207 if (ui1->size < ui2->size)
208 return (-1);
209 if (ui1->size > ui2->size)
210 return (1);
211 return (memcmp(ui1->data, ui2->data, ui1->size));
213 RB_HEAD(utf8_data_tree, utf8_item);
214 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
215 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
217 static int
218 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
220 if (ui1->index < ui2->index)
221 return (-1);
222 if (ui1->index > ui2->index)
223 return (1);
224 return (0);
226 RB_HEAD(utf8_index_tree, utf8_item);
227 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
228 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
230 static u_int utf8_next_index;
232 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
233 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
235 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
236 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
238 /* Get a UTF-8 item from data. */
239 static struct utf8_item *
240 utf8_item_by_data(const u_char *data, size_t size)
242 struct utf8_item ui;
244 memcpy(ui.data, data, size);
245 ui.size = size;
247 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
250 /* Get a UTF-8 item from data. */
251 static struct utf8_item *
252 utf8_item_by_index(u_int index)
254 struct utf8_item ui;
256 ui.index = index;
258 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
261 /* Add a UTF-8 item. */
262 static int
263 utf8_put_item(const u_char *data, size_t size, u_int *index)
265 struct utf8_item *ui;
267 ui = utf8_item_by_data(data, size);
268 if (ui != NULL) {
269 *index = ui->index;
270 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
271 *index);
272 return (0);
275 if (utf8_next_index == 0xffffff + 1)
276 return (-1);
278 ui = xcalloc(1, sizeof *ui);
279 ui->index = utf8_next_index++;
280 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
282 memcpy(ui->data, data, size);
283 ui->size = size;
284 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
286 *index = ui->index;
287 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
288 return (0);
291 static int
292 utf8_table_cmp(const void *vp1, const void *vp2)
294 const wchar_t *wc1 = vp1, *wc2 = vp2;
296 if (*wc1 < *wc2)
297 return (-1);
298 if (*wc1 > *wc2)
299 return (1);
300 return (0);
303 /* Check if character in table. */
305 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
307 wchar_t *found;
309 found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
310 return (found != NULL);
313 /* Get UTF-8 character from data. */
314 enum utf8_state
315 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
317 u_int index;
319 if (ud->width > 2)
320 fatalx("invalid UTF-8 width: %u", ud->width);
322 if (ud->size > UTF8_SIZE)
323 goto fail;
324 if (ud->size <= 3) {
325 index = (((utf8_char)ud->data[2] << 16)|
326 ((utf8_char)ud->data[1] << 8)|
327 ((utf8_char)ud->data[0]));
328 } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
329 goto fail;
330 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
331 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
332 (int)ud->size, ud->data, *uc);
333 return (UTF8_DONE);
335 fail:
336 if (ud->width == 0)
337 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
338 else if (ud->width == 1)
339 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
340 else
341 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
342 return (UTF8_ERROR);
345 /* Get UTF-8 data from character. */
346 void
347 utf8_to_data(utf8_char uc, struct utf8_data *ud)
349 struct utf8_item *ui;
350 u_int index;
352 memset(ud, 0, sizeof *ud);
353 ud->size = ud->have = UTF8_GET_SIZE(uc);
354 ud->width = UTF8_GET_WIDTH(uc);
356 if (ud->size <= 3) {
357 ud->data[2] = (uc >> 16);
358 ud->data[1] = ((uc >> 8) & 0xff);
359 ud->data[0] = (uc & 0xff);
360 } else {
361 index = (uc & 0xffffff);
362 if ((ui = utf8_item_by_index(index)) == NULL)
363 memset(ud->data, ' ', ud->size);
364 else
365 memcpy(ud->data, ui->data, ud->size);
368 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
369 (int)ud->size, ud->data);
372 /* Get UTF-8 character from a single ASCII character. */
373 u_int
374 utf8_build_one(u_char ch)
376 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
379 /* Set a single character. */
380 void
381 utf8_set(struct utf8_data *ud, u_char ch)
383 static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
385 memcpy(ud, &empty, sizeof *ud);
386 *ud->data = ch;
389 /* Copy UTF-8 character. */
390 void
391 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
393 u_int i;
395 memcpy(to, from, sizeof *to);
397 for (i = to->size; i < sizeof to->data; i++)
398 to->data[i] = '\0';
401 /* Get width of Unicode character. */
402 static enum utf8_state
403 utf8_width(struct utf8_data *ud, int *width)
405 wchar_t wc;
407 if (utf8_towc(ud, &wc) != UTF8_DONE)
408 return (UTF8_ERROR);
409 if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
410 *width = 2;
411 return (UTF8_DONE);
413 #ifdef HAVE_UTF8PROC
414 *width = utf8proc_wcwidth(wc);
415 log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
416 #else
417 *width = wcwidth(wc);
418 log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
419 if (*width < 0) {
421 * C1 control characters are nonprintable, so they are always
422 * zero width.
424 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
426 #endif
427 if (*width >= 0 && *width <= 0xff)
428 return (UTF8_DONE);
429 return (UTF8_ERROR);
432 /* Convert UTF-8 character to wide character. */
433 enum utf8_state
434 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
436 #ifdef HAVE_UTF8PROC
437 switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
438 #else
439 switch (mbtowc(wc, ud->data, ud->size)) {
440 #endif
441 case -1:
442 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
443 errno);
444 mbtowc(NULL, NULL, MB_CUR_MAX);
445 return (UTF8_ERROR);
446 case 0:
447 return (UTF8_ERROR);
449 log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
450 return (UTF8_DONE);
453 /* Convert wide character to UTF-8 character. */
454 enum utf8_state
455 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
457 int size, width;
459 #ifdef HAVE_UTF8PROC
460 size = utf8proc_wctomb(ud->data, wc);
461 #else
462 size = wctomb(ud->data, wc);
463 #endif
464 if (size < 0) {
465 log_debug("UTF-8 %d, wctomb() %d", wc, errno);
466 wctomb(NULL, 0);
467 return (UTF8_ERROR);
469 if (size == 0)
470 return (UTF8_ERROR);
471 ud->size = ud->have = size;
472 if (utf8_width(ud, &width) == UTF8_DONE) {
473 ud->width = width;
474 return (UTF8_DONE);
476 return (UTF8_ERROR);
480 * Open UTF-8 sequence.
482 * 11000010-11011111 C2-DF start of 2-byte sequence
483 * 11100000-11101111 E0-EF start of 3-byte sequence
484 * 11110000-11110100 F0-F4 start of 4-byte sequence
486 enum utf8_state
487 utf8_open(struct utf8_data *ud, u_char ch)
489 memset(ud, 0, sizeof *ud);
490 if (ch >= 0xc2 && ch <= 0xdf)
491 ud->size = 2;
492 else if (ch >= 0xe0 && ch <= 0xef)
493 ud->size = 3;
494 else if (ch >= 0xf0 && ch <= 0xf4)
495 ud->size = 4;
496 else
497 return (UTF8_ERROR);
498 utf8_append(ud, ch);
499 return (UTF8_MORE);
502 /* Append character to UTF-8, closing if finished. */
503 enum utf8_state
504 utf8_append(struct utf8_data *ud, u_char ch)
506 int width;
508 if (ud->have >= ud->size)
509 fatalx("UTF-8 character overflow");
510 if (ud->size > sizeof ud->data)
511 fatalx("UTF-8 character size too large");
513 if (ud->have != 0 && (ch & 0xc0) != 0x80)
514 ud->width = 0xff;
516 ud->data[ud->have++] = ch;
517 if (ud->have != ud->size)
518 return (UTF8_MORE);
520 if (ud->width == 0xff)
521 return (UTF8_ERROR);
522 if (utf8_width(ud, &width) != UTF8_DONE)
523 return (UTF8_ERROR);
524 ud->width = width;
526 return (UTF8_DONE);
530 * Encode len characters from src into dst, which is guaranteed to have four
531 * bytes available for each character from src (for \abc or UTF-8) plus space
532 * for \0.
535 utf8_strvis(char *dst, const char *src, size_t len, int flag)
537 struct utf8_data ud;
538 const char *start = dst, *end = src + len;
539 enum utf8_state more;
540 size_t i;
542 while (src < end) {
543 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
544 while (++src < end && more == UTF8_MORE)
545 more = utf8_append(&ud, *src);
546 if (more == UTF8_DONE) {
547 /* UTF-8 character finished. */
548 for (i = 0; i < ud.size; i++)
549 *dst++ = ud.data[i];
550 continue;
552 /* Not a complete, valid UTF-8 character. */
553 src -= ud.have;
555 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
556 if (isalpha((u_char)src[1]) ||
557 src[1] == '_' ||
558 src[1] == '{')
559 *dst++ = '\\';
560 *dst++ = '$';
561 } else if (src < end - 1)
562 dst = vis(dst, src[0], flag, src[1]);
563 else if (src < end)
564 dst = vis(dst, src[0], flag, '\0');
565 src++;
567 *dst = '\0';
568 return (dst - start);
571 /* Same as utf8_strvis but allocate the buffer. */
573 utf8_stravis(char **dst, const char *src, int flag)
575 char *buf;
576 int len;
578 buf = xreallocarray(NULL, 4, strlen(src) + 1);
579 len = utf8_strvis(buf, src, strlen(src), flag);
581 *dst = xrealloc(buf, len + 1);
582 return (len);
585 /* Same as utf8_strvis but allocate the buffer. */
587 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
589 char *buf;
590 int len;
592 buf = xreallocarray(NULL, 4, srclen + 1);
593 len = utf8_strvis(buf, src, srclen, flag);
595 *dst = xrealloc(buf, len + 1);
596 return (len);
599 /* Does this string contain anything that isn't valid UTF-8? */
601 utf8_isvalid(const char *s)
603 struct utf8_data ud;
604 const char *end;
605 enum utf8_state more;
607 end = s + strlen(s);
608 while (s < end) {
609 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
610 while (++s < end && more == UTF8_MORE)
611 more = utf8_append(&ud, *s);
612 if (more == UTF8_DONE)
613 continue;
614 return (0);
616 if (*s < 0x20 || *s > 0x7e)
617 return (0);
618 s++;
620 return (1);
624 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
625 * the returned string. Anything not valid printable ASCII or UTF-8 is
626 * stripped.
628 char *
629 utf8_sanitize(const char *src)
631 char *dst = NULL;
632 size_t n = 0;
633 enum utf8_state more;
634 struct utf8_data ud;
635 u_int i;
637 while (*src != '\0') {
638 dst = xreallocarray(dst, n + 1, sizeof *dst);
639 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
640 while (*++src != '\0' && more == UTF8_MORE)
641 more = utf8_append(&ud, *src);
642 if (more == UTF8_DONE) {
643 dst = xreallocarray(dst, n + ud.width,
644 sizeof *dst);
645 for (i = 0; i < ud.width; i++)
646 dst[n++] = '_';
647 continue;
649 src -= ud.have;
651 if (*src > 0x1f && *src < 0x7f)
652 dst[n++] = *src;
653 else
654 dst[n++] = '_';
655 src++;
657 dst = xreallocarray(dst, n + 1, sizeof *dst);
658 dst[n] = '\0';
659 return (dst);
662 /* Get UTF-8 buffer length. */
663 size_t
664 utf8_strlen(const struct utf8_data *s)
666 size_t i;
668 for (i = 0; s[i].size != 0; i++)
669 /* nothing */;
670 return (i);
673 /* Get UTF-8 string width. */
674 u_int
675 utf8_strwidth(const struct utf8_data *s, ssize_t n)
677 ssize_t i;
678 u_int width = 0;
680 for (i = 0; s[i].size != 0; i++) {
681 if (n != -1 && n == i)
682 break;
683 width += s[i].width;
685 return (width);
689 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
690 * Caller frees.
692 struct utf8_data *
693 utf8_fromcstr(const char *src)
695 struct utf8_data *dst = NULL;
696 size_t n = 0;
697 enum utf8_state more;
699 while (*src != '\0') {
700 dst = xreallocarray(dst, n + 1, sizeof *dst);
701 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
702 while (*++src != '\0' && more == UTF8_MORE)
703 more = utf8_append(&dst[n], *src);
704 if (more == UTF8_DONE) {
705 n++;
706 continue;
708 src -= dst[n].have;
710 utf8_set(&dst[n], *src);
711 n++;
712 src++;
714 dst = xreallocarray(dst, n + 1, sizeof *dst);
715 dst[n].size = 0;
716 return (dst);
719 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
720 char *
721 utf8_tocstr(struct utf8_data *src)
723 char *dst = NULL;
724 size_t n = 0;
726 for(; src->size != 0; src++) {
727 dst = xreallocarray(dst, n + src->size, 1);
728 memcpy(dst + n, src->data, src->size);
729 n += src->size;
731 dst = xreallocarray(dst, n + 1, 1);
732 dst[n] = '\0';
733 return (dst);
736 /* Get width of UTF-8 string. */
737 u_int
738 utf8_cstrwidth(const char *s)
740 struct utf8_data tmp;
741 u_int width;
742 enum utf8_state more;
744 width = 0;
745 while (*s != '\0') {
746 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
747 while (*++s != '\0' && more == UTF8_MORE)
748 more = utf8_append(&tmp, *s);
749 if (more == UTF8_DONE) {
750 width += tmp.width;
751 continue;
753 s -= tmp.have;
755 if (*s > 0x1f && *s != 0x7f)
756 width++;
757 s++;
759 return (width);
762 /* Pad UTF-8 string to width on the left. Caller frees. */
763 char *
764 utf8_padcstr(const char *s, u_int width)
766 size_t slen;
767 char *out;
768 u_int n, i;
770 n = utf8_cstrwidth(s);
771 if (n >= width)
772 return (xstrdup(s));
774 slen = strlen(s);
775 out = xmalloc(slen + 1 + (width - n));
776 memcpy(out, s, slen);
777 for (i = n; i < width; i++)
778 out[slen++] = ' ';
779 out[slen] = '\0';
780 return (out);
783 /* Pad UTF-8 string to width on the right. Caller frees. */
784 char *
785 utf8_rpadcstr(const char *s, u_int width)
787 size_t slen;
788 char *out;
789 u_int n, i;
791 n = utf8_cstrwidth(s);
792 if (n >= width)
793 return (xstrdup(s));
795 slen = strlen(s);
796 out = xmalloc(slen + 1 + (width - n));
797 for (i = 0; i < width - n; i++)
798 out[i] = ' ';
799 memcpy(out + i, s, slen);
800 out[i + slen] = '\0';
801 return (out);
805 utf8_cstrhas(const char *s, const struct utf8_data *ud)
807 struct utf8_data *copy, *loop;
808 int found = 0;
810 copy = utf8_fromcstr(s);
811 for (loop = copy; loop->size != 0; loop++) {
812 if (loop->size != ud->size)
813 continue;
814 if (memcmp(loop->data, ud->data, loop->size) == 0) {
815 found = 1;
816 break;
819 free(copy);
821 return (found);