utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wchar.h>
  26
  27 #include "tmux.h"
  28
  29 struct utf8_item {
  30         RB_ENTRY(utf8_item)     index_entry;
  31         u_int                   index;
  32
  33         RB_ENTRY(utf8_item)     data_entry;
  34         char                    data[UTF8_SIZE];
  35         u_char                  size;
  36 };
  37
  38 static int
  39 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  40 {
  41         if (ui1->size < ui2->size)
  42                 return (-1);
  43         if (ui1->size > ui2->size)
  44                 return (1);
  45         return (memcmp(ui1->data, ui2->data, ui1->size));
  46 }
  47 RB_HEAD(utf8_data_tree, utf8_item);
  48 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
  49 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
  50
  51 static int
  52 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  53 {
  54         if (ui1->index < ui2->index)
  55                 return (-1);
  56         if (ui1->index > ui2->index)
  57                 return (1);
  58         return (0);
  59 }
  60 RB_HEAD(utf8_index_tree, utf8_item);
  61 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
  62 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
  63
  64 static u_int utf8_next_index;
  65
  66 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
  67 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
  68
  69 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
  70 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
  71
  72 /* Get a UTF-8 item from data. */
  73 static struct utf8_item *
  74 utf8_item_by_data(const char *data, size_t size)
  75 {
  76         struct utf8_item        ui;
  77
  78         memcpy(ui.data, data, size);
  79         ui.size = size;
  80
  81         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
  82 }
  83
  84 /* Get a UTF-8 item from data. */
  85 static struct utf8_item *
  86 utf8_item_by_index(u_int index)
  87 {
  88         struct utf8_item        ui;
  89
  90         ui.index = index;
  91
  92         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
  93 }
  94
  95 /* Add a UTF-8 item. */
  96 static int
  97 utf8_put_item(const char *data, size_t size, u_int *index)
  98 {
  99         struct utf8_item        *ui;
 100
 101         ui = utf8_item_by_data(data, size);
 102         if (ui != NULL) {
 103                 *index = ui->index;
 104                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 105                     *index);
 106                 return (0);
 107         }
 108
 109         if (utf8_next_index == 0xffffff + 1)
 110                 return (-1);
 111
 112         ui = xcalloc(1, sizeof *ui);
 113         ui->index = utf8_next_index++;
 114         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 115
 116         memcpy(ui->data, data, size);
 117         ui->size = size;
 118         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 119
 120         *index = ui->index;
 121         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 122         return (0);
 123 }
 124
 125 /* Get UTF-8 character from data. */
 126 enum utf8_state
 127 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 128 {
 129         u_int   index;
 130
 131         if (ud->width > 2)
 132                 fatalx("invalid UTF-8 width: %u", ud->width);
 133
 134         if (ud->size > UTF8_SIZE)
 135                 goto fail;
 136         if (ud->size <= 3) {
 137                 index = (((utf8_char)ud->data[2] << 16)|
 138                           ((utf8_char)ud->data[1] << 8)|
 139                           ((utf8_char)ud->data[0]));
 140         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 141                 goto fail;
 142         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 143         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 144             (int)ud->size, ud->data, *uc);
 145         return (UTF8_DONE);
 146
 147 fail:
 148         if (ud->width == 0)
 149                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 150         else if (ud->width == 1)
 151                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 152         else
 153                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 154         return (UTF8_ERROR);
 155 }
 156
 157 /* Get UTF-8 data from character. */
 158 void
 159 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 160 {
 161         struct utf8_item        *ui;
 162         u_int                    index;
 163
 164         memset(ud, 0, sizeof *ud);
 165         ud->size = ud->have = UTF8_GET_SIZE(uc);
 166         ud->width = UTF8_GET_WIDTH(uc);
 167
 168         if (ud->size <= 3) {
 169                 ud->data[2] = (uc >> 16);
 170                 ud->data[1] = ((uc >> 8) & 0xff);
 171                 ud->data[0] = (uc & 0xff);
 172         } else {
 173                 index = (uc & 0xffffff);
 174                 if ((ui = utf8_item_by_index(index)) == NULL)
 175                         memset(ud->data, ' ', ud->size);
 176                 else
 177                         memcpy(ud->data, ui->data, ud->size);
 178         }
 179
 180         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 181             (int)ud->size, ud->data);
 182 }
 183
 184 /* Get UTF-8 character from a single ASCII character. */
 185 u_int
 186 utf8_build_one(u_char ch)
 187 {
 188         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 189 }
 190
 191 /* Set a single character. */
 192 void
 193 utf8_set(struct utf8_data *ud, u_char ch)
 194 {
 195         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 196
 197         memcpy(ud, &empty, sizeof *ud);
 198         *ud->data = ch;
 199 }
 200
 201 /* Copy UTF-8 character. */
 202 void
 203 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 204 {
 205         u_int   i;
 206
 207         memcpy(to, from, sizeof *to);
 208
 209         for (i = to->size; i < sizeof to->data; i++)
 210                 to->data[i] = '\0';
 211 }
 212
 213 /* Get width of Unicode character. */
 214 static enum utf8_state
 215 utf8_width(struct utf8_data *ud, int *width)
 216 {
 217         wchar_t wc;
 218
 219 #ifdef HAVE_UTF8PROC
 220         switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) {
 221 #else
 222         switch (mbtowc(&wc, ud->data, ud->size)) {
 223 #endif
 224         case -1:
 225                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 226                     errno);
 227                 mbtowc(NULL, NULL, MB_CUR_MAX);
 228                 return (UTF8_ERROR);
 229         case 0:
 230                 return (UTF8_ERROR);
 231         }
 232 #ifdef HAVE_UTF8PROC
 233         *width = utf8proc_wcwidth(wc);
 234 #else
 235         *width = wcwidth(wc);
 236 #endif
 237         if (*width >= 0 && *width <= 0xff)
 238                 return (UTF8_DONE);
 239         log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width);
 240         return (UTF8_ERROR);
 241 }
 242
 243 /*
 244  * Open UTF-8 sequence.
 245  *
 246  * 11000010-11011111 C2-DF start of 2-byte sequence
 247  * 11100000-11101111 E0-EF start of 3-byte sequence
 248  * 11110000-11110100 F0-F4 start of 4-byte sequence
 249  */
 250 enum utf8_state
 251 utf8_open(struct utf8_data *ud, u_char ch)
 252 {
 253         memset(ud, 0, sizeof *ud);
 254         if (ch >= 0xc2 && ch <= 0xdf)
 255                 ud->size = 2;
 256         else if (ch >= 0xe0 && ch <= 0xef)
 257                 ud->size = 3;
 258         else if (ch >= 0xf0 && ch <= 0xf4)
 259                 ud->size = 4;
 260         else
 261                 return (UTF8_ERROR);
 262         utf8_append(ud, ch);
 263         return (UTF8_MORE);
 264 }
 265
 266 /* Append character to UTF-8, closing if finished. */
 267 enum utf8_state
 268 utf8_append(struct utf8_data *ud, u_char ch)
 269 {
 270         int     width;
 271
 272         if (ud->have >= ud->size)
 273                 fatalx("UTF-8 character overflow");
 274         if (ud->size > sizeof ud->data)
 275                 fatalx("UTF-8 character size too large");
 276
 277         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 278                 ud->width = 0xff;
 279
 280         ud->data[ud->have++] = ch;
 281         if (ud->have != ud->size)
 282                 return (UTF8_MORE);
 283
 284         if (ud->width == 0xff)
 285                 return (UTF8_ERROR);
 286         if (utf8_width(ud, &width) != UTF8_DONE)
 287                 return (UTF8_ERROR);
 288         ud->width = width;
 289
 290         return (UTF8_DONE);
 291 }
 292
 293 /*
 294  * Encode len characters from src into dst, which is guaranteed to have four
 295  * bytes available for each character from src (for \abc or UTF-8) plus space
 296  * for \0.
 297  */
 298 int
 299 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 300 {
 301         struct utf8_data         ud;
 302         const char              *start = dst, *end = src + len;
 303         enum utf8_state          more;
 304         size_t                   i;
 305
 306         while (src < end) {
 307                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 308                         while (++src < end && more == UTF8_MORE)
 309                                 more = utf8_append(&ud, *src);
 310                         if (more == UTF8_DONE) {
 311                                 /* UTF-8 character finished. */
 312                                 for (i = 0; i < ud.size; i++)
 313                                         *dst++ = ud.data[i];
 314                                 continue;
 315                         }
 316                         /* Not a complete, valid UTF-8 character. */
 317                         src -= ud.have;
 318                 }
 319                 if (src[0] == '$' && src < end - 1) {
 320                         if (isalpha((u_char)src[1]) ||
 321                             src[1] == '_' ||
 322                             src[1] == '{')
 323                                 *dst++ = '\\';
 324                         *dst++ = '$';
 325                 } else if (src < end - 1)
 326                         dst = vis(dst, src[0], flag, src[1]);
 327                 else if (src < end)
 328                         dst = vis(dst, src[0], flag, '\0');
 329                 src++;
 330         }
 331         *dst = '\0';
 332         return (dst - start);
 333 }
 334
 335 /* Same as utf8_strvis but allocate the buffer. */
 336 int
 337 utf8_stravis(char **dst, const char *src, int flag)
 338 {
 339         char    *buf;
 340         int      len;
 341
 342         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 343         len = utf8_strvis(buf, src, strlen(src), flag);
 344
 345         *dst = xrealloc(buf, len + 1);
 346         return (len);
 347 }
 348
 349 /* Same as utf8_strvis but allocate the buffer. */
 350 int
 351 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 352 {
 353         char    *buf;
 354         int      len;
 355
 356         buf = xreallocarray(NULL, 4, srclen + 1);
 357         len = utf8_strvis(buf, src, srclen, flag);
 358
 359         *dst = xrealloc(buf, len + 1);
 360         return (len);
 361 }
 362
 363 /* Does this string contain anything that isn't valid UTF-8? */
 364 int
 365 utf8_isvalid(const char *s)
 366 {
 367         struct utf8_data ud;
 368         const char      *end;
 369         enum utf8_state  more;
 370
 371         end = s + strlen(s);
 372         while (s < end) {
 373                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 374                         while (++s < end && more == UTF8_MORE)
 375                                 more = utf8_append(&ud, *s);
 376                         if (more == UTF8_DONE)
 377                                 continue;
 378                         return (0);
 379                 }
 380                 if (*s < 0x20 || *s > 0x7e)
 381                         return (0);
 382                 s++;
 383         }
 384         return (1);
 385 }
 386
 387 /*
 388  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 389  * the returned string. Anything not valid printable ASCII or UTF-8 is
 390  * stripped.
 391  */
 392 char *
 393 utf8_sanitize(const char *src)
 394 {
 395         char            *dst = NULL;
 396         size_t           n = 0;
 397         enum utf8_state  more;
 398         struct utf8_data ud;
 399         u_int            i;
 400
 401         while (*src != '\0') {
 402                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 403                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 404                         while (*++src != '\0' && more == UTF8_MORE)
 405                                 more = utf8_append(&ud, *src);
 406                         if (more == UTF8_DONE) {
 407                                 dst = xreallocarray(dst, n + ud.width,
 408                                     sizeof *dst);
 409                                 for (i = 0; i < ud.width; i++)
 410                                         dst[n++] = '_';
 411                                 continue;
 412                         }
 413                         src -= ud.have;
 414                 }
 415                 if (*src > 0x1f && *src < 0x7f)
 416                         dst[n++] = *src;
 417                 else
 418                         dst[n++] = '_';
 419                 src++;
 420         }
 421         dst = xreallocarray(dst, n + 1, sizeof *dst);
 422         dst[n] = '\0';
 423         return (dst);
 424 }
 425
 426 /* Get UTF-8 buffer length. */
 427 size_t
 428 utf8_strlen(const struct utf8_data *s)
 429 {
 430         size_t  i;
 431
 432         for (i = 0; s[i].size != 0; i++)
 433                 /* nothing */;
 434         return (i);
 435 }
 436
 437 /* Get UTF-8 string width. */
 438 u_int
 439 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 440 {
 441         ssize_t i;
 442         u_int   width = 0;
 443
 444         for (i = 0; s[i].size != 0; i++) {
 445                 if (n != -1 && n == i)
 446                         break;
 447                 width += s[i].width;
 448         }
 449         return (width);
 450 }
 451
 452 /*
 453  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 454  * Caller frees.
 455  */
 456 struct utf8_data *
 457 utf8_fromcstr(const char *src)
 458 {
 459         struct utf8_data        *dst = NULL;
 460         size_t                   n = 0;
 461         enum utf8_state          more;
 462
 463         while (*src != '\0') {
 464                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 465                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 466                         while (*++src != '\0' && more == UTF8_MORE)
 467                                 more = utf8_append(&dst[n], *src);
 468                         if (more == UTF8_DONE) {
 469                                 n++;
 470                                 continue;
 471                         }
 472                         src -= dst[n].have;
 473                 }
 474                 utf8_set(&dst[n], *src);
 475                 n++;
 476                 src++;
 477         }
 478         dst = xreallocarray(dst, n + 1, sizeof *dst);
 479         dst[n].size = 0;
 480         return (dst);
 481 }
 482
 483 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 484 char *
 485 utf8_tocstr(struct utf8_data *src)
 486 {
 487         char    *dst = NULL;
 488         size_t   n = 0;
 489
 490         for(; src->size != 0; src++) {
 491                 dst = xreallocarray(dst, n + src->size, 1);
 492                 memcpy(dst + n, src->data, src->size);
 493                 n += src->size;
 494         }
 495         dst = xreallocarray(dst, n + 1, 1);
 496         dst[n] = '\0';
 497         return (dst);
 498 }
 499
 500 /* Get width of UTF-8 string. */
 501 u_int
 502 utf8_cstrwidth(const char *s)
 503 {
 504         struct utf8_data        tmp;
 505         u_int                   width;
 506         enum utf8_state         more;
 507
 508         width = 0;
 509         while (*s != '\0') {
 510                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 511                         while (*++s != '\0' && more == UTF8_MORE)
 512                                 more = utf8_append(&tmp, *s);
 513                         if (more == UTF8_DONE) {
 514                                 width += tmp.width;
 515                                 continue;
 516                         }
 517                         s -= tmp.have;
 518                 }
 519                 if (*s > 0x1f && *s != 0x7f)
 520                         width++;
 521                 s++;
 522         }
 523         return (width);
 524 }
 525
 526 /* Pad UTF-8 string to width on the left. Caller frees. */
 527 char *
 528 utf8_padcstr(const char *s, u_int width)
 529 {
 530         size_t   slen;
 531         char    *out;
 532         u_int    n, i;
 533
 534         n = utf8_cstrwidth(s);
 535         if (n >= width)
 536                 return (xstrdup(s));
 537
 538         slen = strlen(s);
 539         out = xmalloc(slen + 1 + (width - n));
 540         memcpy(out, s, slen);
 541         for (i = n; i < width; i++)
 542                 out[slen++] = ' ';
 543         out[slen] = '\0';
 544         return (out);
 545 }
 546
 547 /* Pad UTF-8 string to width on the right. Caller frees. */
 548 char *
 549 utf8_rpadcstr(const char *s, u_int width)
 550 {
 551         size_t   slen;
 552         char    *out;
 553         u_int    n, i;
 554
 555         n = utf8_cstrwidth(s);
 556         if (n >= width)
 557                 return (xstrdup(s));
 558
 559         slen = strlen(s);
 560         out = xmalloc(slen + 1 + (width - n));
 561         for (i = 0; i < width - n; i++)
 562                 out[i] = ' ';
 563         memcpy(out + i, s, slen);
 564         out[i + slen] = '\0';
 565         return (out);
 566 }
 567
 568 int
 569 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 570 {
 571         struct utf8_data        *copy, *loop;
 572         int                      found = 0;
 573
 574         copy = utf8_fromcstr(s);
 575         for (loop = copy; loop->size != 0; loop++) {
 576                 if (loop->size != ud->size)
 577                         continue;
 578                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 579                         found = 1;
 580                         break;
 581                 }
 582         }
 583         free(copy);
 584
 585         return (found);
 586 }