utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <vis.h>
  26 #include <wchar.h>
  27
  28 #include "tmux.h"
  29
  30 struct utf8_item {
  31         RB_ENTRY(utf8_item)     index_entry;
  32         u_int                   index;
  33
  34         RB_ENTRY(utf8_item)     data_entry;
  35         char                    data[UTF8_SIZE];
  36         u_char                  size;
  37 };
  38
  39 static int
  40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  41 {
  42         if (ui1->size < ui2->size)
  43                 return (-1);
  44         if (ui1->size > ui2->size)
  45                 return (1);
  46         return (memcmp(ui1->data, ui2->data, ui1->size));
  47 }
  48 RB_HEAD(utf8_data_tree, utf8_item);
  49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
  50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
  51
  52 static int
  53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  54 {
  55         if (ui1->index < ui2->index)
  56                 return (-1);
  57         if (ui1->index > ui2->index)
  58                 return (1);
  59         return (0);
  60 }
  61 RB_HEAD(utf8_index_tree, utf8_item);
  62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
  63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
  64
  65 static u_int utf8_next_index;
  66
  67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
  68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
  69
  70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
  71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
  72
  73 /* Get a UTF-8 item from data. */
  74 static struct utf8_item *
  75 utf8_item_by_data(const char *data, size_t size)
  76 {
  77         struct utf8_item        ui;
  78
  79         memcpy(ui.data, data, size);
  80         ui.size = size;
  81
  82         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
  83 }
  84
  85 /* Get a UTF-8 item from data. */
  86 static struct utf8_item *
  87 utf8_item_by_index(u_int index)
  88 {
  89         struct utf8_item        ui;
  90
  91         ui.index = index;
  92
  93         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
  94 }
  95
  96 /* Add a UTF-8 item. */
  97 static int
  98 utf8_put_item(const char *data, size_t size, u_int *index)
  99 {
 100         struct utf8_item        *ui;
 101
 102         ui = utf8_item_by_data(data, size);
 103         if (ui != NULL) {
 104                 *index = ui->index;
 105                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 106                     *index);
 107                 return (0);
 108         }
 109
 110         if (utf8_next_index == 0xffffff + 1)
 111                 return (-1);
 112
 113         ui = xcalloc(1, sizeof *ui);
 114         ui->index = utf8_next_index++;
 115         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 116
 117         memcpy(ui->data, data, size);
 118         ui->size = size;
 119         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 120
 121         *index = ui->index;
 122         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 123         return (0);
 124 }
 125
 126 /* Get UTF-8 character from data. */
 127 enum utf8_state
 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 129 {
 130         u_int   index;
 131
 132         if (ud->width > 2)
 133                 fatalx("invalid UTF-8 width: %u", ud->width);
 134
 135         if (ud->size > UTF8_SIZE)
 136                 goto fail;
 137         if (ud->size <= 3) {
 138                 index = (((utf8_char)ud->data[2] << 16)|
 139                           ((utf8_char)ud->data[1] << 8)|
 140                           ((utf8_char)ud->data[0]));
 141         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 142                 goto fail;
 143         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 144         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 145             (int)ud->size, ud->data, *uc);
 146         return (UTF8_DONE);
 147
 148 fail:
 149         if (ud->width == 0)
 150                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 151         else if (ud->width == 1)
 152                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 153         else
 154                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 155         return (UTF8_ERROR);
 156 }
 157
 158 /* Get UTF-8 data from character. */
 159 void
 160 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 161 {
 162         struct utf8_item        *ui;
 163         u_int                    index;
 164
 165         memset(ud, 0, sizeof *ud);
 166         ud->size = ud->have = UTF8_GET_SIZE(uc);
 167         ud->width = UTF8_GET_WIDTH(uc);
 168
 169         if (ud->size <= 3) {
 170                 ud->data[2] = (uc >> 16);
 171                 ud->data[1] = ((uc >> 8) & 0xff);
 172                 ud->data[0] = (uc & 0xff);
 173         } else {
 174                 index = (uc & 0xffffff);
 175                 if ((ui = utf8_item_by_index(index)) == NULL)
 176                         memset(ud->data, ' ', ud->size);
 177                 else
 178                         memcpy(ud->data, ui->data, ud->size);
 179         }
 180
 181         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 182             (int)ud->size, ud->data);
 183 }
 184
 185 /* Get UTF-8 character from a single ASCII character. */
 186 u_int
 187 utf8_build_one(u_char ch)
 188 {
 189         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 190 }
 191
 192 /* Set a single character. */
 193 void
 194 utf8_set(struct utf8_data *ud, u_char ch)
 195 {
 196         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 197
 198         memcpy(ud, &empty, sizeof *ud);
 199         *ud->data = ch;
 200 }
 201
 202 /* Copy UTF-8 character. */
 203 void
 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 205 {
 206         u_int   i;
 207
 208         memcpy(to, from, sizeof *to);
 209
 210         for (i = to->size; i < sizeof to->data; i++)
 211                 to->data[i] = '\0';
 212 }
 213
 214 /* Get width of Unicode character. */
 215 static enum utf8_state
 216 utf8_width(struct utf8_data *ud, int *width)
 217 {
 218         wchar_t wc;
 219
 220         switch (mbtowc(&wc, ud->data, ud->size)) {
 221         case -1:
 222                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 223                     errno);
 224                 mbtowc(NULL, NULL, MB_CUR_MAX);
 225                 return (UTF8_ERROR);
 226         case 0:
 227                 return (UTF8_ERROR);
 228         }
 229         log_debug("UTF-8 %.*s is %08X", (int)ud->size, ud->data, (u_int)wc);
 230         *width = wcwidth(wc);
 231         log_debug("wcwidth(%08X) returned %d", (u_int)wc, *width);
 232         if (*width < 0) {
 233                 /*
 234                  * C1 control characters are nonprintable, so they are always
 235                  * zero width.
 236                  */
 237                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 238         }
 239         if (*width >= 0 && *width <= 0xff)
 240                 return (UTF8_DONE);
 241         return (UTF8_ERROR);
 242 }
 243
 244 /*
 245  * Open UTF-8 sequence.
 246  *
 247  * 11000010-11011111 C2-DF start of 2-byte sequence
 248  * 11100000-11101111 E0-EF start of 3-byte sequence
 249  * 11110000-11110100 F0-F4 start of 4-byte sequence
 250  */
 251 enum utf8_state
 252 utf8_open(struct utf8_data *ud, u_char ch)
 253 {
 254         memset(ud, 0, sizeof *ud);
 255         if (ch >= 0xc2 && ch <= 0xdf)
 256                 ud->size = 2;
 257         else if (ch >= 0xe0 && ch <= 0xef)
 258                 ud->size = 3;
 259         else if (ch >= 0xf0 && ch <= 0xf4)
 260                 ud->size = 4;
 261         else
 262                 return (UTF8_ERROR);
 263         utf8_append(ud, ch);
 264         return (UTF8_MORE);
 265 }
 266
 267 /* Append character to UTF-8, closing if finished. */
 268 enum utf8_state
 269 utf8_append(struct utf8_data *ud, u_char ch)
 270 {
 271         int     width;
 272
 273         if (ud->have >= ud->size)
 274                 fatalx("UTF-8 character overflow");
 275         if (ud->size > sizeof ud->data)
 276                 fatalx("UTF-8 character size too large");
 277
 278         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 279                 ud->width = 0xff;
 280
 281         ud->data[ud->have++] = ch;
 282         if (ud->have != ud->size)
 283                 return (UTF8_MORE);
 284
 285         if (ud->width == 0xff)
 286                 return (UTF8_ERROR);
 287         if (utf8_width(ud, &width) != UTF8_DONE)
 288                 return (UTF8_ERROR);
 289         ud->width = width;
 290
 291         return (UTF8_DONE);
 292 }
 293
 294 /*
 295  * Encode len characters from src into dst, which is guaranteed to have four
 296  * bytes available for each character from src (for \abc or UTF-8) plus space
 297  * for \0.
 298  */
 299 int
 300 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 301 {
 302         struct utf8_data         ud;
 303         const char              *start = dst, *end = src + len;
 304         enum utf8_state          more;
 305         size_t                   i;
 306
 307         while (src < end) {
 308                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 309                         while (++src < end && more == UTF8_MORE)
 310                                 more = utf8_append(&ud, *src);
 311                         if (more == UTF8_DONE) {
 312                                 /* UTF-8 character finished. */
 313                                 for (i = 0; i < ud.size; i++)
 314                                         *dst++ = ud.data[i];
 315                                 continue;
 316                         }
 317                         /* Not a complete, valid UTF-8 character. */
 318                         src -= ud.have;
 319                 }
 320                 if (src[0] == '$' && src < end - 1) {
 321                         if (isalpha((u_char)src[1]) ||
 322                             src[1] == '_' ||
 323                             src[1] == '{')
 324                                 *dst++ = '\\';
 325                         *dst++ = '$';
 326                 } else if (src < end - 1)
 327                         dst = vis(dst, src[0], flag, src[1]);
 328                 else if (src < end)
 329                         dst = vis(dst, src[0], flag, '\0');
 330                 src++;
 331         }
 332         *dst = '\0';
 333         return (dst - start);
 334 }
 335
 336 /* Same as utf8_strvis but allocate the buffer. */
 337 int
 338 utf8_stravis(char **dst, const char *src, int flag)
 339 {
 340         char    *buf;
 341         int      len;
 342
 343         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 344         len = utf8_strvis(buf, src, strlen(src), flag);
 345
 346         *dst = xrealloc(buf, len + 1);
 347         return (len);
 348 }
 349
 350 /* Same as utf8_strvis but allocate the buffer. */
 351 int
 352 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 353 {
 354         char    *buf;
 355         int      len;
 356
 357         buf = xreallocarray(NULL, 4, srclen + 1);
 358         len = utf8_strvis(buf, src, srclen, flag);
 359
 360         *dst = xrealloc(buf, len + 1);
 361         return (len);
 362 }
 363
 364 /* Does this string contain anything that isn't valid UTF-8? */
 365 int
 366 utf8_isvalid(const char *s)
 367 {
 368         struct utf8_data ud;
 369         const char      *end;
 370         enum utf8_state  more;
 371
 372         end = s + strlen(s);
 373         while (s < end) {
 374                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 375                         while (++s < end && more == UTF8_MORE)
 376                                 more = utf8_append(&ud, *s);
 377                         if (more == UTF8_DONE)
 378                                 continue;
 379                         return (0);
 380                 }
 381                 if (*s < 0x20 || *s > 0x7e)
 382                         return (0);
 383                 s++;
 384         }
 385         return (1);
 386 }
 387
 388 /*
 389  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 390  * the returned string. Anything not valid printable ASCII or UTF-8 is
 391  * stripped.
 392  */
 393 char *
 394 utf8_sanitize(const char *src)
 395 {
 396         char            *dst = NULL;
 397         size_t           n = 0;
 398         enum utf8_state  more;
 399         struct utf8_data ud;
 400         u_int            i;
 401
 402         while (*src != '\0') {
 403                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 404                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 405                         while (*++src != '\0' && more == UTF8_MORE)
 406                                 more = utf8_append(&ud, *src);
 407                         if (more == UTF8_DONE) {
 408                                 dst = xreallocarray(dst, n + ud.width,
 409                                     sizeof *dst);
 410                                 for (i = 0; i < ud.width; i++)
 411                                         dst[n++] = '_';
 412                                 continue;
 413                         }
 414                         src -= ud.have;
 415                 }
 416                 if (*src > 0x1f && *src < 0x7f)
 417                         dst[n++] = *src;
 418                 else
 419                         dst[n++] = '_';
 420                 src++;
 421         }
 422         dst = xreallocarray(dst, n + 1, sizeof *dst);
 423         dst[n] = '\0';
 424         return (dst);
 425 }
 426
 427 /* Get UTF-8 buffer length. */
 428 size_t
 429 utf8_strlen(const struct utf8_data *s)
 430 {
 431         size_t  i;
 432
 433         for (i = 0; s[i].size != 0; i++)
 434                 /* nothing */;
 435         return (i);
 436 }
 437
 438 /* Get UTF-8 string width. */
 439 u_int
 440 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 441 {
 442         ssize_t i;
 443         u_int   width = 0;
 444
 445         for (i = 0; s[i].size != 0; i++) {
 446                 if (n != -1 && n == i)
 447                         break;
 448                 width += s[i].width;
 449         }
 450         return (width);
 451 }
 452
 453 /*
 454  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 455  * Caller frees.
 456  */
 457 struct utf8_data *
 458 utf8_fromcstr(const char *src)
 459 {
 460         struct utf8_data        *dst = NULL;
 461         size_t                   n = 0;
 462         enum utf8_state          more;
 463
 464         while (*src != '\0') {
 465                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 466                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 467                         while (*++src != '\0' && more == UTF8_MORE)
 468                                 more = utf8_append(&dst[n], *src);
 469                         if (more == UTF8_DONE) {
 470                                 n++;
 471                                 continue;
 472                         }
 473                         src -= dst[n].have;
 474                 }
 475                 utf8_set(&dst[n], *src);
 476                 n++;
 477                 src++;
 478         }
 479         dst = xreallocarray(dst, n + 1, sizeof *dst);
 480         dst[n].size = 0;
 481         return (dst);
 482 }
 483
 484 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 485 char *
 486 utf8_tocstr(struct utf8_data *src)
 487 {
 488         char    *dst = NULL;
 489         size_t   n = 0;
 490
 491         for(; src->size != 0; src++) {
 492                 dst = xreallocarray(dst, n + src->size, 1);
 493                 memcpy(dst + n, src->data, src->size);
 494                 n += src->size;
 495         }
 496         dst = xreallocarray(dst, n + 1, 1);
 497         dst[n] = '\0';
 498         return (dst);
 499 }
 500
 501 /* Get width of UTF-8 string. */
 502 u_int
 503 utf8_cstrwidth(const char *s)
 504 {
 505         struct utf8_data        tmp;
 506         u_int                   width;
 507         enum utf8_state         more;
 508
 509         width = 0;
 510         while (*s != '\0') {
 511                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 512                         while (*++s != '\0' && more == UTF8_MORE)
 513                                 more = utf8_append(&tmp, *s);
 514                         if (more == UTF8_DONE) {
 515                                 width += tmp.width;
 516                                 continue;
 517                         }
 518                         s -= tmp.have;
 519                 }
 520                 if (*s > 0x1f && *s != 0x7f)
 521                         width++;
 522                 s++;
 523         }
 524         return (width);
 525 }
 526
 527 /* Pad UTF-8 string to width on the left. Caller frees. */
 528 char *
 529 utf8_padcstr(const char *s, u_int width)
 530 {
 531         size_t   slen;
 532         char    *out;
 533         u_int    n, i;
 534
 535         n = utf8_cstrwidth(s);
 536         if (n >= width)
 537                 return (xstrdup(s));
 538
 539         slen = strlen(s);
 540         out = xmalloc(slen + 1 + (width - n));
 541         memcpy(out, s, slen);
 542         for (i = n; i < width; i++)
 543                 out[slen++] = ' ';
 544         out[slen] = '\0';
 545         return (out);
 546 }
 547
 548 /* Pad UTF-8 string to width on the right. Caller frees. */
 549 char *
 550 utf8_rpadcstr(const char *s, u_int width)
 551 {
 552         size_t   slen;
 553         char    *out;
 554         u_int    n, i;
 555
 556         n = utf8_cstrwidth(s);
 557         if (n >= width)
 558                 return (xstrdup(s));
 559
 560         slen = strlen(s);
 561         out = xmalloc(slen + 1 + (width - n));
 562         for (i = 0; i < width - n; i++)
 563                 out[i] = ' ';
 564         memcpy(out + i, s, slen);
 565         out[i + slen] = '\0';
 566         return (out);
 567 }
 568
 569 int
 570 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 571 {
 572         struct utf8_data        *copy, *loop;
 573         int                      found = 0;
 574
 575         copy = utf8_fromcstr(s);
 576         for (loop = copy; loop->size != 0; loop++) {
 577                 if (loop->size != ud->size)
 578                         continue;
 579                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 580                         found = 1;
 581                         break;
 582                 }
 583         }
 584         free(copy);
 585
 586         return (found);
 587 }