utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <vis.h>
  26 #include <wchar.h>
  27
  28 #include "tmux.h"
  29
  30 struct utf8_item {
  31         RB_ENTRY(utf8_item)     index_entry;
  32         u_int                   index;
  33
  34         RB_ENTRY(utf8_item)     data_entry;
  35         char                    data[UTF8_SIZE];
  36         u_char                  size;
  37 };
  38
  39 static int
  40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  41 {
  42         if (ui1->size < ui2->size)
  43                 return (-1);
  44         if (ui1->size > ui2->size)
  45                 return (1);
  46         return (memcmp(ui1->data, ui2->data, ui1->size));
  47 }
  48 RB_HEAD(utf8_data_tree, utf8_item);
  49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
  50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
  51
  52 static int
  53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  54 {
  55         if (ui1->index < ui2->index)
  56                 return (-1);
  57         if (ui1->index > ui2->index)
  58                 return (1);
  59         return (0);
  60 }
  61 RB_HEAD(utf8_index_tree, utf8_item);
  62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
  63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
  64
  65 static u_int utf8_next_index;
  66
  67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
  68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
  69
  70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
  71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
  72
  73 /* Get a UTF-8 item from data. */
  74 static struct utf8_item *
  75 utf8_item_by_data(const char *data, size_t size)
  76 {
  77         struct utf8_item        ui;
  78
  79         memcpy(ui.data, data, size);
  80         ui.size = size;
  81
  82         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
  83 }
  84
  85 /* Get a UTF-8 item from data. */
  86 static struct utf8_item *
  87 utf8_item_by_index(u_int index)
  88 {
  89         struct utf8_item        ui;
  90
  91         ui.index = index;
  92
  93         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
  94 }
  95
  96 /* Add a UTF-8 item. */
  97 static int
  98 utf8_put_item(const char *data, size_t size, u_int *index)
  99 {
 100         struct utf8_item        *ui;
 101
 102         ui = utf8_item_by_data(data, size);
 103         if (ui != NULL) {
 104                 *index = ui->index;
 105                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 106                     *index);
 107                 return (0);
 108         }
 109
 110         if (utf8_next_index == 0xffffff + 1)
 111                 return (-1);
 112
 113         ui = xcalloc(1, sizeof *ui);
 114         ui->index = utf8_next_index++;
 115         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 116
 117         memcpy(ui->data, data, size);
 118         ui->size = size;
 119         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 120
 121         *index = ui->index;
 122         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 123         return (0);
 124 }
 125
 126 /* Get UTF-8 character from data. */
 127 enum utf8_state
 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 129 {
 130         u_int   index;
 131
 132         if (ud->width > 2)
 133                 fatalx("invalid UTF-8 width: %u", ud->width);
 134
 135         if (ud->size > UTF8_SIZE)
 136                 goto fail;
 137         if (ud->size <= 3) {
 138                 index = (((utf8_char)ud->data[2] << 16)|
 139                           ((utf8_char)ud->data[1] << 8)|
 140                           ((utf8_char)ud->data[0]));
 141         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 142                 goto fail;
 143         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 144         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 145             (int)ud->size, ud->data, *uc);
 146         return (UTF8_DONE);
 147
 148 fail:
 149         if (ud->width == 0)
 150                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 151         else if (ud->width == 1)
 152                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 153         else
 154                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 155         return (UTF8_ERROR);
 156 }
 157
 158 /* Get UTF-8 data from character. */
 159 void
 160 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 161 {
 162         struct utf8_item        *ui;
 163         u_int                    index;
 164
 165         memset(ud, 0, sizeof *ud);
 166         ud->size = ud->have = UTF8_GET_SIZE(uc);
 167         ud->width = UTF8_GET_WIDTH(uc);
 168
 169         if (ud->size <= 3) {
 170                 ud->data[2] = (uc >> 16);
 171                 ud->data[1] = ((uc >> 8) & 0xff);
 172                 ud->data[0] = (uc & 0xff);
 173         } else {
 174                 index = (uc & 0xffffff);
 175                 if ((ui = utf8_item_by_index(index)) == NULL)
 176                         memset(ud->data, ' ', ud->size);
 177                 else
 178                         memcpy(ud->data, ui->data, ud->size);
 179         }
 180
 181         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 182             (int)ud->size, ud->data);
 183 }
 184
 185 /* Get UTF-8 character from a single ASCII character. */
 186 u_int
 187 utf8_build_one(u_char ch)
 188 {
 189         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 190 }
 191
 192 /* Set a single character. */
 193 void
 194 utf8_set(struct utf8_data *ud, u_char ch)
 195 {
 196         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 197
 198         memcpy(ud, &empty, sizeof *ud);
 199         *ud->data = ch;
 200 }
 201
 202 /* Copy UTF-8 character. */
 203 void
 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 205 {
 206         u_int   i;
 207
 208         memcpy(to, from, sizeof *to);
 209
 210         for (i = to->size; i < sizeof to->data; i++)
 211                 to->data[i] = '\0';
 212 }
 213
 214 /* Get width of Unicode character. */
 215 static enum utf8_state
 216 utf8_width(struct utf8_data *ud, int *width)
 217 {
 218         wchar_t wc;
 219
 220         switch (mbtowc(&wc, ud->data, ud->size)) {
 221         case -1:
 222                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 223                     errno);
 224                 mbtowc(NULL, NULL, MB_CUR_MAX);
 225                 return (UTF8_ERROR);
 226         case 0:
 227                 return (UTF8_ERROR);
 228         }
 229         *width = wcwidth(wc);
 230         log_debug("UTF-8 %.*s %#x, wcwidth() %d", (int)ud->size, ud->data,
 231             (u_int)wc, *width);
 232         if (*width >= 0 && *width <= 0xff)
 233                 return (UTF8_DONE);
 234         return (UTF8_ERROR);
 235 }
 236
 237 /*
 238  * Open UTF-8 sequence.
 239  *
 240  * 11000010-11011111 C2-DF start of 2-byte sequence
 241  * 11100000-11101111 E0-EF start of 3-byte sequence
 242  * 11110000-11110100 F0-F4 start of 4-byte sequence
 243  */
 244 enum utf8_state
 245 utf8_open(struct utf8_data *ud, u_char ch)
 246 {
 247         memset(ud, 0, sizeof *ud);
 248         if (ch >= 0xc2 && ch <= 0xdf)
 249                 ud->size = 2;
 250         else if (ch >= 0xe0 && ch <= 0xef)
 251                 ud->size = 3;
 252         else if (ch >= 0xf0 && ch <= 0xf4)
 253                 ud->size = 4;
 254         else
 255                 return (UTF8_ERROR);
 256         utf8_append(ud, ch);
 257         return (UTF8_MORE);
 258 }
 259
 260 /* Append character to UTF-8, closing if finished. */
 261 enum utf8_state
 262 utf8_append(struct utf8_data *ud, u_char ch)
 263 {
 264         int     width;
 265
 266         if (ud->have >= ud->size)
 267                 fatalx("UTF-8 character overflow");
 268         if (ud->size > sizeof ud->data)
 269                 fatalx("UTF-8 character size too large");
 270
 271         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 272                 ud->width = 0xff;
 273
 274         ud->data[ud->have++] = ch;
 275         if (ud->have != ud->size)
 276                 return (UTF8_MORE);
 277
 278         if (ud->width == 0xff)
 279                 return (UTF8_ERROR);
 280         if (utf8_width(ud, &width) != UTF8_DONE)
 281                 return (UTF8_ERROR);
 282         ud->width = width;
 283
 284         return (UTF8_DONE);
 285 }
 286
 287 /*
 288  * Encode len characters from src into dst, which is guaranteed to have four
 289  * bytes available for each character from src (for \abc or UTF-8) plus space
 290  * for \0.
 291  */
 292 int
 293 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 294 {
 295         struct utf8_data         ud;
 296         const char              *start = dst, *end = src + len;
 297         enum utf8_state          more;
 298         size_t                   i;
 299
 300         while (src < end) {
 301                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 302                         while (++src < end && more == UTF8_MORE)
 303                                 more = utf8_append(&ud, *src);
 304                         if (more == UTF8_DONE) {
 305                                 /* UTF-8 character finished. */
 306                                 for (i = 0; i < ud.size; i++)
 307                                         *dst++ = ud.data[i];
 308                                 continue;
 309                         }
 310                         /* Not a complete, valid UTF-8 character. */
 311                         src -= ud.have;
 312                 }
 313                 if (src[0] == '$' && src < end - 1) {
 314                         if (isalpha((u_char)src[1]) ||
 315                             src[1] == '_' ||
 316                             src[1] == '{')
 317                                 *dst++ = '\\';
 318                         *dst++ = '$';
 319                 } else if (src < end - 1)
 320                         dst = vis(dst, src[0], flag, src[1]);
 321                 else if (src < end)
 322                         dst = vis(dst, src[0], flag, '\0');
 323                 src++;
 324         }
 325         *dst = '\0';
 326         return (dst - start);
 327 }
 328
 329 /* Same as utf8_strvis but allocate the buffer. */
 330 int
 331 utf8_stravis(char **dst, const char *src, int flag)
 332 {
 333         char    *buf;
 334         int      len;
 335
 336         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 337         len = utf8_strvis(buf, src, strlen(src), flag);
 338
 339         *dst = xrealloc(buf, len + 1);
 340         return (len);
 341 }
 342
 343 /* Same as utf8_strvis but allocate the buffer. */
 344 int
 345 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 346 {
 347         char    *buf;
 348         int      len;
 349
 350         buf = xreallocarray(NULL, 4, srclen + 1);
 351         len = utf8_strvis(buf, src, srclen, flag);
 352
 353         *dst = xrealloc(buf, len + 1);
 354         return (len);
 355 }
 356
 357 /* Does this string contain anything that isn't valid UTF-8? */
 358 int
 359 utf8_isvalid(const char *s)
 360 {
 361         struct utf8_data ud;
 362         const char      *end;
 363         enum utf8_state  more;
 364
 365         end = s + strlen(s);
 366         while (s < end) {
 367                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 368                         while (++s < end && more == UTF8_MORE)
 369                                 more = utf8_append(&ud, *s);
 370                         if (more == UTF8_DONE)
 371                                 continue;
 372                         return (0);
 373                 }
 374                 if (*s < 0x20 || *s > 0x7e)
 375                         return (0);
 376                 s++;
 377         }
 378         return (1);
 379 }
 380
 381 /*
 382  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 383  * the returned string. Anything not valid printable ASCII or UTF-8 is
 384  * stripped.
 385  */
 386 char *
 387 utf8_sanitize(const char *src)
 388 {
 389         char            *dst = NULL;
 390         size_t           n = 0;
 391         enum utf8_state  more;
 392         struct utf8_data ud;
 393         u_int            i;
 394
 395         while (*src != '\0') {
 396                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 397                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 398                         while (*++src != '\0' && more == UTF8_MORE)
 399                                 more = utf8_append(&ud, *src);
 400                         if (more == UTF8_DONE) {
 401                                 dst = xreallocarray(dst, n + ud.width,
 402                                     sizeof *dst);
 403                                 for (i = 0; i < ud.width; i++)
 404                                         dst[n++] = '_';
 405                                 continue;
 406                         }
 407                         src -= ud.have;
 408                 }
 409                 if (*src > 0x1f && *src < 0x7f)
 410                         dst[n++] = *src;
 411                 else
 412                         dst[n++] = '_';
 413                 src++;
 414         }
 415         dst = xreallocarray(dst, n + 1, sizeof *dst);
 416         dst[n] = '\0';
 417         return (dst);
 418 }
 419
 420 /* Get UTF-8 buffer length. */
 421 size_t
 422 utf8_strlen(const struct utf8_data *s)
 423 {
 424         size_t  i;
 425
 426         for (i = 0; s[i].size != 0; i++)
 427                 /* nothing */;
 428         return (i);
 429 }
 430
 431 /* Get UTF-8 string width. */
 432 u_int
 433 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 434 {
 435         ssize_t i;
 436         u_int   width = 0;
 437
 438         for (i = 0; s[i].size != 0; i++) {
 439                 if (n != -1 && n == i)
 440                         break;
 441                 width += s[i].width;
 442         }
 443         return (width);
 444 }
 445
 446 /*
 447  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 448  * Caller frees.
 449  */
 450 struct utf8_data *
 451 utf8_fromcstr(const char *src)
 452 {
 453         struct utf8_data        *dst = NULL;
 454         size_t                   n = 0;
 455         enum utf8_state          more;
 456
 457         while (*src != '\0') {
 458                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 459                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 460                         while (*++src != '\0' && more == UTF8_MORE)
 461                                 more = utf8_append(&dst[n], *src);
 462                         if (more == UTF8_DONE) {
 463                                 n++;
 464                                 continue;
 465                         }
 466                         src -= dst[n].have;
 467                 }
 468                 utf8_set(&dst[n], *src);
 469                 n++;
 470                 src++;
 471         }
 472         dst = xreallocarray(dst, n + 1, sizeof *dst);
 473         dst[n].size = 0;
 474         return (dst);
 475 }
 476
 477 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 478 char *
 479 utf8_tocstr(struct utf8_data *src)
 480 {
 481         char    *dst = NULL;
 482         size_t   n = 0;
 483
 484         for(; src->size != 0; src++) {
 485                 dst = xreallocarray(dst, n + src->size, 1);
 486                 memcpy(dst + n, src->data, src->size);
 487                 n += src->size;
 488         }
 489         dst = xreallocarray(dst, n + 1, 1);
 490         dst[n] = '\0';
 491         return (dst);
 492 }
 493
 494 /* Get width of UTF-8 string. */
 495 u_int
 496 utf8_cstrwidth(const char *s)
 497 {
 498         struct utf8_data        tmp;
 499         u_int                   width;
 500         enum utf8_state         more;
 501
 502         width = 0;
 503         while (*s != '\0') {
 504                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 505                         while (*++s != '\0' && more == UTF8_MORE)
 506                                 more = utf8_append(&tmp, *s);
 507                         if (more == UTF8_DONE) {
 508                                 width += tmp.width;
 509                                 continue;
 510                         }
 511                         s -= tmp.have;
 512                 }
 513                 if (*s > 0x1f && *s != 0x7f)
 514                         width++;
 515                 s++;
 516         }
 517         return (width);
 518 }
 519
 520 /* Pad UTF-8 string to width on the left. Caller frees. */
 521 char *
 522 utf8_padcstr(const char *s, u_int width)
 523 {
 524         size_t   slen;
 525         char    *out;
 526         u_int    n, i;
 527
 528         n = utf8_cstrwidth(s);
 529         if (n >= width)
 530                 return (xstrdup(s));
 531
 532         slen = strlen(s);
 533         out = xmalloc(slen + 1 + (width - n));
 534         memcpy(out, s, slen);
 535         for (i = n; i < width; i++)
 536                 out[slen++] = ' ';
 537         out[slen] = '\0';
 538         return (out);
 539 }
 540
 541 /* Pad UTF-8 string to width on the right. Caller frees. */
 542 char *
 543 utf8_rpadcstr(const char *s, u_int width)
 544 {
 545         size_t   slen;
 546         char    *out;
 547         u_int    n, i;
 548
 549         n = utf8_cstrwidth(s);
 550         if (n >= width)
 551                 return (xstrdup(s));
 552
 553         slen = strlen(s);
 554         out = xmalloc(slen + 1 + (width - n));
 555         for (i = 0; i < width - n; i++)
 556                 out[i] = ' ';
 557         memcpy(out + i, s, slen);
 558         out[i + slen] = '\0';
 559         return (out);
 560 }
 561
 562 int
 563 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 564 {
 565         struct utf8_data        *copy, *loop;
 566         int                      found = 0;
 567
 568         copy = utf8_fromcstr(s);
 569         for (loop = copy; loop->size != 0; loop++) {
 570                 if (loop->size != ud->size)
 571                         continue;
 572                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 573                         found = 1;
 574                         break;
 575                 }
 576         }
 577         free(copy);
 578
 579         return (found);
 580 }