utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <vis.h>
  26 #include <wchar.h>
  27
  28 #include "tmux.h"
  29
  30 struct utf8_item {
  31         RB_ENTRY(utf8_item)     index_entry;
  32         u_int                   index;
  33
  34         RB_ENTRY(utf8_item)     data_entry;
  35         char                    data[UTF8_SIZE];
  36         u_char                  size;
  37 };
  38
  39 static int
  40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  41 {
  42         if (ui1->size < ui2->size)
  43                 return (-1);
  44         if (ui1->size > ui2->size)
  45                 return (1);
  46         return (memcmp(ui1->data, ui2->data, ui1->size));
  47 }
  48 RB_HEAD(utf8_data_tree, utf8_item);
  49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
  50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
  51
  52 static int
  53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  54 {
  55         if (ui1->index < ui2->index)
  56                 return (-1);
  57         if (ui1->index > ui2->index)
  58                 return (1);
  59         return (0);
  60 }
  61 RB_HEAD(utf8_index_tree, utf8_item);
  62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
  63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
  64
  65 static u_int utf8_next_index;
  66
  67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
  68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
  69
  70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
  71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
  72
  73 /* Get a UTF-8 item from data. */
  74 static struct utf8_item *
  75 utf8_item_by_data(const char *data, size_t size)
  76 {
  77         struct utf8_item        ui;
  78
  79         memcpy(ui.data, data, size);
  80         ui.size = size;
  81
  82         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
  83 }
  84
  85 /* Get a UTF-8 item from data. */
  86 static struct utf8_item *
  87 utf8_item_by_index(u_int index)
  88 {
  89         struct utf8_item        ui;
  90
  91         ui.index = index;
  92
  93         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
  94 }
  95
  96 /* Add a UTF-8 item. */
  97 static int
  98 utf8_put_item(const char *data, size_t size, u_int *index)
  99 {
 100         struct utf8_item        *ui;
 101
 102         ui = utf8_item_by_data(data, size);
 103         if (ui != NULL) {
 104                 *index = ui->index;
 105                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 106                     *index);
 107                 return (0);
 108         }
 109
 110         if (utf8_next_index == 0xffffff + 1)
 111                 return (-1);
 112
 113         ui = xcalloc(1, sizeof *ui);
 114         ui->index = utf8_next_index++;
 115         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 116
 117         memcpy(ui->data, data, size);
 118         ui->size = size;
 119         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 120
 121         *index = ui->index;
 122         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 123         return (0);
 124 }
 125
 126 /* Get UTF-8 character from data. */
 127 enum utf8_state
 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 129 {
 130         u_int   index;
 131
 132         if (ud->width > 2)
 133                 fatalx("invalid UTF-8 width: %u", ud->width);
 134
 135         if (ud->size > UTF8_SIZE)
 136                 goto fail;
 137         if (ud->size <= 3) {
 138                 index = (((utf8_char)ud->data[2] << 16)|
 139                           ((utf8_char)ud->data[1] << 8)|
 140                           ((utf8_char)ud->data[0]));
 141         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 142                 goto fail;
 143         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 144         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 145             (int)ud->size, ud->data, *uc);
 146         return (UTF8_DONE);
 147
 148 fail:
 149         if (ud->width == 0)
 150                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 151         else if (ud->width == 1)
 152                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 153         else
 154                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 155         return (UTF8_ERROR);
 156 }
 157
 158 /* Get UTF-8 data from character. */
 159 void
 160 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 161 {
 162         struct utf8_item        *ui;
 163         u_int                    index;
 164
 165         memset(ud, 0, sizeof *ud);
 166         ud->size = ud->have = UTF8_GET_SIZE(uc);
 167         ud->width = UTF8_GET_WIDTH(uc);
 168
 169         if (ud->size <= 3) {
 170                 ud->data[2] = (uc >> 16);
 171                 ud->data[1] = ((uc >> 8) & 0xff);
 172                 ud->data[0] = (uc & 0xff);
 173         } else {
 174                 index = (uc & 0xffffff);
 175                 if ((ui = utf8_item_by_index(index)) == NULL)
 176                         memset(ud->data, ' ', ud->size);
 177                 else
 178                         memcpy(ud->data, ui->data, ud->size);
 179         }
 180
 181         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 182             (int)ud->size, ud->data);
 183 }
 184
 185 /* Get UTF-8 character from a single ASCII character. */
 186 u_int
 187 utf8_build_one(u_char ch)
 188 {
 189         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 190 }
 191
 192 /* Set a single character. */
 193 void
 194 utf8_set(struct utf8_data *ud, u_char ch)
 195 {
 196         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 197
 198         memcpy(ud, &empty, sizeof *ud);
 199         *ud->data = ch;
 200 }
 201
 202 /* Copy UTF-8 character. */
 203 void
 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 205 {
 206         u_int   i;
 207
 208         memcpy(to, from, sizeof *to);
 209
 210         for (i = to->size; i < sizeof to->data; i++)
 211                 to->data[i] = '\0';
 212 }
 213
 214 /* Get width of Unicode character. */
 215 static enum utf8_state
 216 utf8_width(struct utf8_data *ud, int *width)
 217 {
 218         wchar_t wc;
 219
 220         switch (mbtowc(&wc, ud->data, ud->size)) {
 221         case -1:
 222                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 223                     errno);
 224                 mbtowc(NULL, NULL, MB_CUR_MAX);
 225                 return (UTF8_ERROR);
 226         case 0:
 227                 return (UTF8_ERROR);
 228         }
 229         *width = wcwidth(wc);
 230         if (*width < 0 || *width > 0xff) {
 231                 log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
 232                     *width);
 233                 return (UTF8_ERROR);
 234         }
 235         return (UTF8_DONE);
 236 }
 237
 238 /*
 239  * Open UTF-8 sequence.
 240  *
 241  * 11000010-11011111 C2-DF start of 2-byte sequence
 242  * 11100000-11101111 E0-EF start of 3-byte sequence
 243  * 11110000-11110100 F0-F4 start of 4-byte sequence
 244  */
 245 enum utf8_state
 246 utf8_open(struct utf8_data *ud, u_char ch)
 247 {
 248         memset(ud, 0, sizeof *ud);
 249         if (ch >= 0xc2 && ch <= 0xdf)
 250                 ud->size = 2;
 251         else if (ch >= 0xe0 && ch <= 0xef)
 252                 ud->size = 3;
 253         else if (ch >= 0xf0 && ch <= 0xf4)
 254                 ud->size = 4;
 255         else
 256                 return (UTF8_ERROR);
 257         utf8_append(ud, ch);
 258         return (UTF8_MORE);
 259 }
 260
 261 /* Append character to UTF-8, closing if finished. */
 262 enum utf8_state
 263 utf8_append(struct utf8_data *ud, u_char ch)
 264 {
 265         int     width;
 266
 267         if (ud->have >= ud->size)
 268                 fatalx("UTF-8 character overflow");
 269         if (ud->size > sizeof ud->data)
 270                 fatalx("UTF-8 character size too large");
 271
 272         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 273                 ud->width = 0xff;
 274
 275         ud->data[ud->have++] = ch;
 276         if (ud->have != ud->size)
 277                 return (UTF8_MORE);
 278
 279         if (ud->width == 0xff)
 280                 return (UTF8_ERROR);
 281         if (utf8_width(ud, &width) != UTF8_DONE)
 282                 return (UTF8_ERROR);
 283         ud->width = width;
 284
 285         return (UTF8_DONE);
 286 }
 287
 288 /*
 289  * Encode len characters from src into dst, which is guaranteed to have four
 290  * bytes available for each character from src (for \abc or UTF-8) plus space
 291  * for \0.
 292  */
 293 int
 294 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 295 {
 296         struct utf8_data         ud;
 297         const char              *start = dst, *end = src + len;
 298         enum utf8_state          more;
 299         size_t                   i;
 300
 301         while (src < end) {
 302                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 303                         while (++src < end && more == UTF8_MORE)
 304                                 more = utf8_append(&ud, *src);
 305                         if (more == UTF8_DONE) {
 306                                 /* UTF-8 character finished. */
 307                                 for (i = 0; i < ud.size; i++)
 308                                         *dst++ = ud.data[i];
 309                                 continue;
 310                         }
 311                         /* Not a complete, valid UTF-8 character. */
 312                         src -= ud.have;
 313                 }
 314                 if (src[0] == '$' && src < end - 1) {
 315                         if (isalpha((u_char)src[1]) ||
 316                             src[1] == '_' ||
 317                             src[1] == '{')
 318                                 *dst++ = '\\';
 319                         *dst++ = '$';
 320                 } else if (src < end - 1)
 321                         dst = vis(dst, src[0], flag, src[1]);
 322                 else if (src < end)
 323                         dst = vis(dst, src[0], flag, '\0');
 324                 src++;
 325         }
 326         *dst = '\0';
 327         return (dst - start);
 328 }
 329
 330 /* Same as utf8_strvis but allocate the buffer. */
 331 int
 332 utf8_stravis(char **dst, const char *src, int flag)
 333 {
 334         char    *buf;
 335         int      len;
 336
 337         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 338         len = utf8_strvis(buf, src, strlen(src), flag);
 339
 340         *dst = xrealloc(buf, len + 1);
 341         return (len);
 342 }
 343
 344 /* Same as utf8_strvis but allocate the buffer. */
 345 int
 346 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 347 {
 348         char    *buf;
 349         int      len;
 350
 351         buf = xreallocarray(NULL, 4, srclen + 1);
 352         len = utf8_strvis(buf, src, srclen, flag);
 353
 354         *dst = xrealloc(buf, len + 1);
 355         return (len);
 356 }
 357
 358 /* Does this string contain anything that isn't valid UTF-8? */
 359 int
 360 utf8_isvalid(const char *s)
 361 {
 362         struct utf8_data ud;
 363         const char      *end;
 364         enum utf8_state  more;
 365
 366         end = s + strlen(s);
 367         while (s < end) {
 368                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 369                         while (++s < end && more == UTF8_MORE)
 370                                 more = utf8_append(&ud, *s);
 371                         if (more == UTF8_DONE)
 372                                 continue;
 373                         return (0);
 374                 }
 375                 if (*s < 0x20 || *s > 0x7e)
 376                         return (0);
 377                 s++;
 378         }
 379         return (1);
 380 }
 381
 382 /*
 383  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 384  * the returned string. Anything not valid printable ASCII or UTF-8 is
 385  * stripped.
 386  */
 387 char *
 388 utf8_sanitize(const char *src)
 389 {
 390         char            *dst = NULL;
 391         size_t           n = 0;
 392         enum utf8_state  more;
 393         struct utf8_data ud;
 394         u_int            i;
 395
 396         while (*src != '\0') {
 397                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 398                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 399                         while (*++src != '\0' && more == UTF8_MORE)
 400                                 more = utf8_append(&ud, *src);
 401                         if (more == UTF8_DONE) {
 402                                 dst = xreallocarray(dst, n + ud.width,
 403                                     sizeof *dst);
 404                                 for (i = 0; i < ud.width; i++)
 405                                         dst[n++] = '_';
 406                                 continue;
 407                         }
 408                         src -= ud.have;
 409                 }
 410                 if (*src > 0x1f && *src < 0x7f)
 411                         dst[n++] = *src;
 412                 else
 413                         dst[n++] = '_';
 414                 src++;
 415         }
 416         dst = xreallocarray(dst, n + 1, sizeof *dst);
 417         dst[n] = '\0';
 418         return (dst);
 419 }
 420
 421 /* Get UTF-8 buffer length. */
 422 size_t
 423 utf8_strlen(const struct utf8_data *s)
 424 {
 425         size_t  i;
 426
 427         for (i = 0; s[i].size != 0; i++)
 428                 /* nothing */;
 429         return (i);
 430 }
 431
 432 /* Get UTF-8 string width. */
 433 u_int
 434 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 435 {
 436         ssize_t i;
 437         u_int   width = 0;
 438
 439         for (i = 0; s[i].size != 0; i++) {
 440                 if (n != -1 && n == i)
 441                         break;
 442                 width += s[i].width;
 443         }
 444         return (width);
 445 }
 446
 447 /*
 448  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 449  * Caller frees.
 450  */
 451 struct utf8_data *
 452 utf8_fromcstr(const char *src)
 453 {
 454         struct utf8_data        *dst = NULL;
 455         size_t                   n = 0;
 456         enum utf8_state          more;
 457
 458         while (*src != '\0') {
 459                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 460                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 461                         while (*++src != '\0' && more == UTF8_MORE)
 462                                 more = utf8_append(&dst[n], *src);
 463                         if (more == UTF8_DONE) {
 464                                 n++;
 465                                 continue;
 466                         }
 467                         src -= dst[n].have;
 468                 }
 469                 utf8_set(&dst[n], *src);
 470                 n++;
 471                 src++;
 472         }
 473         dst = xreallocarray(dst, n + 1, sizeof *dst);
 474         dst[n].size = 0;
 475         return (dst);
 476 }
 477
 478 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 479 char *
 480 utf8_tocstr(struct utf8_data *src)
 481 {
 482         char    *dst = NULL;
 483         size_t   n = 0;
 484
 485         for(; src->size != 0; src++) {
 486                 dst = xreallocarray(dst, n + src->size, 1);
 487                 memcpy(dst + n, src->data, src->size);
 488                 n += src->size;
 489         }
 490         dst = xreallocarray(dst, n + 1, 1);
 491         dst[n] = '\0';
 492         return (dst);
 493 }
 494
 495 /* Get width of UTF-8 string. */
 496 u_int
 497 utf8_cstrwidth(const char *s)
 498 {
 499         struct utf8_data        tmp;
 500         u_int                   width;
 501         enum utf8_state         more;
 502
 503         width = 0;
 504         while (*s != '\0') {
 505                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 506                         while (*++s != '\0' && more == UTF8_MORE)
 507                                 more = utf8_append(&tmp, *s);
 508                         if (more == UTF8_DONE) {
 509                                 width += tmp.width;
 510                                 continue;
 511                         }
 512                         s -= tmp.have;
 513                 }
 514                 if (*s > 0x1f && *s != 0x7f)
 515                         width++;
 516                 s++;
 517         }
 518         return (width);
 519 }
 520
 521 /* Pad UTF-8 string to width on the left. Caller frees. */
 522 char *
 523 utf8_padcstr(const char *s, u_int width)
 524 {
 525         size_t   slen;
 526         char    *out;
 527         u_int    n, i;
 528
 529         n = utf8_cstrwidth(s);
 530         if (n >= width)
 531                 return (xstrdup(s));
 532
 533         slen = strlen(s);
 534         out = xmalloc(slen + 1 + (width - n));
 535         memcpy(out, s, slen);
 536         for (i = n; i < width; i++)
 537                 out[slen++] = ' ';
 538         out[slen] = '\0';
 539         return (out);
 540 }
 541
 542 /* Pad UTF-8 string to width on the right. Caller frees. */
 543 char *
 544 utf8_rpadcstr(const char *s, u_int width)
 545 {
 546         size_t   slen;
 547         char    *out;
 548         u_int    n, i;
 549
 550         n = utf8_cstrwidth(s);
 551         if (n >= width)
 552                 return (xstrdup(s));
 553
 554         slen = strlen(s);
 555         out = xmalloc(slen + 1 + (width - n));
 556         for (i = 0; i < width - n; i++)
 557                 out[i] = ' ';
 558         memcpy(out + i, s, slen);
 559         out[i + slen] = '\0';
 560         return (out);
 561 }
 562
 563 int
 564 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 565 {
 566         struct utf8_data        *copy, *loop;
 567         int                      found = 0;
 568
 569         copy = utf8_fromcstr(s);
 570         for (loop = copy; loop->size != 0; loop++) {
 571                 if (loop->size != ud->size)
 572                         continue;
 573                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 574                         found = 1;
 575                         break;
 576                 }
 577         }
 578         free(copy);
 579
 580         return (found);
 581 }