utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wchar.h>
  26
  27 #include "tmux.h"
  28
  29 static const wchar_t utf8_force_wide[] = {
  30         0x0261D,
  31         0x026F9,
  32         0x0270A,
  33         0x0270B,
  34         0x0270C,
  35         0x0270D,
  36         0x1F1E6,
  37         0x1F1E7,
  38         0x1F1E8,
  39         0x1F1E9,
  40         0x1F1EA,
  41         0x1F1EB,
  42         0x1F1EC,
  43         0x1F1ED,
  44         0x1F1EE,
  45         0x1F1EF,
  46         0x1F1F0,
  47         0x1F1F1,
  48         0x1F1F2,
  49         0x1F1F3,
  50         0x1F1F4,
  51         0x1F1F5,
  52         0x1F1F6,
  53         0x1F1F7,
  54         0x1F1F8,
  55         0x1F1F9,
  56         0x1F1FA,
  57         0x1F1FB,
  58         0x1F1FC,
  59         0x1F1FD,
  60         0x1F1FE,
  61         0x1F1FF,
  62         0x1F385,
  63         0x1F3C2,
  64         0x1F3C3,
  65         0x1F3C4,
  66         0x1F3C7,
  67         0x1F3CA,
  68         0x1F3CB,
  69         0x1F3CC,
  70         0x1F3FB,
  71         0x1F3FC,
  72         0x1F3FD,
  73         0x1F3FE,
  74         0x1F3FF,
  75         0x1F442,
  76         0x1F443,
  77         0x1F446,
  78         0x1F447,
  79         0x1F448,
  80         0x1F449,
  81         0x1F44A,
  82         0x1F44B,
  83         0x1F44C,
  84         0x1F44D,
  85         0x1F44E,
  86         0x1F44F,
  87         0x1F450,
  88         0x1F466,
  89         0x1F467,
  90         0x1F468,
  91         0x1F469,
  92         0x1F46B,
  93         0x1F46C,
  94         0x1F46D,
  95         0x1F46E,
  96         0x1F470,
  97         0x1F471,
  98         0x1F472,
  99         0x1F473,
 100         0x1F474,
 101         0x1F475,
 102         0x1F476,
 103         0x1F477,
 104         0x1F478,
 105         0x1F47C,
 106         0x1F481,
 107         0x1F482,
 108         0x1F483,
 109         0x1F485,
 110         0x1F486,
 111         0x1F487,
 112         0x1F48F,
 113         0x1F491,
 114         0x1F4AA,
 115         0x1F574,
 116         0x1F575,
 117         0x1F57A,
 118         0x1F590,
 119         0x1F595,
 120         0x1F596,
 121         0x1F645,
 122         0x1F646,
 123         0x1F647,
 124         0x1F64B,
 125         0x1F64C,
 126         0x1F64D,
 127         0x1F64E,
 128         0x1F64F,
 129         0x1F6A3,
 130         0x1F6B4,
 131         0x1F6B5,
 132         0x1F6B6,
 133         0x1F6C0,
 134         0x1F6CC,
 135         0x1F90C,
 136         0x1F90F,
 137         0x1F918,
 138         0x1F919,
 139         0x1F91A,
 140         0x1F91B,
 141         0x1F91C,
 142         0x1F91D,
 143         0x1F91E,
 144         0x1F91F,
 145         0x1F926,
 146         0x1F930,
 147         0x1F931,
 148         0x1F932,
 149         0x1F933,
 150         0x1F934,
 151         0x1F935,
 152         0x1F936,
 153         0x1F937,
 154         0x1F938,
 155         0x1F939,
 156         0x1F93D,
 157         0x1F93E,
 158         0x1F977,
 159         0x1F9B5,
 160         0x1F9B6,
 161         0x1F9B8,
 162         0x1F9B9,
 163         0x1F9BB,
 164         0x1F9CD,
 165         0x1F9CE,
 166         0x1F9CF,
 167         0x1F9D1,
 168         0x1F9D2,
 169         0x1F9D3,
 170         0x1F9D4,
 171         0x1F9D5,
 172         0x1F9D6,
 173         0x1F9D7,
 174         0x1F9D8,
 175         0x1F9D9,
 176         0x1F9DA,
 177         0x1F9DB,
 178         0x1F9DC,
 179         0x1F9DD,
 180         0x1FAC3,
 181         0x1FAC4,
 182         0x1FAC5,
 183         0x1FAF0,
 184         0x1FAF1,
 185         0x1FAF2,
 186         0x1FAF3,
 187         0x1FAF4,
 188         0x1FAF5,
 189         0x1FAF6,
 190         0x1FAF7,
 191         0x1FAF8
 192 };
 193
 194 struct utf8_item {
 195         RB_ENTRY(utf8_item)     index_entry;
 196         u_int                   index;
 197
 198         RB_ENTRY(utf8_item)     data_entry;
 199         char                    data[UTF8_SIZE];
 200         u_char                  size;
 201 };
 202
 203 static int
 204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 205 {
 206         if (ui1->size < ui2->size)
 207                 return (-1);
 208         if (ui1->size > ui2->size)
 209                 return (1);
 210         return (memcmp(ui1->data, ui2->data, ui1->size));
 211 }
 212 RB_HEAD(utf8_data_tree, utf8_item);
 213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
 214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
 215
 216 static int
 217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 218 {
 219         if (ui1->index < ui2->index)
 220                 return (-1);
 221         if (ui1->index > ui2->index)
 222                 return (1);
 223         return (0);
 224 }
 225 RB_HEAD(utf8_index_tree, utf8_item);
 226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
 227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
 228
 229 static u_int utf8_next_index;
 230
 231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
 232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
 233
 234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
 235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
 236
 237 /* Get a UTF-8 item from data. */
 238 static struct utf8_item *
 239 utf8_item_by_data(const u_char *data, size_t size)
 240 {
 241         struct utf8_item        ui;
 242
 243         memcpy(ui.data, data, size);
 244         ui.size = size;
 245
 246         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
 247 }
 248
 249 /* Get a UTF-8 item from data. */
 250 static struct utf8_item *
 251 utf8_item_by_index(u_int index)
 252 {
 253         struct utf8_item        ui;
 254
 255         ui.index = index;
 256
 257         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
 258 }
 259
 260 /* Add a UTF-8 item. */
 261 static int
 262 utf8_put_item(const u_char *data, size_t size, u_int *index)
 263 {
 264         struct utf8_item        *ui;
 265
 266         ui = utf8_item_by_data(data, size);
 267         if (ui != NULL) {
 268                 *index = ui->index;
 269                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 270                     *index);
 271                 return (0);
 272         }
 273
 274         if (utf8_next_index == 0xffffff + 1)
 275                 return (-1);
 276
 277         ui = xcalloc(1, sizeof *ui);
 278         ui->index = utf8_next_index++;
 279         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 280
 281         memcpy(ui->data, data, size);
 282         ui->size = size;
 283         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 284
 285         *index = ui->index;
 286         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 287         return (0);
 288 }
 289
 290 static int
 291 utf8_table_cmp(const void *vp1, const void *vp2)
 292 {
 293         const wchar_t   *wc1 = vp1, *wc2 = vp2;
 294
 295         if (*wc1 < *wc2)
 296                 return (-1);
 297         if (*wc1 > *wc2)
 298                 return (1);
 299         return (0);
 300 }
 301
 302 /* Check if character in table. */
 303 int
 304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
 305 {
 306         wchar_t *found;
 307
 308         found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
 309         return (found != NULL);
 310 }
 311
 312 /* Get UTF-8 character from data. */
 313 enum utf8_state
 314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 315 {
 316         u_int   index;
 317
 318         if (ud->width > 2)
 319                 fatalx("invalid UTF-8 width: %u", ud->width);
 320
 321         if (ud->size > UTF8_SIZE)
 322                 goto fail;
 323         if (ud->size <= 3) {
 324                 index = (((utf8_char)ud->data[2] << 16)|
 325                           ((utf8_char)ud->data[1] << 8)|
 326                           ((utf8_char)ud->data[0]));
 327         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 328                 goto fail;
 329         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 330         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 331             (int)ud->size, ud->data, *uc);
 332         return (UTF8_DONE);
 333
 334 fail:
 335         if (ud->width == 0)
 336                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 337         else if (ud->width == 1)
 338                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 339         else
 340                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 341         return (UTF8_ERROR);
 342 }
 343
 344 /* Get UTF-8 data from character. */
 345 void
 346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 347 {
 348         struct utf8_item        *ui;
 349         u_int                    index;
 350
 351         memset(ud, 0, sizeof *ud);
 352         ud->size = ud->have = UTF8_GET_SIZE(uc);
 353         ud->width = UTF8_GET_WIDTH(uc);
 354
 355         if (ud->size <= 3) {
 356                 ud->data[2] = (uc >> 16);
 357                 ud->data[1] = ((uc >> 8) & 0xff);
 358                 ud->data[0] = (uc & 0xff);
 359         } else {
 360                 index = (uc & 0xffffff);
 361                 if ((ui = utf8_item_by_index(index)) == NULL)
 362                         memset(ud->data, ' ', ud->size);
 363                 else
 364                         memcpy(ud->data, ui->data, ud->size);
 365         }
 366
 367         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 368             (int)ud->size, ud->data);
 369 }
 370
 371 /* Get UTF-8 character from a single ASCII character. */
 372 u_int
 373 utf8_build_one(u_char ch)
 374 {
 375         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 376 }
 377
 378 /* Set a single character. */
 379 void
 380 utf8_set(struct utf8_data *ud, u_char ch)
 381 {
 382         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 383
 384         memcpy(ud, &empty, sizeof *ud);
 385         *ud->data = ch;
 386 }
 387
 388 /* Copy UTF-8 character. */
 389 void
 390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 391 {
 392         u_int   i;
 393
 394         memcpy(to, from, sizeof *to);
 395
 396         for (i = to->size; i < sizeof to->data; i++)
 397                 to->data[i] = '\0';
 398 }
 399
 400 /* Get width of Unicode character. */
 401 static enum utf8_state
 402 utf8_width(struct utf8_data *ud, int *width)
 403 {
 404         wchar_t wc;
 405
 406         if (utf8_towc(ud, &wc) != UTF8_DONE)
 407                 return (UTF8_ERROR);
 408         if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
 409                 *width = 2;
 410                 return (UTF8_DONE);
 411         }
 412 #ifdef HAVE_UTF8PROC
 413         *width = utf8proc_wcwidth(wc);
 414         log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
 415 #else
 416         *width = wcwidth(wc);
 417         log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 418         if (*width < 0) {
 419                 /*
 420                  * C1 control characters are nonprintable, so they are always
 421                  * zero width.
 422                  */
 423                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 424         }
 425 #endif
 426         if (*width >= 0 && *width <= 0xff)
 427                 return (UTF8_DONE);
 428         return (UTF8_ERROR);
 429 }
 430
 431 /* Convert UTF-8 character to wide character. */
 432 enum utf8_state
 433 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
 434 {
 435 #ifdef HAVE_UTF8PROC
 436         switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
 437 #else
 438         switch (mbtowc(wc, ud->data, ud->size)) {
 439 #endif
 440         case -1:
 441                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 442                     errno);
 443                 mbtowc(NULL, NULL, MB_CUR_MAX);
 444                 return (UTF8_ERROR);
 445         case 0:
 446                 return (UTF8_ERROR);
 447         }
 448         log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
 449         return (UTF8_DONE);
 450 }
 451
 452 /*
 453  * Open UTF-8 sequence.
 454  *
 455  * 11000010-11011111 C2-DF start of 2-byte sequence
 456  * 11100000-11101111 E0-EF start of 3-byte sequence
 457  * 11110000-11110100 F0-F4 start of 4-byte sequence
 458  */
 459 enum utf8_state
 460 utf8_open(struct utf8_data *ud, u_char ch)
 461 {
 462         memset(ud, 0, sizeof *ud);
 463         if (ch >= 0xc2 && ch <= 0xdf)
 464                 ud->size = 2;
 465         else if (ch >= 0xe0 && ch <= 0xef)
 466                 ud->size = 3;
 467         else if (ch >= 0xf0 && ch <= 0xf4)
 468                 ud->size = 4;
 469         else
 470                 return (UTF8_ERROR);
 471         utf8_append(ud, ch);
 472         return (UTF8_MORE);
 473 }
 474
 475 /* Append character to UTF-8, closing if finished. */
 476 enum utf8_state
 477 utf8_append(struct utf8_data *ud, u_char ch)
 478 {
 479         int     width;
 480
 481         if (ud->have >= ud->size)
 482                 fatalx("UTF-8 character overflow");
 483         if (ud->size > sizeof ud->data)
 484                 fatalx("UTF-8 character size too large");
 485
 486         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 487                 ud->width = 0xff;
 488
 489         ud->data[ud->have++] = ch;
 490         if (ud->have != ud->size)
 491                 return (UTF8_MORE);
 492
 493         if (ud->width == 0xff)
 494                 return (UTF8_ERROR);
 495         if (utf8_width(ud, &width) != UTF8_DONE)
 496                 return (UTF8_ERROR);
 497         ud->width = width;
 498
 499         return (UTF8_DONE);
 500 }
 501
 502 /*
 503  * Encode len characters from src into dst, which is guaranteed to have four
 504  * bytes available for each character from src (for \abc or UTF-8) plus space
 505  * for \0.
 506  */
 507 int
 508 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 509 {
 510         struct utf8_data         ud;
 511         const char              *start = dst, *end = src + len;
 512         enum utf8_state          more;
 513         size_t                   i;
 514
 515         while (src < end) {
 516                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 517                         while (++src < end && more == UTF8_MORE)
 518                                 more = utf8_append(&ud, *src);
 519                         if (more == UTF8_DONE) {
 520                                 /* UTF-8 character finished. */
 521                                 for (i = 0; i < ud.size; i++)
 522                                         *dst++ = ud.data[i];
 523                                 continue;
 524                         }
 525                         /* Not a complete, valid UTF-8 character. */
 526                         src -= ud.have;
 527                 }
 528                 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
 529                         if (isalpha((u_char)src[1]) ||
 530                             src[1] == '_' ||
 531                             src[1] == '{')
 532                                 *dst++ = '\\';
 533                         *dst++ = '$';
 534                 } else if (src < end - 1)
 535                         dst = vis(dst, src[0], flag, src[1]);
 536                 else if (src < end)
 537                         dst = vis(dst, src[0], flag, '\0');
 538                 src++;
 539         }
 540         *dst = '\0';
 541         return (dst - start);
 542 }
 543
 544 /* Same as utf8_strvis but allocate the buffer. */
 545 int
 546 utf8_stravis(char **dst, const char *src, int flag)
 547 {
 548         char    *buf;
 549         int      len;
 550
 551         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 552         len = utf8_strvis(buf, src, strlen(src), flag);
 553
 554         *dst = xrealloc(buf, len + 1);
 555         return (len);
 556 }
 557
 558 /* Same as utf8_strvis but allocate the buffer. */
 559 int
 560 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 561 {
 562         char    *buf;
 563         int      len;
 564
 565         buf = xreallocarray(NULL, 4, srclen + 1);
 566         len = utf8_strvis(buf, src, srclen, flag);
 567
 568         *dst = xrealloc(buf, len + 1);
 569         return (len);
 570 }
 571
 572 /* Does this string contain anything that isn't valid UTF-8? */
 573 int
 574 utf8_isvalid(const char *s)
 575 {
 576         struct utf8_data ud;
 577         const char      *end;
 578         enum utf8_state  more;
 579
 580         end = s + strlen(s);
 581         while (s < end) {
 582                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 583                         while (++s < end && more == UTF8_MORE)
 584                                 more = utf8_append(&ud, *s);
 585                         if (more == UTF8_DONE)
 586                                 continue;
 587                         return (0);
 588                 }
 589                 if (*s < 0x20 || *s > 0x7e)
 590                         return (0);
 591                 s++;
 592         }
 593         return (1);
 594 }
 595
 596 /*
 597  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 598  * the returned string. Anything not valid printable ASCII or UTF-8 is
 599  * stripped.
 600  */
 601 char *
 602 utf8_sanitize(const char *src)
 603 {
 604         char            *dst = NULL;
 605         size_t           n = 0;
 606         enum utf8_state  more;
 607         struct utf8_data ud;
 608         u_int            i;
 609
 610         while (*src != '\0') {
 611                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 612                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 613                         while (*++src != '\0' && more == UTF8_MORE)
 614                                 more = utf8_append(&ud, *src);
 615                         if (more == UTF8_DONE) {
 616                                 dst = xreallocarray(dst, n + ud.width,
 617                                     sizeof *dst);
 618                                 for (i = 0; i < ud.width; i++)
 619                                         dst[n++] = '_';
 620                                 continue;
 621                         }
 622                         src -= ud.have;
 623                 }
 624                 if (*src > 0x1f && *src < 0x7f)
 625                         dst[n++] = *src;
 626                 else
 627                         dst[n++] = '_';
 628                 src++;
 629         }
 630         dst = xreallocarray(dst, n + 1, sizeof *dst);
 631         dst[n] = '\0';
 632         return (dst);
 633 }
 634
 635 /* Get UTF-8 buffer length. */
 636 size_t
 637 utf8_strlen(const struct utf8_data *s)
 638 {
 639         size_t  i;
 640
 641         for (i = 0; s[i].size != 0; i++)
 642                 /* nothing */;
 643         return (i);
 644 }
 645
 646 /* Get UTF-8 string width. */
 647 u_int
 648 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 649 {
 650         ssize_t i;
 651         u_int   width = 0;
 652
 653         for (i = 0; s[i].size != 0; i++) {
 654                 if (n != -1 && n == i)
 655                         break;
 656                 width += s[i].width;
 657         }
 658         return (width);
 659 }
 660
 661 /*
 662  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 663  * Caller frees.
 664  */
 665 struct utf8_data *
 666 utf8_fromcstr(const char *src)
 667 {
 668         struct utf8_data        *dst = NULL;
 669         size_t                   n = 0;
 670         enum utf8_state          more;
 671
 672         while (*src != '\0') {
 673                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 674                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 675                         while (*++src != '\0' && more == UTF8_MORE)
 676                                 more = utf8_append(&dst[n], *src);
 677                         if (more == UTF8_DONE) {
 678                                 n++;
 679                                 continue;
 680                         }
 681                         src -= dst[n].have;
 682                 }
 683                 utf8_set(&dst[n], *src);
 684                 n++;
 685                 src++;
 686         }
 687         dst = xreallocarray(dst, n + 1, sizeof *dst);
 688         dst[n].size = 0;
 689         return (dst);
 690 }
 691
 692 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 693 char *
 694 utf8_tocstr(struct utf8_data *src)
 695 {
 696         char    *dst = NULL;
 697         size_t   n = 0;
 698
 699         for(; src->size != 0; src++) {
 700                 dst = xreallocarray(dst, n + src->size, 1);
 701                 memcpy(dst + n, src->data, src->size);
 702                 n += src->size;
 703         }
 704         dst = xreallocarray(dst, n + 1, 1);
 705         dst[n] = '\0';
 706         return (dst);
 707 }
 708
 709 /* Get width of UTF-8 string. */
 710 u_int
 711 utf8_cstrwidth(const char *s)
 712 {
 713         struct utf8_data        tmp;
 714         u_int                   width;
 715         enum utf8_state         more;
 716
 717         width = 0;
 718         while (*s != '\0') {
 719                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 720                         while (*++s != '\0' && more == UTF8_MORE)
 721                                 more = utf8_append(&tmp, *s);
 722                         if (more == UTF8_DONE) {
 723                                 width += tmp.width;
 724                                 continue;
 725                         }
 726                         s -= tmp.have;
 727                 }
 728                 if (*s > 0x1f && *s != 0x7f)
 729                         width++;
 730                 s++;
 731         }
 732         return (width);
 733 }
 734
 735 /* Pad UTF-8 string to width on the left. Caller frees. */
 736 char *
 737 utf8_padcstr(const char *s, u_int width)
 738 {
 739         size_t   slen;
 740         char    *out;
 741         u_int    n, i;
 742
 743         n = utf8_cstrwidth(s);
 744         if (n >= width)
 745                 return (xstrdup(s));
 746
 747         slen = strlen(s);
 748         out = xmalloc(slen + 1 + (width - n));
 749         memcpy(out, s, slen);
 750         for (i = n; i < width; i++)
 751                 out[slen++] = ' ';
 752         out[slen] = '\0';
 753         return (out);
 754 }
 755
 756 /* Pad UTF-8 string to width on the right. Caller frees. */
 757 char *
 758 utf8_rpadcstr(const char *s, u_int width)
 759 {
 760         size_t   slen;
 761         char    *out;
 762         u_int    n, i;
 763
 764         n = utf8_cstrwidth(s);
 765         if (n >= width)
 766                 return (xstrdup(s));
 767
 768         slen = strlen(s);
 769         out = xmalloc(slen + 1 + (width - n));
 770         for (i = 0; i < width - n; i++)
 771                 out[i] = ' ';
 772         memcpy(out + i, s, slen);
 773         out[i + slen] = '\0';
 774         return (out);
 775 }
 776
 777 int
 778 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 779 {
 780         struct utf8_data        *copy, *loop;
 781         int                      found = 0;
 782
 783         copy = utf8_fromcstr(s);
 784         for (loop = copy; loop->size != 0; loop++) {
 785                 if (loop->size != ud->size)
 786                         continue;
 787                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 788                         found = 1;
 789                         break;
 790                 }
 791         }
 792         free(copy);
 793
 794         return (found);
 795 }