utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <vis.h>
  26
  27 #include "tmux.h"
  28
  29 static const wchar_t utf8_force_wide[] = {
  30         0x0261D,
  31         0x026F9,
  32         0x0270A,
  33         0x0270B,
  34         0x0270C,
  35         0x0270D,
  36         0x1F1E6,
  37         0x1F1E7,
  38         0x1F1E8,
  39         0x1F1E9,
  40         0x1F1EA,
  41         0x1F1EB,
  42         0x1F1EC,
  43         0x1F1ED,
  44         0x1F1EE,
  45         0x1F1EF,
  46         0x1F1F0,
  47         0x1F1F1,
  48         0x1F1F2,
  49         0x1F1F3,
  50         0x1F1F4,
  51         0x1F1F5,
  52         0x1F1F6,
  53         0x1F1F7,
  54         0x1F1F8,
  55         0x1F1F9,
  56         0x1F1FA,
  57         0x1F1FB,
  58         0x1F1FC,
  59         0x1F1FD,
  60         0x1F1FE,
  61         0x1F1FF,
  62         0x1F385,
  63         0x1F3C2,
  64         0x1F3C3,
  65         0x1F3C4,
  66         0x1F3C7,
  67         0x1F3CA,
  68         0x1F3CB,
  69         0x1F3CC,
  70         0x1F3FB,
  71         0x1F3FC,
  72         0x1F3FD,
  73         0x1F3FE,
  74         0x1F3FF,
  75         0x1F442,
  76         0x1F443,
  77         0x1F446,
  78         0x1F447,
  79         0x1F448,
  80         0x1F449,
  81         0x1F44A,
  82         0x1F44B,
  83         0x1F44C,
  84         0x1F44D,
  85         0x1F44E,
  86         0x1F44F,
  87         0x1F450,
  88         0x1F466,
  89         0x1F467,
  90         0x1F468,
  91         0x1F469,
  92         0x1F46B,
  93         0x1F46C,
  94         0x1F46D,
  95         0x1F46E,
  96         0x1F470,
  97         0x1F471,
  98         0x1F472,
  99         0x1F473,
 100         0x1F474,
 101         0x1F475,
 102         0x1F476,
 103         0x1F477,
 104         0x1F478,
 105         0x1F47C,
 106         0x1F481,
 107         0x1F482,
 108         0x1F483,
 109         0x1F485,
 110         0x1F486,
 111         0x1F487,
 112         0x1F48F,
 113         0x1F491,
 114         0x1F4AA,
 115         0x1F574,
 116         0x1F575,
 117         0x1F57A,
 118         0x1F590,
 119         0x1F595,
 120         0x1F596,
 121         0x1F645,
 122         0x1F646,
 123         0x1F647,
 124         0x1F64B,
 125         0x1F64C,
 126         0x1F64D,
 127         0x1F64E,
 128         0x1F64F,
 129         0x1F6A3,
 130         0x1F6B4,
 131         0x1F6B5,
 132         0x1F6B6,
 133         0x1F6C0,
 134         0x1F6CC,
 135         0x1F90C,
 136         0x1F90F,
 137         0x1F918,
 138         0x1F919,
 139         0x1F91A,
 140         0x1F91B,
 141         0x1F91C,
 142         0x1F91D,
 143         0x1F91E,
 144         0x1F91F,
 145         0x1F926,
 146         0x1F930,
 147         0x1F931,
 148         0x1F932,
 149         0x1F933,
 150         0x1F934,
 151         0x1F935,
 152         0x1F936,
 153         0x1F937,
 154         0x1F938,
 155         0x1F939,
 156         0x1F93D,
 157         0x1F93E,
 158         0x1F977,
 159         0x1F9B5,
 160         0x1F9B6,
 161         0x1F9B8,
 162         0x1F9B9,
 163         0x1F9BB,
 164         0x1F9CD,
 165         0x1F9CE,
 166         0x1F9CF,
 167         0x1F9D1,
 168         0x1F9D2,
 169         0x1F9D3,
 170         0x1F9D4,
 171         0x1F9D5,
 172         0x1F9D6,
 173         0x1F9D7,
 174         0x1F9D8,
 175         0x1F9D9,
 176         0x1F9DA,
 177         0x1F9DB,
 178         0x1F9DC,
 179         0x1F9DD,
 180         0x1FAC3,
 181         0x1FAC4,
 182         0x1FAC5,
 183         0x1FAF0,
 184         0x1FAF1,
 185         0x1FAF2,
 186         0x1FAF3,
 187         0x1FAF4,
 188         0x1FAF5,
 189         0x1FAF6,
 190         0x1FAF7,
 191         0x1FAF8
 192 };
 193
 194 struct utf8_item {
 195         RB_ENTRY(utf8_item)     index_entry;
 196         u_int                   index;
 197
 198         RB_ENTRY(utf8_item)     data_entry;
 199         char                    data[UTF8_SIZE];
 200         u_char                  size;
 201 };
 202
 203 static int
 204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 205 {
 206         if (ui1->size < ui2->size)
 207                 return (-1);
 208         if (ui1->size > ui2->size)
 209                 return (1);
 210         return (memcmp(ui1->data, ui2->data, ui1->size));
 211 }
 212 RB_HEAD(utf8_data_tree, utf8_item);
 213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
 214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
 215
 216 static int
 217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 218 {
 219         if (ui1->index < ui2->index)
 220                 return (-1);
 221         if (ui1->index > ui2->index)
 222                 return (1);
 223         return (0);
 224 }
 225 RB_HEAD(utf8_index_tree, utf8_item);
 226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
 227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
 228
 229 static u_int utf8_next_index;
 230
 231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
 232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
 233
 234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
 235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
 236
 237 /* Get a UTF-8 item from data. */
 238 static struct utf8_item *
 239 utf8_item_by_data(const u_char *data, size_t size)
 240 {
 241         struct utf8_item        ui;
 242
 243         memcpy(ui.data, data, size);
 244         ui.size = size;
 245
 246         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
 247 }
 248
 249 /* Get a UTF-8 item from data. */
 250 static struct utf8_item *
 251 utf8_item_by_index(u_int index)
 252 {
 253         struct utf8_item        ui;
 254
 255         ui.index = index;
 256
 257         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
 258 }
 259
 260 /* Add a UTF-8 item. */
 261 static int
 262 utf8_put_item(const u_char *data, size_t size, u_int *index)
 263 {
 264         struct utf8_item        *ui;
 265
 266         ui = utf8_item_by_data(data, size);
 267         if (ui != NULL) {
 268                 *index = ui->index;
 269                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 270                     *index);
 271                 return (0);
 272         }
 273
 274         if (utf8_next_index == 0xffffff + 1)
 275                 return (-1);
 276
 277         ui = xcalloc(1, sizeof *ui);
 278         ui->index = utf8_next_index++;
 279         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 280
 281         memcpy(ui->data, data, size);
 282         ui->size = size;
 283         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 284
 285         *index = ui->index;
 286         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 287         return (0);
 288 }
 289
 290 static int
 291 utf8_table_cmp(const void *vp1, const void *vp2)
 292 {
 293         const wchar_t   *wc1 = vp1, *wc2 = vp2;
 294
 295         if (*wc1 < *wc2)
 296                 return (-1);
 297         if (*wc1 > *wc2)
 298                 return (1);
 299         return (0);
 300 }
 301
 302 /* Check if character in table. */
 303 int
 304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
 305 {
 306         wchar_t *found;
 307
 308         found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
 309         return (found != NULL);
 310 }
 311
 312 /* Get UTF-8 character from data. */
 313 enum utf8_state
 314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 315 {
 316         u_int   index;
 317
 318         if (ud->width > 2)
 319                 fatalx("invalid UTF-8 width: %u", ud->width);
 320
 321         if (ud->size > UTF8_SIZE)
 322                 goto fail;
 323         if (ud->size <= 3) {
 324                 index = (((utf8_char)ud->data[2] << 16)|
 325                           ((utf8_char)ud->data[1] << 8)|
 326                           ((utf8_char)ud->data[0]));
 327         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 328                 goto fail;
 329         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 330         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 331             (int)ud->size, ud->data, *uc);
 332         return (UTF8_DONE);
 333
 334 fail:
 335         if (ud->width == 0)
 336                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 337         else if (ud->width == 1)
 338                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 339         else
 340                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 341         return (UTF8_ERROR);
 342 }
 343
 344 /* Get UTF-8 data from character. */
 345 void
 346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 347 {
 348         struct utf8_item        *ui;
 349         u_int                    index;
 350
 351         memset(ud, 0, sizeof *ud);
 352         ud->size = ud->have = UTF8_GET_SIZE(uc);
 353         ud->width = UTF8_GET_WIDTH(uc);
 354
 355         if (ud->size <= 3) {
 356                 ud->data[2] = (uc >> 16);
 357                 ud->data[1] = ((uc >> 8) & 0xff);
 358                 ud->data[0] = (uc & 0xff);
 359         } else {
 360                 index = (uc & 0xffffff);
 361                 if ((ui = utf8_item_by_index(index)) == NULL)
 362                         memset(ud->data, ' ', ud->size);
 363                 else
 364                         memcpy(ud->data, ui->data, ud->size);
 365         }
 366
 367         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 368             (int)ud->size, ud->data);
 369 }
 370
 371 /* Get UTF-8 character from a single ASCII character. */
 372 u_int
 373 utf8_build_one(u_char ch)
 374 {
 375         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 376 }
 377
 378 /* Set a single character. */
 379 void
 380 utf8_set(struct utf8_data *ud, u_char ch)
 381 {
 382         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 383
 384         memcpy(ud, &empty, sizeof *ud);
 385         *ud->data = ch;
 386 }
 387
 388 /* Copy UTF-8 character. */
 389 void
 390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 391 {
 392         u_int   i;
 393
 394         memcpy(to, from, sizeof *to);
 395
 396         for (i = to->size; i < sizeof to->data; i++)
 397                 to->data[i] = '\0';
 398 }
 399
 400 /* Get width of Unicode character. */
 401 static enum utf8_state
 402 utf8_width(struct utf8_data *ud, int *width)
 403 {
 404         wchar_t wc;
 405
 406         if (utf8_towc(ud, &wc) != UTF8_DONE)
 407                 return (UTF8_ERROR);
 408         if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
 409                 *width = 2;
 410                 return (UTF8_DONE);
 411         }
 412
 413         *width = wcwidth(wc);
 414         log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 415         if (*width < 0) {
 416                 /*
 417                  * C1 control characters are nonprintable, so they are always
 418                  * zero width.
 419                  */
 420                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 421         }
 422         if (*width >= 0 && *width <= 0xff)
 423                 return (UTF8_DONE);
 424         return (UTF8_ERROR);
 425 }
 426
 427 /* Convert UTF-8 character to wide character. */
 428 enum utf8_state
 429 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
 430 {
 431         switch (mbtowc(wc, ud->data, ud->size)) {
 432         case -1:
 433                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 434                     errno);
 435                 mbtowc(NULL, NULL, MB_CUR_MAX);
 436                 return (UTF8_ERROR);
 437         case 0:
 438                 return (UTF8_ERROR);
 439         }
 440         log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
 441         return (UTF8_DONE);
 442 }
 443
 444 /*
 445  * Open UTF-8 sequence.
 446  *
 447  * 11000010-11011111 C2-DF start of 2-byte sequence
 448  * 11100000-11101111 E0-EF start of 3-byte sequence
 449  * 11110000-11110100 F0-F4 start of 4-byte sequence
 450  */
 451 enum utf8_state
 452 utf8_open(struct utf8_data *ud, u_char ch)
 453 {
 454         memset(ud, 0, sizeof *ud);
 455         if (ch >= 0xc2 && ch <= 0xdf)
 456                 ud->size = 2;
 457         else if (ch >= 0xe0 && ch <= 0xef)
 458                 ud->size = 3;
 459         else if (ch >= 0xf0 && ch <= 0xf4)
 460                 ud->size = 4;
 461         else
 462                 return (UTF8_ERROR);
 463         utf8_append(ud, ch);
 464         return (UTF8_MORE);
 465 }
 466
 467 /* Append character to UTF-8, closing if finished. */
 468 enum utf8_state
 469 utf8_append(struct utf8_data *ud, u_char ch)
 470 {
 471         int     width;
 472
 473         if (ud->have >= ud->size)
 474                 fatalx("UTF-8 character overflow");
 475         if (ud->size > sizeof ud->data)
 476                 fatalx("UTF-8 character size too large");
 477
 478         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 479                 ud->width = 0xff;
 480
 481         ud->data[ud->have++] = ch;
 482         if (ud->have != ud->size)
 483                 return (UTF8_MORE);
 484
 485         if (ud->width == 0xff)
 486                 return (UTF8_ERROR);
 487         if (utf8_width(ud, &width) != UTF8_DONE)
 488                 return (UTF8_ERROR);
 489         ud->width = width;
 490
 491         return (UTF8_DONE);
 492 }
 493
 494 /*
 495  * Encode len characters from src into dst, which is guaranteed to have four
 496  * bytes available for each character from src (for \abc or UTF-8) plus space
 497  * for \0.
 498  */
 499 int
 500 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 501 {
 502         struct utf8_data         ud;
 503         const char              *start = dst, *end = src + len;
 504         enum utf8_state          more;
 505         size_t                   i;
 506
 507         while (src < end) {
 508                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 509                         while (++src < end && more == UTF8_MORE)
 510                                 more = utf8_append(&ud, *src);
 511                         if (more == UTF8_DONE) {
 512                                 /* UTF-8 character finished. */
 513                                 for (i = 0; i < ud.size; i++)
 514                                         *dst++ = ud.data[i];
 515                                 continue;
 516                         }
 517                         /* Not a complete, valid UTF-8 character. */
 518                         src -= ud.have;
 519                 }
 520                 if (src[0] == '$' && src < end - 1) {
 521                         if (isalpha((u_char)src[1]) ||
 522                             src[1] == '_' ||
 523                             src[1] == '{')
 524                                 *dst++ = '\\';
 525                         *dst++ = '$';
 526                 } else if (src < end - 1)
 527                         dst = vis(dst, src[0], flag, src[1]);
 528                 else if (src < end)
 529                         dst = vis(dst, src[0], flag, '\0');
 530                 src++;
 531         }
 532         *dst = '\0';
 533         return (dst - start);
 534 }
 535
 536 /* Same as utf8_strvis but allocate the buffer. */
 537 int
 538 utf8_stravis(char **dst, const char *src, int flag)
 539 {
 540         char    *buf;
 541         int      len;
 542
 543         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 544         len = utf8_strvis(buf, src, strlen(src), flag);
 545
 546         *dst = xrealloc(buf, len + 1);
 547         return (len);
 548 }
 549
 550 /* Same as utf8_strvis but allocate the buffer. */
 551 int
 552 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 553 {
 554         char    *buf;
 555         int      len;
 556
 557         buf = xreallocarray(NULL, 4, srclen + 1);
 558         len = utf8_strvis(buf, src, srclen, flag);
 559
 560         *dst = xrealloc(buf, len + 1);
 561         return (len);
 562 }
 563
 564 /* Does this string contain anything that isn't valid UTF-8? */
 565 int
 566 utf8_isvalid(const char *s)
 567 {
 568         struct utf8_data ud;
 569         const char      *end;
 570         enum utf8_state  more;
 571
 572         end = s + strlen(s);
 573         while (s < end) {
 574                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 575                         while (++s < end && more == UTF8_MORE)
 576                                 more = utf8_append(&ud, *s);
 577                         if (more == UTF8_DONE)
 578                                 continue;
 579                         return (0);
 580                 }
 581                 if (*s < 0x20 || *s > 0x7e)
 582                         return (0);
 583                 s++;
 584         }
 585         return (1);
 586 }
 587
 588 /*
 589  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 590  * the returned string. Anything not valid printable ASCII or UTF-8 is
 591  * stripped.
 592  */
 593 char *
 594 utf8_sanitize(const char *src)
 595 {
 596         char            *dst = NULL;
 597         size_t           n = 0;
 598         enum utf8_state  more;
 599         struct utf8_data ud;
 600         u_int            i;
 601
 602         while (*src != '\0') {
 603                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 604                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 605                         while (*++src != '\0' && more == UTF8_MORE)
 606                                 more = utf8_append(&ud, *src);
 607                         if (more == UTF8_DONE) {
 608                                 dst = xreallocarray(dst, n + ud.width,
 609                                     sizeof *dst);
 610                                 for (i = 0; i < ud.width; i++)
 611                                         dst[n++] = '_';
 612                                 continue;
 613                         }
 614                         src -= ud.have;
 615                 }
 616                 if (*src > 0x1f && *src < 0x7f)
 617                         dst[n++] = *src;
 618                 else
 619                         dst[n++] = '_';
 620                 src++;
 621         }
 622         dst = xreallocarray(dst, n + 1, sizeof *dst);
 623         dst[n] = '\0';
 624         return (dst);
 625 }
 626
 627 /* Get UTF-8 buffer length. */
 628 size_t
 629 utf8_strlen(const struct utf8_data *s)
 630 {
 631         size_t  i;
 632
 633         for (i = 0; s[i].size != 0; i++)
 634                 /* nothing */;
 635         return (i);
 636 }
 637
 638 /* Get UTF-8 string width. */
 639 u_int
 640 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 641 {
 642         ssize_t i;
 643         u_int   width = 0;
 644
 645         for (i = 0; s[i].size != 0; i++) {
 646                 if (n != -1 && n == i)
 647                         break;
 648                 width += s[i].width;
 649         }
 650         return (width);
 651 }
 652
 653 /*
 654  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 655  * Caller frees.
 656  */
 657 struct utf8_data *
 658 utf8_fromcstr(const char *src)
 659 {
 660         struct utf8_data        *dst = NULL;
 661         size_t                   n = 0;
 662         enum utf8_state          more;
 663
 664         while (*src != '\0') {
 665                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 666                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 667                         while (*++src != '\0' && more == UTF8_MORE)
 668                                 more = utf8_append(&dst[n], *src);
 669                         if (more == UTF8_DONE) {
 670                                 n++;
 671                                 continue;
 672                         }
 673                         src -= dst[n].have;
 674                 }
 675                 utf8_set(&dst[n], *src);
 676                 n++;
 677                 src++;
 678         }
 679         dst = xreallocarray(dst, n + 1, sizeof *dst);
 680         dst[n].size = 0;
 681         return (dst);
 682 }
 683
 684 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 685 char *
 686 utf8_tocstr(struct utf8_data *src)
 687 {
 688         char    *dst = NULL;
 689         size_t   n = 0;
 690
 691         for(; src->size != 0; src++) {
 692                 dst = xreallocarray(dst, n + src->size, 1);
 693                 memcpy(dst + n, src->data, src->size);
 694                 n += src->size;
 695         }
 696         dst = xreallocarray(dst, n + 1, 1);
 697         dst[n] = '\0';
 698         return (dst);
 699 }
 700
 701 /* Get width of UTF-8 string. */
 702 u_int
 703 utf8_cstrwidth(const char *s)
 704 {
 705         struct utf8_data        tmp;
 706         u_int                   width;
 707         enum utf8_state         more;
 708
 709         width = 0;
 710         while (*s != '\0') {
 711                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 712                         while (*++s != '\0' && more == UTF8_MORE)
 713                                 more = utf8_append(&tmp, *s);
 714                         if (more == UTF8_DONE) {
 715                                 width += tmp.width;
 716                                 continue;
 717                         }
 718                         s -= tmp.have;
 719                 }
 720                 if (*s > 0x1f && *s != 0x7f)
 721                         width++;
 722                 s++;
 723         }
 724         return (width);
 725 }
 726
 727 /* Pad UTF-8 string to width on the left. Caller frees. */
 728 char *
 729 utf8_padcstr(const char *s, u_int width)
 730 {
 731         size_t   slen;
 732         char    *out;
 733         u_int    n, i;
 734
 735         n = utf8_cstrwidth(s);
 736         if (n >= width)
 737                 return (xstrdup(s));
 738
 739         slen = strlen(s);
 740         out = xmalloc(slen + 1 + (width - n));
 741         memcpy(out, s, slen);
 742         for (i = n; i < width; i++)
 743                 out[slen++] = ' ';
 744         out[slen] = '\0';
 745         return (out);
 746 }
 747
 748 /* Pad UTF-8 string to width on the right. Caller frees. */
 749 char *
 750 utf8_rpadcstr(const char *s, u_int width)
 751 {
 752         size_t   slen;
 753         char    *out;
 754         u_int    n, i;
 755
 756         n = utf8_cstrwidth(s);
 757         if (n >= width)
 758                 return (xstrdup(s));
 759
 760         slen = strlen(s);
 761         out = xmalloc(slen + 1 + (width - n));
 762         for (i = 0; i < width - n; i++)
 763                 out[i] = ' ';
 764         memcpy(out + i, s, slen);
 765         out[i + slen] = '\0';
 766         return (out);
 767 }
 768
 769 int
 770 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 771 {
 772         struct utf8_data        *copy, *loop;
 773         int                      found = 0;
 774
 775         copy = utf8_fromcstr(s);
 776         for (loop = copy; loop->size != 0; loop++) {
 777                 if (loop->size != ud->size)
 778                         continue;
 779                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 780                         found = 1;
 781                         break;
 782                 }
 783         }
 784         free(copy);
 785
 786         return (found);
 787 }