pith/charconv/utf8.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2013-2021 Eduardo Chappa
   8  * Copyright 2006-2008 University of Washington
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19
  20 /* includable WITHOUT dependency on c-client */
  21 #include "../../c-client/mail.h"
  22 #include "../../c-client/utf8.h"
  23
  24 #ifdef _WINDOWS
  25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  26 #undef ERROR
  27 #else
  28 #define _XOPEN_SOURCE
  29 #endif
  30
  31 #include <system.h>
  32
  33 #include "../../c-client/fs.h"
  34
  35 /* includable WITHOUT dependency on pico */
  36 #include "../../pico/keydefs.h"
  37
  38 #include "../osdep/collate.h"
  39 #include "../filttype.h"
  40
  41 #include "utf8.h"
  42
  43 #include <stdarg.h>
  44
  45
  46 unsigned single_width_chars_a_to_b(UCS *, int, int);
  47
  48
  49 static char locale_charmap[50];
  50
  51 static int   native_utf8;
  52 static void *display_data;
  53
  54 void
  55 init_utf8_display(int utf8, void *rmap)
  56 {
  57     native_utf8 = utf8;
  58     display_data = rmap;
  59 }
  60
  61
  62 /*
  63  * Argument is a UCS-4 wide character.
  64  * Returns the environment dependent cell width of the
  65  * character when printed to the screen.
  66  * This will be -1 if the character is not printable.
  67  * It will be >= zero if it is printable.
  68  *
  69  * Note that in the case it is not printable but it is still sent to
  70  * Writechar, Writechar will print a '?' with width 1.
  71  */
  72 int
  73 wcellwidth(UCS ucs)
  74 {
  75     char dummy[32];
  76     long w;
  77
  78     /*
  79      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  80      * That's the assumption here.
  81      */
  82
  83     if(native_utf8){                    /* display is UTF-8 capable */
  84         w = ucs4_width((unsigned long) ucs);
  85         return((w & U4W_ERROR) ? -1 : w);
  86     }
  87     else if(display_data){
  88         if(wtomb(dummy, ucs) < 0)
  89           return(-1);
  90         else{
  91             w = ucs4_width((unsigned long) ucs);
  92             return((w & U4W_ERROR) ? -1 : w);
  93         }
  94     }
  95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
  96     else
  97       return(wcwidth((wchar_t) ucs));
  98 #else
  99     return(0);
 100 #endif
 101 }
 102
 103 /* ambiguous width zone character function. We use the Windows code until
 104  * we find a better way to do it in general.
 105  */
 106 int
 107 pith_ucs4width(UCS ucs)
 108 {
 109   return (ucs >= 0x2100) ? 2 : 1;
 110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
 111   return wcwidth((wchar_t) ucs);
 112 #else
 113   return (ucs >= 0x2100) ? 2 : 1;
 114 #endif /* _WINDOWS */
 115 }
 116
 117 /*
 118  * Argument is a UCS-4 wide character.
 119  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 120  * Dest is a buffer at least xx chars wide where the multi-byte version
 121  * of the wide character will be written.
 122  * The returned value is the number of bytes written to dest or -1
 123  * if the conversion can't be done.
 124  */
 125 int
 126 wtomb(char *dest, UCS ucs)
 127 {
 128     int rv;
 129     /*
 130      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 131      * That's the assumption here.
 132      */
 133
 134     if(native_utf8){
 135         unsigned char *newdptr;
 136
 137         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 138         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 139     }
 140     else if(display_data){
 141         unsigned long ucs4;
 142         int           ret;
 143
 144         ucs4 = (unsigned long) ucs;
 145         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 146         if(ret >= 0)
 147           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 148         else
 149           ret = -1;
 150
 151         return(ret);
 152     }
 153     else
 154 #if defined(HAVE_WCRTOMB)
 155        rv = wcrtomb(dest, (wchar_t) ucs, NULL);
 156 #elif defined(HAVE_WCTOMB)
 157        rv = wctomb(dest, (wchar_t) ucs);
 158 #else
 159        rv = -1;
 160 #endif
 161    return rv;
 162 }
 163
 164
 165 /*
 166  * This function does not necessarily update inputp and remaining_octets, so
 167  * don't rely on that. The c-client version does but the other doesn't.
 168  */
 169 UCS
 170 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 171 {
 172     UCS ucs;
 173
 174     if(input_cs){
 175         CHARSET *cast_input_cs;
 176
 177         cast_input_cs = (CHARSET *) input_cs;
 178
 179         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 180           case U8G_ENDSTRG:
 181           case U8G_ENDSTRI:
 182             return(CCONV_NEEDMORE);
 183
 184           default:
 185             if(ucs & U8G_ERROR || ucs == UBOGON)
 186               return(CCONV_BADCHAR);
 187
 188             return(ucs);
 189         }
 190     }
 191     else{
 192         size_t ret;
 193         wchar_t w;
 194
 195         /*
 196          * Warning:  input_cs and remaining_octets are unused in this
 197          * half of the if/else.
 198          *
 199          * Unfortunately, we can't tell the difference between a source string
 200          * that is just not long enough and one that has characters that can't
 201          * be converted even though it is long enough. We return NEEDMORE in both cases.
 202          */
 203         ret = mbstowcs(&w, (char *) (*inputp), 1);
 204         if(ret == (size_t)(-1))
 205           return(CCONV_NEEDMORE);
 206         else{
 207           ucs = (UCS) w;
 208           return(ucs);
 209         }
 210     }
 211 }
 212
 213
 214 void
 215 set_locale_charmap(char *charmap)
 216 {
 217     if(charmap){
 218         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 219         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 220     }
 221     else
 222       locale_charmap[0] = '\0';
 223 }
 224
 225
 226 /*
 227  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 228  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 229  * The caller is responsible for freeing the returned value.
 230  *
 231  * Args  str     -- the string to convert
 232  */
 233 char *
 234 convert_to_utf8(char *str, char *fromcharset, int flags)
 235 {
 236     char          *ret = NULL;
 237     char          *fcharset;
 238     SIZEDTEXT      src, result;
 239     const CHARSET *cs = NULL;
 240     int            try;
 241
 242     src.data = (unsigned char *) str;
 243     src.size = strlen(str);
 244
 245     /* already UTF-8, return NULL */
 246     if(!(flags & CU8_NOINFER)
 247        && (cs = utf8_infercharset(&src))
 248        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 249       return(ret);
 250
 251     try = 1;
 252     while(try < 5){
 253         switch(try){
 254           case 1:
 255             fcharset = fromcharset;
 256             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 257               break;    /* give it a try */
 258             else
 259               try++;    /* fall through */
 260
 261           case 2:
 262             if(!(flags & CU8_NOINFER)){
 263                 fcharset = cs ? cs->name : NULL;
 264                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 265                   break;
 266                 else
 267                   try++;        /* fall through */
 268             }
 269             else
 270               try++;    /* fall through */
 271
 272           case 3:
 273             fcharset = locale_charmap;
 274             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 275               break;
 276             else
 277               try++;    /* fall through */
 278
 279           default:
 280             fcharset = "ISO-8859-1";            /* this will "work" */
 281             break;
 282         }
 283
 284         memset(&result, 0, sizeof(result));
 285
 286         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 287             if(!(result.size == src.size && result.data == src.data)){
 288                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 289                 strncpy(ret, (char *) result.data, result.size);
 290                 ret[result.size] = '\0';
 291             }
 292             /* else no conversion necessary */
 293
 294             if(result.data && result.data != src.data)
 295               fs_give((void **) &result.data);
 296             result.size = 0;
 297
 298             return(ret);
 299         }
 300
 301         try++;
 302     }
 303
 304     /* won't make it to here */
 305     return(ret);
 306 }
 307
 308
 309 /*
 310  * Convert from UTF-8 to user's locale charset.
 311  * This actually uses the wtomb routine to do the conversion, and that
 312  * relies on setup_for_input_output having been called.
 313  * If no conversion is necessary, NULL is returned, otherwise an allocated
 314  * string in the locale charset is returned and the caller is responsible
 315  * for freeing it.
 316  */
 317 char *
 318 convert_to_locale(char *utf8str)
 319 {
 320 #define CHNK 500
 321     char *inp, *ret = NULL;
 322     CBUF_S cb;
 323     int alloced;
 324     size_t i = 0;
 325
 326     if(native_utf8 || !utf8str || !utf8str[0])
 327       return(NULL);
 328
 329     cb.cbuf[0] = '\0';
 330     cb.cbufp = cb.cbufend = cb.cbuf;
 331     inp = utf8str;
 332
 333     alloced = CHNK;
 334     ret = (char *) fs_get(alloced * sizeof(char));
 335
 336     /*
 337      * There's gotta be a better way to do this but utf8_to_locale was
 338      * available and everything looks like a nail when all you have
 339      * is a hammer.
 340      */
 341     while(*inp){
 342         /*
 343          * We're placing the outgoing stream of characters in ret, a multi-byte
 344          * array of characters in the user's locale charset. See if there is
 345          * enough room for the next wide characters worth of output chars
 346          * and allocate more space if not.
 347          */
 348         if((alloced - i) < MAX(MB_LEN_MAX,32)){
 349             alloced += CHNK;
 350             fs_resize((void **) &ret, alloced * sizeof(char));
 351         }
 352
 353         i += utf8_to_locale((int) *inp++, &cb,
 354                            (unsigned char *) &ret[i], alloced - i);
 355     }
 356
 357     fs_resize((void **) &ret, i + 1);
 358
 359     ret[i] = '\0';
 360
 361     return(ret);
 362 }
 363
 364
 365 /*
 366  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 367  * filled in with multi-byte characters. The return value is the
 368  * number of valid characters in obuf to be used.
 369  */
 370 int
 371 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 372 {
 373     int outchars = 0;
 374
 375     if(!(cb && cb->cbufp))
 376       return(0);
 377
 378     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 379         unsigned char *inputp;
 380         unsigned long remaining_octets;
 381         UCS ucs;
 382
 383         *(cb->cbufp)++ = (unsigned char) c;
 384         inputp = cb->cbuf;
 385         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 386         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 387
 388         switch(ucs){
 389           case U8G_ENDSTRG:     /* incomplete character, wait */
 390           case U8G_ENDSTRI:     /* incomplete character, wait */
 391             break;
 392
 393           default:
 394             if(ucs & U8G_ERROR || ucs == UBOGON){
 395                 /*
 396                  * None of these cases is supposed to happen. If it
 397                  * does happen then the input stream isn't UTF-8
 398                  * so something is wrong. Treat each character in the
 399                  * input buffer as a separate error character and
 400                  * print a '?' for each.
 401                  */
 402                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 403                   obuf[outchars++] = '?';
 404
 405                 cb->cbufp = cb->cbuf;
 406             }
 407             else{
 408                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 409                     /*
 410                      * This happens when we have a UTF-8 character that
 411                      * we aren't able to print in our locale. For example,
 412                      * if the locale is setup with the terminal
 413                      * expecting ISO-8859-1 characters then there are
 414                      * lots of UTF-8 characters that can't be printed.
 415                      * Print a '?' instead.
 416                      */
 417                     obuf[outchars++] = '?';
 418                 }
 419                 else{
 420                     /*
 421                      * Convert the ucs into the multibyte
 422                      * character that corresponds to the
 423                      * ucs in the users locale.
 424                      */
 425                     outchars = wtomb((char *) obuf, ucs);
 426                     if(outchars < 0){
 427                         obuf[0] = '?';
 428                         outchars = 1;
 429                     }
 430                 }
 431
 432                 /* update the input buffer */
 433                 if(inputp >= cb->cbufp) /* this should be the case */
 434                   cb->cbufp = cb->cbuf;
 435                 else{           /* extra chars for some reason? */
 436                     unsigned char *q, *newcbufp;
 437
 438                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 439                     q = cb->cbuf;
 440                     while(inputp < cb->cbufp)
 441                       *q++ = *inputp++;
 442
 443                     cb->cbufp = newcbufp;
 444                 }
 445             }
 446
 447             break;
 448         }
 449     }
 450     else{                       /* error */
 451         obuf[0] = '?';
 452         outchars = 1;
 453         cb->cbufp = cb->cbuf;   /* start over */
 454     }
 455
 456     return(outchars);
 457 }
 458
 459
 460 /*
 461  * Returns the screen cells width of the UCS-4 string argument.
 462  * The source string is zero terminated.
 463  */
 464 unsigned
 465 ucs4_str_width(UCS *ucsstr)
 466 {
 467     unsigned width = 0;
 468     int w;
 469
 470     if(ucsstr)
 471       while(*ucsstr){
 472         w = wcellwidth(*ucsstr++);
 473         if(w != U4W_CTLSRGT)
 474           width += (w < 0 ? 1 : w);
 475       }
 476
 477     return width;
 478 }
 479
 480
 481 /*
 482  * Returns the screen cells width of the UCS-4 string argument
 483  * from ucsstr[a] through (inclusive) ucsstr[b].
 484  * No checking is done to make sure a starts in the middle
 485  * of a UCS-4 array.
 486  */
 487 unsigned
 488 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 489 {
 490     unsigned width = 0;
 491     int i, w;
 492
 493     if(ucsstr)
 494       for(i = a; i <= b && ucsstr[i]; i++){
 495         w = wcellwidth(ucsstr[i]);
 496         if(w != U4W_CTLSRGT)
 497           width += (w < 0 ? 1 : w);
 498       }
 499
 500     return width;
 501 }
 502
 503
 504 /*
 505  * Returns the screen cells width of the UCS-4 string argument
 506  * from ustart through (exclusive) uend.
 507  * No checking is done to make sure it starts in the middle
 508  * of a UCS-4 array.
 509  */
 510 unsigned
 511 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 512 {
 513     UCS *u;
 514     unsigned width = 0;
 515     int w;
 516
 517     if(!ustart)
 518       return width;
 519
 520     if(ustart)
 521       for(u = ustart; u < uend; u++){
 522         w = wcellwidth(*u);
 523         if(w != U4W_CTLSRGT)
 524           width += (w < 0 ? 1 : w);
 525       }
 526
 527     return(width);
 528 }
 529
 530
 531 /*
 532  * Return the largest possible pointer into ucs4str so that the width
 533  * of the string from ucs4str to the pointer (exclusive)
 534  * is maxwidth or less. Also stops at a null character.
 535  */
 536 UCS *
 537 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 538 {
 539     UCS *u;
 540     int w_consumed = 0, w, done = 0;
 541
 542     u = ucs4str;
 543
 544     if(u)
 545       while(!done && *u && w_consumed <= maxwidth){
 546         w = wcellwidth(*u);
 547         w = (w >= 0 ? w : 1);
 548         if(w_consumed + w <= maxwidth){
 549             w_consumed += w;
 550             ++u;
 551         }
 552         else
 553           ++done;
 554       }
 555
 556     return(u);
 557 }
 558
 559
 560 /*
 561  * Convert and copy a UTF-8 string into a UCS-4 NULL
 562  * terminated array. Just like cpystr only it converts
 563  * from UTF-8 to UCS-4.
 564  *
 565  * Returned UCS-4 string needs to be freed by caller.
 566  */
 567 UCS *
 568 utf8_to_ucs4_cpystr(char *utf8src)
 569 {
 570     size_t         retsize;
 571     UCS           *ret = NULL;
 572     UCS            ucs;
 573     unsigned long  remaining_octets;
 574     unsigned char *readptr;
 575     size_t         arrayindex;
 576
 577     /*
 578      * We don't know how big to allocate the return array
 579      * because variable numbers of octets in the src array
 580      * will combine to make UCS-4 characters. The number of
 581      * UCS-4 characters is less than or equal to the number
 582      * of src characters, though.
 583      */
 584
 585     if(!utf8src)
 586       return NULL;
 587
 588     retsize = strlen(utf8src) + 1;
 589
 590     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 591     memset(ret, 0, retsize * sizeof(*ret));
 592
 593     readptr = (unsigned char *) utf8src;
 594     remaining_octets = retsize-1;
 595     arrayindex = 0;
 596
 597     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 598         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 599
 600         if(ucs & U8G_ERROR || ucs == UBOGON)
 601           remaining_octets = 0;
 602         else
 603           ret[arrayindex++] = ucs;
 604     }
 605
 606     ret[arrayindex] = '\0';
 607
 608     /* get rid of excess size */
 609     if(arrayindex+1 < retsize)
 610       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 611
 612     return ret;
 613 }
 614
 615
 616 /*
 617  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 618  * terminated string. Just like cpystr only it converts
 619  * from UCS-4 to UTF-8.
 620  *
 621  * Returned UTF-8 string needs to be freed by caller.
 622  */
 623 char *
 624 ucs4_to_utf8_cpystr(UCS *ucs4src)
 625 {
 626     unsigned char *ret = NULL;
 627     unsigned char *writeptr;
 628     int            i;
 629
 630     if(!ucs4src)
 631       return NULL;
 632
 633     /*
 634      * Over-allocate and then resize at the end.
 635      */
 636
 637     /* count characters in source */
 638     for(i = 0; ucs4src[i]; i++)
 639       ;
 640
 641     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 642     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 643
 644     writeptr = ret;
 645     for(i = 0; ucs4src[i]; i++)
 646       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 647
 648     /* get rid of excess size */
 649     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 650
 651     return ((char *) ret);
 652 }
 653
 654
 655 /*
 656  * Similar to above but copy a fixed number of source
 657  * characters instead of going until null terminator.
 658  */
 659 char *
 660 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 661 {
 662     unsigned char *ret = NULL;
 663     unsigned char *writeptr;
 664     int            i;
 665
 666     if(!ucs4src)
 667       return NULL;
 668
 669     /*
 670      * Over-allocate and then resize at the end.
 671      */
 672
 673     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 674     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 675
 676     writeptr = ret;
 677     for(i = 0; i < ucs4src_len; i++)
 678       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 679
 680     /* get rid of excess size */
 681     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 682
 683     return ((char *) ret);
 684 }
 685
 686 /*
 687  * Similar to above but copy what is possible to a
 688  * string of a size at most the given retlen.
 689  */
 690 char *
 691 ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
 692 {
 693     unsigned char *ret = NULL;
 694     unsigned char *writeptr;
 695     int            i, oldlen, len;
 696
 697     if(!ucs4src)
 698       return NULL;
 699
 700     /*
 701      * Over-allocate and then resize at the end.
 702      */
 703
 704     /* count characters in source */
 705     for(i = 0; ucs4src[i]; i++)
 706       ;
 707
 708     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
 709     memset(ret, 0, (6*i + 1) * sizeof(unsigned char));
 710
 711     writeptr = ret;
 712     oldlen = len = 0;
 713     for(i = 0; ucs4src[i] && (len < retlen); i++){
 714       oldlen = len;
 715       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 716       len = strlen(ret);
 717     }
 718     if(len > retlen){
 719       ret[oldlen] = '\0';
 720       len = oldlen;
 721     }
 722
 723     /* get rid of excess size */
 724     fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));
 725
 726     return ((char *) ret);
 727 }
 728
 729
 730 #ifdef _WINDOWS
 731 /*
 732  * Convert a UTF-8 argument into an LPTSTR version
 733  * of that argument. The result is allocated here
 734  * and should be freed by the caller.
 735  */
 736 LPTSTR
 737 utf8_to_lptstr(LPSTR arg_utf8)
 738 {
 739      int lptstr_len;
 740      LPTSTR lptstr_ret = NULL;
 741
 742      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 743      if(lptstr_len > 0)
 744      {
 745          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 746          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 747              arg_utf8, -1, lptstr_ret, lptstr_len );
 748      }
 749
 750      if(!lptstr_len)
 751      {
 752          /* check GetLastError()? */
 753          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 754          lptstr_ret[0] = 0;
 755      }
 756
 757      return lptstr_ret;
 758 }
 759
 760
 761 /*
 762  * Convert an LPTSTR argument into a UTF-8 version
 763  * of that argument. The result is allocated here
 764  * and should be freed by the caller.
 765  */
 766 LPSTR
 767 lptstr_to_utf8(LPTSTR arg_lptstr)
 768 {
 769      int utf8str_len;
 770      LPSTR utf8str_ret = NULL;
 771
 772      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 773      if(utf8str_len > 0)
 774      {
 775          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 776          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 777              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 778      }
 779
 780      if(!utf8str_len)
 781      {
 782          /* check GetLastError()? */
 783          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 784          utf8str_ret[0] = 0;
 785      }
 786
 787      return utf8str_ret;
 788 }
 789
 790
 791 /*
 792  * Convert a UCS4 argument into an LPTSTR version
 793  * of that argument. The result is allocated here
 794  * and should be freed by the caller.
 795  */
 796 LPTSTR
 797 ucs4_to_lptstr(UCS *arg_ucs4)
 798 {
 799     LPTSTR ret_lptstr = NULL;
 800     size_t len;
 801     size_t i;
 802
 803     if(arg_ucs4){
 804         len = ucs4_strlen(arg_ucs4);
 805         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 806         /* bogus conversion ignores UTF-16 */
 807         for(i = 0; i < len; i++)
 808           ret_lptstr[i] = arg_ucs4[i];
 809
 810         ret_lptstr[len] = '\0';
 811     }
 812
 813     return(ret_lptstr);
 814 }
 815
 816
 817 /*
 818  * Convert an LPTSTR argument into a UCS4 version
 819  * of that argument. The result is MemAlloc'd here
 820  * and should be freed by the caller.
 821  */
 822 UCS *
 823 lptstr_to_ucs4(LPTSTR arg_lptstr)
 824 {
 825     UCS *ret_ucs4 = NULL;
 826     size_t len;
 827     size_t i;
 828
 829     if(arg_lptstr){
 830         len = _tcslen(arg_lptstr);
 831         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 832         /* bogus conversion ignores UTF-16 */
 833         for(i = 0; i < len; i++)
 834           ret_ucs4[i] = arg_lptstr[i];
 835
 836         ret_ucs4[len] = '\0';
 837     }
 838
 839     return(ret_ucs4);
 840 }
 841
 842 #endif /* _WINDOWS */
 843
 844
 845 /*
 846  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 847  * 1-at-a-time filled in with UCS characters. The return value is the
 848  * number of valid characters in obuf to be used. It can only
 849  * be 1 or 0 characters since we're only getting one UTF-8 character
 850  * at a time.
 851  */
 852 int
 853 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 854 {
 855     int  width = 0, outchars = 0;
 856
 857     if(!(cb && cb->cbufp))
 858       return(0);
 859
 860     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 861         unsigned char *inputp;
 862         unsigned long remaining_octets;
 863         UCS ucs;
 864
 865         *cb->cbufp++ = (unsigned char) c;
 866         inputp = cb->cbuf;
 867         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 868         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 869
 870         switch(ucs){
 871           case U8G_ENDSTRG:     /* incomplete character, wait */
 872           case U8G_ENDSTRI:     /* incomplete character, wait */
 873             break;
 874
 875           default:
 876             if(ucs & U8G_ERROR || ucs == UBOGON){
 877                 /*
 878                  * None of these cases is supposed to happen. If it
 879                  * does happen then the input stream isn't UTF-8
 880                  * so something is wrong.
 881                  */
 882                 outchars++;
 883                 *obuf = '?';
 884                 cb->cbufp = cb->cbuf;
 885                 width = 1;
 886             }
 887             else{
 888                 outchars++;
 889                 if(ucs < 0x80 && ucs >= 0x20)
 890                   width = 1;
 891
 892                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 893                     /*
 894                      * This happens when we have a UTF-8 character that
 895                      * we aren't able to print in our locale. For example,
 896                      * if the locale is setup with the terminal
 897                      * expecting ISO-8859-1 characters then there are
 898                      * lots of UTF-8 characters that can't be printed.
 899                      * Print a '?' instead.
 900                      * Don't think this should happen in Windows.
 901                      */
 902                     *obuf = '?';
 903                 }
 904                 else{
 905                     *obuf = ucs;
 906                 }
 907
 908                 /* update the input buffer */
 909                 if(inputp >= cb->cbufp) /* this should be the case */
 910                   cb->cbufp = cb->cbuf;
 911                 else{           /* extra chars for some reason? */
 912                     unsigned char *q, *newcbufp;
 913
 914                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 915                     q = cb->cbuf;
 916                     while(inputp < cb->cbufp)
 917                       *q++ = *inputp++;
 918
 919                     cb->cbufp = newcbufp;
 920                 }
 921             }
 922
 923             break;
 924         }
 925     }
 926     else{                       /* error */
 927         *obuf = '?';
 928         outchars = 1;
 929         width = 1;
 930         cb->cbufp = cb->cbuf;   /* start over */
 931     }
 932
 933     if(obufwidth)
 934       *obufwidth = width;
 935
 936     return(outchars);
 937 }
 938
 939
 940 /*
 941  * Return an allocated copy of a zero-terminated UCS-4 string.
 942  */
 943 UCS *
 944 ucs4_cpystr(UCS *ucs4src)
 945 {
 946     size_t         arraysize;
 947     UCS           *ret = NULL;
 948     size_t         i;
 949
 950     if(!ucs4src)
 951       return NULL;
 952
 953     arraysize = ucs4_strlen(ucs4src);
 954
 955     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 956     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 957
 958     for(i = 0; i < arraysize; i++)
 959       ret[i] = ucs4src[i];
 960
 961     return ret;
 962 }
 963
 964
 965 UCS *
 966 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 967 {
 968     size_t i;
 969
 970     if(ucs4src && ucs4dst){
 971         for(i = 0; i < n; i++){
 972             ucs4dst[i] = ucs4src[i];
 973             if(ucs4dst[i] == '\0')
 974               break;
 975         }
 976     }
 977
 978     return ucs4dst;
 979 }
 980
 981
 982 UCS *
 983 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 984 {
 985     size_t i;
 986     UCS *u;
 987
 988     if(ucs4src && ucs4dst){
 989         for(u = ucs4dst; *u; u++)
 990           ;
 991
 992         for(i = 0; i < n; i++){
 993             u[i] = ucs4src[i];
 994             if(u[i] == '\0')
 995               break;
 996         }
 997
 998         if(i == n)
 999           u[i] = '\0';
1000     }
1001
1002     return ucs4dst;
1003 }
1004
1005
1006 /*
1007  * Like strlen only this returns the number of non-zero characters
1008  * in a zero-terminated UCS-4 array.
1009  */
1010 size_t
1011 ucs4_strlen(UCS *ucs4str)
1012 {
1013     size_t i = 0;
1014
1015     if(ucs4str)
1016       while(ucs4str[i])
1017         i++;
1018
1019     return(i);
1020 }
1021
1022
1023 int
1024 ucs4_strcmp(UCS *s1, UCS *s2)
1025 {
1026     for(; *s1 == *s2; s1++, s2++)
1027       if(*s1 == '\0')
1028         return 0;
1029
1030     return((*s1 < *s2) ? -1 : 1);
1031 }
1032
1033
1034 UCS *
1035 ucs4_strchr(UCS *s, UCS c)
1036 {
1037     if(!s)
1038       return NULL;
1039
1040     while(*s && *s != c)
1041       s++;
1042
1043     if(*s || !c)
1044       return s;
1045     else
1046       return NULL;
1047 }
1048
1049
1050 UCS *
1051 ucs4_strrchr(UCS *s, UCS c)
1052 {
1053     UCS *ret = NULL;
1054
1055     if(!s)
1056       return ret;
1057
1058     while(*s){
1059         if(*s == c)
1060           ret = s;
1061
1062         s++;
1063     }
1064
1065     return ret;
1066 }
1067
1068
1069 /*
1070  * Returns the screen cells width of the UTF-8 string argument.
1071  */
1072 unsigned
1073 utf8_width(char *str)
1074 {
1075     unsigned width = 0;
1076     int this_width;
1077     UCS ucs;
1078     unsigned long remaining_octets;
1079     char *readptr;
1080
1081     if(!(str && *str))
1082       return(width);
1083
1084     readptr = str;
1085     remaining_octets = readptr ? strlen(readptr) : 0;
1086
1087     while(remaining_octets > 0 && *readptr){
1088
1089         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1090
1091         if(ucs & U8G_ERROR || ucs == UBOGON){
1092             /*
1093              * This should not happen, but do something to handle it anyway.
1094              * Treat each character as a single width character, which is what should
1095              * probably happen when we actually go to write it out.
1096              */
1097             remaining_octets--;
1098             readptr++;
1099             this_width = 1;
1100         }
1101         else{
1102             this_width = wcellwidth(ucs);
1103
1104             /*
1105              * If this_width is -1 that means we can't print this character
1106              * with our current locale. Writechar will print a '?'.
1107              */
1108             if(this_width < 0)
1109               this_width = 1;
1110         }
1111
1112         width += (unsigned) this_width;
1113     }
1114
1115     return(width);
1116 }
1117
1118
1119 /*
1120  * Copy UTF-8 characters from src into dst.
1121  * This is intended to be used if you want to truncate a string at
1122  * the start instead of the end. For example, you have a long string
1123  * like
1124  *       this_is_a_long_string
1125  * but not enough space to fit it into a particular field. You want to
1126  * end up with
1127  *             s_a_long_string
1128  * where that fits in a particular width. Perhaps you'd use this with ...
1129  * to get
1130  *          ...s_a_long_string
1131  * This right adjusts the end of the string in the width space and
1132  * cuts it off at the start. If there is enough width for the whole
1133  * string it will copy the string into dst with no padding.
1134  *
1135  * Copy enough characters so that the result will have screen width of
1136  * want_width screen cells in current locale.
1137  *
1138  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1139  *   to dst. This is just for protection, it shouldn't be relied on to
1140  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1141  *   characters truncated in the middle or something like that.
1142  *
1143  * Returned value is the number of bytes written to dst, not including
1144  *   the possible terminating null.
1145  *
1146  * If we can't hit want_width exactly because of double width characters
1147  *   then we will pad the end of the string with space in order to make
1148  *   the width exact.
1149  */
1150 size_t
1151 utf8_to_width_rhs(char *dst,            /* destination buffer */
1152                   char *src,            /* source string */
1153                   size_t dstlen,        /* space in dest */
1154                   unsigned want_width)  /* desired screen width */
1155 {
1156     int this_width;
1157     unsigned width_consumed = 0;
1158     UCS ucs;
1159     unsigned long remaining_octets;
1160     char *readptr, *goodreadptr, *savereadptr, *endptr;
1161     size_t nb = 0;
1162
1163     if(!src){
1164         if(dstlen > 0)
1165           dst[0] = '\0';
1166
1167         return nb;
1168     }
1169
1170     /*
1171      * Start at the end of the source string and go backwards until we
1172      * get to the desired width, but not more than the width.
1173      */
1174     readptr = src + strlen(src);
1175     endptr = readptr;
1176     goodreadptr = readptr;
1177     width_consumed = 0;
1178     savereadptr = readptr;
1179
1180     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1181         readptr = savereadptr-1){
1182
1183         savereadptr = readptr;
1184         remaining_octets = goodreadptr - readptr;
1185         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1186
1187         /*
1188          * Handling the error case is tough because an error will be the normal thing that
1189          * happens as we back through the string. So we're just going to punt on the
1190          * error for now.
1191          */
1192         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1193             if(remaining_octets > 0){
1194                 /*
1195                  * This means there are some bad octets after this good
1196                  * character so things are not going to work out well.
1197                  * Bail out.
1198                  */
1199                 savereadptr = src;      /* we're done */
1200             }
1201             else{
1202                 this_width = wcellwidth(ucs);
1203
1204                 if(this_width < 0)
1205                   this_width = 1;
1206
1207                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1208                     width_consumed += (unsigned) this_width;
1209                     goodreadptr = savereadptr;
1210                 }
1211                 else
1212                   savereadptr = src;    /* we're done */
1213             }
1214         }
1215     }
1216
1217     /*
1218      * Copy characters from goodreadptr to endptr into dst.
1219      */
1220     nb = MIN(endptr-goodreadptr, dstlen-1);
1221     strncpy(dst, goodreadptr, nb);
1222     dst[nb] = '\0';
1223
1224     /*
1225      * Pad out with spaces in order to hit width exactly.
1226      */
1227     while(width_consumed < want_width && nb < dstlen-1){
1228         dst[nb++] = ' ';
1229         dst[nb] = '\0';
1230         width_consumed++;
1231     }
1232
1233     return nb;
1234 }
1235
1236
1237 /*
1238  * The arguments being converted are UTF-8 strings.
1239  * This routine attempts to make it possible to use screen cell
1240  * widths in a format specifier. In a one-byte per screen cell
1241  * world we might have used %10.10s to cause a string to occupy
1242  * 10 screen positions. Since the width and precision are really
1243  * referring to numbers of bytes instead of screen positions that
1244  * won't work with UTF-8 input. We emulate that behavior with
1245  * the format string %w. %m.nw means to use the m and n as
1246  * screen width indicators instead of bytes indicators.
1247  *
1248  * There is no reason to use this routine unless you want to use
1249  * min field with or precision with the specifier. A plain %w without
1250  * widths is equivalent exactly to a plain %s in a regular printf.
1251  *
1252  * Double-width characters complicate things. It may not be possible
1253  * to satisfy the request exactly. For example, %3w for an input
1254  * string that is made up of two double-width characters.
1255  * This routine will arbitrarily use a trailing space character if
1256  * needed to make the width come out correctly where a half of a
1257  * double-width character would have been needed. We'll see how
1258  * that works for us.
1259  *
1260  * %w only works for strings (it's a %s replacement).
1261  *
1262  * Buffer overflow is handled by the size argument. %.30s will work
1263  * to limit a particular string to 30 bytes, but you lose that
1264  * ability with %w, since it may write more than precision bytes
1265  * in order to get to the desired width. It is best to choose
1266  * size large enough so that it doesn't come into play, otherwise
1267  * it may be possible to get partial UTF-8 characters because of
1268  * the truncation.
1269  *
1270  * The return value isn't quite the same as the return value
1271  * of snprintf. It is the number of bytes written, not counting
1272  * the trailing null, just like snprintf. However, if it is
1273  * truncated due to size then the output is size, not the
1274  * number of characters that would have been written.
1275  */
1276 int
1277 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1278 {
1279     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1280     char   *start_of_specifier;
1281     char   *input_str;
1282     int     int_arg;
1283     double  double_arg;
1284     void   *ptr_arg;
1285     unsigned got_width;
1286     int     more_flags, ret, w;
1287     int     min_field_width, field_precision, modifier;
1288     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1289     va_list args;
1290
1291     newfmt[0] = '\0';
1292     q = newfmt;
1293
1294     pdest = dest;
1295
1296 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1297     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1298
1299     /*
1300      * Strategy: Look through the fmt string for %w's. Replace the
1301      * %w's in the format string with %s's but with possibly different
1302      * width and precision arguments which will make it come out right.
1303      * Then call the regular system vsnprintf with the altered format
1304      * string but same arguments.
1305      *
1306      * That would be nice but it doesn't quite work. Why? Because a
1307      * %*w will need to have the value in the integer argument the *
1308      * refers to modified. Can't do it as far as I can tell. Or we could
1309      * remove the integer argument somehow before calling printf. Can't
1310      * do it. Or we could somehow add an additional conversion specifier
1311      * that caused nothing to be printed but ate up the integer arg.
1312      * Can't figure out how to do that either.
1313      *
1314      * Since we can't figure out how to do it, the alternative is to
1315      * construct the result one piece at a time, pasting together the
1316      * pieces from the different conversions.
1317      */
1318     va_start(args, fmt);
1319
1320     while(*fmt && IS_ROOM_IN_DEST(1)){
1321         if(*fmt == '%'){
1322             start_of_specifier = fmt++;
1323
1324             min_field_width = field_precision = -1;
1325             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1326
1327             /* flags */
1328             more_flags = 1;
1329             while(more_flags){
1330                 switch(*fmt){
1331                   case '-':
1332                     flags_minus++;
1333                     fmt++;
1334                     break;
1335
1336                   case '+':
1337                     flags_plus++;
1338                     fmt++;
1339                     break;
1340
1341                   case ' ':
1342                     flags_space++;
1343                     fmt++;
1344                     break;
1345
1346                   case '0':
1347                     flags_zero++;
1348                     fmt++;
1349                     break;
1350
1351                   case '#':
1352                     flags_pound++;
1353                     fmt++;
1354                     break;
1355
1356                   default:
1357                     more_flags = 0;
1358                     break;
1359                 }
1360             }
1361
1362             /* minimum field width */
1363             if(*fmt == '*'){
1364                 min_field_width = va_arg(args, int);
1365                 fmt++;
1366             }
1367             else if(*fmt >= '0' && *fmt <= '9'){
1368                 width_str = fmt;
1369                 while (*fmt >= '0' && *fmt <= '9')
1370                   fmt++;
1371
1372                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1373                 if(sizeof(buf) > fmt-width_str)
1374                   buf[fmt-width_str] = '\0';
1375
1376                 buf[sizeof(buf)-1] = '\0';
1377
1378                 min_field_width = atoi(width_str);
1379             }
1380
1381             /* field precision */
1382             if(*fmt == '.'){
1383                 fmt++;
1384                 if(*fmt == '*'){
1385                     field_precision = va_arg(args, int);
1386                     fmt++;
1387                 }
1388                 else if(*fmt >= '0' && *fmt <= '9'){
1389                     width_str = fmt;
1390                     while (*fmt >= '0' && *fmt <= '9')
1391                       fmt++;
1392
1393                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1394                     if(sizeof(buf) > fmt-width_str)
1395                       buf[fmt-width_str] = '\0';
1396
1397                     buf[sizeof(buf)-1] = '\0';
1398
1399                     field_precision = atoi(width_str);
1400                 }
1401             }
1402
1403             /* length modifier */
1404             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1405               modifier = *fmt++;
1406
1407             /* conversion character */
1408             switch(*fmt){
1409               case 'w':
1410                 /*
1411                  * work with va_arg(char *) to figure out width
1412                  * and precision needed to produce the screen width
1413                  * and precision asked for in %w using some of the
1414                  * utf8 width routines we have.
1415                  */
1416
1417                 input_str = va_arg(args, char *);
1418                 if(field_precision >=0 || min_field_width >= 0)
1419                   w = utf8_width(input_str);
1420
1421                 if(field_precision >= 0){
1422                     if(w <= field_precision)
1423                       field_precision = -1;  /* print it all */
1424                     else{
1425                         /*
1426                          * We need to cut off some of the input_str
1427                          * in this case.
1428                          */
1429                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1430                         field_precision = (int) (end - input_str);
1431                         /* new w with this field_precision */
1432                         w = got_width;
1433                     }
1434                 }
1435
1436                 /* need some padding */
1437                 if(min_field_width >= 0)
1438                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1439                                       MAX(0, min_field_width - w);
1440
1441                 /*
1442                  * Now we just need to get the new format string
1443                  * set correctly in newfmt.
1444                  */
1445                 q = newfmt;
1446                 if(q-newfmt < sizeof(newfmt))
1447                   *q++ = '%';
1448
1449                 if(flags_minus && q-newfmt < sizeof(newfmt))
1450                   *q++ = '-';
1451                 if(flags_plus && q-newfmt < sizeof(newfmt))
1452                   *q++ = '+';
1453                 if(flags_space && q-newfmt < sizeof(newfmt))
1454                   *q++ = ' ';
1455                 if(flags_zero && q-newfmt < sizeof(newfmt))
1456                   *q++ = '0';
1457                 if(flags_pound && q-newfmt < sizeof(newfmt))
1458                   *q++ = '#';
1459
1460                 if(min_field_width >= 0){
1461                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1462                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1463                 }
1464
1465                 if(field_precision >= 0){
1466                     if(q-newfmt < sizeof(newfmt))
1467                       *q++ = '.';
1468
1469                     snprintf(buf, sizeof(buf), "%d", field_precision);
1470                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1471                 }
1472
1473                 if(q-newfmt < sizeof(newfmt))
1474                   *q++ = 's';
1475
1476                 if(q-newfmt < sizeof(newfmt))
1477                   *q++ = '\0';
1478
1479                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1480                 pdest += strlen(pdest);
1481
1482                 break;
1483
1484               case '\0':
1485                 fmt--;
1486                 break;
1487
1488               default:
1489                 /* make a new format which leaves out the dynamic '*' arguments */
1490                 q = newfmt;
1491                 if(q-newfmt < sizeof(newfmt))
1492                   *q++ = '%';
1493
1494                 if(flags_minus && q-newfmt < sizeof(newfmt))
1495                   *q++ = '-';
1496                 if(flags_plus && q-newfmt < sizeof(newfmt))
1497                   *q++ = '+';
1498                 if(flags_space && q-newfmt < sizeof(newfmt))
1499                   *q++ = ' ';
1500                 if(flags_zero && q-newfmt < sizeof(newfmt))
1501                   *q++ = '0';
1502                 if(flags_pound && q-newfmt < sizeof(newfmt))
1503                   *q++ = '#';
1504
1505                 if(min_field_width >= 0){
1506                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1507                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1508                 }
1509
1510                 if(field_precision >= 0){
1511                     if(q-newfmt < sizeof(newfmt))
1512                       *q++ = '.';
1513
1514                     snprintf(buf, sizeof(buf), "%d", field_precision);
1515                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1516                 }
1517
1518                 if(q-newfmt < sizeof(newfmt))
1519                   *q++ = *fmt;
1520
1521                 if(q-newfmt < sizeof(newfmt))
1522                   *q++ = '\0';
1523
1524                 switch(*fmt){
1525                   case 'd': case 'i': case 'o':
1526                   case 'x': case 'X': case 'u': case 'c':
1527                     int_arg = va_arg(args, int);
1528                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1529                     pdest += strlen(pdest);
1530                     break;
1531
1532                   case 's':
1533                     input_str = va_arg(args, char *);
1534                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1535                     pdest += strlen(pdest);
1536                     break;
1537
1538                   case 'f': case 'e': case 'E':
1539                   case 'g': case 'G':
1540                     double_arg = va_arg(args, double);
1541                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1542                     pdest += strlen(pdest);
1543                     break;
1544
1545                   case 'p':
1546                     ptr_arg = va_arg(args, void *);
1547                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1548                     pdest += strlen(pdest);
1549                     break;
1550
1551                   case '%':
1552                     if(IS_ROOM_IN_DEST(1))
1553                       *pdest++ =  '%';
1554
1555                     break;
1556
1557                   default:
1558                     /* didn't think of this type */
1559                     assert(0);
1560                     break;
1561                 }
1562
1563                 break;
1564             }
1565
1566             fmt++;
1567         }
1568         else{
1569             if(IS_ROOM_IN_DEST(1))
1570               *pdest++ = *fmt++;
1571         }
1572     }
1573
1574     ret = pdest - dest;
1575
1576     if(IS_ROOM_IN_DEST(1))
1577       *pdest++ = '\0';
1578
1579     va_end(args);
1580
1581     return ret;
1582 }
1583
1584
1585 /*
1586  * Copy UTF-8 characters from src into dst.
1587  * Copy enough characters so that the result will have (<=) screen width of
1588  * want_width screen cells in current locale.
1589  *
1590  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1591  *   to dst.
1592  *
1593  * Returned value is the number of bytes written to dst, not including
1594  *   the possible terminating null.
1595  * Got_width is another returned value. It is the width in screen cells of
1596  *   the string placed in dst. It will be the same as want_width if there
1597  *   are enough characters in the src to do that and if the character widths
1598  *   hit the width exactly. It will be less than want_width if we run out
1599  *   of src characters or if the next character width would skip over the
1600  *   width we want, because it is double width.
1601  *
1602  * Zero width characters are collected and included at the end of the string.
1603  *   That is, if we make it to want_width but there is still a zero length
1604  *   character sitting in src, we add that to dst. This might be an accent
1605  *   or something like that.
1606  */
1607 size_t
1608 utf8_to_width(char *dst,                /* destination buffer */
1609               char *src,                /* source string */
1610               size_t dstlen,            /* space in dst */
1611               unsigned want_width,      /* desired screen width */
1612               unsigned *got_width)      /* returned screen width in dst */
1613 {
1614     int this_width;
1615     unsigned width_consumed = 0;
1616     UCS ucs;
1617     unsigned long remaining_octets;
1618     char *writeptr, *readptr, *savereadptr, *endptr;
1619     int ran_out_of_space = 0;
1620
1621     readptr = src;
1622
1623     remaining_octets = readptr ? strlen(readptr) : 0;
1624
1625     writeptr = dst;
1626     endptr = writeptr + dstlen;
1627
1628     if(readptr && writeptr){
1629       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1630         savereadptr = readptr;
1631         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1632
1633         if(ucs & U8G_ERROR || ucs == UBOGON)
1634           remaining_octets = 0;
1635         else{
1636           this_width = wcellwidth(ucs);
1637
1638           /*
1639            * If this_width is -1 that means we can't print this character
1640            * with our current locale. Writechar will print a '?'.
1641            */
1642           if(this_width < 0)
1643             this_width = 1;
1644
1645           if(width_consumed + (unsigned) this_width <= want_width){
1646             /* append this utf8 character to dst if it will fit */
1647             if(writeptr + (readptr - savereadptr) < endptr){
1648               width_consumed += this_width;
1649               while(savereadptr < readptr)
1650                 *writeptr++ = *savereadptr++;
1651             }
1652             else
1653               ran_out_of_space++;       /* no more utf8 to dst */
1654           }
1655           else
1656             remaining_octets = 0;       /* we're done */
1657         }
1658       }
1659
1660       if(writeptr < endptr)
1661         *writeptr = '\0';
1662     }
1663
1664     if(got_width)
1665       *got_width = width_consumed;
1666
1667     return(writeptr ? (writeptr - dst) : 0);
1668 }
1669
1670
1671 /*
1672  * Str is a UTF-8 string.
1673  * Count forward width screencell positions and return a pointer to the
1674  * end of the string that is width wide.
1675  * The returned pointer points at the next character (where the null would
1676  * be placed).
1677  *
1678  * Got_width is another returned value. It is the width in screen cells of
1679  *   the string from str to the returned pointer. It will be the same as
1680  *   want_width if there are enough characters in the str to do that
1681  *   and if the character widths hit the width exactly. It will be less
1682  *   than want_width if we run out of characters or if the next character
1683  *   width would skip over the width we want, because it is double width.
1684  */
1685 char *
1686 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1687 {
1688     int this_width;
1689     unsigned width_consumed = 0;
1690     UCS ucs;
1691     unsigned long remaining_octets;
1692     char *readptr;
1693     char *retptr;
1694
1695     retptr = readptr = str;
1696
1697     remaining_octets = readptr ? strlen(readptr) : 0;
1698
1699     while(width_consumed <= want_width && remaining_octets > 0){
1700
1701         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1702
1703         if(ucs & U8G_ERROR || ucs == UBOGON){
1704             /*
1705              * This should not happen, but do something to handle it anyway.
1706              * Treat each character as a single width character, which is what should
1707              * probably happen when we actually go to write it out.
1708              */
1709             remaining_octets--;
1710             readptr++;
1711             this_width = 1;
1712         }
1713         else{
1714             this_width = wcellwidth(ucs);
1715
1716             /*
1717              * If this_width is -1 that means we can't print this character
1718              * with our current locale. Writechar will print a '?'.
1719              */
1720             if(this_width < 0)
1721               this_width = 1;
1722         }
1723
1724         if(width_consumed + (unsigned) this_width <= want_width){
1725             width_consumed += (unsigned) this_width;
1726             retptr = readptr;
1727         }
1728         else
1729           remaining_octets = 0; /* we're done */
1730     }
1731
1732     if(got_width)
1733       *got_width = width_consumed;
1734
1735     return(retptr);
1736 }
1737
1738
1739 /*
1740  * Copy a null terminator into a UTF-8 string in place so that the string is
1741  * no more than a certain screen width wide. If the string is already less
1742  * than or equal in width to the requested width, no change is made.
1743  *
1744  * The actual width accomplished is returned. Note that it may be less than
1745  * max_width due to double width characters as well as due to the fact that
1746  * it fits wholly in the max_width.
1747  *
1748  * Returned value is the actual screen width of str when done.
1749  *
1750  * A side effect is that a terminating null may have been written into
1751  * the passed in string.
1752  */
1753 unsigned
1754 utf8_truncate(char *str, unsigned max_width)
1755 {
1756     int this_width;
1757     unsigned width_consumed = 0;
1758     UCS ucs;
1759     unsigned long remaining_octets;
1760     char *readptr, *savereadptr;
1761
1762     readptr = str;
1763
1764     remaining_octets = readptr ? strlen(readptr) : 0;
1765
1766     if(readptr){
1767       while(width_consumed <= max_width && remaining_octets > 0){
1768
1769         savereadptr = readptr;
1770         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1771
1772         if(ucs & U8G_ERROR || ucs == UBOGON){
1773             /*
1774              * This should not happen, but do something to handle it anyway.
1775              * Treat each character as a single width character, which is what should
1776              * probably happen when we actually go to write it out.
1777              */
1778             remaining_octets--;
1779             readptr++;
1780             this_width = 1;
1781         }
1782         else{
1783             this_width = wcellwidth(ucs);
1784
1785             /*
1786              * If this_width is -1 that means we can't print this character
1787              * with our current locale. Writechar will print a '?'.
1788              */
1789             if(this_width < 0)
1790               this_width = 1;
1791         }
1792
1793         if(width_consumed + (unsigned) this_width <= max_width){
1794             width_consumed += (unsigned) this_width;
1795         }
1796         else{
1797             remaining_octets = 0;       /* we're done */
1798             *savereadptr = '\0';
1799         }
1800       }
1801     }
1802
1803     return(width_consumed);
1804 }
1805
1806
1807 /*
1808  * Copy UTF-8 characters from src into dst.
1809  * Copy enough characters so that the result will have screen width of
1810  * want_width screen cells in current locale.
1811  * If there aren't enough characters in src to get to want_width, pad on
1812  * left or right according to left_adjust argument.
1813  *
1814  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1815  *   to dst. Dst will be null terminated if there is enough room, but not
1816  *   if that would overflow dst's len.
1817  *
1818  * Returned value is the number of bytes written to dst, not including
1819  *   the possible terminating null.
1820  */
1821 size_t
1822 utf8_pad_to_width(char *dst,            /* destination buffer */
1823                   char *src,            /* source string */
1824                   size_t dstlen,        /* space in dst */
1825                   unsigned want_width,  /* desired screen width */
1826                   int left_adjust)      /* adjust left or right in want_width columns */
1827 {
1828     unsigned got_width = 0;
1829     int      need_more, howmany;
1830     size_t   len_left, bytes_used;
1831
1832     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1833     len_left = dstlen - bytes_used;
1834
1835     need_more = want_width - got_width;
1836     howmany = MIN(need_more, len_left);
1837
1838     if(howmany > 0){
1839         char *end, *newend, *p, *q;
1840
1841         end = dst + bytes_used;
1842         newend = end + howmany;
1843         if(left_adjust){
1844             /*
1845              * Add padding to end of string. Simply append
1846              * the needed number of spaces, or however many will fit
1847              * if we don't have enough space.
1848              */
1849             for(q = end; q < newend; q++)
1850               *q = ' ';
1851         }
1852         else{
1853             /*
1854              * Add padding to start of string.
1855              */
1856
1857             /* slide existing string over */
1858             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1859               *q = *p;
1860
1861             /* fill rest with spaces */
1862             for(; q >= dst; q--)
1863               *q = ' ';
1864         }
1865
1866         bytes_used += howmany;
1867     }
1868
1869     if(bytes_used < dstlen)
1870       dst[bytes_used] = '\0';
1871
1872     return(bytes_used);
1873 }
1874
1875
1876 /*
1877  * Str is a UTF-8 string.
1878  * Start_here is a pointer into the string. It points one position past
1879  * the last byte that should be considered a part of the length string.
1880  * Count back want_width screencell positions and return a pointer to the
1881  * start of the string that is want_width wide and ends with start_here.
1882  *
1883  * Since characters may be more than one cell width wide we may end up
1884  * skipping over the exact width. That is, if we need to we'll go back
1885  * too far (by one cell width). Account for that in the call by looking
1886  * at got_width.
1887  *
1888  * Note that this call gives a possible got_width == want_width+1 as
1889  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1890  * That was just what was needed at the time, maybe it needs to be
1891  * optional.
1892  */
1893 char *
1894 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1895 {
1896     unsigned width_consumed = 0;
1897     int this_width;
1898     UCS ucs;
1899     unsigned long remaining_octets;
1900     char *ptr, *savereadptr, *goodreadptr;
1901
1902     savereadptr = start_here;
1903     goodreadptr = start_here;
1904
1905     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1906
1907         savereadptr = ptr;
1908         remaining_octets = goodreadptr - ptr;
1909         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1910
1911         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1912           if(remaining_octets > 0){
1913               /*
1914                * This means there are some bad octets after this good
1915                * character so things are not going to work out well.
1916                * Bail out.
1917                */
1918               savereadptr = str;        /* we're done */
1919           }
1920           else{
1921             this_width = wcellwidth(ucs);
1922
1923             /*
1924              * If this_width is -1 that means we can't print this character
1925              * with our current locale. Writechar will print a '?'.
1926              */
1927             if(this_width < 0)
1928               this_width = 1;
1929
1930             width_consumed += (unsigned) this_width;
1931             goodreadptr = savereadptr;
1932           }
1933         }
1934     }
1935
1936     if(got_width)
1937       *got_width = width_consumed;
1938
1939     return(savereadptr);
1940 }
1941
1942
1943 /*----------------------------------------------------------------------
1944   copy the source string onto the destination string returning with
1945   the destination string pointer at the end of the destination text
1946
1947   motivation for this is to avoid twice passing over a string that's
1948   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1949
1950   This doesn't really belong here but it is used here.
1951  ----*/
1952 void
1953 sstrncpy(char **d, char *s, int n)
1954 {
1955     while(n-- > 0 && (**d = *s++) != '\0')
1956       (*d)++;
1957 }
1958
1959
1960 /*
1961  * If use_system_routines is set then NULL is the return value and it is
1962  * not an error. Display_charmap and keyboard_charmap should come over as
1963  * malloced strings and will be filled in with the result.
1964  *
1965  * Returns a void pointer to the input_cs CHARSET which is
1966  * passed to mbtow via kbseq().
1967  * If !use_system_routines && NULL is returned, that is an error and err should
1968  * have a message.
1969  * display_charmap and keyboard_charmap should be malloced data and may be
1970  * realloced and changed here.
1971  */
1972 int
1973 setup_for_input_output(int use_system_routines, char **display_charmap,
1974                        char **keyboard_charmap, void **input_cs_arg, char **err)
1975 {
1976     const CHARSET *cs;
1977     const CHARSET *input_cs = NULL;
1978     int already_tried = 0;
1979     int supported = 0;
1980     char buf[1000];
1981
1982 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1983
1984     if(err)
1985       *err = NULL;
1986
1987     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1988         *err = cpstr("Bad call to setup_for_input_output");
1989         return(-1);
1990     }
1991
1992     if(use_system_routines){
1993 #if     PREREQ_FOR_SYS_TRANSLATION
1994         char *dcm;
1995
1996         dcm = nl_langinfo_codeset_wrapper();
1997         dcm = dcm ? dcm : "US-ASCII";
1998
1999         init_utf8_display(0, NULL);
2000         if(*display_charmap){
2001             if(dcm && strucmp(*display_charmap, dcm)){
2002                 snprintf(buf, sizeof(buf),
2003                  _("Display character set \"%s\" is ignored when using system translation"),
2004                      *display_charmap);
2005
2006                 *err = cpstr(buf);
2007             }
2008
2009             fs_give((void **) display_charmap);
2010         }
2011
2012         if(*keyboard_charmap){
2013             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
2014                 snprintf(buf, sizeof(buf),
2015                  _("Keyboard character set \"%s\" is ignored when using system translation"),
2016                      *keyboard_charmap);
2017
2018                 *err = cpstr(buf);
2019             }
2020
2021             fs_give((void **) keyboard_charmap);
2022         }
2023
2024         *display_charmap = cpstr(dcm);
2025         *keyboard_charmap = cpstr(dcm);
2026 #else
2027         *err = cpstr("Bad call to setup_for_input_output");
2028 #endif
2029
2030         *input_cs_arg = NULL;
2031         return(0);
2032     }
2033
2034
2035 try_again1:
2036     if(!(*display_charmap))
2037       *display_charmap = cpstr("US-ASCII");
2038
2039     if(!(*keyboard_charmap))
2040       *keyboard_charmap = cpstr(*display_charmap);
2041
2042     if(*keyboard_charmap){
2043         supported = input_charset_is_supported(*keyboard_charmap);
2044
2045         if(supported){
2046             if(!strucmp(*keyboard_charmap, "utf-8"))
2047               input_cs = utf8_charset(*keyboard_charmap);
2048             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2049               input_cs = cs;
2050         }
2051         else{
2052             if(err && !*err){
2053                 int iso2022jp = 0;
2054
2055                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2056                   iso2022jp = 1;
2057
2058                 snprintf(buf, sizeof(buf),
2059                      /* TRANSLATORS: The first argument is the name of the character
2060                         set the user is trying to use (which is unsupported by alpine).
2061                         The second argument is " (except for posting)" if they are
2062                         trying to use ISO-2022-JP for something other than posting. */
2063                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2064                      *keyboard_charmap,
2065                      iso2022jp ? _(" (except for posting)") : "");
2066
2067                 *err = cpstr(buf);
2068             }
2069
2070             input_cs = NULL;
2071             fs_give((void **) keyboard_charmap);
2072             *keyboard_charmap = cpstr("US-ASCII");
2073             if(!already_tried){
2074                 already_tried++;
2075                 goto try_again1;
2076             }
2077         }
2078     }
2079
2080
2081 try_again2:
2082     if(!(*display_charmap))
2083       *display_charmap = cpstr("US-ASCII");
2084
2085     if(*display_charmap){
2086         supported = output_charset_is_supported(*display_charmap);
2087         if(supported){
2088             if(!strucmp(*display_charmap, "utf-8"))
2089               init_utf8_display(1, NULL);
2090             else if((cs = utf8_charset(*display_charmap)) != NULL)
2091               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2092         }
2093         else{
2094             if(err && !*err){
2095                 int iso2022jp = 0;
2096
2097                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2098                   iso2022jp = 1;
2099
2100                 snprintf(buf, sizeof(buf),
2101                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2102                      *display_charmap,
2103                      iso2022jp ? _(" (except for posting)") : "");
2104
2105                 *err = cpstr(buf);
2106             }
2107
2108             fs_give((void **) display_charmap);
2109             if(!already_tried){
2110                 already_tried++;
2111                 goto try_again2;
2112             }
2113         }
2114     }
2115     else{
2116         if(err && !*err)
2117           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2118     }
2119
2120 #undef cpstr
2121
2122     *input_cs_arg = (void *) input_cs;
2123
2124     return(0);
2125 }
2126
2127
2128 int
2129 input_charset_is_supported(char *input_charset)
2130 {
2131     const CHARSET *cs;
2132
2133     if(!(input_charset && *input_charset))
2134       return 0;
2135
2136     if(!strucmp(input_charset, "utf-8"))
2137       return 1;
2138
2139     if((cs = utf8_charset(input_charset)) != NULL){
2140
2141         /*
2142          * This was true 2006-09-25.
2143          */
2144         switch(cs->type){
2145           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2146           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2147           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2148           case CT_UCS4: case CT_UTF16:
2149             return 1;
2150             break;
2151
2152           default:
2153             break;
2154         }
2155     }
2156
2157     return 0;
2158 }
2159
2160
2161 int
2162 output_charset_is_supported(char *output_charset)
2163 {
2164     const CHARSET *cs;
2165
2166     if(!(output_charset && *output_charset))
2167       return 0;
2168
2169     if(!strucmp(output_charset, "utf-8"))
2170       return 1;
2171
2172     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2173       return 1;
2174
2175     return 0;
2176 }
2177
2178
2179 int
2180 posting_charset_is_supported(char *posting_charset)
2181 {
2182     return(posting_charset && *posting_charset
2183            && (!strucmp(posting_charset, "ISO-2022-JP")
2184                || output_charset_is_supported(posting_charset)));
2185 }
2186
2187
2188 /*
2189  * This function is only defined in this special case and so calls
2190  * to it should be wrapped in the same macro conditionals.
2191  *
2192  * Returns the default display charset for a UNIX terminal emulator,
2193  * it is what nl_langinfo(CODESET) should return but we need to
2194  * wrap nl_langinfo because we know of strange behaving implementations.
2195  */
2196 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2197 char *
2198 nl_langinfo_codeset_wrapper(void)
2199 {
2200     char *ret = NULL;
2201
2202     ret = nl_langinfo(CODESET);
2203
2204     /*
2205      * If the value returned from nl_langinfo() is not a real charset,
2206      * see if we can figure out what they meant. If we can't figure it
2207      * out return NULL and let the caller decide what to do.
2208      */
2209     if(ret && *ret && !output_charset_is_supported(ret)){
2210         if(!strcmp("ANSI_X3.4-1968", ret)
2211            || !strcmp("646", ret)
2212            || !strcmp("ASCII", ret)
2213            || !strcmp("C", ret)
2214            || !strcmp("POSIX", ret))
2215           ret = "US-ASCII";
2216         else if(!strucmp(ret, "UTF8"))
2217           ret = "UTF-8";
2218         else if(!strucmp(ret, "EUCJP"))
2219           ret = "EUC-JP";
2220         else if(!strucmp(ret, "EUCKP"))
2221           ret = "EUC-KP";
2222         else if(!strucmp(ret, "SJIS"))
2223           ret = "SHIFT-JIS";
2224         else if(strstr(ret, "8859")){
2225             char *p;
2226
2227             /* check for digits after 8859 */
2228             p = strstr(ret, "8859");
2229             p += 4;
2230             if(!isdigit(*p))
2231               p++;
2232
2233             if(isdigit(*p)){
2234                 static char buf[12];
2235
2236                 memset(buf, 0, sizeof(buf));
2237                 strncpy(buf, "ISO-8859-", sizeof(buf));
2238                 buf[9] = *p++;
2239                 if(isdigit(*p))
2240                   buf[10] = *p;
2241
2242                 ret = buf;
2243             }
2244         }
2245     }
2246
2247     if(ret && !output_charset_is_supported(ret))
2248       ret = NULL;
2249
2250     return(ret);
2251 }
2252 #endif
2253
2254
2255 /*
2256  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2257  * needed the return value will point to orig. If a conversion is done,
2258  * the return string should be freed by the caller.
2259  * If not possible, returns NULL.
2260  */
2261 char *
2262 utf8_to_charset(char *orig, char *charset, int report_err)
2263 {
2264     SIZEDTEXT src, dst;
2265     char *ret = orig;
2266
2267     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2268       return ret;
2269
2270     src.size = strlen(orig);
2271     src.data = (unsigned char *) orig;
2272
2273     if(!strucmp(charset, "us-ascii")){
2274         size_t i;
2275
2276         for(i = 0; i < src.size; i++)
2277           if(src.data[i] & 0x80)
2278             return NULL;
2279
2280         return ret;
2281     }
2282
2283     /*
2284      * This works for ISO-2022-JP because of special code in utf8_cstext
2285      * but not for other 2022 charsets.
2286      */
2287     memset(&dst, 0, sizeof(dst));
2288     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2289       ret = (char *) dst.data;          /* c-client already null terminates it */
2290     else
2291       ret = NULL;
2292
2293     if((unsigned char *) ret != dst.data && dst.data)
2294       fs_give((void **) &dst.data);
2295
2296     return ret;
2297 }
2298
2299
2300 /*
2301  *      Turn a number into a string with comma's
2302  *
2303  * Args: number -- The long to be turned into a string.
2304  *
2305  * Result: pointer to static string representing number with commas
2306  * Can use up to 3 comatose results at once.
2307  */
2308 char *
2309 comatose(long int number)
2310 {
2311     long        i, x, done_one;
2312     static char buf[3][50];
2313     static int whichbuf = 0;
2314     char       *b;
2315
2316     whichbuf = (whichbuf + 1) % 3;
2317
2318     if(number == 0){
2319         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2320         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2321         return(buf[whichbuf]);
2322     }
2323
2324     done_one = 0;
2325     b = buf[whichbuf];
2326     for(i = 1000000000; i >= 1; i /= 1000) {
2327         x = number / i;
2328         number = number % i;
2329         if(x != 0 || done_one) {
2330             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2331               *b++ = ',';
2332
2333             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2334             b += strlen(b);
2335             done_one = 1;
2336         }
2337     }
2338
2339     if(b-buf[whichbuf] < sizeof(buf[0]))
2340       *b = '\0';
2341
2342     return(buf[whichbuf]);
2343 }
2344
2345
2346 /* leave out the commas */
2347 char *
2348 tose(long int number)
2349 {
2350     static char buf[3][50];
2351     static int whichbuf = 0;
2352
2353     whichbuf = (whichbuf + 1) % 3;
2354
2355     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2356
2357     return(buf[whichbuf]);
2358 }
2359
2360
2361 /*
2362  * line_paint - where the real work of managing what is displayed gets done.
2363  */
2364 void
2365 line_paint(int offset,                  /* current dot offset into vl */
2366            struct display_line *displ,
2367            int *passwd)                 /* flag to hide display of chars */
2368 {
2369     int i, w, w2, already_got_one = 0;
2370     int vfirst, vlast, dfirst, dlast, vi, di;
2371     int new_vbase;
2372     unsigned (*width_a_to_b)(UCS *, int, int);
2373
2374     /*
2375      * Set passwd to 10 in caller if you want to conceal the
2376      * password but not print asterisks for feedback.
2377      *
2378      * Set passwd to 1 in caller to conceal by printing asterisks.
2379      */
2380     if(passwd && *passwd >= 10){        /* don't show asterisks */
2381         if(*passwd > 10)
2382           return;
2383         else
2384           *passwd = 11;         /* only blat once */
2385
2386         i = 0;
2387         (*displ->movecursor)(displ->row, displ->col);
2388         while(i++ <= displ->dwid)
2389           (*displ->writechar)(' ');
2390
2391         (*displ->movecursor)(displ->row, displ->col);
2392         return;
2393     }
2394
2395     if(passwd && *passwd)
2396       width_a_to_b = single_width_chars_a_to_b;
2397     else
2398       width_a_to_b = ucs4_str_width_a_to_b;
2399
2400     /*
2401      * vl is the virtual line (the actual data). We operate on it by typing
2402      * characters to be added and deleting and so forth. In this routine we
2403      * copy a subset of those UCS-4 characters in vl into dl, the display
2404      * array, and show that subset on the screen.
2405      *
2406      * Offset is the location of the cursor in vl.
2407      *
2408      * We will display the string starting from vbase.
2409      * We have dwid screen cells to work in.
2410      * We may have to adjust vbase in order to display the
2411      * part of the string that contains the cursor.
2412      *
2413      * We'll make the display look like
2414      *   vl    a b c d e f g h i j k l m
2415      *             xxxxxxxxxxxxx  <- width dwid window
2416      *             < d e f g h >
2417      *               |
2418      *             vbase
2419      * The < will be there if vbase > 0.
2420      * The > will be there if the string from vbase to the
2421      * end can't all fit in the window.
2422      */
2423
2424     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2425
2426     /*
2427      * Adjust vbase so offset is not out of the window to the right.
2428      * (The +2 in w + 2 is for a possible " >" if the string goes past
2429      *  the right hand edge of the window and if the last visible character
2430      * is double wide. We don't want the offset to be under that > character.)
2431      */
2432     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2433         displ->dwid > 1 &&
2434         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2435         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2436         /*
2437          * offset is off the window to the right
2438          * It looks like   a b c d e f g h
2439          *                   |         |
2440          *               vbase         offset
2441          * and offset is either past the right edge,
2442          * or right at the right edge (and maybe under >),
2443          * or one before right at the edge (and maybe on space
2444          * for half a character).
2445          *
2446          * Since the characters may be double width it is slightly
2447          * complicated to figure out how far to increase vbase.
2448          * We're going to scoot over past width w/2 characters and
2449          * then see if that's sufficient.
2450          */
2451         new_vbase = displ->vbase + 1;
2452         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2453             w2 < displ->dwid/2;
2454             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2455           new_vbase++;
2456
2457         displ->vbase = new_vbase;
2458     }
2459
2460     /* adjust so offset is not out of the window to the left */
2461     while(displ->vbase > 0 && displ->vbase >= offset){
2462         /* add about dwid/2 more width */
2463         new_vbase = displ->vbase - 1;
2464         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2465             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2466             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2467           new_vbase--;
2468
2469         /* but don't let it get too small, recheck off right end */
2470         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2471             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2472             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2473           new_vbase++;
2474
2475         displ->vbase = MAX(new_vbase, 0);
2476     }
2477
2478     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2479       displ->vbase = 0;
2480
2481     vfirst = displ->vbase;
2482     dfirst = 0;
2483     if(displ->vbase > 0){                       /* off screen cue left */
2484         dfirst = 1;                             /* index which matches vfirst */
2485         displ->dl[0] = '<';
2486     }
2487
2488     vlast = displ->vused-1;                     /* end */
2489     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2490
2491     if(displ->dwid > 0 && w + dfirst > displ->dwid){                    /* off window right */
2492
2493         /* find last ucs character to be printed */
2494         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2495           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2496
2497         /* worry about double-width characters */
2498         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2499             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2500             displ->dl[dlast] = '>';
2501         }
2502         else{
2503             dlast = dfirst + vlast - vfirst + 1;
2504             displ->dl[dlast++] = ' ';
2505             displ->dl[dlast] = '>';
2506         }
2507     }
2508     else
2509       dlast = dfirst + vlast - vfirst;
2510
2511     /*
2512      * Copy the relevant part of the virtual line into the display line.
2513      */
2514     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2515       if(passwd && *passwd)
2516         displ->dl[di] = '*';            /* to conceal password */
2517       else
2518         displ->dl[di] = displ->vl[vi];
2519
2520     /*
2521      * Add spaces to clear the rest of the line.
2522      * We have dwid total space to fill.
2523      */
2524     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2525     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2526       displ->dl[di++] = ' ';
2527
2528     /*
2529      * Draw from left to right, skipping until we get to
2530      * something that is different. Characters may be different
2531      * widths than they were initially so paint from there the
2532      * rest of the way.
2533      */
2534     for(di = 0; displ->dl[di]; di++){
2535         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2536             /* move cursor first time */
2537             if(!already_got_one++){
2538                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2539                 (*displ->movecursor)(displ->row, displ->col + w);
2540             }
2541
2542             (*displ->writechar)(displ->dl[di]);
2543             displ->olddl[di] = displ->dl[di];
2544         }
2545     }
2546
2547     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2548
2549     /*
2550      * Move the cursor to the offset.
2551      *
2552      * The offset is relative to the start of the virtual array. We need
2553      * to find the location on the screen. The offset into the display array
2554      * will be offset-vbase+dfirst. We want to be at the start of that
2555      * character, so we need to find the width of all the characters up
2556      * to that point.
2557      */
2558     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2559
2560     (*displ->movecursor)(displ->row, displ->col + w);
2561 }
2562
2563
2564 /*
2565  * This is just like ucs4_str_width_a_to_b() except all of the characters
2566  * are assumed to be of width 1. This is for printing out *'s when user
2567  * enters a password, while still managing to use the same code to do the
2568  * display.
2569  */
2570 unsigned
2571 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2572 {
2573     unsigned width = 0;
2574     int i;
2575
2576     if(ucsstr)
2577       for(i = a; i <= b && ucsstr[i]; i++)
2578         width++;
2579
2580     return width;
2581 }