pith/charconv/utf8.c

   1 /*
   2  * ========================================================================
   3  * Copyright 2013-2022 Eduardo Chappa
   4  * Copyright 2006-2008 University of Washington
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * ========================================================================
  13  */
  14
  15
  16 /* includable WITHOUT dependency on c-client */
  17 #include "../../c-client/mail.h"
  18 #include "../../c-client/utf8.h"
  19
  20 #ifdef _WINDOWS
  21 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  22 #undef ERROR
  23 #else
  24 #define _XOPEN_SOURCE
  25 #endif
  26
  27 #include <system.h>
  28
  29 #include "../../c-client/fs.h"
  30
  31 /* includable WITHOUT dependency on pico */
  32 #include "../../pico/keydefs.h"
  33
  34 #include "../osdep/collate.h"
  35 #include "../filttype.h"
  36
  37 #include "utf8.h"
  38
  39 #include <stdarg.h>
  40
  41
  42 unsigned single_width_chars_a_to_b(UCS *, int, int);
  43
  44
  45 static char locale_charmap[50];
  46
  47 static int   native_utf8;
  48 static void *display_data;
  49
  50 void
  51 init_utf8_display(int utf8, void *rmap)
  52 {
  53     native_utf8 = utf8;
  54     display_data = rmap;
  55 }
  56
  57
  58 /*
  59  * Argument is a UCS-4 wide character.
  60  * Returns the environment dependent cell width of the
  61  * character when printed to the screen.
  62  * This will be -1 if the character is not printable.
  63  * It will be >= zero if it is printable.
  64  *
  65  * Note that in the case it is not printable but it is still sent to
  66  * Writechar, Writechar will print a '?' with width 1.
  67  */
  68 int
  69 wcellwidth(UCS ucs)
  70 {
  71     char dummy[32];
  72     long w;
  73
  74     /*
  75      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  76      * That's the assumption here.
  77      */
  78
  79     if(native_utf8){                    /* display is UTF-8 capable */
  80         w = ucs4_width((unsigned long) ucs);
  81         return((w & U4W_ERROR) ? -1 : w);
  82     }
  83     else if(display_data){
  84         if(wtomb(dummy, ucs) < 0)
  85           return(-1);
  86         else{
  87             w = ucs4_width((unsigned long) ucs);
  88             return((w & U4W_ERROR) ? -1 : w);
  89         }
  90     }
  91 #if !defined(_WINDOWS) && HAVE_WCWIDTH
  92     else
  93       return(wcwidth((wchar_t) ucs));
  94 #else
  95     return(0);
  96 #endif
  97 }
  98
  99 /* ambiguous width zone character function. We use the Windows code until
 100  * we find a better way to do it in general.
 101  */
 102 int
 103 pith_ucs4width(UCS ucs)
 104 {
 105   return (ucs >= 0x2100) ? 2 : 1;
 106 #if !defined(_WINDOWS) && HAVE_WCWIDTH
 107   return wcwidth((wchar_t) ucs);
 108 #else
 109   return (ucs >= 0x2100) ? 2 : 1;
 110 #endif /* _WINDOWS */
 111 }
 112
 113 /*
 114  * Argument is a UCS-4 wide character.
 115  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 116  * Dest is a buffer at least xx chars wide where the multi-byte version
 117  * of the wide character will be written.
 118  * The returned value is the number of bytes written to dest or -1
 119  * if the conversion can't be done.
 120  */
 121 int
 122 wtomb(char *dest, UCS ucs)
 123 {
 124     int rv;
 125     /*
 126      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 127      * That's the assumption here.
 128      */
 129
 130     if(native_utf8){
 131         unsigned char *newdptr;
 132
 133         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 134         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 135     }
 136     else if(display_data){
 137         unsigned long ucs4;
 138         int           ret;
 139
 140         ucs4 = (unsigned long) ucs;
 141         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 142         if(ret >= 0)
 143           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 144         else
 145           ret = -1;
 146
 147         return(ret);
 148     }
 149     else
 150 #if defined(HAVE_WCRTOMB)
 151        rv = wcrtomb(dest, (wchar_t) ucs, NULL);
 152 #elif defined(HAVE_WCTOMB)
 153        rv = wctomb(dest, (wchar_t) ucs);
 154 #else
 155        rv = -1;
 156 #endif
 157    return rv;
 158 }
 159
 160
 161 /*
 162  * This function does not necessarily update inputp and remaining_octets, so
 163  * don't rely on that. The c-client version does but the other doesn't.
 164  */
 165 UCS
 166 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 167 {
 168     UCS ucs;
 169
 170     if(input_cs){
 171         CHARSET *cast_input_cs;
 172
 173         cast_input_cs = (CHARSET *) input_cs;
 174
 175         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 176           case U8G_ENDSTRG:
 177           case U8G_ENDSTRI:
 178             return(CCONV_NEEDMORE);
 179
 180           default:
 181             if(ucs & U8G_ERROR || ucs == UBOGON)
 182               return(CCONV_BADCHAR);
 183
 184             return(ucs);
 185         }
 186     }
 187     else{
 188         size_t ret;
 189         wchar_t w;
 190
 191         /*
 192          * Warning:  input_cs and remaining_octets are unused in this
 193          * half of the if/else.
 194          *
 195          * Unfortunately, we can't tell the difference between a source string
 196          * that is just not long enough and one that has characters that can't
 197          * be converted even though it is long enough. We return NEEDMORE in both cases.
 198          */
 199         ret = mbstowcs(&w, (char *) (*inputp), 1);
 200         if(ret == (size_t)(-1))
 201           return(CCONV_NEEDMORE);
 202         else{
 203           ucs = (UCS) w;
 204           return(ucs);
 205         }
 206     }
 207 }
 208
 209
 210 void
 211 set_locale_charmap(char *charmap)
 212 {
 213     if(charmap){
 214         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 215         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 216     }
 217     else
 218       locale_charmap[0] = '\0';
 219 }
 220
 221
 222 /*
 223  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 224  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 225  * The caller is responsible for freeing the returned value.
 226  *
 227  * Args  str     -- the string to convert
 228  */
 229 char *
 230 convert_to_utf8(char *str, char *fromcharset, int flags)
 231 {
 232     char          *ret = NULL;
 233     char          *fcharset;
 234     SIZEDTEXT      src, result;
 235     const CHARSET *cs = NULL;
 236     int            try;
 237
 238     src.data = (unsigned char *) str;
 239     src.size = strlen(str);
 240
 241     /* already UTF-8, return NULL */
 242     if(!(flags & CU8_NOINFER)
 243        && (cs = utf8_infercharset(&src))
 244        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 245       return(ret);
 246
 247     try = 1;
 248     while(try < 5){
 249         switch(try){
 250           case 1:
 251             fcharset = fromcharset;
 252             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 253               break;    /* give it a try */
 254             else
 255               try++;    /* fall through */
 256
 257           case 2:
 258             if(!(flags & CU8_NOINFER)){
 259                 fcharset = cs ? cs->name : NULL;
 260                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 261                   break;
 262                 else
 263                   try++;        /* fall through */
 264             }
 265             else
 266               try++;    /* fall through */
 267
 268           case 3:
 269             fcharset = locale_charmap;
 270             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 271               break;
 272             else
 273               try++;    /* fall through */
 274
 275           default:
 276             fcharset = "ISO-8859-1";            /* this will "work" */
 277             break;
 278         }
 279
 280         memset(&result, 0, sizeof(result));
 281
 282         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 283             if(!(result.size == src.size && result.data == src.data)){
 284                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 285                 strncpy(ret, (char *) result.data, result.size);
 286                 ret[result.size] = '\0';
 287             }
 288             /* else no conversion necessary */
 289
 290             if(result.data && result.data != src.data)
 291               fs_give((void **) &result.data);
 292             result.size = 0;
 293
 294             return(ret);
 295         }
 296
 297         try++;
 298     }
 299
 300     /* won't make it to here */
 301     return(ret);
 302 }
 303
 304
 305 /*
 306  * Convert from UTF-8 to user's locale charset.
 307  * This actually uses the wtomb routine to do the conversion, and that
 308  * relies on setup_for_input_output having been called.
 309  * If no conversion is necessary, NULL is returned, otherwise an allocated
 310  * string in the locale charset is returned and the caller is responsible
 311  * for freeing it.
 312  */
 313 char *
 314 convert_to_locale(char *utf8str)
 315 {
 316 #define CHNK 500
 317     char *inp, *ret = NULL;
 318     CBUF_S cb;
 319     int alloced;
 320     size_t i = 0;
 321
 322     if(native_utf8 || !utf8str || !utf8str[0])
 323       return(NULL);
 324
 325     cb.cbuf[0] = '\0';
 326     cb.cbufp = cb.cbufend = cb.cbuf;
 327     inp = utf8str;
 328
 329     alloced = CHNK;
 330     ret = (char *) fs_get(alloced * sizeof(char));
 331
 332     /*
 333      * There's gotta be a better way to do this but utf8_to_locale was
 334      * available and everything looks like a nail when all you have
 335      * is a hammer.
 336      */
 337     while(*inp){
 338         /*
 339          * We're placing the outgoing stream of characters in ret, a multi-byte
 340          * array of characters in the user's locale charset. See if there is
 341          * enough room for the next wide characters worth of output chars
 342          * and allocate more space if not.
 343          */
 344         if((alloced - i) < MAX(MB_LEN_MAX,32)){
 345             alloced += CHNK;
 346             fs_resize((void **) &ret, alloced * sizeof(char));
 347         }
 348
 349         i += utf8_to_locale((int) *inp++, &cb,
 350                            (unsigned char *) &ret[i], alloced - i);
 351     }
 352
 353     fs_resize((void **) &ret, i + 1);
 354
 355     ret[i] = '\0';
 356
 357     return(ret);
 358 }
 359
 360
 361 /*
 362  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 363  * filled in with multi-byte characters. The return value is the
 364  * number of valid characters in obuf to be used.
 365  */
 366 int
 367 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 368 {
 369     int outchars = 0;
 370
 371     if(!(cb && cb->cbufp))
 372       return(0);
 373
 374     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 375         unsigned char *inputp;
 376         unsigned long remaining_octets;
 377         UCS ucs;
 378
 379         *(cb->cbufp)++ = (unsigned char) c;
 380         inputp = cb->cbuf;
 381         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 382         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 383
 384         switch(ucs){
 385           case U8G_ENDSTRG:     /* incomplete character, wait */
 386           case U8G_ENDSTRI:     /* incomplete character, wait */
 387             break;
 388
 389           default:
 390             if(ucs & U8G_ERROR || ucs == UBOGON){
 391                 /*
 392                  * None of these cases is supposed to happen. If it
 393                  * does happen then the input stream isn't UTF-8
 394                  * so something is wrong. Treat each character in the
 395                  * input buffer as a separate error character and
 396                  * print a '?' for each.
 397                  */
 398                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 399                   obuf[outchars++] = '?';
 400
 401                 cb->cbufp = cb->cbuf;
 402             }
 403             else{
 404                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 405                     /*
 406                      * This happens when we have a UTF-8 character that
 407                      * we aren't able to print in our locale. For example,
 408                      * if the locale is setup with the terminal
 409                      * expecting ISO-8859-1 characters then there are
 410                      * lots of UTF-8 characters that can't be printed.
 411                      * Print a '?' instead.
 412                      */
 413                     obuf[outchars++] = '?';
 414                 }
 415                 else{
 416                     /*
 417                      * Convert the ucs into the multibyte
 418                      * character that corresponds to the
 419                      * ucs in the users locale.
 420                      */
 421                     outchars = wtomb((char *) obuf, ucs);
 422                     if(outchars < 0){
 423                         obuf[0] = '?';
 424                         outchars = 1;
 425                     }
 426                 }
 427
 428                 /* update the input buffer */
 429                 if(inputp >= cb->cbufp) /* this should be the case */
 430                   cb->cbufp = cb->cbuf;
 431                 else{           /* extra chars for some reason? */
 432                     unsigned char *q, *newcbufp;
 433
 434                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 435                     q = cb->cbuf;
 436                     while(inputp < cb->cbufp)
 437                       *q++ = *inputp++;
 438
 439                     cb->cbufp = newcbufp;
 440                 }
 441             }
 442
 443             break;
 444         }
 445     }
 446     else{                       /* error */
 447         obuf[0] = '?';
 448         outchars = 1;
 449         cb->cbufp = cb->cbuf;   /* start over */
 450     }
 451
 452     return(outchars);
 453 }
 454
 455
 456 /*
 457  * Returns the screen cells width of the UCS-4 string argument.
 458  * The source string is zero terminated.
 459  */
 460 unsigned
 461 ucs4_str_width(UCS *ucsstr)
 462 {
 463     unsigned width = 0;
 464     int w;
 465
 466     if(ucsstr)
 467       while(*ucsstr){
 468         w = wcellwidth(*ucsstr++);
 469         if(w != U4W_CTLSRGT)
 470           width += (w < 0 ? 1 : w);
 471       }
 472
 473     return width;
 474 }
 475
 476
 477 /*
 478  * Returns the screen cells width of the UCS-4 string argument
 479  * from ucsstr[a] through (inclusive) ucsstr[b].
 480  * No checking is done to make sure a starts in the middle
 481  * of a UCS-4 array.
 482  */
 483 unsigned
 484 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 485 {
 486     unsigned width = 0;
 487     int i, w;
 488
 489     if(ucsstr)
 490       for(i = a; i <= b && ucsstr[i]; i++){
 491         w = wcellwidth(ucsstr[i]);
 492         if(w != U4W_CTLSRGT)
 493           width += (w < 0 ? 1 : w);
 494       }
 495
 496     return width;
 497 }
 498
 499
 500 /*
 501  * Returns the screen cells width of the UCS-4 string argument
 502  * from ustart through (exclusive) uend.
 503  * No checking is done to make sure it starts in the middle
 504  * of a UCS-4 array.
 505  */
 506 unsigned
 507 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 508 {
 509     UCS *u;
 510     unsigned width = 0;
 511     int w;
 512
 513     if(!ustart)
 514       return width;
 515
 516     if(ustart)
 517       for(u = ustart; u < uend; u++){
 518         w = wcellwidth(*u);
 519         if(w != U4W_CTLSRGT)
 520           width += (w < 0 ? 1 : w);
 521       }
 522
 523     return(width);
 524 }
 525
 526
 527 /*
 528  * Return the largest possible pointer into ucs4str so that the width
 529  * of the string from ucs4str to the pointer (exclusive)
 530  * is maxwidth or less. Also stops at a null character.
 531  */
 532 UCS *
 533 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 534 {
 535     UCS *u;
 536     int w_consumed = 0, w, done = 0;
 537
 538     u = ucs4str;
 539
 540     if(u)
 541       while(!done && *u && w_consumed <= maxwidth){
 542         w = wcellwidth(*u);
 543         w = (w >= 0 ? w : 1);
 544         if(w_consumed + w <= maxwidth){
 545             w_consumed += w;
 546             ++u;
 547         }
 548         else
 549           ++done;
 550       }
 551
 552     return(u);
 553 }
 554
 555
 556 /*
 557  * Convert and copy a UTF-8 string into a UCS-4 NULL
 558  * terminated array. Just like cpystr only it converts
 559  * from UTF-8 to UCS-4.
 560  *
 561  * Returned UCS-4 string needs to be freed by caller.
 562  */
 563 UCS *
 564 utf8_to_ucs4_cpystr(char *utf8src)
 565 {
 566     size_t         retsize;
 567     UCS           *ret = NULL;
 568     UCS            ucs;
 569     unsigned long  remaining_octets;
 570     unsigned char *readptr;
 571     size_t         arrayindex;
 572
 573     /*
 574      * We don't know how big to allocate the return array
 575      * because variable numbers of octets in the src array
 576      * will combine to make UCS-4 characters. The number of
 577      * UCS-4 characters is less than or equal to the number
 578      * of src characters, though.
 579      */
 580
 581     if(!utf8src)
 582       return NULL;
 583
 584     retsize = strlen(utf8src) + 1;
 585
 586     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 587     memset(ret, 0, retsize * sizeof(*ret));
 588
 589     readptr = (unsigned char *) utf8src;
 590     remaining_octets = retsize-1;
 591     arrayindex = 0;
 592
 593     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 594         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 595
 596         if(ucs & U8G_ERROR || ucs == UBOGON)
 597           remaining_octets = 0;
 598         else
 599           ret[arrayindex++] = ucs;
 600     }
 601
 602     ret[arrayindex] = '\0';
 603
 604     /* get rid of excess size */
 605     if(arrayindex+1 < retsize)
 606       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 607
 608     return ret;
 609 }
 610
 611
 612 /*
 613  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 614  * terminated string. Just like cpystr only it converts
 615  * from UCS-4 to UTF-8.
 616  *
 617  * Returned UTF-8 string needs to be freed by caller.
 618  */
 619 char *
 620 ucs4_to_utf8_cpystr(UCS *ucs4src)
 621 {
 622     unsigned char *ret = NULL;
 623     unsigned char *writeptr;
 624     int            i;
 625
 626     if(!ucs4src)
 627       return NULL;
 628
 629     /*
 630      * Over-allocate and then resize at the end.
 631      */
 632
 633     /* count characters in source */
 634     for(i = 0; ucs4src[i]; i++)
 635       ;
 636
 637     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 638     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 639
 640     writeptr = ret;
 641     for(i = 0; ucs4src[i]; i++)
 642       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 643
 644     /* get rid of excess size */
 645     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 646
 647     return ((char *) ret);
 648 }
 649
 650
 651 /*
 652  * Similar to above but copy a fixed number of source
 653  * characters instead of going until null terminator.
 654  */
 655 char *
 656 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 657 {
 658     unsigned char *ret = NULL;
 659     unsigned char *writeptr;
 660     int            i;
 661
 662     if(!ucs4src)
 663       return NULL;
 664
 665     /*
 666      * Over-allocate and then resize at the end.
 667      */
 668
 669     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 670     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 671
 672     writeptr = ret;
 673     for(i = 0; i < ucs4src_len; i++)
 674       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 675
 676     /* get rid of excess size */
 677     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 678
 679     return ((char *) ret);
 680 }
 681
 682 /*
 683  * Similar to above but copy what is possible to a
 684  * string of a size at most the given retlen.
 685  */
 686 char *
 687 ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
 688 {
 689     unsigned char *ret = NULL;
 690     unsigned char *writeptr;
 691     int            i, oldlen, len;
 692
 693     if(!ucs4src)
 694       return NULL;
 695
 696     /*
 697      * Over-allocate and then resize at the end.
 698      */
 699
 700     /* count characters in source */
 701     for(i = 0; ucs4src[i]; i++)
 702       ;
 703
 704     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
 705     memset(ret, 0, (6*i + 1) * sizeof(unsigned char));
 706
 707     writeptr = ret;
 708     oldlen = len = 0;
 709     for(i = 0; ucs4src[i] && (len < retlen); i++){
 710       oldlen = len;
 711       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 712       len = strlen((char *)ret);
 713     }
 714     if(len > retlen){
 715       ret[oldlen] = '\0';
 716       len = oldlen;
 717     }
 718
 719     /* get rid of excess size */
 720     fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));
 721
 722     return ((char *) ret);
 723 }
 724
 725
 726 #ifdef _WINDOWS
 727 /*
 728  * Convert a UTF-8 argument into an LPTSTR version
 729  * of that argument. The result is allocated here
 730  * and should be freed by the caller.
 731  */
 732 LPTSTR
 733 utf8_to_lptstr(LPSTR arg_utf8)
 734 {
 735      int lptstr_len;
 736      LPTSTR lptstr_ret = NULL;
 737
 738      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 739      if(lptstr_len > 0)
 740      {
 741          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 742          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 743              arg_utf8, -1, lptstr_ret, lptstr_len );
 744      }
 745
 746      if(!lptstr_len)
 747      {
 748          /* check GetLastError()? */
 749          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 750          lptstr_ret[0] = 0;
 751      }
 752
 753      return lptstr_ret;
 754 }
 755
 756
 757 /*
 758  * Convert an LPTSTR argument into a UTF-8 version
 759  * of that argument. The result is allocated here
 760  * and should be freed by the caller.
 761  */
 762 LPSTR
 763 lptstr_to_utf8(LPTSTR arg_lptstr)
 764 {
 765      int utf8str_len;
 766      LPSTR utf8str_ret = NULL;
 767
 768      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 769      if(utf8str_len > 0)
 770      {
 771          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 772          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 773              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 774      }
 775
 776      if(!utf8str_len)
 777      {
 778          /* check GetLastError()? */
 779          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 780          utf8str_ret[0] = 0;
 781      }
 782
 783      return utf8str_ret;
 784 }
 785
 786
 787 /*
 788  * Convert a UCS4 argument into an LPTSTR version
 789  * of that argument. The result is allocated here
 790  * and should be freed by the caller.
 791  */
 792 LPTSTR
 793 ucs4_to_lptstr(UCS *arg_ucs4)
 794 {
 795     LPTSTR ret_lptstr = NULL;
 796     size_t len;
 797     size_t i;
 798
 799     if(arg_ucs4){
 800         len = ucs4_strlen(arg_ucs4);
 801         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 802         /* bogus conversion ignores UTF-16 */
 803         for(i = 0; i < len; i++)
 804           ret_lptstr[i] = arg_ucs4[i];
 805
 806         ret_lptstr[len] = '\0';
 807     }
 808
 809     return(ret_lptstr);
 810 }
 811
 812
 813 /*
 814  * Convert an LPTSTR argument into a UCS4 version
 815  * of that argument. The result is MemAlloc'd here
 816  * and should be freed by the caller.
 817  */
 818 UCS *
 819 lptstr_to_ucs4(LPTSTR arg_lptstr)
 820 {
 821     UCS *ret_ucs4 = NULL;
 822     size_t len;
 823     size_t i;
 824
 825     if(arg_lptstr){
 826         len = _tcslen(arg_lptstr);
 827         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 828         /* bogus conversion ignores UTF-16 */
 829         for(i = 0; i < len; i++)
 830           ret_ucs4[i] = arg_lptstr[i];
 831
 832         ret_ucs4[len] = '\0';
 833     }
 834
 835     return(ret_ucs4);
 836 }
 837
 838 #endif /* _WINDOWS */
 839
 840
 841 /*
 842  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 843  * 1-at-a-time filled in with UCS characters. The return value is the
 844  * number of valid characters in obuf to be used. It can only
 845  * be 1 or 0 characters since we're only getting one UTF-8 character
 846  * at a time.
 847  */
 848 int
 849 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 850 {
 851     int  width = 0, outchars = 0;
 852
 853     if(!(cb && cb->cbufp))
 854       return(0);
 855
 856     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 857         unsigned char *inputp;
 858         unsigned long remaining_octets;
 859         UCS ucs;
 860
 861         *cb->cbufp++ = (unsigned char) c;
 862         inputp = cb->cbuf;
 863         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 864         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 865
 866         switch(ucs){
 867           case U8G_ENDSTRG:     /* incomplete character, wait */
 868           case U8G_ENDSTRI:     /* incomplete character, wait */
 869             break;
 870
 871           default:
 872             if(ucs & U8G_ERROR || ucs == UBOGON){
 873                 /*
 874                  * None of these cases is supposed to happen. If it
 875                  * does happen then the input stream isn't UTF-8
 876                  * so something is wrong.
 877                  */
 878                 outchars++;
 879                 *obuf = '?';
 880                 cb->cbufp = cb->cbuf;
 881                 width = 1;
 882             }
 883             else{
 884                 outchars++;
 885                 if(ucs < 0x80 && ucs >= 0x20)
 886                   width = 1;
 887
 888                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 889                     /*
 890                      * This happens when we have a UTF-8 character that
 891                      * we aren't able to print in our locale. For example,
 892                      * if the locale is setup with the terminal
 893                      * expecting ISO-8859-1 characters then there are
 894                      * lots of UTF-8 characters that can't be printed.
 895                      * Print a '?' instead.
 896                      * Don't think this should happen in Windows.
 897                      */
 898                     *obuf = '?';
 899                 }
 900                 else{
 901                     *obuf = ucs;
 902                 }
 903
 904                 /* update the input buffer */
 905                 if(inputp >= cb->cbufp) /* this should be the case */
 906                   cb->cbufp = cb->cbuf;
 907                 else{           /* extra chars for some reason? */
 908                     unsigned char *q, *newcbufp;
 909
 910                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 911                     q = cb->cbuf;
 912                     while(inputp < cb->cbufp)
 913                       *q++ = *inputp++;
 914
 915                     cb->cbufp = newcbufp;
 916                 }
 917             }
 918
 919             break;
 920         }
 921     }
 922     else{                       /* error */
 923         *obuf = '?';
 924         outchars = 1;
 925         width = 1;
 926         cb->cbufp = cb->cbuf;   /* start over */
 927     }
 928
 929     if(obufwidth)
 930       *obufwidth = width;
 931
 932     return(outchars);
 933 }
 934
 935
 936 /*
 937  * Return an allocated copy of a zero-terminated UCS-4 string.
 938  */
 939 UCS *
 940 ucs4_cpystr(UCS *ucs4src)
 941 {
 942     size_t         arraysize;
 943     UCS           *ret = NULL;
 944     size_t         i;
 945
 946     if(!ucs4src)
 947       return NULL;
 948
 949     arraysize = ucs4_strlen(ucs4src);
 950
 951     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 952     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 953
 954     for(i = 0; i < arraysize; i++)
 955       ret[i] = ucs4src[i];
 956
 957     return ret;
 958 }
 959
 960
 961 UCS *
 962 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 963 {
 964     size_t i;
 965
 966     if(ucs4src && ucs4dst){
 967         for(i = 0; i < n; i++){
 968             ucs4dst[i] = ucs4src[i];
 969             if(ucs4dst[i] == '\0')
 970               break;
 971         }
 972     }
 973
 974     return ucs4dst;
 975 }
 976
 977
 978 UCS *
 979 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 980 {
 981     size_t i;
 982     UCS *u;
 983
 984     if(ucs4src && ucs4dst){
 985         for(u = ucs4dst; *u; u++)
 986           ;
 987
 988         for(i = 0; i < n; i++){
 989             u[i] = ucs4src[i];
 990             if(u[i] == '\0')
 991               break;
 992         }
 993
 994         if(i == n)
 995           u[i] = '\0';
 996     }
 997
 998     return ucs4dst;
 999 }
1000
1001
1002 /*
1003  * Like strlen only this returns the number of non-zero characters
1004  * in a zero-terminated UCS-4 array.
1005  */
1006 size_t
1007 ucs4_strlen(UCS *ucs4str)
1008 {
1009     size_t i = 0;
1010
1011     if(ucs4str)
1012       while(ucs4str[i])
1013         i++;
1014
1015     return(i);
1016 }
1017
1018
1019 int
1020 ucs4_strcmp(UCS *s1, UCS *s2)
1021 {
1022     for(; *s1 == *s2; s1++, s2++)
1023       if(*s1 == '\0')
1024         return 0;
1025
1026     return((*s1 < *s2) ? -1 : 1);
1027 }
1028
1029
1030 UCS *
1031 ucs4_strchr(UCS *s, UCS c)
1032 {
1033     if(!s)
1034       return NULL;
1035
1036     while(*s && *s != c)
1037       s++;
1038
1039     if(*s || !c)
1040       return s;
1041     else
1042       return NULL;
1043 }
1044
1045
1046 UCS *
1047 ucs4_strrchr(UCS *s, UCS c)
1048 {
1049     UCS *ret = NULL;
1050
1051     if(!s)
1052       return ret;
1053
1054     while(*s){
1055         if(*s == c)
1056           ret = s;
1057
1058         s++;
1059     }
1060
1061     return ret;
1062 }
1063
1064
1065 /*
1066  * Returns the screen cells width of the UTF-8 string argument.
1067  */
1068 unsigned
1069 utf8_width(char *str)
1070 {
1071     unsigned width = 0;
1072     int this_width;
1073     UCS ucs;
1074     unsigned long remaining_octets;
1075     char *readptr;
1076
1077     if(!(str && *str))
1078       return(width);
1079
1080     readptr = str;
1081     remaining_octets = readptr ? strlen(readptr) : 0;
1082
1083     while(remaining_octets > 0 && *readptr){
1084
1085         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1086
1087         if(ucs & U8G_ERROR || ucs == UBOGON){
1088             /*
1089              * This should not happen, but do something to handle it anyway.
1090              * Treat each character as a single width character, which is what should
1091              * probably happen when we actually go to write it out.
1092              */
1093             remaining_octets--;
1094             readptr++;
1095             this_width = 1;
1096         }
1097         else{
1098             this_width = wcellwidth(ucs);
1099
1100             /*
1101              * If this_width is -1 that means we can't print this character
1102              * with our current locale. Writechar will print a '?'.
1103              */
1104             if(this_width < 0)
1105               this_width = 1;
1106         }
1107
1108         width += (unsigned) this_width;
1109     }
1110
1111     return(width);
1112 }
1113
1114
1115 /*
1116  * Copy UTF-8 characters from src into dst.
1117  * This is intended to be used if you want to truncate a string at
1118  * the start instead of the end. For example, you have a long string
1119  * like
1120  *       this_is_a_long_string
1121  * but not enough space to fit it into a particular field. You want to
1122  * end up with
1123  *             s_a_long_string
1124  * where that fits in a particular width. Perhaps you'd use this with ...
1125  * to get
1126  *          ...s_a_long_string
1127  * This right adjusts the end of the string in the width space and
1128  * cuts it off at the start. If there is enough width for the whole
1129  * string it will copy the string into dst with no padding.
1130  *
1131  * Copy enough characters so that the result will have screen width of
1132  * want_width screen cells in current locale.
1133  *
1134  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1135  *   to dst. This is just for protection, it shouldn't be relied on to
1136  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1137  *   characters truncated in the middle or something like that.
1138  *
1139  * Returned value is the number of bytes written to dst, not including
1140  *   the possible terminating null.
1141  *
1142  * If we can't hit want_width exactly because of double width characters
1143  *   then we will pad the end of the string with space in order to make
1144  *   the width exact.
1145  */
1146 size_t
1147 utf8_to_width_rhs(char *dst,            /* destination buffer */
1148                   char *src,            /* source string */
1149                   size_t dstlen,        /* space in dest */
1150                   unsigned want_width)  /* desired screen width */
1151 {
1152     int this_width;
1153     unsigned width_consumed = 0;
1154     UCS ucs;
1155     unsigned long remaining_octets;
1156     char *readptr, *goodreadptr, *savereadptr, *endptr;
1157     size_t nb = 0;
1158
1159     if(!src){
1160         if(dstlen > 0)
1161           dst[0] = '\0';
1162
1163         return nb;
1164     }
1165
1166     /*
1167      * Start at the end of the source string and go backwards until we
1168      * get to the desired width, but not more than the width.
1169      */
1170     readptr = src + strlen(src);
1171     endptr = readptr;
1172     goodreadptr = readptr;
1173     width_consumed = 0;
1174     savereadptr = readptr;
1175
1176     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1177         readptr = savereadptr-1){
1178
1179         savereadptr = readptr;
1180         remaining_octets = goodreadptr - readptr;
1181         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1182
1183         /*
1184          * Handling the error case is tough because an error will be the normal thing that
1185          * happens as we back through the string. So we're just going to punt on the
1186          * error for now.
1187          */
1188         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1189             if(remaining_octets > 0){
1190                 /*
1191                  * This means there are some bad octets after this good
1192                  * character so things are not going to work out well.
1193                  * Bail out.
1194                  */
1195                 savereadptr = src;      /* we're done */
1196             }
1197             else{
1198                 this_width = wcellwidth(ucs);
1199
1200                 if(this_width < 0)
1201                   this_width = 1;
1202
1203                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1204                     width_consumed += (unsigned) this_width;
1205                     goodreadptr = savereadptr;
1206                 }
1207                 else
1208                   savereadptr = src;    /* we're done */
1209             }
1210         }
1211     }
1212
1213     /*
1214      * Copy characters from goodreadptr to endptr into dst.
1215      */
1216     nb = MIN(endptr-goodreadptr, dstlen-1);
1217     strncpy(dst, goodreadptr, nb);
1218     dst[nb] = '\0';
1219
1220     /*
1221      * Pad out with spaces in order to hit width exactly.
1222      */
1223     while(width_consumed < want_width && nb < dstlen-1){
1224         dst[nb++] = ' ';
1225         dst[nb] = '\0';
1226         width_consumed++;
1227     }
1228
1229     return nb;
1230 }
1231
1232
1233 /*
1234  * The arguments being converted are UTF-8 strings.
1235  * This routine attempts to make it possible to use screen cell
1236  * widths in a format specifier. In a one-byte per screen cell
1237  * world we might have used %10.10s to cause a string to occupy
1238  * 10 screen positions. Since the width and precision are really
1239  * referring to numbers of bytes instead of screen positions that
1240  * won't work with UTF-8 input. We emulate that behavior with
1241  * the format string %w. %m.nw means to use the m and n as
1242  * screen width indicators instead of bytes indicators.
1243  *
1244  * There is no reason to use this routine unless you want to use
1245  * min field with or precision with the specifier. A plain %w without
1246  * widths is equivalent exactly to a plain %s in a regular printf.
1247  *
1248  * Double-width characters complicate things. It may not be possible
1249  * to satisfy the request exactly. For example, %3w for an input
1250  * string that is made up of two double-width characters.
1251  * This routine will arbitrarily use a trailing space character if
1252  * needed to make the width come out correctly where a half of a
1253  * double-width character would have been needed. We'll see how
1254  * that works for us.
1255  *
1256  * %w only works for strings (it's a %s replacement).
1257  *
1258  * Buffer overflow is handled by the size argument. %.30s will work
1259  * to limit a particular string to 30 bytes, but you lose that
1260  * ability with %w, since it may write more than precision bytes
1261  * in order to get to the desired width. It is best to choose
1262  * size large enough so that it doesn't come into play, otherwise
1263  * it may be possible to get partial UTF-8 characters because of
1264  * the truncation.
1265  *
1266  * The return value isn't quite the same as the return value
1267  * of snprintf. It is the number of bytes written, not counting
1268  * the trailing null, just like snprintf. However, if it is
1269  * truncated due to size then the output is size, not the
1270  * number of characters that would have been written.
1271  */
1272 int
1273 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1274 {
1275     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1276     char   *start_of_specifier;
1277     char   *input_str;
1278     int     int_arg;
1279     double  double_arg;
1280     void   *ptr_arg;
1281     unsigned got_width;
1282     int     more_flags, ret, w;
1283     int     min_field_width, field_precision, modifier;
1284     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1285     va_list args;
1286
1287     newfmt[0] = '\0';
1288     q = newfmt;
1289
1290     pdest = dest;
1291
1292 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1293     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1294
1295     /*
1296      * Strategy: Look through the fmt string for %w's. Replace the
1297      * %w's in the format string with %s's but with possibly different
1298      * width and precision arguments which will make it come out right.
1299      * Then call the regular system vsnprintf with the altered format
1300      * string but same arguments.
1301      *
1302      * That would be nice but it doesn't quite work. Why? Because a
1303      * %*w will need to have the value in the integer argument the *
1304      * refers to modified. Can't do it as far as I can tell. Or we could
1305      * remove the integer argument somehow before calling printf. Can't
1306      * do it. Or we could somehow add an additional conversion specifier
1307      * that caused nothing to be printed but ate up the integer arg.
1308      * Can't figure out how to do that either.
1309      *
1310      * Since we can't figure out how to do it, the alternative is to
1311      * construct the result one piece at a time, pasting together the
1312      * pieces from the different conversions.
1313      */
1314     va_start(args, fmt);
1315
1316     while(*fmt && IS_ROOM_IN_DEST(1)){
1317         if(*fmt == '%'){
1318             start_of_specifier = fmt++;
1319
1320             min_field_width = field_precision = -1;
1321             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1322
1323             /* flags */
1324             more_flags = 1;
1325             while(more_flags){
1326                 switch(*fmt){
1327                   case '-':
1328                     flags_minus++;
1329                     fmt++;
1330                     break;
1331
1332                   case '+':
1333                     flags_plus++;
1334                     fmt++;
1335                     break;
1336
1337                   case ' ':
1338                     flags_space++;
1339                     fmt++;
1340                     break;
1341
1342                   case '0':
1343                     flags_zero++;
1344                     fmt++;
1345                     break;
1346
1347                   case '#':
1348                     flags_pound++;
1349                     fmt++;
1350                     break;
1351
1352                   default:
1353                     more_flags = 0;
1354                     break;
1355                 }
1356             }
1357
1358             /* minimum field width */
1359             if(*fmt == '*'){
1360                 min_field_width = va_arg(args, int);
1361                 fmt++;
1362             }
1363             else if(*fmt >= '0' && *fmt <= '9'){
1364                 width_str = fmt;
1365                 while (*fmt >= '0' && *fmt <= '9')
1366                   fmt++;
1367
1368                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1369                 if(sizeof(buf) > fmt-width_str)
1370                   buf[fmt-width_str] = '\0';
1371
1372                 buf[sizeof(buf)-1] = '\0';
1373
1374                 min_field_width = atoi(width_str);
1375             }
1376
1377             /* field precision */
1378             if(*fmt == '.'){
1379                 fmt++;
1380                 if(*fmt == '*'){
1381                     field_precision = va_arg(args, int);
1382                     fmt++;
1383                 }
1384                 else if(*fmt >= '0' && *fmt <= '9'){
1385                     width_str = fmt;
1386                     while (*fmt >= '0' && *fmt <= '9')
1387                       fmt++;
1388
1389                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1390                     if(sizeof(buf) > fmt-width_str)
1391                       buf[fmt-width_str] = '\0';
1392
1393                     buf[sizeof(buf)-1] = '\0';
1394
1395                     field_precision = atoi(width_str);
1396                 }
1397             }
1398
1399             /* length modifier */
1400             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1401               modifier = *fmt++;
1402
1403             /* conversion character */
1404             switch(*fmt){
1405               case 'w':
1406                 /*
1407                  * work with va_arg(char *) to figure out width
1408                  * and precision needed to produce the screen width
1409                  * and precision asked for in %w using some of the
1410                  * utf8 width routines we have.
1411                  */
1412
1413                 input_str = va_arg(args, char *);
1414                 if(field_precision >=0 || min_field_width >= 0)
1415                   w = utf8_width(input_str);
1416
1417                 if(field_precision >= 0){
1418                     if(w <= field_precision)
1419                       field_precision = -1;  /* print it all */
1420                     else{
1421                         /*
1422                          * We need to cut off some of the input_str
1423                          * in this case.
1424                          */
1425                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1426                         field_precision = (int) (end - input_str);
1427                         /* new w with this field_precision */
1428                         w = got_width;
1429                     }
1430                 }
1431
1432                 /* need some padding */
1433                 if(min_field_width >= 0)
1434                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1435                                       MAX(0, min_field_width - w);
1436
1437                 /*
1438                  * Now we just need to get the new format string
1439                  * set correctly in newfmt.
1440                  */
1441                 q = newfmt;
1442                 if(q-newfmt < sizeof(newfmt))
1443                   *q++ = '%';
1444
1445                 if(flags_minus && q-newfmt < sizeof(newfmt))
1446                   *q++ = '-';
1447                 if(flags_plus && q-newfmt < sizeof(newfmt))
1448                   *q++ = '+';
1449                 if(flags_space && q-newfmt < sizeof(newfmt))
1450                   *q++ = ' ';
1451                 if(flags_zero && q-newfmt < sizeof(newfmt))
1452                   *q++ = '0';
1453                 if(flags_pound && q-newfmt < sizeof(newfmt))
1454                   *q++ = '#';
1455
1456                 if(min_field_width >= 0){
1457                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1458                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1459                 }
1460
1461                 if(field_precision >= 0){
1462                     if(q-newfmt < sizeof(newfmt))
1463                       *q++ = '.';
1464
1465                     snprintf(buf, sizeof(buf), "%d", field_precision);
1466                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1467                 }
1468
1469                 if(q-newfmt < sizeof(newfmt))
1470                   *q++ = 's';
1471
1472                 if(q-newfmt < sizeof(newfmt))
1473                   *q++ = '\0';
1474
1475                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1476                 pdest += strlen(pdest);
1477
1478                 break;
1479
1480               case '\0':
1481                 fmt--;
1482                 break;
1483
1484               default:
1485                 /* make a new format which leaves out the dynamic '*' arguments */
1486                 q = newfmt;
1487                 if(q-newfmt < sizeof(newfmt))
1488                   *q++ = '%';
1489
1490                 if(flags_minus && q-newfmt < sizeof(newfmt))
1491                   *q++ = '-';
1492                 if(flags_plus && q-newfmt < sizeof(newfmt))
1493                   *q++ = '+';
1494                 if(flags_space && q-newfmt < sizeof(newfmt))
1495                   *q++ = ' ';
1496                 if(flags_zero && q-newfmt < sizeof(newfmt))
1497                   *q++ = '0';
1498                 if(flags_pound && q-newfmt < sizeof(newfmt))
1499                   *q++ = '#';
1500
1501                 if(min_field_width >= 0){
1502                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1503                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1504                 }
1505
1506                 if(field_precision >= 0){
1507                     if(q-newfmt < sizeof(newfmt))
1508                       *q++ = '.';
1509
1510                     snprintf(buf, sizeof(buf), "%d", field_precision);
1511                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1512                 }
1513
1514                 if(q-newfmt < sizeof(newfmt))
1515                   *q++ = *fmt;
1516
1517                 if(q-newfmt < sizeof(newfmt))
1518                   *q++ = '\0';
1519
1520                 switch(*fmt){
1521                   case 'd': case 'i': case 'o':
1522                   case 'x': case 'X': case 'u': case 'c':
1523                     int_arg = va_arg(args, int);
1524                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1525                     pdest += strlen(pdest);
1526                     break;
1527
1528                   case 's':
1529                     input_str = va_arg(args, char *);
1530                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1531                     pdest += strlen(pdest);
1532                     break;
1533
1534                   case 'f': case 'e': case 'E':
1535                   case 'g': case 'G':
1536                     double_arg = va_arg(args, double);
1537                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1538                     pdest += strlen(pdest);
1539                     break;
1540
1541                   case 'p':
1542                     ptr_arg = va_arg(args, void *);
1543                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1544                     pdest += strlen(pdest);
1545                     break;
1546
1547                   case '%':
1548                     if(IS_ROOM_IN_DEST(1))
1549                       *pdest++ =  '%';
1550
1551                     break;
1552
1553                   default:
1554                     /* didn't think of this type */
1555                     assert(0);
1556                     break;
1557                 }
1558
1559                 break;
1560             }
1561
1562             fmt++;
1563         }
1564         else{
1565             if(IS_ROOM_IN_DEST(1))
1566               *pdest++ = *fmt++;
1567         }
1568     }
1569
1570     ret = pdest - dest;
1571
1572     if(IS_ROOM_IN_DEST(1))
1573       *pdest++ = '\0';
1574
1575     va_end(args);
1576
1577     return ret;
1578 }
1579
1580
1581 /*
1582  * Copy UTF-8 characters from src into dst.
1583  * Copy enough characters so that the result will have (<=) screen width of
1584  * want_width screen cells in current locale.
1585  *
1586  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1587  *   to dst.
1588  *
1589  * Returned value is the number of bytes written to dst, not including
1590  *   the possible terminating null.
1591  * Got_width is another returned value. It is the width in screen cells of
1592  *   the string placed in dst. It will be the same as want_width if there
1593  *   are enough characters in the src to do that and if the character widths
1594  *   hit the width exactly. It will be less than want_width if we run out
1595  *   of src characters or if the next character width would skip over the
1596  *   width we want, because it is double width.
1597  *
1598  * Zero width characters are collected and included at the end of the string.
1599  *   That is, if we make it to want_width but there is still a zero length
1600  *   character sitting in src, we add that to dst. This might be an accent
1601  *   or something like that.
1602  */
1603 size_t
1604 utf8_to_width(char *dst,                /* destination buffer */
1605               char *src,                /* source string */
1606               size_t dstlen,            /* space in dst */
1607               unsigned want_width,      /* desired screen width */
1608               unsigned *got_width)      /* returned screen width in dst */
1609 {
1610     int this_width;
1611     unsigned width_consumed = 0;
1612     UCS ucs;
1613     unsigned long remaining_octets;
1614     char *writeptr, *readptr, *savereadptr, *endptr;
1615     int ran_out_of_space = 0;
1616
1617     readptr = src;
1618
1619     remaining_octets = readptr ? strlen(readptr) : 0;
1620
1621     writeptr = dst;
1622     endptr = writeptr + dstlen;
1623
1624     if(readptr && writeptr){
1625       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1626         savereadptr = readptr;
1627         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1628
1629         if(ucs & U8G_ERROR || ucs == UBOGON)
1630           remaining_octets = 0;
1631         else{
1632           this_width = wcellwidth(ucs);
1633
1634           /*
1635            * If this_width is -1 that means we can't print this character
1636            * with our current locale. Writechar will print a '?'.
1637            */
1638           if(this_width < 0)
1639             this_width = 1;
1640
1641           if(width_consumed + (unsigned) this_width <= want_width){
1642             /* append this utf8 character to dst if it will fit */
1643             if(writeptr + (readptr - savereadptr) < endptr){
1644               width_consumed += this_width;
1645               while(savereadptr < readptr)
1646                 *writeptr++ = *savereadptr++;
1647             }
1648             else
1649               ran_out_of_space++;       /* no more utf8 to dst */
1650           }
1651           else
1652             remaining_octets = 0;       /* we're done */
1653         }
1654       }
1655
1656       if(writeptr < endptr)
1657         *writeptr = '\0';
1658     }
1659
1660     if(got_width)
1661       *got_width = width_consumed;
1662
1663     return(writeptr ? (writeptr - dst) : 0);
1664 }
1665
1666
1667 /*
1668  * Str is a UTF-8 string.
1669  * Count forward width screencell positions and return a pointer to the
1670  * end of the string that is width wide.
1671  * The returned pointer points at the next character (where the null would
1672  * be placed).
1673  *
1674  * Got_width is another returned value. It is the width in screen cells of
1675  *   the string from str to the returned pointer. It will be the same as
1676  *   want_width if there are enough characters in the str to do that
1677  *   and if the character widths hit the width exactly. It will be less
1678  *   than want_width if we run out of characters or if the next character
1679  *   width would skip over the width we want, because it is double width.
1680  */
1681 char *
1682 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1683 {
1684     int this_width;
1685     unsigned width_consumed = 0;
1686     UCS ucs;
1687     unsigned long remaining_octets;
1688     char *readptr;
1689     char *retptr;
1690
1691     retptr = readptr = str;
1692
1693     remaining_octets = readptr ? strlen(readptr) : 0;
1694
1695     while(width_consumed <= want_width && remaining_octets > 0){
1696
1697         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1698
1699         if(ucs & U8G_ERROR || ucs == UBOGON){
1700             /*
1701              * This should not happen, but do something to handle it anyway.
1702              * Treat each character as a single width character, which is what should
1703              * probably happen when we actually go to write it out.
1704              */
1705             remaining_octets--;
1706             readptr++;
1707             this_width = 1;
1708         }
1709         else{
1710             this_width = wcellwidth(ucs);
1711
1712             /*
1713              * If this_width is -1 that means we can't print this character
1714              * with our current locale. Writechar will print a '?'.
1715              */
1716             if(this_width < 0)
1717               this_width = 1;
1718         }
1719
1720         if(width_consumed + (unsigned) this_width <= want_width){
1721             width_consumed += (unsigned) this_width;
1722             retptr = readptr;
1723         }
1724         else
1725           remaining_octets = 0; /* we're done */
1726     }
1727
1728     if(got_width)
1729       *got_width = width_consumed;
1730
1731     return(retptr);
1732 }
1733
1734
1735 /*
1736  * Copy a null terminator into a UTF-8 string in place so that the string is
1737  * no more than a certain screen width wide. If the string is already less
1738  * than or equal in width to the requested width, no change is made.
1739  *
1740  * The actual width accomplished is returned. Note that it may be less than
1741  * max_width due to double width characters as well as due to the fact that
1742  * it fits wholly in the max_width.
1743  *
1744  * Returned value is the actual screen width of str when done.
1745  *
1746  * A side effect is that a terminating null may have been written into
1747  * the passed in string.
1748  */
1749 unsigned
1750 utf8_truncate(char *str, unsigned max_width)
1751 {
1752     int this_width;
1753     unsigned width_consumed = 0;
1754     UCS ucs;
1755     unsigned long remaining_octets;
1756     char *readptr, *savereadptr;
1757
1758     readptr = str;
1759
1760     remaining_octets = readptr ? strlen(readptr) : 0;
1761
1762     if(readptr){
1763       while(width_consumed <= max_width && remaining_octets > 0){
1764
1765         savereadptr = readptr;
1766         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1767
1768         if(ucs & U8G_ERROR || ucs == UBOGON){
1769             /*
1770              * This should not happen, but do something to handle it anyway.
1771              * Treat each character as a single width character, which is what should
1772              * probably happen when we actually go to write it out.
1773              */
1774             remaining_octets--;
1775             readptr++;
1776             this_width = 1;
1777         }
1778         else{
1779             this_width = wcellwidth(ucs);
1780
1781             /*
1782              * If this_width is -1 that means we can't print this character
1783              * with our current locale. Writechar will print a '?'.
1784              */
1785             if(this_width < 0)
1786               this_width = 1;
1787         }
1788
1789         if(width_consumed + (unsigned) this_width <= max_width){
1790             width_consumed += (unsigned) this_width;
1791         }
1792         else{
1793             remaining_octets = 0;       /* we're done */
1794             *savereadptr = '\0';
1795         }
1796       }
1797     }
1798
1799     return(width_consumed);
1800 }
1801
1802
1803 /*
1804  * Copy UTF-8 characters from src into dst.
1805  * Copy enough characters so that the result will have screen width of
1806  * want_width screen cells in current locale.
1807  * If there aren't enough characters in src to get to want_width, pad on
1808  * left or right according to left_adjust argument.
1809  *
1810  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1811  *   to dst. Dst will be null terminated if there is enough room, but not
1812  *   if that would overflow dst's len.
1813  *
1814  * Returned value is the number of bytes written to dst, not including
1815  *   the possible terminating null.
1816  */
1817 size_t
1818 utf8_pad_to_width(char *dst,            /* destination buffer */
1819                   char *src,            /* source string */
1820                   size_t dstlen,        /* space in dst */
1821                   unsigned want_width,  /* desired screen width */
1822                   int left_adjust)      /* adjust left or right in want_width columns */
1823 {
1824     unsigned got_width = 0;
1825     int      need_more, howmany;
1826     size_t   len_left, bytes_used;
1827
1828     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1829     len_left = dstlen - bytes_used;
1830
1831     need_more = want_width - got_width;
1832     howmany = MIN(need_more, len_left);
1833
1834     if(howmany > 0){
1835         char *end, *newend, *p, *q;
1836
1837         end = dst + bytes_used;
1838         newend = end + howmany;
1839         if(left_adjust){
1840             /*
1841              * Add padding to end of string. Simply append
1842              * the needed number of spaces, or however many will fit
1843              * if we don't have enough space.
1844              */
1845             for(q = end; q < newend; q++)
1846               *q = ' ';
1847         }
1848         else{
1849             /*
1850              * Add padding to start of string.
1851              */
1852
1853             /* slide existing string over */
1854             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1855               *q = *p;
1856
1857             /* fill rest with spaces */
1858             for(; q >= dst; q--)
1859               *q = ' ';
1860         }
1861
1862         bytes_used += howmany;
1863     }
1864
1865     if(bytes_used < dstlen)
1866       dst[bytes_used] = '\0';
1867
1868     return(bytes_used);
1869 }
1870
1871
1872 /*
1873  * Str is a UTF-8 string.
1874  * Start_here is a pointer into the string. It points one position past
1875  * the last byte that should be considered a part of the length string.
1876  * Count back want_width screencell positions and return a pointer to the
1877  * start of the string that is want_width wide and ends with start_here.
1878  *
1879  * Since characters may be more than one cell width wide we may end up
1880  * skipping over the exact width. That is, if we need to we'll go back
1881  * too far (by one cell width). Account for that in the call by looking
1882  * at got_width.
1883  *
1884  * Note that this call gives a possible got_width == want_width+1 as
1885  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1886  * That was just what was needed at the time, maybe it needs to be
1887  * optional.
1888  */
1889 char *
1890 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1891 {
1892     unsigned width_consumed = 0;
1893     int this_width;
1894     UCS ucs;
1895     unsigned long remaining_octets;
1896     char *ptr, *savereadptr, *goodreadptr;
1897
1898     savereadptr = start_here;
1899     goodreadptr = start_here;
1900
1901     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1902
1903         savereadptr = ptr;
1904         remaining_octets = goodreadptr - ptr;
1905         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1906
1907         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1908           if(remaining_octets > 0){
1909               /*
1910                * This means there are some bad octets after this good
1911                * character so things are not going to work out well.
1912                * Bail out.
1913                */
1914               savereadptr = str;        /* we're done */
1915           }
1916           else{
1917             this_width = wcellwidth(ucs);
1918
1919             /*
1920              * If this_width is -1 that means we can't print this character
1921              * with our current locale. Writechar will print a '?'.
1922              */
1923             if(this_width < 0)
1924               this_width = 1;
1925
1926             width_consumed += (unsigned) this_width;
1927             goodreadptr = savereadptr;
1928           }
1929         }
1930     }
1931
1932     if(got_width)
1933       *got_width = width_consumed;
1934
1935     return(savereadptr);
1936 }
1937
1938
1939 /*----------------------------------------------------------------------
1940   copy the source string onto the destination string returning with
1941   the destination string pointer at the end of the destination text
1942
1943   motivation for this is to avoid twice passing over a string that's
1944   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1945
1946   This doesn't really belong here but it is used here.
1947  ----*/
1948 void
1949 sstrncpy(char **d, char *s, int n)
1950 {
1951     while(n-- > 0 && (**d = *s++) != '\0')
1952       (*d)++;
1953 }
1954
1955
1956 /*
1957  * If use_system_routines is set then NULL is the return value and it is
1958  * not an error. Display_charmap and keyboard_charmap should come over as
1959  * malloced strings and will be filled in with the result.
1960  *
1961  * Returns a void pointer to the input_cs CHARSET which is
1962  * passed to mbtow via kbseq().
1963  * If !use_system_routines && NULL is returned, that is an error and err should
1964  * have a message.
1965  * display_charmap and keyboard_charmap should be malloced data and may be
1966  * realloced and changed here.
1967  */
1968 int
1969 setup_for_input_output(int use_system_routines, char **display_charmap,
1970                        char **keyboard_charmap, void **input_cs_arg, char **err)
1971 {
1972     const CHARSET *cs;
1973     const CHARSET *input_cs = NULL;
1974     int already_tried = 0;
1975     int supported = 0;
1976     char buf[1000];
1977
1978 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1979
1980     if(err)
1981       *err = NULL;
1982
1983     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1984         *err = cpstr("Bad call to setup_for_input_output");
1985         return(-1);
1986     }
1987
1988     if(use_system_routines){
1989 #if     PREREQ_FOR_SYS_TRANSLATION
1990         char *dcm;
1991
1992         dcm = nl_langinfo_codeset_wrapper();
1993         dcm = dcm ? dcm : "US-ASCII";
1994
1995         init_utf8_display(0, NULL);
1996         if(*display_charmap){
1997             if(dcm && strucmp(*display_charmap, dcm)){
1998                 snprintf(buf, sizeof(buf),
1999                  _("Display character set \"%s\" is ignored when using system translation"),
2000                      *display_charmap);
2001
2002                 *err = cpstr(buf);
2003             }
2004
2005             fs_give((void **) display_charmap);
2006         }
2007
2008         if(*keyboard_charmap){
2009             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
2010                 snprintf(buf, sizeof(buf),
2011                  _("Keyboard character set \"%s\" is ignored when using system translation"),
2012                      *keyboard_charmap);
2013
2014                 *err = cpstr(buf);
2015             }
2016
2017             fs_give((void **) keyboard_charmap);
2018         }
2019
2020         *display_charmap = cpstr(dcm);
2021         *keyboard_charmap = cpstr(dcm);
2022 #else
2023         *err = cpstr("Bad call to setup_for_input_output");
2024 #endif
2025
2026         *input_cs_arg = NULL;
2027         return(0);
2028     }
2029
2030
2031 try_again1:
2032     if(!(*display_charmap))
2033       *display_charmap = cpstr("US-ASCII");
2034
2035     if(!(*keyboard_charmap))
2036       *keyboard_charmap = cpstr(*display_charmap);
2037
2038     if(*keyboard_charmap){
2039         supported = input_charset_is_supported(*keyboard_charmap);
2040
2041         if(supported){
2042             if(!strucmp(*keyboard_charmap, "utf-8"))
2043               input_cs = utf8_charset(*keyboard_charmap);
2044             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2045               input_cs = cs;
2046         }
2047         else{
2048             if(err && !*err){
2049                 int iso2022jp = 0;
2050
2051                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2052                   iso2022jp = 1;
2053
2054                 snprintf(buf, sizeof(buf),
2055                      /* TRANSLATORS: The first argument is the name of the character
2056                         set the user is trying to use (which is unsupported by alpine).
2057                         The second argument is " (except for posting)" if they are
2058                         trying to use ISO-2022-JP for something other than posting. */
2059                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2060                      *keyboard_charmap,
2061                      iso2022jp ? _(" (except for posting)") : "");
2062
2063                 *err = cpstr(buf);
2064             }
2065
2066             input_cs = NULL;
2067             fs_give((void **) keyboard_charmap);
2068             *keyboard_charmap = cpstr("US-ASCII");
2069             if(!already_tried){
2070                 already_tried++;
2071                 goto try_again1;
2072             }
2073         }
2074     }
2075
2076
2077 try_again2:
2078     if(!(*display_charmap))
2079       *display_charmap = cpstr("US-ASCII");
2080
2081     if(*display_charmap){
2082         supported = output_charset_is_supported(*display_charmap);
2083         if(supported){
2084             if(!strucmp(*display_charmap, "utf-8"))
2085               init_utf8_display(1, NULL);
2086             else if((cs = utf8_charset(*display_charmap)) != NULL)
2087               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2088         }
2089         else{
2090             if(err && !*err){
2091                 int iso2022jp = 0;
2092
2093                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2094                   iso2022jp = 1;
2095
2096                 snprintf(buf, sizeof(buf),
2097                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2098                      *display_charmap,
2099                      iso2022jp ? _(" (except for posting)") : "");
2100
2101                 *err = cpstr(buf);
2102             }
2103
2104             fs_give((void **) display_charmap);
2105             if(!already_tried){
2106                 already_tried++;
2107                 goto try_again2;
2108             }
2109         }
2110     }
2111     else{
2112         if(err && !*err)
2113           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2114     }
2115
2116 #undef cpstr
2117
2118     *input_cs_arg = (void *) input_cs;
2119
2120     return(0);
2121 }
2122
2123
2124 int
2125 input_charset_is_supported(char *input_charset)
2126 {
2127     const CHARSET *cs;
2128
2129     if(!(input_charset && *input_charset))
2130       return 0;
2131
2132     if(!strucmp(input_charset, "utf-8"))
2133       return 1;
2134
2135     if((cs = utf8_charset(input_charset)) != NULL){
2136
2137         /*
2138          * This was true 2006-09-25.
2139          */
2140         switch(cs->type){
2141           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2142           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2143           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2144           case CT_UCS4: case CT_UTF16:
2145             return 1;
2146             break;
2147
2148           default:
2149             break;
2150         }
2151     }
2152
2153     return 0;
2154 }
2155
2156
2157 int
2158 output_charset_is_supported(char *output_charset)
2159 {
2160     const CHARSET *cs;
2161
2162     if(!(output_charset && *output_charset))
2163       return 0;
2164
2165     if(!strucmp(output_charset, "utf-8"))
2166       return 1;
2167
2168     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2169       return 1;
2170
2171     return 0;
2172 }
2173
2174
2175 int
2176 posting_charset_is_supported(char *posting_charset)
2177 {
2178     return(posting_charset && *posting_charset
2179            && (!strucmp(posting_charset, "ISO-2022-JP")
2180                || output_charset_is_supported(posting_charset)));
2181 }
2182
2183
2184 /*
2185  * This function is only defined in this special case and so calls
2186  * to it should be wrapped in the same macro conditionals.
2187  *
2188  * Returns the default display charset for a UNIX terminal emulator,
2189  * it is what nl_langinfo(CODESET) should return but we need to
2190  * wrap nl_langinfo because we know of strange behaving implementations.
2191  */
2192 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2193 char *
2194 nl_langinfo_codeset_wrapper(void)
2195 {
2196     char *ret = NULL;
2197
2198     ret = nl_langinfo(CODESET);
2199
2200     /*
2201      * If the value returned from nl_langinfo() is not a real charset,
2202      * see if we can figure out what they meant. If we can't figure it
2203      * out return NULL and let the caller decide what to do.
2204      */
2205     if(ret && *ret && !output_charset_is_supported(ret)){
2206         if(!strcmp("ANSI_X3.4-1968", ret)
2207            || !strcmp("646", ret)
2208            || !strcmp("ASCII", ret)
2209            || !strcmp("C", ret)
2210            || !strcmp("POSIX", ret))
2211           ret = "US-ASCII";
2212         else if(!strucmp(ret, "UTF8"))
2213           ret = "UTF-8";
2214         else if(!strucmp(ret, "EUCJP"))
2215           ret = "EUC-JP";
2216         else if(!strucmp(ret, "EUCKP"))
2217           ret = "EUC-KP";
2218         else if(!strucmp(ret, "SJIS"))
2219           ret = "SHIFT-JIS";
2220         else if(strstr(ret, "8859")){
2221             char *p;
2222
2223             /* check for digits after 8859 */
2224             p = strstr(ret, "8859");
2225             p += 4;
2226             if(!isdigit(*p))
2227               p++;
2228
2229             if(isdigit(*p)){
2230                 static char buf[12];
2231
2232                 memset(buf, 0, sizeof(buf));
2233                 strncpy(buf, "ISO-8859-", sizeof(buf));
2234                 buf[9] = *p++;
2235                 if(isdigit(*p))
2236                   buf[10] = *p;
2237
2238                 ret = buf;
2239             }
2240         }
2241     }
2242
2243     if(ret && !output_charset_is_supported(ret))
2244       ret = NULL;
2245
2246     return(ret);
2247 }
2248 #endif
2249
2250
2251 /*
2252  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2253  * needed the return value will point to orig. If a conversion is done,
2254  * the return string should be freed by the caller.
2255  * If not possible, returns NULL.
2256  */
2257 char *
2258 utf8_to_charset(char *orig, char *charset, int report_err)
2259 {
2260     SIZEDTEXT src, dst;
2261     char *ret = orig;
2262
2263     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2264       return ret;
2265
2266     src.size = strlen(orig);
2267     src.data = (unsigned char *) orig;
2268
2269     if(!strucmp(charset, "us-ascii")){
2270         size_t i;
2271
2272         for(i = 0; i < src.size; i++)
2273           if(src.data[i] & 0x80)
2274             return NULL;
2275
2276         return ret;
2277     }
2278
2279     /*
2280      * This works for ISO-2022-JP because of special code in utf8_cstext
2281      * but not for other 2022 charsets.
2282      */
2283     memset(&dst, 0, sizeof(dst));
2284     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2285       ret = (char *) dst.data;          /* c-client already null terminates it */
2286     else
2287       ret = NULL;
2288
2289     if((unsigned char *) ret != dst.data && dst.data)
2290       fs_give((void **) &dst.data);
2291
2292     return ret;
2293 }
2294
2295
2296 /*
2297  *      Turn a number into a string with comma's
2298  *
2299  * Args: number -- The long to be turned into a string.
2300  *
2301  * Result: pointer to static string representing number with commas
2302  * Can use up to 3 comatose results at once.
2303  */
2304 char *
2305 comatose(long int number)
2306 {
2307     long        i, x, done_one;
2308     static char buf[3][50];
2309     static int whichbuf = 0;
2310     char       *b;
2311
2312     whichbuf = (whichbuf + 1) % 3;
2313
2314     if(number == 0){
2315         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2316         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2317         return(buf[whichbuf]);
2318     }
2319
2320     done_one = 0;
2321     b = buf[whichbuf];
2322     for(i = 1000000000; i >= 1; i /= 1000) {
2323         x = number / i;
2324         number = number % i;
2325         if(x != 0 || done_one) {
2326             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2327               *b++ = ',';
2328
2329             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2330             b += strlen(b);
2331             done_one = 1;
2332         }
2333     }
2334
2335     if(b-buf[whichbuf] < sizeof(buf[0]))
2336       *b = '\0';
2337
2338     return(buf[whichbuf]);
2339 }
2340
2341
2342 /* leave out the commas */
2343 char *
2344 tose(long int number)
2345 {
2346     static char buf[3][50];
2347     static int whichbuf = 0;
2348
2349     whichbuf = (whichbuf + 1) % 3;
2350
2351     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2352
2353     return(buf[whichbuf]);
2354 }
2355
2356
2357 /*
2358  * line_paint - where the real work of managing what is displayed gets done.
2359  */
2360 void
2361 line_paint(int offset,                  /* current dot offset into vl */
2362            struct display_line *displ,
2363            int *passwd)                 /* flag to hide display of chars */
2364 {
2365     int i, w, w2, already_got_one = 0;
2366     int vfirst, vlast, dfirst, dlast, vi, di;
2367     int new_vbase;
2368     unsigned (*width_a_to_b)(UCS *, int, int);
2369
2370     /*
2371      * Set passwd to 10 in caller if you want to conceal the
2372      * password but not print asterisks for feedback.
2373      *
2374      * Set passwd to 1 in caller to conceal by printing asterisks.
2375      */
2376     if(passwd && *passwd >= 10){        /* don't show asterisks */
2377         if(*passwd > 10)
2378           return;
2379         else
2380           *passwd = 11;         /* only blat once */
2381
2382         i = 0;
2383         (*displ->movecursor)(displ->row, displ->col);
2384         while(i++ <= displ->dwid)
2385           (*displ->writechar)(' ');
2386
2387         (*displ->movecursor)(displ->row, displ->col);
2388         return;
2389     }
2390
2391     if(passwd && *passwd)
2392       width_a_to_b = single_width_chars_a_to_b;
2393     else
2394       width_a_to_b = ucs4_str_width_a_to_b;
2395
2396     /*
2397      * vl is the virtual line (the actual data). We operate on it by typing
2398      * characters to be added and deleting and so forth. In this routine we
2399      * copy a subset of those UCS-4 characters in vl into dl, the display
2400      * array, and show that subset on the screen.
2401      *
2402      * Offset is the location of the cursor in vl.
2403      *
2404      * We will display the string starting from vbase.
2405      * We have dwid screen cells to work in.
2406      * We may have to adjust vbase in order to display the
2407      * part of the string that contains the cursor.
2408      *
2409      * We'll make the display look like
2410      *   vl    a b c d e f g h i j k l m
2411      *             xxxxxxxxxxxxx  <- width dwid window
2412      *             < d e f g h >
2413      *               |
2414      *             vbase
2415      * The < will be there if vbase > 0.
2416      * The > will be there if the string from vbase to the
2417      * end can't all fit in the window.
2418      */
2419
2420     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2421
2422     /*
2423      * Adjust vbase so offset is not out of the window to the right.
2424      * (The +2 in w + 2 is for a possible " >" if the string goes past
2425      *  the right hand edge of the window and if the last visible character
2426      * is double wide. We don't want the offset to be under that > character.)
2427      */
2428     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2429         displ->dwid > 1 &&
2430         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2431         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2432         /*
2433          * offset is off the window to the right
2434          * It looks like   a b c d e f g h
2435          *                   |         |
2436          *               vbase         offset
2437          * and offset is either past the right edge,
2438          * or right at the right edge (and maybe under >),
2439          * or one before right at the edge (and maybe on space
2440          * for half a character).
2441          *
2442          * Since the characters may be double width it is slightly
2443          * complicated to figure out how far to increase vbase.
2444          * We're going to scoot over past width w/2 characters and
2445          * then see if that's sufficient.
2446          */
2447         new_vbase = displ->vbase + 1;
2448         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2449             w2 < displ->dwid/2;
2450             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2451           new_vbase++;
2452
2453         displ->vbase = new_vbase;
2454     }
2455
2456     /* adjust so offset is not out of the window to the left */
2457     while(displ->vbase > 0 && displ->vbase >= offset){
2458         /* add about dwid/2 more width */
2459         new_vbase = displ->vbase - 1;
2460         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2461             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2462             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2463           new_vbase--;
2464
2465         /* but don't let it get too small, recheck off right end */
2466         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2467             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2468             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2469           new_vbase++;
2470
2471         displ->vbase = MAX(new_vbase, 0);
2472     }
2473
2474     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2475       displ->vbase = 0;
2476
2477     vfirst = displ->vbase;
2478     dfirst = 0;
2479     if(displ->vbase > 0){                       /* off screen cue left */
2480         dfirst = 1;                             /* index which matches vfirst */
2481         displ->dl[0] = '<';
2482     }
2483
2484     vlast = displ->vused-1;                     /* end */
2485     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2486
2487     if(displ->dwid > 0 && w + dfirst > displ->dwid){                    /* off window right */
2488
2489         /* find last ucs character to be printed */
2490         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2491           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2492
2493         /* worry about double-width characters */
2494         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2495             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2496             displ->dl[dlast] = '>';
2497         }
2498         else{
2499             dlast = dfirst + vlast - vfirst + 1;
2500             displ->dl[dlast++] = ' ';
2501             displ->dl[dlast] = '>';
2502         }
2503     }
2504     else
2505       dlast = dfirst + vlast - vfirst;
2506
2507     /*
2508      * Copy the relevant part of the virtual line into the display line.
2509      */
2510     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2511       if(passwd && *passwd)
2512         displ->dl[di] = '*';            /* to conceal password */
2513       else
2514         displ->dl[di] = displ->vl[vi];
2515
2516     /*
2517      * Add spaces to clear the rest of the line.
2518      * We have dwid total space to fill.
2519      */
2520     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2521     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2522       displ->dl[di++] = ' ';
2523
2524     /*
2525      * Draw from left to right, skipping until we get to
2526      * something that is different. Characters may be different
2527      * widths than they were initially so paint from there the
2528      * rest of the way.
2529      */
2530     for(di = 0; displ->dl[di]; di++){
2531         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2532             /* move cursor first time */
2533             if(!already_got_one++){
2534                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2535                 (*displ->movecursor)(displ->row, displ->col + w);
2536             }
2537
2538             (*displ->writechar)(displ->dl[di]);
2539             displ->olddl[di] = displ->dl[di];
2540         }
2541     }
2542
2543     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2544
2545     /*
2546      * Move the cursor to the offset.
2547      *
2548      * The offset is relative to the start of the virtual array. We need
2549      * to find the location on the screen. The offset into the display array
2550      * will be offset-vbase+dfirst. We want to be at the start of that
2551      * character, so we need to find the width of all the characters up
2552      * to that point.
2553      */
2554     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2555
2556     (*displ->movecursor)(displ->row, displ->col + w);
2557 }
2558
2559
2560 /*
2561  * This is just like ucs4_str_width_a_to_b() except all of the characters
2562  * are assumed to be of width 1. This is for printing out *'s when user
2563  * enters a password, while still managing to use the same code to do the
2564  * display.
2565  */
2566 unsigned
2567 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2568 {
2569     unsigned width = 0;
2570     int i;
2571
2572     if(ucsstr)
2573       for(i = a; i <= b && ucsstr[i]; i++)
2574         width++;
2575
2576     return width;
2577 }