pith/charconv/utf8.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2013-2018 Eduardo Chappa
   8  * Copyright 2006-2008 University of Washington
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19
  20 /* includable WITHOUT dependency on c-client */
  21 #include "../../c-client/mail.h"
  22 #include "../../c-client/utf8.h"
  23
  24 #ifdef _WINDOWS
  25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  26 #undef ERROR
  27 #else
  28 #define _XOPEN_SOURCE
  29 #endif
  30
  31 #include <system.h>
  32
  33 #include "../../c-client/fs.h"
  34
  35 /* includable WITHOUT dependency on pico */
  36 #include "../../pico/keydefs.h"
  37
  38 #include "../osdep/collate.h"
  39 #include "../filttype.h"
  40
  41 #include "utf8.h"
  42
  43 #include <stdarg.h>
  44
  45
  46 unsigned single_width_chars_a_to_b(UCS *, int, int);
  47
  48
  49 static char locale_charmap[50];
  50
  51 static int   native_utf8;
  52 static void *display_data;
  53
  54 void
  55 init_utf8_display(int utf8, void *rmap)
  56 {
  57     native_utf8 = utf8;
  58     display_data = rmap;
  59 }
  60
  61
  62 /*
  63  * Argument is a UCS-4 wide character.
  64  * Returns the environment dependent cell width of the
  65  * character when printed to the screen.
  66  * This will be -1 if the character is not printable.
  67  * It will be >= zero if it is printable.
  68  *
  69  * Note that in the case it is not printable but it is still sent to
  70  * Writechar, Writechar will print a '?' with width 1.
  71  */
  72 int
  73 wcellwidth(UCS ucs)
  74 {
  75     char dummy[32];
  76     long w;
  77
  78     /*
  79      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  80      * That's the assumption here.
  81      */
  82
  83     if(native_utf8){                    /* display is UTF-8 capable */
  84         w = ucs4_width((unsigned long) ucs);
  85         return((w & U4W_ERROR) ? -1 : w);
  86     }
  87     else if(display_data){
  88         if(wtomb(dummy, ucs) < 0)
  89           return(-1);
  90         else{
  91             w = ucs4_width((unsigned long) ucs);
  92             return((w & U4W_ERROR) ? -1 : w);
  93         }
  94     }
  95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
  96     else
  97       return(wcwidth((wchar_t) ucs));
  98 #else
  99     return(0);
 100 #endif
 101 }
 102
 103 /* ambiguous width zone character function. We use the Windows code until
 104  * we find a better way to do it in general.
 105  */
 106 int
 107 pith_ucs4width(UCS ucs)
 108 {
 109   return (ucs >= 0x2100) ? 2 : 1;
 110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
 111   return wcwidth((wchar_t) ucs);
 112 #else
 113   return (ucs >= 0x2100) ? 2 : 1;
 114 #endif /* _WINDOWS */
 115 }
 116
 117 /*
 118  * Argument is a UCS-4 wide character.
 119  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 120  * Dest is a buffer at least xx chars wide where the multi-byte version
 121  * of the wide character will be written.
 122  * The returned value is the number of bytes written to dest or -1
 123  * if the conversion can't be done.
 124  */
 125 int
 126 wtomb(char *dest, UCS ucs)
 127 {
 128     /*
 129      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 130      * That's the assumption here.
 131      */
 132
 133     if(native_utf8){
 134         unsigned char *newdptr;
 135
 136         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 137         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 138     }
 139     else if(display_data){
 140         unsigned long ucs4;
 141         int           ret;
 142
 143         ucs4 = (unsigned long) ucs;
 144         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 145         if(ret >= 0)
 146           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 147         else
 148           ret = -1;
 149
 150         return(ret);
 151     }
 152     else
 153       return(wcrtomb(dest, (wchar_t) ucs, NULL));
 154 }
 155
 156
 157 /*
 158  * This function does not necessarily update inputp and remaining_octets, so
 159  * don't rely on that. The c-client version does but the other doesn't.
 160  */
 161 UCS
 162 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 163 {
 164     UCS ucs;
 165
 166     if(input_cs){
 167         CHARSET *cast_input_cs;
 168
 169         cast_input_cs = (CHARSET *) input_cs;
 170
 171         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 172           case U8G_ENDSTRG:
 173           case U8G_ENDSTRI:
 174             return(CCONV_NEEDMORE);
 175
 176           default:
 177             if(ucs & U8G_ERROR || ucs == UBOGON)
 178               return(CCONV_BADCHAR);
 179
 180             return(ucs);
 181         }
 182     }
 183     else{
 184         size_t ret;
 185         wchar_t w;
 186
 187         /*
 188          * Warning:  input_cs and remaining_octets are unused in this
 189          * half of the if/else.
 190          *
 191          * Unfortunately, we can't tell the difference between a source string
 192          * that is just not long enough and one that has characters that can't
 193          * be converted even though it is long enough. We return NEEDMORE in both cases.
 194          */
 195         ret = mbstowcs(&w, (char *) (*inputp), 1);
 196         if(ret == (size_t)(-1))
 197           return(CCONV_NEEDMORE);
 198         else{
 199           ucs = (UCS) w;
 200           return(ucs);
 201         }
 202     }
 203 }
 204
 205
 206 void
 207 set_locale_charmap(char *charmap)
 208 {
 209     if(charmap){
 210         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 211         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 212     }
 213     else
 214       locale_charmap[0] = '\0';
 215 }
 216
 217
 218 /*
 219  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 220  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 221  * The caller is responsible for freeing the returned value.
 222  *
 223  * Args  str     -- the string to convert
 224  */
 225 char *
 226 convert_to_utf8(char *str, char *fromcharset, int flags)
 227 {
 228     char          *ret = NULL;
 229     char          *fcharset;
 230     SIZEDTEXT      src, result;
 231     const CHARSET *cs;
 232     int            try;
 233
 234     src.data = (unsigned char *) str;
 235     src.size = strlen(str);
 236
 237     /* already UTF-8, return NULL */
 238     if(!(flags & CU8_NOINFER)
 239        && (cs = utf8_infercharset(&src))
 240        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 241       return(ret);
 242
 243     try = 1;
 244     while(try < 5){
 245         switch(try){
 246           case 1:
 247             fcharset = fromcharset;
 248             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 249               break;    /* give it a try */
 250             else
 251               try++;    /* fall through */
 252
 253           case 2:
 254             if(!(flags & CU8_NOINFER)){
 255                 fcharset = cs ? cs->name : NULL;
 256                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 257                   break;
 258                 else
 259                   try++;        /* fall through */
 260             }
 261             else
 262               try++;    /* fall through */
 263
 264           case 3:
 265             fcharset = locale_charmap;
 266             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 267               break;
 268             else
 269               try++;    /* fall through */
 270
 271           default:
 272             fcharset = "ISO-8859-1";            /* this will "work" */
 273             break;
 274         }
 275
 276         memset(&result, 0, sizeof(result));
 277
 278         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 279             if(!(result.size == src.size && result.data == src.data)){
 280                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 281                 strncpy(ret, (char *) result.data, result.size);
 282                 ret[result.size] = '\0';
 283             }
 284             /* else no conversion necessary */
 285
 286             if(result.data)
 287               fs_give((void **) &result.data);
 288             result.size = 0;
 289
 290             return(ret);
 291         }
 292
 293         try++;
 294     }
 295
 296     /* won't make it to here */
 297     return(ret);
 298 }
 299
 300
 301 /*
 302  * Convert from UTF-8 to user's locale charset.
 303  * This actually uses the wtomb routine to do the conversion, and that
 304  * relies on setup_for_input_output having been called.
 305  * If no conversion is necessary, NULL is returned, otherwise an allocated
 306  * string in the locale charset is returned and the caller is responsible
 307  * for freeing it.
 308  */
 309 char *
 310 convert_to_locale(char *utf8str)
 311 {
 312 #define CHNK 500
 313     char *inp, *retp, *ret = NULL;
 314     CBUF_S cb;
 315     int r, alloced;
 316
 317     if(native_utf8 || !utf8str || !utf8str[0])
 318       return(NULL);
 319
 320     cb.cbuf[0] = '\0';
 321     cb.cbufp = cb.cbufend = cb.cbuf;
 322     inp = utf8str;
 323
 324     alloced = CHNK;
 325     ret = (char *) fs_get(alloced * sizeof(char));
 326     retp = ret;
 327
 328     /*
 329      * There's gotta be a better way to do this but utf8_to_locale was
 330      * available and everything looks like a nail when all you have
 331      * is a hammer.
 332      */
 333     while(*inp){
 334         /*
 335          * We're placing the outgoing stream of characters in ret, a multi-byte
 336          * array of characters in the user's locale charset. See if there is
 337          * enough room for the next wide characters worth of output chars
 338          * and allocate more space if not.
 339          */
 340         if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
 341             alloced += CHNK;
 342             fs_resize((void **) &ret, alloced * sizeof(char));
 343         }
 344
 345         r = utf8_to_locale((int) *inp++, &cb,
 346                            (unsigned char *) retp, alloced-(retp-ret));
 347
 348         retp += r;
 349     }
 350
 351     *retp = '\0';
 352
 353     fs_resize((void **) &ret, strlen(ret)+1);
 354
 355     return(ret);
 356 }
 357
 358
 359 /*
 360  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 361  * filled in with multi-byte characters. The return value is the
 362  * number of valid characters in obuf to be used.
 363  */
 364 int
 365 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 366 {
 367     int outchars = 0;
 368
 369     if(!(cb && cb->cbufp))
 370       return(0);
 371
 372     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 373         unsigned char *inputp;
 374         unsigned long remaining_octets;
 375         UCS ucs;
 376
 377         *(cb->cbufp)++ = (unsigned char) c;
 378         inputp = cb->cbuf;
 379         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 380         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 381
 382         switch(ucs){
 383           case U8G_ENDSTRG:     /* incomplete character, wait */
 384           case U8G_ENDSTRI:     /* incomplete character, wait */
 385             break;
 386
 387           default:
 388             if(ucs & U8G_ERROR || ucs == UBOGON){
 389                 /*
 390                  * None of these cases is supposed to happen. If it
 391                  * does happen then the input stream isn't UTF-8
 392                  * so something is wrong. Treat each character in the
 393                  * input buffer as a separate error character and
 394                  * print a '?' for each.
 395                  */
 396                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 397                   obuf[outchars++] = '?';
 398
 399                 cb->cbufp = cb->cbuf;
 400             }
 401             else{
 402                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 403                     /*
 404                      * This happens when we have a UTF-8 character that
 405                      * we aren't able to print in our locale. For example,
 406                      * if the locale is setup with the terminal
 407                      * expecting ISO-8859-1 characters then there are
 408                      * lots of UTF-8 characters that can't be printed.
 409                      * Print a '?' instead.
 410                      */
 411                     obuf[outchars++] = '?';
 412                 }
 413                 else{
 414                     /*
 415                      * Convert the ucs into the multibyte
 416                      * character that corresponds to the
 417                      * ucs in the users locale.
 418                      */
 419                     outchars = wtomb((char *) obuf, ucs);
 420                     if(outchars < 0){
 421                         obuf[0] = '?';
 422                         outchars = 1;
 423                     }
 424                 }
 425
 426                 /* update the input buffer */
 427                 if(inputp >= cb->cbufp) /* this should be the case */
 428                   cb->cbufp = cb->cbuf;
 429                 else{           /* extra chars for some reason? */
 430                     unsigned char *q, *newcbufp;
 431
 432                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 433                     q = cb->cbuf;
 434                     while(inputp < cb->cbufp)
 435                       *q++ = *inputp++;
 436
 437                     cb->cbufp = newcbufp;
 438                 }
 439             }
 440
 441             break;
 442         }
 443     }
 444     else{                       /* error */
 445         obuf[0] = '?';
 446         outchars = 1;
 447         cb->cbufp = cb->cbuf;   /* start over */
 448     }
 449
 450     return(outchars);
 451 }
 452
 453
 454 /*
 455  * Returns the screen cells width of the UCS-4 string argument.
 456  * The source string is zero terminated.
 457  */
 458 unsigned
 459 ucs4_str_width(UCS *ucsstr)
 460 {
 461     unsigned width = 0;
 462     int w;
 463
 464     if(ucsstr)
 465       while(*ucsstr){
 466         w = wcellwidth(*ucsstr++);
 467         if(w != U4W_CTLSRGT)
 468           width += (w < 0 ? 1 : w);
 469       }
 470
 471     return width;
 472 }
 473
 474
 475 /*
 476  * Returns the screen cells width of the UCS-4 string argument
 477  * from ucsstr[a] through (inclusive) ucsstr[b].
 478  * No checking is done to make sure a starts in the middle
 479  * of a UCS-4 array.
 480  */
 481 unsigned
 482 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 483 {
 484     unsigned width = 0;
 485     int i, w;
 486
 487     if(ucsstr)
 488       for(i = a; i <= b && ucsstr[i]; i++){
 489         w = wcellwidth(ucsstr[i]);
 490         if(w != U4W_CTLSRGT)
 491           width += (w < 0 ? 1 : w);
 492       }
 493
 494     return width;
 495 }
 496
 497
 498 /*
 499  * Returns the screen cells width of the UCS-4 string argument
 500  * from ustart through (exclusive) uend.
 501  * No checking is done to make sure it starts in the middle
 502  * of a UCS-4 array.
 503  */
 504 unsigned
 505 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 506 {
 507     UCS *u;
 508     unsigned width = 0;
 509     int w;
 510
 511     if(!ustart)
 512       return width;
 513
 514     if(ustart)
 515       for(u = ustart; u < uend; u++){
 516         w = wcellwidth(*u);
 517         if(w != U4W_CTLSRGT)
 518           width += (w < 0 ? 1 : w);
 519       }
 520
 521     return(width);
 522 }
 523
 524
 525 /*
 526  * Return the largest possible pointer into ucs4str so that the width
 527  * of the string from ucs4str to the pointer (exclusive)
 528  * is maxwidth or less. Also stops at a null character.
 529  */
 530 UCS *
 531 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 532 {
 533     UCS *u;
 534     int w_consumed = 0, w, done = 0;
 535
 536     u = ucs4str;
 537
 538     if(u)
 539       while(!done && *u && w_consumed <= maxwidth){
 540         w = wcellwidth(*u);
 541         w = (w >= 0 ? w : 1);
 542         if(w_consumed + w <= maxwidth){
 543             w_consumed += w;
 544             ++u;
 545         }
 546         else
 547           ++done;
 548       }
 549
 550     return(u);
 551 }
 552
 553
 554 /*
 555  * Convert and copy a UTF-8 string into a UCS-4 NULL
 556  * terminated array. Just like cpystr only it converts
 557  * from UTF-8 to UCS-4.
 558  *
 559  * Returned UCS-4 string needs to be freed by caller.
 560  */
 561 UCS *
 562 utf8_to_ucs4_cpystr(char *utf8src)
 563 {
 564     size_t         retsize;
 565     UCS           *ret = NULL;
 566     UCS            ucs;
 567     unsigned long  remaining_octets;
 568     unsigned char *readptr;
 569     size_t         arrayindex;
 570
 571     /*
 572      * We don't know how big to allocate the return array
 573      * because variable numbers of octets in the src array
 574      * will combine to make UCS-4 characters. The number of
 575      * UCS-4 characters is less than or equal to the number
 576      * of src characters, though.
 577      */
 578
 579     if(!utf8src)
 580       return NULL;
 581
 582     retsize = strlen(utf8src) + 1;
 583
 584     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 585     memset(ret, 0, retsize * sizeof(*ret));
 586
 587     readptr = (unsigned char *) utf8src;
 588     remaining_octets = retsize-1;
 589     arrayindex = 0;
 590
 591     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 592         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 593
 594         if(ucs & U8G_ERROR || ucs == UBOGON)
 595           remaining_octets = 0;
 596         else
 597           ret[arrayindex++] = ucs;
 598     }
 599
 600     ret[arrayindex] = '\0';
 601
 602     /* get rid of excess size */
 603     if(arrayindex+1 < retsize)
 604       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 605
 606     return ret;
 607 }
 608
 609
 610 /*
 611  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 612  * terminated string. Just like cpystr only it converts
 613  * from UCS-4 to UTF-8.
 614  *
 615  * Returned UTF-8 string needs to be freed by caller.
 616  */
 617 char *
 618 ucs4_to_utf8_cpystr(UCS *ucs4src)
 619 {
 620     unsigned char *ret = NULL;
 621     unsigned char *writeptr;
 622     int            i;
 623
 624     if(!ucs4src)
 625       return NULL;
 626
 627     /*
 628      * Over-allocate and then resize at the end.
 629      */
 630
 631     /* count characters in source */
 632     for(i = 0; ucs4src[i]; i++)
 633       ;
 634
 635     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 636     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 637
 638     writeptr = ret;
 639     for(i = 0; ucs4src[i]; i++)
 640       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 641
 642     /* get rid of excess size */
 643     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 644
 645     return ((char *) ret);
 646 }
 647
 648
 649 /*
 650  * Similar to above but copy a fixed number of source
 651  * characters instead of going until null terminator.
 652  */
 653 char *
 654 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 655 {
 656     unsigned char *ret = NULL;
 657     unsigned char *writeptr;
 658     int            i;
 659
 660     if(!ucs4src)
 661       return NULL;
 662
 663     /*
 664      * Over-allocate and then resize at the end.
 665      */
 666
 667     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 668     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 669
 670     writeptr = ret;
 671     for(i = 0; i < ucs4src_len; i++)
 672       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 673
 674     /* get rid of excess size */
 675     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 676
 677     return ((char *) ret);
 678 }
 679
 680
 681 #ifdef _WINDOWS
 682 /*
 683  * Convert a UTF-8 argument into an LPTSTR version
 684  * of that argument. The result is allocated here
 685  * and should be freed by the caller.
 686  */
 687 LPTSTR
 688 utf8_to_lptstr(LPSTR arg_utf8)
 689 {
 690      int lptstr_len;
 691      LPTSTR lptstr_ret = NULL;
 692
 693      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 694      if(lptstr_len > 0)
 695      {
 696          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 697          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 698              arg_utf8, -1, lptstr_ret, lptstr_len );
 699      }
 700
 701      if(!lptstr_len)
 702      {
 703          /* check GetLastError()? */
 704          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 705          lptstr_ret[0] = 0;
 706      }
 707
 708      return lptstr_ret;
 709 }
 710
 711
 712 /*
 713  * Convert an LPTSTR argument into a UTF-8 version
 714  * of that argument. The result is allocated here
 715  * and should be freed by the caller.
 716  */
 717 LPSTR
 718 lptstr_to_utf8(LPTSTR arg_lptstr)
 719 {
 720      int utf8str_len;
 721      LPSTR utf8str_ret = NULL;
 722
 723      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 724      if(utf8str_len > 0)
 725      {
 726          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 727          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 728              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 729      }
 730
 731      if(!utf8str_len)
 732      {
 733          /* check GetLastError()? */
 734          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 735          utf8str_ret[0] = 0;
 736      }
 737
 738      return utf8str_ret;
 739 }
 740
 741
 742 /*
 743  * Convert a UCS4 argument into an LPTSTR version
 744  * of that argument. The result is allocated here
 745  * and should be freed by the caller.
 746  */
 747 LPTSTR
 748 ucs4_to_lptstr(UCS *arg_ucs4)
 749 {
 750     LPTSTR ret_lptstr = NULL;
 751     size_t len;
 752     size_t i;
 753
 754     if(arg_ucs4){
 755         len = ucs4_strlen(arg_ucs4);
 756         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 757         /* bogus conversion ignores UTF-16 */
 758         for(i = 0; i < len; i++)
 759           ret_lptstr[i] = arg_ucs4[i];
 760
 761         ret_lptstr[len] = '\0';
 762     }
 763
 764     return(ret_lptstr);
 765 }
 766
 767
 768 /*
 769  * Convert an LPTSTR argument into a UCS4 version
 770  * of that argument. The result is MemAlloc'd here
 771  * and should be freed by the caller.
 772  */
 773 UCS *
 774 lptstr_to_ucs4(LPTSTR arg_lptstr)
 775 {
 776     UCS *ret_ucs4 = NULL;
 777     size_t len;
 778     size_t i;
 779
 780     if(arg_lptstr){
 781         len = _tcslen(arg_lptstr);
 782         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 783         /* bogus conversion ignores UTF-16 */
 784         for(i = 0; i < len; i++)
 785           ret_ucs4[i] = arg_lptstr[i];
 786
 787         ret_ucs4[len] = '\0';
 788     }
 789
 790     return(ret_ucs4);
 791 }
 792
 793 #endif /* _WINDOWS */
 794
 795
 796 /*
 797  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 798  * 1-at-a-time filled in with UCS characters. The return value is the
 799  * number of valid characters in obuf to be used. It can only
 800  * be 1 or 0 characters since we're only getting one UTF-8 character
 801  * at a time.
 802  */
 803 int
 804 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 805 {
 806     int  width = 0, outchars = 0;
 807
 808     if(!(cb && cb->cbufp))
 809       return(0);
 810
 811     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 812         unsigned char *inputp;
 813         unsigned long remaining_octets;
 814         UCS ucs;
 815
 816         *cb->cbufp++ = (unsigned char) c;
 817         inputp = cb->cbuf;
 818         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 819         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 820
 821         switch(ucs){
 822           case U8G_ENDSTRG:     /* incomplete character, wait */
 823           case U8G_ENDSTRI:     /* incomplete character, wait */
 824             break;
 825
 826           default:
 827             if(ucs & U8G_ERROR || ucs == UBOGON){
 828                 /*
 829                  * None of these cases is supposed to happen. If it
 830                  * does happen then the input stream isn't UTF-8
 831                  * so something is wrong.
 832                  */
 833                 outchars++;
 834                 *obuf = '?';
 835                 cb->cbufp = cb->cbuf;
 836                 width = 1;
 837             }
 838             else{
 839                 outchars++;
 840                 if(ucs < 0x80 && ucs >= 0x20)
 841                   width = 1;
 842
 843                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 844                     /*
 845                      * This happens when we have a UTF-8 character that
 846                      * we aren't able to print in our locale. For example,
 847                      * if the locale is setup with the terminal
 848                      * expecting ISO-8859-1 characters then there are
 849                      * lots of UTF-8 characters that can't be printed.
 850                      * Print a '?' instead.
 851                      * Don't think this should happen in Windows.
 852                      */
 853                     *obuf = '?';
 854                 }
 855                 else{
 856                     *obuf = ucs;
 857                 }
 858
 859                 /* update the input buffer */
 860                 if(inputp >= cb->cbufp) /* this should be the case */
 861                   cb->cbufp = cb->cbuf;
 862                 else{           /* extra chars for some reason? */
 863                     unsigned char *q, *newcbufp;
 864
 865                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 866                     q = cb->cbuf;
 867                     while(inputp < cb->cbufp)
 868                       *q++ = *inputp++;
 869
 870                     cb->cbufp = newcbufp;
 871                 }
 872             }
 873
 874             break;
 875         }
 876     }
 877     else{                       /* error */
 878         *obuf = '?';
 879         outchars = 1;
 880         width = 1;
 881         cb->cbufp = cb->cbuf;   /* start over */
 882     }
 883
 884     if(obufwidth)
 885       *obufwidth = width;
 886
 887     return(outchars);
 888 }
 889
 890
 891 /*
 892  * Return an allocated copy of a zero-terminated UCS-4 string.
 893  */
 894 UCS *
 895 ucs4_cpystr(UCS *ucs4src)
 896 {
 897     size_t         arraysize;
 898     UCS           *ret = NULL;
 899     size_t         i;
 900
 901     if(!ucs4src)
 902       return NULL;
 903
 904     arraysize = ucs4_strlen(ucs4src);
 905
 906     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 907     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 908
 909     for(i = 0; i < arraysize; i++)
 910       ret[i] = ucs4src[i];
 911
 912     return ret;
 913 }
 914
 915
 916 UCS *
 917 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 918 {
 919     size_t i;
 920
 921     if(ucs4src && ucs4dst){
 922         for(i = 0; i < n; i++){
 923             ucs4dst[i] = ucs4src[i];
 924             if(ucs4dst[i] == '\0')
 925               break;
 926         }
 927     }
 928
 929     return ucs4dst;
 930 }
 931
 932
 933 UCS *
 934 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 935 {
 936     size_t i;
 937     UCS *u;
 938
 939     if(ucs4src && ucs4dst){
 940         for(u = ucs4dst; *u; u++)
 941           ;
 942
 943         for(i = 0; i < n; i++){
 944             u[i] = ucs4src[i];
 945             if(u[i] == '\0')
 946               break;
 947         }
 948
 949         if(i == n)
 950           u[i] = '\0';
 951     }
 952
 953     return ucs4dst;
 954 }
 955
 956
 957 /*
 958  * Like strlen only this returns the number of non-zero characters
 959  * in a zero-terminated UCS-4 array.
 960  */
 961 size_t
 962 ucs4_strlen(UCS *ucs4str)
 963 {
 964     size_t i = 0;
 965
 966     if(ucs4str)
 967       while(ucs4str[i])
 968         i++;
 969
 970     return(i);
 971 }
 972
 973
 974 int
 975 ucs4_strcmp(UCS *s1, UCS *s2)
 976 {
 977     for(; *s1 == *s2; s1++, s2++)
 978       if(*s1 == '\0')
 979         return 0;
 980
 981     return((*s1 < *s2) ? -1 : 1);
 982 }
 983
 984
 985 UCS *
 986 ucs4_strchr(UCS *s, UCS c)
 987 {
 988     if(!s)
 989       return NULL;
 990
 991     while(*s && *s != c)
 992       s++;
 993
 994     if(*s || !c)
 995       return s;
 996     else
 997       return NULL;
 998 }
 999
1000
1001 UCS *
1002 ucs4_strrchr(UCS *s, UCS c)
1003 {
1004     UCS *ret = NULL;
1005
1006     if(!s)
1007       return ret;
1008
1009     while(*s){
1010         if(*s == c)
1011           ret = s;
1012
1013         s++;
1014     }
1015
1016     return ret;
1017 }
1018
1019
1020 /*
1021  * Returns the screen cells width of the UTF-8 string argument.
1022  */
1023 unsigned
1024 utf8_width(char *str)
1025 {
1026     unsigned width = 0;
1027     int this_width;
1028     UCS ucs;
1029     unsigned long remaining_octets;
1030     char *readptr;
1031
1032     if(!(str && *str))
1033       return(width);
1034
1035     readptr = str;
1036     remaining_octets = readptr ? strlen(readptr) : 0;
1037
1038     while(remaining_octets > 0 && *readptr){
1039
1040         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1041
1042         if(ucs & U8G_ERROR || ucs == UBOGON){
1043             /*
1044              * This should not happen, but do something to handle it anyway.
1045              * Treat each character as a single width character, which is what should
1046              * probably happen when we actually go to write it out.
1047              */
1048             remaining_octets--;
1049             readptr++;
1050             this_width = 1;
1051         }
1052         else{
1053             this_width = wcellwidth(ucs);
1054
1055             /*
1056              * If this_width is -1 that means we can't print this character
1057              * with our current locale. Writechar will print a '?'.
1058              */
1059             if(this_width < 0)
1060               this_width = 1;
1061         }
1062
1063         width += (unsigned) this_width;
1064     }
1065
1066     return(width);
1067 }
1068
1069
1070 /*
1071  * Copy UTF-8 characters from src into dst.
1072  * This is intended to be used if you want to truncate a string at
1073  * the start instead of the end. For example, you have a long string
1074  * like
1075  *       this_is_a_long_string
1076  * but not enough space to fit it into a particular field. You want to
1077  * end up with
1078  *             s_a_long_string
1079  * where that fits in a particular width. Perhaps you'd use this with ...
1080  * to get
1081  *          ...s_a_long_string
1082  * This right adjusts the end of the string in the width space and
1083  * cuts it off at the start. If there is enough width for the whole
1084  * string it will copy the string into dst with no padding.
1085  *
1086  * Copy enough characters so that the result will have screen width of
1087  * want_width screen cells in current locale.
1088  *
1089  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1090  *   to dst. This is just for protection, it shouldn't be relied on to
1091  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1092  *   characters truncated in the middle or something like that.
1093  *
1094  * Returned value is the number of bytes written to dst, not including
1095  *   the possible terminating null.
1096  *
1097  * If we can't hit want_width exactly because of double width characters
1098  *   then we will pad the end of the string with space in order to make
1099  *   the width exact.
1100  */
1101 size_t
1102 utf8_to_width_rhs(char *dst,            /* destination buffer */
1103                   char *src,            /* source string */
1104                   size_t dstlen,        /* space in dest */
1105                   unsigned want_width)  /* desired screen width */
1106 {
1107     int this_width;
1108     unsigned width_consumed = 0;
1109     UCS ucs;
1110     unsigned long remaining_octets;
1111     char *readptr, *goodreadptr, *savereadptr, *endptr;
1112     size_t nb = 0;
1113
1114     if(!src){
1115         if(dstlen > 0)
1116           dst[0] = '\0';
1117
1118         return nb;
1119     }
1120
1121     /*
1122      * Start at the end of the source string and go backwards until we
1123      * get to the desired width, but not more than the width.
1124      */
1125     readptr = src + strlen(src);
1126     endptr = readptr;
1127     goodreadptr = readptr;
1128     width_consumed = 0;
1129     savereadptr = readptr;
1130
1131     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1132         readptr = savereadptr-1){
1133
1134         savereadptr = readptr;
1135         remaining_octets = goodreadptr - readptr;
1136         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1137
1138         /*
1139          * Handling the error case is tough because an error will be the normal thing that
1140          * happens as we back through the string. So we're just going to punt on the
1141          * error for now.
1142          */
1143         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1144             if(remaining_octets > 0){
1145                 /*
1146                  * This means there are some bad octets after this good
1147                  * character so things are not going to work out well.
1148                  * Bail out.
1149                  */
1150                 savereadptr = src;      /* we're done */
1151             }
1152             else{
1153                 this_width = wcellwidth(ucs);
1154
1155                 if(this_width < 0)
1156                   this_width = 1;
1157
1158                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1159                     width_consumed += (unsigned) this_width;
1160                     goodreadptr = savereadptr;
1161                 }
1162                 else
1163                   savereadptr = src;    /* we're done */
1164             }
1165         }
1166     }
1167
1168     /*
1169      * Copy characters from goodreadptr to endptr into dst.
1170      */
1171     nb = MIN(endptr-goodreadptr, dstlen-1);
1172     strncpy(dst, goodreadptr, nb);
1173     dst[nb] = '\0';
1174
1175     /*
1176      * Pad out with spaces in order to hit width exactly.
1177      */
1178     while(width_consumed < want_width && nb < dstlen-1){
1179         dst[nb++] = ' ';
1180         dst[nb] = '\0';
1181         width_consumed++;
1182     }
1183
1184     return nb;
1185 }
1186
1187
1188 /*
1189  * The arguments being converted are UTF-8 strings.
1190  * This routine attempts to make it possible to use screen cell
1191  * widths in a format specifier. In a one-byte per screen cell
1192  * world we might have used %10.10s to cause a string to occupy
1193  * 10 screen positions. Since the width and precision are really
1194  * referring to numbers of bytes instead of screen positions that
1195  * won't work with UTF-8 input. We emulate that behavior with
1196  * the format string %w. %m.nw means to use the m and n as
1197  * screen width indicators instead of bytes indicators.
1198  *
1199  * There is no reason to use this routine unless you want to use
1200  * min field with or precision with the specifier. A plain %w without
1201  * widths is equivalent exactly to a plain %s in a regular printf.
1202  *
1203  * Double-width characters complicate things. It may not be possible
1204  * to satisfy the request exactly. For example, %3w for an input
1205  * string that is made up of two double-width characters.
1206  * This routine will arbitrarily use a trailing space character if
1207  * needed to make the width come out correctly where a half of a
1208  * double-width character would have been needed. We'll see how
1209  * that works for us.
1210  *
1211  * %w only works for strings (it's a %s replacement).
1212  *
1213  * Buffer overflow is handled by the size argument. %.30s will work
1214  * to limit a particular string to 30 bytes, but you lose that
1215  * ability with %w, since it may write more than precision bytes
1216  * in order to get to the desired width. It is best to choose
1217  * size large enough so that it doesn't come into play, otherwise
1218  * it may be possible to get partial UTF-8 characters because of
1219  * the truncation.
1220  *
1221  * The return value isn't quite the same as the return value
1222  * of snprintf. It is the number of bytes written, not counting
1223  * the trailing null, just like snprintf. However, if it is
1224  * truncated due to size then the output is size, not the
1225  * number of characters that would have been written.
1226  */
1227 int
1228 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1229 {
1230     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1231     char   *start_of_specifier;
1232     char   *input_str;
1233     int     int_arg;
1234     double  double_arg;
1235     void   *ptr_arg;
1236     unsigned got_width;
1237     int     more_flags, ret, w;
1238     int     min_field_width, field_precision, modifier;
1239     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1240     va_list args;
1241
1242     newfmt[0] = '\0';
1243     q = newfmt;
1244
1245     pdest = dest;
1246
1247 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1248     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1249
1250     /*
1251      * Strategy: Look through the fmt string for %w's. Replace the
1252      * %w's in the format string with %s's but with possibly different
1253      * width and precision arguments which will make it come out right.
1254      * Then call the regular system vsnprintf with the altered format
1255      * string but same arguments.
1256      *
1257      * That would be nice but it doesn't quite work. Why? Because a
1258      * %*w will need to have the value in the integer argument the *
1259      * refers to modified. Can't do it as far as I can tell. Or we could
1260      * remove the integer argument somehow before calling printf. Can't
1261      * do it. Or we could somehow add an additional conversion specifier
1262      * that caused nothing to be printed but ate up the integer arg.
1263      * Can't figure out how to do that either.
1264      *
1265      * Since we can't figure out how to do it, the alternative is to
1266      * construct the result one piece at a time, pasting together the
1267      * pieces from the different conversions.
1268      */
1269     va_start(args, fmt);
1270
1271     while(*fmt && IS_ROOM_IN_DEST(1)){
1272         if(*fmt == '%'){
1273             start_of_specifier = fmt++;
1274
1275             min_field_width = field_precision = -1;
1276             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1277
1278             /* flags */
1279             more_flags = 1;
1280             while(more_flags){
1281                 switch(*fmt){
1282                   case '-':
1283                     flags_minus++;
1284                     fmt++;
1285                     break;
1286
1287                   case '+':
1288                     flags_plus++;
1289                     fmt++;
1290                     break;
1291
1292                   case ' ':
1293                     flags_space++;
1294                     fmt++;
1295                     break;
1296
1297                   case '0':
1298                     flags_zero++;
1299                     fmt++;
1300                     break;
1301
1302                   case '#':
1303                     flags_pound++;
1304                     fmt++;
1305                     break;
1306
1307                   default:
1308                     more_flags = 0;
1309                     break;
1310                 }
1311             }
1312
1313             /* minimum field width */
1314             if(*fmt == '*'){
1315                 min_field_width = va_arg(args, int);
1316                 fmt++;
1317             }
1318             else if(*fmt >= '0' && *fmt <= '9'){
1319                 width_str = fmt;
1320                 while (*fmt >= '0' && *fmt <= '9')
1321                   fmt++;
1322
1323                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1324                 if(sizeof(buf) > fmt-width_str)
1325                   buf[fmt-width_str] = '\0';
1326
1327                 buf[sizeof(buf)-1] = '\0';
1328
1329                 min_field_width = atoi(width_str);
1330             }
1331
1332             /* field precision */
1333             if(*fmt == '.'){
1334                 fmt++;
1335                 if(*fmt == '*'){
1336                     field_precision = va_arg(args, int);
1337                     fmt++;
1338                 }
1339                 else if(*fmt >= '0' && *fmt <= '9'){
1340                     width_str = fmt;
1341                     while (*fmt >= '0' && *fmt <= '9')
1342                       fmt++;
1343
1344                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1345                     if(sizeof(buf) > fmt-width_str)
1346                       buf[fmt-width_str] = '\0';
1347
1348                     buf[sizeof(buf)-1] = '\0';
1349
1350                     field_precision = atoi(width_str);
1351                 }
1352             }
1353
1354             /* length modifier */
1355             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1356               modifier = *fmt++;
1357
1358             /* conversion character */
1359             switch(*fmt){
1360               case 'w':
1361                 /*
1362                  * work with va_arg(char *) to figure out width
1363                  * and precision needed to produce the screen width
1364                  * and precision asked for in %w using some of the
1365                  * utf8 width routines we have.
1366                  */
1367
1368                 input_str = va_arg(args, char *);
1369                 if(field_precision >=0 || min_field_width >= 0)
1370                   w = utf8_width(input_str);
1371
1372                 if(field_precision >= 0){
1373                     if(w <= field_precision)
1374                       field_precision = -1;  /* print it all */
1375                     else{
1376                         /*
1377                          * We need to cut off some of the input_str
1378                          * in this case.
1379                          */
1380                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1381                         field_precision = (int) (end - input_str);
1382                         /* new w with this field_precision */
1383                         w = got_width;
1384                     }
1385                 }
1386
1387                 /* need some padding */
1388                 if(min_field_width >= 0)
1389                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1390                                       MAX(0, min_field_width - w);
1391
1392                 /*
1393                  * Now we just need to get the new format string
1394                  * set correctly in newfmt.
1395                  */
1396                 q = newfmt;
1397                 if(q-newfmt < sizeof(newfmt))
1398                   *q++ = '%';
1399
1400                 if(flags_minus && q-newfmt < sizeof(newfmt))
1401                   *q++ = '-';
1402                 if(flags_plus && q-newfmt < sizeof(newfmt))
1403                   *q++ = '+';
1404                 if(flags_space && q-newfmt < sizeof(newfmt))
1405                   *q++ = ' ';
1406                 if(flags_zero && q-newfmt < sizeof(newfmt))
1407                   *q++ = '0';
1408                 if(flags_pound && q-newfmt < sizeof(newfmt))
1409                   *q++ = '#';
1410
1411                 if(min_field_width >= 0){
1412                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1413                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1414                 }
1415
1416                 if(field_precision >= 0){
1417                     if(q-newfmt < sizeof(newfmt))
1418                       *q++ = '.';
1419
1420                     snprintf(buf, sizeof(buf), "%d", field_precision);
1421                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1422                 }
1423
1424                 if(q-newfmt < sizeof(newfmt))
1425                   *q++ = 's';
1426
1427                 if(q-newfmt < sizeof(newfmt))
1428                   *q++ = '\0';
1429
1430                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1431                 pdest += strlen(pdest);
1432
1433                 break;
1434
1435               case '\0':
1436                 fmt--;
1437                 break;
1438
1439               default:
1440                 /* make a new format which leaves out the dynamic '*' arguments */
1441                 q = newfmt;
1442                 if(q-newfmt < sizeof(newfmt))
1443                   *q++ = '%';
1444
1445                 if(flags_minus && q-newfmt < sizeof(newfmt))
1446                   *q++ = '-';
1447                 if(flags_plus && q-newfmt < sizeof(newfmt))
1448                   *q++ = '+';
1449                 if(flags_space && q-newfmt < sizeof(newfmt))
1450                   *q++ = ' ';
1451                 if(flags_zero && q-newfmt < sizeof(newfmt))
1452                   *q++ = '0';
1453                 if(flags_pound && q-newfmt < sizeof(newfmt))
1454                   *q++ = '#';
1455
1456                 if(min_field_width >= 0){
1457                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1458                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1459                 }
1460
1461                 if(field_precision >= 0){
1462                     if(q-newfmt < sizeof(newfmt))
1463                       *q++ = '.';
1464
1465                     snprintf(buf, sizeof(buf), "%d", field_precision);
1466                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1467                 }
1468
1469                 if(q-newfmt < sizeof(newfmt))
1470                   *q++ = *fmt;
1471
1472                 if(q-newfmt < sizeof(newfmt))
1473                   *q++ = '\0';
1474
1475                 switch(*fmt){
1476                   case 'd': case 'i': case 'o':
1477                   case 'x': case 'X': case 'u': case 'c':
1478                     int_arg = va_arg(args, int);
1479                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1480                     pdest += strlen(pdest);
1481                     break;
1482
1483                   case 's':
1484                     input_str = va_arg(args, char *);
1485                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1486                     pdest += strlen(pdest);
1487                     break;
1488
1489                   case 'f': case 'e': case 'E':
1490                   case 'g': case 'G':
1491                     double_arg = va_arg(args, double);
1492                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1493                     pdest += strlen(pdest);
1494                     break;
1495
1496                   case 'p':
1497                     ptr_arg = va_arg(args, void *);
1498                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1499                     pdest += strlen(pdest);
1500                     break;
1501
1502                   case '%':
1503                     if(IS_ROOM_IN_DEST(1))
1504                       *pdest++ =  '%';
1505
1506                     break;
1507
1508                   default:
1509                     /* didn't think of this type */
1510                     assert(0);
1511                     break;
1512                 }
1513
1514                 break;
1515             }
1516
1517             fmt++;
1518         }
1519         else{
1520             if(IS_ROOM_IN_DEST(1))
1521               *pdest++ = *fmt++;
1522         }
1523     }
1524
1525     ret = pdest - dest;
1526
1527     if(IS_ROOM_IN_DEST(1))
1528       *pdest++ = '\0';
1529
1530     va_end(args);
1531
1532     return ret;
1533 }
1534
1535
1536 /*
1537  * Copy UTF-8 characters from src into dst.
1538  * Copy enough characters so that the result will have (<=) screen width of
1539  * want_width screen cells in current locale.
1540  *
1541  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1542  *   to dst.
1543  *
1544  * Returned value is the number of bytes written to dst, not including
1545  *   the possible terminating null.
1546  * Got_width is another returned value. It is the width in screen cells of
1547  *   the string placed in dst. It will be the same as want_width if there
1548  *   are enough characters in the src to do that and if the character widths
1549  *   hit the width exactly. It will be less than want_width if we run out
1550  *   of src characters or if the next character width would skip over the
1551  *   width we want, because it is double width.
1552  *
1553  * Zero width characters are collected and included at the end of the string.
1554  *   That is, if we make it to want_width but there is still a zero length
1555  *   character sitting in src, we add that to dst. This might be an accent
1556  *   or something like that.
1557  */
1558 size_t
1559 utf8_to_width(char *dst,                /* destination buffer */
1560               char *src,                /* source string */
1561               size_t dstlen,            /* space in dst */
1562               unsigned want_width,      /* desired screen width */
1563               unsigned *got_width)      /* returned screen width in dst */
1564 {
1565     int this_width;
1566     unsigned width_consumed = 0;
1567     UCS ucs;
1568     unsigned long remaining_octets;
1569     char *writeptr, *readptr, *savereadptr, *endptr;
1570     int ran_out_of_space = 0;
1571
1572     readptr = src;
1573
1574     remaining_octets = readptr ? strlen(readptr) : 0;
1575
1576     writeptr = dst;
1577     endptr = writeptr + dstlen;
1578
1579     if(readptr && writeptr){
1580       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1581         savereadptr = readptr;
1582         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1583
1584         if(ucs & U8G_ERROR || ucs == UBOGON)
1585           remaining_octets = 0;
1586         else{
1587           this_width = wcellwidth(ucs);
1588
1589           /*
1590            * If this_width is -1 that means we can't print this character
1591            * with our current locale. Writechar will print a '?'.
1592            */
1593           if(this_width < 0)
1594             this_width = 1;
1595
1596           if(width_consumed + (unsigned) this_width <= want_width){
1597             /* append this utf8 character to dst if it will fit */
1598             if(writeptr + (readptr - savereadptr) < endptr){
1599               width_consumed += this_width;
1600               while(savereadptr < readptr)
1601                 *writeptr++ = *savereadptr++;
1602             }
1603             else
1604               ran_out_of_space++;       /* no more utf8 to dst */
1605           }
1606           else
1607             remaining_octets = 0;       /* we're done */
1608         }
1609       }
1610
1611       if(writeptr < endptr)
1612         *writeptr = '\0';
1613     }
1614
1615     if(got_width)
1616       *got_width = width_consumed;
1617
1618     return(writeptr ? (writeptr - dst) : 0);
1619 }
1620
1621
1622 /*
1623  * Str is a UTF-8 string.
1624  * Count forward width screencell positions and return a pointer to the
1625  * end of the string that is width wide.
1626  * The returned pointer points at the next character (where the null would
1627  * be placed).
1628  *
1629  * Got_width is another returned value. It is the width in screen cells of
1630  *   the string from str to the returned pointer. It will be the same as
1631  *   want_width if there are enough characters in the str to do that
1632  *   and if the character widths hit the width exactly. It will be less
1633  *   than want_width if we run out of characters or if the next character
1634  *   width would skip over the width we want, because it is double width.
1635  */
1636 char *
1637 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1638 {
1639     int this_width;
1640     unsigned width_consumed = 0;
1641     UCS ucs;
1642     unsigned long remaining_octets;
1643     char *readptr;
1644     char *retptr;
1645
1646     retptr = readptr = str;
1647
1648     remaining_octets = readptr ? strlen(readptr) : 0;
1649
1650     while(width_consumed <= want_width && remaining_octets > 0){
1651
1652         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1653
1654         if(ucs & U8G_ERROR || ucs == UBOGON){
1655             /*
1656              * This should not happen, but do something to handle it anyway.
1657              * Treat each character as a single width character, which is what should
1658              * probably happen when we actually go to write it out.
1659              */
1660             remaining_octets--;
1661             readptr++;
1662             this_width = 1;
1663         }
1664         else{
1665             this_width = wcellwidth(ucs);
1666
1667             /*
1668              * If this_width is -1 that means we can't print this character
1669              * with our current locale. Writechar will print a '?'.
1670              */
1671             if(this_width < 0)
1672               this_width = 1;
1673         }
1674
1675         if(width_consumed + (unsigned) this_width <= want_width){
1676             width_consumed += (unsigned) this_width;
1677             retptr = readptr;
1678         }
1679         else
1680           remaining_octets = 0; /* we're done */
1681     }
1682
1683     if(got_width)
1684       *got_width = width_consumed;
1685
1686     return(retptr);
1687 }
1688
1689
1690 /*
1691  * Copy a null terminator into a UTF-8 string in place so that the string is
1692  * no more than a certain screen width wide. If the string is already less
1693  * than or equal in width to the requested width, no change is made.
1694  *
1695  * The actual width accomplished is returned. Note that it may be less than
1696  * max_width due to double width characters as well as due to the fact that
1697  * it fits wholly in the max_width.
1698  *
1699  * Returned value is the actual screen width of str when done.
1700  *
1701  * A side effect is that a terminating null may have been written into
1702  * the passed in string.
1703  */
1704 unsigned
1705 utf8_truncate(char *str, unsigned max_width)
1706 {
1707     int this_width;
1708     unsigned width_consumed = 0;
1709     UCS ucs;
1710     unsigned long remaining_octets;
1711     char *readptr, *savereadptr;
1712
1713     readptr = str;
1714
1715     remaining_octets = readptr ? strlen(readptr) : 0;
1716
1717     if(readptr){
1718       while(width_consumed <= max_width && remaining_octets > 0){
1719
1720         savereadptr = readptr;
1721         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1722
1723         if(ucs & U8G_ERROR || ucs == UBOGON){
1724             /*
1725              * This should not happen, but do something to handle it anyway.
1726              * Treat each character as a single width character, which is what should
1727              * probably happen when we actually go to write it out.
1728              */
1729             remaining_octets--;
1730             readptr++;
1731             this_width = 1;
1732         }
1733         else{
1734             this_width = wcellwidth(ucs);
1735
1736             /*
1737              * If this_width is -1 that means we can't print this character
1738              * with our current locale. Writechar will print a '?'.
1739              */
1740             if(this_width < 0)
1741               this_width = 1;
1742         }
1743
1744         if(width_consumed + (unsigned) this_width <= max_width){
1745             width_consumed += (unsigned) this_width;
1746         }
1747         else{
1748             remaining_octets = 0;       /* we're done */
1749             *savereadptr = '\0';
1750         }
1751       }
1752     }
1753
1754     return(width_consumed);
1755 }
1756
1757
1758 /*
1759  * Copy UTF-8 characters from src into dst.
1760  * Copy enough characters so that the result will have screen width of
1761  * want_width screen cells in current locale.
1762  * If there aren't enough characters in src to get to want_width, pad on
1763  * left or right according to left_adjust argument.
1764  *
1765  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1766  *   to dst. Dst will be null terminated if there is enough room, but not
1767  *   if that would overflow dst's len.
1768  *
1769  * Returned value is the number of bytes written to dst, not including
1770  *   the possible terminating null.
1771  */
1772 size_t
1773 utf8_pad_to_width(char *dst,            /* destination buffer */
1774                   char *src,            /* source string */
1775                   size_t dstlen,        /* space in dst */
1776                   unsigned want_width,  /* desired screen width */
1777                   int left_adjust)      /* adjust left or right in want_width columns */
1778 {
1779     unsigned got_width = 0;
1780     int      need_more, howmany;
1781     size_t   len_left, bytes_used;
1782
1783     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1784     len_left = dstlen - bytes_used;
1785
1786     need_more = want_width - got_width;
1787     howmany = MIN(need_more, len_left);
1788
1789     if(howmany > 0){
1790         char *end, *newend, *p, *q;
1791
1792         end = dst + bytes_used;
1793         newend = end + howmany;
1794         if(left_adjust){
1795             /*
1796              * Add padding to end of string. Simply append
1797              * the needed number of spaces, or however many will fit
1798              * if we don't have enough space.
1799              */
1800             for(q = end; q < newend; q++)
1801               *q = ' ';
1802         }
1803         else{
1804             /*
1805              * Add padding to start of string.
1806              */
1807
1808             /* slide existing string over */
1809             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1810               *q = *p;
1811
1812             /* fill rest with spaces */
1813             for(; q >= dst; q--)
1814               *q = ' ';
1815         }
1816
1817         bytes_used += howmany;
1818     }
1819
1820     if(bytes_used < dstlen)
1821       dst[bytes_used] = '\0';
1822
1823     return(bytes_used);
1824 }
1825
1826
1827 /*
1828  * Str is a UTF-8 string.
1829  * Start_here is a pointer into the string. It points one position past
1830  * the last byte that should be considered a part of the length string.
1831  * Count back want_width screencell positions and return a pointer to the
1832  * start of the string that is want_width wide and ends with start_here.
1833  *
1834  * Since characters may be more than one cell width wide we may end up
1835  * skipping over the exact width. That is, if we need to we'll go back
1836  * too far (by one cell width). Account for that in the call by looking
1837  * at got_width.
1838  *
1839  * Note that this call gives a possible got_width == want_width+1 as
1840  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1841  * That was just what was needed at the time, maybe it needs to be
1842  * optional.
1843  */
1844 char *
1845 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1846 {
1847     unsigned width_consumed = 0;
1848     int this_width;
1849     UCS ucs;
1850     unsigned long remaining_octets;
1851     char *ptr, *savereadptr, *goodreadptr;
1852
1853     savereadptr = start_here;
1854     goodreadptr = start_here;
1855
1856     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1857
1858         savereadptr = ptr;
1859         remaining_octets = goodreadptr - ptr;
1860         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1861
1862         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1863           if(remaining_octets > 0){
1864               /*
1865                * This means there are some bad octets after this good
1866                * character so things are not going to work out well.
1867                * Bail out.
1868                */
1869               savereadptr = str;        /* we're done */
1870           }
1871           else{
1872             this_width = wcellwidth(ucs);
1873
1874             /*
1875              * If this_width is -1 that means we can't print this character
1876              * with our current locale. Writechar will print a '?'.
1877              */
1878             if(this_width < 0)
1879               this_width = 1;
1880
1881             width_consumed += (unsigned) this_width;
1882             goodreadptr = savereadptr;
1883           }
1884         }
1885     }
1886
1887     if(got_width)
1888       *got_width = width_consumed;
1889
1890     return(savereadptr);
1891 }
1892
1893
1894 /*----------------------------------------------------------------------
1895   copy the source string onto the destination string returning with
1896   the destination string pointer at the end of the destination text
1897
1898   motivation for this is to avoid twice passing over a string that's
1899   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1900
1901   This doesn't really belong here but it is used here.
1902  ----*/
1903 void
1904 sstrncpy(char **d, char *s, int n)
1905 {
1906     while(n-- > 0 && (**d = *s++) != '\0')
1907       (*d)++;
1908 }
1909
1910
1911 /*
1912  * If use_system_routines is set then NULL is the return value and it is
1913  * not an error. Display_charmap and keyboard_charmap should come over as
1914  * malloced strings and will be filled in with the result.
1915  *
1916  * Returns a void pointer to the input_cs CHARSET which is
1917  * passed to mbtow via kbseq().
1918  * If !use_system_routines && NULL is returned, that is an error and err should
1919  * have a message.
1920  * display_charmap and keyboard_charmap should be malloced data and may be
1921  * realloced and changed here.
1922  */
1923 int
1924 setup_for_input_output(int use_system_routines, char **display_charmap,
1925                        char **keyboard_charmap, void **input_cs_arg, char **err)
1926 {
1927     const CHARSET *cs;
1928     const CHARSET *input_cs = NULL;
1929     int already_tried = 0;
1930     int supported = 0;
1931     char buf[1000];
1932
1933 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1934
1935     if(err)
1936       *err = NULL;
1937
1938     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1939         *err = cpstr("Bad call to setup_for_input_output");
1940         return(-1);
1941     }
1942
1943     if(use_system_routines){
1944 #if     PREREQ_FOR_SYS_TRANSLATION
1945         char *dcm;
1946
1947         dcm = nl_langinfo_codeset_wrapper();
1948         dcm = dcm ? dcm : "US-ASCII";
1949
1950         init_utf8_display(0, NULL);
1951         if(*display_charmap){
1952             if(dcm && strucmp(*display_charmap, dcm)){
1953                 snprintf(buf, sizeof(buf),
1954                  _("Display character set \"%s\" is ignored when using system translation"),
1955                      *display_charmap);
1956
1957                 *err = cpstr(buf);
1958             }
1959
1960             fs_give((void **) display_charmap);
1961         }
1962
1963         if(*keyboard_charmap){
1964             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1965                 snprintf(buf, sizeof(buf),
1966                  _("Keyboard character set \"%s\" is ignored when using system translation"),
1967                      *keyboard_charmap);
1968
1969                 *err = cpstr(buf);
1970             }
1971
1972             fs_give((void **) keyboard_charmap);
1973         }
1974
1975         *display_charmap = cpstr(dcm);
1976         *keyboard_charmap = cpstr(dcm);
1977 #else
1978         *err = cpstr("Bad call to setup_for_input_output");
1979 #endif
1980
1981         *input_cs_arg = NULL;
1982         return(0);
1983     }
1984
1985
1986 try_again1:
1987     if(!(*display_charmap))
1988       *display_charmap = cpstr("US-ASCII");
1989
1990     if(!(*keyboard_charmap))
1991       *keyboard_charmap = cpstr(*display_charmap);
1992
1993     if(*keyboard_charmap){
1994         supported = input_charset_is_supported(*keyboard_charmap);
1995
1996         if(supported){
1997             if(!strucmp(*keyboard_charmap, "utf-8"))
1998               input_cs = utf8_charset(*keyboard_charmap);
1999             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2000               input_cs = cs;
2001         }
2002         else{
2003             if(err && !*err){
2004                 int iso2022jp = 0;
2005
2006                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2007                   iso2022jp = 1;
2008
2009                 snprintf(buf, sizeof(buf),
2010                      /* TRANSLATORS: The first argument is the name of the character
2011                         set the user is trying to use (which is unsupported by alpine).
2012                         The second argument is " (except for posting)" if they are
2013                         trying to use ISO-2022-JP for something other than posting. */
2014                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2015                      *keyboard_charmap,
2016                      iso2022jp ? _(" (except for posting)") : "");
2017
2018                 *err = cpstr(buf);
2019             }
2020
2021             input_cs = NULL;
2022             fs_give((void **) keyboard_charmap);
2023             *keyboard_charmap = cpstr("US-ASCII");
2024             if(!already_tried){
2025                 already_tried++;
2026                 goto try_again1;
2027             }
2028         }
2029     }
2030
2031
2032 try_again2:
2033     if(!(*display_charmap))
2034       *display_charmap = cpstr("US-ASCII");
2035
2036     if(*display_charmap){
2037         supported = output_charset_is_supported(*display_charmap);
2038         if(supported){
2039             if(!strucmp(*display_charmap, "utf-8"))
2040               init_utf8_display(1, NULL);
2041             else if((cs = utf8_charset(*display_charmap)) != NULL)
2042               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2043         }
2044         else{
2045             if(err && !*err){
2046                 int iso2022jp = 0;
2047
2048                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2049                   iso2022jp = 1;
2050
2051                 snprintf(buf, sizeof(buf),
2052                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2053                      *display_charmap,
2054                      iso2022jp ? _(" (except for posting)") : "");
2055
2056                 *err = cpstr(buf);
2057             }
2058
2059             fs_give((void **) display_charmap);
2060             if(!already_tried){
2061                 already_tried++;
2062                 goto try_again2;
2063             }
2064         }
2065     }
2066     else{
2067         if(err && !*err)
2068           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2069     }
2070
2071 #undef cpstr
2072
2073     *input_cs_arg = (void *) input_cs;
2074
2075     return(0);
2076 }
2077
2078
2079 int
2080 input_charset_is_supported(char *input_charset)
2081 {
2082     const CHARSET *cs;
2083
2084     if(!(input_charset && *input_charset))
2085       return 0;
2086
2087     if(!strucmp(input_charset, "utf-8"))
2088       return 1;
2089
2090     if((cs = utf8_charset(input_charset)) != NULL){
2091
2092         /*
2093          * This was true 2006-09-25.
2094          */
2095         switch(cs->type){
2096           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2097           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2098           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2099           case CT_UCS4: case CT_UTF16:
2100             return 1;
2101             break;
2102
2103           default:
2104             break;
2105         }
2106     }
2107
2108     return 0;
2109 }
2110
2111
2112 int
2113 output_charset_is_supported(char *output_charset)
2114 {
2115     const CHARSET *cs;
2116
2117     if(!(output_charset && *output_charset))
2118       return 0;
2119
2120     if(!strucmp(output_charset, "utf-8"))
2121       return 1;
2122
2123     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2124       return 1;
2125
2126     return 0;
2127 }
2128
2129
2130 int
2131 posting_charset_is_supported(char *posting_charset)
2132 {
2133     return(posting_charset && *posting_charset
2134            && (!strucmp(posting_charset, "ISO-2022-JP")
2135                || output_charset_is_supported(posting_charset)));
2136 }
2137
2138
2139 /*
2140  * This function is only defined in this special case and so calls
2141  * to it should be wrapped in the same macro conditionals.
2142  *
2143  * Returns the default display charset for a UNIX terminal emulator,
2144  * it is what nl_langinfo(CODESET) should return but we need to
2145  * wrap nl_langinfo because we know of strange behaving implementations.
2146  */
2147 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2148 char *
2149 nl_langinfo_codeset_wrapper(void)
2150 {
2151     char *ret = NULL;
2152
2153     ret = nl_langinfo(CODESET);
2154
2155     /*
2156      * If the value returned from nl_langinfo() is not a real charset,
2157      * see if we can figure out what they meant. If we can't figure it
2158      * out return NULL and let the caller decide what to do.
2159      */
2160     if(ret && *ret && !output_charset_is_supported(ret)){
2161         if(!strcmp("ANSI_X3.4-1968", ret)
2162            || !strcmp("646", ret)
2163            || !strcmp("ASCII", ret)
2164            || !strcmp("C", ret)
2165            || !strcmp("POSIX", ret))
2166           ret = "US-ASCII";
2167         else if(!strucmp(ret, "UTF8"))
2168           ret = "UTF-8";
2169         else if(!strucmp(ret, "EUCJP"))
2170           ret = "EUC-JP";
2171         else if(!strucmp(ret, "EUCKP"))
2172           ret = "EUC-KP";
2173         else if(!strucmp(ret, "SJIS"))
2174           ret = "SHIFT-JIS";
2175         else if(strstr(ret, "8859")){
2176             char *p;
2177
2178             /* check for digits after 8859 */
2179             p = strstr(ret, "8859");
2180             p += 4;
2181             if(!isdigit(*p))
2182               p++;
2183
2184             if(isdigit(*p)){
2185                 static char buf[12];
2186
2187                 memset(buf, 0, sizeof(buf));
2188                 strncpy(buf, "ISO-8859-", sizeof(buf));
2189                 buf[9] = *p++;
2190                 if(isdigit(*p))
2191                   buf[10] = *p;
2192
2193                 ret = buf;
2194             }
2195         }
2196     }
2197
2198     if(ret && !output_charset_is_supported(ret))
2199       ret = NULL;
2200
2201     return(ret);
2202 }
2203 #endif
2204
2205
2206 /*
2207  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2208  * needed the return value will point to orig. If a conversion is done,
2209  * the return string should be freed by the caller.
2210  * If not possible, returns NULL.
2211  */
2212 char *
2213 utf8_to_charset(char *orig, char *charset, int report_err)
2214 {
2215     SIZEDTEXT src, dst;
2216     char *ret = orig;
2217
2218     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2219       return ret;
2220
2221     src.size = strlen(orig);
2222     src.data = (unsigned char *) orig;
2223
2224     if(!strucmp(charset, "us-ascii")){
2225         size_t i;
2226
2227         for(i = 0; i < src.size; i++)
2228           if(src.data[i] & 0x80)
2229             return NULL;
2230
2231         return ret;
2232     }
2233
2234     /*
2235      * This works for ISO-2022-JP because of special code in utf8_cstext
2236      * but not for other 2022 charsets.
2237      */
2238     memset(&dst, 0, sizeof(dst));
2239     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2240       ret = (char *) dst.data;          /* c-client already null terminates it */
2241     else
2242       ret = NULL;
2243
2244     if((unsigned char *) ret != dst.data && dst.data)
2245       fs_give((void **) &dst.data);
2246
2247     return ret;
2248 }
2249
2250
2251 /*
2252  *      Turn a number into a string with comma's
2253  *
2254  * Args: number -- The long to be turned into a string.
2255  *
2256  * Result: pointer to static string representing number with commas
2257  * Can use up to 3 comatose results at once.
2258  */
2259 char *
2260 comatose(long int number)
2261 {
2262     long        i, x, done_one;
2263     static char buf[3][50];
2264     static int whichbuf = 0;
2265     char       *b;
2266
2267     whichbuf = (whichbuf + 1) % 3;
2268
2269     if(number == 0){
2270         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2271         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2272         return(buf[whichbuf]);
2273     }
2274
2275     done_one = 0;
2276     b = buf[whichbuf];
2277     for(i = 1000000000; i >= 1; i /= 1000) {
2278         x = number / i;
2279         number = number % i;
2280         if(x != 0 || done_one) {
2281             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2282               *b++ = ',';
2283
2284             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2285             b += strlen(b);
2286             done_one = 1;
2287         }
2288     }
2289
2290     if(b-buf[whichbuf] < sizeof(buf[0]))
2291       *b = '\0';
2292
2293     return(buf[whichbuf]);
2294 }
2295
2296
2297 /* leave out the commas */
2298 char *
2299 tose(long int number)
2300 {
2301     static char buf[3][50];
2302     static int whichbuf = 0;
2303
2304     whichbuf = (whichbuf + 1) % 3;
2305
2306     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2307
2308     return(buf[whichbuf]);
2309 }
2310
2311
2312 /*
2313  * line_paint - where the real work of managing what is displayed gets done.
2314  */
2315 void
2316 line_paint(int offset,                  /* current dot offset into vl */
2317            struct display_line *displ,
2318            int *passwd)                 /* flag to hide display of chars */
2319 {
2320     int i, w, w2, already_got_one = 0;
2321     int vfirst, vlast, dfirst, dlast, vi, di;
2322     int new_vbase;
2323     unsigned (*width_a_to_b)(UCS *, int, int);
2324
2325     /*
2326      * Set passwd to 10 in caller if you want to conceal the
2327      * password but not print asterisks for feedback.
2328      *
2329      * Set passwd to 1 in caller to conceal by printing asterisks.
2330      */
2331     if(passwd && *passwd >= 10){        /* don't show asterisks */
2332         if(*passwd > 10)
2333           return;
2334         else
2335           *passwd = 11;         /* only blat once */
2336
2337         i = 0;
2338         (*displ->movecursor)(displ->row, displ->col);
2339         while(i++ <= displ->dwid)
2340           (*displ->writechar)(' ');
2341
2342         (*displ->movecursor)(displ->row, displ->col);
2343         return;
2344     }
2345
2346     if(passwd && *passwd)
2347       width_a_to_b = single_width_chars_a_to_b;
2348     else
2349       width_a_to_b = ucs4_str_width_a_to_b;
2350
2351     /*
2352      * vl is the virtual line (the actual data). We operate on it by typing
2353      * characters to be added and deleting and so forth. In this routine we
2354      * copy a subset of those UCS-4 characters in vl into dl, the display
2355      * array, and show that subset on the screen.
2356      *
2357      * Offset is the location of the cursor in vl.
2358      *
2359      * We will display the string starting from vbase.
2360      * We have dwid screen cells to work in.
2361      * We may have to adjust vbase in order to display the
2362      * part of the string that contains the cursor.
2363      *
2364      * We'll make the display look like
2365      *   vl    a b c d e f g h i j k l m
2366      *             xxxxxxxxxxxxx  <- width dwid window
2367      *             < d e f g h >
2368      *               |
2369      *             vbase
2370      * The < will be there if vbase > 0.
2371      * The > will be there if the string from vbase to the
2372      * end can't all fit in the window.
2373      */
2374
2375     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2376
2377     /*
2378      * Adjust vbase so offset is not out of the window to the right.
2379      * (The +2 in w + 2 is for a possible " >" if the string goes past
2380      *  the right hand edge of the window and if the last visible character
2381      * is double wide. We don't want the offset to be under that > character.)
2382      */
2383     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2384         displ->dwid > 1 &&
2385         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2386         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2387         /*
2388          * offset is off the window to the right
2389          * It looks like   a b c d e f g h
2390          *                   |         |
2391          *               vbase         offset
2392          * and offset is either past the right edge,
2393          * or right at the right edge (and maybe under >),
2394          * or one before right at the edge (and maybe on space
2395          * for half a character).
2396          *
2397          * Since the characters may be double width it is slightly
2398          * complicated to figure out how far to increase vbase.
2399          * We're going to scoot over past width w/2 characters and
2400          * then see if that's sufficient.
2401          */
2402         new_vbase = displ->vbase + 1;
2403         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2404             w2 < displ->dwid/2;
2405             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2406           new_vbase++;
2407
2408         displ->vbase = new_vbase;
2409     }
2410
2411     /* adjust so offset is not out of the window to the left */
2412     while(displ->vbase > 0 && displ->vbase >= offset){
2413         /* add about dwid/2 more width */
2414         new_vbase = displ->vbase - 1;
2415         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2416             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2417             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2418           new_vbase--;
2419
2420         /* but don't let it get too small, recheck off right end */
2421         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2422             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2423             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2424           new_vbase++;
2425
2426         displ->vbase = MAX(new_vbase, 0);
2427     }
2428
2429     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2430       displ->vbase = 0;
2431
2432     vfirst = displ->vbase;
2433     dfirst = 0;
2434     if(displ->vbase > 0){                       /* off screen cue left */
2435         dfirst = 1;                             /* index which matches vfirst */
2436         displ->dl[0] = '<';
2437     }
2438
2439     vlast = displ->vused-1;                     /* end */
2440     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2441
2442     if(displ->dwid > 0 && w + dfirst > displ->dwid){                    /* off window right */
2443
2444         /* find last ucs character to be printed */
2445         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2446           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2447
2448         /* worry about double-width characters */
2449         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2450             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2451             displ->dl[dlast] = '>';
2452         }
2453         else{
2454             dlast = dfirst + vlast - vfirst + 1;
2455             displ->dl[dlast++] = ' ';
2456             displ->dl[dlast] = '>';
2457         }
2458     }
2459     else
2460       dlast = dfirst + vlast - vfirst;
2461
2462     /*
2463      * Copy the relevant part of the virtual line into the display line.
2464      */
2465     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2466       if(passwd && *passwd)
2467         displ->dl[di] = '*';            /* to conceal password */
2468       else
2469         displ->dl[di] = displ->vl[vi];
2470
2471     /*
2472      * Add spaces to clear the rest of the line.
2473      * We have dwid total space to fill.
2474      */
2475     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2476     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2477       displ->dl[di++] = ' ';
2478
2479     /*
2480      * Draw from left to right, skipping until we get to
2481      * something that is different. Characters may be different
2482      * widths than they were initially so paint from there the
2483      * rest of the way.
2484      */
2485     for(di = 0; displ->dl[di]; di++){
2486         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2487             /* move cursor first time */
2488             if(!already_got_one++){
2489                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2490                 (*displ->movecursor)(displ->row, displ->col + w);
2491             }
2492
2493             (*displ->writechar)(displ->dl[di]);
2494             displ->olddl[di] = displ->dl[di];
2495         }
2496     }
2497
2498     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2499
2500     /*
2501      * Move the cursor to the offset.
2502      *
2503      * The offset is relative to the start of the virtual array. We need
2504      * to find the location on the screen. The offset into the display array
2505      * will be offset-vbase+dfirst. We want to be at the start of that
2506      * character, so we need to find the width of all the characters up
2507      * to that point.
2508      */
2509     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2510
2511     (*displ->movecursor)(displ->row, displ->col + w);
2512 }
2513
2514
2515 /*
2516  * This is just like ucs4_str_width_a_to_b() except all of the characters
2517  * are assumed to be of width 1. This is for printing out *'s when user
2518  * enters a password, while still managing to use the same code to do the
2519  * display.
2520  */
2521 unsigned
2522 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2523 {
2524     unsigned width = 0;
2525     int i;
2526
2527     if(ucsstr)
2528       for(i = a; i <= b && ucsstr[i]; i++)
2529         width++;
2530
2531     return width;
2532 }