pith/charconv/utf8.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2006-2008 University of Washington
   8  * Copyright 2013-2015 Eduardo Chappa
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19
  20 /* includable WITHOUT dependency on c-client */
  21 #include "../../c-client/mail.h"
  22 #include "../../c-client/utf8.h"
  23
  24 #ifdef _WINDOWS
  25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  26 #undef ERROR
  27 #endif
  28
  29 #include <system.h>
  30
  31 #include "../../c-client/fs.h"
  32
  33 /* includable WITHOUT dependency on pico */
  34 #include "../../pico/keydefs.h"
  35
  36 #include "../osdep/collate.h"
  37 #include "../filttype.h"
  38
  39 #include "utf8.h"
  40
  41 #include <stdarg.h>
  42
  43
  44 unsigned single_width_chars_a_to_b(UCS *, int, int);
  45
  46
  47 static char locale_charmap[50];
  48
  49 static int   native_utf8;
  50 static void *display_data;
  51
  52 void
  53 init_utf8_display(int utf8, void *rmap)
  54 {
  55     native_utf8 = utf8;
  56     display_data = rmap;
  57 }
  58
  59
  60 /*
  61  * Argument is a UCS-4 wide character.
  62  * Returns the environment dependent cell width of the
  63  * character when printed to the screen.
  64  * This will be -1 if the character is not printable.
  65  * It will be >= zero if it is printable.
  66  *
  67  * Note that in the case it is not printable but it is still sent to
  68  * Writechar, Writechar will print a '?' with width 1.
  69  */
  70 int
  71 wcellwidth(UCS ucs)
  72 {
  73     char dummy[32];
  74     long w;
  75
  76     /*
  77      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  78      * That's the assumption here.
  79      */
  80
  81     if(native_utf8){                    /* display is UTF-8 capable */
  82         w = ucs4_width((unsigned long) ucs);
  83         return((w & U4W_ERROR) ? -1 : w);
  84     }
  85     else if(display_data){
  86         if(wtomb(dummy, ucs) < 0)
  87           return(-1);
  88         else{
  89             w = ucs4_width((unsigned long) ucs);
  90             return((w & U4W_ERROR) ? -1 : w);
  91         }
  92     }
  93 #ifndef _WINDOWS
  94     else
  95       return(wcwidth((wchar_t) ucs));
  96 #else
  97     return(0);
  98 #endif
  99 }
 100
 101
 102 /*
 103  * Argument is a UCS-4 wide character.
 104  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 105  * Dest is a buffer at least xx chars wide where the multi-byte version
 106  * of the wide character will be written.
 107  * The returned value is the number of bytes written to dest or -1
 108  * if the conversion can't be done.
 109  */
 110 int
 111 wtomb(char *dest, UCS ucs)
 112 {
 113     /*
 114      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 115      * That's the assumption here.
 116      */
 117
 118     if(native_utf8){
 119         unsigned char *newdptr;
 120
 121         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 122         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 123     }
 124     else if(display_data){
 125         unsigned long ucs4;
 126         int           ret;
 127
 128         ucs4 = (unsigned long) ucs;
 129         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 130         if(ret >= 0)
 131           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 132         else
 133           ret = -1;
 134
 135         return(ret);
 136     }
 137     else
 138       return(wcrtomb(dest, (wchar_t) ucs, NULL));
 139 }
 140
 141
 142 /*
 143  * This function does not necessarily update inputp and remaining_octets, so
 144  * don't rely on that. The c-client version does but the other doesn't.
 145  */
 146 UCS
 147 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 148 {
 149     UCS ucs;
 150
 151     if(input_cs){
 152         CHARSET *cast_input_cs;
 153
 154         cast_input_cs = (CHARSET *) input_cs;
 155
 156         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 157           case U8G_ENDSTRG:
 158           case U8G_ENDSTRI:
 159             return(CCONV_NEEDMORE);
 160
 161           default:
 162             if(ucs & U8G_ERROR || ucs == UBOGON)
 163               return(CCONV_BADCHAR);
 164
 165             return(ucs);
 166         }
 167     }
 168     else{
 169         size_t ret;
 170         wchar_t w;
 171
 172         /*
 173          * Warning:  input_cs and remaining_octets are unused in this
 174          * half of the if/else.
 175          *
 176          * Unfortunately, we can't tell the difference between a source string
 177          * that is just not long enough and one that has characters that can't
 178          * be converted even though it is long enough. We return NEEDMORE in both cases.
 179          */
 180         ret = mbstowcs(&w, (char *) (*inputp), 1);
 181         if(ret == (size_t)(-1))
 182           return(CCONV_NEEDMORE);
 183         else{
 184           ucs = (UCS) w;
 185           return(ucs);
 186         }
 187     }
 188 }
 189
 190
 191 void
 192 set_locale_charmap(char *charmap)
 193 {
 194     if(charmap){
 195         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 196         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 197     }
 198     else
 199       locale_charmap[0] = '\0';
 200 }
 201
 202
 203 /*
 204  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 205  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 206  * The caller is responsible for freeing the returned value.
 207  *
 208  * Args  str     -- the string to convert
 209  */
 210 char *
 211 convert_to_utf8(char *str, char *fromcharset, int flags)
 212 {
 213     char          *ret = NULL;
 214     char          *fcharset;
 215     SIZEDTEXT      src, result;
 216     const CHARSET *cs;
 217     int            try;
 218
 219     src.data = (unsigned char *) str;
 220     src.size = strlen(str);
 221
 222     /* already UTF-8, return NULL */
 223     if(!(flags & CU8_NOINFER)
 224        && (cs = utf8_infercharset(&src))
 225        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 226       return(ret);
 227
 228     try = 1;
 229     while(try < 5){
 230         switch(try){
 231           case 1:
 232             fcharset = fromcharset;
 233             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 234               break;    /* give it a try */
 235             else
 236               try++;    /* fall through */
 237
 238           case 2:
 239             if(!(flags & CU8_NOINFER)){
 240                 fcharset = cs ? cs->name : NULL;
 241                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 242                   break;
 243                 else
 244                   try++;        /* fall through */
 245             }
 246             else
 247               try++;    /* fall through */
 248
 249           case 3:
 250             fcharset = locale_charmap;
 251             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 252               break;
 253             else
 254               try++;    /* fall through */
 255
 256           default:
 257             fcharset = "ISO-8859-1";            /* this will "work" */
 258             break;
 259         }
 260
 261         memset(&result, 0, sizeof(result));
 262
 263         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 264             if(!(result.size == src.size && result.data == src.data)){
 265                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 266                 strncpy(ret, (char *) result.data, result.size);
 267                 ret[result.size] = '\0';
 268             }
 269             /* else no conversion necessary */
 270
 271             return(ret);
 272         }
 273
 274         try++;
 275     }
 276
 277     /* won't make it to here */
 278     return(ret);
 279 }
 280
 281
 282 /*
 283  * Convert from UTF-8 to user's locale charset.
 284  * This actually uses the wtomb routine to do the conversion, and that
 285  * relies on setup_for_input_output having been called.
 286  * If no conversion is necessary, NULL is returned, otherwise an allocated
 287  * string in the locale charset is returned and the caller is responsible
 288  * for freeing it.
 289  */
 290 char *
 291 convert_to_locale(char *utf8str)
 292 {
 293 #define CHNK 500
 294     char *inp, *retp, *ret = NULL;
 295     CBUF_S cb;
 296     int r, alloced;
 297
 298     if(native_utf8 || !utf8str || !utf8str[0])
 299       return(NULL);
 300
 301     cb.cbuf[0] = '\0';
 302     cb.cbufp = cb.cbufend = cb.cbuf;
 303     inp = utf8str;
 304
 305     alloced = CHNK;
 306     ret = (char *) fs_get(alloced * sizeof(char));
 307     retp = ret;
 308
 309     /*
 310      * There's gotta be a better way to do this but utf8_to_locale was
 311      * available and everything looks like a nail when all you have
 312      * is a hammer.
 313      */
 314     while(*inp){
 315         /*
 316          * We're placing the outgoing stream of characters in ret, a multi-byte
 317          * array of characters in the user's locale charset. See if there is
 318          * enough room for the next wide characters worth of output chars
 319          * and allocate more space if not.
 320          */
 321         if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
 322             alloced += CHNK;
 323             fs_resize((void **) &ret, alloced * sizeof(char));
 324         }
 325
 326         r = utf8_to_locale((int) *inp++, &cb,
 327                            (unsigned char *) retp, alloced-(retp-ret));
 328
 329         retp += r;
 330     }
 331
 332     *retp = '\0';
 333
 334     fs_resize((void **) &ret, strlen(ret)+1);
 335
 336     return(ret);
 337 }
 338
 339
 340 /*
 341  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 342  * filled in with multi-byte characters. The return value is the
 343  * number of valid characters in obuf to be used.
 344  */
 345 int
 346 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 347 {
 348     int outchars = 0;
 349
 350     if(!(cb && cb->cbufp))
 351       return(0);
 352
 353     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 354         unsigned char *inputp;
 355         unsigned long remaining_octets;
 356         UCS ucs;
 357
 358         *(cb->cbufp)++ = (unsigned char) c;
 359         inputp = cb->cbuf;
 360         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 361         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 362
 363         switch(ucs){
 364           case U8G_ENDSTRG:     /* incomplete character, wait */
 365           case U8G_ENDSTRI:     /* incomplete character, wait */
 366             break;
 367
 368           default:
 369             if(ucs & U8G_ERROR || ucs == UBOGON){
 370                 /*
 371                  * None of these cases is supposed to happen. If it
 372                  * does happen then the input stream isn't UTF-8
 373                  * so something is wrong. Treat each character in the
 374                  * input buffer as a separate error character and
 375                  * print a '?' for each.
 376                  */
 377                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 378                   obuf[outchars++] = '?';
 379
 380                 cb->cbufp = cb->cbuf;
 381             }
 382             else{
 383                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 384                     /*
 385                      * This happens when we have a UTF-8 character that
 386                      * we aren't able to print in our locale. For example,
 387                      * if the locale is setup with the terminal
 388                      * expecting ISO-8859-1 characters then there are
 389                      * lots of UTF-8 characters that can't be printed.
 390                      * Print a '?' instead.
 391                      */
 392                     obuf[outchars++] = '?';
 393                 }
 394                 else{
 395                     /*
 396                      * Convert the ucs into the multibyte
 397                      * character that corresponds to the
 398                      * ucs in the users locale.
 399                      */
 400                     outchars = wtomb((char *) obuf, ucs);
 401                     if(outchars < 0){
 402                         obuf[0] = '?';
 403                         outchars = 1;
 404                     }
 405                 }
 406
 407                 /* update the input buffer */
 408                 if(inputp >= cb->cbufp) /* this should be the case */
 409                   cb->cbufp = cb->cbuf;
 410                 else{           /* extra chars for some reason? */
 411                     unsigned char *q, *newcbufp;
 412
 413                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 414                     q = cb->cbuf;
 415                     while(inputp < cb->cbufp)
 416                       *q++ = *inputp++;
 417
 418                     cb->cbufp = newcbufp;
 419                 }
 420             }
 421
 422             break;
 423         }
 424     }
 425     else{                       /* error */
 426         obuf[0] = '?';
 427         outchars = 1;
 428         cb->cbufp = cb->cbuf;   /* start over */
 429     }
 430
 431     return(outchars);
 432 }
 433
 434
 435 /*
 436  * Returns the screen cells width of the UCS-4 string argument.
 437  * The source string is zero terminated.
 438  */
 439 unsigned
 440 ucs4_str_width(UCS *ucsstr)
 441 {
 442     unsigned width = 0;
 443     int w;
 444
 445     if(ucsstr)
 446       while(*ucsstr){
 447         w = wcellwidth(*ucsstr++);
 448         if(w != U4W_CTLSRGT)
 449           width += (w < 0 ? 1 : w);
 450       }
 451
 452     return width;
 453 }
 454
 455
 456 /*
 457  * Returns the screen cells width of the UCS-4 string argument
 458  * from ucsstr[a] through (inclusive) ucsstr[b].
 459  * No checking is done to make sure a starts in the middle
 460  * of a UCS-4 array.
 461  */
 462 unsigned
 463 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 464 {
 465     unsigned width = 0;
 466     int i, w;
 467
 468     if(ucsstr)
 469       for(i = a; i <= b && ucsstr[i]; i++){
 470         w = wcellwidth(ucsstr[i]);
 471         if(w != U4W_CTLSRGT)
 472           width += (w < 0 ? 1 : w);
 473       }
 474
 475     return width;
 476 }
 477
 478
 479 /*
 480  * Returns the screen cells width of the UCS-4 string argument
 481  * from ustart through (exclusive) uend.
 482  * No checking is done to make sure it starts in the middle
 483  * of a UCS-4 array.
 484  */
 485 unsigned
 486 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 487 {
 488     UCS *u;
 489     unsigned width = 0;
 490     int w;
 491
 492     if(!ustart)
 493       return width;
 494
 495     if(ustart)
 496       for(u = ustart; u < uend; u++){
 497         w = wcellwidth(*u);
 498         if(w != U4W_CTLSRGT)
 499           width += (w < 0 ? 1 : w);
 500       }
 501
 502     return(width);
 503 }
 504
 505
 506 /*
 507  * Return the largest possible pointer into ucs4str so that the width
 508  * of the string from ucs4str to the pointer (exclusive)
 509  * is maxwidth or less. Also stops at a null character.
 510  */
 511 UCS *
 512 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 513 {
 514     UCS *u;
 515     int w_consumed = 0, w, done = 0;
 516
 517     u = ucs4str;
 518
 519     if(u)
 520       while(!done && *u && w_consumed <= maxwidth){
 521         w = wcellwidth(*u);
 522         w = (w >= 0 ? w : 1);
 523         if(w_consumed + w <= maxwidth){
 524             w_consumed += w;
 525             ++u;
 526         }
 527         else
 528           ++done;
 529       }
 530
 531     return(u);
 532 }
 533
 534
 535 /*
 536  * Convert and copy a UTF-8 string into a UCS-4 NULL
 537  * terminated array. Just like cpystr only it converts
 538  * from UTF-8 to UCS-4.
 539  *
 540  * Returned UCS-4 string needs to be freed by caller.
 541  */
 542 UCS *
 543 utf8_to_ucs4_cpystr(char *utf8src)
 544 {
 545     size_t         retsize;
 546     UCS           *ret = NULL;
 547     UCS            ucs;
 548     unsigned long  remaining_octets;
 549     unsigned char *readptr;
 550     size_t         arrayindex;
 551
 552     /*
 553      * We don't know how big to allocate the return array
 554      * because variable numbers of octets in the src array
 555      * will combine to make UCS-4 characters. The number of
 556      * UCS-4 characters is less than or equal to the number
 557      * of src characters, though.
 558      */
 559
 560     if(!utf8src)
 561       return NULL;
 562
 563     retsize = strlen(utf8src) + 1;
 564
 565     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 566     memset(ret, 0, retsize * sizeof(*ret));
 567
 568     readptr = (unsigned char *) utf8src;
 569     remaining_octets = retsize-1;
 570     arrayindex = 0;
 571
 572     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 573         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 574
 575         if(ucs & U8G_ERROR || ucs == UBOGON)
 576           remaining_octets = 0;
 577         else
 578           ret[arrayindex++] = ucs;
 579     }
 580
 581     ret[arrayindex] = '\0';
 582
 583     /* get rid of excess size */
 584     if(arrayindex+1 < retsize)
 585       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 586
 587     return ret;
 588 }
 589
 590
 591 /*
 592  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 593  * terminated string. Just like cpystr only it converts
 594  * from UCS-4 to UTF-8.
 595  *
 596  * Returned UTF-8 string needs to be freed by caller.
 597  */
 598 char *
 599 ucs4_to_utf8_cpystr(UCS *ucs4src)
 600 {
 601     unsigned char *ret = NULL;
 602     unsigned char *writeptr;
 603     int            i;
 604
 605     if(!ucs4src)
 606       return NULL;
 607
 608     /*
 609      * Over-allocate and then resize at the end.
 610      */
 611
 612     /* count characters in source */
 613     for(i = 0; ucs4src[i]; i++)
 614       ;
 615
 616     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 617     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 618
 619     writeptr = ret;
 620     for(i = 0; ucs4src[i]; i++)
 621       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 622
 623     /* get rid of excess size */
 624     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 625
 626     return ((char *) ret);
 627 }
 628
 629
 630 /*
 631  * Similar to above but copy a fixed number of source
 632  * characters instead of going until null terminator.
 633  */
 634 char *
 635 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 636 {
 637     unsigned char *ret = NULL;
 638     unsigned char *writeptr;
 639     int            i;
 640
 641     if(!ucs4src)
 642       return NULL;
 643
 644     /*
 645      * Over-allocate and then resize at the end.
 646      */
 647
 648     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 649     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 650
 651     writeptr = ret;
 652     for(i = 0; i < ucs4src_len; i++)
 653       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 654
 655     /* get rid of excess size */
 656     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 657
 658     return ((char *) ret);
 659 }
 660
 661
 662 #ifdef _WINDOWS
 663 /*
 664  * Convert a UTF-8 argument into an LPTSTR version
 665  * of that argument. The result is allocated here
 666  * and should be freed by the caller.
 667  */
 668 LPTSTR
 669 utf8_to_lptstr(LPSTR arg_utf8)
 670 {
 671      int lptstr_len;
 672      LPTSTR lptstr_ret = NULL;
 673
 674      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 675      if(lptstr_len > 0)
 676      {
 677          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 678          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 679              arg_utf8, -1, lptstr_ret, lptstr_len );
 680      }
 681
 682      if(!lptstr_len)
 683      {
 684          /* check GetLastError()? */
 685          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 686          lptstr_ret[0] = 0;
 687      }
 688
 689      return lptstr_ret;
 690 }
 691
 692
 693 /*
 694  * Convert an LPTSTR argument into a UTF-8 version
 695  * of that argument. The result is allocated here
 696  * and should be freed by the caller.
 697  */
 698 LPSTR
 699 lptstr_to_utf8(LPTSTR arg_lptstr)
 700 {
 701      int utf8str_len;
 702      LPSTR utf8str_ret = NULL;
 703
 704      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 705      if(utf8str_len > 0)
 706      {
 707          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 708          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 709              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 710      }
 711
 712      if(!utf8str_len)
 713      {
 714          /* check GetLastError()? */
 715          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 716          utf8str_ret[0] = 0;
 717      }
 718
 719      return utf8str_ret;
 720 }
 721
 722
 723 /*
 724  * Convert a UCS4 argument into an LPTSTR version
 725  * of that argument. The result is allocated here
 726  * and should be freed by the caller.
 727  */
 728 LPTSTR
 729 ucs4_to_lptstr(UCS *arg_ucs4)
 730 {
 731     LPTSTR ret_lptstr = NULL;
 732     size_t len;
 733     size_t i;
 734
 735     if(arg_ucs4){
 736         len = ucs4_strlen(arg_ucs4);
 737         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 738         /* bogus conversion ignores UTF-16 */
 739         for(i = 0; i < len; i++)
 740           ret_lptstr[i] = arg_ucs4[i];
 741
 742         ret_lptstr[len] = '\0';
 743     }
 744
 745     return(ret_lptstr);
 746 }
 747
 748
 749 /*
 750  * Convert an LPTSTR argument into a UCS4 version
 751  * of that argument. The result is MemAlloc'd here
 752  * and should be freed by the caller.
 753  */
 754 UCS *
 755 lptstr_to_ucs4(LPTSTR arg_lptstr)
 756 {
 757     UCS *ret_ucs4 = NULL;
 758     size_t len;
 759     size_t i;
 760
 761     if(arg_lptstr){
 762         len = _tcslen(arg_lptstr);
 763         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 764         /* bogus conversion ignores UTF-16 */
 765         for(i = 0; i < len; i++)
 766           ret_ucs4[i] = arg_lptstr[i];
 767
 768         ret_ucs4[len] = '\0';
 769     }
 770
 771     return(ret_ucs4);
 772 }
 773
 774 #endif /* _WINDOWS */
 775
 776
 777 /*
 778  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 779  * 1-at-a-time filled in with UCS characters. The return value is the
 780  * number of valid characters in obuf to be used. It can only
 781  * be 1 or 0 characters since we're only getting one UTF-8 character
 782  * at a time.
 783  */
 784 int
 785 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 786 {
 787     int  width = 0, outchars = 0;
 788
 789     if(!(cb && cb->cbufp))
 790       return(0);
 791
 792     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 793         unsigned char *inputp;
 794         unsigned long remaining_octets;
 795         UCS ucs;
 796
 797         *cb->cbufp++ = (unsigned char) c;
 798         inputp = cb->cbuf;
 799         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 800         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 801
 802         switch(ucs){
 803           case U8G_ENDSTRG:     /* incomplete character, wait */
 804           case U8G_ENDSTRI:     /* incomplete character, wait */
 805             break;
 806
 807           default:
 808             if(ucs & U8G_ERROR || ucs == UBOGON){
 809                 /*
 810                  * None of these cases is supposed to happen. If it
 811                  * does happen then the input stream isn't UTF-8
 812                  * so something is wrong.
 813                  */
 814                 outchars++;
 815                 *obuf = '?';
 816                 cb->cbufp = cb->cbuf;
 817                 width = 1;
 818             }
 819             else{
 820                 outchars++;
 821                 if(ucs < 0x80 && ucs >= 0x20)
 822                   width = 1;
 823
 824                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 825                     /*
 826                      * This happens when we have a UTF-8 character that
 827                      * we aren't able to print in our locale. For example,
 828                      * if the locale is setup with the terminal
 829                      * expecting ISO-8859-1 characters then there are
 830                      * lots of UTF-8 characters that can't be printed.
 831                      * Print a '?' instead.
 832                      * Don't think this should happen in Windows.
 833                      */
 834                     *obuf = '?';
 835                 }
 836                 else{
 837                     *obuf = ucs;
 838                 }
 839
 840                 /* update the input buffer */
 841                 if(inputp >= cb->cbufp) /* this should be the case */
 842                   cb->cbufp = cb->cbuf;
 843                 else{           /* extra chars for some reason? */
 844                     unsigned char *q, *newcbufp;
 845
 846                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 847                     q = cb->cbuf;
 848                     while(inputp < cb->cbufp)
 849                       *q++ = *inputp++;
 850
 851                     cb->cbufp = newcbufp;
 852                 }
 853             }
 854
 855             break;
 856         }
 857     }
 858     else{                       /* error */
 859         *obuf = '?';
 860         outchars = 1;
 861         width = 1;
 862         cb->cbufp = cb->cbuf;   /* start over */
 863     }
 864
 865     if(obufwidth)
 866       *obufwidth = width;
 867
 868     return(outchars);
 869 }
 870
 871
 872 /*
 873  * Return an allocated copy of a zero-terminated UCS-4 string.
 874  */
 875 UCS *
 876 ucs4_cpystr(UCS *ucs4src)
 877 {
 878     size_t         arraysize;
 879     UCS           *ret = NULL;
 880     size_t         i;
 881
 882     if(!ucs4src)
 883       return NULL;
 884
 885     arraysize = ucs4_strlen(ucs4src);
 886
 887     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 888     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 889
 890     for(i = 0; i < arraysize; i++)
 891       ret[i] = ucs4src[i];
 892
 893     return ret;
 894 }
 895
 896
 897 UCS *
 898 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 899 {
 900     size_t i;
 901
 902     if(ucs4src && ucs4dst){
 903         for(i = 0; i < n; i++){
 904             ucs4dst[i] = ucs4src[i];
 905             if(ucs4dst[i] == '\0')
 906               break;
 907         }
 908     }
 909
 910     return ucs4dst;
 911 }
 912
 913
 914 UCS *
 915 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 916 {
 917     size_t i;
 918     UCS *u;
 919
 920     if(ucs4src && ucs4dst){
 921         for(u = ucs4dst; *u; u++)
 922           ;
 923
 924         for(i = 0; i < n; i++){
 925             u[i] = ucs4src[i];
 926             if(u[i] == '\0')
 927               break;
 928         }
 929
 930         if(i == n)
 931           u[i] = '\0';
 932     }
 933
 934     return ucs4dst;
 935 }
 936
 937
 938 /*
 939  * Like strlen only this returns the number of non-zero characters
 940  * in a zero-terminated UCS-4 array.
 941  */
 942 size_t
 943 ucs4_strlen(UCS *ucs4str)
 944 {
 945     size_t i = 0;
 946
 947     if(ucs4str)
 948       while(ucs4str[i])
 949         i++;
 950
 951     return(i);
 952 }
 953
 954
 955 int
 956 ucs4_strcmp(UCS *s1, UCS *s2)
 957 {
 958     for(; *s1 == *s2; s1++, s2++)
 959       if(*s1 == '\0')
 960         return 0;
 961
 962     return((*s1 < *s2) ? -1 : 1);
 963 }
 964
 965
 966 UCS *
 967 ucs4_strchr(UCS *s, UCS c)
 968 {
 969     if(!s)
 970       return NULL;
 971
 972     while(*s && *s != c)
 973       s++;
 974
 975     if(*s || !c)
 976       return s;
 977     else
 978       return NULL;
 979 }
 980
 981
 982 UCS *
 983 ucs4_strrchr(UCS *s, UCS c)
 984 {
 985     UCS *ret = NULL;
 986
 987     if(!s)
 988       return ret;
 989
 990     while(*s){
 991         if(*s == c)
 992           ret = s;
 993
 994         s++;
 995     }
 996
 997     return ret;
 998 }
 999
1000
1001 /*
1002  * Returns the screen cells width of the UTF-8 string argument.
1003  */
1004 unsigned
1005 utf8_width(char *str)
1006 {
1007     unsigned width = 0;
1008     int this_width;
1009     UCS ucs;
1010     unsigned long remaining_octets;
1011     char *readptr;
1012
1013     if(!(str && *str))
1014       return(width);
1015
1016     readptr = str;
1017     remaining_octets = readptr ? strlen(readptr) : 0;
1018
1019     while(remaining_octets > 0 && *readptr){
1020
1021         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1022
1023         if(ucs & U8G_ERROR || ucs == UBOGON){
1024             /*
1025              * This should not happen, but do something to handle it anyway.
1026              * Treat each character as a single width character, which is what should
1027              * probably happen when we actually go to write it out.
1028              */
1029             remaining_octets--;
1030             readptr++;
1031             this_width = 1;
1032         }
1033         else{
1034             this_width = wcellwidth(ucs);
1035
1036             /*
1037              * If this_width is -1 that means we can't print this character
1038              * with our current locale. Writechar will print a '?'.
1039              */
1040             if(this_width < 0)
1041               this_width = 1;
1042         }
1043
1044         width += (unsigned) this_width;
1045     }
1046
1047     return(width);
1048 }
1049
1050
1051 /*
1052  * Copy UTF-8 characters from src into dst.
1053  * This is intended to be used if you want to truncate a string at
1054  * the start instead of the end. For example, you have a long string
1055  * like
1056  *       this_is_a_long_string
1057  * but not enough space to fit it into a particular field. You want to
1058  * end up with
1059  *             s_a_long_string
1060  * where that fits in a particular width. Perhaps you'd use this with ...
1061  * to get
1062  *          ...s_a_long_string
1063  * This right adjusts the end of the string in the width space and
1064  * cuts it off at the start. If there is enough width for the whole
1065  * string it will copy the string into dst with no padding.
1066  *
1067  * Copy enough characters so that the result will have screen width of
1068  * want_width screen cells in current locale.
1069  *
1070  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1071  *   to dst. This is just for protection, it shouldn't be relied on to
1072  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1073  *   characters truncated in the middle or something like that.
1074  *
1075  * Returned value is the number of bytes written to dst, not including
1076  *   the possible terminating null.
1077  *
1078  * If we can't hit want_width exactly because of double width characters
1079  *   then we will pad the end of the string with space in order to make
1080  *   the width exact.
1081  */
1082 size_t
1083 utf8_to_width_rhs(char *dst,            /* destination buffer */
1084                   char *src,            /* source string */
1085                   size_t dstlen,        /* space in dest */
1086                   unsigned want_width)  /* desired screen width */
1087 {
1088     int this_width;
1089     unsigned width_consumed = 0;
1090     UCS ucs;
1091     unsigned long remaining_octets;
1092     char *readptr, *goodreadptr, *savereadptr, *endptr;
1093     size_t nb = 0;
1094
1095     if(!src){
1096         if(dstlen > 0)
1097           dst[0] = '\0';
1098
1099         return nb;
1100     }
1101
1102     /*
1103      * Start at the end of the source string and go backwards until we
1104      * get to the desired width, but not more than the width.
1105      */
1106     readptr = src + strlen(src);
1107     endptr = readptr;
1108     goodreadptr = readptr;
1109     width_consumed = 0;
1110     savereadptr = readptr;
1111
1112     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1113         readptr = savereadptr-1){
1114
1115         savereadptr = readptr;
1116         remaining_octets = goodreadptr - readptr;
1117         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1118
1119         /*
1120          * Handling the error case is tough because an error will be the normal thing that
1121          * happens as we back through the string. So we're just going to punt on the
1122          * error for now.
1123          */
1124         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1125             if(remaining_octets > 0){
1126                 /*
1127                  * This means there are some bad octets after this good
1128                  * character so things are not going to work out well.
1129                  * Bail out.
1130                  */
1131                 savereadptr = src;      /* we're done */
1132             }
1133             else{
1134                 this_width = wcellwidth(ucs);
1135
1136                 if(this_width < 0)
1137                   this_width = 1;
1138
1139                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1140                     width_consumed += (unsigned) this_width;
1141                     goodreadptr = savereadptr;
1142                 }
1143                 else
1144                   savereadptr = src;    /* we're done */
1145             }
1146         }
1147     }
1148
1149     /*
1150      * Copy characters from goodreadptr to endptr into dst.
1151      */
1152     nb = MIN(endptr-goodreadptr, dstlen-1);
1153     strncpy(dst, goodreadptr, nb);
1154     dst[nb] = '\0';
1155
1156     /*
1157      * Pad out with spaces in order to hit width exactly.
1158      */
1159     while(width_consumed < want_width && nb < dstlen-1){
1160         dst[nb++] = ' ';
1161         dst[nb] = '\0';
1162         width_consumed++;
1163     }
1164
1165     return nb;
1166 }
1167
1168
1169 /*
1170  * The arguments being converted are UTF-8 strings.
1171  * This routine attempts to make it possible to use screen cell
1172  * widths in a format specifier. In a one-byte per screen cell
1173  * world we might have used %10.10s to cause a string to occupy
1174  * 10 screen positions. Since the width and precision are really
1175  * referring to numbers of bytes instead of screen positions that
1176  * won't work with UTF-8 input. We emulate that behavior with
1177  * the format string %w. %m.nw means to use the m and n as
1178  * screen width indicators instead of bytes indicators.
1179  *
1180  * There is no reason to use this routine unless you want to use
1181  * min field with or precision with the specifier. A plain %w without
1182  * widths is equivalent exactly to a plain %s in a regular printf.
1183  *
1184  * Double-width characters complicate things. It may not be possible
1185  * to satisfy the request exactly. For example, %3w for an input
1186  * string that is made up of two double-width characters.
1187  * This routine will arbitrarily use a trailing space character if
1188  * needed to make the width come out correctly where a half of a
1189  * double-width character would have been needed. We'll see how
1190  * that works for us.
1191  *
1192  * %w only works for strings (it's a %s replacement).
1193  *
1194  * Buffer overflow is handled by the size argument. %.30s will work
1195  * to limit a particular string to 30 bytes, but you lose that
1196  * ability with %w, since it may write more than precision bytes
1197  * in order to get to the desired width. It is best to choose
1198  * size large enough so that it doesn't come into play, otherwise
1199  * it may be possible to get partial UTF-8 characters because of
1200  * the truncation.
1201  *
1202  * The return value isn't quite the same as the return value
1203  * of snprintf. It is the number of bytes written, not counting
1204  * the trailing null, just like snprintf. However, if it is
1205  * truncated due to size then the output is size, not the
1206  * number of characters that would have been written.
1207  */
1208 int
1209 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1210 {
1211     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1212     char   *start_of_specifier;
1213     char   *input_str;
1214     int     int_arg;
1215     double  double_arg;
1216     void   *ptr_arg;
1217     unsigned got_width;
1218     int     more_flags, ret, w;
1219     int     min_field_width, field_precision, modifier;
1220     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1221     va_list args;
1222
1223     newfmt[0] = '\0';
1224     q = newfmt;
1225
1226     pdest = dest;
1227
1228 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1229     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1230
1231     /*
1232      * Strategy: Look through the fmt string for %w's. Replace the
1233      * %w's in the format string with %s's but with possibly different
1234      * width and precision arguments which will make it come out right.
1235      * Then call the regular system vsnprintf with the altered format
1236      * string but same arguments.
1237      *
1238      * That would be nice but it doesn't quite work. Why? Because a
1239      * %*w will need to have the value in the integer argument the *
1240      * refers to modified. Can't do it as far as I can tell. Or we could
1241      * remove the integer argument somehow before calling printf. Can't
1242      * do it. Or we could somehow add an additional conversion specifier
1243      * that caused nothing to be printed but ate up the integer arg.
1244      * Can't figure out how to do that either.
1245      *
1246      * Since we can't figure out how to do it, the alternative is to
1247      * construct the result one piece at a time, pasting together the
1248      * pieces from the different conversions.
1249      */
1250     va_start(args, fmt);
1251
1252     while(*fmt && IS_ROOM_IN_DEST(1)){
1253         if(*fmt == '%'){
1254             start_of_specifier = fmt++;
1255
1256             min_field_width = field_precision = -1;
1257             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1258
1259             /* flags */
1260             more_flags = 1;
1261             while(more_flags){
1262                 switch(*fmt){
1263                   case '-':
1264                     flags_minus++;
1265                     fmt++;
1266                     break;
1267
1268                   case '+':
1269                     flags_plus++;
1270                     fmt++;
1271                     break;
1272
1273                   case ' ':
1274                     flags_space++;
1275                     fmt++;
1276                     break;
1277
1278                   case '0':
1279                     flags_zero++;
1280                     fmt++;
1281                     break;
1282
1283                   case '#':
1284                     flags_pound++;
1285                     fmt++;
1286                     break;
1287
1288                   default:
1289                     more_flags = 0;
1290                     break;
1291                 }
1292             }
1293
1294             /* minimum field width */
1295             if(*fmt == '*'){
1296                 min_field_width = va_arg(args, int);
1297                 fmt++;
1298             }
1299             else if(*fmt >= '0' && *fmt <= '9'){
1300                 width_str = fmt;
1301                 while (*fmt >= '0' && *fmt <= '9')
1302                   fmt++;
1303
1304                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1305                 if(sizeof(buf) > fmt-width_str)
1306                   buf[fmt-width_str] = '\0';
1307
1308                 buf[sizeof(buf)-1] = '\0';
1309
1310                 min_field_width = atoi(width_str);
1311             }
1312
1313             /* field precision */
1314             if(*fmt == '.'){
1315                 fmt++;
1316                 if(*fmt == '*'){
1317                     field_precision = va_arg(args, int);
1318                     fmt++;
1319                 }
1320                 else if(*fmt >= '0' && *fmt <= '9'){
1321                     width_str = fmt;
1322                     while (*fmt >= '0' && *fmt <= '9')
1323                       fmt++;
1324
1325                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1326                     if(sizeof(buf) > fmt-width_str)
1327                       buf[fmt-width_str] = '\0';
1328
1329                     buf[sizeof(buf)-1] = '\0';
1330
1331                     field_precision = atoi(width_str);
1332                 }
1333             }
1334
1335             /* length modifier */
1336             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1337               modifier = *fmt++;
1338
1339             /* conversion character */
1340             switch(*fmt){
1341               case 'w':
1342                 /*
1343                  * work with va_arg(char *) to figure out width
1344                  * and precision needed to produce the screen width
1345                  * and precision asked for in %w using some of the
1346                  * utf8 width routines we have.
1347                  */
1348
1349                 input_str = va_arg(args, char *);
1350                 if(field_precision >=0 || min_field_width >= 0)
1351                   w = utf8_width(input_str);
1352
1353                 if(field_precision >= 0){
1354                     if(w <= field_precision)
1355                       field_precision = -1;  /* print it all */
1356                     else{
1357                         /*
1358                          * We need to cut off some of the input_str
1359                          * in this case.
1360                          */
1361                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1362                         field_precision = (int) (end - input_str);
1363                         /* new w with this field_precision */
1364                         w = got_width;
1365                     }
1366                 }
1367
1368                 /* need some padding */
1369                 if(min_field_width >= 0)
1370                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1371                                       MAX(0, min_field_width - w);
1372
1373                 /*
1374                  * Now we just need to get the new format string
1375                  * set correctly in newfmt.
1376                  */
1377                 q = newfmt;
1378                 if(q-newfmt < sizeof(newfmt))
1379                   *q++ = '%';
1380
1381                 if(flags_minus && q-newfmt < sizeof(newfmt))
1382                   *q++ = '-';
1383                 if(flags_plus && q-newfmt < sizeof(newfmt))
1384                   *q++ = '+';
1385                 if(flags_space && q-newfmt < sizeof(newfmt))
1386                   *q++ = ' ';
1387                 if(flags_zero && q-newfmt < sizeof(newfmt))
1388                   *q++ = '0';
1389                 if(flags_pound && q-newfmt < sizeof(newfmt))
1390                   *q++ = '#';
1391
1392                 if(min_field_width >= 0){
1393                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1394                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1395                 }
1396
1397                 if(field_precision >= 0){
1398                     if(q-newfmt < sizeof(newfmt))
1399                       *q++ = '.';
1400
1401                     snprintf(buf, sizeof(buf), "%d", field_precision);
1402                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1403                 }
1404
1405                 if(q-newfmt < sizeof(newfmt))
1406                   *q++ = 's';
1407
1408                 if(q-newfmt < sizeof(newfmt))
1409                   *q++ = '\0';
1410
1411                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1412                 pdest += strlen(pdest);
1413
1414                 break;
1415
1416               case '\0':
1417                 fmt--;
1418                 break;
1419
1420               default:
1421                 /* make a new format which leaves out the dynamic '*' arguments */
1422                 q = newfmt;
1423                 if(q-newfmt < sizeof(newfmt))
1424                   *q++ = '%';
1425
1426                 if(flags_minus && q-newfmt < sizeof(newfmt))
1427                   *q++ = '-';
1428                 if(flags_plus && q-newfmt < sizeof(newfmt))
1429                   *q++ = '+';
1430                 if(flags_space && q-newfmt < sizeof(newfmt))
1431                   *q++ = ' ';
1432                 if(flags_zero && q-newfmt < sizeof(newfmt))
1433                   *q++ = '0';
1434                 if(flags_pound && q-newfmt < sizeof(newfmt))
1435                   *q++ = '#';
1436
1437                 if(min_field_width >= 0){
1438                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1439                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1440                 }
1441
1442                 if(field_precision >= 0){
1443                     if(q-newfmt < sizeof(newfmt))
1444                       *q++ = '.';
1445
1446                     snprintf(buf, sizeof(buf), "%d", field_precision);
1447                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1448                 }
1449
1450                 if(q-newfmt < sizeof(newfmt))
1451                   *q++ = *fmt;
1452
1453                 if(q-newfmt < sizeof(newfmt))
1454                   *q++ = '\0';
1455
1456                 switch(*fmt){
1457                   case 'd': case 'i': case 'o':
1458                   case 'x': case 'X': case 'u': case 'c':
1459                     int_arg = va_arg(args, int);
1460                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1461                     pdest += strlen(pdest);
1462                     break;
1463
1464                   case 's':
1465                     input_str = va_arg(args, char *);
1466                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1467                     pdest += strlen(pdest);
1468                     break;
1469
1470                   case 'f': case 'e': case 'E':
1471                   case 'g': case 'G':
1472                     double_arg = va_arg(args, double);
1473                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1474                     pdest += strlen(pdest);
1475                     break;
1476
1477                   case 'p':
1478                     ptr_arg = va_arg(args, void *);
1479                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1480                     pdest += strlen(pdest);
1481                     break;
1482
1483                   case '%':
1484                     if(IS_ROOM_IN_DEST(1))
1485                       *pdest++ =  '%';
1486
1487                     break;
1488
1489                   default:
1490                     /* didn't think of this type */
1491                     assert(0);
1492                     break;
1493                 }
1494
1495                 break;
1496             }
1497
1498             fmt++;
1499         }
1500         else{
1501             if(IS_ROOM_IN_DEST(1))
1502               *pdest++ = *fmt++;
1503         }
1504     }
1505
1506     ret = pdest - dest;
1507
1508     if(IS_ROOM_IN_DEST(1))
1509       *pdest++ = '\0';
1510
1511     va_end(args);
1512
1513     return ret;
1514 }
1515
1516
1517 /*
1518  * Copy UTF-8 characters from src into dst.
1519  * Copy enough characters so that the result will have (<=) screen width of
1520  * want_width screen cells in current locale.
1521  *
1522  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1523  *   to dst.
1524  *
1525  * Returned value is the number of bytes written to dst, not including
1526  *   the possible terminating null.
1527  * Got_width is another returned value. It is the width in screen cells of
1528  *   the string placed in dst. It will be the same as want_width if there
1529  *   are enough characters in the src to do that and if the character widths
1530  *   hit the width exactly. It will be less than want_width if we run out
1531  *   of src characters or if the next character width would skip over the
1532  *   width we want, because it is double width.
1533  *
1534  * Zero width characters are collected and included at the end of the string.
1535  *   That is, if we make it to want_width but there is still a zero length
1536  *   character sitting in src, we add that to dst. This might be an accent
1537  *   or something like that.
1538  */
1539 size_t
1540 utf8_to_width(char *dst,                /* destination buffer */
1541               char *src,                /* source string */
1542               size_t dstlen,            /* space in dst */
1543               unsigned want_width,      /* desired screen width */
1544               unsigned *got_width)      /* returned screen width in dst */
1545 {
1546     int this_width;
1547     unsigned width_consumed = 0;
1548     UCS ucs;
1549     unsigned long remaining_octets;
1550     char *writeptr, *readptr, *savereadptr, *endptr;
1551     int ran_out_of_space = 0;
1552
1553     readptr = src;
1554
1555     remaining_octets = readptr ? strlen(readptr) : 0;
1556
1557     writeptr = dst;
1558     endptr = writeptr + dstlen;
1559
1560     if(readptr && writeptr){
1561       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1562         savereadptr = readptr;
1563         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1564
1565         if(ucs & U8G_ERROR || ucs == UBOGON)
1566           remaining_octets = 0;
1567         else{
1568           this_width = wcellwidth(ucs);
1569
1570           /*
1571            * If this_width is -1 that means we can't print this character
1572            * with our current locale. Writechar will print a '?'.
1573            */
1574           if(this_width < 0)
1575             this_width = 1;
1576
1577           if(width_consumed + (unsigned) this_width <= want_width){
1578             /* append this utf8 character to dst if it will fit */
1579             if(writeptr + (readptr - savereadptr) < endptr){
1580               width_consumed += this_width;
1581               while(savereadptr < readptr)
1582                 *writeptr++ = *savereadptr++;
1583             }
1584             else
1585               ran_out_of_space++;       /* no more utf8 to dst */
1586           }
1587           else
1588             remaining_octets = 0;       /* we're done */
1589         }
1590       }
1591
1592       if(writeptr < endptr)
1593         *writeptr = '\0';
1594     }
1595
1596     if(got_width)
1597       *got_width = width_consumed;
1598
1599     return(writeptr ? (writeptr - dst) : 0);
1600 }
1601
1602
1603 /*
1604  * Str is a UTF-8 string.
1605  * Count forward width screencell positions and return a pointer to the
1606  * end of the string that is width wide.
1607  * The returned pointer points at the next character (where the null would
1608  * be placed).
1609  *
1610  * Got_width is another returned value. It is the width in screen cells of
1611  *   the string from str to the returned pointer. It will be the same as
1612  *   want_width if there are enough characters in the str to do that
1613  *   and if the character widths hit the width exactly. It will be less
1614  *   than want_width if we run out of characters or if the next character
1615  *   width would skip over the width we want, because it is double width.
1616  */
1617 char *
1618 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1619 {
1620     int this_width;
1621     unsigned width_consumed = 0;
1622     UCS ucs;
1623     unsigned long remaining_octets;
1624     char *readptr;
1625     char *retptr;
1626
1627     retptr = readptr = str;
1628
1629     remaining_octets = readptr ? strlen(readptr) : 0;
1630
1631     while(width_consumed <= want_width && remaining_octets > 0){
1632
1633         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1634
1635         if(ucs & U8G_ERROR || ucs == UBOGON){
1636             /*
1637              * This should not happen, but do something to handle it anyway.
1638              * Treat each character as a single width character, which is what should
1639              * probably happen when we actually go to write it out.
1640              */
1641             remaining_octets--;
1642             readptr++;
1643             this_width = 1;
1644         }
1645         else{
1646             this_width = wcellwidth(ucs);
1647
1648             /*
1649              * If this_width is -1 that means we can't print this character
1650              * with our current locale. Writechar will print a '?'.
1651              */
1652             if(this_width < 0)
1653               this_width = 1;
1654         }
1655
1656         if(width_consumed + (unsigned) this_width <= want_width){
1657             width_consumed += (unsigned) this_width;
1658             retptr = readptr;
1659         }
1660         else
1661           remaining_octets = 0; /* we're done */
1662     }
1663
1664     if(got_width)
1665       *got_width = width_consumed;
1666
1667     return(retptr);
1668 }
1669
1670
1671 /*
1672  * Copy a null terminator into a UTF-8 string in place so that the string is
1673  * no more than a certain screen width wide. If the string is already less
1674  * than or equal in width to the requested width, no change is made.
1675  *
1676  * The actual width accomplished is returned. Note that it may be less than
1677  * max_width due to double width characters as well as due to the fact that
1678  * it fits wholly in the max_width.
1679  *
1680  * Returned value is the actual screen width of str when done.
1681  *
1682  * A side effect is that a terminating null may have been written into
1683  * the passed in string.
1684  */
1685 unsigned
1686 utf8_truncate(char *str, unsigned max_width)
1687 {
1688     int this_width;
1689     unsigned width_consumed = 0;
1690     UCS ucs;
1691     unsigned long remaining_octets;
1692     char *readptr, *savereadptr;
1693
1694     readptr = str;
1695
1696     remaining_octets = readptr ? strlen(readptr) : 0;
1697
1698     if(readptr){
1699       while(width_consumed <= max_width && remaining_octets > 0){
1700
1701         savereadptr = readptr;
1702         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1703
1704         if(ucs & U8G_ERROR || ucs == UBOGON){
1705             /*
1706              * This should not happen, but do something to handle it anyway.
1707              * Treat each character as a single width character, which is what should
1708              * probably happen when we actually go to write it out.
1709              */
1710             remaining_octets--;
1711             readptr++;
1712             this_width = 1;
1713         }
1714         else{
1715             this_width = wcellwidth(ucs);
1716
1717             /*
1718              * If this_width is -1 that means we can't print this character
1719              * with our current locale. Writechar will print a '?'.
1720              */
1721             if(this_width < 0)
1722               this_width = 1;
1723         }
1724
1725         if(width_consumed + (unsigned) this_width <= max_width){
1726             width_consumed += (unsigned) this_width;
1727         }
1728         else{
1729             remaining_octets = 0;       /* we're done */
1730             *savereadptr = '\0';
1731         }
1732       }
1733     }
1734
1735     return(width_consumed);
1736 }
1737
1738
1739 /*
1740  * Copy UTF-8 characters from src into dst.
1741  * Copy enough characters so that the result will have screen width of
1742  * want_width screen cells in current locale.
1743  * If there aren't enough characters in src to get to want_width, pad on
1744  * left or right according to left_adjust argument.
1745  *
1746  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1747  *   to dst. Dst will be null terminated if there is enough room, but not
1748  *   if that would overflow dst's len.
1749  *
1750  * Returned value is the number of bytes written to dst, not including
1751  *   the possible terminating null.
1752  */
1753 size_t
1754 utf8_pad_to_width(char *dst,            /* destination buffer */
1755                   char *src,            /* source string */
1756                   size_t dstlen,        /* space in dst */
1757                   unsigned want_width,  /* desired screen width */
1758                   int left_adjust)      /* adjust left or right in want_width columns */
1759 {
1760     unsigned got_width = 0;
1761     int      need_more, howmany;
1762     size_t   len_left, bytes_used;
1763
1764     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1765     len_left = dstlen - bytes_used;
1766
1767     need_more = want_width - got_width;
1768     howmany = MIN(need_more, len_left);
1769
1770     if(howmany > 0){
1771         char *end, *newend, *p, *q;
1772
1773         end = dst + bytes_used;
1774         newend = end + howmany;
1775         if(left_adjust){
1776             /*
1777              * Add padding to end of string. Simply append
1778              * the needed number of spaces, or however many will fit
1779              * if we don't have enough space.
1780              */
1781             for(q = end; q < newend; q++)
1782               *q = ' ';
1783         }
1784         else{
1785             /*
1786              * Add padding to start of string.
1787              */
1788
1789             /* slide existing string over */
1790             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1791               *q = *p;
1792
1793             /* fill rest with spaces */
1794             for(; q >= dst; q--)
1795               *q = ' ';
1796         }
1797
1798         bytes_used += howmany;
1799     }
1800
1801     if(bytes_used < dstlen)
1802       dst[bytes_used] = '\0';
1803
1804     return(bytes_used);
1805 }
1806
1807
1808 /*
1809  * Str is a UTF-8 string.
1810  * Start_here is a pointer into the string. It points one position past
1811  * the last byte that should be considered a part of the length string.
1812  * Count back want_width screencell positions and return a pointer to the
1813  * start of the string that is want_width wide and ends with start_here.
1814  *
1815  * Since characters may be more than one cell width wide we may end up
1816  * skipping over the exact width. That is, if we need to we'll go back
1817  * too far (by one cell width). Account for that in the call by looking
1818  * at got_width.
1819  *
1820  * Note that this call gives a possible got_width == want_width+1 as
1821  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1822  * That was just what was needed at the time, maybe it needs to be
1823  * optional.
1824  */
1825 char *
1826 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1827 {
1828     unsigned width_consumed = 0;
1829     int this_width;
1830     UCS ucs;
1831     unsigned long remaining_octets;
1832     char *ptr, *savereadptr, *goodreadptr;
1833
1834     savereadptr = start_here;
1835     goodreadptr = start_here;
1836
1837     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1838
1839         savereadptr = ptr;
1840         remaining_octets = goodreadptr - ptr;
1841         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1842
1843         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1844           if(remaining_octets > 0){
1845               /*
1846                * This means there are some bad octets after this good
1847                * character so things are not going to work out well.
1848                * Bail out.
1849                */
1850               savereadptr = str;        /* we're done */
1851           }
1852           else{
1853             this_width = wcellwidth(ucs);
1854
1855             /*
1856              * If this_width is -1 that means we can't print this character
1857              * with our current locale. Writechar will print a '?'.
1858              */
1859             if(this_width < 0)
1860               this_width = 1;
1861
1862             width_consumed += (unsigned) this_width;
1863             goodreadptr = savereadptr;
1864           }
1865         }
1866     }
1867
1868     if(got_width)
1869       *got_width = width_consumed;
1870
1871     return(savereadptr);
1872 }
1873
1874
1875 /*----------------------------------------------------------------------
1876   copy the source string onto the destination string returning with
1877   the destination string pointer at the end of the destination text
1878
1879   motivation for this is to avoid twice passing over a string that's
1880   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1881
1882   This doesn't really belong here but it is used here.
1883  ----*/
1884 void
1885 sstrncpy(char **d, char *s, int n)
1886 {
1887     while(n-- > 0 && (**d = *s++) != '\0')
1888       (*d)++;
1889 }
1890
1891
1892 /*
1893  * If use_system_routines is set then NULL is the return value and it is
1894  * not an error. Display_charmap and keyboard_charmap should come over as
1895  * malloced strings and will be filled in with the result.
1896  *
1897  * Returns a void pointer to the input_cs CHARSET which is
1898  * passed to mbtow via kbseq().
1899  * If !use_system_routines && NULL is returned, that is an error and err should
1900  * have a message.
1901  * display_charmap and keyboard_charmap should be malloced data and may be
1902  * realloced and changed here.
1903  */
1904 int
1905 setup_for_input_output(int use_system_routines, char **display_charmap,
1906                        char **keyboard_charmap, void **input_cs_arg, char **err)
1907 {
1908     const CHARSET *cs;
1909     const CHARSET *input_cs = NULL;
1910     int already_tried = 0;
1911     int supported = 0;
1912     char buf[1000];
1913
1914 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1915
1916     if(err)
1917       *err = NULL;
1918
1919     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1920         *err = cpstr("Bad call to setup_for_input_output");
1921         return(-1);
1922     }
1923
1924     if(use_system_routines){
1925 #if     PREREQ_FOR_SYS_TRANSLATION
1926         char *dcm;
1927
1928         dcm = nl_langinfo_codeset_wrapper();
1929         dcm = dcm ? dcm : "US-ASCII";
1930
1931         init_utf8_display(0, NULL);
1932         if(*display_charmap){
1933             if(dcm && strucmp(*display_charmap, dcm)){
1934                 snprintf(buf, sizeof(buf),
1935                  _("Display character set \"%s\" is ignored when using system translation"),
1936                      *display_charmap);
1937
1938                 *err = cpstr(buf);
1939             }
1940
1941             fs_give((void **) display_charmap);
1942         }
1943
1944         if(*keyboard_charmap){
1945             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1946                 snprintf(buf, sizeof(buf),
1947                  _("Keyboard character set \"%s\" is ignored when using system translation"),
1948                      *keyboard_charmap);
1949
1950                 *err = cpstr(buf);
1951             }
1952
1953             fs_give((void **) keyboard_charmap);
1954         }
1955
1956         *display_charmap = cpstr(dcm);
1957         *keyboard_charmap = cpstr(dcm);
1958 #else
1959         *err = cpstr("Bad call to setup_for_input_output");
1960 #endif
1961
1962         *input_cs_arg = NULL;
1963         return(0);
1964     }
1965
1966
1967 try_again1:
1968     if(!(*display_charmap))
1969       *display_charmap = cpstr("US-ASCII");
1970
1971     if(!(*keyboard_charmap))
1972       *keyboard_charmap = cpstr(*display_charmap);
1973
1974     if(*keyboard_charmap){
1975         supported = input_charset_is_supported(*keyboard_charmap);
1976
1977         if(supported){
1978             if(!strucmp(*keyboard_charmap, "utf-8"))
1979               input_cs = utf8_charset(*keyboard_charmap);
1980             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1981               input_cs = cs;
1982         }
1983         else{
1984             if(err && !*err){
1985                 int iso2022jp = 0;
1986
1987                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
1988                   iso2022jp = 1;
1989
1990                 snprintf(buf, sizeof(buf),
1991                      /* TRANSLATORS: The first argument is the name of the character
1992                         set the user is trying to use (which is unsupported by alpine).
1993                         The second argument is " (except for posting)" if they are
1994                         trying to use ISO-2022-JP for something other than posting. */
1995                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
1996                      *keyboard_charmap,
1997                      iso2022jp ? _(" (except for posting)") : "");
1998
1999                 *err = cpstr(buf);
2000             }
2001
2002             input_cs = NULL;
2003             fs_give((void **) keyboard_charmap);
2004             *keyboard_charmap = cpstr("US-ASCII");
2005             if(!already_tried){
2006                 already_tried++;
2007                 goto try_again1;
2008             }
2009         }
2010     }
2011
2012
2013 try_again2:
2014     if(!(*display_charmap))
2015       *display_charmap = cpstr("US-ASCII");
2016
2017     if(*display_charmap){
2018         supported = output_charset_is_supported(*display_charmap);
2019         if(supported){
2020             if(!strucmp(*display_charmap, "utf-8"))
2021               init_utf8_display(1, NULL);
2022             else if((cs = utf8_charset(*display_charmap)) != NULL)
2023               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2024         }
2025         else{
2026             if(err && !*err){
2027                 int iso2022jp = 0;
2028
2029                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2030                   iso2022jp = 1;
2031
2032                 snprintf(buf, sizeof(buf),
2033                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2034                      *display_charmap,
2035                      iso2022jp ? _(" (except for posting)") : "");
2036
2037                 *err = cpstr(buf);
2038             }
2039
2040             fs_give((void **) display_charmap);
2041             if(!already_tried){
2042                 already_tried++;
2043                 goto try_again2;
2044             }
2045         }
2046     }
2047     else{
2048         if(err && !*err)
2049           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2050     }
2051
2052 #undef cpstr
2053
2054     *input_cs_arg = (void *) input_cs;
2055
2056     return(0);
2057 }
2058
2059
2060 int
2061 input_charset_is_supported(char *input_charset)
2062 {
2063     const CHARSET *cs;
2064
2065     if(!(input_charset && *input_charset))
2066       return 0;
2067
2068     if(!strucmp(input_charset, "utf-8"))
2069       return 1;
2070
2071     if((cs = utf8_charset(input_charset)) != NULL){
2072
2073         /*
2074          * This was true 2006-09-25.
2075          */
2076         switch(cs->type){
2077           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2078           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2079           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2080           case CT_UCS4: case CT_UTF16:
2081             return 1;
2082             break;
2083
2084           default:
2085             break;
2086         }
2087     }
2088
2089     return 0;
2090 }
2091
2092
2093 int
2094 output_charset_is_supported(char *output_charset)
2095 {
2096     const CHARSET *cs;
2097
2098     if(!(output_charset && *output_charset))
2099       return 0;
2100
2101     if(!strucmp(output_charset, "utf-8"))
2102       return 1;
2103
2104     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2105       return 1;
2106
2107     return 0;
2108 }
2109
2110
2111 int
2112 posting_charset_is_supported(char *posting_charset)
2113 {
2114     return(posting_charset && *posting_charset
2115            && (!strucmp(posting_charset, "ISO-2022-JP")
2116                || output_charset_is_supported(posting_charset)));
2117 }
2118
2119
2120 /*
2121  * This function is only defined in this special case and so calls
2122  * to it should be wrapped in the same macro conditionals.
2123  *
2124  * Returns the default display charset for a UNIX terminal emulator,
2125  * it is what nl_langinfo(CODESET) should return but we need to
2126  * wrap nl_langinfo because we know of strange behaving implementations.
2127  */
2128 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2129 char *
2130 nl_langinfo_codeset_wrapper(void)
2131 {
2132     char *ret = NULL;
2133
2134     ret = nl_langinfo(CODESET);
2135
2136     /*
2137      * If the value returned from nl_langinfo() is not a real charset,
2138      * see if we can figure out what they meant. If we can't figure it
2139      * out return NULL and let the caller decide what to do.
2140      */
2141     if(ret && *ret && !output_charset_is_supported(ret)){
2142         if(!strcmp("ANSI_X3.4-1968", ret)
2143            || !strcmp("646", ret)
2144            || !strcmp("ASCII", ret)
2145            || !strcmp("C", ret)
2146            || !strcmp("POSIX", ret))
2147           ret = "US-ASCII";
2148         else if(!strucmp(ret, "UTF8"))
2149           ret = "UTF-8";
2150         else if(!strucmp(ret, "EUCJP"))
2151           ret = "EUC-JP";
2152         else if(!strucmp(ret, "EUCKP"))
2153           ret = "EUC-KP";
2154         else if(!strucmp(ret, "SJIS"))
2155           ret = "SHIFT-JIS";
2156         else if(strstr(ret, "8859")){
2157             char *p;
2158
2159             /* check for digits after 8859 */
2160             p = strstr(ret, "8859");
2161             p += 4;
2162             if(!isdigit(*p))
2163               p++;
2164
2165             if(isdigit(*p)){
2166                 static char buf[12];
2167
2168                 memset(buf, 0, sizeof(buf));
2169                 strncpy(buf, "ISO-8859-", sizeof(buf));
2170                 buf[9] = *p++;
2171                 if(isdigit(*p))
2172                   buf[10] = *p;
2173
2174                 ret = buf;
2175             }
2176         }
2177     }
2178
2179     if(ret && !output_charset_is_supported(ret))
2180       ret = NULL;
2181
2182     return(ret);
2183 }
2184 #endif
2185
2186
2187 /*
2188  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2189  * needed the return value will point to orig. If a conversion is done,
2190  * the return string should be freed by the caller.
2191  * If not possible, returns NULL.
2192  */
2193 char *
2194 utf8_to_charset(char *orig, char *charset, int report_err)
2195 {
2196     SIZEDTEXT src, dst;
2197     char *ret = orig;
2198
2199     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2200       return ret;
2201
2202     src.size = strlen(orig);
2203     src.data = (unsigned char *) orig;
2204
2205     if(!strucmp(charset, "us-ascii")){
2206         size_t i;
2207
2208         for(i = 0; i < src.size; i++)
2209           if(src.data[i] & 0x80)
2210             return NULL;
2211
2212         return ret;
2213     }
2214
2215     /*
2216      * This works for ISO-2022-JP because of special code in utf8_cstext
2217      * but not for other 2022 charsets.
2218      */
2219     memset(&dst, 0, sizeof(dst));
2220     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2221       ret = (char *) dst.data;          /* c-client already null terminates it */
2222     else
2223       ret = NULL;
2224
2225     if((unsigned char *) ret != dst.data && dst.data)
2226       fs_give((void **) &dst.data);
2227
2228     return ret;
2229 }
2230
2231
2232 /*
2233  *      Turn a number into a string with comma's
2234  *
2235  * Args: number -- The long to be turned into a string.
2236  *
2237  * Result: pointer to static string representing number with commas
2238  * Can use up to 3 comatose results at once.
2239  */
2240 char *
2241 comatose(long int number)
2242 {
2243     long        i, x, done_one;
2244     static char buf[3][50];
2245     static int whichbuf = 0;
2246     char       *b;
2247
2248     whichbuf = (whichbuf + 1) % 3;
2249
2250     if(number == 0){
2251         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2252         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2253         return(buf[whichbuf]);
2254     }
2255
2256     done_one = 0;
2257     b = buf[whichbuf];
2258     for(i = 1000000000; i >= 1; i /= 1000) {
2259         x = number / i;
2260         number = number % i;
2261         if(x != 0 || done_one) {
2262             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2263               *b++ = ',';
2264
2265             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2266             b += strlen(b);
2267             done_one = 1;
2268         }
2269     }
2270
2271     if(b-buf[whichbuf] < sizeof(buf[0]))
2272       *b = '\0';
2273
2274     return(buf[whichbuf]);
2275 }
2276
2277
2278 /* leave out the commas */
2279 char *
2280 tose(long int number)
2281 {
2282     static char buf[3][50];
2283     static int whichbuf = 0;
2284
2285     whichbuf = (whichbuf + 1) % 3;
2286
2287     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2288
2289     return(buf[whichbuf]);
2290 }
2291
2292
2293 /*
2294  * line_paint - where the real work of managing what is displayed gets done.
2295  */
2296 void
2297 line_paint(int offset,                  /* current dot offset into vl */
2298            struct display_line *displ,
2299            int *passwd)                 /* flag to hide display of chars */
2300 {
2301     int i, w, w2, already_got_one = 0;
2302     int vfirst, vlast, dfirst, dlast, vi, di;
2303     int new_vbase;
2304     unsigned (*width_a_to_b)(UCS *, int, int);
2305
2306     /*
2307      * Set passwd to 10 in caller if you want to conceal the
2308      * password but not print asterisks for feedback.
2309      *
2310      * Set passwd to 1 in caller to conceal by printing asterisks.
2311      */
2312     if(passwd && *passwd >= 10){        /* don't show asterisks */
2313         if(*passwd > 10)
2314           return;
2315         else
2316           *passwd = 11;         /* only blat once */
2317
2318         i = 0;
2319         (*displ->movecursor)(displ->row, displ->col);
2320         while(i++ <= displ->dwid)
2321           (*displ->writechar)(' ');
2322
2323         (*displ->movecursor)(displ->row, displ->col);
2324         return;
2325     }
2326
2327     if(passwd && *passwd)
2328       width_a_to_b = single_width_chars_a_to_b;
2329     else
2330       width_a_to_b = ucs4_str_width_a_to_b;
2331
2332     /*
2333      * vl is the virtual line (the actual data). We operate on it by typing
2334      * characters to be added and deleting and so forth. In this routine we
2335      * copy a subset of those UCS-4 characters in vl into dl, the display
2336      * array, and show that subset on the screen.
2337      *
2338      * Offset is the location of the cursor in vl.
2339      *
2340      * We will display the string starting from vbase.
2341      * We have dwid screen cells to work in.
2342      * We may have to adjust vbase in order to display the
2343      * part of the string that contains the cursor.
2344      *
2345      * We'll make the display look like
2346      *   vl    a b c d e f g h i j k l m
2347      *             xxxxxxxxxxxxx  <- width dwid window
2348      *             < d e f g h >
2349      *               |
2350      *             vbase
2351      * The < will be there if vbase > 0.
2352      * The > will be there if the string from vbase to the
2353      * end can't all fit in the window.
2354      */
2355
2356     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2357
2358     /*
2359      * Adjust vbase so offset is not out of the window to the right.
2360      * (The +2 in w + 2 is for a possible " >" if the string goes past
2361      *  the right hand edge of the window and if the last visible character
2362      * is double wide. We don't want the offset to be under that > character.)
2363      */
2364     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2365         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2366         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2367         /*
2368          * offset is off the window to the right
2369          * It looks like   a b c d e f g h
2370          *                   |         |
2371          *               vbase         offset
2372          * and offset is either past the right edge,
2373          * or right at the right edge (and maybe under >),
2374          * or one before right at the edge (and maybe on space
2375          * for half a character).
2376          *
2377          * Since the characters may be double width it is slightly
2378          * complicated to figure out how far to increase vbase.
2379          * We're going to scoot over past width w/2 characters and
2380          * then see if that's sufficient.
2381          */
2382         new_vbase = displ->vbase + 1;
2383         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2384             w2 < displ->dwid/2;
2385             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2386           new_vbase++;
2387
2388         displ->vbase = new_vbase;
2389     }
2390
2391     /* adjust so offset is not out of the window to the left */
2392     while(displ->vbase > 0 && displ->vbase >= offset){
2393         /* add about dwid/2 more width */
2394         new_vbase = displ->vbase - 1;
2395         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2396             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2397             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2398           new_vbase--;
2399
2400         /* but don't let it get too small, recheck off right end */
2401         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2402             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2403             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2404           new_vbase++;
2405
2406         displ->vbase = MAX(new_vbase, 0);
2407     }
2408
2409     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2410       displ->vbase = 0;
2411
2412     vfirst = displ->vbase;
2413     dfirst = 0;
2414     if(displ->vbase > 0){                       /* off screen cue left */
2415         dfirst = 1;                             /* index which matches vfirst */
2416         displ->dl[0] = '<';
2417     }
2418
2419     vlast = displ->vused-1;                     /* end */
2420     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2421
2422     if(w + dfirst > displ->dwid){                       /* off window right */
2423
2424         /* find last ucs character to be printed */
2425         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2426           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2427
2428         /* worry about double-width characters */
2429         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2430             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2431             displ->dl[dlast] = '>';
2432         }
2433         else{
2434             dlast = dfirst + vlast - vfirst + 1;
2435             displ->dl[dlast++] = ' ';
2436             displ->dl[dlast] = '>';
2437         }
2438     }
2439     else
2440       dlast = dfirst + vlast - vfirst;
2441
2442     /*
2443      * Copy the relevant part of the virtual line into the display line.
2444      */
2445     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2446       if(passwd && *passwd)
2447         displ->dl[di] = '*';            /* to conceal password */
2448       else
2449         displ->dl[di] = displ->vl[vi];
2450
2451     /*
2452      * Add spaces to clear the rest of the line.
2453      * We have dwid total space to fill.
2454      */
2455     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2456     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2457       displ->dl[di++] = ' ';
2458
2459     /*
2460      * Draw from left to right, skipping until we get to
2461      * something that is different. Characters may be different
2462      * widths than they were initially so paint from there the
2463      * rest of the way.
2464      */
2465     for(di = 0; displ->dl[di]; di++){
2466         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2467             /* move cursor first time */
2468             if(!already_got_one++){
2469                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2470                 (*displ->movecursor)(displ->row, displ->col + w);
2471             }
2472
2473             (*displ->writechar)(displ->dl[di]);
2474             displ->olddl[di] = displ->dl[di];
2475         }
2476     }
2477
2478     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2479
2480     /*
2481      * Move the cursor to the offset.
2482      *
2483      * The offset is relative to the start of the virtual array. We need
2484      * to find the location on the screen. The offset into the display array
2485      * will be offset-vbase+dfirst. We want to be at the start of that
2486      * character, so we need to find the width of all the characters up
2487      * to that point.
2488      */
2489     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2490
2491     (*displ->movecursor)(displ->row, displ->col + w);
2492 }
2493
2494
2495 /*
2496  * This is just like ucs4_str_width_a_to_b() except all of the characters
2497  * are assumed to be of width 1. This is for printing out *'s when user
2498  * enters a password, while still managing to use the same code to do the
2499  * display.
2500  */
2501 unsigned
2502 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2503 {
2504     unsigned width = 0;
2505     int i;
2506
2507     if(ucsstr)
2508       for(i = a; i <= b && ucsstr[i]; i++)
2509         width++;
2510
2511     return width;
2512 }