pith/charconv/utf8.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2013-2017 Eduardo Chappa
   8  * Copyright 2006-2008 University of Washington
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19
  20 /* includable WITHOUT dependency on c-client */
  21 #include "../../c-client/mail.h"
  22 #include "../../c-client/utf8.h"
  23
  24 #ifdef _WINDOWS
  25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  26 #undef ERROR
  27 #else
  28 #define _XOPEN_SOURCE
  29 #endif
  30
  31 #include <system.h>
  32
  33 #include "../../c-client/fs.h"
  34
  35 /* includable WITHOUT dependency on pico */
  36 #include "../../pico/keydefs.h"
  37
  38 #include "../osdep/collate.h"
  39 #include "../filttype.h"
  40
  41 #include "utf8.h"
  42
  43 #include <stdarg.h>
  44
  45
  46 unsigned single_width_chars_a_to_b(UCS *, int, int);
  47
  48
  49 static char locale_charmap[50];
  50
  51 static int   native_utf8;
  52 static void *display_data;
  53
  54 void
  55 init_utf8_display(int utf8, void *rmap)
  56 {
  57     native_utf8 = utf8;
  58     display_data = rmap;
  59 }
  60
  61
  62 /*
  63  * Argument is a UCS-4 wide character.
  64  * Returns the environment dependent cell width of the
  65  * character when printed to the screen.
  66  * This will be -1 if the character is not printable.
  67  * It will be >= zero if it is printable.
  68  *
  69  * Note that in the case it is not printable but it is still sent to
  70  * Writechar, Writechar will print a '?' with width 1.
  71  */
  72 int
  73 wcellwidth(UCS ucs)
  74 {
  75     char dummy[32];
  76     long w;
  77
  78     /*
  79      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  80      * That's the assumption here.
  81      */
  82
  83     if(native_utf8){                    /* display is UTF-8 capable */
  84         w = ucs4_width((unsigned long) ucs);
  85         return((w & U4W_ERROR) ? -1 : w);
  86     }
  87     else if(display_data){
  88         if(wtomb(dummy, ucs) < 0)
  89           return(-1);
  90         else{
  91             w = ucs4_width((unsigned long) ucs);
  92             return((w & U4W_ERROR) ? -1 : w);
  93         }
  94     }
  95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
  96     else
  97       return(wcwidth((wchar_t) ucs));
  98 #else
  99     return(0);
 100 #endif
 101 }
 102
 103 /* ambiguous width zone character function */
 104 int
 105 pith_ucs4width(UCS ucs)
 106 {
 107 #if !defined(_WINDOWS) && HAVE_WCWIDTH
 108   return wcwidth((wchar_t) ucs);
 109 #else
 110   return (ucs >= 0x2100) ? 2 : 1;
 111 #endif /* _WINDOWS */
 112 }
 113
 114 /*
 115  * Argument is a UCS-4 wide character.
 116  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 117  * Dest is a buffer at least xx chars wide where the multi-byte version
 118  * of the wide character will be written.
 119  * The returned value is the number of bytes written to dest or -1
 120  * if the conversion can't be done.
 121  */
 122 int
 123 wtomb(char *dest, UCS ucs)
 124 {
 125     /*
 126      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 127      * That's the assumption here.
 128      */
 129
 130     if(native_utf8){
 131         unsigned char *newdptr;
 132
 133         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 134         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 135     }
 136     else if(display_data){
 137         unsigned long ucs4;
 138         int           ret;
 139
 140         ucs4 = (unsigned long) ucs;
 141         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 142         if(ret >= 0)
 143           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 144         else
 145           ret = -1;
 146
 147         return(ret);
 148     }
 149     else
 150       return(wcrtomb(dest, (wchar_t) ucs, NULL));
 151 }
 152
 153
 154 /*
 155  * This function does not necessarily update inputp and remaining_octets, so
 156  * don't rely on that. The c-client version does but the other doesn't.
 157  */
 158 UCS
 159 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 160 {
 161     UCS ucs;
 162
 163     if(input_cs){
 164         CHARSET *cast_input_cs;
 165
 166         cast_input_cs = (CHARSET *) input_cs;
 167
 168         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 169           case U8G_ENDSTRG:
 170           case U8G_ENDSTRI:
 171             return(CCONV_NEEDMORE);
 172
 173           default:
 174             if(ucs & U8G_ERROR || ucs == UBOGON)
 175               return(CCONV_BADCHAR);
 176
 177             return(ucs);
 178         }
 179     }
 180     else{
 181         size_t ret;
 182         wchar_t w;
 183
 184         /*
 185          * Warning:  input_cs and remaining_octets are unused in this
 186          * half of the if/else.
 187          *
 188          * Unfortunately, we can't tell the difference between a source string
 189          * that is just not long enough and one that has characters that can't
 190          * be converted even though it is long enough. We return NEEDMORE in both cases.
 191          */
 192         ret = mbstowcs(&w, (char *) (*inputp), 1);
 193         if(ret == (size_t)(-1))
 194           return(CCONV_NEEDMORE);
 195         else{
 196           ucs = (UCS) w;
 197           return(ucs);
 198         }
 199     }
 200 }
 201
 202
 203 void
 204 set_locale_charmap(char *charmap)
 205 {
 206     if(charmap){
 207         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 208         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 209     }
 210     else
 211       locale_charmap[0] = '\0';
 212 }
 213
 214
 215 /*
 216  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 217  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 218  * The caller is responsible for freeing the returned value.
 219  *
 220  * Args  str     -- the string to convert
 221  */
 222 char *
 223 convert_to_utf8(char *str, char *fromcharset, int flags)
 224 {
 225     char          *ret = NULL;
 226     char          *fcharset;
 227     SIZEDTEXT      src, result;
 228     const CHARSET *cs;
 229     int            try;
 230
 231     src.data = (unsigned char *) str;
 232     src.size = strlen(str);
 233
 234     /* already UTF-8, return NULL */
 235     if(!(flags & CU8_NOINFER)
 236        && (cs = utf8_infercharset(&src))
 237        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 238       return(ret);
 239
 240     try = 1;
 241     while(try < 5){
 242         switch(try){
 243           case 1:
 244             fcharset = fromcharset;
 245             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 246               break;    /* give it a try */
 247             else
 248               try++;    /* fall through */
 249
 250           case 2:
 251             if(!(flags & CU8_NOINFER)){
 252                 fcharset = cs ? cs->name : NULL;
 253                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 254                   break;
 255                 else
 256                   try++;        /* fall through */
 257             }
 258             else
 259               try++;    /* fall through */
 260
 261           case 3:
 262             fcharset = locale_charmap;
 263             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 264               break;
 265             else
 266               try++;    /* fall through */
 267
 268           default:
 269             fcharset = "ISO-8859-1";            /* this will "work" */
 270             break;
 271         }
 272
 273         memset(&result, 0, sizeof(result));
 274
 275         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 276             if(!(result.size == src.size && result.data == src.data)){
 277                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 278                 strncpy(ret, (char *) result.data, result.size);
 279                 ret[result.size] = '\0';
 280             }
 281             /* else no conversion necessary */
 282
 283             return(ret);
 284         }
 285
 286         try++;
 287     }
 288
 289     /* won't make it to here */
 290     return(ret);
 291 }
 292
 293
 294 /*
 295  * Convert from UTF-8 to user's locale charset.
 296  * This actually uses the wtomb routine to do the conversion, and that
 297  * relies on setup_for_input_output having been called.
 298  * If no conversion is necessary, NULL is returned, otherwise an allocated
 299  * string in the locale charset is returned and the caller is responsible
 300  * for freeing it.
 301  */
 302 char *
 303 convert_to_locale(char *utf8str)
 304 {
 305 #define CHNK 500
 306     char *inp, *retp, *ret = NULL;
 307     CBUF_S cb;
 308     int r, alloced;
 309
 310     if(native_utf8 || !utf8str || !utf8str[0])
 311       return(NULL);
 312
 313     cb.cbuf[0] = '\0';
 314     cb.cbufp = cb.cbufend = cb.cbuf;
 315     inp = utf8str;
 316
 317     alloced = CHNK;
 318     ret = (char *) fs_get(alloced * sizeof(char));
 319     retp = ret;
 320
 321     /*
 322      * There's gotta be a better way to do this but utf8_to_locale was
 323      * available and everything looks like a nail when all you have
 324      * is a hammer.
 325      */
 326     while(*inp){
 327         /*
 328          * We're placing the outgoing stream of characters in ret, a multi-byte
 329          * array of characters in the user's locale charset. See if there is
 330          * enough room for the next wide characters worth of output chars
 331          * and allocate more space if not.
 332          */
 333         if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
 334             alloced += CHNK;
 335             fs_resize((void **) &ret, alloced * sizeof(char));
 336         }
 337
 338         r = utf8_to_locale((int) *inp++, &cb,
 339                            (unsigned char *) retp, alloced-(retp-ret));
 340
 341         retp += r;
 342     }
 343
 344     *retp = '\0';
 345
 346     fs_resize((void **) &ret, strlen(ret)+1);
 347
 348     return(ret);
 349 }
 350
 351
 352 /*
 353  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 354  * filled in with multi-byte characters. The return value is the
 355  * number of valid characters in obuf to be used.
 356  */
 357 int
 358 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 359 {
 360     int outchars = 0;
 361
 362     if(!(cb && cb->cbufp))
 363       return(0);
 364
 365     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 366         unsigned char *inputp;
 367         unsigned long remaining_octets;
 368         UCS ucs;
 369
 370         *(cb->cbufp)++ = (unsigned char) c;
 371         inputp = cb->cbuf;
 372         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 373         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 374
 375         switch(ucs){
 376           case U8G_ENDSTRG:     /* incomplete character, wait */
 377           case U8G_ENDSTRI:     /* incomplete character, wait */
 378             break;
 379
 380           default:
 381             if(ucs & U8G_ERROR || ucs == UBOGON){
 382                 /*
 383                  * None of these cases is supposed to happen. If it
 384                  * does happen then the input stream isn't UTF-8
 385                  * so something is wrong. Treat each character in the
 386                  * input buffer as a separate error character and
 387                  * print a '?' for each.
 388                  */
 389                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 390                   obuf[outchars++] = '?';
 391
 392                 cb->cbufp = cb->cbuf;
 393             }
 394             else{
 395                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 396                     /*
 397                      * This happens when we have a UTF-8 character that
 398                      * we aren't able to print in our locale. For example,
 399                      * if the locale is setup with the terminal
 400                      * expecting ISO-8859-1 characters then there are
 401                      * lots of UTF-8 characters that can't be printed.
 402                      * Print a '?' instead.
 403                      */
 404                     obuf[outchars++] = '?';
 405                 }
 406                 else{
 407                     /*
 408                      * Convert the ucs into the multibyte
 409                      * character that corresponds to the
 410                      * ucs in the users locale.
 411                      */
 412                     outchars = wtomb((char *) obuf, ucs);
 413                     if(outchars < 0){
 414                         obuf[0] = '?';
 415                         outchars = 1;
 416                     }
 417                 }
 418
 419                 /* update the input buffer */
 420                 if(inputp >= cb->cbufp) /* this should be the case */
 421                   cb->cbufp = cb->cbuf;
 422                 else{           /* extra chars for some reason? */
 423                     unsigned char *q, *newcbufp;
 424
 425                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 426                     q = cb->cbuf;
 427                     while(inputp < cb->cbufp)
 428                       *q++ = *inputp++;
 429
 430                     cb->cbufp = newcbufp;
 431                 }
 432             }
 433
 434             break;
 435         }
 436     }
 437     else{                       /* error */
 438         obuf[0] = '?';
 439         outchars = 1;
 440         cb->cbufp = cb->cbuf;   /* start over */
 441     }
 442
 443     return(outchars);
 444 }
 445
 446
 447 /*
 448  * Returns the screen cells width of the UCS-4 string argument.
 449  * The source string is zero terminated.
 450  */
 451 unsigned
 452 ucs4_str_width(UCS *ucsstr)
 453 {
 454     unsigned width = 0;
 455     int w;
 456
 457     if(ucsstr)
 458       while(*ucsstr){
 459         w = wcellwidth(*ucsstr++);
 460         if(w != U4W_CTLSRGT)
 461           width += (w < 0 ? 1 : w);
 462       }
 463
 464     return width;
 465 }
 466
 467
 468 /*
 469  * Returns the screen cells width of the UCS-4 string argument
 470  * from ucsstr[a] through (inclusive) ucsstr[b].
 471  * No checking is done to make sure a starts in the middle
 472  * of a UCS-4 array.
 473  */
 474 unsigned
 475 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 476 {
 477     unsigned width = 0;
 478     int i, w;
 479
 480     if(ucsstr)
 481       for(i = a; i <= b && ucsstr[i]; i++){
 482         w = wcellwidth(ucsstr[i]);
 483         if(w != U4W_CTLSRGT)
 484           width += (w < 0 ? 1 : w);
 485       }
 486
 487     return width;
 488 }
 489
 490
 491 /*
 492  * Returns the screen cells width of the UCS-4 string argument
 493  * from ustart through (exclusive) uend.
 494  * No checking is done to make sure it starts in the middle
 495  * of a UCS-4 array.
 496  */
 497 unsigned
 498 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 499 {
 500     UCS *u;
 501     unsigned width = 0;
 502     int w;
 503
 504     if(!ustart)
 505       return width;
 506
 507     if(ustart)
 508       for(u = ustart; u < uend; u++){
 509         w = wcellwidth(*u);
 510         if(w != U4W_CTLSRGT)
 511           width += (w < 0 ? 1 : w);
 512       }
 513
 514     return(width);
 515 }
 516
 517
 518 /*
 519  * Return the largest possible pointer into ucs4str so that the width
 520  * of the string from ucs4str to the pointer (exclusive)
 521  * is maxwidth or less. Also stops at a null character.
 522  */
 523 UCS *
 524 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 525 {
 526     UCS *u;
 527     int w_consumed = 0, w, done = 0;
 528
 529     u = ucs4str;
 530
 531     if(u)
 532       while(!done && *u && w_consumed <= maxwidth){
 533         w = wcellwidth(*u);
 534         w = (w >= 0 ? w : 1);
 535         if(w_consumed + w <= maxwidth){
 536             w_consumed += w;
 537             ++u;
 538         }
 539         else
 540           ++done;
 541       }
 542
 543     return(u);
 544 }
 545
 546
 547 /*
 548  * Convert and copy a UTF-8 string into a UCS-4 NULL
 549  * terminated array. Just like cpystr only it converts
 550  * from UTF-8 to UCS-4.
 551  *
 552  * Returned UCS-4 string needs to be freed by caller.
 553  */
 554 UCS *
 555 utf8_to_ucs4_cpystr(char *utf8src)
 556 {
 557     size_t         retsize;
 558     UCS           *ret = NULL;
 559     UCS            ucs;
 560     unsigned long  remaining_octets;
 561     unsigned char *readptr;
 562     size_t         arrayindex;
 563
 564     /*
 565      * We don't know how big to allocate the return array
 566      * because variable numbers of octets in the src array
 567      * will combine to make UCS-4 characters. The number of
 568      * UCS-4 characters is less than or equal to the number
 569      * of src characters, though.
 570      */
 571
 572     if(!utf8src)
 573       return NULL;
 574
 575     retsize = strlen(utf8src) + 1;
 576
 577     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 578     memset(ret, 0, retsize * sizeof(*ret));
 579
 580     readptr = (unsigned char *) utf8src;
 581     remaining_octets = retsize-1;
 582     arrayindex = 0;
 583
 584     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 585         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 586
 587         if(ucs & U8G_ERROR || ucs == UBOGON)
 588           remaining_octets = 0;
 589         else
 590           ret[arrayindex++] = ucs;
 591     }
 592
 593     ret[arrayindex] = '\0';
 594
 595     /* get rid of excess size */
 596     if(arrayindex+1 < retsize)
 597       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 598
 599     return ret;
 600 }
 601
 602
 603 /*
 604  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 605  * terminated string. Just like cpystr only it converts
 606  * from UCS-4 to UTF-8.
 607  *
 608  * Returned UTF-8 string needs to be freed by caller.
 609  */
 610 char *
 611 ucs4_to_utf8_cpystr(UCS *ucs4src)
 612 {
 613     unsigned char *ret = NULL;
 614     unsigned char *writeptr;
 615     int            i;
 616
 617     if(!ucs4src)
 618       return NULL;
 619
 620     /*
 621      * Over-allocate and then resize at the end.
 622      */
 623
 624     /* count characters in source */
 625     for(i = 0; ucs4src[i]; i++)
 626       ;
 627
 628     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 629     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 630
 631     writeptr = ret;
 632     for(i = 0; ucs4src[i]; i++)
 633       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 634
 635     /* get rid of excess size */
 636     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 637
 638     return ((char *) ret);
 639 }
 640
 641
 642 /*
 643  * Similar to above but copy a fixed number of source
 644  * characters instead of going until null terminator.
 645  */
 646 char *
 647 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 648 {
 649     unsigned char *ret = NULL;
 650     unsigned char *writeptr;
 651     int            i;
 652
 653     if(!ucs4src)
 654       return NULL;
 655
 656     /*
 657      * Over-allocate and then resize at the end.
 658      */
 659
 660     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 661     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 662
 663     writeptr = ret;
 664     for(i = 0; i < ucs4src_len; i++)
 665       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 666
 667     /* get rid of excess size */
 668     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 669
 670     return ((char *) ret);
 671 }
 672
 673
 674 #ifdef _WINDOWS
 675 /*
 676  * Convert a UTF-8 argument into an LPTSTR version
 677  * of that argument. The result is allocated here
 678  * and should be freed by the caller.
 679  */
 680 LPTSTR
 681 utf8_to_lptstr(LPSTR arg_utf8)
 682 {
 683      int lptstr_len;
 684      LPTSTR lptstr_ret = NULL;
 685
 686      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 687      if(lptstr_len > 0)
 688      {
 689          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 690          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 691              arg_utf8, -1, lptstr_ret, lptstr_len );
 692      }
 693
 694      if(!lptstr_len)
 695      {
 696          /* check GetLastError()? */
 697          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 698          lptstr_ret[0] = 0;
 699      }
 700
 701      return lptstr_ret;
 702 }
 703
 704
 705 /*
 706  * Convert an LPTSTR argument into a UTF-8 version
 707  * of that argument. The result is allocated here
 708  * and should be freed by the caller.
 709  */
 710 LPSTR
 711 lptstr_to_utf8(LPTSTR arg_lptstr)
 712 {
 713      int utf8str_len;
 714      LPSTR utf8str_ret = NULL;
 715
 716      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 717      if(utf8str_len > 0)
 718      {
 719          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 720          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 721              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 722      }
 723
 724      if(!utf8str_len)
 725      {
 726          /* check GetLastError()? */
 727          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 728          utf8str_ret[0] = 0;
 729      }
 730
 731      return utf8str_ret;
 732 }
 733
 734
 735 /*
 736  * Convert a UCS4 argument into an LPTSTR version
 737  * of that argument. The result is allocated here
 738  * and should be freed by the caller.
 739  */
 740 LPTSTR
 741 ucs4_to_lptstr(UCS *arg_ucs4)
 742 {
 743     LPTSTR ret_lptstr = NULL;
 744     size_t len;
 745     size_t i;
 746
 747     if(arg_ucs4){
 748         len = ucs4_strlen(arg_ucs4);
 749         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 750         /* bogus conversion ignores UTF-16 */
 751         for(i = 0; i < len; i++)
 752           ret_lptstr[i] = arg_ucs4[i];
 753
 754         ret_lptstr[len] = '\0';
 755     }
 756
 757     return(ret_lptstr);
 758 }
 759
 760
 761 /*
 762  * Convert an LPTSTR argument into a UCS4 version
 763  * of that argument. The result is MemAlloc'd here
 764  * and should be freed by the caller.
 765  */
 766 UCS *
 767 lptstr_to_ucs4(LPTSTR arg_lptstr)
 768 {
 769     UCS *ret_ucs4 = NULL;
 770     size_t len;
 771     size_t i;
 772
 773     if(arg_lptstr){
 774         len = _tcslen(arg_lptstr);
 775         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 776         /* bogus conversion ignores UTF-16 */
 777         for(i = 0; i < len; i++)
 778           ret_ucs4[i] = arg_lptstr[i];
 779
 780         ret_ucs4[len] = '\0';
 781     }
 782
 783     return(ret_ucs4);
 784 }
 785
 786 #endif /* _WINDOWS */
 787
 788
 789 /*
 790  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 791  * 1-at-a-time filled in with UCS characters. The return value is the
 792  * number of valid characters in obuf to be used. It can only
 793  * be 1 or 0 characters since we're only getting one UTF-8 character
 794  * at a time.
 795  */
 796 int
 797 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 798 {
 799     int  width = 0, outchars = 0;
 800
 801     if(!(cb && cb->cbufp))
 802       return(0);
 803
 804     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 805         unsigned char *inputp;
 806         unsigned long remaining_octets;
 807         UCS ucs;
 808
 809         *cb->cbufp++ = (unsigned char) c;
 810         inputp = cb->cbuf;
 811         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 812         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 813
 814         switch(ucs){
 815           case U8G_ENDSTRG:     /* incomplete character, wait */
 816           case U8G_ENDSTRI:     /* incomplete character, wait */
 817             break;
 818
 819           default:
 820             if(ucs & U8G_ERROR || ucs == UBOGON){
 821                 /*
 822                  * None of these cases is supposed to happen. If it
 823                  * does happen then the input stream isn't UTF-8
 824                  * so something is wrong.
 825                  */
 826                 outchars++;
 827                 *obuf = '?';
 828                 cb->cbufp = cb->cbuf;
 829                 width = 1;
 830             }
 831             else{
 832                 outchars++;
 833                 if(ucs < 0x80 && ucs >= 0x20)
 834                   width = 1;
 835
 836                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 837                     /*
 838                      * This happens when we have a UTF-8 character that
 839                      * we aren't able to print in our locale. For example,
 840                      * if the locale is setup with the terminal
 841                      * expecting ISO-8859-1 characters then there are
 842                      * lots of UTF-8 characters that can't be printed.
 843                      * Print a '?' instead.
 844                      * Don't think this should happen in Windows.
 845                      */
 846                     *obuf = '?';
 847                 }
 848                 else{
 849                     *obuf = ucs;
 850                 }
 851
 852                 /* update the input buffer */
 853                 if(inputp >= cb->cbufp) /* this should be the case */
 854                   cb->cbufp = cb->cbuf;
 855                 else{           /* extra chars for some reason? */
 856                     unsigned char *q, *newcbufp;
 857
 858                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 859                     q = cb->cbuf;
 860                     while(inputp < cb->cbufp)
 861                       *q++ = *inputp++;
 862
 863                     cb->cbufp = newcbufp;
 864                 }
 865             }
 866
 867             break;
 868         }
 869     }
 870     else{                       /* error */
 871         *obuf = '?';
 872         outchars = 1;
 873         width = 1;
 874         cb->cbufp = cb->cbuf;   /* start over */
 875     }
 876
 877     if(obufwidth)
 878       *obufwidth = width;
 879
 880     return(outchars);
 881 }
 882
 883
 884 /*
 885  * Return an allocated copy of a zero-terminated UCS-4 string.
 886  */
 887 UCS *
 888 ucs4_cpystr(UCS *ucs4src)
 889 {
 890     size_t         arraysize;
 891     UCS           *ret = NULL;
 892     size_t         i;
 893
 894     if(!ucs4src)
 895       return NULL;
 896
 897     arraysize = ucs4_strlen(ucs4src);
 898
 899     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 900     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 901
 902     for(i = 0; i < arraysize; i++)
 903       ret[i] = ucs4src[i];
 904
 905     return ret;
 906 }
 907
 908
 909 UCS *
 910 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 911 {
 912     size_t i;
 913
 914     if(ucs4src && ucs4dst){
 915         for(i = 0; i < n; i++){
 916             ucs4dst[i] = ucs4src[i];
 917             if(ucs4dst[i] == '\0')
 918               break;
 919         }
 920     }
 921
 922     return ucs4dst;
 923 }
 924
 925
 926 UCS *
 927 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 928 {
 929     size_t i;
 930     UCS *u;
 931
 932     if(ucs4src && ucs4dst){
 933         for(u = ucs4dst; *u; u++)
 934           ;
 935
 936         for(i = 0; i < n; i++){
 937             u[i] = ucs4src[i];
 938             if(u[i] == '\0')
 939               break;
 940         }
 941
 942         if(i == n)
 943           u[i] = '\0';
 944     }
 945
 946     return ucs4dst;
 947 }
 948
 949
 950 /*
 951  * Like strlen only this returns the number of non-zero characters
 952  * in a zero-terminated UCS-4 array.
 953  */
 954 size_t
 955 ucs4_strlen(UCS *ucs4str)
 956 {
 957     size_t i = 0;
 958
 959     if(ucs4str)
 960       while(ucs4str[i])
 961         i++;
 962
 963     return(i);
 964 }
 965
 966
 967 int
 968 ucs4_strcmp(UCS *s1, UCS *s2)
 969 {
 970     for(; *s1 == *s2; s1++, s2++)
 971       if(*s1 == '\0')
 972         return 0;
 973
 974     return((*s1 < *s2) ? -1 : 1);
 975 }
 976
 977
 978 UCS *
 979 ucs4_strchr(UCS *s, UCS c)
 980 {
 981     if(!s)
 982       return NULL;
 983
 984     while(*s && *s != c)
 985       s++;
 986
 987     if(*s || !c)
 988       return s;
 989     else
 990       return NULL;
 991 }
 992
 993
 994 UCS *
 995 ucs4_strrchr(UCS *s, UCS c)
 996 {
 997     UCS *ret = NULL;
 998
 999     if(!s)
1000       return ret;
1001
1002     while(*s){
1003         if(*s == c)
1004           ret = s;
1005
1006         s++;
1007     }
1008
1009     return ret;
1010 }
1011
1012
1013 /*
1014  * Returns the screen cells width of the UTF-8 string argument.
1015  */
1016 unsigned
1017 utf8_width(char *str)
1018 {
1019     unsigned width = 0;
1020     int this_width;
1021     UCS ucs;
1022     unsigned long remaining_octets;
1023     char *readptr;
1024
1025     if(!(str && *str))
1026       return(width);
1027
1028     readptr = str;
1029     remaining_octets = readptr ? strlen(readptr) : 0;
1030
1031     while(remaining_octets > 0 && *readptr){
1032
1033         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1034
1035         if(ucs & U8G_ERROR || ucs == UBOGON){
1036             /*
1037              * This should not happen, but do something to handle it anyway.
1038              * Treat each character as a single width character, which is what should
1039              * probably happen when we actually go to write it out.
1040              */
1041             remaining_octets--;
1042             readptr++;
1043             this_width = 1;
1044         }
1045         else{
1046             this_width = wcellwidth(ucs);
1047
1048             /*
1049              * If this_width is -1 that means we can't print this character
1050              * with our current locale. Writechar will print a '?'.
1051              */
1052             if(this_width < 0)
1053               this_width = 1;
1054         }
1055
1056         width += (unsigned) this_width;
1057     }
1058
1059     return(width);
1060 }
1061
1062
1063 /*
1064  * Copy UTF-8 characters from src into dst.
1065  * This is intended to be used if you want to truncate a string at
1066  * the start instead of the end. For example, you have a long string
1067  * like
1068  *       this_is_a_long_string
1069  * but not enough space to fit it into a particular field. You want to
1070  * end up with
1071  *             s_a_long_string
1072  * where that fits in a particular width. Perhaps you'd use this with ...
1073  * to get
1074  *          ...s_a_long_string
1075  * This right adjusts the end of the string in the width space and
1076  * cuts it off at the start. If there is enough width for the whole
1077  * string it will copy the string into dst with no padding.
1078  *
1079  * Copy enough characters so that the result will have screen width of
1080  * want_width screen cells in current locale.
1081  *
1082  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1083  *   to dst. This is just for protection, it shouldn't be relied on to
1084  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1085  *   characters truncated in the middle or something like that.
1086  *
1087  * Returned value is the number of bytes written to dst, not including
1088  *   the possible terminating null.
1089  *
1090  * If we can't hit want_width exactly because of double width characters
1091  *   then we will pad the end of the string with space in order to make
1092  *   the width exact.
1093  */
1094 size_t
1095 utf8_to_width_rhs(char *dst,            /* destination buffer */
1096                   char *src,            /* source string */
1097                   size_t dstlen,        /* space in dest */
1098                   unsigned want_width)  /* desired screen width */
1099 {
1100     int this_width;
1101     unsigned width_consumed = 0;
1102     UCS ucs;
1103     unsigned long remaining_octets;
1104     char *readptr, *goodreadptr, *savereadptr, *endptr;
1105     size_t nb = 0;
1106
1107     if(!src){
1108         if(dstlen > 0)
1109           dst[0] = '\0';
1110
1111         return nb;
1112     }
1113
1114     /*
1115      * Start at the end of the source string and go backwards until we
1116      * get to the desired width, but not more than the width.
1117      */
1118     readptr = src + strlen(src);
1119     endptr = readptr;
1120     goodreadptr = readptr;
1121     width_consumed = 0;
1122     savereadptr = readptr;
1123
1124     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1125         readptr = savereadptr-1){
1126
1127         savereadptr = readptr;
1128         remaining_octets = goodreadptr - readptr;
1129         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1130
1131         /*
1132          * Handling the error case is tough because an error will be the normal thing that
1133          * happens as we back through the string. So we're just going to punt on the
1134          * error for now.
1135          */
1136         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1137             if(remaining_octets > 0){
1138                 /*
1139                  * This means there are some bad octets after this good
1140                  * character so things are not going to work out well.
1141                  * Bail out.
1142                  */
1143                 savereadptr = src;      /* we're done */
1144             }
1145             else{
1146                 this_width = wcellwidth(ucs);
1147
1148                 if(this_width < 0)
1149                   this_width = 1;
1150
1151                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1152                     width_consumed += (unsigned) this_width;
1153                     goodreadptr = savereadptr;
1154                 }
1155                 else
1156                   savereadptr = src;    /* we're done */
1157             }
1158         }
1159     }
1160
1161     /*
1162      * Copy characters from goodreadptr to endptr into dst.
1163      */
1164     nb = MIN(endptr-goodreadptr, dstlen-1);
1165     strncpy(dst, goodreadptr, nb);
1166     dst[nb] = '\0';
1167
1168     /*
1169      * Pad out with spaces in order to hit width exactly.
1170      */
1171     while(width_consumed < want_width && nb < dstlen-1){
1172         dst[nb++] = ' ';
1173         dst[nb] = '\0';
1174         width_consumed++;
1175     }
1176
1177     return nb;
1178 }
1179
1180
1181 /*
1182  * The arguments being converted are UTF-8 strings.
1183  * This routine attempts to make it possible to use screen cell
1184  * widths in a format specifier. In a one-byte per screen cell
1185  * world we might have used %10.10s to cause a string to occupy
1186  * 10 screen positions. Since the width and precision are really
1187  * referring to numbers of bytes instead of screen positions that
1188  * won't work with UTF-8 input. We emulate that behavior with
1189  * the format string %w. %m.nw means to use the m and n as
1190  * screen width indicators instead of bytes indicators.
1191  *
1192  * There is no reason to use this routine unless you want to use
1193  * min field with or precision with the specifier. A plain %w without
1194  * widths is equivalent exactly to a plain %s in a regular printf.
1195  *
1196  * Double-width characters complicate things. It may not be possible
1197  * to satisfy the request exactly. For example, %3w for an input
1198  * string that is made up of two double-width characters.
1199  * This routine will arbitrarily use a trailing space character if
1200  * needed to make the width come out correctly where a half of a
1201  * double-width character would have been needed. We'll see how
1202  * that works for us.
1203  *
1204  * %w only works for strings (it's a %s replacement).
1205  *
1206  * Buffer overflow is handled by the size argument. %.30s will work
1207  * to limit a particular string to 30 bytes, but you lose that
1208  * ability with %w, since it may write more than precision bytes
1209  * in order to get to the desired width. It is best to choose
1210  * size large enough so that it doesn't come into play, otherwise
1211  * it may be possible to get partial UTF-8 characters because of
1212  * the truncation.
1213  *
1214  * The return value isn't quite the same as the return value
1215  * of snprintf. It is the number of bytes written, not counting
1216  * the trailing null, just like snprintf. However, if it is
1217  * truncated due to size then the output is size, not the
1218  * number of characters that would have been written.
1219  */
1220 int
1221 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1222 {
1223     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1224     char   *start_of_specifier;
1225     char   *input_str;
1226     int     int_arg;
1227     double  double_arg;
1228     void   *ptr_arg;
1229     unsigned got_width;
1230     int     more_flags, ret, w;
1231     int     min_field_width, field_precision, modifier;
1232     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1233     va_list args;
1234
1235     newfmt[0] = '\0';
1236     q = newfmt;
1237
1238     pdest = dest;
1239
1240 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1241     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1242
1243     /*
1244      * Strategy: Look through the fmt string for %w's. Replace the
1245      * %w's in the format string with %s's but with possibly different
1246      * width and precision arguments which will make it come out right.
1247      * Then call the regular system vsnprintf with the altered format
1248      * string but same arguments.
1249      *
1250      * That would be nice but it doesn't quite work. Why? Because a
1251      * %*w will need to have the value in the integer argument the *
1252      * refers to modified. Can't do it as far as I can tell. Or we could
1253      * remove the integer argument somehow before calling printf. Can't
1254      * do it. Or we could somehow add an additional conversion specifier
1255      * that caused nothing to be printed but ate up the integer arg.
1256      * Can't figure out how to do that either.
1257      *
1258      * Since we can't figure out how to do it, the alternative is to
1259      * construct the result one piece at a time, pasting together the
1260      * pieces from the different conversions.
1261      */
1262     va_start(args, fmt);
1263
1264     while(*fmt && IS_ROOM_IN_DEST(1)){
1265         if(*fmt == '%'){
1266             start_of_specifier = fmt++;
1267
1268             min_field_width = field_precision = -1;
1269             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1270
1271             /* flags */
1272             more_flags = 1;
1273             while(more_flags){
1274                 switch(*fmt){
1275                   case '-':
1276                     flags_minus++;
1277                     fmt++;
1278                     break;
1279
1280                   case '+':
1281                     flags_plus++;
1282                     fmt++;
1283                     break;
1284
1285                   case ' ':
1286                     flags_space++;
1287                     fmt++;
1288                     break;
1289
1290                   case '0':
1291                     flags_zero++;
1292                     fmt++;
1293                     break;
1294
1295                   case '#':
1296                     flags_pound++;
1297                     fmt++;
1298                     break;
1299
1300                   default:
1301                     more_flags = 0;
1302                     break;
1303                 }
1304             }
1305
1306             /* minimum field width */
1307             if(*fmt == '*'){
1308                 min_field_width = va_arg(args, int);
1309                 fmt++;
1310             }
1311             else if(*fmt >= '0' && *fmt <= '9'){
1312                 width_str = fmt;
1313                 while (*fmt >= '0' && *fmt <= '9')
1314                   fmt++;
1315
1316                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1317                 if(sizeof(buf) > fmt-width_str)
1318                   buf[fmt-width_str] = '\0';
1319
1320                 buf[sizeof(buf)-1] = '\0';
1321
1322                 min_field_width = atoi(width_str);
1323             }
1324
1325             /* field precision */
1326             if(*fmt == '.'){
1327                 fmt++;
1328                 if(*fmt == '*'){
1329                     field_precision = va_arg(args, int);
1330                     fmt++;
1331                 }
1332                 else if(*fmt >= '0' && *fmt <= '9'){
1333                     width_str = fmt;
1334                     while (*fmt >= '0' && *fmt <= '9')
1335                       fmt++;
1336
1337                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1338                     if(sizeof(buf) > fmt-width_str)
1339                       buf[fmt-width_str] = '\0';
1340
1341                     buf[sizeof(buf)-1] = '\0';
1342
1343                     field_precision = atoi(width_str);
1344                 }
1345             }
1346
1347             /* length modifier */
1348             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1349               modifier = *fmt++;
1350
1351             /* conversion character */
1352             switch(*fmt){
1353               case 'w':
1354                 /*
1355                  * work with va_arg(char *) to figure out width
1356                  * and precision needed to produce the screen width
1357                  * and precision asked for in %w using some of the
1358                  * utf8 width routines we have.
1359                  */
1360
1361                 input_str = va_arg(args, char *);
1362                 if(field_precision >=0 || min_field_width >= 0)
1363                   w = utf8_width(input_str);
1364
1365                 if(field_precision >= 0){
1366                     if(w <= field_precision)
1367                       field_precision = -1;  /* print it all */
1368                     else{
1369                         /*
1370                          * We need to cut off some of the input_str
1371                          * in this case.
1372                          */
1373                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1374                         field_precision = (int) (end - input_str);
1375                         /* new w with this field_precision */
1376                         w = got_width;
1377                     }
1378                 }
1379
1380                 /* need some padding */
1381                 if(min_field_width >= 0)
1382                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1383                                       MAX(0, min_field_width - w);
1384
1385                 /*
1386                  * Now we just need to get the new format string
1387                  * set correctly in newfmt.
1388                  */
1389                 q = newfmt;
1390                 if(q-newfmt < sizeof(newfmt))
1391                   *q++ = '%';
1392
1393                 if(flags_minus && q-newfmt < sizeof(newfmt))
1394                   *q++ = '-';
1395                 if(flags_plus && q-newfmt < sizeof(newfmt))
1396                   *q++ = '+';
1397                 if(flags_space && q-newfmt < sizeof(newfmt))
1398                   *q++ = ' ';
1399                 if(flags_zero && q-newfmt < sizeof(newfmt))
1400                   *q++ = '0';
1401                 if(flags_pound && q-newfmt < sizeof(newfmt))
1402                   *q++ = '#';
1403
1404                 if(min_field_width >= 0){
1405                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1406                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1407                 }
1408
1409                 if(field_precision >= 0){
1410                     if(q-newfmt < sizeof(newfmt))
1411                       *q++ = '.';
1412
1413                     snprintf(buf, sizeof(buf), "%d", field_precision);
1414                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1415                 }
1416
1417                 if(q-newfmt < sizeof(newfmt))
1418                   *q++ = 's';
1419
1420                 if(q-newfmt < sizeof(newfmt))
1421                   *q++ = '\0';
1422
1423                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1424                 pdest += strlen(pdest);
1425
1426                 break;
1427
1428               case '\0':
1429                 fmt--;
1430                 break;
1431
1432               default:
1433                 /* make a new format which leaves out the dynamic '*' arguments */
1434                 q = newfmt;
1435                 if(q-newfmt < sizeof(newfmt))
1436                   *q++ = '%';
1437
1438                 if(flags_minus && q-newfmt < sizeof(newfmt))
1439                   *q++ = '-';
1440                 if(flags_plus && q-newfmt < sizeof(newfmt))
1441                   *q++ = '+';
1442                 if(flags_space && q-newfmt < sizeof(newfmt))
1443                   *q++ = ' ';
1444                 if(flags_zero && q-newfmt < sizeof(newfmt))
1445                   *q++ = '0';
1446                 if(flags_pound && q-newfmt < sizeof(newfmt))
1447                   *q++ = '#';
1448
1449                 if(min_field_width >= 0){
1450                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1451                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1452                 }
1453
1454                 if(field_precision >= 0){
1455                     if(q-newfmt < sizeof(newfmt))
1456                       *q++ = '.';
1457
1458                     snprintf(buf, sizeof(buf), "%d", field_precision);
1459                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1460                 }
1461
1462                 if(q-newfmt < sizeof(newfmt))
1463                   *q++ = *fmt;
1464
1465                 if(q-newfmt < sizeof(newfmt))
1466                   *q++ = '\0';
1467
1468                 switch(*fmt){
1469                   case 'd': case 'i': case 'o':
1470                   case 'x': case 'X': case 'u': case 'c':
1471                     int_arg = va_arg(args, int);
1472                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1473                     pdest += strlen(pdest);
1474                     break;
1475
1476                   case 's':
1477                     input_str = va_arg(args, char *);
1478                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1479                     pdest += strlen(pdest);
1480                     break;
1481
1482                   case 'f': case 'e': case 'E':
1483                   case 'g': case 'G':
1484                     double_arg = va_arg(args, double);
1485                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1486                     pdest += strlen(pdest);
1487                     break;
1488
1489                   case 'p':
1490                     ptr_arg = va_arg(args, void *);
1491                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1492                     pdest += strlen(pdest);
1493                     break;
1494
1495                   case '%':
1496                     if(IS_ROOM_IN_DEST(1))
1497                       *pdest++ =  '%';
1498
1499                     break;
1500
1501                   default:
1502                     /* didn't think of this type */
1503                     assert(0);
1504                     break;
1505                 }
1506
1507                 break;
1508             }
1509
1510             fmt++;
1511         }
1512         else{
1513             if(IS_ROOM_IN_DEST(1))
1514               *pdest++ = *fmt++;
1515         }
1516     }
1517
1518     ret = pdest - dest;
1519
1520     if(IS_ROOM_IN_DEST(1))
1521       *pdest++ = '\0';
1522
1523     va_end(args);
1524
1525     return ret;
1526 }
1527
1528
1529 /*
1530  * Copy UTF-8 characters from src into dst.
1531  * Copy enough characters so that the result will have (<=) screen width of
1532  * want_width screen cells in current locale.
1533  *
1534  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1535  *   to dst.
1536  *
1537  * Returned value is the number of bytes written to dst, not including
1538  *   the possible terminating null.
1539  * Got_width is another returned value. It is the width in screen cells of
1540  *   the string placed in dst. It will be the same as want_width if there
1541  *   are enough characters in the src to do that and if the character widths
1542  *   hit the width exactly. It will be less than want_width if we run out
1543  *   of src characters or if the next character width would skip over the
1544  *   width we want, because it is double width.
1545  *
1546  * Zero width characters are collected and included at the end of the string.
1547  *   That is, if we make it to want_width but there is still a zero length
1548  *   character sitting in src, we add that to dst. This might be an accent
1549  *   or something like that.
1550  */
1551 size_t
1552 utf8_to_width(char *dst,                /* destination buffer */
1553               char *src,                /* source string */
1554               size_t dstlen,            /* space in dst */
1555               unsigned want_width,      /* desired screen width */
1556               unsigned *got_width)      /* returned screen width in dst */
1557 {
1558     int this_width;
1559     unsigned width_consumed = 0;
1560     UCS ucs;
1561     unsigned long remaining_octets;
1562     char *writeptr, *readptr, *savereadptr, *endptr;
1563     int ran_out_of_space = 0;
1564
1565     readptr = src;
1566
1567     remaining_octets = readptr ? strlen(readptr) : 0;
1568
1569     writeptr = dst;
1570     endptr = writeptr + dstlen;
1571
1572     if(readptr && writeptr){
1573       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1574         savereadptr = readptr;
1575         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1576
1577         if(ucs & U8G_ERROR || ucs == UBOGON)
1578           remaining_octets = 0;
1579         else{
1580           this_width = wcellwidth(ucs);
1581
1582           /*
1583            * If this_width is -1 that means we can't print this character
1584            * with our current locale. Writechar will print a '?'.
1585            */
1586           if(this_width < 0)
1587             this_width = 1;
1588
1589           if(width_consumed + (unsigned) this_width <= want_width){
1590             /* append this utf8 character to dst if it will fit */
1591             if(writeptr + (readptr - savereadptr) < endptr){
1592               width_consumed += this_width;
1593               while(savereadptr < readptr)
1594                 *writeptr++ = *savereadptr++;
1595             }
1596             else
1597               ran_out_of_space++;       /* no more utf8 to dst */
1598           }
1599           else
1600             remaining_octets = 0;       /* we're done */
1601         }
1602       }
1603
1604       if(writeptr < endptr)
1605         *writeptr = '\0';
1606     }
1607
1608     if(got_width)
1609       *got_width = width_consumed;
1610
1611     return(writeptr ? (writeptr - dst) : 0);
1612 }
1613
1614
1615 /*
1616  * Str is a UTF-8 string.
1617  * Count forward width screencell positions and return a pointer to the
1618  * end of the string that is width wide.
1619  * The returned pointer points at the next character (where the null would
1620  * be placed).
1621  *
1622  * Got_width is another returned value. It is the width in screen cells of
1623  *   the string from str to the returned pointer. It will be the same as
1624  *   want_width if there are enough characters in the str to do that
1625  *   and if the character widths hit the width exactly. It will be less
1626  *   than want_width if we run out of characters or if the next character
1627  *   width would skip over the width we want, because it is double width.
1628  */
1629 char *
1630 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1631 {
1632     int this_width;
1633     unsigned width_consumed = 0;
1634     UCS ucs;
1635     unsigned long remaining_octets;
1636     char *readptr;
1637     char *retptr;
1638
1639     retptr = readptr = str;
1640
1641     remaining_octets = readptr ? strlen(readptr) : 0;
1642
1643     while(width_consumed <= want_width && remaining_octets > 0){
1644
1645         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1646
1647         if(ucs & U8G_ERROR || ucs == UBOGON){
1648             /*
1649              * This should not happen, but do something to handle it anyway.
1650              * Treat each character as a single width character, which is what should
1651              * probably happen when we actually go to write it out.
1652              */
1653             remaining_octets--;
1654             readptr++;
1655             this_width = 1;
1656         }
1657         else{
1658             this_width = wcellwidth(ucs);
1659
1660             /*
1661              * If this_width is -1 that means we can't print this character
1662              * with our current locale. Writechar will print a '?'.
1663              */
1664             if(this_width < 0)
1665               this_width = 1;
1666         }
1667
1668         if(width_consumed + (unsigned) this_width <= want_width){
1669             width_consumed += (unsigned) this_width;
1670             retptr = readptr;
1671         }
1672         else
1673           remaining_octets = 0; /* we're done */
1674     }
1675
1676     if(got_width)
1677       *got_width = width_consumed;
1678
1679     return(retptr);
1680 }
1681
1682
1683 /*
1684  * Copy a null terminator into a UTF-8 string in place so that the string is
1685  * no more than a certain screen width wide. If the string is already less
1686  * than or equal in width to the requested width, no change is made.
1687  *
1688  * The actual width accomplished is returned. Note that it may be less than
1689  * max_width due to double width characters as well as due to the fact that
1690  * it fits wholly in the max_width.
1691  *
1692  * Returned value is the actual screen width of str when done.
1693  *
1694  * A side effect is that a terminating null may have been written into
1695  * the passed in string.
1696  */
1697 unsigned
1698 utf8_truncate(char *str, unsigned max_width)
1699 {
1700     int this_width;
1701     unsigned width_consumed = 0;
1702     UCS ucs;
1703     unsigned long remaining_octets;
1704     char *readptr, *savereadptr;
1705
1706     readptr = str;
1707
1708     remaining_octets = readptr ? strlen(readptr) : 0;
1709
1710     if(readptr){
1711       while(width_consumed <= max_width && remaining_octets > 0){
1712
1713         savereadptr = readptr;
1714         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1715
1716         if(ucs & U8G_ERROR || ucs == UBOGON){
1717             /*
1718              * This should not happen, but do something to handle it anyway.
1719              * Treat each character as a single width character, which is what should
1720              * probably happen when we actually go to write it out.
1721              */
1722             remaining_octets--;
1723             readptr++;
1724             this_width = 1;
1725         }
1726         else{
1727             this_width = wcellwidth(ucs);
1728
1729             /*
1730              * If this_width is -1 that means we can't print this character
1731              * with our current locale. Writechar will print a '?'.
1732              */
1733             if(this_width < 0)
1734               this_width = 1;
1735         }
1736
1737         if(width_consumed + (unsigned) this_width <= max_width){
1738             width_consumed += (unsigned) this_width;
1739         }
1740         else{
1741             remaining_octets = 0;       /* we're done */
1742             *savereadptr = '\0';
1743         }
1744       }
1745     }
1746
1747     return(width_consumed);
1748 }
1749
1750
1751 /*
1752  * Copy UTF-8 characters from src into dst.
1753  * Copy enough characters so that the result will have screen width of
1754  * want_width screen cells in current locale.
1755  * If there aren't enough characters in src to get to want_width, pad on
1756  * left or right according to left_adjust argument.
1757  *
1758  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1759  *   to dst. Dst will be null terminated if there is enough room, but not
1760  *   if that would overflow dst's len.
1761  *
1762  * Returned value is the number of bytes written to dst, not including
1763  *   the possible terminating null.
1764  */
1765 size_t
1766 utf8_pad_to_width(char *dst,            /* destination buffer */
1767                   char *src,            /* source string */
1768                   size_t dstlen,        /* space in dst */
1769                   unsigned want_width,  /* desired screen width */
1770                   int left_adjust)      /* adjust left or right in want_width columns */
1771 {
1772     unsigned got_width = 0;
1773     int      need_more, howmany;
1774     size_t   len_left, bytes_used;
1775
1776     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1777     len_left = dstlen - bytes_used;
1778
1779     need_more = want_width - got_width;
1780     howmany = MIN(need_more, len_left);
1781
1782     if(howmany > 0){
1783         char *end, *newend, *p, *q;
1784
1785         end = dst + bytes_used;
1786         newend = end + howmany;
1787         if(left_adjust){
1788             /*
1789              * Add padding to end of string. Simply append
1790              * the needed number of spaces, or however many will fit
1791              * if we don't have enough space.
1792              */
1793             for(q = end; q < newend; q++)
1794               *q = ' ';
1795         }
1796         else{
1797             /*
1798              * Add padding to start of string.
1799              */
1800
1801             /* slide existing string over */
1802             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1803               *q = *p;
1804
1805             /* fill rest with spaces */
1806             for(; q >= dst; q--)
1807               *q = ' ';
1808         }
1809
1810         bytes_used += howmany;
1811     }
1812
1813     if(bytes_used < dstlen)
1814       dst[bytes_used] = '\0';
1815
1816     return(bytes_used);
1817 }
1818
1819
1820 /*
1821  * Str is a UTF-8 string.
1822  * Start_here is a pointer into the string. It points one position past
1823  * the last byte that should be considered a part of the length string.
1824  * Count back want_width screencell positions and return a pointer to the
1825  * start of the string that is want_width wide and ends with start_here.
1826  *
1827  * Since characters may be more than one cell width wide we may end up
1828  * skipping over the exact width. That is, if we need to we'll go back
1829  * too far (by one cell width). Account for that in the call by looking
1830  * at got_width.
1831  *
1832  * Note that this call gives a possible got_width == want_width+1 as
1833  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1834  * That was just what was needed at the time, maybe it needs to be
1835  * optional.
1836  */
1837 char *
1838 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1839 {
1840     unsigned width_consumed = 0;
1841     int this_width;
1842     UCS ucs;
1843     unsigned long remaining_octets;
1844     char *ptr, *savereadptr, *goodreadptr;
1845
1846     savereadptr = start_here;
1847     goodreadptr = start_here;
1848
1849     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1850
1851         savereadptr = ptr;
1852         remaining_octets = goodreadptr - ptr;
1853         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1854
1855         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1856           if(remaining_octets > 0){
1857               /*
1858                * This means there are some bad octets after this good
1859                * character so things are not going to work out well.
1860                * Bail out.
1861                */
1862               savereadptr = str;        /* we're done */
1863           }
1864           else{
1865             this_width = wcellwidth(ucs);
1866
1867             /*
1868              * If this_width is -1 that means we can't print this character
1869              * with our current locale. Writechar will print a '?'.
1870              */
1871             if(this_width < 0)
1872               this_width = 1;
1873
1874             width_consumed += (unsigned) this_width;
1875             goodreadptr = savereadptr;
1876           }
1877         }
1878     }
1879
1880     if(got_width)
1881       *got_width = width_consumed;
1882
1883     return(savereadptr);
1884 }
1885
1886
1887 /*----------------------------------------------------------------------
1888   copy the source string onto the destination string returning with
1889   the destination string pointer at the end of the destination text
1890
1891   motivation for this is to avoid twice passing over a string that's
1892   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1893
1894   This doesn't really belong here but it is used here.
1895  ----*/
1896 void
1897 sstrncpy(char **d, char *s, int n)
1898 {
1899     while(n-- > 0 && (**d = *s++) != '\0')
1900       (*d)++;
1901 }
1902
1903
1904 /*
1905  * If use_system_routines is set then NULL is the return value and it is
1906  * not an error. Display_charmap and keyboard_charmap should come over as
1907  * malloced strings and will be filled in with the result.
1908  *
1909  * Returns a void pointer to the input_cs CHARSET which is
1910  * passed to mbtow via kbseq().
1911  * If !use_system_routines && NULL is returned, that is an error and err should
1912  * have a message.
1913  * display_charmap and keyboard_charmap should be malloced data and may be
1914  * realloced and changed here.
1915  */
1916 int
1917 setup_for_input_output(int use_system_routines, char **display_charmap,
1918                        char **keyboard_charmap, void **input_cs_arg, char **err)
1919 {
1920     const CHARSET *cs;
1921     const CHARSET *input_cs = NULL;
1922     int already_tried = 0;
1923     int supported = 0;
1924     char buf[1000];
1925
1926 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1927
1928     if(err)
1929       *err = NULL;
1930
1931     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1932         *err = cpstr("Bad call to setup_for_input_output");
1933         return(-1);
1934     }
1935
1936     if(use_system_routines){
1937 #if     PREREQ_FOR_SYS_TRANSLATION
1938         char *dcm;
1939
1940         dcm = nl_langinfo_codeset_wrapper();
1941         dcm = dcm ? dcm : "US-ASCII";
1942
1943         init_utf8_display(0, NULL);
1944         if(*display_charmap){
1945             if(dcm && strucmp(*display_charmap, dcm)){
1946                 snprintf(buf, sizeof(buf),
1947                  _("Display character set \"%s\" is ignored when using system translation"),
1948                      *display_charmap);
1949
1950                 *err = cpstr(buf);
1951             }
1952
1953             fs_give((void **) display_charmap);
1954         }
1955
1956         if(*keyboard_charmap){
1957             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1958                 snprintf(buf, sizeof(buf),
1959                  _("Keyboard character set \"%s\" is ignored when using system translation"),
1960                      *keyboard_charmap);
1961
1962                 *err = cpstr(buf);
1963             }
1964
1965             fs_give((void **) keyboard_charmap);
1966         }
1967
1968         *display_charmap = cpstr(dcm);
1969         *keyboard_charmap = cpstr(dcm);
1970 #else
1971         *err = cpstr("Bad call to setup_for_input_output");
1972 #endif
1973
1974         *input_cs_arg = NULL;
1975         return(0);
1976     }
1977
1978
1979 try_again1:
1980     if(!(*display_charmap))
1981       *display_charmap = cpstr("US-ASCII");
1982
1983     if(!(*keyboard_charmap))
1984       *keyboard_charmap = cpstr(*display_charmap);
1985
1986     if(*keyboard_charmap){
1987         supported = input_charset_is_supported(*keyboard_charmap);
1988
1989         if(supported){
1990             if(!strucmp(*keyboard_charmap, "utf-8"))
1991               input_cs = utf8_charset(*keyboard_charmap);
1992             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1993               input_cs = cs;
1994         }
1995         else{
1996             if(err && !*err){
1997                 int iso2022jp = 0;
1998
1999                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2000                   iso2022jp = 1;
2001
2002                 snprintf(buf, sizeof(buf),
2003                      /* TRANSLATORS: The first argument is the name of the character
2004                         set the user is trying to use (which is unsupported by alpine).
2005                         The second argument is " (except for posting)" if they are
2006                         trying to use ISO-2022-JP for something other than posting. */
2007                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2008                      *keyboard_charmap,
2009                      iso2022jp ? _(" (except for posting)") : "");
2010
2011                 *err = cpstr(buf);
2012             }
2013
2014             input_cs = NULL;
2015             fs_give((void **) keyboard_charmap);
2016             *keyboard_charmap = cpstr("US-ASCII");
2017             if(!already_tried){
2018                 already_tried++;
2019                 goto try_again1;
2020             }
2021         }
2022     }
2023
2024
2025 try_again2:
2026     if(!(*display_charmap))
2027       *display_charmap = cpstr("US-ASCII");
2028
2029     if(*display_charmap){
2030         supported = output_charset_is_supported(*display_charmap);
2031         if(supported){
2032             if(!strucmp(*display_charmap, "utf-8"))
2033               init_utf8_display(1, NULL);
2034             else if((cs = utf8_charset(*display_charmap)) != NULL)
2035               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2036         }
2037         else{
2038             if(err && !*err){
2039                 int iso2022jp = 0;
2040
2041                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2042                   iso2022jp = 1;
2043
2044                 snprintf(buf, sizeof(buf),
2045                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2046                      *display_charmap,
2047                      iso2022jp ? _(" (except for posting)") : "");
2048
2049                 *err = cpstr(buf);
2050             }
2051
2052             fs_give((void **) display_charmap);
2053             if(!already_tried){
2054                 already_tried++;
2055                 goto try_again2;
2056             }
2057         }
2058     }
2059     else{
2060         if(err && !*err)
2061           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2062     }
2063
2064 #undef cpstr
2065
2066     *input_cs_arg = (void *) input_cs;
2067
2068     return(0);
2069 }
2070
2071
2072 int
2073 input_charset_is_supported(char *input_charset)
2074 {
2075     const CHARSET *cs;
2076
2077     if(!(input_charset && *input_charset))
2078       return 0;
2079
2080     if(!strucmp(input_charset, "utf-8"))
2081       return 1;
2082
2083     if((cs = utf8_charset(input_charset)) != NULL){
2084
2085         /*
2086          * This was true 2006-09-25.
2087          */
2088         switch(cs->type){
2089           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2090           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2091           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2092           case CT_UCS4: case CT_UTF16:
2093             return 1;
2094             break;
2095
2096           default:
2097             break;
2098         }
2099     }
2100
2101     return 0;
2102 }
2103
2104
2105 int
2106 output_charset_is_supported(char *output_charset)
2107 {
2108     const CHARSET *cs;
2109
2110     if(!(output_charset && *output_charset))
2111       return 0;
2112
2113     if(!strucmp(output_charset, "utf-8"))
2114       return 1;
2115
2116     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2117       return 1;
2118
2119     return 0;
2120 }
2121
2122
2123 int
2124 posting_charset_is_supported(char *posting_charset)
2125 {
2126     return(posting_charset && *posting_charset
2127            && (!strucmp(posting_charset, "ISO-2022-JP")
2128                || output_charset_is_supported(posting_charset)));
2129 }
2130
2131
2132 /*
2133  * This function is only defined in this special case and so calls
2134  * to it should be wrapped in the same macro conditionals.
2135  *
2136  * Returns the default display charset for a UNIX terminal emulator,
2137  * it is what nl_langinfo(CODESET) should return but we need to
2138  * wrap nl_langinfo because we know of strange behaving implementations.
2139  */
2140 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2141 char *
2142 nl_langinfo_codeset_wrapper(void)
2143 {
2144     char *ret = NULL;
2145
2146     ret = nl_langinfo(CODESET);
2147
2148     /*
2149      * If the value returned from nl_langinfo() is not a real charset,
2150      * see if we can figure out what they meant. If we can't figure it
2151      * out return NULL and let the caller decide what to do.
2152      */
2153     if(ret && *ret && !output_charset_is_supported(ret)){
2154         if(!strcmp("ANSI_X3.4-1968", ret)
2155            || !strcmp("646", ret)
2156            || !strcmp("ASCII", ret)
2157            || !strcmp("C", ret)
2158            || !strcmp("POSIX", ret))
2159           ret = "US-ASCII";
2160         else if(!strucmp(ret, "UTF8"))
2161           ret = "UTF-8";
2162         else if(!strucmp(ret, "EUCJP"))
2163           ret = "EUC-JP";
2164         else if(!strucmp(ret, "EUCKP"))
2165           ret = "EUC-KP";
2166         else if(!strucmp(ret, "SJIS"))
2167           ret = "SHIFT-JIS";
2168         else if(strstr(ret, "8859")){
2169             char *p;
2170
2171             /* check for digits after 8859 */
2172             p = strstr(ret, "8859");
2173             p += 4;
2174             if(!isdigit(*p))
2175               p++;
2176
2177             if(isdigit(*p)){
2178                 static char buf[12];
2179
2180                 memset(buf, 0, sizeof(buf));
2181                 strncpy(buf, "ISO-8859-", sizeof(buf));
2182                 buf[9] = *p++;
2183                 if(isdigit(*p))
2184                   buf[10] = *p;
2185
2186                 ret = buf;
2187             }
2188         }
2189     }
2190
2191     if(ret && !output_charset_is_supported(ret))
2192       ret = NULL;
2193
2194     return(ret);
2195 }
2196 #endif
2197
2198
2199 /*
2200  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2201  * needed the return value will point to orig. If a conversion is done,
2202  * the return string should be freed by the caller.
2203  * If not possible, returns NULL.
2204  */
2205 char *
2206 utf8_to_charset(char *orig, char *charset, int report_err)
2207 {
2208     SIZEDTEXT src, dst;
2209     char *ret = orig;
2210
2211     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2212       return ret;
2213
2214     src.size = strlen(orig);
2215     src.data = (unsigned char *) orig;
2216
2217     if(!strucmp(charset, "us-ascii")){
2218         size_t i;
2219
2220         for(i = 0; i < src.size; i++)
2221           if(src.data[i] & 0x80)
2222             return NULL;
2223
2224         return ret;
2225     }
2226
2227     /*
2228      * This works for ISO-2022-JP because of special code in utf8_cstext
2229      * but not for other 2022 charsets.
2230      */
2231     memset(&dst, 0, sizeof(dst));
2232     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2233       ret = (char *) dst.data;          /* c-client already null terminates it */
2234     else
2235       ret = NULL;
2236
2237     if((unsigned char *) ret != dst.data && dst.data)
2238       fs_give((void **) &dst.data);
2239
2240     return ret;
2241 }
2242
2243
2244 /*
2245  *      Turn a number into a string with comma's
2246  *
2247  * Args: number -- The long to be turned into a string.
2248  *
2249  * Result: pointer to static string representing number with commas
2250  * Can use up to 3 comatose results at once.
2251  */
2252 char *
2253 comatose(long int number)
2254 {
2255     long        i, x, done_one;
2256     static char buf[3][50];
2257     static int whichbuf = 0;
2258     char       *b;
2259
2260     whichbuf = (whichbuf + 1) % 3;
2261
2262     if(number == 0){
2263         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2264         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2265         return(buf[whichbuf]);
2266     }
2267
2268     done_one = 0;
2269     b = buf[whichbuf];
2270     for(i = 1000000000; i >= 1; i /= 1000) {
2271         x = number / i;
2272         number = number % i;
2273         if(x != 0 || done_one) {
2274             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2275               *b++ = ',';
2276
2277             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2278             b += strlen(b);
2279             done_one = 1;
2280         }
2281     }
2282
2283     if(b-buf[whichbuf] < sizeof(buf[0]))
2284       *b = '\0';
2285
2286     return(buf[whichbuf]);
2287 }
2288
2289
2290 /* leave out the commas */
2291 char *
2292 tose(long int number)
2293 {
2294     static char buf[3][50];
2295     static int whichbuf = 0;
2296
2297     whichbuf = (whichbuf + 1) % 3;
2298
2299     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2300
2301     return(buf[whichbuf]);
2302 }
2303
2304
2305 /*
2306  * line_paint - where the real work of managing what is displayed gets done.
2307  */
2308 void
2309 line_paint(int offset,                  /* current dot offset into vl */
2310            struct display_line *displ,
2311            int *passwd)                 /* flag to hide display of chars */
2312 {
2313     int i, w, w2, already_got_one = 0;
2314     int vfirst, vlast, dfirst, dlast, vi, di;
2315     int new_vbase;
2316     unsigned (*width_a_to_b)(UCS *, int, int);
2317
2318     /*
2319      * Set passwd to 10 in caller if you want to conceal the
2320      * password but not print asterisks for feedback.
2321      *
2322      * Set passwd to 1 in caller to conceal by printing asterisks.
2323      */
2324     if(passwd && *passwd >= 10){        /* don't show asterisks */
2325         if(*passwd > 10)
2326           return;
2327         else
2328           *passwd = 11;         /* only blat once */
2329
2330         i = 0;
2331         (*displ->movecursor)(displ->row, displ->col);
2332         while(i++ <= displ->dwid)
2333           (*displ->writechar)(' ');
2334
2335         (*displ->movecursor)(displ->row, displ->col);
2336         return;
2337     }
2338
2339     if(passwd && *passwd)
2340       width_a_to_b = single_width_chars_a_to_b;
2341     else
2342       width_a_to_b = ucs4_str_width_a_to_b;
2343
2344     /*
2345      * vl is the virtual line (the actual data). We operate on it by typing
2346      * characters to be added and deleting and so forth. In this routine we
2347      * copy a subset of those UCS-4 characters in vl into dl, the display
2348      * array, and show that subset on the screen.
2349      *
2350      * Offset is the location of the cursor in vl.
2351      *
2352      * We will display the string starting from vbase.
2353      * We have dwid screen cells to work in.
2354      * We may have to adjust vbase in order to display the
2355      * part of the string that contains the cursor.
2356      *
2357      * We'll make the display look like
2358      *   vl    a b c d e f g h i j k l m
2359      *             xxxxxxxxxxxxx  <- width dwid window
2360      *             < d e f g h >
2361      *               |
2362      *             vbase
2363      * The < will be there if vbase > 0.
2364      * The > will be there if the string from vbase to the
2365      * end can't all fit in the window.
2366      */
2367
2368     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2369
2370     /*
2371      * Adjust vbase so offset is not out of the window to the right.
2372      * (The +2 in w + 2 is for a possible " >" if the string goes past
2373      *  the right hand edge of the window and if the last visible character
2374      * is double wide. We don't want the offset to be under that > character.)
2375      */
2376     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2377         displ->dwid > 1 &&
2378         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2379         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2380         /*
2381          * offset is off the window to the right
2382          * It looks like   a b c d e f g h
2383          *                   |         |
2384          *               vbase         offset
2385          * and offset is either past the right edge,
2386          * or right at the right edge (and maybe under >),
2387          * or one before right at the edge (and maybe on space
2388          * for half a character).
2389          *
2390          * Since the characters may be double width it is slightly
2391          * complicated to figure out how far to increase vbase.
2392          * We're going to scoot over past width w/2 characters and
2393          * then see if that's sufficient.
2394          */
2395         new_vbase = displ->vbase + 1;
2396         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2397             w2 < displ->dwid/2;
2398             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2399           new_vbase++;
2400
2401         displ->vbase = new_vbase;
2402     }
2403
2404     /* adjust so offset is not out of the window to the left */
2405     while(displ->vbase > 0 && displ->vbase >= offset){
2406         /* add about dwid/2 more width */
2407         new_vbase = displ->vbase - 1;
2408         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2409             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2410             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2411           new_vbase--;
2412
2413         /* but don't let it get too small, recheck off right end */
2414         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2415             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2416             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2417           new_vbase++;
2418
2419         displ->vbase = MAX(new_vbase, 0);
2420     }
2421
2422     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2423       displ->vbase = 0;
2424
2425     vfirst = displ->vbase;
2426     dfirst = 0;
2427     if(displ->vbase > 0){                       /* off screen cue left */
2428         dfirst = 1;                             /* index which matches vfirst */
2429         displ->dl[0] = '<';
2430     }
2431
2432     vlast = displ->vused-1;                     /* end */
2433     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2434
2435     if(displ->dwid > 0 && w + dfirst > displ->dwid){                    /* off window right */
2436
2437         /* find last ucs character to be printed */
2438         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2439           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2440
2441         /* worry about double-width characters */
2442         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2443             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2444             displ->dl[dlast] = '>';
2445         }
2446         else{
2447             dlast = dfirst + vlast - vfirst + 1;
2448             displ->dl[dlast++] = ' ';
2449             displ->dl[dlast] = '>';
2450         }
2451     }
2452     else
2453       dlast = dfirst + vlast - vfirst;
2454
2455     /*
2456      * Copy the relevant part of the virtual line into the display line.
2457      */
2458     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2459       if(passwd && *passwd)
2460         displ->dl[di] = '*';            /* to conceal password */
2461       else
2462         displ->dl[di] = displ->vl[vi];
2463
2464     /*
2465      * Add spaces to clear the rest of the line.
2466      * We have dwid total space to fill.
2467      */
2468     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2469     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2470       displ->dl[di++] = ' ';
2471
2472     /*
2473      * Draw from left to right, skipping until we get to
2474      * something that is different. Characters may be different
2475      * widths than they were initially so paint from there the
2476      * rest of the way.
2477      */
2478     for(di = 0; displ->dl[di]; di++){
2479         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2480             /* move cursor first time */
2481             if(!already_got_one++){
2482                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2483                 (*displ->movecursor)(displ->row, displ->col + w);
2484             }
2485
2486             (*displ->writechar)(displ->dl[di]);
2487             displ->olddl[di] = displ->dl[di];
2488         }
2489     }
2490
2491     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2492
2493     /*
2494      * Move the cursor to the offset.
2495      *
2496      * The offset is relative to the start of the virtual array. We need
2497      * to find the location on the screen. The offset into the display array
2498      * will be offset-vbase+dfirst. We want to be at the start of that
2499      * character, so we need to find the width of all the characters up
2500      * to that point.
2501      */
2502     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2503
2504     (*displ->movecursor)(displ->row, displ->col + w);
2505 }
2506
2507
2508 /*
2509  * This is just like ucs4_str_width_a_to_b() except all of the characters
2510  * are assumed to be of width 1. This is for printing out *'s when user
2511  * enters a password, while still managing to use the same code to do the
2512  * display.
2513  */
2514 unsigned
2515 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2516 {
2517     unsigned width = 0;
2518     int i;
2519
2520     if(ucsstr)
2521       for(i = a; i <= b && ucsstr[i]; i++)
2522         width++;
2523
2524     return width;
2525 }