pith/charconv/utf8.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2013-2021 Eduardo Chappa
   8  * Copyright 2006-2008 University of Washington
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19
  20 /* includable WITHOUT dependency on c-client */
  21 #include "../../c-client/mail.h"
  22 #include "../../c-client/utf8.h"
  23
  24 #ifdef _WINDOWS
  25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
  26 #undef ERROR
  27 #else
  28 #define _XOPEN_SOURCE
  29 #endif
  30
  31 #include <system.h>
  32
  33 #include "../../c-client/fs.h"
  34
  35 /* includable WITHOUT dependency on pico */
  36 #include "../../pico/keydefs.h"
  37
  38 #include "../osdep/collate.h"
  39 #include "../filttype.h"
  40
  41 #include "utf8.h"
  42
  43 #include <stdarg.h>
  44
  45
  46 unsigned single_width_chars_a_to_b(UCS *, int, int);
  47
  48
  49 static char locale_charmap[50];
  50
  51 static int   native_utf8;
  52 static void *display_data;
  53
  54 void
  55 init_utf8_display(int utf8, void *rmap)
  56 {
  57     native_utf8 = utf8;
  58     display_data = rmap;
  59 }
  60
  61
  62 /*
  63  * Argument is a UCS-4 wide character.
  64  * Returns the environment dependent cell width of the
  65  * character when printed to the screen.
  66  * This will be -1 if the character is not printable.
  67  * It will be >= zero if it is printable.
  68  *
  69  * Note that in the case it is not printable but it is still sent to
  70  * Writechar, Writechar will print a '?' with width 1.
  71  */
  72 int
  73 wcellwidth(UCS ucs)
  74 {
  75     char dummy[32];
  76     long w;
  77
  78     /*
  79      * We believe that on modern unix systems wchar_t is a UCS-4 character.
  80      * That's the assumption here.
  81      */
  82
  83     if(native_utf8){                    /* display is UTF-8 capable */
  84         w = ucs4_width((unsigned long) ucs);
  85         return((w & U4W_ERROR) ? -1 : w);
  86     }
  87     else if(display_data){
  88         if(wtomb(dummy, ucs) < 0)
  89           return(-1);
  90         else{
  91             w = ucs4_width((unsigned long) ucs);
  92             return((w & U4W_ERROR) ? -1 : w);
  93         }
  94     }
  95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
  96     else
  97       return(wcwidth((wchar_t) ucs));
  98 #else
  99     return(0);
 100 #endif
 101 }
 102
 103 /* ambiguous width zone character function. We use the Windows code until
 104  * we find a better way to do it in general.
 105  */
 106 int
 107 pith_ucs4width(UCS ucs)
 108 {
 109   return (ucs >= 0x2100) ? 2 : 1;
 110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
 111   return wcwidth((wchar_t) ucs);
 112 #else
 113   return (ucs >= 0x2100) ? 2 : 1;
 114 #endif /* _WINDOWS */
 115 }
 116
 117 /*
 118  * Argument is a UCS-4 wide character.
 119  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 120  * Dest is a buffer at least xx chars wide where the multi-byte version
 121  * of the wide character will be written.
 122  * The returned value is the number of bytes written to dest or -1
 123  * if the conversion can't be done.
 124  */
 125 int
 126 wtomb(char *dest, UCS ucs)
 127 {
 128     /*
 129      * We believe that on modern unix systems wchar_t is a UCS-4 character.
 130      * That's the assumption here.
 131      */
 132
 133     if(native_utf8){
 134         unsigned char *newdptr;
 135
 136         newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
 137         return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
 138     }
 139     else if(display_data){
 140         unsigned long ucs4;
 141         int           ret;
 142
 143         ucs4 = (unsigned long) ucs;
 144         ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
 145         if(ret >= 0)
 146           ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
 147         else
 148           ret = -1;
 149
 150         return(ret);
 151     }
 152     else
 153       return(wcrtomb(dest, (wchar_t) ucs, NULL));
 154 }
 155
 156
 157 /*
 158  * This function does not necessarily update inputp and remaining_octets, so
 159  * don't rely on that. The c-client version does but the other doesn't.
 160  */
 161 UCS
 162 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
 163 {
 164     UCS ucs;
 165
 166     if(input_cs){
 167         CHARSET *cast_input_cs;
 168
 169         cast_input_cs = (CHARSET *) input_cs;
 170
 171         switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
 172           case U8G_ENDSTRG:
 173           case U8G_ENDSTRI:
 174             return(CCONV_NEEDMORE);
 175
 176           default:
 177             if(ucs & U8G_ERROR || ucs == UBOGON)
 178               return(CCONV_BADCHAR);
 179
 180             return(ucs);
 181         }
 182     }
 183     else{
 184         size_t ret;
 185         wchar_t w;
 186
 187         /*
 188          * Warning:  input_cs and remaining_octets are unused in this
 189          * half of the if/else.
 190          *
 191          * Unfortunately, we can't tell the difference between a source string
 192          * that is just not long enough and one that has characters that can't
 193          * be converted even though it is long enough. We return NEEDMORE in both cases.
 194          */
 195         ret = mbstowcs(&w, (char *) (*inputp), 1);
 196         if(ret == (size_t)(-1))
 197           return(CCONV_NEEDMORE);
 198         else{
 199           ucs = (UCS) w;
 200           return(ucs);
 201         }
 202     }
 203 }
 204
 205
 206 void
 207 set_locale_charmap(char *charmap)
 208 {
 209     if(charmap){
 210         strncpy(locale_charmap, charmap, sizeof(locale_charmap));
 211         locale_charmap[sizeof(locale_charmap)-1] = '\0';
 212     }
 213     else
 214       locale_charmap[0] = '\0';
 215 }
 216
 217
 218 /*
 219  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 220  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 221  * The caller is responsible for freeing the returned value.
 222  *
 223  * Args  str     -- the string to convert
 224  */
 225 char *
 226 convert_to_utf8(char *str, char *fromcharset, int flags)
 227 {
 228     char          *ret = NULL;
 229     char          *fcharset;
 230     SIZEDTEXT      src, result;
 231     const CHARSET *cs;
 232     int            try;
 233
 234     src.data = (unsigned char *) str;
 235     src.size = strlen(str);
 236
 237     /* already UTF-8, return NULL */
 238     if(!(flags & CU8_NOINFER)
 239        && (cs = utf8_infercharset(&src))
 240        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
 241       return(ret);
 242
 243     try = 1;
 244     while(try < 5){
 245         switch(try){
 246           case 1:
 247             fcharset = fromcharset;
 248             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 249               break;    /* give it a try */
 250             else
 251               try++;    /* fall through */
 252
 253           case 2:
 254             if(!(flags & CU8_NOINFER)){
 255                 fcharset = cs ? cs->name : NULL;
 256                 if(fcharset && strucmp("UTF-8", fcharset) != 0)
 257                   break;
 258                 else
 259                   try++;        /* fall through */
 260             }
 261             else
 262               try++;    /* fall through */
 263
 264           case 3:
 265             fcharset = locale_charmap;
 266             if(fcharset && strucmp("UTF-8", fcharset) != 0)
 267               break;
 268             else
 269               try++;    /* fall through */
 270
 271           default:
 272             fcharset = "ISO-8859-1";            /* this will "work" */
 273             break;
 274         }
 275
 276         memset(&result, 0, sizeof(result));
 277
 278         if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
 279             if(!(result.size == src.size && result.data == src.data)){
 280                 ret = (char *) fs_get((result.size+1) * sizeof(char));
 281                 strncpy(ret, (char *) result.data, result.size);
 282                 ret[result.size] = '\0';
 283             }
 284             /* else no conversion necessary */
 285
 286             if(result.data && result.data != src.data)
 287               fs_give((void **) &result.data);
 288             result.size = 0;
 289
 290             return(ret);
 291         }
 292
 293         try++;
 294     }
 295
 296     /* won't make it to here */
 297     return(ret);
 298 }
 299
 300
 301 /*
 302  * Convert from UTF-8 to user's locale charset.
 303  * This actually uses the wtomb routine to do the conversion, and that
 304  * relies on setup_for_input_output having been called.
 305  * If no conversion is necessary, NULL is returned, otherwise an allocated
 306  * string in the locale charset is returned and the caller is responsible
 307  * for freeing it.
 308  */
 309 char *
 310 convert_to_locale(char *utf8str)
 311 {
 312 #define CHNK 500
 313     char *inp, *ret = NULL;
 314     CBUF_S cb;
 315     int alloced;
 316     size_t i = 0;
 317
 318     if(native_utf8 || !utf8str || !utf8str[0])
 319       return(NULL);
 320
 321     cb.cbuf[0] = '\0';
 322     cb.cbufp = cb.cbufend = cb.cbuf;
 323     inp = utf8str;
 324
 325     alloced = CHNK;
 326     ret = (char *) fs_get(alloced * sizeof(char));
 327
 328     /*
 329      * There's gotta be a better way to do this but utf8_to_locale was
 330      * available and everything looks like a nail when all you have
 331      * is a hammer.
 332      */
 333     while(*inp){
 334         /*
 335          * We're placing the outgoing stream of characters in ret, a multi-byte
 336          * array of characters in the user's locale charset. See if there is
 337          * enough room for the next wide characters worth of output chars
 338          * and allocate more space if not.
 339          */
 340         if((alloced - i) < MAX(MB_LEN_MAX,32)){
 341             alloced += CHNK;
 342             fs_resize((void **) &ret, alloced * sizeof(char));
 343         }
 344
 345         i += utf8_to_locale((int) *inp++, &cb,
 346                            (unsigned char *) &ret[i], alloced - i);
 347     }
 348
 349     fs_resize((void **) &ret, i + 1);
 350
 351     ret[i] = '\0';
 352
 353     return(ret);
 354 }
 355
 356
 357 /*
 358  * Pass in a stream of UTF-8 characters in 'c' and return obuf
 359  * filled in with multi-byte characters. The return value is the
 360  * number of valid characters in obuf to be used.
 361  */
 362 int
 363 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
 364 {
 365     int outchars = 0;
 366
 367     if(!(cb && cb->cbufp))
 368       return(0);
 369
 370     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 371         unsigned char *inputp;
 372         unsigned long remaining_octets;
 373         UCS ucs;
 374
 375         *(cb->cbufp)++ = (unsigned char) c;
 376         inputp = cb->cbuf;
 377         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 378         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 379
 380         switch(ucs){
 381           case U8G_ENDSTRG:     /* incomplete character, wait */
 382           case U8G_ENDSTRI:     /* incomplete character, wait */
 383             break;
 384
 385           default:
 386             if(ucs & U8G_ERROR || ucs == UBOGON){
 387                 /*
 388                  * None of these cases is supposed to happen. If it
 389                  * does happen then the input stream isn't UTF-8
 390                  * so something is wrong. Treat each character in the
 391                  * input buffer as a separate error character and
 392                  * print a '?' for each.
 393                  */
 394                 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
 395                   obuf[outchars++] = '?';
 396
 397                 cb->cbufp = cb->cbuf;
 398             }
 399             else{
 400                 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
 401                     /*
 402                      * This happens when we have a UTF-8 character that
 403                      * we aren't able to print in our locale. For example,
 404                      * if the locale is setup with the terminal
 405                      * expecting ISO-8859-1 characters then there are
 406                      * lots of UTF-8 characters that can't be printed.
 407                      * Print a '?' instead.
 408                      */
 409                     obuf[outchars++] = '?';
 410                 }
 411                 else{
 412                     /*
 413                      * Convert the ucs into the multibyte
 414                      * character that corresponds to the
 415                      * ucs in the users locale.
 416                      */
 417                     outchars = wtomb((char *) obuf, ucs);
 418                     if(outchars < 0){
 419                         obuf[0] = '?';
 420                         outchars = 1;
 421                     }
 422                 }
 423
 424                 /* update the input buffer */
 425                 if(inputp >= cb->cbufp) /* this should be the case */
 426                   cb->cbufp = cb->cbuf;
 427                 else{           /* extra chars for some reason? */
 428                     unsigned char *q, *newcbufp;
 429
 430                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 431                     q = cb->cbuf;
 432                     while(inputp < cb->cbufp)
 433                       *q++ = *inputp++;
 434
 435                     cb->cbufp = newcbufp;
 436                 }
 437             }
 438
 439             break;
 440         }
 441     }
 442     else{                       /* error */
 443         obuf[0] = '?';
 444         outchars = 1;
 445         cb->cbufp = cb->cbuf;   /* start over */
 446     }
 447
 448     return(outchars);
 449 }
 450
 451
 452 /*
 453  * Returns the screen cells width of the UCS-4 string argument.
 454  * The source string is zero terminated.
 455  */
 456 unsigned
 457 ucs4_str_width(UCS *ucsstr)
 458 {
 459     unsigned width = 0;
 460     int w;
 461
 462     if(ucsstr)
 463       while(*ucsstr){
 464         w = wcellwidth(*ucsstr++);
 465         if(w != U4W_CTLSRGT)
 466           width += (w < 0 ? 1 : w);
 467       }
 468
 469     return width;
 470 }
 471
 472
 473 /*
 474  * Returns the screen cells width of the UCS-4 string argument
 475  * from ucsstr[a] through (inclusive) ucsstr[b].
 476  * No checking is done to make sure a starts in the middle
 477  * of a UCS-4 array.
 478  */
 479 unsigned
 480 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
 481 {
 482     unsigned width = 0;
 483     int i, w;
 484
 485     if(ucsstr)
 486       for(i = a; i <= b && ucsstr[i]; i++){
 487         w = wcellwidth(ucsstr[i]);
 488         if(w != U4W_CTLSRGT)
 489           width += (w < 0 ? 1 : w);
 490       }
 491
 492     return width;
 493 }
 494
 495
 496 /*
 497  * Returns the screen cells width of the UCS-4 string argument
 498  * from ustart through (exclusive) uend.
 499  * No checking is done to make sure it starts in the middle
 500  * of a UCS-4 array.
 501  */
 502 unsigned
 503 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
 504 {
 505     UCS *u;
 506     unsigned width = 0;
 507     int w;
 508
 509     if(!ustart)
 510       return width;
 511
 512     if(ustart)
 513       for(u = ustart; u < uend; u++){
 514         w = wcellwidth(*u);
 515         if(w != U4W_CTLSRGT)
 516           width += (w < 0 ? 1 : w);
 517       }
 518
 519     return(width);
 520 }
 521
 522
 523 /*
 524  * Return the largest possible pointer into ucs4str so that the width
 525  * of the string from ucs4str to the pointer (exclusive)
 526  * is maxwidth or less. Also stops at a null character.
 527  */
 528 UCS *
 529 ucs4_particular_width(UCS *ucs4str, int maxwidth)
 530 {
 531     UCS *u;
 532     int w_consumed = 0, w, done = 0;
 533
 534     u = ucs4str;
 535
 536     if(u)
 537       while(!done && *u && w_consumed <= maxwidth){
 538         w = wcellwidth(*u);
 539         w = (w >= 0 ? w : 1);
 540         if(w_consumed + w <= maxwidth){
 541             w_consumed += w;
 542             ++u;
 543         }
 544         else
 545           ++done;
 546       }
 547
 548     return(u);
 549 }
 550
 551
 552 /*
 553  * Convert and copy a UTF-8 string into a UCS-4 NULL
 554  * terminated array. Just like cpystr only it converts
 555  * from UTF-8 to UCS-4.
 556  *
 557  * Returned UCS-4 string needs to be freed by caller.
 558  */
 559 UCS *
 560 utf8_to_ucs4_cpystr(char *utf8src)
 561 {
 562     size_t         retsize;
 563     UCS           *ret = NULL;
 564     UCS            ucs;
 565     unsigned long  remaining_octets;
 566     unsigned char *readptr;
 567     size_t         arrayindex;
 568
 569     /*
 570      * We don't know how big to allocate the return array
 571      * because variable numbers of octets in the src array
 572      * will combine to make UCS-4 characters. The number of
 573      * UCS-4 characters is less than or equal to the number
 574      * of src characters, though.
 575      */
 576
 577     if(!utf8src)
 578       return NULL;
 579
 580     retsize = strlen(utf8src) + 1;
 581
 582     ret = (UCS *) fs_get(retsize * sizeof(*ret));
 583     memset(ret, 0, retsize * sizeof(*ret));
 584
 585     readptr = (unsigned char *) utf8src;
 586     remaining_octets = retsize-1;
 587     arrayindex = 0;
 588
 589     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
 590         ucs = (UCS) utf8_get(&readptr, &remaining_octets);
 591
 592         if(ucs & U8G_ERROR || ucs == UBOGON)
 593           remaining_octets = 0;
 594         else
 595           ret[arrayindex++] = ucs;
 596     }
 597
 598     ret[arrayindex] = '\0';
 599
 600     /* get rid of excess size */
 601     if(arrayindex+1 < retsize)
 602       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
 603
 604     return ret;
 605 }
 606
 607
 608 /*
 609  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 610  * terminated string. Just like cpystr only it converts
 611  * from UCS-4 to UTF-8.
 612  *
 613  * Returned UTF-8 string needs to be freed by caller.
 614  */
 615 char *
 616 ucs4_to_utf8_cpystr(UCS *ucs4src)
 617 {
 618     unsigned char *ret = NULL;
 619     unsigned char *writeptr;
 620     int            i;
 621
 622     if(!ucs4src)
 623       return NULL;
 624
 625     /*
 626      * Over-allocate and then resize at the end.
 627      */
 628
 629     /* count characters in source */
 630     for(i = 0; ucs4src[i]; i++)
 631       ;
 632
 633     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
 634     memset(ret, 0, (6*i + 1) * sizeof(*ret));
 635
 636     writeptr = ret;
 637     for(i = 0; ucs4src[i]; i++)
 638       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 639
 640     /* get rid of excess size */
 641     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 642
 643     return ((char *) ret);
 644 }
 645
 646
 647 /*
 648  * Similar to above but copy a fixed number of source
 649  * characters instead of going until null terminator.
 650  */
 651 char *
 652 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
 653 {
 654     unsigned char *ret = NULL;
 655     unsigned char *writeptr;
 656     int            i;
 657
 658     if(!ucs4src)
 659       return NULL;
 660
 661     /*
 662      * Over-allocate and then resize at the end.
 663      */
 664
 665     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
 666     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
 667
 668     writeptr = ret;
 669     for(i = 0; i < ucs4src_len; i++)
 670       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
 671
 672     /* get rid of excess size */
 673     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
 674
 675     return ((char *) ret);
 676 }
 677
 678
 679 #ifdef _WINDOWS
 680 /*
 681  * Convert a UTF-8 argument into an LPTSTR version
 682  * of that argument. The result is allocated here
 683  * and should be freed by the caller.
 684  */
 685 LPTSTR
 686 utf8_to_lptstr(LPSTR arg_utf8)
 687 {
 688      int lptstr_len;
 689      LPTSTR lptstr_ret = NULL;
 690
 691      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
 692      if(lptstr_len > 0)
 693      {
 694          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
 695          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
 696              arg_utf8, -1, lptstr_ret, lptstr_len );
 697      }
 698
 699      if(!lptstr_len)
 700      {
 701          /* check GetLastError()? */
 702          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
 703          lptstr_ret[0] = 0;
 704      }
 705
 706      return lptstr_ret;
 707 }
 708
 709
 710 /*
 711  * Convert an LPTSTR argument into a UTF-8 version
 712  * of that argument. The result is allocated here
 713  * and should be freed by the caller.
 714  */
 715 LPSTR
 716 lptstr_to_utf8(LPTSTR arg_lptstr)
 717 {
 718      int utf8str_len;
 719      LPSTR utf8str_ret = NULL;
 720
 721      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
 722      if(utf8str_len > 0)
 723      {
 724          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
 725          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
 726              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
 727      }
 728
 729      if(!utf8str_len)
 730      {
 731          /* check GetLastError()? */
 732          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
 733          utf8str_ret[0] = 0;
 734      }
 735
 736      return utf8str_ret;
 737 }
 738
 739
 740 /*
 741  * Convert a UCS4 argument into an LPTSTR version
 742  * of that argument. The result is allocated here
 743  * and should be freed by the caller.
 744  */
 745 LPTSTR
 746 ucs4_to_lptstr(UCS *arg_ucs4)
 747 {
 748     LPTSTR ret_lptstr = NULL;
 749     size_t len;
 750     size_t i;
 751
 752     if(arg_ucs4){
 753         len = ucs4_strlen(arg_ucs4);
 754         ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
 755         /* bogus conversion ignores UTF-16 */
 756         for(i = 0; i < len; i++)
 757           ret_lptstr[i] = arg_ucs4[i];
 758
 759         ret_lptstr[len] = '\0';
 760     }
 761
 762     return(ret_lptstr);
 763 }
 764
 765
 766 /*
 767  * Convert an LPTSTR argument into a UCS4 version
 768  * of that argument. The result is MemAlloc'd here
 769  * and should be freed by the caller.
 770  */
 771 UCS *
 772 lptstr_to_ucs4(LPTSTR arg_lptstr)
 773 {
 774     UCS *ret_ucs4 = NULL;
 775     size_t len;
 776     size_t i;
 777
 778     if(arg_lptstr){
 779         len = _tcslen(arg_lptstr);
 780         ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
 781         /* bogus conversion ignores UTF-16 */
 782         for(i = 0; i < len; i++)
 783           ret_ucs4[i] = arg_lptstr[i];
 784
 785         ret_ucs4[len] = '\0';
 786     }
 787
 788     return(ret_ucs4);
 789 }
 790
 791 #endif /* _WINDOWS */
 792
 793
 794 /*
 795  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 796  * 1-at-a-time filled in with UCS characters. The return value is the
 797  * number of valid characters in obuf to be used. It can only
 798  * be 1 or 0 characters since we're only getting one UTF-8 character
 799  * at a time.
 800  */
 801 int
 802 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
 803 {
 804     int  width = 0, outchars = 0;
 805
 806     if(!(cb && cb->cbufp))
 807       return(0);
 808
 809     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
 810         unsigned char *inputp;
 811         unsigned long remaining_octets;
 812         UCS ucs;
 813
 814         *cb->cbufp++ = (unsigned char) c;
 815         inputp = cb->cbuf;
 816         remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
 817         ucs = (UCS) utf8_get(&inputp, &remaining_octets);
 818
 819         switch(ucs){
 820           case U8G_ENDSTRG:     /* incomplete character, wait */
 821           case U8G_ENDSTRI:     /* incomplete character, wait */
 822             break;
 823
 824           default:
 825             if(ucs & U8G_ERROR || ucs == UBOGON){
 826                 /*
 827                  * None of these cases is supposed to happen. If it
 828                  * does happen then the input stream isn't UTF-8
 829                  * so something is wrong.
 830                  */
 831                 outchars++;
 832                 *obuf = '?';
 833                 cb->cbufp = cb->cbuf;
 834                 width = 1;
 835             }
 836             else{
 837                 outchars++;
 838                 if(ucs < 0x80 && ucs >= 0x20)
 839                   width = 1;
 840
 841                 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
 842                     /*
 843                      * This happens when we have a UTF-8 character that
 844                      * we aren't able to print in our locale. For example,
 845                      * if the locale is setup with the terminal
 846                      * expecting ISO-8859-1 characters then there are
 847                      * lots of UTF-8 characters that can't be printed.
 848                      * Print a '?' instead.
 849                      * Don't think this should happen in Windows.
 850                      */
 851                     *obuf = '?';
 852                 }
 853                 else{
 854                     *obuf = ucs;
 855                 }
 856
 857                 /* update the input buffer */
 858                 if(inputp >= cb->cbufp) /* this should be the case */
 859                   cb->cbufp = cb->cbuf;
 860                 else{           /* extra chars for some reason? */
 861                     unsigned char *q, *newcbufp;
 862
 863                     newcbufp = (cb->cbufp - inputp) + cb->cbuf;
 864                     q = cb->cbuf;
 865                     while(inputp < cb->cbufp)
 866                       *q++ = *inputp++;
 867
 868                     cb->cbufp = newcbufp;
 869                 }
 870             }
 871
 872             break;
 873         }
 874     }
 875     else{                       /* error */
 876         *obuf = '?';
 877         outchars = 1;
 878         width = 1;
 879         cb->cbufp = cb->cbuf;   /* start over */
 880     }
 881
 882     if(obufwidth)
 883       *obufwidth = width;
 884
 885     return(outchars);
 886 }
 887
 888
 889 /*
 890  * Return an allocated copy of a zero-terminated UCS-4 string.
 891  */
 892 UCS *
 893 ucs4_cpystr(UCS *ucs4src)
 894 {
 895     size_t         arraysize;
 896     UCS           *ret = NULL;
 897     size_t         i;
 898
 899     if(!ucs4src)
 900       return NULL;
 901
 902     arraysize = ucs4_strlen(ucs4src);
 903
 904     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
 905     memset(ret, 0, (arraysize+1) * sizeof(*ret));
 906
 907     for(i = 0; i < arraysize; i++)
 908       ret[i] = ucs4src[i];
 909
 910     return ret;
 911 }
 912
 913
 914 UCS *
 915 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
 916 {
 917     size_t i;
 918
 919     if(ucs4src && ucs4dst){
 920         for(i = 0; i < n; i++){
 921             ucs4dst[i] = ucs4src[i];
 922             if(ucs4dst[i] == '\0')
 923               break;
 924         }
 925     }
 926
 927     return ucs4dst;
 928 }
 929
 930
 931 UCS *
 932 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
 933 {
 934     size_t i;
 935     UCS *u;
 936
 937     if(ucs4src && ucs4dst){
 938         for(u = ucs4dst; *u; u++)
 939           ;
 940
 941         for(i = 0; i < n; i++){
 942             u[i] = ucs4src[i];
 943             if(u[i] == '\0')
 944               break;
 945         }
 946
 947         if(i == n)
 948           u[i] = '\0';
 949     }
 950
 951     return ucs4dst;
 952 }
 953
 954
 955 /*
 956  * Like strlen only this returns the number of non-zero characters
 957  * in a zero-terminated UCS-4 array.
 958  */
 959 size_t
 960 ucs4_strlen(UCS *ucs4str)
 961 {
 962     size_t i = 0;
 963
 964     if(ucs4str)
 965       while(ucs4str[i])
 966         i++;
 967
 968     return(i);
 969 }
 970
 971
 972 int
 973 ucs4_strcmp(UCS *s1, UCS *s2)
 974 {
 975     for(; *s1 == *s2; s1++, s2++)
 976       if(*s1 == '\0')
 977         return 0;
 978
 979     return((*s1 < *s2) ? -1 : 1);
 980 }
 981
 982
 983 UCS *
 984 ucs4_strchr(UCS *s, UCS c)
 985 {
 986     if(!s)
 987       return NULL;
 988
 989     while(*s && *s != c)
 990       s++;
 991
 992     if(*s || !c)
 993       return s;
 994     else
 995       return NULL;
 996 }
 997
 998
 999 UCS *
1000 ucs4_strrchr(UCS *s, UCS c)
1001 {
1002     UCS *ret = NULL;
1003
1004     if(!s)
1005       return ret;
1006
1007     while(*s){
1008         if(*s == c)
1009           ret = s;
1010
1011         s++;
1012     }
1013
1014     return ret;
1015 }
1016
1017
1018 /*
1019  * Returns the screen cells width of the UTF-8 string argument.
1020  */
1021 unsigned
1022 utf8_width(char *str)
1023 {
1024     unsigned width = 0;
1025     int this_width;
1026     UCS ucs;
1027     unsigned long remaining_octets;
1028     char *readptr;
1029
1030     if(!(str && *str))
1031       return(width);
1032
1033     readptr = str;
1034     remaining_octets = readptr ? strlen(readptr) : 0;
1035
1036     while(remaining_octets > 0 && *readptr){
1037
1038         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1039
1040         if(ucs & U8G_ERROR || ucs == UBOGON){
1041             /*
1042              * This should not happen, but do something to handle it anyway.
1043              * Treat each character as a single width character, which is what should
1044              * probably happen when we actually go to write it out.
1045              */
1046             remaining_octets--;
1047             readptr++;
1048             this_width = 1;
1049         }
1050         else{
1051             this_width = wcellwidth(ucs);
1052
1053             /*
1054              * If this_width is -1 that means we can't print this character
1055              * with our current locale. Writechar will print a '?'.
1056              */
1057             if(this_width < 0)
1058               this_width = 1;
1059         }
1060
1061         width += (unsigned) this_width;
1062     }
1063
1064     return(width);
1065 }
1066
1067
1068 /*
1069  * Copy UTF-8 characters from src into dst.
1070  * This is intended to be used if you want to truncate a string at
1071  * the start instead of the end. For example, you have a long string
1072  * like
1073  *       this_is_a_long_string
1074  * but not enough space to fit it into a particular field. You want to
1075  * end up with
1076  *             s_a_long_string
1077  * where that fits in a particular width. Perhaps you'd use this with ...
1078  * to get
1079  *          ...s_a_long_string
1080  * This right adjusts the end of the string in the width space and
1081  * cuts it off at the start. If there is enough width for the whole
1082  * string it will copy the string into dst with no padding.
1083  *
1084  * Copy enough characters so that the result will have screen width of
1085  * want_width screen cells in current locale.
1086  *
1087  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1088  *   to dst. This is just for protection, it shouldn't be relied on to
1089  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1090  *   characters truncated in the middle or something like that.
1091  *
1092  * Returned value is the number of bytes written to dst, not including
1093  *   the possible terminating null.
1094  *
1095  * If we can't hit want_width exactly because of double width characters
1096  *   then we will pad the end of the string with space in order to make
1097  *   the width exact.
1098  */
1099 size_t
1100 utf8_to_width_rhs(char *dst,            /* destination buffer */
1101                   char *src,            /* source string */
1102                   size_t dstlen,        /* space in dest */
1103                   unsigned want_width)  /* desired screen width */
1104 {
1105     int this_width;
1106     unsigned width_consumed = 0;
1107     UCS ucs;
1108     unsigned long remaining_octets;
1109     char *readptr, *goodreadptr, *savereadptr, *endptr;
1110     size_t nb = 0;
1111
1112     if(!src){
1113         if(dstlen > 0)
1114           dst[0] = '\0';
1115
1116         return nb;
1117     }
1118
1119     /*
1120      * Start at the end of the source string and go backwards until we
1121      * get to the desired width, but not more than the width.
1122      */
1123     readptr = src + strlen(src);
1124     endptr = readptr;
1125     goodreadptr = readptr;
1126     width_consumed = 0;
1127     savereadptr = readptr;
1128
1129     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1130         readptr = savereadptr-1){
1131
1132         savereadptr = readptr;
1133         remaining_octets = goodreadptr - readptr;
1134         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1135
1136         /*
1137          * Handling the error case is tough because an error will be the normal thing that
1138          * happens as we back through the string. So we're just going to punt on the
1139          * error for now.
1140          */
1141         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1142             if(remaining_octets > 0){
1143                 /*
1144                  * This means there are some bad octets after this good
1145                  * character so things are not going to work out well.
1146                  * Bail out.
1147                  */
1148                 savereadptr = src;      /* we're done */
1149             }
1150             else{
1151                 this_width = wcellwidth(ucs);
1152
1153                 if(this_width < 0)
1154                   this_width = 1;
1155
1156                 if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1157                     width_consumed += (unsigned) this_width;
1158                     goodreadptr = savereadptr;
1159                 }
1160                 else
1161                   savereadptr = src;    /* we're done */
1162             }
1163         }
1164     }
1165
1166     /*
1167      * Copy characters from goodreadptr to endptr into dst.
1168      */
1169     nb = MIN(endptr-goodreadptr, dstlen-1);
1170     strncpy(dst, goodreadptr, nb);
1171     dst[nb] = '\0';
1172
1173     /*
1174      * Pad out with spaces in order to hit width exactly.
1175      */
1176     while(width_consumed < want_width && nb < dstlen-1){
1177         dst[nb++] = ' ';
1178         dst[nb] = '\0';
1179         width_consumed++;
1180     }
1181
1182     return nb;
1183 }
1184
1185
1186 /*
1187  * The arguments being converted are UTF-8 strings.
1188  * This routine attempts to make it possible to use screen cell
1189  * widths in a format specifier. In a one-byte per screen cell
1190  * world we might have used %10.10s to cause a string to occupy
1191  * 10 screen positions. Since the width and precision are really
1192  * referring to numbers of bytes instead of screen positions that
1193  * won't work with UTF-8 input. We emulate that behavior with
1194  * the format string %w. %m.nw means to use the m and n as
1195  * screen width indicators instead of bytes indicators.
1196  *
1197  * There is no reason to use this routine unless you want to use
1198  * min field with or precision with the specifier. A plain %w without
1199  * widths is equivalent exactly to a plain %s in a regular printf.
1200  *
1201  * Double-width characters complicate things. It may not be possible
1202  * to satisfy the request exactly. For example, %3w for an input
1203  * string that is made up of two double-width characters.
1204  * This routine will arbitrarily use a trailing space character if
1205  * needed to make the width come out correctly where a half of a
1206  * double-width character would have been needed. We'll see how
1207  * that works for us.
1208  *
1209  * %w only works for strings (it's a %s replacement).
1210  *
1211  * Buffer overflow is handled by the size argument. %.30s will work
1212  * to limit a particular string to 30 bytes, but you lose that
1213  * ability with %w, since it may write more than precision bytes
1214  * in order to get to the desired width. It is best to choose
1215  * size large enough so that it doesn't come into play, otherwise
1216  * it may be possible to get partial UTF-8 characters because of
1217  * the truncation.
1218  *
1219  * The return value isn't quite the same as the return value
1220  * of snprintf. It is the number of bytes written, not counting
1221  * the trailing null, just like snprintf. However, if it is
1222  * truncated due to size then the output is size, not the
1223  * number of characters that would have been written.
1224  */
1225 int
1226 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1227 {
1228     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1229     char   *start_of_specifier;
1230     char   *input_str;
1231     int     int_arg;
1232     double  double_arg;
1233     void   *ptr_arg;
1234     unsigned got_width;
1235     int     more_flags, ret, w;
1236     int     min_field_width, field_precision, modifier;
1237     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1238     va_list args;
1239
1240     newfmt[0] = '\0';
1241     q = newfmt;
1242
1243     pdest = dest;
1244
1245 #define IS_ROOM_IN_DEST(n_more_chars)                   \
1246     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1247
1248     /*
1249      * Strategy: Look through the fmt string for %w's. Replace the
1250      * %w's in the format string with %s's but with possibly different
1251      * width and precision arguments which will make it come out right.
1252      * Then call the regular system vsnprintf with the altered format
1253      * string but same arguments.
1254      *
1255      * That would be nice but it doesn't quite work. Why? Because a
1256      * %*w will need to have the value in the integer argument the *
1257      * refers to modified. Can't do it as far as I can tell. Or we could
1258      * remove the integer argument somehow before calling printf. Can't
1259      * do it. Or we could somehow add an additional conversion specifier
1260      * that caused nothing to be printed but ate up the integer arg.
1261      * Can't figure out how to do that either.
1262      *
1263      * Since we can't figure out how to do it, the alternative is to
1264      * construct the result one piece at a time, pasting together the
1265      * pieces from the different conversions.
1266      */
1267     va_start(args, fmt);
1268
1269     while(*fmt && IS_ROOM_IN_DEST(1)){
1270         if(*fmt == '%'){
1271             start_of_specifier = fmt++;
1272
1273             min_field_width = field_precision = -1;
1274             flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1275
1276             /* flags */
1277             more_flags = 1;
1278             while(more_flags){
1279                 switch(*fmt){
1280                   case '-':
1281                     flags_minus++;
1282                     fmt++;
1283                     break;
1284
1285                   case '+':
1286                     flags_plus++;
1287                     fmt++;
1288                     break;
1289
1290                   case ' ':
1291                     flags_space++;
1292                     fmt++;
1293                     break;
1294
1295                   case '0':
1296                     flags_zero++;
1297                     fmt++;
1298                     break;
1299
1300                   case '#':
1301                     flags_pound++;
1302                     fmt++;
1303                     break;
1304
1305                   default:
1306                     more_flags = 0;
1307                     break;
1308                 }
1309             }
1310
1311             /* minimum field width */
1312             if(*fmt == '*'){
1313                 min_field_width = va_arg(args, int);
1314                 fmt++;
1315             }
1316             else if(*fmt >= '0' && *fmt <= '9'){
1317                 width_str = fmt;
1318                 while (*fmt >= '0' && *fmt <= '9')
1319                   fmt++;
1320
1321                 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1322                 if(sizeof(buf) > fmt-width_str)
1323                   buf[fmt-width_str] = '\0';
1324
1325                 buf[sizeof(buf)-1] = '\0';
1326
1327                 min_field_width = atoi(width_str);
1328             }
1329
1330             /* field precision */
1331             if(*fmt == '.'){
1332                 fmt++;
1333                 if(*fmt == '*'){
1334                     field_precision = va_arg(args, int);
1335                     fmt++;
1336                 }
1337                 else if(*fmt >= '0' && *fmt <= '9'){
1338                     width_str = fmt;
1339                     while (*fmt >= '0' && *fmt <= '9')
1340                       fmt++;
1341
1342                     strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1343                     if(sizeof(buf) > fmt-width_str)
1344                       buf[fmt-width_str] = '\0';
1345
1346                     buf[sizeof(buf)-1] = '\0';
1347
1348                     field_precision = atoi(width_str);
1349                 }
1350             }
1351
1352             /* length modifier */
1353             if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1354               modifier = *fmt++;
1355
1356             /* conversion character */
1357             switch(*fmt){
1358               case 'w':
1359                 /*
1360                  * work with va_arg(char *) to figure out width
1361                  * and precision needed to produce the screen width
1362                  * and precision asked for in %w using some of the
1363                  * utf8 width routines we have.
1364                  */
1365
1366                 input_str = va_arg(args, char *);
1367                 if(field_precision >=0 || min_field_width >= 0)
1368                   w = utf8_width(input_str);
1369
1370                 if(field_precision >= 0){
1371                     if(w <= field_precision)
1372                       field_precision = -1;  /* print it all */
1373                     else{
1374                         /*
1375                          * We need to cut off some of the input_str
1376                          * in this case.
1377                          */
1378                         end = utf8_count_forw_width(input_str, field_precision, &got_width);
1379                         field_precision = (int) (end - input_str);
1380                         /* new w with this field_precision */
1381                         w = got_width;
1382                     }
1383                 }
1384
1385                 /* need some padding */
1386                 if(min_field_width >= 0)
1387                   min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1388                                       MAX(0, min_field_width - w);
1389
1390                 /*
1391                  * Now we just need to get the new format string
1392                  * set correctly in newfmt.
1393                  */
1394                 q = newfmt;
1395                 if(q-newfmt < sizeof(newfmt))
1396                   *q++ = '%';
1397
1398                 if(flags_minus && q-newfmt < sizeof(newfmt))
1399                   *q++ = '-';
1400                 if(flags_plus && q-newfmt < sizeof(newfmt))
1401                   *q++ = '+';
1402                 if(flags_space && q-newfmt < sizeof(newfmt))
1403                   *q++ = ' ';
1404                 if(flags_zero && q-newfmt < sizeof(newfmt))
1405                   *q++ = '0';
1406                 if(flags_pound && q-newfmt < sizeof(newfmt))
1407                   *q++ = '#';
1408
1409                 if(min_field_width >= 0){
1410                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1411                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1412                 }
1413
1414                 if(field_precision >= 0){
1415                     if(q-newfmt < sizeof(newfmt))
1416                       *q++ = '.';
1417
1418                     snprintf(buf, sizeof(buf), "%d", field_precision);
1419                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1420                 }
1421
1422                 if(q-newfmt < sizeof(newfmt))
1423                   *q++ = 's';
1424
1425                 if(q-newfmt < sizeof(newfmt))
1426                   *q++ = '\0';
1427
1428                 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1429                 pdest += strlen(pdest);
1430
1431                 break;
1432
1433               case '\0':
1434                 fmt--;
1435                 break;
1436
1437               default:
1438                 /* make a new format which leaves out the dynamic '*' arguments */
1439                 q = newfmt;
1440                 if(q-newfmt < sizeof(newfmt))
1441                   *q++ = '%';
1442
1443                 if(flags_minus && q-newfmt < sizeof(newfmt))
1444                   *q++ = '-';
1445                 if(flags_plus && q-newfmt < sizeof(newfmt))
1446                   *q++ = '+';
1447                 if(flags_space && q-newfmt < sizeof(newfmt))
1448                   *q++ = ' ';
1449                 if(flags_zero && q-newfmt < sizeof(newfmt))
1450                   *q++ = '0';
1451                 if(flags_pound && q-newfmt < sizeof(newfmt))
1452                   *q++ = '#';
1453
1454                 if(min_field_width >= 0){
1455                     snprintf(buf, sizeof(buf), "%d", min_field_width);
1456                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1457                 }
1458
1459                 if(field_precision >= 0){
1460                     if(q-newfmt < sizeof(newfmt))
1461                       *q++ = '.';
1462
1463                     snprintf(buf, sizeof(buf), "%d", field_precision);
1464                     sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1465                 }
1466
1467                 if(q-newfmt < sizeof(newfmt))
1468                   *q++ = *fmt;
1469
1470                 if(q-newfmt < sizeof(newfmt))
1471                   *q++ = '\0';
1472
1473                 switch(*fmt){
1474                   case 'd': case 'i': case 'o':
1475                   case 'x': case 'X': case 'u': case 'c':
1476                     int_arg = va_arg(args, int);
1477                     snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1478                     pdest += strlen(pdest);
1479                     break;
1480
1481                   case 's':
1482                     input_str = va_arg(args, char *);
1483                     snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1484                     pdest += strlen(pdest);
1485                     break;
1486
1487                   case 'f': case 'e': case 'E':
1488                   case 'g': case 'G':
1489                     double_arg = va_arg(args, double);
1490                     snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1491                     pdest += strlen(pdest);
1492                     break;
1493
1494                   case 'p':
1495                     ptr_arg = va_arg(args, void *);
1496                     snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1497                     pdest += strlen(pdest);
1498                     break;
1499
1500                   case '%':
1501                     if(IS_ROOM_IN_DEST(1))
1502                       *pdest++ =  '%';
1503
1504                     break;
1505
1506                   default:
1507                     /* didn't think of this type */
1508                     assert(0);
1509                     break;
1510                 }
1511
1512                 break;
1513             }
1514
1515             fmt++;
1516         }
1517         else{
1518             if(IS_ROOM_IN_DEST(1))
1519               *pdest++ = *fmt++;
1520         }
1521     }
1522
1523     ret = pdest - dest;
1524
1525     if(IS_ROOM_IN_DEST(1))
1526       *pdest++ = '\0';
1527
1528     va_end(args);
1529
1530     return ret;
1531 }
1532
1533
1534 /*
1535  * Copy UTF-8 characters from src into dst.
1536  * Copy enough characters so that the result will have (<=) screen width of
1537  * want_width screen cells in current locale.
1538  *
1539  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1540  *   to dst.
1541  *
1542  * Returned value is the number of bytes written to dst, not including
1543  *   the possible terminating null.
1544  * Got_width is another returned value. It is the width in screen cells of
1545  *   the string placed in dst. It will be the same as want_width if there
1546  *   are enough characters in the src to do that and if the character widths
1547  *   hit the width exactly. It will be less than want_width if we run out
1548  *   of src characters or if the next character width would skip over the
1549  *   width we want, because it is double width.
1550  *
1551  * Zero width characters are collected and included at the end of the string.
1552  *   That is, if we make it to want_width but there is still a zero length
1553  *   character sitting in src, we add that to dst. This might be an accent
1554  *   or something like that.
1555  */
1556 size_t
1557 utf8_to_width(char *dst,                /* destination buffer */
1558               char *src,                /* source string */
1559               size_t dstlen,            /* space in dst */
1560               unsigned want_width,      /* desired screen width */
1561               unsigned *got_width)      /* returned screen width in dst */
1562 {
1563     int this_width;
1564     unsigned width_consumed = 0;
1565     UCS ucs;
1566     unsigned long remaining_octets;
1567     char *writeptr, *readptr, *savereadptr, *endptr;
1568     int ran_out_of_space = 0;
1569
1570     readptr = src;
1571
1572     remaining_octets = readptr ? strlen(readptr) : 0;
1573
1574     writeptr = dst;
1575     endptr = writeptr + dstlen;
1576
1577     if(readptr && writeptr){
1578       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1579         savereadptr = readptr;
1580         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1581
1582         if(ucs & U8G_ERROR || ucs == UBOGON)
1583           remaining_octets = 0;
1584         else{
1585           this_width = wcellwidth(ucs);
1586
1587           /*
1588            * If this_width is -1 that means we can't print this character
1589            * with our current locale. Writechar will print a '?'.
1590            */
1591           if(this_width < 0)
1592             this_width = 1;
1593
1594           if(width_consumed + (unsigned) this_width <= want_width){
1595             /* append this utf8 character to dst if it will fit */
1596             if(writeptr + (readptr - savereadptr) < endptr){
1597               width_consumed += this_width;
1598               while(savereadptr < readptr)
1599                 *writeptr++ = *savereadptr++;
1600             }
1601             else
1602               ran_out_of_space++;       /* no more utf8 to dst */
1603           }
1604           else
1605             remaining_octets = 0;       /* we're done */
1606         }
1607       }
1608
1609       if(writeptr < endptr)
1610         *writeptr = '\0';
1611     }
1612
1613     if(got_width)
1614       *got_width = width_consumed;
1615
1616     return(writeptr ? (writeptr - dst) : 0);
1617 }
1618
1619
1620 /*
1621  * Str is a UTF-8 string.
1622  * Count forward width screencell positions and return a pointer to the
1623  * end of the string that is width wide.
1624  * The returned pointer points at the next character (where the null would
1625  * be placed).
1626  *
1627  * Got_width is another returned value. It is the width in screen cells of
1628  *   the string from str to the returned pointer. It will be the same as
1629  *   want_width if there are enough characters in the str to do that
1630  *   and if the character widths hit the width exactly. It will be less
1631  *   than want_width if we run out of characters or if the next character
1632  *   width would skip over the width we want, because it is double width.
1633  */
1634 char *
1635 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1636 {
1637     int this_width;
1638     unsigned width_consumed = 0;
1639     UCS ucs;
1640     unsigned long remaining_octets;
1641     char *readptr;
1642     char *retptr;
1643
1644     retptr = readptr = str;
1645
1646     remaining_octets = readptr ? strlen(readptr) : 0;
1647
1648     while(width_consumed <= want_width && remaining_octets > 0){
1649
1650         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1651
1652         if(ucs & U8G_ERROR || ucs == UBOGON){
1653             /*
1654              * This should not happen, but do something to handle it anyway.
1655              * Treat each character as a single width character, which is what should
1656              * probably happen when we actually go to write it out.
1657              */
1658             remaining_octets--;
1659             readptr++;
1660             this_width = 1;
1661         }
1662         else{
1663             this_width = wcellwidth(ucs);
1664
1665             /*
1666              * If this_width is -1 that means we can't print this character
1667              * with our current locale. Writechar will print a '?'.
1668              */
1669             if(this_width < 0)
1670               this_width = 1;
1671         }
1672
1673         if(width_consumed + (unsigned) this_width <= want_width){
1674             width_consumed += (unsigned) this_width;
1675             retptr = readptr;
1676         }
1677         else
1678           remaining_octets = 0; /* we're done */
1679     }
1680
1681     if(got_width)
1682       *got_width = width_consumed;
1683
1684     return(retptr);
1685 }
1686
1687
1688 /*
1689  * Copy a null terminator into a UTF-8 string in place so that the string is
1690  * no more than a certain screen width wide. If the string is already less
1691  * than or equal in width to the requested width, no change is made.
1692  *
1693  * The actual width accomplished is returned. Note that it may be less than
1694  * max_width due to double width characters as well as due to the fact that
1695  * it fits wholly in the max_width.
1696  *
1697  * Returned value is the actual screen width of str when done.
1698  *
1699  * A side effect is that a terminating null may have been written into
1700  * the passed in string.
1701  */
1702 unsigned
1703 utf8_truncate(char *str, unsigned max_width)
1704 {
1705     int this_width;
1706     unsigned width_consumed = 0;
1707     UCS ucs;
1708     unsigned long remaining_octets;
1709     char *readptr, *savereadptr;
1710
1711     readptr = str;
1712
1713     remaining_octets = readptr ? strlen(readptr) : 0;
1714
1715     if(readptr){
1716       while(width_consumed <= max_width && remaining_octets > 0){
1717
1718         savereadptr = readptr;
1719         ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1720
1721         if(ucs & U8G_ERROR || ucs == UBOGON){
1722             /*
1723              * This should not happen, but do something to handle it anyway.
1724              * Treat each character as a single width character, which is what should
1725              * probably happen when we actually go to write it out.
1726              */
1727             remaining_octets--;
1728             readptr++;
1729             this_width = 1;
1730         }
1731         else{
1732             this_width = wcellwidth(ucs);
1733
1734             /*
1735              * If this_width is -1 that means we can't print this character
1736              * with our current locale. Writechar will print a '?'.
1737              */
1738             if(this_width < 0)
1739               this_width = 1;
1740         }
1741
1742         if(width_consumed + (unsigned) this_width <= max_width){
1743             width_consumed += (unsigned) this_width;
1744         }
1745         else{
1746             remaining_octets = 0;       /* we're done */
1747             *savereadptr = '\0';
1748         }
1749       }
1750     }
1751
1752     return(width_consumed);
1753 }
1754
1755
1756 /*
1757  * Copy UTF-8 characters from src into dst.
1758  * Copy enough characters so that the result will have screen width of
1759  * want_width screen cells in current locale.
1760  * If there aren't enough characters in src to get to want_width, pad on
1761  * left or right according to left_adjust argument.
1762  *
1763  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1764  *   to dst. Dst will be null terminated if there is enough room, but not
1765  *   if that would overflow dst's len.
1766  *
1767  * Returned value is the number of bytes written to dst, not including
1768  *   the possible terminating null.
1769  */
1770 size_t
1771 utf8_pad_to_width(char *dst,            /* destination buffer */
1772                   char *src,            /* source string */
1773                   size_t dstlen,        /* space in dst */
1774                   unsigned want_width,  /* desired screen width */
1775                   int left_adjust)      /* adjust left or right in want_width columns */
1776 {
1777     unsigned got_width = 0;
1778     int      need_more, howmany;
1779     size_t   len_left, bytes_used;
1780
1781     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1782     len_left = dstlen - bytes_used;
1783
1784     need_more = want_width - got_width;
1785     howmany = MIN(need_more, len_left);
1786
1787     if(howmany > 0){
1788         char *end, *newend, *p, *q;
1789
1790         end = dst + bytes_used;
1791         newend = end + howmany;
1792         if(left_adjust){
1793             /*
1794              * Add padding to end of string. Simply append
1795              * the needed number of spaces, or however many will fit
1796              * if we don't have enough space.
1797              */
1798             for(q = end; q < newend; q++)
1799               *q = ' ';
1800         }
1801         else{
1802             /*
1803              * Add padding to start of string.
1804              */
1805
1806             /* slide existing string over */
1807             for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1808               *q = *p;
1809
1810             /* fill rest with spaces */
1811             for(; q >= dst; q--)
1812               *q = ' ';
1813         }
1814
1815         bytes_used += howmany;
1816     }
1817
1818     if(bytes_used < dstlen)
1819       dst[bytes_used] = '\0';
1820
1821     return(bytes_used);
1822 }
1823
1824
1825 /*
1826  * Str is a UTF-8 string.
1827  * Start_here is a pointer into the string. It points one position past
1828  * the last byte that should be considered a part of the length string.
1829  * Count back want_width screencell positions and return a pointer to the
1830  * start of the string that is want_width wide and ends with start_here.
1831  *
1832  * Since characters may be more than one cell width wide we may end up
1833  * skipping over the exact width. That is, if we need to we'll go back
1834  * too far (by one cell width). Account for that in the call by looking
1835  * at got_width.
1836  *
1837  * Note that this call gives a possible got_width == want_width+1 as
1838  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1839  * That was just what was needed at the time, maybe it needs to be
1840  * optional.
1841  */
1842 char *
1843 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1844 {
1845     unsigned width_consumed = 0;
1846     int this_width;
1847     UCS ucs;
1848     unsigned long remaining_octets;
1849     char *ptr, *savereadptr, *goodreadptr;
1850
1851     savereadptr = start_here;
1852     goodreadptr = start_here;
1853
1854     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1855
1856         savereadptr = ptr;
1857         remaining_octets = goodreadptr - ptr;
1858         ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1859
1860         if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1861           if(remaining_octets > 0){
1862               /*
1863                * This means there are some bad octets after this good
1864                * character so things are not going to work out well.
1865                * Bail out.
1866                */
1867               savereadptr = str;        /* we're done */
1868           }
1869           else{
1870             this_width = wcellwidth(ucs);
1871
1872             /*
1873              * If this_width is -1 that means we can't print this character
1874              * with our current locale. Writechar will print a '?'.
1875              */
1876             if(this_width < 0)
1877               this_width = 1;
1878
1879             width_consumed += (unsigned) this_width;
1880             goodreadptr = savereadptr;
1881           }
1882         }
1883     }
1884
1885     if(got_width)
1886       *got_width = width_consumed;
1887
1888     return(savereadptr);
1889 }
1890
1891
1892 /*----------------------------------------------------------------------
1893   copy the source string onto the destination string returning with
1894   the destination string pointer at the end of the destination text
1895
1896   motivation for this is to avoid twice passing over a string that's
1897   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1898
1899   This doesn't really belong here but it is used here.
1900  ----*/
1901 void
1902 sstrncpy(char **d, char *s, int n)
1903 {
1904     while(n-- > 0 && (**d = *s++) != '\0')
1905       (*d)++;
1906 }
1907
1908
1909 /*
1910  * If use_system_routines is set then NULL is the return value and it is
1911  * not an error. Display_charmap and keyboard_charmap should come over as
1912  * malloced strings and will be filled in with the result.
1913  *
1914  * Returns a void pointer to the input_cs CHARSET which is
1915  * passed to mbtow via kbseq().
1916  * If !use_system_routines && NULL is returned, that is an error and err should
1917  * have a message.
1918  * display_charmap and keyboard_charmap should be malloced data and may be
1919  * realloced and changed here.
1920  */
1921 int
1922 setup_for_input_output(int use_system_routines, char **display_charmap,
1923                        char **keyboard_charmap, void **input_cs_arg, char **err)
1924 {
1925     const CHARSET *cs;
1926     const CHARSET *input_cs = NULL;
1927     int already_tried = 0;
1928     int supported = 0;
1929     char buf[1000];
1930
1931 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1932
1933     if(err)
1934       *err = NULL;
1935
1936     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1937         *err = cpstr("Bad call to setup_for_input_output");
1938         return(-1);
1939     }
1940
1941     if(use_system_routines){
1942 #if     PREREQ_FOR_SYS_TRANSLATION
1943         char *dcm;
1944
1945         dcm = nl_langinfo_codeset_wrapper();
1946         dcm = dcm ? dcm : "US-ASCII";
1947
1948         init_utf8_display(0, NULL);
1949         if(*display_charmap){
1950             if(dcm && strucmp(*display_charmap, dcm)){
1951                 snprintf(buf, sizeof(buf),
1952                  _("Display character set \"%s\" is ignored when using system translation"),
1953                      *display_charmap);
1954
1955                 *err = cpstr(buf);
1956             }
1957
1958             fs_give((void **) display_charmap);
1959         }
1960
1961         if(*keyboard_charmap){
1962             if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1963                 snprintf(buf, sizeof(buf),
1964                  _("Keyboard character set \"%s\" is ignored when using system translation"),
1965                      *keyboard_charmap);
1966
1967                 *err = cpstr(buf);
1968             }
1969
1970             fs_give((void **) keyboard_charmap);
1971         }
1972
1973         *display_charmap = cpstr(dcm);
1974         *keyboard_charmap = cpstr(dcm);
1975 #else
1976         *err = cpstr("Bad call to setup_for_input_output");
1977 #endif
1978
1979         *input_cs_arg = NULL;
1980         return(0);
1981     }
1982
1983
1984 try_again1:
1985     if(!(*display_charmap))
1986       *display_charmap = cpstr("US-ASCII");
1987
1988     if(!(*keyboard_charmap))
1989       *keyboard_charmap = cpstr(*display_charmap);
1990
1991     if(*keyboard_charmap){
1992         supported = input_charset_is_supported(*keyboard_charmap);
1993
1994         if(supported){
1995             if(!strucmp(*keyboard_charmap, "utf-8"))
1996               input_cs = utf8_charset(*keyboard_charmap);
1997             else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1998               input_cs = cs;
1999         }
2000         else{
2001             if(err && !*err){
2002                 int iso2022jp = 0;
2003
2004                 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2005                   iso2022jp = 1;
2006
2007                 snprintf(buf, sizeof(buf),
2008                      /* TRANSLATORS: The first argument is the name of the character
2009                         set the user is trying to use (which is unsupported by alpine).
2010                         The second argument is " (except for posting)" if they are
2011                         trying to use ISO-2022-JP for something other than posting. */
2012                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2013                      *keyboard_charmap,
2014                      iso2022jp ? _(" (except for posting)") : "");
2015
2016                 *err = cpstr(buf);
2017             }
2018
2019             input_cs = NULL;
2020             fs_give((void **) keyboard_charmap);
2021             *keyboard_charmap = cpstr("US-ASCII");
2022             if(!already_tried){
2023                 already_tried++;
2024                 goto try_again1;
2025             }
2026         }
2027     }
2028
2029
2030 try_again2:
2031     if(!(*display_charmap))
2032       *display_charmap = cpstr("US-ASCII");
2033
2034     if(*display_charmap){
2035         supported = output_charset_is_supported(*display_charmap);
2036         if(supported){
2037             if(!strucmp(*display_charmap, "utf-8"))
2038               init_utf8_display(1, NULL);
2039             else if((cs = utf8_charset(*display_charmap)) != NULL)
2040               init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2041         }
2042         else{
2043             if(err && !*err){
2044                 int iso2022jp = 0;
2045
2046                 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2047                   iso2022jp = 1;
2048
2049                 snprintf(buf, sizeof(buf),
2050                      _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2051                      *display_charmap,
2052                      iso2022jp ? _(" (except for posting)") : "");
2053
2054                 *err = cpstr(buf);
2055             }
2056
2057             fs_give((void **) display_charmap);
2058             if(!already_tried){
2059                 already_tried++;
2060                 goto try_again2;
2061             }
2062         }
2063     }
2064     else{
2065         if(err && !*err)
2066           *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2067     }
2068
2069 #undef cpstr
2070
2071     *input_cs_arg = (void *) input_cs;
2072
2073     return(0);
2074 }
2075
2076
2077 int
2078 input_charset_is_supported(char *input_charset)
2079 {
2080     const CHARSET *cs;
2081
2082     if(!(input_charset && *input_charset))
2083       return 0;
2084
2085     if(!strucmp(input_charset, "utf-8"))
2086       return 1;
2087
2088     if((cs = utf8_charset(input_charset)) != NULL){
2089
2090         /*
2091          * This was true 2006-09-25.
2092          */
2093         switch(cs->type){
2094           case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2095           case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2096           case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2097           case CT_UCS4: case CT_UTF16:
2098             return 1;
2099             break;
2100
2101           default:
2102             break;
2103         }
2104     }
2105
2106     return 0;
2107 }
2108
2109
2110 int
2111 output_charset_is_supported(char *output_charset)
2112 {
2113     const CHARSET *cs;
2114
2115     if(!(output_charset && *output_charset))
2116       return 0;
2117
2118     if(!strucmp(output_charset, "utf-8"))
2119       return 1;
2120
2121     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2122       return 1;
2123
2124     return 0;
2125 }
2126
2127
2128 int
2129 posting_charset_is_supported(char *posting_charset)
2130 {
2131     return(posting_charset && *posting_charset
2132            && (!strucmp(posting_charset, "ISO-2022-JP")
2133                || output_charset_is_supported(posting_charset)));
2134 }
2135
2136
2137 /*
2138  * This function is only defined in this special case and so calls
2139  * to it should be wrapped in the same macro conditionals.
2140  *
2141  * Returns the default display charset for a UNIX terminal emulator,
2142  * it is what nl_langinfo(CODESET) should return but we need to
2143  * wrap nl_langinfo because we know of strange behaving implementations.
2144  */
2145 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2146 char *
2147 nl_langinfo_codeset_wrapper(void)
2148 {
2149     char *ret = NULL;
2150
2151     ret = nl_langinfo(CODESET);
2152
2153     /*
2154      * If the value returned from nl_langinfo() is not a real charset,
2155      * see if we can figure out what they meant. If we can't figure it
2156      * out return NULL and let the caller decide what to do.
2157      */
2158     if(ret && *ret && !output_charset_is_supported(ret)){
2159         if(!strcmp("ANSI_X3.4-1968", ret)
2160            || !strcmp("646", ret)
2161            || !strcmp("ASCII", ret)
2162            || !strcmp("C", ret)
2163            || !strcmp("POSIX", ret))
2164           ret = "US-ASCII";
2165         else if(!strucmp(ret, "UTF8"))
2166           ret = "UTF-8";
2167         else if(!strucmp(ret, "EUCJP"))
2168           ret = "EUC-JP";
2169         else if(!strucmp(ret, "EUCKP"))
2170           ret = "EUC-KP";
2171         else if(!strucmp(ret, "SJIS"))
2172           ret = "SHIFT-JIS";
2173         else if(strstr(ret, "8859")){
2174             char *p;
2175
2176             /* check for digits after 8859 */
2177             p = strstr(ret, "8859");
2178             p += 4;
2179             if(!isdigit(*p))
2180               p++;
2181
2182             if(isdigit(*p)){
2183                 static char buf[12];
2184
2185                 memset(buf, 0, sizeof(buf));
2186                 strncpy(buf, "ISO-8859-", sizeof(buf));
2187                 buf[9] = *p++;
2188                 if(isdigit(*p))
2189                   buf[10] = *p;
2190
2191                 ret = buf;
2192             }
2193         }
2194     }
2195
2196     if(ret && !output_charset_is_supported(ret))
2197       ret = NULL;
2198
2199     return(ret);
2200 }
2201 #endif
2202
2203
2204 /*
2205  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2206  * needed the return value will point to orig. If a conversion is done,
2207  * the return string should be freed by the caller.
2208  * If not possible, returns NULL.
2209  */
2210 char *
2211 utf8_to_charset(char *orig, char *charset, int report_err)
2212 {
2213     SIZEDTEXT src, dst;
2214     char *ret = orig;
2215
2216     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2217       return ret;
2218
2219     src.size = strlen(orig);
2220     src.data = (unsigned char *) orig;
2221
2222     if(!strucmp(charset, "us-ascii")){
2223         size_t i;
2224
2225         for(i = 0; i < src.size; i++)
2226           if(src.data[i] & 0x80)
2227             return NULL;
2228
2229         return ret;
2230     }
2231
2232     /*
2233      * This works for ISO-2022-JP because of special code in utf8_cstext
2234      * but not for other 2022 charsets.
2235      */
2236     memset(&dst, 0, sizeof(dst));
2237     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2238       ret = (char *) dst.data;          /* c-client already null terminates it */
2239     else
2240       ret = NULL;
2241
2242     if((unsigned char *) ret != dst.data && dst.data)
2243       fs_give((void **) &dst.data);
2244
2245     return ret;
2246 }
2247
2248
2249 /*
2250  *      Turn a number into a string with comma's
2251  *
2252  * Args: number -- The long to be turned into a string.
2253  *
2254  * Result: pointer to static string representing number with commas
2255  * Can use up to 3 comatose results at once.
2256  */
2257 char *
2258 comatose(long int number)
2259 {
2260     long        i, x, done_one;
2261     static char buf[3][50];
2262     static int whichbuf = 0;
2263     char       *b;
2264
2265     whichbuf = (whichbuf + 1) % 3;
2266
2267     if(number == 0){
2268         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2269         buf[whichbuf][sizeof(buf[0])-1] = '\0';
2270         return(buf[whichbuf]);
2271     }
2272
2273     done_one = 0;
2274     b = buf[whichbuf];
2275     for(i = 1000000000; i >= 1; i /= 1000) {
2276         x = number / i;
2277         number = number % i;
2278         if(x != 0 || done_one) {
2279             if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2280               *b++ = ',';
2281
2282             snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2283             b += strlen(b);
2284             done_one = 1;
2285         }
2286     }
2287
2288     if(b-buf[whichbuf] < sizeof(buf[0]))
2289       *b = '\0';
2290
2291     return(buf[whichbuf]);
2292 }
2293
2294
2295 /* leave out the commas */
2296 char *
2297 tose(long int number)
2298 {
2299     static char buf[3][50];
2300     static int whichbuf = 0;
2301
2302     whichbuf = (whichbuf + 1) % 3;
2303
2304     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2305
2306     return(buf[whichbuf]);
2307 }
2308
2309
2310 /*
2311  * line_paint - where the real work of managing what is displayed gets done.
2312  */
2313 void
2314 line_paint(int offset,                  /* current dot offset into vl */
2315            struct display_line *displ,
2316            int *passwd)                 /* flag to hide display of chars */
2317 {
2318     int i, w, w2, already_got_one = 0;
2319     int vfirst, vlast, dfirst, dlast, vi, di;
2320     int new_vbase;
2321     unsigned (*width_a_to_b)(UCS *, int, int);
2322
2323     /*
2324      * Set passwd to 10 in caller if you want to conceal the
2325      * password but not print asterisks for feedback.
2326      *
2327      * Set passwd to 1 in caller to conceal by printing asterisks.
2328      */
2329     if(passwd && *passwd >= 10){        /* don't show asterisks */
2330         if(*passwd > 10)
2331           return;
2332         else
2333           *passwd = 11;         /* only blat once */
2334
2335         i = 0;
2336         (*displ->movecursor)(displ->row, displ->col);
2337         while(i++ <= displ->dwid)
2338           (*displ->writechar)(' ');
2339
2340         (*displ->movecursor)(displ->row, displ->col);
2341         return;
2342     }
2343
2344     if(passwd && *passwd)
2345       width_a_to_b = single_width_chars_a_to_b;
2346     else
2347       width_a_to_b = ucs4_str_width_a_to_b;
2348
2349     /*
2350      * vl is the virtual line (the actual data). We operate on it by typing
2351      * characters to be added and deleting and so forth. In this routine we
2352      * copy a subset of those UCS-4 characters in vl into dl, the display
2353      * array, and show that subset on the screen.
2354      *
2355      * Offset is the location of the cursor in vl.
2356      *
2357      * We will display the string starting from vbase.
2358      * We have dwid screen cells to work in.
2359      * We may have to adjust vbase in order to display the
2360      * part of the string that contains the cursor.
2361      *
2362      * We'll make the display look like
2363      *   vl    a b c d e f g h i j k l m
2364      *             xxxxxxxxxxxxx  <- width dwid window
2365      *             < d e f g h >
2366      *               |
2367      *             vbase
2368      * The < will be there if vbase > 0.
2369      * The > will be there if the string from vbase to the
2370      * end can't all fit in the window.
2371      */
2372
2373     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2374
2375     /*
2376      * Adjust vbase so offset is not out of the window to the right.
2377      * (The +2 in w + 2 is for a possible " >" if the string goes past
2378      *  the right hand edge of the window and if the last visible character
2379      * is double wide. We don't want the offset to be under that > character.)
2380      */
2381     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2382         displ->dwid > 1 &&
2383         w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2384         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2385         /*
2386          * offset is off the window to the right
2387          * It looks like   a b c d e f g h
2388          *                   |         |
2389          *               vbase         offset
2390          * and offset is either past the right edge,
2391          * or right at the right edge (and maybe under >),
2392          * or one before right at the edge (and maybe on space
2393          * for half a character).
2394          *
2395          * Since the characters may be double width it is slightly
2396          * complicated to figure out how far to increase vbase.
2397          * We're going to scoot over past width w/2 characters and
2398          * then see if that's sufficient.
2399          */
2400         new_vbase = displ->vbase + 1;
2401         for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2402             w2 < displ->dwid/2;
2403             w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2404           new_vbase++;
2405
2406         displ->vbase = new_vbase;
2407     }
2408
2409     /* adjust so offset is not out of the window to the left */
2410     while(displ->vbase > 0 && displ->vbase >= offset){
2411         /* add about dwid/2 more width */
2412         new_vbase = displ->vbase - 1;
2413         for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2414             w2 < (displ->dwid+1)/2 && new_vbase > 0;
2415             w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2416           new_vbase--;
2417
2418         /* but don't let it get too small, recheck off right end */
2419         for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2420             w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2421             w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2422           new_vbase++;
2423
2424         displ->vbase = MAX(new_vbase, 0);
2425     }
2426
2427     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2428       displ->vbase = 0;
2429
2430     vfirst = displ->vbase;
2431     dfirst = 0;
2432     if(displ->vbase > 0){                       /* off screen cue left */
2433         dfirst = 1;                             /* index which matches vfirst */
2434         displ->dl[0] = '<';
2435     }
2436
2437     vlast = displ->vused-1;                     /* end */
2438     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2439
2440     if(displ->dwid > 0 && w + dfirst > displ->dwid){                    /* off window right */
2441
2442         /* find last ucs character to be printed */
2443         while(w + dfirst > displ->dwid - 1)     /* -1 for > */
2444           w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2445
2446         /* worry about double-width characters */
2447         if(w + dfirst == displ->dwid - 1){      /* no prob, hit it exactly */
2448             dlast = dfirst + vlast - vfirst + 1;        /* +1 for > */
2449             displ->dl[dlast] = '>';
2450         }
2451         else{
2452             dlast = dfirst + vlast - vfirst + 1;
2453             displ->dl[dlast++] = ' ';
2454             displ->dl[dlast] = '>';
2455         }
2456     }
2457     else
2458       dlast = dfirst + vlast - vfirst;
2459
2460     /*
2461      * Copy the relevant part of the virtual line into the display line.
2462      */
2463     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2464       if(passwd && *passwd)
2465         displ->dl[di] = '*';            /* to conceal password */
2466       else
2467         displ->dl[di] = displ->vl[vi];
2468
2469     /*
2470      * Add spaces to clear the rest of the line.
2471      * We have dwid total space to fill.
2472      */
2473     w = (*width_a_to_b)(displ->dl, 0, dlast);   /* width through dlast */
2474     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2475       displ->dl[di++] = ' ';
2476
2477     /*
2478      * Draw from left to right, skipping until we get to
2479      * something that is different. Characters may be different
2480      * widths than they were initially so paint from there the
2481      * rest of the way.
2482      */
2483     for(di = 0; displ->dl[di]; di++){
2484         if(already_got_one || displ->dl[di] != displ->olddl[di]){
2485             /* move cursor first time */
2486             if(!already_got_one++){
2487                 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2488                 (*displ->movecursor)(displ->row, displ->col + w);
2489             }
2490
2491             (*displ->writechar)(displ->dl[di]);
2492             displ->olddl[di] = displ->dl[di];
2493         }
2494     }
2495
2496     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2497
2498     /*
2499      * Move the cursor to the offset.
2500      *
2501      * The offset is relative to the start of the virtual array. We need
2502      * to find the location on the screen. The offset into the display array
2503      * will be offset-vbase+dfirst. We want to be at the start of that
2504      * character, so we need to find the width of all the characters up
2505      * to that point.
2506      */
2507     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2508
2509     (*displ->movecursor)(displ->row, displ->col + w);
2510 }
2511
2512
2513 /*
2514  * This is just like ucs4_str_width_a_to_b() except all of the characters
2515  * are assumed to be of width 1. This is for printing out *'s when user
2516  * enters a password, while still managing to use the same code to do the
2517  * display.
2518  */
2519 unsigned
2520 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2521 {
2522     unsigned width = 0;
2523     int i;
2524
2525     if(ucsstr)
2526       for(i = a; i <= b && ucsstr[i]; i++)
2527         width++;
2528
2529     return width;
2530 }