regina/regutil/regunicode.c

   1 /* Unicode conversion functions for regutil
   2  *
   3  * The contents of this file are subject to the Mozilla Public License
   4  * Version 1.0 (the "License"); you may not use this file except in
   5  * compliance with the License. You may obtain a copy of the License at
   6  * http://www.mozilla.org/MPL/
   7  *
   8  * Software distributed under the License is distributed on an "AS IS"
   9  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
  10  * License for the specific language governing rights and limitations
  11  * under the License.
  12  *
  13  * The Original Code is regutil.
  14  *
  15  * The Initial Developer of the Original Code is Patrick TJ McPhee.
  16  * Portions created by Patrick McPhee are Copyright © 2003
  17  * Patrick TJ McPhee. All Rights Reserved.
  18  *
  19  * Contributors:
  20  *
  21  * $Header: /opt/cvs/Regina/regutil/regunicode.c,v 1.2 2009/11/23 23:24:35 mark Exp $
  22  */
  23 #include "rxproto.h"
  24
  25 #ifdef _WIN32
  26 #include <windows.h>
  27 #include <winnls.h>
  28 #else
  29
  30 # ifdef HAVE_ICONV_H
  31 #  include <iconv.h>
  32
  33    /* this needs to be redefined if your system uses a different name for
  34     * the base unicode code page */
  35 #  ifndef ICONV_UTF16
  36 #   define ICONV_UTF16 "UTF-16LE"
  37 #  endif
  38
  39 # endif
  40
  41 #define CP_ACP 0
  42 #define CP_OEMCP 0
  43 #define CP_MACCP 1
  44 #define CP_UTF7 7
  45 #define CP_UTF8 8
  46
  47 #endif
  48
  49 /* code pages can be a numeric value, one of the strings defined by IBM, or
  50  * MAC (which I include because everything else in this API follows the
  51  * windows function so closely, we might as well include it) */
  52 static int cvtcp(const char * s)
  53 {
  54    static const struct {
  55       const char * s;
  56       int cp;
  57    } cpgs[] = {
  58       {"ACP", CP_ACP},
  59 #ifndef _WIN32
  60       {"THREAD_ACP", CP_ACP},
  61 #endif
  62       {"OEMCP", CP_OEMCP},
  63       {"MAC", CP_MACCP },
  64       {"UTF7", CP_UTF7 },
  65       {"UTF8", CP_UTF8 }
  66    };
  67    int cp = -1;
  68    register int i;
  69
  70    for (i = 0; i < DIM(cpgs); i++) {
  71       if (!strcasecmp(cpgs[i].s, s)) {
  72          cp = cpgs[i].cp;
  73          break;
  74       }
  75    }
  76
  77 #ifdef _WIN32
  78    /* query system to find the current thread's ACP (thread's ACP? Windows!) */
  79    if (cp == -1) {
  80       if (!strcasecmp(s, "THREAD_ACP")) {
  81          cp = GetACP();
  82       }
  83    }
  84 #endif
  85
  86    if (cp == -1) {
  87       cp = atoi(s);
  88       if (!cp)
  89          cp = -1;
  90    }
  91
  92    return cp;
  93 }
  94
  95 #ifdef _WIN32
  96 struct mapping_flags_T {
  97    RXSTRING str;
  98    int flg;
  99 };
 100
 101 /* find the flags in string s */
 102 static int getflags(RXSTRING rxs, const struct mapping_flags_T * flgs,
 103                    int dim_flgs)
 104 {
 105    int flags = 0;
 106    register int i;
 107
 108    while (rxs.strlength > 0) {
 109       for (i = 0; i < dim_flgs; i++) {
 110          if (rxs.strlength >= flgs[i].str.strlength &&
 111              !memcmp(rxs.strptr, flgs[i].str.strptr, flgs[i].str.strlength)) {
 112             flags |= flgs[i].flg;
 113             rxs.strptr += flgs[i].str.strlength;
 114             rxs.strlength -= flgs[i].str.strlength;
 115          }
 116
 117          /* skip non-spaces -- strictly, I'm supposed to return rc 1004 */
 118          i = strcspn(rxs.strptr, " ");
 119          rxs.strlength -= i;
 120          rxs.strptr += i;
 121
 122          /* skip spaces */
 123          i = strspn(rxs.strptr, " ");
 124          rxs.strlength -= i;
 125          rxs.strptr += i;
 126       }
 127    }
 128
 129    return flags;
 130 }
 131 #elif !defined(HAVE_MBSTOWCS)
 132
 133 /* trivial conversions between unicode and latin 1 */
 134 static int utol1(unsigned char * l1s, const unsigned short * us, int ul)
 135 {
 136    register int i;
 137    for (i = 0; i < ul; i++) {
 138       l1s[i] = (unsigned char)us[i];
 139    }
 140
 141    return ul;
 142 }
 143
 144 static int l1tou(unsigned short * us, const unsigned char * l1s, int ul)
 145 {
 146    register int i;
 147    for (i = 0; i < ul; i++) {
 148
 149       us[i] = l1s[i];
 150    }
 151
 152    return ul;
 153 }
 154
 155 #endif
 156
 157 /* similarly trivial conversions between unicode and utf8. utf8 is just
 158  * a different way of representing the same numbers as ucs-2. The first
 159  * bit determines how many bytes are used to represent a character. If it
 160  * is 0, 1 byte is used. Otherwise, the number of non-zero most significant
 161  * bits determines the number of bytes used to represent the character (if
 162  * the first two bits are set and the third is clear, two characters are
 163  * used, and so forth). For whatever reason, every byte in a multi-byte
 164  * sequence has as many most significant bits set as there are bytes
 165  * remaining in the sequence. */
 166 static int utou8(unsigned char *u8s, const unsigned short * us, int ul)
 167 {
 168    register int i, j;
 169
 170    for (i = j = 0; i < ul; i++, j++) {
 171       /* the number of characters needed depends on character values in
 172        * the unicode string. Values up to 2^7-1 (0x80) fit in one byte.
 173        * Values up to 2^11-1 (2047) fit in two bytes. Values up to 2^15-1
 174        * (32767) fit in three bytes. Values up to 2^18-1 fit in four bytes.
 175        * Luckily, we never need more than 2^16-1.
 176        * Note that  0xf0 == (1 << 8) | (1 << 7) | (1 << 6) | (1 << 5)
 177        *            0xe0 == (1 << 8) | (1 << 7) | (1 << 6)
 178        *            0xc0 == (1 << 8) | (1 << 7)
 179        *            0x80 == (1 << 8)
 180        *   which are the significant bits for a multi-byte character, and
 181        *            0x3f == (1 << 6) - 1
 182        *            0x1f == (1 << 5) - 1
 183        *            0x0f == (1 << 4) - 1
 184        *            0x07 == (1 << 3) - 1
 185        *   which are the corresponding masks giving the numeric values
 186        */
 187       if (us[i] < 128) {
 188          u8s[j] = us[i];
 189       }
 190       else if (us[i] < 2047) {
 191          u8s[j++] = (us[i] >> 6) | 0xc0;
 192          u8s[j] = (us[i] & 0x3f) | 0x80;
 193       }
 194       else if (us[i] < 32767) {
 195          u8s[j++] = (us[i] >> 11) | 0xe0;
 196          u8s[j++] = ((us[i] >> 6) & 0x1f) | 0xc0;
 197          u8s[j] = (us[i] & 0x3f) | 0x80;
 198       }
 199       else {
 200          u8s[j++] = (us[i] >> 15) | 0xf0;
 201          u8s[j++] = ((us[i] >> 11) & 0x0f) | 0xe0;
 202          u8s[j++] = ((us[i] >> 6) & 0x1f) | 0xc0;
 203          u8s[j] = (us[i] & 0x3f) | 0x80;
 204       }
 205    }
 206    return j;
 207 }
 208
 209 static int u8tou(unsigned short *us, const unsigned char * u8s, int ul)
 210 {
 211    register int i, j;
 212
 213    for (i = j = 0; i < ul; i++, j++) {
 214       if (u8s[i] & 0x80) {
 215          us[j] = 0;
 216          /* 0xf8 == 11111000 */
 217          if ((u8s[i] & 0xf8) == 0xf0) {
 218             us[j] = u8s[i++] & 0x7;
 219          }
 220          /* fall through ... */
 221          if ((u8s[i] & 0xf0) == 0xe0) {
 222             us[j] <<= 4;
 223             us[j] |= u8s[i++] & 0xf;
 224          }
 225          /* the last two must be true */
 226          us[j] <<= 5;
 227          us[j] |= u8s[i++] & 0x1f;
 228          us[j] <<= 6;
 229          us[j] |= u8s[i] & 0x3f;
 230       }
 231       else {
 232          us[j] = u8s[i];
 233       }
 234    }
 235    return j;
 236 }
 237
 238 /* similarly trivial conversions between unicode and utf7.
 239  * rfc 2152 describes a set of `directly encoded' characters and a set
 240  * of `optional direct characters'. This code directly encodes all of
 241  * them.
 242  * Everything else is converted to big-endian, and the resulting byte
 243  * stream is converted to base64. + is used to shift into base64,
 244  * and any non-base64 character can be used to shift out, however
 245  * - is handled specially as a shift-out character: - is absorbed.
 246  * I expect some converters think - is in fact the only shift-out
 247  * character, so this converter always uses it when going to utf-7.
 248  */
 249
 250 /* conversion to `modified base64' */
 251 static int utomb64(unsigned char * mb64s, const unsigned short * us, int ul)
 252 {
 253    register int i, j, nb, r;
 254    static const char coderange[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 255    unsigned char uc[2];
 256
 257    nb = 6;      /* nb is the number of bits needed to complete the current character.
 258                  * since we're working with 16 bits at a time, it can be 6, 2, or 4 */
 259    r = 0;       /* r is the remainder from the previously encoded character */
 260
 261    for (i = j = 0; i < ul; i++) {
 262       /* make sure it's big-endian */
 263       uc[0] = us[i] >> 8;
 264       uc[1] = us[i] & 0xff;
 265
 266       if (nb == 6) {
 267          mb64s[j++] = coderange[uc[0] >> 2];
 268          mb64s[j++] = coderange[((uc[0]&3) << 4)|(uc[1]>>4)];
 269          r = uc[1] & 0xf;
 270          nb = 2;
 271       }
 272       else if (nb == 2) {
 273          mb64s[j++] = coderange[(r << 2) | (uc[0] >> 6)];
 274          mb64s[j++] = coderange[uc[0]&0x3f];
 275          mb64s[j++] = coderange[uc[1]>>2];
 276          r = uc[1] & 0x3;
 277          nb = 4;
 278       }
 279       else if (nb == 4) {
 280          mb64s[j++] = coderange[(r << 4) | (uc[0] >> 4)];
 281          mb64s[j++] = coderange[((uc[0]&0xf) << 2)|(uc[1]>>6)];
 282          mb64s[j++] = coderange[uc[1]&0x3f];
 283          r = 0;
 284          nb = 6;
 285       }
 286    }
 287
 288    /* if we're have a partial character, need to pad it out appropriately
 289     * with 0s -- this is where base 64 is modified  */
 290    if (nb != 6) {
 291       mb64s[j++] = coderange[r << nb];
 292    }
 293
 294    mb64s[j] = 0;
 295
 296    return j;
 297 }
 298
 299 static int utou7(unsigned char *u7s, const unsigned short * us, int ul)
 300 {
 301    static unsigned char enc[127];
 302    register int i, j;
 303
 304    if (!enc['A']) {
 305       static const unsigned char directs[] = "\t\n\r !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}";
 306       for (i = 0; i < sizeof(directs)-1; i++)
 307          enc[directs[i]] = 1;
 308    }
 309
 310    for (i = j = 0; i < ul; i++, j++) {
 311       if (us[i] < 126 && enc[us[i]]) {
 312          u7s[j] = us[i];
 313       }
 314       else {
 315          register int l;
 316
 317          u7s[j++] = '+';
 318
 319          /* determine where the set of shift characters ends */
 320          for (l = i; l < ul && (us[l] >= 126 || !enc[us[l]]); l++)
 321             ;
 322          j += utomb64(u7s+j, us+i, l - i);
 323          u7s[j] = '-';
 324          i += l - 1;
 325       }
 326    }
 327
 328    u7s[j] = 0;
 329    return j;
 330 }
 331
 332 static int decode(unsigned char c)
 333 {
 334     if (c == '+') return 62;
 335     else if (c == '/') return 63;
 336     else if ('0' <= c && c <= '9') return c - '0' + 52;
 337     else if ('A' <= c && c <= 'Z') return c - 'A';
 338     else if ('a' <= c && c <= 'z') return c - 'a' + 26;
 339     else return -1;
 340 }
 341
 342 /* decodes a base-64 string until it encounters a character which isn't
 343  * allowed in base-64. The index of that character is returned in *pnul,
 344  * unless the character is -, in which case the index of the next character
 345  * is returned in *pnul. */
 346 static int mb64tou(unsigned short * us, const unsigned char * mb64s, int ul, int * pnul)
 347 {
 348     register int i, j, c, r, nb;
 349     unsigned char uc[2];
 350
 351     nb = 16;    /* start needing all 16 bits */
 352     r = 0;
 353     c = 0;
 354
 355     /* yes, this should be i++ */
 356     for (i = j = 0; i < ul; i++) {
 357        c = decode(mb64s[i]);
 358
 359        if (c == -1) {
 360           if (mb64s[i] == '-')
 361              i++;
 362           break;
 363        }
 364
 365        if (nb == 16) {
 366           uc[0] = c << 2;
 367           nb = 10;
 368         }
 369         else if (nb == 10) {
 370            uc[0] |= c >> 4;
 371            uc[1] = (c & 0xf) << 4;
 372            nb = 4;
 373         }
 374         else if (nb == 4) {
 375            uc[1] |= c >> 2;
 376            us[j++] = ((unsigned short)uc[0]) << 8 | uc[1];
 377
 378            uc[0] = (c & 3) << 6;
 379            nb = 14;
 380         }
 381         else if (nb == 14) {
 382            uc[0] |= c;
 383            nb = 8;
 384         }
 385         else if (nb == 8) {
 386            uc[1] = c << 2;
 387            nb = 2;
 388         }
 389         else if (nb == 2) {
 390            uc[1] |= c >> 4;
 391            us[j++] = ((unsigned short)uc[0]) << 8 | uc[1];
 392
 393            uc[0] = (c & 0xf) << 4;
 394            nb = 12;
 395         }
 396         else if (nb == 12) {
 397            uc[0] |= c >> 2;
 398            uc[1] = (c & 3) << 6;
 399
 400            nb = 6;
 401         }
 402         else if (nb == 6) {
 403            uc[1] |= c;
 404            us[j++] = ((unsigned short)uc[0]) << 8 | uc[1];
 405            nb = 16;
 406         }
 407     }
 408
 409     /* ignore any left-over bits. If they're not 0, the string is not
 410      * well-formed */
 411
 412     *pnul = i;
 413
 414     return j;
 415 }
 416
 417 static int u7tou(unsigned short *us, const unsigned char * u7s, int ul)
 418 {
 419    register int i, j;
 420    int nul;
 421
 422    for (i = j = 0; i < ul; i++, j++) {
 423       if (u7s[i] == '+') {
 424          i++;
 425          j += mb64tou(us+j, u7s+i, ul - i, &nul) - 1;
 426          i += nul-1;
 427       }
 428       else {
 429          us[j] = u7s[i];
 430       }
 431    }
 432
 433    us[j] = 0;
 434    return j;
 435 }
 436
 437 /* SysToUnicode(string, [codepage], [mappingflags], outstem) */
 438 rxfunc(systounicode)
 439 {
 440    int cp;
 441    int flags = 0;
 442    RXSTRING outs;
 443    RXSTRING stemv;
 444    static const char text[] = "!TEXT";
 445
 446    checkparam(4, 4);
 447
 448    /* default code page is the `oem' code page. On most systems, this is
 449     * 437, while the `windows' code page is iso-latin 1. It's not a bad idea
 450     * to make the two be the same, though. */
 451    if (argv[1].strptr == NULL || argv[1].strlength == 0)
 452       cp = CP_OEMCP;
 453    else {
 454       char * s;
 455       rxstrdup(s, argv[1]);
 456       cp = cvtcp(s);
 457    }
 458
 459 #ifdef _WIN32
 460    if (argv[2].strptr && argv[2].strlength) {
 461       static const struct mapping_flags_T flgs[] = {
 462          {{11, "PRECOMPOSED"}, MB_PRECOMPOSED},
 463          {{ 9, "COMPOSITE"}, MB_COMPOSITE},
 464          {{17, "ERR_INVALID_CHARS"}, MB_ERR_INVALID_CHARS},
 465          {{13, "USEGLYPHCHARS"}, MB_USEGLYPHCHARS}
 466       };
 467
 468       flags = getflags(argv[2], flgs, DIM(flgs));
 469    }
 470 #endif
 471
 472    outs.strptr = malloc(argv[0].strlength*2);
 473
 474 #ifdef _WIN32
 475    outs.strlength = MultiByteToWideChar(cp, flags, argv[0].strptr, argv[0].strlength, (LPWSTR)outs.strptr, argv[0].strlength);
 476
 477    if (outs.strlength == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 478       outs.strlength = MultiByteToWideChar(cp, flags, argv[0].strptr, argv[0].strlength, NULL, 0);
 479       outs.strptr = realloc(outs.strptr, outs.strlength*2);
 480       MultiByteToWideChar(cp, flags, argv[0].strptr, argv[0].strlength, (LPWSTR)outs.strptr, outs.strlength);
 481    }
 482
 483    if (outs.strlength == 0) {
 484       switch (GetLastError()) {
 485          /* these are my fault -- they shouldn't happen */
 486          case ERROR_INSUFFICIENT_BUFFER:
 487          case ERROR_INVALID_FLAGS:
 488          case ERROR_INVALID_PARAMETER:
 489             result->strlength = 2;
 490             result->strptr[0] = '4';
 491             result->strptr[1] = '0';
 492             break;
 493          case ERROR_NO_UNICODE_TRANSLATION:
 494             result->strlength = 2;
 495             result->strptr[0] = '8';
 496             result->strptr[1] = '7';
 497             break;
 498          /* this is their fault */
 499          default:
 500             result->strlength = 2;
 501             result->strptr[0] = '4';
 502             result->strptr[1] = '7';
 503             break;
 504       }
 505    }
 506    else {
 507       result_zero();
 508    }
 509 #else
 510
 511    /* perform default conversion using mbsrtowcs(), since it can be set up
 512     * portably, in contrast to iconv(). */
 513    if (cp == CP_ACP) {
 514 # ifdef HAVE_MBSTOWCS
 515         wchar_t * output;
 516
 517         if (sizeof(wchar_t) != sizeof(short)) {
 518            output = alloca(sizeof(wchar_t)*argv[0].strlength);
 519         }
 520         else
 521            output = (wchar_t *) outs.strptr;
 522
 523         outs.strlength = mbstowcs(output, argv[0].strptr, argv[0].strlength);
 524
 525         /* well, mostly portably -- we want 16 byte output, which might
 526          * not be the case for wchar_t */
 527         if (sizeof(wchar_t) != 2) {
 528            register int i;
 529
 530            for (i = 0; i < outs.strlength; i++) {
 531            ((short *)outs.strptr)[i] = output[i];
 532            }
 533         }
 534
 535 # else
 536         /* unless, of course, the system doesn't support it -- in this case
 537          * assume the trivial conversion */
 538         outs.strlength = l1tou((unsigned short *)outs.strptr, argv[0].strptr, argv[0].strlength);
 539 # endif
 540    }
 541
 542    /* for the same reason, perform utf-7 and utf-8 conversions here, rather
 543     * than using iconv() */
 544    else if (cp == CP_UTF7) {
 545       outs.strlength = u7tou((unsigned short *)outs.strptr,  argv[0].strptr, argv[0].strlength);
 546    }
 547    else if (cp == CP_UTF8) {
 548       outs.strlength = u8tou((unsigned short *) outs.strptr, argv[0].strptr, argv[0].strlength);
 549    }
 550
 551    /* if requesting a specific code page, we need iconv, or we return an
 552     * error */
 553    else {
 554 # ifdef HAVE_ICONV
 555       iconv_t ic;
 556       char * cps;
 557
 558       rxstrdup(cps, argv[1]);
 559
 560       ic = iconv_open(ICONV_UTF16, cps);
 561
 562       if (ic == (iconv_t)-1) {
 563          result->strlength = 2;
 564          result->strptr[0] = '8';
 565          result->strptr[1] = '7';
 566          outs.strlength = 0;
 567       }
 568       else {
 569          char * inbuf = argv[0].strptr, * outbuf = outs.strptr;
 570          size_t inlen = argv[0].strlength, outlen = argv[0].strlength*2;
 571
 572          iconv(ic, &inbuf, &inlen, &outbuf, &outlen);
 573
 574          if (inlen) {
 575             outs.strlength = 0;
 576             result->strlength = 4;
 577             memcpy(result->strptr, "1113", 4);
 578          }
 579          else {
 580             outs.strlength = (argv[0].strlength * 2 - outlen)/2;
 581             result_zero();
 582          }
 583
 584          iconv_close(ic);
 585       }
 586 # else
 587       result->strlength = 2;
 588       result->strptr[0] = '8';
 589       result->strptr[1] = '7';
 590       outs.strlength = 0;
 591 # endif
 592    }
 593 #endif
 594
 595    /* outstem.!TEXT is the return value */
 596
 597    stemv.strptr = alloca(argv[3].strlength + sizeof(text) + 1);
 598    memcpy(stemv.strptr, argv[3].strptr, argv[3].strlength);
 599    if (stemv.strptr[argv[3].strlength-1] == '.') {
 600       stemv.strlength = argv[3].strlength;
 601    }
 602    else {
 603       stemv.strptr[argv[3].strlength] = '.';
 604       stemv.strlength = argv[3].strlength + 1;
 605    }
 606
 607    memcpy(stemv.strptr+stemv.strlength, text, sizeof(text)-1);
 608    stemv.strlength += sizeof(text) - 1;
 609    setavar(&stemv, outs.strptr, outs.strlength*2);
 610
 611    free(outs.strptr);
 612
 613    return 0;
 614 }
 615
 616 rxfunc(sysfromunicode)
 617 {
 618    int cp;
 619    int flags = 0;
 620    RXSTRING outs;
 621    RXSTRING stemv;
 622    static const char text[] = "!TEXT", usedd[] = "!USEDDEFAULTCHAR";
 623    int usedDefaultChar = 0;
 624    char * defchar = NULL;
 625    int * pusedDefaultChar = NULL;
 626
 627    checkparam(5, 5);
 628
 629    if (argv[1].strptr == NULL || argv[1].strlength == 0)
 630       cp = CP_OEMCP;
 631    else {
 632       char * s;
 633       rxstrdup(s, argv[1]);
 634       cp = cvtcp(s);
 635    }
 636
 637 #ifdef _WIN32
 638    if (argv[2].strptr && argv[2].strlength) {
 639       static const struct mapping_flags_T flgs[] = {
 640          {{14, "COMPOSITECHECK"}, WC_COMPOSITECHECK},
 641          {{ 8, "SEPCHARS"}, WC_SEPCHARS},
 642          {{ 9, "DISCARDNS"}, WC_DISCARDNS},
 643          {{11, "DEFAULTCHAR"}, WC_DEFAULTCHAR}
 644       };
 645
 646       flags = getflags(argv[2], flgs, DIM(flgs));
 647       if (flags & WC_DEFAULTCHAR) {
 648          pusedDefaultChar = &usedDefaultChar;
 649       }
 650    }
 651
 652    if (argv[3].strlength && argv[3].strptr) {
 653       defchar = (char *)argv[3].strptr;
 654    }
 655 #endif
 656
 657    outs.strptr = malloc(argv[0].strlength*2);
 658
 659 #ifdef _WIN32
 660    outs.strlength = WideCharToMultiByte(cp, flags, (LPWSTR)argv[0].strptr, argv[0].strlength/2, outs.strptr, argv[0].strlength, defchar, pusedDefaultChar);
 661
 662    if (outs.strlength == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 663       outs.strlength = WideCharToMultiByte(cp, flags, (LPWSTR)argv[0].strptr, argv[0].strlength/2, NULL, 0, NULL, NULL);
 664       outs.strptr = realloc(outs.strptr, outs.strlength);
 665       WideCharToMultiByte(cp, flags, (LPWSTR)argv[0].strptr, argv[0].strlength/2, outs.strptr, outs.strlength, defchar, &usedDefaultChar);
 666    }
 667
 668    if (outs.strlength == 0) {
 669       switch (GetLastError()) {
 670          /* these are my fault -- they shouldn't happen */
 671          case ERROR_INSUFFICIENT_BUFFER:
 672          case ERROR_INVALID_FLAGS:
 673          case ERROR_INVALID_PARAMETER:
 674             result->strlength = 2;
 675             result->strptr[0] = '4';
 676             result->strptr[1] = '0';
 677             break;
 678          case ERROR_NO_UNICODE_TRANSLATION:
 679             result->strlength = 2;
 680             result->strptr[0] = '8';
 681             result->strptr[1] = '7';
 682             break;
 683          /* this is their fault */
 684          default:
 685             result->strlength = 2;
 686             result->strptr[0] = '4';
 687             result->strptr[1] = '7';
 688             break;
 689       }
 690    }
 691    else {
 692       result_zero();
 693    }
 694
 695 #else
 696    /* perform default conversion using wcsrtombs(), since it can be set up
 697     * portably, in contrast to iconv(). */
 698    if (cp == CP_ACP) {
 699 # ifdef HAVE_MBSTOWCS
 700         wchar_t * s;
 701
 702         if (sizeof(wchar_t) != 2) {
 703            register int i;
 704            s = alloca(argv[0].strlength*sizeof(wchar_t));
 705            for (i = 0; i < argv[0].strlength/2; i++)
 706               s[i] = ((short *)argv[0].strptr)[i];
 707         }
 708         else
 709            s = (wchar_t *)argv[0].strptr;
 710
 711         outs.strlength = wcstombs(outs.strptr, s, argv[0].strlength/2);
 712 # else
 713         /* unless, of course, the system doesn't support it -- in this case
 714          * assume the trivial conversion */
 715         outs.strlength = utol1(outs.strptr, (unsigned short *)argv[0].strptr, argv[0].strlength/2);
 716 # endif
 717    }
 718
 719    /* for the same reason, perform utf-7 and utf-8 conversions here, rather
 720     * than using iconv() */
 721    else if (cp == CP_UTF7) {
 722       outs.strlength = utou7(outs.strptr,  (unsigned short *)argv[0].strptr, argv[0].strlength/2);
 723    }
 724    else if (cp == CP_UTF8) {
 725       outs.strlength = utou8(outs.strptr, (unsigned short *)argv[0].strptr, argv[0].strlength/2);
 726    }
 727
 728    /* if requesting a specific code page, we need iconv, or we return an
 729     * error */
 730    else {
 731 # ifdef HAVE_ICONV
 732       iconv_t ic;
 733       char * cps;
 734
 735       rxstrdup(cps, argv[1]);
 736
 737       ic = iconv_open(cps, ICONV_UTF16);
 738
 739       if (ic == (iconv_t)-1) {
 740          result->strlength = 2;
 741          result->strptr[0] = '8';
 742          result->strptr[1] = '7';
 743          outs.strlength = 0;
 744       }
 745       else {
 746          char * inbuf = argv[0].strptr, * outbuf = outs.strptr;
 747          size_t inlen = argv[0].strlength, outlen = argv[0].strlength*2;
 748
 749          iconv(ic, &inbuf, &inlen, &outbuf, &outlen);
 750
 751          if (inlen) {
 752             outs.strlength = 0;
 753             result->strlength = 4;
 754             memcpy(result->strptr, "1113", 4);
 755          }
 756          else {
 757             outs.strlength = argv[0].strlength * 2 - outlen;
 758             result_zero();
 759          }
 760
 761          iconv_close(ic);
 762          result_zero();
 763       }
 764 # else
 765       result->strlength = 2;
 766       result->strptr[0] = '8';
 767       result->strptr[1] = '7';
 768       outs.strlength = 0;
 769 # endif
 770    }
 771 #endif
 772
 773    /* outstem.!TEXT is the return value
 774     * outstem.!USEDDEFAULTCHAR is the value of the default character if
 775     *  applicable */
 776
 777    stemv.strptr = alloca(argv[4].strlength + sizeof(usedd) + 1);
 778    memcpy(stemv.strptr, argv[4].strptr, argv[4].strlength);
 779    if (stemv.strptr[argv[4].strlength-1] == '.') {
 780       stemv.strlength = argv[4].strlength;
 781    }
 782    else {
 783       stemv.strptr[argv[4].strlength] = '.';
 784       stemv.strlength = argv[4].strlength + 1;
 785    }
 786
 787    memcpy(stemv.strptr+stemv.strlength, text, sizeof(text)-1);
 788    stemv.strlength += sizeof(text) - 1;
 789    setavar(&stemv, outs.strptr, outs.strlength);
 790
 791    memcpy(stemv.strptr+stemv.strlength - (sizeof(text) - 1), usedd, sizeof(usedd)-1);
 792    stemv.strlength += sizeof(usedd) - sizeof(text);
 793    if (usedDefaultChar) {
 794       setavar(&stemv, "-", 1);
 795    }
 796    else {
 797       setavar(&stemv, "", 0);
 798    }
 799
 800    return 0;
 801 }
 802