lib/util/charset/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "../lib/util/dlinklist.h"
  23 #include "system/iconv.h"
  24 #include "system/filesys.h"
  25
  26
  27 /**
  28  * @file
  29  *
  30  * @brief Samba wrapper/stub for iconv character set conversion.
  31  *
  32  * iconv is the XPG2 interface for converting between character
  33  * encodings.  This file provides a Samba wrapper around it, and also
  34  * a simple reimplementation that is used if the system does not
  35  * implement iconv.
  36  *
  37  * Samba only works with encodings that are supersets of ASCII: ascii
  38  * characters like whitespace can be tested for directly, multibyte
  39  * sequences start with a byte with the high bit set, and strings are
  40  * terminated by a nul byte.
  41  *
  42  * Note that the only function provided by iconv is conversion between
  43  * characters.  It doesn't directly support operations like
  44  * uppercasing or comparison.  We have to convert to UTF-16LE and
  45  * compare there.
  46  *
  47  * @sa Samba Developers Guide
  48  **/
  49
  50 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  51 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  52 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  53 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  54 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  55 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  56 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  57 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  58 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  59
  60 static const struct charset_functions builtin_functions[] = {
  61         /* windows is closest to UTF-16 */
  62         {"UCS-2LE",  iconv_copy, iconv_copy},
  63         {"UTF-16LE",  iconv_copy, iconv_copy},
  64         {"UCS-2BE",  iconv_swab, iconv_swab},
  65         {"UTF-16BE",  iconv_swab, iconv_swab},
  66
  67         /* we include the UTF-8 alias to cope with differing locale settings */
  68         {"UTF8",   utf8_pull,  utf8_push},
  69         {"UTF-8",   utf8_pull,  utf8_push},
  70
  71         /* this handles the munging needed for String2Key */
  72         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
  73
  74         {"ASCII", ascii_pull, ascii_push},
  75         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
  76 };
  77
  78 static struct charset_functions *charsets = NULL;
  79
  80 bool charset_register_backend(const void *_funcs)
  81 {
  82         struct charset_functions *funcs = (struct charset_functions *)memdup(_funcs,sizeof(struct charset_functions));
  83         struct charset_functions *c;
  84
  85         /* Check whether we already have this charset... */
  86         for (c = charsets; c != NULL; c = c->next) {
  87                 if(!strcasecmp(c->name, funcs->name)) {
  88                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
  89                         return false;
  90                 }
  91         }
  92
  93         funcs->next = funcs->prev = NULL;
  94         DLIST_ADD(charsets, funcs);
  95         return true;
  96 }
  97
  98 #ifdef HAVE_NATIVE_ICONV
  99 /* if there was an error then reset the internal state,
 100    this ensures that we don't have a shift state remaining for
 101    character sets like SJIS */
 102 static size_t sys_iconv(void *cd,
 103                         const char **inbuf, size_t *inbytesleft,
 104                         char **outbuf, size_t *outbytesleft)
 105 {
 106         size_t ret = iconv((iconv_t)cd,
 107                            discard_const_p(char *, inbuf), inbytesleft,
 108                            outbuf, outbytesleft);
 109         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 110         return ret;
 111 }
 112 #endif
 113
 114 /**
 115  * This is a simple portable iconv() implementaion.
 116  *
 117  * It only knows about a very small number of character sets - just
 118  * enough that Samba works on systems that don't have iconv.
 119  **/
 120 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
 121                  const char **inbuf, size_t *inbytesleft,
 122                  char **outbuf, size_t *outbytesleft)
 123 {
 124         char cvtbuf[2048];
 125         size_t bufsize;
 126
 127         /* in many cases we can go direct */
 128         if (cd->direct) {
 129                 return cd->direct(cd->cd_direct,
 130                                   inbuf, inbytesleft, outbuf, outbytesleft);
 131         }
 132
 133
 134         /* otherwise we have to do it chunks at a time */
 135         while (*inbytesleft > 0) {
 136                 char *bufp1 = cvtbuf;
 137                 const char *bufp2 = cvtbuf;
 138
 139                 bufsize = sizeof(cvtbuf);
 140
 141                 if (cd->pull(cd->cd_pull,
 142                              inbuf, inbytesleft, &bufp1, &bufsize) == -1
 143                     && errno != E2BIG) return -1;
 144
 145                 bufsize = sizeof(cvtbuf) - bufsize;
 146
 147                 if (cd->push(cd->cd_push,
 148                              &bufp2, &bufsize,
 149                              outbuf, outbytesleft) == -1) return -1;
 150         }
 151
 152         return 0;
 153 }
 154
 155 static bool is_utf16(const char *name)
 156 {
 157         return strcasecmp(name, "UCS-2LE") == 0 ||
 158                 strcasecmp(name, "UTF-16LE") == 0;
 159 }
 160
 161 int smb_iconv_t_destructor(smb_iconv_t hwd)
 162 {
 163 #ifdef HAVE_NATIVE_ICONV
 164         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 165                 iconv_close(hwd->cd_pull);
 166         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
 167                 iconv_close(hwd->cd_push);
 168         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
 169                 iconv_close(hwd->cd_direct);
 170 #endif
 171
 172         return 0;
 173 }
 174
 175 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 176                               const char *fromcode, bool native_iconv)
 177 {
 178         smb_iconv_t ret;
 179         const struct charset_functions *from=NULL, *to=NULL;
 180         int i;
 181
 182         ret = (smb_iconv_t)talloc_named(mem_ctx,
 183                                         sizeof(*ret),
 184                                         "iconv(%s,%s)", tocode, fromcode);
 185         if (!ret) {
 186                 errno = ENOMEM;
 187                 return (smb_iconv_t)-1;
 188         }
 189         memset(ret, 0, sizeof(*ret));
 190         talloc_set_destructor(ret, smb_iconv_t_destructor);
 191
 192         /* check for the simplest null conversion */
 193         if (strcmp(fromcode, tocode) == 0) {
 194                 ret->direct = iconv_copy;
 195                 return ret;
 196         }
 197
 198         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 199                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 200                         from = &builtin_functions[i];
 201                 }
 202                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 203                         to = &builtin_functions[i];
 204                 }
 205         }
 206
 207         if (from == NULL) {
 208                 for (from=charsets; from; from=from->next) {
 209                         if (strcasecmp(from->name, fromcode) == 0) break;
 210                 }
 211         }
 212
 213         if (to == NULL) {
 214                 for (to=charsets; to; to=to->next) {
 215                         if (strcasecmp(to->name, tocode) == 0) break;
 216                 }
 217         }
 218
 219 #ifdef HAVE_NATIVE_ICONV
 220         if ((!from || !to) && !native_iconv) {
 221                 goto failed;
 222         }
 223         if (!from) {
 224                 ret->pull = sys_iconv;
 225                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 226                 if (ret->cd_pull == (iconv_t)-1)
 227                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 228                 if (ret->cd_pull == (iconv_t)-1) goto failed;
 229         }
 230
 231         if (!to) {
 232                 ret->push = sys_iconv;
 233                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 234                 if (ret->cd_push == (iconv_t)-1)
 235                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 236                 if (ret->cd_push == (iconv_t)-1) goto failed;
 237         }
 238 #else
 239         if (!from || !to) {
 240                 goto failed;
 241         }
 242 #endif
 243
 244         /* check for conversion to/from ucs2 */
 245         if (is_utf16(fromcode) && to) {
 246                 ret->direct = to->push;
 247                 return ret;
 248         }
 249         if (is_utf16(tocode) && from) {
 250                 ret->direct = from->pull;
 251                 return ret;
 252         }
 253
 254 #ifdef HAVE_NATIVE_ICONV
 255         if (is_utf16(fromcode)) {
 256                 ret->direct = sys_iconv;
 257                 ret->cd_direct = ret->cd_push;
 258                 ret->cd_push = NULL;
 259                 return ret;
 260         }
 261         if (is_utf16(tocode)) {
 262                 ret->direct = sys_iconv;
 263                 ret->cd_direct = ret->cd_pull;
 264                 ret->cd_pull = NULL;
 265                 return ret;
 266         }
 267 #endif
 268
 269         /* the general case has to go via a buffer */
 270         if (!ret->pull) ret->pull = from->pull;
 271         if (!ret->push) ret->push = to->push;
 272         return ret;
 273
 274 failed:
 275         talloc_free(ret);
 276         errno = EINVAL;
 277         return (smb_iconv_t)-1;
 278 }
 279
 280 /*
 281   simple iconv_open() wrapper
 282  */
 283 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 284 {
 285         return smb_iconv_open_ex(talloc_autofree_context(), tocode, fromcode, true);
 286 }
 287
 288 /*
 289   simple iconv_close() wrapper
 290 */
 291 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
 292 {
 293         talloc_free(cd);
 294         return 0;
 295 }
 296
 297
 298 /**********************************************************************
 299  the following functions implement the builtin character sets in Samba
 300  and also the "test" character sets that are designed to test
 301  multi-byte character set support for english users
 302 ***********************************************************************/
 303 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 304                          char **outbuf, size_t *outbytesleft)
 305 {
 306         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 307                 (*outbuf)[0] = (*inbuf)[0];
 308                 (*outbuf)[1] = 0;
 309                 (*inbytesleft)  -= 1;
 310                 (*outbytesleft) -= 2;
 311                 (*inbuf)  += 1;
 312                 (*outbuf) += 2;
 313         }
 314
 315         if (*inbytesleft > 0) {
 316                 errno = E2BIG;
 317                 return -1;
 318         }
 319
 320         return 0;
 321 }
 322
 323 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 324                          char **outbuf, size_t *outbytesleft)
 325 {
 326         int ir_count=0;
 327
 328         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 329                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
 330                 if ((*inbuf)[1]) ir_count++;
 331                 (*inbytesleft)  -= 2;
 332                 (*outbytesleft) -= 1;
 333                 (*inbuf)  += 2;
 334                 (*outbuf) += 1;
 335         }
 336
 337         if (*inbytesleft == 1) {
 338                 errno = EINVAL;
 339                 return -1;
 340         }
 341
 342         if (*inbytesleft > 1) {
 343                 errno = E2BIG;
 344                 return -1;
 345         }
 346
 347         return ir_count;
 348 }
 349
 350
 351 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 352                          char **outbuf, size_t *outbytesleft)
 353 {
 354         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 355                 unsigned int v;
 356
 357                 if ((*inbuf)[0] != '@') {
 358                         /* seven bit ascii case */
 359                         (*outbuf)[0] = (*inbuf)[0];
 360                         (*outbuf)[1] = 0;
 361                         (*inbytesleft)  -= 1;
 362                         (*outbytesleft) -= 2;
 363                         (*inbuf)  += 1;
 364                         (*outbuf) += 2;
 365                         continue;
 366                 }
 367                 /* it's a hex character */
 368                 if (*inbytesleft < 5) {
 369                         errno = EINVAL;
 370                         return -1;
 371                 }
 372
 373                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 374                         errno = EILSEQ;
 375                         return -1;
 376                 }
 377
 378                 (*outbuf)[0] = v&0xff;
 379                 (*outbuf)[1] = v>>8;
 380                 (*inbytesleft)  -= 5;
 381                 (*outbytesleft) -= 2;
 382                 (*inbuf)  += 5;
 383                 (*outbuf) += 2;
 384         }
 385
 386         if (*inbytesleft > 0) {
 387                 errno = E2BIG;
 388                 return -1;
 389         }
 390
 391         return 0;
 392 }
 393
 394 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 395                            char **outbuf, size_t *outbytesleft)
 396 {
 397         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 398                 char buf[6];
 399
 400                 if ((*inbuf)[1] == 0 &&
 401                     ((*inbuf)[0] & 0x80) == 0 &&
 402                     (*inbuf)[0] != '@') {
 403                         (*outbuf)[0] = (*inbuf)[0];
 404                         (*inbytesleft)  -= 2;
 405                         (*outbytesleft) -= 1;
 406                         (*inbuf)  += 2;
 407                         (*outbuf) += 1;
 408                         continue;
 409                 }
 410                 if (*outbytesleft < 5) {
 411                         errno = E2BIG;
 412                         return -1;
 413                 }
 414                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 415                 memcpy(*outbuf, buf, 5);
 416                 (*inbytesleft)  -= 2;
 417                 (*outbytesleft) -= 5;
 418                 (*inbuf)  += 2;
 419                 (*outbuf) += 5;
 420         }
 421
 422         if (*inbytesleft == 1) {
 423                 errno = EINVAL;
 424                 return -1;
 425         }
 426
 427         if (*inbytesleft > 1) {
 428                 errno = E2BIG;
 429                 return -1;
 430         }
 431
 432         return 0;
 433 }
 434
 435 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 436                          char **outbuf, size_t *outbytesleft)
 437 {
 438         int n;
 439
 440         n = MIN(*inbytesleft, *outbytesleft);
 441
 442         swab(*inbuf, *outbuf, (n&~1));
 443         if (n&1) {
 444                 (*outbuf)[n-1] = 0;
 445         }
 446
 447         (*inbytesleft) -= n;
 448         (*outbytesleft) -= n;
 449         (*inbuf) += n;
 450         (*outbuf) += n;
 451
 452         if (*inbytesleft > 0) {
 453                 errno = E2BIG;
 454                 return -1;
 455         }
 456
 457         return 0;
 458 }
 459
 460
 461 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 462                          char **outbuf, size_t *outbytesleft)
 463 {
 464         int n;
 465
 466         n = MIN(*inbytesleft, *outbytesleft);
 467
 468         memmove(*outbuf, *inbuf, n);
 469
 470         (*inbytesleft) -= n;
 471         (*outbytesleft) -= n;
 472         (*inbuf) += n;
 473         (*outbuf) += n;
 474
 475         if (*inbytesleft > 0) {
 476                 errno = E2BIG;
 477                 return -1;
 478         }
 479
 480         return 0;
 481 }
 482
 483 /*
 484   this takes a UTF8 sequence and produces a UTF16 sequence
 485  */
 486 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 487                          char **outbuf, size_t *outbytesleft)
 488 {
 489         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 490         const uint8_t *c = (const uint8_t *)*inbuf;
 491         uint8_t *uc = (uint8_t *)*outbuf;
 492
 493         while (in_left >= 1 && out_left >= 2) {
 494                 if ((c[0] & 0x80) == 0) {
 495                         uc[0] = c[0];
 496                         uc[1] = 0;
 497                         c  += 1;
 498                         in_left  -= 1;
 499                         out_left -= 2;
 500                         uc += 2;
 501                         continue;
 502                 }
 503
 504                 if ((c[0] & 0xe0) == 0xc0) {
 505                         if (in_left < 2 ||
 506                             (c[1] & 0xc0) != 0x80) {
 507                                 errno = EILSEQ;
 508                                 goto error;
 509                         }
 510                         uc[1] = (c[0]>>2) & 0x7;
 511                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 512                         c  += 2;
 513                         in_left  -= 2;
 514                         out_left -= 2;
 515                         uc += 2;
 516                         continue;
 517                 }
 518
 519                 if ((c[0] & 0xf0) == 0xe0) {
 520                         if (in_left < 3 ||
 521                             (c[1] & 0xc0) != 0x80 ||
 522                             (c[2] & 0xc0) != 0x80) {
 523                                 errno = EILSEQ;
 524                                 goto error;
 525                         }
 526                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 527                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 528                         c  += 3;
 529                         in_left  -= 3;
 530                         out_left -= 2;
 531                         uc += 2;
 532                         continue;
 533                 }
 534
 535                 if ((c[0] & 0xf8) == 0xf0) {
 536                         unsigned int codepoint;
 537                         if (in_left < 4 ||
 538                             (c[1] & 0xc0) != 0x80 ||
 539                             (c[2] & 0xc0) != 0x80 ||
 540                             (c[3] & 0xc0) != 0x80) {
 541                                 errno = EILSEQ;
 542                                 goto error;
 543                         }
 544                         codepoint =
 545                                 (c[3]&0x3f) |
 546                                 ((c[2]&0x3f)<<6) |
 547                                 ((c[1]&0x3f)<<12) |
 548                                 ((c[0]&0x7)<<18);
 549                         if (codepoint < 0x10000) {
 550                                 /* accept UTF-8 characters that are not
 551                                    minimally packed, but pack the result */
 552                                 uc[0] = (codepoint & 0xFF);
 553                                 uc[1] = (codepoint >> 8);
 554                                 c += 4;
 555                                 in_left -= 4;
 556                                 out_left -= 2;
 557                                 uc += 2;
 558                                 continue;
 559                         }
 560
 561                         codepoint -= 0x10000;
 562
 563                         if (out_left < 4) {
 564                                 errno = E2BIG;
 565                                 goto error;
 566                         }
 567
 568                         uc[0] = (codepoint>>10) & 0xFF;
 569                         uc[1] = (codepoint>>18) | 0xd8;
 570                         uc[2] = codepoint & 0xFF;
 571                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 572                         c  += 4;
 573                         in_left  -= 4;
 574                         out_left -= 4;
 575                         uc += 4;
 576                         continue;
 577                 }
 578
 579                 /* we don't handle 5 byte sequences */
 580                 errno = EINVAL;
 581                 goto error;
 582         }
 583
 584         if (in_left > 0) {
 585                 errno = E2BIG;
 586                 goto error;
 587         }
 588
 589         *inbytesleft = in_left;
 590         *outbytesleft = out_left;
 591         *inbuf = (const char *)c;
 592         *outbuf = (char *)uc;
 593         return 0;
 594
 595 error:
 596         *inbytesleft = in_left;
 597         *outbytesleft = out_left;
 598         *inbuf = (const char *)c;
 599         *outbuf = (char *)uc;
 600         return -1;
 601 }
 602
 603
 604 /*
 605   this takes a UTF16 sequence and produces a UTF8 sequence
 606  */
 607 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 608                         char **outbuf, size_t *outbytesleft)
 609 {
 610         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 611         uint8_t *c = (uint8_t *)*outbuf;
 612         const uint8_t *uc = (const uint8_t *)*inbuf;
 613
 614         while (in_left >= 2 && out_left >= 1) {
 615                 unsigned int codepoint;
 616
 617                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 618                         /* simplest case */
 619                         c[0] = uc[0];
 620                         in_left  -= 2;
 621                         out_left -= 1;
 622                         uc += 2;
 623                         c  += 1;
 624                         continue;
 625                 }
 626
 627                 if ((uc[1]&0xf8) == 0) {
 628                         /* next simplest case */
 629                         if (out_left < 2) {
 630                                 errno = E2BIG;
 631                                 goto error;
 632                         }
 633                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 634                         c[1] = 0x80 | (uc[0] & 0x3f);
 635                         in_left  -= 2;
 636                         out_left -= 2;
 637                         uc += 2;
 638                         c  += 2;
 639                         continue;
 640                 }
 641
 642                 if ((uc[1] & 0xfc) == 0xdc) {
 643                         /* its the second part of a 4 byte sequence. Illegal */
 644                         if (in_left < 4) {
 645                                 errno = EINVAL;
 646                         } else {
 647                                 errno = EILSEQ;
 648                         }
 649                         goto error;
 650                 }
 651
 652                 if ((uc[1] & 0xfc) != 0xd8) {
 653                         codepoint = uc[0] | (uc[1]<<8);
 654                         if (out_left < 3) {
 655                                 errno = E2BIG;
 656                                 goto error;
 657                         }
 658                         c[0] = 0xe0 | (codepoint >> 12);
 659                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 660                         c[2] = 0x80 | (codepoint & 0x3f);
 661
 662                         in_left  -= 2;
 663                         out_left -= 3;
 664                         uc  += 2;
 665                         c   += 3;
 666                         continue;
 667                 }
 668
 669                 /* its the first part of a 4 byte sequence */
 670                 if (in_left < 4) {
 671                         errno = EINVAL;
 672                         goto error;
 673                 }
 674                 if ((uc[3] & 0xfc) != 0xdc) {
 675                         errno = EILSEQ;
 676                         goto error;
 677                 }
 678                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
 679                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 680
 681                 if (out_left < 4) {
 682                         errno = E2BIG;
 683                         goto error;
 684                 }
 685                 c[0] = 0xf0 | (codepoint >> 18);
 686                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 687                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 688                 c[3] = 0x80 | (codepoint & 0x3f);
 689
 690                 in_left  -= 4;
 691                 out_left -= 4;
 692                 uc       += 4;
 693                 c        += 4;
 694         }
 695
 696         if (in_left == 1) {
 697                 errno = EINVAL;
 698                 goto error;
 699         }
 700
 701         if (in_left > 1) {
 702                 errno = E2BIG;
 703                 goto error;
 704         }
 705
 706         *inbytesleft = in_left;
 707         *outbytesleft = out_left;
 708         *inbuf  = (const char *)uc;
 709         *outbuf = (char *)c;
 710
 711         return 0;
 712
 713 error:
 714         *inbytesleft = in_left;
 715         *outbytesleft = out_left;
 716         *inbuf  = (const char *)uc;
 717         *outbuf = (char *)c;
 718         return -1;
 719 }
 720
 721
 722 /*
 723   this takes a UTF16 munged sequence, modifies it according to the
 724   string2key rules, and produces a UTF16 sequence
 725
 726 The rules are:
 727
 728     1) any 0x0000 characters are mapped to 0x0001
 729
 730     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
 731        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
 732        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 733
 734     3) the same for any low surrogate that was not preceded by a high surrogate.
 735
 736  */
 737 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 738                                char **outbuf, size_t *outbytesleft)
 739 {
 740         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 741         uint8_t *c = (uint8_t *)*outbuf;
 742         const uint8_t *uc = (const uint8_t *)*inbuf;
 743
 744         while (in_left >= 2 && out_left >= 2) {
 745                 unsigned int codepoint = uc[0] | (uc[1]<<8);
 746
 747                 if (codepoint == 0) {
 748                         codepoint = 1;
 749                 }
 750
 751                 if ((codepoint & 0xfc00) == 0xd800) {
 752                         /* a high surrogate */
 753                         unsigned int codepoint2;
 754                         if (in_left < 4) {
 755                                 codepoint = 0xfffd;
 756                                 goto codepoint16;
 757                         }
 758                         codepoint2 = uc[2] | (uc[3]<<8);
 759                         if ((codepoint2 & 0xfc00) != 0xdc00) {
 760                                 /* high surrogate not followed by low
 761                                    surrogate: convert to 0xfffd */
 762                                 codepoint = 0xfffd;
 763                                 goto codepoint16;
 764                         }
 765                         if (out_left < 4) {
 766                                 errno = E2BIG;
 767                                 goto error;
 768                         }
 769                         memcpy(c, uc, 4);
 770                         in_left  -= 4;
 771                         out_left -= 4;
 772                         uc       += 4;
 773                         c        += 4;
 774                         continue;
 775                 }
 776
 777                 if ((codepoint & 0xfc00) == 0xdc00) {
 778                         /* low surrogate not preceded by high
 779                            surrogate: convert to 0xfffd */
 780                         codepoint = 0xfffd;
 781                 }
 782
 783         codepoint16:
 784                 c[0] = codepoint & 0xFF;
 785                 c[1] = (codepoint>>8) & 0xFF;
 786
 787                 in_left  -= 2;
 788                 out_left -= 2;
 789                 uc  += 2;
 790                 c   += 2;
 791                 continue;
 792         }
 793
 794         if (in_left == 1) {
 795                 errno = EINVAL;
 796                 goto error;
 797         }
 798
 799         if (in_left > 1) {
 800                 errno = E2BIG;
 801                 goto error;
 802         }
 803
 804         *inbytesleft = in_left;
 805         *outbytesleft = out_left;
 806         *inbuf  = (const char *)uc;
 807         *outbuf = (char *)c;
 808
 809         return 0;
 810
 811 error:
 812         *inbytesleft = in_left;
 813         *outbytesleft = out_left;
 814         *inbuf  = (const char *)uc;
 815         *outbuf = (char *)c;
 816         return -1;
 817 }
 818
 819
 820