lib/util/charset/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "../lib/util/dlinklist.h"
  23 #include "system/iconv.h"
  24 #include "system/filesys.h"
  25 #include "charset_proto.h"
  26
  27 #ifdef strcasecmp
  28 #undef strcasecmp
  29 #endif
  30
  31 /**
  32  * @file
  33  *
  34  * @brief Samba wrapper/stub for iconv character set conversion.
  35  *
  36  * iconv is the XPG2 interface for converting between character
  37  * encodings.  This file provides a Samba wrapper around it, and also
  38  * a simple reimplementation that is used if the system does not
  39  * implement iconv.
  40  *
  41  * Samba only works with encodings that are supersets of ASCII: ascii
  42  * characters like whitespace can be tested for directly, multibyte
  43  * sequences start with a byte with the high bit set, and strings are
  44  * terminated by a nul byte.
  45  *
  46  * Note that the only function provided by iconv is conversion between
  47  * characters.  It doesn't directly support operations like
  48  * uppercasing or comparison.  We have to convert to UTF-16LE and
  49  * compare there.
  50  *
  51  * @sa Samba Developers Guide
  52  **/
  53
  54 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  55 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
  57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
  58 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  59 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  63 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  64 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  65
  66 static const struct charset_functions builtin_functions[] = {
  67         /* windows is closest to UTF-16 */
  68         {"UCS-2LE",  iconv_copy, iconv_copy},
  69         {"UTF-16LE",  iconv_copy, iconv_copy},
  70         {"UCS-2BE",  iconv_swab, iconv_swab},
  71         {"UTF-16BE",  iconv_swab, iconv_swab},
  72
  73         /* we include the UTF-8 alias to cope with differing locale settings */
  74         {"UTF8",   utf8_pull,  utf8_push},
  75         {"UTF-8",   utf8_pull,  utf8_push},
  76
  77         /* this handles the munging needed for String2Key */
  78         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy, true},
  79
  80         {"ASCII", ascii_pull, ascii_push},
  81         {"646", ascii_pull, ascii_push},
  82         {"ISO-8859-1", latin1_pull, latin1_push},
  83 #ifdef DEVELOPER
  84         {"WEIRD", weird_pull, weird_push, true},
  85 #endif
  86 #ifdef DARWINOS
  87         {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
  88 #endif
  89         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
  90
  91 };
  92
  93 #ifdef HAVE_NATIVE_ICONV
  94 /* if there was an error then reset the internal state,
  95    this ensures that we don't have a shift state remaining for
  96    character sets like SJIS */
  97 static size_t sys_iconv(void *cd,
  98                         const char **inbuf, size_t *inbytesleft,
  99                         char **outbuf, size_t *outbytesleft)
 100 {
 101         size_t ret = iconv((iconv_t)cd,
 102                            discard_const_p(char *, inbuf), inbytesleft,
 103                            outbuf, outbytesleft);
 104         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 105         return ret;
 106 }
 107 #endif
 108
 109 /**
 110  * This is a simple portable iconv() implementaion.
 111  *
 112  * It only knows about a very small number of character sets - just
 113  * enough that Samba works on systems that don't have iconv.
 114  **/
 115 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
 116                  const char **inbuf, size_t *inbytesleft,
 117                  char **outbuf, size_t *outbytesleft)
 118 {
 119         /* in many cases we can go direct */
 120         if (cd->direct) {
 121                 return cd->direct(cd->cd_direct,
 122                                   inbuf, inbytesleft, outbuf, outbytesleft);
 123         }
 124
 125         /* otherwise we have to do it chunks at a time */
 126         {
 127 #ifndef SMB_ICONV_BUFSIZE
 128 #define SMB_ICONV_BUFSIZE 2048
 129 #endif
 130                 TALLOC_CTX *mem_ctx;
 131                 size_t bufsize;
 132                 char *cvtbuf;
 133
 134 #if _SAMBA_BUILD_ == 3
 135                 mem_ctx = talloc_tos();
 136 #else
 137                 mem_ctx = cd;
 138 #endif
 139                 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
 140
 141                 if (!cvtbuf) {
 142                         return (size_t)-1;
 143                 }
 144
 145                 while (*inbytesleft > 0) {
 146                         char *bufp1 = cvtbuf;
 147                         const char *bufp2 = cvtbuf;
 148                         int saved_errno = errno;
 149                         bool pull_failed = false;
 150                         bufsize = SMB_ICONV_BUFSIZE;
 151
 152                         if (cd->pull(cd->cd_pull,
 153                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
 154                             && errno != E2BIG) {
 155                                 saved_errno = errno;
 156                                 pull_failed = true;
 157                         }
 158
 159                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
 160
 161                         if (cd->push(cd->cd_push,
 162                                      &bufp2, &bufsize,
 163                                      outbuf, outbytesleft) == -1) {
 164                                 talloc_free(cvtbuf);
 165                                 return -1;
 166                         } else if (pull_failed) {
 167                                 /* We want the pull errno if possible */
 168                                 errno = saved_errno;
 169                                 return -1;
 170                         }
 171                 }
 172                 talloc_free(cvtbuf);
 173         }
 174
 175         return 0;
 176 }
 177
 178 static bool is_utf16(const char *name)
 179 {
 180         return strcasecmp(name, "UCS-2LE") == 0 ||
 181                 strcasecmp(name, "UTF-16LE") == 0;
 182 }
 183
 184 static int smb_iconv_t_destructor(smb_iconv_t hwd)
 185 {
 186 #ifdef HAVE_NATIVE_ICONV
 187         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 188                 iconv_close(hwd->cd_pull);
 189         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
 190                 iconv_close(hwd->cd_push);
 191         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
 192                 iconv_close(hwd->cd_direct);
 193 #endif
 194
 195         return 0;
 196 }
 197
 198 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 199                               const char *fromcode, bool use_builtin_handlers)
 200 {
 201         smb_iconv_t ret;
 202         const struct charset_functions *from=NULL, *to=NULL;
 203         int i;
 204
 205         ret = (smb_iconv_t)talloc_named(mem_ctx,
 206                                         sizeof(*ret),
 207                                         "iconv(%s,%s)", tocode, fromcode);
 208         if (!ret) {
 209                 errno = ENOMEM;
 210                 return (smb_iconv_t)-1;
 211         }
 212         memset(ret, 0, sizeof(*ret));
 213         talloc_set_destructor(ret, smb_iconv_t_destructor);
 214
 215         /* check for the simplest null conversion */
 216         if (strcmp(fromcode, tocode) == 0) {
 217                 ret->direct = iconv_copy;
 218                 return ret;
 219         }
 220
 221         /* check if we have a builtin function for this conversion */
 222         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 223                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 224                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 225                                 from = &builtin_functions[i];
 226                         }
 227                 }
 228                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 229                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 230                                 to = &builtin_functions[i];
 231                         }
 232                 }
 233         }
 234
 235 #ifdef HAVE_NATIVE_ICONV
 236         /* the from and to varaibles indicate a samba module or
 237          * internal conversion, ret->pull and ret->push are
 238          * initialised only in this block for iconv based
 239          * conversions */
 240
 241         if (from == NULL) {
 242                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 243                 if (ret->cd_pull == (iconv_t)-1)
 244                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 245                 if (ret->cd_pull != (iconv_t)-1) {
 246                         ret->pull = sys_iconv;
 247                 }
 248         }
 249
 250         if (to == NULL) {
 251                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 252                 if (ret->cd_push == (iconv_t)-1)
 253                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 254                 if (ret->cd_push != (iconv_t)-1) {
 255                         ret->push = sys_iconv;
 256                 }
 257         }
 258 #endif
 259
 260         if (ret->pull == NULL && from == NULL) {
 261                 goto failed;
 262         }
 263
 264         if (ret->push == NULL && to == NULL) {
 265                 goto failed;
 266         }
 267
 268         /* check for conversion to/from ucs2 */
 269         if (is_utf16(fromcode) && to) {
 270                 ret->direct = to->push;
 271                 return ret;
 272         }
 273         if (is_utf16(tocode) && from) {
 274                 ret->direct = from->pull;
 275                 return ret;
 276         }
 277
 278 #ifdef HAVE_NATIVE_ICONV
 279         if (is_utf16(fromcode)) {
 280                 ret->direct = sys_iconv;
 281                 ret->cd_direct = ret->cd_push;
 282                 ret->cd_push = NULL;
 283                 return ret;
 284         }
 285         if (is_utf16(tocode)) {
 286                 ret->direct = sys_iconv;
 287                 ret->cd_direct = ret->cd_pull;
 288                 ret->cd_pull = NULL;
 289                 return ret;
 290         }
 291 #endif
 292
 293         /* the general case has to go via a buffer */
 294         if (!ret->pull) ret->pull = from->pull;
 295         if (!ret->push) ret->push = to->push;
 296         return ret;
 297
 298 failed:
 299         talloc_free(ret);
 300         errno = EINVAL;
 301         return (smb_iconv_t)-1;
 302 }
 303
 304 /*
 305   simple iconv_open() wrapper
 306  */
 307 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 308 {
 309         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
 310 }
 311
 312 /*
 313   simple iconv_close() wrapper
 314 */
 315 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
 316 {
 317         talloc_free(cd);
 318         return 0;
 319 }
 320
 321
 322 /**********************************************************************
 323  the following functions implement the builtin character sets in Samba
 324  and also the "test" character sets that are designed to test
 325  multi-byte character set support for english users
 326 ***********************************************************************/
 327
 328 /*
 329   this takes an ASCII sequence and produces a UTF16 sequence
 330
 331   The first 127 codepoints of latin1 matches the first 127 codepoints
 332   of unicode, and so can be put into the first byte of UTF16LE
 333
 334  */
 335
 336 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 337                          char **outbuf, size_t *outbytesleft)
 338 {
 339         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 340                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
 341                         /* If this is multi-byte, then it isn't legal ASCII */
 342                         errno = EILSEQ;
 343                         return -1;
 344                 }
 345                 (*outbuf)[0] = (*inbuf)[0];
 346                 (*outbuf)[1] = 0;
 347                 (*inbytesleft)  -= 1;
 348                 (*outbytesleft) -= 2;
 349                 (*inbuf)  += 1;
 350                 (*outbuf) += 2;
 351         }
 352
 353         if (*inbytesleft > 0) {
 354                 errno = E2BIG;
 355                 return -1;
 356         }
 357
 358         return 0;
 359 }
 360
 361 /*
 362   this takes a UTF16 sequence and produces an ASCII sequence
 363
 364   The first 127 codepoints of ASCII matches the first 127 codepoints
 365   of unicode, and so can be read directly from the first byte of UTF16LE
 366
 367  */
 368 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 369                          char **outbuf, size_t *outbytesleft)
 370 {
 371         int ir_count=0;
 372
 373         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 374                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
 375                         (*inbuf)[1] != 0) {
 376                         /* If this is multi-byte, then it isn't legal ASCII */
 377                         errno = EILSEQ;
 378                         return -1;
 379                 }
 380                 (*outbuf)[0] = (*inbuf)[0];
 381                 (*inbytesleft)  -= 2;
 382                 (*outbytesleft) -= 1;
 383                 (*inbuf)  += 2;
 384                 (*outbuf) += 1;
 385         }
 386
 387         if (*inbytesleft == 1) {
 388                 errno = EINVAL;
 389                 return -1;
 390         }
 391
 392         if (*inbytesleft > 1) {
 393                 errno = E2BIG;
 394                 return -1;
 395         }
 396
 397         return ir_count;
 398 }
 399
 400 /*
 401   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
 402
 403   The first 256 codepoints of latin1 matches the first 256 codepoints
 404   of unicode, and so can be put into the first byte of UTF16LE
 405
 406  */
 407 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 408                           char **outbuf, size_t *outbytesleft)
 409 {
 410         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 411                 (*outbuf)[0] = (*inbuf)[0];
 412                 (*outbuf)[1] = 0;
 413                 (*inbytesleft)  -= 1;
 414                 (*outbytesleft) -= 2;
 415                 (*inbuf)  += 1;
 416                 (*outbuf) += 2;
 417         }
 418
 419         if (*inbytesleft > 0) {
 420                 errno = E2BIG;
 421                 return -1;
 422         }
 423
 424         return 0;
 425 }
 426
 427 /*
 428   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
 429
 430   The first 256 codepoints of latin1 matches the first 256 codepoints
 431   of unicode, and so can be read directly from the first byte of UTF16LE
 432
 433  */
 434 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
 435                          char **outbuf, size_t *outbytesleft)
 436 {
 437         int ir_count=0;
 438
 439         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 440                 (*outbuf)[0] = (*inbuf)[0];
 441                 if ((*inbuf)[1] != 0) {
 442                         /* If this is multi-byte, then it isn't legal latin1 */
 443                         errno = EILSEQ;
 444                         return -1;
 445                 }
 446                 (*inbytesleft)  -= 2;
 447                 (*outbytesleft) -= 1;
 448                 (*inbuf)  += 2;
 449                 (*outbuf) += 1;
 450         }
 451
 452         if (*inbytesleft == 1) {
 453                 errno = EINVAL;
 454                 return -1;
 455         }
 456
 457         if (*inbytesleft > 1) {
 458                 errno = E2BIG;
 459                 return -1;
 460         }
 461
 462         return ir_count;
 463 }
 464
 465 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 466                          char **outbuf, size_t *outbytesleft)
 467 {
 468         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 469                 unsigned int v;
 470
 471                 if ((*inbuf)[0] != '@') {
 472                         /* seven bit ascii case */
 473                         (*outbuf)[0] = (*inbuf)[0];
 474                         (*outbuf)[1] = 0;
 475                         (*inbytesleft)  -= 1;
 476                         (*outbytesleft) -= 2;
 477                         (*inbuf)  += 1;
 478                         (*outbuf) += 2;
 479                         continue;
 480                 }
 481                 /* it's a hex character */
 482                 if (*inbytesleft < 5) {
 483                         errno = EINVAL;
 484                         return -1;
 485                 }
 486
 487                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 488                         errno = EILSEQ;
 489                         return -1;
 490                 }
 491
 492                 (*outbuf)[0] = v&0xff;
 493                 (*outbuf)[1] = v>>8;
 494                 (*inbytesleft)  -= 5;
 495                 (*outbytesleft) -= 2;
 496                 (*inbuf)  += 5;
 497                 (*outbuf) += 2;
 498         }
 499
 500         if (*inbytesleft > 0) {
 501                 errno = E2BIG;
 502                 return -1;
 503         }
 504
 505         return 0;
 506 }
 507
 508 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 509                            char **outbuf, size_t *outbytesleft)
 510 {
 511         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 512                 char buf[6];
 513
 514                 if ((*inbuf)[1] == 0 &&
 515                     ((*inbuf)[0] & 0x80) == 0 &&
 516                     (*inbuf)[0] != '@') {
 517                         (*outbuf)[0] = (*inbuf)[0];
 518                         (*inbytesleft)  -= 2;
 519                         (*outbytesleft) -= 1;
 520                         (*inbuf)  += 2;
 521                         (*outbuf) += 1;
 522                         continue;
 523                 }
 524                 if (*outbytesleft < 5) {
 525                         errno = E2BIG;
 526                         return -1;
 527                 }
 528                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 529                 memcpy(*outbuf, buf, 5);
 530                 (*inbytesleft)  -= 2;
 531                 (*outbytesleft) -= 5;
 532                 (*inbuf)  += 2;
 533                 (*outbuf) += 5;
 534         }
 535
 536         if (*inbytesleft == 1) {
 537                 errno = EINVAL;
 538                 return -1;
 539         }
 540
 541         if (*inbytesleft > 1) {
 542                 errno = E2BIG;
 543                 return -1;
 544         }
 545
 546         return 0;
 547 }
 548
 549 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 550                          char **outbuf, size_t *outbytesleft)
 551 {
 552         int n;
 553
 554         n = MIN(*inbytesleft, *outbytesleft);
 555
 556         swab(*inbuf, *outbuf, (n&~1));
 557         if (n&1) {
 558                 (*outbuf)[n-1] = 0;
 559         }
 560
 561         (*inbytesleft) -= n;
 562         (*outbytesleft) -= n;
 563         (*inbuf) += n;
 564         (*outbuf) += n;
 565
 566         if (*inbytesleft > 0) {
 567                 errno = E2BIG;
 568                 return -1;
 569         }
 570
 571         return 0;
 572 }
 573
 574
 575 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 576                          char **outbuf, size_t *outbytesleft)
 577 {
 578         int n;
 579
 580         n = MIN(*inbytesleft, *outbytesleft);
 581
 582         memmove(*outbuf, *inbuf, n);
 583
 584         (*inbytesleft) -= n;
 585         (*outbytesleft) -= n;
 586         (*inbuf) += n;
 587         (*outbuf) += n;
 588
 589         if (*inbytesleft > 0) {
 590                 errno = E2BIG;
 591                 return -1;
 592         }
 593
 594         return 0;
 595 }
 596
 597 /*
 598   this takes a UTF8 sequence and produces a UTF16 sequence
 599  */
 600 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 601                          char **outbuf, size_t *outbytesleft)
 602 {
 603         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 604         const uint8_t *c = (const uint8_t *)*inbuf;
 605         uint8_t *uc = (uint8_t *)*outbuf;
 606
 607         while (in_left >= 1 && out_left >= 2) {
 608                 if ((c[0] & 0x80) == 0) {
 609                         uc[0] = c[0];
 610                         uc[1] = 0;
 611                         c  += 1;
 612                         in_left  -= 1;
 613                         out_left -= 2;
 614                         uc += 2;
 615                         continue;
 616                 }
 617
 618                 if ((c[0] & 0xe0) == 0xc0) {
 619                         if (in_left < 2 ||
 620                             (c[1] & 0xc0) != 0x80) {
 621                                 errno = EILSEQ;
 622                                 goto error;
 623                         }
 624                         uc[1] = (c[0]>>2) & 0x7;
 625                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 626                         c  += 2;
 627                         in_left  -= 2;
 628                         out_left -= 2;
 629                         uc += 2;
 630                         continue;
 631                 }
 632
 633                 if ((c[0] & 0xf0) == 0xe0) {
 634                         if (in_left < 3 ||
 635                             (c[1] & 0xc0) != 0x80 ||
 636                             (c[2] & 0xc0) != 0x80) {
 637                                 errno = EILSEQ;
 638                                 goto error;
 639                         }
 640                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 641                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 642                         c  += 3;
 643                         in_left  -= 3;
 644                         out_left -= 2;
 645                         uc += 2;
 646                         continue;
 647                 }
 648
 649                 if ((c[0] & 0xf8) == 0xf0) {
 650                         unsigned int codepoint;
 651                         if (in_left < 4 ||
 652                             (c[1] & 0xc0) != 0x80 ||
 653                             (c[2] & 0xc0) != 0x80 ||
 654                             (c[3] & 0xc0) != 0x80) {
 655                                 errno = EILSEQ;
 656                                 goto error;
 657                         }
 658                         codepoint =
 659                                 (c[3]&0x3f) |
 660                                 ((c[2]&0x3f)<<6) |
 661                                 ((c[1]&0x3f)<<12) |
 662                                 ((c[0]&0x7)<<18);
 663                         if (codepoint < 0x10000) {
 664                                 /* accept UTF-8 characters that are not
 665                                    minimally packed, but pack the result */
 666                                 uc[0] = (codepoint & 0xFF);
 667                                 uc[1] = (codepoint >> 8);
 668                                 c += 4;
 669                                 in_left -= 4;
 670                                 out_left -= 2;
 671                                 uc += 2;
 672                                 continue;
 673                         }
 674
 675                         codepoint -= 0x10000;
 676
 677                         if (out_left < 4) {
 678                                 errno = E2BIG;
 679                                 goto error;
 680                         }
 681
 682                         uc[0] = (codepoint>>10) & 0xFF;
 683                         uc[1] = (codepoint>>18) | 0xd8;
 684                         uc[2] = codepoint & 0xFF;
 685                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 686                         c  += 4;
 687                         in_left  -= 4;
 688                         out_left -= 4;
 689                         uc += 4;
 690                         continue;
 691                 }
 692
 693                 /* we don't handle 5 byte sequences */
 694                 errno = EINVAL;
 695                 goto error;
 696         }
 697
 698         if (in_left > 0) {
 699                 errno = E2BIG;
 700                 goto error;
 701         }
 702
 703         *inbytesleft = in_left;
 704         *outbytesleft = out_left;
 705         *inbuf = (const char *)c;
 706         *outbuf = (char *)uc;
 707         return 0;
 708
 709 error:
 710         *inbytesleft = in_left;
 711         *outbytesleft = out_left;
 712         *inbuf = (const char *)c;
 713         *outbuf = (char *)uc;
 714         return -1;
 715 }
 716
 717
 718 /*
 719   this takes a UTF16 sequence and produces a UTF8 sequence
 720  */
 721 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 722                         char **outbuf, size_t *outbytesleft)
 723 {
 724         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 725         uint8_t *c = (uint8_t *)*outbuf;
 726         const uint8_t *uc = (const uint8_t *)*inbuf;
 727
 728         while (in_left >= 2 && out_left >= 1) {
 729                 unsigned int codepoint;
 730
 731                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 732                         /* simplest case */
 733                         c[0] = uc[0];
 734                         in_left  -= 2;
 735                         out_left -= 1;
 736                         uc += 2;
 737                         c  += 1;
 738                         continue;
 739                 }
 740
 741                 if ((uc[1]&0xf8) == 0) {
 742                         /* next simplest case */
 743                         if (out_left < 2) {
 744                                 errno = E2BIG;
 745                                 goto error;
 746                         }
 747                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 748                         c[1] = 0x80 | (uc[0] & 0x3f);
 749                         in_left  -= 2;
 750                         out_left -= 2;
 751                         uc += 2;
 752                         c  += 2;
 753                         continue;
 754                 }
 755
 756                 if ((uc[1] & 0xfc) == 0xdc) {
 757                         /* its the second part of a 4 byte sequence. Illegal */
 758                         if (in_left < 4) {
 759                                 errno = EINVAL;
 760                         } else {
 761                                 errno = EILSEQ;
 762                         }
 763                         goto error;
 764                 }
 765
 766                 if ((uc[1] & 0xfc) != 0xd8) {
 767                         codepoint = uc[0] | (uc[1]<<8);
 768                         if (out_left < 3) {
 769                                 errno = E2BIG;
 770                                 goto error;
 771                         }
 772                         c[0] = 0xe0 | (codepoint >> 12);
 773                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 774                         c[2] = 0x80 | (codepoint & 0x3f);
 775
 776                         in_left  -= 2;
 777                         out_left -= 3;
 778                         uc  += 2;
 779                         c   += 3;
 780                         continue;
 781                 }
 782
 783                 /* its the first part of a 4 byte sequence */
 784                 if (in_left < 4) {
 785                         errno = EINVAL;
 786                         goto error;
 787                 }
 788                 if ((uc[3] & 0xfc) != 0xdc) {
 789                         errno = EILSEQ;
 790                         goto error;
 791                 }
 792                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
 793                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 794
 795                 if (out_left < 4) {
 796                         errno = E2BIG;
 797                         goto error;
 798                 }
 799                 c[0] = 0xf0 | (codepoint >> 18);
 800                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 801                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 802                 c[3] = 0x80 | (codepoint & 0x3f);
 803
 804                 in_left  -= 4;
 805                 out_left -= 4;
 806                 uc       += 4;
 807                 c        += 4;
 808         }
 809
 810         if (in_left == 1) {
 811                 errno = EINVAL;
 812                 goto error;
 813         }
 814
 815         if (in_left > 1) {
 816                 errno = E2BIG;
 817                 goto error;
 818         }
 819
 820         *inbytesleft = in_left;
 821         *outbytesleft = out_left;
 822         *inbuf  = (const char *)uc;
 823         *outbuf = (char *)c;
 824
 825         return 0;
 826
 827 error:
 828         *inbytesleft = in_left;
 829         *outbytesleft = out_left;
 830         *inbuf  = (const char *)uc;
 831         *outbuf = (char *)c;
 832         return -1;
 833 }
 834
 835
 836 /*
 837   this takes a UTF16 munged sequence, modifies it according to the
 838   string2key rules, and produces a UTF16 sequence
 839
 840 The rules are:
 841
 842     1) any 0x0000 characters are mapped to 0x0001
 843
 844     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
 845        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
 846        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 847
 848     3) the same for any low surrogate that was not preceded by a high surrogate.
 849
 850  */
 851 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 852                                char **outbuf, size_t *outbytesleft)
 853 {
 854         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 855         uint8_t *c = (uint8_t *)*outbuf;
 856         const uint8_t *uc = (const uint8_t *)*inbuf;
 857
 858         while (in_left >= 2 && out_left >= 2) {
 859                 unsigned int codepoint = uc[0] | (uc[1]<<8);
 860
 861                 if (codepoint == 0) {
 862                         codepoint = 1;
 863                 }
 864
 865                 if ((codepoint & 0xfc00) == 0xd800) {
 866                         /* a high surrogate */
 867                         unsigned int codepoint2;
 868                         if (in_left < 4) {
 869                                 codepoint = 0xfffd;
 870                                 goto codepoint16;
 871                         }
 872                         codepoint2 = uc[2] | (uc[3]<<8);
 873                         if ((codepoint2 & 0xfc00) != 0xdc00) {
 874                                 /* high surrogate not followed by low
 875                                    surrogate: convert to 0xfffd */
 876                                 codepoint = 0xfffd;
 877                                 goto codepoint16;
 878                         }
 879                         if (out_left < 4) {
 880                                 errno = E2BIG;
 881                                 goto error;
 882                         }
 883                         memcpy(c, uc, 4);
 884                         in_left  -= 4;
 885                         out_left -= 4;
 886                         uc       += 4;
 887                         c        += 4;
 888                         continue;
 889                 }
 890
 891                 if ((codepoint & 0xfc00) == 0xdc00) {
 892                         /* low surrogate not preceded by high
 893                            surrogate: convert to 0xfffd */
 894                         codepoint = 0xfffd;
 895                 }
 896
 897         codepoint16:
 898                 c[0] = codepoint & 0xFF;
 899                 c[1] = (codepoint>>8) & 0xFF;
 900
 901                 in_left  -= 2;
 902                 out_left -= 2;
 903                 uc  += 2;
 904                 c   += 2;
 905                 continue;
 906         }
 907
 908         if (in_left == 1) {
 909                 errno = EINVAL;
 910                 goto error;
 911         }
 912
 913         if (in_left > 1) {
 914                 errno = E2BIG;
 915                 goto error;
 916         }
 917
 918         *inbytesleft = in_left;
 919         *outbytesleft = out_left;
 920         *inbuf  = (const char *)uc;
 921         *outbuf = (char *)c;
 922
 923         return 0;
 924
 925 error:
 926         *inbytesleft = in_left;
 927         *outbytesleft = out_left;
 928         *inbuf  = (const char *)uc;
 929         *outbuf = (char *)c;
 930         return -1;
 931 }
 932
 933
 934