lib/util/charset/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "../lib/util/dlinklist.h"
  23 #include "system/iconv.h"
  24 #include "system/filesys.h"
  25 #include "charset_proto.h"
  26
  27 #ifdef strcasecmp
  28 #undef strcasecmp
  29 #endif
  30
  31 /**
  32  * @file
  33  *
  34  * @brief Samba wrapper/stub for iconv character set conversion.
  35  *
  36  * iconv is the XPG2 interface for converting between character
  37  * encodings.  This file provides a Samba wrapper around it, and also
  38  * a simple reimplementation that is used if the system does not
  39  * implement iconv.
  40  *
  41  * Samba only works with encodings that are supersets of ASCII: ascii
  42  * characters like whitespace can be tested for directly, multibyte
  43  * sequences start with a byte with the high bit set, and strings are
  44  * terminated by a nul byte.
  45  *
  46  * Note that the only function provided by iconv is conversion between
  47  * characters.  It doesn't directly support operations like
  48  * uppercasing or comparison.  We have to convert to UTF-16LE and
  49  * compare there.
  50  *
  51  * @sa Samba Developers Guide
  52  **/
  53
  54 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  55 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
  57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
  58 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  59 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  63 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  64 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  65
  66 static const struct charset_functions builtin_functions[] = {
  67         /* windows is closest to UTF-16 */
  68         {"UCS-2LE",  iconv_copy, iconv_copy},
  69         {"UTF-16LE",  iconv_copy, iconv_copy},
  70         {"UCS-2BE",  iconv_swab, iconv_swab},
  71         {"UTF-16BE",  iconv_swab, iconv_swab},
  72
  73         /* we include the UTF-8 alias to cope with differing locale settings */
  74         {"UTF8",   utf8_pull,  utf8_push},
  75         {"UTF-8",   utf8_pull,  utf8_push},
  76
  77         /* this handles the munging needed for String2Key */
  78         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy, true},
  79
  80         {"ASCII", ascii_pull, ascii_push},
  81         {"646", ascii_pull, ascii_push},
  82         {"ISO-8859-1", latin1_pull, latin1_push},
  83 #ifdef DEVELOPER
  84         {"WEIRD", weird_pull, weird_push, true},
  85 #endif
  86 #ifdef DARWINOS
  87         {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
  88 #endif
  89         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
  90
  91 };
  92
  93 #ifdef HAVE_NATIVE_ICONV
  94 /* if there was an error then reset the internal state,
  95    this ensures that we don't have a shift state remaining for
  96    character sets like SJIS */
  97 static size_t sys_iconv(void *cd,
  98                         const char **inbuf, size_t *inbytesleft,
  99                         char **outbuf, size_t *outbytesleft)
 100 {
 101         size_t ret = iconv((iconv_t)cd,
 102                            discard_const_p(char *, inbuf), inbytesleft,
 103                            outbuf, outbytesleft);
 104         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 105         return ret;
 106 }
 107 #endif
 108
 109 /**
 110  * This is a simple portable iconv() implementaion.
 111  *
 112  * It only knows about a very small number of character sets - just
 113  * enough that Samba works on systems that don't have iconv.
 114  **/
 115 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
 116                  const char **inbuf, size_t *inbytesleft,
 117                  char **outbuf, size_t *outbytesleft)
 118 {
 119         /* in many cases we can go direct */
 120         if (cd->direct) {
 121                 return cd->direct(cd->cd_direct,
 122                                   inbuf, inbytesleft, outbuf, outbytesleft);
 123         }
 124
 125         /* otherwise we have to do it chunks at a time */
 126         {
 127 #ifndef SMB_ICONV_BUFSIZE
 128 #define SMB_ICONV_BUFSIZE 2048
 129 #endif
 130                 size_t bufsize;
 131                 char cvtbuf[SMB_ICONV_BUFSIZE];
 132
 133                 while (*inbytesleft > 0) {
 134                         char *bufp1 = cvtbuf;
 135                         const char *bufp2 = cvtbuf;
 136                         int saved_errno = errno;
 137                         bool pull_failed = false;
 138                         bufsize = SMB_ICONV_BUFSIZE;
 139
 140                         if (cd->pull(cd->cd_pull,
 141                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
 142                             && errno != E2BIG) {
 143                                 saved_errno = errno;
 144                                 pull_failed = true;
 145                         }
 146
 147                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
 148
 149                         if (cd->push(cd->cd_push,
 150                                      &bufp2, &bufsize,
 151                                      outbuf, outbytesleft) == -1) {
 152                                 return -1;
 153                         } else if (pull_failed) {
 154                                 /* We want the pull errno if possible */
 155                                 errno = saved_errno;
 156                                 return -1;
 157                         }
 158                 }
 159         }
 160
 161         return 0;
 162 }
 163
 164 static bool is_utf16(const char *name)
 165 {
 166         return strcasecmp(name, "UCS-2LE") == 0 ||
 167                 strcasecmp(name, "UTF-16LE") == 0;
 168 }
 169
 170 static int smb_iconv_t_destructor(smb_iconv_t hwd)
 171 {
 172 #ifdef HAVE_NATIVE_ICONV
 173         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 174                 iconv_close(hwd->cd_pull);
 175         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
 176                 iconv_close(hwd->cd_push);
 177         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
 178                 iconv_close(hwd->cd_direct);
 179 #endif
 180
 181         return 0;
 182 }
 183
 184 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 185                               const char *fromcode, bool use_builtin_handlers)
 186 {
 187         smb_iconv_t ret;
 188         const struct charset_functions *from=NULL, *to=NULL;
 189         int i;
 190
 191         ret = (smb_iconv_t)talloc_named(mem_ctx,
 192                                         sizeof(*ret),
 193                                         "iconv(%s,%s)", tocode, fromcode);
 194         if (!ret) {
 195                 errno = ENOMEM;
 196                 return (smb_iconv_t)-1;
 197         }
 198         memset(ret, 0, sizeof(*ret));
 199         talloc_set_destructor(ret, smb_iconv_t_destructor);
 200
 201         /* check for the simplest null conversion */
 202         if (strcmp(fromcode, tocode) == 0) {
 203                 ret->direct = iconv_copy;
 204                 return ret;
 205         }
 206
 207         /* check if we have a builtin function for this conversion */
 208         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 209                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 210                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 211                                 from = &builtin_functions[i];
 212                         }
 213                 }
 214                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 215                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 216                                 to = &builtin_functions[i];
 217                         }
 218                 }
 219         }
 220
 221 #ifdef HAVE_NATIVE_ICONV
 222         /* the from and to varaibles indicate a samba module or
 223          * internal conversion, ret->pull and ret->push are
 224          * initialised only in this block for iconv based
 225          * conversions */
 226
 227         if (from == NULL) {
 228                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 229                 if (ret->cd_pull == (iconv_t)-1)
 230                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 231                 if (ret->cd_pull != (iconv_t)-1) {
 232                         ret->pull = sys_iconv;
 233                 }
 234         }
 235
 236         if (to == NULL) {
 237                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 238                 if (ret->cd_push == (iconv_t)-1)
 239                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 240                 if (ret->cd_push != (iconv_t)-1) {
 241                         ret->push = sys_iconv;
 242                 }
 243         }
 244 #endif
 245
 246         if (ret->pull == NULL && from == NULL) {
 247                 goto failed;
 248         }
 249
 250         if (ret->push == NULL && to == NULL) {
 251                 goto failed;
 252         }
 253
 254         /* check for conversion to/from ucs2 */
 255         if (is_utf16(fromcode) && to) {
 256                 ret->direct = to->push;
 257                 return ret;
 258         }
 259         if (is_utf16(tocode) && from) {
 260                 ret->direct = from->pull;
 261                 return ret;
 262         }
 263
 264 #ifdef HAVE_NATIVE_ICONV
 265         if (is_utf16(fromcode)) {
 266                 ret->direct = sys_iconv;
 267                 ret->cd_direct = ret->cd_push;
 268                 ret->cd_push = NULL;
 269                 return ret;
 270         }
 271         if (is_utf16(tocode)) {
 272                 ret->direct = sys_iconv;
 273                 ret->cd_direct = ret->cd_pull;
 274                 ret->cd_pull = NULL;
 275                 return ret;
 276         }
 277 #endif
 278
 279         /* the general case has to go via a buffer */
 280         if (!ret->pull) ret->pull = from->pull;
 281         if (!ret->push) ret->push = to->push;
 282         return ret;
 283
 284 failed:
 285         talloc_free(ret);
 286         errno = EINVAL;
 287         return (smb_iconv_t)-1;
 288 }
 289
 290 /*
 291   simple iconv_open() wrapper
 292  */
 293 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 294 {
 295         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
 296 }
 297
 298 /*
 299   simple iconv_close() wrapper
 300 */
 301 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
 302 {
 303         talloc_free(cd);
 304         return 0;
 305 }
 306
 307
 308 /**********************************************************************
 309  the following functions implement the builtin character sets in Samba
 310  and also the "test" character sets that are designed to test
 311  multi-byte character set support for english users
 312 ***********************************************************************/
 313
 314 /*
 315   this takes an ASCII sequence and produces a UTF16 sequence
 316
 317   The first 127 codepoints of latin1 matches the first 127 codepoints
 318   of unicode, and so can be put into the first byte of UTF16LE
 319
 320  */
 321
 322 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 323                          char **outbuf, size_t *outbytesleft)
 324 {
 325         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 326                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
 327                         /* If this is multi-byte, then it isn't legal ASCII */
 328                         errno = EILSEQ;
 329                         return -1;
 330                 }
 331                 (*outbuf)[0] = (*inbuf)[0];
 332                 (*outbuf)[1] = 0;
 333                 (*inbytesleft)  -= 1;
 334                 (*outbytesleft) -= 2;
 335                 (*inbuf)  += 1;
 336                 (*outbuf) += 2;
 337         }
 338
 339         if (*inbytesleft > 0) {
 340                 errno = E2BIG;
 341                 return -1;
 342         }
 343
 344         return 0;
 345 }
 346
 347 /*
 348   this takes a UTF16 sequence and produces an ASCII sequence
 349
 350   The first 127 codepoints of ASCII matches the first 127 codepoints
 351   of unicode, and so can be read directly from the first byte of UTF16LE
 352
 353  */
 354 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 355                          char **outbuf, size_t *outbytesleft)
 356 {
 357         int ir_count=0;
 358
 359         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 360                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
 361                         (*inbuf)[1] != 0) {
 362                         /* If this is multi-byte, then it isn't legal ASCII */
 363                         errno = EILSEQ;
 364                         return -1;
 365                 }
 366                 (*outbuf)[0] = (*inbuf)[0];
 367                 (*inbytesleft)  -= 2;
 368                 (*outbytesleft) -= 1;
 369                 (*inbuf)  += 2;
 370                 (*outbuf) += 1;
 371         }
 372
 373         if (*inbytesleft == 1) {
 374                 errno = EINVAL;
 375                 return -1;
 376         }
 377
 378         if (*inbytesleft > 1) {
 379                 errno = E2BIG;
 380                 return -1;
 381         }
 382
 383         return ir_count;
 384 }
 385
 386 /*
 387   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
 388
 389   The first 256 codepoints of latin1 matches the first 256 codepoints
 390   of unicode, and so can be put into the first byte of UTF16LE
 391
 392  */
 393 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 394                           char **outbuf, size_t *outbytesleft)
 395 {
 396         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 397                 (*outbuf)[0] = (*inbuf)[0];
 398                 (*outbuf)[1] = 0;
 399                 (*inbytesleft)  -= 1;
 400                 (*outbytesleft) -= 2;
 401                 (*inbuf)  += 1;
 402                 (*outbuf) += 2;
 403         }
 404
 405         if (*inbytesleft > 0) {
 406                 errno = E2BIG;
 407                 return -1;
 408         }
 409
 410         return 0;
 411 }
 412
 413 /*
 414   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
 415
 416   The first 256 codepoints of latin1 matches the first 256 codepoints
 417   of unicode, and so can be read directly from the first byte of UTF16LE
 418
 419  */
 420 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
 421                          char **outbuf, size_t *outbytesleft)
 422 {
 423         int ir_count=0;
 424
 425         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 426                 (*outbuf)[0] = (*inbuf)[0];
 427                 if ((*inbuf)[1] != 0) {
 428                         /* If this is multi-byte, then it isn't legal latin1 */
 429                         errno = EILSEQ;
 430                         return -1;
 431                 }
 432                 (*inbytesleft)  -= 2;
 433                 (*outbytesleft) -= 1;
 434                 (*inbuf)  += 2;
 435                 (*outbuf) += 1;
 436         }
 437
 438         if (*inbytesleft == 1) {
 439                 errno = EINVAL;
 440                 return -1;
 441         }
 442
 443         if (*inbytesleft > 1) {
 444                 errno = E2BIG;
 445                 return -1;
 446         }
 447
 448         return ir_count;
 449 }
 450
 451 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 452                          char **outbuf, size_t *outbytesleft)
 453 {
 454         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 455                 unsigned int v;
 456
 457                 if ((*inbuf)[0] != '@') {
 458                         /* seven bit ascii case */
 459                         (*outbuf)[0] = (*inbuf)[0];
 460                         (*outbuf)[1] = 0;
 461                         (*inbytesleft)  -= 1;
 462                         (*outbytesleft) -= 2;
 463                         (*inbuf)  += 1;
 464                         (*outbuf) += 2;
 465                         continue;
 466                 }
 467                 /* it's a hex character */
 468                 if (*inbytesleft < 5) {
 469                         errno = EINVAL;
 470                         return -1;
 471                 }
 472
 473                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 474                         errno = EILSEQ;
 475                         return -1;
 476                 }
 477
 478                 (*outbuf)[0] = v&0xff;
 479                 (*outbuf)[1] = v>>8;
 480                 (*inbytesleft)  -= 5;
 481                 (*outbytesleft) -= 2;
 482                 (*inbuf)  += 5;
 483                 (*outbuf) += 2;
 484         }
 485
 486         if (*inbytesleft > 0) {
 487                 errno = E2BIG;
 488                 return -1;
 489         }
 490
 491         return 0;
 492 }
 493
 494 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 495                            char **outbuf, size_t *outbytesleft)
 496 {
 497         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 498                 char buf[6];
 499
 500                 if ((*inbuf)[1] == 0 &&
 501                     ((*inbuf)[0] & 0x80) == 0 &&
 502                     (*inbuf)[0] != '@') {
 503                         (*outbuf)[0] = (*inbuf)[0];
 504                         (*inbytesleft)  -= 2;
 505                         (*outbytesleft) -= 1;
 506                         (*inbuf)  += 2;
 507                         (*outbuf) += 1;
 508                         continue;
 509                 }
 510                 if (*outbytesleft < 5) {
 511                         errno = E2BIG;
 512                         return -1;
 513                 }
 514                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 515                 memcpy(*outbuf, buf, 5);
 516                 (*inbytesleft)  -= 2;
 517                 (*outbytesleft) -= 5;
 518                 (*inbuf)  += 2;
 519                 (*outbuf) += 5;
 520         }
 521
 522         if (*inbytesleft == 1) {
 523                 errno = EINVAL;
 524                 return -1;
 525         }
 526
 527         if (*inbytesleft > 1) {
 528                 errno = E2BIG;
 529                 return -1;
 530         }
 531
 532         return 0;
 533 }
 534
 535 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 536                          char **outbuf, size_t *outbytesleft)
 537 {
 538         int n;
 539
 540         n = MIN(*inbytesleft, *outbytesleft);
 541
 542         swab(*inbuf, *outbuf, (n&~1));
 543         if (n&1) {
 544                 (*outbuf)[n-1] = 0;
 545         }
 546
 547         (*inbytesleft) -= n;
 548         (*outbytesleft) -= n;
 549         (*inbuf) += n;
 550         (*outbuf) += n;
 551
 552         if (*inbytesleft > 0) {
 553                 errno = E2BIG;
 554                 return -1;
 555         }
 556
 557         return 0;
 558 }
 559
 560
 561 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 562                          char **outbuf, size_t *outbytesleft)
 563 {
 564         int n;
 565
 566         n = MIN(*inbytesleft, *outbytesleft);
 567
 568         memmove(*outbuf, *inbuf, n);
 569
 570         (*inbytesleft) -= n;
 571         (*outbytesleft) -= n;
 572         (*inbuf) += n;
 573         (*outbuf) += n;
 574
 575         if (*inbytesleft > 0) {
 576                 errno = E2BIG;
 577                 return -1;
 578         }
 579
 580         return 0;
 581 }
 582
 583 /*
 584   this takes a UTF8 sequence and produces a UTF16 sequence
 585  */
 586 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 587                          char **outbuf, size_t *outbytesleft)
 588 {
 589         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 590         const uint8_t *c = (const uint8_t *)*inbuf;
 591         uint8_t *uc = (uint8_t *)*outbuf;
 592
 593         while (in_left >= 1 && out_left >= 2) {
 594                 if ((c[0] & 0x80) == 0) {
 595                         uc[0] = c[0];
 596                         uc[1] = 0;
 597                         c  += 1;
 598                         in_left  -= 1;
 599                         out_left -= 2;
 600                         uc += 2;
 601                         continue;
 602                 }
 603
 604                 if ((c[0] & 0xe0) == 0xc0) {
 605                         if (in_left < 2 ||
 606                             (c[1] & 0xc0) != 0x80) {
 607                                 errno = EILSEQ;
 608                                 goto error;
 609                         }
 610                         uc[1] = (c[0]>>2) & 0x7;
 611                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 612                         c  += 2;
 613                         in_left  -= 2;
 614                         out_left -= 2;
 615                         uc += 2;
 616                         continue;
 617                 }
 618
 619                 if ((c[0] & 0xf0) == 0xe0) {
 620                         if (in_left < 3 ||
 621                             (c[1] & 0xc0) != 0x80 ||
 622                             (c[2] & 0xc0) != 0x80) {
 623                                 errno = EILSEQ;
 624                                 goto error;
 625                         }
 626                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 627                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 628                         c  += 3;
 629                         in_left  -= 3;
 630                         out_left -= 2;
 631                         uc += 2;
 632                         continue;
 633                 }
 634
 635                 if ((c[0] & 0xf8) == 0xf0) {
 636                         unsigned int codepoint;
 637                         if (in_left < 4 ||
 638                             (c[1] & 0xc0) != 0x80 ||
 639                             (c[2] & 0xc0) != 0x80 ||
 640                             (c[3] & 0xc0) != 0x80) {
 641                                 errno = EILSEQ;
 642                                 goto error;
 643                         }
 644                         codepoint =
 645                                 (c[3]&0x3f) |
 646                                 ((c[2]&0x3f)<<6) |
 647                                 ((c[1]&0x3f)<<12) |
 648                                 ((c[0]&0x7)<<18);
 649                         if (codepoint < 0x10000) {
 650                                 /* accept UTF-8 characters that are not
 651                                    minimally packed, but pack the result */
 652                                 uc[0] = (codepoint & 0xFF);
 653                                 uc[1] = (codepoint >> 8);
 654                                 c += 4;
 655                                 in_left -= 4;
 656                                 out_left -= 2;
 657                                 uc += 2;
 658                                 continue;
 659                         }
 660
 661                         codepoint -= 0x10000;
 662
 663                         if (out_left < 4) {
 664                                 errno = E2BIG;
 665                                 goto error;
 666                         }
 667
 668                         uc[0] = (codepoint>>10) & 0xFF;
 669                         uc[1] = (codepoint>>18) | 0xd8;
 670                         uc[2] = codepoint & 0xFF;
 671                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 672                         c  += 4;
 673                         in_left  -= 4;
 674                         out_left -= 4;
 675                         uc += 4;
 676                         continue;
 677                 }
 678
 679                 /* we don't handle 5 byte sequences */
 680                 errno = EINVAL;
 681                 goto error;
 682         }
 683
 684         if (in_left > 0) {
 685                 errno = E2BIG;
 686                 goto error;
 687         }
 688
 689         *inbytesleft = in_left;
 690         *outbytesleft = out_left;
 691         *inbuf = (const char *)c;
 692         *outbuf = (char *)uc;
 693         return 0;
 694
 695 error:
 696         *inbytesleft = in_left;
 697         *outbytesleft = out_left;
 698         *inbuf = (const char *)c;
 699         *outbuf = (char *)uc;
 700         return -1;
 701 }
 702
 703
 704 /*
 705   this takes a UTF16 sequence and produces a UTF8 sequence
 706  */
 707 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 708                         char **outbuf, size_t *outbytesleft)
 709 {
 710         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 711         uint8_t *c = (uint8_t *)*outbuf;
 712         const uint8_t *uc = (const uint8_t *)*inbuf;
 713
 714         while (in_left >= 2 && out_left >= 1) {
 715                 unsigned int codepoint;
 716
 717                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 718                         /* simplest case */
 719                         c[0] = uc[0];
 720                         in_left  -= 2;
 721                         out_left -= 1;
 722                         uc += 2;
 723                         c  += 1;
 724                         continue;
 725                 }
 726
 727                 if ((uc[1]&0xf8) == 0) {
 728                         /* next simplest case */
 729                         if (out_left < 2) {
 730                                 errno = E2BIG;
 731                                 goto error;
 732                         }
 733                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 734                         c[1] = 0x80 | (uc[0] & 0x3f);
 735                         in_left  -= 2;
 736                         out_left -= 2;
 737                         uc += 2;
 738                         c  += 2;
 739                         continue;
 740                 }
 741
 742                 if ((uc[1] & 0xfc) == 0xdc) {
 743                         /* its the second part of a 4 byte sequence. Illegal */
 744                         if (in_left < 4) {
 745                                 errno = EINVAL;
 746                         } else {
 747                                 errno = EILSEQ;
 748                         }
 749                         goto error;
 750                 }
 751
 752                 if ((uc[1] & 0xfc) != 0xd8) {
 753                         codepoint = uc[0] | (uc[1]<<8);
 754                         if (out_left < 3) {
 755                                 errno = E2BIG;
 756                                 goto error;
 757                         }
 758                         c[0] = 0xe0 | (codepoint >> 12);
 759                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 760                         c[2] = 0x80 | (codepoint & 0x3f);
 761
 762                         in_left  -= 2;
 763                         out_left -= 3;
 764                         uc  += 2;
 765                         c   += 3;
 766                         continue;
 767                 }
 768
 769                 /* its the first part of a 4 byte sequence */
 770                 if (in_left < 4) {
 771                         errno = EINVAL;
 772                         goto error;
 773                 }
 774                 if ((uc[3] & 0xfc) != 0xdc) {
 775                         errno = EILSEQ;
 776                         goto error;
 777                 }
 778                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
 779                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 780
 781                 if (out_left < 4) {
 782                         errno = E2BIG;
 783                         goto error;
 784                 }
 785                 c[0] = 0xf0 | (codepoint >> 18);
 786                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 787                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 788                 c[3] = 0x80 | (codepoint & 0x3f);
 789
 790                 in_left  -= 4;
 791                 out_left -= 4;
 792                 uc       += 4;
 793                 c        += 4;
 794         }
 795
 796         if (in_left == 1) {
 797                 errno = EINVAL;
 798                 goto error;
 799         }
 800
 801         if (in_left > 1) {
 802                 errno = E2BIG;
 803                 goto error;
 804         }
 805
 806         *inbytesleft = in_left;
 807         *outbytesleft = out_left;
 808         *inbuf  = (const char *)uc;
 809         *outbuf = (char *)c;
 810
 811         return 0;
 812
 813 error:
 814         *inbytesleft = in_left;
 815         *outbytesleft = out_left;
 816         *inbuf  = (const char *)uc;
 817         *outbuf = (char *)c;
 818         return -1;
 819 }
 820
 821
 822 /*
 823   this takes a UTF16 munged sequence, modifies it according to the
 824   string2key rules, and produces a UTF16 sequence
 825
 826 The rules are:
 827
 828     1) any 0x0000 characters are mapped to 0x0001
 829
 830     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
 831        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
 832        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 833
 834     3) the same for any low surrogate that was not preceded by a high surrogate.
 835
 836  */
 837 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 838                                char **outbuf, size_t *outbytesleft)
 839 {
 840         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 841         uint8_t *c = (uint8_t *)*outbuf;
 842         const uint8_t *uc = (const uint8_t *)*inbuf;
 843
 844         while (in_left >= 2 && out_left >= 2) {
 845                 unsigned int codepoint = uc[0] | (uc[1]<<8);
 846
 847                 if (codepoint == 0) {
 848                         codepoint = 1;
 849                 }
 850
 851                 if ((codepoint & 0xfc00) == 0xd800) {
 852                         /* a high surrogate */
 853                         unsigned int codepoint2;
 854                         if (in_left < 4) {
 855                                 codepoint = 0xfffd;
 856                                 goto codepoint16;
 857                         }
 858                         codepoint2 = uc[2] | (uc[3]<<8);
 859                         if ((codepoint2 & 0xfc00) != 0xdc00) {
 860                                 /* high surrogate not followed by low
 861                                    surrogate: convert to 0xfffd */
 862                                 codepoint = 0xfffd;
 863                                 goto codepoint16;
 864                         }
 865                         if (out_left < 4) {
 866                                 errno = E2BIG;
 867                                 goto error;
 868                         }
 869                         memcpy(c, uc, 4);
 870                         in_left  -= 4;
 871                         out_left -= 4;
 872                         uc       += 4;
 873                         c        += 4;
 874                         continue;
 875                 }
 876
 877                 if ((codepoint & 0xfc00) == 0xdc00) {
 878                         /* low surrogate not preceded by high
 879                            surrogate: convert to 0xfffd */
 880                         codepoint = 0xfffd;
 881                 }
 882
 883         codepoint16:
 884                 c[0] = codepoint & 0xFF;
 885                 c[1] = (codepoint>>8) & 0xFF;
 886
 887                 in_left  -= 2;
 888                 out_left -= 2;
 889                 uc  += 2;
 890                 c   += 2;
 891                 continue;
 892         }
 893
 894         if (in_left == 1) {
 895                 errno = EINVAL;
 896                 goto error;
 897         }
 898
 899         if (in_left > 1) {
 900                 errno = E2BIG;
 901                 goto error;
 902         }
 903
 904         *inbytesleft = in_left;
 905         *outbytesleft = out_left;
 906         *inbuf  = (const char *)uc;
 907         *outbuf = (char *)c;
 908
 909         return 0;
 910
 911 error:
 912         *inbytesleft = in_left;
 913         *outbytesleft = out_left;
 914         *inbuf  = (const char *)uc;
 915         *outbuf = (char *)c;
 916         return -1;
 917 }
 918
 919
 920