lib/util/charset/charset_macosxfs.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba charset module for Mac OS X/Darwin
   4    Copyright (C) Benjamin Riefenstahl 2003
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 /*
  21  * modules/charset_macosxfs.c
  22  *
  23  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
  24  * and display encoding.
  25  *
  26  * Actually two implementations are provided here.  The default
  27  * implementation is based on the official CFString API.  The other is
  28  * based on internal CFString APIs as defined in the OpenDarwin
  29  * source.
  30  */
  31
  32 #include "replace.h"
  33 #include "charset.h"
  34 #include "charset_proto.h"
  35 #include "lib/util/debug.h"
  36 #undef realloc
  37
  38 #ifdef DARWINOS
  39
  40 /*
  41  * Include OS frameworks.  These are only needed in this module.
  42  */
  43 #include <CoreFoundation/CFString.h>
  44
  45 /*
  46  * See if autoconf has found us the internal headers in some form.
  47  */
  48 #if defined(HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H)
  49 #       include <CoreFoundation/CFStringEncodingConverter.h>
  50 #       include <CoreFoundation/CFUnicodePrecomposition.h>
  51 #       define USE_INTERNAL_API 1
  52 #elif defined(HAVE_CFSTRINGENCODINGCONVERTER_H)
  53 #       include <CFStringEncodingConverter.h>
  54 #       include <CFUnicodePrecomposition.h>
  55 #       define USE_INTERNAL_API 1
  56 #endif
  57
  58 /*
  59  * Compile time configuration: Do we want debug output?
  60  */
  61 /* #define DEBUG_STRINGS 1 */
  62
  63 /*
  64  * A simple, but efficient memory provider for our buffers.
  65  */
  66 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
  67 {
  68         if (newsize > *size) {
  69                 *size = newsize + 128;
  70                 buffer = realloc(buffer, *size);
  71         }
  72         return buffer;
  73 }
  74
  75 /*
  76  * While there is a version of OpenDarwin for intel, the usual case is
  77  * big-endian PPC.  So we need byte swapping to handle the
  78  * little-endian byte order of the network protocol.  We also need an
  79  * additional dynamic buffer to do this work for incoming data blocks,
  80  * because we have to consider the original data as constant.
  81  *
  82  * We abstract the differences away by providing a simple facade with
  83  * these functions/macros:
  84  *
  85  *      le_to_native(dst,src,len)
  86  *      native_to_le(cp,len)
  87  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
  88  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
  89  */
  90 #ifdef WORDS_BIGENDIAN
  91
  92 static inline void swap_bytes (char * dst, const char * src, size_t len)
  93 {
  94         const char *srcend = src + len;
  95         while (src < srcend) {
  96                 dst[0] = src[1];
  97                 dst[1] = src[0];
  98                 dst += 2;
  99                 src += 2;
 100         }
 101 }
 102 static inline void swap_bytes_inplace (char * cp, size_t len)
 103 {
 104         char temp;
 105         char *end = cp + len;
 106         while (cp  < end) {
 107                 temp = cp[1];
 108                 cp[1] = cp[0];
 109                 cp[0] = temp;
 110                 cp += 2;
 111         }
 112 }
 113
 114 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
 115 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
 116 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 117         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
 118
 119 #else   /* ! WORDS_BIGENDIAN */
 120
 121 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
 122 #define native_to_le(cp,len)            /* nothing */
 123 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 124         (((void)(bufsize)),(UniChar*)(data))
 125
 126 #endif
 127
 128 static inline UniChar *set_ucbuffer_with_le_copy (
 129         UniChar *buffer, size_t *bufsize,
 130         const void *data, size_t size, size_t reserve)
 131 {
 132         buffer = resize_buffer(buffer, bufsize, size+reserve);
 133         le_to_native((char*)buffer,data,size);
 134         return buffer;
 135 }
 136
 137
 138 /*
 139  * A simple hexdump function for debugging error conditions.
 140  */
 141 #define debug_out(s)    DEBUG(0,(s))
 142
 143 #ifdef DEBUG_STRINGS
 144
 145 static void hexdump( const char * label, const char * s, size_t len )
 146 {
 147         size_t restlen = len;
 148         debug_out("<<<<<<<\n");
 149         debug_out(label);
 150         debug_out("\n");
 151         while (restlen > 0) {
 152                 char line[100];
 153                 size_t i, j;
 154                 char * d = line;
 155 #undef sprintf
 156                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
 157                 *d++ = ' ';
 158                 for( i = 0; i<restlen && i<8; ++i ) {
 159                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 160                 }
 161                 for( j = i; j<8; ++j ) {
 162                         d += sprintf(d, "   ");
 163                 }
 164                 *d++ = ' ';
 165                 for( i = 8; i<restlen && i<16; ++i ) {
 166                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 167                 }
 168                 for( j = i; j<16; ++j ) {
 169                         d += sprintf(d, "   ");
 170                 }
 171                 *d++ = ' ';
 172                 for( i = 0; i<restlen && i<16; ++i ) {
 173                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
 174                                 *d++ = '.';
 175                         else
 176                                 *d++ = s[i];
 177                 }
 178                 *d++ = '\n';
 179                 *d = 0;
 180                 restlen -= i;
 181                 s += i;
 182                 debug_out(line);
 183         }
 184         debug_out(">>>>>>>\n");
 185 }
 186
 187 #else   /* !DEBUG_STRINGS */
 188
 189 #define hexdump(label,s,len) /* nothing */
 190
 191 #endif
 192
 193
 194 #if !USE_INTERNAL_API
 195
 196 /*
 197  * An implementation based on documented Mac OS X APIs.
 198  *
 199  * This does a certain amount of memory management, creating and
 200  * manipulating CFString objects.  We try to minimize the impact by
 201  * keeping those objects around and re-using them.  We also use
 202  * external backing store for the CFStrings where this is possible and
 203  * benficial.
 204  *
 205  * The Unicode normalizations forms available at this level are
 206  * generic, not specifically for the file system.  So they may not be
 207  * perfect fits.
 208  */
 209 size_t macosxfs_encoding_pull(
 210         void *cd,                               /* Encoder handle */
 211         const char **inbuf, size_t *inbytesleft, /* Script string */
 212         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 213 {
 214         static const int script_code = kCFStringEncodingUTF8;
 215         static CFMutableStringRef cfstring = NULL;
 216         size_t outsize;
 217         CFRange range;
 218
 219         (void) cd; /* UNUSED */
 220
 221         if (0 == *inbytesleft) {
 222                 return 0;
 223         }
 224
 225         if (NULL == cfstring) {
 226                 /*
 227                  * A version with an external backing store as in the
 228                  * push function should have been more efficient, but
 229                  * testing shows, that it is actually slower (!).
 230                  * Maybe kCFAllocatorDefault gets shortcut evaluation
 231                  * internally, while kCFAllocatorNull doesn't.
 232                  */
 233                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
 234         }
 235
 236         /*
 237          * Three methods of appending to a CFString, choose the most
 238          * efficient.
 239          */
 240         if (0 == (*inbuf)[*inbytesleft-1]) {
 241                 CFStringAppendCString(cfstring, *inbuf, script_code);
 242         } else if (*inbytesleft <= 255) {
 243                 Str255 buffer;
 244                 buffer[0] = *inbytesleft;
 245                 memcpy(buffer+1, *inbuf, buffer[0]);
 246                 CFStringAppendPascalString(cfstring, buffer, script_code);
 247         } else {
 248                 /*
 249                  * We would like to use a fixed buffer and a loop
 250                  * here, but then we can't guarantee that the input is
 251                  * well-formed UTF-8, as we are supposed to do.
 252                  */
 253                 static char *buffer = NULL;
 254                 static size_t buflen = 0;
 255                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
 256                 memcpy(buffer, *inbuf, *inbytesleft);
 257                 buffer[*inbytesleft] = 0;
 258                 CFStringAppendCString(cfstring, *inbuf, script_code);
 259         }
 260
 261         /*
 262          * Compose characters, using the non-canonical composition
 263          * form.
 264          */
 265         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
 266
 267         outsize = CFStringGetLength(cfstring);
 268         range = CFRangeMake(0,outsize);
 269
 270         if (outsize == 0) {
 271                 /*
 272                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 273                  * errors here.  That function will always pass 2
 274                  * characters.  smbd/open.c:check_for_pipe() cuts a
 275                  * patchname to 10 characters blindly.  Suppress the
 276                  * debug output in those cases.
 277                  */
 278                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 279                         debug_out("String conversion: "
 280                                   "An unknown error occurred\n");
 281                         hexdump("UTF8->UTF16LE (old) input",
 282                                 *inbuf, *inbytesleft);
 283                 }
 284                 errno = EILSEQ; /* Not sure, but this is what we have
 285                                  * actually seen. */
 286                 return -1;
 287         }
 288         if (outsize*2 > *outbytesleft) {
 289                 CFStringDelete(cfstring, range);
 290                 debug_out("String conversion: "
 291                           "Output buffer too small\n");
 292                 hexdump("UTF8->UTF16LE (old) input",
 293                         *inbuf, *inbytesleft);
 294                 errno = E2BIG;
 295                 return -1;
 296         }
 297
 298         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
 299         CFStringDelete(cfstring, range);
 300
 301         native_to_le(*outbuf, outsize*2);
 302
 303         /*
 304          * Add a converted null byte, if the CFString conversions
 305          * prevented that until now.
 306          */
 307         if (0 == (*inbuf)[*inbytesleft-1] &&
 308             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
 309
 310                 if ((outsize*2+2) > *outbytesleft) {
 311                         debug_out("String conversion: "
 312                                   "Output buffer too small\n");
 313                         hexdump("UTF8->UTF16LE (old) input",
 314                                 *inbuf, *inbytesleft);
 315                         errno = E2BIG;
 316                         return -1;
 317                 }
 318
 319                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
 320                 outsize += 2;
 321         }
 322
 323         *inbuf += *inbytesleft;
 324         *inbytesleft = 0;
 325         *outbuf += outsize*2;
 326         *outbytesleft -= outsize*2;
 327
 328         return 0;
 329 }
 330
 331 size_t macosxfs_encoding_push(
 332         void *cd,                               /* Encoder handle */
 333         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
 334         char **outbuf, size_t *outbytesleft)    /* Script string */
 335 {
 336         static const int script_code = kCFStringEncodingUTF8;
 337         static CFMutableStringRef cfstring = NULL;
 338         static UniChar *buffer = NULL;
 339         static size_t buflen = 0;
 340         CFIndex outsize, cfsize, charsconverted;
 341
 342         (void) cd; /* UNUSED */
 343
 344         if (0 == *inbytesleft) {
 345                 return 0;
 346         }
 347
 348         /*
 349          * We need a buffer that can hold 4 times the original data,
 350          * because that is the theoretical maximum that decomposition
 351          * can create currently (in Unicode 4.0).
 352          */
 353         buffer = set_ucbuffer_with_le_copy(
 354                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
 355
 356         if (NULL == cfstring) {
 357                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
 358                         kCFAllocatorDefault,
 359                         buffer, *inbytesleft/2, buflen/2,
 360                         kCFAllocatorNull);
 361         } else {
 362                 CFStringSetExternalCharactersNoCopy(
 363                         cfstring,
 364                         buffer, *inbytesleft/2, buflen/2);
 365         }
 366
 367         /*
 368          * Decompose characters, using the non-canonical decomposition
 369          * form.
 370          *
 371          * NB: This isn't exactly what HFS+ wants (see note on
 372          * kCFStringEncodingUseHFSPlusCanonical in
 373          * CFStringEncodingConverter.h), but AFAIK it's the best that
 374          * the official API can do.
 375          */
 376         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
 377
 378         cfsize = CFStringGetLength(cfstring);
 379         charsconverted = CFStringGetBytes(
 380                 cfstring, CFRangeMake(0,cfsize),
 381                 script_code, 0, false,
 382                 *(UInt8 **)outbuf, *outbytesleft, &outsize);
 383
 384         if (0 == charsconverted) {
 385                 debug_out("String conversion: "
 386                           "Buffer too small or not convertible\n");
 387                 hexdump("UTF16LE->UTF8 (old) input",
 388                         *inbuf, *inbytesleft);
 389                 errno = EILSEQ; /* Probably more likely. */
 390                 return -1;
 391         }
 392
 393         /*
 394          * Add a converted null byte, if the CFString conversions
 395          * prevented that until now.
 396          */
 397         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
 398             (0 != (*outbuf)[outsize-1])) {
 399
 400                 if (((size_t)outsize+1) > *outbytesleft) {
 401                         debug_out("String conversion: "
 402                                   "Output buffer too small\n");
 403                         hexdump("UTF16LE->UTF8 (old) input",
 404                                 *inbuf, *inbytesleft);
 405                         errno = E2BIG;
 406                         return -1;
 407                 }
 408
 409                 (*outbuf)[outsize] = 0;
 410                 ++outsize;
 411         }
 412
 413         *inbuf += *inbytesleft;
 414         *inbytesleft = 0;
 415         *outbuf += outsize;
 416         *outbytesleft -= outsize;
 417
 418         return 0;
 419 }
 420
 421 #else /* USE_INTERNAL_API */
 422
 423 /*
 424  * An implementation based on internal code as known from the
 425  * OpenDarwin CVS.
 426  *
 427  * This code doesn't need much memory management because it uses
 428  * functions that operate on the raw memory directly.
 429  *
 430  * The push routine here is faster and more compatible with HFS+ than
 431  * the other implementation above.  The pull routine is only faster
 432  * for some strings, slightly slower for others.  The pull routine
 433  * looses because it has to iterate over the data twice, once to
 434  * decode UTF-8 and than to do the character composition required by
 435  * Windows.
 436  */
 437 static size_t macosxfs_encoding_pull(
 438         void *cd,                               /* Encoder handle */
 439         const char **inbuf, size_t *inbytesleft, /* Script string */
 440         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 441 {
 442         static const int script_code = kCFStringEncodingUTF8;
 443         UInt32 srcCharsUsed = 0;
 444         UInt32 dstCharsUsed = 0;
 445         UInt32 result;
 446         uint32_t dstDecomposedUsed = 0;
 447         uint32_t dstPrecomposedUsed = 0;
 448
 449         (void) cd; /* UNUSED */
 450
 451         if (0 == *inbytesleft) {
 452                 return 0;
 453         }
 454
 455         result = CFStringEncodingBytesToUnicode(
 456                 script_code, kCFStringEncodingComposeCombinings,
 457                 *inbuf, *inbytesleft, &srcCharsUsed,
 458                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
 459
 460         switch(result) {
 461         case kCFStringEncodingConversionSuccess:
 462                 if (*inbytesleft == srcCharsUsed) {
 463                         break;
 464                 }
 465
 466                 FALL_THROUGH;
 467         case kCFStringEncodingInsufficientOutputBufferLength:
 468                 debug_out("String conversion: "
 469                           "Output buffer too small\n");
 470                 hexdump("UTF8->UTF16LE (new) input",
 471                         *inbuf, *inbytesleft);
 472                 errno = E2BIG;
 473                 return -1;
 474         case kCFStringEncodingInvalidInputStream:
 475                 /*
 476                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 477                  * errors here.  That function will always pass 2
 478                  * characters.  smbd/open.c:check_for_pipe() cuts a
 479                  * patchname to 10 characters blindly.  Suppress the
 480                  * debug output in those cases.
 481                  */
 482                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 483                         debug_out("String conversion: "
 484                                   "Invalid input sequence\n");
 485                         hexdump("UTF8->UTF16LE (new) input",
 486                                 *inbuf, *inbytesleft);
 487                 }
 488                 errno = EILSEQ;
 489                 return -1;
 490         case kCFStringEncodingConverterUnavailable:
 491                 debug_out("String conversion: "
 492                           "Unknown encoding\n");
 493                 hexdump("UTF8->UTF16LE (new) input",
 494                         *inbuf, *inbytesleft);
 495                 errno = EINVAL;
 496                 return -1;
 497         }
 498
 499         /*
 500          * It doesn't look like CFStringEncodingBytesToUnicode() can
 501          * produce precomposed characters (flags=ComposeCombinings
 502          * doesn't do it), so we need another pass over the data here.
 503          * We can do this in-place, as the string can only get
 504          * shorter.
 505          *
 506          * (Actually in theory there should be an internal
 507          * decomposition and reordering before the actual composition
 508          * step.  But we should be able to rely on that we always get
 509          * fully decomposed strings for input, so this can't create
 510          * problems in reality.)
 511          */
 512         CFUniCharPrecompose(
 513                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
 514                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
 515
 516         native_to_le(*outbuf, dstPrecomposedUsed*2);
 517
 518         *inbuf += srcCharsUsed;
 519         *inbytesleft -= srcCharsUsed;
 520         *outbuf += dstPrecomposedUsed*2;
 521         *outbytesleft -= dstPrecomposedUsed*2;
 522
 523         return 0;
 524 }
 525
 526 static size_t macosxfs_encoding_push(
 527         void *cd,                               /* Encoder handle */
 528         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
 529         char **outbuf, size_t *outbytesleft)    /* Script string */
 530 {
 531         static const int script_code = kCFStringEncodingUTF8;
 532         static UniChar *buffer = NULL;
 533         static size_t buflen = 0;
 534         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
 535
 536         (void) cd; /* UNUSED */
 537
 538         if (0 == *inbytesleft) {
 539                 return 0;
 540         }
 541
 542         buffer = set_ucbuffer_with_le(
 543                 buffer, &buflen, *inbuf, *inbytesleft);
 544
 545         result = CFStringEncodingUnicodeToBytes(
 546                 script_code, kCFStringEncodingUseHFSPlusCanonical,
 547                 buffer, *inbytesleft/2, &srcCharsUsed,
 548                 *outbuf, *outbytesleft, &dstCharsUsed);
 549
 550         switch(result) {
 551         case kCFStringEncodingConversionSuccess:
 552                 if (*inbytesleft/2 == srcCharsUsed) {
 553                         break;
 554                 }
 555
 556                 FALL_THROUGH;
 557         case kCFStringEncodingInsufficientOutputBufferLength:
 558                 debug_out("String conversion: "
 559                           "Output buffer too small\n");
 560                 hexdump("UTF16LE->UTF8 (new) input",
 561                         *inbuf, *inbytesleft);
 562                 errno = E2BIG;
 563                 return -1;
 564         case kCFStringEncodingInvalidInputStream:
 565                 /*
 566                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
 567                  * cuts a pathname to 10 characters blindly.  Suppress
 568                  * the debug output in those cases.
 569                  */
 570                 if(10 != *inbytesleft) {
 571                         debug_out("String conversion: "
 572                                   "Invalid input sequence\n");
 573                         hexdump("UTF16LE->UTF8 (new) input",
 574                                 *inbuf, *inbytesleft);
 575                 }
 576                 errno = EILSEQ;
 577                 return -1;
 578         case kCFStringEncodingConverterUnavailable:
 579                 debug_out("String conversion: "
 580                           "Unknown encoding\n");
 581                 hexdump("UTF16LE->UTF8 (new) input",
 582                         *inbuf, *inbytesleft);
 583                 errno = EINVAL;
 584                 return -1;
 585         }
 586
 587         *inbuf += srcCharsUsed*2;
 588         *inbytesleft -= srcCharsUsed*2;
 589         *outbuf += dstCharsUsed;
 590         *outbytesleft -= dstCharsUsed;
 591
 592         return 0;
 593 }
 594
 595 #endif /* USE_INTERNAL_API */
 596
 597 #else /* DARWIN */
 598
 599 void charset_macosfs_dummy(void);
 600 void charset_macosfs_dummy(void)
 601 {
 602         return;
 603 }
 604
 605 #endif /* DARWIN */