source/modules/charset_macosxfs.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba charset module for Mac OS X/Darwin
   4    Copyright (C) Benjamin Riefenstahl 2003
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 /*
  21  * modules/charset_macosxfs.c
  22  *
  23  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
  24  * and display encoding.
  25  *
  26  * Actually two implementations are provided here.  The default
  27  * implementation is based on the official CFString API.  The other is
  28  * based on internal CFString APIs as defined in the OpenDarwin
  29  * source.
  30  */
  31
  32 #include "includes.h"
  33
  34 /*
  35  * Include OS frameworks.  These are only needed in this module.
  36  */
  37 #include <CoreFoundation/CFString.h>
  38
  39 /*
  40  * See if autoconf has found us the internal headers in some form.
  41  */
  42 #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
  43 #       include <CoreFoundation/CFStringEncodingConverter.h>
  44 #       include <CoreFoundation/CFUnicodePrecomposition.h>
  45 #       define USE_INTERNAL_API 1
  46 #elif HAVE_CFSTRINGENCODINGCONVERTER_H
  47 #       include <CFStringEncodingConverter.h>
  48 #       include <CFUnicodePrecomposition.h>
  49 #       define USE_INTERNAL_API 1
  50 #endif
  51
  52 /*
  53  * Compile time configuration: Do we want debug output?
  54  */
  55 /* #define DEBUG_STRINGS 1 */
  56
  57 /*
  58  * A simple, but efficient memory provider for our buffers.
  59  */
  60 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
  61 {
  62         if (newsize > *size) {
  63                 *size = newsize + 128;
  64                 buffer = SMB_REALLOC(buffer, *size);
  65         }
  66         return buffer;
  67 }
  68
  69 /*
  70  * While there is a version of OpenDarwin for intel, the usual case is
  71  * big-endian PPC.  So we need byte swapping to handle the
  72  * little-endian byte order of the network protocol.  We also need an
  73  * additional dynamic buffer to do this work for incoming data blocks,
  74  * because we have to consider the original data as constant.
  75  *
  76  * We abstract the differences away by providing a simple facade with
  77  * these functions/macros:
  78  *
  79  *      le_to_native(dst,src,len)
  80  *      native_to_le(cp,len)
  81  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
  82  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
  83  */
  84 #ifdef WORDS_BIGENDIAN
  85
  86 static inline void swap_bytes (char * dst, const char * src, size_t len)
  87 {
  88         const char *srcend = src + len;
  89         while (src < srcend) {
  90                 dst[0] = src[1];
  91                 dst[1] = src[0];
  92                 dst += 2;
  93                 src += 2;
  94         }
  95 }
  96 static inline void swap_bytes_inplace (char * cp, size_t len)
  97 {
  98         char temp;
  99         char *end = cp + len;
 100         while (cp  < end) {
 101                 temp = cp[1];
 102                 cp[1] = cp[0];
 103                 cp[0] = temp;
 104                 cp += 2;
 105         }
 106 }
 107
 108 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
 109 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
 110 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 111         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
 112
 113 #else   /* ! WORDS_BIGENDIAN */
 114
 115 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
 116 #define native_to_le(cp,len)            /* nothing */
 117 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 118         (((void)(bufsize)),(UniChar*)(data))
 119
 120 #endif
 121
 122 static inline UniChar *set_ucbuffer_with_le_copy (
 123         UniChar *buffer, size_t *bufsize,
 124         const void *data, size_t size, size_t reserve)
 125 {
 126         buffer = resize_buffer(buffer, bufsize, size+reserve);
 127         le_to_native((char*)buffer,data,size);
 128         return buffer;
 129 }
 130
 131
 132 /*
 133  * A simple hexdump function for debugging error conditions.
 134  */
 135 #define debug_out(s)    DEBUG(0,(s))
 136
 137 #ifdef DEBUG_STRINGS
 138
 139 static void hexdump( const char * label, const char * s, size_t len )
 140 {
 141         size_t restlen = len;
 142         debug_out("<<<<<<<\n");
 143         debug_out(label);
 144         debug_out("\n");
 145         while (restlen > 0) {
 146                 char line[100];
 147                 size_t i, j;
 148                 char * d = line;
 149 #undef sprintf
 150                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
 151                 *d++ = ' ';
 152                 for( i = 0; i<restlen && i<8; ++i ) {
 153                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 154                 }
 155                 for( j = i; j<8; ++j ) {
 156                         d += sprintf(d, "   ");
 157                 }
 158                 *d++ = ' ';
 159                 for( i = 8; i<restlen && i<16; ++i ) {
 160                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 161                 }
 162                 for( j = i; j<16; ++j ) {
 163                         d += sprintf(d, "   ");
 164                 }
 165                 *d++ = ' ';
 166                 for( i = 0; i<restlen && i<16; ++i ) {
 167                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
 168                                 *d++ = '.';
 169                         else
 170                                 *d++ = s[i];
 171                 }
 172                 *d++ = '\n';
 173                 *d = 0;
 174                 restlen -= i;
 175                 s += i;
 176                 debug_out(line);
 177         }
 178         debug_out(">>>>>>>\n");
 179 }
 180
 181 #else   /* !DEBUG_STRINGS */
 182
 183 #define hexdump(label,s,len) /* nothing */
 184
 185 #endif
 186
 187
 188 #if !USE_INTERNAL_API
 189
 190 /*
 191  * An implementation based on documented Mac OS X APIs.
 192  *
 193  * This does a certain amount of memory management, creating and
 194  * manipulating CFString objects.  We try to minimize the impact by
 195  * keeping those objects around and re-using them.  We also use
 196  * external backing store for the CFStrings where this is possible and
 197  * benficial.
 198  *
 199  * The Unicode normalizations forms available at this level are
 200  * generic, not specifically for the file system.  So they may not be
 201  * perfect fits.
 202  */
 203 static size_t macosxfs_encoding_pull(
 204         void *cd,                                   /* Encoder handle */
 205         const char **inbuf, size_t *inbytesleft,    /* Script string */
 206         char **outbuf, size_t *outbytesleft)        /* UTF-16-LE string */
 207 {
 208         static const int script_code = kCFStringEncodingUTF8;
 209         static CFMutableStringRef cfstring = NULL;
 210         size_t outsize;
 211         CFRange range;
 212
 213         (void) cd; /* UNUSED */
 214
 215         if (0 == *inbytesleft) {
 216                 return 0;
 217         }
 218
 219         if (NULL == cfstring) {
 220                 /*
 221                  * A version with an external backing store as in the
 222                  * push function should have been more efficient, but
 223                  * testing shows, that it is actually slower (!).
 224                  * Maybe kCFAllocatorDefault gets shortcut evaluation
 225                  * internally, while kCFAllocatorNull doesn't.
 226                  */
 227                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
 228         }
 229
 230         /*
 231          * Three methods of appending to a CFString, choose the most
 232          * efficient.
 233          */
 234         if (0 == (*inbuf)[*inbytesleft-1]) {
 235                 CFStringAppendCString(cfstring, *inbuf, script_code);
 236         } else if (*inbytesleft <= 255) {
 237                 Str255 buffer;
 238                 buffer[0] = *inbytesleft;
 239                 memcpy(buffer+1, *inbuf, buffer[0]);
 240                 CFStringAppendPascalString(cfstring, buffer, script_code);
 241         } else {
 242                 /*
 243                  * We would like to use a fixed buffer and a loop
 244                  * here, but than we can't garantee that the input is
 245                  * well-formed UTF-8, as we are supposed to do.
 246                  */
 247                 static char *buffer = NULL;
 248                 static size_t buflen = 0;
 249                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
 250                 memcpy(buffer, *inbuf, *inbytesleft);
 251                 buffer[*inbytesleft] = 0;
 252                 CFStringAppendCString(cfstring, *inbuf, script_code);
 253         }
 254
 255         /*
 256          * Compose characters, using the non-canonical composition
 257          * form.
 258          */
 259         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
 260
 261         outsize = CFStringGetLength(cfstring);
 262         range = CFRangeMake(0,outsize);
 263
 264         if (outsize == 0) {
 265                 /*
 266                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 267                  * errors here.  That function will always pass 2
 268                  * characters.  smbd/open.c:check_for_pipe() cuts a
 269                  * patchname to 10 characters blindly.  Suppress the
 270                  * debug output in those cases.
 271                  */
 272                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 273                         debug_out("String conversion: "
 274                                   "An unknown error occurred\n");
 275                         hexdump("UTF8->UTF16LE (old) input",
 276                                 *inbuf, *inbytesleft);
 277                 }
 278                 errno = EILSEQ; /* Not sure, but this is what we have
 279                                  * actually seen. */
 280                 return -1;
 281         }
 282         if (outsize*2 > *outbytesleft) {
 283                 CFStringDelete(cfstring, range);
 284                 debug_out("String conversion: "
 285                           "Output buffer too small\n");
 286                 hexdump("UTF8->UTF16LE (old) input",
 287                         *inbuf, *inbytesleft);
 288                 errno = E2BIG;
 289                 return -1;
 290         }
 291
 292         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
 293         CFStringDelete(cfstring, range);
 294
 295         native_to_le(*outbuf, outsize*2);
 296
 297         /*
 298          * Add a converted null byte, if the CFString conversions
 299          * prevented that until now.
 300          */
 301         if (0 == (*inbuf)[*inbytesleft-1] &&
 302             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
 303
 304                 if ((outsize*2+2) > *outbytesleft) {
 305                         debug_out("String conversion: "
 306                                   "Output buffer too small\n");
 307                         hexdump("UTF8->UTF16LE (old) input",
 308                                 *inbuf, *inbytesleft);
 309                         errno = E2BIG;
 310                         return -1;
 311                 }
 312
 313                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
 314                 outsize += 2;
 315         }
 316
 317         *inbuf += *inbytesleft;
 318         *inbytesleft = 0;
 319         *outbuf += outsize*2;
 320         *outbytesleft -= outsize*2;
 321
 322         return 0;
 323 }
 324
 325 static size_t macosxfs_encoding_push(
 326         void *cd,                                   /* Encoder handle */
 327         const char **inbuf, size_t *inbytesleft,    /* UTF-16-LE string */
 328         char **outbuf, size_t *outbytesleft)        /* Script string */
 329 {
 330         static const int script_code = kCFStringEncodingUTF8;
 331         static CFMutableStringRef cfstring = NULL;
 332         static UniChar *buffer = NULL;
 333         static size_t buflen = 0;
 334         CFIndex outsize, cfsize, charsconverted;
 335
 336         (void) cd; /* UNUSED */
 337
 338         if (0 == *inbytesleft) {
 339                 return 0;
 340         }
 341
 342         /*
 343          * We need a buffer that can hold 4 times the original data,
 344          * because that is the theoretical maximum that decomposition
 345          * can create currently (in Unicode 4.0).
 346          */
 347         buffer = set_ucbuffer_with_le_copy(
 348                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
 349
 350         if (NULL == cfstring) {
 351                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
 352                         kCFAllocatorDefault,
 353                         buffer, *inbytesleft/2, buflen/2,
 354                         kCFAllocatorNull);
 355         } else {
 356                 CFStringSetExternalCharactersNoCopy(
 357                         cfstring,
 358                         buffer, *inbytesleft/2, buflen/2);
 359         }
 360
 361         /*
 362          * Decompose characters, using the non-canonical decomposition
 363          * form.
 364          *
 365          * NB: This isn't exactly what HFS+ wants (see note on
 366          * kCFStringEncodingUseHFSPlusCanonical in
 367          * CFStringEncodingConverter.h), but AFAIK it's the best that
 368          * the official API can do.
 369          */
 370         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
 371
 372         cfsize = CFStringGetLength(cfstring);
 373         charsconverted = CFStringGetBytes(
 374                 cfstring, CFRangeMake(0,cfsize),
 375                 script_code, 0, False,
 376                 (uint8_t *)(*outbuf), *outbytesleft, &outsize);
 377
 378         if (0 == charsconverted) {
 379                 debug_out("String conversion: "
 380                           "Buffer too small or not convertable\n");
 381                 hexdump("UTF16LE->UTF8 (old) input",
 382                         *inbuf, *inbytesleft);
 383                 errno = EILSEQ; /* Probably more likely. */
 384                 return -1;
 385         }
 386
 387         /*
 388          * Add a converted null byte, if the CFString conversions
 389          * prevented that until now.
 390          */
 391         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
 392             (0 != (*outbuf)[outsize-1])) {
 393
 394                 if (((size_t)outsize+1) > *outbytesleft) {
 395                         debug_out("String conversion: "
 396                                   "Output buffer too small\n");
 397                         hexdump("UTF16LE->UTF8 (old) input",
 398                                 *inbuf, *inbytesleft);
 399                         errno = E2BIG;
 400                         return -1;
 401                 }
 402
 403                 (*outbuf)[outsize] = 0;
 404                 ++outsize;
 405         }
 406
 407         *inbuf += *inbytesleft;
 408         *inbytesleft = 0;
 409         *outbuf += outsize;
 410         *outbytesleft -= outsize;
 411
 412         return 0;
 413 }
 414
 415 #else /* USE_INTERNAL_API */
 416
 417 /*
 418  * An implementation based on internal code as known from the
 419  * OpenDarwin CVS.
 420  *
 421  * This code doesn't need much memory management because it uses
 422  * functions that operate on the raw memory directly.
 423  *
 424  * The push routine here is faster and more compatible with HFS+ than
 425  * the other implementation above.  The pull routine is only faster
 426  * for some strings, slightly slower for others.  The pull routine
 427  * looses because it has to iterate over the data twice, once to
 428  * decode UTF-8 and than to do the character composition required by
 429  * Windows.
 430  */
 431 static size_t macosxfs_encoding_pull(
 432         void *cd,                               /* Encoder handle */
 433         char **inbuf, size_t *inbytesleft,      /* Script string */
 434         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 435 {
 436         static const int script_code = kCFStringEncodingUTF8;
 437         UInt32 srcCharsUsed = 0;
 438         UInt32 dstCharsUsed = 0;
 439         UInt32 result;
 440         uint32_t dstDecomposedUsed = 0;
 441         uint32_t dstPrecomposedUsed = 0;
 442
 443         (void) cd; /* UNUSED */
 444
 445         if (0 == *inbytesleft) {
 446                 return 0;
 447         }
 448
 449         result = CFStringEncodingBytesToUnicode(
 450                 script_code, kCFStringEncodingComposeCombinings,
 451                 *inbuf, *inbytesleft, &srcCharsUsed,
 452                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
 453
 454         switch(result) {
 455         case kCFStringEncodingConversionSuccess:
 456                 if (*inbytesleft == srcCharsUsed)
 457                         break;
 458                 else
 459                         ; /*fall through*/
 460         case kCFStringEncodingInsufficientOutputBufferLength:
 461                 debug_out("String conversion: "
 462                           "Output buffer too small\n");
 463                 hexdump("UTF8->UTF16LE (new) input",
 464                         *inbuf, *inbytesleft);
 465                 errno = E2BIG;
 466                 return -1;
 467         case kCFStringEncodingInvalidInputStream:
 468                 /*
 469                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 470                  * errors here.  That function will always pass 2
 471                  * characters.  smbd/open.c:check_for_pipe() cuts a
 472                  * patchname to 10 characters blindly.  Suppress the
 473                  * debug output in those cases.
 474                  */
 475                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 476                         debug_out("String conversion: "
 477                                   "Invalid input sequence\n");
 478                         hexdump("UTF8->UTF16LE (new) input",
 479                                 *inbuf, *inbytesleft);
 480                 }
 481                 errno = EILSEQ;
 482                 return -1;
 483         case kCFStringEncodingConverterUnavailable:
 484                 debug_out("String conversion: "
 485                           "Unknown encoding\n");
 486                 hexdump("UTF8->UTF16LE (new) input",
 487                         *inbuf, *inbytesleft);
 488                 errno = EINVAL;
 489                 return -1;
 490         }
 491
 492         /*
 493          * It doesn't look like CFStringEncodingBytesToUnicode() can
 494          * produce precomposed characters (flags=ComposeCombinings
 495          * doesn't do it), so we need another pass over the data here.
 496          * We can do this in-place, as the string can only get
 497          * shorter.
 498          *
 499          * (Actually in theory there should be an internal
 500          * decomposition and reordering before the actual composition
 501          * step.  But we should be able to rely on that we always get
 502          * fully decomposed strings for input, so this can't create
 503          * problems in reality.)
 504          */
 505         CFUniCharPrecompose(
 506                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
 507                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
 508
 509         native_to_le(*outbuf, dstPrecomposedUsed*2);
 510
 511         *inbuf += srcCharsUsed;
 512         *inbytesleft -= srcCharsUsed;
 513         *outbuf += dstPrecomposedUsed*2;
 514         *outbytesleft -= dstPrecomposedUsed*2;
 515
 516         return 0;
 517 }
 518
 519 static size_t macosxfs_encoding_push(
 520         void *cd,                               /* Encoder handle */
 521         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
 522         char **outbuf, size_t *outbytesleft)    /* Script string */
 523 {
 524         static const int script_code = kCFStringEncodingUTF8;
 525         static UniChar *buffer = NULL;
 526         static size_t buflen = 0;
 527         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
 528
 529         (void) cd; /* UNUSED */
 530
 531         if (0 == *inbytesleft) {
 532                 return 0;
 533         }
 534
 535         buffer = set_ucbuffer_with_le(
 536                 buffer, &buflen, *inbuf, *inbytesleft);
 537
 538         result = CFStringEncodingUnicodeToBytes(
 539                 script_code, kCFStringEncodingUseHFSPlusCanonical,
 540                 buffer, *inbytesleft/2, &srcCharsUsed,
 541                 *outbuf, *outbytesleft, &dstCharsUsed);
 542
 543         switch(result) {
 544         case kCFStringEncodingConversionSuccess:
 545                 if (*inbytesleft/2 == srcCharsUsed)
 546                         break;
 547                 else
 548                         ; /*fall through*/
 549         case kCFStringEncodingInsufficientOutputBufferLength:
 550                 debug_out("String conversion: "
 551                           "Output buffer too small\n");
 552                 hexdump("UTF16LE->UTF8 (new) input",
 553                         *inbuf, *inbytesleft);
 554                 errno = E2BIG;
 555                 return -1;
 556         case kCFStringEncodingInvalidInputStream:
 557                 /*
 558                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
 559                  * cuts a pathname to 10 characters blindly.  Suppress
 560                  * the debug output in those cases.
 561                  */
 562                 if(10 != *inbytesleft) {
 563                         debug_out("String conversion: "
 564                                   "Invalid input sequence\n");
 565                         hexdump("UTF16LE->UTF8 (new) input",
 566                                 *inbuf, *inbytesleft);
 567                 }
 568                 errno = EILSEQ;
 569                 return -1;
 570         case kCFStringEncodingConverterUnavailable:
 571                 debug_out("String conversion: "
 572                           "Unknown encoding\n");
 573                 hexdump("UTF16LE->UTF8 (new) input",
 574                         *inbuf, *inbytesleft);
 575                 errno = EINVAL;
 576                 return -1;
 577         }
 578
 579         *inbuf += srcCharsUsed*2;
 580         *inbytesleft -= srcCharsUsed*2;
 581         *outbuf += dstCharsUsed;
 582         *outbytesleft -= dstCharsUsed;
 583
 584         return 0;
 585 }
 586
 587 #endif /* USE_INTERNAL_API */
 588
 589 /*
 590  * For initialization, actually install the encoding as "macosxfs".
 591  */
 592 static struct charset_functions macosxfs_encoding_functions = {
 593         "MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
 594 };
 595
 596 NTSTATUS charset_macosxfs_init(void)
 597 {
 598         return smb_register_charset(&macosxfs_encoding_functions);
 599 }
 600
 601 /* eof */