lib/util/charset/charset_macosxfs.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba charset module for Mac OS X/Darwin
   4    Copyright (C) Benjamin Riefenstahl 2003
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 /*
  21  * modules/charset_macosxfs.c
  22  *
  23  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
  24  * and display encoding.
  25  *
  26  * Actually two implementations are provided here.  The default
  27  * implementation is based on the official CFString API.  The other is
  28  * based on internal CFString APIs as defined in the OpenDarwin
  29  * source.
  30  */
  31
  32 #include "includes.h"
  33 #include "charset_proto.h"
  34 #undef realloc
  35
  36 #ifdef DARWINOS
  37
  38 /*
  39  * Include OS frameworks.  These are only needed in this module.
  40  */
  41 #include <CoreFoundation/CFString.h>
  42
  43 /*
  44  * See if autoconf has found us the internal headers in some form.
  45  */
  46 #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
  47 #       include <CoreFoundation/CFStringEncodingConverter.h>
  48 #       include <CoreFoundation/CFUnicodePrecomposition.h>
  49 #       define USE_INTERNAL_API 1
  50 #elif HAVE_CFSTRINGENCODINGCONVERTER_H
  51 #       include <CFStringEncodingConverter.h>
  52 #       include <CFUnicodePrecomposition.h>
  53 #       define USE_INTERNAL_API 1
  54 #endif
  55
  56 /*
  57  * Compile time configuration: Do we want debug output?
  58  */
  59 /* #define DEBUG_STRINGS 1 */
  60
  61 /*
  62  * A simple, but efficient memory provider for our buffers.
  63  */
  64 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
  65 {
  66         if (newsize > *size) {
  67                 *size = newsize + 128;
  68                 buffer = realloc(buffer, *size);
  69         }
  70         return buffer;
  71 }
  72
  73 /*
  74  * While there is a version of OpenDarwin for intel, the usual case is
  75  * big-endian PPC.  So we need byte swapping to handle the
  76  * little-endian byte order of the network protocol.  We also need an
  77  * additional dynamic buffer to do this work for incoming data blocks,
  78  * because we have to consider the original data as constant.
  79  *
  80  * We abstract the differences away by providing a simple facade with
  81  * these functions/macros:
  82  *
  83  *      le_to_native(dst,src,len)
  84  *      native_to_le(cp,len)
  85  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
  86  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
  87  */
  88 #ifdef WORDS_BIGENDIAN
  89
  90 static inline void swap_bytes (char * dst, const char * src, size_t len)
  91 {
  92         const char *srcend = src + len;
  93         while (src < srcend) {
  94                 dst[0] = src[1];
  95                 dst[1] = src[0];
  96                 dst += 2;
  97                 src += 2;
  98         }
  99 }
 100 static inline void swap_bytes_inplace (char * cp, size_t len)
 101 {
 102         char temp;
 103         char *end = cp + len;
 104         while (cp  < end) {
 105                 temp = cp[1];
 106                 cp[1] = cp[0];
 107                 cp[0] = temp;
 108                 cp += 2;
 109         }
 110 }
 111
 112 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
 113 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
 114 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 115         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
 116
 117 #else   /* ! WORDS_BIGENDIAN */
 118
 119 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
 120 #define native_to_le(cp,len)            /* nothing */
 121 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 122         (((void)(bufsize)),(UniChar*)(data))
 123
 124 #endif
 125
 126 static inline UniChar *set_ucbuffer_with_le_copy (
 127         UniChar *buffer, size_t *bufsize,
 128         const void *data, size_t size, size_t reserve)
 129 {
 130         buffer = resize_buffer(buffer, bufsize, size+reserve);
 131         le_to_native((char*)buffer,data,size);
 132         return buffer;
 133 }
 134
 135
 136 /*
 137  * A simple hexdump function for debugging error conditions.
 138  */
 139 #define debug_out(s)    DEBUG(0,(s))
 140
 141 #ifdef DEBUG_STRINGS
 142
 143 static void hexdump( const char * label, const char * s, size_t len )
 144 {
 145         size_t restlen = len;
 146         debug_out("<<<<<<<\n");
 147         debug_out(label);
 148         debug_out("\n");
 149         while (restlen > 0) {
 150                 char line[100];
 151                 size_t i, j;
 152                 char * d = line;
 153 #undef sprintf
 154                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
 155                 *d++ = ' ';
 156                 for( i = 0; i<restlen && i<8; ++i ) {
 157                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 158                 }
 159                 for( j = i; j<8; ++j ) {
 160                         d += sprintf(d, "   ");
 161                 }
 162                 *d++ = ' ';
 163                 for( i = 8; i<restlen && i<16; ++i ) {
 164                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 165                 }
 166                 for( j = i; j<16; ++j ) {
 167                         d += sprintf(d, "   ");
 168                 }
 169                 *d++ = ' ';
 170                 for( i = 0; i<restlen && i<16; ++i ) {
 171                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
 172                                 *d++ = '.';
 173                         else
 174                                 *d++ = s[i];
 175                 }
 176                 *d++ = '\n';
 177                 *d = 0;
 178                 restlen -= i;
 179                 s += i;
 180                 debug_out(line);
 181         }
 182         debug_out(">>>>>>>\n");
 183 }
 184
 185 #else   /* !DEBUG_STRINGS */
 186
 187 #define hexdump(label,s,len) /* nothing */
 188
 189 #endif
 190
 191
 192 #if !USE_INTERNAL_API
 193
 194 /*
 195  * An implementation based on documented Mac OS X APIs.
 196  *
 197  * This does a certain amount of memory management, creating and
 198  * manipulating CFString objects.  We try to minimize the impact by
 199  * keeping those objects around and re-using them.  We also use
 200  * external backing store for the CFStrings where this is possible and
 201  * benficial.
 202  *
 203  * The Unicode normalizations forms available at this level are
 204  * generic, not specifically for the file system.  So they may not be
 205  * perfect fits.
 206  */
 207 size_t macosxfs_encoding_pull(
 208         void *cd,                               /* Encoder handle */
 209         const char **inbuf, size_t *inbytesleft, /* Script string */
 210         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 211 {
 212         static const int script_code = kCFStringEncodingUTF8;
 213         static CFMutableStringRef cfstring = NULL;
 214         size_t outsize;
 215         CFRange range;
 216
 217         (void) cd; /* UNUSED */
 218
 219         if (0 == *inbytesleft) {
 220                 return 0;
 221         }
 222
 223         if (NULL == cfstring) {
 224                 /*
 225                  * A version with an external backing store as in the
 226                  * push function should have been more efficient, but
 227                  * testing shows, that it is actually slower (!).
 228                  * Maybe kCFAllocatorDefault gets shortcut evaluation
 229                  * internally, while kCFAllocatorNull doesn't.
 230                  */
 231                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
 232         }
 233
 234         /*
 235          * Three methods of appending to a CFString, choose the most
 236          * efficient.
 237          */
 238         if (0 == (*inbuf)[*inbytesleft-1]) {
 239                 CFStringAppendCString(cfstring, *inbuf, script_code);
 240         } else if (*inbytesleft <= 255) {
 241                 Str255 buffer;
 242                 buffer[0] = *inbytesleft;
 243                 memcpy(buffer+1, *inbuf, buffer[0]);
 244                 CFStringAppendPascalString(cfstring, buffer, script_code);
 245         } else {
 246                 /*
 247                  * We would like to use a fixed buffer and a loop
 248                  * here, but than we can't garantee that the input is
 249                  * well-formed UTF-8, as we are supposed to do.
 250                  */
 251                 static char *buffer = NULL;
 252                 static size_t buflen = 0;
 253                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
 254                 memcpy(buffer, *inbuf, *inbytesleft);
 255                 buffer[*inbytesleft] = 0;
 256                 CFStringAppendCString(cfstring, *inbuf, script_code);
 257         }
 258
 259         /*
 260          * Compose characters, using the non-canonical composition
 261          * form.
 262          */
 263         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
 264
 265         outsize = CFStringGetLength(cfstring);
 266         range = CFRangeMake(0,outsize);
 267
 268         if (outsize == 0) {
 269                 /*
 270                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 271                  * errors here.  That function will always pass 2
 272                  * characters.  smbd/open.c:check_for_pipe() cuts a
 273                  * patchname to 10 characters blindly.  Suppress the
 274                  * debug output in those cases.
 275                  */
 276                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 277                         debug_out("String conversion: "
 278                                   "An unknown error occurred\n");
 279                         hexdump("UTF8->UTF16LE (old) input",
 280                                 *inbuf, *inbytesleft);
 281                 }
 282                 errno = EILSEQ; /* Not sure, but this is what we have
 283                                  * actually seen. */
 284                 return -1;
 285         }
 286         if (outsize*2 > *outbytesleft) {
 287                 CFStringDelete(cfstring, range);
 288                 debug_out("String conversion: "
 289                           "Output buffer too small\n");
 290                 hexdump("UTF8->UTF16LE (old) input",
 291                         *inbuf, *inbytesleft);
 292                 errno = E2BIG;
 293                 return -1;
 294         }
 295
 296         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
 297         CFStringDelete(cfstring, range);
 298
 299         native_to_le(*outbuf, outsize*2);
 300
 301         /*
 302          * Add a converted null byte, if the CFString conversions
 303          * prevented that until now.
 304          */
 305         if (0 == (*inbuf)[*inbytesleft-1] &&
 306             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
 307
 308                 if ((outsize*2+2) > *outbytesleft) {
 309                         debug_out("String conversion: "
 310                                   "Output buffer too small\n");
 311                         hexdump("UTF8->UTF16LE (old) input",
 312                                 *inbuf, *inbytesleft);
 313                         errno = E2BIG;
 314                         return -1;
 315                 }
 316
 317                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
 318                 outsize += 2;
 319         }
 320
 321         *inbuf += *inbytesleft;
 322         *inbytesleft = 0;
 323         *outbuf += outsize*2;
 324         *outbytesleft -= outsize*2;
 325
 326         return 0;
 327 }
 328
 329 size_t macosxfs_encoding_push(
 330         void *cd,                               /* Encoder handle */
 331         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
 332         char **outbuf, size_t *outbytesleft)    /* Script string */
 333 {
 334         static const int script_code = kCFStringEncodingUTF8;
 335         static CFMutableStringRef cfstring = NULL;
 336         static UniChar *buffer = NULL;
 337         static size_t buflen = 0;
 338         CFIndex outsize, cfsize, charsconverted;
 339
 340         (void) cd; /* UNUSED */
 341
 342         if (0 == *inbytesleft) {
 343                 return 0;
 344         }
 345
 346         /*
 347          * We need a buffer that can hold 4 times the original data,
 348          * because that is the theoretical maximum that decomposition
 349          * can create currently (in Unicode 4.0).
 350          */
 351         buffer = set_ucbuffer_with_le_copy(
 352                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
 353
 354         if (NULL == cfstring) {
 355                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
 356                         kCFAllocatorDefault,
 357                         buffer, *inbytesleft/2, buflen/2,
 358                         kCFAllocatorNull);
 359         } else {
 360                 CFStringSetExternalCharactersNoCopy(
 361                         cfstring,
 362                         buffer, *inbytesleft/2, buflen/2);
 363         }
 364
 365         /*
 366          * Decompose characters, using the non-canonical decomposition
 367          * form.
 368          *
 369          * NB: This isn't exactly what HFS+ wants (see note on
 370          * kCFStringEncodingUseHFSPlusCanonical in
 371          * CFStringEncodingConverter.h), but AFAIK it's the best that
 372          * the official API can do.
 373          */
 374         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
 375
 376         cfsize = CFStringGetLength(cfstring);
 377         charsconverted = CFStringGetBytes(
 378                 cfstring, CFRangeMake(0,cfsize),
 379                 script_code, 0, false,
 380                 *outbuf, *outbytesleft, &outsize);
 381
 382         if (0 == charsconverted) {
 383                 debug_out("String conversion: "
 384                           "Buffer too small or not convertable\n");
 385                 hexdump("UTF16LE->UTF8 (old) input",
 386                         *inbuf, *inbytesleft);
 387                 errno = EILSEQ; /* Probably more likely. */
 388                 return -1;
 389         }
 390
 391         /*
 392          * Add a converted null byte, if the CFString conversions
 393          * prevented that until now.
 394          */
 395         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
 396             (0 != (*outbuf)[outsize-1])) {
 397
 398                 if (((size_t)outsize+1) > *outbytesleft) {
 399                         debug_out("String conversion: "
 400                                   "Output buffer too small\n");
 401                         hexdump("UTF16LE->UTF8 (old) input",
 402                                 *inbuf, *inbytesleft);
 403                         errno = E2BIG;
 404                         return -1;
 405                 }
 406
 407                 (*outbuf)[outsize] = 0;
 408                 ++outsize;
 409         }
 410
 411         *inbuf += *inbytesleft;
 412         *inbytesleft = 0;
 413         *outbuf += outsize;
 414         *outbytesleft -= outsize;
 415
 416         return 0;
 417 }
 418
 419 #else /* USE_INTERNAL_API */
 420
 421 /*
 422  * An implementation based on internal code as known from the
 423  * OpenDarwin CVS.
 424  *
 425  * This code doesn't need much memory management because it uses
 426  * functions that operate on the raw memory directly.
 427  *
 428  * The push routine here is faster and more compatible with HFS+ than
 429  * the other implementation above.  The pull routine is only faster
 430  * for some strings, slightly slower for others.  The pull routine
 431  * looses because it has to iterate over the data twice, once to
 432  * decode UTF-8 and than to do the character composition required by
 433  * Windows.
 434  */
 435 static size_t macosxfs_encoding_pull(
 436         void *cd,                               /* Encoder handle */
 437         const char **inbuf, size_t *inbytesleft, /* Script string */
 438         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 439 {
 440         static const int script_code = kCFStringEncodingUTF8;
 441         UInt32 srcCharsUsed = 0;
 442         UInt32 dstCharsUsed = 0;
 443         UInt32 result;
 444         uint32_t dstDecomposedUsed = 0;
 445         uint32_t dstPrecomposedUsed = 0;
 446
 447         (void) cd; /* UNUSED */
 448
 449         if (0 == *inbytesleft) {
 450                 return 0;
 451         }
 452
 453         result = CFStringEncodingBytesToUnicode(
 454                 script_code, kCFStringEncodingComposeCombinings,
 455                 *inbuf, *inbytesleft, &srcCharsUsed,
 456                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
 457
 458         switch(result) {
 459         case kCFStringEncodingConversionSuccess:
 460                 if (*inbytesleft == srcCharsUsed)
 461                         break;
 462                 else
 463                         ; /*fall through*/
 464         case kCFStringEncodingInsufficientOutputBufferLength:
 465                 debug_out("String conversion: "
 466                           "Output buffer too small\n");
 467                 hexdump("UTF8->UTF16LE (new) input",
 468                         *inbuf, *inbytesleft);
 469                 errno = E2BIG;
 470                 return -1;
 471         case kCFStringEncodingInvalidInputStream:
 472                 /*
 473                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 474                  * errors here.  That function will always pass 2
 475                  * characters.  smbd/open.c:check_for_pipe() cuts a
 476                  * patchname to 10 characters blindly.  Suppress the
 477                  * debug output in those cases.
 478                  */
 479                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 480                         debug_out("String conversion: "
 481                                   "Invalid input sequence\n");
 482                         hexdump("UTF8->UTF16LE (new) input",
 483                                 *inbuf, *inbytesleft);
 484                 }
 485                 errno = EILSEQ;
 486                 return -1;
 487         case kCFStringEncodingConverterUnavailable:
 488                 debug_out("String conversion: "
 489                           "Unknown encoding\n");
 490                 hexdump("UTF8->UTF16LE (new) input",
 491                         *inbuf, *inbytesleft);
 492                 errno = EINVAL;
 493                 return -1;
 494         }
 495
 496         /*
 497          * It doesn't look like CFStringEncodingBytesToUnicode() can
 498          * produce precomposed characters (flags=ComposeCombinings
 499          * doesn't do it), so we need another pass over the data here.
 500          * We can do this in-place, as the string can only get
 501          * shorter.
 502          *
 503          * (Actually in theory there should be an internal
 504          * decomposition and reordering before the actual composition
 505          * step.  But we should be able to rely on that we always get
 506          * fully decomposed strings for input, so this can't create
 507          * problems in reality.)
 508          */
 509         CFUniCharPrecompose(
 510                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
 511                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
 512
 513         native_to_le(*outbuf, dstPrecomposedUsed*2);
 514
 515         *inbuf += srcCharsUsed;
 516         *inbytesleft -= srcCharsUsed;
 517         *outbuf += dstPrecomposedUsed*2;
 518         *outbytesleft -= dstPrecomposedUsed*2;
 519
 520         return 0;
 521 }
 522
 523 static size_t macosxfs_encoding_push(
 524         void *cd,                               /* Encoder handle */
 525         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
 526         char **outbuf, size_t *outbytesleft)    /* Script string */
 527 {
 528         static const int script_code = kCFStringEncodingUTF8;
 529         static UniChar *buffer = NULL;
 530         static size_t buflen = 0;
 531         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
 532
 533         (void) cd; /* UNUSED */
 534
 535         if (0 == *inbytesleft) {
 536                 return 0;
 537         }
 538
 539         buffer = set_ucbuffer_with_le(
 540                 buffer, &buflen, *inbuf, *inbytesleft);
 541
 542         result = CFStringEncodingUnicodeToBytes(
 543                 script_code, kCFStringEncodingUseHFSPlusCanonical,
 544                 buffer, *inbytesleft/2, &srcCharsUsed,
 545                 *outbuf, *outbytesleft, &dstCharsUsed);
 546
 547         switch(result) {
 548         case kCFStringEncodingConversionSuccess:
 549                 if (*inbytesleft/2 == srcCharsUsed)
 550                         break;
 551                 else
 552                         ; /*fall through*/
 553         case kCFStringEncodingInsufficientOutputBufferLength:
 554                 debug_out("String conversion: "
 555                           "Output buffer too small\n");
 556                 hexdump("UTF16LE->UTF8 (new) input",
 557                         *inbuf, *inbytesleft);
 558                 errno = E2BIG;
 559                 return -1;
 560         case kCFStringEncodingInvalidInputStream:
 561                 /*
 562                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
 563                  * cuts a pathname to 10 characters blindly.  Suppress
 564                  * the debug output in those cases.
 565                  */
 566                 if(10 != *inbytesleft) {
 567                         debug_out("String conversion: "
 568                                   "Invalid input sequence\n");
 569                         hexdump("UTF16LE->UTF8 (new) input",
 570                                 *inbuf, *inbytesleft);
 571                 }
 572                 errno = EILSEQ;
 573                 return -1;
 574         case kCFStringEncodingConverterUnavailable:
 575                 debug_out("String conversion: "
 576                           "Unknown encoding\n");
 577                 hexdump("UTF16LE->UTF8 (new) input",
 578                         *inbuf, *inbytesleft);
 579                 errno = EINVAL;
 580                 return -1;
 581         }
 582
 583         *inbuf += srcCharsUsed*2;
 584         *inbytesleft -= srcCharsUsed*2;
 585         *outbuf += dstCharsUsed;
 586         *outbytesleft -= dstCharsUsed;
 587
 588         return 0;
 589 }
 590
 591 #endif /* USE_INTERNAL_API */
 592
 593 #else /* DARWIN */
 594
 595 void charset_macosfs_dummy(void);
 596 void charset_macosfs_dummy(void)
 597 {
 598         return;
 599 }
 600
 601 #endif /* DARWIN */