src/os_mac_conv.c

   1 /* vi:set ts=8 sts=4 sw=4:
   2  *
   3  * VIM - Vi IMproved    by Bram Moolenaar
   4  *
   5  * Do ":help uganda"  in Vim to read copying and usage conditions.
   6  * Do ":help credits" in Vim to see a list of people who contributed.
   7  * See README.txt for an overview of the Vim source code.
   8  */
   9 /*
  10  * os_mac_conv.c: Code specifically for Mac string conversions.
  11  *
  12  * This code has been put in a separate file to avoid the conflicts that are
  13  * caused by including both the X11 and Carbon header files.
  14  */
  15
  16 #define NO_X11_INCLUDES
  17 #include "vim.h"
  18
  19 #if defined(MACOS_CONVERT) || defined(PROTO)
  20 # ifdef PROTO
  21 /* A few dummy types to be able to generate function prototypes. */
  22 typedef int UniChar;
  23 typedef int *TECObjectRef;
  24 typedef int CFStringRef;
  25 # else
  26 typedef unsigned short UniChar;
  27 #include <CoreServices/CoreServices.h>
  28 # endif
  29
  30 static char_u       *mac_utf16_to_utf8 __ARGS((UniChar *from, size_t fromLen, size_t *actualLen));
  31 static UniChar      *mac_utf8_to_utf16 __ARGS((char_u *from, size_t fromLen, size_t *actualLen));
  32
  33 /* Converter for composing decomposed HFS+ file paths */
  34 static TECObjectRef gPathConverter;
  35 /* Converter used by mac_utf16_to_utf8 */
  36 static TECObjectRef gUTF16ToUTF8Converter;
  37
  38 /*
  39  * A Mac version of string_convert_ext() for special cases.
  40  */
  41     char_u *
  42 mac_string_convert(ptr, len, lenp, fail_on_error, from_enc, to_enc, unconvlenp)
  43     char_u              *ptr;
  44     int                 len;
  45     int                 *lenp;
  46     int                 fail_on_error;
  47     int                 from_enc;
  48     int                 to_enc;
  49     int                 *unconvlenp;
  50 {
  51     char_u              *retval, *d;
  52     CFStringRef         cfstr;
  53     int                 buflen, in, out, l, i;
  54     CFStringEncoding    from;
  55     CFStringEncoding    to;
  56
  57     switch (from_enc)
  58     {
  59         case 'l':   from = kCFStringEncodingISOLatin1; break;
  60         case 'm':   from = kCFStringEncodingMacRoman; break;
  61         case 'u':   from = kCFStringEncodingUTF8; break;
  62         default:    return NULL;
  63     }
  64     switch (to_enc)
  65     {
  66         case 'l':   to = kCFStringEncodingISOLatin1; break;
  67         case 'm':   to = kCFStringEncodingMacRoman; break;
  68         case 'u':   to = kCFStringEncodingUTF8; break;
  69         default:    return NULL;
  70     }
  71
  72     if (unconvlenp != NULL)
  73         *unconvlenp = 0;
  74     cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
  75
  76     if(cfstr == NULL)
  77         fprintf(stderr, "Encoding failed\n");
  78     /* When conversion failed, try excluding bytes from the end, helps when
  79      * there is an incomplete byte sequence.  Only do up to 6 bytes to avoid
  80      * looping a long time when there really is something unconvertible. */
  81     while (cfstr == NULL && unconvlenp != NULL && len > 1 && *unconvlenp < 6)
  82     {
  83         --len;
  84         ++*unconvlenp;
  85         cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
  86     }
  87     if (cfstr == NULL)
  88         return NULL;
  89
  90     if (to == kCFStringEncodingUTF8)
  91         buflen = len * 6 + 1;
  92     else
  93         buflen = len + 1;
  94     retval = alloc(buflen);
  95     if (retval == NULL)
  96     {
  97         CFRelease(cfstr);
  98         return NULL;
  99     }
 100
 101 #if 0
 102     CFRange convertRange = CFRangeMake(0, CFStringGetLength(cfstr));
 103     /*  Determine output buffer size */
 104     CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, NULL, 0, (CFIndex *)&buflen);
 105     retval = (buflen > 0) ? alloc(buflen) : NULL;
 106     if (retval == NULL) {
 107         CFRelease(cfstr);
 108         return NULL;
 109     }
 110
 111     if (lenp)
 112         *lenp = buflen / sizeof(char_u);
 113
 114     if (!CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, retval, buflen, NULL))
 115 #endif
 116     if (!CFStringGetCString(cfstr, (char *)retval, buflen, to))
 117     {
 118         CFRelease(cfstr);
 119         if (fail_on_error)
 120         {
 121             vim_free(retval);
 122             return NULL;
 123         }
 124
 125         fprintf(stderr, "Trying char-by-char conversion...\n");
 126         /* conversion failed for the whole string, but maybe it will work
 127          * for each character */
 128         for (d = retval, in = 0, out = 0; in < len && out < buflen - 1;)
 129         {
 130             if (from == kCFStringEncodingUTF8)
 131                 l = utf_ptr2len(ptr + in);
 132             else
 133                 l = 1;
 134             cfstr = CFStringCreateWithBytes(NULL, ptr + in, l, from, 0);
 135             if (cfstr == NULL)
 136             {
 137                 *d++ = '?';
 138                 out++;
 139             }
 140             else
 141             {
 142                 if (!CFStringGetCString(cfstr, (char *)d, buflen - out, to))
 143                 {
 144                     *d++ = '?';
 145                     out++;
 146                 }
 147                 else
 148                 {
 149                     i = STRLEN(d);
 150                     d += i;
 151                     out += i;
 152                 }
 153                 CFRelease(cfstr);
 154             }
 155             in += l;
 156         }
 157         *d = NUL;
 158         if (lenp != NULL)
 159             *lenp = out;
 160         return retval;
 161     }
 162     CFRelease(cfstr);
 163     if (lenp != NULL)
 164         *lenp = STRLEN(retval);
 165
 166     return retval;
 167 }
 168
 169 /*
 170  * Conversion from Apple MacRoman char encoding to UTF-8 or latin1, using
 171  * standard Carbon framework.
 172  * Input: "ptr[*sizep]".
 173  * "real_size" is the size of the buffer that "ptr" points to.
 174  * output is in-place, "sizep" is adjusted.
 175  * Returns OK or FAIL.
 176  */
 177     int
 178 macroman2enc(ptr, sizep, real_size)
 179     char_u      *ptr;
 180     long        *sizep;
 181     long        real_size;
 182 {
 183     CFStringRef         cfstr;
 184     CFRange             r;
 185     CFIndex             len = *sizep;
 186
 187     /* MacRoman is an 8-bit encoding, no need to move bytes to
 188      * conv_rest[]. */
 189     cfstr = CFStringCreateWithBytes(NULL, ptr, len,
 190                                                 kCFStringEncodingMacRoman, 0);
 191     /*
 192      * If there is a conversion error, try using another
 193      * conversion.
 194      */
 195     if (cfstr == NULL)
 196         return FAIL;
 197
 198     r.location = 0;
 199     r.length = CFStringGetLength(cfstr);
 200     if (r.length != CFStringGetBytes(cfstr, r,
 201             (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
 202             0, /* no lossy conversion */
 203             0, /* not external representation */
 204             ptr + *sizep, real_size - *sizep, &len))
 205     {
 206         CFRelease(cfstr);
 207         return FAIL;
 208     }
 209     CFRelease(cfstr);
 210     mch_memmove(ptr, ptr + *sizep, len);
 211     *sizep = len;
 212
 213     return OK;
 214 }
 215
 216 /*
 217  * Conversion from UTF-8 or latin1 to MacRoman.
 218  * Input: "from[fromlen]"
 219  * Output: "to[maxtolen]" length in "*tolenp"
 220  * Unconverted rest in rest[*restlenp].
 221  * Returns OK or FAIL.
 222  */
 223     int
 224 enc2macroman(from, fromlen, to, tolenp, maxtolen, rest, restlenp)
 225     char_u      *from;
 226     size_t      fromlen;
 227     char_u      *to;
 228     int         *tolenp;
 229     int         maxtolen;
 230     char_u      *rest;
 231     int         *restlenp;
 232 {
 233     CFStringRef cfstr;
 234     CFRange     r;
 235     CFIndex     l;
 236
 237     *restlenp = 0;
 238     cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
 239             (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
 240             0);
 241     while (cfstr == NULL && *restlenp < 3 && fromlen > 1)
 242     {
 243         rest[*restlenp++] = from[--fromlen];
 244         cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
 245                 (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
 246                 0);
 247     }
 248     if (cfstr == NULL)
 249         return FAIL;
 250
 251     r.location = 0;
 252     r.length = CFStringGetLength(cfstr);
 253     if (r.length != CFStringGetBytes(cfstr, r,
 254                 kCFStringEncodingMacRoman,
 255                 0, /* no lossy conversion */
 256                 0, /* not external representation (since vim
 257                     * handles this internally */
 258                 to, maxtolen, &l))
 259     {
 260         CFRelease(cfstr);
 261         return FAIL;
 262     }
 263     CFRelease(cfstr);
 264     *tolenp = l;
 265     return OK;
 266 }
 267
 268 /*
 269  * Initializes text converters
 270  */
 271     void
 272 mac_conv_init()
 273 {
 274     TextEncoding    utf8_encoding;
 275     TextEncoding    utf8_hfsplus_encoding;
 276     TextEncoding    utf8_canon_encoding;
 277     TextEncoding    utf16_encoding;
 278
 279     utf8_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
 280             kTextEncodingDefaultVariant, kUnicodeUTF8Format);
 281     utf8_hfsplus_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
 282             kUnicodeHFSPlusCompVariant, kUnicodeUTF8Format);
 283     utf8_canon_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
 284             kUnicodeCanonicalCompVariant, kUnicodeUTF8Format);
 285     utf16_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
 286             kTextEncodingDefaultVariant, kUnicode16BitFormat);
 287
 288     if (TECCreateConverter(&gPathConverter, utf8_encoding,
 289                 utf8_hfsplus_encoding) != noErr)
 290         gPathConverter = NULL;
 291
 292     if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
 293                 utf8_canon_encoding) != noErr)
 294     {
 295         /* On pre-10.3, Unicode normalization is not available so
 296          * fall back to non-normalizing converter */
 297         if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
 298                     utf8_encoding) != noErr)
 299             gUTF16ToUTF8Converter = NULL;
 300     }
 301 }
 302
 303 /*
 304  * Destroys text converters
 305  */
 306     void
 307 mac_conv_cleanup()
 308 {
 309     if (gUTF16ToUTF8Converter)
 310     {
 311         TECDisposeConverter(gUTF16ToUTF8Converter);
 312         gUTF16ToUTF8Converter = NULL;
 313     }
 314
 315     if (gPathConverter)
 316     {
 317         TECDisposeConverter(gPathConverter);
 318         gPathConverter = NULL;
 319     }
 320 }
 321
 322 /*
 323  * Conversion from UTF-16 UniChars to 'encoding'
 324  * The function signature uses the real type of UniChar (as typedef'ed in
 325  * CFBase.h) to avoid clashes with X11 header files in the .pro file
 326  */
 327     char_u *
 328 mac_utf16_to_enc(from, fromLen, actualLen)
 329     unsigned short *from;
 330     size_t fromLen;
 331     size_t *actualLen;
 332 {
 333     /* Following code borrows somewhat from os_mswin.c */
 334     vimconv_T   conv;
 335     size_t      utf8_len;
 336     char_u      *utf8_str;
 337     char_u      *result = NULL;
 338
 339     /* Convert to utf-8 first, works better with iconv */
 340     utf8_len = 0;
 341     utf8_str = mac_utf16_to_utf8(from, fromLen, &utf8_len);
 342
 343     if (utf8_str)
 344     {
 345         /* We might be called before we have p_enc set up. */
 346         conv.vc_type = CONV_NONE;
 347
 348         /* If encoding (p_enc) is any unicode, it is actually in utf-8 (vim
 349          * internal unicode is always utf-8) so don't convert in such cases */
 350
 351         if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0)
 352             convert_setup(&conv, (char_u *)"utf-8",
 353                     p_enc? p_enc: (char_u *)"macroman");
 354         if (conv.vc_type == CONV_NONE)
 355         {
 356             /* p_enc is utf-8, so we're done. */
 357             result = utf8_str;
 358         }
 359         else
 360         {
 361             result = string_convert(&conv, utf8_str, (int *)&utf8_len);
 362             vim_free(utf8_str);
 363         }
 364
 365         convert_setup(&conv, NULL, NULL);
 366
 367         if (actualLen)
 368             *actualLen = utf8_len;
 369     }
 370     else if (actualLen)
 371         *actualLen = 0;
 372
 373     return result;
 374 }
 375
 376 /*
 377  * Conversion from 'encoding' to UTF-16 UniChars
 378  * The function return uses the real type of UniChar (as typedef'ed in
 379  * CFBase.h) to avoid clashes with X11 header files in the .pro file
 380  */
 381     unsigned short *
 382 mac_enc_to_utf16(from, fromLen, actualLen)
 383     char_u *from;
 384     size_t fromLen;
 385     size_t *actualLen;
 386 {
 387     /* Following code borrows somewhat from os_mswin.c */
 388     vimconv_T   conv;
 389     size_t      utf8_len;
 390     char_u      *utf8_str;
 391     UniChar     *result = NULL;
 392     Boolean     should_free_utf8 = FALSE;
 393
 394     do
 395     {
 396         /* Use MacRoman by default, we might be called before we have p_enc
 397          * set up.  Convert to utf-8 first, works better with iconv().  Does
 398          * nothing if 'encoding' is "utf-8". */
 399         conv.vc_type = CONV_NONE;
 400         if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0 &&
 401                 convert_setup(&conv, p_enc ? p_enc : (char_u *)"macroman",
 402                     (char_u *)"utf-8") == FAIL)
 403             break;
 404
 405         if (conv.vc_type != CONV_NONE)
 406         {
 407             utf8_len = fromLen;
 408             utf8_str = string_convert(&conv, from, (int *)&utf8_len);
 409             should_free_utf8 = TRUE;
 410         }
 411         else
 412         {
 413             utf8_str = from;
 414             utf8_len = fromLen;
 415         }
 416
 417         if (utf8_str == NULL)
 418             break;
 419
 420         convert_setup(&conv, NULL, NULL);
 421
 422         result = mac_utf8_to_utf16(utf8_str, utf8_len, actualLen);
 423
 424         if (should_free_utf8)
 425             vim_free(utf8_str);
 426         return result;
 427     }
 428     while (0);
 429
 430     if (actualLen)
 431         *actualLen = 0;
 432
 433     return result;
 434 }
 435
 436 /*
 437  * Converts from UTF-16 UniChars to CFString
 438  * The void * return type is actually a CFStringRef
 439  */
 440     void *
 441 mac_enc_to_cfstring(from, fromLen)
 442     char_u  *from;
 443     size_t  fromLen;
 444 {
 445     UniChar     *utf16_str;
 446     size_t      utf16_len;
 447     CFStringRef result = NULL;
 448
 449     utf16_str = mac_enc_to_utf16(from, fromLen, &utf16_len);
 450     if (utf16_str)
 451     {
 452         result = CFStringCreateWithCharacters(NULL, utf16_str, utf16_len/sizeof(UniChar));
 453         vim_free(utf16_str);
 454     }
 455
 456     return (void *)result;
 457 }
 458
 459 /*
 460  * Converts a decomposed HFS+ UTF-8 path to precomposed UTF-8
 461  */
 462     char_u *
 463 mac_precompose_path(decompPath, decompLen, precompLen)
 464     char_u  *decompPath;
 465     size_t  decompLen;
 466     size_t  *precompLen;
 467 {
 468     char_u  *result = NULL;
 469     size_t  actualLen = 0;
 470
 471     if (gPathConverter)
 472     {
 473         result = alloc(decompLen);
 474         if (result)
 475         {
 476             if (TECConvertText(gPathConverter, decompPath,
 477                         decompLen, &decompLen, result,
 478                         decompLen, &actualLen) != noErr)
 479             {
 480                 vim_free(result);
 481                 result = NULL;
 482             }
 483         }
 484     }
 485
 486     if (precompLen)
 487         *precompLen = actualLen;
 488
 489     return result;
 490 }
 491
 492 /*
 493  * Converts from UTF-16 UniChars to precomposed UTF-8
 494  */
 495     static char_u *
 496 mac_utf16_to_utf8(from, fromLen, actualLen)
 497     UniChar *from;
 498     size_t fromLen;
 499     size_t *actualLen;
 500 {
 501     ByteCount           utf8_len;
 502     ByteCount           inputRead;
 503     char_u              *result;
 504
 505     if (gUTF16ToUTF8Converter)
 506     {
 507         result = alloc(fromLen * 6 + 1);
 508         if (result && TECConvertText(gUTF16ToUTF8Converter, (ConstTextPtr)from,
 509                     fromLen, &inputRead, result,
 510                     (fromLen*6+1)*sizeof(char_u), &utf8_len) == noErr)
 511         {
 512             TECFlushText(gUTF16ToUTF8Converter, result, (fromLen*6+1)*sizeof(char_u), &inputRead);
 513             utf8_len += inputRead;
 514         }
 515         else
 516         {
 517             vim_free(result);
 518             result = NULL;
 519         }
 520     }
 521     else
 522     {
 523         result = NULL;
 524     }
 525
 526     if (actualLen)
 527         *actualLen = result ? utf8_len : 0;
 528
 529     return result;
 530 }
 531
 532 /*
 533  * Converts from UTF-8 to UTF-16 UniChars
 534  */
 535     static UniChar *
 536 mac_utf8_to_utf16(from, fromLen, actualLen)
 537     char_u *from;
 538     size_t fromLen;
 539     size_t *actualLen;
 540 {
 541     CFStringRef  utf8_str;
 542     CFRange      convertRange;
 543     UniChar      *result = NULL;
 544
 545     utf8_str = CFStringCreateWithBytes(NULL, from, fromLen,
 546             kCFStringEncodingUTF8, FALSE);
 547
 548     if (utf8_str == NULL) {
 549         if (actualLen)
 550             *actualLen = 0;
 551         return NULL;
 552     }
 553
 554     convertRange = CFRangeMake(0, CFStringGetLength(utf8_str));
 555     result = (UniChar *)alloc(convertRange.length * sizeof(UniChar));
 556
 557     CFStringGetCharacters(utf8_str, convertRange, result);
 558
 559     CFRelease(utf8_str);
 560
 561     if (actualLen)
 562         *actualLen = convertRange.length * sizeof(UniChar);
 563
 564     return result;
 565 }
 566
 567 /*
 568  * Sets LANG environment variable in Vim from Mac locale
 569  */
 570     void
 571 mac_lang_init() {
 572     if (mch_getenv((char_u *)"LANG") == NULL)
 573     {
 574         char    buf[20];
 575         if (LocaleRefGetPartString(NULL,
 576                     kLocaleLanguageMask | kLocaleLanguageVariantMask |
 577                     kLocaleRegionMask | kLocaleRegionVariantMask,
 578                     sizeof buf, buf) == noErr && *buf)
 579         {
 580             vim_setenv((char_u *)"LANG", (char_u *)buf);
 581 #   ifdef HAVE_LOCALE_H
 582             setlocale(LC_ALL, "");
 583 #   endif
 584         }
 585     }
 586 }
 587 #endif /* MACOS_CONVERT */