release/src/router/ntfs-3g/libntfs-3g/unistr.c

   1 /**
   2  * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
   3  *
   4  * Copyright (c) 2000-2004 Anton Altaparmakov
   5  * Copyright (c) 2002-2009 Szabolcs Szakacsits
   6  * Copyright (c) 2008-2009 Jean-Pierre Andre
   7  * Copyright (c) 2008      Bernhard Kaindl
   8  *
   9  * This program/include file is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as published
  11  * by the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program/include file is distributed in the hope that it will be
  15  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  16  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program (in the main directory of the NTFS-3G
  21  * distribution in the file COPYING); if not, write to the Free Software
  22  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  23  */
  24
  25 #ifdef HAVE_CONFIG_H
  26 #include "config.h"
  27 #endif
  28
  29 #ifdef HAVE_STDIO_H
  30 #include <stdio.h>
  31 #endif
  32 #ifdef HAVE_STDLIB_H
  33 #include <stdlib.h>
  34 #endif
  35 #ifdef HAVE_WCHAR_H
  36 #include <wchar.h>
  37 #endif
  38 #ifdef HAVE_STRING_H
  39 #include <string.h>
  40 #endif
  41 #ifdef HAVE_ERRNO_H
  42 #include <errno.h>
  43 #endif
  44 #ifdef HAVE_LOCALE_H
  45 #include <locale.h>
  46 #endif
  47
  48 #if defined(__APPLE__) || defined(__DARWIN__)
  49 #ifdef ENABLE_NFCONV
  50 #include <CoreFoundation/CoreFoundation.h>
  51 #endif /* ENABLE_NFCONV */
  52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
  53
  54 #include "compat.h"
  55 #include "attrib.h"
  56 #include "types.h"
  57 #include "unistr.h"
  58 #include "debug.h"
  59 #include "logging.h"
  60 #include "misc.h"
  61
  62 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
  63
  64 /*
  65  * IMPORTANT
  66  * =========
  67  *
  68  * All these routines assume that the Unicode characters are in little endian
  69  * encoding inside the strings!!!
  70  */
  71
  72 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
  73
  74 #if defined(__APPLE__) || defined(__DARWIN__)
  75 #ifdef ENABLE_NFCONV
  76 /**
  77  * This variable controls whether or not automatic normalization form conversion
  78  * should be performed when translating NTFS unicode file names to UTF-8.
  79  * Defaults to on, but can be controlled from the outside using the function
  80  *   int ntfs_macosx_normalize_filenames(int normalize);
  81  */
  82 static int nfconvert_utf8 = 1;
  83 #endif /* ENABLE_NFCONV */
  84 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
  85
  86 /*
  87  * This is used by the name collation functions to quickly determine what
  88  * characters are (in)valid.
  89  */
  90 #if 0
  91 static const u8 legal_ansi_char_array[0x40] = {
  92         0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  93         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  94
  95         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  96         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  97
  98         0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
  99         0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
 100
 101         0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
 102         0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
 103 };
 104 #endif
 105
 106 /**
 107  * ntfs_names_are_equal - compare two Unicode names for equality
 108  * @s1:                 name to compare to @s2
 109  * @s1_len:             length in Unicode characters of @s1
 110  * @s2:                 name to compare to @s1
 111  * @s2_len:             length in Unicode characters of @s2
 112  * @ic:                 ignore case bool
 113  * @upcase:             upcase table (only if @ic == IGNORE_CASE)
 114  * @upcase_size:        length in Unicode characters of @upcase (if present)
 115  *
 116  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
 117  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
 118  * the @upcase table is used to perform a case insensitive comparison.
 119  */
 120 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
 121                 const ntfschar *s2, size_t s2_len,
 122                 const IGNORE_CASE_BOOL ic,
 123                 const ntfschar *upcase, const u32 upcase_size)
 124 {
 125         if (s1_len != s2_len)
 126                 return FALSE;
 127         if (!s1_len)
 128                 return TRUE;
 129         if (ic == CASE_SENSITIVE)
 130                 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
 131         return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
 132                                                                        TRUE;
 133 }
 134
 135 /**
 136  * ntfs_names_collate - collate two Unicode names
 137  * @name1:      first Unicode name to compare
 138  * @name1_len:  length of first Unicode name to compare
 139  * @name2:      second Unicode name to compare
 140  * @name2_len:  length of second Unicode name to compare
 141  * @err_val:    if @name1 contains an invalid character return this value
 142  * @ic:         either CASE_SENSITIVE or IGNORE_CASE
 143  * @upcase:     upcase table (ignored if @ic is CASE_SENSITIVE)
 144  * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
 145  *
 146  * ntfs_names_collate() collates two Unicode names and returns:
 147  *
 148  *  -1 if the first name collates before the second one,
 149  *   0 if the names match,
 150  *   1 if the second name collates before the first one, or
 151  * @err_val if an invalid character is found in @name1 during the comparison.
 152  *
 153  * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
 154  *
 155  * A few optimizations made by JPA
 156  */
 157
 158 int ntfs_names_collate(const ntfschar *name1, const u32 name1_len,
 159                 const ntfschar *name2, const u32 name2_len,
 160                 const int err_val __attribute__((unused)),
 161                 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
 162                 const u32 upcase_len)
 163 {
 164         u32 cnt;
 165         ntfschar c1, c2;
 166
 167 #ifdef DEBUG
 168         if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
 169                 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
 170                 exit(1);
 171         }
 172 #endif
 173         cnt = min(name1_len, name2_len);
 174                 /* JPA average loop count is 8 */
 175         if (cnt > 0) {
 176                 if (ic)
 177                                 /* JPA this loop in 76% cases */
 178                         do {
 179                                 c1 = le16_to_cpu(*name1);
 180                                 name1++;
 181                                 c2 = le16_to_cpu(*name2);
 182                                 name2++;
 183                                 if (c1 < upcase_len)
 184                                         c1 = le16_to_cpu(upcase[c1]);
 185                                 if (c2 < upcase_len)
 186                                         c2 = le16_to_cpu(upcase[c2]);
 187                         } while ((c1 == c2) && --cnt);
 188                 else
 189                         do {
 190                                 /* JPA this loop in 24% cases */
 191                                 c1 = le16_to_cpu(*name1);
 192                                 name1++;
 193                                 c2 = le16_to_cpu(*name2);
 194                                 name2++;
 195                         } while ((c1 == c2) && --cnt);
 196                 if (c1 < c2)
 197                         return -1;
 198                 if (c1 > c2)
 199                         return 1;
 200         }
 201         if (name1_len < name2_len)
 202                 return -1;
 203         if (name1_len == name2_len)
 204                 return 0;
 205         return 1;
 206 }
 207
 208 /**
 209  * ntfs_ucsncmp - compare two little endian Unicode strings
 210  * @s1:         first string
 211  * @s2:         second string
 212  * @n:          maximum unicode characters to compare
 213  *
 214  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 215  * The strings in little endian format and appropriate le16_to_cpu()
 216  * conversion is performed on non-little endian machines.
 217  *
 218  * The function returns an integer less than, equal to, or greater than zero
 219  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 220  * to be less than, to match, or be greater than @s2.
 221  */
 222 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
 223 {
 224         ntfschar c1, c2;
 225         size_t i;
 226
 227 #ifdef DEBUG
 228         if (!s1 || !s2) {
 229                 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
 230                 exit(1);
 231         }
 232 #endif
 233         for (i = 0; i < n; ++i) {
 234                 c1 = le16_to_cpu(s1[i]);
 235                 c2 = le16_to_cpu(s2[i]);
 236                 if (c1 < c2)
 237                         return -1;
 238                 if (c1 > c2)
 239                         return 1;
 240                 if (!c1)
 241                         break;
 242         }
 243         return 0;
 244 }
 245
 246 /**
 247  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
 248  * @s1:                 first string
 249  * @s2:                 second string
 250  * @n:                  maximum unicode characters to compare
 251  * @upcase:             upcase table
 252  * @upcase_size:        upcase table size in Unicode characters
 253  *
 254  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 255  * ignoring case. The strings in little endian format and appropriate
 256  * le16_to_cpu() conversion is performed on non-little endian machines.
 257  *
 258  * Each character is uppercased using the @upcase table before the comparison.
 259  *
 260  * The function returns an integer less than, equal to, or greater than zero
 261  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 262  * to be less than, to match, or be greater than @s2.
 263  */
 264 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
 265                 const ntfschar *upcase, const u32 upcase_size)
 266 {
 267         ntfschar c1, c2;
 268         size_t i;
 269
 270 #ifdef DEBUG
 271         if (!s1 || !s2 || !upcase) {
 272                 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
 273                 exit(1);
 274         }
 275 #endif
 276         for (i = 0; i < n; ++i) {
 277                 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
 278                         c1 = le16_to_cpu(upcase[c1]);
 279                 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
 280                         c2 = le16_to_cpu(upcase[c2]);
 281                 if (c1 < c2)
 282                         return -1;
 283                 if (c1 > c2)
 284                         return 1;
 285                 if (!c1)
 286                         break;
 287         }
 288         return 0;
 289 }
 290
 291 /**
 292  * ntfs_ucsnlen - determine the length of a little endian Unicode string
 293  * @s:          pointer to Unicode string
 294  * @maxlen:     maximum length of string @s
 295  *
 296  * Return the number of Unicode characters in the little endian Unicode
 297  * string @s up to a maximum of maxlen Unicode characters, not including
 298  * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
 299  * and @s + @maxlen, @maxlen is returned.
 300  *
 301  * This function never looks beyond @s + @maxlen.
 302  */
 303 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
 304 {
 305         u32 i;
 306
 307         for (i = 0; i < maxlen; i++) {
 308                 if (!le16_to_cpu(s[i]))
 309                         break;
 310         }
 311         return i;
 312 }
 313
 314 /**
 315  * ntfs_ucsndup - duplicate little endian Unicode string
 316  * @s:          pointer to Unicode string
 317  * @maxlen:     maximum length of string @s
 318  *
 319  * Return a pointer to a new little endian Unicode string which is a duplicate
 320  * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
 321  * and can be freed with free(3).
 322  *
 323  * A maximum of @maxlen Unicode characters are copied and a terminating
 324  * (ntfschar)'\0' little endian Unicode character is added.
 325  *
 326  * This function never looks beyond @s + @maxlen.
 327  *
 328  * Return a pointer to the new little endian Unicode string on success and NULL
 329  * on failure with errno set to the error code.
 330  */
 331 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
 332 {
 333         ntfschar *dst;
 334         u32 len;
 335
 336         len = ntfs_ucsnlen(s, maxlen);
 337         dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
 338         if (dst) {
 339                 memcpy(dst, s, len * sizeof(ntfschar));
 340                 dst[len] = cpu_to_le16(L'\0');
 341         }
 342         return dst;
 343 }
 344
 345 /**
 346  * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
 347  * @name:
 348  * @name_len:
 349  * @upcase:
 350  * @upcase_len:
 351  *
 352  * Description...
 353  *
 354  * Returns:
 355  */
 356 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
 357                 const u32 upcase_len)
 358 {
 359         u32 i;
 360         ntfschar u;
 361
 362         for (i = 0; i < name_len; i++)
 363                 if ((u = le16_to_cpu(name[i])) < upcase_len)
 364                         name[i] = upcase[u];
 365 }
 366
 367 /**
 368  * ntfs_file_value_upcase - Convert a filename to upper case
 369  * @file_name_attr:
 370  * @upcase:
 371  * @upcase_len:
 372  *
 373  * Description...
 374  *
 375  * Returns:
 376  */
 377 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
 378                 const ntfschar *upcase, const u32 upcase_len)
 379 {
 380         ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
 381                         file_name_attr->file_name_length, upcase, upcase_len);
 382 }
 383
 384 /**
 385  * ntfs_file_values_compare - Which of two filenames should be listed first
 386  * @file_name_attr1:
 387  * @file_name_attr2:
 388  * @err_val:
 389  * @ic:
 390  * @upcase:
 391  * @upcase_len:
 392  *
 393  * Description...
 394  *
 395  * Returns:
 396  */
 397 int ntfs_file_values_compare(const FILE_NAME_ATTR *file_name_attr1,
 398                 const FILE_NAME_ATTR *file_name_attr2,
 399                 const int err_val, const IGNORE_CASE_BOOL ic,
 400                 const ntfschar *upcase, const u32 upcase_len)
 401 {
 402         return ntfs_names_collate((ntfschar*)&file_name_attr1->file_name,
 403                         file_name_attr1->file_name_length,
 404                         (ntfschar*)&file_name_attr2->file_name,
 405                         file_name_attr2->file_name_length,
 406                         err_val, ic, upcase, upcase_len);
 407 }
 408
 409 /*
 410    NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
 411    for now]) for path names, but the Unicode code points need to be
 412    converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
 413    glibc does this even without a locale in a hard-coded fashion as that
 414    appears to be is easy because the low 7-bit ASCII range appears to be
 415    available in all charsets but it does not convert anything if
 416    there was some error with the locale setup or none set up like
 417    when mount is called during early boot where he (by policy) do
 418    not use locales (and may be not available if /usr is not yet mounted),
 419    so this patch fixes the resulting issues for systems which use
 420    UTF-8 and for others, specifying the locale in fstab brings them
 421    the encoding which they want.
 422
 423    If no locale is defined or there was a problem with setting one
 424    up and whenever nl_langinfo(CODESET) returns a sting starting with
 425    "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
 426    the bug where NTFS-3G does not show any path names which include
 427    international characters!!! (and also fails on creating them) as result.
 428
 429    Author: Bernhard Kaindl <bk@suse.de>
 430    Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
 431 */
 432
 433 /*
 434  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
 435  * null) to store a given UTF-16LE string.
 436  *
 437  * Return -1 with errno set if string has invalid byte sequence or too long.
 438  */
 439 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
 440 {
 441         int i, ret = -1;
 442         int count = 0;
 443         BOOL surrog;
 444
 445         surrog = FALSE;
 446         for (i = 0; i < ins_len && ins[i]; i++) {
 447                 unsigned short c = le16_to_cpu(ins[i]);
 448                 if (surrog) {
 449                         if ((c >= 0xdc00) && (c < 0xe000)) {
 450                                 surrog = FALSE;
 451                                 count += 4;
 452                         } else
 453                                 goto fail;
 454                 } else
 455                         if (c < 0x80)
 456                                 count++;
 457                         else if (c < 0x800)
 458                                 count += 2;
 459                         else if (c < 0xd800)
 460                                 count += 3;
 461                         else if (c < 0xdc00)
 462                                 surrog = TRUE;
 463 #if NOREVBOM
 464                         else if ((c >= 0xe000) && (c < 0xfffe))
 465 #else
 466                         else if (c >= 0xe000)
 467 #endif
 468                                 count += 3;
 469                         else
 470                                 goto fail;
 471                 if (count > outs_len) {
 472                         errno = ENAMETOOLONG;
 473                         goto out;
 474                 }
 475         }
 476         if (surrog)
 477                 goto fail;
 478
 479         ret = count;
 480 out:
 481         return ret;
 482 fail:
 483         errno = EILSEQ;
 484         goto out;
 485 }
 486
 487 /*
 488  * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
 489  * @ins:        input utf16 string buffer
 490  * @ins_len:    length of input string in utf16 characters
 491  * @outs:       on return contains the (allocated) output multibyte string
 492  * @outs_len:   length of output buffer in bytes
 493  *
 494  * Return -1 with errno set if string has invalid byte sequence or too long.
 495  */
 496 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
 497                               char **outs, int outs_len)
 498 {
 499 #if defined(__APPLE__) || defined(__DARWIN__)
 500 #ifdef ENABLE_NFCONV
 501         char *original_outs_value = *outs;
 502         int original_outs_len = outs_len;
 503 #endif /* ENABLE_NFCONV */
 504 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 505
 506         char *t;
 507         int i, size, ret = -1;
 508         ntfschar halfpair;
 509
 510         halfpair = 0;
 511         if (!*outs)
 512                 outs_len = PATH_MAX;
 513
 514         size = utf16_to_utf8_size(ins, ins_len, outs_len);
 515
 516         if (size < 0)
 517                 goto out;
 518
 519         if (!*outs) {
 520                 outs_len = size + 1;
 521                 *outs = ntfs_malloc(outs_len);
 522                 if (!*outs)
 523                         goto out;
 524         }
 525
 526         t = *outs;
 527
 528         for (i = 0; i < ins_len && ins[i]; i++) {
 529             unsigned short c = le16_to_cpu(ins[i]);
 530                         /* size not double-checked */
 531                 if (halfpair) {
 532                         if ((c >= 0xdc00) && (c < 0xe000)) {
 533                                 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
 534                                 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
 535                                 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
 536                                 *t++ = 0x80 + (c & 63);
 537                                 halfpair = 0;
 538                         } else
 539                                 goto fail;
 540                 } else if (c < 0x80) {
 541                         *t++ = c;
 542                 } else {
 543                         if (c < 0x800) {
 544                                 *t++ = (0xc0 | ((c >> 6) & 0x3f));
 545                                 *t++ = 0x80 | (c & 0x3f);
 546                         } else if (c < 0xd800) {
 547                                 *t++ = 0xe0 | (c >> 12);
 548                                 *t++ = 0x80 | ((c >> 6) & 0x3f);
 549                                 *t++ = 0x80 | (c & 0x3f);
 550                         } else if (c < 0xdc00)
 551                                 halfpair = c;
 552                         else if (c >= 0xe000) {
 553                                 *t++ = 0xe0 | (c >> 12);
 554                                 *t++ = 0x80 | ((c >> 6) & 0x3f);
 555                                 *t++ = 0x80 | (c & 0x3f);
 556                         } else
 557                                 goto fail;
 558                 }
 559         }
 560         *t = '\0';
 561
 562 #if defined(__APPLE__) || defined(__DARWIN__)
 563 #ifdef ENABLE_NFCONV
 564         if(nfconvert_utf8 && (t - *outs) > 0) {
 565                 char *new_outs = NULL;
 566                 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
 567                 if(new_outs_len >= 0 && new_outs != NULL) {
 568                         if(original_outs_value != *outs) {
 569                                 // We have allocated outs ourselves.
 570                                 free(*outs);
 571                                 *outs = new_outs;
 572                                 t = *outs + new_outs_len;
 573                         }
 574                         else {
 575                                 // We need to copy new_outs into the fixed outs buffer.
 576                                 memset(*outs, 0, original_outs_len);
 577                                 strncpy(*outs, new_outs, original_outs_len-1);
 578                                 t = *outs + original_outs_len;
 579                                 free(new_outs);
 580                         }
 581                 }
 582                 else {
 583                         ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
 584                         ntfs_log_error("  new_outs=0x%p\n", new_outs);
 585                         ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
 586                 }
 587         }
 588 #endif /* ENABLE_NFCONV */
 589 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 590
 591         ret = t - *outs;
 592 out:
 593         return ret;
 594 fail:
 595         errno = EILSEQ;
 596         goto out;
 597 }
 598
 599 /*
 600  * Return the amount of 16-bit elements in UTF-16LE needed
 601  * (without the terminating null) to store given UTF-8 string.
 602  *
 603  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
 604  *
 605  * Note: This does not check whether the input sequence is a valid utf8 string,
 606  *       and should be used only in context where such check is made!
 607  */
 608 static int utf8_to_utf16_size(const char *s)
 609 {
 610         int ret = -1;
 611         unsigned int byte;
 612         size_t count = 0;
 613
 614         while ((byte = *((const unsigned char *)s++))) {
 615                 if (++count >= PATH_MAX)
 616                         goto fail;
 617                 if (byte >= 0xF5) {
 618                         errno = EILSEQ;
 619                         goto out;
 620                 }
 621                 if (!*s)
 622                         break;
 623                 if (byte >= 0xC0)
 624                         s++;
 625                 if (!*s)
 626                         break;
 627                 if (byte >= 0xE0)
 628                         s++;
 629                 if (!*s)
 630                         break;
 631                 if (byte >= 0xF0) {
 632                         s++;
 633                         if (++count >= PATH_MAX)
 634                                 goto fail;
 635                 }
 636         }
 637         ret = count;
 638 out:
 639         return ret;
 640 fail:
 641         errno = ENAMETOOLONG;
 642         goto out;
 643 }
 644 /*
 645  * This converts one UTF-8 sequence to cpu-endian Unicode value
 646  * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
 647  *
 648  * Return the number of used utf8 bytes or -1 with errno set
 649  * if sequence is invalid.
 650  */
 651 static int utf8_to_unicode(u32 *wc, const char *s)
 652 {
 653         unsigned int byte = *((const unsigned char *)s);
 654
 655                                         /* single byte */
 656         if (byte == 0) {
 657                 *wc = (u32) 0;
 658                 return 0;
 659         } else if (byte < 0x80) {
 660                 *wc = (u32) byte;
 661                 return 1;
 662                                         /* double byte */
 663         } else if (byte < 0xc2) {
 664                 goto fail;
 665         } else if (byte < 0xE0) {
 666                 if (strlen(s) < 2)
 667                         goto fail;
 668                 if ((s[1] & 0xC0) == 0x80) {
 669                         *wc = ((u32)(byte & 0x1F) << 6)
 670                             | ((u32)(s[1] & 0x3F));
 671                         return 2;
 672                 } else
 673                         goto fail;
 674                                         /* three-byte */
 675         } else if (byte < 0xF0) {
 676                 if (strlen(s) < 3)
 677                         goto fail;
 678                 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
 679                         *wc = ((u32)(byte & 0x0F) << 12)
 680                             | ((u32)(s[1] & 0x3F) << 6)
 681                             | ((u32)(s[2] & 0x3F));
 682                         /* Check valid ranges */
 683 #if NOREVBOM
 684                         if (((*wc >= 0x800) && (*wc <= 0xD7FF))
 685                           || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
 686                                 return 3;
 687 #else
 688                         if (((*wc >= 0x800) && (*wc <= 0xD7FF))
 689                           || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
 690                                 return 3;
 691 #endif
 692                 }
 693                 goto fail;
 694                                         /* four-byte */
 695         } else if (byte < 0xF5) {
 696                 if (strlen(s) < 4)
 697                         goto fail;
 698                 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
 699                   && ((s[3] & 0xC0) == 0x80)) {
 700                         *wc = ((u32)(byte & 0x07) << 18)
 701                             | ((u32)(s[1] & 0x3F) << 12)
 702                             | ((u32)(s[2] & 0x3F) << 6)
 703                             | ((u32)(s[3] & 0x3F));
 704                 /* Check valid ranges */
 705                 if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
 706                         return 4;
 707                 }
 708                 goto fail;
 709         }
 710 fail:
 711         errno = EILSEQ;
 712         return -1;
 713 }
 714
 715 /**
 716  * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
 717  * @ins:        input multibyte string buffer
 718  * @outs:       on return contains the (allocated) output utf16 string
 719  * @outs_len:   length of output buffer in utf16 characters
 720  *
 721  * Return -1 with errno set.
 722  */
 723 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
 724 {
 725 #if defined(__APPLE__) || defined(__DARWIN__)
 726 #ifdef ENABLE_NFCONV
 727         char *new_ins = NULL;
 728         if(nfconvert_utf8) {
 729                 int new_ins_len;
 730                 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
 731                 if(new_ins_len >= 0)
 732                         ins = new_ins;
 733                 else
 734                         ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
 735         }
 736 #endif /* ENABLE_NFCONV */
 737 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 738         const char *t = ins;
 739         u32 wc;
 740         ntfschar *outpos;
 741         int shorts, ret = -1;
 742
 743         shorts = utf8_to_utf16_size(ins);
 744         if (shorts < 0)
 745                 goto fail;
 746
 747         if (!*outs) {
 748                 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
 749                 if (!*outs)
 750                         goto fail;
 751         }
 752
 753         outpos = *outs;
 754
 755         while(1) {
 756                 int m  = utf8_to_unicode(&wc, t);
 757                 if (m < 0)
 758                         goto fail;
 759                 if (wc < 0x10000)
 760                         *outpos++ = cpu_to_le16(wc);
 761                 else {
 762                         wc -= 0x10000;
 763                         *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
 764                         *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
 765                 }
 766                 if (m == 0)
 767                         break;
 768                 t += m;
 769         }
 770
 771         ret = --outpos - *outs;
 772 fail:
 773 #if defined(__APPLE__) || defined(__DARWIN__)
 774 #ifdef ENABLE_NFCONV
 775         if(new_ins != NULL)
 776                 free(new_ins);
 777 #endif /* ENABLE_NFCONV */
 778 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
 779         return ret;
 780 }
 781
 782 /**
 783  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
 784  * @ins:        input Unicode string buffer
 785  * @ins_len:    length of input string in Unicode characters
 786  * @outs:       on return contains the (allocated) output multibyte string
 787  * @outs_len:   length of output buffer in bytes
 788  *
 789  * Convert the input little endian, 2-byte Unicode string @ins, of length
 790  * @ins_len into the multibyte string format dictated by the current locale.
 791  *
 792  * If *@outs is NULL, the function allocates the string and the caller is
 793  * responsible for calling free(*@outs); when finished with it.
 794  *
 795  * On success the function returns the number of bytes written to the output
 796  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
 797  * string buffer was allocated, *@outs is set to it.
 798  *
 799  * On error, -1 is returned, and errno is set to the error code. The following
 800  * error codes can be expected:
 801  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 802  *      EILSEQ          The input string cannot be represented as a multibyte
 803  *                      sequence according to the current locale.
 804  *      ENAMETOOLONG    Destination buffer is too small for input string.
 805  *      ENOMEM          Not enough memory to allocate destination buffer.
 806  */
 807 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
 808                 int outs_len)
 809 {
 810         char *mbs;
 811         wchar_t wc;
 812         int i, o, mbs_len;
 813         int cnt = 0;
 814 #ifdef HAVE_MBSINIT
 815         mbstate_t mbstate;
 816 #endif
 817
 818         if (!ins || !outs) {
 819                 errno = EINVAL;
 820                 return -1;
 821         }
 822         mbs = *outs;
 823         mbs_len = outs_len;
 824         if (mbs && !mbs_len) {
 825                 errno = ENAMETOOLONG;
 826                 return -1;
 827         }
 828         if (use_utf8)
 829                 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
 830         if (!mbs) {
 831                 mbs_len = (ins_len + 1) * MB_CUR_MAX;
 832                 mbs = ntfs_malloc(mbs_len);
 833                 if (!mbs)
 834                         return -1;
 835         }
 836 #ifdef HAVE_MBSINIT
 837         memset(&mbstate, 0, sizeof(mbstate));
 838 #else
 839         wctomb(NULL, 0);
 840 #endif
 841         for (i = o = 0; i < ins_len; i++) {
 842                 /* Reallocate memory if necessary or abort. */
 843                 if ((int)(o + MB_CUR_MAX) > mbs_len) {
 844                         char *tc;
 845                         if (mbs == *outs) {
 846                                 errno = ENAMETOOLONG;
 847                                 return -1;
 848                         }
 849                         tc = ntfs_malloc((mbs_len + 64) & ~63);
 850                         if (!tc)
 851                                 goto err_out;
 852                         memcpy(tc, mbs, mbs_len);
 853                         mbs_len = (mbs_len + 64) & ~63;
 854                         free(mbs);
 855                         mbs = tc;
 856                 }
 857                 /* Convert the LE Unicode character to a CPU wide character. */
 858                 wc = (wchar_t)le16_to_cpu(ins[i]);
 859                 if (!wc)
 860                         break;
 861                 /* Convert the CPU endian wide character to multibyte. */
 862 #ifdef HAVE_MBSINIT
 863                 cnt = wcrtomb(mbs + o, wc, &mbstate);
 864 #else
 865                 cnt = wctomb(mbs + o, wc);
 866 #endif
 867                 if (cnt == -1)
 868                         goto err_out;
 869                 if (cnt <= 0) {
 870                         ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
 871                         errno = EINVAL;
 872                         goto err_out;
 873                 }
 874                 o += cnt;
 875         }
 876 #ifdef HAVE_MBSINIT
 877         /* Make sure we are back in the initial state. */
 878         if (!mbsinit(&mbstate)) {
 879                 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
 880                 errno = EILSEQ;
 881                 goto err_out;
 882         }
 883 #endif
 884         /* Now write the NULL character. */
 885         mbs[o] = '\0';
 886         if (*outs != mbs)
 887                 *outs = mbs;
 888         return o;
 889 err_out:
 890         if (mbs != *outs) {
 891                 int eo = errno;
 892                 free(mbs);
 893                 errno = eo;
 894         }
 895         return -1;
 896 }
 897
 898 /**
 899  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
 900  * @ins:        input multibyte string buffer
 901  * @outs:       on return contains the (allocated) output Unicode string
 902  *
 903  * Convert the input multibyte string @ins, from the current locale into the
 904  * corresponding little endian, 2-byte Unicode string.
 905  *
 906  * The function allocates the string and the caller is responsible for calling
 907  * free(*@outs); when finished with it.
 908  *
 909  * On success the function returns the number of Unicode characters written to
 910  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
 911  * character.
 912  *
 913  * On error, -1 is returned, and errno is set to the error code. The following
 914  * error codes can be expected:
 915  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 916  *      EILSEQ          The input string cannot be represented as a Unicode
 917  *                      string according to the current locale.
 918  *      ENAMETOOLONG    Destination buffer is too small for input string.
 919  *      ENOMEM          Not enough memory to allocate destination buffer.
 920  */
 921 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
 922 {
 923         ntfschar *ucs;
 924         const char *s;
 925         wchar_t wc;
 926         int i, o, cnt, ins_len, ucs_len, ins_size;
 927 #ifdef HAVE_MBSINIT
 928         mbstate_t mbstate;
 929 #endif
 930
 931         if (!ins || !outs) {
 932                 errno = EINVAL;
 933                 return -1;
 934         }
 935
 936         if (use_utf8)
 937                 return ntfs_utf8_to_utf16(ins, outs);
 938
 939         /* Determine the size of the multi-byte string in bytes. */
 940         ins_size = strlen(ins);
 941         /* Determine the length of the multi-byte string. */
 942         s = ins;
 943 #if defined(HAVE_MBSINIT)
 944         memset(&mbstate, 0, sizeof(mbstate));
 945         ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
 946 #ifdef __CYGWIN32__
 947         if (!ins_len && *ins) {
 948                 /* Older Cygwin had broken mbsrtowcs() implementation. */
 949                 ins_len = strlen(ins);
 950         }
 951 #endif
 952 #elif !defined(DJGPP)
 953         ins_len = mbstowcs(NULL, s, 0);
 954 #else
 955         /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
 956         ins_len = strlen(ins);
 957 #endif
 958         if (ins_len == -1)
 959                 return ins_len;
 960 #ifdef HAVE_MBSINIT
 961         if ((s != ins) || !mbsinit(&mbstate)) {
 962 #else
 963         if (s != ins) {
 964 #endif
 965                 errno = EILSEQ;
 966                 return -1;
 967         }
 968         /* Add the NULL terminator. */
 969         ins_len++;
 970         ucs_len = ins_len;
 971         ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
 972         if (!ucs)
 973                 return -1;
 974 #ifdef HAVE_MBSINIT
 975         memset(&mbstate, 0, sizeof(mbstate));
 976 #else
 977         mbtowc(NULL, NULL, 0);
 978 #endif
 979         for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
 980                 /* Reallocate memory if necessary. */
 981                 if (o >= ucs_len) {
 982                         ntfschar *tc;
 983                         ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
 984                         tc = realloc(ucs, ucs_len);
 985                         if (!tc)
 986                                 goto err_out;
 987                         ucs = tc;
 988                         ucs_len /= sizeof(ntfschar);
 989                 }
 990                 /* Convert the multibyte character to a wide character. */
 991 #ifdef HAVE_MBSINIT
 992                 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
 993 #else
 994                 cnt = mbtowc(&wc, ins + i, ins_size - i);
 995 #endif
 996                 if (!cnt)
 997                         break;
 998                 if (cnt == -1)
 999                         goto err_out;
1000                 if (cnt < -1) {
1001                         ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1002                         errno = EINVAL;
1003                         goto err_out;
1004                 }
1005                 /* Make sure we are not overflowing the NTFS Unicode set. */
1006                 if ((unsigned long)wc >= (unsigned long)(1 <<
1007                                 (8 * sizeof(ntfschar)))) {
1008                         errno = EILSEQ;
1009                         goto err_out;
1010                 }
1011                 /* Convert the CPU wide character to a LE Unicode character. */
1012                 ucs[o] = cpu_to_le16(wc);
1013         }
1014 #ifdef HAVE_MBSINIT
1015         /* Make sure we are back in the initial state. */
1016         if (!mbsinit(&mbstate)) {
1017                 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1018                 errno = EILSEQ;
1019                 goto err_out;
1020         }
1021 #endif
1022         /* Now write the NULL character. */
1023         ucs[o] = cpu_to_le16(L'\0');
1024         *outs = ucs;
1025         return o;
1026 err_out:
1027         free(ucs);
1028         return -1;
1029 }
1030
1031 /**
1032  * ntfs_upcase_table_build - build the default upcase table for NTFS
1033  * @uc:         destination buffer where to store the built table
1034  * @uc_len:     size of destination buffer in bytes
1035  *
1036  * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1037  * stores it in the caller supplied buffer @uc of size @uc_len.
1038  *
1039  * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1040  */
1041 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1042 {
1043         static int uc_run_table[][3] = { /* Start, End, Add */
1044         {0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1045         {0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1046         {0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1047         {0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1048         {0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1049         {0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1050         {0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1051         {0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1052         {0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1053         {0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1054         {0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1055         {0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1056         {0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1057         {0}
1058         };
1059         static int uc_dup_table[][2] = { /* Start, End */
1060         {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1061         {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1062         {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1063         {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1064         {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1065         {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1066         {0}
1067         };
1068         static int uc_byte_table[][2] = { /* Offset, Value */
1069         {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1070         {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1071         {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1072         {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1073         {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1074         {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1075         {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1076         {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1077         {0}
1078         };
1079         int i, r;
1080         int k, off;
1081
1082         memset((char*)uc, 0, uc_len);
1083         uc_len >>= 1;
1084         if (uc_len > 65536)
1085                 uc_len = 65536;
1086         for (i = 0; (u32)i < uc_len; i++)
1087                 uc[i] = cpu_to_le16(i);
1088         for (r = 0; uc_run_table[r][0]; r++) {
1089                 off = uc_run_table[r][2];
1090                 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1091                         uc[i] = cpu_to_le16(i + off);
1092         }
1093         for (r = 0; uc_dup_table[r][0]; r++)
1094                 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1095                         uc[i + 1] = cpu_to_le16(i);
1096         for (r = 0; uc_byte_table[r][0]; r++) {
1097                 k = uc_byte_table[r][1];
1098                 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1099         }
1100 }
1101
1102 /**
1103  * ntfs_str2ucs - convert a string to a valid NTFS file name
1104  * @s:          input string
1105  * @len:        length of output buffer in Unicode characters
1106  *
1107  * Convert the input @s string into the corresponding little endian,
1108  * 2-byte Unicode string. The length of the converted string is less
1109  * or equal to the maximum length allowed by the NTFS format (255).
1110  *
1111  * If @s is NULL then return AT_UNNAMED.
1112  *
1113  * On success the function returns the Unicode string in an allocated
1114  * buffer and the caller is responsible to free it when it's not needed
1115  * anymore.
1116  *
1117  * On error NULL is returned and errno is set to the error code.
1118  */
1119 ntfschar *ntfs_str2ucs(const char *s, int *len)
1120 {
1121         ntfschar *ucs = NULL;
1122
1123         if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1124                 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1125                 return NULL;
1126         }
1127         if (*len > NTFS_MAX_NAME_LEN) {
1128                 free(ucs);
1129                 errno = ENAMETOOLONG;
1130                 return NULL;
1131         }
1132         if (!ucs || !*len) {
1133                 ucs  = AT_UNNAMED;
1134                 *len = 0;
1135         }
1136         return ucs;
1137 }
1138
1139 /**
1140  * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1141  * @ucs         input string to be freed
1142  *
1143  * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1144  *
1145  * Return value: none.
1146  */
1147 void ntfs_ucsfree(ntfschar *ucs)
1148 {
1149         if (ucs && (ucs != AT_UNNAMED))
1150                 free(ucs);
1151 }
1152
1153 /*
1154  *              Check whether a name contains no chars forbidden
1155  *      for DOS or Win32 use
1156  *
1157  *      If there is a bad char, errno is set to EINVAL
1158  */
1159
1160 BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1161 {
1162         BOOL forbidden;
1163         int ch;
1164         int i;
1165         u32 mainset =     (1L << ('\"' - 0x20))
1166                         | (1L << ('*' - 0x20))
1167                         | (1L << ('/' - 0x20))
1168                         | (1L << (':' - 0x20))
1169                         | (1L << ('<' - 0x20))
1170                         | (1L << ('>' - 0x20))
1171                         | (1L << ('?' - 0x20));
1172
1173         forbidden = (len == 0) || (le16_to_cpu(name[len-1]) == ' ');
1174         for (i=0; i<len; i++) {
1175                 ch = le16_to_cpu(name[i]);
1176                 if ((ch < 0x20)
1177                     || ((ch < 0x40)
1178                         && ((1L << (ch - 0x20)) & mainset))
1179                     || (ch == '\\')
1180                     || (ch == '|'))
1181                         forbidden = TRUE;
1182         }
1183         if (forbidden)
1184                 errno = EINVAL;
1185         return (forbidden);
1186 }
1187
1188 /*
1189  *              Check whether the same name can be used as a DOS and
1190  *      a Win32 name
1191  *
1192  *      The names must be the same, or the short name the uppercase
1193  *      variant of the long name
1194  */
1195
1196 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1197                         const ntfschar *shortname, int shortlen,
1198                         const ntfschar *longname, int longlen)
1199 {
1200         BOOL collapsible;
1201         unsigned int ch;
1202         int i;
1203
1204         collapsible = shortlen == longlen;
1205         if (collapsible)
1206                 for (i=0; i<shortlen; i++) {
1207                         ch = le16_to_cpu(longname[i]);
1208                         if ((ch >= vol->upcase_len)
1209                          || ((shortname[i] != longname[i])
1210                                 && (shortname[i] != vol->upcase[ch])))
1211                                         collapsible = FALSE;
1212         }
1213         return (collapsible);
1214 }
1215
1216 /*
1217  * Define the character encoding to be used.
1218  * Use UTF-8 unless specified otherwise.
1219  */
1220
1221 int ntfs_set_char_encoding(const char *locale)
1222 {
1223         use_utf8 = 0;
1224         if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1225             || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1226                 use_utf8 = 1;
1227         else
1228                 if (setlocale(LC_ALL, locale))
1229                         use_utf8 = 0;
1230                 else {
1231                         ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1232                         use_utf8 = 1;
1233                 }
1234         return 0; /* always successful */
1235 }
1236
1237 #if defined(__APPLE__) || defined(__DARWIN__)
1238
1239 int ntfs_macosx_normalize_filenames(int normalize) {
1240 #ifdef ENABLE_NFCONV
1241         if(normalize == 0 || normalize == 1) {
1242                 nfconvert_utf8 = normalize;
1243                 return 0;
1244         }
1245         else
1246                 return -1;
1247 #else
1248         return -1;
1249 #endif /* ENABLE_NFCONV */
1250 }
1251
1252 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1253  int composed) {
1254 #ifdef ENABLE_NFCONV
1255         /* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1256         CFStringRef cfSourceString;
1257         CFMutableStringRef cfMutableString;
1258         CFRange rangeToProcess;
1259         CFIndex requiredBufferLength;
1260         char *result = NULL;
1261         int resultLength = -1;
1262
1263         /* Convert the UTF-8 string to a CFString. */
1264         cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1265         if(cfSourceString == NULL) {
1266                 ntfs_log_error("CFStringCreateWithCString failed!\n");
1267                 return -2;
1268         }
1269
1270         /* Create a mutable string from cfSourceString that we are free to modify. */
1271         cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1272         CFRelease(cfSourceString); /* End-of-life. */
1273         if(cfMutableString == NULL) {
1274                 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1275                 return -3;
1276         }
1277
1278         /* Normalize the mutable string to the desired normalization form. */
1279         CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1280
1281         /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1282         rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1283         if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1284                 resultLength = sizeof(char)*(requiredBufferLength + 1);
1285                 result = ntfs_calloc(resultLength);
1286
1287                 if(result != NULL) {
1288                         if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1289                                             0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1290                                 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1291                                 free(result);
1292                                 result = NULL;
1293                         }
1294                 }
1295                 else
1296                         ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1297         }
1298         else
1299                 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1300
1301
1302         CFRelease(cfMutableString);
1303
1304         if(result != NULL) {
1305                 *target = result;
1306                 return resultLength - 1;
1307         }
1308         else
1309                 return -1;
1310 #else
1311         return -1;
1312 #endif /* ENABLE_NFCONV */
1313 }
1314 #endif /* defined(__APPLE__) || defined(__DARWIN__) */