fs/udf/unicode.c

   1 /*
   2  * unicode.c
   3  *
   4  * PURPOSE
   5  *      Routines for converting between UTF-8 and OSTA Compressed Unicode.
   6  *      Also handles filename mangling
   7  *
   8  * DESCRIPTION
   9  *      OSTA Compressed Unicode is explained in the OSTA UDF specification.
  10  *              http://www.osta.org/
  11  *      UTF-8 is explained in the IETF RFC XXXX.
  12  *              ftp://ftp.internic.net/rfc/rfcxxxx.txt
  13  *
  14  * CONTACTS
  15  *      E-mail regarding any portion of the Linux UDF file system should be
  16  *      directed to the development team's mailing list (run by majordomo):
  17  *              linux_udf@hpesjro.fc.hp.com
  18  *
  19  * COPYRIGHT
  20  *      This file is distributed under the terms of the GNU General Public
  21  *      License (GPL). Copies of the GPL can be obtained from:
  22  *              ftp://prep.ai.mit.edu/pub/gnu/GPL
  23  *      Each contributing author retains all rights to their own work.
  24  */
  25
  26 #include "udfdecl.h"
  27
  28 #include <linux/kernel.h>
  29 #include <linux/string.h>       /* for memset */
  30 #include <linux/nls.h>
  31 #include <linux/udf_fs.h>
  32
  33 #include "udf_sb.h"
  34
  35 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
  36
  37 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
  38 {
  39         if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) )
  40                 return 0;
  41         memset(dest, 0, sizeof(struct ustr));
  42         memcpy(dest->u_name, src, strlen);
  43         dest->u_cmpID = 0x08;
  44         dest->u_len = strlen;
  45         return strlen;
  46 }
  47
  48 /*
  49  * udf_build_ustr
  50  */
  51 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
  52 {
  53         int usesize;
  54
  55         if ( (!dest) || (!ptr) || (!size) )
  56                 return -1;
  57
  58         memset(dest, 0, sizeof(struct ustr));
  59         usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
  60         dest->u_cmpID=ptr[0];
  61         dest->u_len=ptr[size-1];
  62         memcpy(dest->u_name, ptr+1, usesize-1);
  63         return 0;
  64 }
  65
  66 /*
  67  * udf_build_ustr_exact
  68  */
  69 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  70 {
  71         if ( (!dest) || (!ptr) || (!exactsize) )
  72                 return -1;
  73
  74         memset(dest, 0, sizeof(struct ustr));
  75         dest->u_cmpID=ptr[0];
  76         dest->u_len=exactsize-1;
  77         memcpy(dest->u_name, ptr+1, exactsize-1);
  78         return 0;
  79 }
  80
  81 /*
  82  * udf_ocu_to_utf8
  83  *
  84  * PURPOSE
  85  *      Convert OSTA Compressed Unicode to the UTF-8 equivalent.
  86  *
  87  * DESCRIPTION
  88  *      This routine is only called by udf_filldir().
  89  *
  90  * PRE-CONDITIONS
  91  *      utf                     Pointer to UTF-8 output buffer.
  92  *      ocu                     Pointer to OSTA Compressed Unicode input buffer
  93  *                              of size UDF_NAME_LEN bytes.
  94  *                              both of type "struct ustr *"
  95  *
  96  * POST-CONDITIONS
  97  *      <return>                Zero on success.
  98  *
  99  * HISTORY
 100  *      November 12, 1997 - Andrew E. Mileski
 101  *      Written, tested, and released.
 102  */
 103 int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
 104 {
 105         uint8_t *ocu;
 106         uint32_t c;
 107         uint8_t cmp_id, ocu_len;
 108         int i;
 109
 110         ocu = ocu_i->u_name;
 111
 112         ocu_len = ocu_i->u_len;
 113         cmp_id = ocu_i->u_cmpID;
 114         utf_o->u_len = 0;
 115
 116         if (ocu_len == 0)
 117         {
 118                 memset(utf_o, 0, sizeof(struct ustr));
 119                 utf_o->u_cmpID = 0;
 120                 utf_o->u_len = 0;
 121                 return 0;
 122         }
 123
 124         if ((cmp_id != 8) && (cmp_id != 16))
 125         {
 126                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
 127                 return 0;
 128         }
 129
 130         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
 131         {
 132
 133                 /* Expand OSTA compressed Unicode to Unicode */
 134                 c = ocu[i++];
 135                 if (cmp_id == 16)
 136                         c = (c << 8) | ocu[i++];
 137
 138                 /* Compress Unicode to UTF-8 */
 139                 if (c < 0x80U)
 140                         utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
 141                 else if (c < 0x800U)
 142                 {
 143                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6));
 144                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
 145                 }
 146                 else
 147                 {
 148                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12));
 149                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f));
 150                         utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f));
 151                 }
 152         }
 153         utf_o->u_cmpID=8;
 154
 155         return utf_o->u_len;
 156 }
 157
 158 /*
 159  *
 160  * udf_utf8_to_ocu
 161  *
 162  * PURPOSE
 163  *      Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 164  *
 165  * DESCRIPTION
 166  *      This routine is only called by udf_lookup().
 167  *
 168  * PRE-CONDITIONS
 169  *      ocu                     Pointer to OSTA Compressed Unicode output
 170  *                              buffer of size UDF_NAME_LEN bytes.
 171  *      utf                     Pointer to UTF-8 input buffer.
 172  *      utf_len                 Length of UTF-8 input buffer in bytes.
 173  *
 174  * POST-CONDITIONS
 175  *      <return>                Zero on success.
 176  *
 177  * HISTORY
 178  *      November 12, 1997 - Andrew E. Mileski
 179  *      Written, tested, and released.
 180  */
 181 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
 182 {
 183         unsigned c, i, max_val, utf_char;
 184         int utf_cnt, u_len;
 185
 186         memset(ocu, 0, sizeof(dstring) * length);
 187         ocu[0] = 8;
 188         max_val = 0xffU;
 189
 190 try_again:
 191         u_len = 0U;
 192         utf_char = 0U;
 193         utf_cnt = 0U;
 194         for (i = 0U; i < utf->u_len; i++)
 195         {
 196                 c = (uint8_t)utf->u_name[i];
 197
 198                 /* Complete a multi-byte UTF-8 character */
 199                 if (utf_cnt)
 200                 {
 201                         utf_char = (utf_char << 6) | (c & 0x3fU);
 202                         if (--utf_cnt)
 203                                 continue;
 204                 }
 205                 else
 206                 {
 207                         /* Check for a multi-byte UTF-8 character */
 208                         if (c & 0x80U)
 209                         {
 210                                 /* Start a multi-byte UTF-8 character */
 211                                 if ((c & 0xe0U) == 0xc0U)
 212                                 {
 213                                         utf_char = c & 0x1fU;
 214                                         utf_cnt = 1;
 215                                 }
 216                                 else if ((c & 0xf0U) == 0xe0U)
 217                                 {
 218                                         utf_char = c & 0x0fU;
 219                                         utf_cnt = 2;
 220                                 }
 221                                 else if ((c & 0xf8U) == 0xf0U)
 222                                 {
 223                                         utf_char = c & 0x07U;
 224                                         utf_cnt = 3;
 225                                 }
 226                                 else if ((c & 0xfcU) == 0xf8U)
 227                                 {
 228                                         utf_char = c & 0x03U;
 229                                         utf_cnt = 4;
 230                                 }
 231                                 else if ((c & 0xfeU) == 0xfcU)
 232                                 {
 233                                         utf_char = c & 0x01U;
 234                                         utf_cnt = 5;
 235                                 }
 236                                 else
 237                                         goto error_out;
 238                                 continue;
 239                         } else
 240                                 /* Single byte UTF-8 character (most common) */
 241                                 utf_char = c;
 242                 }
 243
 244                 /* Choose no compression if necessary */
 245                 if (utf_char > max_val)
 246                 {
 247                         if ( 0xffU == max_val )
 248                         {
 249                                 max_val = 0xffffU;
 250                                 ocu[0] = (uint8_t)0x10U;
 251                                 goto try_again;
 252                         }
 253                         goto error_out;
 254                 }
 255
 256                 if (max_val == 0xffffU)
 257                 {
 258                         ocu[++u_len] = (uint8_t)(utf_char >> 8);
 259                 }
 260                 ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
 261         }
 262
 263
 264         if (utf_cnt)
 265         {
 266 error_out:
 267                 ocu[++u_len] = '?';
 268                 printk(KERN_DEBUG "udf: bad UTF-8 character\n");
 269         }
 270
 271         ocu[length - 1] = (uint8_t)u_len + 1;
 272         return u_len + 1;
 273 }
 274
 275 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i)
 276 {
 277         uint8_t *ocu;
 278         uint32_t c;
 279         uint8_t cmp_id, ocu_len;
 280         int i;
 281
 282         ocu = ocu_i->u_name;
 283
 284         ocu_len = ocu_i->u_len;
 285         cmp_id = ocu_i->u_cmpID;
 286         utf_o->u_len = 0;
 287
 288         if (ocu_len == 0)
 289         {
 290                 memset(utf_o, 0, sizeof(struct ustr));
 291                 utf_o->u_cmpID = 0;
 292                 utf_o->u_len = 0;
 293                 return 0;
 294         }
 295
 296         if ((cmp_id != 8) && (cmp_id != 16))
 297         {
 298                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name);
 299                 return 0;
 300         }
 301
 302         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;)
 303         {
 304                 /* Expand OSTA compressed Unicode to Unicode */
 305                 c = ocu[i++];
 306                 if (cmp_id == 16)
 307                         c = (c << 8) | ocu[i++];
 308
 309                 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
 310                         UDF_NAME_LEN - utf_o->u_len);
 311         }
 312         utf_o->u_cmpID=8;
 313
 314         return utf_o->u_len;
 315 }
 316
 317 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length)
 318 {
 319         unsigned len, i, max_val;
 320         uint16_t uni_char;
 321         int u_len;
 322
 323         memset(ocu, 0, sizeof(dstring) * length);
 324         ocu[0] = 8;
 325         max_val = 0xffU;
 326
 327 try_again:
 328         u_len = 0U;
 329         for (i = 0U; i < uni->u_len; i++)
 330         {
 331                 len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char);
 332                 if (len <= 0)
 333                         continue;
 334
 335                 if (uni_char > max_val)
 336                 {
 337                         max_val = 0xffffU;
 338                         ocu[0] = (uint8_t)0x10U;
 339                         goto try_again;
 340                 }
 341
 342                 if (max_val == 0xffffU)
 343                         ocu[++u_len] = (uint8_t)(uni_char >> 8);
 344                 ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
 345                 i += len - 1;
 346         }
 347
 348         ocu[length - 1] = (uint8_t)u_len + 1;
 349         return u_len + 1;
 350 }
 351
 352 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen)
 353 {
 354         struct ustr filename, unifilename;
 355         int len;
 356
 357         if (udf_build_ustr_exact(&unifilename, sname, flen))
 358         {
 359                 return 0;
 360         }
 361
 362         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
 363         {
 364                 if (!udf_CS0toUTF8(&filename, &unifilename) )
 365                 {
 366                         udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
 367                         return 0;
 368                 }
 369         }
 370         else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 371         {
 372                 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) )
 373                 {
 374                         udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
 375                         return 0;
 376                 }
 377         }
 378         else
 379                 return 0;
 380
 381         if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
 382                 unifilename.u_name, unifilename.u_len)))
 383         {
 384                 return len;
 385         }
 386         return 0;
 387 }
 388
 389 int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen)
 390 {
 391         struct ustr unifilename;
 392         int namelen;
 393
 394         if ( !(udf_char_to_ustr(&unifilename, sname, flen)) )
 395         {
 396                 return 0;
 397         }
 398
 399         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
 400         {
 401                 if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) )
 402                 {
 403                         return 0;
 404                 }
 405         }
 406         else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 407         {
 408                 if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) )
 409                 {
 410                         return 0;
 411                 }
 412         }
 413         else
 414                 return 0;
 415
 416         return namelen;
 417 }
 418
 419 #define ILLEGAL_CHAR_MARK       '_'
 420 #define EXT_MARK                        '.'
 421 #define CRC_MARK                        '#'
 422 #define EXT_SIZE                        5
 423
 424 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen)
 425 {
 426         int index, newIndex = 0, needsCRC = 0;
 427         int extIndex = 0, newExtIndex = 0, hasExt = 0;
 428         unsigned short valueCRC;
 429         uint8_t curr;
 430         const uint8_t hexChar[] = "0123456789ABCDEF";
 431
 432         if (udfName[0] == '.' && (udfLen == 1 ||
 433                 (udfLen == 2 && udfName[1] == '.')))
 434         {
 435                 needsCRC = 1;
 436                 newIndex = udfLen;
 437                 memcpy(newName, udfName, udfLen);
 438         }
 439         else
 440         {
 441                 for (index = 0; index < udfLen; index++)
 442                 {
 443                         curr = udfName[index];
 444                         if (curr == '/' || curr == 0)
 445                         {
 446                                 needsCRC = 1;
 447                                 curr = ILLEGAL_CHAR_MARK;
 448                                 while (index+1 < udfLen && (udfName[index+1] == '/' ||
 449                                         udfName[index+1] == 0))
 450                                         index++;
 451                         }
 452                         if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE)
 453                         {
 454                                 if (udfLen == index + 1)
 455                                         hasExt = 0;
 456                                 else
 457                                 {
 458                                         hasExt = 1;
 459                                         extIndex = index;
 460                                         newExtIndex = newIndex;
 461                                 }
 462                         }
 463                         if (newIndex < 256)
 464                                 newName[newIndex++] = curr;
 465                         else
 466                                 needsCRC = 1;
 467                 }
 468         }
 469         if (needsCRC)
 470         {
 471                 uint8_t ext[EXT_SIZE];
 472                 int localExtIndex = 0;
 473
 474                 if (hasExt)
 475                 {
 476                         int maxFilenameLen;
 477                         for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen;
 478                                 index++ )
 479                         {
 480                                 curr = udfName[extIndex + index + 1];
 481
 482                                 if (curr == '/' || curr == 0)
 483                                 {
 484                                         needsCRC = 1;
 485                                         curr = ILLEGAL_CHAR_MARK;
 486                                         while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE
 487                                                 && (udfName[extIndex + index + 2] == '/' ||
 488                                                         udfName[extIndex + index + 2] == 0)))
 489                                                 index++;
 490                                 }
 491                                 ext[localExtIndex++] = curr;
 492                         }
 493                         maxFilenameLen = 250 - localExtIndex;
 494                         if (newIndex > maxFilenameLen)
 495                                 newIndex = maxFilenameLen;
 496                         else
 497                                 newIndex = newExtIndex;
 498                 }
 499                 else if (newIndex > 250)
 500                         newIndex = 250;
 501                 newName[newIndex++] = CRC_MARK;
 502                 valueCRC = udf_crc(fidName, fidNameLen, 0);
 503                 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
 504                 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
 505                 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
 506                 newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
 507
 508                 if (hasExt)
 509                 {
 510                         newName[newIndex++] = EXT_MARK;
 511                         for (index = 0;index < localExtIndex ;index++ )
 512                                 newName[newIndex++] = ext[index];
 513                 }
 514         }
 515         return newIndex;
 516 }