lib/util/charset/codepoints.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Character set conversion Extensions
   4    Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
   5    Copyright (C) Andrew Tridgell 2001
   6    Copyright (C) Simo Sorce 2001
   7    Copyright (C) Jelmer Vernooij 2007
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 */
  23 #include "includes.h"
  24 #include "lib/util/charset/charset.h"
  25 #include "system/locale.h"
  26 #include "dynconfig.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  * @file
  34  * @brief Unicode string manipulation
  35  */
  36
  37 /* these 2 tables define the unicode case handling.  They are loaded
  38    at startup either via mmap() or read() from the lib directory */
  39 static void *upcase_table;
  40 static void *lowcase_table;
  41
  42
  43 /*******************************************************************
  44 load the case handling tables
  45
  46 This is the function that should be called from library code.
  47 ********************************************************************/
  48 void load_case_tables_library(void)
  49 {
  50         TALLOC_CTX *mem_ctx;
  51
  52         mem_ctx = talloc_init("load_case_tables");
  53         if (!mem_ctx) {
  54                 smb_panic("No memory for case_tables");
  55         }
  56         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  57         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  58         talloc_free(mem_ctx);
  59         if (upcase_table == NULL) {
  60                 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  61                 upcase_table = (void *)-1;
  62         }
  63         if (lowcase_table == NULL) {
  64                 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  65                 lowcase_table = (void *)-1;
  66         }
  67 }
  68
  69 /*******************************************************************
  70 load the case handling tables
  71
  72 This MUST only be called from main() in application code, never from a
  73 library.  We don't know if the calling program has already done
  74 setlocale() to another value, and can't tell if they have.
  75 ********************************************************************/
  76 void load_case_tables(void)
  77 {
  78         /* This is a useful global hook where we can ensure that the
  79          * locale is set from the environment.  This is needed so that
  80          * we can use LOCALE as a codepage */
  81 #ifdef HAVE_SETLOCALE
  82         setlocale(LC_ALL, "");
  83 #endif
  84         load_case_tables_library();
  85 }
  86
  87 /**
  88  Convert a codepoint_t to upper case.
  89 **/
  90 _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return toupper(val);
  94         }
  95         if (upcase_table == NULL) {
  96                 load_case_tables_library();
  97         }
  98         if (upcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(upcase_table, val*2);
 105 }
 106
 107 /**
 108  Convert a codepoint_t to lower case.
 109 **/
 110 _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
 111 {
 112         if (val < 128) {
 113                 return tolower(val);
 114         }
 115         if (lowcase_table == NULL) {
 116                 load_case_tables_library();
 117         }
 118         if (lowcase_table == (void *)-1) {
 119                 return val;
 120         }
 121         if (val & 0xFFFF0000) {
 122                 return val;
 123         }
 124         return SVAL(lowcase_table, val*2);
 125 }
 126
 127 /**
 128  If we upper cased this character, would we get the same character?
 129 **/
 130 _PUBLIC_ bool islower_m(codepoint_t val)
 131 {
 132         return (toupper_m(val) != val);
 133 }
 134
 135 /**
 136  If we lower cased this character, would we get the same character?
 137 **/
 138 _PUBLIC_ bool isupper_m(codepoint_t val)
 139 {
 140         return (tolower_m(val) != val);
 141 }
 142
 143 /**
 144   compare two codepoints case insensitively
 145 */
 146 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 147 {
 148         if (c1 == c2 ||
 149             toupper_m(c1) == toupper_m(c2)) {
 150                 return 0;
 151         }
 152         return c1 - c2;
 153 }
 154
 155
 156 struct smb_iconv_convenience {
 157         TALLOC_CTX *child_ctx;
 158         const char *unix_charset;
 159         const char *dos_charset;
 160         const char *display_charset;
 161         bool native_iconv;
 162         smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
 163 };
 164
 165 struct smb_iconv_convenience *global_iconv_convenience = NULL;
 166
 167 struct smb_iconv_convenience *get_iconv_convenience(void)
 168 {
 169         if (global_iconv_convenience == NULL)
 170                 global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(),
 171                                                                         "ASCII", "UTF-8", "ASCII", true, NULL);
 172         return global_iconv_convenience;
 173 }
 174
 175 /**
 176  * Return the name of a charset to give to iconv().
 177  **/
 178 const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
 179 {
 180         switch (ch) {
 181         case CH_UTF16: return "UTF-16LE";
 182         case CH_UNIX: return ic->unix_charset;
 183         case CH_DOS: return ic->dos_charset;
 184         case CH_DISPLAY: return ic->display_charset;
 185         case CH_UTF8: return "UTF8";
 186         case CH_UTF16BE: return "UTF-16BE";
 187         case CH_UTF16MUNGED: return "UTF16_MUNGED";
 188         default:
 189         return "ASCII";
 190         }
 191 }
 192
 193 /**
 194  re-initialize iconv conversion descriptors
 195 **/
 196 static int close_iconv_convenience(struct smb_iconv_convenience *data)
 197 {
 198         unsigned c1, c2;
 199         for (c1=0;c1<NUM_CHARSETS;c1++) {
 200                 for (c2=0;c2<NUM_CHARSETS;c2++) {
 201                         if (data->conv_handles[c1][c2] != NULL) {
 202                                 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
 203                                         smb_iconv_close(data->conv_handles[c1][c2]);
 204                                 }
 205                                 data->conv_handles[c1][c2] = NULL;
 206                         }
 207                 }
 208         }
 209
 210         return 0;
 211 }
 212
 213 static const char *map_locale(const char *charset)
 214 {
 215         if (strcmp(charset, "LOCALE") != 0) {
 216                 return charset;
 217         }
 218 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
 219         {
 220                 const char *ln;
 221                 smb_iconv_t handle;
 222
 223                 ln = nl_langinfo(CODESET);
 224                 if (ln == NULL) {
 225                         DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
 226                         return "ASCII";
 227                 }
 228                 /* Check whether the charset name is supported
 229                    by iconv */
 230                 handle = smb_iconv_open(ln, "UCS-2LE");
 231                 if (handle == (smb_iconv_t) -1) {
 232                         DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
 233                         return "ASCII";
 234                 } else {
 235                         DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
 236                         smb_iconv_close(handle);
 237                 }
 238                 return ln;
 239         }
 240 #endif
 241         return "ASCII";
 242 }
 243
 244 /*
 245   the old_ic is passed in here as the smb_iconv_convenience structure
 246   is used as a global pointer in some places (eg. python modules). We
 247   don't want to invalidate those global pointers, but we do want to
 248   update them with the right charset information when loadparm
 249   runs. To do that we need to re-use the structure pointer, but
 250   re-fill the elements in the structure with the updated values
 251  */
 252 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx,
 253                                                                     const char *dos_charset,
 254                                                                     const char *unix_charset,
 255                                                                     const char *display_charset,
 256                                                                     bool native_iconv,
 257                                                                     struct smb_iconv_convenience *old_ic)
 258 {
 259         struct smb_iconv_convenience *ret;
 260
 261         display_charset = map_locale(display_charset);
 262
 263         if (old_ic != NULL) {
 264                 ret = old_ic;
 265                 close_iconv_convenience(ret);
 266                 talloc_free(ret->child_ctx);
 267                 ZERO_STRUCTP(ret);
 268         } else {
 269                 ret = talloc_zero(mem_ctx, struct smb_iconv_convenience);
 270         }
 271         if (ret == NULL) {
 272                 return NULL;
 273         }
 274
 275         /* we use a child context to allow us to free all ptrs without
 276            freeing the structure itself */
 277         ret->child_ctx = talloc_new(ret);
 278         if (ret->child_ctx == NULL) {
 279                 return NULL;
 280         }
 281
 282         talloc_set_destructor(ret, close_iconv_convenience);
 283
 284         ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
 285         ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
 286         ret->display_charset = talloc_strdup(ret->child_ctx, display_charset);
 287         ret->native_iconv = native_iconv;
 288
 289         return ret;
 290 }
 291
 292 /*
 293   on-demand initialisation of conversion handles
 294 */
 295 smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
 296                             charset_t from, charset_t to)
 297 {
 298         const char *n1, *n2;
 299         static bool initialised;
 300
 301         if (initialised == false) {
 302                 initialised = true;
 303         }
 304
 305         if (ic->conv_handles[from][to]) {
 306                 return ic->conv_handles[from][to];
 307         }
 308
 309         n1 = charset_name(ic, from);
 310         n2 = charset_name(ic, to);
 311
 312         ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
 313                                                        ic->native_iconv);
 314
 315         if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
 316                 if ((from == CH_DOS || to == CH_DOS) &&
 317                     strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
 318                         DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
 319                                  charset_name(ic, CH_DOS)));
 320                         ic->dos_charset = "ASCII";
 321
 322                         n1 = charset_name(ic, from);
 323                         n2 = charset_name(ic, to);
 324
 325                         ic->conv_handles[from][to] =
 326                                 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
 327                 }
 328         }
 329
 330         return ic->conv_handles[from][to];
 331 }
 332
 333 /**
 334  * Return the unicode codepoint for the next character in the input
 335  * string in the given src_charset.
 336  * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
 337  *
 338  * Also return the number of bytes consumed (which tells the caller
 339  * how many bytes to skip to get to the next src_charset-character).
 340  *
 341  * This is implemented (in the non-ascii-case) by first converting the
 342  * next character in the input string to UTF16_LE and then calculating
 343  * the unicode codepoint from that.
 344  *
 345  * Return INVALID_CODEPOINT if the next character cannot be converted.
 346  */
 347 _PUBLIC_ codepoint_t next_codepoint_convenience_ext(
 348                         struct smb_iconv_convenience *ic,
 349                         const char *str, charset_t src_charset,
 350                         size_t *bytes_consumed)
 351 {
 352         /* it cannot occupy more than 4 bytes in UTF16 format */
 353         uint8_t buf[4];
 354         smb_iconv_t descriptor;
 355         size_t ilen_orig;
 356         size_t ilen;
 357         size_t olen;
 358         char *outbuf;
 359
 360         if ((str[0] & 0x80) == 0) {
 361                 *bytes_consumed = 1;
 362                 return (codepoint_t)str[0];
 363         }
 364
 365         /*
 366          * we assume that no multi-byte character can take more than 5 bytes.
 367          * This is OK as we only support codepoints up to 1M (U+100000)
 368          */
 369         ilen_orig = strnlen(str, 5);
 370         ilen = ilen_orig;
 371
 372         descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
 373         if (descriptor == (smb_iconv_t)-1) {
 374                 *bytes_consumed = 1;
 375                 return INVALID_CODEPOINT;
 376         }
 377
 378         /*
 379          * this looks a little strange, but it is needed to cope with
 380          * codepoints above 64k (U+1000) which are encoded as per RFC2781.
 381          */
 382         olen = 2;
 383         outbuf = (char *)buf;
 384         smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
 385         if (olen == 2) {
 386                 olen = 4;
 387                 outbuf = (char *)buf;
 388                 smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
 389                 if (olen == 4) {
 390                         /* we didn't convert any bytes */
 391                         *bytes_consumed = 1;
 392                         return INVALID_CODEPOINT;
 393                 }
 394                 olen = 4 - olen;
 395         } else {
 396                 olen = 2 - olen;
 397         }
 398
 399         *bytes_consumed = ilen_orig - ilen;
 400
 401         if (olen == 2) {
 402                 return (codepoint_t)SVAL(buf, 0);
 403         }
 404         if (olen == 4) {
 405                 /* decode a 4 byte UTF16 character manually */
 406                 return (codepoint_t)0x10000 +
 407                         (buf[2] | ((buf[3] & 0x3)<<8) |
 408                          (buf[0]<<10) | ((buf[1] & 0x3)<<18));
 409         }
 410
 411         /* no other length is valid */
 412         return INVALID_CODEPOINT;
 413 }
 414
 415 /*
 416   return the unicode codepoint for the next multi-byte CH_UNIX character
 417   in the string
 418
 419   also return the number of bytes consumed (which tells the caller
 420   how many bytes to skip to get to the next CH_UNIX character)
 421
 422   return INVALID_CODEPOINT if the next character cannot be converted
 423 */
 424 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
 425                                     const char *str, size_t *size)
 426 {
 427         return next_codepoint_convenience_ext(ic, str, CH_UNIX, size);
 428 }
 429
 430 /*
 431   push a single codepoint into a CH_UNIX string the target string must
 432   be able to hold the full character, which is guaranteed if it is at
 433   least 5 bytes in size. The caller may pass less than 5 bytes if they
 434   are sure the character will fit (for example, you can assume that
 435   uppercase/lowercase of a character will not add more than 1 byte)
 436
 437   return the number of bytes occupied by the CH_UNIX character, or
 438   -1 on failure
 439 */
 440 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
 441                                 char *str, codepoint_t c)
 442 {
 443         smb_iconv_t descriptor;
 444         uint8_t buf[4];
 445         size_t ilen, olen;
 446         const char *inbuf;
 447
 448         if (c < 128) {
 449                 *str = c;
 450                 return 1;
 451         }
 452
 453         descriptor = get_conv_handle(ic,
 454                                      CH_UTF16, CH_UNIX);
 455         if (descriptor == (smb_iconv_t)-1) {
 456                 return -1;
 457         }
 458
 459         if (c < 0x10000) {
 460                 ilen = 2;
 461                 olen = 5;
 462                 inbuf = (char *)buf;
 463                 SSVAL(buf, 0, c);
 464                 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 465                 if (ilen != 0) {
 466                         return -1;
 467                 }
 468                 return 5 - olen;
 469         }
 470
 471         c -= 0x10000;
 472
 473         buf[0] = (c>>10) & 0xFF;
 474         buf[1] = (c>>18) | 0xd8;
 475         buf[2] = c & 0xFF;
 476         buf[3] = ((c>>8) & 0x3) | 0xdc;
 477
 478         ilen = 4;
 479         olen = 5;
 480         inbuf = (char *)buf;
 481
 482         smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 483         if (ilen != 0) {
 484                 return -1;
 485         }
 486         return 5 - olen;
 487 }
 488
 489 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
 490                                         size_t *size)
 491 {
 492         return next_codepoint_convenience_ext(get_iconv_convenience(), str,
 493                                               src_charset, size);
 494 }
 495
 496 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
 497 {
 498         return next_codepoint_convenience(get_iconv_convenience(), str, size);
 499 }
 500
 501 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
 502 {
 503         return push_codepoint_convenience(get_iconv_convenience(), str, c);
 504 }