lib/util/charset/util_str.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6    Copyright (C) Andrew Bartlett 2011
   7    Copyright (C) Jeremy Allison  1992-2007
   8    Copyright (C) Martin Pool     2003
   9    Copyright (C) James Peach     2006
  10
  11    This program is free software; you can redistribute it and/or modify
  12    it under the terms of the GNU General Public License as published by
  13    the Free Software Foundation; either version 3 of the License, or
  14    (at your option) any later version.
  15
  16    This program is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19    GNU General Public License for more details.
  20
  21    You should have received a copy of the GNU General Public License
  22    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23 */
  24
  25 #include "includes.h"
  26 #include "system/locale.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  Case insensitive string compararison, handle specified for testing
  34 **/
  35 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  36                                  const char *s1, const char *s2)
  37 {
  38         codepoint_t c1=0, c2=0;
  39         size_t size1, size2;
  40
  41         /* handle null ptr comparisons to simplify the use in qsort */
  42         if (s1 == s2) return 0;
  43         if (s1 == NULL) return -1;
  44         if (s2 == NULL) return 1;
  45
  46         while (*s1 && *s2) {
  47                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  48                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  49
  50                 s1 += size1;
  51                 s2 += size2;
  52
  53                 if (c1 == c2) {
  54                         continue;
  55                 }
  56
  57                 if (c1 == INVALID_CODEPOINT ||
  58                     c2 == INVALID_CODEPOINT) {
  59                         /* what else can we do?? */
  60                         return strcasecmp(s1, s2);
  61                 }
  62
  63                 if (toupper_m(c1) != toupper_m(c2)) {
  64                         return c1 - c2;
  65                 }
  66         }
  67
  68         return *s1 - *s2;
  69 }
  70
  71 /**
  72  Case insensitive string compararison
  73 **/
  74 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
  75 {
  76         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  77         return strcasecmp_m_handle(iconv_handle, s1, s2);
  78 }
  79
  80 /**
  81  Case insensitive string compararison, length limited, handle specified for testing
  82 **/
  83 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  84                                   const char *s1, const char *s2, size_t n)
  85 {
  86         codepoint_t c1=0, c2=0;
  87         size_t size1, size2;
  88
  89         /* handle null ptr comparisons to simplify the use in qsort */
  90         if (s1 == s2) return 0;
  91         if (s1 == NULL) return -1;
  92         if (s2 == NULL) return 1;
  93
  94         while (*s1 && *s2 && n) {
  95                 n--;
  96
  97                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  98                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  99
 100                 s1 += size1;
 101                 s2 += size2;
 102
 103                 if (c1 == c2) {
 104                         continue;
 105                 }
 106
 107                 if (c1 == INVALID_CODEPOINT ||
 108                     c2 == INVALID_CODEPOINT) {
 109                         /* what else can we do?? */
 110                         return strcasecmp(s1, s2);
 111                 }
 112
 113                 if (toupper_m(c1) != toupper_m(c2)) {
 114                         return c1 - c2;
 115                 }
 116         }
 117
 118         if (n == 0) {
 119                 return 0;
 120         }
 121
 122         return *s1 - *s2;
 123 }
 124
 125 /**
 126  Case insensitive string compararison, length limited
 127 **/
 128 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 129 {
 130         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
 131         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
 132 }
 133
 134 /**
 135  * Compare 2 strings.
 136  *
 137  * @note The comparison is case-insensitive.
 138  **/
 139 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
 140 {
 141         return strcasecmp_m(s1,s2) == 0;
 142 }
 143
 144 /**
 145  Compare 2 strings (case sensitive).
 146 **/
 147 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
 148 {
 149         if (s1 == s2)
 150                 return true;
 151         if (!s1 || !s2)
 152                 return false;
 153
 154         return strcmp(s1,s2) == 0;
 155 }
 156
 157 /**
 158  * Calculate the number of units (8 or 16-bit, depending on the
 159  * destination charset), that would be needed to convert the input
 160  * string which is expected to be in in src_charset encoding to the
 161  * destination charset (which should be a unicode charset).
 162  */
 163 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 164                                     const char *s, charset_t src_charset, charset_t dst_charset)
 165 {
 166         size_t count = 0;
 167
 168 #ifdef DEVELOPER
 169         switch (dst_charset) {
 170         case CH_DOS:
 171         case CH_UNIX:
 172                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
 173         default:
 174                 break;
 175         }
 176
 177         switch (src_charset) {
 178         case CH_UTF16LE:
 179         case CH_UTF16BE:
 180                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
 181         default:
 182                 break;
 183         }
 184 #endif
 185         if (!s) {
 186                 return 0;
 187         }
 188
 189         while (*s && !(((uint8_t)*s) & 0x80)) {
 190                 s++;
 191                 count++;
 192         }
 193
 194         if (!*s) {
 195                 return count;
 196         }
 197
 198         while (*s) {
 199                 size_t c_size;
 200                 codepoint_t c = next_codepoint_handle_ext(ic, s, src_charset, &c_size);
 201                 s += c_size;
 202
 203                 switch (dst_charset) {
 204                 case CH_UTF16LE:
 205                 case CH_UTF16BE:
 206                 case CH_UTF16MUNGED:
 207                         if (c < 0x10000) {
 208                                 /* Unicode char fits into 16 bits. */
 209                                 count += 1;
 210                         } else {
 211                                 /* Double-width unicode char - 32 bits. */
 212                                 count += 2;
 213                         }
 214                         break;
 215                 case CH_UTF8:
 216                         /*
 217                          * this only checks ranges, and does not
 218                          * check for invalid codepoints
 219                          */
 220                         if (c < 0x80) {
 221                                 count += 1;
 222                         } else if (c < 0x800) {
 223                                 count += 2;
 224                         } else if (c < 0x10000) {
 225                                 count += 3;
 226                         } else {
 227                                 count += 4;
 228                         }
 229                         break;
 230                 default:
 231                         /*
 232                          * non-unicode encoding:
 233                          * assume that each codepoint fits into
 234                          * one unit in the destination encoding.
 235                          */
 236                         count += 1;
 237                 }
 238         }
 239
 240         return count;
 241 }
 242
 243 /**
 244  * Calculate the number of units (8 or 16-bit, depending on the
 245  * destination charset), that would be needed to convert the input
 246  * string which is expected to be in in src_charset encoding to the
 247  * destination charset (which should be a unicode charset).
 248  */
 249 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
 250 {
 251         struct smb_iconv_handle *ic = get_iconv_handle();
 252         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
 253 }
 254
 255 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
 256                                   const charset_t dst_charset)
 257 {
 258         if (!s) {
 259                 return 0;
 260         }
 261         return strlen_m_ext(s, src_charset, dst_charset) + 1;
 262 }
 263
 264 /**
 265  * Calculate the number of 16-bit units that would be needed to convert
 266  * the input string which is expected to be in CH_UNIX encoding to UTF16.
 267  *
 268  * This will be the same as the number of bytes in a string for single
 269  * byte strings, but will be different for multibyte.
 270  */
 271 _PUBLIC_ size_t strlen_m(const char *s)
 272 {
 273         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
 274 }
 275
 276 /**
 277    Work out the number of multibyte chars in a string, including the NULL
 278    terminator.
 279 **/
 280 _PUBLIC_ size_t strlen_m_term(const char *s)
 281 {
 282         if (!s) {
 283                 return 0;
 284         }
 285
 286         return strlen_m(s) + 1;
 287 }
 288
 289 /*
 290  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
 291  * if a string is there, include the terminator.
 292  */
 293
 294 _PUBLIC_ size_t strlen_m_term_null(const char *s)
 295 {
 296         size_t len;
 297         if (!s) {
 298                 return 0;
 299         }
 300         len = strlen_m(s);
 301         if (len == 0) {
 302                 return 0;
 303         }
 304
 305         return len+1;
 306 }
 307
 308 /**
 309  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 310 **/
 311 _PUBLIC_ char *strchr_m(const char *src, char c)
 312 {
 313         const char *s;
 314         struct smb_iconv_handle *ic = get_iconv_handle();
 315         if (src == NULL) {
 316                 return NULL;
 317         }
 318         /* characters below 0x3F are guaranteed to not appear in
 319            non-initial position in multi-byte charsets */
 320         if ((c & 0xC0) == 0) {
 321                 return strchr(src, c);
 322         }
 323
 324         /* this is quite a common operation, so we want it to be
 325            fast. We optimise for the ascii case, knowing that all our
 326            supported multi-byte character sets are ascii-compatible
 327            (ie. they match for the first 128 chars) */
 328
 329         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 330                 if (*s == c)
 331                         return discard_const_p(char, s);
 332         }
 333
 334         if (!*s)
 335                 return NULL;
 336
 337 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
 338         /* With compose characters we must restart from the beginning. JRA. */
 339         s = src;
 340 #endif
 341
 342         while (*s) {
 343                 size_t size;
 344                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 345                 if (c2 == c) {
 346                         return discard_const_p(char, s);
 347                 }
 348                 s += size;
 349         }
 350
 351         return NULL;
 352 }
 353
 354 /**
 355  * Multibyte-character version of strrchr
 356  */
 357 _PUBLIC_ char *strrchr_m(const char *s, char c)
 358 {
 359         struct smb_iconv_handle *ic = get_iconv_handle();
 360         char *ret = NULL;
 361
 362         if (s == NULL) {
 363                 return NULL;
 364         }
 365
 366         /* characters below 0x3F are guaranteed to not appear in
 367            non-initial position in multi-byte charsets */
 368         if ((c & 0xC0) == 0) {
 369                 return strrchr(s, c);
 370         }
 371
 372         /* this is quite a common operation, so we want it to be
 373            fast. We optimise for the ascii case, knowing that all our
 374            supported multi-byte character sets are ascii-compatible
 375            (ie. they match for the first 128 chars). Also, in Samba
 376            we only search for ascii characters in 'c' and that
 377            in all mb character sets with a compound character
 378            containing c, if 'c' is not a match at position
 379            p, then p[-1] > 0x7f. JRA. */
 380
 381         {
 382                 size_t len = strlen(s);
 383                 const char *cp = s;
 384                 bool got_mb = false;
 385
 386                 if (len == 0)
 387                         return NULL;
 388                 cp += (len - 1);
 389                 do {
 390                         if (c == *cp) {
 391                                 /* Could be a match. Part of a multibyte ? */
 392                                 if ((cp > s) &&
 393                                         (((unsigned char)cp[-1]) & 0x80)) {
 394                                         /* Yep - go slow :-( */
 395                                         got_mb = true;
 396                                         break;
 397                                 }
 398                                 /* No - we have a match ! */
 399                                 return discard_const_p(char , cp);
 400                         }
 401                 } while (cp-- != s);
 402                 if (!got_mb)
 403                         return NULL;
 404         }
 405
 406         while (*s) {
 407                 size_t size;
 408                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 409                 if (c2 == c) {
 410                         ret = discard_const_p(char, s);
 411                 }
 412                 s += size;
 413         }
 414
 415         return ret;
 416 }
 417
 418 /**
 419   return True if any (multi-byte) character is lower case
 420 */
 421 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
 422                                  const char *string)
 423 {
 424         while (*string) {
 425                 size_t c_size;
 426                 codepoint_t s;
 427                 codepoint_t t;
 428
 429                 s = next_codepoint_handle(ic, string, &c_size);
 430                 string += c_size;
 431
 432                 t = toupper_m(s);
 433
 434                 if (s != t) {
 435                         return true; /* that means it has lower case chars */
 436                 }
 437         }
 438
 439         return false;
 440 }
 441
 442 _PUBLIC_ bool strhaslower(const char *string)
 443 {
 444         struct smb_iconv_handle *ic = get_iconv_handle();
 445         return strhaslower_handle(ic, string);
 446 }
 447
 448 /**
 449   return True if any (multi-byte) character is upper case
 450 */
 451 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
 452                                  const char *string)
 453 {
 454         while (*string) {
 455                 size_t c_size;
 456                 codepoint_t s;
 457                 codepoint_t t;
 458
 459                 s = next_codepoint_handle(ic, string, &c_size);
 460                 string += c_size;
 461
 462                 t = tolower_m(s);
 463
 464                 if (s != t) {
 465                         return true; /* that means it has upper case chars */
 466                 }
 467         }
 468
 469         return false;
 470 }
 471
 472 _PUBLIC_ bool strhasupper(const char *string)
 473 {
 474         struct smb_iconv_handle *ic = get_iconv_handle();
 475         return strhasupper_handle(ic, string);
 476 }
 477
 478 /***********************************************************************
 479  strstr_m - We convert via ucs2 for now.
 480 ***********************************************************************/
 481
 482 char *strstr_m(const char *src, const char *findstr)
 483 {
 484         smb_ucs2_t *p;
 485         smb_ucs2_t *src_w, *find_w;
 486         const char *s;
 487         char *s2;
 488         char *retp;
 489         size_t converted_size, findstr_len = 0;
 490
 491         TALLOC_CTX *frame; /* Only set up in the iconv case */
 492
 493         /* for correctness */
 494         if (!findstr[0]) {
 495                 return discard_const_p(char, src);
 496         }
 497
 498         /* Samba does single character findstr calls a *lot*. */
 499         if (findstr[1] == '\0')
 500                 return strchr_m(src, *findstr);
 501
 502         /* We optimise for the ascii case, knowing that all our
 503            supported multi-byte character sets are ascii-compatible
 504            (ie. they match for the first 128 chars) */
 505
 506         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 507                 if (*s == *findstr) {
 508                         if (!findstr_len)
 509                                 findstr_len = strlen(findstr);
 510
 511                         if (strncmp(s, findstr, findstr_len) == 0) {
 512                                 return discard_const_p(char, s);
 513                         }
 514                 }
 515         }
 516
 517         if (!*s)
 518                 return NULL;
 519
 520 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
 521         /* 'make check' fails unless we do this */
 522
 523         /* With compose characters we must restart from the beginning. JRA. */
 524         s = src;
 525 #endif
 526
 527         frame = talloc_stackframe();
 528
 529         if (!push_ucs2_talloc(frame, &src_w, src, &converted_size)) {
 530                 DEBUG(0,("strstr_m: src malloc fail\n"));
 531                 TALLOC_FREE(frame);
 532                 return NULL;
 533         }
 534
 535         if (!push_ucs2_talloc(frame, &find_w, findstr, &converted_size)) {
 536                 DEBUG(0,("strstr_m: find malloc fail\n"));
 537                 TALLOC_FREE(frame);
 538                 return NULL;
 539         }
 540
 541         p = strstr_w(src_w, find_w);
 542
 543         if (!p) {
 544                 TALLOC_FREE(frame);
 545                 return NULL;
 546         }
 547
 548         *p = 0;
 549         if (!pull_ucs2_talloc(frame, &s2, src_w, &converted_size)) {
 550                 TALLOC_FREE(frame);
 551                 DEBUG(0,("strstr_m: dest malloc fail\n"));
 552                 return NULL;
 553         }
 554         retp = discard_const_p(char, (s+strlen(s2)));
 555         TALLOC_FREE(frame);
 556         return retp;
 557 }