lib/util/charset/util_str.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6    Copyright (C) Andrew Bartlett 2011
   7    Copyright (C) Jeremy Allison  1992-2007
   8    Copyright (C) Martin Pool     2003
   9    Copyright (C) James Peach     2006
  10
  11    This program is free software; you can redistribute it and/or modify
  12    it under the terms of the GNU General Public License as published by
  13    the Free Software Foundation; either version 3 of the License, or
  14    (at your option) any later version.
  15
  16    This program is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19    GNU General Public License for more details.
  20
  21    You should have received a copy of the GNU General Public License
  22    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23 */
  24
  25 #include "includes.h"
  26 #include "system/locale.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  Case insensitive string compararison, handle specified for testing
  34 **/
  35 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  36                                  const char *s1, const char *s2)
  37 {
  38         codepoint_t c1=0, c2=0;
  39         size_t size1, size2;
  40
  41         /* handle null ptr comparisons to simplify the use in qsort */
  42         if (s1 == s2) return 0;
  43         if (s1 == NULL) return -1;
  44         if (s2 == NULL) return 1;
  45
  46         while (*s1 && *s2) {
  47                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  48                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  49
  50                 if (c1 == INVALID_CODEPOINT ||
  51                     c2 == INVALID_CODEPOINT) {
  52                         return strcasecmp(s1, s2);
  53                 }
  54
  55                 s1 += size1;
  56                 s2 += size2;
  57
  58                 if (c1 == c2) {
  59                         continue;
  60                 }
  61
  62                 if (toupper_m(c1) != toupper_m(c2)) {
  63                         return c1 - c2;
  64                 }
  65         }
  66
  67         return *s1 - *s2;
  68 }
  69
  70 /**
  71  Case insensitive string compararison
  72 **/
  73 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
  74 {
  75         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  76         return strcasecmp_m_handle(iconv_handle, s1, s2);
  77 }
  78
  79 /**
  80  Case insensitive string compararison, length limited, handle specified for testing
  81 **/
  82 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  83                                   const char *s1, const char *s2, size_t n)
  84 {
  85         codepoint_t c1=0, c2=0;
  86         size_t size1, size2;
  87
  88         /* handle null ptr comparisons to simplify the use in qsort */
  89         if (s1 == s2) return 0;
  90         if (s1 == NULL) return -1;
  91         if (s2 == NULL) return 1;
  92
  93         while (*s1 && *s2 && n) {
  94                 n--;
  95
  96                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  97                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  98
  99                 if (c1 == INVALID_CODEPOINT ||
 100                     c2 == INVALID_CODEPOINT) {
 101                         /*
 102                          * n was specified in characters,
 103                          * now we must convert it to bytes.
 104                          * As bytes are the smallest
 105                          * character unit, the following
 106                          * increment and strncasecmp is always
 107                          * safe.
 108                          *
 109                          * The source string was already known
 110                          * to be n characters long, so we are
 111                          * guaranteed to be able to look at the
 112                          * (n remaining + size1) bytes from the
 113                          * s1 position).
 114                          */
 115                         n += size1;
 116                         return strncasecmp(s1, s2, n);
 117                 }
 118
 119                 s1 += size1;
 120                 s2 += size2;
 121
 122                 if (c1 == c2) {
 123                         continue;
 124                 }
 125
 126                 if (toupper_m(c1) != toupper_m(c2)) {
 127                         return c1 - c2;
 128                 }
 129         }
 130
 131         if (n == 0) {
 132                 return 0;
 133         }
 134
 135         return *s1 - *s2;
 136 }
 137
 138 /**
 139  Case insensitive string compararison, length limited
 140 **/
 141 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 142 {
 143         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
 144         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
 145 }
 146
 147 /**
 148  * Compare 2 strings.
 149  *
 150  * @note The comparison is case-insensitive.
 151  **/
 152 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
 153 {
 154         return strcasecmp_m(s1,s2) == 0;
 155 }
 156
 157 /**
 158  Compare 2 strings (case sensitive).
 159 **/
 160 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
 161 {
 162         if (s1 == s2)
 163                 return true;
 164         if (!s1 || !s2)
 165                 return false;
 166
 167         return strcmp(s1,s2) == 0;
 168 }
 169
 170 /**
 171  * Calculate the number of units (8 or 16-bit, depending on the
 172  * destination charset), that would be needed to convert the input
 173  * string which is expected to be in in src_charset encoding to the
 174  * destination charset (which should be a unicode charset).
 175  */
 176 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 177                                     const char *s, charset_t src_charset, charset_t dst_charset)
 178 {
 179         size_t count = 0;
 180
 181 #ifdef DEVELOPER
 182         switch (dst_charset) {
 183         case CH_DOS:
 184         case CH_UNIX:
 185                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
 186         default:
 187                 break;
 188         }
 189
 190         switch (src_charset) {
 191         case CH_UTF16LE:
 192         case CH_UTF16BE:
 193                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
 194         default:
 195                 break;
 196         }
 197 #endif
 198         if (!s) {
 199                 return 0;
 200         }
 201
 202         while (*s && !(((uint8_t)*s) & 0x80)) {
 203                 s++;
 204                 count++;
 205         }
 206
 207         if (!*s) {
 208                 return count;
 209         }
 210
 211         while (*s) {
 212                 size_t c_size;
 213                 codepoint_t c = next_codepoint_handle_ext(ic, s, src_charset, &c_size);
 214                 s += c_size;
 215
 216                 switch (dst_charset) {
 217                 case CH_UTF16LE:
 218                 case CH_UTF16BE:
 219                 case CH_UTF16MUNGED:
 220                         if (c < 0x10000) {
 221                                 /* Unicode char fits into 16 bits. */
 222                                 count += 1;
 223                         } else {
 224                                 /* Double-width unicode char - 32 bits. */
 225                                 count += 2;
 226                         }
 227                         break;
 228                 case CH_UTF8:
 229                         /*
 230                          * this only checks ranges, and does not
 231                          * check for invalid codepoints
 232                          */
 233                         if (c < 0x80) {
 234                                 count += 1;
 235                         } else if (c < 0x800) {
 236                                 count += 2;
 237                         } else if (c < 0x10000) {
 238                                 count += 3;
 239                         } else {
 240                                 count += 4;
 241                         }
 242                         break;
 243                 default:
 244                         /*
 245                          * non-unicode encoding:
 246                          * assume that each codepoint fits into
 247                          * one unit in the destination encoding.
 248                          */
 249                         count += 1;
 250                 }
 251         }
 252
 253         return count;
 254 }
 255
 256 /**
 257  * Calculate the number of units (8 or 16-bit, depending on the
 258  * destination charset), that would be needed to convert the input
 259  * string which is expected to be in in src_charset encoding to the
 260  * destination charset (which should be a unicode charset).
 261  */
 262 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
 263 {
 264         struct smb_iconv_handle *ic = get_iconv_handle();
 265         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
 266 }
 267
 268 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
 269                                   const charset_t dst_charset)
 270 {
 271         if (!s) {
 272                 return 0;
 273         }
 274         return strlen_m_ext(s, src_charset, dst_charset) + 1;
 275 }
 276
 277 /**
 278  * Calculate the number of 16-bit units that would be needed to convert
 279  * the input string which is expected to be in CH_UNIX encoding to UTF16.
 280  *
 281  * This will be the same as the number of bytes in a string for single
 282  * byte strings, but will be different for multibyte.
 283  */
 284 _PUBLIC_ size_t strlen_m(const char *s)
 285 {
 286         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
 287 }
 288
 289 /**
 290    Work out the number of multibyte chars in a string, including the NULL
 291    terminator.
 292 **/
 293 _PUBLIC_ size_t strlen_m_term(const char *s)
 294 {
 295         if (!s) {
 296                 return 0;
 297         }
 298
 299         return strlen_m(s) + 1;
 300 }
 301
 302 /*
 303  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
 304  * if a string is there, include the terminator.
 305  */
 306
 307 _PUBLIC_ size_t strlen_m_term_null(const char *s)
 308 {
 309         size_t len;
 310         if (!s) {
 311                 return 0;
 312         }
 313         len = strlen_m(s);
 314         if (len == 0) {
 315                 return 0;
 316         }
 317
 318         return len+1;
 319 }
 320
 321 /**
 322  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 323 **/
 324 _PUBLIC_ char *strchr_m(const char *src, char c)
 325 {
 326         const char *s;
 327         struct smb_iconv_handle *ic = get_iconv_handle();
 328         if (src == NULL) {
 329                 return NULL;
 330         }
 331         /* characters below 0x3F are guaranteed to not appear in
 332            non-initial position in multi-byte charsets */
 333         if ((c & 0xC0) == 0) {
 334                 return strchr(src, c);
 335         }
 336
 337         /* this is quite a common operation, so we want it to be
 338            fast. We optimise for the ascii case, knowing that all our
 339            supported multi-byte character sets are ascii-compatible
 340            (ie. they match for the first 128 chars) */
 341
 342         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 343                 if (*s == c)
 344                         return discard_const_p(char, s);
 345         }
 346
 347         if (!*s)
 348                 return NULL;
 349
 350 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
 351         /* With compose characters we must restart from the beginning. JRA. */
 352         s = src;
 353 #endif
 354
 355         while (*s) {
 356                 size_t size;
 357                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 358                 if (c2 == c) {
 359                         return discard_const_p(char, s);
 360                 }
 361                 s += size;
 362         }
 363
 364         return NULL;
 365 }
 366
 367 /**
 368  * Multibyte-character version of strrchr
 369  */
 370 _PUBLIC_ char *strrchr_m(const char *s, char c)
 371 {
 372         struct smb_iconv_handle *ic = get_iconv_handle();
 373         char *ret = NULL;
 374
 375         if (s == NULL) {
 376                 return NULL;
 377         }
 378
 379         /* characters below 0x3F are guaranteed to not appear in
 380            non-initial position in multi-byte charsets */
 381         if ((c & 0xC0) == 0) {
 382                 return strrchr(s, c);
 383         }
 384
 385         /* this is quite a common operation, so we want it to be
 386            fast. We optimise for the ascii case, knowing that all our
 387            supported multi-byte character sets are ascii-compatible
 388            (ie. they match for the first 128 chars). Also, in Samba
 389            we only search for ascii characters in 'c' and that
 390            in all mb character sets with a compound character
 391            containing c, if 'c' is not a match at position
 392            p, then p[-1] > 0x7f. JRA. */
 393
 394         {
 395                 size_t len = strlen(s);
 396                 const char *cp = s;
 397                 bool got_mb = false;
 398
 399                 if (len == 0)
 400                         return NULL;
 401                 cp += (len - 1);
 402                 do {
 403                         if (c == *cp) {
 404                                 /* Could be a match. Part of a multibyte ? */
 405                                 if ((cp > s) &&
 406                                         (((unsigned char)cp[-1]) & 0x80)) {
 407                                         /* Yep - go slow :-( */
 408                                         got_mb = true;
 409                                         break;
 410                                 }
 411                                 /* No - we have a match ! */
 412                                 return discard_const_p(char , cp);
 413                         }
 414                 } while (cp-- != s);
 415                 if (!got_mb)
 416                         return NULL;
 417         }
 418
 419         while (*s) {
 420                 size_t size;
 421                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 422                 if (c2 == c) {
 423                         ret = discard_const_p(char, s);
 424                 }
 425                 s += size;
 426         }
 427
 428         return ret;
 429 }
 430
 431 /**
 432   return True if any (multi-byte) character is lower case
 433 */
 434 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
 435                                  const char *string)
 436 {
 437         while (*string) {
 438                 size_t c_size;
 439                 codepoint_t s;
 440                 codepoint_t t;
 441
 442                 s = next_codepoint_handle(ic, string, &c_size);
 443                 string += c_size;
 444
 445                 t = toupper_m(s);
 446
 447                 if (s != t) {
 448                         return true; /* that means it has lower case chars */
 449                 }
 450         }
 451
 452         return false;
 453 }
 454
 455 _PUBLIC_ bool strhaslower(const char *string)
 456 {
 457         struct smb_iconv_handle *ic = get_iconv_handle();
 458         return strhaslower_handle(ic, string);
 459 }
 460
 461 /**
 462   return True if any (multi-byte) character is upper case
 463 */
 464 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
 465                                  const char *string)
 466 {
 467         while (*string) {
 468                 size_t c_size;
 469                 codepoint_t s;
 470                 codepoint_t t;
 471
 472                 s = next_codepoint_handle(ic, string, &c_size);
 473                 string += c_size;
 474
 475                 t = tolower_m(s);
 476
 477                 if (s != t) {
 478                         return true; /* that means it has upper case chars */
 479                 }
 480         }
 481
 482         return false;
 483 }
 484
 485 _PUBLIC_ bool strhasupper(const char *string)
 486 {
 487         struct smb_iconv_handle *ic = get_iconv_handle();
 488         return strhasupper_handle(ic, string);
 489 }
 490
 491 /***********************************************************************
 492  strstr_m - We convert via ucs2 for now.
 493 ***********************************************************************/
 494
 495 char *strstr_m(const char *src, const char *findstr)
 496 {
 497         smb_ucs2_t *p;
 498         smb_ucs2_t *src_w, *find_w;
 499         const char *s;
 500         char *s2;
 501         char *retp;
 502         size_t converted_size, findstr_len = 0;
 503
 504         TALLOC_CTX *frame; /* Only set up in the iconv case */
 505
 506         /* for correctness */
 507         if (!findstr[0]) {
 508                 return discard_const_p(char, src);
 509         }
 510
 511         /* Samba does single character findstr calls a *lot*. */
 512         if (findstr[1] == '\0')
 513                 return strchr_m(src, *findstr);
 514
 515         /* We optimise for the ascii case, knowing that all our
 516            supported multi-byte character sets are ascii-compatible
 517            (ie. they match for the first 128 chars) */
 518
 519         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 520                 if (*s == *findstr) {
 521                         if (!findstr_len)
 522                                 findstr_len = strlen(findstr);
 523
 524                         if (strncmp(s, findstr, findstr_len) == 0) {
 525                                 return discard_const_p(char, s);
 526                         }
 527                 }
 528         }
 529
 530         if (!*s)
 531                 return NULL;
 532
 533 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
 534         /* 'make check' fails unless we do this */
 535
 536         /* With compose characters we must restart from the beginning. JRA. */
 537         s = src;
 538 #endif
 539
 540         frame = talloc_stackframe();
 541
 542         if (!push_ucs2_talloc(frame, &src_w, src, &converted_size)) {
 543                 DEBUG(0,("strstr_m: src malloc fail\n"));
 544                 TALLOC_FREE(frame);
 545                 return NULL;
 546         }
 547
 548         if (!push_ucs2_talloc(frame, &find_w, findstr, &converted_size)) {
 549                 DEBUG(0,("strstr_m: find malloc fail\n"));
 550                 TALLOC_FREE(frame);
 551                 return NULL;
 552         }
 553
 554         p = strstr_w(src_w, find_w);
 555
 556         if (!p) {
 557                 TALLOC_FREE(frame);
 558                 return NULL;
 559         }
 560
 561         *p = 0;
 562         if (!pull_ucs2_talloc(frame, &s2, src_w, &converted_size)) {
 563                 TALLOC_FREE(frame);
 564                 DEBUG(0,("strstr_m: dest malloc fail\n"));
 565                 return NULL;
 566         }
 567         retp = discard_const_p(char, (s+strlen(s2)));
 568         TALLOC_FREE(frame);
 569         return retp;
 570 }