lib/util/charset/util_str.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6    Copyright (C) Andrew Bartlett 2011
   7    Copyright (C) Jeremy Allison  1992-2007
   8    Copyright (C) Martin Pool     2003
   9    Copyright (C) James Peach     2006
  10
  11    This program is free software; you can redistribute it and/or modify
  12    it under the terms of the GNU General Public License as published by
  13    the Free Software Foundation; either version 3 of the License, or
  14    (at your option) any later version.
  15
  16    This program is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19    GNU General Public License for more details.
  20
  21    You should have received a copy of the GNU General Public License
  22    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23 */
  24
  25 #include "replace.h"
  26 #include "system/locale.h"
  27 #include "charset.h"
  28 #include "lib/util/fault.h"
  29 #include "lib/util/tsort.h"
  30
  31 #ifdef strcasecmp
  32 #undef strcasecmp
  33 #endif
  34 #ifdef strncasecmp
  35 #undef strncasecmp
  36 #endif
  37
  38
  39 /**
  40  Case insensitive string comparison, handle specified for testing
  41 **/
  42 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  43                                  const char *s1, const char *s2)
  44 {
  45         codepoint_t c1=0, c2=0;
  46         codepoint_t u1=0, u2=0;
  47         codepoint_t l1=0, l2=0;
  48         size_t size1, size2;
  49
  50         /* handle null ptr comparisons to simplify the use in qsort */
  51         if (s1 == s2) return 0;
  52         if (s1 == NULL) return -1;
  53         if (s2 == NULL) return 1;
  54
  55         while (*s1 && *s2) {
  56                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  57                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  58
  59                 if (c1 == INVALID_CODEPOINT ||
  60                     c2 == INVALID_CODEPOINT) {
  61                         return strcasecmp(s1, s2);
  62                 }
  63
  64                 s1 += size1;
  65                 s2 += size2;
  66
  67                 if (c1 == c2) {
  68                         continue;
  69                 }
  70
  71                 u1 = toupper_m(c1);
  72                 u2 = toupper_m(c2);
  73                 if (u1 == u2) {
  74                         continue;
  75                 }
  76
  77                 l1 = tolower_m(c1);
  78                 l2 = tolower_m(c2);
  79                 if (l1 == l2) {
  80                         continue;
  81                 }
  82
  83                 return NUMERIC_CMP(l1, l2);
  84         }
  85
  86         return NUMERIC_CMP(*s1, *s2);
  87 }
  88
  89 /**
  90  Case insensitive string comparison
  91 **/
  92 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
  93 {
  94         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  95         return strcasecmp_m_handle(iconv_handle, s1, s2);
  96 }
  97
  98 /**
  99  Case insensitive string comparison, length limited, handle specified for
 100  testing
 101 **/
 102 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
 103                                   const char *s1, const char *s2, size_t n)
 104 {
 105         codepoint_t c1=0, c2=0;
 106         codepoint_t u1=0, u2=0;
 107         codepoint_t l1=0, l2=0;
 108         size_t size1, size2;
 109
 110         /* handle null ptr comparisons to simplify the use in qsort */
 111         if (s1 == s2) return 0;
 112         if (s1 == NULL) return -1;
 113         if (s2 == NULL) return 1;
 114
 115         while (*s1 && *s2 && n) {
 116                 n--;
 117
 118                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
 119                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
 120
 121                 if (c1 == INVALID_CODEPOINT ||
 122                     c2 == INVALID_CODEPOINT) {
 123                         /*
 124                          * n was specified in characters,
 125                          * now we must convert it to bytes.
 126                          * As bytes are the smallest
 127                          * character unit, the following
 128                          * increment and strncasecmp is always
 129                          * safe.
 130                          *
 131                          * The source string was already known
 132                          * to be n characters long, so we are
 133                          * guaranteed to be able to look at the
 134                          * (n remaining + size1) bytes from the
 135                          * s1 position).
 136                          */
 137                         n += size1;
 138                         return strncasecmp(s1, s2, n);
 139                 }
 140
 141                 s1 += size1;
 142                 s2 += size2;
 143
 144                 if (c1 == c2) {
 145                         continue;
 146                 }
 147
 148                 u1 = toupper_m(c1);
 149                 u2 = toupper_m(c2);
 150                 if (u1 == u2) {
 151                         continue;
 152                 }
 153
 154                 l1 = tolower_m(c1);
 155                 l2 = tolower_m(c2);
 156                 if (l1 == l2) {
 157                         continue;
 158                 }
 159
 160                 return NUMERIC_CMP(l1, l2);
 161         }
 162
 163         if (n == 0) {
 164                 return 0;
 165         }
 166
 167         return NUMERIC_CMP(*s1, *s2);
 168 }
 169
 170 /**
 171  Case insensitive string comparison, length limited
 172 **/
 173 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 174 {
 175         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
 176         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
 177 }
 178
 179 /**
 180  * Compare 2 strings.
 181  *
 182  * @note The comparison is case-insensitive.
 183  **/
 184 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
 185 {
 186         return strcasecmp_m(s1,s2) == 0;
 187 }
 188
 189 /**
 190  Compare 2 strings (case sensitive).
 191 **/
 192 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
 193 {
 194         if (s1 == s2)
 195                 return true;
 196         if (!s1 || !s2)
 197                 return false;
 198
 199         return strcmp(s1,s2) == 0;
 200 }
 201
 202 /**
 203  * Calculate the number of units (8 or 16-bit, depending on the
 204  * destination charset) that would be needed to convert the input
 205  * string, which is expected to be in src_charset encoding, to the
 206  * destination charset (which should be a unicode charset).
 207  */
 208 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 209                                     const char *s, charset_t src_charset, charset_t dst_charset)
 210 {
 211         size_t count = 0;
 212
 213 #ifdef DEVELOPER
 214         switch (dst_charset) {
 215         case CH_DOS:
 216         case CH_UNIX:
 217                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
 218         default:
 219                 break;
 220         }
 221
 222         switch (src_charset) {
 223         case CH_UTF16LE:
 224         case CH_UTF16BE:
 225                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
 226         default:
 227                 break;
 228         }
 229 #endif
 230         if (!s) {
 231                 return 0;
 232         }
 233
 234         while (*s && !(((uint8_t)*s) & 0x80)) {
 235                 s++;
 236                 count++;
 237         }
 238
 239         if (!*s) {
 240                 return count;
 241         }
 242
 243         while (*s) {
 244                 size_t c_size;
 245                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
 246                                                           src_charset, &c_size);
 247                 s += c_size;
 248
 249                 switch (dst_charset) {
 250                 case CH_UTF16LE:
 251                 case CH_UTF16BE:
 252                 case CH_UTF16MUNGED:
 253                         if (c < 0x10000) {
 254                                 /* Unicode char fits into 16 bits. */
 255                                 count += 1;
 256                         } else {
 257                                 /* Double-width unicode char - 32 bits. */
 258                                 count += 2;
 259                         }
 260                         break;
 261                 case CH_UTF8:
 262                         /*
 263                          * this only checks ranges, and does not
 264                          * check for invalid codepoints
 265                          */
 266                         if (c < 0x80) {
 267                                 count += 1;
 268                         } else if (c < 0x800) {
 269                                 count += 2;
 270                         } else if (c < 0x10000) {
 271                                 count += 3;
 272                         } else {
 273                                 count += 4;
 274                         }
 275                         break;
 276                 default:
 277                         /*
 278                          * non-unicode encoding:
 279                          * assume that each codepoint fits into
 280                          * one unit in the destination encoding.
 281                          */
 282                         count += 1;
 283                 }
 284         }
 285
 286         return count;
 287 }
 288
 289 /**
 290  * Calculate the number of units (8 or 16-bit, depending on the
 291  * destination charset) that would be needed to convert the input
 292  * string, which is expected to be in src_charset encoding, to the
 293  * destination charset (which should be a unicode charset).
 294  */
 295 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
 296 {
 297         struct smb_iconv_handle *ic = get_iconv_handle();
 298         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
 299 }
 300
 301 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
 302                                   const charset_t dst_charset)
 303 {
 304         if (!s) {
 305                 return 0;
 306         }
 307         return strlen_m_ext(s, src_charset, dst_charset) + 1;
 308 }
 309
 310 _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
 311                                        const charset_t src_charset,
 312                                        const charset_t dst_charset)
 313 {
 314         size_t len;
 315         if (!s) {
 316                 return 0;
 317         }
 318         len = strlen_m_ext(s, src_charset, dst_charset);
 319         if (len == 0) {
 320                 return 0;
 321         }
 322
 323         return len+1;
 324 }
 325
 326 /**
 327  * Calculate the number of 16-bit units that would be needed to convert
 328  * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
 329  *
 330  * This will be the same as the number of bytes in a string for single
 331  * byte strings, but will be different for multibyte.
 332  */
 333 _PUBLIC_ size_t strlen_m(const char *s)
 334 {
 335         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
 336 }
 337
 338 /**
 339    Work out the number of multibyte chars in a string, including the NULL
 340    terminator.
 341 **/
 342 _PUBLIC_ size_t strlen_m_term(const char *s)
 343 {
 344         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
 345 }
 346
 347 /*
 348  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
 349  * if a string is there, include the terminator.
 350  */
 351
 352 _PUBLIC_ size_t strlen_m_term_null(const char *s)
 353 {
 354         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
 355 }
 356
 357 /**
 358  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 359 **/
 360 _PUBLIC_ char *strchr_m(const char *src, char c)
 361 {
 362         const char *s;
 363         struct smb_iconv_handle *ic = get_iconv_handle();
 364         if (src == NULL) {
 365                 return NULL;
 366         }
 367         /* characters below 0x3F are guaranteed to not appear in
 368            non-initial position in multi-byte charsets */
 369         if ((c & 0xC0) == 0) {
 370                 return strchr(src, c);
 371         }
 372
 373         /* this is quite a common operation, so we want it to be
 374            fast. We optimise for the ascii case, knowing that all our
 375            supported multi-byte character sets are ascii-compatible
 376            (ie. they match for the first 128 chars) */
 377
 378         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 379                 if (*s == c)
 380                         return discard_const_p(char, s);
 381         }
 382
 383         if (!*s)
 384                 return NULL;
 385
 386 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
 387         /* With compose characters we must restart from the beginning. JRA. */
 388         s = src;
 389 #endif
 390
 391         while (*s) {
 392                 size_t size;
 393                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 394                 if (c2 == c) {
 395                         return discard_const_p(char, s);
 396                 }
 397                 s += size;
 398         }
 399
 400         return NULL;
 401 }
 402
 403 /**
 404  * Multibyte-character version of strrchr
 405  */
 406 _PUBLIC_ char *strrchr_m(const char *s, char c)
 407 {
 408         struct smb_iconv_handle *ic;
 409         char *ret = NULL;
 410
 411         if (s == NULL) {
 412                 return NULL;
 413         }
 414
 415         /* characters below 0x3F are guaranteed to not appear in
 416            non-initial position in multi-byte charsets */
 417         if ((c & 0xC0) == 0) {
 418                 return strrchr(s, c);
 419         }
 420
 421         /* this is quite a common operation, so we want it to be
 422            fast. We optimise for the ascii case, knowing that all our
 423            supported multi-byte character sets are ascii-compatible
 424            (ie. they match for the first 128 chars). Also, in Samba
 425            we only search for ascii characters in 'c' and that
 426            in all mb character sets with a compound character
 427            containing c, if 'c' is not a match at position
 428            p, then p[-1] > 0x7f. JRA. */
 429
 430         {
 431                 size_t len = strlen(s);
 432                 const char *cp = s;
 433                 bool got_mb = false;
 434
 435                 if (len == 0)
 436                         return NULL;
 437                 cp += (len - 1);
 438                 do {
 439                         if (c == *cp) {
 440                                 /* Could be a match. Part of a multibyte ? */
 441                                 if ((cp > s) &&
 442                                         (((unsigned char)cp[-1]) & 0x80)) {
 443                                         /* Yep - go slow :-( */
 444                                         got_mb = true;
 445                                         break;
 446                                 }
 447                                 /* No - we have a match ! */
 448                                 return discard_const_p(char , cp);
 449                         }
 450                 } while (cp-- != s);
 451                 if (!got_mb)
 452                         return NULL;
 453         }
 454
 455         ic = get_iconv_handle();
 456
 457         while (*s) {
 458                 size_t size;
 459                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 460                 if (c2 == c) {
 461                         ret = discard_const_p(char, s);
 462                 }
 463                 s += size;
 464         }
 465
 466         return ret;
 467 }
 468
 469 /**
 470   return True if any (multi-byte) character is lower case
 471 */
 472 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
 473                                  const char *string)
 474 {
 475         while (*string) {
 476                 size_t c_size;
 477                 codepoint_t s;
 478                 codepoint_t t;
 479
 480                 s = next_codepoint_handle(ic, string, &c_size);
 481                 string += c_size;
 482
 483                 t = toupper_m(s);
 484
 485                 if (s != t) {
 486                         return true; /* that means it has lower case chars */
 487                 }
 488         }
 489
 490         return false;
 491 }
 492
 493 _PUBLIC_ bool strhaslower(const char *string)
 494 {
 495         struct smb_iconv_handle *ic = get_iconv_handle();
 496         return strhaslower_handle(ic, string);
 497 }
 498
 499 /**
 500   return True if any (multi-byte) character is upper case
 501 */
 502 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
 503                                  const char *string)
 504 {
 505         while (*string) {
 506                 size_t c_size;
 507                 codepoint_t s;
 508                 codepoint_t t;
 509
 510                 s = next_codepoint_handle(ic, string, &c_size);
 511                 string += c_size;
 512
 513                 t = tolower_m(s);
 514
 515                 if (s != t) {
 516                         return true; /* that means it has upper case chars */
 517                 }
 518         }
 519
 520         return false;
 521 }
 522
 523 _PUBLIC_ bool strhasupper(const char *string)
 524 {
 525         struct smb_iconv_handle *ic = get_iconv_handle();
 526         return strhasupper_handle(ic, string);
 527 }
 528
 529 /***********************************************************************
 530  strstr_m - We convert via ucs2 for now.
 531 ***********************************************************************/
 532
 533 char *strstr_m(const char *src, const char *findstr)
 534 {
 535         TALLOC_CTX *mem_ctx = NULL;
 536         smb_ucs2_t *p;
 537         smb_ucs2_t *src_w, *find_w;
 538         const char *s;
 539         char *s2;
 540         char *retp = NULL;
 541         size_t converted_size, findstr_len = 0;
 542
 543         /* for correctness */
 544         if (!findstr[0]) {
 545                 return discard_const_p(char, src);
 546         }
 547
 548         /* Samba does single character findstr calls a *lot*. */
 549         if (findstr[1] == '\0')
 550                 return strchr_m(src, *findstr);
 551
 552         /* We optimise for the ascii case, knowing that all our
 553            supported multi-byte character sets are ascii-compatible
 554            (ie. they match for the first 128 chars) */
 555
 556         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 557                 if (*s == *findstr) {
 558                         if (!findstr_len)
 559                                 findstr_len = strlen(findstr);
 560
 561                         if (strncmp(s, findstr, findstr_len) == 0) {
 562                                 return discard_const_p(char, s);
 563                         }
 564                 }
 565         }
 566
 567         if (!*s)
 568                 return NULL;
 569
 570 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
 571         /* 'make check' fails unless we do this */
 572
 573         /* With compose characters we must restart from the beginning. JRA. */
 574         s = src;
 575 #endif
 576
 577         /*
 578          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
 579          * case we leak memory, this should then be more obvious in
 580          * the talloc report.
 581          */
 582         mem_ctx = talloc_new(get_iconv_handle());
 583         if (mem_ctx == NULL) {
 584                 return NULL;
 585         }
 586
 587         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
 588                 goto done;
 589         }
 590
 591         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
 592                 goto done;
 593         }
 594
 595         p = strstr_w(src_w, find_w);
 596
 597         if (!p) {
 598                 goto done;
 599         }
 600
 601         *p = 0;
 602         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
 603                 goto done;
 604         }
 605         retp = discard_const_p(char, (s+strlen(s2)));
 606 done:
 607         TALLOC_FREE(mem_ctx);
 608         return retp;
 609 }