lib/util/charset/util_str.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6    Copyright (C) Andrew Bartlett 2011
   7    Copyright (C) Jeremy Allison  1992-2007
   8    Copyright (C) Martin Pool     2003
   9    Copyright (C) James Peach     2006
  10
  11    This program is free software; you can redistribute it and/or modify
  12    it under the terms of the GNU General Public License as published by
  13    the Free Software Foundation; either version 3 of the License, or
  14    (at your option) any later version.
  15
  16    This program is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19    GNU General Public License for more details.
  20
  21    You should have received a copy of the GNU General Public License
  22    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23 */
  24
  25 #include "includes.h"
  26 #include "system/locale.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  Case insensitive string compararison, handle specified for testing
  34 **/
  35 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  36                                  const char *s1, const char *s2)
  37 {
  38         codepoint_t c1=0, c2=0;
  39         codepoint_t u1=0, u2=0;
  40         codepoint_t l1=0, l2=0;
  41         size_t size1, size2;
  42
  43         /* handle null ptr comparisons to simplify the use in qsort */
  44         if (s1 == s2) return 0;
  45         if (s1 == NULL) return -1;
  46         if (s2 == NULL) return 1;
  47
  48         while (*s1 && *s2) {
  49                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  50                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  51
  52                 if (c1 == INVALID_CODEPOINT ||
  53                     c2 == INVALID_CODEPOINT) {
  54                         return strcasecmp(s1, s2);
  55                 }
  56
  57                 s1 += size1;
  58                 s2 += size2;
  59
  60                 if (c1 == c2) {
  61                         continue;
  62                 }
  63
  64                 u1 = toupper_m(c1);
  65                 u2 = toupper_m(c2);
  66                 if (u1 == u2) {
  67                         continue;
  68                 }
  69
  70                 l1 = tolower_m(c1);
  71                 l2 = tolower_m(c2);
  72                 if (l1 == l2) {
  73                         continue;
  74                 }
  75
  76                 return l1 - l2;
  77         }
  78
  79         return *s1 - *s2;
  80 }
  81
  82 /**
  83  Case insensitive string compararison
  84 **/
  85 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
  86 {
  87         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  88         return strcasecmp_m_handle(iconv_handle, s1, s2);
  89 }
  90
  91 /**
  92  Case insensitive string compararison, length limited, handle specified for testing
  93 **/
  94 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
  95                                   const char *s1, const char *s2, size_t n)
  96 {
  97         codepoint_t c1=0, c2=0;
  98         codepoint_t u1=0, u2=0;
  99         codepoint_t l1=0, l2=0;
 100         size_t size1, size2;
 101
 102         /* handle null ptr comparisons to simplify the use in qsort */
 103         if (s1 == s2) return 0;
 104         if (s1 == NULL) return -1;
 105         if (s2 == NULL) return 1;
 106
 107         while (*s1 && *s2 && n) {
 108                 n--;
 109
 110                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
 111                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
 112
 113                 if (c1 == INVALID_CODEPOINT ||
 114                     c2 == INVALID_CODEPOINT) {
 115                         /*
 116                          * n was specified in characters,
 117                          * now we must convert it to bytes.
 118                          * As bytes are the smallest
 119                          * character unit, the following
 120                          * increment and strncasecmp is always
 121                          * safe.
 122                          *
 123                          * The source string was already known
 124                          * to be n characters long, so we are
 125                          * guaranteed to be able to look at the
 126                          * (n remaining + size1) bytes from the
 127                          * s1 position).
 128                          */
 129                         n += size1;
 130                         return strncasecmp(s1, s2, n);
 131                 }
 132
 133                 s1 += size1;
 134                 s2 += size2;
 135
 136                 if (c1 == c2) {
 137                         continue;
 138                 }
 139
 140                 u1 = toupper_m(c1);
 141                 u2 = toupper_m(c2);
 142                 if (u1 == u2) {
 143                         continue;
 144                 }
 145
 146                 l1 = tolower_m(c1);
 147                 l2 = tolower_m(c2);
 148                 if (l1 == l2) {
 149                         continue;
 150                 }
 151
 152                 return l1 - l2;
 153         }
 154
 155         if (n == 0) {
 156                 return 0;
 157         }
 158
 159         return *s1 - *s2;
 160 }
 161
 162 /**
 163  Case insensitive string compararison, length limited
 164 **/
 165 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 166 {
 167         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
 168         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
 169 }
 170
 171 /**
 172  * Compare 2 strings.
 173  *
 174  * @note The comparison is case-insensitive.
 175  **/
 176 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
 177 {
 178         return strcasecmp_m(s1,s2) == 0;
 179 }
 180
 181 /**
 182  Compare 2 strings (case sensitive).
 183 **/
 184 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
 185 {
 186         if (s1 == s2)
 187                 return true;
 188         if (!s1 || !s2)
 189                 return false;
 190
 191         return strcmp(s1,s2) == 0;
 192 }
 193
 194 /**
 195  * Calculate the number of units (8 or 16-bit, depending on the
 196  * destination charset), that would be needed to convert the input
 197  * string which is expected to be in in src_charset encoding to the
 198  * destination charset (which should be a unicode charset).
 199  */
 200 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 201                                     const char *s, charset_t src_charset, charset_t dst_charset)
 202 {
 203         size_t count = 0;
 204
 205 #ifdef DEVELOPER
 206         switch (dst_charset) {
 207         case CH_DOS:
 208         case CH_UNIX:
 209                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
 210         default:
 211                 break;
 212         }
 213
 214         switch (src_charset) {
 215         case CH_UTF16LE:
 216         case CH_UTF16BE:
 217                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
 218         default:
 219                 break;
 220         }
 221 #endif
 222         if (!s) {
 223                 return 0;
 224         }
 225
 226         while (*s && !(((uint8_t)*s) & 0x80)) {
 227                 s++;
 228                 count++;
 229         }
 230
 231         if (!*s) {
 232                 return count;
 233         }
 234
 235         while (*s) {
 236                 size_t c_size;
 237                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
 238                                                           src_charset, &c_size);
 239                 s += c_size;
 240
 241                 switch (dst_charset) {
 242                 case CH_UTF16LE:
 243                 case CH_UTF16BE:
 244                 case CH_UTF16MUNGED:
 245                         if (c < 0x10000) {
 246                                 /* Unicode char fits into 16 bits. */
 247                                 count += 1;
 248                         } else {
 249                                 /* Double-width unicode char - 32 bits. */
 250                                 count += 2;
 251                         }
 252                         break;
 253                 case CH_UTF8:
 254                         /*
 255                          * this only checks ranges, and does not
 256                          * check for invalid codepoints
 257                          */
 258                         if (c < 0x80) {
 259                                 count += 1;
 260                         } else if (c < 0x800) {
 261                                 count += 2;
 262                         } else if (c < 0x10000) {
 263                                 count += 3;
 264                         } else {
 265                                 count += 4;
 266                         }
 267                         break;
 268                 default:
 269                         /*
 270                          * non-unicode encoding:
 271                          * assume that each codepoint fits into
 272                          * one unit in the destination encoding.
 273                          */
 274                         count += 1;
 275                 }
 276         }
 277
 278         return count;
 279 }
 280
 281 /**
 282  * Calculate the number of units (8 or 16-bit, depending on the
 283  * destination charset), that would be needed to convert the input
 284  * string which is expected to be in in src_charset encoding to the
 285  * destination charset (which should be a unicode charset).
 286  */
 287 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
 288 {
 289         struct smb_iconv_handle *ic = get_iconv_handle();
 290         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
 291 }
 292
 293 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
 294                                   const charset_t dst_charset)
 295 {
 296         if (!s) {
 297                 return 0;
 298         }
 299         return strlen_m_ext(s, src_charset, dst_charset) + 1;
 300 }
 301
 302 _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
 303                                        const charset_t src_charset,
 304                                        const charset_t dst_charset)
 305 {
 306         size_t len;
 307         if (!s) {
 308                 return 0;
 309         }
 310         len = strlen_m_ext(s, src_charset, dst_charset);
 311         if (len == 0) {
 312                 return 0;
 313         }
 314
 315         return len+1;
 316 }
 317
 318 /**
 319  * Calculate the number of 16-bit units that would be needed to convert
 320  * the input string which is expected to be in CH_UNIX encoding to UTF16.
 321  *
 322  * This will be the same as the number of bytes in a string for single
 323  * byte strings, but will be different for multibyte.
 324  */
 325 _PUBLIC_ size_t strlen_m(const char *s)
 326 {
 327         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
 328 }
 329
 330 /**
 331    Work out the number of multibyte chars in a string, including the NULL
 332    terminator.
 333 **/
 334 _PUBLIC_ size_t strlen_m_term(const char *s)
 335 {
 336         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
 337 }
 338
 339 /*
 340  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
 341  * if a string is there, include the terminator.
 342  */
 343
 344 _PUBLIC_ size_t strlen_m_term_null(const char *s)
 345 {
 346         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
 347 }
 348
 349 /**
 350  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 351 **/
 352 _PUBLIC_ char *strchr_m(const char *src, char c)
 353 {
 354         const char *s;
 355         struct smb_iconv_handle *ic = get_iconv_handle();
 356         if (src == NULL) {
 357                 return NULL;
 358         }
 359         /* characters below 0x3F are guaranteed to not appear in
 360            non-initial position in multi-byte charsets */
 361         if ((c & 0xC0) == 0) {
 362                 return strchr(src, c);
 363         }
 364
 365         /* this is quite a common operation, so we want it to be
 366            fast. We optimise for the ascii case, knowing that all our
 367            supported multi-byte character sets are ascii-compatible
 368            (ie. they match for the first 128 chars) */
 369
 370         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 371                 if (*s == c)
 372                         return discard_const_p(char, s);
 373         }
 374
 375         if (!*s)
 376                 return NULL;
 377
 378 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
 379         /* With compose characters we must restart from the beginning. JRA. */
 380         s = src;
 381 #endif
 382
 383         while (*s) {
 384                 size_t size;
 385                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 386                 if (c2 == c) {
 387                         return discard_const_p(char, s);
 388                 }
 389                 s += size;
 390         }
 391
 392         return NULL;
 393 }
 394
 395 /**
 396  * Multibyte-character version of strrchr
 397  */
 398 _PUBLIC_ char *strrchr_m(const char *s, char c)
 399 {
 400         struct smb_iconv_handle *ic;
 401         char *ret = NULL;
 402
 403         if (s == NULL) {
 404                 return NULL;
 405         }
 406
 407         /* characters below 0x3F are guaranteed to not appear in
 408            non-initial position in multi-byte charsets */
 409         if ((c & 0xC0) == 0) {
 410                 return strrchr(s, c);
 411         }
 412
 413         /* this is quite a common operation, so we want it to be
 414            fast. We optimise for the ascii case, knowing that all our
 415            supported multi-byte character sets are ascii-compatible
 416            (ie. they match for the first 128 chars). Also, in Samba
 417            we only search for ascii characters in 'c' and that
 418            in all mb character sets with a compound character
 419            containing c, if 'c' is not a match at position
 420            p, then p[-1] > 0x7f. JRA. */
 421
 422         {
 423                 size_t len = strlen(s);
 424                 const char *cp = s;
 425                 bool got_mb = false;
 426
 427                 if (len == 0)
 428                         return NULL;
 429                 cp += (len - 1);
 430                 do {
 431                         if (c == *cp) {
 432                                 /* Could be a match. Part of a multibyte ? */
 433                                 if ((cp > s) &&
 434                                         (((unsigned char)cp[-1]) & 0x80)) {
 435                                         /* Yep - go slow :-( */
 436                                         got_mb = true;
 437                                         break;
 438                                 }
 439                                 /* No - we have a match ! */
 440                                 return discard_const_p(char , cp);
 441                         }
 442                 } while (cp-- != s);
 443                 if (!got_mb)
 444                         return NULL;
 445         }
 446
 447         ic = get_iconv_handle();
 448
 449         while (*s) {
 450                 size_t size;
 451                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 452                 if (c2 == c) {
 453                         ret = discard_const_p(char, s);
 454                 }
 455                 s += size;
 456         }
 457
 458         return ret;
 459 }
 460
 461 /**
 462   return True if any (multi-byte) character is lower case
 463 */
 464 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
 465                                  const char *string)
 466 {
 467         while (*string) {
 468                 size_t c_size;
 469                 codepoint_t s;
 470                 codepoint_t t;
 471
 472                 s = next_codepoint_handle(ic, string, &c_size);
 473                 string += c_size;
 474
 475                 t = toupper_m(s);
 476
 477                 if (s != t) {
 478                         return true; /* that means it has lower case chars */
 479                 }
 480         }
 481
 482         return false;
 483 }
 484
 485 _PUBLIC_ bool strhaslower(const char *string)
 486 {
 487         struct smb_iconv_handle *ic = get_iconv_handle();
 488         return strhaslower_handle(ic, string);
 489 }
 490
 491 /**
 492   return True if any (multi-byte) character is upper case
 493 */
 494 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
 495                                  const char *string)
 496 {
 497         while (*string) {
 498                 size_t c_size;
 499                 codepoint_t s;
 500                 codepoint_t t;
 501
 502                 s = next_codepoint_handle(ic, string, &c_size);
 503                 string += c_size;
 504
 505                 t = tolower_m(s);
 506
 507                 if (s != t) {
 508                         return true; /* that means it has upper case chars */
 509                 }
 510         }
 511
 512         return false;
 513 }
 514
 515 _PUBLIC_ bool strhasupper(const char *string)
 516 {
 517         struct smb_iconv_handle *ic = get_iconv_handle();
 518         return strhasupper_handle(ic, string);
 519 }
 520
 521 /***********************************************************************
 522  strstr_m - We convert via ucs2 for now.
 523 ***********************************************************************/
 524
 525 char *strstr_m(const char *src, const char *findstr)
 526 {
 527         smb_ucs2_t *p;
 528         smb_ucs2_t *src_w, *find_w;
 529         const char *s;
 530         char *s2;
 531         char *retp;
 532         size_t converted_size, findstr_len = 0;
 533
 534         TALLOC_CTX *frame; /* Only set up in the iconv case */
 535
 536         /* for correctness */
 537         if (!findstr[0]) {
 538                 return discard_const_p(char, src);
 539         }
 540
 541         /* Samba does single character findstr calls a *lot*. */
 542         if (findstr[1] == '\0')
 543                 return strchr_m(src, *findstr);
 544
 545         /* We optimise for the ascii case, knowing that all our
 546            supported multi-byte character sets are ascii-compatible
 547            (ie. they match for the first 128 chars) */
 548
 549         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 550                 if (*s == *findstr) {
 551                         if (!findstr_len)
 552                                 findstr_len = strlen(findstr);
 553
 554                         if (strncmp(s, findstr, findstr_len) == 0) {
 555                                 return discard_const_p(char, s);
 556                         }
 557                 }
 558         }
 559
 560         if (!*s)
 561                 return NULL;
 562
 563 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
 564         /* 'make check' fails unless we do this */
 565
 566         /* With compose characters we must restart from the beginning. JRA. */
 567         s = src;
 568 #endif
 569
 570         frame = talloc_stackframe();
 571
 572         if (!push_ucs2_talloc(frame, &src_w, src, &converted_size)) {
 573                 TALLOC_FREE(frame);
 574                 return NULL;
 575         }
 576
 577         if (!push_ucs2_talloc(frame, &find_w, findstr, &converted_size)) {
 578                 TALLOC_FREE(frame);
 579                 return NULL;
 580         }
 581
 582         p = strstr_w(src_w, find_w);
 583
 584         if (!p) {
 585                 TALLOC_FREE(frame);
 586                 return NULL;
 587         }
 588
 589         *p = 0;
 590         if (!pull_ucs2_talloc(frame, &s2, src_w, &converted_size)) {
 591                 TALLOC_FREE(frame);
 592                 return NULL;
 593         }
 594         retp = discard_const_p(char, (s+strlen(s2)));
 595         TALLOC_FREE(frame);
 596         return retp;
 597 }