charset: fix str[n]casecmp_m() by comparing lower case values
[Samba.git] / lib / util / charset / util_str.c
blob6feed1742ac48b3177ee3d87596eac5007fca42e
1 /*
2 Unix SMB/CIFS implementation.
3 Samba utility functions
4 Copyright (C) Andrew Tridgell 1992-2001
5 Copyright (C) Simo Sorce 2001
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Jeremy Allison 1992-2007
8 Copyright (C) Martin Pool 2003
9 Copyright (C) James Peach 2006
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include "includes.h"
26 #include "system/locale.h"
28 #ifdef strcasecmp
29 #undef strcasecmp
30 #endif
32 /**
33 Case insensitive string compararison, handle specified for testing
34 **/
35 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
36 const char *s1, const char *s2)
38 codepoint_t c1=0, c2=0;
39 codepoint_t u1=0, u2=0;
40 codepoint_t l1=0, l2=0;
41 size_t size1, size2;
43 /* handle null ptr comparisons to simplify the use in qsort */
44 if (s1 == s2) return 0;
45 if (s1 == NULL) return -1;
46 if (s2 == NULL) return 1;
48 while (*s1 && *s2) {
49 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
50 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
52 if (c1 == INVALID_CODEPOINT ||
53 c2 == INVALID_CODEPOINT) {
54 return strcasecmp(s1, s2);
57 s1 += size1;
58 s2 += size2;
60 if (c1 == c2) {
61 continue;
64 u1 = toupper_m(c1);
65 u2 = toupper_m(c2);
66 if (u1 == u2) {
67 continue;
70 l1 = tolower_m(c1);
71 l2 = tolower_m(c2);
72 if (l1 == l2) {
73 continue;
76 return l1 - l2;
79 return *s1 - *s2;
82 /**
83 Case insensitive string compararison
84 **/
85 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
87 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
88 return strcasecmp_m_handle(iconv_handle, s1, s2);
91 /**
92 Case insensitive string compararison, length limited, handle specified for testing
93 **/
94 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
95 const char *s1, const char *s2, size_t n)
97 codepoint_t c1=0, c2=0;
98 codepoint_t u1=0, u2=0;
99 codepoint_t l1=0, l2=0;
100 size_t size1, size2;
102 /* handle null ptr comparisons to simplify the use in qsort */
103 if (s1 == s2) return 0;
104 if (s1 == NULL) return -1;
105 if (s2 == NULL) return 1;
107 while (*s1 && *s2 && n) {
108 n--;
110 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
111 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
113 if (c1 == INVALID_CODEPOINT ||
114 c2 == INVALID_CODEPOINT) {
116 * n was specified in characters,
117 * now we must convert it to bytes.
118 * As bytes are the smallest
119 * character unit, the following
120 * increment and strncasecmp is always
121 * safe.
123 * The source string was already known
124 * to be n characters long, so we are
125 * guaranteed to be able to look at the
126 * (n remaining + size1) bytes from the
127 * s1 position).
129 n += size1;
130 return strncasecmp(s1, s2, n);
133 s1 += size1;
134 s2 += size2;
136 if (c1 == c2) {
137 continue;
140 u1 = toupper_m(c1);
141 u2 = toupper_m(c2);
142 if (u1 == u2) {
143 continue;
146 l1 = tolower_m(c1);
147 l2 = tolower_m(c2);
148 if (l1 == l2) {
149 continue;
152 return l1 - l2;
155 if (n == 0) {
156 return 0;
159 return *s1 - *s2;
163 Case insensitive string compararison, length limited
165 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
167 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
168 return strncasecmp_m_handle(iconv_handle, s1, s2, n);
172 * Compare 2 strings.
174 * @note The comparison is case-insensitive.
176 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
178 return strcasecmp_m(s1,s2) == 0;
182 Compare 2 strings (case sensitive).
184 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
186 if (s1 == s2)
187 return true;
188 if (!s1 || !s2)
189 return false;
191 return strcmp(s1,s2) == 0;
195 * Calculate the number of units (8 or 16-bit, depending on the
196 * destination charset), that would be needed to convert the input
197 * string which is expected to be in in src_charset encoding to the
198 * destination charset (which should be a unicode charset).
200 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
201 const char *s, charset_t src_charset, charset_t dst_charset)
203 size_t count = 0;
205 #ifdef DEVELOPER
206 switch (dst_charset) {
207 case CH_DOS:
208 case CH_UNIX:
209 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
210 default:
211 break;
214 switch (src_charset) {
215 case CH_UTF16LE:
216 case CH_UTF16BE:
217 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
218 default:
219 break;
221 #endif
222 if (!s) {
223 return 0;
226 while (*s && !(((uint8_t)*s) & 0x80)) {
227 s++;
228 count++;
231 if (!*s) {
232 return count;
235 while (*s) {
236 size_t c_size;
237 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
238 src_charset, &c_size);
239 s += c_size;
241 switch (dst_charset) {
242 case CH_UTF16LE:
243 case CH_UTF16BE:
244 case CH_UTF16MUNGED:
245 if (c < 0x10000) {
246 /* Unicode char fits into 16 bits. */
247 count += 1;
248 } else {
249 /* Double-width unicode char - 32 bits. */
250 count += 2;
252 break;
253 case CH_UTF8:
255 * this only checks ranges, and does not
256 * check for invalid codepoints
258 if (c < 0x80) {
259 count += 1;
260 } else if (c < 0x800) {
261 count += 2;
262 } else if (c < 0x10000) {
263 count += 3;
264 } else {
265 count += 4;
267 break;
268 default:
270 * non-unicode encoding:
271 * assume that each codepoint fits into
272 * one unit in the destination encoding.
274 count += 1;
278 return count;
282 * Calculate the number of units (8 or 16-bit, depending on the
283 * destination charset), that would be needed to convert the input
284 * string which is expected to be in in src_charset encoding to the
285 * destination charset (which should be a unicode charset).
287 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
289 struct smb_iconv_handle *ic = get_iconv_handle();
290 return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
293 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
294 const charset_t dst_charset)
296 if (!s) {
297 return 0;
299 return strlen_m_ext(s, src_charset, dst_charset) + 1;
302 _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
303 const charset_t src_charset,
304 const charset_t dst_charset)
306 size_t len;
307 if (!s) {
308 return 0;
310 len = strlen_m_ext(s, src_charset, dst_charset);
311 if (len == 0) {
312 return 0;
315 return len+1;
319 * Calculate the number of 16-bit units that would be needed to convert
320 * the input string which is expected to be in CH_UNIX encoding to UTF16.
322 * This will be the same as the number of bytes in a string for single
323 * byte strings, but will be different for multibyte.
325 _PUBLIC_ size_t strlen_m(const char *s)
327 return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
331 Work out the number of multibyte chars in a string, including the NULL
332 terminator.
334 _PUBLIC_ size_t strlen_m_term(const char *s)
336 return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
340 * Weird helper routine for the winreg pipe: If nothing is around, return 0,
341 * if a string is there, include the terminator.
344 _PUBLIC_ size_t strlen_m_term_null(const char *s)
346 return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
350 Strchr and strrchr_m are a bit complex on general multi-byte strings.
352 _PUBLIC_ char *strchr_m(const char *src, char c)
354 const char *s;
355 struct smb_iconv_handle *ic = get_iconv_handle();
356 if (src == NULL) {
357 return NULL;
359 /* characters below 0x3F are guaranteed to not appear in
360 non-initial position in multi-byte charsets */
361 if ((c & 0xC0) == 0) {
362 return strchr(src, c);
365 /* this is quite a common operation, so we want it to be
366 fast. We optimise for the ascii case, knowing that all our
367 supported multi-byte character sets are ascii-compatible
368 (ie. they match for the first 128 chars) */
370 for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
371 if (*s == c)
372 return discard_const_p(char, s);
375 if (!*s)
376 return NULL;
378 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
379 /* With compose characters we must restart from the beginning. JRA. */
380 s = src;
381 #endif
383 while (*s) {
384 size_t size;
385 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
386 if (c2 == c) {
387 return discard_const_p(char, s);
389 s += size;
392 return NULL;
396 * Multibyte-character version of strrchr
398 _PUBLIC_ char *strrchr_m(const char *s, char c)
400 struct smb_iconv_handle *ic;
401 char *ret = NULL;
403 if (s == NULL) {
404 return NULL;
407 /* characters below 0x3F are guaranteed to not appear in
408 non-initial position in multi-byte charsets */
409 if ((c & 0xC0) == 0) {
410 return strrchr(s, c);
413 /* this is quite a common operation, so we want it to be
414 fast. We optimise for the ascii case, knowing that all our
415 supported multi-byte character sets are ascii-compatible
416 (ie. they match for the first 128 chars). Also, in Samba
417 we only search for ascii characters in 'c' and that
418 in all mb character sets with a compound character
419 containing c, if 'c' is not a match at position
420 p, then p[-1] > 0x7f. JRA. */
423 size_t len = strlen(s);
424 const char *cp = s;
425 bool got_mb = false;
427 if (len == 0)
428 return NULL;
429 cp += (len - 1);
430 do {
431 if (c == *cp) {
432 /* Could be a match. Part of a multibyte ? */
433 if ((cp > s) &&
434 (((unsigned char)cp[-1]) & 0x80)) {
435 /* Yep - go slow :-( */
436 got_mb = true;
437 break;
439 /* No - we have a match ! */
440 return discard_const_p(char , cp);
442 } while (cp-- != s);
443 if (!got_mb)
444 return NULL;
447 ic = get_iconv_handle();
449 while (*s) {
450 size_t size;
451 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
452 if (c2 == c) {
453 ret = discard_const_p(char, s);
455 s += size;
458 return ret;
462 return True if any (multi-byte) character is lower case
464 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
465 const char *string)
467 while (*string) {
468 size_t c_size;
469 codepoint_t s;
470 codepoint_t t;
472 s = next_codepoint_handle(ic, string, &c_size);
473 string += c_size;
475 t = toupper_m(s);
477 if (s != t) {
478 return true; /* that means it has lower case chars */
482 return false;
485 _PUBLIC_ bool strhaslower(const char *string)
487 struct smb_iconv_handle *ic = get_iconv_handle();
488 return strhaslower_handle(ic, string);
492 return True if any (multi-byte) character is upper case
494 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
495 const char *string)
497 while (*string) {
498 size_t c_size;
499 codepoint_t s;
500 codepoint_t t;
502 s = next_codepoint_handle(ic, string, &c_size);
503 string += c_size;
505 t = tolower_m(s);
507 if (s != t) {
508 return true; /* that means it has upper case chars */
512 return false;
515 _PUBLIC_ bool strhasupper(const char *string)
517 struct smb_iconv_handle *ic = get_iconv_handle();
518 return strhasupper_handle(ic, string);
521 /***********************************************************************
522 strstr_m - We convert via ucs2 for now.
523 ***********************************************************************/
525 char *strstr_m(const char *src, const char *findstr)
527 smb_ucs2_t *p;
528 smb_ucs2_t *src_w, *find_w;
529 const char *s;
530 char *s2;
531 char *retp;
532 size_t converted_size, findstr_len = 0;
534 TALLOC_CTX *frame; /* Only set up in the iconv case */
536 /* for correctness */
537 if (!findstr[0]) {
538 return discard_const_p(char, src);
541 /* Samba does single character findstr calls a *lot*. */
542 if (findstr[1] == '\0')
543 return strchr_m(src, *findstr);
545 /* We optimise for the ascii case, knowing that all our
546 supported multi-byte character sets are ascii-compatible
547 (ie. they match for the first 128 chars) */
549 for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
550 if (*s == *findstr) {
551 if (!findstr_len)
552 findstr_len = strlen(findstr);
554 if (strncmp(s, findstr, findstr_len) == 0) {
555 return discard_const_p(char, s);
560 if (!*s)
561 return NULL;
563 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
564 /* 'make check' fails unless we do this */
566 /* With compose characters we must restart from the beginning. JRA. */
567 s = src;
568 #endif
570 frame = talloc_stackframe();
572 if (!push_ucs2_talloc(frame, &src_w, src, &converted_size)) {
573 DBG_WARNING("src malloc fail\n");
574 TALLOC_FREE(frame);
575 return NULL;
578 if (!push_ucs2_talloc(frame, &find_w, findstr, &converted_size)) {
579 DBG_WARNING("find malloc fail\n");
580 TALLOC_FREE(frame);
581 return NULL;
584 p = strstr_w(src_w, find_w);
586 if (!p) {
587 TALLOC_FREE(frame);
588 return NULL;
591 *p = 0;
592 if (!pull_ucs2_talloc(frame, &s2, src_w, &converted_size)) {
593 TALLOC_FREE(frame);
594 DEBUG(0,("strstr_m: dest malloc fail\n"));
595 return NULL;
597 retp = discard_const_p(char, (s+strlen(s2)));
598 TALLOC_FREE(frame);
599 return retp;