s3:rpc_server: Initialize array
[Samba.git] / lib / util / charset / util_str.c
blobc52b77384ceedbeb953de0aa085ede3842fbca4d
1 /*
2 Unix SMB/CIFS implementation.
3 Samba utility functions
4 Copyright (C) Andrew Tridgell 1992-2001
5 Copyright (C) Simo Sorce 2001
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Jeremy Allison 1992-2007
8 Copyright (C) Martin Pool 2003
9 Copyright (C) James Peach 2006
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include "replace.h"
26 #include "system/locale.h"
27 #include "charset.h"
28 #include "lib/util/fault.h"
29 #include "lib/util/tsort.h"
31 #ifdef strcasecmp
32 #undef strcasecmp
33 #endif
34 #ifdef strncasecmp
35 #undef strncasecmp
36 #endif
39 /**
40 Case insensitive string comparison, handle specified for testing
41 **/
42 _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
43 const char *s1, const char *s2)
45 codepoint_t c1=0, c2=0;
46 codepoint_t u1=0, u2=0;
47 codepoint_t l1=0, l2=0;
48 size_t size1, size2;
50 /* handle null ptr comparisons to simplify the use in qsort */
51 if (s1 == s2) return 0;
52 if (s1 == NULL) return -1;
53 if (s2 == NULL) return 1;
55 while (*s1 && *s2) {
56 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
57 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
59 if (c1 == INVALID_CODEPOINT ||
60 c2 == INVALID_CODEPOINT) {
61 return strcasecmp(s1, s2);
64 s1 += size1;
65 s2 += size2;
67 if (c1 == c2) {
68 continue;
71 u1 = toupper_m(c1);
72 u2 = toupper_m(c2);
73 if (u1 == u2) {
74 continue;
77 l1 = tolower_m(c1);
78 l2 = tolower_m(c2);
79 if (l1 == l2) {
80 continue;
83 return NUMERIC_CMP(l1, l2);
86 return NUMERIC_CMP(*s1, *s2);
89 /**
90 Case insensitive string comparison
91 **/
92 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
94 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
95 return strcasecmp_m_handle(iconv_handle, s1, s2);
98 /**
99 Case insensitive string comparison, length limited, handle specified for
100 testing
102 _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
103 const char *s1, const char *s2, size_t n)
105 codepoint_t c1=0, c2=0;
106 codepoint_t u1=0, u2=0;
107 codepoint_t l1=0, l2=0;
108 size_t size1, size2;
110 /* handle null ptr comparisons to simplify the use in qsort */
111 if (s1 == s2) return 0;
112 if (s1 == NULL) return -1;
113 if (s2 == NULL) return 1;
115 while (*s1 && *s2 && n) {
116 n--;
118 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
119 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
121 if (c1 == INVALID_CODEPOINT ||
122 c2 == INVALID_CODEPOINT) {
124 * n was specified in characters,
125 * now we must convert it to bytes.
126 * As bytes are the smallest
127 * character unit, the following
128 * increment and strncasecmp is always
129 * safe.
131 * The source string was already known
132 * to be n characters long, so we are
133 * guaranteed to be able to look at the
134 * (n remaining + size1) bytes from the
135 * s1 position).
137 n += size1;
138 return strncasecmp(s1, s2, n);
141 s1 += size1;
142 s2 += size2;
144 if (c1 == c2) {
145 continue;
148 u1 = toupper_m(c1);
149 u2 = toupper_m(c2);
150 if (u1 == u2) {
151 continue;
154 l1 = tolower_m(c1);
155 l2 = tolower_m(c2);
156 if (l1 == l2) {
157 continue;
160 return NUMERIC_CMP(l1, l2);
163 if (n == 0) {
164 return 0;
167 return NUMERIC_CMP(*s1, *s2);
171 Case insensitive string comparison, length limited
173 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
175 struct smb_iconv_handle *iconv_handle = get_iconv_handle();
176 return strncasecmp_m_handle(iconv_handle, s1, s2, n);
180 * Compare 2 strings.
182 * @note The comparison is case-insensitive.
184 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
186 return strcasecmp_m(s1,s2) == 0;
190 Compare 2 strings (case sensitive).
192 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
194 if (s1 == s2)
195 return true;
196 if (!s1 || !s2)
197 return false;
199 return strcmp(s1,s2) == 0;
203 * Calculate the number of units (8 or 16-bit, depending on the
204 * destination charset) that would be needed to convert the input
205 * string, which is expected to be in src_charset encoding, to the
206 * destination charset (which should be a unicode charset).
208 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
209 const char *s, charset_t src_charset, charset_t dst_charset)
211 size_t count = 0;
213 #ifdef DEVELOPER
214 switch (dst_charset) {
215 case CH_DOS:
216 case CH_UNIX:
217 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
218 default:
219 break;
222 switch (src_charset) {
223 case CH_UTF16LE:
224 case CH_UTF16BE:
225 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
226 default:
227 break;
229 #endif
230 if (!s) {
231 return 0;
234 while (*s && !(((uint8_t)*s) & 0x80)) {
235 s++;
236 count++;
239 if (!*s) {
240 return count;
243 while (*s) {
244 size_t c_size;
245 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
246 src_charset, &c_size);
247 s += c_size;
249 switch (dst_charset) {
250 case CH_UTF16LE:
251 case CH_UTF16BE:
252 case CH_UTF16MUNGED:
253 if (c < 0x10000) {
254 /* Unicode char fits into 16 bits. */
255 count += 1;
256 } else {
257 /* Double-width unicode char - 32 bits. */
258 count += 2;
260 break;
261 case CH_UTF8:
263 * this only checks ranges, and does not
264 * check for invalid codepoints
266 if (c < 0x80) {
267 count += 1;
268 } else if (c < 0x800) {
269 count += 2;
270 } else if (c < 0x10000) {
271 count += 3;
272 } else {
273 count += 4;
275 break;
276 default:
278 * non-unicode encoding:
279 * assume that each codepoint fits into
280 * one unit in the destination encoding.
282 count += 1;
286 return count;
290 * Calculate the number of units (8 or 16-bit, depending on the
291 * destination charset) that would be needed to convert the input
292 * string, which is expected to be in src_charset encoding, to the
293 * destination charset (which should be a unicode charset).
295 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
297 struct smb_iconv_handle *ic = get_iconv_handle();
298 return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
301 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
302 const charset_t dst_charset)
304 if (!s) {
305 return 0;
307 return strlen_m_ext(s, src_charset, dst_charset) + 1;
310 _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
311 const charset_t src_charset,
312 const charset_t dst_charset)
314 size_t len;
315 if (!s) {
316 return 0;
318 len = strlen_m_ext(s, src_charset, dst_charset);
319 if (len == 0) {
320 return 0;
323 return len+1;
327 * Calculate the number of 16-bit units that would be needed to convert
328 * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
330 * This will be the same as the number of bytes in a string for single
331 * byte strings, but will be different for multibyte.
333 _PUBLIC_ size_t strlen_m(const char *s)
335 return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
339 Work out the number of multibyte chars in a string, including the NULL
340 terminator.
342 _PUBLIC_ size_t strlen_m_term(const char *s)
344 return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
348 * Weird helper routine for the winreg pipe: If nothing is around, return 0,
349 * if a string is there, include the terminator.
352 _PUBLIC_ size_t strlen_m_term_null(const char *s)
354 return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
358 Strchr and strrchr_m are a bit complex on general multi-byte strings.
360 _PUBLIC_ char *strchr_m(const char *src, char c)
362 const char *s;
363 struct smb_iconv_handle *ic = get_iconv_handle();
364 if (src == NULL) {
365 return NULL;
367 /* characters below 0x3F are guaranteed to not appear in
368 non-initial position in multi-byte charsets */
369 if ((c & 0xC0) == 0) {
370 return strchr(src, c);
373 /* this is quite a common operation, so we want it to be
374 fast. We optimise for the ascii case, knowing that all our
375 supported multi-byte character sets are ascii-compatible
376 (ie. they match for the first 128 chars) */
378 for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
379 if (*s == c)
380 return discard_const_p(char, s);
383 if (!*s)
384 return NULL;
386 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
387 /* With compose characters we must restart from the beginning. JRA. */
388 s = src;
389 #endif
391 while (*s) {
392 size_t size;
393 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
394 if (c2 == c) {
395 return discard_const_p(char, s);
397 s += size;
400 return NULL;
404 * Multibyte-character version of strrchr
406 _PUBLIC_ char *strrchr_m(const char *s, char c)
408 struct smb_iconv_handle *ic;
409 char *ret = NULL;
411 if (s == NULL) {
412 return NULL;
415 /* characters below 0x3F are guaranteed to not appear in
416 non-initial position in multi-byte charsets */
417 if ((c & 0xC0) == 0) {
418 return strrchr(s, c);
421 /* this is quite a common operation, so we want it to be
422 fast. We optimise for the ascii case, knowing that all our
423 supported multi-byte character sets are ascii-compatible
424 (ie. they match for the first 128 chars). Also, in Samba
425 we only search for ascii characters in 'c' and that
426 in all mb character sets with a compound character
427 containing c, if 'c' is not a match at position
428 p, then p[-1] > 0x7f. JRA. */
431 size_t len = strlen(s);
432 const char *cp = s;
433 bool got_mb = false;
435 if (len == 0)
436 return NULL;
437 cp += (len - 1);
438 do {
439 if (c == *cp) {
440 /* Could be a match. Part of a multibyte ? */
441 if ((cp > s) &&
442 (((unsigned char)cp[-1]) & 0x80)) {
443 /* Yep - go slow :-( */
444 got_mb = true;
445 break;
447 /* No - we have a match ! */
448 return discard_const_p(char , cp);
450 } while (cp-- != s);
451 if (!got_mb)
452 return NULL;
455 ic = get_iconv_handle();
457 while (*s) {
458 size_t size;
459 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
460 if (c2 == c) {
461 ret = discard_const_p(char, s);
463 s += size;
466 return ret;
470 return True if any (multi-byte) character is lower case
472 _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
473 const char *string)
475 while (*string) {
476 size_t c_size;
477 codepoint_t s;
478 codepoint_t t;
480 s = next_codepoint_handle(ic, string, &c_size);
481 string += c_size;
483 t = toupper_m(s);
485 if (s != t) {
486 return true; /* that means it has lower case chars */
490 return false;
493 _PUBLIC_ bool strhaslower(const char *string)
495 struct smb_iconv_handle *ic = get_iconv_handle();
496 return strhaslower_handle(ic, string);
500 return True if any (multi-byte) character is upper case
502 _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
503 const char *string)
505 while (*string) {
506 size_t c_size;
507 codepoint_t s;
508 codepoint_t t;
510 s = next_codepoint_handle(ic, string, &c_size);
511 string += c_size;
513 t = tolower_m(s);
515 if (s != t) {
516 return true; /* that means it has upper case chars */
520 return false;
523 _PUBLIC_ bool strhasupper(const char *string)
525 struct smb_iconv_handle *ic = get_iconv_handle();
526 return strhasupper_handle(ic, string);
529 /***********************************************************************
530 strstr_m - We convert via ucs2 for now.
531 ***********************************************************************/
533 char *strstr_m(const char *src, const char *findstr)
535 TALLOC_CTX *mem_ctx = NULL;
536 smb_ucs2_t *p;
537 smb_ucs2_t *src_w, *find_w;
538 const char *s;
539 char *s2;
540 char *retp = NULL;
541 size_t converted_size, findstr_len = 0;
543 /* for correctness */
544 if (!findstr[0]) {
545 return discard_const_p(char, src);
548 /* Samba does single character findstr calls a *lot*. */
549 if (findstr[1] == '\0')
550 return strchr_m(src, *findstr);
552 /* We optimise for the ascii case, knowing that all our
553 supported multi-byte character sets are ascii-compatible
554 (ie. they match for the first 128 chars) */
556 for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
557 if (*s == *findstr) {
558 if (!findstr_len)
559 findstr_len = strlen(findstr);
561 if (strncmp(s, findstr, findstr_len) == 0) {
562 return discard_const_p(char, s);
567 if (!*s)
568 return NULL;
570 #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
571 /* 'make check' fails unless we do this */
573 /* With compose characters we must restart from the beginning. JRA. */
574 s = src;
575 #endif
578 * Use get_iconv_handle() just as a non-NULL talloc ctx. In
579 * case we leak memory, this should then be more obvious in
580 * the talloc report.
582 mem_ctx = talloc_new(get_iconv_handle());
583 if (mem_ctx == NULL) {
584 return NULL;
587 if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
588 goto done;
591 if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
592 goto done;
595 p = strstr_w(src_w, find_w);
597 if (!p) {
598 goto done;
601 *p = 0;
602 if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
603 goto done;
605 retp = discard_const_p(char, (s+strlen(s2)));
606 done:
607 TALLOC_FREE(mem_ctx);
608 return retp;