r21691: Add testsuite for lib/charset
[Samba/ekacnet.git] / source4 / lib / charset / util_unistr.c
blobca65d1fa002f5888367210d2813266573970f889
1 /*
2 Unix SMB/CIFS implementation.
3 Samba utility functions
4 Copyright (C) Andrew Tridgell 1992-2001
5 Copyright (C) Simo Sorce 2001
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include "includes.h"
23 #include "system/locale.h"
24 #include "dynconfig.h"
26 /**
27 * @file
28 * @brief Unicode string manipulation
31 /* these 2 tables define the unicode case handling. They are loaded
32 at startup either via mmap() or read() from the lib directory */
33 static void *upcase_table;
34 static void *lowcase_table;
37 /*******************************************************************
38 load the case handling tables
39 ********************************************************************/
40 static void load_case_tables(void)
42 TALLOC_CTX *mem_ctx;
44 mem_ctx = talloc_init("load_case_tables");
45 if (!mem_ctx) {
46 smb_panic("No memory for case_tables");
48 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
49 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
50 talloc_free(mem_ctx);
51 if (upcase_table == NULL) {
52 /* try also under codepages for testing purposes */
53 upcase_table = map_file("codepages/upcase.dat", 0x20000);
54 if (upcase_table == NULL) {
55 upcase_table = (void *)-1;
58 if (lowcase_table == NULL) {
59 /* try also under codepages for testing purposes */
60 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
61 if (lowcase_table == NULL) {
62 lowcase_table = (void *)-1;
67 /**
68 Convert a codepoint_t to upper case.
69 **/
70 codepoint_t toupper_w(codepoint_t val)
72 if (val < 128) {
73 return toupper(val);
75 if (upcase_table == NULL) {
76 load_case_tables();
78 if (upcase_table == (void *)-1) {
79 return val;
81 if (val & 0xFFFF0000) {
82 return val;
84 return SVAL(upcase_table, val*2);
87 /**
88 Convert a codepoint_t to lower case.
89 **/
90 codepoint_t tolower_w(codepoint_t val)
92 if (val < 128) {
93 return tolower(val);
95 if (lowcase_table == NULL) {
96 load_case_tables();
98 if (lowcase_table == (void *)-1) {
99 return val;
101 if (val & 0xFFFF0000) {
102 return val;
104 return SVAL(lowcase_table, val*2);
108 compare two codepoints case insensitively
110 int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
112 if (c1 == c2 ||
113 toupper_w(c1) == toupper_w(c2)) {
114 return 0;
116 return c1 - c2;
120 Case insensitive string compararison
122 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
124 codepoint_t c1=0, c2=0;
125 size_t size1, size2;
127 /* handle null ptr comparisons to simplify the use in qsort */
128 if (s1 == s2) return 0;
129 if (s1 == NULL) return -1;
130 if (s2 == NULL) return 1;
132 while (*s1 && *s2) {
133 c1 = next_codepoint(s1, &size1);
134 c2 = next_codepoint(s2, &size2);
136 s1 += size1;
137 s2 += size2;
139 if (c1 == c2) {
140 continue;
143 if (c1 == INVALID_CODEPOINT ||
144 c2 == INVALID_CODEPOINT) {
145 /* what else can we do?? */
146 return strcasecmp(s1, s2);
149 if (toupper_w(c1) != toupper_w(c2)) {
150 return c1 - c2;
154 return *s1 - *s2;
158 * Get the next token from a string, return False if none found.
159 * Handles double-quotes.
161 * Based on a routine by GJC@VILLAGE.COM.
162 * Extensively modified by Andrew.Tridgell@anu.edu.au
164 _PUBLIC_ BOOL next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
166 const char *s;
167 BOOL quoted;
168 size_t len=1;
170 if (!ptr)
171 return(False);
173 s = *ptr;
175 /* default to simple separators */
176 if (!sep)
177 sep = " \t\n\r";
179 /* find the first non sep char */
180 while (*s && strchr_m(sep,*s))
181 s++;
183 /* nothing left? */
184 if (! *s)
185 return(False);
187 /* copy over the token */
188 for (quoted = False; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
189 if (*s == '\"') {
190 quoted = !quoted;
191 } else {
192 len++;
193 *buff++ = *s;
197 *ptr = (*s) ? s+1 : s;
198 *buff = 0;
200 return(True);
204 Case insensitive string compararison, length limited
206 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
208 codepoint_t c1=0, c2=0;
209 size_t size1, size2;
211 /* handle null ptr comparisons to simplify the use in qsort */
212 if (s1 == s2) return 0;
213 if (s1 == NULL) return -1;
214 if (s2 == NULL) return 1;
216 while (*s1 && *s2 && n) {
217 n--;
219 c1 = next_codepoint(s1, &size1);
220 c2 = next_codepoint(s2, &size2);
222 s1 += size1;
223 s2 += size2;
225 if (c1 == c2) {
226 continue;
229 if (c1 == INVALID_CODEPOINT ||
230 c2 == INVALID_CODEPOINT) {
231 /* what else can we do?? */
232 return strcasecmp(s1, s2);
235 if (toupper_w(c1) != toupper_w(c2)) {
236 return c1 - c2;
240 if (n == 0) {
241 return 0;
244 return *s1 - *s2;
248 * Compare 2 strings.
250 * @note The comparison is case-insensitive.
252 _PUBLIC_ BOOL strequal_w(const char *s1, const char *s2)
254 return strcasecmp_m(s1,s2) == 0;
258 Compare 2 strings (case sensitive).
260 _PUBLIC_ BOOL strcsequal_w(const char *s1,const char *s2)
262 if (s1 == s2)
263 return(True);
264 if (!s1 || !s2)
265 return(False);
267 return strcmp(s1,s2) == 0;
272 String replace.
273 NOTE: oldc and newc must be 7 bit characters
275 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
277 while (s && *s) {
278 size_t size;
279 codepoint_t c = next_codepoint(s, &size);
280 if (c == oldc) {
281 *s = newc;
283 s += size;
288 Paranoid strcpy into a buffer of given length (includes terminating
289 zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
290 and replaces with '_'. Deliberately does *NOT* check for multibyte
291 characters. Don't change it !
294 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
296 size_t len, i;
298 if (maxlength == 0) {
299 /* can't fit any bytes at all! */
300 return NULL;
303 if (!dest) {
304 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
305 return NULL;
308 if (!src) {
309 *dest = 0;
310 return dest;
313 len = strlen(src);
314 if (len >= maxlength)
315 len = maxlength - 1;
317 if (!other_safe_chars)
318 other_safe_chars = "";
320 for(i = 0; i < len; i++) {
321 int val = (src[i] & 0xff);
322 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
323 dest[i] = src[i];
324 else
325 dest[i] = '_';
328 dest[i] = '\0';
330 return dest;
334 Count the number of UCS2 characters in a string. Normally this will
335 be the same as the number of bytes in a string for single byte strings,
336 but will be different for multibyte.
338 _PUBLIC_ size_t strlen_m(const char *s)
340 size_t count = 0;
342 if (!s) {
343 return 0;
346 while (*s && !(((uint8_t)*s) & 0x80)) {
347 s++;
348 count++;
351 if (!*s) {
352 return count;
355 while (*s) {
356 size_t c_size;
357 codepoint_t c = next_codepoint(s, &c_size);
358 if (c < 0x10000) {
359 count += 1;
360 } else {
361 count += 2;
363 s += c_size;
366 return count;
370 Work out the number of multibyte chars in a string, including the NULL
371 terminator.
373 _PUBLIC_ size_t strlen_m_term(const char *s)
375 if (!s) {
376 return 0;
379 return strlen_m(s) + 1;
383 Strchr and strrchr_m are a bit complex on general multi-byte strings.
385 _PUBLIC_ char *strchr_m(const char *s, char c)
387 /* characters below 0x3F are guaranteed to not appear in
388 non-initial position in multi-byte charsets */
389 if ((c & 0xC0) == 0) {
390 return strchr(s, c);
393 while (*s) {
394 size_t size;
395 codepoint_t c2 = next_codepoint(s, &size);
396 if (c2 == c) {
397 return discard_const(s);
399 s += size;
402 return NULL;
406 * Multibyte-character version of strrchr
408 _PUBLIC_ char *strrchr_m(const char *s, char c)
410 char *ret = NULL;
412 /* characters below 0x3F are guaranteed to not appear in
413 non-initial position in multi-byte charsets */
414 if ((c & 0xC0) == 0) {
415 return strrchr(s, c);
418 while (*s) {
419 size_t size;
420 codepoint_t c2 = next_codepoint(s, &size);
421 if (c2 == c) {
422 ret = discard_const(s);
424 s += size;
427 return ret;
431 return True if any (multi-byte) character is lower case
433 _PUBLIC_ BOOL strhaslower(const char *string)
435 while (*string) {
436 size_t c_size;
437 codepoint_t s;
438 codepoint_t t;
440 s = next_codepoint(string, &c_size);
441 string += c_size;
443 t = toupper_w(s);
445 if (s != t) {
446 return True; /* that means it has lower case chars */
450 return False;
454 return True if any (multi-byte) character is upper case
456 _PUBLIC_ BOOL strhasupper(const char *string)
458 while (*string) {
459 size_t c_size;
460 codepoint_t s;
461 codepoint_t t;
463 s = next_codepoint(string, &c_size);
464 string += c_size;
466 t = tolower_w(s);
468 if (s != t) {
469 return True; /* that means it has upper case chars */
473 return False;
477 Convert a string to lower case, allocated with talloc
479 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
481 size_t size=0;
482 char *dest;
484 /* this takes advantage of the fact that upper/lower can't
485 change the length of a character by more than 1 byte */
486 dest = talloc_size(ctx, 2*(strlen(src))+1);
487 if (dest == NULL) {
488 return NULL;
491 while (*src) {
492 size_t c_size;
493 codepoint_t c = next_codepoint(src, &c_size);
494 src += c_size;
496 c = tolower_w(c);
498 c_size = push_codepoint(dest+size, c);
499 if (c_size == -1) {
500 talloc_free(dest);
501 return NULL;
503 size += c_size;
506 dest[size] = 0;
508 /* trim it so talloc_append_string() works */
509 dest = talloc_realloc_size(ctx, dest, size+1);
511 return dest;
515 Convert a string to UPPER case, allocated with talloc
517 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
519 size_t size=0;
520 char *dest;
522 if (!src) {
523 return NULL;
526 /* this takes advantage of the fact that upper/lower can't
527 change the length of a character by more than 1 byte */
528 dest = talloc_size(ctx, 2*(strlen(src))+1);
529 if (dest == NULL) {
530 return NULL;
533 while (*src) {
534 size_t c_size;
535 codepoint_t c = next_codepoint(src, &c_size);
536 src += c_size;
538 c = toupper_w(c);
540 c_size = push_codepoint(dest+size, c);
541 if (c_size == -1) {
542 talloc_free(dest);
543 return NULL;
545 size += c_size;
548 dest[size] = 0;
550 /* trim it so talloc_append_string() works */
551 dest = talloc_realloc_size(ctx, dest, size+1);
553 return dest;
557 Convert a string to lower case.
559 _PUBLIC_ void strlower_m(char *s)
561 char *d;
563 /* this is quite a common operation, so we want it to be
564 fast. We optimise for the ascii case, knowing that all our
565 supported multi-byte character sets are ascii-compatible
566 (ie. they match for the first 128 chars) */
567 while (*s && !(((uint8_t)*s) & 0x80)) {
568 *s = tolower((uint8_t)*s);
569 s++;
572 if (!*s)
573 return;
575 d = s;
577 while (*s) {
578 size_t c_size, c_size2;
579 codepoint_t c = next_codepoint(s, &c_size);
580 c_size2 = push_codepoint(d, tolower_w(c));
581 if (c_size2 > c_size) {
582 DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
583 c, tolower_w(c), (int)c_size, (int)c_size2));
584 smb_panic("codepoint expansion in strlower_m\n");
586 s += c_size;
587 d += c_size2;
589 *d = 0;
593 Convert a string to UPPER case.
595 _PUBLIC_ void strupper_m(char *s)
597 char *d;
599 /* this is quite a common operation, so we want it to be
600 fast. We optimise for the ascii case, knowing that all our
601 supported multi-byte character sets are ascii-compatible
602 (ie. they match for the first 128 chars) */
603 while (*s && !(((uint8_t)*s) & 0x80)) {
604 *s = toupper((uint8_t)*s);
605 s++;
608 if (!*s)
609 return;
611 d = s;
613 while (*s) {
614 size_t c_size, c_size2;
615 codepoint_t c = next_codepoint(s, &c_size);
616 c_size2 = push_codepoint(d, toupper_w(c));
617 if (c_size2 > c_size) {
618 DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
619 c, toupper_w(c), (int)c_size, (int)c_size2));
620 smb_panic("codepoint expansion in strupper_m\n");
622 s += c_size;
623 d += c_size2;
625 *d = 0;
630 Find the number of 'c' chars in a string
632 _PUBLIC_ size_t count_chars_w(const char *s, char c)
634 size_t count = 0;
636 while (*s) {
637 size_t size;
638 codepoint_t c2 = next_codepoint(s, &size);
639 if (c2 == c) count++;
640 s += size;
643 return count;