1 /* Elementary Unicode string functions.
2 Copyright (C) 2001-2002, 2005-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
22 /* Get common macros for C. */
23 #include "unused-parameter.h"
38 All functions prefixed with u8_ operate on UTF-8 encoded strings.
39 Their unit is an uint8_t (1 byte).
41 All functions prefixed with u16_ operate on UTF-16 encoded strings.
42 Their unit is an uint16_t (a 2-byte word).
44 All functions prefixed with u32_ operate on UCS-4 encoded strings.
45 Their unit is an uint32_t (a 4-byte word).
47 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
50 All arguments starting with "str" and the arguments of functions starting
51 with u8_str/u16_str/u32_str denote a NUL terminated string, i.e. a string
52 which terminates at the first NUL unit. This termination unit is
53 considered part of the string for all memory allocation purposes, but
54 is not considered part of the string for all other logical purposes.
56 Functions returning a string result take a (resultbuf, lengthp) argument
57 pair. If resultbuf is not NULL and the result fits into *lengthp units,
58 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
59 allocated string is returned. In both cases, *lengthp is set to the
60 length (number of units) of the returned string. In case of error,
61 NULL is returned and errno is set. */
64 /* Elementary string checks. */
66 /* Check whether an UTF-8 string is well-formed.
67 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
68 extern const uint8_t *
69 u8_check (const uint8_t *s
, size_t n
)
72 /* Check whether an UTF-16 string is well-formed.
73 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
74 extern const uint16_t *
75 u16_check (const uint16_t *s
, size_t n
)
78 /* Check whether an UCS-4 string is well-formed.
79 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
80 extern const uint32_t *
81 u32_check (const uint32_t *s
, size_t n
)
85 /* Elementary string conversions. */
87 /* Convert an UTF-8 string to an UTF-16 string. */
89 u8_to_u16 (const uint8_t *s
, size_t n
, uint16_t *resultbuf
,
92 /* Convert an UTF-8 string to an UCS-4 string. */
94 u8_to_u32 (const uint8_t *s
, size_t n
, uint32_t *resultbuf
,
97 /* Convert an UTF-16 string to an UTF-8 string. */
99 u16_to_u8 (const uint16_t *s
, size_t n
, uint8_t *resultbuf
,
102 /* Convert an UTF-16 string to an UCS-4 string. */
104 u16_to_u32 (const uint16_t *s
, size_t n
, uint32_t *resultbuf
,
107 /* Convert an UCS-4 string to an UTF-8 string. */
109 u32_to_u8 (const uint32_t *s
, size_t n
, uint8_t *resultbuf
,
112 /* Convert an UCS-4 string to an UTF-16 string. */
114 u32_to_u16 (const uint32_t *s
, size_t n
, uint16_t *resultbuf
,
118 /* Elementary string functions. */
120 /* Return the length (number of units) of the first character in S, which is
121 no longer than N. Return 0 if it is the NUL character. Return -1 upon
123 /* Similar to mblen(), except that s must not be NULL. */
125 u8_mblen (const uint8_t *s
, size_t n
)
128 u16_mblen (const uint16_t *s
, size_t n
)
131 u32_mblen (const uint32_t *s
, size_t n
)
134 /* Return the length (number of units) of the first character in S, putting
135 its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
136 and an appropriate number of units is returned.
137 The number of available units, N, must be > 0. */
138 /* Similar to mbtowc(), except that puc and s must not be NULL, n must be > 0,
139 and the NUL character is not treated specially. */
140 /* The variants with _unsafe suffix are for backward compatibility with
141 libunistring versions < 0.9.7. */
143 #if GNULIB_UNISTR_U8_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
146 u8_mbtouc_unsafe (ucs4_t
*puc
, const uint8_t *s
, size_t n
);
149 u8_mbtouc_unsafe_aux (ucs4_t
*puc
, const uint8_t *s
, size_t n
);
151 u8_mbtouc_unsafe (ucs4_t
*puc
, const uint8_t *s
, size_t n
)
161 return u8_mbtouc_unsafe_aux (puc
, s
, n
);
166 #if GNULIB_UNISTR_U16_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
169 u16_mbtouc_unsafe (ucs4_t
*puc
, const uint16_t *s
, size_t n
);
172 u16_mbtouc_unsafe_aux (ucs4_t
*puc
, const uint16_t *s
, size_t n
);
174 u16_mbtouc_unsafe (ucs4_t
*puc
, const uint16_t *s
, size_t n
)
178 if (c
< 0xd800 || c
>= 0xe000)
184 return u16_mbtouc_unsafe_aux (puc
, s
, n
);
189 #if GNULIB_UNISTR_U32_MBTOUC_UNSAFE || HAVE_LIBUNISTRING
192 u32_mbtouc_unsafe (ucs4_t
*puc
, const uint32_t *s
, size_t n
);
195 u32_mbtouc_unsafe (ucs4_t
*puc
,
196 const uint32_t *s
, size_t n _GL_UNUSED_PARAMETER
)
200 if (c
< 0xd800 || (c
>= 0xe000 && c
< 0x110000))
203 /* invalid multibyte character */
210 #if GNULIB_UNISTR_U8_MBTOUC || HAVE_LIBUNISTRING
213 u8_mbtouc (ucs4_t
*puc
, const uint8_t *s
, size_t n
);
216 u8_mbtouc_aux (ucs4_t
*puc
, const uint8_t *s
, size_t n
);
218 u8_mbtouc (ucs4_t
*puc
, const uint8_t *s
, size_t n
)
228 return u8_mbtouc_aux (puc
, s
, n
);
233 #if GNULIB_UNISTR_U16_MBTOUC || HAVE_LIBUNISTRING
236 u16_mbtouc (ucs4_t
*puc
, const uint16_t *s
, size_t n
);
239 u16_mbtouc_aux (ucs4_t
*puc
, const uint16_t *s
, size_t n
);
241 u16_mbtouc (ucs4_t
*puc
, const uint16_t *s
, size_t n
)
245 if (c
< 0xd800 || c
>= 0xe000)
251 return u16_mbtouc_aux (puc
, s
, n
);
256 #if GNULIB_UNISTR_U32_MBTOUC || HAVE_LIBUNISTRING
259 u32_mbtouc (ucs4_t
*puc
, const uint32_t *s
, size_t n
);
262 u32_mbtouc (ucs4_t
*puc
, const uint32_t *s
, size_t n _GL_UNUSED_PARAMETER
)
266 if (c
< 0xd800 || (c
>= 0xe000 && c
< 0x110000))
269 /* invalid multibyte character */
276 /* Return the length (number of units) of the first character in S, putting
277 its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
278 and -1 is returned for an invalid sequence of units, -2 is returned for an
279 incomplete sequence of units.
280 The number of available units, N, must be > 0. */
281 /* Similar to u*_mbtouc(), except that the return value gives more details
282 about the failure, similar to mbrtowc(). */
284 #if GNULIB_UNISTR_U8_MBTOUCR || HAVE_LIBUNISTRING
286 u8_mbtoucr (ucs4_t
*puc
, const uint8_t *s
, size_t n
);
289 #if GNULIB_UNISTR_U16_MBTOUCR || HAVE_LIBUNISTRING
291 u16_mbtoucr (ucs4_t
*puc
, const uint16_t *s
, size_t n
);
294 #if GNULIB_UNISTR_U32_MBTOUCR || HAVE_LIBUNISTRING
296 u32_mbtoucr (ucs4_t
*puc
, const uint32_t *s
, size_t n
);
299 /* Put the multibyte character represented by UC in S, returning its
300 length. Return -1 upon failure, -2 if the number of available units, N,
301 is too small. The latter case cannot occur if N >= 6/2/1, respectively. */
302 /* Similar to wctomb(), except that s must not be NULL, and the argument n
303 must be specified. */
305 #if GNULIB_UNISTR_U8_UCTOMB || HAVE_LIBUNISTRING
306 /* Auxiliary function, also used by u8_chr, u8_strchr, u8_strrchr. */
308 u8_uctomb_aux (uint8_t *s
, ucs4_t uc
, int n
);
311 u8_uctomb (uint8_t *s
, ucs4_t uc
, int n
);
314 u8_uctomb (uint8_t *s
, ucs4_t uc
, int n
)
316 if (uc
< 0x80 && n
> 0)
322 return u8_uctomb_aux (s
, uc
, n
);
327 #if GNULIB_UNISTR_U16_UCTOMB || HAVE_LIBUNISTRING
328 /* Auxiliary function, also used by u16_chr, u16_strchr, u16_strrchr. */
330 u16_uctomb_aux (uint16_t *s
, ucs4_t uc
, int n
);
333 u16_uctomb (uint16_t *s
, ucs4_t uc
, int n
);
336 u16_uctomb (uint16_t *s
, ucs4_t uc
, int n
)
338 if (uc
< 0xd800 && n
> 0)
344 return u16_uctomb_aux (s
, uc
, n
);
349 #if GNULIB_UNISTR_U32_UCTOMB || HAVE_LIBUNISTRING
352 u32_uctomb (uint32_t *s
, ucs4_t uc
, int n
);
355 u32_uctomb (uint32_t *s
, ucs4_t uc
, int n
)
357 if (uc
< 0xd800 || (uc
>= 0xe000 && uc
< 0x110000))
373 /* Copy N units from SRC to DEST. */
374 /* Similar to memcpy(). */
376 u8_cpy (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
, size_t n
);
378 u16_cpy (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
, size_t n
);
380 u32_cpy (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
, size_t n
);
382 /* Copy N units from SRC to DEST, guaranteeing correct behavior for
383 overlapping memory areas. */
384 /* Similar to memmove(). */
386 u8_move (uint8_t *dest
, const uint8_t *src
, size_t n
);
388 u16_move (uint16_t *dest
, const uint16_t *src
, size_t n
);
390 u32_move (uint32_t *dest
, const uint32_t *src
, size_t n
);
392 /* Set the first N characters of S to UC. UC should be a character that
393 occupies only 1 unit. */
394 /* Similar to memset(). */
396 u8_set (uint8_t *s
, ucs4_t uc
, size_t n
);
398 u16_set (uint16_t *s
, ucs4_t uc
, size_t n
);
400 u32_set (uint32_t *s
, ucs4_t uc
, size_t n
);
402 /* Compare S1 and S2, each of length N. */
403 /* Similar to memcmp(). */
405 u8_cmp (const uint8_t *s1
, const uint8_t *s2
, size_t n
)
408 u16_cmp (const uint16_t *s1
, const uint16_t *s2
, size_t n
)
411 u32_cmp (const uint32_t *s1
, const uint32_t *s2
, size_t n
)
414 /* Compare S1 and S2. */
415 /* Similar to the gnulib function memcmp2(). */
417 u8_cmp2 (const uint8_t *s1
, size_t n1
, const uint8_t *s2
, size_t n2
)
420 u16_cmp2 (const uint16_t *s1
, size_t n1
, const uint16_t *s2
, size_t n2
)
423 u32_cmp2 (const uint32_t *s1
, size_t n1
, const uint32_t *s2
, size_t n2
)
426 /* Search the string at S for UC. */
427 /* Similar to memchr(). */
429 u8_chr (const uint8_t *s
, size_t n
, ucs4_t uc
)
432 u16_chr (const uint16_t *s
, size_t n
, ucs4_t uc
)
435 u32_chr (const uint32_t *s
, size_t n
, ucs4_t uc
)
438 /* Count the number of Unicode characters in the N units from S. */
439 /* Similar to mbsnlen(). */
441 u8_mbsnlen (const uint8_t *s
, size_t n
)
444 u16_mbsnlen (const uint16_t *s
, size_t n
)
447 u32_mbsnlen (const uint32_t *s
, size_t n
)
450 /* Elementary string functions with memory allocation. */
452 /* Make a freshly allocated copy of S, of length N. */
454 u8_cpy_alloc (const uint8_t *s
, size_t n
);
456 u16_cpy_alloc (const uint16_t *s
, size_t n
);
458 u32_cpy_alloc (const uint32_t *s
, size_t n
);
460 /* Elementary string functions on NUL terminated strings. */
462 /* Return the length (number of units) of the first character in S.
463 Return 0 if it is the NUL character. Return -1 upon failure. */
465 u8_strmblen (const uint8_t *s
)
468 u16_strmblen (const uint16_t *s
)
471 u32_strmblen (const uint32_t *s
)
474 /* Return the length (number of units) of the first character in S, putting
475 its 'ucs4_t' representation in *PUC. Return 0 if it is the NUL
476 character. Return -1 upon failure. */
478 u8_strmbtouc (ucs4_t
*puc
, const uint8_t *s
);
480 u16_strmbtouc (ucs4_t
*puc
, const uint16_t *s
);
482 u32_strmbtouc (ucs4_t
*puc
, const uint32_t *s
);
484 /* Forward iteration step. Advances the pointer past the next character,
485 or returns NULL if the end of the string has been reached. Puts the
486 character's 'ucs4_t' representation in *PUC. */
487 extern const uint8_t *
488 u8_next (ucs4_t
*puc
, const uint8_t *s
);
489 extern const uint16_t *
490 u16_next (ucs4_t
*puc
, const uint16_t *s
);
491 extern const uint32_t *
492 u32_next (ucs4_t
*puc
, const uint32_t *s
);
494 /* Backward iteration step. Advances the pointer to point to the previous
495 character, or returns NULL if the beginning of the string had been reached.
496 Puts the character's 'ucs4_t' representation in *PUC. */
497 extern const uint8_t *
498 u8_prev (ucs4_t
*puc
, const uint8_t *s
, const uint8_t *start
);
499 extern const uint16_t *
500 u16_prev (ucs4_t
*puc
, const uint16_t *s
, const uint16_t *start
);
501 extern const uint32_t *
502 u32_prev (ucs4_t
*puc
, const uint32_t *s
, const uint32_t *start
);
504 /* Return the number of units in S. */
505 /* Similar to strlen(), wcslen(). */
507 u8_strlen (const uint8_t *s
)
510 u16_strlen (const uint16_t *s
)
513 u32_strlen (const uint32_t *s
)
516 /* Return the number of units in S, but at most MAXLEN. */
517 /* Similar to strnlen(), wcsnlen(). */
519 u8_strnlen (const uint8_t *s
, size_t maxlen
)
522 u16_strnlen (const uint16_t *s
, size_t maxlen
)
525 u32_strnlen (const uint32_t *s
, size_t maxlen
)
528 /* Copy SRC to DEST. */
529 /* Similar to strcpy(), wcscpy(). */
531 u8_strcpy (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
);
533 u16_strcpy (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
);
535 u32_strcpy (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
);
537 /* Copy SRC to DEST, returning the address of the terminating NUL in DEST. */
538 /* Similar to stpcpy(). */
540 u8_stpcpy (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
);
542 u16_stpcpy (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
);
544 u32_stpcpy (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
);
546 /* Copy no more than N units of SRC to DEST. */
547 /* Similar to strncpy(), wcsncpy(). */
549 u8_strncpy (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
, size_t n
);
551 u16_strncpy (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
, size_t n
);
553 u32_strncpy (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
, size_t n
);
555 /* Copy no more than N units of SRC to DEST. Return a pointer past the last
556 non-NUL unit written into DEST. */
557 /* Similar to stpncpy(). */
559 u8_stpncpy (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
, size_t n
);
561 u16_stpncpy (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
, size_t n
);
563 u32_stpncpy (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
, size_t n
);
565 /* Append SRC onto DEST. */
566 /* Similar to strcat(), wcscat(). */
568 u8_strcat (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
);
570 u16_strcat (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
);
572 u32_strcat (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
);
574 /* Append no more than N units of SRC onto DEST. */
575 /* Similar to strncat(), wcsncat(). */
577 u8_strncat (uint8_t *_UC_RESTRICT dest
, const uint8_t *src
, size_t n
);
579 u16_strncat (uint16_t *_UC_RESTRICT dest
, const uint16_t *src
, size_t n
);
581 u32_strncat (uint32_t *_UC_RESTRICT dest
, const uint32_t *src
, size_t n
);
583 /* Compare S1 and S2. */
584 /* Similar to strcmp(), wcscmp(). */
586 /* Avoid a collision with the u8_strcmp() function in Solaris 11 libc. */
588 u8_strcmp_gnu (const uint8_t *s1
, const uint8_t *s2
)
590 # define u8_strcmp u8_strcmp_gnu
593 u8_strcmp (const uint8_t *s1
, const uint8_t *s2
)
597 u16_strcmp (const uint16_t *s1
, const uint16_t *s2
)
600 u32_strcmp (const uint32_t *s1
, const uint32_t *s2
)
603 /* Compare S1 and S2 using the collation rules of the current locale.
604 Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2.
605 Upon failure, set errno and return any value. */
606 /* Similar to strcoll(), wcscoll(). */
608 u8_strcoll (const uint8_t *s1
, const uint8_t *s2
);
610 u16_strcoll (const uint16_t *s1
, const uint16_t *s2
);
612 u32_strcoll (const uint32_t *s1
, const uint32_t *s2
);
614 /* Compare no more than N units of S1 and S2. */
615 /* Similar to strncmp(), wcsncmp(). */
617 u8_strncmp (const uint8_t *s1
, const uint8_t *s2
, size_t n
)
620 u16_strncmp (const uint16_t *s1
, const uint16_t *s2
, size_t n
)
623 u32_strncmp (const uint32_t *s1
, const uint32_t *s2
, size_t n
)
626 /* Duplicate S, returning an identical malloc'd string. */
627 /* Similar to strdup(), wcsdup(). */
629 u8_strdup (const uint8_t *s
);
631 u16_strdup (const uint16_t *s
);
633 u32_strdup (const uint32_t *s
);
635 /* Find the first occurrence of UC in STR. */
636 /* Similar to strchr(), wcschr(). */
638 u8_strchr (const uint8_t *str
, ucs4_t uc
)
641 u16_strchr (const uint16_t *str
, ucs4_t uc
)
644 u32_strchr (const uint32_t *str
, ucs4_t uc
)
647 /* Find the last occurrence of UC in STR. */
648 /* Similar to strrchr(), wcsrchr(). */
650 u8_strrchr (const uint8_t *str
, ucs4_t uc
)
653 u16_strrchr (const uint16_t *str
, ucs4_t uc
)
656 u32_strrchr (const uint32_t *str
, ucs4_t uc
)
659 /* Return the length of the initial segment of STR which consists entirely
660 of Unicode characters not in REJECT. */
661 /* Similar to strcspn(), wcscspn(). */
663 u8_strcspn (const uint8_t *str
, const uint8_t *reject
)
666 u16_strcspn (const uint16_t *str
, const uint16_t *reject
)
669 u32_strcspn (const uint32_t *str
, const uint32_t *reject
)
672 /* Return the length of the initial segment of STR which consists entirely
673 of Unicode characters in ACCEPT. */
674 /* Similar to strspn(), wcsspn(). */
676 u8_strspn (const uint8_t *str
, const uint8_t *accept
)
679 u16_strspn (const uint16_t *str
, const uint16_t *accept
)
682 u32_strspn (const uint32_t *str
, const uint32_t *accept
)
685 /* Find the first occurrence in STR of any character in ACCEPT. */
686 /* Similar to strpbrk(), wcspbrk(). */
688 u8_strpbrk (const uint8_t *str
, const uint8_t *accept
)
691 u16_strpbrk (const uint16_t *str
, const uint16_t *accept
)
694 u32_strpbrk (const uint32_t *str
, const uint32_t *accept
)
697 /* Find the first occurrence of NEEDLE in HAYSTACK. */
698 /* Similar to strstr(), wcsstr(). */
700 u8_strstr (const uint8_t *haystack
, const uint8_t *needle
)
703 u16_strstr (const uint16_t *haystack
, const uint16_t *needle
)
706 u32_strstr (const uint32_t *haystack
, const uint32_t *needle
)
709 /* Test whether STR starts with PREFIX. */
711 u8_startswith (const uint8_t *str
, const uint8_t *prefix
)
714 u16_startswith (const uint16_t *str
, const uint16_t *prefix
)
717 u32_startswith (const uint32_t *str
, const uint32_t *prefix
)
720 /* Test whether STR ends with SUFFIX. */
722 u8_endswith (const uint8_t *str
, const uint8_t *suffix
)
725 u16_endswith (const uint16_t *str
, const uint16_t *suffix
)
728 u32_endswith (const uint32_t *str
, const uint32_t *suffix
)
731 /* Divide STR into tokens separated by characters in DELIM.
732 This interface is actually more similar to wcstok than to strtok. */
733 /* Similar to strtok_r(), wcstok(). */
735 u8_strtok (uint8_t *_UC_RESTRICT str
, const uint8_t *delim
,
738 u16_strtok (uint16_t *_UC_RESTRICT str
, const uint16_t *delim
,
741 u32_strtok (uint32_t *_UC_RESTRICT str
, const uint32_t *delim
,
749 #endif /* _UNISTR_H */