exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / mbuiterf.h
blob22e84627d7b3932d3bff1f934705aecd12415f2d
1 /* Iterating through multibyte strings, faster: macros for multi-byte encodings.
2 Copyright (C) 2001, 2005, 2007, 2009-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation, either version 3 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>,
18 with insights from Paul Eggert. */
20 /* The macros in this file implement forward iteration through a
21 multi-byte string, without knowing its length a-priori.
23 With these macros, an iteration loop that looks like
25 char *iter;
26 for (iter = buf; *iter != '\0'; iter++)
28 do_something (*iter);
31 becomes
33 mbuif_state_t state;
34 [const] char *iter;
35 for (mbuif_init (state), iter = buf; mbuif_avail (state, iter); )
37 mbchar_t cur = mbuif_next (state, iter);
38 // Note: Here always mb_ptr (cur) == iter.
39 do_something (iter, mb_len (cur));
40 iter += mb_len (cur);
43 The benefit of these macros over plain use of mbrtowc or mbrtoc32 is:
44 - Handling of invalid multibyte sequences is possible without
45 making the code more complicated, while still preserving the
46 invalid multibyte sequences.
48 Compared to mbiterf.h, the macros here don't need to know the string's
49 length a-priori. The downside is that at each step, the look-ahead
50 that guards against overrunning the terminating '\0' is more expensive.
51 The mbuif_* macros are therefore suitable when there is a high probability
52 that only the first few multibyte characters need to be inspected.
53 Whereas the mbif_* macros are better if usually the iteration runs
54 through the entire string.
56 The benefit of these macros over those from mbuiter.h is that it
57 produces faster code with today's optimizing compilers (because mbuif_next
58 returns its result by value).
60 mbuif_state_t
61 is a type usable for variable declarations.
63 mbuif_init (state)
64 initializes the state.
66 mbuif_avail (state, iter)
67 returns true if another loop round is needed.
69 mbuif_next (state, iter)
70 returns the next multibyte character.
71 It asssumes that the state is initialized and that *iter != '\0'.
73 Here are the function prototypes of the macros.
75 extern void mbuif_init (mbuif_state_t state);
76 extern bool mbuif_avail (mbuif_state_t state, const char *iter);
77 extern mbchar_t mbuif_next (mbuif_state_t state, const char *iter);
80 #ifndef _MBUITERF_H
81 #define _MBUITERF_H 1
83 /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
84 _GL_ATTRIBUTE_ALWAYS_INLINE. */
85 #if !_GL_CONFIG_H_INCLUDED
86 #error "Please include config.h first."
87 #endif
89 #include <assert.h>
90 #include <stddef.h>
91 #include <stdlib.h>
92 #include <string.h>
93 #include <uchar.h>
94 #include <wchar.h>
96 #include "mbchar.h"
97 #include "strnlen1.h"
99 _GL_INLINE_HEADER_BEGIN
100 #ifndef MBUITERF_INLINE
101 # define MBUITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE
102 #endif
104 struct mbuif_state
106 #if !GNULIB_MBRTOC32_REGULAR
107 bool in_shift; /* true if next byte may not be interpreted as ASCII */
108 /* If GNULIB_MBRTOC32_REGULAR, it is always false,
109 so optimize it away. */
110 #endif
111 mbstate_t state; /* if in_shift: current shift state */
112 /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state
113 before and after every mbuiterf_next invocation.
115 unsigned int cur_max; /* A cache of MB_CUR_MAX. */
118 MBUITERF_INLINE mbchar_t
119 mbuiterf_next (struct mbuif_state *ps, const char *iter)
121 #if !GNULIB_MBRTOC32_REGULAR
122 if (ps->in_shift)
123 goto with_shift;
124 #endif
125 /* Handle most ASCII characters quickly, without calling mbrtowc(). */
126 if (is_basic (*iter))
128 /* These characters are part of the POSIX portable character set.
129 For most of them, namely those in the ISO C basic character set,
130 ISO C 99 guarantees that their wide character code is identical to
131 their char code. For the few other ones, this is the case as well,
132 in all locale encodings that are in use. The 32-bit wide character
133 code is the same as well. */
134 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter };
136 else
138 assert (mbsinit (&ps->state));
139 #if !GNULIB_MBRTOC32_REGULAR
140 ps->in_shift = true;
141 with_shift:;
142 #endif
143 size_t bytes;
144 char32_t wc;
145 bytes = mbrtoc32 (&wc, iter, strnlen1 (iter, ps->cur_max), &ps->state);
146 if (bytes == (size_t) -1)
148 /* An invalid multibyte sequence was encountered. */
149 /* Allow the next invocation to continue from a sane state. */
150 #if !GNULIB_MBRTOC32_REGULAR
151 ps->in_shift = false;
152 #endif
153 mbszero (&ps->state);
154 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
156 else if (bytes == (size_t) -2)
158 /* An incomplete multibyte character at the end. */
159 /* Whether to set ps->in_shift = false and reset ps->state or not is
160 not important; the string end is reached anyway. */
161 return (mbchar_t) { .ptr = iter, .bytes = strlen (iter), .wc_valid = false };
163 else
165 if (bytes == 0)
167 /* A null wide character was encountered. */
168 bytes = 1;
169 assert (*iter == '\0');
170 assert (wc == 0);
172 #if !GNULIB_MBRTOC32_REGULAR
173 else if (bytes == (size_t) -3)
174 /* The previous multibyte sequence produced an additional 32-bit
175 wide character. */
176 bytes = 0;
177 #endif
179 /* When in an initial state, we can go back treating ASCII
180 characters more quickly. */
181 #if !GNULIB_MBRTOC32_REGULAR
182 if (mbsinit (&ps->state))
183 ps->in_shift = false;
184 #endif
185 return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc };
190 /* Iteration macros. */
191 typedef struct mbuif_state mbuif_state_t;
192 #if !GNULIB_MBRTOC32_REGULAR
193 #define mbuif_init(st) \
194 ((st).in_shift = false, mbszero (&(st).state), \
195 (st).cur_max = MB_CUR_MAX)
196 #else
197 /* Optimized: no in_shift. */
198 #define mbuif_init(st) \
199 (mbszero (&(st).state), \
200 (st).cur_max = MB_CUR_MAX)
201 #endif
202 #if !GNULIB_MBRTOC32_REGULAR
203 #define mbuif_avail(st, iter) ((st).in_shift || (*(iter) != '\0'))
204 #else
205 /* Optimized: no in_shift. */
206 #define mbuif_avail(st, iter) (*(iter) != '\0')
207 #endif
208 #define mbuif_next(st, iter) \
209 mbuiterf_next (&(st), (iter))
211 _GL_INLINE_HEADER_END
213 #endif /* _MBUITERF_H */