exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / mbiterf.h
blob28d2f8ce0255d0c95daa58da5a75bf55c96141b6
1 /* Iterating through multibyte strings, faster: macros for multi-byte encodings.
2 Copyright (C) 2001, 2005, 2007, 2009-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>,
18 with insights from Paul Eggert. */
20 /* The macros in this file implement forward iteration through a
21 multi-byte string.
23 With these macros, an iteration loop that looks like
25 char *iter;
26 for (iter = buf; iter < buf + buflen; iter++)
28 do_something (*iter);
31 becomes
33 const char *buf_end = buf + buflen;
34 mbif_state_t state;
35 [const] char *iter;
36 for (mbif_init (state), iter = buf; mbif_avail (state, iter, buf_end); )
38 mbchar_t cur = mbif_next (state, iter, buf_end);
39 // Note: Here always mb_ptr (cur) == iter.
40 do_something (iter, mb_len (cur));
41 iter += mb_len (cur);
44 The benefit of these macros over plain use of mbrtowc or mbrtoc32 is:
45 - Handling of invalid multibyte sequences is possible without
46 making the code more complicated, while still preserving the
47 invalid multibyte sequences.
49 The benefit of these macros over those from mbiter.h is that it
50 produces faster code with today's optimizing compilers (because mbif_next
51 returns its result by value).
53 mbif_state_t
54 is a type usable for variable declarations.
56 mbif_init (state)
57 initializes the state.
59 mbif_avail (state, iter, endptr)
60 returns true if another loop round is needed.
62 mbif_next (state, iter, endptr)
63 returns the next multibyte character.
64 It asssumes that the state is initialized and that iter < endptr.
66 Here are the function prototypes of the macros.
68 extern void mbif_init (mbif_state_t state);
69 extern bool mbif_avail (mbif_state_t state, const char *iter, const char *endptr);
70 extern mbchar_t mbif_next (mbif_state_t state, const char *iter, const char *endptr);
73 #ifndef _MBITERF_H
74 #define _MBITERF_H 1
76 /* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
77 _GL_ATTRIBUTE_ALWAYS_INLINE. */
78 #if !_GL_CONFIG_H_INCLUDED
79 #error "Please include config.h first."
80 #endif
82 #include <assert.h>
83 #include <stddef.h>
84 #include <string.h>
85 #include <uchar.h>
86 #include <wchar.h>
88 #include "mbchar.h"
90 _GL_INLINE_HEADER_BEGIN
91 #ifndef MBITERF_INLINE
92 # define MBITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE
93 #endif
95 struct mbif_state
97 #if !GNULIB_MBRTOC32_REGULAR
98 bool in_shift; /* true if next byte may not be interpreted as ASCII */
99 /* If GNULIB_MBRTOC32_REGULAR, it is always false,
100 so optimize it away. */
101 #endif
102 mbstate_t state; /* if in_shift: current shift state */
103 /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state
104 before and after every mbiterf_next invocation.
108 MBITERF_INLINE mbchar_t
109 mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr)
111 #if !GNULIB_MBRTOC32_REGULAR
112 if (ps->in_shift)
113 goto with_shift;
114 #endif
115 /* Handle most ASCII characters quickly, without calling mbrtowc(). */
116 if (is_basic (*iter))
118 /* These characters are part of the POSIX portable character set.
119 For most of them, namely those in the ISO C basic character set,
120 ISO C 99 guarantees that their wide character code is identical to
121 their char code. For the few other ones, this is the case as well,
122 in all locale encodings that are in use. The 32-bit wide character
123 code is the same as well. */
124 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter };
126 else
128 assert (mbsinit (&ps->state));
129 #if !GNULIB_MBRTOC32_REGULAR
130 ps->in_shift = true;
131 with_shift:;
132 #endif
133 size_t bytes;
134 char32_t wc;
135 bytes = mbrtoc32 (&wc, iter, endptr - iter, &ps->state);
136 if (bytes == (size_t) -1)
138 /* An invalid multibyte sequence was encountered. */
139 /* Allow the next invocation to continue from a sane state. */
140 #if !GNULIB_MBRTOC32_REGULAR
141 ps->in_shift = false;
142 #endif
143 mbszero (&ps->state);
144 return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
146 else if (bytes == (size_t) -2)
148 /* An incomplete multibyte character at the end. */
149 #if !GNULIB_MBRTOC32_REGULAR
150 ps->in_shift = false;
151 #endif
152 /* Whether to reset ps->state or not is not important; the string end
153 is reached anyway. */
154 return (mbchar_t) { .ptr = iter, .bytes = endptr - iter, .wc_valid = false };
156 else
158 if (bytes == 0)
160 /* A null wide character was encountered. */
161 bytes = 1;
162 assert (*iter == '\0');
163 assert (wc == 0);
165 #if !GNULIB_MBRTOC32_REGULAR
166 else if (bytes == (size_t) -3)
167 /* The previous multibyte sequence produced an additional 32-bit
168 wide character. */
169 bytes = 0;
170 #endif
172 /* When in an initial state, we can go back treating ASCII
173 characters more quickly. */
174 #if !GNULIB_MBRTOC32_REGULAR
175 if (mbsinit (&ps->state))
176 ps->in_shift = false;
177 #endif
178 return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc };
183 /* Iteration macros. */
184 typedef struct mbif_state mbif_state_t;
185 #if !GNULIB_MBRTOC32_REGULAR
186 #define mbif_init(st) \
187 ((st).in_shift = false, mbszero (&(st).state))
188 #else
189 /* Optimized: no in_shift. */
190 #define mbif_init(st) \
191 (mbszero (&(st).state))
192 #endif
193 #if !GNULIB_MBRTOC32_REGULAR
194 #define mbif_avail(st, iter, endptr) ((st).in_shift || ((iter) < (endptr)))
195 #else
196 /* Optimized: no in_shift. */
197 #define mbif_avail(st, iter, endptr) ((iter) < (endptr))
198 #endif
199 #define mbif_next(st, iter, endptr) \
200 mbiterf_next (&(st), (iter), (endptr))
202 _GL_INLINE_HEADER_END
204 #endif /* _MBITERF_H */