exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / mcel.h
blobf6f006c90097c0cd2ddd7cf7025c7ee27dc66e8b
1 /* Multi-byte characters, Error encodings, and Lengths (MCELs)
2 Copyright 2023-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Paul Eggert. */
19 /* The macros in this file implement multi-byte character representation
20 and forward iteration through a multi-byte string.
21 They are simpler and can be faster than the mbiter family.
22 However, they do not support obsolescent encodings like CP864,
23 EBCDIC, Johab, and Shift JIS that glibc also does not support,
24 and it is up to the caller to coalesce encoding-error bytes if desired.
26 The mcel_scan function lets code iterate through an array of bytes,
27 supporting character encodings in practical use
28 more simply than using plain mbrtoc32.
30 Instead of this single-byte code:
32 char *p = ..., *lim = ...;
33 for (; p < lim; p++)
34 process (*p);
36 You can use this multi-byte code:
38 char *p = ..., *lim = ...;
39 for (mcel_t g; p < lim; p += g.len)
41 g = mcel_scan (p, lim);
42 process (g);
45 You can select from G using G.ch, G.err, and G.len.
46 G is an encoding error if G.err is nonzero, a character otherwise.
48 The mcel_scanz function is similar except it works with a
49 string of unknown but positive length that is terminated with '\0'.
50 Instead of this single-byte code:
52 char *p = ...;
53 for (; *p; p++)
54 process (*p);
56 You can use this multi-byte code:
58 char *p = ...;
59 for (mcel_t g; *p; p += g.len)
61 g = mcel_scanz (p);
62 process (g);
65 mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
66 string is terminated by TERMINATOR. The C standard says that the
67 TERMINATORs '\0', '\r', '\n', '.', '/' are safe, as they cannot be
68 a part (even a trailing byte) of a multi-byte character.
69 In practice TERMINATOR is safe if 0 <= TERMINATOR <= 0x2f (ASCII '/').
71 mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.
73 mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
74 character or by encoding byte value, with encoding bytes sorting
75 after characters.
77 Calls like c32isalpha (G.ch) test G; they return false for encoding
78 errors since calls like c32isalpha (0) return false. Calls like
79 mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
80 but transliterate first.
82 Although ISO C and POSIX allow encodings that have shift states or
83 that can produce multiple characters from an indivisible byte sequence,
84 POSIX does not require support for these encodings,
85 they are not in practical use on GNUish platforms,
86 and omitting support for them simplifies the API. */
88 #ifndef _MCEL_H
89 #define _MCEL_H 1
91 #if !_GL_CONFIG_H_INCLUDED
92 #error "Please include config.h first."
93 #endif
95 #include <verify.h>
97 #include <limits.h>
98 #include <stddef.h>
99 #include <uchar.h>
101 /* Pacify GCC re type limits. */
102 #if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
103 # pragma GCC diagnostic ignored "-Wtype-limits"
104 #endif
106 /* The maximum multi-byte character length supported on any platform.
107 This can be less than MB_LEN_MAX because many platforms have a
108 large MB_LEN_MAX to allow for stateful encodings, and mcel does not
109 support these encodings. MCEL_LEN_MAX is enough for UTF-8, EUC,
110 Shift-JIS, GB18030, etc. In all multi-byte encodings supported by glibc,
111 0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */
112 enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
114 /* Bounds for mcel_t members. */
115 enum { MCEL_CHAR_MAX = 0x10FFFF };
116 enum { MCEL_ERR_MIN = 0x80 };
118 /* mcel_t is a type representing a character CH or an encoding error byte ERR,
119 along with a count of the LEN bytes that represent CH or ERR.
120 If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
121 otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR,
122 CH == 0, and LEN == 1. */
123 typedef struct
125 char32_t ch;
126 unsigned char err;
127 unsigned char len;
128 } mcel_t;
130 /* Every multi-byte character length fits in mcel_t's LEN. */
131 static_assert (MB_LEN_MAX <= UCHAR_MAX);
133 /* Shifting an encoding error byte left by this value
134 suffices to sort encoding errors after characters. */
135 enum { MCEL_ERR_SHIFT = 14 };
136 static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
138 /* Unsigned char promotes to int. */
139 static_assert (UCHAR_MAX <= INT_MAX);
141 /* Bytes have 8 bits, as POSIX requires. */
142 static_assert (CHAR_BIT == 8);
144 #ifndef _GL_LIKELY
145 /* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
146 # define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
147 # define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
148 #endif
150 _GL_INLINE_HEADER_BEGIN
151 #ifndef MCEL_INLINE
152 # define MCEL_INLINE _GL_INLINE
153 #endif
155 /* mcel_t constructors. */
156 MCEL_INLINE mcel_t
157 mcel_ch (char32_t ch, size_t len)
159 assume (0 < len);
160 assume (len <= MCEL_LEN_MAX);
161 assume (ch <= MCEL_CHAR_MAX);
162 return (mcel_t) {.ch = ch, .len = len};
164 MCEL_INLINE mcel_t
165 mcel_err (unsigned char err)
167 assume (MCEL_ERR_MIN <= err);
168 return (mcel_t) {.err = err, .len = 1};
171 /* Compare C1 and C2, with encoding errors sorting after characters.
172 Return <0, 0, >0 for <, =, >. */
173 MCEL_INLINE int
174 mcel_cmp (mcel_t c1, mcel_t c2)
176 int ch1 = c1.ch, ch2 = c2.ch;
177 return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
180 /* Apply the uchar translator TO to C1 and C2 and compare the results,
181 with encoding errors sorting after characters,
182 Return <0, 0, >0 for <, =, >. */
183 MCEL_INLINE int
184 mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
186 int cmp = mcel_cmp (c1, c2);
187 if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
188 return cmp;
189 int ch1 = to (c1.ch), ch2 = to (c2.ch);
190 return ch1 - ch2;
193 /* Whether C represents itself as a Unicode character
194 when it is the first byte of a single- or multi-byte character.
195 These days it is safe to assume ASCII, so do not support
196 obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS. */
197 MCEL_INLINE bool
198 mcel_isbasic (char c)
200 return _GL_LIKELY (0 <= c && c < MCEL_ERR_MIN);
203 /* With mcel there should be no need for the performance overhead of
204 replacing glibc mbrtoc32, as callers shouldn't care whether the
205 C locale treats a byte with the high bit set as an encoding error. */
206 #ifdef __GLIBC__
207 # undef mbrtoc32
208 #endif
210 /* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM.
211 Return the character or encoding error starting at P. */
212 MCEL_INLINE mcel_t
213 mcel_scan (char const *p, char const *lim)
215 /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
216 In supported encodings, the first byte of a multi-byte character
217 cannot be an ASCII byte. */
218 char c = *p;
219 if (mcel_isbasic (c))
220 return mcel_ch (c, 1);
222 /* An initial mbstate_t; initialization optimized for some platforms.
223 For details about these and other platforms, see wchar.in.h. */
224 #if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
225 /* Although only a trivial optimization, it's worth it for GNU. */
226 mbstate_t mbs; mbs.__count = 0;
227 #elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
228 || (defined __APPLE__ && defined __MACH__))
229 /* These platforms have 128-byte mbstate_t. What were they thinking?
230 Initialize just for supported encodings (UTF-8, EUC, etc.).
231 Avoid memset because some compilers generate function call code. */
232 struct mbhidden { char32_t ch; int utf8_want, euc_want; }
233 _GL_ATTRIBUTE_MAY_ALIAS;
234 union { mbstate_t m; struct mbhidden s; } u;
235 u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
236 # define mbs u.m
237 #elif defined __NetBSD__
238 /* Experiments on both 32- and 64-bit NetBSD platforms have
239 shown that it doesn't work to clear fewer than 24 bytes. */
240 struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
241 union { mbstate_t m; struct mbhidden s; } u;
242 u.s.a = u.s.b = u.s.c = 0;
243 # define mbs u.m
244 #else
245 /* mbstate_t has unknown structure or is not worth optimizing. */
246 mbstate_t mbs = {0};
247 #endif
249 char32_t ch;
250 size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
252 #undef mbs
254 /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
255 is not supported and MB_LEN_MAX is small. */
256 if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
257 return mcel_err (c);
259 /* A multi-byte character. LEN must be positive,
260 as *P != '\0' and shift sequences are not supported. */
261 return mcel_ch (ch, len);
264 /* Scan bytes from P, a byte sequence terminated by TERMINATOR.
265 If *P == TERMINATOR, scan just that byte; otherwise scan
266 bytes up to but not including TERMINATOR.
267 TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
268 Return the character or encoding error starting at P. */
269 MCEL_INLINE mcel_t
270 mcel_scant (char const *p, char terminator)
272 /* Handle ASCII quickly for speed. */
273 if (mcel_isbasic (*p))
274 return mcel_ch (*p, 1);
276 /* Defer to mcel_scan for non-ASCII. Compute length with code that
277 is typically faster than strnlen. */
278 char const *lim = p + 1;
279 for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
280 lim += *lim != terminator;
281 return mcel_scan (p, lim);
284 /* Scan bytes from P, a byte sequence terminated by '\0'.
285 If *P == '\0', scan just that byte; otherwise scan
286 bytes up to but not including '\0'.
287 Return the character or encoding error starting at P. */
288 MCEL_INLINE mcel_t
289 mcel_scanz (char const *p)
291 return mcel_scant (p, '\0');
294 _GL_INLINE_HEADER_END
296 #endif /* _MCEL_H */