lib/mcel.h

   1 /* Multi-byte characters, Error encodings, and Lengths (MCELs)
   2    Copyright 2023-2024 Free Software Foundation, Inc.
   3
   4    This file is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU Lesser General Public License as
   6    published by the Free Software Foundation; either version 2.1 of the
   7    License, or (at your option) any later version.
   8
   9    This file is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Paul Eggert.  */
  18
  19 /* The macros in this file implement multi-byte character representation
  20    and forward iteration through a multi-byte string.
  21    They are simpler and can be faster than the mbiter family.
  22    However, they do not support obsolescent encodings like CP864,
  23    EBCDIC, Johab, and Shift JIS that glibc also does not support,
  24    and it is up to the caller to coalesce encoding-error bytes if desired.
  25
  26    The mcel_scan function lets code iterate through an array of bytes,
  27    supporting character encodings in practical use
  28    more simply than using plain mbrtoc32.
  29
  30    Instead of this single-byte code:
  31
  32       char *p = ..., *lim = ...;
  33       for (; p < lim; p++)
  34         process (*p);
  35
  36    You can use this multi-byte code:
  37
  38       char *p = ..., *lim = ...;
  39       for (mcel_t g; p < lim; p += g.len)
  40         {
  41           g = mcel_scan (p, lim);
  42           process (g);
  43         }
  44
  45    You can select from G using G.ch, G.err, and G.len.
  46    G is an encoding error if G.err is nonzero, a character otherwise.
  47
  48    The mcel_scanz function is similar except it works with a
  49    string of unknown but positive length that is terminated with '\0'.
  50    Instead of this single-byte code:
  51
  52       char *p = ...;
  53       for (; *p; p++)
  54         process (*p);
  55
  56    You can use this multi-byte code:
  57
  58       char *p = ...;
  59       for (mcel_t g; *p; p += g.len)
  60         {
  61           g = mcel_scanz (p);
  62           process (g);
  63         }
  64
  65    mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
  66    string is terminated by TERMINATOR.  The C standard says that the
  67    TERMINATORs '\0', '\r', '\n', '.', '/' are safe, as they cannot be
  68    a part (even a trailing byte) of a multi-byte character.
  69    In practice TERMINATOR is safe if 0 <= TERMINATOR <= 0x2f (ASCII '/').
  70
  71    mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.
  72
  73    mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
  74    character or by encoding byte value, with encoding bytes sorting
  75    after characters.
  76
  77    Calls like c32isalpha (G.ch) test G; they return false for encoding
  78    errors since calls like c32isalpha (0) return false.  Calls like
  79    mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
  80    but transliterate first.
  81
  82    Although ISO C and POSIX allow encodings that have shift states or
  83    that can produce multiple characters from an indivisible byte sequence,
  84    POSIX does not require support for these encodings,
  85    they are not in practical use on GNUish platforms,
  86    and omitting support for them simplifies the API.  */
  87
  88 #ifndef _MCEL_H
  89 #define _MCEL_H 1
  90
  91 #if !_GL_CONFIG_H_INCLUDED
  92  #error "Please include config.h first."
  93 #endif
  94
  95 #include <verify.h>
  96
  97 #include <limits.h>
  98 #include <stddef.h>
  99 #include <uchar.h>
 100
 101 /* Pacify GCC re type limits.  */
 102 #if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
 103 # pragma GCC diagnostic ignored "-Wtype-limits"
 104 #endif
 105
 106 /* The maximum multi-byte character length supported on any platform.
 107    This can be less than MB_LEN_MAX because many platforms have a
 108    large MB_LEN_MAX to allow for stateful encodings, and mcel does not
 109    support these encodings.  MCEL_LEN_MAX is enough for UTF-8, EUC,
 110    Shift-JIS, GB18030, etc.  In all multi-byte encodings supported by glibc,
 111    0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX.  */
 112 enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
 113
 114 /* Bounds for mcel_t members.  */
 115 enum { MCEL_CHAR_MAX = 0x10FFFF };
 116 enum { MCEL_ERR_MIN = 0x80 };
 117
 118 /* mcel_t is a type representing a character CH or an encoding error byte ERR,
 119    along with a count of the LEN bytes that represent CH or ERR.
 120    If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
 121    otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR,
 122    CH == 0, and LEN == 1.  */
 123 typedef struct
 124 {
 125   char32_t ch;
 126   unsigned char err;
 127   unsigned char len;
 128 } mcel_t;
 129
 130 /* Every multi-byte character length fits in mcel_t's LEN.  */
 131 static_assert (MB_LEN_MAX <= UCHAR_MAX);
 132
 133 /* Shifting an encoding error byte left by this value
 134    suffices to sort encoding errors after characters.  */
 135 enum { MCEL_ERR_SHIFT = 14 };
 136 static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
 137
 138 /* Unsigned char promotes to int.  */
 139 static_assert (UCHAR_MAX <= INT_MAX);
 140
 141 /* Bytes have 8 bits, as POSIX requires.  */
 142 static_assert (CHAR_BIT == 8);
 143
 144 #ifndef _GL_LIKELY
 145 /* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
 146 # define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
 147 # define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
 148 #endif
 149
 150 _GL_INLINE_HEADER_BEGIN
 151 #ifndef MCEL_INLINE
 152 # define MCEL_INLINE _GL_INLINE
 153 #endif
 154
 155 /* mcel_t constructors.  */
 156 MCEL_INLINE mcel_t
 157 mcel_ch (char32_t ch, size_t len)
 158 {
 159   assume (0 < len);
 160   assume (len <= MCEL_LEN_MAX);
 161   assume (ch <= MCEL_CHAR_MAX);
 162   return (mcel_t) {.ch = ch, .len = len};
 163 }
 164 MCEL_INLINE mcel_t
 165 mcel_err (unsigned char err)
 166 {
 167   assume (MCEL_ERR_MIN <= err);
 168   return (mcel_t) {.err = err, .len = 1};
 169 }
 170
 171 /* Compare C1 and C2, with encoding errors sorting after characters.
 172    Return <0, 0, >0 for <, =, >.  */
 173 MCEL_INLINE int
 174 mcel_cmp (mcel_t c1, mcel_t c2)
 175 {
 176   int ch1 = c1.ch, ch2 = c2.ch;
 177   return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
 178 }
 179
 180 /* Apply the uchar translator TO to C1 and C2 and compare the results,
 181    with encoding errors sorting after characters,
 182    Return <0, 0, >0 for <, =, >.  */
 183 MCEL_INLINE int
 184 mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
 185 {
 186   int cmp = mcel_cmp (c1, c2);
 187   if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
 188     return cmp;
 189   int ch1 = to (c1.ch), ch2 = to (c2.ch);
 190   return ch1 - ch2;
 191 }
 192
 193 /* Whether C represents itself as a Unicode character
 194    when it is the first byte of a single- or multi-byte character.
 195    These days it is safe to assume ASCII, so do not support
 196    obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS.  */
 197 MCEL_INLINE bool
 198 mcel_isbasic (char c)
 199 {
 200   return _GL_LIKELY (0 <= c && c < MCEL_ERR_MIN);
 201 }
 202
 203 /* With mcel there should be no need for the performance overhead of
 204    replacing glibc mbrtoc32, as callers shouldn't care whether the
 205    C locale treats a byte with the high bit set as an encoding error.  */
 206 #ifdef __GLIBC__
 207 # undef mbrtoc32
 208 #endif
 209
 210 /* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
 211    Return the character or encoding error starting at P.  */
 212 MCEL_INLINE mcel_t
 213 mcel_scan (char const *p, char const *lim)
 214 {
 215   /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
 216      In supported encodings, the first byte of a multi-byte character
 217      cannot be an ASCII byte.  */
 218   char c = *p;
 219   if (mcel_isbasic (c))
 220     return mcel_ch (c, 1);
 221
 222   /* An initial mbstate_t; initialization optimized for some platforms.
 223      For details about these and other platforms, see wchar.in.h.  */
 224 #if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
 225   /* Although only a trivial optimization, it's worth it for GNU.  */
 226   mbstate_t mbs; mbs.__count = 0;
 227 #elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
 228        || (defined __APPLE__ && defined __MACH__))
 229   /* These platforms have 128-byte mbstate_t.  What were they thinking?
 230      Initialize just for supported encodings (UTF-8, EUC, etc.).
 231      Avoid memset because some compilers generate function call code.  */
 232   struct mbhidden { char32_t ch; int utf8_want, euc_want; }
 233     _GL_ATTRIBUTE_MAY_ALIAS;
 234   union { mbstate_t m; struct mbhidden s; } u;
 235   u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
 236 # define mbs u.m
 237 #elif defined __NetBSD__
 238   /* Experiments on both 32- and 64-bit NetBSD platforms have
 239      shown that it doesn't work to clear fewer than 24 bytes.  */
 240   struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
 241   union { mbstate_t m; struct mbhidden s; } u;
 242   u.s.a = u.s.b = u.s.c = 0;
 243 # define mbs u.m
 244 #else
 245   /* mbstate_t has unknown structure or is not worth optimizing.  */
 246   mbstate_t mbs = {0};
 247 #endif
 248
 249   char32_t ch;
 250   size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
 251
 252 #undef mbs
 253
 254   /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
 255      is not supported and MB_LEN_MAX is small.  */
 256   if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
 257     return mcel_err (c);
 258
 259   /* A multi-byte character.  LEN must be positive,
 260      as *P != '\0' and shift sequences are not supported.  */
 261   return mcel_ch (ch, len);
 262 }
 263
 264 /* Scan bytes from P, a byte sequence terminated by TERMINATOR.
 265    If *P == TERMINATOR, scan just that byte; otherwise scan
 266    bytes up to but not including TERMINATOR.
 267    TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
 268    Return the character or encoding error starting at P.  */
 269 MCEL_INLINE mcel_t
 270 mcel_scant (char const *p, char terminator)
 271 {
 272   /* Handle ASCII quickly for speed.  */
 273   if (mcel_isbasic (*p))
 274     return mcel_ch (*p, 1);
 275
 276   /* Defer to mcel_scan for non-ASCII.  Compute length with code that
 277      is typically faster than strnlen.  */
 278   char const *lim = p + 1;
 279   for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
 280     lim += *lim != terminator;
 281   return mcel_scan (p, lim);
 282 }
 283
 284 /* Scan bytes from P, a byte sequence terminated by '\0'.
 285    If *P == '\0', scan just that byte; otherwise scan
 286    bytes up to but not including '\0'.
 287    Return the character or encoding error starting at P.  */
 288 MCEL_INLINE mcel_t
 289 mcel_scanz (char const *p)
 290 {
 291   return mcel_scant (p, '\0');
 292 }
 293
 294 _GL_INLINE_HEADER_END
 295
 296 #endif /* _MCEL_H */