lib/uninorm.in.h

   1 /* Normalization forms (composition and decomposition) of Unicode strings.
   2    Copyright (C) 2001-2002, 2009-2024 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2009.
   4
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 #ifndef _UNINORM_H
  19 #define _UNINORM_H
  20
  21 /* Get size_t.  */
  22 #include <stddef.h>
  23
  24 #include "unitypes.h"
  25
  26 #if @HAVE_UNISTRING_WOE32DLL_H@
  27 # include <unistring/woe32dll.h>
  28 #else
  29 # define LIBUNISTRING_DLL_VARIABLE
  30 #endif
  31
  32
  33 #ifdef __cplusplus
  34 extern "C" {
  35 #endif
  36
  37
  38 /* Conventions:
  39
  40    All functions prefixed with u8_ operate on UTF-8 encoded strings.
  41    Their unit is an uint8_t (1 byte).
  42
  43    All functions prefixed with u16_ operate on UTF-16 encoded strings.
  44    Their unit is an uint16_t (a 2-byte word).
  45
  46    All functions prefixed with u32_ operate on UCS-4 encoded strings.
  47    Their unit is an uint32_t (a 4-byte word).
  48
  49    All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
  50    n units.
  51
  52    Functions returning a string result take a (resultbuf, lengthp) argument
  53    pair.  If resultbuf is not NULL and the result fits into *lengthp units,
  54    it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
  55    allocated string is returned.  In both cases, *lengthp is set to the
  56    length (number of units) of the returned string.  In case of error,
  57    NULL is returned and errno is set.  */
  58
  59
  60 enum
  61 {
  62   UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
  63   UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
  64   UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
  65   UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
  66   UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
  67   UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
  68   UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
  69   UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
  70   UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
  71   UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
  72   UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
  73   UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
  74   UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
  75   UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
  76   UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
  77   UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
  78   UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
  79 };
  80
  81 /* Maximum size of decomposition of a single Unicode character.  */
  82 #define UC_DECOMPOSITION_MAX_LENGTH 32
  83
  84 /* Return the character decomposition mapping of a Unicode character.
  85    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  86    ucs_t elements.
  87    When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
  88    filled and N is returned.  Otherwise -1 is returned.  */
  89 extern int
  90        uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
  91
  92 /* Return the canonical character decomposition mapping of a Unicode character.
  93    DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
  94    ucs_t elements.
  95    When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
  96    returned.  Otherwise -1 is returned.  */
  97 extern int
  98        uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
  99
 100
 101 /* Attempt to combine the Unicode characters uc1, uc2.
 102    uc1 is known to have canonical combining class 0.
 103    Return the combination of uc1 and uc2, if it exists.
 104    Return 0 otherwise.
 105    Not all decompositions can be recombined using this function.  See the
 106    Unicode file CompositionExclusions.txt for details.  */
 107 extern ucs4_t
 108        uc_composition (ucs4_t uc1, ucs4_t uc2)
 109        _UC_ATTRIBUTE_CONST;
 110
 111
 112 /* An object of type uninorm_t denotes a Unicode normalization form.  */
 113 struct unicode_normalization_form;
 114 typedef const struct unicode_normalization_form *uninorm_t;
 115
 116 /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
 117 extern @GNULIB_UNINORM_NFD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfd;
 118 #define UNINORM_NFD (&uninorm_nfd)
 119
 120 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
 121    canonical composition.  */
 122 extern @GNULIB_UNINORM_NFC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfc;
 123 #define UNINORM_NFC (&uninorm_nfc)
 124
 125 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
 126 extern @GNULIB_UNINORM_NFKD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkd;
 127 #define UNINORM_NFKD (&uninorm_nfkd)
 128
 129 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
 130    canonical composition.  */
 131 extern @GNULIB_UNINORM_NFKC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkc;
 132 #define UNINORM_NFKC (&uninorm_nfkc)
 133
 134 /* Test whether a normalization form does compatibility decomposition.  */
 135 #define uninorm_is_compat_decomposing(nf) \
 136   ((* (const unsigned int *) (nf) >> 0) & 1)
 137
 138 /* Test whether a normalization form includes canonical composition.  */
 139 #define uninorm_is_composing(nf) \
 140   ((* (const unsigned int *) (nf) >> 1) & 1)
 141
 142 /* Return the decomposing variant of a normalization form.
 143    This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
 144 extern uninorm_t
 145        uninorm_decomposing_form (uninorm_t nf)
 146        _UC_ATTRIBUTE_PURE;
 147
 148
 149 /* Return the specified normalization form of a string.  */
 150 extern uint8_t *
 151        u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
 152                      uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
 153 extern uint16_t *
 154        u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
 155                       uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
 156 extern uint32_t *
 157        u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
 158                       uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
 159
 160
 161 /* Compare S1 and S2, ignoring differences in normalization.
 162    NF must be either UNINORM_NFD or UNINORM_NFKD.
 163    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 164    return 0.  Upon failure, return -1 with errno set.  */
 165 extern int
 166        u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 167                    uninorm_t nf, int *resultp);
 168 extern int
 169        u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 170                     uninorm_t nf, int *resultp);
 171 extern int
 172        u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 173                     uninorm_t nf, int *resultp);
 174
 175
 176 /* Converts the string S of length N to a NUL-terminated byte sequence, in such
 177    a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
 178    equivalent to comparing S1 and S2 with uN_normcoll().
 179    NF must be either UNINORM_NFC or UNINORM_NFKC.  */
 180 extern char *
 181        u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
 182                     char *resultbuf, size_t *lengthp);
 183 extern char *
 184        u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
 185                      char *resultbuf, size_t *lengthp);
 186 extern char *
 187        u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
 188                      char *resultbuf, size_t *lengthp);
 189
 190
 191 /* Compare S1 and S2, ignoring differences in normalization, using the
 192    collation rules of the current locale.
 193    NF must be either UNINORM_NFC or UNINORM_NFKC.
 194    If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
 195    return 0.  Upon failure, return -1 with errno set.  */
 196 extern int
 197        u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
 198                     uninorm_t nf, int *resultp);
 199 extern int
 200        u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
 201                      uninorm_t nf, int *resultp);
 202 extern int
 203        u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
 204                      uninorm_t nf, int *resultp);
 205
 206
 207 /* Normalization of a stream of Unicode characters.
 208
 209    A "stream of Unicode characters" is essentially a function that accepts an
 210    ucs4_t argument repeatedly, optionally combined with a function that
 211    "flushes" the stream.  */
 212
 213 /* Data type of a stream of Unicode characters that normalizes its input
 214    according to a given normalization form and passes the normalized character
 215    sequence to the encapsulated stream of Unicode characters.  */
 216 struct uninorm_filter;
 217
 218 /* Bring data buffered in the filter to its destination, the encapsulated
 219    stream, then close and free the filter.
 220    Return 0 if successful, or -1 with errno set upon failure.  */
 221 extern int
 222        uninorm_filter_free (struct uninorm_filter *filter);
 223
 224 /* Create and return a normalization filter for Unicode characters.
 225    The pair (stream_func, stream_data) is the encapsulated stream.
 226    stream_func (stream_data, uc) receives the Unicode character uc
 227    and returns 0 if successful, or -1 with errno set upon failure.
 228    Return the new filter, or NULL with errno set upon failure.  */
 229 extern struct uninorm_filter *
 230        uninorm_filter_create (uninorm_t nf,
 231                               int (*stream_func) (void *stream_data, ucs4_t uc),
 232                               void *stream_data)
 233        _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
 234
 235 /* Stuff a Unicode character into a normalizing filter.
 236    Return 0 if successful, or -1 with errno set upon failure.  */
 237 extern int
 238        uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
 239
 240 /* Bring data buffered in the filter to its destination, the encapsulated
 241    stream.
 242    Return 0 if successful, or -1 with errno set upon failure.
 243    Note! If after calling this function, additional characters are written
 244    into the filter, the resulting character sequence in the encapsulated stream
 245    will not necessarily be normalized.  */
 246 extern int
 247        uninorm_filter_flush (struct uninorm_filter *filter);
 248
 249
 250 #ifdef __cplusplus
 251 }
 252 #endif
 253
 254
 255 #endif /* _UNINORM_H */