src/common/unicode_case.c

   1 /*-------------------------------------------------------------------------
   2  * unicode_case.c
   3  *              Unicode case mapping and case conversion.
   4  *
   5  * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
   6  *
   7  * IDENTIFICATION
   8  *        src/common/unicode_case.c
   9  *
  10  *-------------------------------------------------------------------------
  11  */
  12 #ifndef FRONTEND
  13 #include "postgres.h"
  14 #else
  15 #include "postgres_fe.h"
  16 #endif
  17
  18 #include "common/unicode_case.h"
  19 #include "common/unicode_case_table.h"
  20 #include "common/unicode_category.h"
  21 #include "mb/pg_wchar.h"
  22
  23 static const pg_case_map *find_case_map(pg_wchar ucs);
  24 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
  25                                                    CaseKind str_casekind, WordBoundaryNext wbnext,
  26                                                    void *wbstate);
  27
  28 pg_wchar
  29 unicode_lowercase_simple(pg_wchar code)
  30 {
  31         const           pg_case_map *map = find_case_map(code);
  32
  33         return map ? map->simplemap[CaseLower] : code;
  34 }
  35
  36 pg_wchar
  37 unicode_titlecase_simple(pg_wchar code)
  38 {
  39         const           pg_case_map *map = find_case_map(code);
  40
  41         return map ? map->simplemap[CaseTitle] : code;
  42 }
  43
  44 pg_wchar
  45 unicode_uppercase_simple(pg_wchar code)
  46 {
  47         const           pg_case_map *map = find_case_map(code);
  48
  49         return map ? map->simplemap[CaseUpper] : code;
  50 }
  51
  52 /*
  53  * unicode_strlower()
  54  *
  55  * Convert src to lowercase, and return the result length (not including
  56  * terminating NUL).
  57  *
  58  * String src must be encoded in UTF-8. If srclen < 0, src must be
  59  * NUL-terminated.
  60  *
  61  * Result string is stored in dst, truncating if larger than dstsize. If
  62  * dstsize is greater than the result length, dst will be NUL-terminated;
  63  * otherwise not.
  64  *
  65  * If dstsize is zero, dst may be NULL. This is useful for calculating the
  66  * required buffer size before allocating.
  67  */
  68 size_t
  69 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
  70 {
  71         return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
  72 }
  73
  74 /*
  75  * unicode_strtitle()
  76  *
  77  * Convert src to titlecase, and return the result length (not including
  78  * terminating NUL).
  79  *
  80  * String src must be encoded in UTF-8. If srclen < 0, src must be
  81  * NUL-terminated.
  82  *
  83  * Result string is stored in dst, truncating if larger than dstsize. If
  84  * dstsize is greater than the result length, dst will be NUL-terminated;
  85  * otherwise not.
  86  *
  87  * If dstsize is zero, dst may be NULL. This is useful for calculating the
  88  * required buffer size before allocating.
  89  *
  90  * Titlecasing requires knowledge about word boundaries, which is provided by
  91  * the callback wbnext. A word boundary is the offset of the start of a word
  92  * or the offset of the character immediately following a word.
  93  *
  94  * The caller is expected to initialize and free the callback state
  95  * wbstate. The callback should first return offset 0 for the first boundary;
  96  * then the offset of each subsequent word boundary; then the total length of
  97  * the string to indicate the final boundary.
  98  */
  99 size_t
 100 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 101                                  WordBoundaryNext wbnext, void *wbstate)
 102 {
 103         return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
 104                                                 wbstate);
 105 }
 106
 107 /*
 108  * unicode_strupper()
 109  *
 110  * Convert src to uppercase, and return the result length (not including
 111  * terminating NUL).
 112  *
 113  * String src must be encoded in UTF-8. If srclen < 0, src must be
 114  * NUL-terminated.
 115  *
 116  * Result string is stored in dst, truncating if larger than dstsize. If
 117  * dstsize is greater than the result length, dst will be NUL-terminated;
 118  * otherwise not.
 119  *
 120  * If dstsize is zero, dst may be NULL. This is useful for calculating the
 121  * required buffer size before allocating.
 122  */
 123 size_t
 124 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 125 {
 126         return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 127 }
 128
 129 /*
 130  * If str_casekind is CaseLower or CaseUpper, map each character in the string
 131  * for which a mapping is available.
 132  *
 133  * If str_casekind is CaseTitle, maps characters found on a word boundary to
 134  * uppercase and other characters to lowercase.
 135  */
 136 static size_t
 137 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 138                          CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 139 {
 140         /* character CaseKind varies while titlecasing */
 141         CaseKind        chr_casekind = str_casekind;
 142         size_t          srcoff = 0;
 143         size_t          result_len = 0;
 144         size_t          boundary = 0;
 145
 146         Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 147                    (str_casekind != CaseTitle && !wbnext && !wbstate));
 148
 149         if (str_casekind == CaseTitle)
 150         {
 151                 boundary = wbnext(wbstate);
 152                 Assert(boundary == 0);  /* start of text is always a boundary */
 153         }
 154
 155         while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 156         {
 157                 pg_wchar        u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 158                 int                     u1len = unicode_utf8len(u1);
 159                 const           pg_case_map *casemap = find_case_map(u1);
 160
 161                 if (str_casekind == CaseTitle)
 162                 {
 163                         if (srcoff == boundary)
 164                         {
 165                                 chr_casekind = CaseUpper;
 166                                 boundary = wbnext(wbstate);
 167                         }
 168                         else
 169                                 chr_casekind = CaseLower;
 170                 }
 171
 172                 /* perform mapping, update result_len, and write to dst */
 173                 if (casemap)
 174                 {
 175                         pg_wchar        u2 = casemap->simplemap[chr_casekind];
 176                         pg_wchar        u2len = unicode_utf8len(u2);
 177
 178                         if (result_len + u2len <= dstsize)
 179                                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
 180
 181                         result_len += u2len;
 182                 }
 183                 else
 184                 {
 185                         /* no mapping; copy bytes from src */
 186                         if (result_len + u1len <= dstsize)
 187                                 memcpy(dst + result_len, src + srcoff, u1len);
 188
 189                         result_len += u1len;
 190                 }
 191
 192                 srcoff += u1len;
 193         }
 194
 195         if (result_len < dstsize)
 196                 dst[result_len] = '\0';
 197
 198         return result_len;
 199 }
 200
 201 /* find entry in simple case map, if any */
 202 static const pg_case_map *
 203 find_case_map(pg_wchar ucs)
 204 {
 205         int                     min;
 206         int                     mid;
 207         int                     max;
 208
 209         /* all chars <= 0x80 are stored in array for fast lookup */
 210         Assert(lengthof(case_map) >= 0x80);
 211         if (ucs < 0x80)
 212         {
 213                 const           pg_case_map *map = &case_map[ucs];
 214
 215                 Assert(map->codepoint == ucs);
 216                 return map;
 217         }
 218
 219         /* otherwise, binary search */
 220         min = 0x80;
 221         max = lengthof(case_map) - 1;
 222         while (max >= min)
 223         {
 224                 mid = (min + max) / 2;
 225                 if (ucs > case_map[mid].codepoint)
 226                         min = mid + 1;
 227                 else if (ucs < case_map[mid].codepoint)
 228                         max = mid - 1;
 229                 else
 230                         return &case_map[mid];
 231         }
 232
 233         return NULL;
 234 }