doc PG 17 relnotes: add FETCH_COUNT item
[pgsql.git] / src / common / unicode_case.c
blobbc423b0890c4d7d6a3a9954878139a3da6d2ca87
1 /*-------------------------------------------------------------------------
2 * unicode_case.c
3 * Unicode case mapping and case conversion.
5 * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
7 * IDENTIFICATION
8 * src/common/unicode_case.c
10 *-------------------------------------------------------------------------
12 #ifndef FRONTEND
13 #include "postgres.h"
14 #else
15 #include "postgres_fe.h"
16 #endif
18 #include "common/unicode_case.h"
19 #include "common/unicode_case_table.h"
20 #include "common/unicode_category.h"
21 #include "mb/pg_wchar.h"
23 static const pg_case_map *find_case_map(pg_wchar ucs);
24 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25 CaseKind str_casekind, WordBoundaryNext wbnext,
26 void *wbstate);
28 pg_wchar
29 unicode_lowercase_simple(pg_wchar code)
31 const pg_case_map *map = find_case_map(code);
33 return map ? map->simplemap[CaseLower] : code;
36 pg_wchar
37 unicode_titlecase_simple(pg_wchar code)
39 const pg_case_map *map = find_case_map(code);
41 return map ? map->simplemap[CaseTitle] : code;
44 pg_wchar
45 unicode_uppercase_simple(pg_wchar code)
47 const pg_case_map *map = find_case_map(code);
49 return map ? map->simplemap[CaseUpper] : code;
53 * unicode_strlower()
55 * Convert src to lowercase, and return the result length (not including
56 * terminating NUL).
58 * String src must be encoded in UTF-8. If srclen < 0, src must be
59 * NUL-terminated.
61 * Result string is stored in dst, truncating if larger than dstsize. If
62 * dstsize is greater than the result length, dst will be NUL-terminated;
63 * otherwise not.
65 * If dstsize is zero, dst may be NULL. This is useful for calculating the
66 * required buffer size before allocating.
68 size_t
69 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
71 return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
75 * unicode_strtitle()
77 * Convert src to titlecase, and return the result length (not including
78 * terminating NUL).
80 * String src must be encoded in UTF-8. If srclen < 0, src must be
81 * NUL-terminated.
83 * Result string is stored in dst, truncating if larger than dstsize. If
84 * dstsize is greater than the result length, dst will be NUL-terminated;
85 * otherwise not.
87 * If dstsize is zero, dst may be NULL. This is useful for calculating the
88 * required buffer size before allocating.
90 * Titlecasing requires knowledge about word boundaries, which is provided by
91 * the callback wbnext. A word boundary is the offset of the start of a word
92 * or the offset of the character immediately following a word.
94 * The caller is expected to initialize and free the callback state
95 * wbstate. The callback should first return offset 0 for the first boundary;
96 * then the offset of each subsequent word boundary; then the total length of
97 * the string to indicate the final boundary.
99 size_t
100 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
101 WordBoundaryNext wbnext, void *wbstate)
103 return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
104 wbstate);
108 * unicode_strupper()
110 * Convert src to uppercase, and return the result length (not including
111 * terminating NUL).
113 * String src must be encoded in UTF-8. If srclen < 0, src must be
114 * NUL-terminated.
116 * Result string is stored in dst, truncating if larger than dstsize. If
117 * dstsize is greater than the result length, dst will be NUL-terminated;
118 * otherwise not.
120 * If dstsize is zero, dst may be NULL. This is useful for calculating the
121 * required buffer size before allocating.
123 size_t
124 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
126 return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
130 * If str_casekind is CaseLower or CaseUpper, map each character in the string
131 * for which a mapping is available.
133 * If str_casekind is CaseTitle, maps characters found on a word boundary to
134 * uppercase and other characters to lowercase.
136 static size_t
137 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
138 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
140 /* character CaseKind varies while titlecasing */
141 CaseKind chr_casekind = str_casekind;
142 size_t srcoff = 0;
143 size_t result_len = 0;
144 size_t boundary = 0;
146 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
147 (str_casekind != CaseTitle && !wbnext && !wbstate));
149 if (str_casekind == CaseTitle)
151 boundary = wbnext(wbstate);
152 Assert(boundary == 0); /* start of text is always a boundary */
155 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
157 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
158 int u1len = unicode_utf8len(u1);
159 const pg_case_map *casemap = find_case_map(u1);
161 if (str_casekind == CaseTitle)
163 if (srcoff == boundary)
165 chr_casekind = CaseUpper;
166 boundary = wbnext(wbstate);
168 else
169 chr_casekind = CaseLower;
172 /* perform mapping, update result_len, and write to dst */
173 if (casemap)
175 pg_wchar u2 = casemap->simplemap[chr_casekind];
176 pg_wchar u2len = unicode_utf8len(u2);
178 if (result_len + u2len <= dstsize)
179 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
181 result_len += u2len;
183 else
185 /* no mapping; copy bytes from src */
186 if (result_len + u1len <= dstsize)
187 memcpy(dst + result_len, src + srcoff, u1len);
189 result_len += u1len;
192 srcoff += u1len;
195 if (result_len < dstsize)
196 dst[result_len] = '\0';
198 return result_len;
201 /* find entry in simple case map, if any */
202 static const pg_case_map *
203 find_case_map(pg_wchar ucs)
205 int min;
206 int mid;
207 int max;
209 /* all chars <= 0x80 are stored in array for fast lookup */
210 Assert(lengthof(case_map) >= 0x80);
211 if (ucs < 0x80)
213 const pg_case_map *map = &case_map[ucs];
215 Assert(map->codepoint == ucs);
216 return map;
219 /* otherwise, binary search */
220 min = 0x80;
221 max = lengthof(case_map) - 1;
222 while (max >= min)
224 mid = (min + max) / 2;
225 if (ucs > case_map[mid].codepoint)
226 min = mid + 1;
227 else if (ucs < case_map[mid].codepoint)
228 max = mid - 1;
229 else
230 return &case_map[mid];
233 return NULL;