1 /*-------------------------------------------------------------------------
3 * Unicode case mapping and case conversion.
5 * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
8 * src/common/unicode_case.c
10 *-------------------------------------------------------------------------
15 #include "postgres_fe.h"
18 #include "common/unicode_case.h"
19 #include "common/unicode_case_table.h"
20 #include "common/unicode_category.h"
21 #include "mb/pg_wchar.h"
23 static const pg_case_map
*find_case_map(pg_wchar ucs
);
24 static size_t convert_case(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
25 CaseKind str_casekind
, WordBoundaryNext wbnext
,
29 unicode_lowercase_simple(pg_wchar code
)
31 const pg_case_map
*map
= find_case_map(code
);
33 return map
? map
->simplemap
[CaseLower
] : code
;
37 unicode_titlecase_simple(pg_wchar code
)
39 const pg_case_map
*map
= find_case_map(code
);
41 return map
? map
->simplemap
[CaseTitle
] : code
;
45 unicode_uppercase_simple(pg_wchar code
)
47 const pg_case_map
*map
= find_case_map(code
);
49 return map
? map
->simplemap
[CaseUpper
] : code
;
55 * Convert src to lowercase, and return the result length (not including
58 * String src must be encoded in UTF-8. If srclen < 0, src must be
61 * Result string is stored in dst, truncating if larger than dstsize. If
62 * dstsize is greater than the result length, dst will be NUL-terminated;
65 * If dstsize is zero, dst may be NULL. This is useful for calculating the
66 * required buffer size before allocating.
69 unicode_strlower(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
)
71 return convert_case(dst
, dstsize
, src
, srclen
, CaseLower
, NULL
, NULL
);
77 * Convert src to titlecase, and return the result length (not including
80 * String src must be encoded in UTF-8. If srclen < 0, src must be
83 * Result string is stored in dst, truncating if larger than dstsize. If
84 * dstsize is greater than the result length, dst will be NUL-terminated;
87 * If dstsize is zero, dst may be NULL. This is useful for calculating the
88 * required buffer size before allocating.
90 * Titlecasing requires knowledge about word boundaries, which is provided by
91 * the callback wbnext. A word boundary is the offset of the start of a word
92 * or the offset of the character immediately following a word.
94 * The caller is expected to initialize and free the callback state
95 * wbstate. The callback should first return offset 0 for the first boundary;
96 * then the offset of each subsequent word boundary; then the total length of
97 * the string to indicate the final boundary.
100 unicode_strtitle(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
101 WordBoundaryNext wbnext
, void *wbstate
)
103 return convert_case(dst
, dstsize
, src
, srclen
, CaseTitle
, wbnext
,
110 * Convert src to uppercase, and return the result length (not including
113 * String src must be encoded in UTF-8. If srclen < 0, src must be
116 * Result string is stored in dst, truncating if larger than dstsize. If
117 * dstsize is greater than the result length, dst will be NUL-terminated;
120 * If dstsize is zero, dst may be NULL. This is useful for calculating the
121 * required buffer size before allocating.
124 unicode_strupper(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
)
126 return convert_case(dst
, dstsize
, src
, srclen
, CaseUpper
, NULL
, NULL
);
130 * If str_casekind is CaseLower or CaseUpper, map each character in the string
131 * for which a mapping is available.
133 * If str_casekind is CaseTitle, maps characters found on a word boundary to
134 * uppercase and other characters to lowercase.
137 convert_case(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
138 CaseKind str_casekind
, WordBoundaryNext wbnext
, void *wbstate
)
140 /* character CaseKind varies while titlecasing */
141 CaseKind chr_casekind
= str_casekind
;
143 size_t result_len
= 0;
146 Assert((str_casekind
== CaseTitle
&& wbnext
&& wbstate
) ||
147 (str_casekind
!= CaseTitle
&& !wbnext
&& !wbstate
));
149 if (str_casekind
== CaseTitle
)
151 boundary
= wbnext(wbstate
);
152 Assert(boundary
== 0); /* start of text is always a boundary */
155 while ((srclen
< 0 || srcoff
< srclen
) && src
[srcoff
] != '\0')
157 pg_wchar u1
= utf8_to_unicode((unsigned char *) src
+ srcoff
);
158 int u1len
= unicode_utf8len(u1
);
159 const pg_case_map
*casemap
= find_case_map(u1
);
161 if (str_casekind
== CaseTitle
)
163 if (srcoff
== boundary
)
165 chr_casekind
= CaseUpper
;
166 boundary
= wbnext(wbstate
);
169 chr_casekind
= CaseLower
;
172 /* perform mapping, update result_len, and write to dst */
175 pg_wchar u2
= casemap
->simplemap
[chr_casekind
];
176 pg_wchar u2len
= unicode_utf8len(u2
);
178 if (result_len
+ u2len
<= dstsize
)
179 unicode_to_utf8(u2
, (unsigned char *) dst
+ result_len
);
185 /* no mapping; copy bytes from src */
186 if (result_len
+ u1len
<= dstsize
)
187 memcpy(dst
+ result_len
, src
+ srcoff
, u1len
);
195 if (result_len
< dstsize
)
196 dst
[result_len
] = '\0';
201 /* find entry in simple case map, if any */
202 static const pg_case_map
*
203 find_case_map(pg_wchar ucs
)
209 /* all chars <= 0x80 are stored in array for fast lookup */
210 Assert(lengthof(case_map
) >= 0x80);
213 const pg_case_map
*map
= &case_map
[ucs
];
215 Assert(map
->codepoint
== ucs
);
219 /* otherwise, binary search */
221 max
= lengthof(case_map
) - 1;
224 mid
= (min
+ max
) / 2;
225 if (ucs
> case_map
[mid
].codepoint
)
227 else if (ucs
< case_map
[mid
].codepoint
)
230 return &case_map
[mid
];