1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #ifndef intl_components_String_h_
6 #define intl_components_String_h_
8 #include "mozilla/Assertions.h"
9 #include "mozilla/Casting.h"
10 #include "mozilla/intl/ICU4CGlue.h"
11 #include "mozilla/intl/ICUError.h"
12 #include "mozilla/PodOperations.h"
13 #include "mozilla/Result.h"
14 #include "mozilla/Span.h"
16 #include "unicode/uchar.h"
17 #include "unicode/unorm2.h"
18 #include "unicode/ustring.h"
19 #include "unicode/utext.h"
20 #include "unicode/utypes.h"
22 namespace mozilla::intl
{
25 * This component is a Mozilla-focused API for working with strings in
26 * internationalization code.
33 * Return the locale-sensitive lower case string of the input.
36 static Result
<Ok
, ICUError
> ToLocaleLowerCase(const char* aLocale
,
37 Span
<const char16_t
> aString
,
39 if (!aBuffer
.reserve(aString
.size())) {
40 return Err(ICUError::OutOfMemory
);
42 return FillBufferWithICUCall(
43 aBuffer
, [&](UChar
* target
, int32_t length
, UErrorCode
* status
) {
44 return u_strToLower(target
, length
, aString
.data(), aString
.size(),
50 * Return the locale-sensitive upper case string of the input.
53 static Result
<Ok
, ICUError
> ToLocaleUpperCase(const char* aLocale
,
54 Span
<const char16_t
> aString
,
56 if (!aBuffer
.reserve(aString
.size())) {
57 return Err(ICUError::OutOfMemory
);
59 return FillBufferWithICUCall(
60 aBuffer
, [&](UChar
* target
, int32_t length
, UErrorCode
* status
) {
61 return u_strToUpper(target
, length
, aString
.data(), aString
.size(),
67 * Normalization form constants to describe which normalization algorithm
68 * should be performed.
71 * - Unicode Standard, §2.12 Equivalent Sequences
72 * - Unicode Standard, §3.11 Normalization Forms
73 * - https://unicode.org/reports/tr15/
75 enum class NormalizationForm
{
77 * Normalization Form C
82 * Normalization Form D
87 * Normalization Form KC
92 * Normalization Form KD
97 enum class AlreadyNormalized
: bool { No
, Yes
};
100 * Normalize the input string according to requested normalization form.
102 * Returns `AlreadyNormalized::Yes` when the string is already in normalized
103 * form. The output buffer is unchanged in this case. Otherwise returns
104 * `AlreadyNormalized::No` and places the normalized string into the output
107 template <typename B
>
108 static Result
<AlreadyNormalized
, ICUError
> Normalize(
109 NormalizationForm aForm
, Span
<const char16_t
> aString
, B
& aBuffer
) {
110 // The unorm2_getXXXInstance() methods return a shared instance which must
112 UErrorCode status
= U_ZERO_ERROR
;
113 const UNormalizer2
* normalizer
;
115 case NormalizationForm::NFC
:
116 normalizer
= unorm2_getNFCInstance(&status
);
118 case NormalizationForm::NFD
:
119 normalizer
= unorm2_getNFDInstance(&status
);
121 case NormalizationForm::NFKC
:
122 normalizer
= unorm2_getNFKCInstance(&status
);
124 case NormalizationForm::NFKD
:
125 normalizer
= unorm2_getNFKDInstance(&status
);
128 if (U_FAILURE(status
)) {
129 return Err(ToICUError(status
));
132 int32_t spanLengthInt
= unorm2_spanQuickCheckYes(normalizer
, aString
.data(),
133 aString
.size(), &status
);
134 if (U_FAILURE(status
)) {
135 return Err(ToICUError(status
));
138 size_t spanLength
= AssertedCast
<size_t>(spanLengthInt
);
139 MOZ_ASSERT(spanLength
<= aString
.size());
141 // Return if the input string is already normalized.
142 if (spanLength
== aString
.size()) {
143 return AlreadyNormalized::Yes
;
146 if (!aBuffer
.reserve(aString
.size())) {
147 return Err(ICUError::OutOfMemory
);
150 // Copy the already normalized prefix.
151 if (spanLength
> 0) {
152 PodCopy(aBuffer
.data(), aString
.data(), spanLength
);
154 aBuffer
.written(spanLength
);
157 MOZ_TRY(FillBufferWithICUCall(
158 aBuffer
, [&](UChar
* target
, int32_t length
, UErrorCode
* status
) {
159 Span
<const char16_t
> remaining
= aString
.From(spanLength
);
160 return unorm2_normalizeSecondAndAppend(normalizer
, target
, spanLength
,
161 length
, remaining
.data(),
162 remaining
.size(), status
);
165 return AlreadyNormalized::No
;
169 * Return true if the code point has the binary property "Cased".
171 static bool IsCased(char32_t codePoint
) {
172 return u_hasBinaryProperty(static_cast<UChar32
>(codePoint
), UCHAR_CASED
);
176 * Return true if the code point has the binary property "Case_Ignorable".
178 static bool IsCaseIgnorable(char32_t codePoint
) {
179 return u_hasBinaryProperty(static_cast<UChar32
>(codePoint
),
180 UCHAR_CASE_IGNORABLE
);
184 * Return the NFC pairwise composition of the two input characters, if any;
185 * returns 0 (which we know is not a composed char!) if none exists.
187 static char32_t
ComposePairNFC(char32_t a
, char32_t b
) {
188 // unorm2_getNFCInstance returns a static instance that does not have to be
189 // released here. If it fails, we just return 0 (no composition) always.
190 static UErrorCode status
= U_ZERO_ERROR
;
191 static const UNormalizer2
* normalizer
= unorm2_getNFCInstance(&status
);
192 if (U_FAILURE(status
)) {
195 UChar32 ch
= unorm2_composePair(normalizer
, static_cast<UChar32
>(a
),
196 static_cast<UChar32
>(b
));
197 return ch
< 0 ? 0 : static_cast<char32_t
>(ch
);
201 * Put the "raw" (single-level) canonical decomposition of the input char, if
202 * any, into the provided buffer. Canonical decomps are never more than two
203 * chars in length (although full normalization may result in longer output
205 * Returns the length of the decomposition (0 if none, else 1 or 2).
207 static int DecomposeRawNFD(char32_t ab
, char32_t decomp
[2]) {
208 // unorm2_getNFCInstance returns a static instance that does not have to be
209 // released here. If it fails, we just return 0 (no decomposition) always.
210 // Although we are using it to query for a decomposition, the mode of the
211 // Normalizer2 is irrelevant here, so we may as well use the same singleton
212 // instance as ComposePairNFC.
213 static UErrorCode status
= U_ZERO_ERROR
;
214 static const UNormalizer2
* normalizer
= unorm2_getNFCInstance(&status
);
215 if (U_FAILURE(status
)) {
219 // Canonical decompositions are never more than two Unicode characters,
220 // or a maximum of 4 utf-16 code units.
221 const unsigned MAX_DECOMP_LENGTH
= 4;
222 UErrorCode error
= U_ZERO_ERROR
;
223 UChar decompUtf16
[MAX_DECOMP_LENGTH
];
225 unorm2_getRawDecomposition(normalizer
, static_cast<UChar32
>(ab
),
226 decompUtf16
, MAX_DECOMP_LENGTH
, &error
);
227 if (U_FAILURE(error
) || len
< 0) {
230 UText text
= UTEXT_INITIALIZER
;
231 utext_openUChars(&text
, decompUtf16
, len
, &error
);
232 MOZ_ASSERT(U_SUCCESS(error
));
233 UChar32 ch
= UTEXT_NEXT32(&text
);
235 if (ch
!= U_SENTINEL
) {
236 decomp
[0] = static_cast<char32_t
>(ch
);
238 ch
= UTEXT_NEXT32(&text
);
239 if (ch
!= U_SENTINEL
) {
240 decomp
[1] = static_cast<char32_t
>(ch
);
249 * Return the Unicode version, for example "13.0".
251 static Span
<const char> GetUnicodeVersion();
254 } // namespace mozilla::intl