intl/components/src/String.h

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this
   3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 #ifndef intl_components_String_h_
   6 #define intl_components_String_h_
   7
   8 #include "mozilla/Assertions.h"
   9 #include "mozilla/Casting.h"
  10 #include "mozilla/intl/ICU4CGlue.h"
  11 #include "mozilla/intl/ICUError.h"
  12 #include "mozilla/PodOperations.h"
  13 #include "mozilla/Result.h"
  14 #include "mozilla/Span.h"
  15
  16 #include "unicode/uchar.h"
  17 #include "unicode/unorm2.h"
  18 #include "unicode/ustring.h"
  19 #include "unicode/utext.h"
  20 #include "unicode/utypes.h"
  21
  22 namespace mozilla::intl {
  23
  24 /**
  25  * This component is a Mozilla-focused API for working with strings in
  26  * internationalization code.
  27  */
  28 class String final {
  29  public:
  30   String() = delete;
  31
  32   /**
  33    * Return the locale-sensitive lower case string of the input.
  34    */
  35   template <typename B>
  36   static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
  37                                                 Span<const char16_t> aString,
  38                                                 B& aBuffer) {
  39     if (!aBuffer.reserve(aString.size())) {
  40       return Err(ICUError::OutOfMemory);
  41     }
  42     return FillBufferWithICUCall(
  43         aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
  44           return u_strToLower(target, length, aString.data(), aString.size(),
  45                               aLocale, status);
  46         });
  47   }
  48
  49   /**
  50    * Return the locale-sensitive upper case string of the input.
  51    */
  52   template <typename B>
  53   static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
  54                                                 Span<const char16_t> aString,
  55                                                 B& aBuffer) {
  56     if (!aBuffer.reserve(aString.size())) {
  57       return Err(ICUError::OutOfMemory);
  58     }
  59     return FillBufferWithICUCall(
  60         aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
  61           return u_strToUpper(target, length, aString.data(), aString.size(),
  62                               aLocale, status);
  63         });
  64   }
  65
  66   /**
  67    * Normalization form constants to describe which normalization algorithm
  68    * should be performed.
  69    *
  70    * Also see:
  71    * - Unicode Standard, §2.12 Equivalent Sequences
  72    * - Unicode Standard, §3.11 Normalization Forms
  73    * - https://unicode.org/reports/tr15/
  74    */
  75   enum class NormalizationForm {
  76     /**
  77      * Normalization Form C
  78      */
  79     NFC,
  80
  81     /**
  82      * Normalization Form D
  83      */
  84     NFD,
  85
  86     /**
  87      * Normalization Form KC
  88      */
  89     NFKC,
  90
  91     /**
  92      * Normalization Form KD
  93      */
  94     NFKD,
  95   };
  96
  97   enum class AlreadyNormalized : bool { No, Yes };
  98
  99   /**
 100    * Normalize the input string according to requested normalization form.
 101    *
 102    * Returns `AlreadyNormalized::Yes` when the string is already in normalized
 103    * form. The output buffer is unchanged in this case. Otherwise returns
 104    * `AlreadyNormalized::No` and places the normalized string into the output
 105    * buffer.
 106    */
 107   template <typename B>
 108   static Result<AlreadyNormalized, ICUError> Normalize(
 109       NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
 110     // The unorm2_getXXXInstance() methods return a shared instance which must
 111     // not be deleted.
 112     UErrorCode status = U_ZERO_ERROR;
 113     const UNormalizer2* normalizer;
 114     switch (aForm) {
 115       case NormalizationForm::NFC:
 116         normalizer = unorm2_getNFCInstance(&status);
 117         break;
 118       case NormalizationForm::NFD:
 119         normalizer = unorm2_getNFDInstance(&status);
 120         break;
 121       case NormalizationForm::NFKC:
 122         normalizer = unorm2_getNFKCInstance(&status);
 123         break;
 124       case NormalizationForm::NFKD:
 125         normalizer = unorm2_getNFKDInstance(&status);
 126         break;
 127     }
 128     if (U_FAILURE(status)) {
 129       return Err(ToICUError(status));
 130     }
 131
 132     int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
 133                                                      aString.size(), &status);
 134     if (U_FAILURE(status)) {
 135       return Err(ToICUError(status));
 136     }
 137
 138     size_t spanLength = AssertedCast<size_t>(spanLengthInt);
 139     MOZ_ASSERT(spanLength <= aString.size());
 140
 141     // Return if the input string is already normalized.
 142     if (spanLength == aString.size()) {
 143       return AlreadyNormalized::Yes;
 144     }
 145
 146     if (!aBuffer.reserve(aString.size())) {
 147       return Err(ICUError::OutOfMemory);
 148     }
 149
 150     // Copy the already normalized prefix.
 151     if (spanLength > 0) {
 152       PodCopy(aBuffer.data(), aString.data(), spanLength);
 153
 154       aBuffer.written(spanLength);
 155     }
 156
 157     MOZ_TRY(FillBufferWithICUCall(
 158         aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
 159           Span<const char16_t> remaining = aString.From(spanLength);
 160           return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
 161                                                  length, remaining.data(),
 162                                                  remaining.size(), status);
 163         }));
 164
 165     return AlreadyNormalized::No;
 166   }
 167
 168   /**
 169    * Return true if the code point has the binary property "Cased".
 170    */
 171   static bool IsCased(char32_t codePoint) {
 172     return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
 173   }
 174
 175   /**
 176    * Return true if the code point has the binary property "Case_Ignorable".
 177    */
 178   static bool IsCaseIgnorable(char32_t codePoint) {
 179     return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
 180                                UCHAR_CASE_IGNORABLE);
 181   }
 182
 183   /**
 184    * Return the NFC pairwise composition of the two input characters, if any;
 185    * returns 0 (which we know is not a composed char!) if none exists.
 186    */
 187   static char32_t ComposePairNFC(char32_t a, char32_t b) {
 188     // unorm2_getNFCInstance returns a static instance that does not have to be
 189     // released here. If it fails, we just return 0 (no composition) always.
 190     static UErrorCode status = U_ZERO_ERROR;
 191     static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
 192     if (U_FAILURE(status)) {
 193       return 0;
 194     }
 195     UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
 196                                     static_cast<UChar32>(b));
 197     return ch < 0 ? 0 : static_cast<char32_t>(ch);
 198   }
 199
 200   /**
 201    * Put the "raw" (single-level) canonical decomposition of the input char, if
 202    * any, into the provided buffer. Canonical decomps are never more than two
 203    * chars in length (although full normalization may result in longer output
 204    * due to recursion).
 205    * Returns the length of the decomposition (0 if none, else 1 or 2).
 206    */
 207   static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
 208     // unorm2_getNFCInstance returns a static instance that does not have to be
 209     // released here. If it fails, we just return 0 (no decomposition) always.
 210     // Although we are using it to query for a decomposition, the mode of the
 211     // Normalizer2 is irrelevant here, so we may as well use the same singleton
 212     // instance as ComposePairNFC.
 213     static UErrorCode status = U_ZERO_ERROR;
 214     static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
 215     if (U_FAILURE(status)) {
 216       return 0;
 217     }
 218
 219     // Canonical decompositions are never more than two Unicode characters,
 220     // or a maximum of 4 utf-16 code units.
 221     const unsigned MAX_DECOMP_LENGTH = 4;
 222     UErrorCode error = U_ZERO_ERROR;
 223     UChar decompUtf16[MAX_DECOMP_LENGTH];
 224     int32_t len =
 225         unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
 226                                    decompUtf16, MAX_DECOMP_LENGTH, &error);
 227     if (U_FAILURE(error) || len < 0) {
 228       return 0;
 229     }
 230     UText text = UTEXT_INITIALIZER;
 231     utext_openUChars(&text, decompUtf16, len, &error);
 232     MOZ_ASSERT(U_SUCCESS(error));
 233     UChar32 ch = UTEXT_NEXT32(&text);
 234     len = 0;
 235     if (ch != U_SENTINEL) {
 236       decomp[0] = static_cast<char32_t>(ch);
 237       ++len;
 238       ch = UTEXT_NEXT32(&text);
 239       if (ch != U_SENTINEL) {
 240         decomp[1] = static_cast<char32_t>(ch);
 241         ++len;
 242       }
 243     }
 244     utext_close(&text);
 245     return len;
 246   }
 247
 248   /**
 249    * Return the Unicode version, for example "13.0".
 250    */
 251   static Span<const char> GetUnicodeVersion();
 252 };
 253
 254 }  // namespace mozilla::intl
 255
 256 #endif