mfbt/tests/TestUtf8.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #define MOZ_PRETEND_NO_JSRUST 1
   8
   9 #include "mozilla/Utf8.h"
  10
  11 #include "mozilla/ArrayUtils.h"
  12 #include "mozilla/Assertions.h"
  13 #include "mozilla/EnumSet.h"
  14 #include "mozilla/IntegerRange.h"
  15 #include "mozilla/Span.h"
  16
  17 using mozilla::ArrayLength;
  18 using mozilla::AsChars;
  19 using mozilla::DecodeOneUtf8CodePoint;
  20 using mozilla::EnumSet;
  21 using mozilla::IntegerRange;
  22 using mozilla::IsAscii;
  23 using mozilla::IsUtf8;
  24 using mozilla::Span;
  25 using mozilla::Utf8Unit;
  26
  27 // Disable the C++ 2a warning. See bug #1509926
  28 #if defined(__clang__) && (__clang_major__ >= 6)
  29 #  pragma clang diagnostic push
  30 #  pragma clang diagnostic ignored "-Wc++2a-compat"
  31 #endif
  32
  33 static void TestUtf8Unit() {
  34   Utf8Unit c('A');
  35   MOZ_RELEASE_ASSERT(c.toChar() == 'A');
  36   MOZ_RELEASE_ASSERT(c == Utf8Unit('A'));
  37   MOZ_RELEASE_ASSERT(c != Utf8Unit('B'));
  38   MOZ_RELEASE_ASSERT(c.toUint8() == 0x41);
  39
  40   unsigned char asUnsigned = 'A';
  41   MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned);
  42   MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned);
  43
  44   Utf8Unit first('@');
  45   Utf8Unit second('#');
  46
  47   MOZ_RELEASE_ASSERT(first != second);
  48
  49   first = second;
  50   MOZ_RELEASE_ASSERT(first == second);
  51 }
  52
  53 template <typename Char>
  54 struct ToUtf8Units {
  55  public:
  56   explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
  57       : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) {
  58     MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
  59   }
  60
  61   const Utf8Unit lead;
  62   const Char* iter;
  63   const Char* const end;
  64 };
  65
  66 class AssertIfCalled {
  67  public:
  68   template <typename... Args>
  69   void operator()(Args&&... aArgs) {
  70     MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
  71   }
  72 };
  73
  74 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
  75 //       a string literal or a more-generalized array, we require |aCharN| be
  76 //       null-terminated.
  77
  78 template <typename Char, size_t N>
  79 static void ExpectValidCodePoint(const Char (&aCharN)[N],
  80                                  char32_t aExpectedCodePoint) {
  81   MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
  82                      "array must be null-terminated for |aCharN + N - 1| to "
  83                      "compute the value of |aIter| as altered by "
  84                      "DecodeOneUtf8CodePoint");
  85
  86   ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
  87   auto simple =
  88       DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
  89   MOZ_RELEASE_ASSERT(simple.isSome());
  90   MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
  91   MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);
  92
  93   ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
  94   auto complex = DecodeOneUtf8CodePoint(
  95       complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(),
  96       AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled());
  97   MOZ_RELEASE_ASSERT(complex.isSome());
  98   MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
  99   MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
 100 }
 101
 102 enum class InvalidUtf8Reason {
 103   BadLeadUnit,
 104   NotEnoughUnits,
 105   BadTrailingUnit,
 106   BadCodePoint,
 107   NotShortestForm,
 108 };
 109
 110 template <typename Char, size_t N>
 111 static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
 112                                          InvalidUtf8Reason aExpectedReason,
 113                                          uint8_t aExpectedUnitsAvailable,
 114                                          uint8_t aExpectedUnitsNeeded,
 115                                          char32_t aExpectedBadCodePoint,
 116                                          uint8_t aExpectedUnitsObserved) {
 117   MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
 118                      "array must be null-terminated for |aCharN + N - 1| to "
 119                      "compute the value of |aIter| as altered by "
 120                      "DecodeOneUtf8CodePoint");
 121
 122   ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
 123   auto simple =
 124       DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
 125   MOZ_RELEASE_ASSERT(simple.isNothing());
 126   MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);
 127
 128   EnumSet<InvalidUtf8Reason> reasons;
 129   uint8_t unitsAvailable;
 130   uint8_t unitsNeeded;
 131   char32_t badCodePoint;
 132   uint8_t unitsObserved;
 133
 134   struct OnNotShortestForm {
 135     EnumSet<InvalidUtf8Reason>& reasons;
 136     char32_t& badCodePoint;
 137     uint8_t& unitsObserved;
 138
 139     void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
 140       reasons += InvalidUtf8Reason::NotShortestForm;
 141       badCodePoint = aBadCodePoint;
 142       unitsObserved = aUnitsObserved;
 143     }
 144   };
 145
 146   ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
 147   auto complex = DecodeOneUtf8CodePoint(
 148       complexUnit.lead, &complexUnit.iter, complexUnit.end,
 149       [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; },
 150       [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
 151                                                 uint8_t aUnitsNeeded) {
 152         reasons += InvalidUtf8Reason::NotEnoughUnits;
 153         unitsAvailable = aUnitsAvailable;
 154         unitsNeeded = aUnitsNeeded;
 155       },
 156       [&reasons, &unitsObserved](uint8_t aUnitsObserved) {
 157         reasons += InvalidUtf8Reason::BadTrailingUnit;
 158         unitsObserved = aUnitsObserved;
 159       },
 160       [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
 161                                                 uint8_t aUnitsObserved) {
 162         reasons += InvalidUtf8Reason::BadCodePoint;
 163         badCodePoint = aBadCodePoint;
 164         unitsObserved = aUnitsObserved;
 165       },
 166       [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
 167                                                 uint8_t aUnitsObserved) {
 168         reasons += InvalidUtf8Reason::NotShortestForm;
 169         badCodePoint = aBadCodePoint;
 170         unitsObserved = aUnitsObserved;
 171       });
 172   MOZ_RELEASE_ASSERT(complex.isNothing());
 173   MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);
 174
 175   bool alreadyIterated = false;
 176   for (InvalidUtf8Reason reason : reasons) {
 177     MOZ_RELEASE_ASSERT(!alreadyIterated);
 178     alreadyIterated = true;
 179
 180     switch (reason) {
 181       case InvalidUtf8Reason::BadLeadUnit:
 182         break;
 183
 184       case InvalidUtf8Reason::NotEnoughUnits:
 185         MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
 186         MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
 187         break;
 188
 189       case InvalidUtf8Reason::BadTrailingUnit:
 190         MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
 191         break;
 192
 193       case InvalidUtf8Reason::BadCodePoint:
 194         MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
 195         MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
 196         break;
 197
 198       case InvalidUtf8Reason::NotShortestForm:
 199         MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
 200         MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
 201         break;
 202     }
 203   }
 204 }
 205
 206 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
 207 //       a string literal or a more-generalized array, we require |aCharN| be
 208 //       null-terminated in all these functions.
 209
 210 template <typename Char, size_t N>
 211 static void ExpectBadLeadUnit(const Char (&aCharN)[N]) {
 212   ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF,
 213                                0xFF, 0xFFFFFFFF, 0xFF);
 214 }
 215
 216 template <typename Char, size_t N>
 217 static void ExpectNotEnoughUnits(const Char (&aCharN)[N],
 218                                  uint8_t aExpectedUnitsAvailable,
 219                                  uint8_t aExpectedUnitsNeeded) {
 220   ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits,
 221                                aExpectedUnitsAvailable, aExpectedUnitsNeeded,
 222                                0xFFFFFFFF, 0xFF);
 223 }
 224
 225 template <typename Char, size_t N>
 226 static void ExpectBadTrailingUnit(const Char (&aCharN)[N],
 227                                   uint8_t aExpectedUnitsObserved) {
 228   ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF,
 229                                0xFF, 0xFFFFFFFF, aExpectedUnitsObserved);
 230 }
 231
 232 template <typename Char, size_t N>
 233 static void ExpectNotShortestForm(const Char (&aCharN)[N],
 234                                   char32_t aExpectedBadCodePoint,
 235                                   uint8_t aExpectedUnitsObserved) {
 236   ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF,
 237                                0xFF, aExpectedBadCodePoint,
 238                                aExpectedUnitsObserved);
 239 }
 240
 241 template <typename Char, size_t N>
 242 static void ExpectBadCodePoint(const Char (&aCharN)[N],
 243                                char32_t aExpectedBadCodePoint,
 244                                uint8_t aExpectedUnitsObserved) {
 245   ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF,
 246                                0xFF, aExpectedBadCodePoint,
 247                                aExpectedUnitsObserved);
 248 }
 249
 250 static void TestIsUtf8() {
 251   // Note we include the U+0000 NULL in this one -- and that's fine.
 252   static const char asciiBytes[] = u8"How about a nice game of chess?";
 253   MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, ArrayLength(asciiBytes))));
 254
 255   static const char endNonAsciiBytes[] = u8"Life is like a 🌯";
 256   MOZ_RELEASE_ASSERT(
 257       IsUtf8(Span(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1)));
 258
 259   static const unsigned char badLeading[] = {0x80};
 260   MOZ_RELEASE_ASSERT(
 261       !IsUtf8(AsChars(Span(badLeading, ArrayLength(badLeading)))));
 262
 263   // Byte-counts
 264
 265   // 1
 266   static const char oneBytes[] = u8"A";  // U+0041 LATIN CAPITAL LETTER A
 267   constexpr size_t oneBytesLen = ArrayLength(oneBytes);
 268   static_assert(oneBytesLen == 2, "U+0041 plus nul");
 269   MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen)));
 270
 271   // 2
 272   static const char twoBytes[] = u8"؆";  // U+0606 ARABIC-INDIC CUBE ROOT
 273   constexpr size_t twoBytesLen = ArrayLength(twoBytes);
 274   static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
 275   MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen)));
 276
 277   ExpectValidCodePoint(twoBytes, 0x0606);
 278
 279   // 3
 280   static const char threeBytes[] = u8"᨞";  // U+1A1E BUGINESE PALLAWA
 281   constexpr size_t threeBytesLen = ArrayLength(threeBytes);
 282   static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
 283   MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen)));
 284
 285   ExpectValidCodePoint(threeBytes, 0x1A1E);
 286
 287   // 4
 288   static const char fourBytes[] =
 289       u8"🁡";  // U+1F061 DOMINO TILE HORIZONTAL-06-06
 290   constexpr size_t fourBytesLen = ArrayLength(fourBytes);
 291   static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
 292   MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen)));
 293
 294   ExpectValidCodePoint(fourBytes, 0x1F061);
 295
 296   // Max code point
 297   static const char maxCodePoint[] = u8"􏿿";  // U+10FFFF
 298   constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
 299   static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
 300   MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen)));
 301
 302   ExpectValidCodePoint(maxCodePoint, 0x10FFFF);
 303
 304   // One past max code point
 305   static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80,
 306                                                       0x0};
 307   constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
 308   MOZ_RELEASE_ASSERT(
 309       !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen))));
 310
 311   ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);
 312
 313   // Surrogate-related testing
 314
 315   // (Note that the various code unit sequences here are null-terminated to
 316   // simplify life for ExpectValidCodePoint, which presumes null termination.)
 317
 318   static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0};
 319   constexpr size_t justBeforeSurrogatesLen =
 320       ArrayLength(justBeforeSurrogates) - 1;
 321   MOZ_RELEASE_ASSERT(
 322       IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen))));
 323
 324   ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);
 325
 326   static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0};
 327   constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1;
 328   MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen))));
 329
 330   ExpectBadCodePoint(leastSurrogate, 0xD800, 3);
 331
 332   static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0};
 333   constexpr size_t arbitraryHighSurrogateLen =
 334       ArrayLength(arbitraryHighSurrogate) - 1;
 335   MOZ_RELEASE_ASSERT(!IsUtf8(
 336       AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen))));
 337
 338   ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);
 339
 340   static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0};
 341   constexpr size_t arbitraryLowSurrogateLen =
 342       ArrayLength(arbitraryLowSurrogate) - 1;
 343   MOZ_RELEASE_ASSERT(
 344       !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen))));
 345
 346   ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);
 347
 348   static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0};
 349   constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1;
 350   MOZ_RELEASE_ASSERT(
 351       !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen))));
 352
 353   ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);
 354
 355   static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0};
 356   constexpr size_t justAfterSurrogatesLen =
 357       ArrayLength(justAfterSurrogates) - 1;
 358   MOZ_RELEASE_ASSERT(
 359       IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen))));
 360
 361   ExpectValidCodePoint(justAfterSurrogates, 0xE000);
 362 }
 363
 364 static void TestDecodeOneValidUtf8CodePoint() {
 365   // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
 366   //       consist of multiple code units, so there are no ASCII tests below.
 367
 368   // Length two.
 369
 370   ExpectValidCodePoint(u8"", 0x80);  // <control>
 371   ExpectValidCodePoint(u8"©", 0xA9);   // COPYRIGHT SIGN
 372   ExpectValidCodePoint(u8"¶", 0xB6);   // PILCROW SIGN
 373   ExpectValidCodePoint(u8"¾", 0xBE);   // VULGAR FRACTION THREE QUARTERS
 374   ExpectValidCodePoint(u8"÷", 0xF7);   // DIVISION SIGN
 375   ExpectValidCodePoint(u8"ÿ", 0xFF);   // LATIN SMALL LETTER Y WITH DIAERESIS
 376   ExpectValidCodePoint(u8"Ā", 0x100);  // LATIN CAPITAL LETTER A WITH MACRON
 377   ExpectValidCodePoint(u8"Ĳ", 0x132);  // LATIN CAPITAL LETTER LIGATURE IJ
 378   ExpectValidCodePoint(u8"ͼ", 0x37C);  // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
 379   ExpectValidCodePoint(u8"Ӝ",
 380                        0x4DC);  // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
 381   ExpectValidCodePoint(u8"۩", 0x6E9);  // ARABIC PLACE OF SAJDAH
 382   ExpectValidCodePoint(u8"߿", 0x7FF);  // <not assigned>
 383
 384   // Length three.
 385
 386   ExpectValidCodePoint(u8"ࠀ", 0x800);  // SAMARITAN LETTER ALAF
 387   ExpectValidCodePoint(u8"ࡁ", 0x841);  // MANDAIC LETTER AB
 388   ExpectValidCodePoint(u8"ࣿ", 0x8FF);   // ARABIC MARK SIDEWAYS NOON GHUNNA
 389   ExpectValidCodePoint(u8"ஆ", 0xB86);  // TAMIL LETTER AA
 390   ExpectValidCodePoint(u8"༃",
 391                        0xF03);  // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
 392   ExpectValidCodePoint(
 393       u8"࿉",
 394       0xFC9);  // TIBETAN SYMBOL NOR BU (but on my system it really looks like
 395                // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
 396   ExpectValidCodePoint(u8"ဪ", 0x102A);           // MYANMAR LETTER AU
 397   ExpectValidCodePoint(u8"ᚏ", 0x168F);           // OGHAM LETTER RUIS
 398   ExpectValidCodePoint("\xE2\x80\xA8", 0x2028);  // (the hated) LINE SEPARATOR
 399   ExpectValidCodePoint("\xE2\x80\xA9",
 400                        0x2029);           // (the hated) PARAGRAPH SEPARATOR
 401   ExpectValidCodePoint(u8"☬", 0x262C);    // ADI SHAKTI
 402   ExpectValidCodePoint(u8"㊮", 0x32AE);   // CIRCLED IDEOGRAPH RESOURCE
 403   ExpectValidCodePoint(u8"㏖", 0x33D6);   // SQUARE MOL
 404   ExpectValidCodePoint(u8"ꔄ", 0xA504);    // VAI SYLLABLE WEEN
 405   ExpectValidCodePoint(u8"ퟕ", 0xD7D5);    // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
 406   ExpectValidCodePoint(u8"퟿", 0xD7FF);  // <not assigned>
 407   ExpectValidCodePoint(u8"", 0xE000);  // <Private Use>
 408   ExpectValidCodePoint(u8"鱗", 0xF9F2);   // CJK COMPATIBILITY IDEOGRAPH-F9F
 409   ExpectValidCodePoint(
 410       u8"﷽", 0xFDFD);  // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
 411   ExpectValidCodePoint(u8"", 0xFFFF);  // <not assigned>
 412
 413   // Length four.
 414   ExpectValidCodePoint(u8"𐀀", 0x10000);      // LINEAR B SYLLABLE B008 A
 415   ExpectValidCodePoint(u8"𔑀", 0x14440);      // ANATOLIAN HIEROGLYPH A058
 416   ExpectValidCodePoint(u8"𝛗", 0x1D6D7);      // MATHEMATICAL BOLD SMALL PHI
 417   ExpectValidCodePoint(u8"💩", 0x1F4A9);     // PILE OF POO
 418   ExpectValidCodePoint(u8"🔫", 0x1F52B);     // PISTOL
 419   ExpectValidCodePoint(u8"🥌", 0x1F94C);     // CURLING STONE
 420   ExpectValidCodePoint(u8"🥏", 0x1F94F);     // FLYING DISC
 421   ExpectValidCodePoint(u8"𠍆", 0x20346);     // CJK UNIFIED IDEOGRAPH-20346
 422   ExpectValidCodePoint(u8"𡠺", 0x2183A);     // CJK UNIFIED IDEOGRAPH-2183A
 423   ExpectValidCodePoint(u8"񁟶", 0x417F6);   // <not assigned>
 424   ExpectValidCodePoint(u8"񾠶", 0x7E836);   // <not assigned>
 425   ExpectValidCodePoint(u8"󾽧", 0xFEF67);   // <Plane 15 Private Use>
 426   ExpectValidCodePoint(u8"􏿿", 0x10FFFF);  //
 427 }
 428
 429 static void TestDecodeBadLeadUnit() {
 430   // These tests are actually exhaustive.
 431
 432   unsigned char badLead[] = {'\0', '\0'};
 433
 434   for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
 435     badLead[0] = lead;
 436     ExpectBadLeadUnit(badLead);
 437   }
 438
 439   {
 440     uint8_t lead = 0b1111'1000;
 441     do {
 442       badLead[0] = lead;
 443       ExpectBadLeadUnit(badLead);
 444       if (lead == 0b1111'1111) {
 445         break;
 446       }
 447
 448       lead++;
 449     } while (true);
 450   }
 451 }
 452
 453 static void TestTooFewOrBadTrailingUnits() {
 454   // Lead unit indicates a two-byte code point.
 455
 456   char truncatedTwo[] = {'\0', '\0'};
 457   char badTrailTwo[] = {'\0', '\0', '\0'};
 458
 459   for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
 460     truncatedTwo[0] = lead;
 461     ExpectNotEnoughUnits(truncatedTwo, 1, 2);
 462
 463     badTrailTwo[0] = lead;
 464     for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
 465       badTrailTwo[1] = trail;
 466       ExpectBadTrailingUnit(badTrailTwo, 2);
 467     }
 468
 469     for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
 470       badTrailTwo[1] = trail;
 471       ExpectBadTrailingUnit(badTrailTwo, 2);
 472     }
 473   }
 474
 475   // Lead unit indicates a three-byte code point.
 476
 477   char truncatedThreeOne[] = {'\0', '\0'};
 478   char truncatedThreeTwo[] = {'\0', '\0', '\0'};
 479   unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'};
 480
 481   for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
 482     truncatedThreeOne[0] = lead;
 483     ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);
 484
 485     truncatedThreeTwo[0] = lead;
 486     ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);
 487
 488     badTrailThree[0] = lead;
 489     badTrailThree[2] = 0b1011'1111;  // make valid to test overreads
 490     for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
 491       badTrailThree[1] = mid;
 492       ExpectBadTrailingUnit(badTrailThree, 2);
 493     }
 494     {
 495       uint8_t mid = 0b1100'0000;
 496       do {
 497         badTrailThree[1] = mid;
 498         ExpectBadTrailingUnit(badTrailThree, 2);
 499         if (mid == 0b1111'1111) {
 500           break;
 501         }
 502
 503         mid++;
 504       } while (true);
 505     }
 506
 507     badTrailThree[1] = 0b1011'1111;
 508     for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
 509       badTrailThree[2] = last;
 510       ExpectBadTrailingUnit(badTrailThree, 3);
 511     }
 512     {
 513       uint8_t last = 0b1100'0000;
 514       do {
 515         badTrailThree[2] = last;
 516         ExpectBadTrailingUnit(badTrailThree, 3);
 517         if (last == 0b1111'1111) {
 518           break;
 519         }
 520
 521         last++;
 522       } while (true);
 523     }
 524   }
 525
 526   // Lead unit indicates a four-byte code point.
 527
 528   char truncatedFourOne[] = {'\0', '\0'};
 529   char truncatedFourTwo[] = {'\0', '\0', '\0'};
 530   char truncatedFourThree[] = {'\0', '\0', '\0', '\0'};
 531
 532   unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'};
 533
 534   for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
 535     truncatedFourOne[0] = lead;
 536     ExpectNotEnoughUnits(truncatedFourOne, 1, 4);
 537
 538     truncatedFourTwo[0] = lead;
 539     ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);
 540
 541     truncatedFourThree[0] = lead;
 542     ExpectNotEnoughUnits(truncatedFourThree, 3, 4);
 543
 544     badTrailFour[0] = lead;
 545     badTrailFour[2] = badTrailFour[3] = 0b1011'1111;  // test for overreads
 546     for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
 547       badTrailFour[1] = second;
 548       ExpectBadTrailingUnit(badTrailFour, 2);
 549     }
 550     {
 551       uint8_t second = 0b1100'0000;
 552       do {
 553         badTrailFour[1] = second;
 554         ExpectBadTrailingUnit(badTrailFour, 2);
 555         if (second == 0b1111'1111) {
 556           break;
 557         }
 558
 559         second++;
 560       } while (true);
 561     }
 562
 563     badTrailFour[1] = badTrailFour[3] = 0b1011'1111;  // test for overreads
 564     for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
 565       badTrailFour[2] = third;
 566       ExpectBadTrailingUnit(badTrailFour, 3);
 567     }
 568     {
 569       uint8_t third = 0b1100'0000;
 570       do {
 571         badTrailFour[2] = third;
 572         ExpectBadTrailingUnit(badTrailFour, 3);
 573         if (third == 0b1111'1111) {
 574           break;
 575         }
 576
 577         third++;
 578       } while (true);
 579     }
 580
 581     badTrailFour[2] = 0b1011'1111;
 582     for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
 583       badTrailFour[3] = fourth;
 584       ExpectBadTrailingUnit(badTrailFour, 4);
 585     }
 586     {
 587       uint8_t fourth = 0b1100'0000;
 588       do {
 589         badTrailFour[3] = fourth;
 590         ExpectBadTrailingUnit(badTrailFour, 4);
 591         if (fourth == 0b1111'1111) {
 592           break;
 593         }
 594
 595         fourth++;
 596       } while (true);
 597     }
 598   }
 599 }
 600
 601 static void TestBadSurrogate() {
 602   // These tests are actually exhaustive.
 603
 604   ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF);  // last before surrogates
 605   ExpectValidCodePoint("\xEE\x80\x80", 0xE000);  // first after surrogates
 606
 607   // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }.  Last invalid
 608   // surrogate encoding is { 0xED, 0xBF, 0xBF }.
 609
 610   char badSurrogate[] = {'\xED', '\0', '\0', '\0'};
 611
 612   for (char32_t c = 0xD800; c < 0xE000; c++) {
 613     badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
 614     badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
 615
 616     ExpectBadCodePoint(badSurrogate, c, 3);
 617   }
 618 }
 619
 620 static void TestBadTooBig() {
 621   // These tests are actually exhaustive.
 622
 623   ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF);  // last code point
 624
 625   // Four-byte code points are
 626   //
 627   //   0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
 628   //
 629   // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
 630   // representable limit (exclusive) is 2**21 - 1 == 2097152.
 631
 632   char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'};
 633
 634   for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
 635     tooLargeCodePoint[0] =
 636         0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
 637     tooLargeCodePoint[1] =
 638         0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
 639     tooLargeCodePoint[2] =
 640         0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
 641     tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
 642
 643     ExpectBadCodePoint(tooLargeCodePoint, c, 4);
 644   }
 645 }
 646
 647 static void TestBadCodePoint() {
 648   TestBadSurrogate();
 649   TestBadTooBig();
 650 }
 651
 652 static void TestNotShortestForm() {
 653   {
 654     // One-byte in two-byte.
 655
 656     char oneInTwo[] = {'\0', '\0', '\0'};
 657
 658     for (char32_t c = '\0'; c < 0x80; c++) {
 659       oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
 660       oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
 661
 662       ExpectNotShortestForm(oneInTwo, c, 2);
 663     }
 664
 665     // One-byte in three-byte.
 666
 667     char oneInThree[] = {'\0', '\0', '\0', '\0'};
 668
 669     for (char32_t c = '\0'; c < 0x80; c++) {
 670       oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
 671       oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
 672       oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
 673
 674       ExpectNotShortestForm(oneInThree, c, 3);
 675     }
 676
 677     // One-byte in four-byte.
 678
 679     char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'};
 680
 681     for (char32_t c = '\0'; c < 0x80; c++) {
 682       oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
 683       oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
 684       oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
 685       oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
 686
 687       ExpectNotShortestForm(oneInFour, c, 4);
 688     }
 689   }
 690
 691   {
 692     // Two-byte in three-byte.
 693
 694     char twoInThree[] = {'\0', '\0', '\0', '\0'};
 695
 696     for (char32_t c = 0x80; c < 0x800; c++) {
 697       twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
 698       twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
 699       twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
 700
 701       ExpectNotShortestForm(twoInThree, c, 3);
 702     }
 703
 704     // Two-byte in four-byte.
 705
 706     char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'};
 707
 708     for (char32_t c = 0x80; c < 0x800; c++) {
 709       twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
 710       twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
 711       twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
 712       twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
 713
 714       ExpectNotShortestForm(twoInFour, c, 4);
 715     }
 716   }
 717
 718   {
 719     // Three-byte in four-byte.
 720
 721     char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'};
 722
 723     for (char32_t c = 0x800; c < 0x1'0000; c++) {
 724       threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
 725       threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
 726       threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
 727       threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
 728
 729       ExpectNotShortestForm(threeInFour, c, 4);
 730     }
 731   }
 732 }
 733
 734 static void TestDecodeOneInvalidUtf8CodePoint() {
 735   TestDecodeBadLeadUnit();
 736   TestTooFewOrBadTrailingUnits();
 737   TestBadCodePoint();
 738   TestNotShortestForm();
 739 }
 740
 741 static void TestDecodeOneUtf8CodePoint() {
 742   TestDecodeOneValidUtf8CodePoint();
 743   TestDecodeOneInvalidUtf8CodePoint();
 744 }
 745
 746 int main() {
 747   TestUtf8Unit();
 748   TestIsUtf8();
 749   TestDecodeOneUtf8CodePoint();
 750   return 0;
 751 }
 752
 753 #if defined(__clang__) && (__clang_major__ >= 6)
 754 #  pragma clang diagnostic pop
 755 #endif