no bug - Import translations from android-l10n r=release a=l10n CLOSED TREE
[gecko.git] / mfbt / tests / TestUtf8.cpp
blobb3ff9e9ee88edef87141cb2ad4e02d8ce39d1955
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #define MOZ_PRETEND_NO_JSRUST 1
9 #include "mozilla/Utf8.h"
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/Assertions.h"
13 #include "mozilla/EnumSet.h"
14 #include "mozilla/IntegerRange.h"
15 #include "mozilla/Span.h"
17 using mozilla::ArrayLength;
18 using mozilla::AsChars;
19 using mozilla::DecodeOneUtf8CodePoint;
20 using mozilla::EnumSet;
21 using mozilla::IntegerRange;
22 using mozilla::IsAscii;
23 using mozilla::IsUtf8;
24 using mozilla::Span;
25 using mozilla::Utf8Unit;
27 // Disable the C++ 2a warning. See bug #1509926
28 #if defined(__clang__) && (__clang_major__ >= 6)
29 # pragma clang diagnostic push
30 # pragma clang diagnostic ignored "-Wc++2a-compat"
31 #endif
33 static void TestUtf8Unit() {
34 Utf8Unit c('A');
35 MOZ_RELEASE_ASSERT(c.toChar() == 'A');
36 MOZ_RELEASE_ASSERT(c == Utf8Unit('A'));
37 MOZ_RELEASE_ASSERT(c != Utf8Unit('B'));
38 MOZ_RELEASE_ASSERT(c.toUint8() == 0x41);
40 unsigned char asUnsigned = 'A';
41 MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned);
42 MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned);
44 Utf8Unit first('@');
45 Utf8Unit second('#');
47 MOZ_RELEASE_ASSERT(first != second);
49 first = second;
50 MOZ_RELEASE_ASSERT(first == second);
53 template <typename Char>
54 struct ToUtf8Units {
55 public:
56 explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
57 : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) {
58 MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
61 const Utf8Unit lead;
62 const Char* iter;
63 const Char* const end;
66 class AssertIfCalled {
67 public:
68 template <typename... Args>
69 void operator()(Args&&... aArgs) {
70 MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
74 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
75 // a string literal or a more-generalized array, we require |aCharN| be
76 // null-terminated.
78 template <typename Char, size_t N>
79 static void ExpectValidCodePoint(const Char (&aCharN)[N],
80 char32_t aExpectedCodePoint) {
81 MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
82 "array must be null-terminated for |aCharN + N - 1| to "
83 "compute the value of |aIter| as altered by "
84 "DecodeOneUtf8CodePoint");
86 ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
87 auto simple =
88 DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
89 MOZ_RELEASE_ASSERT(simple.isSome());
90 MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
91 MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);
93 ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
94 auto complex = DecodeOneUtf8CodePoint(
95 complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(),
96 AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled());
97 MOZ_RELEASE_ASSERT(complex.isSome());
98 MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
99 MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
102 enum class InvalidUtf8Reason {
103 BadLeadUnit,
104 NotEnoughUnits,
105 BadTrailingUnit,
106 BadCodePoint,
107 NotShortestForm,
110 template <typename Char, size_t N>
111 static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
112 InvalidUtf8Reason aExpectedReason,
113 uint8_t aExpectedUnitsAvailable,
114 uint8_t aExpectedUnitsNeeded,
115 char32_t aExpectedBadCodePoint,
116 uint8_t aExpectedUnitsObserved) {
117 MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
118 "array must be null-terminated for |aCharN + N - 1| to "
119 "compute the value of |aIter| as altered by "
120 "DecodeOneUtf8CodePoint");
122 ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
123 auto simple =
124 DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
125 MOZ_RELEASE_ASSERT(simple.isNothing());
126 MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);
128 EnumSet<InvalidUtf8Reason> reasons;
129 uint8_t unitsAvailable;
130 uint8_t unitsNeeded;
131 char32_t badCodePoint;
132 uint8_t unitsObserved;
134 struct OnNotShortestForm {
135 EnumSet<InvalidUtf8Reason>& reasons;
136 char32_t& badCodePoint;
137 uint8_t& unitsObserved;
139 void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
140 reasons += InvalidUtf8Reason::NotShortestForm;
141 badCodePoint = aBadCodePoint;
142 unitsObserved = aUnitsObserved;
146 ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
147 auto complex = DecodeOneUtf8CodePoint(
148 complexUnit.lead, &complexUnit.iter, complexUnit.end,
149 [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; },
150 [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
151 uint8_t aUnitsNeeded) {
152 reasons += InvalidUtf8Reason::NotEnoughUnits;
153 unitsAvailable = aUnitsAvailable;
154 unitsNeeded = aUnitsNeeded;
156 [&reasons, &unitsObserved](uint8_t aUnitsObserved) {
157 reasons += InvalidUtf8Reason::BadTrailingUnit;
158 unitsObserved = aUnitsObserved;
160 [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
161 uint8_t aUnitsObserved) {
162 reasons += InvalidUtf8Reason::BadCodePoint;
163 badCodePoint = aBadCodePoint;
164 unitsObserved = aUnitsObserved;
166 [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
167 uint8_t aUnitsObserved) {
168 reasons += InvalidUtf8Reason::NotShortestForm;
169 badCodePoint = aBadCodePoint;
170 unitsObserved = aUnitsObserved;
172 MOZ_RELEASE_ASSERT(complex.isNothing());
173 MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);
175 bool alreadyIterated = false;
176 for (InvalidUtf8Reason reason : reasons) {
177 MOZ_RELEASE_ASSERT(!alreadyIterated);
178 alreadyIterated = true;
180 switch (reason) {
181 case InvalidUtf8Reason::BadLeadUnit:
182 break;
184 case InvalidUtf8Reason::NotEnoughUnits:
185 MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
186 MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
187 break;
189 case InvalidUtf8Reason::BadTrailingUnit:
190 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
191 break;
193 case InvalidUtf8Reason::BadCodePoint:
194 MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
195 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
196 break;
198 case InvalidUtf8Reason::NotShortestForm:
199 MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
200 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
201 break;
206 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
207 // a string literal or a more-generalized array, we require |aCharN| be
208 // null-terminated in all these functions.
210 template <typename Char, size_t N>
211 static void ExpectBadLeadUnit(const Char (&aCharN)[N]) {
212 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF,
213 0xFF, 0xFFFFFFFF, 0xFF);
216 template <typename Char, size_t N>
217 static void ExpectNotEnoughUnits(const Char (&aCharN)[N],
218 uint8_t aExpectedUnitsAvailable,
219 uint8_t aExpectedUnitsNeeded) {
220 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits,
221 aExpectedUnitsAvailable, aExpectedUnitsNeeded,
222 0xFFFFFFFF, 0xFF);
225 template <typename Char, size_t N>
226 static void ExpectBadTrailingUnit(const Char (&aCharN)[N],
227 uint8_t aExpectedUnitsObserved) {
228 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF,
229 0xFF, 0xFFFFFFFF, aExpectedUnitsObserved);
232 template <typename Char, size_t N>
233 static void ExpectNotShortestForm(const Char (&aCharN)[N],
234 char32_t aExpectedBadCodePoint,
235 uint8_t aExpectedUnitsObserved) {
236 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF,
237 0xFF, aExpectedBadCodePoint,
238 aExpectedUnitsObserved);
241 template <typename Char, size_t N>
242 static void ExpectBadCodePoint(const Char (&aCharN)[N],
243 char32_t aExpectedBadCodePoint,
244 uint8_t aExpectedUnitsObserved) {
245 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF,
246 0xFF, aExpectedBadCodePoint,
247 aExpectedUnitsObserved);
250 static void TestIsUtf8() {
251 // Note we include the U+0000 NULL in this one -- and that's fine.
252 static const char asciiBytes[] = u8"How about a nice game of chess?";
253 MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, ArrayLength(asciiBytes))));
255 static const char endNonAsciiBytes[] = u8"Life is like a 🌯";
256 MOZ_RELEASE_ASSERT(
257 IsUtf8(Span(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1)));
259 static const unsigned char badLeading[] = {0x80};
260 MOZ_RELEASE_ASSERT(
261 !IsUtf8(AsChars(Span(badLeading, ArrayLength(badLeading)))));
263 // Byte-counts
265 // 1
266 static const char oneBytes[] = u8"A"; // U+0041 LATIN CAPITAL LETTER A
267 constexpr size_t oneBytesLen = ArrayLength(oneBytes);
268 static_assert(oneBytesLen == 2, "U+0041 plus nul");
269 MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen)));
271 // 2
272 static const char twoBytes[] = u8"؆"; // U+0606 ARABIC-INDIC CUBE ROOT
273 constexpr size_t twoBytesLen = ArrayLength(twoBytes);
274 static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
275 MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen)));
277 ExpectValidCodePoint(twoBytes, 0x0606);
279 // 3
280 static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA
281 constexpr size_t threeBytesLen = ArrayLength(threeBytes);
282 static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
283 MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen)));
285 ExpectValidCodePoint(threeBytes, 0x1A1E);
287 // 4
288 static const char fourBytes[] =
289 u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06
290 constexpr size_t fourBytesLen = ArrayLength(fourBytes);
291 static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
292 MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen)));
294 ExpectValidCodePoint(fourBytes, 0x1F061);
296 // Max code point
297 static const char maxCodePoint[] = u8"􏿿"; // U+10FFFF
298 constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
299 static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
300 MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen)));
302 ExpectValidCodePoint(maxCodePoint, 0x10FFFF);
304 // One past max code point
305 static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80,
306 0x0};
307 constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
308 MOZ_RELEASE_ASSERT(
309 !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen))));
311 ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);
313 // Surrogate-related testing
315 // (Note that the various code unit sequences here are null-terminated to
316 // simplify life for ExpectValidCodePoint, which presumes null termination.)
318 static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0};
319 constexpr size_t justBeforeSurrogatesLen =
320 ArrayLength(justBeforeSurrogates) - 1;
321 MOZ_RELEASE_ASSERT(
322 IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen))));
324 ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);
326 static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0};
327 constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1;
328 MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen))));
330 ExpectBadCodePoint(leastSurrogate, 0xD800, 3);
332 static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0};
333 constexpr size_t arbitraryHighSurrogateLen =
334 ArrayLength(arbitraryHighSurrogate) - 1;
335 MOZ_RELEASE_ASSERT(!IsUtf8(
336 AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen))));
338 ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);
340 static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0};
341 constexpr size_t arbitraryLowSurrogateLen =
342 ArrayLength(arbitraryLowSurrogate) - 1;
343 MOZ_RELEASE_ASSERT(
344 !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen))));
346 ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);
348 static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0};
349 constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1;
350 MOZ_RELEASE_ASSERT(
351 !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen))));
353 ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);
355 static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0};
356 constexpr size_t justAfterSurrogatesLen =
357 ArrayLength(justAfterSurrogates) - 1;
358 MOZ_RELEASE_ASSERT(
359 IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen))));
361 ExpectValidCodePoint(justAfterSurrogates, 0xE000);
364 static void TestDecodeOneValidUtf8CodePoint() {
365 // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
366 // consist of multiple code units, so there are no ASCII tests below.
368 // Length two.
370 ExpectValidCodePoint(u8"€", 0x80); // <control>
371 ExpectValidCodePoint(u8"©", 0xA9); // COPYRIGHT SIGN
372 ExpectValidCodePoint(u8"¶", 0xB6); // PILCROW SIGN
373 ExpectValidCodePoint(u8"¾", 0xBE); // VULGAR FRACTION THREE QUARTERS
374 ExpectValidCodePoint(u8"÷", 0xF7); // DIVISION SIGN
375 ExpectValidCodePoint(u8"ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS
376 ExpectValidCodePoint(u8"Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON
377 ExpectValidCodePoint(u8"IJ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ
378 ExpectValidCodePoint(u8"ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
379 ExpectValidCodePoint(u8"Ӝ",
380 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
381 ExpectValidCodePoint(u8"۩", 0x6E9); // ARABIC PLACE OF SAJDAH
382 ExpectValidCodePoint(u8"߿", 0x7FF); // <not assigned>
384 // Length three.
386 ExpectValidCodePoint(u8"ࠀ", 0x800); // SAMARITAN LETTER ALAF
387 ExpectValidCodePoint(u8"ࡁ", 0x841); // MANDAIC LETTER AB
388 ExpectValidCodePoint(u8"ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA
389 ExpectValidCodePoint(u8"ஆ", 0xB86); // TAMIL LETTER AA
390 ExpectValidCodePoint(u8"༃",
391 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
392 ExpectValidCodePoint(
393 u8"࿉",
394 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like
395 // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
396 ExpectValidCodePoint(u8"ဪ", 0x102A); // MYANMAR LETTER AU
397 ExpectValidCodePoint(u8"ᚏ", 0x168F); // OGHAM LETTER RUIS
398 ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR
399 ExpectValidCodePoint("\xE2\x80\xA9",
400 0x2029); // (the hated) PARAGRAPH SEPARATOR
401 ExpectValidCodePoint(u8"☬", 0x262C); // ADI SHAKTI
402 ExpectValidCodePoint(u8"㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE
403 ExpectValidCodePoint(u8"㏖", 0x33D6); // SQUARE MOL
404 ExpectValidCodePoint(u8"ꔄ", 0xA504); // VAI SYLLABLE WEEN
405 ExpectValidCodePoint(u8"ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
406 ExpectValidCodePoint(u8"퟿", 0xD7FF); // <not assigned>
407 ExpectValidCodePoint(u8"", 0xE000); // <Private Use>
408 ExpectValidCodePoint(u8"鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F
409 ExpectValidCodePoint(
410 u8"﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
411 ExpectValidCodePoint(u8"￿", 0xFFFF); // <not assigned>
413 // Length four.
414 ExpectValidCodePoint(u8"𐀀", 0x10000); // LINEAR B SYLLABLE B008 A
415 ExpectValidCodePoint(u8"𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058
416 ExpectValidCodePoint(u8"𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI
417 ExpectValidCodePoint(u8"💩", 0x1F4A9); // PILE OF POO
418 ExpectValidCodePoint(u8"🔫", 0x1F52B); // PISTOL
419 ExpectValidCodePoint(u8"🥌", 0x1F94C); // CURLING STONE
420 ExpectValidCodePoint(u8"🥏", 0x1F94F); // FLYING DISC
421 ExpectValidCodePoint(u8"𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346
422 ExpectValidCodePoint(u8"𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A
423 ExpectValidCodePoint(u8"񁟶", 0x417F6); // <not assigned>
424 ExpectValidCodePoint(u8"񾠶", 0x7E836); // <not assigned>
425 ExpectValidCodePoint(u8"󾽧", 0xFEF67); // <Plane 15 Private Use>
426 ExpectValidCodePoint(u8"􏿿", 0x10FFFF); //
429 static void TestDecodeBadLeadUnit() {
430 // These tests are actually exhaustive.
432 unsigned char badLead[] = {'\0', '\0'};
434 for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
435 badLead[0] = lead;
436 ExpectBadLeadUnit(badLead);
440 uint8_t lead = 0b1111'1000;
441 do {
442 badLead[0] = lead;
443 ExpectBadLeadUnit(badLead);
444 if (lead == 0b1111'1111) {
445 break;
448 lead++;
449 } while (true);
453 static void TestTooFewOrBadTrailingUnits() {
454 // Lead unit indicates a two-byte code point.
456 char truncatedTwo[] = {'\0', '\0'};
457 char badTrailTwo[] = {'\0', '\0', '\0'};
459 for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
460 truncatedTwo[0] = lead;
461 ExpectNotEnoughUnits(truncatedTwo, 1, 2);
463 badTrailTwo[0] = lead;
464 for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
465 badTrailTwo[1] = trail;
466 ExpectBadTrailingUnit(badTrailTwo, 2);
469 for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
470 badTrailTwo[1] = trail;
471 ExpectBadTrailingUnit(badTrailTwo, 2);
475 // Lead unit indicates a three-byte code point.
477 char truncatedThreeOne[] = {'\0', '\0'};
478 char truncatedThreeTwo[] = {'\0', '\0', '\0'};
479 unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'};
481 for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
482 truncatedThreeOne[0] = lead;
483 ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);
485 truncatedThreeTwo[0] = lead;
486 ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);
488 badTrailThree[0] = lead;
489 badTrailThree[2] = 0b1011'1111; // make valid to test overreads
490 for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
491 badTrailThree[1] = mid;
492 ExpectBadTrailingUnit(badTrailThree, 2);
495 uint8_t mid = 0b1100'0000;
496 do {
497 badTrailThree[1] = mid;
498 ExpectBadTrailingUnit(badTrailThree, 2);
499 if (mid == 0b1111'1111) {
500 break;
503 mid++;
504 } while (true);
507 badTrailThree[1] = 0b1011'1111;
508 for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
509 badTrailThree[2] = last;
510 ExpectBadTrailingUnit(badTrailThree, 3);
513 uint8_t last = 0b1100'0000;
514 do {
515 badTrailThree[2] = last;
516 ExpectBadTrailingUnit(badTrailThree, 3);
517 if (last == 0b1111'1111) {
518 break;
521 last++;
522 } while (true);
526 // Lead unit indicates a four-byte code point.
528 char truncatedFourOne[] = {'\0', '\0'};
529 char truncatedFourTwo[] = {'\0', '\0', '\0'};
530 char truncatedFourThree[] = {'\0', '\0', '\0', '\0'};
532 unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'};
534 for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
535 truncatedFourOne[0] = lead;
536 ExpectNotEnoughUnits(truncatedFourOne, 1, 4);
538 truncatedFourTwo[0] = lead;
539 ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);
541 truncatedFourThree[0] = lead;
542 ExpectNotEnoughUnits(truncatedFourThree, 3, 4);
544 badTrailFour[0] = lead;
545 badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads
546 for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
547 badTrailFour[1] = second;
548 ExpectBadTrailingUnit(badTrailFour, 2);
551 uint8_t second = 0b1100'0000;
552 do {
553 badTrailFour[1] = second;
554 ExpectBadTrailingUnit(badTrailFour, 2);
555 if (second == 0b1111'1111) {
556 break;
559 second++;
560 } while (true);
563 badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads
564 for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
565 badTrailFour[2] = third;
566 ExpectBadTrailingUnit(badTrailFour, 3);
569 uint8_t third = 0b1100'0000;
570 do {
571 badTrailFour[2] = third;
572 ExpectBadTrailingUnit(badTrailFour, 3);
573 if (third == 0b1111'1111) {
574 break;
577 third++;
578 } while (true);
581 badTrailFour[2] = 0b1011'1111;
582 for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
583 badTrailFour[3] = fourth;
584 ExpectBadTrailingUnit(badTrailFour, 4);
587 uint8_t fourth = 0b1100'0000;
588 do {
589 badTrailFour[3] = fourth;
590 ExpectBadTrailingUnit(badTrailFour, 4);
591 if (fourth == 0b1111'1111) {
592 break;
595 fourth++;
596 } while (true);
601 static void TestBadSurrogate() {
602 // These tests are actually exhaustive.
604 ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates
605 ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates
607 // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }. Last invalid
608 // surrogate encoding is { 0xED, 0xBF, 0xBF }.
610 char badSurrogate[] = {'\xED', '\0', '\0', '\0'};
612 for (char32_t c = 0xD800; c < 0xE000; c++) {
613 badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
614 badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
616 ExpectBadCodePoint(badSurrogate, c, 3);
620 static void TestBadTooBig() {
621 // These tests are actually exhaustive.
623 ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point
625 // Four-byte code points are
627 // 0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
629 // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
630 // representable limit (exclusive) is 2**21 - 1 == 2097152.
632 char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'};
634 for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
635 tooLargeCodePoint[0] =
636 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
637 tooLargeCodePoint[1] =
638 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
639 tooLargeCodePoint[2] =
640 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
641 tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
643 ExpectBadCodePoint(tooLargeCodePoint, c, 4);
647 static void TestBadCodePoint() {
648 TestBadSurrogate();
649 TestBadTooBig();
652 static void TestNotShortestForm() {
654 // One-byte in two-byte.
656 char oneInTwo[] = {'\0', '\0', '\0'};
658 for (char32_t c = '\0'; c < 0x80; c++) {
659 oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
660 oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
662 ExpectNotShortestForm(oneInTwo, c, 2);
665 // One-byte in three-byte.
667 char oneInThree[] = {'\0', '\0', '\0', '\0'};
669 for (char32_t c = '\0'; c < 0x80; c++) {
670 oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
671 oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
672 oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
674 ExpectNotShortestForm(oneInThree, c, 3);
677 // One-byte in four-byte.
679 char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'};
681 for (char32_t c = '\0'; c < 0x80; c++) {
682 oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
683 oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
684 oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
685 oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
687 ExpectNotShortestForm(oneInFour, c, 4);
692 // Two-byte in three-byte.
694 char twoInThree[] = {'\0', '\0', '\0', '\0'};
696 for (char32_t c = 0x80; c < 0x800; c++) {
697 twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
698 twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
699 twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
701 ExpectNotShortestForm(twoInThree, c, 3);
704 // Two-byte in four-byte.
706 char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'};
708 for (char32_t c = 0x80; c < 0x800; c++) {
709 twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
710 twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
711 twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
712 twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
714 ExpectNotShortestForm(twoInFour, c, 4);
719 // Three-byte in four-byte.
721 char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'};
723 for (char32_t c = 0x800; c < 0x1'0000; c++) {
724 threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
725 threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
726 threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
727 threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
729 ExpectNotShortestForm(threeInFour, c, 4);
734 static void TestDecodeOneInvalidUtf8CodePoint() {
735 TestDecodeBadLeadUnit();
736 TestTooFewOrBadTrailingUnits();
737 TestBadCodePoint();
738 TestNotShortestForm();
741 static void TestDecodeOneUtf8CodePoint() {
742 TestDecodeOneValidUtf8CodePoint();
743 TestDecodeOneInvalidUtf8CodePoint();
746 int main() {
747 TestUtf8Unit();
748 TestIsUtf8();
749 TestDecodeOneUtf8CodePoint();
750 return 0;
753 #if defined(__clang__) && (__clang_major__ >= 6)
754 # pragma clang diagnostic pop
755 #endif