mfbt/Utf8.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 /*
   8  * UTF-8-related functionality, including a type-safe structure representing a
   9  * UTF-8 code unit.
  10  */
  11
  12 #ifndef mozilla_Utf8_h
  13 #define mozilla_Utf8_h
  14
  15 #include "mozilla/Casting.h"    // for mozilla::AssertedCast
  16 #include "mozilla/Likely.h"     // for MOZ_UNLIKELY
  17 #include "mozilla/Maybe.h"      // for mozilla::Maybe
  18 #include "mozilla/Span.h"       // for mozilla::Span
  19 #include "mozilla/TextUtils.h"  // for mozilla::IsAscii and via Latin1.h for
  20                                 // encoding_rs_mem.h and MOZ_HAS_JSRUST.
  21 #include "mozilla/Types.h"      // for MFBT_API
  22
  23 #include <limits>    // for std::numeric_limits
  24 #include <limits.h>  // for CHAR_BIT
  25 #include <stddef.h>  // for size_t
  26 #include <stdint.h>  // for uint8_t
  27
  28 #if MOZ_HAS_JSRUST()
  29 // Can't include mozilla/Encoding.h here.
  30 extern "C" {
  31 // Declared as uint8_t instead of char to match declaration in another header.
  32 size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
  33 }
  34 #else
  35 namespace mozilla {
  36 namespace detail {
  37 extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount);
  38 };  // namespace detail
  39 };  // namespace mozilla
  40 #endif  // MOZ_HAS_JSRUST
  41
  42 namespace mozilla {
  43
  44 union Utf8Unit;
  45
  46 static_assert(CHAR_BIT == 8,
  47               "Utf8Unit won't work so well with non-octet chars");
  48
  49 /**
  50  * A code unit within a UTF-8 encoded string.  (A code unit is the smallest
  51  * unit within the Unicode encoding of a string.  For UTF-8 this is an 8-bit
  52  * number; for UTF-16 it would be a 16-bit number.)
  53  *
  54  * This is *not* the same as a single code point: in UTF-8, non-ASCII code
  55  * points are constituted by multiple code units.
  56  */
  57 union Utf8Unit {
  58  private:
  59   // Utf8Unit is a union wrapping a raw |char|.  The C++ object model and C++
  60   // requirements as to how objects may be accessed with respect to their actual
  61   // types (almost?) uniquely compel this choice.
  62   //
  63   // Our requirements for a UTF-8 code unit representation are:
  64   //
  65   //   1. It must be "compatible" with C++ character/string literals that use
  66   //      the UTF-8 encoding.  Given a properly encoded C++ literal, you should
  67   //      be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
  68   //      and friends (particularly UnicodeData), you should be able to access
  69   //      C++ character types for their contents.
  70   //   2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
  71   //      explicit operation.
  72   //   3. |Utf8Unit| must participate in overload resolution and template type
  73   //      equivalence (that is, given |template<class> class X|, when |X<T>| and
  74   //      |X<U>| are the same type) distinctly from the C++ character types.
  75   //
  76   // And a few nice-to-haves (at least for the moment):
  77   //
  78   //   4. The representation should use unsigned numbers, to avoid undefined
  79   //      behavior that can arise with signed types, and because Unicode code
  80   //      points and code units are unsigned.
  81   //   5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
  82   //      and |unsigned char*|, for APIs that (because of #4 above) use those
  83   //      types as the "natural" choice for UTF-8 data.
  84   //
  85   // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
  86   // |{,{un,}signed} char|.[0]  |uint8_t| won't work because it might not be a
  87   // C++ character type.
  88   //
  89   // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
  90   // typedefs don't generate *new* types, just type aliases).  This requires a
  91   // compound type.
  92   //
  93   // The ultimate representation (and character type in it) is constrained by
  94   // C++14 [basic.lval]p10 that defines how objects may be accessed, with
  95   // respect to the dynamic type in memory and the actual type used to access
  96   // them.  It reads:
  97   //
  98   //     If a program attempts to access the stored value of an object
  99   //     through a glvalue of other than one of the following types the
 100   //     behavior is undefined:
 101   //
 102   //       1. the dynamic type of the object,
 103   //       2. a cv-qualified version of the dynamic type of the object,
 104   //       ...other types irrelevant here...
 105   //       3. an aggregate or union type that includes one of the
 106   //          aforementioned types among its elements or non-static data
 107   //          members (including, recursively, an element or non-static
 108   //          data member of a subaggregate or contained union),
 109   //       ...more irrelevant types...
 110   //       4. a char or unsigned char type.
 111   //
 112   // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
 113   // matter the representation by #4.  (Briefly set aside what values are seen.)
 114   // (And #2 allows |const| on either the dynamic type or the accessing type.)
 115   // (|signed char| is really only useful for small signed numbers, not
 116   // characters, so we ignore it.)
 117   //
 118   // If we interpret contents as |char|/|unsigned char| contrary to the actual
 119   // type stored there, what happens?  C++14 [basic.fundamental]p1 requires
 120   // character types be identically aligned/sized; C++14 [basic.fundamental]p3
 121   // requires |signed char| and |unsigned char| have the same value
 122   // representation.  C++ doesn't require identical bitwise representation, tho.
 123   // Practically we could assume it, but this verges on C++ spec bits best not
 124   // *relied* on for correctness, if possible.
 125   //
 126   // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
 127   // and |char*|.  Instead we safely expose |unsigned char| by fully-defined
 128   // *integral conversion* (C++14 [conv.integral]p2).  Integral conversion from
 129   // |unsigned char| → |char| has only implementation-defined behavior.  It'd be
 130   // better not to depend on that, but given twos-complement won, it should be
 131   // okay.  (Also |unsigned char*| is awkward enough to work with for strings
 132   // that it probably doesn't appear in string manipulation much anyway, only in
 133   // places that should really use |Utf8Unit| directly.)
 134   //
 135   // The opposite direction -- interpreting |char| or |char*| data through
 136   // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
 137   // decided above, using #3.  An "aggregate or union" will work that contains a
 138   // |char|.  Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
 139   // aggregates must have "no private or protected non-static data members", and
 140   // we want to keep the inner |char| hidden.  So a |struct| is out, and only
 141   // |union| remains.
 142   //
 143   // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
 144   // can't make |Utf8Unit| an enum class with |char| underlying type, because we
 145   // are given no license to treat |char| memory as such an |enum|'s memory.)
 146   //
 147   // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
 148   // This satisfies all our requirements.  It also supports the nice-to-haves of
 149   // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
 150   // |unsigned char|.  It doesn't satisfy the nice-to-haves of using an
 151   // |unsigned char| internally, nor of letting us wrap an existing
 152   // |unsigned char| or pointer to one.  We probably *could* do these, if we
 153   // were willing to rely harder on implementation-defined behaviors, but for
 154   // now we privilege C++'s main character type over some conceptual purity.
 155   //
 156   // 0. There's a proposal for a UTF-8 character type distinct from the existing
 157   //    C++ narrow character types:
 158   //
 159   //      http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
 160   //
 161   //    but it hasn't been standardized (and might never be), and none of the
 162   //    compilers we really care about have implemented it.  Maybe someday we
 163   //    can change our implementation to it without too much trouble, if we're
 164   //    lucky...
 165   char mValue = '\0';
 166
 167  public:
 168   Utf8Unit() = default;
 169
 170   explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {}
 171
 172   explicit constexpr Utf8Unit(unsigned char aUnit)
 173       : mValue(static_cast<char>(aUnit)) {
 174     // Per the above comment, the prior cast is integral conversion with
 175     // implementation-defined semantics, and we regretfully but unavoidably
 176     // assume the conversion does what we want it to.
 177   }
 178
 179 #ifdef __cpp_char8_t
 180   explicit constexpr Utf8Unit(char8_t aUnit)
 181       : mValue(static_cast<char>(aUnit)) {}
 182 #endif
 183
 184   constexpr bool operator==(const Utf8Unit& aOther) const {
 185     return mValue == aOther.mValue;
 186   }
 187
 188   constexpr bool operator!=(const Utf8Unit& aOther) const {
 189     return !(*this == aOther);
 190   }
 191
 192   /** Convert a UTF-8 code unit to a raw char. */
 193   constexpr char toChar() const {
 194     // Only a |char| is ever permitted to be written into this location, so this
 195     // is both permissible and returns the desired value.
 196     return mValue;
 197   }
 198
 199   /** Convert a UTF-8 code unit to a raw unsigned char. */
 200   constexpr unsigned char toUnsignedChar() const {
 201     // Per the above comment, this is well-defined integral conversion.
 202     return static_cast<unsigned char>(mValue);
 203   }
 204
 205   /** Convert a UTF-8 code unit to a uint8_t. */
 206   constexpr uint8_t toUint8() const {
 207     // Per the above comment, this is well-defined integral conversion.
 208     return static_cast<uint8_t>(mValue);
 209   }
 210
 211   // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
 212   // that's a somewhat separate concern, justified in different comments in
 213   // that other code.
 214 };
 215
 216 /**
 217  * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
 218  *
 219  * Assuming proper backing has been set up, the resulting |const unsigned char*|
 220  * may validly be dereferenced.
 221  *
 222  * No access is provided to mutate this underlying memory as |unsigned char|.
 223  * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
 224  * loath to offer a way to write non-|char| data until absolutely necessary.
 225  */
 226 inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) {
 227   static_assert(sizeof(Utf8Unit) == sizeof(unsigned char),
 228                 "sizes must match to permissibly reinterpret_cast<>");
 229   static_assert(alignof(Utf8Unit) == alignof(unsigned char),
 230                 "alignment must match to permissibly reinterpret_cast<>");
 231
 232   // The static_asserts above only enable the reinterpret_cast<> to occur.
 233   //
 234   // Dereferencing the resulting pointer is a separate question.  Any object's
 235   // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
 236   // this doesn't guarantee what values will be observed.  If |char| is
 237   // implemented to act like |unsigned char|, we're good to go: memory for the
 238   // |char| in |Utf8Unit| acts as we need.  But if |char| is implemented to act
 239   // like |signed char|, dereferencing produces the right value only if the
 240   // |char| types all use two's-complement representation.  Every modern
 241   // compiler does this, and there's a C++ proposal to standardize it.
 242   // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html   So
 243   // *technically* this is implementation-defined -- but everyone does it and
 244   // this behavior is being standardized.
 245   return reinterpret_cast<const unsigned char*>(aUnits);
 246 }
 247
 248 /** Returns true iff |aUnit| is an ASCII value. */
 249 constexpr bool IsAscii(Utf8Unit aUnit) {
 250   return IsAscii(aUnit.toUnsignedChar());
 251 }
 252
 253 /**
 254  * Return true if the given span of memory consists of a valid UTF-8
 255  * string and false otherwise.
 256  *
 257  * The string *may* contain U+0000 NULL code points.
 258  */
 259 inline bool IsUtf8(mozilla::Span<const char> aString) {
 260 #if MOZ_HAS_JSRUST()
 261   size_t length = aString.Length();
 262   const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
 263   // For short strings, the function call is a pessimization, and the SIMD
 264   // code won't have a chance to kick in anyway.
 265   if (length < 16) {
 266     for (size_t i = 0; i < length; i++) {
 267       if (ptr[i] >= 0x80U) {
 268         ptr += i;
 269         length -= i;
 270         goto end;
 271       }
 272     }
 273     return true;
 274   }
 275 end:
 276   return length == encoding_utf8_valid_up_to(ptr, length);
 277 #else
 278   return detail::IsValidUtf8(aString.Elements(), aString.Length());
 279 #endif
 280 }
 281
 282 #if MOZ_HAS_JSRUST()
 283
 284 // See Latin1.h for conversions between Latin1 and UTF-8.
 285
 286 /**
 287  * Returns the index of the start of the first malformed byte
 288  * sequence or the length of the string if there are none.
 289  */
 290 inline size_t Utf8ValidUpTo(mozilla::Span<const char> aString) {
 291   return encoding_utf8_valid_up_to(
 292       reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
 293 }
 294
 295 /**
 296  * Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates
 297  * with the REPLACEMENT CHARACTER.
 298  *
 299  * The length of aDest must be at least the length of aSource times three.
 300  *
 301  * Returns the number of code units written.
 302  */
 303 inline size_t ConvertUtf16toUtf8(mozilla::Span<const char16_t> aSource,
 304                                  mozilla::Span<char> aDest) {
 305   return encoding_mem_convert_utf16_to_utf8(
 306       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 307 }
 308
 309 /**
 310  * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
 311  * sequences with the REPLACEMENT CHARACTER with potentially insufficient
 312  * output space.
 313  *
 314  * Returns the number of code units read and the number of bytes written.
 315  *
 316  * If the output isn't large enough, not all input is consumed.
 317  *
 318  * The conversion is guaranteed to be complete if the length of aDest is
 319  * at least the length of aSource times three.
 320  *
 321  * The output is always valid UTF-8 ending on scalar value boundary
 322  * even in the case of partial conversion.
 323  *
 324  * The semantics of this function match the semantics of
 325  * TextEncoder.encodeInto.
 326  * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
 327  */
 328 inline std::tuple<size_t, size_t> ConvertUtf16toUtf8Partial(
 329     mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest) {
 330   size_t srcLen = aSource.Length();
 331   size_t dstLen = aDest.Length();
 332   encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen,
 333                                              aDest.Elements(), &dstLen);
 334   return std::make_tuple(srcLen, dstLen);
 335 }
 336
 337 /**
 338  * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
 339  * sequences with the REPLACEMENT CHARACTER.
 340  *
 341  * Returns the number of code units written.
 342  *
 343  * The length of aDest must be at least one greater than the length of aSource
 344  * even though the last slot isn't written to.
 345  *
 346  * If you know that the input is valid for sure, use
 347  * UnsafeConvertValidUtf8toUtf16() instead.
 348  */
 349 inline size_t ConvertUtf8toUtf16(mozilla::Span<const char> aSource,
 350                                  mozilla::Span<char16_t> aDest) {
 351   return encoding_mem_convert_utf8_to_utf16(
 352       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 353 }
 354
 355 /**
 356  * Converts known-valid UTF-8 to UTF-16. If the input might be invalid,
 357  * use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead.
 358  *
 359  * Returns the number of code units written.
 360  *
 361  * The length of aDest must be at least the length of aSource.
 362  */
 363 inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span<const char> aSource,
 364                                             mozilla::Span<char16_t> aDest) {
 365   return encoding_mem_convert_str_to_utf16(aSource.Elements(), aSource.Length(),
 366                                            aDest.Elements(), aDest.Length());
 367 }
 368
 369 /**
 370  * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
 371  *
 372  * Returns the number of code units written or `mozilla::Nothing` if the
 373  * input was invalid.
 374  *
 375  * The length of the destination buffer must be at least the length of the
 376  * source buffer.
 377  *
 378  * When the input was invalid, some output may have been written.
 379  *
 380  * If you know that the input is valid for sure, use
 381  * UnsafeConvertValidUtf8toUtf16() instead.
 382  */
 383 inline mozilla::Maybe<size_t> ConvertUtf8toUtf16WithoutReplacement(
 384     mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest) {
 385   size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement(
 386       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 387   if (MOZ_UNLIKELY(written == std::numeric_limits<size_t>::max())) {
 388     return mozilla::Nothing();
 389   }
 390   return mozilla::Some(written);
 391 }
 392
 393 #endif  // MOZ_HAS_JSRUST
 394
 395 /**
 396  * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
 397  * 0b10xx'xxxx.
 398  */
 399 inline bool IsTrailingUnit(Utf8Unit aUnit) {
 400   return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
 401 }
 402
 403 /**
 404  * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
 405  * that (initially) itself points one unit past |aLeadUnit|, and
 406  * |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared
 407  * against |*aIter| using |aEnd - *aIter|:
 408  *
 409  * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
 410  * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
 411  * surrogate, in shortest form -- then return Some(that code point) and advance
 412  * |*aIter| past those code units.
 413  *
 414  * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
 415  * Nothing().
 416  *
 417  * |Iter| and |EndIter| are generalized concepts most easily understood as if
 418  * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
 419  * iterators that when dereferenced can be used to construct a |Utf8Unit| and
 420  * that can be compared and modified in certain limited ways.  (Carefully note
 421  * that this function mutates |*aIter|.)  |Iter| and |EndIter| are template
 422  * parameters to support more-complicated adaptor iterators.
 423  *
 424  * The template parameters after |Iter| allow users to implement custom handling
 425  * for various forms of invalid UTF-8.  A version of this function that defaults
 426  * all such handling to no-ops is defined below this function.  To learn how to
 427  * define your own custom handling, consult the implementation of that function,
 428  * which documents exactly how custom handler functors are invoked.
 429  *
 430  * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
 431  * of this function without the "Inline" suffix on the name.
 432  */
 433 template <typename Iter, typename EndIter, class OnBadLeadUnit,
 434           class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
 435           class OnNotShortestForm>
 436 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
 437     const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
 438     OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
 439     OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
 440     OnNotShortestForm aOnNotShortestForm) {
 441   MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);
 442
 443   char32_t n = aLeadUnit.toUint8();
 444   MOZ_ASSERT(!IsAscii(n));
 445
 446   // |aLeadUnit| determines the number of trailing code units in the code point
 447   // and the bits of |aLeadUnit| that contribute to the code point's value.
 448   uint8_t remaining;
 449   uint32_t min;
 450   if ((n & 0b1110'0000) == 0b1100'0000) {
 451     remaining = 1;
 452     min = 0x80;
 453     n &= 0b0001'1111;
 454   } else if ((n & 0b1111'0000) == 0b1110'0000) {
 455     remaining = 2;
 456     min = 0x800;
 457     n &= 0b0000'1111;
 458   } else if ((n & 0b1111'1000) == 0b1111'0000) {
 459     remaining = 3;
 460     min = 0x10000;
 461     n &= 0b0000'0111;
 462   } else {
 463     *aIter -= 1;
 464     aOnBadLeadUnit();
 465     return Nothing();
 466   }
 467
 468   // If the code point would require more code units than remain, the encoding
 469   // is invalid.
 470   auto actual = aEnd - *aIter;
 471   if (MOZ_UNLIKELY(actual < remaining)) {
 472     *aIter -= 1;
 473     aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
 474     return Nothing();
 475   }
 476
 477   for (uint8_t i = 0; i < remaining; i++) {
 478     const Utf8Unit unit(*(*aIter)++);
 479
 480     // Every non-leading code unit in properly encoded UTF-8 has its high
 481     // bit set and the next-highest bit unset.
 482     if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
 483       uint8_t unitsObserved = i + 1 + 1;
 484       *aIter -= unitsObserved;
 485       aOnBadTrailingUnit(unitsObserved);
 486       return Nothing();
 487     }
 488
 489     // The code point being encoded is the concatenation of all the
 490     // unconstrained bits.
 491     n = (n << 6) | (unit.toUint8() & 0b0011'1111);
 492   }
 493
 494   // UTF-16 surrogates and values outside the Unicode range are invalid.
 495   if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
 496     uint8_t unitsObserved = remaining + 1;
 497     *aIter -= unitsObserved;
 498     aOnBadCodePoint(n, unitsObserved);
 499     return Nothing();
 500   }
 501
 502   // Overlong code points are also invalid.
 503   if (MOZ_UNLIKELY(n < min)) {
 504     uint8_t unitsObserved = remaining + 1;
 505     *aIter -= unitsObserved;
 506     aOnNotShortestForm(n, unitsObserved);
 507     return Nothing();
 508   }
 509
 510   return Some(n);
 511 }
 512
 513 /**
 514  * Identical to the above function, but not forced to be instantiated inline --
 515  * the compiler is permitted to common up separate invocations if it chooses.
 516  */
 517 template <typename Iter, typename EndIter, class OnBadLeadUnit,
 518           class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
 519           class OnNotShortestForm>
 520 inline Maybe<char32_t> DecodeOneUtf8CodePoint(
 521     const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
 522     OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
 523     OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
 524     OnNotShortestForm aOnNotShortestForm) {
 525   return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit,
 526                                       aOnNotEnoughUnits, aOnBadTrailingUnit,
 527                                       aOnBadCodePoint, aOnNotShortestForm);
 528 }
 529
 530 /**
 531  * Like the always-inlined function above, but with no-op behavior from all
 532  * trailing if-invalid notifier functors.
 533  *
 534  * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
 535  * of this function without the "Inline" suffix on the name.
 536  */
 537 template <typename Iter, typename EndIter>
 538 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
 539     const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) {
 540   // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
 541   // a multi-unit code point.  It is passed no arguments: the caller already has
 542   // |aLeadUnit| on hand, so no need to provide it again.
 543   auto onBadLeadUnit = []() {};
 544
 545   // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
 546   // point length, but there aren't enough units from |*aIter| to |aEnd| to
 547   // satisfy that length.  It is passed the number of code units actually
 548   // available (according to |aEnd - *aIter|) and the number of code units that
 549   // |aLeadUnit| indicates are needed.  Both numbers include the contribution
 550   // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
 551   // |aUnitsAvailable < aUnitsNeeded|.  As above, it also is not passed the lead
 552   // code unit.
 553   auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};
 554
 555   // aOnBadTrailingUnit is called when one of the trailing code units implied by
 556   // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
 557   // trailing code units must satisfy.  It is passed the total count of units
 558   // observed (including |aLeadUnit|).  The bad trailing code unit will
 559   // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
 560   // called, and so |aUnitsObserved <= 4|.
 561   auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};
 562
 563   // aOnBadCodePoint is called when a structurally-correct code point encoding
 564   // is found, but the *value* that is encoded is not a valid code point: either
 565   // because it exceeded the U+10FFFF Unicode maximum code point, or because it
 566   // was a UTF-16 surrogate.  It is passed the non-code point value and the
 567   // number of code units used to encode it.
 568   auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
 569
 570   // aOnNotShortestForm is called when structurally-correct encoding is found,
 571   // but the encoded value should have been encoded in fewer code units (e.g.
 572   // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
 573   // as 0b0000'0000).  It is passed the mis-encoded code point (which will be
 574   // valid and not a surrogate) and the count of code units that mis-encoded it.
 575   auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {
 576   };
 577
 578   return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit,
 579                                       onNotEnoughUnits, onBadTrailingUnit,
 580                                       onBadCodePoint, onNotShortestForm);
 581 }
 582
 583 /**
 584  * Identical to the above function, but not forced to be instantiated inline --
 585  * the compiler/linker are allowed to common up separate invocations.
 586  */
 587 template <typename Iter, typename EndIter>
 588 inline Maybe<char32_t> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
 589                                               Iter* aIter,
 590                                               const EndIter& aEnd) {
 591   return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
 592 }
 593
 594 }  // namespace mozilla
 595
 596 #endif /* mozilla_Utf8_h */