1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
8 * UTF-8-related functionality, including a type-safe structure representing a
12 #ifndef mozilla_Utf8_h
13 #define mozilla_Utf8_h
15 #include "mozilla/Casting.h" // for mozilla::AssertedCast
16 #include "mozilla/Likely.h" // for MOZ_UNLIKELY
17 #include "mozilla/Maybe.h" // for mozilla::Maybe
18 #include "mozilla/TextUtils.h" // for mozilla::IsAscii
19 #include "mozilla/Types.h" // for MFBT_API
21 #include <limits.h> // for CHAR_BIT
22 #include <stddef.h> // for size_t
23 #include <stdint.h> // for uint8_t
29 static_assert(CHAR_BIT
== 8,
30 "Utf8Unit won't work so well with non-octet chars");
33 * A code unit within a UTF-8 encoded string. (A code unit is the smallest
34 * unit within the Unicode encoding of a string. For UTF-8 this is an 8-bit
35 * number; for UTF-16 it would be a 16-bit number.)
37 * This is *not* the same as a single code point: in UTF-8, non-ASCII code
38 * points are constituted by multiple code units.
42 // Utf8Unit is a union wrapping a raw |char|. The C++ object model and C++
43 // requirements as to how objects may be accessed with respect to their actual
44 // types (almost?) uniquely compel this choice.
46 // Our requirements for a UTF-8 code unit representation are:
48 // 1. It must be "compatible" with C++ character/string literals that use
49 // the UTF-8 encoding. Given a properly encoded C++ literal, you should
50 // be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
51 // and friends (particularly UnicodeData), you should be able to access
52 // C++ character types for their contents.
53 // 2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
54 // explicit operation.
55 // 3. |Utf8Unit| must participate in overload resolution and template type
56 // equivalence (that is, given |template<class> class X|, when |X<T>| and
57 // |X<U>| are the same type) distinctly from the C++ character types.
59 // And a few nice-to-haves (at least for the moment):
61 // 4. The representation should use unsigned numbers, to avoid undefined
62 // behavior that can arise with signed types, and because Unicode code
63 // points and code units are unsigned.
64 // 5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
65 // and |unsigned char*|, for APIs that (because of #4 above) use those
66 // types as the "natural" choice for UTF-8 data.
68 // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
69 // |{,{un,}signed} char|.[0] |uint8_t| won't work because it might not be a
70 // C++ character type.
72 // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
73 // typedefs don't generate *new* types, just type aliases). This requires a
76 // The ultimate representation (and character type in it) is constrained by
77 // C++14 [basic.lval]p10 that defines how objects may be accessed, with
78 // respect to the dynamic type in memory and the actual type used to access
81 // If a program attempts to access the stored value of an object
82 // through a glvalue of other than one of the following types the
83 // behavior is undefined:
85 // 1. the dynamic type of the object,
86 // 2. a cv-qualified version of the dynamic type of the object,
87 // ...other types irrelevant here...
88 // 3. an aggregate or union type that includes one of the
89 // aforementioned types among its elements or non-static data
90 // members (including, recursively, an element or non-static
91 // data member of a subaggregate or contained union),
92 // ...more irrelevant types...
93 // 4. a char or unsigned char type.
95 // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
96 // matter the representation by #4. (Briefly set aside what values are seen.)
97 // (And #2 allows |const| on either the dynamic type or the accessing type.)
98 // (|signed char| is really only useful for small signed numbers, not
99 // characters, so we ignore it.)
101 // If we interpret contents as |char|/|unsigned char| contrary to the actual
102 // type stored there, what happens? C++14 [basic.fundamental]p1 requires
103 // character types be identically aligned/sized; C++14 [basic.fundamental]p3
104 // requires |signed char| and |unsigned char| have the same value
105 // representation. C++ doesn't require identical bitwise representation, tho.
106 // Practically we could assume it, but this verges on C++ spec bits best not
107 // *relied* on for correctness, if possible.
109 // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
110 // and |char*|. Instead we safely expose |unsigned char| by fully-defined
111 // *integral conversion* (C++14 [conv.integral]p2). Integral conversion from
112 // |unsigned char| → |char| has only implementation-defined behavior. It'd be
113 // better not to depend on that, but given twos-complement won, it should be
114 // okay. (Also |unsigned char*| is awkward enough to work with for strings
115 // that it probably doesn't appear in string manipulation much anyway, only in
116 // places that should really use |Utf8Unit| directly.)
118 // The opposite direction -- interpreting |char| or |char*| data through
119 // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
120 // decided above, using #3. An "aggregate or union" will work that contains a
121 // |char|. Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
122 // aggregates must have "no private or protected non-static data members", and
123 // we want to keep the inner |char| hidden. So a |struct| is out, and only
126 // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
127 // can't make |Utf8Unit| an enum class with |char| underlying type, because we
128 // are given no license to treat |char| memory as such an |enum|'s memory.)
130 // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
131 // This satisfies all our requirements. It also supports the nice-to-haves of
132 // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
133 // |unsigned char|. It doesn't satisfy the nice-to-haves of using an
134 // |unsigned char| internally, nor of letting us wrap an existing
135 // |unsigned char| or pointer to one. We probably *could* do these, if we
136 // were willing to rely harder on implementation-defined behaviors, but for
137 // now we privilege C++'s main character type over some conceptual purity.
139 // 0. There's a proposal for a UTF-8 character type distinct from the existing
140 // C++ narrow character types:
142 // http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
144 // but it hasn't been standardized (and might never be), and none of the
145 // compilers we really care about have implemented it. Maybe someday we
146 // can change our implementation to it without too much trouble, if we're
151 explicit constexpr Utf8Unit(char aUnit
) : mValue(aUnit
) {}
153 explicit constexpr Utf8Unit(unsigned char aUnit
)
154 : mValue(static_cast<char>(aUnit
)) {
155 // Per the above comment, the prior cast is integral conversion with
156 // implementation-defined semantics, and we regretfully but unavoidably
157 // assume the conversion does what we want it to.
160 constexpr bool operator==(const Utf8Unit
& aOther
) const {
161 return mValue
== aOther
.mValue
;
164 constexpr bool operator!=(const Utf8Unit
& aOther
) const {
165 return !(*this == aOther
);
168 /** Convert a UTF-8 code unit to a raw char. */
169 constexpr char toChar() const {
170 // Only a |char| is ever permitted to be written into this location, so this
171 // is both permissible and returns the desired value.
175 /** Convert a UTF-8 code unit to a raw unsigned char. */
176 constexpr unsigned char toUnsignedChar() const {
177 // Per the above comment, this is well-defined integral conversion.
178 return static_cast<unsigned char>(mValue
);
181 /** Convert a UTF-8 code unit to a uint8_t. */
182 constexpr uint8_t toUint8() const {
183 // Per the above comment, this is well-defined integral conversion.
184 return static_cast<uint8_t>(mValue
);
187 // We currently don't expose |&mValue|. |UnicodeData| sort of does, but
188 // that's a somewhat separate concern, justified in different comments in
193 * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
195 * Assuming proper backing has been set up, the resulting |const unsigned char*|
196 * may validly be dereferenced.
198 * No access is provided to mutate this underlying memory as |unsigned char|.
199 * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
200 * loath to offer a way to write non-|char| data until absolutely necessary.
202 inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit
* aUnits
) {
203 static_assert(sizeof(Utf8Unit
) == sizeof(unsigned char),
204 "sizes must match to permissibly reinterpret_cast<>");
205 static_assert(alignof(Utf8Unit
) == alignof(unsigned char),
206 "alignment must match to permissibly reinterpret_cast<>");
208 // The static_asserts above only enable the reinterpret_cast<> to occur.
210 // Dereferencing the resulting pointer is a separate question. Any object's
211 // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
212 // this doesn't guarantee what values will be observed. If |char| is
213 // implemented to act like |unsigned char|, we're good to go: memory for the
214 // |char| in |Utf8Unit| acts as we need. But if |char| is implemented to act
215 // like |signed char|, dereferencing produces the right value only if the
216 // |char| types all use two's-complement representation. Every modern
217 // compiler does this, and there's a C++ proposal to standardize it.
218 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html So
219 // *technically* this is implementation-defined -- but everyone does it and
220 // this behavior is being standardized.
221 return reinterpret_cast<const unsigned char*>(aUnits
);
224 /** Returns true iff |aUnit| is an ASCII value. */
226 inline bool IsAscii
<Utf8Unit
>(Utf8Unit aUnit
) {
227 return IsAscii(aUnit
.toUint8());
231 * Returns true if the given length-delimited memory consists of a valid UTF-8
232 * string, false otherwise.
234 * A valid UTF-8 string contains no overlong-encoded code points (as one would
235 * expect) and contains no code unit sequence encoding a UTF-16 surrogate. The
236 * string *may* contain U+0000 NULL code points.
238 extern MFBT_API
bool IsValidUtf8(const void* aCodeUnits
, size_t aCount
);
241 * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
244 inline bool IsTrailingUnit(Utf8Unit aUnit
) {
245 return (aUnit
.toUint8() & 0b1100'0000) == 0b1000'0000;
249 * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
250 * that (initially) itself points one unit past |aLeadUnit|, and
251 * |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared
252 * against |*aIter| using |aEnd - *aIter|:
254 * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
255 * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
256 * surrogate, in shortest form -- then return Some(that code point) and advance
257 * |*aIter| past those code units.
259 * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
262 * |Iter| and |EndIter| are generalized concepts most easily understood as if
263 * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
264 * iterators that when dereferenced can be used to construct a |Utf8Unit| and
265 * that can be compared and modified in certain limited ways. (Carefully note
266 * that this function mutates |*aIter|.) |Iter| and |EndIter| are template
267 * parameters to support more-complicated adaptor iterators.
269 * The template parameters after |Iter| allow users to implement custom handling
270 * for various forms of invalid UTF-8. A version of this function that defaults
271 * all such handling to no-ops is defined below this function. To learn how to
272 * define your own custom handling, consult the implementation of that function,
273 * which documents exactly how custom handler functors are invoked.
275 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
276 * of this function without the "Inline" suffix on the name.
278 template <typename Iter
, typename EndIter
, class OnBadLeadUnit
,
279 class OnNotEnoughUnits
, class OnBadTrailingUnit
, class OnBadCodePoint
,
280 class OnNotShortestForm
>
281 MOZ_ALWAYS_INLINE Maybe
<char32_t
> DecodeOneUtf8CodePointInline(
282 const Utf8Unit aLeadUnit
, Iter
* aIter
, const EndIter
& aEnd
,
283 OnBadLeadUnit aOnBadLeadUnit
, OnNotEnoughUnits aOnNotEnoughUnits
,
284 OnBadTrailingUnit aOnBadTrailingUnit
, OnBadCodePoint aOnBadCodePoint
,
285 OnNotShortestForm aOnNotShortestForm
) {
286 MOZ_ASSERT(Utf8Unit((*aIter
)[-1]) == aLeadUnit
);
288 char32_t n
= aLeadUnit
.toUint8();
289 MOZ_ASSERT(!IsAscii(n
));
291 // |aLeadUnit| determines the number of trailing code units in the code point
292 // and the bits of |aLeadUnit| that contribute to the code point's value.
295 if ((n
& 0b1110'0000) == 0b1100'0000) {
299 } else if ((n
& 0b1111'0000) == 0b1110'0000) {
303 } else if ((n
& 0b1111'1000) == 0b1111'0000) {
313 // If the code point would require more code units than remain, the encoding
315 auto actual
= aEnd
- *aIter
;
316 if (MOZ_UNLIKELY(actual
< remaining
)) {
318 aOnNotEnoughUnits(AssertedCast
<uint8_t>(actual
+ 1), remaining
+ 1);
322 for (uint8_t i
= 0; i
< remaining
; i
++) {
323 const Utf8Unit
unit(*(*aIter
)++);
325 // Every non-leading code unit in properly encoded UTF-8 has its high
326 // bit set and the next-highest bit unset.
327 if (MOZ_UNLIKELY(!IsTrailingUnit(unit
))) {
328 uint8_t unitsObserved
= i
+ 1 + 1;
329 *aIter
-= unitsObserved
;
330 aOnBadTrailingUnit(unitsObserved
);
334 // The code point being encoded is the concatenation of all the
335 // unconstrained bits.
336 n
= (n
<< 6) | (unit
.toUint8() & 0b0011'1111);
339 // UTF-16 surrogates and values outside the Unicode range are invalid.
340 if (MOZ_UNLIKELY(n
> 0x10FFFF || (0xD800 <= n
&& n
<= 0xDFFF))) {
341 uint8_t unitsObserved
= remaining
+ 1;
342 *aIter
-= unitsObserved
;
343 aOnBadCodePoint(n
, unitsObserved
);
347 // Overlong code points are also invalid.
348 if (MOZ_UNLIKELY(n
< min
)) {
349 uint8_t unitsObserved
= remaining
+ 1;
350 *aIter
-= unitsObserved
;
351 aOnNotShortestForm(n
, unitsObserved
);
359 * Identical to the above function, but not forced to be instantiated inline --
360 * the compiler is permitted to common up separate invocations if it chooses.
362 template <typename Iter
, typename EndIter
, class OnBadLeadUnit
,
363 class OnNotEnoughUnits
, class OnBadTrailingUnit
, class OnBadCodePoint
,
364 class OnNotShortestForm
>
365 inline Maybe
<char32_t
> DecodeOneUtf8CodePoint(
366 const Utf8Unit aLeadUnit
, Iter
* aIter
, const EndIter
& aEnd
,
367 OnBadLeadUnit aOnBadLeadUnit
, OnNotEnoughUnits aOnNotEnoughUnits
,
368 OnBadTrailingUnit aOnBadTrailingUnit
, OnBadCodePoint aOnBadCodePoint
,
369 OnNotShortestForm aOnNotShortestForm
) {
370 return DecodeOneUtf8CodePointInline(aLeadUnit
, aIter
, aEnd
, aOnBadLeadUnit
,
371 aOnNotEnoughUnits
, aOnBadTrailingUnit
,
372 aOnBadCodePoint
, aOnNotShortestForm
);
376 * Like the always-inlined function above, but with no-op behavior from all
377 * trailing if-invalid notifier functors.
379 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
380 * of this function without the "Inline" suffix on the name.
382 template <typename Iter
, typename EndIter
>
383 MOZ_ALWAYS_INLINE Maybe
<char32_t
> DecodeOneUtf8CodePointInline(
384 const Utf8Unit aLeadUnit
, Iter
* aIter
, const EndIter
& aEnd
) {
385 // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
386 // a multi-unit code point. It is passed no arguments: the caller already has
387 // |aLeadUnit| on hand, so no need to provide it again.
388 auto onBadLeadUnit
= []() {};
390 // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
391 // point length, but there aren't enough units from |*aIter| to |aEnd| to
392 // satisfy that length. It is passed the number of code units actually
393 // available (according to |aEnd - *aIter|) and the number of code units that
394 // |aLeadUnit| indicates are needed. Both numbers include the contribution
395 // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
396 // |aUnitsAvailable < aUnitsNeeded|. As above, it also is not passed the lead
398 auto onNotEnoughUnits
= [](uint8_t aUnitsAvailable
, uint8_t aUnitsNeeded
) {};
400 // aOnBadTrailingUnit is called when one of the trailing code units implied by
401 // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
402 // trailing code units must satisfy. It is passed the total count of units
403 // observed (including |aLeadUnit|). The bad trailing code unit will
404 // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
405 // called, and so |aUnitsObserved <= 4|.
406 auto onBadTrailingUnit
= [](uint8_t aUnitsObserved
) {};
408 // aOnBadCodePoint is called when a structurally-correct code point encoding
409 // is found, but the *value* that is encoded is not a valid code point: either
410 // because it exceeded the U+10FFFF Unicode maximum code point, or because it
411 // was a UTF-16 surrogate. It is passed the non-code point value and the
412 // number of code units used to encode it.
413 auto onBadCodePoint
= [](char32_t aBadCodePoint
, uint8_t aUnitsObserved
) {};
415 // aOnNotShortestForm is called when structurally-correct encoding is found,
416 // but the encoded value should have been encoded in fewer code units (e.g.
417 // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
418 // as 0b0000'0000). It is passed the mis-encoded code point (which will be
419 // valid and not a surrogate) and the count of code units that mis-encoded it.
420 auto onNotShortestForm
= [](char32_t aBadCodePoint
, uint8_t aUnitsObserved
) {
423 return DecodeOneUtf8CodePointInline(aLeadUnit
, aIter
, aEnd
, onBadLeadUnit
,
424 onNotEnoughUnits
, onBadTrailingUnit
,
425 onBadCodePoint
, onNotShortestForm
);
429 * Identical to the above function, but not forced to be instantiated inline --
430 * the compiler/linker are allowed to common up separate invocations.
432 template <typename Iter
, typename EndIter
>
433 inline Maybe
<char32_t
> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit
,
435 const EndIter
& aEnd
) {
436 return DecodeOneUtf8CodePointInline(aLeadUnit
, aIter
, aEnd
);
439 } // namespace mozilla
441 #endif /* mozilla_Utf8_h */