1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
9 // NB: This code may be used from non-XPCOM code, in particular, the
10 // standalone updater executable. That is, this file may be used in
11 // two ways: if MOZILLA_INTERNAL_API is defined, this file will
12 // provide signatures for the Mozilla abstract string types. It will
13 // use XPCOM assertion/debugging macros, etc.
15 #include <type_traits>
17 #include "mozilla/Assertions.h"
18 #include "mozilla/EndianUtils.h"
20 #include "nsCharTraits.h"
22 #ifdef MOZILLA_INTERNAL_API
23 # define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
25 # define UTF8UTILS_WARNING(msg)
30 static bool isASCII(char aChar
) { return (aChar
& 0x80) == 0x00; }
31 static bool isInSeq(char aChar
) { return (aChar
& 0xC0) == 0x80; }
32 static bool is2byte(char aChar
) { return (aChar
& 0xE0) == 0xC0; }
33 static bool is3byte(char aChar
) { return (aChar
& 0xF0) == 0xE0; }
34 static bool is4byte(char aChar
) { return (aChar
& 0xF8) == 0xF0; }
35 static bool is5byte(char aChar
) { return (aChar
& 0xFC) == 0xF8; }
36 static bool is6byte(char aChar
) { return (aChar
& 0xFE) == 0xFC; }
37 // return the number of bytes in a sequence beginning with aChar
38 static int bytes(char aChar
) {
51 MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
57 * Extract the next Unicode scalar value from the buffer and return it. The
58 * pointer passed in is advanced to the start of the next character in the
59 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
60 * over the maximal valid prefix and *aErr is set to true (if aErr is not
63 * Note: This method never sets *aErr to false to allow error accumulation
64 * across multiple calls.
66 * Precondition: *aBuffer < aEnd
68 class UTF8CharEnumerator
{
70 static inline char32_t
NextChar(const char** aBuffer
, const char* aEnd
,
71 bool* aErr
= nullptr) {
72 MOZ_ASSERT(aBuffer
, "null buffer pointer pointer");
73 MOZ_ASSERT(aEnd
, "null end pointer");
75 const unsigned char* p
= reinterpret_cast<const unsigned char*>(*aBuffer
);
76 const unsigned char* end
= reinterpret_cast<const unsigned char*>(aEnd
);
78 MOZ_ASSERT(p
, "null buffer");
79 MOZ_ASSERT(p
< end
, "Bogus range");
81 unsigned char first
= *p
;
84 if (MOZ_LIKELY(first
< 0x80U
)) {
85 *aBuffer
= reinterpret_cast<const char*>(p
);
89 // Unsigned underflow is defined behavior
90 if (MOZ_UNLIKELY((p
== end
) || ((first
- 0xC2U
) >= (0xF5U
- 0xC2U
)))) {
91 *aBuffer
= reinterpret_cast<const char*>(p
);
98 unsigned char second
= *p
;
102 if (MOZ_LIKELY((second
& 0xC0U
) == 0x80U
)) {
104 *aBuffer
= reinterpret_cast<const char*>(p
);
105 return ((uint32_t(first
) & 0x1FU
) << 6) | (uint32_t(second
) & 0x3FU
);
107 *aBuffer
= reinterpret_cast<const char*>(p
);
114 if (MOZ_LIKELY(first
< 0xF0U
)) {
116 unsigned char lower
= 0x80U
;
117 unsigned char upper
= 0xBFU
;
118 if (first
== 0xE0U
) {
120 } else if (first
== 0xEDU
) {
123 if (MOZ_LIKELY(second
>= lower
&& second
<= upper
)) {
125 if (MOZ_LIKELY(p
!= end
)) {
126 unsigned char third
= *p
;
127 if (MOZ_LIKELY((third
& 0xC0U
) == 0x80U
)) {
129 *aBuffer
= reinterpret_cast<const char*>(p
);
130 return ((uint32_t(first
) & 0xFU
) << 12) |
131 ((uint32_t(second
) & 0x3FU
) << 6) |
132 (uint32_t(third
) & 0x3FU
);
136 *aBuffer
= reinterpret_cast<const char*>(p
);
144 unsigned char lower
= 0x80U
;
145 unsigned char upper
= 0xBFU
;
146 if (first
== 0xF0U
) {
148 } else if (first
== 0xF4U
) {
151 if (MOZ_LIKELY(second
>= lower
&& second
<= upper
)) {
153 if (MOZ_LIKELY(p
!= end
)) {
154 unsigned char third
= *p
;
155 if (MOZ_LIKELY((third
& 0xC0U
) == 0x80U
)) {
157 if (MOZ_LIKELY(p
!= end
)) {
158 unsigned char fourth
= *p
;
159 if (MOZ_LIKELY((fourth
& 0xC0U
) == 0x80U
)) {
161 *aBuffer
= reinterpret_cast<const char*>(p
);
162 return ((uint32_t(first
) & 0x7U
) << 18) |
163 ((uint32_t(second
) & 0x3FU
) << 12) |
164 ((uint32_t(third
) & 0x3FU
) << 6) |
165 (uint32_t(fourth
) & 0x3FU
);
171 *aBuffer
= reinterpret_cast<const char*>(p
);
180 * Extract the next Unicode scalar value from the buffer and return it. The
181 * pointer passed in is advanced to the start of the next character in the
182 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
183 * the unpaired surrogate and *aErr is set to true (if aErr is not null).
185 * Note: This method never sets *aErr to false to allow error accumulation
186 * across multiple calls.
188 * Precondition: *aBuffer < aEnd
190 class UTF16CharEnumerator
{
192 static inline char32_t
NextChar(const char16_t
** aBuffer
,
193 const char16_t
* aEnd
, bool* aErr
= nullptr) {
194 MOZ_ASSERT(aBuffer
, "null buffer pointer pointer");
195 MOZ_ASSERT(aEnd
, "null end pointer");
197 const char16_t
* p
= *aBuffer
;
199 MOZ_ASSERT(p
, "null buffer");
200 MOZ_ASSERT(p
< aEnd
, "Bogus range");
204 // Let's use encoding_rs-style code golf here.
205 // Unsigned underflow is defined behavior
206 char16_t cMinusSurrogateStart
= c
- 0xD800U
;
207 if (MOZ_LIKELY(cMinusSurrogateStart
> (0xDFFFU
- 0xD800U
))) {
211 if (MOZ_LIKELY(cMinusSurrogateStart
<= (0xDBFFU
- 0xD800U
))) {
213 if (MOZ_LIKELY(p
!= aEnd
)) {
214 char16_t second
= *p
;
215 // Unsigned underflow is defined behavior
216 if (MOZ_LIKELY((second
- 0xDC00U
) <= (0xDFFFU
- 0xDC00U
))) {
218 return (uint32_t(c
) << 10) + uint32_t(second
) -
219 (((0xD800U
<< 10) - 0x10000U
) + 0xDC00U
);
223 // Unpaired surrogate
232 template <typename Char
, typename UnsignedT
>
233 inline UnsignedT
RewindToPriorUTF8Codepoint(const Char
* utf8Chars
,
235 static_assert(std::is_same_v
<Char
, char> ||
236 std::is_same_v
<Char
, unsigned char> ||
237 std::is_same_v
<Char
, signed char>,
238 "UTF-8 data must be in 8-bit units");
239 static_assert(std::is_unsigned_v
<UnsignedT
>, "index type must be unsigned");
240 while (index
> 0 && (utf8Chars
[index
] & 0xC0) == 0x80) --index
;
245 #undef UTF8UTILS_WARNING
247 #endif /* !defined(nsUTF8Utils_h_) */