no bug - Bumping Firefox l10n changesets r=release a=l10n-bump DONTBUILD CLOSED TREE
[gecko.git] / xpcom / string / nsUTF8Utils.h
blob0145011ec152fa4c8adc5dfe8b8688eb190390e2
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsUTF8Utils_h_
7 #define nsUTF8Utils_h_
9 // NB: This code may be used from non-XPCOM code, in particular, the
10 // standalone updater executable. That is, this file may be used in
11 // two ways: if MOZILLA_INTERNAL_API is defined, this file will
12 // provide signatures for the Mozilla abstract string types. It will
13 // use XPCOM assertion/debugging macros, etc.
15 #include <type_traits>
17 #include "mozilla/Assertions.h"
18 #include "mozilla/EndianUtils.h"
20 #include "nsCharTraits.h"
22 #ifdef MOZILLA_INTERNAL_API
23 # define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
24 #else
25 # define UTF8UTILS_WARNING(msg)
26 #endif
28 class UTF8traits {
29 public:
30 static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; }
31 static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; }
32 static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; }
33 static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; }
34 static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; }
35 static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; }
36 static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; }
37 // return the number of bytes in a sequence beginning with aChar
38 static int bytes(char aChar) {
39 if (isASCII(aChar)) {
40 return 1;
42 if (is2byte(aChar)) {
43 return 2;
45 if (is3byte(aChar)) {
46 return 3;
48 if (is4byte(aChar)) {
49 return 4;
51 MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
52 return 1;
56 /**
57 * Extract the next Unicode scalar value from the buffer and return it. The
58 * pointer passed in is advanced to the start of the next character in the
59 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
60 * over the maximal valid prefix and *aErr is set to true (if aErr is not
61 * null).
63 * Note: This method never sets *aErr to false to allow error accumulation
64 * across multiple calls.
66 * Precondition: *aBuffer < aEnd
68 class UTF8CharEnumerator {
69 public:
70 static inline char32_t NextChar(const char** aBuffer, const char* aEnd,
71 bool* aErr = nullptr) {
72 MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
73 MOZ_ASSERT(aEnd, "null end pointer");
75 const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
76 const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
78 MOZ_ASSERT(p, "null buffer");
79 MOZ_ASSERT(p < end, "Bogus range");
81 unsigned char first = *p;
82 ++p;
84 if (MOZ_LIKELY(first < 0x80U)) {
85 *aBuffer = reinterpret_cast<const char*>(p);
86 return first;
89 // Unsigned underflow is defined behavior
90 if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
91 *aBuffer = reinterpret_cast<const char*>(p);
92 if (aErr) {
93 *aErr = true;
95 return 0xFFFDU;
98 unsigned char second = *p;
100 if (first < 0xE0U) {
101 // Two-byte
102 if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
103 ++p;
104 *aBuffer = reinterpret_cast<const char*>(p);
105 return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
107 *aBuffer = reinterpret_cast<const char*>(p);
108 if (aErr) {
109 *aErr = true;
111 return 0xFFFDU;
114 if (MOZ_LIKELY(first < 0xF0U)) {
115 // Three-byte
116 unsigned char lower = 0x80U;
117 unsigned char upper = 0xBFU;
118 if (first == 0xE0U) {
119 lower = 0xA0U;
120 } else if (first == 0xEDU) {
121 upper = 0x9FU;
123 if (MOZ_LIKELY(second >= lower && second <= upper)) {
124 ++p;
125 if (MOZ_LIKELY(p != end)) {
126 unsigned char third = *p;
127 if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
128 ++p;
129 *aBuffer = reinterpret_cast<const char*>(p);
130 return ((uint32_t(first) & 0xFU) << 12) |
131 ((uint32_t(second) & 0x3FU) << 6) |
132 (uint32_t(third) & 0x3FU);
136 *aBuffer = reinterpret_cast<const char*>(p);
137 if (aErr) {
138 *aErr = true;
140 return 0xFFFDU;
143 // Four-byte
144 unsigned char lower = 0x80U;
145 unsigned char upper = 0xBFU;
146 if (first == 0xF0U) {
147 lower = 0x90U;
148 } else if (first == 0xF4U) {
149 upper = 0x8FU;
151 if (MOZ_LIKELY(second >= lower && second <= upper)) {
152 ++p;
153 if (MOZ_LIKELY(p != end)) {
154 unsigned char third = *p;
155 if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
156 ++p;
157 if (MOZ_LIKELY(p != end)) {
158 unsigned char fourth = *p;
159 if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
160 ++p;
161 *aBuffer = reinterpret_cast<const char*>(p);
162 return ((uint32_t(first) & 0x7U) << 18) |
163 ((uint32_t(second) & 0x3FU) << 12) |
164 ((uint32_t(third) & 0x3FU) << 6) |
165 (uint32_t(fourth) & 0x3FU);
171 *aBuffer = reinterpret_cast<const char*>(p);
172 if (aErr) {
173 *aErr = true;
175 return 0xFFFDU;
180 * Extract the next Unicode scalar value from the buffer and return it. The
181 * pointer passed in is advanced to the start of the next character in the
182 * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
183 * the unpaired surrogate and *aErr is set to true (if aErr is not null).
185 * Note: This method never sets *aErr to false to allow error accumulation
186 * across multiple calls.
188 * Precondition: *aBuffer < aEnd
190 class UTF16CharEnumerator {
191 public:
192 static inline char32_t NextChar(const char16_t** aBuffer,
193 const char16_t* aEnd, bool* aErr = nullptr) {
194 MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
195 MOZ_ASSERT(aEnd, "null end pointer");
197 const char16_t* p = *aBuffer;
199 MOZ_ASSERT(p, "null buffer");
200 MOZ_ASSERT(p < aEnd, "Bogus range");
202 char16_t c = *p++;
204 // Let's use encoding_rs-style code golf here.
205 // Unsigned underflow is defined behavior
206 char16_t cMinusSurrogateStart = c - 0xD800U;
207 if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
208 *aBuffer = p;
209 return c;
211 if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
212 // High surrogate
213 if (MOZ_LIKELY(p != aEnd)) {
214 char16_t second = *p;
215 // Unsigned underflow is defined behavior
216 if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
217 *aBuffer = ++p;
218 return (uint32_t(c) << 10) + uint32_t(second) -
219 (((0xD800U << 10) - 0x10000U) + 0xDC00U);
223 // Unpaired surrogate
224 *aBuffer = p;
225 if (aErr) {
226 *aErr = true;
228 return 0xFFFDU;
232 template <typename Char, typename UnsignedT>
233 inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars,
234 UnsignedT index) {
235 static_assert(std::is_same_v<Char, char> ||
236 std::is_same_v<Char, unsigned char> ||
237 std::is_same_v<Char, signed char>,
238 "UTF-8 data must be in 8-bit units");
239 static_assert(std::is_unsigned_v<UnsignedT>, "index type must be unsigned");
240 while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index;
242 return index;
245 #undef UTF8UTILS_WARNING
247 #endif /* !defined(nsUTF8Utils_h_) */