Bug 1824490 - Use the end page value rather than the start page value of the previous...
[gecko.git] / xpcom / string / nsCharTraits.h
blobc81c2f5b2d3b6cafead8c67ab5df2f7da1fdf28b
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef nsCharTraits_h___
8 #define nsCharTraits_h___
10 #include <ctype.h> // for |EOF|, |WEOF|
11 #include <stdint.h> // for |uint32_t|
12 #include <string.h> // for |memcpy|, et al
13 #include "mozilla/MemoryChecking.h"
15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
16 // particular the standalone software updater. In that case stub out
17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
19 #ifdef NS_NO_XPCOM
20 # define NS_WARNING(msg)
21 # define NS_ASSERTION(cond, msg)
22 # define NS_ERROR(msg)
23 #else
24 # include "nsDebug.h" // for NS_ASSERTION
25 #endif
28 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
29 * values.
31 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
32 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
33 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
34 * in the range U+DC00 - U+DFFF, like this:
36 * U+D800 U+DC00 = U+10000
37 * U+D800 U+DC01 = U+10001
38 * ...
39 * U+DBFF U+DFFE = U+10FFFE
40 * U+DBFF U+DFFF = U+10FFFF
42 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
43 * scalar values and are not well-formed UTF-16 except as high-surrogate /
44 * low-surrogate pairs.
47 #define PLANE1_BASE uint32_t(0x00010000)
48 // High surrogates are in the range 0xD800 -- OxDBFF
49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
52 // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE
53 #define NS_IS_SURROGATE_PAIR(h, l) \
54 (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l))
55 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
56 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
58 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
60 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
61 // I wonder whether we could somehow assert that H is a high surrogate
62 // and L is a low surrogate
63 #define SURROGATE_TO_UCS4(h, l) \
64 (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE)
66 // Extract surrogates from a UCS4 char
67 // Reference: the Unicode standard 4.0, section 3.9
68 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
69 // 0xD7C0 == 0xD800 - 0x0080,
70 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
71 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0))
72 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
73 // but added.
75 // Since 0x10000 & 0x03FF == 0,
76 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
77 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
78 #define L_SURROGATE(c) \
79 char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00))
81 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
82 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
84 #define UCS_END uint32_t(0x00110000)
85 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
86 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
88 template <class CharT>
89 struct nsCharTraits {};
91 template <>
92 struct nsCharTraits<char16_t> {
93 typedef char16_t char_type;
94 typedef uint16_t unsigned_char_type;
95 typedef char incompatible_char_type;
97 static char_type* const sEmptyBuffer;
99 // integer representation of characters:
100 typedef int int_type;
102 static char_type to_char_type(int_type aChar) { return char_type(aChar); }
104 static int_type to_int_type(char_type aChar) {
105 return int_type(static_cast<unsigned_char_type>(aChar));
108 static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
110 // |char_type| comparisons:
112 static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
114 static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
116 // operations on s[n] arrays:
118 static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
119 return static_cast<char_type*>(
120 memmove(aStr1, aStr2, aN * sizeof(char_type)));
123 static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
124 return static_cast<char_type*>(
125 memcpy(aStr1, aStr2, aN * sizeof(char_type)));
128 static void uninitialize(char_type* aStr, size_t aN) {
129 #ifdef DEBUG
130 memset(aStr, 0xE4, aN * sizeof(char_type));
131 #endif
132 MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
135 static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
136 for (char_type* s = aStr1; aN--; ++s, ++aStr2) {
137 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
138 *s = static_cast<char_type>(*aStr2);
140 return aStr1;
143 static int compare(const char_type* aStr1, const char_type* aStr2,
144 size_t aN) {
145 for (; aN--; ++aStr1, ++aStr2) {
146 if (!eq(*aStr1, *aStr2)) {
147 return to_int_type(*aStr1) - to_int_type(*aStr2);
151 return 0;
154 static int compareASCII(const char_type* aStr1, const char* aStr2,
155 size_t aN) {
156 for (; aN--; ++aStr1, ++aStr2) {
157 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
158 if (!eq_int_type(to_int_type(*aStr1),
159 to_int_type(static_cast<char_type>(*aStr2)))) {
160 return to_int_type(*aStr1) -
161 to_int_type(static_cast<char_type>(*aStr2));
165 return 0;
168 static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
169 const size_t aN) {
170 for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) {
171 if (*aStr1 != static_cast<char_type>(*aStr2)) {
172 return false;
176 return true;
179 // this version assumes that s2 is null-terminated and s1 has length n.
180 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
181 // we return 1.
182 static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
183 const char* aStr2) {
184 for (; aN--; ++aStr1, ++aStr2) {
185 if (!*aStr2) {
186 return 1;
188 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
189 if (!eq_int_type(to_int_type(*aStr1),
190 to_int_type(static_cast<char_type>(*aStr2)))) {
191 return to_int_type(*aStr1) -
192 to_int_type(static_cast<char_type>(*aStr2));
196 if (*aStr2) {
197 return -1;
200 return 0;
204 * Convert c to its lower-case form, but only if c is in the ASCII
205 * range. Otherwise leave it alone.
207 static char_type ASCIIToLower(char_type aChar) {
208 if (aChar >= 'A' && aChar <= 'Z') {
209 return char_type(aChar + ('a' - 'A'));
212 return aChar;
215 static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
216 size_t aN) {
217 for (; aN--; ++aStr1, ++aStr2) {
218 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
219 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
220 "Unexpected uppercase character");
221 char_type lower_s1 = ASCIIToLower(*aStr1);
222 if (lower_s1 != static_cast<char_type>(*aStr2)) {
223 return to_int_type(lower_s1) -
224 to_int_type(static_cast<char_type>(*aStr2));
228 return 0;
231 // this version assumes that s2 is null-terminated and s1 has length n.
232 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
233 // we return 1.
234 static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
235 size_t aN,
236 const char* aStr2) {
237 for (; aN--; ++aStr1, ++aStr2) {
238 if (!*aStr2) {
239 return 1;
241 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
242 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
243 "Unexpected uppercase character");
244 char_type lower_s1 = ASCIIToLower(*aStr1);
245 if (lower_s1 != static_cast<char_type>(*aStr2)) {
246 return to_int_type(lower_s1) -
247 to_int_type(static_cast<char_type>(*aStr2));
251 if (*aStr2) {
252 return -1;
255 return 0;
258 static size_t length(const char_type* aStr) {
259 size_t result = 0;
260 while (!eq(*aStr++, char_type(0))) {
261 ++result;
263 return result;
266 static const char_type* find(const char_type* aStr, size_t aN,
267 char_type aChar) {
268 while (aN--) {
269 if (eq(*aStr, aChar)) {
270 return aStr;
272 ++aStr;
275 return 0;
279 template <>
280 struct nsCharTraits<char> {
281 typedef char char_type;
282 typedef unsigned char unsigned_char_type;
283 typedef char16_t incompatible_char_type;
285 static char_type* const sEmptyBuffer;
287 // integer representation of characters:
289 typedef int int_type;
291 static char_type to_char_type(int_type aChar) { return char_type(aChar); }
293 static int_type to_int_type(char_type aChar) {
294 return int_type(static_cast<unsigned_char_type>(aChar));
297 static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
299 // |char_type| comparisons:
301 static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
303 static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
305 // operations on s[n] arrays:
307 static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
308 return static_cast<char_type*>(
309 memmove(aStr1, aStr2, aN * sizeof(char_type)));
312 static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
313 return static_cast<char_type*>(
314 memcpy(aStr1, aStr2, aN * sizeof(char_type)));
317 static void uninitialize(char_type* aStr, size_t aN) {
318 #ifdef DEBUG
319 memset(aStr, 0xE4, aN * sizeof(char_type));
320 #endif
321 MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
324 static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
325 return copy(aStr1, aStr2, aN);
328 static int compare(const char_type* aStr1, const char_type* aStr2,
329 size_t aN) {
330 return memcmp(aStr1, aStr2, aN);
333 static int compareASCII(const char_type* aStr1, const char* aStr2,
334 size_t aN) {
335 #ifdef DEBUG
336 for (size_t i = 0; i < aN; ++i) {
337 NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character");
339 #endif
340 return compare(aStr1, aStr2, aN);
343 static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
344 size_t aN) {
345 return memcmp(aStr1, aStr2, aN) == 0;
348 // this version assumes that s2 is null-terminated and s1 has length n.
349 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
350 // we return 1.
351 static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
352 const char* aStr2) {
353 // can't use strcmp here because we don't want to stop when aStr1
354 // contains a null
355 for (; aN--; ++aStr1, ++aStr2) {
356 if (!*aStr2) {
357 return 1;
359 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
360 if (*aStr1 != *aStr2) {
361 return to_int_type(*aStr1) - to_int_type(*aStr2);
365 if (*aStr2) {
366 return -1;
369 return 0;
373 * Convert c to its lower-case form, but only if c is ASCII.
375 static char_type ASCIIToLower(char_type aChar) {
376 if (aChar >= 'A' && aChar <= 'Z') {
377 return char_type(aChar + ('a' - 'A'));
380 return aChar;
383 static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
384 size_t aN) {
385 for (; aN--; ++aStr1, ++aStr2) {
386 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
387 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
388 "Unexpected uppercase character");
389 char_type lower_s1 = ASCIIToLower(*aStr1);
390 if (lower_s1 != *aStr2) {
391 return to_int_type(lower_s1) - to_int_type(*aStr2);
394 return 0;
397 // this version assumes that s2 is null-terminated and s1 has length n.
398 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
399 // we return 1.
400 static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
401 size_t aN,
402 const char* aStr2) {
403 for (; aN--; ++aStr1, ++aStr2) {
404 if (!*aStr2) {
405 return 1;
407 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
408 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
409 "Unexpected uppercase character");
410 char_type lower_s1 = ASCIIToLower(*aStr1);
411 if (lower_s1 != *aStr2) {
412 return to_int_type(lower_s1) - to_int_type(*aStr2);
416 if (*aStr2) {
417 return -1;
420 return 0;
423 static size_t length(const char_type* aStr) { return strlen(aStr); }
425 static const char_type* find(const char_type* aStr, size_t aN,
426 char_type aChar) {
427 return reinterpret_cast<const char_type*>(
428 memchr(aStr, to_int_type(aChar), aN));
432 template <class InputIterator>
433 struct nsCharSourceTraits {
434 typedef typename InputIterator::difference_type difference_type;
436 static difference_type readable_distance(const InputIterator& aFirst,
437 const InputIterator& aLast) {
438 // assumes single fragment
439 return aLast.get() - aFirst.get();
442 static const typename InputIterator::value_type* read(
443 const InputIterator& aIter) {
444 return aIter.get();
447 static void advance(InputIterator& aStr, difference_type aN) {
448 aStr.advance(aN);
452 template <class CharT>
453 struct nsCharSourceTraits<CharT*> {
454 typedef ptrdiff_t difference_type;
456 static difference_type readable_distance(CharT* aStr) {
457 return nsCharTraits<CharT>::length(aStr);
460 static difference_type readable_distance(CharT* aFirst, CharT* aLast) {
461 return aLast - aFirst;
464 static const CharT* read(CharT* aStr) { return aStr; }
466 static void advance(CharT*& aStr, difference_type aN) { aStr += aN; }
469 template <class OutputIterator>
470 struct nsCharSinkTraits {
471 static void write(OutputIterator& aIter,
472 const typename OutputIterator::value_type* aStr,
473 size_t aN) {
474 aIter.write(aStr, aN);
478 template <class CharT>
479 struct nsCharSinkTraits<CharT*> {
480 static void write(CharT*& aIter, const CharT* aStr, size_t aN) {
481 nsCharTraits<CharT>::move(aIter, aStr, aN);
482 aIter += aN;
486 #endif // !defined(nsCharTraits_h___)