Bumping manifests a=b2g-bump
[gecko.git] / xpcom / string / nsCharTraits.h
blobd93e1f5dcac8ff3d3da8aed472ab3a8fd3aca117
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef nsCharTraits_h___
8 #define nsCharTraits_h___
10 #include <ctype.h> // for |EOF|, |WEOF|
11 #include <string.h> // for |memcpy|, et al
13 #include "nscore.h" // for |char16_t|
15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
16 // particular the standalone software updater. In that case stub out
17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
19 #ifdef NS_NO_XPCOM
20 #define NS_WARNING(msg)
21 #define NS_ASSERTION(cond, msg)
22 #define NS_ERROR(msg)
23 #else
24 #include "nsDebug.h" // for NS_ASSERTION
25 #endif
28 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
29 * values.
31 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
32 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
33 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
34 * in the range U+DC00 - U+DFFF, like this:
36 * U+D800 U+DC00 = U+10000
37 * U+D800 U+DC01 = U+10001
38 * ...
39 * U+DBFF U+DFFE = U+10FFFE
40 * U+DBFF U+DFFF = U+10FFFF
42 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
43 * scalar values and are not well-formed UTF-16 except as high-surrogate /
44 * low-surrogate pairs.
47 #define PLANE1_BASE uint32_t(0x00010000)
48 // High surrogates are in the range 0xD800 -- OxDBFF
49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
52 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
53 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
55 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
57 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
58 // I wonder whether we could somehow assert that H is a high surrogate
59 // and L is a low surrogate
60 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \
61 (uint32_t(l) & 0x03FF) + PLANE1_BASE)
63 // Extract surrogates from a UCS4 char
64 // Reference: the Unicode standard 4.0, section 3.9
65 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
66 // 0xD7C0 == 0xD800 - 0x0080,
67 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
68 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \
69 char16_t(0xD7C0))
70 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
71 // but added.
73 // Since 0x10000 & 0x03FF == 0,
74 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
75 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
76 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \
77 char16_t(0xDC00))
79 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
80 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
82 #define UCS_END uint32_t(0x00110000)
83 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
84 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
86 template <class CharT>
87 struct nsCharTraits
91 template <>
92 struct nsCharTraits<char16_t>
94 typedef char16_t char_type;
95 typedef uint16_t unsigned_char_type;
96 typedef char incompatible_char_type;
98 static char_type* const sEmptyBuffer;
100 static void
101 assign(char_type& aLhs, char_type aRhs)
103 aLhs = aRhs;
107 // integer representation of characters:
108 typedef int int_type;
110 static char_type
111 to_char_type(int_type aChar)
113 return char_type(aChar);
116 static int_type
117 to_int_type(char_type aChar)
119 return int_type(static_cast<unsigned_char_type>(aChar));
122 static bool
123 eq_int_type(int_type aLhs, int_type aRhs)
125 return aLhs == aRhs;
129 // |char_type| comparisons:
131 static bool
132 eq(char_type aLhs, char_type aRhs)
134 return aLhs == aRhs;
137 static bool
138 lt(char_type aLhs, char_type aRhs)
140 return aLhs < aRhs;
144 // operations on s[n] arrays:
146 static char_type*
147 move(char_type* aStr1, const char_type* aStr2, size_t aN)
149 return static_cast<char_type*>(memmove(aStr1, aStr2,
150 aN * sizeof(char_type)));
153 static char_type*
154 copy(char_type* aStr1, const char_type* aStr2, size_t aN)
156 return static_cast<char_type*>(memcpy(aStr1, aStr2,
157 aN * sizeof(char_type)));
160 static char_type*
161 copyASCII(char_type* aStr1, const char* aStr2, size_t aN)
163 for (char_type* s = aStr1; aN--; ++s, ++aStr2) {
164 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
165 *s = static_cast<char_type>(*aStr2);
167 return aStr1;
170 static char_type*
171 assign(char_type* aStr, size_t aN, char_type aChar)
173 char_type* result = aStr;
174 while (aN--) {
175 assign(*aStr++, aChar);
177 return result;
180 static int
181 compare(const char_type* aStr1, const char_type* aStr2, size_t aN)
183 for (; aN--; ++aStr1, ++aStr2) {
184 if (!eq(*aStr1, *aStr2)) {
185 return to_int_type(*aStr1) - to_int_type(*aStr2);
189 return 0;
192 static int
193 compareASCII(const char_type* aStr1, const char* aStr2, size_t aN)
195 for (; aN--; ++aStr1, ++aStr2) {
196 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
197 if (!eq_int_type(to_int_type(*aStr1),
198 to_int_type(static_cast<char_type>(*aStr2)))) {
199 return to_int_type(*aStr1) -
200 to_int_type(static_cast<char_type>(*aStr2));
204 return 0;
207 // this version assumes that s2 is null-terminated and s1 has length n.
208 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
209 // we return 1.
210 static int
211 compareASCIINullTerminated(const char_type* aStr1, size_t aN,
212 const char* aStr2)
214 for (; aN--; ++aStr1, ++aStr2) {
215 if (!*aStr2) {
216 return 1;
218 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
219 if (!eq_int_type(to_int_type(*aStr1),
220 to_int_type(static_cast<char_type>(*aStr2)))) {
221 return to_int_type(*aStr1) -
222 to_int_type(static_cast<char_type>(*aStr2));
226 if (*aStr2) {
227 return -1;
230 return 0;
234 * Convert c to its lower-case form, but only if c is in the ASCII
235 * range. Otherwise leave it alone.
237 static char_type
238 ASCIIToLower(char_type aChar)
240 if (aChar >= 'A' && aChar <= 'Z') {
241 return char_type(aChar + ('a' - 'A'));
244 return aChar;
247 static int
248 compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, size_t aN)
250 for (; aN--; ++aStr1, ++aStr2) {
251 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
252 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
253 "Unexpected uppercase character");
254 char_type lower_s1 = ASCIIToLower(*aStr1);
255 if (lower_s1 != static_cast<char_type>(*aStr2)) {
256 return to_int_type(lower_s1) -
257 to_int_type(static_cast<char_type>(*aStr2));
261 return 0;
264 // this version assumes that s2 is null-terminated and s1 has length n.
265 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
266 // we return 1.
267 static int
268 compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
269 size_t aN, const char* aStr2)
271 for (; aN--; ++aStr1, ++aStr2) {
272 if (!*aStr2) {
273 return 1;
275 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
276 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
277 "Unexpected uppercase character");
278 char_type lower_s1 = ASCIIToLower(*aStr1);
279 if (lower_s1 != static_cast<char_type>(*aStr2)) {
280 return to_int_type(lower_s1) -
281 to_int_type(static_cast<char_type>(*aStr2));
285 if (*aStr2) {
286 return -1;
289 return 0;
292 static size_t
293 length(const char_type* aStr)
295 size_t result = 0;
296 while (!eq(*aStr++, char_type(0))) {
297 ++result;
299 return result;
302 static const char_type*
303 find(const char_type* aStr, size_t aN, char_type aChar)
305 while (aN--) {
306 if (eq(*aStr, aChar)) {
307 return aStr;
309 ++aStr;
312 return 0;
316 template <>
317 struct nsCharTraits<char>
319 typedef char char_type;
320 typedef unsigned char unsigned_char_type;
321 typedef char16_t incompatible_char_type;
323 static char_type* const sEmptyBuffer;
325 static void
326 assign(char_type& aLhs, char_type aRhs)
328 aLhs = aRhs;
332 // integer representation of characters:
334 typedef int int_type;
336 static char_type
337 to_char_type(int_type aChar)
339 return char_type(aChar);
342 static int_type
343 to_int_type(char_type aChar)
345 return int_type(static_cast<unsigned_char_type>(aChar));
348 static bool
349 eq_int_type(int_type aLhs, int_type aRhs)
351 return aLhs == aRhs;
355 // |char_type| comparisons:
357 static bool eq(char_type aLhs, char_type aRhs)
359 return aLhs == aRhs;
362 static bool
363 lt(char_type aLhs, char_type aRhs)
365 return aLhs < aRhs;
369 // operations on s[n] arrays:
371 static char_type*
372 move(char_type* aStr1, const char_type* aStr2, size_t aN)
374 return static_cast<char_type*>(memmove(aStr1, aStr2,
375 aN * sizeof(char_type)));
378 static char_type*
379 copy(char_type* aStr1, const char_type* aStr2, size_t aN)
381 return static_cast<char_type*>(memcpy(aStr1, aStr2,
382 aN * sizeof(char_type)));
385 static char_type*
386 copyASCII(char_type* aStr1, const char* aStr2, size_t aN)
388 return copy(aStr1, aStr2, aN);
391 static char_type*
392 assign(char_type* aStr, size_t aN, char_type aChar)
394 return static_cast<char_type*>(memset(aStr, to_int_type(aChar), aN));
397 static int
398 compare(const char_type* aStr1, const char_type* aStr2, size_t aN)
400 return memcmp(aStr1, aStr2, aN);
403 static int
404 compareASCII(const char_type* aStr1, const char* aStr2, size_t aN)
406 #ifdef DEBUG
407 for (size_t i = 0; i < aN; ++i) {
408 NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character");
410 #endif
411 return compare(aStr1, aStr2, aN);
414 // this version assumes that s2 is null-terminated and s1 has length n.
415 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
416 // we return 1.
417 static int
418 compareASCIINullTerminated(const char_type* aStr1, size_t aN,
419 const char* aStr2)
421 // can't use strcmp here because we don't want to stop when aStr1
422 // contains a null
423 for (; aN--; ++aStr1, ++aStr2) {
424 if (!*aStr2) {
425 return 1;
427 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
428 if (*aStr1 != *aStr2) {
429 return to_int_type(*aStr1) - to_int_type(*aStr2);
433 if (*aStr2) {
434 return -1;
437 return 0;
441 * Convert c to its lower-case form, but only if c is ASCII.
443 static char_type
444 ASCIIToLower(char_type aChar)
446 if (aChar >= 'A' && aChar <= 'Z') {
447 return char_type(aChar + ('a' - 'A'));
450 return aChar;
453 static int
454 compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, size_t aN)
456 for (; aN--; ++aStr1, ++aStr2) {
457 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
458 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
459 "Unexpected uppercase character");
460 char_type lower_s1 = ASCIIToLower(*aStr1);
461 if (lower_s1 != *aStr2) {
462 return to_int_type(lower_s1) - to_int_type(*aStr2);
465 return 0;
468 // this version assumes that s2 is null-terminated and s1 has length n.
469 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
470 // we return 1.
471 static int
472 compareLowerCaseToASCIINullTerminated(const char_type* aStr1, size_t aN,
473 const char* aStr2)
475 for (; aN--; ++aStr1, ++aStr2) {
476 if (!*aStr2) {
477 return 1;
479 NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
480 NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
481 "Unexpected uppercase character");
482 char_type lower_s1 = ASCIIToLower(*aStr1);
483 if (lower_s1 != *aStr2) {
484 return to_int_type(lower_s1) - to_int_type(*aStr2);
488 if (*aStr2) {
489 return -1;
492 return 0;
495 static size_t
496 length(const char_type* aStr)
498 return strlen(aStr);
501 static const char_type*
502 find(const char_type* aStr, size_t aN, char_type aChar)
504 return reinterpret_cast<const char_type*>(memchr(aStr, to_int_type(aChar),
505 aN));
509 template <class InputIterator>
510 struct nsCharSourceTraits
512 typedef typename InputIterator::difference_type difference_type;
514 static uint32_t
515 readable_distance(const InputIterator& aFirst, const InputIterator& aLast)
517 // assumes single fragment
518 return uint32_t(aLast.get() - aFirst.get());
521 static const typename InputIterator::value_type*
522 read(const InputIterator& aIter)
524 return aIter.get();
527 static void
528 advance(InputIterator& aStr, difference_type aN)
530 aStr.advance(aN);
534 template <class CharT>
535 struct nsCharSourceTraits<CharT*>
537 typedef ptrdiff_t difference_type;
539 static uint32_t
540 readable_distance(CharT* aStr)
542 return uint32_t(nsCharTraits<CharT>::length(aStr));
543 // return numeric_limits<uint32_t>::max();
546 static uint32_t
547 readable_distance(CharT* aFirst, CharT* aLast)
549 return uint32_t(aLast - aFirst);
552 static const CharT*
553 read(CharT* aStr)
555 return aStr;
558 static void
559 advance(CharT*& aStr, difference_type aN)
561 aStr += aN;
565 template <class OutputIterator>
566 struct nsCharSinkTraits
568 static void
569 write(OutputIterator& aIter, const typename OutputIterator::value_type* aStr,
570 uint32_t aN)
572 aIter.write(aStr, aN);
576 template <class CharT>
577 struct nsCharSinkTraits<CharT*>
579 static void
580 write(CharT*& aIter, const CharT* aStr, uint32_t aN)
582 nsCharTraits<CharT>::move(aIter, aStr, aN);
583 aIter += aN;
587 #endif // !defined(nsCharTraits_h___)