1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef nsCharTraits_h___
8 #define nsCharTraits_h___
10 #include <ctype.h> // for |EOF|, |WEOF|
11 #include <string.h> // for |memcpy|, et al
13 #include "nscore.h" // for |char16_t|
15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
16 // particular the standalone software updater. In that case stub out
17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
20 #define NS_WARNING(msg)
21 #define NS_ASSERTION(cond, msg)
24 #include "nsDebug.h" // for NS_ASSERTION
28 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
31 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
32 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
33 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
34 * in the range U+DC00 - U+DFFF, like this:
36 * U+D800 U+DC00 = U+10000
37 * U+D800 U+DC01 = U+10001
39 * U+DBFF U+DFFE = U+10FFFE
40 * U+DBFF U+DFFF = U+10FFFF
42 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
43 * scalar values and are not well-formed UTF-16 except as high-surrogate /
44 * low-surrogate pairs.
47 #define PLANE1_BASE uint32_t(0x00010000)
48 // High surrogates are in the range 0xD800 -- OxDBFF
49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
52 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
53 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
55 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
57 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
58 // I wonder whether we could somehow assert that H is a high surrogate
59 // and L is a low surrogate
60 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \
61 (uint32_t(l) & 0x03FF) + PLANE1_BASE)
63 // Extract surrogates from a UCS4 char
64 // Reference: the Unicode standard 4.0, section 3.9
65 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
66 // 0xD7C0 == 0xD800 - 0x0080,
67 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
68 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \
70 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
73 // Since 0x10000 & 0x03FF == 0,
74 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
75 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
76 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \
79 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
80 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
82 #define UCS_END uint32_t(0x00110000)
83 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
84 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
86 template <class CharT
>
92 struct nsCharTraits
<char16_t
>
94 typedef char16_t char_type
;
95 typedef uint16_t unsigned_char_type
;
96 typedef char incompatible_char_type
;
98 static char_type
* const sEmptyBuffer
;
101 assign(char_type
& aLhs
, char_type aRhs
)
107 // integer representation of characters:
108 typedef int int_type
;
111 to_char_type(int_type aChar
)
113 return char_type(aChar
);
117 to_int_type(char_type aChar
)
119 return int_type(static_cast<unsigned_char_type
>(aChar
));
123 eq_int_type(int_type aLhs
, int_type aRhs
)
129 // |char_type| comparisons:
132 eq(char_type aLhs
, char_type aRhs
)
138 lt(char_type aLhs
, char_type aRhs
)
144 // operations on s[n] arrays:
147 move(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
149 return static_cast<char_type
*>(memmove(aStr1
, aStr2
,
150 aN
* sizeof(char_type
)));
154 copy(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
156 return static_cast<char_type
*>(memcpy(aStr1
, aStr2
,
157 aN
* sizeof(char_type
)));
161 copyASCII(char_type
* aStr1
, const char* aStr2
, size_t aN
)
163 for (char_type
* s
= aStr1
; aN
--; ++s
, ++aStr2
) {
164 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
165 *s
= static_cast<char_type
>(*aStr2
);
171 assign(char_type
* aStr
, size_t aN
, char_type aChar
)
173 char_type
* result
= aStr
;
175 assign(*aStr
++, aChar
);
181 compare(const char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
183 for (; aN
--; ++aStr1
, ++aStr2
) {
184 if (!eq(*aStr1
, *aStr2
)) {
185 return to_int_type(*aStr1
) - to_int_type(*aStr2
);
193 compareASCII(const char_type
* aStr1
, const char* aStr2
, size_t aN
)
195 for (; aN
--; ++aStr1
, ++aStr2
) {
196 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
197 if (!eq_int_type(to_int_type(*aStr1
),
198 to_int_type(static_cast<char_type
>(*aStr2
)))) {
199 return to_int_type(*aStr1
) -
200 to_int_type(static_cast<char_type
>(*aStr2
));
207 // this version assumes that s2 is null-terminated and s1 has length n.
208 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
211 compareASCIINullTerminated(const char_type
* aStr1
, size_t aN
,
214 for (; aN
--; ++aStr1
, ++aStr2
) {
218 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
219 if (!eq_int_type(to_int_type(*aStr1
),
220 to_int_type(static_cast<char_type
>(*aStr2
)))) {
221 return to_int_type(*aStr1
) -
222 to_int_type(static_cast<char_type
>(*aStr2
));
234 * Convert c to its lower-case form, but only if c is in the ASCII
235 * range. Otherwise leave it alone.
238 ASCIIToLower(char_type aChar
)
240 if (aChar
>= 'A' && aChar
<= 'Z') {
241 return char_type(aChar
+ ('a' - 'A'));
248 compareLowerCaseToASCII(const char_type
* aStr1
, const char* aStr2
, size_t aN
)
250 for (; aN
--; ++aStr1
, ++aStr2
) {
251 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
252 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
253 "Unexpected uppercase character");
254 char_type lower_s1
= ASCIIToLower(*aStr1
);
255 if (lower_s1
!= static_cast<char_type
>(*aStr2
)) {
256 return to_int_type(lower_s1
) -
257 to_int_type(static_cast<char_type
>(*aStr2
));
264 // this version assumes that s2 is null-terminated and s1 has length n.
265 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
268 compareLowerCaseToASCIINullTerminated(const char_type
* aStr1
,
269 size_t aN
, const char* aStr2
)
271 for (; aN
--; ++aStr1
, ++aStr2
) {
275 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
276 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
277 "Unexpected uppercase character");
278 char_type lower_s1
= ASCIIToLower(*aStr1
);
279 if (lower_s1
!= static_cast<char_type
>(*aStr2
)) {
280 return to_int_type(lower_s1
) -
281 to_int_type(static_cast<char_type
>(*aStr2
));
293 length(const char_type
* aStr
)
296 while (!eq(*aStr
++, char_type(0))) {
302 static const char_type
*
303 find(const char_type
* aStr
, size_t aN
, char_type aChar
)
306 if (eq(*aStr
, aChar
)) {
317 struct nsCharTraits
<char>
319 typedef char char_type
;
320 typedef unsigned char unsigned_char_type
;
321 typedef char16_t incompatible_char_type
;
323 static char_type
* const sEmptyBuffer
;
326 assign(char_type
& aLhs
, char_type aRhs
)
332 // integer representation of characters:
334 typedef int int_type
;
337 to_char_type(int_type aChar
)
339 return char_type(aChar
);
343 to_int_type(char_type aChar
)
345 return int_type(static_cast<unsigned_char_type
>(aChar
));
349 eq_int_type(int_type aLhs
, int_type aRhs
)
355 // |char_type| comparisons:
357 static bool eq(char_type aLhs
, char_type aRhs
)
363 lt(char_type aLhs
, char_type aRhs
)
369 // operations on s[n] arrays:
372 move(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
374 return static_cast<char_type
*>(memmove(aStr1
, aStr2
,
375 aN
* sizeof(char_type
)));
379 copy(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
381 return static_cast<char_type
*>(memcpy(aStr1
, aStr2
,
382 aN
* sizeof(char_type
)));
386 copyASCII(char_type
* aStr1
, const char* aStr2
, size_t aN
)
388 return copy(aStr1
, aStr2
, aN
);
392 assign(char_type
* aStr
, size_t aN
, char_type aChar
)
394 return static_cast<char_type
*>(memset(aStr
, to_int_type(aChar
), aN
));
398 compare(const char_type
* aStr1
, const char_type
* aStr2
, size_t aN
)
400 return memcmp(aStr1
, aStr2
, aN
);
404 compareASCII(const char_type
* aStr1
, const char* aStr2
, size_t aN
)
407 for (size_t i
= 0; i
< aN
; ++i
) {
408 NS_ASSERTION(!(aStr2
[i
] & ~0x7F), "Unexpected non-ASCII character");
411 return compare(aStr1
, aStr2
, aN
);
414 // this version assumes that s2 is null-terminated and s1 has length n.
415 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
418 compareASCIINullTerminated(const char_type
* aStr1
, size_t aN
,
421 // can't use strcmp here because we don't want to stop when aStr1
423 for (; aN
--; ++aStr1
, ++aStr2
) {
427 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
428 if (*aStr1
!= *aStr2
) {
429 return to_int_type(*aStr1
) - to_int_type(*aStr2
);
441 * Convert c to its lower-case form, but only if c is ASCII.
444 ASCIIToLower(char_type aChar
)
446 if (aChar
>= 'A' && aChar
<= 'Z') {
447 return char_type(aChar
+ ('a' - 'A'));
454 compareLowerCaseToASCII(const char_type
* aStr1
, const char* aStr2
, size_t aN
)
456 for (; aN
--; ++aStr1
, ++aStr2
) {
457 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
458 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
459 "Unexpected uppercase character");
460 char_type lower_s1
= ASCIIToLower(*aStr1
);
461 if (lower_s1
!= *aStr2
) {
462 return to_int_type(lower_s1
) - to_int_type(*aStr2
);
468 // this version assumes that s2 is null-terminated and s1 has length n.
469 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
472 compareLowerCaseToASCIINullTerminated(const char_type
* aStr1
, size_t aN
,
475 for (; aN
--; ++aStr1
, ++aStr2
) {
479 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
480 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
481 "Unexpected uppercase character");
482 char_type lower_s1
= ASCIIToLower(*aStr1
);
483 if (lower_s1
!= *aStr2
) {
484 return to_int_type(lower_s1
) - to_int_type(*aStr2
);
496 length(const char_type
* aStr
)
501 static const char_type
*
502 find(const char_type
* aStr
, size_t aN
, char_type aChar
)
504 return reinterpret_cast<const char_type
*>(memchr(aStr
, to_int_type(aChar
),
509 template <class InputIterator
>
510 struct nsCharSourceTraits
512 typedef typename
InputIterator::difference_type difference_type
;
515 readable_distance(const InputIterator
& aFirst
, const InputIterator
& aLast
)
517 // assumes single fragment
518 return uint32_t(aLast
.get() - aFirst
.get());
521 static const typename
InputIterator::value_type
*
522 read(const InputIterator
& aIter
)
528 advance(InputIterator
& aStr
, difference_type aN
)
534 template <class CharT
>
535 struct nsCharSourceTraits
<CharT
*>
537 typedef ptrdiff_t difference_type
;
540 readable_distance(CharT
* aStr
)
542 return uint32_t(nsCharTraits
<CharT
>::length(aStr
));
543 // return numeric_limits<uint32_t>::max();
547 readable_distance(CharT
* aFirst
, CharT
* aLast
)
549 return uint32_t(aLast
- aFirst
);
559 advance(CharT
*& aStr
, difference_type aN
)
565 template <class OutputIterator
>
566 struct nsCharSinkTraits
569 write(OutputIterator
& aIter
, const typename
OutputIterator::value_type
* aStr
,
572 aIter
.write(aStr
, aN
);
576 template <class CharT
>
577 struct nsCharSinkTraits
<CharT
*>
580 write(CharT
*& aIter
, const CharT
* aStr
, uint32_t aN
)
582 nsCharTraits
<CharT
>::move(aIter
, aStr
, aN
);
587 #endif // !defined(nsCharTraits_h___)