1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef nsCharTraits_h___
8 #define nsCharTraits_h___
10 #include <ctype.h> // for |EOF|, |WEOF|
11 #include <stdint.h> // for |uint32_t|
12 #include <string.h> // for |memcpy|, et al
13 #include "mozilla/MemoryChecking.h"
15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
16 // particular the standalone software updater. In that case stub out
17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
20 # define NS_WARNING(msg)
21 # define NS_ASSERTION(cond, msg)
22 # define NS_ERROR(msg)
24 # include "nsDebug.h" // for NS_ASSERTION
28 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
31 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
32 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
33 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
34 * in the range U+DC00 - U+DFFF, like this:
36 * U+D800 U+DC00 = U+10000
37 * U+D800 U+DC01 = U+10001
39 * U+DBFF U+DFFE = U+10FFFE
40 * U+DBFF U+DFFF = U+10FFFF
42 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
43 * scalar values and are not well-formed UTF-16 except as high-surrogate /
44 * low-surrogate pairs.
47 #define PLANE1_BASE uint32_t(0x00010000)
48 // High surrogates are in the range 0xD800 -- OxDBFF
49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
52 // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE
53 #define NS_IS_SURROGATE_PAIR(h, l) \
54 (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l))
55 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
56 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
58 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
60 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
61 // I wonder whether we could somehow assert that H is a high surrogate
62 // and L is a low surrogate
63 #define SURROGATE_TO_UCS4(h, l) \
64 (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE)
66 // Extract surrogates from a UCS4 char
67 // Reference: the Unicode standard 4.0, section 3.9
68 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
69 // 0xD7C0 == 0xD800 - 0x0080,
70 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
71 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0))
72 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
75 // Since 0x10000 & 0x03FF == 0,
76 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
77 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
78 #define L_SURROGATE(c) \
79 char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00))
81 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
82 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
84 #define UCS_END uint32_t(0x00110000)
85 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
86 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
88 template <class CharT
>
89 struct nsCharTraits
{};
92 struct nsCharTraits
<char16_t
> {
93 typedef char16_t char_type
;
94 typedef uint16_t unsigned_char_type
;
95 typedef char incompatible_char_type
;
97 static char_type
* const sEmptyBuffer
;
99 // integer representation of characters:
100 typedef int int_type
;
102 static char_type
to_char_type(int_type aChar
) { return char_type(aChar
); }
104 static int_type
to_int_type(char_type aChar
) {
105 return int_type(static_cast<unsigned_char_type
>(aChar
));
108 static bool eq_int_type(int_type aLhs
, int_type aRhs
) { return aLhs
== aRhs
; }
110 // |char_type| comparisons:
112 static bool eq(char_type aLhs
, char_type aRhs
) { return aLhs
== aRhs
; }
114 static bool lt(char_type aLhs
, char_type aRhs
) { return aLhs
< aRhs
; }
116 // operations on s[n] arrays:
118 static char_type
* move(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
) {
119 return static_cast<char_type
*>(
120 memmove(aStr1
, aStr2
, aN
* sizeof(char_type
)));
123 static char_type
* copy(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
) {
124 return static_cast<char_type
*>(
125 memcpy(aStr1
, aStr2
, aN
* sizeof(char_type
)));
128 static void uninitialize(char_type
* aStr
, size_t aN
) {
130 memset(aStr
, 0xE4, aN
* sizeof(char_type
));
132 MOZ_MAKE_MEM_UNDEFINED(aStr
, aN
* sizeof(char_type
));
135 static char_type
* copyASCII(char_type
* aStr1
, const char* aStr2
, size_t aN
) {
136 for (char_type
* s
= aStr1
; aN
--; ++s
, ++aStr2
) {
137 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
138 *s
= static_cast<char_type
>(*aStr2
);
143 static int compare(const char_type
* aStr1
, const char_type
* aStr2
,
145 for (; aN
--; ++aStr1
, ++aStr2
) {
146 if (!eq(*aStr1
, *aStr2
)) {
147 return to_int_type(*aStr1
) - to_int_type(*aStr2
);
154 static int compareASCII(const char_type
* aStr1
, const char* aStr2
,
156 for (; aN
--; ++aStr1
, ++aStr2
) {
157 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
158 if (!eq_int_type(to_int_type(*aStr1
),
159 to_int_type(static_cast<char_type
>(*aStr2
)))) {
160 return to_int_type(*aStr1
) -
161 to_int_type(static_cast<char_type
>(*aStr2
));
168 static bool equalsLatin1(const char_type
* aStr1
, const char* aStr2
,
170 for (size_t i
= aN
; i
> 0; --i
, ++aStr1
, ++aStr2
) {
171 if (*aStr1
!= static_cast<char_type
>(*aStr2
)) {
179 // this version assumes that s2 is null-terminated and s1 has length n.
180 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
182 static int compareASCIINullTerminated(const char_type
* aStr1
, size_t aN
,
184 for (; aN
--; ++aStr1
, ++aStr2
) {
188 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
189 if (!eq_int_type(to_int_type(*aStr1
),
190 to_int_type(static_cast<char_type
>(*aStr2
)))) {
191 return to_int_type(*aStr1
) -
192 to_int_type(static_cast<char_type
>(*aStr2
));
204 * Convert c to its lower-case form, but only if c is in the ASCII
205 * range. Otherwise leave it alone.
207 static char_type
ASCIIToLower(char_type aChar
) {
208 if (aChar
>= 'A' && aChar
<= 'Z') {
209 return char_type(aChar
+ ('a' - 'A'));
215 static int compareLowerCaseToASCII(const char_type
* aStr1
, const char* aStr2
,
217 for (; aN
--; ++aStr1
, ++aStr2
) {
218 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
219 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
220 "Unexpected uppercase character");
221 char_type lower_s1
= ASCIIToLower(*aStr1
);
222 if (lower_s1
!= static_cast<char_type
>(*aStr2
)) {
223 return to_int_type(lower_s1
) -
224 to_int_type(static_cast<char_type
>(*aStr2
));
231 // this version assumes that s2 is null-terminated and s1 has length n.
232 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
234 static int compareLowerCaseToASCIINullTerminated(const char_type
* aStr1
,
237 for (; aN
--; ++aStr1
, ++aStr2
) {
241 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
242 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
243 "Unexpected uppercase character");
244 char_type lower_s1
= ASCIIToLower(*aStr1
);
245 if (lower_s1
!= static_cast<char_type
>(*aStr2
)) {
246 return to_int_type(lower_s1
) -
247 to_int_type(static_cast<char_type
>(*aStr2
));
258 static size_t length(const char_type
* aStr
) {
260 while (!eq(*aStr
++, char_type(0))) {
266 static const char_type
* find(const char_type
* aStr
, size_t aN
,
269 if (eq(*aStr
, aChar
)) {
280 struct nsCharTraits
<char> {
281 typedef char char_type
;
282 typedef unsigned char unsigned_char_type
;
283 typedef char16_t incompatible_char_type
;
285 static char_type
* const sEmptyBuffer
;
287 // integer representation of characters:
289 typedef int int_type
;
291 static char_type
to_char_type(int_type aChar
) { return char_type(aChar
); }
293 static int_type
to_int_type(char_type aChar
) {
294 return int_type(static_cast<unsigned_char_type
>(aChar
));
297 static bool eq_int_type(int_type aLhs
, int_type aRhs
) { return aLhs
== aRhs
; }
299 // |char_type| comparisons:
301 static bool eq(char_type aLhs
, char_type aRhs
) { return aLhs
== aRhs
; }
303 static bool lt(char_type aLhs
, char_type aRhs
) { return aLhs
< aRhs
; }
305 // operations on s[n] arrays:
307 static char_type
* move(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
) {
308 return static_cast<char_type
*>(
309 memmove(aStr1
, aStr2
, aN
* sizeof(char_type
)));
312 static char_type
* copy(char_type
* aStr1
, const char_type
* aStr2
, size_t aN
) {
313 return static_cast<char_type
*>(
314 memcpy(aStr1
, aStr2
, aN
* sizeof(char_type
)));
317 static void uninitialize(char_type
* aStr
, size_t aN
) {
319 memset(aStr
, 0xE4, aN
* sizeof(char_type
));
321 MOZ_MAKE_MEM_UNDEFINED(aStr
, aN
* sizeof(char_type
));
324 static char_type
* copyASCII(char_type
* aStr1
, const char* aStr2
, size_t aN
) {
325 return copy(aStr1
, aStr2
, aN
);
328 static int compare(const char_type
* aStr1
, const char_type
* aStr2
,
330 return memcmp(aStr1
, aStr2
, aN
);
333 static int compareASCII(const char_type
* aStr1
, const char* aStr2
,
336 for (size_t i
= 0; i
< aN
; ++i
) {
337 NS_ASSERTION(!(aStr2
[i
] & ~0x7F), "Unexpected non-ASCII character");
340 return compare(aStr1
, aStr2
, aN
);
343 static bool equalsLatin1(const char_type
* aStr1
, const char* aStr2
,
345 return memcmp(aStr1
, aStr2
, aN
) == 0;
348 // this version assumes that s2 is null-terminated and s1 has length n.
349 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
351 static int compareASCIINullTerminated(const char_type
* aStr1
, size_t aN
,
353 // can't use strcmp here because we don't want to stop when aStr1
355 for (; aN
--; ++aStr1
, ++aStr2
) {
359 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
360 if (*aStr1
!= *aStr2
) {
361 return to_int_type(*aStr1
) - to_int_type(*aStr2
);
373 * Convert c to its lower-case form, but only if c is ASCII.
375 static char_type
ASCIIToLower(char_type aChar
) {
376 if (aChar
>= 'A' && aChar
<= 'Z') {
377 return char_type(aChar
+ ('a' - 'A'));
383 static int compareLowerCaseToASCII(const char_type
* aStr1
, const char* aStr2
,
385 for (; aN
--; ++aStr1
, ++aStr2
) {
386 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
387 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
388 "Unexpected uppercase character");
389 char_type lower_s1
= ASCIIToLower(*aStr1
);
390 if (lower_s1
!= *aStr2
) {
391 return to_int_type(lower_s1
) - to_int_type(*aStr2
);
397 // this version assumes that s2 is null-terminated and s1 has length n.
398 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
400 static int compareLowerCaseToASCIINullTerminated(const char_type
* aStr1
,
403 for (; aN
--; ++aStr1
, ++aStr2
) {
407 NS_ASSERTION(!(*aStr2
& ~0x7F), "Unexpected non-ASCII character");
408 NS_ASSERTION(!(*aStr2
>= 'A' && *aStr2
<= 'Z'),
409 "Unexpected uppercase character");
410 char_type lower_s1
= ASCIIToLower(*aStr1
);
411 if (lower_s1
!= *aStr2
) {
412 return to_int_type(lower_s1
) - to_int_type(*aStr2
);
423 static size_t length(const char_type
* aStr
) { return strlen(aStr
); }
425 static const char_type
* find(const char_type
* aStr
, size_t aN
,
427 return reinterpret_cast<const char_type
*>(
428 memchr(aStr
, to_int_type(aChar
), aN
));
432 template <class InputIterator
>
433 struct nsCharSourceTraits
{
434 typedef typename
InputIterator::difference_type difference_type
;
436 static difference_type
readable_distance(const InputIterator
& aFirst
,
437 const InputIterator
& aLast
) {
438 // assumes single fragment
439 return aLast
.get() - aFirst
.get();
442 static const typename
InputIterator::value_type
* read(
443 const InputIterator
& aIter
) {
447 static void advance(InputIterator
& aStr
, difference_type aN
) {
452 template <class CharT
>
453 struct nsCharSourceTraits
<CharT
*> {
454 typedef ptrdiff_t difference_type
;
456 static difference_type
readable_distance(CharT
* aStr
) {
457 return nsCharTraits
<CharT
>::length(aStr
);
460 static difference_type
readable_distance(CharT
* aFirst
, CharT
* aLast
) {
461 return aLast
- aFirst
;
464 static const CharT
* read(CharT
* aStr
) { return aStr
; }
466 static void advance(CharT
*& aStr
, difference_type aN
) { aStr
+= aN
; }
469 template <class OutputIterator
>
470 struct nsCharSinkTraits
{
471 static void write(OutputIterator
& aIter
,
472 const typename
OutputIterator::value_type
* aStr
,
474 aIter
.write(aStr
, aN
);
478 template <class CharT
>
479 struct nsCharSinkTraits
<CharT
*> {
480 static void write(CharT
*& aIter
, const CharT
* aStr
, size_t aN
) {
481 nsCharTraits
<CharT
>::move(aIter
, aStr
, aN
);
486 #endif // !defined(nsCharTraits_h___)