1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 2000
20 * the Initial Developer. All Rights Reserved.
23 * Scott Collins <scc@mozilla.org> (original author)
25 * Alternatively, the contents of this file may be used under the terms of
26 * either of the GNU General Public License Version 2 or later (the "GPL"),
27 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #ifndef nsCharTraits_h___
40 #define nsCharTraits_h___
45 #define FORCED_CPP_2BYTE_WCHAR_T
46 // disable special optimizations for now through this hack
48 #if defined(HAVE_CPP_2BYTE_WCHAR_T) && !defined(FORCED_CPP_2BYTE_WCHAR_T)
49 #define USE_CPP_WCHAR_FUNCS
52 #ifdef USE_CPP_WCHAR_FUNCS
54 // for |wmemset|, et al
58 // for |memcpy|, et al
65 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
66 // particular the standalone software updater. In that case stub out
67 // the macros provided by nsDebug.h which are only usable when linking XPCOM
70 #define NS_WARNING(msg)
71 #define NS_ASSERTION(cond, msg)
81 typedef bool nsCharTraits_bool
;
83 typedef PRBool nsCharTraits_bool
;
87 * Some macros for converting PRUnichar (UTF-16) to and from Unicode scalar
90 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
91 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
92 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
93 * in the range U+DC00 - U+DFFF, like this:
95 * U+D800 U+DC00 = U+10000
96 * U+D800 U+DC01 = U+10001
98 * U+DBFF U+DFFE = U+10FFFE
99 * U+DBFF U+DFFF = U+10FFFF
101 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
102 * scalar values and are not well-formed UTF-16 except as high-surrogate /
103 * low-surrogate pairs.
106 #define PLANE1_BASE PRUint32(0x00010000)
107 // High surrogates are in the range 0xD800 -- OxDBFF
108 #define NS_IS_HIGH_SURROGATE(u) ((PRUint32(u) & 0xFFFFFC00) == 0xD800)
109 // Low surrogates are in the range 0xDC00 -- 0xDFFF
110 #define NS_IS_LOW_SURROGATE(u) ((PRUint32(u) & 0xFFFFFC00) == 0xDC00)
111 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
112 #define IS_SURROGATE(u) ((PRUint32(u) & 0xFFFFF800) == 0xD800)
114 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
116 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
117 // I wonder whether we could somehow assert that H is a high surrogate
118 // and L is a low surrogate
119 #define SURROGATE_TO_UCS4(h, l) (((PRUint32(h) & 0x03FF) << 10) + \
120 (PRUint32(l) & 0x03FF) + PLANE1_BASE)
122 // Extract surrogates from a UCS4 char
123 // Reference: the Unicode standard 4.0, section 3.9
124 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
125 // 0xD7C0 == 0xD800 - 0x0080,
126 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
127 #define H_SURROGATE(c) PRUnichar(PRUnichar(PRUint32(c) >> 10) + \
129 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
132 // Since 0x10000 & 0x03FF == 0,
133 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
134 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
135 #define L_SURROGATE(c) PRUnichar(PRUnichar(PRUint32(c) & PRUint32(0x03FF)) | \
138 #define IS_IN_BMP(ucs) (PRUint32(ucs) < PLANE1_BASE)
139 #define UCS2_REPLACEMENT_CHAR PRUnichar(0xFFFD)
141 #define UCS_END PRUint32(0x00110000)
142 #define IS_VALID_CHAR(c) ((PRUint32(c) < UCS_END) && !IS_SURROGATE(c))
143 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
145 template <class CharT
> struct nsCharTraits
{};
147 NS_SPECIALIZE_TEMPLATE
148 struct nsCharTraits
<PRUnichar
>
150 typedef PRUnichar char_type
;
151 typedef PRUint16 unsigned_char_type
;
152 typedef char incompatible_char_type
;
154 NS_COM
static char_type
*sEmptyBuffer
;
158 assign( char_type
& lhs
, char_type rhs
)
164 // integer representation of characters:
166 #ifdef USE_CPP_WCHAR_FUNCS
167 typedef wint_t int_type
;
169 typedef int int_type
;
174 to_char_type( int_type c
)
181 to_int_type( char_type c
)
183 return int_type( static_cast<unsigned_char_type
>(c
) );
188 eq_int_type( int_type lhs
, int_type rhs
)
194 // |char_type| comparisons:
198 eq( char_type lhs
, char_type rhs
)
205 lt( char_type lhs
, char_type rhs
)
211 // operations on s[n] arrays:
215 move( char_type
* s1
, const char_type
* s2
, size_t n
)
217 return static_cast<char_type
*>(memmove(s1
, s2
, n
* sizeof(char_type
)));
222 copy( char_type
* s1
, const char_type
* s2
, size_t n
)
224 return static_cast<char_type
*>(memcpy(s1
, s2
, n
* sizeof(char_type
)));
229 copyASCII( char_type
* s1
, const char* s2
, size_t n
)
231 for (char_type
* s
= s1
; n
--; ++s
, ++s2
) {
232 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
240 assign( char_type
* s
, size_t n
, char_type c
)
242 #ifdef USE_CPP_WCHAR_FUNCS
243 return static_cast<char_type
*>(wmemset(s
, to_int_type(c
), n
));
245 char_type
* result
= s
;
254 compare( const char_type
* s1
, const char_type
* s2
, size_t n
)
256 #ifdef USE_CPP_WCHAR_FUNCS
257 return wmemcmp(s1
, s2
, n
);
259 for ( ; n
--; ++s1
, ++s2
)
262 return to_int_type(*s1
) - to_int_type(*s2
);
271 compareASCII( const char_type
* s1
, const char* s2
, size_t n
)
273 for ( ; n
--; ++s1
, ++s2
)
275 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
276 if ( !eq_int_type(to_int_type(*s1
), to_int_type(*s2
)) )
277 return to_int_type(*s1
) - to_int_type(*s2
);
283 // this version assumes that s2 is null-terminated and s1 has length n.
284 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
288 compareASCIINullTerminated( const char_type
* s1
, size_t n
, const char* s2
)
290 for ( ; n
--; ++s1
, ++s2
)
294 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
295 if ( !eq_int_type(to_int_type(*s1
), to_int_type(*s2
)) )
296 return to_int_type(*s1
) - to_int_type(*s2
);
306 * Convert c to its lower-case form, but only if the lower-case form is
307 * ASCII. Otherwise leave it alone.
309 * There are only two non-ASCII Unicode characters whose lowercase
310 * equivalents are ASCII: KELVIN SIGN and LATIN CAPITAL LETTER I WITH
311 * DOT ABOVE. So it's a simple matter to handle those explicitly.
315 ASCIIToLower( char_type c
)
319 if (c
>= 'A' && c
<= 'Z')
320 return char_type(c
+ ('a' - 'A'));
326 if (c
== 0x212A) // KELVIN SIGN
328 if (c
== 0x0130) // LATIN CAPITAL LETTER I WITH DOT ABOVE
336 compareLowerCaseToASCII( const char_type
* s1
, const char* s2
, size_t n
)
338 for ( ; n
--; ++s1
, ++s2
)
340 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
341 NS_ASSERTION(!(*s2
>= 'A' && *s2
<= 'Z'),
342 "Unexpected uppercase character");
343 char_type lower_s1
= ASCIIToLower(*s1
);
344 if ( lower_s1
!= to_char_type(*s2
) )
345 return to_int_type(lower_s1
) - to_int_type(*s2
);
351 // this version assumes that s2 is null-terminated and s1 has length n.
352 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
356 compareLowerCaseToASCIINullTerminated( const char_type
* s1
, size_t n
, const char* s2
)
358 for ( ; n
--; ++s1
, ++s2
)
362 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
363 NS_ASSERTION(!(*s2
>= 'A' && *s2
<= 'Z'),
364 "Unexpected uppercase character");
365 char_type lower_s1
= ASCIIToLower(*s1
);
366 if ( lower_s1
!= to_char_type(*s2
) )
367 return to_int_type(lower_s1
) - to_int_type(*s2
);
378 length( const char_type
* s
)
380 #ifdef USE_CPP_WCHAR_FUNCS
384 while ( !eq(*s
++, char_type(0)) )
392 find( const char_type
* s
, size_t n
, char_type c
)
394 #ifdef USE_CPP_WCHAR_FUNCS
395 return reinterpret_cast<const char_type
*>(wmemchr(s
, to_int_type(c
), n
));
411 typedef streamoff off_type
;
412 typedef streampos pos_type
;
413 typedef mbstate_t state_type
;
419 #ifdef USE_CPP_WCHAR_FUNCS
428 not_eof( int_type c
)
430 return eq_int_type(c
, eof()) ? ~eof() : c
;
433 // static state_type get_state( pos_type );
437 NS_SPECIALIZE_TEMPLATE
438 struct nsCharTraits
<char>
440 typedef char char_type
;
441 typedef unsigned char unsigned_char_type
;
442 typedef PRUnichar incompatible_char_type
;
444 NS_COM
static char_type
*sEmptyBuffer
;
448 assign( char_type
& lhs
, char_type rhs
)
454 // integer representation of characters:
456 typedef int int_type
;
460 to_char_type( int_type c
)
467 to_int_type( char_type c
)
469 return int_type( static_cast<unsigned_char_type
>(c
) );
474 eq_int_type( int_type lhs
, int_type rhs
)
480 // |char_type| comparisons:
484 eq( char_type lhs
, char_type rhs
)
491 lt( char_type lhs
, char_type rhs
)
497 // operations on s[n] arrays:
501 move( char_type
* s1
, const char_type
* s2
, size_t n
)
503 return static_cast<char_type
*>(memmove(s1
, s2
, n
* sizeof(char_type
)));
508 copy( char_type
* s1
, const char_type
* s2
, size_t n
)
510 return static_cast<char_type
*>(memcpy(s1
, s2
, n
* sizeof(char_type
)));
515 copyASCII( char_type
* s1
, const char* s2
, size_t n
)
517 return copy(s1
, s2
, n
);
522 assign( char_type
* s
, size_t n
, char_type c
)
524 return static_cast<char_type
*>(memset(s
, to_int_type(c
), n
));
529 compare( const char_type
* s1
, const char_type
* s2
, size_t n
)
531 return memcmp(s1
, s2
, n
);
536 compareASCII( const char_type
* s1
, const char* s2
, size_t n
)
539 for (size_t i
= 0; i
< n
; ++i
)
541 NS_ASSERTION(!(s2
[i
] & ~0x7F), "Unexpected non-ASCII character");
544 return compare(s1
, s2
, n
);
547 // this version assumes that s2 is null-terminated and s1 has length n.
548 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
552 compareASCIINullTerminated( const char_type
* s1
, size_t n
, const char* s2
)
554 // can't use strcmp here because we don't want to stop when s1
556 for ( ; n
--; ++s1
, ++s2
)
560 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
562 return to_int_type(*s1
) - to_int_type(*s2
);
572 * Convert c to its lower-case form, but only if c is ASCII.
576 ASCIIToLower( char_type c
)
578 if (c
>= 'A' && c
<= 'Z')
579 return char_type(c
+ ('a' - 'A'));
586 compareLowerCaseToASCII( const char_type
* s1
, const char* s2
, size_t n
)
588 for ( ; n
--; ++s1
, ++s2
)
590 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
591 NS_ASSERTION(!(*s2
>= 'A' && *s2
<= 'Z'),
592 "Unexpected uppercase character");
593 char_type lower_s1
= ASCIIToLower(*s1
);
594 if ( lower_s1
!= *s2
)
595 return to_int_type(lower_s1
) - to_int_type(*s2
);
600 // this version assumes that s2 is null-terminated and s1 has length n.
601 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
605 compareLowerCaseToASCIINullTerminated( const char_type
* s1
, size_t n
, const char* s2
)
607 for ( ; n
--; ++s1
, ++s2
)
611 NS_ASSERTION(!(*s2
& ~0x7F), "Unexpected non-ASCII character");
612 NS_ASSERTION(!(*s2
>= 'A' && *s2
<= 'Z'),
613 "Unexpected uppercase character");
614 char_type lower_s1
= ASCIIToLower(*s1
);
615 if ( lower_s1
!= *s2
)
616 return to_int_type(lower_s1
) - to_int_type(*s2
);
627 length( const char_type
* s
)
634 find( const char_type
* s
, size_t n
, char_type c
)
636 return reinterpret_cast<const char_type
*>(memchr(s
, to_int_type(c
), n
));
642 typedef streamoff off_type
;
643 typedef streampos pos_type
;
644 typedef mbstate_t state_type
;
655 not_eof( int_type c
)
657 return eq_int_type(c
, eof()) ? ~eof() : c
;
660 // static state_type get_state( pos_type );
664 template <class InputIterator
>
665 struct nsCharSourceTraits
667 typedef typename
InputIterator::difference_type difference_type
;
671 readable_distance( const InputIterator
& first
, const InputIterator
& last
)
673 // assumes single fragment
674 return PRUint32(last
.get() - first
.get());
678 const typename
InputIterator::value_type
*
679 read( const InputIterator
& iter
)
686 advance( InputIterator
& s
, difference_type n
)
692 #ifdef HAVE_CPP_PARTIAL_SPECIALIZATION
694 template <class CharT
>
695 struct nsCharSourceTraits
<CharT
*>
697 typedef ptrdiff_t difference_type
;
701 readable_distance( CharT
* s
)
703 return PRUint32(nsCharTraits
<CharT
>::length(s
));
704 // return numeric_limits<PRUint32>::max();
709 readable_distance( CharT
* first
, CharT
* last
)
711 return PRUint32(last
-first
);
723 advance( CharT
*& s
, difference_type n
)
731 NS_SPECIALIZE_TEMPLATE
732 struct nsCharSourceTraits
<const char*>
734 typedef ptrdiff_t difference_type
;
738 readable_distance( const char* s
)
740 return PRUint32(nsCharTraits
<char>::length(s
));
741 // return numeric_limits<PRUint32>::max();
746 readable_distance( const char* first
, const char* last
)
748 return PRUint32(last
-first
);
753 read( const char* s
)
760 advance( const char*& s
, difference_type n
)
767 NS_SPECIALIZE_TEMPLATE
768 struct nsCharSourceTraits
<const PRUnichar
*>
770 typedef ptrdiff_t difference_type
;
774 readable_distance( const PRUnichar
* s
)
776 return PRUint32(nsCharTraits
<PRUnichar
>::length(s
));
777 // return numeric_limits<PRUint32>::max();
782 readable_distance( const PRUnichar
* first
, const PRUnichar
* last
)
784 return PRUint32(last
-first
);
789 read( const PRUnichar
* s
)
796 advance( const PRUnichar
*& s
, difference_type n
)
805 template <class OutputIterator
>
806 struct nsCharSinkTraits
810 write( OutputIterator
& iter
, const typename
OutputIterator::value_type
* s
, PRUint32 n
)
816 #ifdef HAVE_CPP_PARTIAL_SPECIALIZATION
818 template <class CharT
>
819 struct nsCharSinkTraits
<CharT
*>
823 write( CharT
*& iter
, const CharT
* s
, PRUint32 n
)
825 nsCharTraits
<CharT
>::move(iter
, s
, n
);
832 NS_SPECIALIZE_TEMPLATE
833 struct nsCharSinkTraits
<char*>
837 write( char*& iter
, const char* s
, PRUint32 n
)
839 nsCharTraits
<char>::move(iter
, s
, n
);
844 NS_SPECIALIZE_TEMPLATE
845 struct nsCharSinkTraits
<PRUnichar
*>
849 write( PRUnichar
*& iter
, const PRUnichar
* s
, PRUint32 n
)
851 nsCharTraits
<PRUnichar
>::move(iter
, s
, n
);
858 #endif // !defined(nsCharTraits_h___)