xpcom/string/nsCharTraits.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #ifndef nsCharTraits_h___
   8 #define nsCharTraits_h___
   9
  10 #include <ctype.h>   // for |EOF|, |WEOF|
  11 #include <stdint.h>  // for |uint32_t|
  12 #include <string.h>  // for |memcpy|, et al
  13 #include "mozilla/MemoryChecking.h"
  14
  15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
  16 // particular the standalone software updater. In that case stub out
  17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
  18
  19 #ifdef NS_NO_XPCOM
  20 #  define NS_WARNING(msg)
  21 #  define NS_ASSERTION(cond, msg)
  22 #  define NS_ERROR(msg)
  23 #else
  24 #  include "nsDebug.h"  // for NS_ASSERTION
  25 #endif
  26
  27 /*
  28  * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
  29  * values.
  30  *
  31  * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
  32  * using "surrogate pairs". These consist of a high surrogate, i.e. a code
  33  * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
  34  * in the range U+DC00 - U+DFFF, like this:
  35  *
  36  *  U+D800 U+DC00 =  U+10000
  37  *  U+D800 U+DC01 =  U+10001
  38  *  ...
  39  *  U+DBFF U+DFFE = U+10FFFE
  40  *  U+DBFF U+DFFF = U+10FFFF
  41  *
  42  * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
  43  * scalar values and are not well-formed UTF-16 except as high-surrogate /
  44  * low-surrogate pairs.
  45  */
  46
  47 #define PLANE1_BASE uint32_t(0x00010000)
  48 // High surrogates are in the range 0xD800 -- OxDBFF
  49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
  50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
  51 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
  52 // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE
  53 #define NS_IS_SURROGATE_PAIR(h, l) \
  54   (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l))
  55 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
  56 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
  57
  58 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
  59
  60 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
  61 // I wonder whether we could somehow assert that H is a high surrogate
  62 // and L is a low surrogate
  63 #define SURROGATE_TO_UCS4(h, l) \
  64   (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE)
  65
  66 // Extract surrogates from a UCS4 char
  67 // Reference: the Unicode standard 4.0, section 3.9
  68 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
  69 // 0xD7C0 == 0xD800 - 0x0080,
  70 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
  71 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0))
  72 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
  73 // but added.
  74
  75 // Since 0x10000 & 0x03FF == 0,
  76 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
  77 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
  78 #define L_SURROGATE(c) \
  79   char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00))
  80
  81 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
  82 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
  83
  84 #define UCS_END uint32_t(0x00110000)
  85 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
  86 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
  87
  88 template <class CharT>
  89 struct nsCharTraits {};
  90
  91 template <>
  92 struct nsCharTraits<char16_t> {
  93   typedef char16_t char_type;
  94   typedef uint16_t unsigned_char_type;
  95   typedef char incompatible_char_type;
  96
  97   static char_type* const sEmptyBuffer;
  98
  99   // integer representation of characters:
 100   typedef int int_type;
 101
 102   static char_type to_char_type(int_type aChar) { return char_type(aChar); }
 103
 104   static int_type to_int_type(char_type aChar) {
 105     return int_type(static_cast<unsigned_char_type>(aChar));
 106   }
 107
 108   static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
 109
 110   // |char_type| comparisons:
 111
 112   static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
 113
 114   static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
 115
 116   // operations on s[n] arrays:
 117
 118   static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
 119     return static_cast<char_type*>(
 120         memmove(aStr1, aStr2, aN * sizeof(char_type)));
 121   }
 122
 123   static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
 124     return static_cast<char_type*>(
 125         memcpy(aStr1, aStr2, aN * sizeof(char_type)));
 126   }
 127
 128   static void uninitialize(char_type* aStr, size_t aN) {
 129 #ifdef DEBUG
 130     memset(aStr, 0xE4, aN * sizeof(char_type));
 131 #endif
 132     MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
 133   }
 134
 135   static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
 136     for (char_type* s = aStr1; aN--; ++s, ++aStr2) {
 137       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 138       *s = static_cast<char_type>(*aStr2);
 139     }
 140     return aStr1;
 141   }
 142
 143   static int compare(const char_type* aStr1, const char_type* aStr2,
 144                      size_t aN) {
 145     for (; aN--; ++aStr1, ++aStr2) {
 146       if (!eq(*aStr1, *aStr2)) {
 147         return to_int_type(*aStr1) - to_int_type(*aStr2);
 148       }
 149     }
 150
 151     return 0;
 152   }
 153
 154   static int compareASCII(const char_type* aStr1, const char* aStr2,
 155                           size_t aN) {
 156     for (; aN--; ++aStr1, ++aStr2) {
 157       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 158       if (!eq_int_type(to_int_type(*aStr1),
 159                        to_int_type(static_cast<char_type>(*aStr2)))) {
 160         return to_int_type(*aStr1) -
 161                to_int_type(static_cast<char_type>(*aStr2));
 162       }
 163     }
 164
 165     return 0;
 166   }
 167
 168   static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
 169                            const size_t aN) {
 170     for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) {
 171       if (*aStr1 != static_cast<char_type>(*aStr2)) {
 172         return false;
 173       }
 174     }
 175
 176     return true;
 177   }
 178
 179   // this version assumes that s2 is null-terminated and s1 has length n.
 180   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 181   // we return 1.
 182   static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
 183                                         const char* aStr2) {
 184     for (; aN--; ++aStr1, ++aStr2) {
 185       if (!*aStr2) {
 186         return 1;
 187       }
 188       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 189       if (!eq_int_type(to_int_type(*aStr1),
 190                        to_int_type(static_cast<char_type>(*aStr2)))) {
 191         return to_int_type(*aStr1) -
 192                to_int_type(static_cast<char_type>(*aStr2));
 193       }
 194     }
 195
 196     if (*aStr2) {
 197       return -1;
 198     }
 199
 200     return 0;
 201   }
 202
 203   /**
 204    * Convert c to its lower-case form, but only if c is in the ASCII
 205    * range. Otherwise leave it alone.
 206    */
 207   static char_type ASCIIToLower(char_type aChar) {
 208     if (aChar >= 'A' && aChar <= 'Z') {
 209       return char_type(aChar + ('a' - 'A'));
 210     }
 211
 212     return aChar;
 213   }
 214
 215   static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
 216                                      size_t aN) {
 217     for (; aN--; ++aStr1, ++aStr2) {
 218       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 219       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 220                    "Unexpected uppercase character");
 221       char_type lower_s1 = ASCIIToLower(*aStr1);
 222       if (lower_s1 != static_cast<char_type>(*aStr2)) {
 223         return to_int_type(lower_s1) -
 224                to_int_type(static_cast<char_type>(*aStr2));
 225       }
 226     }
 227
 228     return 0;
 229   }
 230
 231   // this version assumes that s2 is null-terminated and s1 has length n.
 232   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 233   // we return 1.
 234   static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
 235                                                    size_t aN,
 236                                                    const char* aStr2) {
 237     for (; aN--; ++aStr1, ++aStr2) {
 238       if (!*aStr2) {
 239         return 1;
 240       }
 241       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 242       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 243                    "Unexpected uppercase character");
 244       char_type lower_s1 = ASCIIToLower(*aStr1);
 245       if (lower_s1 != static_cast<char_type>(*aStr2)) {
 246         return to_int_type(lower_s1) -
 247                to_int_type(static_cast<char_type>(*aStr2));
 248       }
 249     }
 250
 251     if (*aStr2) {
 252       return -1;
 253     }
 254
 255     return 0;
 256   }
 257
 258   static size_t length(const char_type* aStr) {
 259     size_t result = 0;
 260     while (!eq(*aStr++, char_type(0))) {
 261       ++result;
 262     }
 263     return result;
 264   }
 265
 266   static const char_type* find(const char_type* aStr, size_t aN,
 267                                char_type aChar) {
 268     while (aN--) {
 269       if (eq(*aStr, aChar)) {
 270         return aStr;
 271       }
 272       ++aStr;
 273     }
 274
 275     return 0;
 276   }
 277 };
 278
 279 template <>
 280 struct nsCharTraits<char> {
 281   typedef char char_type;
 282   typedef unsigned char unsigned_char_type;
 283   typedef char16_t incompatible_char_type;
 284
 285   static char_type* const sEmptyBuffer;
 286
 287   // integer representation of characters:
 288
 289   typedef int int_type;
 290
 291   static char_type to_char_type(int_type aChar) { return char_type(aChar); }
 292
 293   static int_type to_int_type(char_type aChar) {
 294     return int_type(static_cast<unsigned_char_type>(aChar));
 295   }
 296
 297   static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; }
 298
 299   // |char_type| comparisons:
 300
 301   static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; }
 302
 303   static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; }
 304
 305   // operations on s[n] arrays:
 306
 307   static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) {
 308     return static_cast<char_type*>(
 309         memmove(aStr1, aStr2, aN * sizeof(char_type)));
 310   }
 311
 312   static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) {
 313     return static_cast<char_type*>(
 314         memcpy(aStr1, aStr2, aN * sizeof(char_type)));
 315   }
 316
 317   static void uninitialize(char_type* aStr, size_t aN) {
 318 #ifdef DEBUG
 319     memset(aStr, 0xE4, aN * sizeof(char_type));
 320 #endif
 321     MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type));
 322   }
 323
 324   static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) {
 325     return copy(aStr1, aStr2, aN);
 326   }
 327
 328   static int compare(const char_type* aStr1, const char_type* aStr2,
 329                      size_t aN) {
 330     return memcmp(aStr1, aStr2, aN);
 331   }
 332
 333   static int compareASCII(const char_type* aStr1, const char* aStr2,
 334                           size_t aN) {
 335 #ifdef DEBUG
 336     for (size_t i = 0; i < aN; ++i) {
 337       NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character");
 338     }
 339 #endif
 340     return compare(aStr1, aStr2, aN);
 341   }
 342
 343   static bool equalsLatin1(const char_type* aStr1, const char* aStr2,
 344                            size_t aN) {
 345     return memcmp(aStr1, aStr2, aN) == 0;
 346   }
 347
 348   // this version assumes that s2 is null-terminated and s1 has length n.
 349   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 350   // we return 1.
 351   static int compareASCIINullTerminated(const char_type* aStr1, size_t aN,
 352                                         const char* aStr2) {
 353     // can't use strcmp here because we don't want to stop when aStr1
 354     // contains a null
 355     for (; aN--; ++aStr1, ++aStr2) {
 356       if (!*aStr2) {
 357         return 1;
 358       }
 359       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 360       if (*aStr1 != *aStr2) {
 361         return to_int_type(*aStr1) - to_int_type(*aStr2);
 362       }
 363     }
 364
 365     if (*aStr2) {
 366       return -1;
 367     }
 368
 369     return 0;
 370   }
 371
 372   /**
 373    * Convert c to its lower-case form, but only if c is ASCII.
 374    */
 375   static char_type ASCIIToLower(char_type aChar) {
 376     if (aChar >= 'A' && aChar <= 'Z') {
 377       return char_type(aChar + ('a' - 'A'));
 378     }
 379
 380     return aChar;
 381   }
 382
 383   static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2,
 384                                      size_t aN) {
 385     for (; aN--; ++aStr1, ++aStr2) {
 386       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 387       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 388                    "Unexpected uppercase character");
 389       char_type lower_s1 = ASCIIToLower(*aStr1);
 390       if (lower_s1 != *aStr2) {
 391         return to_int_type(lower_s1) - to_int_type(*aStr2);
 392       }
 393     }
 394     return 0;
 395   }
 396
 397   // this version assumes that s2 is null-terminated and s1 has length n.
 398   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 399   // we return 1.
 400   static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
 401                                                    size_t aN,
 402                                                    const char* aStr2) {
 403     for (; aN--; ++aStr1, ++aStr2) {
 404       if (!*aStr2) {
 405         return 1;
 406       }
 407       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 408       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 409                    "Unexpected uppercase character");
 410       char_type lower_s1 = ASCIIToLower(*aStr1);
 411       if (lower_s1 != *aStr2) {
 412         return to_int_type(lower_s1) - to_int_type(*aStr2);
 413       }
 414     }
 415
 416     if (*aStr2) {
 417       return -1;
 418     }
 419
 420     return 0;
 421   }
 422
 423   static size_t length(const char_type* aStr) { return strlen(aStr); }
 424
 425   static const char_type* find(const char_type* aStr, size_t aN,
 426                                char_type aChar) {
 427     return reinterpret_cast<const char_type*>(
 428         memchr(aStr, to_int_type(aChar), aN));
 429   }
 430 };
 431
 432 template <class InputIterator>
 433 struct nsCharSourceTraits {
 434   typedef typename InputIterator::difference_type difference_type;
 435
 436   static difference_type readable_distance(const InputIterator& aFirst,
 437                                            const InputIterator& aLast) {
 438     // assumes single fragment
 439     return aLast.get() - aFirst.get();
 440   }
 441
 442   static const typename InputIterator::value_type* read(
 443       const InputIterator& aIter) {
 444     return aIter.get();
 445   }
 446
 447   static void advance(InputIterator& aStr, difference_type aN) {
 448     aStr.advance(aN);
 449   }
 450 };
 451
 452 template <class CharT>
 453 struct nsCharSourceTraits<CharT*> {
 454   typedef ptrdiff_t difference_type;
 455
 456   static difference_type readable_distance(CharT* aStr) {
 457     return nsCharTraits<CharT>::length(aStr);
 458   }
 459
 460   static difference_type readable_distance(CharT* aFirst, CharT* aLast) {
 461     return aLast - aFirst;
 462   }
 463
 464   static const CharT* read(CharT* aStr) { return aStr; }
 465
 466   static void advance(CharT*& aStr, difference_type aN) { aStr += aN; }
 467 };
 468
 469 template <class OutputIterator>
 470 struct nsCharSinkTraits {
 471   static void write(OutputIterator& aIter,
 472                     const typename OutputIterator::value_type* aStr,
 473                     size_t aN) {
 474     aIter.write(aStr, aN);
 475   }
 476 };
 477
 478 template <class CharT>
 479 struct nsCharSinkTraits<CharT*> {
 480   static void write(CharT*& aIter, const CharT* aStr, size_t aN) {
 481     nsCharTraits<CharT>::move(aIter, aStr, aN);
 482     aIter += aN;
 483   }
 484 };
 485
 486 #endif  // !defined(nsCharTraits_h___)