xpcom/string/nsCharTraits.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #ifndef nsCharTraits_h___
   8 #define nsCharTraits_h___
   9
  10 #include <ctype.h> // for |EOF|, |WEOF|
  11 #include <string.h> // for |memcpy|, et al
  12
  13 #include "nscore.h" // for |char16_t|
  14
  15 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
  16 // particular the standalone software updater. In that case stub out
  17 // the macros provided by nsDebug.h which are only usable when linking XPCOM
  18
  19 #ifdef NS_NO_XPCOM
  20 #define NS_WARNING(msg)
  21 #define NS_ASSERTION(cond, msg)
  22 #define NS_ERROR(msg)
  23 #else
  24 #include "nsDebug.h"  // for NS_ASSERTION
  25 #endif
  26
  27 /*
  28  * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
  29  * values.
  30  *
  31  * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
  32  * using "surrogate pairs". These consist of a high surrogate, i.e. a code
  33  * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
  34  * in the range U+DC00 - U+DFFF, like this:
  35  *
  36  *  U+D800 U+DC00 =  U+10000
  37  *  U+D800 U+DC01 =  U+10001
  38  *  ...
  39  *  U+DBFF U+DFFE = U+10FFFE
  40  *  U+DBFF U+DFFF = U+10FFFF
  41  *
  42  * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
  43  * scalar values and are not well-formed UTF-16 except as high-surrogate /
  44  * low-surrogate pairs.
  45  */
  46
  47 #define PLANE1_BASE          uint32_t(0x00010000)
  48 // High surrogates are in the range 0xD800 -- OxDBFF
  49 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
  50 // Low surrogates are in the range 0xDC00 -- 0xDFFF
  51 #define NS_IS_LOW_SURROGATE(u)  ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
  52 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
  53 #define IS_SURROGATE(u)      ((uint32_t(u) & 0xFFFFF800) == 0xD800)
  54
  55 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
  56
  57 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
  58 // I wonder whether we could somehow assert that H is a high surrogate
  59 // and L is a low surrogate
  60 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \
  61                                  (uint32_t(l) & 0x03FF) + PLANE1_BASE)
  62
  63 // Extract surrogates from a UCS4 char
  64 // Reference: the Unicode standard 4.0, section 3.9
  65 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
  66 // 0xD7C0 == 0xD800 - 0x0080,
  67 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
  68 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \
  69                                 char16_t(0xD7C0))
  70 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
  71 // but added.
  72
  73 // Since 0x10000 & 0x03FF == 0,
  74 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
  75 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
  76 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \
  77                                  char16_t(0xDC00))
  78
  79 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
  80 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
  81
  82 #define UCS_END uint32_t(0x00110000)
  83 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
  84 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
  85
  86 template <class CharT>
  87 struct nsCharTraits
  88 {
  89 };
  90
  91 template <>
  92 struct nsCharTraits<char16_t>
  93 {
  94   typedef char16_t char_type;
  95   typedef uint16_t  unsigned_char_type;
  96   typedef char      incompatible_char_type;
  97
  98   static char_type* const sEmptyBuffer;
  99
 100   static void
 101   assign(char_type& aLhs, char_type aRhs)
 102   {
 103     aLhs = aRhs;
 104   }
 105
 106
 107   // integer representation of characters:
 108   typedef int int_type;
 109
 110   static char_type
 111   to_char_type(int_type aChar)
 112   {
 113     return char_type(aChar);
 114   }
 115
 116   static int_type
 117   to_int_type(char_type aChar)
 118   {
 119     return int_type(static_cast<unsigned_char_type>(aChar));
 120   }
 121
 122   static bool
 123   eq_int_type(int_type aLhs, int_type aRhs)
 124   {
 125     return aLhs == aRhs;
 126   }
 127
 128
 129   // |char_type| comparisons:
 130
 131   static bool
 132   eq(char_type aLhs, char_type aRhs)
 133   {
 134     return aLhs == aRhs;
 135   }
 136
 137   static bool
 138   lt(char_type aLhs, char_type aRhs)
 139   {
 140     return aLhs < aRhs;
 141   }
 142
 143
 144   // operations on s[n] arrays:
 145
 146   static char_type*
 147   move(char_type* aStr1, const char_type* aStr2, size_t aN)
 148   {
 149     return static_cast<char_type*>(memmove(aStr1, aStr2,
 150                                            aN * sizeof(char_type)));
 151   }
 152
 153   static char_type*
 154   copy(char_type* aStr1, const char_type* aStr2, size_t aN)
 155   {
 156     return static_cast<char_type*>(memcpy(aStr1, aStr2,
 157                                           aN * sizeof(char_type)));
 158   }
 159
 160   static char_type*
 161   copyASCII(char_type* aStr1, const char* aStr2, size_t aN)
 162   {
 163     for (char_type* s = aStr1; aN--; ++s, ++aStr2) {
 164       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 165       *s = static_cast<char_type>(*aStr2);
 166     }
 167     return aStr1;
 168   }
 169
 170   static char_type*
 171   assign(char_type* aStr, size_t aN, char_type aChar)
 172   {
 173     char_type* result = aStr;
 174     while (aN--) {
 175       assign(*aStr++, aChar);
 176     }
 177     return result;
 178   }
 179
 180   static int
 181   compare(const char_type* aStr1, const char_type* aStr2, size_t aN)
 182   {
 183     for (; aN--; ++aStr1, ++aStr2) {
 184       if (!eq(*aStr1, *aStr2)) {
 185         return to_int_type(*aStr1) - to_int_type(*aStr2);
 186       }
 187     }
 188
 189     return 0;
 190   }
 191
 192   static int
 193   compareASCII(const char_type* aStr1, const char* aStr2, size_t aN)
 194   {
 195     for (; aN--; ++aStr1, ++aStr2) {
 196       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 197       if (!eq_int_type(to_int_type(*aStr1),
 198                        to_int_type(static_cast<char_type>(*aStr2)))) {
 199         return to_int_type(*aStr1) -
 200                to_int_type(static_cast<char_type>(*aStr2));
 201       }
 202     }
 203
 204     return 0;
 205   }
 206
 207   // this version assumes that s2 is null-terminated and s1 has length n.
 208   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 209   // we return 1.
 210   static int
 211   compareASCIINullTerminated(const char_type* aStr1, size_t aN,
 212                              const char* aStr2)
 213   {
 214     for (; aN--; ++aStr1, ++aStr2) {
 215       if (!*aStr2) {
 216         return 1;
 217       }
 218       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 219       if (!eq_int_type(to_int_type(*aStr1),
 220                        to_int_type(static_cast<char_type>(*aStr2)))) {
 221         return to_int_type(*aStr1) -
 222                to_int_type(static_cast<char_type>(*aStr2));
 223       }
 224     }
 225
 226     if (*aStr2) {
 227       return -1;
 228     }
 229
 230     return 0;
 231   }
 232
 233   /**
 234    * Convert c to its lower-case form, but only if c is in the ASCII
 235    * range. Otherwise leave it alone.
 236    */
 237   static char_type
 238   ASCIIToLower(char_type aChar)
 239   {
 240     if (aChar >= 'A' && aChar <= 'Z') {
 241       return char_type(aChar + ('a' - 'A'));
 242     }
 243
 244     return aChar;
 245   }
 246
 247   static int
 248   compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, size_t aN)
 249   {
 250     for (; aN--; ++aStr1, ++aStr2) {
 251       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 252       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 253                    "Unexpected uppercase character");
 254       char_type lower_s1 = ASCIIToLower(*aStr1);
 255       if (lower_s1 != static_cast<char_type>(*aStr2)) {
 256         return to_int_type(lower_s1) -
 257                to_int_type(static_cast<char_type>(*aStr2));
 258       }
 259     }
 260
 261     return 0;
 262   }
 263
 264   // this version assumes that s2 is null-terminated and s1 has length n.
 265   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 266   // we return 1.
 267   static int
 268   compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
 269                                         size_t aN, const char* aStr2)
 270   {
 271     for (; aN--; ++aStr1, ++aStr2) {
 272       if (!*aStr2) {
 273         return 1;
 274       }
 275       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 276       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 277                    "Unexpected uppercase character");
 278       char_type lower_s1 = ASCIIToLower(*aStr1);
 279       if (lower_s1 != static_cast<char_type>(*aStr2)) {
 280         return to_int_type(lower_s1) -
 281                to_int_type(static_cast<char_type>(*aStr2));
 282       }
 283     }
 284
 285     if (*aStr2) {
 286       return -1;
 287     }
 288
 289     return 0;
 290   }
 291
 292   static size_t
 293   length(const char_type* aStr)
 294   {
 295     size_t result = 0;
 296     while (!eq(*aStr++, char_type(0))) {
 297       ++result;
 298     }
 299     return result;
 300   }
 301
 302   static const char_type*
 303   find(const char_type* aStr, size_t aN, char_type aChar)
 304   {
 305     while (aN--) {
 306       if (eq(*aStr, aChar)) {
 307         return aStr;
 308       }
 309       ++aStr;
 310     }
 311
 312     return 0;
 313   }
 314 };
 315
 316 template <>
 317 struct nsCharTraits<char>
 318 {
 319   typedef char           char_type;
 320   typedef unsigned char  unsigned_char_type;
 321   typedef char16_t      incompatible_char_type;
 322
 323   static char_type* const sEmptyBuffer;
 324
 325   static void
 326   assign(char_type& aLhs, char_type aRhs)
 327   {
 328     aLhs = aRhs;
 329   }
 330
 331
 332   // integer representation of characters:
 333
 334   typedef int int_type;
 335
 336   static char_type
 337   to_char_type(int_type aChar)
 338   {
 339     return char_type(aChar);
 340   }
 341
 342   static int_type
 343   to_int_type(char_type aChar)
 344   {
 345     return int_type(static_cast<unsigned_char_type>(aChar));
 346   }
 347
 348   static bool
 349   eq_int_type(int_type aLhs, int_type aRhs)
 350   {
 351     return aLhs == aRhs;
 352   }
 353
 354
 355   // |char_type| comparisons:
 356
 357   static bool eq(char_type aLhs, char_type aRhs)
 358   {
 359     return aLhs == aRhs;
 360   }
 361
 362   static bool
 363   lt(char_type aLhs, char_type aRhs)
 364   {
 365     return aLhs < aRhs;
 366   }
 367
 368
 369   // operations on s[n] arrays:
 370
 371   static char_type*
 372   move(char_type* aStr1, const char_type* aStr2, size_t aN)
 373   {
 374     return static_cast<char_type*>(memmove(aStr1, aStr2,
 375                                            aN * sizeof(char_type)));
 376   }
 377
 378   static char_type*
 379   copy(char_type* aStr1, const char_type* aStr2, size_t aN)
 380   {
 381     return static_cast<char_type*>(memcpy(aStr1, aStr2,
 382                                           aN * sizeof(char_type)));
 383   }
 384
 385   static char_type*
 386   copyASCII(char_type* aStr1, const char* aStr2, size_t aN)
 387   {
 388     return copy(aStr1, aStr2, aN);
 389   }
 390
 391   static char_type*
 392   assign(char_type* aStr, size_t aN, char_type aChar)
 393   {
 394     return static_cast<char_type*>(memset(aStr, to_int_type(aChar), aN));
 395   }
 396
 397   static int
 398   compare(const char_type* aStr1, const char_type* aStr2, size_t aN)
 399   {
 400     return memcmp(aStr1, aStr2, aN);
 401   }
 402
 403   static int
 404   compareASCII(const char_type* aStr1, const char* aStr2, size_t aN)
 405   {
 406 #ifdef DEBUG
 407     for (size_t i = 0; i < aN; ++i) {
 408       NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character");
 409     }
 410 #endif
 411     return compare(aStr1, aStr2, aN);
 412   }
 413
 414   // this version assumes that s2 is null-terminated and s1 has length n.
 415   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 416   // we return 1.
 417   static int
 418   compareASCIINullTerminated(const char_type* aStr1, size_t aN,
 419                              const char* aStr2)
 420   {
 421     // can't use strcmp here because we don't want to stop when aStr1
 422     // contains a null
 423     for (; aN--; ++aStr1, ++aStr2) {
 424       if (!*aStr2) {
 425         return 1;
 426       }
 427       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 428       if (*aStr1 != *aStr2) {
 429         return to_int_type(*aStr1) - to_int_type(*aStr2);
 430       }
 431     }
 432
 433     if (*aStr2) {
 434       return -1;
 435     }
 436
 437     return 0;
 438   }
 439
 440   /**
 441    * Convert c to its lower-case form, but only if c is ASCII.
 442    */
 443   static char_type
 444   ASCIIToLower(char_type aChar)
 445   {
 446     if (aChar >= 'A' && aChar <= 'Z') {
 447       return char_type(aChar + ('a' - 'A'));
 448     }
 449
 450     return aChar;
 451   }
 452
 453   static int
 454   compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, size_t aN)
 455   {
 456     for (; aN--; ++aStr1, ++aStr2) {
 457       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 458       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 459                    "Unexpected uppercase character");
 460       char_type lower_s1 = ASCIIToLower(*aStr1);
 461       if (lower_s1 != *aStr2) {
 462         return to_int_type(lower_s1) - to_int_type(*aStr2);
 463       }
 464     }
 465     return 0;
 466   }
 467
 468   // this version assumes that s2 is null-terminated and s1 has length n.
 469   // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
 470   // we return 1.
 471   static int
 472   compareLowerCaseToASCIINullTerminated(const char_type* aStr1, size_t aN,
 473                                         const char* aStr2)
 474   {
 475     for (; aN--; ++aStr1, ++aStr2) {
 476       if (!*aStr2) {
 477         return 1;
 478       }
 479       NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
 480       NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'),
 481                    "Unexpected uppercase character");
 482       char_type lower_s1 = ASCIIToLower(*aStr1);
 483       if (lower_s1 != *aStr2) {
 484         return to_int_type(lower_s1) - to_int_type(*aStr2);
 485       }
 486     }
 487
 488     if (*aStr2) {
 489       return -1;
 490     }
 491
 492     return 0;
 493   }
 494
 495   static size_t
 496   length(const char_type* aStr)
 497   {
 498     return strlen(aStr);
 499   }
 500
 501   static const char_type*
 502   find(const char_type* aStr, size_t aN, char_type aChar)
 503   {
 504     return reinterpret_cast<const char_type*>(memchr(aStr, to_int_type(aChar),
 505                                                      aN));
 506   }
 507 };
 508
 509 template <class InputIterator>
 510 struct nsCharSourceTraits
 511 {
 512   typedef typename InputIterator::difference_type difference_type;
 513
 514   static uint32_t
 515   readable_distance(const InputIterator& aFirst, const InputIterator& aLast)
 516   {
 517     // assumes single fragment
 518     return uint32_t(aLast.get() - aFirst.get());
 519   }
 520
 521   static const typename InputIterator::value_type*
 522   read(const InputIterator& aIter)
 523   {
 524     return aIter.get();
 525   }
 526
 527   static void
 528   advance(InputIterator& aStr, difference_type aN)
 529   {
 530     aStr.advance(aN);
 531   }
 532 };
 533
 534 template <class CharT>
 535 struct nsCharSourceTraits<CharT*>
 536 {
 537   typedef ptrdiff_t difference_type;
 538
 539   static uint32_t
 540   readable_distance(CharT* aStr)
 541   {
 542     return uint32_t(nsCharTraits<CharT>::length(aStr));
 543     // return numeric_limits<uint32_t>::max();
 544   }
 545
 546   static uint32_t
 547   readable_distance(CharT* aFirst, CharT* aLast)
 548   {
 549     return uint32_t(aLast - aFirst);
 550   }
 551
 552   static const CharT*
 553   read(CharT* aStr)
 554   {
 555     return aStr;
 556   }
 557
 558   static void
 559   advance(CharT*& aStr, difference_type aN)
 560   {
 561     aStr += aN;
 562   }
 563 };
 564
 565 template <class OutputIterator>
 566 struct nsCharSinkTraits
 567 {
 568   static void
 569   write(OutputIterator& aIter, const typename OutputIterator::value_type* aStr,
 570         uint32_t aN)
 571   {
 572     aIter.write(aStr, aN);
 573   }
 574 };
 575
 576 template <class CharT>
 577 struct nsCharSinkTraits<CharT*>
 578 {
 579   static void
 580   write(CharT*& aIter, const CharT* aStr, uint32_t aN)
 581   {
 582     nsCharTraits<CharT>::move(aIter, aStr, aN);
 583     aIter += aN;
 584   }
 585 };
 586
 587 #endif // !defined(nsCharTraits_h___)