js/public/CharacterEncoding.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
   2  * vim: set ts=8 sts=2 et sw=2 tw=80:
   3  * This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #ifndef js_CharacterEncoding_h
   8 #define js_CharacterEncoding_h
   9
  10 #include "mozilla/Range.h"
  11 #include "mozilla/Span.h"
  12
  13 #include "js/TypeDecls.h"
  14 #include "js/Utility.h"
  15
  16 class JSLinearString;
  17
  18 namespace mozilla {
  19 union Utf8Unit;
  20 }
  21
  22 namespace JS {
  23
  24 /*
  25  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
  26  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
  27  * byte is treated as a 2-byte character, and there is no way to pass in a
  28  * string containing characters beyond U+00FF.
  29  */
  30 class Latin1Chars : public mozilla::Range<Latin1Char> {
  31   typedef mozilla::Range<Latin1Char> Base;
  32
  33  public:
  34   using CharT = Latin1Char;
  35
  36   Latin1Chars() = default;
  37   Latin1Chars(char* aBytes, size_t aLength)
  38       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
  39   Latin1Chars(const Latin1Char* aBytes, size_t aLength)
  40       : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
  41   Latin1Chars(const char* aBytes, size_t aLength)
  42       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
  43              aLength) {}
  44 };
  45
  46 /*
  47  * Like Latin1Chars, but the chars are const.
  48  */
  49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
  50   typedef mozilla::Range<const Latin1Char> Base;
  51
  52  public:
  53   using CharT = Latin1Char;
  54
  55   ConstLatin1Chars() = default;
  56   ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
  57       : Base(aChars, aLength) {}
  58 };
  59
  60 /*
  61  * A Latin1Chars, but with \0 termination for C compatibility.
  62  */
  63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
  64   typedef mozilla::RangedPtr<Latin1Char> Base;
  65
  66  public:
  67   using CharT = Latin1Char;
  68
  69   Latin1CharsZ() : Base(nullptr, 0) {}  // NOLINT
  70
  71   Latin1CharsZ(char* aBytes, size_t aLength)
  72       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
  73     MOZ_ASSERT(aBytes[aLength] == '\0');
  74   }
  75
  76   Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
  77     MOZ_ASSERT(aBytes[aLength] == '\0');
  78   }
  79
  80   using Base::operator=;
  81
  82   char* c_str() { return reinterpret_cast<char*>(get()); }
  83 };
  84
  85 class UTF8Chars : public mozilla::Range<unsigned char> {
  86   typedef mozilla::Range<unsigned char> Base;
  87
  88  public:
  89   using CharT = unsigned char;
  90
  91   UTF8Chars() = default;
  92   UTF8Chars(char* aBytes, size_t aLength)
  93       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
  94   UTF8Chars(const char* aBytes, size_t aLength)
  95       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
  96              aLength) {}
  97   UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
  98       : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
  99   UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
 100       : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
 101 };
 102
 103 /*
 104  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
 105  */
 106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
 107   typedef mozilla::RangedPtr<unsigned char> Base;
 108
 109  public:
 110   using CharT = unsigned char;
 111
 112   UTF8CharsZ() : Base(nullptr, 0) {}  // NOLINT
 113
 114   UTF8CharsZ(char* aBytes, size_t aLength)
 115       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
 116     MOZ_ASSERT(aBytes[aLength] == '\0');
 117   }
 118
 119   UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
 120     MOZ_ASSERT(aBytes[aLength] == '\0');
 121   }
 122
 123   UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
 124       : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
 125
 126   using Base::operator=;
 127
 128   char* c_str() { return reinterpret_cast<char*>(get()); }
 129 };
 130
 131 /*
 132  * A wrapper for a "const char*" that is encoded using UTF-8.
 133  * This class does not manage ownership of the data; that is left
 134  * to others.  This differs from UTF8CharsZ in that the chars are
 135  * const and it disallows assignment.
 136  */
 137 class JS_PUBLIC_API ConstUTF8CharsZ {
 138   const char* data_;
 139
 140  public:
 141   using CharT = unsigned char;
 142
 143   ConstUTF8CharsZ() : data_(nullptr) {}
 144
 145   explicit ConstUTF8CharsZ(const char* aBytes) : data_(aBytes) {
 146 #ifdef DEBUG
 147     if (aBytes) {
 148       validateWithoutLength();
 149     }
 150 #endif
 151   }
 152
 153   ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
 154     MOZ_ASSERT(aBytes[aLength] == '\0');
 155 #ifdef DEBUG
 156     validate(aLength);
 157 #endif
 158   }
 159
 160   const void* get() const { return data_; }
 161
 162   const char* c_str() const { return data_; }
 163
 164   explicit operator bool() const { return data_ != nullptr; }
 165
 166  private:
 167 #ifdef DEBUG
 168   void validate(size_t aLength);
 169   void validateWithoutLength();
 170 #endif
 171 };
 172
 173 /*
 174  * SpiderMonkey uses a 2-byte character representation: it is a
 175  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
 176  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
 177  * sufficiently dedicated JavaScript program to be fully unicode-aware by
 178  * manually interpreting UTF-16 extension characters embedded in the JS
 179  * string.
 180  */
 181 class TwoByteChars : public mozilla::Range<char16_t> {
 182   typedef mozilla::Range<char16_t> Base;
 183
 184  public:
 185   using CharT = char16_t;
 186
 187   TwoByteChars() = default;
 188   TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
 189   TwoByteChars(const char16_t* aChars, size_t aLength)
 190       : Base(const_cast<char16_t*>(aChars), aLength) {}
 191 };
 192
 193 /*
 194  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
 195  */
 196 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
 197   typedef mozilla::RangedPtr<char16_t> Base;
 198
 199  public:
 200   using CharT = char16_t;
 201
 202   TwoByteCharsZ() : Base(nullptr, 0) {}  // NOLINT
 203
 204   TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
 205     MOZ_ASSERT(chars[length] == '\0');
 206   }
 207
 208   using Base::operator=;
 209 };
 210
 211 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
 212
 213 /*
 214  * Like TwoByteChars, but the chars are const.
 215  */
 216 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
 217   typedef mozilla::Range<const char16_t> Base;
 218
 219  public:
 220   using CharT = char16_t;
 221
 222   ConstTwoByteChars() = default;
 223   ConstTwoByteChars(const char16_t* aChars, size_t aLength)
 224       : Base(aChars, aLength) {}
 225 };
 226
 227 /*
 228  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
 229  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
 230  * contains any UTF-16 extension characters, then this may give invalid Latin1
 231  * output. The returned string is zero terminated. The returned string or the
 232  * returned string's |start()| must be freed with JS_free or js_free,
 233  * respectively. If allocation fails, an OOM error will be set and the method
 234  * will return a nullptr chars (which can be tested for with the ! operator).
 235  * This method cannot trigger GC.
 236  */
 237 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
 238     JSContext* cx, const mozilla::Range<const char16_t>& tbchars);
 239
 240 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
 241                                                        const char16_t* begin,
 242                                                        size_t length) {
 243   const mozilla::Range<const char16_t> tbchars(begin, length);
 244   return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
 245 }
 246
 247 template <typename CharT, typename Allocator>
 248 extern UTF8CharsZ CharsToNewUTF8CharsZ(Allocator* alloc,
 249                                        const mozilla::Range<CharT>& chars);
 250
 251 JS_PUBLIC_API char32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
 252                                          int utf8Length);
 253
 254 /*
 255  * Inflate bytes in UTF-8 encoding to char16_t.
 256  * - On error, returns an empty TwoByteCharsZ.
 257  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
 258  *   its length;  the length value excludes the trailing null.
 259  */
 260 extern JS_PUBLIC_API TwoByteCharsZ
 261 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
 262                             size_t* outlen, arena_id_t destArenaId);
 263
 264 /*
 265  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
 266  */
 267 extern JS_PUBLIC_API TwoByteCharsZ
 268 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
 269                             size_t* outlen, arena_id_t destArenaId);
 270
 271 /*
 272  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
 273  * characters will be replaced by \uFFFD. No exception will be thrown for
 274  * malformed UTF-8 input.
 275  */
 276 extern JS_PUBLIC_API TwoByteCharsZ
 277 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
 278                                  size_t* outlen, arena_id_t destArenaId);
 279
 280 extern JS_PUBLIC_API TwoByteCharsZ
 281 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
 282                                  size_t* outlen, arena_id_t destArenaId);
 283
 284 /*
 285  * Returns the length of the char buffer required to encode |s| as UTF8.
 286  * Does not include the null-terminator.
 287  */
 288 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
 289
 290 /*
 291  * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
 292  * exhausted or too little space is available in |dst| to fit the scalar
 293  * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
 294  * the number of bytes of |dst| that were filled.
 295  *
 296  * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
 297  * linear.
 298  *
 299  * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
 300  * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
 301  * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
 302  * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
 303  *
 304  * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
 305  */
 306 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
 307                                                mozilla::Span<char> dst);
 308
 309 /*
 310  * The smallest character encoding capable of fully representing a particular
 311  * string.
 312  */
 313 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
 314
 315 /*
 316  * Returns the smallest encoding possible for the given string: if all
 317  * codepoints are <128 then ASCII, otherwise if all codepoints are <256
 318  * Latin-1, else UTF16.
 319  */
 320 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(const UTF8Chars& utf8);
 321
 322 /*
 323  * Return a null-terminated Latin-1 string copied from the input string,
 324  * storing its length (excluding null terminator) in |*outlen|.  Fail and
 325  * report an error if the string contains non-Latin-1 codepoints.  Returns
 326  * Latin1CharsZ() on failure.
 327  */
 328 extern JS_PUBLIC_API Latin1CharsZ
 329 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, size_t* outlen,
 330                            arena_id_t destArenaId);
 331
 332 /*
 333  * Return a null-terminated Latin-1 string copied from the input string,
 334  * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
 335  * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
 336  */
 337 extern JS_PUBLIC_API Latin1CharsZ
 338 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8,
 339                                 size_t* outlen, arena_id_t destArenaId);
 340
 341 /*
 342  * Returns true if all characters in the given null-terminated string are
 343  * ASCII, i.e. < 0x80, false otherwise.
 344  */
 345 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
 346
 347 /*
 348  * Returns true if all characters in the given span are ASCII,
 349  * i.e. < 0x80, false otherwise.
 350  */
 351 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
 352
 353 /**
 354  * Encode a narrow multibyte character string to a UTF-8 string.
 355  *
 356  * NOTE: Should only be used when interacting with POSIX/OS functions and not
 357  *       for encoding ASCII/Latin-1/etc. strings to UTF-8.
 358  */
 359 extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
 360                                                         const char* chars);
 361
 362 /**
 363  * Encode a wide string to a UTF-8 string.
 364  *
 365  * NOTE: Should only be used when interacting with Windows API functions.
 366  */
 367 extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
 368                                                       const wchar_t* chars);
 369
 370 /**
 371  * Encode a UTF-8 string to a narrow multibyte character string.
 372  *
 373  * NOTE: Should only be used when interacting with POSIX/OS functions and not
 374  *       for encoding UTF-8 to ASCII/Latin-1/etc. strings.
 375  */
 376 extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
 377                                                         const char* chars);
 378
 379 /**
 380  * Encode a UTF-8 string to a wide string.
 381  *
 382  * NOTE: Should only be used when interacting with Windows API functions.
 383  */
 384 extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
 385                                                           const char* chars);
 386
 387 }  // namespace JS
 388
 389 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
 390 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
 391
 392 /**
 393  * DEPRECATED
 394  *
 395  * Allocate memory sufficient to contain the characters of |str| truncated to
 396  * Latin-1 and a trailing null terminator, fill the memory with the characters
 397  * interpreted in that manner plus the null terminator, and return a pointer to
 398  * the memory.
 399  *
 400  * This function *loses information* when it copies the characters of |str| if
 401  * |str| contains code units greater than 0xFF.  Additionally, users that
 402  * depend on null-termination will misinterpret the copied characters if |str|
 403  * contains any nulls.  Avoid using this function if possible, because it will
 404  * eventually be removed.
 405  */
 406 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
 407                                                              JSString* str);
 408
 409 /**
 410  * DEPRECATED
 411  *
 412  * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
 413  *
 414  * This function *loses information* when it copies the characters of |str| if
 415  * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
 416  * instead.
 417  *
 418  * The returned string is also subject to misinterpretation if |str| contains
 419  * any nulls (which are faithfully transcribed into the returned string, but
 420  * which will implicitly truncate the string if it's passed to functions that
 421  * expect null-terminated strings).
 422  *
 423  * Avoid using this function if possible, because we'll remove it once we can
 424  * devise a better API for the task.
 425  */
 426 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
 427     JSContext* cx, JS::Handle<JSString*> str);
 428
 429 /**
 430  * DEPRECATED
 431  *
 432  * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
 433  *
 434  * This function asserts in debug mode that the input string contains only
 435  * ASCII characters.
 436  *
 437  * The returned string is also subject to misinterpretation if |str| contains
 438  * any nulls (which are faithfully transcribed into the returned string, but
 439  * which will implicitly truncate the string if it's passed to functions that
 440  * expect null-terminated strings).
 441  *
 442  * Avoid using this function if possible, because we'll remove it once we can
 443  * devise a better API for the task.
 444  */
 445 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
 446                                                             JSString* str);
 447
 448 #endif /* js_CharacterEncoding_h */