Merge mozilla-central to autoland. a=merge CLOSED TREE
[gecko.git] / js / public / CharacterEncoding.h
blob9d1df4664b459bb830ca3c0815b2da61fb1b8c16
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
13 #include "js/TypeDecls.h"
14 #include "js/Utility.h"
16 class JSLinearString;
18 namespace mozilla {
19 union Utf8Unit;
22 namespace JS {
25 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
26 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
27 * byte is treated as a 2-byte character, and there is no way to pass in a
28 * string containing characters beyond U+00FF.
30 class Latin1Chars : public mozilla::Range<Latin1Char> {
31 typedef mozilla::Range<Latin1Char> Base;
33 public:
34 using CharT = Latin1Char;
36 Latin1Chars() = default;
37 Latin1Chars(char* aBytes, size_t aLength)
38 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
39 Latin1Chars(const Latin1Char* aBytes, size_t aLength)
40 : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
41 Latin1Chars(const char* aBytes, size_t aLength)
42 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
43 aLength) {}
47 * Like Latin1Chars, but the chars are const.
49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
50 typedef mozilla::Range<const Latin1Char> Base;
52 public:
53 using CharT = Latin1Char;
55 ConstLatin1Chars() = default;
56 ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
57 : Base(aChars, aLength) {}
61 * A Latin1Chars, but with \0 termination for C compatibility.
63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
64 typedef mozilla::RangedPtr<Latin1Char> Base;
66 public:
67 using CharT = Latin1Char;
69 Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT
71 Latin1CharsZ(char* aBytes, size_t aLength)
72 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
73 MOZ_ASSERT(aBytes[aLength] == '\0');
76 Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
77 MOZ_ASSERT(aBytes[aLength] == '\0');
80 using Base::operator=;
82 char* c_str() { return reinterpret_cast<char*>(get()); }
85 class UTF8Chars : public mozilla::Range<unsigned char> {
86 typedef mozilla::Range<unsigned char> Base;
88 public:
89 using CharT = unsigned char;
91 UTF8Chars() = default;
92 UTF8Chars(char* aBytes, size_t aLength)
93 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
94 UTF8Chars(const char* aBytes, size_t aLength)
95 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
96 aLength) {}
97 UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
98 : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
99 UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
100 : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
104 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
107 typedef mozilla::RangedPtr<unsigned char> Base;
109 public:
110 using CharT = unsigned char;
112 UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT
114 UTF8CharsZ(char* aBytes, size_t aLength)
115 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
116 MOZ_ASSERT(aBytes[aLength] == '\0');
119 UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
120 MOZ_ASSERT(aBytes[aLength] == '\0');
123 UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
124 : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
126 using Base::operator=;
128 char* c_str() { return reinterpret_cast<char*>(get()); }
132 * A wrapper for a "const char*" that is encoded using UTF-8.
133 * This class does not manage ownership of the data; that is left
134 * to others. This differs from UTF8CharsZ in that the chars are
135 * const and it disallows assignment.
137 class JS_PUBLIC_API ConstUTF8CharsZ {
138 const char* data_;
140 public:
141 using CharT = unsigned char;
143 ConstUTF8CharsZ() : data_(nullptr) {}
145 explicit ConstUTF8CharsZ(const char* aBytes) : data_(aBytes) {
146 #ifdef DEBUG
147 if (aBytes) {
148 validateWithoutLength();
150 #endif
153 ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
154 MOZ_ASSERT(aBytes[aLength] == '\0');
155 #ifdef DEBUG
156 validate(aLength);
157 #endif
160 const void* get() const { return data_; }
162 const char* c_str() const { return data_; }
164 explicit operator bool() const { return data_ != nullptr; }
166 private:
167 #ifdef DEBUG
168 void validate(size_t aLength);
169 void validateWithoutLength();
170 #endif
174 * SpiderMonkey uses a 2-byte character representation: it is a
175 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
176 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
177 * sufficiently dedicated JavaScript program to be fully unicode-aware by
178 * manually interpreting UTF-16 extension characters embedded in the JS
179 * string.
181 class TwoByteChars : public mozilla::Range<char16_t> {
182 typedef mozilla::Range<char16_t> Base;
184 public:
185 using CharT = char16_t;
187 TwoByteChars() = default;
188 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
189 TwoByteChars(const char16_t* aChars, size_t aLength)
190 : Base(const_cast<char16_t*>(aChars), aLength) {}
194 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
196 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
197 typedef mozilla::RangedPtr<char16_t> Base;
199 public:
200 using CharT = char16_t;
202 TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT
204 TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
205 MOZ_ASSERT(chars[length] == '\0');
208 using Base::operator=;
211 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
214 * Like TwoByteChars, but the chars are const.
216 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
217 typedef mozilla::Range<const char16_t> Base;
219 public:
220 using CharT = char16_t;
222 ConstTwoByteChars() = default;
223 ConstTwoByteChars(const char16_t* aChars, size_t aLength)
224 : Base(aChars, aLength) {}
228 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
229 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
230 * contains any UTF-16 extension characters, then this may give invalid Latin1
231 * output. The returned string is zero terminated. The returned string or the
232 * returned string's |start()| must be freed with JS_free or js_free,
233 * respectively. If allocation fails, an OOM error will be set and the method
234 * will return a nullptr chars (which can be tested for with the ! operator).
235 * This method cannot trigger GC.
237 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
238 JSContext* cx, const mozilla::Range<const char16_t>& tbchars);
240 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
241 const char16_t* begin,
242 size_t length) {
243 const mozilla::Range<const char16_t> tbchars(begin, length);
244 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
247 template <typename CharT, typename Allocator>
248 extern UTF8CharsZ CharsToNewUTF8CharsZ(Allocator* alloc,
249 const mozilla::Range<CharT>& chars);
251 JS_PUBLIC_API char32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
252 int utf8Length);
255 * Inflate bytes in UTF-8 encoding to char16_t.
256 * - On error, returns an empty TwoByteCharsZ.
257 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
258 * its length; the length value excludes the trailing null.
260 extern JS_PUBLIC_API TwoByteCharsZ
261 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
262 size_t* outlen, arena_id_t destArenaId);
265 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
267 extern JS_PUBLIC_API TwoByteCharsZ
268 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
269 size_t* outlen, arena_id_t destArenaId);
272 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
273 * characters will be replaced by \uFFFD. No exception will be thrown for
274 * malformed UTF-8 input.
276 extern JS_PUBLIC_API TwoByteCharsZ
277 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
278 size_t* outlen, arena_id_t destArenaId);
280 extern JS_PUBLIC_API TwoByteCharsZ
281 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8,
282 size_t* outlen, arena_id_t destArenaId);
285 * Returns the length of the char buffer required to encode |s| as UTF8.
286 * Does not include the null-terminator.
288 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
291 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
292 * exhausted or too little space is available in |dst| to fit the scalar
293 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
294 * the number of bytes of |dst| that were filled.
296 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
297 * linear.
299 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
300 * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
301 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
302 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
304 * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
306 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
307 mozilla::Span<char> dst);
310 * The smallest character encoding capable of fully representing a particular
311 * string.
313 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
316 * Returns the smallest encoding possible for the given string: if all
317 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
318 * Latin-1, else UTF16.
320 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(const UTF8Chars& utf8);
323 * Return a null-terminated Latin-1 string copied from the input string,
324 * storing its length (excluding null terminator) in |*outlen|. Fail and
325 * report an error if the string contains non-Latin-1 codepoints. Returns
326 * Latin1CharsZ() on failure.
328 extern JS_PUBLIC_API Latin1CharsZ
329 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, size_t* outlen,
330 arena_id_t destArenaId);
333 * Return a null-terminated Latin-1 string copied from the input string,
334 * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
335 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
337 extern JS_PUBLIC_API Latin1CharsZ
338 LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8,
339 size_t* outlen, arena_id_t destArenaId);
342 * Returns true if all characters in the given null-terminated string are
343 * ASCII, i.e. < 0x80, false otherwise.
345 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
348 * Returns true if all characters in the given span are ASCII,
349 * i.e. < 0x80, false otherwise.
351 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
354 * Encode a narrow multibyte character string to a UTF-8 string.
356 * NOTE: Should only be used when interacting with POSIX/OS functions and not
357 * for encoding ASCII/Latin-1/etc. strings to UTF-8.
359 extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
360 const char* chars);
363 * Encode a wide string to a UTF-8 string.
365 * NOTE: Should only be used when interacting with Windows API functions.
367 extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
368 const wchar_t* chars);
371 * Encode a UTF-8 string to a narrow multibyte character string.
373 * NOTE: Should only be used when interacting with POSIX/OS functions and not
374 * for encoding UTF-8 to ASCII/Latin-1/etc. strings.
376 extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
377 const char* chars);
380 * Encode a UTF-8 string to a wide string.
382 * NOTE: Should only be used when interacting with Windows API functions.
384 extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
385 const char* chars);
387 } // namespace JS
389 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
390 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
393 * DEPRECATED
395 * Allocate memory sufficient to contain the characters of |str| truncated to
396 * Latin-1 and a trailing null terminator, fill the memory with the characters
397 * interpreted in that manner plus the null terminator, and return a pointer to
398 * the memory.
400 * This function *loses information* when it copies the characters of |str| if
401 * |str| contains code units greater than 0xFF. Additionally, users that
402 * depend on null-termination will misinterpret the copied characters if |str|
403 * contains any nulls. Avoid using this function if possible, because it will
404 * eventually be removed.
406 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
407 JSString* str);
410 * DEPRECATED
412 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
414 * This function *loses information* when it copies the characters of |str| if
415 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
416 * instead.
418 * The returned string is also subject to misinterpretation if |str| contains
419 * any nulls (which are faithfully transcribed into the returned string, but
420 * which will implicitly truncate the string if it's passed to functions that
421 * expect null-terminated strings).
423 * Avoid using this function if possible, because we'll remove it once we can
424 * devise a better API for the task.
426 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
427 JSContext* cx, JS::Handle<JSString*> str);
430 * DEPRECATED
432 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
434 * This function asserts in debug mode that the input string contains only
435 * ASCII characters.
437 * The returned string is also subject to misinterpretation if |str| contains
438 * any nulls (which are faithfully transcribed into the returned string, but
439 * which will implicitly truncate the string if it's passed to functions that
440 * expect null-terminated strings).
442 * Avoid using this function if possible, because we'll remove it once we can
443 * devise a better API for the task.
445 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
446 JSString* str);
448 #endif /* js_CharacterEncoding_h */