1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
10 #include "mozilla/Range.h"
11 #include "mozilla/Span.h"
13 #include "js/TypeDecls.h"
14 #include "js/Utility.h"
25 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
26 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
27 * byte is treated as a 2-byte character, and there is no way to pass in a
28 * string containing characters beyond U+00FF.
30 class Latin1Chars
: public mozilla::Range
<Latin1Char
> {
31 typedef mozilla::Range
<Latin1Char
> Base
;
34 using CharT
= Latin1Char
;
36 Latin1Chars() = default;
37 Latin1Chars(char* aBytes
, size_t aLength
)
38 : Base(reinterpret_cast<Latin1Char
*>(aBytes
), aLength
) {}
39 Latin1Chars(const Latin1Char
* aBytes
, size_t aLength
)
40 : Base(const_cast<Latin1Char
*>(aBytes
), aLength
) {}
41 Latin1Chars(const char* aBytes
, size_t aLength
)
42 : Base(reinterpret_cast<Latin1Char
*>(const_cast<char*>(aBytes
)),
47 * Like Latin1Chars, but the chars are const.
49 class ConstLatin1Chars
: public mozilla::Range
<const Latin1Char
> {
50 typedef mozilla::Range
<const Latin1Char
> Base
;
53 using CharT
= Latin1Char
;
55 ConstLatin1Chars() = default;
56 ConstLatin1Chars(const Latin1Char
* aChars
, size_t aLength
)
57 : Base(aChars
, aLength
) {}
61 * A Latin1Chars, but with \0 termination for C compatibility.
63 class Latin1CharsZ
: public mozilla::RangedPtr
<Latin1Char
> {
64 typedef mozilla::RangedPtr
<Latin1Char
> Base
;
67 using CharT
= Latin1Char
;
69 Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT
71 Latin1CharsZ(char* aBytes
, size_t aLength
)
72 : Base(reinterpret_cast<Latin1Char
*>(aBytes
), aLength
) {
73 MOZ_ASSERT(aBytes
[aLength
] == '\0');
76 Latin1CharsZ(Latin1Char
* aBytes
, size_t aLength
) : Base(aBytes
, aLength
) {
77 MOZ_ASSERT(aBytes
[aLength
] == '\0');
80 using Base::operator=;
82 char* c_str() { return reinterpret_cast<char*>(get()); }
85 class UTF8Chars
: public mozilla::Range
<unsigned char> {
86 typedef mozilla::Range
<unsigned char> Base
;
89 using CharT
= unsigned char;
91 UTF8Chars() = default;
92 UTF8Chars(char* aBytes
, size_t aLength
)
93 : Base(reinterpret_cast<unsigned char*>(aBytes
), aLength
) {}
94 UTF8Chars(const char* aBytes
, size_t aLength
)
95 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes
)),
97 UTF8Chars(mozilla::Utf8Unit
* aUnits
, size_t aLength
)
98 : UTF8Chars(reinterpret_cast<char*>(aUnits
), aLength
) {}
99 UTF8Chars(const mozilla::Utf8Unit
* aUnits
, size_t aLength
)
100 : UTF8Chars(reinterpret_cast<const char*>(aUnits
), aLength
) {}
104 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
106 class UTF8CharsZ
: public mozilla::RangedPtr
<unsigned char> {
107 typedef mozilla::RangedPtr
<unsigned char> Base
;
110 using CharT
= unsigned char;
112 UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT
114 UTF8CharsZ(char* aBytes
, size_t aLength
)
115 : Base(reinterpret_cast<unsigned char*>(aBytes
), aLength
) {
116 MOZ_ASSERT(aBytes
[aLength
] == '\0');
119 UTF8CharsZ(unsigned char* aBytes
, size_t aLength
) : Base(aBytes
, aLength
) {
120 MOZ_ASSERT(aBytes
[aLength
] == '\0');
123 UTF8CharsZ(mozilla::Utf8Unit
* aUnits
, size_t aLength
)
124 : UTF8CharsZ(reinterpret_cast<char*>(aUnits
), aLength
) {}
126 using Base::operator=;
128 char* c_str() { return reinterpret_cast<char*>(get()); }
132 * A wrapper for a "const char*" that is encoded using UTF-8.
133 * This class does not manage ownership of the data; that is left
134 * to others. This differs from UTF8CharsZ in that the chars are
135 * const and it disallows assignment.
137 class JS_PUBLIC_API ConstUTF8CharsZ
{
141 using CharT
= unsigned char;
143 ConstUTF8CharsZ() : data_(nullptr) {}
145 explicit ConstUTF8CharsZ(const char* aBytes
) : data_(aBytes
) {
148 validateWithoutLength();
153 ConstUTF8CharsZ(const char* aBytes
, size_t aLength
) : data_(aBytes
) {
154 MOZ_ASSERT(aBytes
[aLength
] == '\0');
160 const void* get() const { return data_
; }
162 const char* c_str() const { return data_
; }
164 explicit operator bool() const { return data_
!= nullptr; }
168 void validate(size_t aLength
);
169 void validateWithoutLength();
174 * SpiderMonkey uses a 2-byte character representation: it is a
175 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
176 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
177 * sufficiently dedicated JavaScript program to be fully unicode-aware by
178 * manually interpreting UTF-16 extension characters embedded in the JS
181 class TwoByteChars
: public mozilla::Range
<char16_t
> {
182 typedef mozilla::Range
<char16_t
> Base
;
185 using CharT
= char16_t
;
187 TwoByteChars() = default;
188 TwoByteChars(char16_t
* aChars
, size_t aLength
) : Base(aChars
, aLength
) {}
189 TwoByteChars(const char16_t
* aChars
, size_t aLength
)
190 : Base(const_cast<char16_t
*>(aChars
), aLength
) {}
194 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
196 class TwoByteCharsZ
: public mozilla::RangedPtr
<char16_t
> {
197 typedef mozilla::RangedPtr
<char16_t
> Base
;
200 using CharT
= char16_t
;
202 TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT
204 TwoByteCharsZ(char16_t
* chars
, size_t length
) : Base(chars
, length
) {
205 MOZ_ASSERT(chars
[length
] == '\0');
208 using Base::operator=;
211 typedef mozilla::RangedPtr
<const char16_t
> ConstCharPtr
;
214 * Like TwoByteChars, but the chars are const.
216 class ConstTwoByteChars
: public mozilla::Range
<const char16_t
> {
217 typedef mozilla::Range
<const char16_t
> Base
;
220 using CharT
= char16_t
;
222 ConstTwoByteChars() = default;
223 ConstTwoByteChars(const char16_t
* aChars
, size_t aLength
)
224 : Base(aChars
, aLength
) {}
228 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
229 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
230 * contains any UTF-16 extension characters, then this may give invalid Latin1
231 * output. The returned string is zero terminated. The returned string or the
232 * returned string's |start()| must be freed with JS_free or js_free,
233 * respectively. If allocation fails, an OOM error will be set and the method
234 * will return a nullptr chars (which can be tested for with the ! operator).
235 * This method cannot trigger GC.
237 extern Latin1CharsZ
LossyTwoByteCharsToNewLatin1CharsZ(
238 JSContext
* cx
, const mozilla::Range
<const char16_t
>& tbchars
);
240 inline Latin1CharsZ
LossyTwoByteCharsToNewLatin1CharsZ(JSContext
* cx
,
241 const char16_t
* begin
,
243 const mozilla::Range
<const char16_t
> tbchars(begin
, length
);
244 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx
, tbchars
);
247 template <typename CharT
, typename Allocator
>
248 extern UTF8CharsZ
CharsToNewUTF8CharsZ(Allocator
* alloc
,
249 const mozilla::Range
<CharT
>& chars
);
251 JS_PUBLIC_API char32_t
Utf8ToOneUcs4Char(const uint8_t* utf8Buffer
,
255 * Inflate bytes in UTF-8 encoding to char16_t.
256 * - On error, returns an empty TwoByteCharsZ.
257 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
258 * its length; the length value excludes the trailing null.
260 extern JS_PUBLIC_API TwoByteCharsZ
261 UTF8CharsToNewTwoByteCharsZ(JSContext
* cx
, const UTF8Chars
& utf8
,
262 size_t* outlen
, arena_id_t destArenaId
);
265 * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
267 extern JS_PUBLIC_API TwoByteCharsZ
268 UTF8CharsToNewTwoByteCharsZ(JSContext
* cx
, const ConstUTF8CharsZ
& utf8
,
269 size_t* outlen
, arena_id_t destArenaId
);
272 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
273 * characters will be replaced by \uFFFD. No exception will be thrown for
274 * malformed UTF-8 input.
276 extern JS_PUBLIC_API TwoByteCharsZ
277 LossyUTF8CharsToNewTwoByteCharsZ(JSContext
* cx
, const UTF8Chars
& utf8
,
278 size_t* outlen
, arena_id_t destArenaId
);
280 extern JS_PUBLIC_API TwoByteCharsZ
281 LossyUTF8CharsToNewTwoByteCharsZ(JSContext
* cx
, const ConstUTF8CharsZ
& utf8
,
282 size_t* outlen
, arena_id_t destArenaId
);
285 * Returns the length of the char buffer required to encode |s| as UTF8.
286 * Does not include the null-terminator.
288 JS_PUBLIC_API
size_t GetDeflatedUTF8StringLength(JSLinearString
* s
);
291 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
292 * exhausted or too little space is available in |dst| to fit the scalar
293 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
294 * the number of bytes of |dst| that were filled.
296 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
299 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
300 * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
301 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
302 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
304 * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
306 JS_PUBLIC_API
size_t DeflateStringToUTF8Buffer(JSLinearString
* src
,
307 mozilla::Span
<char> dst
);
310 * The smallest character encoding capable of fully representing a particular
313 enum class SmallestEncoding
{ ASCII
, Latin1
, UTF16
};
316 * Returns the smallest encoding possible for the given string: if all
317 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
318 * Latin-1, else UTF16.
320 JS_PUBLIC_API SmallestEncoding
FindSmallestEncoding(const UTF8Chars
& utf8
);
323 * Return a null-terminated Latin-1 string copied from the input string,
324 * storing its length (excluding null terminator) in |*outlen|. Fail and
325 * report an error if the string contains non-Latin-1 codepoints. Returns
326 * Latin1CharsZ() on failure.
328 extern JS_PUBLIC_API Latin1CharsZ
329 UTF8CharsToNewLatin1CharsZ(JSContext
* cx
, const UTF8Chars
& utf8
, size_t* outlen
,
330 arena_id_t destArenaId
);
333 * Return a null-terminated Latin-1 string copied from the input string,
334 * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
335 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
337 extern JS_PUBLIC_API Latin1CharsZ
338 LossyUTF8CharsToNewLatin1CharsZ(JSContext
* cx
, const UTF8Chars
& utf8
,
339 size_t* outlen
, arena_id_t destArenaId
);
342 * Returns true if all characters in the given null-terminated string are
343 * ASCII, i.e. < 0x80, false otherwise.
345 extern JS_PUBLIC_API
bool StringIsASCII(const char* s
);
348 * Returns true if all characters in the given span are ASCII,
349 * i.e. < 0x80, false otherwise.
351 extern JS_PUBLIC_API
bool StringIsASCII(mozilla::Span
<const char> s
);
354 * Encode a narrow multibyte character string to a UTF-8 string.
356 * NOTE: Should only be used when interacting with POSIX/OS functions and not
357 * for encoding ASCII/Latin-1/etc. strings to UTF-8.
359 extern JS_PUBLIC_API
JS::UniqueChars
EncodeNarrowToUtf8(JSContext
* cx
,
363 * Encode a wide string to a UTF-8 string.
365 * NOTE: Should only be used when interacting with Windows API functions.
367 extern JS_PUBLIC_API
JS::UniqueChars
EncodeWideToUtf8(JSContext
* cx
,
368 const wchar_t* chars
);
371 * Encode a UTF-8 string to a narrow multibyte character string.
373 * NOTE: Should only be used when interacting with POSIX/OS functions and not
374 * for encoding UTF-8 to ASCII/Latin-1/etc. strings.
376 extern JS_PUBLIC_API
JS::UniqueChars
EncodeUtf8ToNarrow(JSContext
* cx
,
380 * Encode a UTF-8 string to a wide string.
382 * NOTE: Should only be used when interacting with Windows API functions.
384 extern JS_PUBLIC_API
JS::UniqueWideChars
EncodeUtf8ToWide(JSContext
* cx
,
389 inline void JS_free(JS::Latin1CharsZ
& ptr
) { js_free((void*)ptr
.get()); }
390 inline void JS_free(JS::UTF8CharsZ
& ptr
) { js_free((void*)ptr
.get()); }
395 * Allocate memory sufficient to contain the characters of |str| truncated to
396 * Latin-1 and a trailing null terminator, fill the memory with the characters
397 * interpreted in that manner plus the null terminator, and return a pointer to
400 * This function *loses information* when it copies the characters of |str| if
401 * |str| contains code units greater than 0xFF. Additionally, users that
402 * depend on null-termination will misinterpret the copied characters if |str|
403 * contains any nulls. Avoid using this function if possible, because it will
404 * eventually be removed.
406 extern JS_PUBLIC_API
JS::UniqueChars
JS_EncodeStringToLatin1(JSContext
* cx
,
412 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
414 * This function *loses information* when it copies the characters of |str| if
415 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
418 * The returned string is also subject to misinterpretation if |str| contains
419 * any nulls (which are faithfully transcribed into the returned string, but
420 * which will implicitly truncate the string if it's passed to functions that
421 * expect null-terminated strings).
423 * Avoid using this function if possible, because we'll remove it once we can
424 * devise a better API for the task.
426 extern JS_PUBLIC_API
JS::UniqueChars
JS_EncodeStringToUTF8(
427 JSContext
* cx
, JS::Handle
<JSString
*> str
);
432 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
434 * This function asserts in debug mode that the input string contains only
437 * The returned string is also subject to misinterpretation if |str| contains
438 * any nulls (which are faithfully transcribed into the returned string, but
439 * which will implicitly truncate the string if it's passed to functions that
440 * expect null-terminated strings).
442 * Avoid using this function if possible, because we'll remove it once we can
443 * devise a better API for the task.
445 extern JS_PUBLIC_API
JS::UniqueChars
JS_EncodeStringToASCII(JSContext
* cx
,
448 #endif /* js_CharacterEncoding_h */