1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/Maybe.h"
19 #include "mozilla/NotNull.h"
20 #include "mozilla/Span.h"
21 #include "mozilla/Tuple.h"
28 }; // namespace mozilla
30 #define ENCODING_RS_ENCODING mozilla::Encoding
31 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
32 mozilla::NotNull<const mozilla::Encoding*>
33 #define ENCODING_RS_ENCODER mozilla::Encoder
34 #define ENCODING_RS_DECODER mozilla::Decoder
36 #include "encoding_rs.h"
40 nsresult
mozilla_encoding_decode_to_nsstring(mozilla::Encoding
const** encoding
,
41 uint8_t const* src
, size_t src_len
,
44 nsresult
mozilla_encoding_decode_to_nsstring_with_bom_removal(
45 mozilla::Encoding
const* encoding
, uint8_t const* src
, size_t src_len
,
48 nsresult
mozilla_encoding_decode_to_nsstring_without_bom_handling(
49 mozilla::Encoding
const* encoding
, uint8_t const* src
, size_t src_len
,
53 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
54 mozilla::Encoding
const* encoding
, uint8_t const* src
, size_t src_len
,
57 nsresult
mozilla_encoding_encode_from_utf16(mozilla::Encoding
const** encoding
,
58 char16_t
const* src
, size_t src_len
,
61 nsresult
mozilla_encoding_decode_to_nscstring(
62 mozilla::Encoding
const** encoding
, nsACString
const* src
, nsACString
* dst
);
64 nsresult
mozilla_encoding_decode_to_nscstring_with_bom_removal(
65 mozilla::Encoding
const* encoding
, nsACString
const* src
, nsACString
* dst
);
67 nsresult
mozilla_encoding_decode_to_nscstring_without_bom_handling(
68 mozilla::Encoding
const* encoding
, nsACString
const* src
, nsACString
* dst
);
70 nsresult
mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
71 mozilla::Encoding
const* encoding
, uint8_t const* src
, size_t src_len
,
72 nsACString
* dst
, size_t already_validated
);
75 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
76 mozilla::Encoding
const* encoding
, nsACString
const* src
, nsACString
* dst
);
78 nsresult
mozilla_encoding_encode_from_nscstring(
79 mozilla::Encoding
const** encoding
, nsACString
const* src
, nsACString
* dst
);
86 * Return value from `Decoder`/`Encoder` to indicate that input
89 const uint32_t kInputEmpty
= INPUT_EMPTY
;
92 * Return value from `Decoder`/`Encoder` to indicate that output
93 * space was insufficient.
95 const uint32_t kOutputFull
= OUTPUT_FULL
;
98 * An encoding as defined in the Encoding Standard
99 * (https://encoding.spec.whatwg.org/).
101 * See https://docs.rs/encoding_rs/ for the Rust API docs.
103 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
104 * sequence and, in most cases, vice versa. Each encoding has a name, an output
105 * encoding, and one or more labels.
107 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
108 * encoding in formats and protocols. The _name_ of the encoding is the
109 * preferred label in the case appropriate for returning from the
110 * `characterSet` property of the `Document` DOM interface, except for
111 * the replacement encoding whose name is not one of its labels.
113 * The _output encoding_ is the encoding used for form submission and URL
114 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
115 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
118 * # Streaming vs. Non-Streaming
120 * When you have the entire input in a single buffer, you can use the
121 * methods `Decode()`, `DecodeWithBOMRemoval()`,
122 * `DecodeWithoutBOMHandling()`,
123 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
124 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
125 * NewEncoder()` methods), these methods perform heap allocations. You should
126 * the `Decoder` and `Encoder` objects when your input is split into multiple
127 * buffers or when you want to control the allocation of the output buffers.
131 * All instances of `Encoding` are statically allocated and have the process's
132 * lifetime. There is precisely one unique `Encoding` instance for each
133 * encoding defined in the Encoding Standard.
135 * To obtain a reference to a particular encoding whose identity you know at
136 * compile time, use a `static` that refers to encoding. There is a `static`
137 * for each encoding. The `static`s are named in all caps with hyphens
138 * replaced with underscores and with `_ENCODING` appended to the
139 * name. For example, if you know at compile time that you will want to
140 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
142 * If you don't know what encoding you need at compile time and need to
143 * dynamically get an encoding by label, use `Encoding::for_label()`.
145 * Pointers to `Encoding` can be compared with `==` to check for the sameness
148 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
149 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
150 * `const mozilla::Encoding*` in the C signature and
151 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
153 class Encoding final
{
156 * Implements the _get an encoding_ algorithm
157 * (https://encoding.spec.whatwg.org/#concept-encoding-get).
159 * If, after ASCII-lowercasing and removing leading and trailing
160 * whitespace, the argument matches a label defined in the Encoding
161 * Standard, `const Encoding*` representing the corresponding
162 * encoding is returned. If there is no match, `nullptr` is returned.
164 * This is the right method to use if the action upon the method returning
165 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
166 * instead. When the action upon the method returning `nullptr` is not to
167 * proceed with a fallback but to refuse processing,
168 * `ForLabelNoReplacement()` is more appropriate.
170 static inline const Encoding
* ForLabel(Span
<const char> aLabel
) {
171 return encoding_for_label(
172 reinterpret_cast<const uint8_t*>(aLabel
.Elements()), aLabel
.Length());
176 * `nsAString` argument version. See above for docs.
178 static inline const Encoding
* ForLabel(const nsAString
& aLabel
) {
179 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel
));
183 * This method behaves the same as `ForLabel()`, except when `ForLabel()`
184 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
186 * This method is useful in scenarios where a fatal error is required
187 * upon invalid label, because in those cases the caller typically wishes
188 * to treat the labels that map to the replacement encoding as fatal
191 * It is not OK to use this method when the action upon the method returning
192 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
193 * such a case, the `ForLabel()` method should be used instead in order to
194 * avoid unsafe fallback for labels that `ForLabel()` maps to
195 * `REPLACEMENT_ENCODING`.
197 static inline const Encoding
* ForLabelNoReplacement(Span
<const char> aLabel
) {
198 return encoding_for_label_no_replacement(
199 reinterpret_cast<const uint8_t*>(aLabel
.Elements()), aLabel
.Length());
203 * `nsAString` argument version. See above for docs.
205 static inline const Encoding
* ForLabelNoReplacement(const nsAString
& aLabel
) {
206 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel
));
210 * Performs non-incremental BOM sniffing.
212 * The argument must either be a buffer representing the entire input
213 * stream (non-streaming case) or a buffer representing at least the first
214 * three bytes of the input stream (streaming case).
216 * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
217 * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
218 * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
220 static inline Tuple
<const Encoding
*, size_t> ForBOM(
221 Span
<const uint8_t> aBuffer
) {
222 size_t len
= aBuffer
.Length();
223 const Encoding
* encoding
= encoding_for_bom(aBuffer
.Elements(), &len
);
224 return MakeTuple(encoding
, len
);
228 * Writes the name of this encoding into `aName`.
230 * This name is appropriate to return as-is from the DOM
231 * `document.characterSet` property.
233 inline void Name(nsACString
& aName
) const {
234 aName
.SetLength(ENCODING_NAME_MAX_LENGTH
);
236 encoding_name(this, reinterpret_cast<uint8_t*>(aName
.BeginWriting()));
237 aName
.SetLength(length
); // truncation is the 64-bit case is OK
241 * Checks whether the _output encoding_ of this encoding can encode every
242 * Unicode code point. (Only true if the output encoding is UTF-8.)
244 inline bool CanEncodeEverything() const {
245 return encoding_can_encode_everything(this);
249 * Checks whether this encoding maps one byte to one Basic Multilingual
250 * Plane code point (i.e. byte length equals decoded UTF-16 length) and
251 * vice versa (for mappable characters).
253 * `true` iff this encoding is on the list of Legacy single-byte
254 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
255 * in the spec or x-user-defined.
257 inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
260 * Checks whether the bytes 0x00...0x7F map exclusively to the characters
261 * U+0000...U+007F and vice versa.
263 inline bool IsAsciiCompatible() const {
264 return encoding_is_ascii_compatible(this);
268 * Checks whether this is a Japanese legacy encoding.
270 inline bool IsJapaneseLegacy() const {
271 return this == SHIFT_JIS_ENCODING
|| this == EUC_JP_ENCODING
||
272 this == ISO_2022_JP_ENCODING
;
276 * Returns the _output encoding_ of this encoding. This is UTF-8 for
277 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
279 inline NotNull
<const mozilla::Encoding
*> OutputEncoding() const {
280 return WrapNotNull(encoding_output_encoding(this));
284 * Decode complete input to `nsACString` _with BOM sniffing_ and with
285 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
286 * entire input is available as a single buffer (i.e. the end of the
287 * buffer marks the end of the stream).
289 * This method implements the (non-streaming version of) the
290 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
292 * The second item in the returned tuple is the encoding that was actually
293 * used (which may differ from this encoding thanks to BOM sniffing).
295 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
296 * if there were malformed sequences (that were replaced with the
297 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
300 * The backing buffer of the string isn't copied if the input buffer
301 * is heap-allocated and decoding from UTF-8 and the input is valid
302 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
303 * the input is valid ASCII or decoding from ISO-2022-JP and the
304 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
305 * the same string as both arguments.
307 * _Note:_ It is wrong to use this when the input buffer represents only
308 * a segment of the input instead of the whole input. Use `NewDecoder()`
309 * when decoding segmented input.
311 inline Tuple
<nsresult
, NotNull
<const mozilla::Encoding
*>> Decode(
312 const nsACString
& aBytes
, nsACString
& aOut
) const {
313 const Encoding
* encoding
= this;
314 const nsACString
* bytes
= &aBytes
;
315 nsACString
* out
= &aOut
;
318 nsAutoCString
temp(aBytes
);
319 rv
= mozilla_encoding_decode_to_nscstring(&encoding
, &temp
, out
);
321 rv
= mozilla_encoding_decode_to_nscstring(&encoding
, bytes
, out
);
323 return MakeTuple(rv
, WrapNotNull(encoding
));
327 * Decode complete input to `nsAString` _with BOM sniffing_ and with
328 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
329 * entire input is available as a single buffer (i.e. the end of the
330 * buffer marks the end of the stream).
332 * This method implements the (non-streaming version of) the
333 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
335 * The second item in the returned tuple is the encoding that was actually
336 * used (which may differ from this encoding thanks to BOM sniffing).
338 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
339 * if there were malformed sequences (that were replaced with the
340 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
343 * _Note:_ It is wrong to use this when the input buffer represents only
344 * a segment of the input instead of the whole input. Use `NewDecoder()`
345 * when decoding segmented input.
347 inline Tuple
<nsresult
, NotNull
<const mozilla::Encoding
*>> Decode(
348 Span
<const uint8_t> aBytes
, nsAString
& aOut
) const {
349 const Encoding
* encoding
= this;
350 nsresult rv
= mozilla_encoding_decode_to_nsstring(
351 &encoding
, aBytes
.Elements(), aBytes
.Length(), &aOut
);
352 return MakeTuple(rv
, WrapNotNull(encoding
));
356 * Decode complete input to `nsACString` _with BOM removal_ and with
357 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
358 * entire input is available as a single buffer (i.e. the end of the
359 * buffer marks the end of the stream).
361 * When invoked on `UTF_8`, this method implements the (non-streaming
362 * version of) the _UTF-8 decode_
363 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
365 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
366 * if there were malformed sequences (that were replaced with the
367 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
369 * The backing buffer of the string isn't copied if the input buffer
370 * is heap-allocated and decoding from UTF-8 and the input is valid
371 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
372 * the input is valid ASCII or decoding from ISO-2022-JP and the
373 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
374 * the same string as both arguments.
376 * _Note:_ It is wrong to use this when the input buffer represents only
377 * a segment of the input instead of the whole input. Use
378 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
380 inline nsresult
DecodeWithBOMRemoval(const nsACString
& aBytes
,
381 nsACString
& aOut
) const {
382 const nsACString
* bytes
= &aBytes
;
383 nsACString
* out
= &aOut
;
385 nsAutoCString
temp(aBytes
);
386 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp
,
389 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes
,
394 * Decode complete input to `nsAString` _with BOM removal_ and with
395 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
396 * entire input is available as a single buffer (i.e. the end of the
397 * buffer marks the end of the stream).
399 * When invoked on `UTF_8`, this method implements the (non-streaming
400 * version of) the _UTF-8 decode_
401 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
403 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
404 * if there were malformed sequences (that were replaced with the
405 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
407 * _Note:_ It is wrong to use this when the input buffer represents only
408 * a segment of the input instead of the whole input. Use
409 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
411 inline nsresult
DecodeWithBOMRemoval(Span
<const uint8_t> aBytes
,
412 nsAString
& aOut
) const {
413 return mozilla_encoding_decode_to_nsstring_with_bom_removal(
414 this, aBytes
.Elements(), aBytes
.Length(), &aOut
);
418 * Decode complete input to `nsACString` _without BOM handling_ and
419 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
420 * the entire input is available as a single buffer (i.e. the end of the
421 * buffer marks the end of the stream).
423 * When invoked on `UTF_8`, this method implements the (non-streaming
424 * version of) the _UTF-8 decode without BOM_
425 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
427 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
428 * if there were malformed sequences (that were replaced with the
429 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
431 * The backing buffer of the string isn't copied if the input buffer
432 * is heap-allocated and decoding from UTF-8 and the input is valid
433 * UTF-8, decoding from an ASCII-compatible encoding and the input
434 * is valid ASCII or decoding from ISO-2022-JP and the input stays
435 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
438 * _Note:_ It is wrong to use this when the input buffer represents only
439 * a segment of the input instead of the whole input. Use
440 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
442 inline nsresult
DecodeWithoutBOMHandling(const nsACString
& aBytes
,
443 nsACString
& aOut
) const {
444 const nsACString
* bytes
= &aBytes
;
445 nsACString
* out
= &aOut
;
447 nsAutoCString
temp(aBytes
);
448 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
451 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
456 * Decode complete input to `nsAString` _without BOM handling_ and
457 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
458 * the entire input is available as a single buffer (i.e. the end of the
459 * buffer marks the end of the stream).
461 * When invoked on `UTF_8`, this method implements the (non-streaming
462 * version of) the _UTF-8 decode without BOM_
463 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
465 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
466 * if there were malformed sequences (that were replaced with the
467 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
469 * _Note:_ It is wrong to use this when the input buffer represents only
470 * a segment of the input instead of the whole input. Use
471 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
473 inline nsresult
DecodeWithoutBOMHandling(Span
<const uint8_t> aBytes
,
474 nsAString
& aOut
) const {
475 return mozilla_encoding_decode_to_nsstring_without_bom_handling(
476 this, aBytes
.Elements(), aBytes
.Length(), &aOut
);
480 * Decode complete input to `nsACString` _without BOM handling_ and
481 * _with malformed sequences treated as fatal_ when the entire input is
482 * available as a single buffer (i.e. the end of the buffer marks the end
485 * When invoked on `UTF_8`, this method implements the (non-streaming
486 * version of) the _UTF-8 decode without BOM or fail_
487 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
490 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
491 * if a malformed sequence was encountered and `NS_OK` otherwise.
493 * The backing buffer of the string isn't copied if the input buffer
494 * is heap-allocated and decoding from UTF-8 and the input is valid
495 * UTF-8, decoding from an ASCII-compatible encoding and the input
496 * is valid ASCII or decoding from ISO-2022-JP and the input stays
497 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
500 * _Note:_ It is wrong to use this when the input buffer represents only
501 * a segment of the input instead of the whole input. Use
502 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
504 inline nsresult
DecodeWithoutBOMHandlingAndWithoutReplacement(
505 const nsACString
& aBytes
, nsACString
& aOut
) const {
506 const nsACString
* bytes
= &aBytes
;
507 nsACString
* out
= &aOut
;
509 nsAutoCString
temp(aBytes
);
510 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
513 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
518 * Decode complete input to `nsACString` _without BOM handling_ and
519 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
520 * the entire input is available as a single buffer (i.e. the end of the
521 * buffer marks the end of the stream) _asserting that a number of bytes
522 * from the start are already known to be valid UTF-8_.
524 * The use case for this method is avoiding copying when dealing with
525 * input that has a UTF-8 BOM. _When in doubt, do not use this method._
527 * When invoked on `UTF_8`, this method implements the (non-streaming
528 * version of) the _UTF-8 decode without BOM_
529 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
531 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
532 * if there were malformed sequences (that were replaced with the
533 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
535 * _Note:_ It is wrong to use this when the input buffer represents only
536 * a segment of the input instead of the whole input. Use
537 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
541 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
542 * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
544 inline nsresult
DecodeWithoutBOMHandling(Span
<const uint8_t> aBytes
,
546 size_t aAlreadyValidated
) const {
547 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
548 this, aBytes
.Elements(), aBytes
.Length(), &aOut
, aAlreadyValidated
);
552 * Decode complete input to `nsAString` _without BOM handling_ and
553 * _with malformed sequences treated as fatal_ when the entire input is
554 * available as a single buffer (i.e. the end of the buffer marks the end
557 * When invoked on `UTF_8`, this method implements the (non-streaming
558 * version of) the _UTF-8 decode without BOM or fail_
559 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
562 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
563 * if a malformed sequence was encountered and `NS_OK` otherwise.
565 * _Note:_ It is wrong to use this when the input buffer represents only
566 * a segment of the input instead of the whole input. Use
567 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
569 inline nsresult
DecodeWithoutBOMHandlingAndWithoutReplacement(
570 Span
<const uint8_t> aBytes
, nsAString
& aOut
) const {
571 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
572 this, aBytes
.Elements(), aBytes
.Length(), &aOut
);
576 * Encode complete input to `nsACString` with unmappable characters
577 * replaced with decimal numeric character references when the entire input
578 * is available as a single buffer (i.e. the end of the buffer marks the
579 * end of the stream).
581 * This method implements the (non-streaming version of) the
582 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
584 * The second item in the returned tuple is the encoding that was actually
585 * used (which may differ from this encoding thanks to some encodings
586 * having UTF-8 as their output encoding).
588 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
589 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
590 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
591 * replaced with numeric character references) and `NS_OK` otherwise.
593 * The backing buffer of the string isn't copied if the input buffer
594 * is heap-allocated and encoding to UTF-8 and the input is valid
595 * UTF-8, encoding to an ASCII-compatible encoding and the input
596 * is valid ASCII or encoding from ISO-2022-JP and the input stays
597 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
600 * _Note:_ It is wrong to use this when the input buffer represents only
601 * a segment of the input instead of the whole input. Use `NewEncoder()`
602 * when encoding segmented output.
604 inline Tuple
<nsresult
, NotNull
<const mozilla::Encoding
*>> Encode(
605 const nsACString
& aString
, nsACString
& aOut
) const {
606 const Encoding
* encoding
= this;
607 const nsACString
* string
= &aString
;
608 nsACString
* out
= &aOut
;
611 nsAutoCString
temp(aString
);
612 rv
= mozilla_encoding_encode_from_nscstring(&encoding
, &temp
, out
);
614 rv
= mozilla_encoding_encode_from_nscstring(&encoding
, string
, out
);
616 return MakeTuple(rv
, WrapNotNull(encoding
));
620 * Encode complete input to `nsACString` with unmappable characters
621 * replaced with decimal numeric character references when the entire input
622 * is available as a single buffer (i.e. the end of the buffer marks the
623 * end of the stream).
625 * This method implements the (non-streaming version of) the
626 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
628 * The second item in the returned tuple is the encoding that was actually
629 * used (which may differ from this encoding thanks to some encodings
630 * having UTF-8 as their output encoding).
632 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
633 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
634 * were replaced with numeric character references) and `NS_OK` otherwise.
636 * _Note:_ It is wrong to use this when the input buffer represents only
637 * a segment of the input instead of the whole input. Use `NewEncoder()`
638 * when encoding segmented output.
640 inline Tuple
<nsresult
, NotNull
<const mozilla::Encoding
*>> Encode(
641 Span
<const char16_t
> aString
, nsACString
& aOut
) const {
642 const Encoding
* encoding
= this;
643 nsresult rv
= mozilla_encoding_encode_from_utf16(
644 &encoding
, aString
.Elements(), aString
.Length(), &aOut
);
645 return MakeTuple(rv
, WrapNotNull(encoding
));
649 * Instantiates a new decoder for this encoding with BOM sniffing enabled.
651 * BOM sniffing may cause the returned decoder to morph into a decoder
652 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
654 inline UniquePtr
<Decoder
> NewDecoder() const {
655 UniquePtr
<Decoder
> decoder(encoding_new_decoder(this));
660 * Instantiates a new decoder for this encoding with BOM sniffing enabled
661 * into memory occupied by a previously-instantiated decoder.
663 * BOM sniffing may cause the returned decoder to morph into a decoder
664 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
666 inline void NewDecoderInto(Decoder
& aDecoder
) const {
667 encoding_new_decoder_into(this, &aDecoder
);
671 * Instantiates a new decoder for this encoding with BOM removal.
673 * If the input starts with bytes that are the BOM for this encoding,
674 * those bytes are removed. However, the decoder never morphs into a
675 * decoder for another encoding: A BOM for another encoding is treated as
676 * (potentially malformed) input to the decoding algorithm for this
679 inline UniquePtr
<Decoder
> NewDecoderWithBOMRemoval() const {
680 UniquePtr
<Decoder
> decoder(encoding_new_decoder_with_bom_removal(this));
685 * Instantiates a new decoder for this encoding with BOM removal
686 * into memory occupied by a previously-instantiated decoder.
688 * If the input starts with bytes that are the BOM for this encoding,
689 * those bytes are removed. However, the decoder never morphs into a
690 * decoder for another encoding: A BOM for another encoding is treated as
691 * (potentially malformed) input to the decoding algorithm for this
694 inline void NewDecoderWithBOMRemovalInto(Decoder
& aDecoder
) const {
695 encoding_new_decoder_with_bom_removal_into(this, &aDecoder
);
699 * Instantiates a new decoder for this encoding with BOM handling disabled.
701 * If the input starts with bytes that look like a BOM, those bytes are
702 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
703 * for another encoding.)
705 * _Note:_ If the caller has performed BOM sniffing on its own but has not
706 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
707 * instead of this method to cause the BOM to be removed.
709 inline UniquePtr
<Decoder
> NewDecoderWithoutBOMHandling() const {
710 UniquePtr
<Decoder
> decoder(encoding_new_decoder_without_bom_handling(this));
715 * Instantiates a new decoder for this encoding with BOM handling disabled
716 * into memory occupied by a previously-instantiated decoder.
718 * If the input starts with bytes that look like a BOM, those bytes are
719 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
720 * for another encoding.)
722 * _Note:_ If the caller has performed BOM sniffing on its own but has not
723 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
724 * instead of this method to cause the BOM to be removed.
726 inline void NewDecoderWithoutBOMHandlingInto(Decoder
& aDecoder
) const {
727 encoding_new_decoder_without_bom_handling_into(this, &aDecoder
);
731 * Instantiates a new encoder for the output encoding of this encoding.
733 inline UniquePtr
<Encoder
> NewEncoder() const {
734 UniquePtr
<Encoder
> encoder(encoding_new_encoder(this));
739 * Instantiates a new encoder for the output encoding of this encoding
740 * into memory occupied by a previously-instantiated encoder.
742 inline void NewEncoderInto(Encoder
& aEncoder
) const {
743 encoding_new_encoder_into(this, &aEncoder
);
749 * Returns the index of the first byte that makes the input malformed as
750 * UTF-8 or the length of the input if the input is entirely valid.
752 static inline size_t UTF8ValidUpTo(Span
<const uint8_t> aBuffer
) {
753 return encoding_utf8_valid_up_to(aBuffer
.Elements(), aBuffer
.Length());
759 * Returns the index of the first byte that makes the input malformed as
760 * ASCII or the length of the input if the input is entirely valid.
762 static inline size_t ASCIIValidUpTo(Span
<const uint8_t> aBuffer
) {
763 return encoding_ascii_valid_up_to(aBuffer
.Elements(), aBuffer
.Length());
767 * Validates ISO-2022-JP ASCII-state data.
769 * Returns the index of the first byte that makes the input not
770 * representable in the ASCII state of ISO-2022-JP or the length of the
771 * input if the input is entirely representable in the ASCII state of
774 static inline size_t ISO2022JPASCIIValidUpTo(Span
<const uint8_t> aBuffer
) {
775 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer
.Elements(),
781 Encoding(const Encoding
&) = delete;
782 Encoding
& operator=(const Encoding
&) = delete;
783 ~Encoding() = delete;
787 * A converter that decodes a byte stream into Unicode according to a
788 * character encoding in a streaming (incremental) manner.
790 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
791 * buffer `aDst` both of which are caller-allocated. There are variants for
792 * both UTF-8 and UTF-16 output buffers.
794 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
795 * into `aDst` until one of the following three things happens:
797 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
800 * 2. The output buffer has been filled so near capacity that the decoder
801 * cannot be sure that processing an additional byte of input wouldn't
802 * cause so much output that the output buffer would overflow.
804 * 3. All the input bytes have been processed.
806 * The `Decode*` method then returns tuple of a status indicating which one
807 * of the three reasons to return happened, how many input bytes were read,
808 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
809 * when decoding to UTF-16) were written, and in the case of the
810 * variants performing replacement, a boolean indicating whether an error was
811 * replaced with the REPLACEMENT CHARACTER during the call.
813 * The number of bytes "written" is what's logically written. Garbage may be
814 * written in the output buffer beyond the point logically written to.
816 * In the case of the `*WithoutReplacement` variants, the status is a
817 * `uint32_t` whose possible values are packed info about a malformed byte
818 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
821 * Packed info about malformed sequences has the following format:
822 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
823 * indicate the number of bytes that were consumed after the malformed
824 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
825 * the length of the malformed byte sequence (possible decimal values 1, 2,
826 * 3 or 4). The maximum possible sum of the two is 6.
828 * In the case of methods whose name does not end with
829 * `*WithoutReplacement`, malformed sequences are automatically replaced
830 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
833 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
834 * space. When decoding to UTF-16, the output buffer must have at least two
835 * UTF-16 code units (`char16_t`) of space.
837 * When decoding to UTF-8 without replacement, the methods are guaranteed
838 * not to return indicating that more output space is needed if the length
839 * of the output buffer is at least the length returned by
840 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
841 * with replacement, the length of the output buffer that guarantees the
842 * methods not to return indicating that more output space is needed is given
843 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
844 * or without replacement, the length of the output buffer that guarantees
845 * the methods not to return indicating that more output space is needed is
846 * given by `MaxUTF16BufferLength()`.
848 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
849 * and the output after each `Decode*` call is guaranteed to consist of
850 * complete characters. (I.e. the code unit sequence for the last character is
851 * guaranteed not to be split across output buffers.)
853 * The boolean argument `aLast` indicates that the end of the stream is reached
854 * when all the bytes in `aSrc` have been consumed.
856 * A `Decoder` object can be used to incrementally decode a byte stream.
858 * During the processing of a single stream, the caller must call `Decode*`
859 * zero or more times with `aLast` set to `false` and then call `Decode*` at
860 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
861 * the processing of the stream has ended. Otherwise, the caller must call
862 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
863 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
865 * Once the stream has ended, the `Decoder` object must not be used anymore.
866 * That is, you need to create another one to process another stream.
868 * When the decoder returns `kOutputFull` or the decoder returns a malformed
869 * result and the caller does not wish to treat it as a fatal error, the input
870 * buffer `aSrc` may not have been completely consumed. In that case, the caller
871 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
876 * When converting with a fixed-size output buffer whose size is too small to
877 * accommodate one character of output, an infinite loop ensues. When
878 * converting with a fixed-size output buffer, it generally makes sense to
879 * make the buffer fairly large (e.g. couple of kilobytes).
881 class Decoder final
{
883 ~Decoder() = default;
884 static void operator delete(void* aDecoder
) {
885 decoder_free(reinterpret_cast<Decoder
*>(aDecoder
));
889 * The `Encoding` this `Decoder` is for.
891 * BOM sniffing can change the return value of this method during the life
894 inline NotNull
<const mozilla::Encoding
*> Encoding() const {
895 return WrapNotNull(decoder_encoding(this));
899 * Query the worst-case UTF-8 output size _with replacement_.
901 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
902 * that will not overflow given the current state of the decoder and
903 * `aByteLength` number of additional input bytes when decoding with
904 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
907 inline CheckedInt
<size_t> MaxUTF8BufferLength(size_t aByteLength
) const {
908 CheckedInt
<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength
));
909 if (max
.value() == std::numeric_limits
<size_t>::max()) {
910 // Mark invalid by overflowing
912 MOZ_ASSERT(!max
.isValid());
918 * Query the worst-case UTF-8 output size _without replacement_.
920 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
921 * that will not overflow given the current state of the decoder and
922 * `aByteLength` number of additional input bytes when decoding without
923 * replacement error handling.
925 * Note that this value may be too small for the `WithReplacement` case.
926 * Use `MaxUTF8BufferLength()` for that case.
928 inline CheckedInt
<size_t> MaxUTF8BufferLengthWithoutReplacement(
929 size_t aByteLength
) const {
930 CheckedInt
<size_t> max(
931 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength
));
932 if (max
.value() == std::numeric_limits
<size_t>::max()) {
933 // Mark invalid by overflowing
935 MOZ_ASSERT(!max
.isValid());
941 * Incrementally decode a byte stream into UTF-8 with malformed sequences
942 * replaced with the REPLACEMENT CHARACTER.
944 * See the documentation of the class for documentation for `Decode*`
945 * methods collectively.
947 inline Tuple
<uint32_t, size_t, size_t, bool> DecodeToUTF8(
948 Span
<const uint8_t> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
949 size_t srcRead
= aSrc
.Length();
950 size_t dstWritten
= aDst
.Length();
951 bool hadReplacements
;
953 decoder_decode_to_utf8(this, aSrc
.Elements(), &srcRead
, aDst
.Elements(),
954 &dstWritten
, aLast
, &hadReplacements
);
955 return MakeTuple(result
, srcRead
, dstWritten
, hadReplacements
);
959 * Incrementally decode a byte stream into UTF-8 _without replacement_.
961 * See the documentation of the class for documentation for `Decode*`
962 * methods collectively.
964 inline Tuple
<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
965 Span
<const uint8_t> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
966 size_t srcRead
= aSrc
.Length();
967 size_t dstWritten
= aDst
.Length();
968 uint32_t result
= decoder_decode_to_utf8_without_replacement(
969 this, aSrc
.Elements(), &srcRead
, aDst
.Elements(), &dstWritten
, aLast
);
970 return MakeTuple(result
, srcRead
, dstWritten
);
974 * Query the worst-case UTF-16 output size (with or without replacement).
976 * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
977 * that will not overflow given the current state of the decoder and
978 * `aByteLength` number of additional input bytes.
980 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
981 * return value of this method applies also in the
982 * `_without_replacement` case.
984 inline CheckedInt
<size_t> MaxUTF16BufferLength(size_t aU16Length
) const {
985 CheckedInt
<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length
));
986 if (max
.value() == std::numeric_limits
<size_t>::max()) {
987 // Mark invalid by overflowing
989 MOZ_ASSERT(!max
.isValid());
995 * Incrementally decode a byte stream into UTF-16 with malformed sequences
996 * replaced with the REPLACEMENT CHARACTER.
998 * See the documentation of the class for documentation for `Decode*`
999 * methods collectively.
1001 inline Tuple
<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1002 Span
<const uint8_t> aSrc
, Span
<char16_t
> aDst
, bool aLast
) {
1003 size_t srcRead
= aSrc
.Length();
1004 size_t dstWritten
= aDst
.Length();
1005 bool hadReplacements
;
1006 uint32_t result
= decoder_decode_to_utf16(this, aSrc
.Elements(), &srcRead
,
1007 aDst
.Elements(), &dstWritten
,
1008 aLast
, &hadReplacements
);
1009 return MakeTuple(result
, srcRead
, dstWritten
, hadReplacements
);
1013 * Incrementally decode a byte stream into UTF-16 _without replacement_.
1015 * See the documentation of the class for documentation for `Decode*`
1016 * methods collectively.
1018 inline Tuple
<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1019 Span
<const uint8_t> aSrc
, Span
<char16_t
> aDst
, bool aLast
) {
1020 size_t srcRead
= aSrc
.Length();
1021 size_t dstWritten
= aDst
.Length();
1022 uint32_t result
= decoder_decode_to_utf16_without_replacement(
1023 this, aSrc
.Elements(), &srcRead
, aDst
.Elements(), &dstWritten
, aLast
);
1024 return MakeTuple(result
, srcRead
, dstWritten
);
1028 * Checks for compatibility with storing Unicode scalar values as unsigned
1029 * bytes taking into account the state of the decoder.
1031 * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1032 * including waiting for the BOM, or if the encoding is never
1033 * Latin1-byte-compatible.
1035 * Otherwise returns the index of the first byte whose unsigned value doesn't
1036 * directly correspond to the decoded Unicode scalar value, or the length
1037 * of the input if all bytes in the input decode directly to scalar values
1038 * corresponding to the unsigned byte values.
1040 * Does not change the state of the decoder.
1042 * Do not use this unless you are supporting SpiderMonkey-style string
1043 * storage optimizations.
1045 inline mozilla::Maybe
<size_t> Latin1ByteCompatibleUpTo(
1046 Span
<const uint8_t> aBuffer
) const {
1047 size_t upTo
= decoder_latin1_byte_compatible_up_to(this, aBuffer
.Elements(),
1049 if (upTo
== std::numeric_limits
<size_t>::max()) {
1050 return mozilla::Nothing();
1052 return mozilla::Some(upTo
);
1057 Decoder(const Decoder
&) = delete;
1058 Decoder
& operator=(const Decoder
&) = delete;
1062 * A converter that encodes a Unicode stream into bytes according to a
1063 * character encoding in a streaming (incremental) manner.
1065 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1066 * buffer `aDst` both of which are caller-allocated. There are variants for
1067 * both UTF-8 and UTF-16 input buffers.
1069 * An `Encode*` method encode characters from `aSrc` into bytes characters
1070 * stored into `aDst` until one of the following three things happens:
1072 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1075 * 2. The output buffer has been filled so near capacity that the decoder
1076 * cannot be sure that processing an additional character of input wouldn't
1077 * cause so much output that the output buffer would overflow.
1079 * 3. All the input characters have been processed.
1081 * The `Encode*` method then returns tuple of a status indicating which one
1082 * of the three reasons to return happened, how many input code units (`uint8_t`
1083 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1084 * how many output bytes were written, and in the case of the variants that
1085 * perform replacement, a boolean indicating whether an unmappable
1086 * character was replaced with a numeric character reference during the call.
1088 * The number of bytes "written" is what's logically written. Garbage may be
1089 * written in the output buffer beyond the point logically written to.
1091 * In the case of the methods whose name ends with
1092 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1093 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1094 * to the three cases listed above).
1096 * In the case of methods whose name does not end with
1097 * `*WithoutReplacement`, unmappable characters are automatically replaced
1098 * with the corresponding numeric character references and unmappable
1099 * characters do not cause the methods to return early.
1101 * When encoding from UTF-8 without replacement, the methods are guaranteed
1102 * not to return indicating that more output space is needed if the length
1103 * of the output buffer is at least the length returned by
1104 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1105 * UTF-8 with replacement, the length of the output buffer that guarantees the
1106 * methods not to return indicating that more output space is needed in the
1107 * absence of unmappable characters is given by
1108 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1109 * UTF-16 without replacement, the methods are guaranteed not to return
1110 * indicating that more output space is needed if the length of the output
1111 * buffer is at least the length returned by
1112 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1113 * from UTF-16 with replacement, the the length of the output buffer that
1114 * guarantees the methods not to return indicating that more output space is
1115 * needed in the absence of unmappable characters is given by
1116 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1117 * When encoding with replacement, applications are not expected to size the
1118 * buffer for the worst case ahead of time but to resize the buffer if there
1119 * are unmappable characters. This is why max length queries are only available
1120 * for the case where there are no unmappable characters.
1122 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1123 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1124 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1125 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1126 * surrogate pairs are not split across input buffer boundaries.
1128 * After an `Encode*` call returns, the output produced so far, taken as a
1129 * whole from the start of the stream, is guaranteed to consist of a valid
1130 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1131 * character is guaranteed not to be split across output buffers. However, due
1132 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1133 * from the start for it to be valid. For other encodings, the validity holds
1134 * on a per-output buffer basis.)
1136 * The boolean argument `aLast` indicates that the end of the stream is reached
1137 * when all the characters in `aSrc` have been consumed. This argument is needed
1138 * for ISO-2022-JP and is ignored for other encodings.
1140 * An `Encoder` object can be used to incrementally encode a byte stream.
1142 * During the processing of a single stream, the caller must call `Encode*`
1143 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1144 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1145 * the processing of the stream has ended. Otherwise, the caller must call
1146 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1147 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1149 * Once the stream has ended, the `Encoder` object must not be used anymore.
1150 * That is, you need to create another one to process another stream.
1152 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1153 * result and the caller does not wish to treat it as a fatal error, the input
1154 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1155 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1160 * When converting with a fixed-size output buffer whose size is too small to
1161 * accommodate one character of output, an infinite loop ensues. When
1162 * converting with a fixed-size output buffer, it generally makes sense to
1163 * make the buffer fairly large (e.g. couple of kilobytes).
1165 class Encoder final
{
1167 ~Encoder() = default;
1169 static void operator delete(void* aEncoder
) {
1170 encoder_free(reinterpret_cast<Encoder
*>(aEncoder
));
1174 * The `Encoding` this `Encoder` is for.
1176 inline NotNull
<const mozilla::Encoding
*> Encoding() const {
1177 return WrapNotNull(encoder_encoding(this));
1181 * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1182 * ASCII state and `false` otherwise.
1184 inline bool HasPendingState() const {
1185 return encoder_has_pending_state(this);
1189 * Query the worst-case output size when encoding from UTF-8 with
1192 * Returns the size of the output buffer in bytes that will not overflow
1193 * given the current state of the encoder and `aByteLength` number of
1194 * additional input code units if there are no unmappable characters in
1197 inline CheckedInt
<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1198 size_t aByteLength
) const {
1199 CheckedInt
<size_t> max(
1200 encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1202 if (max
.value() == std::numeric_limits
<size_t>::max()) {
1203 // Mark invalid by overflowing
1205 MOZ_ASSERT(!max
.isValid());
1211 * Query the worst-case output size when encoding from UTF-8 without
1214 * Returns the size of the output buffer in bytes that will not overflow
1215 * given the current state of the encoder and `aByteLength` number of
1216 * additional input code units.
1218 inline CheckedInt
<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1219 size_t aByteLength
) const {
1220 CheckedInt
<size_t> max(
1221 encoder_max_buffer_length_from_utf8_without_replacement(this,
1223 if (max
.value() == std::numeric_limits
<size_t>::max()) {
1224 // Mark invalid by overflowing
1226 MOZ_ASSERT(!max
.isValid());
1232 * Incrementally encode into byte stream from UTF-8 with unmappable
1233 * characters replaced with HTML (decimal) numeric character references.
1235 * See the documentation of the class for documentation for `Encode*`
1236 * methods collectively.
1238 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1239 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1240 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1242 inline Tuple
<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1243 Span
<const uint8_t> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
1244 size_t srcRead
= aSrc
.Length();
1245 size_t dstWritten
= aDst
.Length();
1246 bool hadReplacements
;
1247 uint32_t result
= encoder_encode_from_utf8(this, aSrc
.Elements(), &srcRead
,
1248 aDst
.Elements(), &dstWritten
,
1249 aLast
, &hadReplacements
);
1250 return MakeTuple(result
, srcRead
, dstWritten
, hadReplacements
);
1254 * Incrementally encode into byte stream from UTF-8 _without replacement_.
1256 * See the documentation of the class for documentation for `Encode*`
1257 * methods collectively.
1259 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1260 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1261 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1263 inline Tuple
<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1264 Span
<const uint8_t> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
1265 size_t srcRead
= aSrc
.Length();
1266 size_t dstWritten
= aDst
.Length();
1267 uint32_t result
= encoder_encode_from_utf8_without_replacement(
1268 this, aSrc
.Elements(), &srcRead
, aDst
.Elements(), &dstWritten
, aLast
);
1269 return MakeTuple(result
, srcRead
, dstWritten
);
1273 * Query the worst-case output size when encoding from UTF-16 with
1276 * Returns the size of the output buffer in bytes that will not overflow
1277 * given the current state of the encoder and `aU16Length` number of
1278 * additional input code units if there are no unmappable characters in
1281 inline CheckedInt
<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1282 size_t aU16Length
) const {
1283 CheckedInt
<size_t> max(
1284 encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1286 if (max
.value() == std::numeric_limits
<size_t>::max()) {
1287 // Mark invalid by overflowing
1289 MOZ_ASSERT(!max
.isValid());
1295 * Query the worst-case output size when encoding from UTF-16 without
1298 * Returns the size of the output buffer in bytes that will not overflow
1299 * given the current state of the encoder and `aU16Length` number of
1300 * additional input code units.
1302 inline CheckedInt
<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1303 size_t aU16Length
) const {
1304 CheckedInt
<size_t> max(
1305 encoder_max_buffer_length_from_utf16_without_replacement(this,
1307 if (max
.value() == std::numeric_limits
<size_t>::max()) {
1308 // Mark invalid by overflowing
1310 MOZ_ASSERT(!max
.isValid());
1316 * Incrementally encode into byte stream from UTF-16 with unmappable
1317 * characters replaced with HTML (decimal) numeric character references.
1319 * See the documentation of the class for documentation for `Encode*`
1320 * methods collectively.
1322 inline Tuple
<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1323 Span
<const char16_t
> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
1324 size_t srcRead
= aSrc
.Length();
1325 size_t dstWritten
= aDst
.Length();
1326 bool hadReplacements
;
1327 uint32_t result
= encoder_encode_from_utf16(this, aSrc
.Elements(), &srcRead
,
1328 aDst
.Elements(), &dstWritten
,
1329 aLast
, &hadReplacements
);
1330 return MakeTuple(result
, srcRead
, dstWritten
, hadReplacements
);
1334 * Incrementally encode into byte stream from UTF-16 _without replacement_.
1336 * See the documentation of the class for documentation for `Encode*`
1337 * methods collectively.
1339 inline Tuple
<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1340 Span
<const char16_t
> aSrc
, Span
<uint8_t> aDst
, bool aLast
) {
1341 size_t srcRead
= aSrc
.Length();
1342 size_t dstWritten
= aDst
.Length();
1343 uint32_t result
= encoder_encode_from_utf16_without_replacement(
1344 this, aSrc
.Elements(), &srcRead
, aDst
.Elements(), &dstWritten
, aLast
);
1345 return MakeTuple(result
, srcRead
, dstWritten
);
1350 Encoder(const Encoder
&) = delete;
1351 Encoder
& operator=(const Encoder
&) = delete;
1354 }; // namespace mozilla
1356 #endif // mozilla_Encoding_h