Bug 1892041 - Part 3: Update test exclusions. r=spidermonkey-reviewers,dminor
[gecko.git] / intl / Encoding.h
blob3b5639e431d1d047061c6d695e28ef15638fdf9f
1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/Maybe.h"
19 #include "mozilla/NotNull.h"
20 #include "mozilla/Span.h"
21 #include "nsString.h"
23 #include <tuple>
25 namespace mozilla {
26 class Encoding;
27 class Decoder;
28 class Encoder;
29 }; // namespace mozilla
31 #define ENCODING_RS_ENCODING mozilla::Encoding
32 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
33 mozilla::NotNull<const mozilla::Encoding*>
34 #define ENCODING_RS_ENCODER mozilla::Encoder
35 #define ENCODING_RS_DECODER mozilla::Decoder
37 #include "encoding_rs.h"
39 extern "C" {
41 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
42 uint8_t const* src, size_t src_len,
43 nsAString* dst);
45 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
46 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
47 nsAString* dst);
49 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
50 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
51 nsAString* dst);
53 nsresult
54 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
55 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
56 nsAString* dst);
58 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
59 char16_t const* src, size_t src_len,
60 nsACString* dst);
62 nsresult mozilla_encoding_decode_to_nscstring(
63 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
65 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
66 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
68 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
69 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
71 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
72 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
73 nsACString* dst, size_t already_validated);
75 nsresult
76 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
77 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
79 nsresult mozilla_encoding_encode_from_nscstring(
80 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
82 } // extern "C"
84 namespace mozilla {
86 /**
87 * Return value from `Decoder`/`Encoder` to indicate that input
88 * was exhausted.
90 const uint32_t kInputEmpty = INPUT_EMPTY;
92 /**
93 * Return value from `Decoder`/`Encoder` to indicate that output
94 * space was insufficient.
96 const uint32_t kOutputFull = OUTPUT_FULL;
98 /**
99 * An encoding as defined in the Encoding Standard
100 * (https://encoding.spec.whatwg.org/).
102 * See https://docs.rs/encoding_rs/ for the Rust API docs.
104 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
105 * sequence and, in most cases, vice versa. Each encoding has a name, an output
106 * encoding, and one or more labels.
108 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
109 * encoding in formats and protocols. The _name_ of the encoding is the
110 * preferred label in the case appropriate for returning from the
111 * `characterSet` property of the `Document` DOM interface, except for
112 * the replacement encoding whose name is not one of its labels.
114 * The _output encoding_ is the encoding used for form submission and URL
115 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
116 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
117 * encodings.
119 * # Streaming vs. Non-Streaming
121 * When you have the entire input in a single buffer, you can use the
122 * methods `Decode()`, `DecodeWithBOMRemoval()`,
123 * `DecodeWithoutBOMHandling()`,
124 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
125 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
126 * NewEncoder()` methods), these methods perform heap allocations. You should
127 * the `Decoder` and `Encoder` objects when your input is split into multiple
128 * buffers or when you want to control the allocation of the output buffers.
130 * # Instances
132 * All instances of `Encoding` are statically allocated and have the process's
133 * lifetime. There is precisely one unique `Encoding` instance for each
134 * encoding defined in the Encoding Standard.
136 * To obtain a reference to a particular encoding whose identity you know at
137 * compile time, use a `static` that refers to encoding. There is a `static`
138 * for each encoding. The `static`s are named in all caps with hyphens
139 * replaced with underscores and with `_ENCODING` appended to the
140 * name. For example, if you know at compile time that you will want to
141 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
143 * If you don't know what encoding you need at compile time and need to
144 * dynamically get an encoding by label, use `Encoding::for_label()`.
146 * Pointers to `Encoding` can be compared with `==` to check for the sameness
147 * of two encodings.
149 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
150 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
151 * `const mozilla::Encoding*` in the C signature and
152 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
154 class Encoding final {
155 public:
157 * Implements the _get an encoding_ algorithm
158 * (https://encoding.spec.whatwg.org/#concept-encoding-get).
160 * If, after ASCII-lowercasing and removing leading and trailing
161 * whitespace, the argument matches a label defined in the Encoding
162 * Standard, `const Encoding*` representing the corresponding
163 * encoding is returned. If there is no match, `nullptr` is returned.
165 * This is the right method to use if the action upon the method returning
166 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
167 * instead. When the action upon the method returning `nullptr` is not to
168 * proceed with a fallback but to refuse processing,
169 * `ForLabelNoReplacement()` is more appropriate.
171 static inline const Encoding* ForLabel(Span<const char> aLabel) {
172 return encoding_for_label(
173 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
177 * `nsAString` argument version. See above for docs.
179 static inline const Encoding* ForLabel(const nsAString& aLabel) {
180 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
184 * This method behaves the same as `ForLabel()`, except when `ForLabel()`
185 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
187 * This method is useful in scenarios where a fatal error is required
188 * upon invalid label, because in those cases the caller typically wishes
189 * to treat the labels that map to the replacement encoding as fatal
190 * errors, too.
192 * It is not OK to use this method when the action upon the method returning
193 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
194 * such a case, the `ForLabel()` method should be used instead in order to
195 * avoid unsafe fallback for labels that `ForLabel()` maps to
196 * `REPLACEMENT_ENCODING`.
198 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
199 return encoding_for_label_no_replacement(
200 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
204 * `nsAString` argument version. See above for docs.
206 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
207 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
211 * Performs non-incremental BOM sniffing.
213 * The argument must either be a buffer representing the entire input
214 * stream (non-streaming case) or a buffer representing at least the first
215 * three bytes of the input stream (streaming case).
217 * Returns `{UTF_8_ENCODING, 3}`,
218 * `{UTF_16LE_ENCODING, 2}` or
219 * `{UTF_16BE_ENCODING, 3}` if the argument starts with the
220 * UTF-8, UTF-16LE or UTF-16BE BOM or `{nullptr, 0}` otherwise.
222 static inline std::tuple<const Encoding*, size_t> ForBOM(
223 Span<const uint8_t> aBuffer) {
224 size_t len = aBuffer.Length();
225 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
226 return {encoding, len};
230 * Writes the name of this encoding into `aName`.
232 * This name is appropriate to return as-is from the DOM
233 * `document.characterSet` property.
235 inline void Name(nsACString& aName) const {
236 aName.SetLength(ENCODING_NAME_MAX_LENGTH);
237 size_t length =
238 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
239 aName.SetLength(length); // truncation is the 64-bit case is OK
243 * Checks whether the _output encoding_ of this encoding can encode every
244 * Unicode code point. (Only true if the output encoding is UTF-8.)
246 inline bool CanEncodeEverything() const {
247 return encoding_can_encode_everything(this);
251 * Checks whether this encoding maps one byte to one Basic Multilingual
252 * Plane code point (i.e. byte length equals decoded UTF-16 length) and
253 * vice versa (for mappable characters).
255 * `true` iff this encoding is on the list of Legacy single-byte
256 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
257 * in the spec or x-user-defined.
259 inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
262 * Checks whether the bytes 0x00...0x7F map exclusively to the characters
263 * U+0000...U+007F and vice versa.
265 inline bool IsAsciiCompatible() const {
266 return encoding_is_ascii_compatible(this);
270 * Checks whether this is a Japanese legacy encoding.
272 inline bool IsJapaneseLegacy() const {
273 return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
274 this == ISO_2022_JP_ENCODING;
278 * Returns the _output encoding_ of this encoding. This is UTF-8 for
279 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
281 inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
282 return WrapNotNull(encoding_output_encoding(this));
286 * Decode complete input to `nsACString` _with BOM sniffing_ and with
287 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
288 * entire input is available as a single buffer (i.e. the end of the
289 * buffer marks the end of the stream).
291 * This method implements the (non-streaming version of) the
292 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
294 * The second item in the returned tuple is the encoding that was actually
295 * used (which may differ from this encoding thanks to BOM sniffing).
297 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
298 * if there were malformed sequences (that were replaced with the
299 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
300 * tuple.
302 * The backing buffer of the string isn't copied if the input buffer
303 * is heap-allocated and decoding from UTF-8 and the input is valid
304 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
305 * the input is valid ASCII or decoding from ISO-2022-JP and the
306 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
307 * the same string as both arguments.
309 * _Note:_ It is wrong to use this when the input buffer represents only
310 * a segment of the input instead of the whole input. Use `NewDecoder()`
311 * when decoding segmented input.
313 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
314 const nsACString& aBytes, nsACString& aOut) const {
315 const Encoding* encoding = this;
316 const nsACString* bytes = &aBytes;
317 nsACString* out = &aOut;
318 nsresult rv;
319 if (bytes == out) {
320 nsAutoCString temp(aBytes);
321 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
322 } else {
323 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
325 return {rv, WrapNotNull(encoding)};
329 * Decode complete input to `nsAString` _with BOM sniffing_ and with
330 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
331 * entire input is available as a single buffer (i.e. the end of the
332 * buffer marks the end of the stream).
334 * This method implements the (non-streaming version of) the
335 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
337 * The second item in the returned tuple is the encoding that was actually
338 * used (which may differ from this encoding thanks to BOM sniffing).
340 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
341 * if there were malformed sequences (that were replaced with the
342 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
343 * tuple.
345 * _Note:_ It is wrong to use this when the input buffer represents only
346 * a segment of the input instead of the whole input. Use `NewDecoder()`
347 * when decoding segmented input.
349 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
350 Span<const uint8_t> aBytes, nsAString& aOut) const {
351 const Encoding* encoding = this;
352 nsresult rv = mozilla_encoding_decode_to_nsstring(
353 &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
354 return {rv, WrapNotNull(encoding)};
358 * Decode complete input to `nsACString` _with BOM removal_ and with
359 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
360 * entire input is available as a single buffer (i.e. the end of the
361 * buffer marks the end of the stream).
363 * When invoked on `UTF_8`, this method implements the (non-streaming
364 * version of) the _UTF-8 decode_
365 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
367 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
368 * if there were malformed sequences (that were replaced with the
369 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
371 * The backing buffer of the string isn't copied if the input buffer
372 * is heap-allocated and decoding from UTF-8 and the input is valid
373 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
374 * the input is valid ASCII or decoding from ISO-2022-JP and the
375 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
376 * the same string as both arguments.
378 * _Note:_ It is wrong to use this when the input buffer represents only
379 * a segment of the input instead of the whole input. Use
380 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
382 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
383 nsACString& aOut) const {
384 const nsACString* bytes = &aBytes;
385 nsACString* out = &aOut;
386 if (bytes == out) {
387 nsAutoCString temp(aBytes);
388 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
389 out);
391 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
392 out);
396 * Decode complete input to `nsAString` _with BOM removal_ and with
397 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
398 * entire input is available as a single buffer (i.e. the end of the
399 * buffer marks the end of the stream).
401 * When invoked on `UTF_8`, this method implements the (non-streaming
402 * version of) the _UTF-8 decode_
403 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
405 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
406 * if there were malformed sequences (that were replaced with the
407 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
409 * _Note:_ It is wrong to use this when the input buffer represents only
410 * a segment of the input instead of the whole input. Use
411 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
413 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
414 nsAString& aOut) const {
415 return mozilla_encoding_decode_to_nsstring_with_bom_removal(
416 this, aBytes.Elements(), aBytes.Length(), &aOut);
420 * Decode complete input to `nsACString` _without BOM handling_ and
421 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
422 * the entire input is available as a single buffer (i.e. the end of the
423 * buffer marks the end of the stream).
425 * When invoked on `UTF_8`, this method implements the (non-streaming
426 * version of) the _UTF-8 decode without BOM_
427 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
429 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
430 * if there were malformed sequences (that were replaced with the
431 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
433 * The backing buffer of the string isn't copied if the input buffer
434 * is heap-allocated and decoding from UTF-8 and the input is valid
435 * UTF-8, decoding from an ASCII-compatible encoding and the input
436 * is valid ASCII or decoding from ISO-2022-JP and the input stays
437 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
438 * as both arguments.
440 * _Note:_ It is wrong to use this when the input buffer represents only
441 * a segment of the input instead of the whole input. Use
442 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
444 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
445 nsACString& aOut) const {
446 const nsACString* bytes = &aBytes;
447 nsACString* out = &aOut;
448 if (bytes == out) {
449 nsAutoCString temp(aBytes);
450 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
451 this, &temp, out);
453 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
454 this, bytes, out);
458 * Decode complete input to `nsAString` _without BOM handling_ and
459 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
460 * the entire input is available as a single buffer (i.e. the end of the
461 * buffer marks the end of the stream).
463 * When invoked on `UTF_8`, this method implements the (non-streaming
464 * version of) the _UTF-8 decode without BOM_
465 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
467 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
468 * if there were malformed sequences (that were replaced with the
469 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
471 * _Note:_ It is wrong to use this when the input buffer represents only
472 * a segment of the input instead of the whole input. Use
473 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
475 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
476 nsAString& aOut) const {
477 return mozilla_encoding_decode_to_nsstring_without_bom_handling(
478 this, aBytes.Elements(), aBytes.Length(), &aOut);
482 * Decode complete input to `nsACString` _without BOM handling_ and
483 * _with malformed sequences treated as fatal_ when the entire input is
484 * available as a single buffer (i.e. the end of the buffer marks the end
485 * of the stream).
487 * When invoked on `UTF_8`, this method implements the (non-streaming
488 * version of) the _UTF-8 decode without BOM or fail_
489 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
490 * spec concept.
492 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
493 * if a malformed sequence was encountered and `NS_OK` otherwise.
495 * The backing buffer of the string isn't copied if the input buffer
496 * is heap-allocated and decoding from UTF-8 and the input is valid
497 * UTF-8, decoding from an ASCII-compatible encoding and the input
498 * is valid ASCII or decoding from ISO-2022-JP and the input stays
499 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
500 * as both arguments.
502 * _Note:_ It is wrong to use this when the input buffer represents only
503 * a segment of the input instead of the whole input. Use
504 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
506 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
507 const nsACString& aBytes, nsACString& aOut) const {
508 const nsACString* bytes = &aBytes;
509 nsACString* out = &aOut;
510 if (bytes == out) {
511 nsAutoCString temp(aBytes);
512 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
513 this, &temp, out);
515 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
516 this, bytes, out);
520 * Decode complete input to `nsACString` _without BOM handling_ and
521 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
522 * the entire input is available as a single buffer (i.e. the end of the
523 * buffer marks the end of the stream) _asserting that a number of bytes
524 * from the start are already known to be valid UTF-8_.
526 * The use case for this method is avoiding copying when dealing with
527 * input that has a UTF-8 BOM. _When in doubt, do not use this method._
529 * When invoked on `UTF_8`, this method implements the (non-streaming
530 * version of) the _UTF-8 decode without BOM_
531 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
533 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
534 * if there were malformed sequences (that were replaced with the
535 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
537 * _Note:_ It is wrong to use this when the input buffer represents only
538 * a segment of the input instead of the whole input. Use
539 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
541 * # Safety
543 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
544 * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
546 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
547 nsACString& aOut,
548 size_t aAlreadyValidated) const {
549 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
550 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
554 * Decode complete input to `nsAString` _without BOM handling_ and
555 * _with malformed sequences treated as fatal_ when the entire input is
556 * available as a single buffer (i.e. the end of the buffer marks the end
557 * of the stream).
559 * When invoked on `UTF_8`, this method implements the (non-streaming
560 * version of) the _UTF-8 decode without BOM or fail_
561 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
562 * spec concept.
564 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
565 * if a malformed sequence was encountered and `NS_OK` otherwise.
567 * _Note:_ It is wrong to use this when the input buffer represents only
568 * a segment of the input instead of the whole input. Use
569 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
571 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
572 Span<const uint8_t> aBytes, nsAString& aOut) const {
573 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
574 this, aBytes.Elements(), aBytes.Length(), &aOut);
578 * Encode complete input to `nsACString` with unmappable characters
579 * replaced with decimal numeric character references when the entire input
580 * is available as a single buffer (i.e. the end of the buffer marks the
581 * end of the stream).
583 * This method implements the (non-streaming version of) the
584 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
586 * The second item in the returned tuple is the encoding that was actually
587 * used (which may differ from this encoding thanks to some encodings
588 * having UTF-8 as their output encoding).
590 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
591 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
592 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
593 * replaced with numeric character references) and `NS_OK` otherwise.
595 * The backing buffer of the string isn't copied if the input buffer
596 * is heap-allocated and encoding to UTF-8 and the input is valid
597 * UTF-8, encoding to an ASCII-compatible encoding and the input
598 * is valid ASCII or encoding from ISO-2022-JP and the input stays
599 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
600 * as both arguments.
602 * _Note:_ It is wrong to use this when the input buffer represents only
603 * a segment of the input instead of the whole input. Use `NewEncoder()`
604 * when encoding segmented output.
606 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
607 const nsACString& aString, nsACString& aOut) const {
608 const Encoding* encoding = this;
609 const nsACString* string = &aString;
610 nsACString* out = &aOut;
611 nsresult rv;
612 if (string == out) {
613 nsAutoCString temp(aString);
614 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
615 } else {
616 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
618 return {rv, WrapNotNull(encoding)};
622 * Encode complete input to `nsACString` with unmappable characters
623 * replaced with decimal numeric character references when the entire input
624 * is available as a single buffer (i.e. the end of the buffer marks the
625 * end of the stream).
627 * This method implements the (non-streaming version of) the
628 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
630 * The second item in the returned tuple is the encoding that was actually
631 * used (which may differ from this encoding thanks to some encodings
632 * having UTF-8 as their output encoding).
634 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
635 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
636 * were replaced with numeric character references) and `NS_OK` otherwise.
638 * _Note:_ It is wrong to use this when the input buffer represents only
639 * a segment of the input instead of the whole input. Use `NewEncoder()`
640 * when encoding segmented output.
642 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
643 Span<const char16_t> aString, nsACString& aOut) const {
644 const Encoding* encoding = this;
645 nsresult rv = mozilla_encoding_encode_from_utf16(
646 &encoding, aString.Elements(), aString.Length(), &aOut);
647 return {rv, WrapNotNull(encoding)};
651 * Instantiates a new decoder for this encoding with BOM sniffing enabled.
653 * BOM sniffing may cause the returned decoder to morph into a decoder
654 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
656 inline UniquePtr<Decoder> NewDecoder() const {
657 UniquePtr<Decoder> decoder(encoding_new_decoder(this));
658 return decoder;
662 * Instantiates a new decoder for this encoding with BOM sniffing enabled
663 * into memory occupied by a previously-instantiated decoder.
665 * BOM sniffing may cause the returned decoder to morph into a decoder
666 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
668 inline void NewDecoderInto(Decoder& aDecoder) const {
669 encoding_new_decoder_into(this, &aDecoder);
673 * Instantiates a new decoder for this encoding with BOM removal.
675 * If the input starts with bytes that are the BOM for this encoding,
676 * those bytes are removed. However, the decoder never morphs into a
677 * decoder for another encoding: A BOM for another encoding is treated as
678 * (potentially malformed) input to the decoding algorithm for this
679 * encoding.
681 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
682 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
683 return decoder;
687 * Instantiates a new decoder for this encoding with BOM removal
688 * into memory occupied by a previously-instantiated decoder.
690 * If the input starts with bytes that are the BOM for this encoding,
691 * those bytes are removed. However, the decoder never morphs into a
692 * decoder for another encoding: A BOM for another encoding is treated as
693 * (potentially malformed) input to the decoding algorithm for this
694 * encoding.
696 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
697 encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
701 * Instantiates a new decoder for this encoding with BOM handling disabled.
703 * If the input starts with bytes that look like a BOM, those bytes are
704 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
705 * for another encoding.)
707 * _Note:_ If the caller has performed BOM sniffing on its own but has not
708 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
709 * instead of this method to cause the BOM to be removed.
711 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
712 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
713 return decoder;
717 * Instantiates a new decoder for this encoding with BOM handling disabled
718 * into memory occupied by a previously-instantiated decoder.
720 * If the input starts with bytes that look like a BOM, those bytes are
721 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
722 * for another encoding.)
724 * _Note:_ If the caller has performed BOM sniffing on its own but has not
725 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
726 * instead of this method to cause the BOM to be removed.
728 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
729 encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
733 * Instantiates a new encoder for the output encoding of this encoding.
735 inline UniquePtr<Encoder> NewEncoder() const {
736 UniquePtr<Encoder> encoder(encoding_new_encoder(this));
737 return encoder;
741 * Instantiates a new encoder for the output encoding of this encoding
742 * into memory occupied by a previously-instantiated encoder.
744 inline void NewEncoderInto(Encoder& aEncoder) const {
745 encoding_new_encoder_into(this, &aEncoder);
749 * Validates UTF-8.
751 * Returns the index of the first byte that makes the input malformed as
752 * UTF-8 or the length of the input if the input is entirely valid.
754 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
755 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
759 * Validates ASCII.
761 * Returns the index of the first byte that makes the input malformed as
762 * ASCII or the length of the input if the input is entirely valid.
764 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
765 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
769 * Validates ISO-2022-JP ASCII-state data.
771 * Returns the index of the first byte that makes the input not
772 * representable in the ASCII state of ISO-2022-JP or the length of the
773 * input if the input is entirely representable in the ASCII state of
774 * ISO-2022-JP.
776 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
777 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
778 aBuffer.Length());
781 private:
782 Encoding() = delete;
783 Encoding(const Encoding&) = delete;
784 Encoding& operator=(const Encoding&) = delete;
785 ~Encoding() = delete;
789 * A converter that decodes a byte stream into Unicode according to a
790 * character encoding in a streaming (incremental) manner.
792 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
793 * buffer `aDst` both of which are caller-allocated. There are variants for
794 * both UTF-8 and UTF-16 output buffers.
796 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
797 * into `aDst` until one of the following three things happens:
799 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
800 * variants only).
802 * 2. The output buffer has been filled so near capacity that the decoder
803 * cannot be sure that processing an additional byte of input wouldn't
804 * cause so much output that the output buffer would overflow.
806 * 3. All the input bytes have been processed.
808 * The `Decode*` method then returns tuple of a status indicating which one
809 * of the three reasons to return happened, how many input bytes were read,
810 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
811 * when decoding to UTF-16) were written, and in the case of the
812 * variants performing replacement, a boolean indicating whether an error was
813 * replaced with the REPLACEMENT CHARACTER during the call.
815 * The number of bytes "written" is what's logically written. Garbage may be
816 * written in the output buffer beyond the point logically written to.
818 * In the case of the `*WithoutReplacement` variants, the status is a
819 * `uint32_t` whose possible values are packed info about a malformed byte
820 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
821 * listed above).
823 * Packed info about malformed sequences has the following format:
824 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
825 * indicate the number of bytes that were consumed after the malformed
826 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
827 * the length of the malformed byte sequence (possible decimal values 1, 2,
828 * 3 or 4). The maximum possible sum of the two is 6.
830 * In the case of methods whose name does not end with
831 * `*WithoutReplacement`, malformed sequences are automatically replaced
832 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
833 * return early.
835 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
836 * space. When decoding to UTF-16, the output buffer must have at least two
837 * UTF-16 code units (`char16_t`) of space.
839 * When decoding to UTF-8 without replacement, the methods are guaranteed
840 * not to return indicating that more output space is needed if the length
841 * of the output buffer is at least the length returned by
842 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
843 * with replacement, the length of the output buffer that guarantees the
844 * methods not to return indicating that more output space is needed is given
845 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
846 * or without replacement, the length of the output buffer that guarantees
847 * the methods not to return indicating that more output space is needed is
848 * given by `MaxUTF16BufferLength()`.
850 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
851 * and the output after each `Decode*` call is guaranteed to consist of
852 * complete characters. (I.e. the code unit sequence for the last character is
853 * guaranteed not to be split across output buffers.)
855 * The boolean argument `aLast` indicates that the end of the stream is reached
856 * when all the bytes in `aSrc` have been consumed.
858 * A `Decoder` object can be used to incrementally decode a byte stream.
860 * During the processing of a single stream, the caller must call `Decode*`
861 * zero or more times with `aLast` set to `false` and then call `Decode*` at
862 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
863 * the processing of the stream has ended. Otherwise, the caller must call
864 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
865 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
867 * Once the stream has ended, the `Decoder` object must not be used anymore.
868 * That is, you need to create another one to process another stream.
870 * When the decoder returns `kOutputFull` or the decoder returns a malformed
871 * result and the caller does not wish to treat it as a fatal error, the input
872 * buffer `aSrc` may not have been completely consumed. In that case, the caller
873 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
874 * call.
876 * # Infinite loops
878 * When converting with a fixed-size output buffer whose size is too small to
879 * accommodate one character of output, an infinite loop ensues. When
880 * converting with a fixed-size output buffer, it generally makes sense to
881 * make the buffer fairly large (e.g. couple of kilobytes).
883 class Decoder final {
884 public:
885 ~Decoder() = default;
886 static void operator delete(void* aDecoder) {
887 decoder_free(reinterpret_cast<Decoder*>(aDecoder));
891 * The `Encoding` this `Decoder` is for.
893 * BOM sniffing can change the return value of this method during the life
894 * of the decoder.
896 inline NotNull<const mozilla::Encoding*> Encoding() const {
897 return WrapNotNull(decoder_encoding(this));
901 * Query the worst-case UTF-8 output size _with replacement_.
903 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
904 * that will not overflow given the current state of the decoder and
905 * `aByteLength` number of additional input bytes when decoding with
906 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
907 * sequence.
909 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
910 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
911 if (max.value() == std::numeric_limits<size_t>::max()) {
912 // Mark invalid by overflowing
913 max++;
914 MOZ_ASSERT(!max.isValid());
916 return max;
920 * Query the worst-case UTF-8 output size _without replacement_.
922 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
923 * that will not overflow given the current state of the decoder and
924 * `aByteLength` number of additional input bytes when decoding without
925 * replacement error handling.
927 * Note that this value may be too small for the `WithReplacement` case.
928 * Use `MaxUTF8BufferLength()` for that case.
930 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
931 size_t aByteLength) const {
932 CheckedInt<size_t> max(
933 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
934 if (max.value() == std::numeric_limits<size_t>::max()) {
935 // Mark invalid by overflowing
936 max++;
937 MOZ_ASSERT(!max.isValid());
939 return max;
943 * Incrementally decode a byte stream into UTF-8 with malformed sequences
944 * replaced with the REPLACEMENT CHARACTER.
946 * See the documentation of the class for documentation for `Decode*`
947 * methods collectively.
949 inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
950 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
951 size_t srcRead = aSrc.Length();
952 size_t dstWritten = aDst.Length();
953 bool hadReplacements;
954 uint32_t result =
955 decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
956 &dstWritten, aLast, &hadReplacements);
957 return {result, srcRead, dstWritten, hadReplacements};
961 * Incrementally decode a byte stream into UTF-8 _without replacement_.
963 * See the documentation of the class for documentation for `Decode*`
964 * methods collectively.
966 inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
967 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
968 size_t srcRead = aSrc.Length();
969 size_t dstWritten = aDst.Length();
970 uint32_t result = decoder_decode_to_utf8_without_replacement(
971 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
972 return {result, srcRead, dstWritten};
976 * Query the worst-case UTF-16 output size (with or without replacement).
978 * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
979 * that will not overflow given the current state of the decoder and
980 * `aByteLength` number of additional input bytes.
982 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
983 * return value of this method applies also in the
984 * `_without_replacement` case.
986 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
987 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
988 if (max.value() == std::numeric_limits<size_t>::max()) {
989 // Mark invalid by overflowing
990 max++;
991 MOZ_ASSERT(!max.isValid());
993 return max;
997 * Incrementally decode a byte stream into UTF-16 with malformed sequences
998 * replaced with the REPLACEMENT CHARACTER.
1000 * See the documentation of the class for documentation for `Decode*`
1001 * methods collectively.
1003 inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1004 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1005 size_t srcRead = aSrc.Length();
1006 size_t dstWritten = aDst.Length();
1007 bool hadReplacements;
1008 uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1009 aDst.Elements(), &dstWritten,
1010 aLast, &hadReplacements);
1011 return {result, srcRead, dstWritten, hadReplacements};
1015 * Incrementally decode a byte stream into UTF-16 _without replacement_.
1017 * See the documentation of the class for documentation for `Decode*`
1018 * methods collectively.
1020 inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1021 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1022 size_t srcRead = aSrc.Length();
1023 size_t dstWritten = aDst.Length();
1024 uint32_t result = decoder_decode_to_utf16_without_replacement(
1025 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1026 return {result, srcRead, dstWritten};
1030 * Checks for compatibility with storing Unicode scalar values as unsigned
1031 * bytes taking into account the state of the decoder.
1033 * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1034 * including waiting for the BOM, or if the encoding is never
1035 * Latin1-byte-compatible.
1037 * Otherwise returns the index of the first byte whose unsigned value doesn't
1038 * directly correspond to the decoded Unicode scalar value, or the length
1039 * of the input if all bytes in the input decode directly to scalar values
1040 * corresponding to the unsigned byte values.
1042 * Does not change the state of the decoder.
1044 * Do not use this unless you are supporting SpiderMonkey-style string
1045 * storage optimizations.
1047 inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1048 Span<const uint8_t> aBuffer) const {
1049 size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1050 aBuffer.Length());
1051 if (upTo == std::numeric_limits<size_t>::max()) {
1052 return mozilla::Nothing();
1054 return mozilla::Some(upTo);
1057 private:
1058 Decoder() = delete;
1059 Decoder(const Decoder&) = delete;
1060 Decoder& operator=(const Decoder&) = delete;
1064 * A converter that encodes a Unicode stream into bytes according to a
1065 * character encoding in a streaming (incremental) manner.
1067 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1068 * buffer `aDst` both of which are caller-allocated. There are variants for
1069 * both UTF-8 and UTF-16 input buffers.
1071 * An `Encode*` method encode characters from `aSrc` into bytes characters
1072 * stored into `aDst` until one of the following three things happens:
1074 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1075 * only).
1077 * 2. The output buffer has been filled so near capacity that the decoder
1078 * cannot be sure that processing an additional character of input wouldn't
1079 * cause so much output that the output buffer would overflow.
1081 * 3. All the input characters have been processed.
1083 * The `Encode*` method then returns tuple of a status indicating which one
1084 * of the three reasons to return happened, how many input code units (`uint8_t`
1085 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1086 * how many output bytes were written, and in the case of the variants that
1087 * perform replacement, a boolean indicating whether an unmappable
1088 * character was replaced with a numeric character reference during the call.
1090 * The number of bytes "written" is what's logically written. Garbage may be
1091 * written in the output buffer beyond the point logically written to.
1093 * In the case of the methods whose name ends with
1094 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1095 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1096 * to the three cases listed above).
1098 * In the case of methods whose name does not end with
1099 * `*WithoutReplacement`, unmappable characters are automatically replaced
1100 * with the corresponding numeric character references and unmappable
1101 * characters do not cause the methods to return early.
1103 * When encoding from UTF-8 without replacement, the methods are guaranteed
1104 * not to return indicating that more output space is needed if the length
1105 * of the output buffer is at least the length returned by
1106 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1107 * UTF-8 with replacement, the length of the output buffer that guarantees the
1108 * methods not to return indicating that more output space is needed in the
1109 * absence of unmappable characters is given by
1110 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1111 * UTF-16 without replacement, the methods are guaranteed not to return
1112 * indicating that more output space is needed if the length of the output
1113 * buffer is at least the length returned by
1114 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1115 * from UTF-16 with replacement, the the length of the output buffer that
1116 * guarantees the methods not to return indicating that more output space is
1117 * needed in the absence of unmappable characters is given by
1118 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1119 * When encoding with replacement, applications are not expected to size the
1120 * buffer for the worst case ahead of time but to resize the buffer if there
1121 * are unmappable characters. This is why max length queries are only available
1122 * for the case where there are no unmappable characters.
1124 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1125 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1126 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1127 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1128 * surrogate pairs are not split across input buffer boundaries.
1130 * After an `Encode*` call returns, the output produced so far, taken as a
1131 * whole from the start of the stream, is guaranteed to consist of a valid
1132 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1133 * character is guaranteed not to be split across output buffers. However, due
1134 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1135 * from the start for it to be valid. For other encodings, the validity holds
1136 * on a per-output buffer basis.)
1138 * The boolean argument `aLast` indicates that the end of the stream is reached
1139 * when all the characters in `aSrc` have been consumed. This argument is needed
1140 * for ISO-2022-JP and is ignored for other encodings.
1142 * An `Encoder` object can be used to incrementally encode a byte stream.
1144 * During the processing of a single stream, the caller must call `Encode*`
1145 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1146 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1147 * the processing of the stream has ended. Otherwise, the caller must call
1148 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1149 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1151 * Once the stream has ended, the `Encoder` object must not be used anymore.
1152 * That is, you need to create another one to process another stream.
1154 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1155 * result and the caller does not wish to treat it as a fatal error, the input
1156 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1157 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1158 * call.
1160 * # Infinite loops
1162 * When converting with a fixed-size output buffer whose size is too small to
1163 * accommodate one character of output, an infinite loop ensues. When
1164 * converting with a fixed-size output buffer, it generally makes sense to
1165 * make the buffer fairly large (e.g. couple of kilobytes).
1167 class Encoder final {
1168 public:
1169 ~Encoder() = default;
1171 static void operator delete(void* aEncoder) {
1172 encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1176 * The `Encoding` this `Encoder` is for.
1178 inline NotNull<const mozilla::Encoding*> Encoding() const {
1179 return WrapNotNull(encoder_encoding(this));
1183 * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1184 * ASCII state and `false` otherwise.
1186 inline bool HasPendingState() const {
1187 return encoder_has_pending_state(this);
1191 * Query the worst-case output size when encoding from UTF-8 with
1192 * replacement.
1194 * Returns the size of the output buffer in bytes that will not overflow
1195 * given the current state of the encoder and `aByteLength` number of
1196 * additional input code units if there are no unmappable characters in
1197 * the input.
1199 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1200 size_t aByteLength) const {
1201 CheckedInt<size_t> max(
1202 encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1203 aByteLength));
1204 if (max.value() == std::numeric_limits<size_t>::max()) {
1205 // Mark invalid by overflowing
1206 max++;
1207 MOZ_ASSERT(!max.isValid());
1209 return max;
1213 * Query the worst-case output size when encoding from UTF-8 without
1214 * replacement.
1216 * Returns the size of the output buffer in bytes that will not overflow
1217 * given the current state of the encoder and `aByteLength` number of
1218 * additional input code units.
1220 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1221 size_t aByteLength) const {
1222 CheckedInt<size_t> max(
1223 encoder_max_buffer_length_from_utf8_without_replacement(this,
1224 aByteLength));
1225 if (max.value() == std::numeric_limits<size_t>::max()) {
1226 // Mark invalid by overflowing
1227 max++;
1228 MOZ_ASSERT(!max.isValid());
1230 return max;
1234 * Incrementally encode into byte stream from UTF-8 with unmappable
1235 * characters replaced with HTML (decimal) numeric character references.
1237 * See the documentation of the class for documentation for `Encode*`
1238 * methods collectively.
1240 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1241 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1242 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1244 inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1245 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1246 size_t srcRead = aSrc.Length();
1247 size_t dstWritten = aDst.Length();
1248 bool hadReplacements;
1249 uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1250 aDst.Elements(), &dstWritten,
1251 aLast, &hadReplacements);
1252 return {result, srcRead, dstWritten, hadReplacements};
1256 * Incrementally encode into byte stream from UTF-8 _without replacement_.
1258 * See the documentation of the class for documentation for `Encode*`
1259 * methods collectively.
1261 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1262 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1263 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1265 inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1266 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1267 size_t srcRead = aSrc.Length();
1268 size_t dstWritten = aDst.Length();
1269 uint32_t result = encoder_encode_from_utf8_without_replacement(
1270 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1271 return {result, srcRead, dstWritten};
1275 * Query the worst-case output size when encoding from UTF-16 with
1276 * replacement.
1278 * Returns the size of the output buffer in bytes that will not overflow
1279 * given the current state of the encoder and `aU16Length` number of
1280 * additional input code units if there are no unmappable characters in
1281 * the input.
1283 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1284 size_t aU16Length) const {
1285 CheckedInt<size_t> max(
1286 encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1287 aU16Length));
1288 if (max.value() == std::numeric_limits<size_t>::max()) {
1289 // Mark invalid by overflowing
1290 max++;
1291 MOZ_ASSERT(!max.isValid());
1293 return max;
1297 * Query the worst-case output size when encoding from UTF-16 without
1298 * replacement.
1300 * Returns the size of the output buffer in bytes that will not overflow
1301 * given the current state of the encoder and `aU16Length` number of
1302 * additional input code units.
1304 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1305 size_t aU16Length) const {
1306 CheckedInt<size_t> max(
1307 encoder_max_buffer_length_from_utf16_without_replacement(this,
1308 aU16Length));
1309 if (max.value() == std::numeric_limits<size_t>::max()) {
1310 // Mark invalid by overflowing
1311 max++;
1312 MOZ_ASSERT(!max.isValid());
1314 return max;
1318 * Incrementally encode into byte stream from UTF-16 with unmappable
1319 * characters replaced with HTML (decimal) numeric character references.
1321 * See the documentation of the class for documentation for `Encode*`
1322 * methods collectively.
1324 inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1325 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1326 size_t srcRead = aSrc.Length();
1327 size_t dstWritten = aDst.Length();
1328 bool hadReplacements;
1329 uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1330 aDst.Elements(), &dstWritten,
1331 aLast, &hadReplacements);
1332 return {result, srcRead, dstWritten, hadReplacements};
1336 * Incrementally encode into byte stream from UTF-16 _without replacement_.
1338 * See the documentation of the class for documentation for `Encode*`
1339 * methods collectively.
1341 inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1342 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1343 size_t srcRead = aSrc.Length();
1344 size_t dstWritten = aDst.Length();
1345 uint32_t result = encoder_encode_from_utf16_without_replacement(
1346 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1347 return {result, srcRead, dstWritten};
1350 private:
1351 Encoder() = delete;
1352 Encoder(const Encoder&) = delete;
1353 Encoder& operator=(const Encoder&) = delete;
1356 }; // namespace mozilla
1358 #endif // mozilla_Encoding_h