Bug 1702213 [wpt PR 28315] - Update wpt metadata, a=testonly
[gecko.git] / intl / Encoding.h
blobee57422e943b131e3eadb4e7e80d47330fc8fcaf
1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/Maybe.h"
19 #include "mozilla/NotNull.h"
20 #include "mozilla/Span.h"
21 #include "mozilla/Tuple.h"
22 #include "nsString.h"
24 namespace mozilla {
25 class Encoding;
26 class Decoder;
27 class Encoder;
28 }; // namespace mozilla
30 #define ENCODING_RS_ENCODING mozilla::Encoding
31 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
32 mozilla::NotNull<const mozilla::Encoding*>
33 #define ENCODING_RS_ENCODER mozilla::Encoder
34 #define ENCODING_RS_DECODER mozilla::Decoder
36 #include "encoding_rs.h"
38 extern "C" {
40 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
41 uint8_t const* src, size_t src_len,
42 nsAString* dst);
44 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
45 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
46 nsAString* dst);
48 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
49 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
50 nsAString* dst);
52 nsresult
53 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
54 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
55 nsAString* dst);
57 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
58 char16_t const* src, size_t src_len,
59 nsACString* dst);
61 nsresult mozilla_encoding_decode_to_nscstring(
62 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
64 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
65 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
67 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
68 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
70 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
71 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
72 nsACString* dst, size_t already_validated);
74 nsresult
75 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
76 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
78 nsresult mozilla_encoding_encode_from_nscstring(
79 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
81 } // extern "C"
83 namespace mozilla {
85 /**
86 * Return value from `Decoder`/`Encoder` to indicate that input
87 * was exhausted.
89 const uint32_t kInputEmpty = INPUT_EMPTY;
91 /**
92 * Return value from `Decoder`/`Encoder` to indicate that output
93 * space was insufficient.
95 const uint32_t kOutputFull = OUTPUT_FULL;
97 /**
98 * An encoding as defined in the Encoding Standard
99 * (https://encoding.spec.whatwg.org/).
101 * See https://docs.rs/encoding_rs/ for the Rust API docs.
103 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
104 * sequence and, in most cases, vice versa. Each encoding has a name, an output
105 * encoding, and one or more labels.
107 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
108 * encoding in formats and protocols. The _name_ of the encoding is the
109 * preferred label in the case appropriate for returning from the
110 * `characterSet` property of the `Document` DOM interface, except for
111 * the replacement encoding whose name is not one of its labels.
113 * The _output encoding_ is the encoding used for form submission and URL
114 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
115 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
116 * encodings.
118 * # Streaming vs. Non-Streaming
120 * When you have the entire input in a single buffer, you can use the
121 * methods `Decode()`, `DecodeWithBOMRemoval()`,
122 * `DecodeWithoutBOMHandling()`,
123 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
124 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
125 * NewEncoder()` methods), these methods perform heap allocations. You should
126 * the `Decoder` and `Encoder` objects when your input is split into multiple
127 * buffers or when you want to control the allocation of the output buffers.
129 * # Instances
131 * All instances of `Encoding` are statically allocated and have the process's
132 * lifetime. There is precisely one unique `Encoding` instance for each
133 * encoding defined in the Encoding Standard.
135 * To obtain a reference to a particular encoding whose identity you know at
136 * compile time, use a `static` that refers to encoding. There is a `static`
137 * for each encoding. The `static`s are named in all caps with hyphens
138 * replaced with underscores and with `_ENCODING` appended to the
139 * name. For example, if you know at compile time that you will want to
140 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
142 * If you don't know what encoding you need at compile time and need to
143 * dynamically get an encoding by label, use `Encoding::for_label()`.
145 * Pointers to `Encoding` can be compared with `==` to check for the sameness
146 * of two encodings.
148 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
149 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
150 * `const mozilla::Encoding*` in the C signature and
151 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
153 class Encoding final {
154 public:
156 * Implements the _get an encoding_ algorithm
157 * (https://encoding.spec.whatwg.org/#concept-encoding-get).
159 * If, after ASCII-lowercasing and removing leading and trailing
160 * whitespace, the argument matches a label defined in the Encoding
161 * Standard, `const Encoding*` representing the corresponding
162 * encoding is returned. If there is no match, `nullptr` is returned.
164 * This is the right method to use if the action upon the method returning
165 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
166 * instead. When the action upon the method returning `nullptr` is not to
167 * proceed with a fallback but to refuse processing,
168 * `ForLabelNoReplacement()` is more appropriate.
170 static inline const Encoding* ForLabel(Span<const char> aLabel) {
171 return encoding_for_label(
172 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
176 * `nsAString` argument version. See above for docs.
178 static inline const Encoding* ForLabel(const nsAString& aLabel) {
179 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
183 * This method behaves the same as `ForLabel()`, except when `ForLabel()`
184 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
186 * This method is useful in scenarios where a fatal error is required
187 * upon invalid label, because in those cases the caller typically wishes
188 * to treat the labels that map to the replacement encoding as fatal
189 * errors, too.
191 * It is not OK to use this method when the action upon the method returning
192 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
193 * such a case, the `ForLabel()` method should be used instead in order to
194 * avoid unsafe fallback for labels that `ForLabel()` maps to
195 * `REPLACEMENT_ENCODING`.
197 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
198 return encoding_for_label_no_replacement(
199 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
203 * `nsAString` argument version. See above for docs.
205 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
206 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
210 * Performs non-incremental BOM sniffing.
212 * The argument must either be a buffer representing the entire input
213 * stream (non-streaming case) or a buffer representing at least the first
214 * three bytes of the input stream (streaming case).
216 * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
217 * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
218 * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
220 static inline Tuple<const Encoding*, size_t> ForBOM(
221 Span<const uint8_t> aBuffer) {
222 size_t len = aBuffer.Length();
223 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
224 return MakeTuple(encoding, len);
228 * Writes the name of this encoding into `aName`.
230 * This name is appropriate to return as-is from the DOM
231 * `document.characterSet` property.
233 inline void Name(nsACString& aName) const {
234 aName.SetLength(ENCODING_NAME_MAX_LENGTH);
235 size_t length =
236 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
237 aName.SetLength(length); // truncation is the 64-bit case is OK
241 * Checks whether the _output encoding_ of this encoding can encode every
242 * Unicode code point. (Only true if the output encoding is UTF-8.)
244 inline bool CanEncodeEverything() const {
245 return encoding_can_encode_everything(this);
249 * Checks whether this encoding maps one byte to one Basic Multilingual
250 * Plane code point (i.e. byte length equals decoded UTF-16 length) and
251 * vice versa (for mappable characters).
253 * `true` iff this encoding is on the list of Legacy single-byte
254 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
255 * in the spec or x-user-defined.
257 inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
260 * Checks whether the bytes 0x00...0x7F map exclusively to the characters
261 * U+0000...U+007F and vice versa.
263 inline bool IsAsciiCompatible() const {
264 return encoding_is_ascii_compatible(this);
268 * Checks whether this is a Japanese legacy encoding.
270 inline bool IsJapaneseLegacy() const {
271 return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
272 this == ISO_2022_JP_ENCODING;
276 * Returns the _output encoding_ of this encoding. This is UTF-8 for
277 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
279 inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
280 return WrapNotNull(encoding_output_encoding(this));
284 * Decode complete input to `nsACString` _with BOM sniffing_ and with
285 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
286 * entire input is available as a single buffer (i.e. the end of the
287 * buffer marks the end of the stream).
289 * This method implements the (non-streaming version of) the
290 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
292 * The second item in the returned tuple is the encoding that was actually
293 * used (which may differ from this encoding thanks to BOM sniffing).
295 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
296 * if there were malformed sequences (that were replaced with the
297 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
298 * tuple.
300 * The backing buffer of the string isn't copied if the input buffer
301 * is heap-allocated and decoding from UTF-8 and the input is valid
302 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
303 * the input is valid ASCII or decoding from ISO-2022-JP and the
304 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
305 * the same string as both arguments.
307 * _Note:_ It is wrong to use this when the input buffer represents only
308 * a segment of the input instead of the whole input. Use `NewDecoder()`
309 * when decoding segmented input.
311 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
312 const nsACString& aBytes, nsACString& aOut) const {
313 const Encoding* encoding = this;
314 const nsACString* bytes = &aBytes;
315 nsACString* out = &aOut;
316 nsresult rv;
317 if (bytes == out) {
318 nsAutoCString temp(aBytes);
319 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
320 } else {
321 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
323 return MakeTuple(rv, WrapNotNull(encoding));
327 * Decode complete input to `nsAString` _with BOM sniffing_ and with
328 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
329 * entire input is available as a single buffer (i.e. the end of the
330 * buffer marks the end of the stream).
332 * This method implements the (non-streaming version of) the
333 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
335 * The second item in the returned tuple is the encoding that was actually
336 * used (which may differ from this encoding thanks to BOM sniffing).
338 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
339 * if there were malformed sequences (that were replaced with the
340 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
341 * tuple.
343 * _Note:_ It is wrong to use this when the input buffer represents only
344 * a segment of the input instead of the whole input. Use `NewDecoder()`
345 * when decoding segmented input.
347 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
348 Span<const uint8_t> aBytes, nsAString& aOut) const {
349 const Encoding* encoding = this;
350 nsresult rv = mozilla_encoding_decode_to_nsstring(
351 &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
352 return MakeTuple(rv, WrapNotNull(encoding));
356 * Decode complete input to `nsACString` _with BOM removal_ and with
357 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
358 * entire input is available as a single buffer (i.e. the end of the
359 * buffer marks the end of the stream).
361 * When invoked on `UTF_8`, this method implements the (non-streaming
362 * version of) the _UTF-8 decode_
363 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
365 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
366 * if there were malformed sequences (that were replaced with the
367 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
369 * The backing buffer of the string isn't copied if the input buffer
370 * is heap-allocated and decoding from UTF-8 and the input is valid
371 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
372 * the input is valid ASCII or decoding from ISO-2022-JP and the
373 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
374 * the same string as both arguments.
376 * _Note:_ It is wrong to use this when the input buffer represents only
377 * a segment of the input instead of the whole input. Use
378 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
380 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
381 nsACString& aOut) const {
382 const nsACString* bytes = &aBytes;
383 nsACString* out = &aOut;
384 if (bytes == out) {
385 nsAutoCString temp(aBytes);
386 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
387 out);
389 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
390 out);
394 * Decode complete input to `nsAString` _with BOM removal_ and with
395 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
396 * entire input is available as a single buffer (i.e. the end of the
397 * buffer marks the end of the stream).
399 * When invoked on `UTF_8`, this method implements the (non-streaming
400 * version of) the _UTF-8 decode_
401 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
403 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
404 * if there were malformed sequences (that were replaced with the
405 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
407 * _Note:_ It is wrong to use this when the input buffer represents only
408 * a segment of the input instead of the whole input. Use
409 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
411 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
412 nsAString& aOut) const {
413 return mozilla_encoding_decode_to_nsstring_with_bom_removal(
414 this, aBytes.Elements(), aBytes.Length(), &aOut);
418 * Decode complete input to `nsACString` _without BOM handling_ and
419 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
420 * the entire input is available as a single buffer (i.e. the end of the
421 * buffer marks the end of the stream).
423 * When invoked on `UTF_8`, this method implements the (non-streaming
424 * version of) the _UTF-8 decode without BOM_
425 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
427 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
428 * if there were malformed sequences (that were replaced with the
429 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
431 * The backing buffer of the string isn't copied if the input buffer
432 * is heap-allocated and decoding from UTF-8 and the input is valid
433 * UTF-8, decoding from an ASCII-compatible encoding and the input
434 * is valid ASCII or decoding from ISO-2022-JP and the input stays
435 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
436 * as both arguments.
438 * _Note:_ It is wrong to use this when the input buffer represents only
439 * a segment of the input instead of the whole input. Use
440 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
442 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
443 nsACString& aOut) const {
444 const nsACString* bytes = &aBytes;
445 nsACString* out = &aOut;
446 if (bytes == out) {
447 nsAutoCString temp(aBytes);
448 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
449 this, &temp, out);
451 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
452 this, bytes, out);
456 * Decode complete input to `nsAString` _without BOM handling_ and
457 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
458 * the entire input is available as a single buffer (i.e. the end of the
459 * buffer marks the end of the stream).
461 * When invoked on `UTF_8`, this method implements the (non-streaming
462 * version of) the _UTF-8 decode without BOM_
463 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
465 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
466 * if there were malformed sequences (that were replaced with the
467 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
469 * _Note:_ It is wrong to use this when the input buffer represents only
470 * a segment of the input instead of the whole input. Use
471 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
473 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
474 nsAString& aOut) const {
475 return mozilla_encoding_decode_to_nsstring_without_bom_handling(
476 this, aBytes.Elements(), aBytes.Length(), &aOut);
480 * Decode complete input to `nsACString` _without BOM handling_ and
481 * _with malformed sequences treated as fatal_ when the entire input is
482 * available as a single buffer (i.e. the end of the buffer marks the end
483 * of the stream).
485 * When invoked on `UTF_8`, this method implements the (non-streaming
486 * version of) the _UTF-8 decode without BOM or fail_
487 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
488 * spec concept.
490 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
491 * if a malformed sequence was encountered and `NS_OK` otherwise.
493 * The backing buffer of the string isn't copied if the input buffer
494 * is heap-allocated and decoding from UTF-8 and the input is valid
495 * UTF-8, decoding from an ASCII-compatible encoding and the input
496 * is valid ASCII or decoding from ISO-2022-JP and the input stays
497 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
498 * as both arguments.
500 * _Note:_ It is wrong to use this when the input buffer represents only
501 * a segment of the input instead of the whole input. Use
502 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
504 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
505 const nsACString& aBytes, nsACString& aOut) const {
506 const nsACString* bytes = &aBytes;
507 nsACString* out = &aOut;
508 if (bytes == out) {
509 nsAutoCString temp(aBytes);
510 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
511 this, &temp, out);
513 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
514 this, bytes, out);
518 * Decode complete input to `nsACString` _without BOM handling_ and
519 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
520 * the entire input is available as a single buffer (i.e. the end of the
521 * buffer marks the end of the stream) _asserting that a number of bytes
522 * from the start are already known to be valid UTF-8_.
524 * The use case for this method is avoiding copying when dealing with
525 * input that has a UTF-8 BOM. _When in doubt, do not use this method._
527 * When invoked on `UTF_8`, this method implements the (non-streaming
528 * version of) the _UTF-8 decode without BOM_
529 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
531 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
532 * if there were malformed sequences (that were replaced with the
533 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
535 * _Note:_ It is wrong to use this when the input buffer represents only
536 * a segment of the input instead of the whole input. Use
537 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
539 * # Safety
541 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
542 * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
544 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
545 nsACString& aOut,
546 size_t aAlreadyValidated) const {
547 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
548 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
552 * Decode complete input to `nsAString` _without BOM handling_ and
553 * _with malformed sequences treated as fatal_ when the entire input is
554 * available as a single buffer (i.e. the end of the buffer marks the end
555 * of the stream).
557 * When invoked on `UTF_8`, this method implements the (non-streaming
558 * version of) the _UTF-8 decode without BOM or fail_
559 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
560 * spec concept.
562 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
563 * if a malformed sequence was encountered and `NS_OK` otherwise.
565 * _Note:_ It is wrong to use this when the input buffer represents only
566 * a segment of the input instead of the whole input. Use
567 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
569 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
570 Span<const uint8_t> aBytes, nsAString& aOut) const {
571 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
572 this, aBytes.Elements(), aBytes.Length(), &aOut);
576 * Encode complete input to `nsACString` with unmappable characters
577 * replaced with decimal numeric character references when the entire input
578 * is available as a single buffer (i.e. the end of the buffer marks the
579 * end of the stream).
581 * This method implements the (non-streaming version of) the
582 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
584 * The second item in the returned tuple is the encoding that was actually
585 * used (which may differ from this encoding thanks to some encodings
586 * having UTF-8 as their output encoding).
588 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
589 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
590 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
591 * replaced with numeric character references) and `NS_OK` otherwise.
593 * The backing buffer of the string isn't copied if the input buffer
594 * is heap-allocated and encoding to UTF-8 and the input is valid
595 * UTF-8, encoding to an ASCII-compatible encoding and the input
596 * is valid ASCII or encoding from ISO-2022-JP and the input stays
597 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
598 * as both arguments.
600 * _Note:_ It is wrong to use this when the input buffer represents only
601 * a segment of the input instead of the whole input. Use `NewEncoder()`
602 * when encoding segmented output.
604 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
605 const nsACString& aString, nsACString& aOut) const {
606 const Encoding* encoding = this;
607 const nsACString* string = &aString;
608 nsACString* out = &aOut;
609 nsresult rv;
610 if (string == out) {
611 nsAutoCString temp(aString);
612 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
613 } else {
614 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
616 return MakeTuple(rv, WrapNotNull(encoding));
620 * Encode complete input to `nsACString` with unmappable characters
621 * replaced with decimal numeric character references when the entire input
622 * is available as a single buffer (i.e. the end of the buffer marks the
623 * end of the stream).
625 * This method implements the (non-streaming version of) the
626 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
628 * The second item in the returned tuple is the encoding that was actually
629 * used (which may differ from this encoding thanks to some encodings
630 * having UTF-8 as their output encoding).
632 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
633 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
634 * were replaced with numeric character references) and `NS_OK` otherwise.
636 * _Note:_ It is wrong to use this when the input buffer represents only
637 * a segment of the input instead of the whole input. Use `NewEncoder()`
638 * when encoding segmented output.
640 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
641 Span<const char16_t> aString, nsACString& aOut) const {
642 const Encoding* encoding = this;
643 nsresult rv = mozilla_encoding_encode_from_utf16(
644 &encoding, aString.Elements(), aString.Length(), &aOut);
645 return MakeTuple(rv, WrapNotNull(encoding));
649 * Instantiates a new decoder for this encoding with BOM sniffing enabled.
651 * BOM sniffing may cause the returned decoder to morph into a decoder
652 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
654 inline UniquePtr<Decoder> NewDecoder() const {
655 UniquePtr<Decoder> decoder(encoding_new_decoder(this));
656 return decoder;
660 * Instantiates a new decoder for this encoding with BOM sniffing enabled
661 * into memory occupied by a previously-instantiated decoder.
663 * BOM sniffing may cause the returned decoder to morph into a decoder
664 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
666 inline void NewDecoderInto(Decoder& aDecoder) const {
667 encoding_new_decoder_into(this, &aDecoder);
671 * Instantiates a new decoder for this encoding with BOM removal.
673 * If the input starts with bytes that are the BOM for this encoding,
674 * those bytes are removed. However, the decoder never morphs into a
675 * decoder for another encoding: A BOM for another encoding is treated as
676 * (potentially malformed) input to the decoding algorithm for this
677 * encoding.
679 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
680 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
681 return decoder;
685 * Instantiates a new decoder for this encoding with BOM removal
686 * into memory occupied by a previously-instantiated decoder.
688 * If the input starts with bytes that are the BOM for this encoding,
689 * those bytes are removed. However, the decoder never morphs into a
690 * decoder for another encoding: A BOM for another encoding is treated as
691 * (potentially malformed) input to the decoding algorithm for this
692 * encoding.
694 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
695 encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
699 * Instantiates a new decoder for this encoding with BOM handling disabled.
701 * If the input starts with bytes that look like a BOM, those bytes are
702 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
703 * for another encoding.)
705 * _Note:_ If the caller has performed BOM sniffing on its own but has not
706 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
707 * instead of this method to cause the BOM to be removed.
709 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
710 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
711 return decoder;
715 * Instantiates a new decoder for this encoding with BOM handling disabled
716 * into memory occupied by a previously-instantiated decoder.
718 * If the input starts with bytes that look like a BOM, those bytes are
719 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
720 * for another encoding.)
722 * _Note:_ If the caller has performed BOM sniffing on its own but has not
723 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
724 * instead of this method to cause the BOM to be removed.
726 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
727 encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
731 * Instantiates a new encoder for the output encoding of this encoding.
733 inline UniquePtr<Encoder> NewEncoder() const {
734 UniquePtr<Encoder> encoder(encoding_new_encoder(this));
735 return encoder;
739 * Instantiates a new encoder for the output encoding of this encoding
740 * into memory occupied by a previously-instantiated encoder.
742 inline void NewEncoderInto(Encoder& aEncoder) const {
743 encoding_new_encoder_into(this, &aEncoder);
747 * Validates UTF-8.
749 * Returns the index of the first byte that makes the input malformed as
750 * UTF-8 or the length of the input if the input is entirely valid.
752 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
753 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
757 * Validates ASCII.
759 * Returns the index of the first byte that makes the input malformed as
760 * ASCII or the length of the input if the input is entirely valid.
762 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
763 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
767 * Validates ISO-2022-JP ASCII-state data.
769 * Returns the index of the first byte that makes the input not
770 * representable in the ASCII state of ISO-2022-JP or the length of the
771 * input if the input is entirely representable in the ASCII state of
772 * ISO-2022-JP.
774 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
775 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
776 aBuffer.Length());
779 private:
780 Encoding() = delete;
781 Encoding(const Encoding&) = delete;
782 Encoding& operator=(const Encoding&) = delete;
783 ~Encoding() = delete;
787 * A converter that decodes a byte stream into Unicode according to a
788 * character encoding in a streaming (incremental) manner.
790 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
791 * buffer `aDst` both of which are caller-allocated. There are variants for
792 * both UTF-8 and UTF-16 output buffers.
794 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
795 * into `aDst` until one of the following three things happens:
797 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
798 * variants only).
800 * 2. The output buffer has been filled so near capacity that the decoder
801 * cannot be sure that processing an additional byte of input wouldn't
802 * cause so much output that the output buffer would overflow.
804 * 3. All the input bytes have been processed.
806 * The `Decode*` method then returns tuple of a status indicating which one
807 * of the three reasons to return happened, how many input bytes were read,
808 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
809 * when decoding to UTF-16) were written, and in the case of the
810 * variants performing replacement, a boolean indicating whether an error was
811 * replaced with the REPLACEMENT CHARACTER during the call.
813 * The number of bytes "written" is what's logically written. Garbage may be
814 * written in the output buffer beyond the point logically written to.
816 * In the case of the `*WithoutReplacement` variants, the status is a
817 * `uint32_t` whose possible values are packed info about a malformed byte
818 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
819 * listed above).
821 * Packed info about malformed sequences has the following format:
822 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
823 * indicate the number of bytes that were consumed after the malformed
824 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
825 * the length of the malformed byte sequence (possible decimal values 1, 2,
826 * 3 or 4). The maximum possible sum of the two is 6.
828 * In the case of methods whose name does not end with
829 * `*WithoutReplacement`, malformed sequences are automatically replaced
830 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
831 * return early.
833 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
834 * space. When decoding to UTF-16, the output buffer must have at least two
835 * UTF-16 code units (`char16_t`) of space.
837 * When decoding to UTF-8 without replacement, the methods are guaranteed
838 * not to return indicating that more output space is needed if the length
839 * of the output buffer is at least the length returned by
840 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
841 * with replacement, the length of the output buffer that guarantees the
842 * methods not to return indicating that more output space is needed is given
843 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
844 * or without replacement, the length of the output buffer that guarantees
845 * the methods not to return indicating that more output space is needed is
846 * given by `MaxUTF16BufferLength()`.
848 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
849 * and the output after each `Decode*` call is guaranteed to consist of
850 * complete characters. (I.e. the code unit sequence for the last character is
851 * guaranteed not to be split across output buffers.)
853 * The boolean argument `aLast` indicates that the end of the stream is reached
854 * when all the bytes in `aSrc` have been consumed.
856 * A `Decoder` object can be used to incrementally decode a byte stream.
858 * During the processing of a single stream, the caller must call `Decode*`
859 * zero or more times with `aLast` set to `false` and then call `Decode*` at
860 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
861 * the processing of the stream has ended. Otherwise, the caller must call
862 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
863 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
865 * Once the stream has ended, the `Decoder` object must not be used anymore.
866 * That is, you need to create another one to process another stream.
868 * When the decoder returns `kOutputFull` or the decoder returns a malformed
869 * result and the caller does not wish to treat it as a fatal error, the input
870 * buffer `aSrc` may not have been completely consumed. In that case, the caller
871 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
872 * call.
874 * # Infinite loops
876 * When converting with a fixed-size output buffer whose size is too small to
877 * accommodate one character of output, an infinite loop ensues. When
878 * converting with a fixed-size output buffer, it generally makes sense to
879 * make the buffer fairly large (e.g. couple of kilobytes).
881 class Decoder final {
882 public:
883 ~Decoder() = default;
884 static void operator delete(void* aDecoder) {
885 decoder_free(reinterpret_cast<Decoder*>(aDecoder));
889 * The `Encoding` this `Decoder` is for.
891 * BOM sniffing can change the return value of this method during the life
892 * of the decoder.
894 inline NotNull<const mozilla::Encoding*> Encoding() const {
895 return WrapNotNull(decoder_encoding(this));
899 * Query the worst-case UTF-8 output size _with replacement_.
901 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
902 * that will not overflow given the current state of the decoder and
903 * `aByteLength` number of additional input bytes when decoding with
904 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
905 * sequence.
907 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
908 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
909 if (max.value() == std::numeric_limits<size_t>::max()) {
910 // Mark invalid by overflowing
911 max++;
912 MOZ_ASSERT(!max.isValid());
914 return max;
918 * Query the worst-case UTF-8 output size _without replacement_.
920 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
921 * that will not overflow given the current state of the decoder and
922 * `aByteLength` number of additional input bytes when decoding without
923 * replacement error handling.
925 * Note that this value may be too small for the `WithReplacement` case.
926 * Use `MaxUTF8BufferLength()` for that case.
928 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
929 size_t aByteLength) const {
930 CheckedInt<size_t> max(
931 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
932 if (max.value() == std::numeric_limits<size_t>::max()) {
933 // Mark invalid by overflowing
934 max++;
935 MOZ_ASSERT(!max.isValid());
937 return max;
941 * Incrementally decode a byte stream into UTF-8 with malformed sequences
942 * replaced with the REPLACEMENT CHARACTER.
944 * See the documentation of the class for documentation for `Decode*`
945 * methods collectively.
947 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
948 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
949 size_t srcRead = aSrc.Length();
950 size_t dstWritten = aDst.Length();
951 bool hadReplacements;
952 uint32_t result =
953 decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
954 &dstWritten, aLast, &hadReplacements);
955 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
959 * Incrementally decode a byte stream into UTF-8 _without replacement_.
961 * See the documentation of the class for documentation for `Decode*`
962 * methods collectively.
964 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
965 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
966 size_t srcRead = aSrc.Length();
967 size_t dstWritten = aDst.Length();
968 uint32_t result = decoder_decode_to_utf8_without_replacement(
969 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
970 return MakeTuple(result, srcRead, dstWritten);
974 * Query the worst-case UTF-16 output size (with or without replacement).
976 * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
977 * that will not overflow given the current state of the decoder and
978 * `aByteLength` number of additional input bytes.
980 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
981 * return value of this method applies also in the
982 * `_without_replacement` case.
984 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
985 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
986 if (max.value() == std::numeric_limits<size_t>::max()) {
987 // Mark invalid by overflowing
988 max++;
989 MOZ_ASSERT(!max.isValid());
991 return max;
995 * Incrementally decode a byte stream into UTF-16 with malformed sequences
996 * replaced with the REPLACEMENT CHARACTER.
998 * See the documentation of the class for documentation for `Decode*`
999 * methods collectively.
1001 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1002 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1003 size_t srcRead = aSrc.Length();
1004 size_t dstWritten = aDst.Length();
1005 bool hadReplacements;
1006 uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1007 aDst.Elements(), &dstWritten,
1008 aLast, &hadReplacements);
1009 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1013 * Incrementally decode a byte stream into UTF-16 _without replacement_.
1015 * See the documentation of the class for documentation for `Decode*`
1016 * methods collectively.
1018 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1019 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1020 size_t srcRead = aSrc.Length();
1021 size_t dstWritten = aDst.Length();
1022 uint32_t result = decoder_decode_to_utf16_without_replacement(
1023 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1024 return MakeTuple(result, srcRead, dstWritten);
1028 * Checks for compatibility with storing Unicode scalar values as unsigned
1029 * bytes taking into account the state of the decoder.
1031 * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1032 * including waiting for the BOM, or if the encoding is never
1033 * Latin1-byte-compatible.
1035 * Otherwise returns the index of the first byte whose unsigned value doesn't
1036 * directly correspond to the decoded Unicode scalar value, or the length
1037 * of the input if all bytes in the input decode directly to scalar values
1038 * corresponding to the unsigned byte values.
1040 * Does not change the state of the decoder.
1042 * Do not use this unless you are supporting SpiderMonkey-style string
1043 * storage optimizations.
1045 inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1046 Span<const uint8_t> aBuffer) const {
1047 size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1048 aBuffer.Length());
1049 if (upTo == std::numeric_limits<size_t>::max()) {
1050 return mozilla::Nothing();
1052 return mozilla::Some(upTo);
1055 private:
1056 Decoder() = delete;
1057 Decoder(const Decoder&) = delete;
1058 Decoder& operator=(const Decoder&) = delete;
1062 * A converter that encodes a Unicode stream into bytes according to a
1063 * character encoding in a streaming (incremental) manner.
1065 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1066 * buffer `aDst` both of which are caller-allocated. There are variants for
1067 * both UTF-8 and UTF-16 input buffers.
1069 * An `Encode*` method encode characters from `aSrc` into bytes characters
1070 * stored into `aDst` until one of the following three things happens:
1072 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1073 * only).
1075 * 2. The output buffer has been filled so near capacity that the decoder
1076 * cannot be sure that processing an additional character of input wouldn't
1077 * cause so much output that the output buffer would overflow.
1079 * 3. All the input characters have been processed.
1081 * The `Encode*` method then returns tuple of a status indicating which one
1082 * of the three reasons to return happened, how many input code units (`uint8_t`
1083 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1084 * how many output bytes were written, and in the case of the variants that
1085 * perform replacement, a boolean indicating whether an unmappable
1086 * character was replaced with a numeric character reference during the call.
1088 * The number of bytes "written" is what's logically written. Garbage may be
1089 * written in the output buffer beyond the point logically written to.
1091 * In the case of the methods whose name ends with
1092 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1093 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1094 * to the three cases listed above).
1096 * In the case of methods whose name does not end with
1097 * `*WithoutReplacement`, unmappable characters are automatically replaced
1098 * with the corresponding numeric character references and unmappable
1099 * characters do not cause the methods to return early.
1101 * When encoding from UTF-8 without replacement, the methods are guaranteed
1102 * not to return indicating that more output space is needed if the length
1103 * of the output buffer is at least the length returned by
1104 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1105 * UTF-8 with replacement, the length of the output buffer that guarantees the
1106 * methods not to return indicating that more output space is needed in the
1107 * absence of unmappable characters is given by
1108 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1109 * UTF-16 without replacement, the methods are guaranteed not to return
1110 * indicating that more output space is needed if the length of the output
1111 * buffer is at least the length returned by
1112 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1113 * from UTF-16 with replacement, the the length of the output buffer that
1114 * guarantees the methods not to return indicating that more output space is
1115 * needed in the absence of unmappable characters is given by
1116 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1117 * When encoding with replacement, applications are not expected to size the
1118 * buffer for the worst case ahead of time but to resize the buffer if there
1119 * are unmappable characters. This is why max length queries are only available
1120 * for the case where there are no unmappable characters.
1122 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1123 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1124 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1125 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1126 * surrogate pairs are not split across input buffer boundaries.
1128 * After an `Encode*` call returns, the output produced so far, taken as a
1129 * whole from the start of the stream, is guaranteed to consist of a valid
1130 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1131 * character is guaranteed not to be split across output buffers. However, due
1132 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1133 * from the start for it to be valid. For other encodings, the validity holds
1134 * on a per-output buffer basis.)
1136 * The boolean argument `aLast` indicates that the end of the stream is reached
1137 * when all the characters in `aSrc` have been consumed. This argument is needed
1138 * for ISO-2022-JP and is ignored for other encodings.
1140 * An `Encoder` object can be used to incrementally encode a byte stream.
1142 * During the processing of a single stream, the caller must call `Encode*`
1143 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1144 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1145 * the processing of the stream has ended. Otherwise, the caller must call
1146 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1147 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1149 * Once the stream has ended, the `Encoder` object must not be used anymore.
1150 * That is, you need to create another one to process another stream.
1152 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1153 * result and the caller does not wish to treat it as a fatal error, the input
1154 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1155 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1156 * call.
1158 * # Infinite loops
1160 * When converting with a fixed-size output buffer whose size is too small to
1161 * accommodate one character of output, an infinite loop ensues. When
1162 * converting with a fixed-size output buffer, it generally makes sense to
1163 * make the buffer fairly large (e.g. couple of kilobytes).
1165 class Encoder final {
1166 public:
1167 ~Encoder() = default;
1169 static void operator delete(void* aEncoder) {
1170 encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1174 * The `Encoding` this `Encoder` is for.
1176 inline NotNull<const mozilla::Encoding*> Encoding() const {
1177 return WrapNotNull(encoder_encoding(this));
1181 * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1182 * ASCII state and `false` otherwise.
1184 inline bool HasPendingState() const {
1185 return encoder_has_pending_state(this);
1189 * Query the worst-case output size when encoding from UTF-8 with
1190 * replacement.
1192 * Returns the size of the output buffer in bytes that will not overflow
1193 * given the current state of the encoder and `aByteLength` number of
1194 * additional input code units if there are no unmappable characters in
1195 * the input.
1197 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1198 size_t aByteLength) const {
1199 CheckedInt<size_t> max(
1200 encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1201 aByteLength));
1202 if (max.value() == std::numeric_limits<size_t>::max()) {
1203 // Mark invalid by overflowing
1204 max++;
1205 MOZ_ASSERT(!max.isValid());
1207 return max;
1211 * Query the worst-case output size when encoding from UTF-8 without
1212 * replacement.
1214 * Returns the size of the output buffer in bytes that will not overflow
1215 * given the current state of the encoder and `aByteLength` number of
1216 * additional input code units.
1218 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1219 size_t aByteLength) const {
1220 CheckedInt<size_t> max(
1221 encoder_max_buffer_length_from_utf8_without_replacement(this,
1222 aByteLength));
1223 if (max.value() == std::numeric_limits<size_t>::max()) {
1224 // Mark invalid by overflowing
1225 max++;
1226 MOZ_ASSERT(!max.isValid());
1228 return max;
1232 * Incrementally encode into byte stream from UTF-8 with unmappable
1233 * characters replaced with HTML (decimal) numeric character references.
1235 * See the documentation of the class for documentation for `Encode*`
1236 * methods collectively.
1238 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1239 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1240 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1242 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1243 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1244 size_t srcRead = aSrc.Length();
1245 size_t dstWritten = aDst.Length();
1246 bool hadReplacements;
1247 uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1248 aDst.Elements(), &dstWritten,
1249 aLast, &hadReplacements);
1250 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1254 * Incrementally encode into byte stream from UTF-8 _without replacement_.
1256 * See the documentation of the class for documentation for `Encode*`
1257 * methods collectively.
1259 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1260 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1261 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1263 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1264 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1265 size_t srcRead = aSrc.Length();
1266 size_t dstWritten = aDst.Length();
1267 uint32_t result = encoder_encode_from_utf8_without_replacement(
1268 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1269 return MakeTuple(result, srcRead, dstWritten);
1273 * Query the worst-case output size when encoding from UTF-16 with
1274 * replacement.
1276 * Returns the size of the output buffer in bytes that will not overflow
1277 * given the current state of the encoder and `aU16Length` number of
1278 * additional input code units if there are no unmappable characters in
1279 * the input.
1281 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1282 size_t aU16Length) const {
1283 CheckedInt<size_t> max(
1284 encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1285 aU16Length));
1286 if (max.value() == std::numeric_limits<size_t>::max()) {
1287 // Mark invalid by overflowing
1288 max++;
1289 MOZ_ASSERT(!max.isValid());
1291 return max;
1295 * Query the worst-case output size when encoding from UTF-16 without
1296 * replacement.
1298 * Returns the size of the output buffer in bytes that will not overflow
1299 * given the current state of the encoder and `aU16Length` number of
1300 * additional input code units.
1302 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1303 size_t aU16Length) const {
1304 CheckedInt<size_t> max(
1305 encoder_max_buffer_length_from_utf16_without_replacement(this,
1306 aU16Length));
1307 if (max.value() == std::numeric_limits<size_t>::max()) {
1308 // Mark invalid by overflowing
1309 max++;
1310 MOZ_ASSERT(!max.isValid());
1312 return max;
1316 * Incrementally encode into byte stream from UTF-16 with unmappable
1317 * characters replaced with HTML (decimal) numeric character references.
1319 * See the documentation of the class for documentation for `Encode*`
1320 * methods collectively.
1322 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1323 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1324 size_t srcRead = aSrc.Length();
1325 size_t dstWritten = aDst.Length();
1326 bool hadReplacements;
1327 uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1328 aDst.Elements(), &dstWritten,
1329 aLast, &hadReplacements);
1330 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1334 * Incrementally encode into byte stream from UTF-16 _without replacement_.
1336 * See the documentation of the class for documentation for `Encode*`
1337 * methods collectively.
1339 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1340 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1341 size_t srcRead = aSrc.Length();
1342 size_t dstWritten = aDst.Length();
1343 uint32_t result = encoder_encode_from_utf16_without_replacement(
1344 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1345 return MakeTuple(result, srcRead, dstWritten);
1348 private:
1349 Encoder() = delete;
1350 Encoder(const Encoder&) = delete;
1351 Encoder& operator=(const Encoder&) = delete;
1354 }; // namespace mozilla
1356 #endif // mozilla_Encoding_h