Bug 1486350 [wpt PR 12684] - Remove languageCode tests, a=testonly
[gecko.git] / intl / Encoding.h
blob1e5d5006dae26bba53414d81e1d0e34717cf38f3
1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/NotNull.h"
19 #include "mozilla/Span.h"
20 #include "mozilla/Tuple.h"
21 #include "nsString.h"
23 namespace mozilla {
24 class Encoding;
25 class Decoder;
26 class Encoder;
27 }; // namespace mozilla
29 #define ENCODING_RS_ENCODING mozilla::Encoding
30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*>
31 #define ENCODING_RS_ENCODER mozilla::Encoder
32 #define ENCODING_RS_DECODER mozilla::Decoder
34 #include "encoding_rs.h"
36 extern "C" {
38 nsresult
39 mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
40 uint8_t const* src,
41 size_t src_len,
42 nsAString* dst);
44 nsresult
45 mozilla_encoding_decode_to_nsstring_with_bom_removal(
46 mozilla::Encoding const* encoding,
47 uint8_t const* src,
48 size_t src_len,
49 nsAString* dst);
51 nsresult
52 mozilla_encoding_decode_to_nsstring_without_bom_handling(
53 mozilla::Encoding const* encoding,
54 uint8_t const* src,
55 size_t src_len,
56 nsAString* dst);
58 nsresult
59 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
60 mozilla::Encoding const* encoding,
61 uint8_t const* src,
62 size_t src_len,
63 nsAString* dst);
65 nsresult
66 mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
67 char16_t const* src,
68 size_t src_len,
69 nsACString* dst);
71 nsresult
72 mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding,
73 nsACString const* src,
74 nsACString* dst);
76 nsresult
77 mozilla_encoding_decode_to_nscstring_with_bom_removal(
78 mozilla::Encoding const* encoding,
79 nsACString const* src,
80 nsACString* dst);
82 nsresult
83 mozilla_encoding_decode_to_nscstring_without_bom_handling(
84 mozilla::Encoding const* encoding,
85 nsACString const* src,
86 nsACString* dst);
88 nsresult
89 mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
90 mozilla::Encoding const* encoding,
91 uint8_t const* src,
92 size_t src_len,
93 nsACString* dst,
94 size_t already_validated);
96 nsresult
97 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
98 mozilla::Encoding const* encoding,
99 nsACString const* src,
100 nsACString* dst);
102 nsresult
103 mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding,
104 nsACString const* src,
105 nsACString* dst);
107 } // extern "C"
109 namespace mozilla {
112 * Return value from `Decoder`/`Encoder` to indicate that input
113 * was exhausted.
115 const uint32_t kInputEmpty = INPUT_EMPTY;
118 * Return value from `Decoder`/`Encoder` to indicate that output
119 * space was insufficient.
121 const uint32_t kOutputFull = OUTPUT_FULL;
124 * An encoding as defined in the Encoding Standard
125 * (https://encoding.spec.whatwg.org/).
127 * See https://docs.rs/encoding_rs/ for the Rust API docs.
129 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
130 * sequence and, in most cases, vice versa. Each encoding has a name, an output
131 * encoding, and one or more labels.
133 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
134 * encoding in formats and protocols. The _name_ of the encoding is the
135 * preferred label in the case appropriate for returning from the
136 * `characterSet` property of the `Document` DOM interface, except for
137 * the replacement encoding whose name is not one of its labels.
139 * The _output encoding_ is the encoding used for form submission and URL
140 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
141 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
142 * encodings.
144 * # Streaming vs. Non-Streaming
146 * When you have the entire input in a single buffer, you can use the
147 * methods `Decode()`, `DecodeWithBOMRemoval()`,
148 * `DecodeWithoutBOMHandling()`,
149 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
150 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
151 * NewEncoder()` methods), these methods perform heap allocations. You should
152 * the `Decoder` and `Encoder` objects when your input is split into multiple
153 * buffers or when you want to control the allocation of the output buffers.
155 * # Instances
157 * All instances of `Encoding` are statically allocated and have the process's
158 * lifetime. There is precisely one unique `Encoding` instance for each
159 * encoding defined in the Encoding Standard.
161 * To obtain a reference to a particular encoding whose identity you know at
162 * compile time, use a `static` that refers to encoding. There is a `static`
163 * for each encoding. The `static`s are named in all caps with hyphens
164 * replaced with underscores and with `_ENCODING` appended to the
165 * name. For example, if you know at compile time that you will want to
166 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
168 * If you don't know what encoding you need at compile time and need to
169 * dynamically get an encoding by label, use `Encoding::for_label()`.
171 * Pointers to `Encoding` can be compared with `==` to check for the sameness
172 * of two encodings.
174 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
175 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
176 * `const mozilla::Encoding*` in the C signature and
177 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
179 class Encoding final
181 public:
183 * Implements the _get an encoding_ algorithm
184 * (https://encoding.spec.whatwg.org/#concept-encoding-get).
186 * If, after ASCII-lowercasing and removing leading and trailing
187 * whitespace, the argument matches a label defined in the Encoding
188 * Standard, `const Encoding*` representing the corresponding
189 * encoding is returned. If there is no match, `nullptr` is returned.
191 * This is the right method to use if the action upon the method returning
192 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
193 * instead. When the action upon the method returning `nullptr` is not to
194 * proceed with a fallback but to refuse processing,
195 * `ForLabelNoReplacement()` is more appropriate.
197 static inline const Encoding* ForLabel(Span<const char> aLabel)
199 return encoding_for_label(
200 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
204 * `nsAString` argument version. See above for docs.
206 static inline const Encoding* ForLabel(const nsAString& aLabel)
208 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
212 * This method behaves the same as `ForLabel()`, except when `ForLabel()`
213 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
215 * This method is useful in scenarios where a fatal error is required
216 * upon invalid label, because in those cases the caller typically wishes
217 * to treat the labels that map to the replacement encoding as fatal
218 * errors, too.
220 * It is not OK to use this method when the action upon the method returning
221 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
222 * such a case, the `ForLabel()` method should be used instead in order to avoid
223 * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`.
225 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel)
227 return encoding_for_label_no_replacement(
228 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
232 * `nsAString` argument version. See above for docs.
234 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel)
236 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
240 * Performs non-incremental BOM sniffing.
242 * The argument must either be a buffer representing the entire input
243 * stream (non-streaming case) or a buffer representing at least the first
244 * three bytes of the input stream (streaming case).
246 * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
247 * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
248 * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
250 static inline Tuple<const Encoding*, size_t> ForBOM(
251 Span<const uint8_t> aBuffer)
253 size_t len = aBuffer.Length();
254 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
255 return MakeTuple(encoding, len);
259 * Writes the name of this encoding into `aName`.
261 * This name is appropriate to return as-is from the DOM
262 * `document.characterSet` property.
264 inline void Name(nsACString& aName) const
266 aName.SetLength(ENCODING_NAME_MAX_LENGTH);
267 size_t length =
268 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
269 aName.SetLength(length); // truncation is the 64-bit case is OK
273 * Checks whether the _output encoding_ of this encoding can encode every
274 * Unicode code point. (Only true if the output encoding is UTF-8.)
276 inline bool CanEncodeEverything() const
278 return encoding_can_encode_everything(this);
282 * Checks whether the bytes 0x00...0x7F map exclusively to the characters
283 * U+0000...U+007F and vice versa.
285 inline bool IsAsciiCompatible() const
287 return encoding_is_ascii_compatible(this);
291 * Returns the _output encoding_ of this encoding. This is UTF-8 for
292 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
294 inline NotNull<const mozilla::Encoding*> OutputEncoding() const
296 return WrapNotNull(encoding_output_encoding(this));
300 * Decode complete input to `nsACString` _with BOM sniffing_ and with
301 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
302 * entire input is available as a single buffer (i.e. the end of the
303 * buffer marks the end of the stream).
305 * This method implements the (non-streaming version of) the
306 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
308 * The second item in the returned tuple is the encoding that was actually
309 * used (which may differ from this encoding thanks to BOM sniffing).
311 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
312 * if there were malformed sequences (that were replaced with the
313 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
314 * tuple.
316 * The backing buffer of the string isn't copied if the input buffer
317 * is heap-allocated and decoding from UTF-8 and the input is valid
318 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
319 * the input is valid ASCII or decoding from ISO-2022-JP and the
320 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
321 * the same string as both arguments.
323 * _Note:_ It is wrong to use this when the input buffer represents only
324 * a segment of the input instead of the whole input. Use `NewDecoder()`
325 * when decoding segmented input.
327 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
328 const nsACString& aBytes,
329 nsACString& aOut) const
331 const Encoding* encoding = this;
332 const nsACString* bytes = &aBytes;
333 nsACString* out = &aOut;
334 nsresult rv;
335 if (bytes == out) {
336 nsAutoCString temp(aBytes);
337 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
338 } else {
339 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
341 return MakeTuple(rv, WrapNotNull(encoding));
345 * Decode complete input to `nsAString` _with BOM sniffing_ and with
346 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
347 * entire input is available as a single buffer (i.e. the end of the
348 * buffer marks the end of the stream).
350 * This method implements the (non-streaming version of) the
351 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
353 * The second item in the returned tuple is the encoding that was actually
354 * used (which may differ from this encoding thanks to BOM sniffing).
356 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
357 * if there were malformed sequences (that were replaced with the
358 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
359 * tuple.
361 * _Note:_ It is wrong to use this when the input buffer represents only
362 * a segment of the input instead of the whole input. Use `NewDecoder()`
363 * when decoding segmented input.
365 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
366 Span<const uint8_t> aBytes,
367 nsAString& aOut) const
369 const Encoding* encoding = this;
370 nsresult rv = mozilla_encoding_decode_to_nsstring(
371 &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
372 return MakeTuple(rv, WrapNotNull(encoding));
376 * Decode complete input to `nsACString` _with BOM removal_ and with
377 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
378 * entire input is available as a single buffer (i.e. the end of the
379 * buffer marks the end of the stream).
381 * When invoked on `UTF_8`, this method implements the (non-streaming
382 * version of) the _UTF-8 decode_
383 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
385 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
386 * if there were malformed sequences (that were replaced with the
387 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
389 * The backing buffer of the string isn't copied if the input buffer
390 * is heap-allocated and decoding from UTF-8 and the input is valid
391 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
392 * the input is valid ASCII or decoding from ISO-2022-JP and the
393 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
394 * the same string as both arguments.
396 * _Note:_ It is wrong to use this when the input buffer represents only
397 * a segment of the input instead of the whole input. Use
398 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
400 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
401 nsACString& aOut) const
403 const nsACString* bytes = &aBytes;
404 nsACString* out = &aOut;
405 if (bytes == out) {
406 nsAutoCString temp(aBytes);
407 return mozilla_encoding_decode_to_nscstring_with_bom_removal(
408 this, &temp, out);
410 return mozilla_encoding_decode_to_nscstring_with_bom_removal(
411 this, bytes, out);
415 * Decode complete input to `nsAString` _with BOM removal_ and with
416 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
417 * entire input is available as a single buffer (i.e. the end of the
418 * buffer marks the end of the stream).
420 * When invoked on `UTF_8`, this method implements the (non-streaming
421 * version of) the _UTF-8 decode_
422 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
424 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
425 * if there were malformed sequences (that were replaced with the
426 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
428 * _Note:_ It is wrong to use this when the input buffer represents only
429 * a segment of the input instead of the whole input. Use
430 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
432 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
433 nsAString& aOut) const
435 return mozilla_encoding_decode_to_nsstring_with_bom_removal(
436 this, aBytes.Elements(), aBytes.Length(), &aOut);
440 * Decode complete input to `nsACString` _without BOM handling_ and
441 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
442 * the entire input is available as a single buffer (i.e. the end of the
443 * buffer marks the end of the stream).
445 * When invoked on `UTF_8`, this method implements the (non-streaming
446 * version of) the _UTF-8 decode without BOM_
447 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
449 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
450 * if there were malformed sequences (that were replaced with the
451 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
453 * The backing buffer of the string isn't copied if the input buffer
454 * is heap-allocated and decoding from UTF-8 and the input is valid
455 * UTF-8, decoding from an ASCII-compatible encoding and the input
456 * is valid ASCII or decoding from ISO-2022-JP and the input stays
457 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
458 * as both arguments.
460 * _Note:_ It is wrong to use this when the input buffer represents only
461 * a segment of the input instead of the whole input. Use
462 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
464 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
465 nsACString& aOut) const
467 const nsACString* bytes = &aBytes;
468 nsACString* out = &aOut;
469 if (bytes == out) {
470 nsAutoCString temp(aBytes);
471 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
472 this, &temp, out);
474 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
475 this, bytes, out);
479 * Decode complete input to `nsAString` _without BOM handling_ and
480 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
481 * the entire input is available as a single buffer (i.e. the end of the
482 * buffer marks the end of the stream).
484 * When invoked on `UTF_8`, this method implements the (non-streaming
485 * version of) the _UTF-8 decode without BOM_
486 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
488 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
489 * if there were malformed sequences (that were replaced with the
490 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
492 * _Note:_ It is wrong to use this when the input buffer represents only
493 * a segment of the input instead of the whole input. Use
494 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
496 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
497 nsAString& aOut) const
499 return mozilla_encoding_decode_to_nsstring_without_bom_handling(
500 this, aBytes.Elements(), aBytes.Length(), &aOut);
504 * Decode complete input to `nsACString` _without BOM handling_ and
505 * _with malformed sequences treated as fatal_ when the entire input is
506 * available as a single buffer (i.e. the end of the buffer marks the end
507 * of the stream).
509 * When invoked on `UTF_8`, this method implements the (non-streaming
510 * version of) the _UTF-8 decode without BOM or fail_
511 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
512 * spec concept.
514 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
515 * if a malformed sequence was encountered and `NS_OK` otherwise.
517 * The backing buffer of the string isn't copied if the input buffer
518 * is heap-allocated and decoding from UTF-8 and the input is valid
519 * UTF-8, decoding from an ASCII-compatible encoding and the input
520 * is valid ASCII or decoding from ISO-2022-JP and the input stays
521 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
522 * as both arguments.
524 * _Note:_ It is wrong to use this when the input buffer represents only
525 * a segment of the input instead of the whole input. Use
526 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
528 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
529 const nsACString& aBytes,
530 nsACString& aOut) const
532 const nsACString* bytes = &aBytes;
533 nsACString* out = &aOut;
534 if (bytes == out) {
535 nsAutoCString temp(aBytes);
536 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
537 this, &temp, out);
539 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
540 this, bytes, out);
544 * Decode complete input to `nsACString` _without BOM handling_ and
545 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
546 * the entire input is available as a single buffer (i.e. the end of the
547 * buffer marks the end of the stream) _asserting that a number of bytes
548 * from the start are already known to be valid UTF-8_.
550 * The use case for this method is avoiding copying when dealing with
551 * input that has a UTF-8 BOM. _When in doubt, do not use this method._
553 * When invoked on `UTF_8`, this method implements the (non-streaming
554 * version of) the _UTF-8 decode without BOM_
555 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
557 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
558 * if there were malformed sequences (that were replaced with the
559 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
561 * _Note:_ It is wrong to use this when the input buffer represents only
562 * a segment of the input instead of the whole input. Use
563 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
565 * # Safety
567 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
568 * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
570 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
571 nsACString& aOut,
572 size_t aAlreadyValidated) const
574 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
575 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
579 * Decode complete input to `nsAString` _without BOM handling_ and
580 * _with malformed sequences treated as fatal_ when the entire input is
581 * available as a single buffer (i.e. the end of the buffer marks the end
582 * of the stream).
584 * When invoked on `UTF_8`, this method implements the (non-streaming
585 * version of) the _UTF-8 decode without BOM or fail_
586 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
587 * spec concept.
589 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
590 * if a malformed sequence was encountered and `NS_OK` otherwise.
592 * _Note:_ It is wrong to use this when the input buffer represents only
593 * a segment of the input instead of the whole input. Use
594 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
596 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
597 Span<const uint8_t> aBytes,
598 nsAString& aOut) const
600 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
601 this, aBytes.Elements(), aBytes.Length(), &aOut);
605 * Encode complete input to `nsACString` with unmappable characters
606 * replaced with decimal numeric character references when the entire input
607 * is available as a single buffer (i.e. the end of the buffer marks the
608 * end of the stream).
610 * This method implements the (non-streaming version of) the
611 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
613 * The second item in the returned tuple is the encoding that was actually
614 * used (which may differ from this encoding thanks to some encodings
615 * having UTF-8 as their output encoding).
617 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
618 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
619 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
620 * replaced with numeric character references) and `NS_OK` otherwise.
622 * The backing buffer of the string isn't copied if the input buffer
623 * is heap-allocated and encoding to UTF-8 and the input is valid
624 * UTF-8, encoding to an ASCII-compatible encoding and the input
625 * is valid ASCII or encoding from ISO-2022-JP and the input stays
626 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
627 * as both arguments.
629 * _Note:_ It is wrong to use this when the input buffer represents only
630 * a segment of the input instead of the whole input. Use `NewEncoder()`
631 * when encoding segmented output.
633 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
634 const nsACString& aString,
635 nsACString& aOut) const
637 const Encoding* encoding = this;
638 const nsACString* string = &aString;
639 nsACString* out = &aOut;
640 nsresult rv;
641 if (string == out) {
642 nsAutoCString temp(aString);
643 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
644 } else {
645 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
647 return MakeTuple(rv, WrapNotNull(encoding));
651 * Encode complete input to `nsACString` with unmappable characters
652 * replaced with decimal numeric character references when the entire input
653 * is available as a single buffer (i.e. the end of the buffer marks the
654 * end of the stream).
656 * This method implements the (non-streaming version of) the
657 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
659 * The second item in the returned tuple is the encoding that was actually
660 * used (which may differ from this encoding thanks to some encodings
661 * having UTF-8 as their output encoding).
663 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
664 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
665 * were replaced with numeric character references) and `NS_OK` otherwise.
667 * _Note:_ It is wrong to use this when the input buffer represents only
668 * a segment of the input instead of the whole input. Use `NewEncoder()`
669 * when encoding segmented output.
671 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
672 Span<const char16_t> aString,
673 nsACString& aOut) const
675 const Encoding* encoding = this;
676 nsresult rv = mozilla_encoding_encode_from_utf16(
677 &encoding, aString.Elements(), aString.Length(), &aOut);
678 return MakeTuple(rv, WrapNotNull(encoding));
682 * Instantiates a new decoder for this encoding with BOM sniffing enabled.
684 * BOM sniffing may cause the returned decoder to morph into a decoder
685 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
687 inline UniquePtr<Decoder> NewDecoder() const
689 UniquePtr<Decoder> decoder(encoding_new_decoder(this));
690 return decoder;
694 * Instantiates a new decoder for this encoding with BOM sniffing enabled
695 * into memory occupied by a previously-instantiated decoder.
697 * BOM sniffing may cause the returned decoder to morph into a decoder
698 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
700 inline void NewDecoderInto(Decoder& aDecoder) const
702 encoding_new_decoder_into(this, &aDecoder);
706 * Instantiates a new decoder for this encoding with BOM removal.
708 * If the input starts with bytes that are the BOM for this encoding,
709 * those bytes are removed. However, the decoder never morphs into a
710 * decoder for another encoding: A BOM for another encoding is treated as
711 * (potentially malformed) input to the decoding algorithm for this
712 * encoding.
714 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const
716 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
717 return decoder;
721 * Instantiates a new decoder for this encoding with BOM removal
722 * into memory occupied by a previously-instantiated decoder.
724 * If the input starts with bytes that are the BOM for this encoding,
725 * those bytes are removed. However, the decoder never morphs into a
726 * decoder for another encoding: A BOM for another encoding is treated as
727 * (potentially malformed) input to the decoding algorithm for this
728 * encoding.
730 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const
732 encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
736 * Instantiates a new decoder for this encoding with BOM handling disabled.
738 * If the input starts with bytes that look like a BOM, those bytes are
739 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
740 * for another encoding.)
742 * _Note:_ If the caller has performed BOM sniffing on its own but has not
743 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
744 * instead of this method to cause the BOM to be removed.
746 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const
748 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
749 return decoder;
753 * Instantiates a new decoder for this encoding with BOM handling disabled
754 * into memory occupied by a previously-instantiated decoder.
756 * If the input starts with bytes that look like a BOM, those bytes are
757 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
758 * for another encoding.)
760 * _Note:_ If the caller has performed BOM sniffing on its own but has not
761 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
762 * instead of this method to cause the BOM to be removed.
764 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const
766 encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
770 * Instantiates a new encoder for the output encoding of this encoding.
772 inline UniquePtr<Encoder> NewEncoder() const
774 UniquePtr<Encoder> encoder(encoding_new_encoder(this));
775 return encoder;
779 * Instantiates a new encoder for the output encoding of this encoding
780 * into memory occupied by a previously-instantiated encoder.
782 inline void NewEncoderInto(Encoder& aEncoder) const
784 encoding_new_encoder_into(this, &aEncoder);
788 * Validates UTF-8.
790 * Returns the index of the first byte that makes the input malformed as
791 * UTF-8 or the length of the input if the input is entirely valid.
793 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer)
795 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
799 * Validates ASCII.
801 * Returns the index of the first byte that makes the input malformed as
802 * ASCII or the length of the input if the input is entirely valid.
804 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer)
806 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
810 * Validates ISO-2022-JP ASCII-state data.
812 * Returns the index of the first byte that makes the input not
813 * representable in the ASCII state of ISO-2022-JP or the length of the
814 * input if the input is entirely representable in the ASCII state of
815 * ISO-2022-JP.
817 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)
819 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
820 aBuffer.Length());
823 private:
824 Encoding() = delete;
825 Encoding(const Encoding&) = delete;
826 Encoding& operator=(const Encoding&) = delete;
827 ~Encoding() = delete;
832 * A converter that decodes a byte stream into Unicode according to a
833 * character encoding in a streaming (incremental) manner.
835 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
836 * buffer `aDst` both of which are caller-allocated. There are variants for
837 * both UTF-8 and UTF-16 output buffers.
839 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
840 * into `aDst` until one of the following three things happens:
842 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
843 * variants only).
845 * 2. The output buffer has been filled so near capacity that the decoder
846 * cannot be sure that processing an additional byte of input wouldn't
847 * cause so much output that the output buffer would overflow.
849 * 3. All the input bytes have been processed.
851 * The `Decode*` method then returns tuple of a status indicating which one
852 * of the three reasons to return happened, how many input bytes were read,
853 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
854 * when decoding to UTF-16) were written, and in the case of the
855 * variants performing replacement, a boolean indicating whether an error was
856 * replaced with the REPLACEMENT CHARACTER during the call.
858 * The number of bytes "written" is what's logically written. Garbage may be
859 * written in the output buffer beyond the point logically written to.
861 * In the case of the `*WithoutReplacement` variants, the status is a
862 * `uint32_t` whose possible values are packed info about a malformed byte
863 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
864 * listed above).
866 * Packed info about malformed sequences has the following format:
867 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
868 * indicate the number of bytes that were consumed after the malformed
869 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
870 * the length of the malformed byte sequence (possible decimal values 1, 2,
871 * 3 or 4). The maximum possible sum of the two is 6.
873 * In the case of methods whose name does not end with
874 * `*WithoutReplacement`, malformed sequences are automatically replaced
875 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
876 * return early.
878 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
879 * space. When decoding to UTF-16, the output buffer must have at least two
880 * UTF-16 code units (`char16_t`) of space.
882 * When decoding to UTF-8 without replacement, the methods are guaranteed
883 * not to return indicating that more output space is needed if the length
884 * of the output buffer is at least the length returned by
885 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
886 * with replacement, the length of the output buffer that guarantees the
887 * methods not to return indicating that more output space is needed is given
888 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
889 * or without replacement, the length of the output buffer that guarantees
890 * the methods not to return indicating that more output space is needed is
891 * given by `MaxUTF16BufferLength()`.
893 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
894 * and the output after each `Decode*` call is guaranteed to consist of
895 * complete characters. (I.e. the code unit sequence for the last character is
896 * guaranteed not to be split across output buffers.)
898 * The boolean argument `aLast` indicates that the end of the stream is reached
899 * when all the bytes in `aSrc` have been consumed.
901 * A `Decoder` object can be used to incrementally decode a byte stream.
903 * During the processing of a single stream, the caller must call `Decode*`
904 * zero or more times with `aLast` set to `false` and then call `Decode*` at
905 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
906 * the processing of the stream has ended. Otherwise, the caller must call
907 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
908 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
910 * Once the stream has ended, the `Decoder` object must not be used anymore.
911 * That is, you need to create another one to process another stream.
913 * When the decoder returns `kOutputFull` or the decoder returns a malformed
914 * result and the caller does not wish to treat it as a fatal error, the input
915 * buffer `aSrc` may not have been completely consumed. In that case, the caller
916 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
917 * call.
919 * # Infinite loops
921 * When converting with a fixed-size output buffer whose size is too small to
922 * accommodate one character of output, an infinite loop ensues. When
923 * converting with a fixed-size output buffer, it generally makes sense to
924 * make the buffer fairly large (e.g. couple of kilobytes).
926 class Decoder final
928 public:
929 ~Decoder() {}
930 static void operator delete(void* aDecoder)
932 decoder_free(reinterpret_cast<Decoder*>(aDecoder));
936 * The `Encoding` this `Decoder` is for.
938 * BOM sniffing can change the return value of this method during the life
939 * of the decoder.
941 inline NotNull<const mozilla::Encoding*> Encoding() const
943 return WrapNotNull(decoder_encoding(this));
947 * Query the worst-case UTF-8 output size _with replacement_.
949 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
950 * that will not overflow given the current state of the decoder and
951 * `aByteLength` number of additional input bytes when decoding with
952 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
953 * sequence.
955 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const
957 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
958 if (max.value() == MaxValue<size_t>::value) {
959 // Mark invalid by overflowing
960 max++;
961 MOZ_ASSERT(!max.isValid());
963 return max;
967 * Query the worst-case UTF-8 output size _without replacement_.
969 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
970 * that will not overflow given the current state of the decoder and
971 * `aByteLength` number of additional input bytes when decoding without
972 * replacement error handling.
974 * Note that this value may be too small for the `WithReplacement` case.
975 * Use `MaxUTF8BufferLength()` for that case.
977 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
978 size_t aByteLength) const
980 CheckedInt<size_t> max(
981 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
982 if (max.value() == MaxValue<size_t>::value) {
983 // Mark invalid by overflowing
984 max++;
985 MOZ_ASSERT(!max.isValid());
987 return max;
991 * Incrementally decode a byte stream into UTF-8 with malformed sequences
992 * replaced with the REPLACEMENT CHARACTER.
994 * See the documentation of the class for documentation for `Decode*`
995 * methods collectively.
997 inline Tuple<uint32_t, size_t, size_t, bool>
998 DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
1000 size_t srcRead = aSrc.Length();
1001 size_t dstWritten = aDst.Length();
1002 bool hadReplacements;
1003 uint32_t result = decoder_decode_to_utf8(this,
1004 aSrc.Elements(),
1005 &srcRead,
1006 aDst.Elements(),
1007 &dstWritten,
1008 aLast,
1009 &hadReplacements);
1010 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1014 * Incrementally decode a byte stream into UTF-8 _without replacement_.
1016 * See the documentation of the class for documentation for `Decode*`
1017 * methods collectively.
1019 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
1020 Span<const uint8_t> aSrc,
1021 Span<uint8_t> aDst,
1022 bool aLast)
1024 size_t srcRead = aSrc.Length();
1025 size_t dstWritten = aDst.Length();
1026 uint32_t result = decoder_decode_to_utf8_without_replacement(
1027 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1028 return MakeTuple(result, srcRead, dstWritten);
1032 * Query the worst-case UTF-16 output size (with or without replacement).
1034 * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
1035 * that will not overflow given the current state of the decoder and
1036 * `aByteLength` number of additional input bytes.
1038 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
1039 * return value of this method applies also in the
1040 * `_without_replacement` case.
1042 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const
1044 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
1045 if (max.value() == MaxValue<size_t>::value) {
1046 // Mark invalid by overflowing
1047 max++;
1048 MOZ_ASSERT(!max.isValid());
1050 return max;
1054 * Incrementally decode a byte stream into UTF-16 with malformed sequences
1055 * replaced with the REPLACEMENT CHARACTER.
1057 * See the documentation of the class for documentation for `Decode*`
1058 * methods collectively.
1060 inline Tuple<uint32_t, size_t, size_t, bool>
1061 DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast)
1063 size_t srcRead = aSrc.Length();
1064 size_t dstWritten = aDst.Length();
1065 bool hadReplacements;
1066 uint32_t result = decoder_decode_to_utf16(this,
1067 aSrc.Elements(),
1068 &srcRead,
1069 aDst.Elements(),
1070 &dstWritten,
1071 aLast,
1072 &hadReplacements);
1073 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1077 * Incrementally decode a byte stream into UTF-16 _without replacement_.
1079 * See the documentation of the class for documentation for `Decode*`
1080 * methods collectively.
1082 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1083 Span<const uint8_t> aSrc,
1084 Span<char16_t> aDst,
1085 bool aLast)
1087 size_t srcRead = aSrc.Length();
1088 size_t dstWritten = aDst.Length();
1089 uint32_t result = decoder_decode_to_utf16_without_replacement(
1090 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1091 return MakeTuple(result, srcRead, dstWritten);
1094 private:
1095 Decoder() = delete;
1096 Decoder(const Decoder&) = delete;
1097 Decoder& operator=(const Decoder&) = delete;
1101 * A converter that encodes a Unicode stream into bytes according to a
1102 * character encoding in a streaming (incremental) manner.
1104 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1105 * buffer `aDst` both of which are caller-allocated. There are variants for
1106 * both UTF-8 and UTF-16 input buffers.
1108 * An `Encode*` method encode characters from `aSrc` into bytes characters
1109 * stored into `aDst` until one of the following three things happens:
1111 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1112 * only).
1114 * 2. The output buffer has been filled so near capacity that the decoder
1115 * cannot be sure that processing an additional character of input wouldn't
1116 * cause so much output that the output buffer would overflow.
1118 * 3. All the input characters have been processed.
1120 * The `Encode*` method then returns tuple of a status indicating which one
1121 * of the three reasons to return happened, how many input code units (`uint8_t`
1122 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1123 * how many output bytes were written, and in the case of the variants that
1124 * perform replacement, a boolean indicating whether an unmappable
1125 * character was replaced with a numeric character reference during the call.
1127 * The number of bytes "written" is what's logically written. Garbage may be
1128 * written in the output buffer beyond the point logically written to.
1130 * In the case of the methods whose name ends with
1131 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1132 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1133 * to the three cases listed above).
1135 * In the case of methods whose name does not end with
1136 * `*WithoutReplacement`, unmappable characters are automatically replaced
1137 * with the corresponding numeric character references and unmappable
1138 * characters do not cause the methods to return early.
1140 * When encoding from UTF-8 without replacement, the methods are guaranteed
1141 * not to return indicating that more output space is needed if the length
1142 * of the output buffer is at least the length returned by
1143 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1144 * UTF-8 with replacement, the length of the output buffer that guarantees the
1145 * methods not to return indicating that more output space is needed in the
1146 * absence of unmappable characters is given by
1147 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1148 * UTF-16 without replacement, the methods are guaranteed not to return
1149 * indicating that more output space is needed if the length of the output
1150 * buffer is at least the length returned by
1151 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1152 * from UTF-16 with replacement, the the length of the output buffer that
1153 * guarantees the methods not to return indicating that more output space is
1154 * needed in the absence of unmappable characters is given by
1155 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1156 * When encoding with replacement, applications are not expected to size the
1157 * buffer for the worst case ahead of time but to resize the buffer if there
1158 * are unmappable characters. This is why max length queries are only available
1159 * for the case where there are no unmappable characters.
1161 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1162 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1163 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1164 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1165 * surrogate pairs are not split across input buffer boundaries.
1167 * After an `Encode*` call returns, the output produced so far, taken as a
1168 * whole from the start of the stream, is guaranteed to consist of a valid
1169 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1170 * character is guaranteed not to be split across output buffers. However, due
1171 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1172 * from the start for it to be valid. For other encodings, the validity holds
1173 * on a per-output buffer basis.)
1175 * The boolean argument `aLast` indicates that the end of the stream is reached
1176 * when all the characters in `aSrc` have been consumed. This argument is needed
1177 * for ISO-2022-JP and is ignored for other encodings.
1179 * An `Encoder` object can be used to incrementally encode a byte stream.
1181 * During the processing of a single stream, the caller must call `Encode*`
1182 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1183 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1184 * the processing of the stream has ended. Otherwise, the caller must call
1185 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1186 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1188 * Once the stream has ended, the `Encoder` object must not be used anymore.
1189 * That is, you need to create another one to process another stream.
1191 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1192 * result and the caller does not wish to treat it as a fatal error, the input
1193 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1194 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1195 * call.
1197 * # Infinite loops
1199 * When converting with a fixed-size output buffer whose size is too small to
1200 * accommodate one character of output, an infinite loop ensues. When
1201 * converting with a fixed-size output buffer, it generally makes sense to
1202 * make the buffer fairly large (e.g. couple of kilobytes).
1204 class Encoder final
1206 public:
1207 ~Encoder() {}
1209 static void operator delete(void* aEncoder)
1211 encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1215 * The `Encoding` this `Encoder` is for.
1217 inline NotNull<const mozilla::Encoding*> Encoding() const
1219 return WrapNotNull(encoder_encoding(this));
1223 * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1224 * ASCII state and `false` otherwise.
1226 inline bool HasPendingState() const
1228 return encoder_has_pending_state(this);
1232 * Query the worst-case output size when encoding from UTF-8 with
1233 * replacement.
1235 * Returns the size of the output buffer in bytes that will not overflow
1236 * given the current state of the encoder and `aByteLength` number of
1237 * additional input code units if there are no unmappable characters in
1238 * the input.
1240 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1241 size_t aByteLength) const
1243 CheckedInt<size_t> max(
1244 encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength));
1245 if (max.value() == MaxValue<size_t>::value) {
1246 // Mark invalid by overflowing
1247 max++;
1248 MOZ_ASSERT(!max.isValid());
1250 return max;
1254 * Query the worst-case output size when encoding from UTF-8 without
1255 * replacement.
1257 * Returns the size of the output buffer in bytes that will not overflow
1258 * given the current state of the encoder and `aByteLength` number of
1259 * additional input code units.
1261 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1262 size_t aByteLength) const
1264 CheckedInt<size_t> max(
1265 encoder_max_buffer_length_from_utf8_without_replacement(this,
1266 aByteLength));
1267 if (max.value() == MaxValue<size_t>::value) {
1268 // Mark invalid by overflowing
1269 max++;
1270 MOZ_ASSERT(!max.isValid());
1272 return max;
1276 * Incrementally encode into byte stream from UTF-8 with unmappable
1277 * characters replaced with HTML (decimal) numeric character references.
1279 * See the documentation of the class for documentation for `Encode*`
1280 * methods collectively.
1282 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1283 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1284 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1286 inline Tuple<uint32_t, size_t, size_t, bool>
1287 EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
1289 size_t srcRead = aSrc.Length();
1290 size_t dstWritten = aDst.Length();
1291 bool hadReplacements;
1292 uint32_t result = encoder_encode_from_utf8(this,
1293 aSrc.Elements(),
1294 &srcRead,
1295 aDst.Elements(),
1296 &dstWritten,
1297 aLast,
1298 &hadReplacements);
1299 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1303 * Incrementally encode into byte stream from UTF-8 _without replacement_.
1305 * See the documentation of the class for documentation for `Encode*`
1306 * methods collectively.
1308 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1309 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1310 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1312 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1313 Span<const uint8_t> aSrc,
1314 Span<uint8_t> aDst,
1315 bool aLast)
1317 size_t srcRead = aSrc.Length();
1318 size_t dstWritten = aDst.Length();
1319 uint32_t result = encoder_encode_from_utf8_without_replacement(
1320 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1321 return MakeTuple(result, srcRead, dstWritten);
1325 * Query the worst-case output size when encoding from UTF-16 with
1326 * replacement.
1328 * Returns the size of the output buffer in bytes that will not overflow
1329 * given the current state of the encoder and `aU16Length` number of
1330 * additional input code units if there are no unmappable characters in
1331 * the input.
1333 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1334 size_t aU16Length) const
1336 CheckedInt<size_t> max(
1337 encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length));
1338 if (max.value() == MaxValue<size_t>::value) {
1339 // Mark invalid by overflowing
1340 max++;
1341 MOZ_ASSERT(!max.isValid());
1343 return max;
1347 * Query the worst-case output size when encoding from UTF-16 without
1348 * replacement.
1350 * Returns the size of the output buffer in bytes that will not overflow
1351 * given the current state of the encoder and `aU16Length` number of
1352 * additional input code units.
1354 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1355 size_t aU16Length) const
1357 CheckedInt<size_t> max(
1358 encoder_max_buffer_length_from_utf16_without_replacement(this,
1359 aU16Length));
1360 if (max.value() == MaxValue<size_t>::value) {
1361 // Mark invalid by overflowing
1362 max++;
1363 MOZ_ASSERT(!max.isValid());
1365 return max;
1369 * Incrementally encode into byte stream from UTF-16 with unmappable
1370 * characters replaced with HTML (decimal) numeric character references.
1372 * See the documentation of the class for documentation for `Encode*`
1373 * methods collectively.
1375 inline Tuple<uint32_t, size_t, size_t, bool>
1376 EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast)
1378 size_t srcRead = aSrc.Length();
1379 size_t dstWritten = aDst.Length();
1380 bool hadReplacements;
1381 uint32_t result = encoder_encode_from_utf16(this,
1382 aSrc.Elements(),
1383 &srcRead,
1384 aDst.Elements(),
1385 &dstWritten,
1386 aLast,
1387 &hadReplacements);
1388 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1392 * Incrementally encode into byte stream from UTF-16 _without replacement_.
1394 * See the documentation of the class for documentation for `Encode*`
1395 * methods collectively.
1397 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1398 Span<const char16_t> aSrc,
1399 Span<uint8_t> aDst,
1400 bool aLast)
1402 size_t srcRead = aSrc.Length();
1403 size_t dstWritten = aDst.Length();
1404 uint32_t result = encoder_encode_from_utf16_without_replacement(
1405 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1406 return MakeTuple(result, srcRead, dstWritten);
1409 private:
1410 Encoder() = delete;
1411 Encoder(const Encoder&) = delete;
1412 Encoder& operator=(const Encoder&) = delete;
1415 }; // namespace mozilla
1417 #endif // mozilla_Encoding_h