Bug 1550519 - Show a translucent parent highlight when a subgrid is highlighted....
[gecko.git] / intl / Encoding.h
bloba7bfd9151330002164a412b5e70989091fd7683d
1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 // "top-level directory" in the above notice refers to
12 // third_party/rust/encoding_c/.
14 #ifndef mozilla_Encoding_h
15 #define mozilla_Encoding_h
17 #include "mozilla/CheckedInt.h"
18 #include "mozilla/NotNull.h"
19 #include "mozilla/Span.h"
20 #include "mozilla/Tuple.h"
21 #include "nsString.h"
23 namespace mozilla {
24 class Encoding;
25 class Decoder;
26 class Encoder;
27 }; // namespace mozilla
29 #define ENCODING_RS_ENCODING mozilla::Encoding
30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
31 mozilla::NotNull<const mozilla::Encoding*>
32 #define ENCODING_RS_ENCODER mozilla::Encoder
33 #define ENCODING_RS_DECODER mozilla::Decoder
35 #include "encoding_rs.h"
37 extern "C" {
39 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
40 uint8_t const* src, size_t src_len,
41 nsAString* dst);
43 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
44 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
45 nsAString* dst);
47 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
48 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
49 nsAString* dst);
51 nsresult
52 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
53 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
54 nsAString* dst);
56 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
57 char16_t const* src, size_t src_len,
58 nsACString* dst);
60 nsresult mozilla_encoding_decode_to_nscstring(
61 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
63 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
64 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
66 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
67 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
69 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
70 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
71 nsACString* dst, size_t already_validated);
73 nsresult
74 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
75 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
77 nsresult mozilla_encoding_encode_from_nscstring(
78 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
80 } // extern "C"
82 namespace mozilla {
84 /**
85 * Return value from `Decoder`/`Encoder` to indicate that input
86 * was exhausted.
88 const uint32_t kInputEmpty = INPUT_EMPTY;
90 /**
91 * Return value from `Decoder`/`Encoder` to indicate that output
92 * space was insufficient.
94 const uint32_t kOutputFull = OUTPUT_FULL;
96 /**
97 * An encoding as defined in the Encoding Standard
98 * (https://encoding.spec.whatwg.org/).
100 * See https://docs.rs/encoding_rs/ for the Rust API docs.
102 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
103 * sequence and, in most cases, vice versa. Each encoding has a name, an output
104 * encoding, and one or more labels.
106 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
107 * encoding in formats and protocols. The _name_ of the encoding is the
108 * preferred label in the case appropriate for returning from the
109 * `characterSet` property of the `Document` DOM interface, except for
110 * the replacement encoding whose name is not one of its labels.
112 * The _output encoding_ is the encoding used for form submission and URL
113 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
114 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
115 * encodings.
117 * # Streaming vs. Non-Streaming
119 * When you have the entire input in a single buffer, you can use the
120 * methods `Decode()`, `DecodeWithBOMRemoval()`,
121 * `DecodeWithoutBOMHandling()`,
122 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
123 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
124 * NewEncoder()` methods), these methods perform heap allocations. You should
125 * the `Decoder` and `Encoder` objects when your input is split into multiple
126 * buffers or when you want to control the allocation of the output buffers.
128 * # Instances
130 * All instances of `Encoding` are statically allocated and have the process's
131 * lifetime. There is precisely one unique `Encoding` instance for each
132 * encoding defined in the Encoding Standard.
134 * To obtain a reference to a particular encoding whose identity you know at
135 * compile time, use a `static` that refers to encoding. There is a `static`
136 * for each encoding. The `static`s are named in all caps with hyphens
137 * replaced with underscores and with `_ENCODING` appended to the
138 * name. For example, if you know at compile time that you will want to
139 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
141 * If you don't know what encoding you need at compile time and need to
142 * dynamically get an encoding by label, use `Encoding::for_label()`.
144 * Pointers to `Encoding` can be compared with `==` to check for the sameness
145 * of two encodings.
147 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
148 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
149 * `const mozilla::Encoding*` in the C signature and
150 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
152 class Encoding final {
153 public:
155 * Implements the _get an encoding_ algorithm
156 * (https://encoding.spec.whatwg.org/#concept-encoding-get).
158 * If, after ASCII-lowercasing and removing leading and trailing
159 * whitespace, the argument matches a label defined in the Encoding
160 * Standard, `const Encoding*` representing the corresponding
161 * encoding is returned. If there is no match, `nullptr` is returned.
163 * This is the right method to use if the action upon the method returning
164 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
165 * instead. When the action upon the method returning `nullptr` is not to
166 * proceed with a fallback but to refuse processing,
167 * `ForLabelNoReplacement()` is more appropriate.
169 static inline const Encoding* ForLabel(Span<const char> aLabel) {
170 return encoding_for_label(
171 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
175 * `nsAString` argument version. See above for docs.
177 static inline const Encoding* ForLabel(const nsAString& aLabel) {
178 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
182 * This method behaves the same as `ForLabel()`, except when `ForLabel()`
183 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
185 * This method is useful in scenarios where a fatal error is required
186 * upon invalid label, because in those cases the caller typically wishes
187 * to treat the labels that map to the replacement encoding as fatal
188 * errors, too.
190 * It is not OK to use this method when the action upon the method returning
191 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
192 * such a case, the `ForLabel()` method should be used instead in order to
193 * avoid unsafe fallback for labels that `ForLabel()` maps to
194 * `REPLACEMENT_ENCODING`.
196 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
197 return encoding_for_label_no_replacement(
198 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
202 * `nsAString` argument version. See above for docs.
204 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
205 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
209 * Performs non-incremental BOM sniffing.
211 * The argument must either be a buffer representing the entire input
212 * stream (non-streaming case) or a buffer representing at least the first
213 * three bytes of the input stream (streaming case).
215 * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
216 * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
217 * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
219 static inline Tuple<const Encoding*, size_t> ForBOM(
220 Span<const uint8_t> aBuffer) {
221 size_t len = aBuffer.Length();
222 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
223 return MakeTuple(encoding, len);
227 * Writes the name of this encoding into `aName`.
229 * This name is appropriate to return as-is from the DOM
230 * `document.characterSet` property.
232 inline void Name(nsACString& aName) const {
233 aName.SetLength(ENCODING_NAME_MAX_LENGTH);
234 size_t length =
235 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
236 aName.SetLength(length); // truncation is the 64-bit case is OK
240 * Checks whether the _output encoding_ of this encoding can encode every
241 * Unicode code point. (Only true if the output encoding is UTF-8.)
243 inline bool CanEncodeEverything() const {
244 return encoding_can_encode_everything(this);
248 * Checks whether the bytes 0x00...0x7F map exclusively to the characters
249 * U+0000...U+007F and vice versa.
251 inline bool IsAsciiCompatible() const {
252 return encoding_is_ascii_compatible(this);
256 * Checks whether this is a Japanese legacy encoding.
258 inline bool IsJapaneseLegacy() const {
259 return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
260 this == ISO_2022_JP_ENCODING;
264 * Returns the _output encoding_ of this encoding. This is UTF-8 for
265 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
267 inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
268 return WrapNotNull(encoding_output_encoding(this));
272 * Decode complete input to `nsACString` _with BOM sniffing_ and with
273 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
274 * entire input is available as a single buffer (i.e. the end of the
275 * buffer marks the end of the stream).
277 * This method implements the (non-streaming version of) the
278 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
280 * The second item in the returned tuple is the encoding that was actually
281 * used (which may differ from this encoding thanks to BOM sniffing).
283 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
284 * if there were malformed sequences (that were replaced with the
285 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
286 * tuple.
288 * The backing buffer of the string isn't copied if the input buffer
289 * is heap-allocated and decoding from UTF-8 and the input is valid
290 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
291 * the input is valid ASCII or decoding from ISO-2022-JP and the
292 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
293 * the same string as both arguments.
295 * _Note:_ It is wrong to use this when the input buffer represents only
296 * a segment of the input instead of the whole input. Use `NewDecoder()`
297 * when decoding segmented input.
299 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
300 const nsACString& aBytes, nsACString& aOut) const {
301 const Encoding* encoding = this;
302 const nsACString* bytes = &aBytes;
303 nsACString* out = &aOut;
304 nsresult rv;
305 if (bytes == out) {
306 nsAutoCString temp(aBytes);
307 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
308 } else {
309 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
311 return MakeTuple(rv, WrapNotNull(encoding));
315 * Decode complete input to `nsAString` _with BOM sniffing_ and with
316 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
317 * entire input is available as a single buffer (i.e. the end of the
318 * buffer marks the end of the stream).
320 * This method implements the (non-streaming version of) the
321 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
323 * The second item in the returned tuple is the encoding that was actually
324 * used (which may differ from this encoding thanks to BOM sniffing).
326 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
327 * if there were malformed sequences (that were replaced with the
328 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
329 * tuple.
331 * _Note:_ It is wrong to use this when the input buffer represents only
332 * a segment of the input instead of the whole input. Use `NewDecoder()`
333 * when decoding segmented input.
335 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
336 Span<const uint8_t> aBytes, nsAString& aOut) const {
337 const Encoding* encoding = this;
338 nsresult rv = mozilla_encoding_decode_to_nsstring(
339 &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
340 return MakeTuple(rv, WrapNotNull(encoding));
344 * Decode complete input to `nsACString` _with BOM removal_ and with
345 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
346 * entire input is available as a single buffer (i.e. the end of the
347 * buffer marks the end of the stream).
349 * When invoked on `UTF_8`, this method implements the (non-streaming
350 * version of) the _UTF-8 decode_
351 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
353 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
354 * if there were malformed sequences (that were replaced with the
355 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
357 * The backing buffer of the string isn't copied if the input buffer
358 * is heap-allocated and decoding from UTF-8 and the input is valid
359 * BOMless UTF-8, decoding from an ASCII-compatible encoding and
360 * the input is valid ASCII or decoding from ISO-2022-JP and the
361 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
362 * the same string as both arguments.
364 * _Note:_ It is wrong to use this when the input buffer represents only
365 * a segment of the input instead of the whole input. Use
366 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
368 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
369 nsACString& aOut) const {
370 const nsACString* bytes = &aBytes;
371 nsACString* out = &aOut;
372 if (bytes == out) {
373 nsAutoCString temp(aBytes);
374 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
375 out);
377 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
378 out);
382 * Decode complete input to `nsAString` _with BOM removal_ and with
383 * malformed sequences replaced with the REPLACEMENT CHARACTER when the
384 * entire input is available as a single buffer (i.e. the end of the
385 * buffer marks the end of the stream).
387 * When invoked on `UTF_8`, this method implements the (non-streaming
388 * version of) the _UTF-8 decode_
389 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
391 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
392 * if there were malformed sequences (that were replaced with the
393 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
395 * _Note:_ It is wrong to use this when the input buffer represents only
396 * a segment of the input instead of the whole input. Use
397 * `NewDecoderWithBOMRemoval()` when decoding segmented input.
399 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
400 nsAString& aOut) const {
401 return mozilla_encoding_decode_to_nsstring_with_bom_removal(
402 this, aBytes.Elements(), aBytes.Length(), &aOut);
406 * Decode complete input to `nsACString` _without BOM handling_ and
407 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
408 * the entire input is available as a single buffer (i.e. the end of the
409 * buffer marks the end of the stream).
411 * When invoked on `UTF_8`, this method implements the (non-streaming
412 * version of) the _UTF-8 decode without BOM_
413 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
415 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
416 * if there were malformed sequences (that were replaced with the
417 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
419 * The backing buffer of the string isn't copied if the input buffer
420 * is heap-allocated and decoding from UTF-8 and the input is valid
421 * UTF-8, decoding from an ASCII-compatible encoding and the input
422 * is valid ASCII or decoding from ISO-2022-JP and the input stays
423 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
424 * as both arguments.
426 * _Note:_ It is wrong to use this when the input buffer represents only
427 * a segment of the input instead of the whole input. Use
428 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
430 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
431 nsACString& aOut) const {
432 const nsACString* bytes = &aBytes;
433 nsACString* out = &aOut;
434 if (bytes == out) {
435 nsAutoCString temp(aBytes);
436 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
437 this, &temp, out);
439 return mozilla_encoding_decode_to_nscstring_without_bom_handling(
440 this, bytes, out);
444 * Decode complete input to `nsAString` _without BOM handling_ and
445 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
446 * the entire input is available as a single buffer (i.e. the end of the
447 * buffer marks the end of the stream).
449 * When invoked on `UTF_8`, this method implements the (non-streaming
450 * version of) the _UTF-8 decode without BOM_
451 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
453 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
454 * if there were malformed sequences (that were replaced with the
455 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
457 * _Note:_ It is wrong to use this when the input buffer represents only
458 * a segment of the input instead of the whole input. Use
459 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
461 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
462 nsAString& aOut) const {
463 return mozilla_encoding_decode_to_nsstring_without_bom_handling(
464 this, aBytes.Elements(), aBytes.Length(), &aOut);
468 * Decode complete input to `nsACString` _without BOM handling_ and
469 * _with malformed sequences treated as fatal_ when the entire input is
470 * available as a single buffer (i.e. the end of the buffer marks the end
471 * of the stream).
473 * When invoked on `UTF_8`, this method implements the (non-streaming
474 * version of) the _UTF-8 decode without BOM or fail_
475 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
476 * spec concept.
478 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
479 * if a malformed sequence was encountered and `NS_OK` otherwise.
481 * The backing buffer of the string isn't copied if the input buffer
482 * is heap-allocated and decoding from UTF-8 and the input is valid
483 * UTF-8, decoding from an ASCII-compatible encoding and the input
484 * is valid ASCII or decoding from ISO-2022-JP and the input stays
485 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
486 * as both arguments.
488 * _Note:_ It is wrong to use this when the input buffer represents only
489 * a segment of the input instead of the whole input. Use
490 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
492 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
493 const nsACString& aBytes, nsACString& aOut) const {
494 const nsACString* bytes = &aBytes;
495 nsACString* out = &aOut;
496 if (bytes == out) {
497 nsAutoCString temp(aBytes);
498 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
499 this, &temp, out);
501 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
502 this, bytes, out);
506 * Decode complete input to `nsACString` _without BOM handling_ and
507 * with malformed sequences replaced with the REPLACEMENT CHARACTER when
508 * the entire input is available as a single buffer (i.e. the end of the
509 * buffer marks the end of the stream) _asserting that a number of bytes
510 * from the start are already known to be valid UTF-8_.
512 * The use case for this method is avoiding copying when dealing with
513 * input that has a UTF-8 BOM. _When in doubt, do not use this method._
515 * When invoked on `UTF_8`, this method implements the (non-streaming
516 * version of) the _UTF-8 decode without BOM_
517 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
519 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
520 * if there were malformed sequences (that were replaced with the
521 * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
523 * _Note:_ It is wrong to use this when the input buffer represents only
524 * a segment of the input instead of the whole input. Use
525 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
527 * # Safety
529 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
530 * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
532 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
533 nsACString& aOut,
534 size_t aAlreadyValidated) const {
535 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
536 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
540 * Decode complete input to `nsAString` _without BOM handling_ and
541 * _with malformed sequences treated as fatal_ when the entire input is
542 * available as a single buffer (i.e. the end of the buffer marks the end
543 * of the stream).
545 * When invoked on `UTF_8`, this method implements the (non-streaming
546 * version of) the _UTF-8 decode without BOM or fail_
547 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
548 * spec concept.
550 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
551 * if a malformed sequence was encountered and `NS_OK` otherwise.
553 * _Note:_ It is wrong to use this when the input buffer represents only
554 * a segment of the input instead of the whole input. Use
555 * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
557 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
558 Span<const uint8_t> aBytes, nsAString& aOut) const {
559 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
560 this, aBytes.Elements(), aBytes.Length(), &aOut);
564 * Encode complete input to `nsACString` with unmappable characters
565 * replaced with decimal numeric character references when the entire input
566 * is available as a single buffer (i.e. the end of the buffer marks the
567 * end of the stream).
569 * This method implements the (non-streaming version of) the
570 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
572 * The second item in the returned tuple is the encoding that was actually
573 * used (which may differ from this encoding thanks to some encodings
574 * having UTF-8 as their output encoding).
576 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
577 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
578 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
579 * replaced with numeric character references) and `NS_OK` otherwise.
581 * The backing buffer of the string isn't copied if the input buffer
582 * is heap-allocated and encoding to UTF-8 and the input is valid
583 * UTF-8, encoding to an ASCII-compatible encoding and the input
584 * is valid ASCII or encoding from ISO-2022-JP and the input stays
585 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
586 * as both arguments.
588 * _Note:_ It is wrong to use this when the input buffer represents only
589 * a segment of the input instead of the whole input. Use `NewEncoder()`
590 * when encoding segmented output.
592 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
593 const nsACString& aString, nsACString& aOut) const {
594 const Encoding* encoding = this;
595 const nsACString* string = &aString;
596 nsACString* out = &aOut;
597 nsresult rv;
598 if (string == out) {
599 nsAutoCString temp(aString);
600 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
601 } else {
602 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
604 return MakeTuple(rv, WrapNotNull(encoding));
608 * Encode complete input to `nsACString` with unmappable characters
609 * replaced with decimal numeric character references when the entire input
610 * is available as a single buffer (i.e. the end of the buffer marks the
611 * end of the stream).
613 * This method implements the (non-streaming version of) the
614 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
616 * The second item in the returned tuple is the encoding that was actually
617 * used (which may differ from this encoding thanks to some encodings
618 * having UTF-8 as their output encoding).
620 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
621 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
622 * were replaced with numeric character references) and `NS_OK` otherwise.
624 * _Note:_ It is wrong to use this when the input buffer represents only
625 * a segment of the input instead of the whole input. Use `NewEncoder()`
626 * when encoding segmented output.
628 inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
629 Span<const char16_t> aString, nsACString& aOut) const {
630 const Encoding* encoding = this;
631 nsresult rv = mozilla_encoding_encode_from_utf16(
632 &encoding, aString.Elements(), aString.Length(), &aOut);
633 return MakeTuple(rv, WrapNotNull(encoding));
637 * Instantiates a new decoder for this encoding with BOM sniffing enabled.
639 * BOM sniffing may cause the returned decoder to morph into a decoder
640 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
642 inline UniquePtr<Decoder> NewDecoder() const {
643 UniquePtr<Decoder> decoder(encoding_new_decoder(this));
644 return decoder;
648 * Instantiates a new decoder for this encoding with BOM sniffing enabled
649 * into memory occupied by a previously-instantiated decoder.
651 * BOM sniffing may cause the returned decoder to morph into a decoder
652 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
654 inline void NewDecoderInto(Decoder& aDecoder) const {
655 encoding_new_decoder_into(this, &aDecoder);
659 * Instantiates a new decoder for this encoding with BOM removal.
661 * If the input starts with bytes that are the BOM for this encoding,
662 * those bytes are removed. However, the decoder never morphs into a
663 * decoder for another encoding: A BOM for another encoding is treated as
664 * (potentially malformed) input to the decoding algorithm for this
665 * encoding.
667 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
668 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
669 return decoder;
673 * Instantiates a new decoder for this encoding with BOM removal
674 * into memory occupied by a previously-instantiated decoder.
676 * If the input starts with bytes that are the BOM for this encoding,
677 * those bytes are removed. However, the decoder never morphs into a
678 * decoder for another encoding: A BOM for another encoding is treated as
679 * (potentially malformed) input to the decoding algorithm for this
680 * encoding.
682 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
683 encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
687 * Instantiates a new decoder for this encoding with BOM handling disabled.
689 * If the input starts with bytes that look like a BOM, those bytes are
690 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
691 * for another encoding.)
693 * _Note:_ If the caller has performed BOM sniffing on its own but has not
694 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
695 * instead of this method to cause the BOM to be removed.
697 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
698 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
699 return decoder;
703 * Instantiates a new decoder for this encoding with BOM handling disabled
704 * into memory occupied by a previously-instantiated decoder.
706 * If the input starts with bytes that look like a BOM, those bytes are
707 * not treated as a BOM. (Hence, the decoder never morphs into a decoder
708 * for another encoding.)
710 * _Note:_ If the caller has performed BOM sniffing on its own but has not
711 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
712 * instead of this method to cause the BOM to be removed.
714 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
715 encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
719 * Instantiates a new encoder for the output encoding of this encoding.
721 inline UniquePtr<Encoder> NewEncoder() const {
722 UniquePtr<Encoder> encoder(encoding_new_encoder(this));
723 return encoder;
727 * Instantiates a new encoder for the output encoding of this encoding
728 * into memory occupied by a previously-instantiated encoder.
730 inline void NewEncoderInto(Encoder& aEncoder) const {
731 encoding_new_encoder_into(this, &aEncoder);
735 * Validates UTF-8.
737 * Returns the index of the first byte that makes the input malformed as
738 * UTF-8 or the length of the input if the input is entirely valid.
740 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
741 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
745 * Validates ASCII.
747 * Returns the index of the first byte that makes the input malformed as
748 * ASCII or the length of the input if the input is entirely valid.
750 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
751 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
755 * Validates ISO-2022-JP ASCII-state data.
757 * Returns the index of the first byte that makes the input not
758 * representable in the ASCII state of ISO-2022-JP or the length of the
759 * input if the input is entirely representable in the ASCII state of
760 * ISO-2022-JP.
762 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
763 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
764 aBuffer.Length());
767 private:
768 Encoding() = delete;
769 Encoding(const Encoding&) = delete;
770 Encoding& operator=(const Encoding&) = delete;
771 ~Encoding() = delete;
775 * A converter that decodes a byte stream into Unicode according to a
776 * character encoding in a streaming (incremental) manner.
778 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
779 * buffer `aDst` both of which are caller-allocated. There are variants for
780 * both UTF-8 and UTF-16 output buffers.
782 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
783 * into `aDst` until one of the following three things happens:
785 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
786 * variants only).
788 * 2. The output buffer has been filled so near capacity that the decoder
789 * cannot be sure that processing an additional byte of input wouldn't
790 * cause so much output that the output buffer would overflow.
792 * 3. All the input bytes have been processed.
794 * The `Decode*` method then returns tuple of a status indicating which one
795 * of the three reasons to return happened, how many input bytes were read,
796 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
797 * when decoding to UTF-16) were written, and in the case of the
798 * variants performing replacement, a boolean indicating whether an error was
799 * replaced with the REPLACEMENT CHARACTER during the call.
801 * The number of bytes "written" is what's logically written. Garbage may be
802 * written in the output buffer beyond the point logically written to.
804 * In the case of the `*WithoutReplacement` variants, the status is a
805 * `uint32_t` whose possible values are packed info about a malformed byte
806 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
807 * listed above).
809 * Packed info about malformed sequences has the following format:
810 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
811 * indicate the number of bytes that were consumed after the malformed
812 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
813 * the length of the malformed byte sequence (possible decimal values 1, 2,
814 * 3 or 4). The maximum possible sum of the two is 6.
816 * In the case of methods whose name does not end with
817 * `*WithoutReplacement`, malformed sequences are automatically replaced
818 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
819 * return early.
821 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
822 * space. When decoding to UTF-16, the output buffer must have at least two
823 * UTF-16 code units (`char16_t`) of space.
825 * When decoding to UTF-8 without replacement, the methods are guaranteed
826 * not to return indicating that more output space is needed if the length
827 * of the output buffer is at least the length returned by
828 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
829 * with replacement, the length of the output buffer that guarantees the
830 * methods not to return indicating that more output space is needed is given
831 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
832 * or without replacement, the length of the output buffer that guarantees
833 * the methods not to return indicating that more output space is needed is
834 * given by `MaxUTF16BufferLength()`.
836 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
837 * and the output after each `Decode*` call is guaranteed to consist of
838 * complete characters. (I.e. the code unit sequence for the last character is
839 * guaranteed not to be split across output buffers.)
841 * The boolean argument `aLast` indicates that the end of the stream is reached
842 * when all the bytes in `aSrc` have been consumed.
844 * A `Decoder` object can be used to incrementally decode a byte stream.
846 * During the processing of a single stream, the caller must call `Decode*`
847 * zero or more times with `aLast` set to `false` and then call `Decode*` at
848 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
849 * the processing of the stream has ended. Otherwise, the caller must call
850 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
851 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
853 * Once the stream has ended, the `Decoder` object must not be used anymore.
854 * That is, you need to create another one to process another stream.
856 * When the decoder returns `kOutputFull` or the decoder returns a malformed
857 * result and the caller does not wish to treat it as a fatal error, the input
858 * buffer `aSrc` may not have been completely consumed. In that case, the caller
859 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
860 * call.
862 * # Infinite loops
864 * When converting with a fixed-size output buffer whose size is too small to
865 * accommodate one character of output, an infinite loop ensues. When
866 * converting with a fixed-size output buffer, it generally makes sense to
867 * make the buffer fairly large (e.g. couple of kilobytes).
869 class Decoder final {
870 public:
871 ~Decoder() {}
872 static void operator delete(void* aDecoder) {
873 decoder_free(reinterpret_cast<Decoder*>(aDecoder));
877 * The `Encoding` this `Decoder` is for.
879 * BOM sniffing can change the return value of this method during the life
880 * of the decoder.
882 inline NotNull<const mozilla::Encoding*> Encoding() const {
883 return WrapNotNull(decoder_encoding(this));
887 * Query the worst-case UTF-8 output size _with replacement_.
889 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
890 * that will not overflow given the current state of the decoder and
891 * `aByteLength` number of additional input bytes when decoding with
892 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
893 * sequence.
895 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
896 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
897 if (max.value() == MaxValue<size_t>::value) {
898 // Mark invalid by overflowing
899 max++;
900 MOZ_ASSERT(!max.isValid());
902 return max;
906 * Query the worst-case UTF-8 output size _without replacement_.
908 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
909 * that will not overflow given the current state of the decoder and
910 * `aByteLength` number of additional input bytes when decoding without
911 * replacement error handling.
913 * Note that this value may be too small for the `WithReplacement` case.
914 * Use `MaxUTF8BufferLength()` for that case.
916 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
917 size_t aByteLength) const {
918 CheckedInt<size_t> max(
919 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
920 if (max.value() == MaxValue<size_t>::value) {
921 // Mark invalid by overflowing
922 max++;
923 MOZ_ASSERT(!max.isValid());
925 return max;
929 * Incrementally decode a byte stream into UTF-8 with malformed sequences
930 * replaced with the REPLACEMENT CHARACTER.
932 * See the documentation of the class for documentation for `Decode*`
933 * methods collectively.
935 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
936 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
937 size_t srcRead = aSrc.Length();
938 size_t dstWritten = aDst.Length();
939 bool hadReplacements;
940 uint32_t result =
941 decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
942 &dstWritten, aLast, &hadReplacements);
943 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
947 * Incrementally decode a byte stream into UTF-8 _without replacement_.
949 * See the documentation of the class for documentation for `Decode*`
950 * methods collectively.
952 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
953 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
954 size_t srcRead = aSrc.Length();
955 size_t dstWritten = aDst.Length();
956 uint32_t result = decoder_decode_to_utf8_without_replacement(
957 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
958 return MakeTuple(result, srcRead, dstWritten);
962 * Query the worst-case UTF-16 output size (with or without replacement).
964 * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
965 * that will not overflow given the current state of the decoder and
966 * `aByteLength` number of additional input bytes.
968 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
969 * return value of this method applies also in the
970 * `_without_replacement` case.
972 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
973 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
974 if (max.value() == MaxValue<size_t>::value) {
975 // Mark invalid by overflowing
976 max++;
977 MOZ_ASSERT(!max.isValid());
979 return max;
983 * Incrementally decode a byte stream into UTF-16 with malformed sequences
984 * replaced with the REPLACEMENT CHARACTER.
986 * See the documentation of the class for documentation for `Decode*`
987 * methods collectively.
989 inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
990 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
991 size_t srcRead = aSrc.Length();
992 size_t dstWritten = aDst.Length();
993 bool hadReplacements;
994 uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
995 aDst.Elements(), &dstWritten,
996 aLast, &hadReplacements);
997 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1001 * Incrementally decode a byte stream into UTF-16 _without replacement_.
1003 * See the documentation of the class for documentation for `Decode*`
1004 * methods collectively.
1006 inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1007 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1008 size_t srcRead = aSrc.Length();
1009 size_t dstWritten = aDst.Length();
1010 uint32_t result = decoder_decode_to_utf16_without_replacement(
1011 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1012 return MakeTuple(result, srcRead, dstWritten);
1015 private:
1016 Decoder() = delete;
1017 Decoder(const Decoder&) = delete;
1018 Decoder& operator=(const Decoder&) = delete;
1022 * A converter that encodes a Unicode stream into bytes according to a
1023 * character encoding in a streaming (incremental) manner.
1025 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1026 * buffer `aDst` both of which are caller-allocated. There are variants for
1027 * both UTF-8 and UTF-16 input buffers.
1029 * An `Encode*` method encode characters from `aSrc` into bytes characters
1030 * stored into `aDst` until one of the following three things happens:
1032 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1033 * only).
1035 * 2. The output buffer has been filled so near capacity that the decoder
1036 * cannot be sure that processing an additional character of input wouldn't
1037 * cause so much output that the output buffer would overflow.
1039 * 3. All the input characters have been processed.
1041 * The `Encode*` method then returns tuple of a status indicating which one
1042 * of the three reasons to return happened, how many input code units (`uint8_t`
1043 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1044 * how many output bytes were written, and in the case of the variants that
1045 * perform replacement, a boolean indicating whether an unmappable
1046 * character was replaced with a numeric character reference during the call.
1048 * The number of bytes "written" is what's logically written. Garbage may be
1049 * written in the output buffer beyond the point logically written to.
1051 * In the case of the methods whose name ends with
1052 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1053 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1054 * to the three cases listed above).
1056 * In the case of methods whose name does not end with
1057 * `*WithoutReplacement`, unmappable characters are automatically replaced
1058 * with the corresponding numeric character references and unmappable
1059 * characters do not cause the methods to return early.
1061 * When encoding from UTF-8 without replacement, the methods are guaranteed
1062 * not to return indicating that more output space is needed if the length
1063 * of the output buffer is at least the length returned by
1064 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1065 * UTF-8 with replacement, the length of the output buffer that guarantees the
1066 * methods not to return indicating that more output space is needed in the
1067 * absence of unmappable characters is given by
1068 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1069 * UTF-16 without replacement, the methods are guaranteed not to return
1070 * indicating that more output space is needed if the length of the output
1071 * buffer is at least the length returned by
1072 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1073 * from UTF-16 with replacement, the the length of the output buffer that
1074 * guarantees the methods not to return indicating that more output space is
1075 * needed in the absence of unmappable characters is given by
1076 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1077 * When encoding with replacement, applications are not expected to size the
1078 * buffer for the worst case ahead of time but to resize the buffer if there
1079 * are unmappable characters. This is why max length queries are only available
1080 * for the case where there are no unmappable characters.
1082 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1083 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1084 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1085 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1086 * surrogate pairs are not split across input buffer boundaries.
1088 * After an `Encode*` call returns, the output produced so far, taken as a
1089 * whole from the start of the stream, is guaranteed to consist of a valid
1090 * byte sequence in the target encoding. (I.e. the code unit sequence for a
1091 * character is guaranteed not to be split across output buffers. However, due
1092 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1093 * from the start for it to be valid. For other encodings, the validity holds
1094 * on a per-output buffer basis.)
1096 * The boolean argument `aLast` indicates that the end of the stream is reached
1097 * when all the characters in `aSrc` have been consumed. This argument is needed
1098 * for ISO-2022-JP and is ignored for other encodings.
1100 * An `Encoder` object can be used to incrementally encode a byte stream.
1102 * During the processing of a single stream, the caller must call `Encode*`
1103 * zero or more times with `aLast` set to `false` and then call `Encode*` at
1104 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1105 * the processing of the stream has ended. Otherwise, the caller must call
1106 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1107 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1109 * Once the stream has ended, the `Encoder` object must not be used anymore.
1110 * That is, you need to create another one to process another stream.
1112 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1113 * result and the caller does not wish to treat it as a fatal error, the input
1114 * buffer `aSrc` may not have been completely consumed. In that case, the caller
1115 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1116 * call.
1118 * # Infinite loops
1120 * When converting with a fixed-size output buffer whose size is too small to
1121 * accommodate one character of output, an infinite loop ensues. When
1122 * converting with a fixed-size output buffer, it generally makes sense to
1123 * make the buffer fairly large (e.g. couple of kilobytes).
1125 class Encoder final {
1126 public:
1127 ~Encoder() {}
1129 static void operator delete(void* aEncoder) {
1130 encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1134 * The `Encoding` this `Encoder` is for.
1136 inline NotNull<const mozilla::Encoding*> Encoding() const {
1137 return WrapNotNull(encoder_encoding(this));
1141 * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1142 * ASCII state and `false` otherwise.
1144 inline bool HasPendingState() const {
1145 return encoder_has_pending_state(this);
1149 * Query the worst-case output size when encoding from UTF-8 with
1150 * replacement.
1152 * Returns the size of the output buffer in bytes that will not overflow
1153 * given the current state of the encoder and `aByteLength` number of
1154 * additional input code units if there are no unmappable characters in
1155 * the input.
1157 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1158 size_t aByteLength) const {
1159 CheckedInt<size_t> max(
1160 encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1161 aByteLength));
1162 if (max.value() == MaxValue<size_t>::value) {
1163 // Mark invalid by overflowing
1164 max++;
1165 MOZ_ASSERT(!max.isValid());
1167 return max;
1171 * Query the worst-case output size when encoding from UTF-8 without
1172 * replacement.
1174 * Returns the size of the output buffer in bytes that will not overflow
1175 * given the current state of the encoder and `aByteLength` number of
1176 * additional input code units.
1178 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1179 size_t aByteLength) const {
1180 CheckedInt<size_t> max(
1181 encoder_max_buffer_length_from_utf8_without_replacement(this,
1182 aByteLength));
1183 if (max.value() == MaxValue<size_t>::value) {
1184 // Mark invalid by overflowing
1185 max++;
1186 MOZ_ASSERT(!max.isValid());
1188 return max;
1192 * Incrementally encode into byte stream from UTF-8 with unmappable
1193 * characters replaced with HTML (decimal) numeric character references.
1195 * See the documentation of the class for documentation for `Encode*`
1196 * methods collectively.
1198 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1199 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1200 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1202 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1203 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1204 size_t srcRead = aSrc.Length();
1205 size_t dstWritten = aDst.Length();
1206 bool hadReplacements;
1207 uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1208 aDst.Elements(), &dstWritten,
1209 aLast, &hadReplacements);
1210 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1214 * Incrementally encode into byte stream from UTF-8 _without replacement_.
1216 * See the documentation of the class for documentation for `Encode*`
1217 * methods collectively.
1219 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1220 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1221 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1223 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1224 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1225 size_t srcRead = aSrc.Length();
1226 size_t dstWritten = aDst.Length();
1227 uint32_t result = encoder_encode_from_utf8_without_replacement(
1228 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1229 return MakeTuple(result, srcRead, dstWritten);
1233 * Query the worst-case output size when encoding from UTF-16 with
1234 * replacement.
1236 * Returns the size of the output buffer in bytes that will not overflow
1237 * given the current state of the encoder and `aU16Length` number of
1238 * additional input code units if there are no unmappable characters in
1239 * the input.
1241 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1242 size_t aU16Length) const {
1243 CheckedInt<size_t> max(
1244 encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1245 aU16Length));
1246 if (max.value() == MaxValue<size_t>::value) {
1247 // Mark invalid by overflowing
1248 max++;
1249 MOZ_ASSERT(!max.isValid());
1251 return max;
1255 * Query the worst-case output size when encoding from UTF-16 without
1256 * replacement.
1258 * Returns the size of the output buffer in bytes that will not overflow
1259 * given the current state of the encoder and `aU16Length` number of
1260 * additional input code units.
1262 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1263 size_t aU16Length) const {
1264 CheckedInt<size_t> max(
1265 encoder_max_buffer_length_from_utf16_without_replacement(this,
1266 aU16Length));
1267 if (max.value() == MaxValue<size_t>::value) {
1268 // Mark invalid by overflowing
1269 max++;
1270 MOZ_ASSERT(!max.isValid());
1272 return max;
1276 * Incrementally encode into byte stream from UTF-16 with unmappable
1277 * characters replaced with HTML (decimal) numeric character references.
1279 * See the documentation of the class for documentation for `Encode*`
1280 * methods collectively.
1282 inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1283 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1284 size_t srcRead = aSrc.Length();
1285 size_t dstWritten = aDst.Length();
1286 bool hadReplacements;
1287 uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1288 aDst.Elements(), &dstWritten,
1289 aLast, &hadReplacements);
1290 return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1294 * Incrementally encode into byte stream from UTF-16 _without replacement_.
1296 * See the documentation of the class for documentation for `Encode*`
1297 * methods collectively.
1299 inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1300 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1301 size_t srcRead = aSrc.Length();
1302 size_t dstWritten = aDst.Length();
1303 uint32_t result = encoder_encode_from_utf16_without_replacement(
1304 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1305 return MakeTuple(result, srcRead, dstWritten);
1308 private:
1309 Encoder() = delete;
1310 Encoder(const Encoder&) = delete;
1311 Encoder& operator=(const Encoder&) = delete;
1314 }; // namespace mozilla
1316 #endif // mozilla_Encoding_h