intl/Encoding.h

   1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
   2 // file at the top-level directory of this distribution.
   3 //
   4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   7 // option. This file may not be copied, modified, or distributed
   8 // except according to those terms.
   9
  10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
  11 // "top-level directory" in the above notice refers to
  12 // third_party/rust/encoding_c/.
  13
  14 #ifndef mozilla_Encoding_h
  15 #define mozilla_Encoding_h
  16
  17 #include "mozilla/CheckedInt.h"
  18 #include "mozilla/NotNull.h"
  19 #include "mozilla/Span.h"
  20 #include "mozilla/Tuple.h"
  21 #include "nsString.h"
  22
  23 namespace mozilla {
  24 class Encoding;
  25 class Decoder;
  26 class Encoder;
  27 };  // namespace mozilla
  28
  29 #define ENCODING_RS_ENCODING mozilla::Encoding
  30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
  31   mozilla::NotNull<const mozilla::Encoding*>
  32 #define ENCODING_RS_ENCODER mozilla::Encoder
  33 #define ENCODING_RS_DECODER mozilla::Decoder
  34
  35 #include "encoding_rs.h"
  36
  37 extern "C" {
  38
  39 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
  40                                              uint8_t const* src, size_t src_len,
  41                                              nsAString* dst);
  42
  43 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
  44     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  45     nsAString* dst);
  46
  47 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
  48     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  49     nsAString* dst);
  50
  51 nsresult
  52 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
  53     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  54     nsAString* dst);
  55
  56 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
  57                                             char16_t const* src, size_t src_len,
  58                                             nsACString* dst);
  59
  60 nsresult mozilla_encoding_decode_to_nscstring(
  61     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  62
  63 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
  64     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  65
  66 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
  67     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  68
  69 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
  70     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  71     nsACString* dst, size_t already_validated);
  72
  73 nsresult
  74 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
  75     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  76
  77 nsresult mozilla_encoding_encode_from_nscstring(
  78     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  79
  80 }  // extern "C"
  81
  82 namespace mozilla {
  83
  84 /**
  85  * Return value from `Decoder`/`Encoder` to indicate that input
  86  * was exhausted.
  87  */
  88 const uint32_t kInputEmpty = INPUT_EMPTY;
  89
  90 /**
  91  * Return value from `Decoder`/`Encoder` to indicate that output
  92  * space was insufficient.
  93  */
  94 const uint32_t kOutputFull = OUTPUT_FULL;
  95
  96 /**
  97  * An encoding as defined in the Encoding Standard
  98  * (https://encoding.spec.whatwg.org/).
  99  *
 100  * See https://docs.rs/encoding_rs/ for the Rust API docs.
 101  *
 102  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
 103  * sequence and, in most cases, vice versa. Each encoding has a name, an output
 104  * encoding, and one or more labels.
 105  *
 106  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
 107  * encoding in formats and protocols. The _name_ of the encoding is the
 108  * preferred label in the case appropriate for returning from the
 109  * `characterSet` property of the `Document` DOM interface, except for
 110  * the replacement encoding whose name is not one of its labels.
 111  *
 112  * The _output encoding_ is the encoding used for form submission and URL
 113  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
 114  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
 115  * encodings.
 116  *
 117  * # Streaming vs. Non-Streaming
 118  *
 119  * When you have the entire input in a single buffer, you can use the
 120  * methods `Decode()`, `DecodeWithBOMRemoval()`,
 121  * `DecodeWithoutBOMHandling()`,
 122  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
 123  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
 124  * NewEncoder()` methods), these methods perform heap allocations. You should
 125  * the `Decoder` and `Encoder` objects when your input is split into multiple
 126  * buffers or when you want to control the allocation of the output buffers.
 127  *
 128  * # Instances
 129  *
 130  * All instances of `Encoding` are statically allocated and have the process's
 131  * lifetime. There is precisely one unique `Encoding` instance for each
 132  * encoding defined in the Encoding Standard.
 133  *
 134  * To obtain a reference to a particular encoding whose identity you know at
 135  * compile time, use a `static` that refers to encoding. There is a `static`
 136  * for each encoding. The `static`s are named in all caps with hyphens
 137  * replaced with underscores and with `_ENCODING` appended to the
 138  * name. For example, if you know at compile time that you will want to
 139  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
 140  *
 141  * If you don't know what encoding you need at compile time and need to
 142  * dynamically get an encoding by label, use `Encoding::for_label()`.
 143  *
 144  * Pointers to `Encoding` can be compared with `==` to check for the sameness
 145  * of two encodings.
 146  *
 147  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
 148  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
 149  * `const mozilla::Encoding*` in the C signature and
 150  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
 151  */
 152 class Encoding final {
 153  public:
 154   /**
 155    * Implements the _get an encoding_ algorithm
 156    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
 157    *
 158    * If, after ASCII-lowercasing and removing leading and trailing
 159    * whitespace, the argument matches a label defined in the Encoding
 160    * Standard, `const Encoding*` representing the corresponding
 161    * encoding is returned. If there is no match, `nullptr` is returned.
 162    *
 163    * This is the right method to use if the action upon the method returning
 164    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
 165    * instead. When the action upon the method returning `nullptr` is not to
 166    * proceed with a fallback but to refuse processing,
 167    * `ForLabelNoReplacement()` is more appropriate.
 168    */
 169   static inline const Encoding* ForLabel(Span<const char> aLabel) {
 170     return encoding_for_label(
 171         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 172   }
 173
 174   /**
 175    * `nsAString` argument version. See above for docs.
 176    */
 177   static inline const Encoding* ForLabel(const nsAString& aLabel) {
 178     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
 179   }
 180
 181   /**
 182    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
 183    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
 184    *
 185    * This method is useful in scenarios where a fatal error is required
 186    * upon invalid label, because in those cases the caller typically wishes
 187    * to treat the labels that map to the replacement encoding as fatal
 188    * errors, too.
 189    *
 190    * It is not OK to use this method when the action upon the method returning
 191    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
 192    * such a case, the `ForLabel()` method should be used instead in order to
 193    * avoid unsafe fallback for labels that `ForLabel()` maps to
 194    * `REPLACEMENT_ENCODING`.
 195    */
 196   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
 197     return encoding_for_label_no_replacement(
 198         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 199   }
 200
 201   /**
 202    * `nsAString` argument version. See above for docs.
 203    */
 204   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
 205     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
 206   }
 207
 208   /**
 209    * Performs non-incremental BOM sniffing.
 210    *
 211    * The argument must either be a buffer representing the entire input
 212    * stream (non-streaming case) or a buffer representing at least the first
 213    * three bytes of the input stream (streaming case).
 214    *
 215    * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
 216    * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
 217    * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
 218    */
 219   static inline Tuple<const Encoding*, size_t> ForBOM(
 220       Span<const uint8_t> aBuffer) {
 221     size_t len = aBuffer.Length();
 222     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
 223     return MakeTuple(encoding, len);
 224   }
 225
 226   /**
 227    * Writes the name of this encoding into `aName`.
 228    *
 229    * This name is appropriate to return as-is from the DOM
 230    * `document.characterSet` property.
 231    */
 232   inline void Name(nsACString& aName) const {
 233     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
 234     size_t length =
 235         encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
 236     aName.SetLength(length);  // truncation is the 64-bit case is OK
 237   }
 238
 239   /**
 240    * Checks whether the _output encoding_ of this encoding can encode every
 241    * Unicode code point. (Only true if the output encoding is UTF-8.)
 242    */
 243   inline bool CanEncodeEverything() const {
 244     return encoding_can_encode_everything(this);
 245   }
 246
 247   /**
 248    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
 249    * U+0000...U+007F and vice versa.
 250    */
 251   inline bool IsAsciiCompatible() const {
 252     return encoding_is_ascii_compatible(this);
 253   }
 254
 255   /**
 256    * Checks whether this is a Japanese legacy encoding.
 257    */
 258   inline bool IsJapaneseLegacy() const {
 259     return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
 260            this == ISO_2022_JP_ENCODING;
 261   }
 262
 263   /**
 264    * Returns the _output encoding_ of this encoding. This is UTF-8 for
 265    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
 266    */
 267   inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
 268     return WrapNotNull(encoding_output_encoding(this));
 269   }
 270
 271   /**
 272    * Decode complete input to `nsACString` _with BOM sniffing_ and with
 273    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 274    * entire input is available as a single buffer (i.e. the end of the
 275    * buffer marks the end of the stream).
 276    *
 277    * This method implements the (non-streaming version of) the
 278    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 279    *
 280    * The second item in the returned tuple is the encoding that was actually
 281    * used (which may differ from this encoding thanks to BOM sniffing).
 282    *
 283    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 284    * if there were malformed sequences (that were replaced with the
 285    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 286    * tuple.
 287    *
 288    * The backing buffer of the string isn't copied if the input buffer
 289    * is heap-allocated and decoding from UTF-8 and the input is valid
 290    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 291    * the input is valid ASCII or decoding from ISO-2022-JP and the
 292    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 293    * the same string as both arguments.
 294    *
 295    * _Note:_ It is wrong to use this when the input buffer represents only
 296    * a segment of the input instead of the whole input. Use `NewDecoder()`
 297    * when decoding segmented input.
 298    */
 299   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 300       const nsACString& aBytes, nsACString& aOut) const {
 301     const Encoding* encoding = this;
 302     const nsACString* bytes = &aBytes;
 303     nsACString* out = &aOut;
 304     nsresult rv;
 305     if (bytes == out) {
 306       nsAutoCString temp(aBytes);
 307       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
 308     } else {
 309       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
 310     }
 311     return MakeTuple(rv, WrapNotNull(encoding));
 312   }
 313
 314   /**
 315    * Decode complete input to `nsAString` _with BOM sniffing_ and with
 316    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 317    * entire input is available as a single buffer (i.e. the end of the
 318    * buffer marks the end of the stream).
 319    *
 320    * This method implements the (non-streaming version of) the
 321    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 322    *
 323    * The second item in the returned tuple is the encoding that was actually
 324    * used (which may differ from this encoding thanks to BOM sniffing).
 325    *
 326    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 327    * if there were malformed sequences (that were replaced with the
 328    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 329    * tuple.
 330    *
 331    * _Note:_ It is wrong to use this when the input buffer represents only
 332    * a segment of the input instead of the whole input. Use `NewDecoder()`
 333    * when decoding segmented input.
 334    */
 335   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 336       Span<const uint8_t> aBytes, nsAString& aOut) const {
 337     const Encoding* encoding = this;
 338     nsresult rv = mozilla_encoding_decode_to_nsstring(
 339         &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
 340     return MakeTuple(rv, WrapNotNull(encoding));
 341   }
 342
 343   /**
 344    * Decode complete input to `nsACString` _with BOM removal_ and with
 345    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 346    * entire input is available as a single buffer (i.e. the end of the
 347    * buffer marks the end of the stream).
 348    *
 349    * When invoked on `UTF_8`, this method implements the (non-streaming
 350    * version of) the _UTF-8 decode_
 351    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 352    *
 353    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 354    * if there were malformed sequences (that were replaced with the
 355    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 356    *
 357    * The backing buffer of the string isn't copied if the input buffer
 358    * is heap-allocated and decoding from UTF-8 and the input is valid
 359    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 360    * the input is valid ASCII or decoding from ISO-2022-JP and the
 361    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 362    * the same string as both arguments.
 363    *
 364    * _Note:_ It is wrong to use this when the input buffer represents only
 365    * a segment of the input instead of the whole input. Use
 366    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 367    */
 368   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
 369                                        nsACString& aOut) const {
 370     const nsACString* bytes = &aBytes;
 371     nsACString* out = &aOut;
 372     if (bytes == out) {
 373       nsAutoCString temp(aBytes);
 374       return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
 375                                                                    out);
 376     }
 377     return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
 378                                                                  out);
 379   }
 380
 381   /**
 382    * Decode complete input to `nsAString` _with BOM removal_ and with
 383    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 384    * entire input is available as a single buffer (i.e. the end of the
 385    * buffer marks the end of the stream).
 386    *
 387    * When invoked on `UTF_8`, this method implements the (non-streaming
 388    * version of) the _UTF-8 decode_
 389    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 390    *
 391    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 392    * if there were malformed sequences (that were replaced with the
 393    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 394    *
 395    * _Note:_ It is wrong to use this when the input buffer represents only
 396    * a segment of the input instead of the whole input. Use
 397    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 398    */
 399   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
 400                                        nsAString& aOut) const {
 401     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
 402         this, aBytes.Elements(), aBytes.Length(), &aOut);
 403   }
 404
 405   /**
 406    * Decode complete input to `nsACString` _without BOM handling_ and
 407    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 408    * the entire input is available as a single buffer (i.e. the end of the
 409    * buffer marks the end of the stream).
 410    *
 411    * When invoked on `UTF_8`, this method implements the (non-streaming
 412    * version of) the _UTF-8 decode without BOM_
 413    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 414    *
 415    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 416    * if there were malformed sequences (that were replaced with the
 417    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 418    *
 419    * The backing buffer of the string isn't copied if the input buffer
 420    * is heap-allocated and decoding from UTF-8 and the input is valid
 421    * UTF-8, decoding from an ASCII-compatible encoding and the input
 422    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 423    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 424    * as both arguments.
 425    *
 426    * _Note:_ It is wrong to use this when the input buffer represents only
 427    * a segment of the input instead of the whole input. Use
 428    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 429    */
 430   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
 431                                            nsACString& aOut) const {
 432     const nsACString* bytes = &aBytes;
 433     nsACString* out = &aOut;
 434     if (bytes == out) {
 435       nsAutoCString temp(aBytes);
 436       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 437           this, &temp, out);
 438     }
 439     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 440         this, bytes, out);
 441   }
 442
 443   /**
 444    * Decode complete input to `nsAString` _without BOM handling_ and
 445    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 446    * the entire input is available as a single buffer (i.e. the end of the
 447    * buffer marks the end of the stream).
 448    *
 449    * When invoked on `UTF_8`, this method implements the (non-streaming
 450    * version of) the _UTF-8 decode without BOM_
 451    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 452    *
 453    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 454    * if there were malformed sequences (that were replaced with the
 455    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 456    *
 457    * _Note:_ It is wrong to use this when the input buffer represents only
 458    * a segment of the input instead of the whole input. Use
 459    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 460    */
 461   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 462                                            nsAString& aOut) const {
 463     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
 464         this, aBytes.Elements(), aBytes.Length(), &aOut);
 465   }
 466
 467   /**
 468    * Decode complete input to `nsACString` _without BOM handling_ and
 469    * _with malformed sequences treated as fatal_ when the entire input is
 470    * available as a single buffer (i.e. the end of the buffer marks the end
 471    * of the stream).
 472    *
 473    * When invoked on `UTF_8`, this method implements the (non-streaming
 474    * version of) the _UTF-8 decode without BOM or fail_
 475    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 476    * spec concept.
 477    *
 478    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 479    * if a malformed sequence was encountered and `NS_OK` otherwise.
 480    *
 481    * The backing buffer of the string isn't copied if the input buffer
 482    * is heap-allocated and decoding from UTF-8 and the input is valid
 483    * UTF-8, decoding from an ASCII-compatible encoding and the input
 484    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 485    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 486    * as both arguments.
 487    *
 488    * _Note:_ It is wrong to use this when the input buffer represents only
 489    * a segment of the input instead of the whole input. Use
 490    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 491    */
 492   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 493       const nsACString& aBytes, nsACString& aOut) const {
 494     const nsACString* bytes = &aBytes;
 495     nsACString* out = &aOut;
 496     if (bytes == out) {
 497       nsAutoCString temp(aBytes);
 498       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 499           this, &temp, out);
 500     }
 501     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 502         this, bytes, out);
 503   }
 504
 505   /**
 506    * Decode complete input to `nsACString` _without BOM handling_ and
 507    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 508    * the entire input is available as a single buffer (i.e. the end of the
 509    * buffer marks the end of the stream) _asserting that a number of bytes
 510    * from the start are already known to be valid UTF-8_.
 511    *
 512    * The use case for this method is avoiding copying when dealing with
 513    * input that has a UTF-8 BOM. _When in doubt, do not use this method._
 514    *
 515    * When invoked on `UTF_8`, this method implements the (non-streaming
 516    * version of) the _UTF-8 decode without BOM_
 517    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 518    *
 519    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 520    * if there were malformed sequences (that were replaced with the
 521    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 522    *
 523    * _Note:_ It is wrong to use this when the input buffer represents only
 524    * a segment of the input instead of the whole input. Use
 525    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 526    *
 527    * # Safety
 528    *
 529    * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
 530    * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
 531    */
 532   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 533                                            nsACString& aOut,
 534                                            size_t aAlreadyValidated) const {
 535     return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
 536         this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
 537   }
 538
 539   /**
 540    * Decode complete input to `nsAString` _without BOM handling_ and
 541    * _with malformed sequences treated as fatal_ when the entire input is
 542    * available as a single buffer (i.e. the end of the buffer marks the end
 543    * of the stream).
 544    *
 545    * When invoked on `UTF_8`, this method implements the (non-streaming
 546    * version of) the _UTF-8 decode without BOM or fail_
 547    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 548    * spec concept.
 549    *
 550    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 551    * if a malformed sequence was encountered and `NS_OK` otherwise.
 552    *
 553    * _Note:_ It is wrong to use this when the input buffer represents only
 554    * a segment of the input instead of the whole input. Use
 555    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 556    */
 557   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 558       Span<const uint8_t> aBytes, nsAString& aOut) const {
 559     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
 560         this, aBytes.Elements(), aBytes.Length(), &aOut);
 561   }
 562
 563   /**
 564    * Encode complete input to `nsACString` with unmappable characters
 565    * replaced with decimal numeric character references when the entire input
 566    * is available as a single buffer (i.e. the end of the buffer marks the
 567    * end of the stream).
 568    *
 569    * This method implements the (non-streaming version of) the
 570    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 571    *
 572    * The second item in the returned tuple is the encoding that was actually
 573    * used (which may differ from this encoding thanks to some encodings
 574    * having UTF-8 as their output encoding).
 575    *
 576    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
 577    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
 578    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
 579    * replaced with numeric character references) and `NS_OK` otherwise.
 580    *
 581    * The backing buffer of the string isn't copied if the input buffer
 582    * is heap-allocated and encoding to UTF-8 and the input is valid
 583    * UTF-8, encoding to an ASCII-compatible encoding and the input
 584    * is valid ASCII or encoding from ISO-2022-JP and the input stays
 585    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 586    * as both arguments.
 587    *
 588    * _Note:_ It is wrong to use this when the input buffer represents only
 589    * a segment of the input instead of the whole input. Use `NewEncoder()`
 590    * when encoding segmented output.
 591    */
 592   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 593       const nsACString& aString, nsACString& aOut) const {
 594     const Encoding* encoding = this;
 595     const nsACString* string = &aString;
 596     nsACString* out = &aOut;
 597     nsresult rv;
 598     if (string == out) {
 599       nsAutoCString temp(aString);
 600       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
 601     } else {
 602       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
 603     }
 604     return MakeTuple(rv, WrapNotNull(encoding));
 605   }
 606
 607   /**
 608    * Encode complete input to `nsACString` with unmappable characters
 609    * replaced with decimal numeric character references when the entire input
 610    * is available as a single buffer (i.e. the end of the buffer marks the
 611    * end of the stream).
 612    *
 613    * This method implements the (non-streaming version of) the
 614    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 615    *
 616    * The second item in the returned tuple is the encoding that was actually
 617    * used (which may differ from this encoding thanks to some encodings
 618    * having UTF-8 as their output encoding).
 619    *
 620    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
 621    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
 622    * were replaced with numeric character references) and `NS_OK` otherwise.
 623
 624    * _Note:_ It is wrong to use this when the input buffer represents only
 625    * a segment of the input instead of the whole input. Use `NewEncoder()`
 626    * when encoding segmented output.
 627    */
 628   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 629       Span<const char16_t> aString, nsACString& aOut) const {
 630     const Encoding* encoding = this;
 631     nsresult rv = mozilla_encoding_encode_from_utf16(
 632         &encoding, aString.Elements(), aString.Length(), &aOut);
 633     return MakeTuple(rv, WrapNotNull(encoding));
 634   }
 635
 636   /**
 637    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
 638    *
 639    * BOM sniffing may cause the returned decoder to morph into a decoder
 640    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 641    */
 642   inline UniquePtr<Decoder> NewDecoder() const {
 643     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
 644     return decoder;
 645   }
 646
 647   /**
 648    * Instantiates a new decoder for this encoding with BOM sniffing enabled
 649    * into memory occupied by a previously-instantiated decoder.
 650    *
 651    * BOM sniffing may cause the returned decoder to morph into a decoder
 652    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 653    */
 654   inline void NewDecoderInto(Decoder& aDecoder) const {
 655     encoding_new_decoder_into(this, &aDecoder);
 656   }
 657
 658   /**
 659    * Instantiates a new decoder for this encoding with BOM removal.
 660    *
 661    * If the input starts with bytes that are the BOM for this encoding,
 662    * those bytes are removed. However, the decoder never morphs into a
 663    * decoder for another encoding: A BOM for another encoding is treated as
 664    * (potentially malformed) input to the decoding algorithm for this
 665    * encoding.
 666    */
 667   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
 668     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
 669     return decoder;
 670   }
 671
 672   /**
 673    * Instantiates a new decoder for this encoding with BOM removal
 674    * into memory occupied by a previously-instantiated decoder.
 675    *
 676    * If the input starts with bytes that are the BOM for this encoding,
 677    * those bytes are removed. However, the decoder never morphs into a
 678    * decoder for another encoding: A BOM for another encoding is treated as
 679    * (potentially malformed) input to the decoding algorithm for this
 680    * encoding.
 681    */
 682   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
 683     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
 684   }
 685
 686   /**
 687    * Instantiates a new decoder for this encoding with BOM handling disabled.
 688    *
 689    * If the input starts with bytes that look like a BOM, those bytes are
 690    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 691    * for another encoding.)
 692    *
 693    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 694    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
 695    * instead of this method to cause the BOM to be removed.
 696    */
 697   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
 698     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
 699     return decoder;
 700   }
 701
 702   /**
 703    * Instantiates a new decoder for this encoding with BOM handling disabled
 704    * into memory occupied by a previously-instantiated decoder.
 705    *
 706    * If the input starts with bytes that look like a BOM, those bytes are
 707    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 708    * for another encoding.)
 709    *
 710    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 711    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
 712    * instead of this method to cause the BOM to be removed.
 713    */
 714   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
 715     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
 716   }
 717
 718   /**
 719    * Instantiates a new encoder for the output encoding of this encoding.
 720    */
 721   inline UniquePtr<Encoder> NewEncoder() const {
 722     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
 723     return encoder;
 724   }
 725
 726   /**
 727    * Instantiates a new encoder for the output encoding of this encoding
 728    * into memory occupied by a previously-instantiated encoder.
 729    */
 730   inline void NewEncoderInto(Encoder& aEncoder) const {
 731     encoding_new_encoder_into(this, &aEncoder);
 732   }
 733
 734   /**
 735    * Validates UTF-8.
 736    *
 737    * Returns the index of the first byte that makes the input malformed as
 738    * UTF-8 or the length of the input if the input is entirely valid.
 739    */
 740   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
 741     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 742   }
 743
 744   /**
 745    * Validates ASCII.
 746    *
 747    * Returns the index of the first byte that makes the input malformed as
 748    * ASCII or the length of the input if the input is entirely valid.
 749    */
 750   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 751     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 752   }
 753
 754   /**
 755    * Validates ISO-2022-JP ASCII-state data.
 756    *
 757    * Returns the index of the first byte that makes the input not
 758    * representable in the ASCII state of ISO-2022-JP or the length of the
 759    * input if the input is entirely representable in the ASCII state of
 760    * ISO-2022-JP.
 761    */
 762   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 763     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
 764                                                   aBuffer.Length());
 765   }
 766
 767  private:
 768   Encoding() = delete;
 769   Encoding(const Encoding&) = delete;
 770   Encoding& operator=(const Encoding&) = delete;
 771   ~Encoding() = delete;
 772 };
 773
 774 /**
 775  * A converter that decodes a byte stream into Unicode according to a
 776  * character encoding in a streaming (incremental) manner.
 777  *
 778  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
 779  * buffer `aDst` both of which are caller-allocated. There are variants for
 780  * both UTF-8 and UTF-16 output buffers.
 781  *
 782  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
 783  * into `aDst` until one of the following three things happens:
 784  *
 785  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
 786  *    variants only).
 787  *
 788  * 2. The output buffer has been filled so near capacity that the decoder
 789  *    cannot be sure that processing an additional byte of input wouldn't
 790  *    cause so much output that the output buffer would overflow.
 791  *
 792  * 3. All the input bytes have been processed.
 793  *
 794  * The `Decode*` method then returns tuple of a status indicating which one
 795  * of the three reasons to return happened, how many input bytes were read,
 796  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
 797  * when decoding to UTF-16) were written, and in the case of the
 798  * variants performing replacement, a boolean indicating whether an error was
 799  * replaced with the REPLACEMENT CHARACTER during the call.
 800  *
 801  * The number of bytes "written" is what's logically written. Garbage may be
 802  * written in the output buffer beyond the point logically written to.
 803  *
 804  * In the case of the `*WithoutReplacement` variants, the status is a
 805  * `uint32_t` whose possible values are packed info about a malformed byte
 806  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
 807  * listed above).
 808  *
 809  * Packed info about malformed sequences has the following format:
 810  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
 811  * indicate the number of bytes that were consumed after the malformed
 812  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
 813  * the length of the malformed byte sequence (possible decimal values 1, 2,
 814  * 3 or 4). The maximum possible sum of the two is 6.
 815  *
 816  * In the case of methods whose name does not end with
 817  * `*WithoutReplacement`, malformed sequences are automatically replaced
 818  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
 819  * return early.
 820  *
 821  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
 822  * space. When decoding to UTF-16, the output buffer must have at least two
 823  * UTF-16 code units (`char16_t`) of space.
 824  *
 825  * When decoding to UTF-8 without replacement, the methods are guaranteed
 826  * not to return indicating that more output space is needed if the length
 827  * of the output buffer is at least the length returned by
 828  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
 829  * with replacement, the length of the output buffer that guarantees the
 830  * methods not to return indicating that more output space is needed is given
 831  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
 832  * or without replacement, the length of the output buffer that guarantees
 833  * the methods not to return indicating that more output space is needed is
 834  * given by `MaxUTF16BufferLength()`.
 835  *
 836  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
 837  * and the output after each `Decode*` call is guaranteed to consist of
 838  * complete characters. (I.e. the code unit sequence for the last character is
 839  * guaranteed not to be split across output buffers.)
 840  *
 841  * The boolean argument `aLast` indicates that the end of the stream is reached
 842  * when all the bytes in `aSrc` have been consumed.
 843  *
 844  * A `Decoder` object can be used to incrementally decode a byte stream.
 845  *
 846  * During the processing of a single stream, the caller must call `Decode*`
 847  * zero or more times with `aLast` set to `false` and then call `Decode*` at
 848  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
 849  * the processing of the stream has ended. Otherwise, the caller must call
 850  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
 851  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
 852  *
 853  * Once the stream has ended, the `Decoder` object must not be used anymore.
 854  * That is, you need to create another one to process another stream.
 855  *
 856  * When the decoder returns `kOutputFull` or the decoder returns a malformed
 857  * result and the caller does not wish to treat it as a fatal error, the input
 858  * buffer `aSrc` may not have been completely consumed. In that case, the caller
 859  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
 860  * call.
 861  *
 862  * # Infinite loops
 863  *
 864  * When converting with a fixed-size output buffer whose size is too small to
 865  * accommodate one character of output, an infinite loop ensues. When
 866  * converting with a fixed-size output buffer, it generally makes sense to
 867  * make the buffer fairly large (e.g. couple of kilobytes).
 868  */
 869 class Decoder final {
 870  public:
 871   ~Decoder() {}
 872   static void operator delete(void* aDecoder) {
 873     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
 874   }
 875
 876   /**
 877    * The `Encoding` this `Decoder` is for.
 878    *
 879    * BOM sniffing can change the return value of this method during the life
 880    * of the decoder.
 881    */
 882   inline NotNull<const mozilla::Encoding*> Encoding() const {
 883     return WrapNotNull(decoder_encoding(this));
 884   }
 885
 886   /**
 887    * Query the worst-case UTF-8 output size _with replacement_.
 888    *
 889    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 890    * that will not overflow given the current state of the decoder and
 891    * `aByteLength` number of additional input bytes when decoding with
 892    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
 893    * sequence.
 894    */
 895   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
 896     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
 897     if (max.value() == MaxValue<size_t>::value) {
 898       // Mark invalid by overflowing
 899       max++;
 900       MOZ_ASSERT(!max.isValid());
 901     }
 902     return max;
 903   }
 904
 905   /**
 906    * Query the worst-case UTF-8 output size _without replacement_.
 907    *
 908    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 909    * that will not overflow given the current state of the decoder and
 910    * `aByteLength` number of additional input bytes when decoding without
 911    * replacement error handling.
 912    *
 913    * Note that this value may be too small for the `WithReplacement` case.
 914    * Use `MaxUTF8BufferLength()` for that case.
 915    */
 916   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
 917       size_t aByteLength) const {
 918     CheckedInt<size_t> max(
 919         decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
 920     if (max.value() == MaxValue<size_t>::value) {
 921       // Mark invalid by overflowing
 922       max++;
 923       MOZ_ASSERT(!max.isValid());
 924     }
 925     return max;
 926   }
 927
 928   /**
 929    * Incrementally decode a byte stream into UTF-8 with malformed sequences
 930    * replaced with the REPLACEMENT CHARACTER.
 931    *
 932    * See the documentation of the class for documentation for `Decode*`
 933    * methods collectively.
 934    */
 935   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
 936       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 937     size_t srcRead = aSrc.Length();
 938     size_t dstWritten = aDst.Length();
 939     bool hadReplacements;
 940     uint32_t result =
 941         decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
 942                                &dstWritten, aLast, &hadReplacements);
 943     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
 944   }
 945
 946   /**
 947    * Incrementally decode a byte stream into UTF-8 _without replacement_.
 948    *
 949    * See the documentation of the class for documentation for `Decode*`
 950    * methods collectively.
 951    */
 952   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
 953       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 954     size_t srcRead = aSrc.Length();
 955     size_t dstWritten = aDst.Length();
 956     uint32_t result = decoder_decode_to_utf8_without_replacement(
 957         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
 958     return MakeTuple(result, srcRead, dstWritten);
 959   }
 960
 961   /**
 962    * Query the worst-case UTF-16 output size (with or without replacement).
 963    *
 964    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
 965    * that will not overflow given the current state of the decoder and
 966    * `aByteLength` number of additional input bytes.
 967    *
 968    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
 969    * return value of this method applies also in the
 970    * `_without_replacement` case.
 971    */
 972   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
 973     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
 974     if (max.value() == MaxValue<size_t>::value) {
 975       // Mark invalid by overflowing
 976       max++;
 977       MOZ_ASSERT(!max.isValid());
 978     }
 979     return max;
 980   }
 981
 982   /**
 983    * Incrementally decode a byte stream into UTF-16 with malformed sequences
 984    * replaced with the REPLACEMENT CHARACTER.
 985    *
 986    * See the documentation of the class for documentation for `Decode*`
 987    * methods collectively.
 988    */
 989   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
 990       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
 991     size_t srcRead = aSrc.Length();
 992     size_t dstWritten = aDst.Length();
 993     bool hadReplacements;
 994     uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
 995                                               aDst.Elements(), &dstWritten,
 996                                               aLast, &hadReplacements);
 997     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
 998   }
 999
1000   /**
1001    * Incrementally decode a byte stream into UTF-16 _without replacement_.
1002    *
1003    * See the documentation of the class for documentation for `Decode*`
1004    * methods collectively.
1005    */
1006   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1007       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1008     size_t srcRead = aSrc.Length();
1009     size_t dstWritten = aDst.Length();
1010     uint32_t result = decoder_decode_to_utf16_without_replacement(
1011         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1012     return MakeTuple(result, srcRead, dstWritten);
1013   }
1014
1015  private:
1016   Decoder() = delete;
1017   Decoder(const Decoder&) = delete;
1018   Decoder& operator=(const Decoder&) = delete;
1019 };
1020
1021 /**
1022  * A converter that encodes a Unicode stream into bytes according to a
1023  * character encoding in a streaming (incremental) manner.
1024  *
1025  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1026  * buffer `aDst` both of which are caller-allocated. There are variants for
1027  * both UTF-8 and UTF-16 input buffers.
1028  *
1029  * An `Encode*` method encode characters from `aSrc` into bytes characters
1030  * stored into `aDst` until one of the following three things happens:
1031  *
1032  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1033  *    only).
1034  *
1035  * 2. The output buffer has been filled so near capacity that the decoder
1036  *    cannot be sure that processing an additional character of input wouldn't
1037  *    cause so much output that the output buffer would overflow.
1038  *
1039  * 3. All the input characters have been processed.
1040  *
1041  * The `Encode*` method then returns tuple of a status indicating which one
1042  * of the three reasons to return happened, how many input code units (`uint8_t`
1043  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1044  * how many output bytes were written, and in the case of the variants that
1045  * perform replacement, a boolean indicating whether an unmappable
1046  * character was replaced with a numeric character reference during the call.
1047  *
1048  * The number of bytes "written" is what's logically written. Garbage may be
1049  * written in the output buffer beyond the point logically written to.
1050  *
1051  * In the case of the methods whose name ends with
1052  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1053  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1054  * to the three cases listed above).
1055  *
1056  * In the case of methods whose name does not end with
1057  * `*WithoutReplacement`, unmappable characters are automatically replaced
1058  * with the corresponding numeric character references and unmappable
1059  * characters do not cause the methods to return early.
1060  *
1061  * When encoding from UTF-8 without replacement, the methods are guaranteed
1062  * not to return indicating that more output space is needed if the length
1063  * of the output buffer is at least the length returned by
1064  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1065  * UTF-8 with replacement, the length of the output buffer that guarantees the
1066  * methods not to return indicating that more output space is needed in the
1067  * absence of unmappable characters is given by
1068  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1069  * UTF-16 without replacement, the methods are guaranteed not to return
1070  * indicating that more output space is needed if the length of the output
1071  * buffer is at least the length returned by
1072  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1073  * from UTF-16 with replacement, the the length of the output buffer that
1074  * guarantees the methods not to return indicating that more output space is
1075  * needed in the absence of unmappable characters is given by
1076  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1077  * When encoding with replacement, applications are not expected to size the
1078  * buffer for the worst case ahead of time but to resize the buffer if there
1079  * are unmappable characters. This is why max length queries are only available
1080  * for the case where there are no unmappable characters.
1081  *
1082  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1083  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1084  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1085  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1086  * surrogate pairs are not split across input buffer boundaries.
1087  *
1088  * After an `Encode*` call returns, the output produced so far, taken as a
1089  * whole from the start of the stream, is guaranteed to consist of a valid
1090  * byte sequence in the target encoding. (I.e. the code unit sequence for a
1091  * character is guaranteed not to be split across output buffers. However, due
1092  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1093  * from the start for it to be valid. For other encodings, the validity holds
1094  * on a per-output buffer basis.)
1095  *
1096  * The boolean argument `aLast` indicates that the end of the stream is reached
1097  * when all the characters in `aSrc` have been consumed. This argument is needed
1098  * for ISO-2022-JP and is ignored for other encodings.
1099  *
1100  * An `Encoder` object can be used to incrementally encode a byte stream.
1101  *
1102  * During the processing of a single stream, the caller must call `Encode*`
1103  * zero or more times with `aLast` set to `false` and then call `Encode*` at
1104  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1105  * the processing of the stream has ended. Otherwise, the caller must call
1106  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1107  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1108  *
1109  * Once the stream has ended, the `Encoder` object must not be used anymore.
1110  * That is, you need to create another one to process another stream.
1111  *
1112  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1113  * result and the caller does not wish to treat it as a fatal error, the input
1114  * buffer `aSrc` may not have been completely consumed. In that case, the caller
1115  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1116  * call.
1117  *
1118  * # Infinite loops
1119  *
1120  * When converting with a fixed-size output buffer whose size is too small to
1121  * accommodate one character of output, an infinite loop ensues. When
1122  * converting with a fixed-size output buffer, it generally makes sense to
1123  * make the buffer fairly large (e.g. couple of kilobytes).
1124  */
1125 class Encoder final {
1126  public:
1127   ~Encoder() {}
1128
1129   static void operator delete(void* aEncoder) {
1130     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1131   }
1132
1133   /**
1134    * The `Encoding` this `Encoder` is for.
1135    */
1136   inline NotNull<const mozilla::Encoding*> Encoding() const {
1137     return WrapNotNull(encoder_encoding(this));
1138   }
1139
1140   /**
1141    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1142    * ASCII state and `false` otherwise.
1143    */
1144   inline bool HasPendingState() const {
1145     return encoder_has_pending_state(this);
1146   }
1147
1148   /**
1149    * Query the worst-case output size when encoding from UTF-8 with
1150    * replacement.
1151    *
1152    * Returns the size of the output buffer in bytes that will not overflow
1153    * given the current state of the encoder and `aByteLength` number of
1154    * additional input code units if there are no unmappable characters in
1155    * the input.
1156    */
1157   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1158       size_t aByteLength) const {
1159     CheckedInt<size_t> max(
1160         encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1161                                                               aByteLength));
1162     if (max.value() == MaxValue<size_t>::value) {
1163       // Mark invalid by overflowing
1164       max++;
1165       MOZ_ASSERT(!max.isValid());
1166     }
1167     return max;
1168   }
1169
1170   /**
1171    * Query the worst-case output size when encoding from UTF-8 without
1172    * replacement.
1173    *
1174    * Returns the size of the output buffer in bytes that will not overflow
1175    * given the current state of the encoder and `aByteLength` number of
1176    * additional input code units.
1177    */
1178   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1179       size_t aByteLength) const {
1180     CheckedInt<size_t> max(
1181         encoder_max_buffer_length_from_utf8_without_replacement(this,
1182                                                                 aByteLength));
1183     if (max.value() == MaxValue<size_t>::value) {
1184       // Mark invalid by overflowing
1185       max++;
1186       MOZ_ASSERT(!max.isValid());
1187     }
1188     return max;
1189   }
1190
1191   /**
1192    * Incrementally encode into byte stream from UTF-8 with unmappable
1193    * characters replaced with HTML (decimal) numeric character references.
1194    *
1195    * See the documentation of the class for documentation for `Encode*`
1196    * methods collectively.
1197    *
1198    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1199    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1200    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1201    */
1202   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1203       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1204     size_t srcRead = aSrc.Length();
1205     size_t dstWritten = aDst.Length();
1206     bool hadReplacements;
1207     uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1208                                                aDst.Elements(), &dstWritten,
1209                                                aLast, &hadReplacements);
1210     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1211   }
1212
1213   /**
1214    * Incrementally encode into byte stream from UTF-8 _without replacement_.
1215    *
1216    * See the documentation of the class for documentation for `Encode*`
1217    * methods collectively.
1218    *
1219    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1220    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1221    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1222    */
1223   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1224       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1225     size_t srcRead = aSrc.Length();
1226     size_t dstWritten = aDst.Length();
1227     uint32_t result = encoder_encode_from_utf8_without_replacement(
1228         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1229     return MakeTuple(result, srcRead, dstWritten);
1230   }
1231
1232   /**
1233    * Query the worst-case output size when encoding from UTF-16 with
1234    * replacement.
1235    *
1236    * Returns the size of the output buffer in bytes that will not overflow
1237    * given the current state of the encoder and `aU16Length` number of
1238    * additional input code units if there are no unmappable characters in
1239    * the input.
1240    */
1241   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1242       size_t aU16Length) const {
1243     CheckedInt<size_t> max(
1244         encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1245                                                                aU16Length));
1246     if (max.value() == MaxValue<size_t>::value) {
1247       // Mark invalid by overflowing
1248       max++;
1249       MOZ_ASSERT(!max.isValid());
1250     }
1251     return max;
1252   }
1253
1254   /**
1255    * Query the worst-case output size when encoding from UTF-16 without
1256    * replacement.
1257    *
1258    * Returns the size of the output buffer in bytes that will not overflow
1259    * given the current state of the encoder and `aU16Length` number of
1260    * additional input code units.
1261    */
1262   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1263       size_t aU16Length) const {
1264     CheckedInt<size_t> max(
1265         encoder_max_buffer_length_from_utf16_without_replacement(this,
1266                                                                  aU16Length));
1267     if (max.value() == MaxValue<size_t>::value) {
1268       // Mark invalid by overflowing
1269       max++;
1270       MOZ_ASSERT(!max.isValid());
1271     }
1272     return max;
1273   }
1274
1275   /**
1276    * Incrementally encode into byte stream from UTF-16 with unmappable
1277    * characters replaced with HTML (decimal) numeric character references.
1278    *
1279    * See the documentation of the class for documentation for `Encode*`
1280    * methods collectively.
1281    */
1282   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1283       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1284     size_t srcRead = aSrc.Length();
1285     size_t dstWritten = aDst.Length();
1286     bool hadReplacements;
1287     uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1288                                                 aDst.Elements(), &dstWritten,
1289                                                 aLast, &hadReplacements);
1290     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1291   }
1292
1293   /**
1294    * Incrementally encode into byte stream from UTF-16 _without replacement_.
1295    *
1296    * See the documentation of the class for documentation for `Encode*`
1297    * methods collectively.
1298    */
1299   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1300       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1301     size_t srcRead = aSrc.Length();
1302     size_t dstWritten = aDst.Length();
1303     uint32_t result = encoder_encode_from_utf16_without_replacement(
1304         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1305     return MakeTuple(result, srcRead, dstWritten);
1306   }
1307
1308  private:
1309   Encoder() = delete;
1310   Encoder(const Encoder&) = delete;
1311   Encoder& operator=(const Encoder&) = delete;
1312 };
1313
1314 };  // namespace mozilla
1315
1316 #endif  // mozilla_Encoding_h