intl/Encoding.h

   1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
   2 // file at the top-level directory of this distribution.
   3 //
   4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   7 // option. This file may not be copied, modified, or distributed
   8 // except according to those terms.
   9
  10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
  11 // "top-level directory" in the above notice refers to
  12 // third_party/rust/encoding_c/.
  13
  14 #ifndef mozilla_Encoding_h
  15 #define mozilla_Encoding_h
  16
  17 #include "mozilla/CheckedInt.h"
  18 #include "mozilla/NotNull.h"
  19 #include "mozilla/Span.h"
  20 #include "mozilla/Tuple.h"
  21 #include "nsString.h"
  22
  23 namespace mozilla {
  24 class Encoding;
  25 class Decoder;
  26 class Encoder;
  27 }; // namespace mozilla
  28
  29 #define ENCODING_RS_ENCODING mozilla::Encoding
  30 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*>
  31 #define ENCODING_RS_ENCODER mozilla::Encoder
  32 #define ENCODING_RS_DECODER mozilla::Decoder
  33
  34 #include "encoding_rs.h"
  35
  36 extern "C" {
  37
  38 nsresult
  39 mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
  40                                     uint8_t const* src,
  41                                     size_t src_len,
  42                                     nsAString* dst);
  43
  44 nsresult
  45 mozilla_encoding_decode_to_nsstring_with_bom_removal(
  46   mozilla::Encoding const* encoding,
  47   uint8_t const* src,
  48   size_t src_len,
  49   nsAString* dst);
  50
  51 nsresult
  52 mozilla_encoding_decode_to_nsstring_without_bom_handling(
  53   mozilla::Encoding const* encoding,
  54   uint8_t const* src,
  55   size_t src_len,
  56   nsAString* dst);
  57
  58 nsresult
  59 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
  60   mozilla::Encoding const* encoding,
  61   uint8_t const* src,
  62   size_t src_len,
  63   nsAString* dst);
  64
  65 nsresult
  66 mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
  67                                    char16_t const* src,
  68                                    size_t src_len,
  69                                    nsACString* dst);
  70
  71 nsresult
  72 mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding,
  73                                      nsACString const* src,
  74                                      nsACString* dst);
  75
  76 nsresult
  77 mozilla_encoding_decode_to_nscstring_with_bom_removal(
  78   mozilla::Encoding const* encoding,
  79   nsACString const* src,
  80   nsACString* dst);
  81
  82 nsresult
  83 mozilla_encoding_decode_to_nscstring_without_bom_handling(
  84   mozilla::Encoding const* encoding,
  85   nsACString const* src,
  86   nsACString* dst);
  87
  88 nsresult
  89 mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
  90   mozilla::Encoding const* encoding,
  91   uint8_t const* src,
  92   size_t src_len,
  93   nsACString* dst,
  94   size_t already_validated);
  95
  96 nsresult
  97 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
  98   mozilla::Encoding const* encoding,
  99   nsACString const* src,
 100   nsACString* dst);
 101
 102 nsresult
 103 mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding,
 104                                        nsACString const* src,
 105                                        nsACString* dst);
 106
 107 } // extern "C"
 108
 109 namespace mozilla {
 110
 111 /**
 112  * Return value from `Decoder`/`Encoder` to indicate that input
 113  * was exhausted.
 114  */
 115 const uint32_t kInputEmpty = INPUT_EMPTY;
 116
 117 /**
 118  * Return value from `Decoder`/`Encoder` to indicate that output
 119  * space was insufficient.
 120  */
 121 const uint32_t kOutputFull = OUTPUT_FULL;
 122
 123 /**
 124  * An encoding as defined in the Encoding Standard
 125  * (https://encoding.spec.whatwg.org/).
 126  *
 127  * See https://docs.rs/encoding_rs/ for the Rust API docs.
 128  *
 129  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
 130  * sequence and, in most cases, vice versa. Each encoding has a name, an output
 131  * encoding, and one or more labels.
 132  *
 133  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
 134  * encoding in formats and protocols. The _name_ of the encoding is the
 135  * preferred label in the case appropriate for returning from the
 136  * `characterSet` property of the `Document` DOM interface, except for
 137  * the replacement encoding whose name is not one of its labels.
 138  *
 139  * The _output encoding_ is the encoding used for form submission and URL
 140  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
 141  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
 142  * encodings.
 143  *
 144  * # Streaming vs. Non-Streaming
 145  *
 146  * When you have the entire input in a single buffer, you can use the
 147  * methods `Decode()`, `DecodeWithBOMRemoval()`,
 148  * `DecodeWithoutBOMHandling()`,
 149  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
 150  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
 151  * NewEncoder()` methods), these methods perform heap allocations. You should
 152  * the `Decoder` and `Encoder` objects when your input is split into multiple
 153  * buffers or when you want to control the allocation of the output buffers.
 154  *
 155  * # Instances
 156  *
 157  * All instances of `Encoding` are statically allocated and have the process's
 158  * lifetime. There is precisely one unique `Encoding` instance for each
 159  * encoding defined in the Encoding Standard.
 160  *
 161  * To obtain a reference to a particular encoding whose identity you know at
 162  * compile time, use a `static` that refers to encoding. There is a `static`
 163  * for each encoding. The `static`s are named in all caps with hyphens
 164  * replaced with underscores and with `_ENCODING` appended to the
 165  * name. For example, if you know at compile time that you will want to
 166  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
 167  *
 168  * If you don't know what encoding you need at compile time and need to
 169  * dynamically get an encoding by label, use `Encoding::for_label()`.
 170  *
 171  * Pointers to `Encoding` can be compared with `==` to check for the sameness
 172  * of two encodings.
 173  *
 174  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
 175  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
 176  * `const mozilla::Encoding*` in the C signature and
 177  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
 178  */
 179 class Encoding final
 180 {
 181 public:
 182   /**
 183    * Implements the _get an encoding_ algorithm
 184    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
 185    *
 186    * If, after ASCII-lowercasing and removing leading and trailing
 187    * whitespace, the argument matches a label defined in the Encoding
 188    * Standard, `const Encoding*` representing the corresponding
 189    * encoding is returned. If there is no match, `nullptr` is returned.
 190    *
 191    * This is the right method to use if the action upon the method returning
 192    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
 193    * instead. When the action upon the method returning `nullptr` is not to
 194    * proceed with a fallback but to refuse processing,
 195    * `ForLabelNoReplacement()` is more appropriate.
 196   */
 197   static inline const Encoding* ForLabel(Span<const char> aLabel)
 198   {
 199     return encoding_for_label(
 200       reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 201   }
 202
 203   /**
 204    * `nsAString` argument version. See above for docs.
 205    */
 206   static inline const Encoding* ForLabel(const nsAString& aLabel)
 207   {
 208     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
 209   }
 210
 211   /**
 212    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
 213    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
 214    *
 215    * This method is useful in scenarios where a fatal error is required
 216    * upon invalid label, because in those cases the caller typically wishes
 217    * to treat the labels that map to the replacement encoding as fatal
 218    * errors, too.
 219    *
 220    * It is not OK to use this method when the action upon the method returning
 221    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
 222    * such a case, the `ForLabel()` method should be used instead in order to avoid
 223    * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`.
 224    */
 225   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel)
 226   {
 227     return encoding_for_label_no_replacement(
 228       reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 229   }
 230
 231   /**
 232    * `nsAString` argument version. See above for docs.
 233    */
 234   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel)
 235   {
 236     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
 237   }
 238
 239   /**
 240    * Performs non-incremental BOM sniffing.
 241    *
 242    * The argument must either be a buffer representing the entire input
 243    * stream (non-streaming case) or a buffer representing at least the first
 244    * three bytes of the input stream (streaming case).
 245    *
 246    * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
 247    * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
 248    * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
 249    */
 250   static inline Tuple<const Encoding*, size_t> ForBOM(
 251     Span<const uint8_t> aBuffer)
 252   {
 253     size_t len = aBuffer.Length();
 254     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
 255     return MakeTuple(encoding, len);
 256   }
 257
 258   /**
 259    * Writes the name of this encoding into `aName`.
 260    *
 261    * This name is appropriate to return as-is from the DOM
 262    * `document.characterSet` property.
 263    */
 264   inline void Name(nsACString& aName) const
 265   {
 266     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
 267     size_t length =
 268       encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
 269     aName.SetLength(length); // truncation is the 64-bit case is OK
 270   }
 271
 272   /**
 273    * Checks whether the _output encoding_ of this encoding can encode every
 274    * Unicode code point. (Only true if the output encoding is UTF-8.)
 275    */
 276   inline bool CanEncodeEverything() const
 277   {
 278     return encoding_can_encode_everything(this);
 279   }
 280
 281   /**
 282    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
 283    * U+0000...U+007F and vice versa.
 284    */
 285   inline bool IsAsciiCompatible() const
 286   {
 287     return encoding_is_ascii_compatible(this);
 288   }
 289
 290   /**
 291    * Returns the _output encoding_ of this encoding. This is UTF-8 for
 292    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
 293    */
 294   inline NotNull<const mozilla::Encoding*> OutputEncoding() const
 295   {
 296     return WrapNotNull(encoding_output_encoding(this));
 297   }
 298
 299   /**
 300    * Decode complete input to `nsACString` _with BOM sniffing_ and with
 301    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 302    * entire input is available as a single buffer (i.e. the end of the
 303    * buffer marks the end of the stream).
 304    *
 305    * This method implements the (non-streaming version of) the
 306    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 307    *
 308    * The second item in the returned tuple is the encoding that was actually
 309    * used (which may differ from this encoding thanks to BOM sniffing).
 310    *
 311    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 312    * if there were malformed sequences (that were replaced with the
 313    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 314    * tuple.
 315    *
 316    * The backing buffer of the string isn't copied if the input buffer
 317    * is heap-allocated and decoding from UTF-8 and the input is valid
 318    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 319    * the input is valid ASCII or decoding from ISO-2022-JP and the
 320    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 321    * the same string as both arguments.
 322    *
 323    * _Note:_ It is wrong to use this when the input buffer represents only
 324    * a segment of the input instead of the whole input. Use `NewDecoder()`
 325    * when decoding segmented input.
 326    */
 327   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 328     const nsACString& aBytes,
 329     nsACString& aOut) const
 330   {
 331     const Encoding* encoding = this;
 332     const nsACString* bytes = &aBytes;
 333     nsACString* out = &aOut;
 334     nsresult rv;
 335     if (bytes == out) {
 336       nsAutoCString temp(aBytes);
 337       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
 338     } else {
 339       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
 340     }
 341     return MakeTuple(rv, WrapNotNull(encoding));
 342   }
 343
 344   /**
 345    * Decode complete input to `nsAString` _with BOM sniffing_ and with
 346    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 347    * entire input is available as a single buffer (i.e. the end of the
 348    * buffer marks the end of the stream).
 349    *
 350    * This method implements the (non-streaming version of) the
 351    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 352    *
 353    * The second item in the returned tuple is the encoding that was actually
 354    * used (which may differ from this encoding thanks to BOM sniffing).
 355    *
 356    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 357    * if there were malformed sequences (that were replaced with the
 358    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 359    * tuple.
 360    *
 361    * _Note:_ It is wrong to use this when the input buffer represents only
 362    * a segment of the input instead of the whole input. Use `NewDecoder()`
 363    * when decoding segmented input.
 364    */
 365   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 366     Span<const uint8_t> aBytes,
 367     nsAString& aOut) const
 368   {
 369     const Encoding* encoding = this;
 370     nsresult rv = mozilla_encoding_decode_to_nsstring(
 371       &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
 372     return MakeTuple(rv, WrapNotNull(encoding));
 373   }
 374
 375   /**
 376    * Decode complete input to `nsACString` _with BOM removal_ and with
 377    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 378    * entire input is available as a single buffer (i.e. the end of the
 379    * buffer marks the end of the stream).
 380    *
 381    * When invoked on `UTF_8`, this method implements the (non-streaming
 382    * version of) the _UTF-8 decode_
 383    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 384    *
 385    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 386    * if there were malformed sequences (that were replaced with the
 387    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 388    *
 389    * The backing buffer of the string isn't copied if the input buffer
 390    * is heap-allocated and decoding from UTF-8 and the input is valid
 391    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 392    * the input is valid ASCII or decoding from ISO-2022-JP and the
 393    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 394    * the same string as both arguments.
 395    *
 396    * _Note:_ It is wrong to use this when the input buffer represents only
 397    * a segment of the input instead of the whole input. Use
 398    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 399    */
 400   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
 401                                        nsACString& aOut) const
 402   {
 403     const nsACString* bytes = &aBytes;
 404     nsACString* out = &aOut;
 405     if (bytes == out) {
 406       nsAutoCString temp(aBytes);
 407       return mozilla_encoding_decode_to_nscstring_with_bom_removal(
 408         this, &temp, out);
 409     }
 410     return mozilla_encoding_decode_to_nscstring_with_bom_removal(
 411       this, bytes, out);
 412   }
 413
 414   /**
 415    * Decode complete input to `nsAString` _with BOM removal_ and with
 416    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 417    * entire input is available as a single buffer (i.e. the end of the
 418    * buffer marks the end of the stream).
 419    *
 420    * When invoked on `UTF_8`, this method implements the (non-streaming
 421    * version of) the _UTF-8 decode_
 422    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 423    *
 424    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 425    * if there were malformed sequences (that were replaced with the
 426    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 427    *
 428    * _Note:_ It is wrong to use this when the input buffer represents only
 429    * a segment of the input instead of the whole input. Use
 430    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 431    */
 432   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
 433                                        nsAString& aOut) const
 434   {
 435     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
 436       this, aBytes.Elements(), aBytes.Length(), &aOut);
 437   }
 438
 439   /**
 440    * Decode complete input to `nsACString` _without BOM handling_ and
 441    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 442    * the entire input is available as a single buffer (i.e. the end of the
 443    * buffer marks the end of the stream).
 444    *
 445    * When invoked on `UTF_8`, this method implements the (non-streaming
 446    * version of) the _UTF-8 decode without BOM_
 447    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 448    *
 449    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 450    * if there were malformed sequences (that were replaced with the
 451    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 452    *
 453    * The backing buffer of the string isn't copied if the input buffer
 454    * is heap-allocated and decoding from UTF-8 and the input is valid
 455    * UTF-8, decoding from an ASCII-compatible encoding and the input
 456    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 457    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 458    * as both arguments.
 459    *
 460    * _Note:_ It is wrong to use this when the input buffer represents only
 461    * a segment of the input instead of the whole input. Use
 462    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 463    */
 464   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
 465                                            nsACString& aOut) const
 466   {
 467     const nsACString* bytes = &aBytes;
 468     nsACString* out = &aOut;
 469     if (bytes == out) {
 470       nsAutoCString temp(aBytes);
 471       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 472         this, &temp, out);
 473     }
 474     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 475       this, bytes, out);
 476   }
 477
 478   /**
 479    * Decode complete input to `nsAString` _without BOM handling_ and
 480    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 481    * the entire input is available as a single buffer (i.e. the end of the
 482    * buffer marks the end of the stream).
 483    *
 484    * When invoked on `UTF_8`, this method implements the (non-streaming
 485    * version of) the _UTF-8 decode without BOM_
 486    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 487    *
 488    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 489    * if there were malformed sequences (that were replaced with the
 490    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 491    *
 492    * _Note:_ It is wrong to use this when the input buffer represents only
 493    * a segment of the input instead of the whole input. Use
 494    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 495    */
 496   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 497                                            nsAString& aOut) const
 498   {
 499     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
 500       this, aBytes.Elements(), aBytes.Length(), &aOut);
 501   }
 502
 503   /**
 504    * Decode complete input to `nsACString` _without BOM handling_ and
 505    * _with malformed sequences treated as fatal_ when the entire input is
 506    * available as a single buffer (i.e. the end of the buffer marks the end
 507    * of the stream).
 508    *
 509    * When invoked on `UTF_8`, this method implements the (non-streaming
 510    * version of) the _UTF-8 decode without BOM or fail_
 511    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 512    * spec concept.
 513    *
 514    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 515    * if a malformed sequence was encountered and `NS_OK` otherwise.
 516    *
 517    * The backing buffer of the string isn't copied if the input buffer
 518    * is heap-allocated and decoding from UTF-8 and the input is valid
 519    * UTF-8, decoding from an ASCII-compatible encoding and the input
 520    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 521    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 522    * as both arguments.
 523    *
 524    * _Note:_ It is wrong to use this when the input buffer represents only
 525    * a segment of the input instead of the whole input. Use
 526    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 527    */
 528   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 529     const nsACString& aBytes,
 530     nsACString& aOut) const
 531   {
 532     const nsACString* bytes = &aBytes;
 533     nsACString* out = &aOut;
 534     if (bytes == out) {
 535       nsAutoCString temp(aBytes);
 536       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 537         this, &temp, out);
 538     }
 539     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 540       this, bytes, out);
 541   }
 542
 543   /**
 544    * Decode complete input to `nsACString` _without BOM handling_ and
 545    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 546    * the entire input is available as a single buffer (i.e. the end of the
 547    * buffer marks the end of the stream) _asserting that a number of bytes
 548    * from the start are already known to be valid UTF-8_.
 549    *
 550    * The use case for this method is avoiding copying when dealing with
 551    * input that has a UTF-8 BOM. _When in doubt, do not use this method._
 552    *
 553    * When invoked on `UTF_8`, this method implements the (non-streaming
 554    * version of) the _UTF-8 decode without BOM_
 555    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 556    *
 557    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 558    * if there were malformed sequences (that were replaced with the
 559    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 560    *
 561    * _Note:_ It is wrong to use this when the input buffer represents only
 562    * a segment of the input instead of the whole input. Use
 563    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 564    *
 565    * # Safety
 566    *
 567    * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
 568    * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
 569    */
 570   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 571                                            nsACString& aOut,
 572                                            size_t aAlreadyValidated) const
 573   {
 574     return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
 575       this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
 576   }
 577
 578   /**
 579    * Decode complete input to `nsAString` _without BOM handling_ and
 580    * _with malformed sequences treated as fatal_ when the entire input is
 581    * available as a single buffer (i.e. the end of the buffer marks the end
 582    * of the stream).
 583    *
 584    * When invoked on `UTF_8`, this method implements the (non-streaming
 585    * version of) the _UTF-8 decode without BOM or fail_
 586    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 587    * spec concept.
 588    *
 589    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 590    * if a malformed sequence was encountered and `NS_OK` otherwise.
 591    *
 592    * _Note:_ It is wrong to use this when the input buffer represents only
 593    * a segment of the input instead of the whole input. Use
 594    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 595    */
 596   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 597     Span<const uint8_t> aBytes,
 598     nsAString& aOut) const
 599   {
 600     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
 601       this, aBytes.Elements(), aBytes.Length(), &aOut);
 602   }
 603
 604   /**
 605    * Encode complete input to `nsACString` with unmappable characters
 606    * replaced with decimal numeric character references when the entire input
 607    * is available as a single buffer (i.e. the end of the buffer marks the
 608    * end of the stream).
 609    *
 610    * This method implements the (non-streaming version of) the
 611    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 612    *
 613    * The second item in the returned tuple is the encoding that was actually
 614    * used (which may differ from this encoding thanks to some encodings
 615    * having UTF-8 as their output encoding).
 616    *
 617    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
 618    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
 619    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
 620    * replaced with numeric character references) and `NS_OK` otherwise.
 621    *
 622    * The backing buffer of the string isn't copied if the input buffer
 623    * is heap-allocated and encoding to UTF-8 and the input is valid
 624    * UTF-8, encoding to an ASCII-compatible encoding and the input
 625    * is valid ASCII or encoding from ISO-2022-JP and the input stays
 626    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 627    * as both arguments.
 628    *
 629    * _Note:_ It is wrong to use this when the input buffer represents only
 630    * a segment of the input instead of the whole input. Use `NewEncoder()`
 631    * when encoding segmented output.
 632    */
 633   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 634     const nsACString& aString,
 635     nsACString& aOut) const
 636   {
 637     const Encoding* encoding = this;
 638     const nsACString* string = &aString;
 639     nsACString* out = &aOut;
 640     nsresult rv;
 641     if (string == out) {
 642       nsAutoCString temp(aString);
 643       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
 644     } else {
 645       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
 646     }
 647     return MakeTuple(rv, WrapNotNull(encoding));
 648   }
 649
 650   /**
 651    * Encode complete input to `nsACString` with unmappable characters
 652    * replaced with decimal numeric character references when the entire input
 653    * is available as a single buffer (i.e. the end of the buffer marks the
 654    * end of the stream).
 655    *
 656    * This method implements the (non-streaming version of) the
 657    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 658    *
 659    * The second item in the returned tuple is the encoding that was actually
 660    * used (which may differ from this encoding thanks to some encodings
 661    * having UTF-8 as their output encoding).
 662    *
 663    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
 664    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
 665    * were replaced with numeric character references) and `NS_OK` otherwise.
 666
 667    * _Note:_ It is wrong to use this when the input buffer represents only
 668    * a segment of the input instead of the whole input. Use `NewEncoder()`
 669    * when encoding segmented output.
 670    */
 671   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 672     Span<const char16_t> aString,
 673     nsACString& aOut) const
 674   {
 675     const Encoding* encoding = this;
 676     nsresult rv = mozilla_encoding_encode_from_utf16(
 677       &encoding, aString.Elements(), aString.Length(), &aOut);
 678     return MakeTuple(rv, WrapNotNull(encoding));
 679   }
 680
 681   /**
 682    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
 683    *
 684    * BOM sniffing may cause the returned decoder to morph into a decoder
 685    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 686    */
 687   inline UniquePtr<Decoder> NewDecoder() const
 688   {
 689     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
 690     return decoder;
 691   }
 692
 693   /**
 694    * Instantiates a new decoder for this encoding with BOM sniffing enabled
 695    * into memory occupied by a previously-instantiated decoder.
 696    *
 697    * BOM sniffing may cause the returned decoder to morph into a decoder
 698    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 699    */
 700   inline void NewDecoderInto(Decoder& aDecoder) const
 701   {
 702     encoding_new_decoder_into(this, &aDecoder);
 703   }
 704
 705   /**
 706    * Instantiates a new decoder for this encoding with BOM removal.
 707    *
 708    * If the input starts with bytes that are the BOM for this encoding,
 709    * those bytes are removed. However, the decoder never morphs into a
 710    * decoder for another encoding: A BOM for another encoding is treated as
 711    * (potentially malformed) input to the decoding algorithm for this
 712    * encoding.
 713    */
 714   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const
 715   {
 716     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
 717     return decoder;
 718   }
 719
 720   /**
 721    * Instantiates a new decoder for this encoding with BOM removal
 722    * into memory occupied by a previously-instantiated decoder.
 723    *
 724    * If the input starts with bytes that are the BOM for this encoding,
 725    * those bytes are removed. However, the decoder never morphs into a
 726    * decoder for another encoding: A BOM for another encoding is treated as
 727    * (potentially malformed) input to the decoding algorithm for this
 728    * encoding.
 729    */
 730   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const
 731   {
 732     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
 733   }
 734
 735   /**
 736    * Instantiates a new decoder for this encoding with BOM handling disabled.
 737    *
 738    * If the input starts with bytes that look like a BOM, those bytes are
 739    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 740    * for another encoding.)
 741    *
 742    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 743    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
 744    * instead of this method to cause the BOM to be removed.
 745    */
 746   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const
 747   {
 748     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
 749     return decoder;
 750   }
 751
 752   /**
 753    * Instantiates a new decoder for this encoding with BOM handling disabled
 754    * into memory occupied by a previously-instantiated decoder.
 755    *
 756    * If the input starts with bytes that look like a BOM, those bytes are
 757    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 758    * for another encoding.)
 759    *
 760    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 761    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
 762    * instead of this method to cause the BOM to be removed.
 763    */
 764   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const
 765   {
 766     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
 767   }
 768
 769   /**
 770    * Instantiates a new encoder for the output encoding of this encoding.
 771    */
 772   inline UniquePtr<Encoder> NewEncoder() const
 773   {
 774     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
 775     return encoder;
 776   }
 777
 778   /**
 779    * Instantiates a new encoder for the output encoding of this encoding
 780    * into memory occupied by a previously-instantiated encoder.
 781    */
 782   inline void NewEncoderInto(Encoder& aEncoder) const
 783   {
 784     encoding_new_encoder_into(this, &aEncoder);
 785   }
 786
 787   /**
 788    * Validates UTF-8.
 789    *
 790    * Returns the index of the first byte that makes the input malformed as
 791    * UTF-8 or the length of the input if the input is entirely valid.
 792    */
 793   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer)
 794   {
 795     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 796   }
 797
 798   /**
 799    * Validates ASCII.
 800    *
 801    * Returns the index of the first byte that makes the input malformed as
 802    * ASCII or the length of the input if the input is entirely valid.
 803    */
 804   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer)
 805   {
 806     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 807   }
 808
 809   /**
 810    * Validates ISO-2022-JP ASCII-state data.
 811    *
 812    * Returns the index of the first byte that makes the input not
 813    * representable in the ASCII state of ISO-2022-JP or the length of the
 814    * input if the input is entirely representable in the ASCII state of
 815    * ISO-2022-JP.
 816    */
 817   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)
 818   {
 819     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
 820                                                   aBuffer.Length());
 821   }
 822
 823 private:
 824   Encoding() = delete;
 825   Encoding(const Encoding&) = delete;
 826   Encoding& operator=(const Encoding&) = delete;
 827   ~Encoding() = delete;
 828
 829 };
 830
 831 /**
 832  * A converter that decodes a byte stream into Unicode according to a
 833  * character encoding in a streaming (incremental) manner.
 834  *
 835  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
 836  * buffer `aDst` both of which are caller-allocated. There are variants for
 837  * both UTF-8 and UTF-16 output buffers.
 838  *
 839  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
 840  * into `aDst` until one of the following three things happens:
 841  *
 842  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
 843  *    variants only).
 844  *
 845  * 2. The output buffer has been filled so near capacity that the decoder
 846  *    cannot be sure that processing an additional byte of input wouldn't
 847  *    cause so much output that the output buffer would overflow.
 848  *
 849  * 3. All the input bytes have been processed.
 850  *
 851  * The `Decode*` method then returns tuple of a status indicating which one
 852  * of the three reasons to return happened, how many input bytes were read,
 853  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
 854  * when decoding to UTF-16) were written, and in the case of the
 855  * variants performing replacement, a boolean indicating whether an error was
 856  * replaced with the REPLACEMENT CHARACTER during the call.
 857  *
 858  * The number of bytes "written" is what's logically written. Garbage may be
 859  * written in the output buffer beyond the point logically written to.
 860  *
 861  * In the case of the `*WithoutReplacement` variants, the status is a
 862  * `uint32_t` whose possible values are packed info about a malformed byte
 863  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
 864  * listed above).
 865  *
 866  * Packed info about malformed sequences has the following format:
 867  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
 868  * indicate the number of bytes that were consumed after the malformed
 869  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
 870  * the length of the malformed byte sequence (possible decimal values 1, 2,
 871  * 3 or 4). The maximum possible sum of the two is 6.
 872  *
 873  * In the case of methods whose name does not end with
 874  * `*WithoutReplacement`, malformed sequences are automatically replaced
 875  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
 876  * return early.
 877  *
 878  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
 879  * space. When decoding to UTF-16, the output buffer must have at least two
 880  * UTF-16 code units (`char16_t`) of space.
 881  *
 882  * When decoding to UTF-8 without replacement, the methods are guaranteed
 883  * not to return indicating that more output space is needed if the length
 884  * of the output buffer is at least the length returned by
 885  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
 886  * with replacement, the length of the output buffer that guarantees the
 887  * methods not to return indicating that more output space is needed is given
 888  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
 889  * or without replacement, the length of the output buffer that guarantees
 890  * the methods not to return indicating that more output space is needed is
 891  * given by `MaxUTF16BufferLength()`.
 892  *
 893  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
 894  * and the output after each `Decode*` call is guaranteed to consist of
 895  * complete characters. (I.e. the code unit sequence for the last character is
 896  * guaranteed not to be split across output buffers.)
 897  *
 898  * The boolean argument `aLast` indicates that the end of the stream is reached
 899  * when all the bytes in `aSrc` have been consumed.
 900  *
 901  * A `Decoder` object can be used to incrementally decode a byte stream.
 902  *
 903  * During the processing of a single stream, the caller must call `Decode*`
 904  * zero or more times with `aLast` set to `false` and then call `Decode*` at
 905  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
 906  * the processing of the stream has ended. Otherwise, the caller must call
 907  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
 908  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
 909  *
 910  * Once the stream has ended, the `Decoder` object must not be used anymore.
 911  * That is, you need to create another one to process another stream.
 912  *
 913  * When the decoder returns `kOutputFull` or the decoder returns a malformed
 914  * result and the caller does not wish to treat it as a fatal error, the input
 915  * buffer `aSrc` may not have been completely consumed. In that case, the caller
 916  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
 917  * call.
 918  *
 919  * # Infinite loops
 920  *
 921  * When converting with a fixed-size output buffer whose size is too small to
 922  * accommodate one character of output, an infinite loop ensues. When
 923  * converting with a fixed-size output buffer, it generally makes sense to
 924  * make the buffer fairly large (e.g. couple of kilobytes).
 925  */
 926 class Decoder final
 927 {
 928 public:
 929   ~Decoder() {}
 930   static void operator delete(void* aDecoder)
 931   {
 932     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
 933   }
 934
 935   /**
 936    * The `Encoding` this `Decoder` is for.
 937    *
 938    * BOM sniffing can change the return value of this method during the life
 939    * of the decoder.
 940    */
 941   inline NotNull<const mozilla::Encoding*> Encoding() const
 942   {
 943     return WrapNotNull(decoder_encoding(this));
 944   }
 945
 946   /**
 947    * Query the worst-case UTF-8 output size _with replacement_.
 948    *
 949    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 950    * that will not overflow given the current state of the decoder and
 951    * `aByteLength` number of additional input bytes when decoding with
 952    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
 953    * sequence.
 954    */
 955   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const
 956   {
 957     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
 958     if (max.value() == MaxValue<size_t>::value) {
 959       // Mark invalid by overflowing
 960       max++;
 961       MOZ_ASSERT(!max.isValid());
 962     }
 963     return max;
 964   }
 965
 966   /**
 967    * Query the worst-case UTF-8 output size _without replacement_.
 968    *
 969    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 970    * that will not overflow given the current state of the decoder and
 971    * `aByteLength` number of additional input bytes when decoding without
 972    * replacement error handling.
 973    *
 974    * Note that this value may be too small for the `WithReplacement` case.
 975    * Use `MaxUTF8BufferLength()` for that case.
 976    */
 977   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
 978     size_t aByteLength) const
 979   {
 980     CheckedInt<size_t> max(
 981       decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
 982     if (max.value() == MaxValue<size_t>::value) {
 983       // Mark invalid by overflowing
 984       max++;
 985       MOZ_ASSERT(!max.isValid());
 986     }
 987     return max;
 988   }
 989
 990   /**
 991    * Incrementally decode a byte stream into UTF-8 with malformed sequences
 992    * replaced with the REPLACEMENT CHARACTER.
 993    *
 994    * See the documentation of the class for documentation for `Decode*`
 995    * methods collectively.
 996    */
 997   inline Tuple<uint32_t, size_t, size_t, bool>
 998   DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
 999   {
1000     size_t srcRead = aSrc.Length();
1001     size_t dstWritten = aDst.Length();
1002     bool hadReplacements;
1003     uint32_t result = decoder_decode_to_utf8(this,
1004                                              aSrc.Elements(),
1005                                              &srcRead,
1006                                              aDst.Elements(),
1007                                              &dstWritten,
1008                                              aLast,
1009                                              &hadReplacements);
1010     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1011   }
1012
1013   /**
1014    * Incrementally decode a byte stream into UTF-8 _without replacement_.
1015    *
1016    * See the documentation of the class for documentation for `Decode*`
1017    * methods collectively.
1018    */
1019   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
1020     Span<const uint8_t> aSrc,
1021     Span<uint8_t> aDst,
1022     bool aLast)
1023   {
1024     size_t srcRead = aSrc.Length();
1025     size_t dstWritten = aDst.Length();
1026     uint32_t result = decoder_decode_to_utf8_without_replacement(
1027       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1028     return MakeTuple(result, srcRead, dstWritten);
1029   }
1030
1031   /**
1032    * Query the worst-case UTF-16 output size (with or without replacement).
1033    *
1034    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
1035    * that will not overflow given the current state of the decoder and
1036    * `aByteLength` number of additional input bytes.
1037    *
1038    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
1039    * return value of this method applies also in the
1040    * `_without_replacement` case.
1041    */
1042   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const
1043   {
1044     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
1045     if (max.value() == MaxValue<size_t>::value) {
1046       // Mark invalid by overflowing
1047       max++;
1048       MOZ_ASSERT(!max.isValid());
1049     }
1050     return max;
1051   }
1052
1053   /**
1054    * Incrementally decode a byte stream into UTF-16 with malformed sequences
1055    * replaced with the REPLACEMENT CHARACTER.
1056    *
1057    * See the documentation of the class for documentation for `Decode*`
1058    * methods collectively.
1059    */
1060   inline Tuple<uint32_t, size_t, size_t, bool>
1061   DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast)
1062   {
1063     size_t srcRead = aSrc.Length();
1064     size_t dstWritten = aDst.Length();
1065     bool hadReplacements;
1066     uint32_t result = decoder_decode_to_utf16(this,
1067                                               aSrc.Elements(),
1068                                               &srcRead,
1069                                               aDst.Elements(),
1070                                               &dstWritten,
1071                                               aLast,
1072                                               &hadReplacements);
1073     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1074   }
1075
1076   /**
1077    * Incrementally decode a byte stream into UTF-16 _without replacement_.
1078    *
1079    * See the documentation of the class for documentation for `Decode*`
1080    * methods collectively.
1081    */
1082   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1083     Span<const uint8_t> aSrc,
1084     Span<char16_t> aDst,
1085     bool aLast)
1086   {
1087     size_t srcRead = aSrc.Length();
1088     size_t dstWritten = aDst.Length();
1089     uint32_t result = decoder_decode_to_utf16_without_replacement(
1090       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1091     return MakeTuple(result, srcRead, dstWritten);
1092   }
1093
1094 private:
1095   Decoder() = delete;
1096   Decoder(const Decoder&) = delete;
1097   Decoder& operator=(const Decoder&) = delete;
1098 };
1099
1100 /**
1101  * A converter that encodes a Unicode stream into bytes according to a
1102  * character encoding in a streaming (incremental) manner.
1103  *
1104  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1105  * buffer `aDst` both of which are caller-allocated. There are variants for
1106  * both UTF-8 and UTF-16 input buffers.
1107  *
1108  * An `Encode*` method encode characters from `aSrc` into bytes characters
1109  * stored into `aDst` until one of the following three things happens:
1110  *
1111  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1112  *    only).
1113  *
1114  * 2. The output buffer has been filled so near capacity that the decoder
1115  *    cannot be sure that processing an additional character of input wouldn't
1116  *    cause so much output that the output buffer would overflow.
1117  *
1118  * 3. All the input characters have been processed.
1119  *
1120  * The `Encode*` method then returns tuple of a status indicating which one
1121  * of the three reasons to return happened, how many input code units (`uint8_t`
1122  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1123  * how many output bytes were written, and in the case of the variants that
1124  * perform replacement, a boolean indicating whether an unmappable
1125  * character was replaced with a numeric character reference during the call.
1126  *
1127  * The number of bytes "written" is what's logically written. Garbage may be
1128  * written in the output buffer beyond the point logically written to.
1129  *
1130  * In the case of the methods whose name ends with
1131  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1132  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1133  * to the three cases listed above).
1134  *
1135  * In the case of methods whose name does not end with
1136  * `*WithoutReplacement`, unmappable characters are automatically replaced
1137  * with the corresponding numeric character references and unmappable
1138  * characters do not cause the methods to return early.
1139  *
1140  * When encoding from UTF-8 without replacement, the methods are guaranteed
1141  * not to return indicating that more output space is needed if the length
1142  * of the output buffer is at least the length returned by
1143  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1144  * UTF-8 with replacement, the length of the output buffer that guarantees the
1145  * methods not to return indicating that more output space is needed in the
1146  * absence of unmappable characters is given by
1147  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1148  * UTF-16 without replacement, the methods are guaranteed not to return
1149  * indicating that more output space is needed if the length of the output
1150  * buffer is at least the length returned by
1151  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1152  * from UTF-16 with replacement, the the length of the output buffer that
1153  * guarantees the methods not to return indicating that more output space is
1154  * needed in the absence of unmappable characters is given by
1155  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1156  * When encoding with replacement, applications are not expected to size the
1157  * buffer for the worst case ahead of time but to resize the buffer if there
1158  * are unmappable characters. This is why max length queries are only available
1159  * for the case where there are no unmappable characters.
1160  *
1161  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1162  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1163  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1164  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1165  * surrogate pairs are not split across input buffer boundaries.
1166  *
1167  * After an `Encode*` call returns, the output produced so far, taken as a
1168  * whole from the start of the stream, is guaranteed to consist of a valid
1169  * byte sequence in the target encoding. (I.e. the code unit sequence for a
1170  * character is guaranteed not to be split across output buffers. However, due
1171  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1172  * from the start for it to be valid. For other encodings, the validity holds
1173  * on a per-output buffer basis.)
1174  *
1175  * The boolean argument `aLast` indicates that the end of the stream is reached
1176  * when all the characters in `aSrc` have been consumed. This argument is needed
1177  * for ISO-2022-JP and is ignored for other encodings.
1178  *
1179  * An `Encoder` object can be used to incrementally encode a byte stream.
1180  *
1181  * During the processing of a single stream, the caller must call `Encode*`
1182  * zero or more times with `aLast` set to `false` and then call `Encode*` at
1183  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1184  * the processing of the stream has ended. Otherwise, the caller must call
1185  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1186  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1187  *
1188  * Once the stream has ended, the `Encoder` object must not be used anymore.
1189  * That is, you need to create another one to process another stream.
1190  *
1191  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1192  * result and the caller does not wish to treat it as a fatal error, the input
1193  * buffer `aSrc` may not have been completely consumed. In that case, the caller
1194  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1195  * call.
1196  *
1197  * # Infinite loops
1198  *
1199  * When converting with a fixed-size output buffer whose size is too small to
1200  * accommodate one character of output, an infinite loop ensues. When
1201  * converting with a fixed-size output buffer, it generally makes sense to
1202  * make the buffer fairly large (e.g. couple of kilobytes).
1203  */
1204 class Encoder final
1205 {
1206 public:
1207   ~Encoder() {}
1208
1209   static void operator delete(void* aEncoder)
1210   {
1211     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1212   }
1213
1214   /**
1215    * The `Encoding` this `Encoder` is for.
1216    */
1217   inline NotNull<const mozilla::Encoding*> Encoding() const
1218   {
1219     return WrapNotNull(encoder_encoding(this));
1220   }
1221
1222   /**
1223    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1224    * ASCII state and `false` otherwise.
1225    */
1226   inline bool HasPendingState() const
1227   {
1228     return encoder_has_pending_state(this);
1229   }
1230
1231   /**
1232    * Query the worst-case output size when encoding from UTF-8 with
1233    * replacement.
1234    *
1235    * Returns the size of the output buffer in bytes that will not overflow
1236    * given the current state of the encoder and `aByteLength` number of
1237    * additional input code units if there are no unmappable characters in
1238    * the input.
1239    */
1240   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1241     size_t aByteLength) const
1242   {
1243     CheckedInt<size_t> max(
1244       encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength));
1245     if (max.value() == MaxValue<size_t>::value) {
1246       // Mark invalid by overflowing
1247       max++;
1248       MOZ_ASSERT(!max.isValid());
1249     }
1250     return max;
1251   }
1252
1253   /**
1254    * Query the worst-case output size when encoding from UTF-8 without
1255    * replacement.
1256    *
1257    * Returns the size of the output buffer in bytes that will not overflow
1258    * given the current state of the encoder and `aByteLength` number of
1259    * additional input code units.
1260    */
1261   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1262     size_t aByteLength) const
1263   {
1264     CheckedInt<size_t> max(
1265       encoder_max_buffer_length_from_utf8_without_replacement(this,
1266                                                               aByteLength));
1267     if (max.value() == MaxValue<size_t>::value) {
1268       // Mark invalid by overflowing
1269       max++;
1270       MOZ_ASSERT(!max.isValid());
1271     }
1272     return max;
1273   }
1274
1275   /**
1276    * Incrementally encode into byte stream from UTF-8 with unmappable
1277    * characters replaced with HTML (decimal) numeric character references.
1278    *
1279    * See the documentation of the class for documentation for `Encode*`
1280    * methods collectively.
1281    *
1282    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1283    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1284    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1285    */
1286   inline Tuple<uint32_t, size_t, size_t, bool>
1287   EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
1288   {
1289     size_t srcRead = aSrc.Length();
1290     size_t dstWritten = aDst.Length();
1291     bool hadReplacements;
1292     uint32_t result = encoder_encode_from_utf8(this,
1293                                                aSrc.Elements(),
1294                                                &srcRead,
1295                                                aDst.Elements(),
1296                                                &dstWritten,
1297                                                aLast,
1298                                                &hadReplacements);
1299     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1300   }
1301
1302   /**
1303    * Incrementally encode into byte stream from UTF-8 _without replacement_.
1304    *
1305    * See the documentation of the class for documentation for `Encode*`
1306    * methods collectively.
1307    *
1308    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1309    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1310    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1311    */
1312   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1313     Span<const uint8_t> aSrc,
1314     Span<uint8_t> aDst,
1315     bool aLast)
1316   {
1317     size_t srcRead = aSrc.Length();
1318     size_t dstWritten = aDst.Length();
1319     uint32_t result = encoder_encode_from_utf8_without_replacement(
1320       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1321     return MakeTuple(result, srcRead, dstWritten);
1322   }
1323
1324   /**
1325    * Query the worst-case output size when encoding from UTF-16 with
1326    * replacement.
1327    *
1328    * Returns the size of the output buffer in bytes that will not overflow
1329    * given the current state of the encoder and `aU16Length` number of
1330    * additional input code units if there are no unmappable characters in
1331    * the input.
1332    */
1333   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1334     size_t aU16Length) const
1335   {
1336     CheckedInt<size_t> max(
1337       encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length));
1338     if (max.value() == MaxValue<size_t>::value) {
1339       // Mark invalid by overflowing
1340       max++;
1341       MOZ_ASSERT(!max.isValid());
1342     }
1343     return max;
1344   }
1345
1346   /**
1347    * Query the worst-case output size when encoding from UTF-16 without
1348    * replacement.
1349    *
1350    * Returns the size of the output buffer in bytes that will not overflow
1351    * given the current state of the encoder and `aU16Length` number of
1352    * additional input code units.
1353    */
1354   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1355     size_t aU16Length) const
1356   {
1357     CheckedInt<size_t> max(
1358       encoder_max_buffer_length_from_utf16_without_replacement(this,
1359                                                                aU16Length));
1360     if (max.value() == MaxValue<size_t>::value) {
1361       // Mark invalid by overflowing
1362       max++;
1363       MOZ_ASSERT(!max.isValid());
1364     }
1365     return max;
1366   }
1367
1368   /**
1369    * Incrementally encode into byte stream from UTF-16 with unmappable
1370    * characters replaced with HTML (decimal) numeric character references.
1371    *
1372    * See the documentation of the class for documentation for `Encode*`
1373    * methods collectively.
1374    */
1375   inline Tuple<uint32_t, size_t, size_t, bool>
1376   EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast)
1377   {
1378     size_t srcRead = aSrc.Length();
1379     size_t dstWritten = aDst.Length();
1380     bool hadReplacements;
1381     uint32_t result = encoder_encode_from_utf16(this,
1382                                                 aSrc.Elements(),
1383                                                 &srcRead,
1384                                                 aDst.Elements(),
1385                                                 &dstWritten,
1386                                                 aLast,
1387                                                 &hadReplacements);
1388     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1389   }
1390
1391   /**
1392    * Incrementally encode into byte stream from UTF-16 _without replacement_.
1393    *
1394    * See the documentation of the class for documentation for `Encode*`
1395    * methods collectively.
1396    */
1397   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1398     Span<const char16_t> aSrc,
1399     Span<uint8_t> aDst,
1400     bool aLast)
1401   {
1402     size_t srcRead = aSrc.Length();
1403     size_t dstWritten = aDst.Length();
1404     uint32_t result = encoder_encode_from_utf16_without_replacement(
1405       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1406     return MakeTuple(result, srcRead, dstWritten);
1407   }
1408
1409 private:
1410   Encoder() = delete;
1411   Encoder(const Encoder&) = delete;
1412   Encoder& operator=(const Encoder&) = delete;
1413 };
1414
1415 }; // namespace mozilla
1416
1417 #endif // mozilla_Encoding_h