intl/Encoding.h

   1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
   2 // file at the top-level directory of this distribution.
   3 //
   4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   7 // option. This file may not be copied, modified, or distributed
   8 // except according to those terms.
   9
  10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
  11 // "top-level directory" in the above notice refers to
  12 // third_party/rust/encoding_c/.
  13
  14 #ifndef mozilla_Encoding_h
  15 #define mozilla_Encoding_h
  16
  17 #include "mozilla/CheckedInt.h"
  18 #include "mozilla/Maybe.h"
  19 #include "mozilla/NotNull.h"
  20 #include "mozilla/Span.h"
  21 #include "mozilla/Tuple.h"
  22 #include "nsString.h"
  23
  24 namespace mozilla {
  25 class Encoding;
  26 class Decoder;
  27 class Encoder;
  28 };  // namespace mozilla
  29
  30 #define ENCODING_RS_ENCODING mozilla::Encoding
  31 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
  32   mozilla::NotNull<const mozilla::Encoding*>
  33 #define ENCODING_RS_ENCODER mozilla::Encoder
  34 #define ENCODING_RS_DECODER mozilla::Decoder
  35
  36 #include "encoding_rs.h"
  37
  38 extern "C" {
  39
  40 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
  41                                              uint8_t const* src, size_t src_len,
  42                                              nsAString* dst);
  43
  44 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
  45     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  46     nsAString* dst);
  47
  48 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
  49     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  50     nsAString* dst);
  51
  52 nsresult
  53 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
  54     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  55     nsAString* dst);
  56
  57 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
  58                                             char16_t const* src, size_t src_len,
  59                                             nsACString* dst);
  60
  61 nsresult mozilla_encoding_decode_to_nscstring(
  62     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  63
  64 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
  65     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  66
  67 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
  68     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  69
  70 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
  71     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  72     nsACString* dst, size_t already_validated);
  73
  74 nsresult
  75 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
  76     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  77
  78 nsresult mozilla_encoding_encode_from_nscstring(
  79     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  80
  81 }  // extern "C"
  82
  83 namespace mozilla {
  84
  85 /**
  86  * Return value from `Decoder`/`Encoder` to indicate that input
  87  * was exhausted.
  88  */
  89 const uint32_t kInputEmpty = INPUT_EMPTY;
  90
  91 /**
  92  * Return value from `Decoder`/`Encoder` to indicate that output
  93  * space was insufficient.
  94  */
  95 const uint32_t kOutputFull = OUTPUT_FULL;
  96
  97 /**
  98  * An encoding as defined in the Encoding Standard
  99  * (https://encoding.spec.whatwg.org/).
 100  *
 101  * See https://docs.rs/encoding_rs/ for the Rust API docs.
 102  *
 103  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
 104  * sequence and, in most cases, vice versa. Each encoding has a name, an output
 105  * encoding, and one or more labels.
 106  *
 107  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
 108  * encoding in formats and protocols. The _name_ of the encoding is the
 109  * preferred label in the case appropriate for returning from the
 110  * `characterSet` property of the `Document` DOM interface, except for
 111  * the replacement encoding whose name is not one of its labels.
 112  *
 113  * The _output encoding_ is the encoding used for form submission and URL
 114  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
 115  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
 116  * encodings.
 117  *
 118  * # Streaming vs. Non-Streaming
 119  *
 120  * When you have the entire input in a single buffer, you can use the
 121  * methods `Decode()`, `DecodeWithBOMRemoval()`,
 122  * `DecodeWithoutBOMHandling()`,
 123  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
 124  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
 125  * NewEncoder()` methods), these methods perform heap allocations. You should
 126  * the `Decoder` and `Encoder` objects when your input is split into multiple
 127  * buffers or when you want to control the allocation of the output buffers.
 128  *
 129  * # Instances
 130  *
 131  * All instances of `Encoding` are statically allocated and have the process's
 132  * lifetime. There is precisely one unique `Encoding` instance for each
 133  * encoding defined in the Encoding Standard.
 134  *
 135  * To obtain a reference to a particular encoding whose identity you know at
 136  * compile time, use a `static` that refers to encoding. There is a `static`
 137  * for each encoding. The `static`s are named in all caps with hyphens
 138  * replaced with underscores and with `_ENCODING` appended to the
 139  * name. For example, if you know at compile time that you will want to
 140  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
 141  *
 142  * If you don't know what encoding you need at compile time and need to
 143  * dynamically get an encoding by label, use `Encoding::for_label()`.
 144  *
 145  * Pointers to `Encoding` can be compared with `==` to check for the sameness
 146  * of two encodings.
 147  *
 148  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
 149  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
 150  * `const mozilla::Encoding*` in the C signature and
 151  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
 152  */
 153 class Encoding final {
 154  public:
 155   /**
 156    * Implements the _get an encoding_ algorithm
 157    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
 158    *
 159    * If, after ASCII-lowercasing and removing leading and trailing
 160    * whitespace, the argument matches a label defined in the Encoding
 161    * Standard, `const Encoding*` representing the corresponding
 162    * encoding is returned. If there is no match, `nullptr` is returned.
 163    *
 164    * This is the right method to use if the action upon the method returning
 165    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
 166    * instead. When the action upon the method returning `nullptr` is not to
 167    * proceed with a fallback but to refuse processing,
 168    * `ForLabelNoReplacement()` is more appropriate.
 169    */
 170   static inline const Encoding* ForLabel(Span<const char> aLabel) {
 171     return encoding_for_label(
 172         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 173   }
 174
 175   /**
 176    * `nsAString` argument version. See above for docs.
 177    */
 178   static inline const Encoding* ForLabel(const nsAString& aLabel) {
 179     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
 180   }
 181
 182   /**
 183    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
 184    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
 185    *
 186    * This method is useful in scenarios where a fatal error is required
 187    * upon invalid label, because in those cases the caller typically wishes
 188    * to treat the labels that map to the replacement encoding as fatal
 189    * errors, too.
 190    *
 191    * It is not OK to use this method when the action upon the method returning
 192    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
 193    * such a case, the `ForLabel()` method should be used instead in order to
 194    * avoid unsafe fallback for labels that `ForLabel()` maps to
 195    * `REPLACEMENT_ENCODING`.
 196    */
 197   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
 198     return encoding_for_label_no_replacement(
 199         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 200   }
 201
 202   /**
 203    * `nsAString` argument version. See above for docs.
 204    */
 205   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
 206     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
 207   }
 208
 209   /**
 210    * Performs non-incremental BOM sniffing.
 211    *
 212    * The argument must either be a buffer representing the entire input
 213    * stream (non-streaming case) or a buffer representing at least the first
 214    * three bytes of the input stream (streaming case).
 215    *
 216    * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
 217    * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
 218    * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
 219    */
 220   static inline Tuple<const Encoding*, size_t> ForBOM(
 221       Span<const uint8_t> aBuffer) {
 222     size_t len = aBuffer.Length();
 223     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
 224     return MakeTuple(encoding, len);
 225   }
 226
 227   /**
 228    * Writes the name of this encoding into `aName`.
 229    *
 230    * This name is appropriate to return as-is from the DOM
 231    * `document.characterSet` property.
 232    */
 233   inline void Name(nsACString& aName) const {
 234     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
 235     size_t length =
 236         encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
 237     aName.SetLength(length);  // truncation is the 64-bit case is OK
 238   }
 239
 240   /**
 241    * Checks whether the _output encoding_ of this encoding can encode every
 242    * Unicode code point. (Only true if the output encoding is UTF-8.)
 243    */
 244   inline bool CanEncodeEverything() const {
 245     return encoding_can_encode_everything(this);
 246   }
 247
 248   /**
 249    * Checks whether this encoding maps one byte to one Basic Multilingual
 250    * Plane code point (i.e. byte length equals decoded UTF-16 length) and
 251    * vice versa (for mappable characters).
 252    *
 253    * `true` iff this encoding is on the list of Legacy single-byte
 254    * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
 255    * in the spec or x-user-defined.
 256    */
 257   inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
 258
 259   /**
 260    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
 261    * U+0000...U+007F and vice versa.
 262    */
 263   inline bool IsAsciiCompatible() const {
 264     return encoding_is_ascii_compatible(this);
 265   }
 266
 267   /**
 268    * Checks whether this is a Japanese legacy encoding.
 269    */
 270   inline bool IsJapaneseLegacy() const {
 271     return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
 272            this == ISO_2022_JP_ENCODING;
 273   }
 274
 275   /**
 276    * Returns the _output encoding_ of this encoding. This is UTF-8 for
 277    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
 278    */
 279   inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
 280     return WrapNotNull(encoding_output_encoding(this));
 281   }
 282
 283   /**
 284    * Decode complete input to `nsACString` _with BOM sniffing_ and with
 285    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 286    * entire input is available as a single buffer (i.e. the end of the
 287    * buffer marks the end of the stream).
 288    *
 289    * This method implements the (non-streaming version of) the
 290    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 291    *
 292    * The second item in the returned tuple is the encoding that was actually
 293    * used (which may differ from this encoding thanks to BOM sniffing).
 294    *
 295    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 296    * if there were malformed sequences (that were replaced with the
 297    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 298    * tuple.
 299    *
 300    * The backing buffer of the string isn't copied if the input buffer
 301    * is heap-allocated and decoding from UTF-8 and the input is valid
 302    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 303    * the input is valid ASCII or decoding from ISO-2022-JP and the
 304    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 305    * the same string as both arguments.
 306    *
 307    * _Note:_ It is wrong to use this when the input buffer represents only
 308    * a segment of the input instead of the whole input. Use `NewDecoder()`
 309    * when decoding segmented input.
 310    */
 311   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 312       const nsACString& aBytes, nsACString& aOut) const {
 313     const Encoding* encoding = this;
 314     const nsACString* bytes = &aBytes;
 315     nsACString* out = &aOut;
 316     nsresult rv;
 317     if (bytes == out) {
 318       nsAutoCString temp(aBytes);
 319       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
 320     } else {
 321       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
 322     }
 323     return MakeTuple(rv, WrapNotNull(encoding));
 324   }
 325
 326   /**
 327    * Decode complete input to `nsAString` _with BOM sniffing_ and with
 328    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 329    * entire input is available as a single buffer (i.e. the end of the
 330    * buffer marks the end of the stream).
 331    *
 332    * This method implements the (non-streaming version of) the
 333    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 334    *
 335    * The second item in the returned tuple is the encoding that was actually
 336    * used (which may differ from this encoding thanks to BOM sniffing).
 337    *
 338    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 339    * if there were malformed sequences (that were replaced with the
 340    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 341    * tuple.
 342    *
 343    * _Note:_ It is wrong to use this when the input buffer represents only
 344    * a segment of the input instead of the whole input. Use `NewDecoder()`
 345    * when decoding segmented input.
 346    */
 347   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 348       Span<const uint8_t> aBytes, nsAString& aOut) const {
 349     const Encoding* encoding = this;
 350     nsresult rv = mozilla_encoding_decode_to_nsstring(
 351         &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
 352     return MakeTuple(rv, WrapNotNull(encoding));
 353   }
 354
 355   /**
 356    * Decode complete input to `nsACString` _with BOM removal_ and with
 357    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 358    * entire input is available as a single buffer (i.e. the end of the
 359    * buffer marks the end of the stream).
 360    *
 361    * When invoked on `UTF_8`, this method implements the (non-streaming
 362    * version of) the _UTF-8 decode_
 363    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 364    *
 365    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 366    * if there were malformed sequences (that were replaced with the
 367    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 368    *
 369    * The backing buffer of the string isn't copied if the input buffer
 370    * is heap-allocated and decoding from UTF-8 and the input is valid
 371    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 372    * the input is valid ASCII or decoding from ISO-2022-JP and the
 373    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 374    * the same string as both arguments.
 375    *
 376    * _Note:_ It is wrong to use this when the input buffer represents only
 377    * a segment of the input instead of the whole input. Use
 378    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 379    */
 380   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
 381                                        nsACString& aOut) const {
 382     const nsACString* bytes = &aBytes;
 383     nsACString* out = &aOut;
 384     if (bytes == out) {
 385       nsAutoCString temp(aBytes);
 386       return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
 387                                                                    out);
 388     }
 389     return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
 390                                                                  out);
 391   }
 392
 393   /**
 394    * Decode complete input to `nsAString` _with BOM removal_ and with
 395    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 396    * entire input is available as a single buffer (i.e. the end of the
 397    * buffer marks the end of the stream).
 398    *
 399    * When invoked on `UTF_8`, this method implements the (non-streaming
 400    * version of) the _UTF-8 decode_
 401    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 402    *
 403    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 404    * if there were malformed sequences (that were replaced with the
 405    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 406    *
 407    * _Note:_ It is wrong to use this when the input buffer represents only
 408    * a segment of the input instead of the whole input. Use
 409    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 410    */
 411   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
 412                                        nsAString& aOut) const {
 413     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
 414         this, aBytes.Elements(), aBytes.Length(), &aOut);
 415   }
 416
 417   /**
 418    * Decode complete input to `nsACString` _without BOM handling_ and
 419    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 420    * the entire input is available as a single buffer (i.e. the end of the
 421    * buffer marks the end of the stream).
 422    *
 423    * When invoked on `UTF_8`, this method implements the (non-streaming
 424    * version of) the _UTF-8 decode without BOM_
 425    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 426    *
 427    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 428    * if there were malformed sequences (that were replaced with the
 429    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 430    *
 431    * The backing buffer of the string isn't copied if the input buffer
 432    * is heap-allocated and decoding from UTF-8 and the input is valid
 433    * UTF-8, decoding from an ASCII-compatible encoding and the input
 434    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 435    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 436    * as both arguments.
 437    *
 438    * _Note:_ It is wrong to use this when the input buffer represents only
 439    * a segment of the input instead of the whole input. Use
 440    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 441    */
 442   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
 443                                            nsACString& aOut) const {
 444     const nsACString* bytes = &aBytes;
 445     nsACString* out = &aOut;
 446     if (bytes == out) {
 447       nsAutoCString temp(aBytes);
 448       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 449           this, &temp, out);
 450     }
 451     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 452         this, bytes, out);
 453   }
 454
 455   /**
 456    * Decode complete input to `nsAString` _without BOM handling_ and
 457    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 458    * the entire input is available as a single buffer (i.e. the end of the
 459    * buffer marks the end of the stream).
 460    *
 461    * When invoked on `UTF_8`, this method implements the (non-streaming
 462    * version of) the _UTF-8 decode without BOM_
 463    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 464    *
 465    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 466    * if there were malformed sequences (that were replaced with the
 467    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 468    *
 469    * _Note:_ It is wrong to use this when the input buffer represents only
 470    * a segment of the input instead of the whole input. Use
 471    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 472    */
 473   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 474                                            nsAString& aOut) const {
 475     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
 476         this, aBytes.Elements(), aBytes.Length(), &aOut);
 477   }
 478
 479   /**
 480    * Decode complete input to `nsACString` _without BOM handling_ and
 481    * _with malformed sequences treated as fatal_ when the entire input is
 482    * available as a single buffer (i.e. the end of the buffer marks the end
 483    * of the stream).
 484    *
 485    * When invoked on `UTF_8`, this method implements the (non-streaming
 486    * version of) the _UTF-8 decode without BOM or fail_
 487    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 488    * spec concept.
 489    *
 490    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 491    * if a malformed sequence was encountered and `NS_OK` otherwise.
 492    *
 493    * The backing buffer of the string isn't copied if the input buffer
 494    * is heap-allocated and decoding from UTF-8 and the input is valid
 495    * UTF-8, decoding from an ASCII-compatible encoding and the input
 496    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 497    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 498    * as both arguments.
 499    *
 500    * _Note:_ It is wrong to use this when the input buffer represents only
 501    * a segment of the input instead of the whole input. Use
 502    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 503    */
 504   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 505       const nsACString& aBytes, nsACString& aOut) const {
 506     const nsACString* bytes = &aBytes;
 507     nsACString* out = &aOut;
 508     if (bytes == out) {
 509       nsAutoCString temp(aBytes);
 510       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 511           this, &temp, out);
 512     }
 513     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 514         this, bytes, out);
 515   }
 516
 517   /**
 518    * Decode complete input to `nsACString` _without BOM handling_ and
 519    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 520    * the entire input is available as a single buffer (i.e. the end of the
 521    * buffer marks the end of the stream) _asserting that a number of bytes
 522    * from the start are already known to be valid UTF-8_.
 523    *
 524    * The use case for this method is avoiding copying when dealing with
 525    * input that has a UTF-8 BOM. _When in doubt, do not use this method._
 526    *
 527    * When invoked on `UTF_8`, this method implements the (non-streaming
 528    * version of) the _UTF-8 decode without BOM_
 529    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 530    *
 531    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 532    * if there were malformed sequences (that were replaced with the
 533    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 534    *
 535    * _Note:_ It is wrong to use this when the input buffer represents only
 536    * a segment of the input instead of the whole input. Use
 537    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 538    *
 539    * # Safety
 540    *
 541    * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
 542    * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
 543    */
 544   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 545                                            nsACString& aOut,
 546                                            size_t aAlreadyValidated) const {
 547     return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
 548         this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
 549   }
 550
 551   /**
 552    * Decode complete input to `nsAString` _without BOM handling_ and
 553    * _with malformed sequences treated as fatal_ when the entire input is
 554    * available as a single buffer (i.e. the end of the buffer marks the end
 555    * of the stream).
 556    *
 557    * When invoked on `UTF_8`, this method implements the (non-streaming
 558    * version of) the _UTF-8 decode without BOM or fail_
 559    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 560    * spec concept.
 561    *
 562    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 563    * if a malformed sequence was encountered and `NS_OK` otherwise.
 564    *
 565    * _Note:_ It is wrong to use this when the input buffer represents only
 566    * a segment of the input instead of the whole input. Use
 567    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 568    */
 569   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 570       Span<const uint8_t> aBytes, nsAString& aOut) const {
 571     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
 572         this, aBytes.Elements(), aBytes.Length(), &aOut);
 573   }
 574
 575   /**
 576    * Encode complete input to `nsACString` with unmappable characters
 577    * replaced with decimal numeric character references when the entire input
 578    * is available as a single buffer (i.e. the end of the buffer marks the
 579    * end of the stream).
 580    *
 581    * This method implements the (non-streaming version of) the
 582    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 583    *
 584    * The second item in the returned tuple is the encoding that was actually
 585    * used (which may differ from this encoding thanks to some encodings
 586    * having UTF-8 as their output encoding).
 587    *
 588    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
 589    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
 590    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
 591    * replaced with numeric character references) and `NS_OK` otherwise.
 592    *
 593    * The backing buffer of the string isn't copied if the input buffer
 594    * is heap-allocated and encoding to UTF-8 and the input is valid
 595    * UTF-8, encoding to an ASCII-compatible encoding and the input
 596    * is valid ASCII or encoding from ISO-2022-JP and the input stays
 597    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 598    * as both arguments.
 599    *
 600    * _Note:_ It is wrong to use this when the input buffer represents only
 601    * a segment of the input instead of the whole input. Use `NewEncoder()`
 602    * when encoding segmented output.
 603    */
 604   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 605       const nsACString& aString, nsACString& aOut) const {
 606     const Encoding* encoding = this;
 607     const nsACString* string = &aString;
 608     nsACString* out = &aOut;
 609     nsresult rv;
 610     if (string == out) {
 611       nsAutoCString temp(aString);
 612       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
 613     } else {
 614       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
 615     }
 616     return MakeTuple(rv, WrapNotNull(encoding));
 617   }
 618
 619   /**
 620    * Encode complete input to `nsACString` with unmappable characters
 621    * replaced with decimal numeric character references when the entire input
 622    * is available as a single buffer (i.e. the end of the buffer marks the
 623    * end of the stream).
 624    *
 625    * This method implements the (non-streaming version of) the
 626    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 627    *
 628    * The second item in the returned tuple is the encoding that was actually
 629    * used (which may differ from this encoding thanks to some encodings
 630    * having UTF-8 as their output encoding).
 631    *
 632    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
 633    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
 634    * were replaced with numeric character references) and `NS_OK` otherwise.
 635
 636    * _Note:_ It is wrong to use this when the input buffer represents only
 637    * a segment of the input instead of the whole input. Use `NewEncoder()`
 638    * when encoding segmented output.
 639    */
 640   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 641       Span<const char16_t> aString, nsACString& aOut) const {
 642     const Encoding* encoding = this;
 643     nsresult rv = mozilla_encoding_encode_from_utf16(
 644         &encoding, aString.Elements(), aString.Length(), &aOut);
 645     return MakeTuple(rv, WrapNotNull(encoding));
 646   }
 647
 648   /**
 649    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
 650    *
 651    * BOM sniffing may cause the returned decoder to morph into a decoder
 652    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 653    */
 654   inline UniquePtr<Decoder> NewDecoder() const {
 655     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
 656     return decoder;
 657   }
 658
 659   /**
 660    * Instantiates a new decoder for this encoding with BOM sniffing enabled
 661    * into memory occupied by a previously-instantiated decoder.
 662    *
 663    * BOM sniffing may cause the returned decoder to morph into a decoder
 664    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 665    */
 666   inline void NewDecoderInto(Decoder& aDecoder) const {
 667     encoding_new_decoder_into(this, &aDecoder);
 668   }
 669
 670   /**
 671    * Instantiates a new decoder for this encoding with BOM removal.
 672    *
 673    * If the input starts with bytes that are the BOM for this encoding,
 674    * those bytes are removed. However, the decoder never morphs into a
 675    * decoder for another encoding: A BOM for another encoding is treated as
 676    * (potentially malformed) input to the decoding algorithm for this
 677    * encoding.
 678    */
 679   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
 680     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
 681     return decoder;
 682   }
 683
 684   /**
 685    * Instantiates a new decoder for this encoding with BOM removal
 686    * into memory occupied by a previously-instantiated decoder.
 687    *
 688    * If the input starts with bytes that are the BOM for this encoding,
 689    * those bytes are removed. However, the decoder never morphs into a
 690    * decoder for another encoding: A BOM for another encoding is treated as
 691    * (potentially malformed) input to the decoding algorithm for this
 692    * encoding.
 693    */
 694   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
 695     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
 696   }
 697
 698   /**
 699    * Instantiates a new decoder for this encoding with BOM handling disabled.
 700    *
 701    * If the input starts with bytes that look like a BOM, those bytes are
 702    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 703    * for another encoding.)
 704    *
 705    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 706    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
 707    * instead of this method to cause the BOM to be removed.
 708    */
 709   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
 710     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
 711     return decoder;
 712   }
 713
 714   /**
 715    * Instantiates a new decoder for this encoding with BOM handling disabled
 716    * into memory occupied by a previously-instantiated decoder.
 717    *
 718    * If the input starts with bytes that look like a BOM, those bytes are
 719    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 720    * for another encoding.)
 721    *
 722    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 723    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
 724    * instead of this method to cause the BOM to be removed.
 725    */
 726   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
 727     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
 728   }
 729
 730   /**
 731    * Instantiates a new encoder for the output encoding of this encoding.
 732    */
 733   inline UniquePtr<Encoder> NewEncoder() const {
 734     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
 735     return encoder;
 736   }
 737
 738   /**
 739    * Instantiates a new encoder for the output encoding of this encoding
 740    * into memory occupied by a previously-instantiated encoder.
 741    */
 742   inline void NewEncoderInto(Encoder& aEncoder) const {
 743     encoding_new_encoder_into(this, &aEncoder);
 744   }
 745
 746   /**
 747    * Validates UTF-8.
 748    *
 749    * Returns the index of the first byte that makes the input malformed as
 750    * UTF-8 or the length of the input if the input is entirely valid.
 751    */
 752   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
 753     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 754   }
 755
 756   /**
 757    * Validates ASCII.
 758    *
 759    * Returns the index of the first byte that makes the input malformed as
 760    * ASCII or the length of the input if the input is entirely valid.
 761    */
 762   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 763     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 764   }
 765
 766   /**
 767    * Validates ISO-2022-JP ASCII-state data.
 768    *
 769    * Returns the index of the first byte that makes the input not
 770    * representable in the ASCII state of ISO-2022-JP or the length of the
 771    * input if the input is entirely representable in the ASCII state of
 772    * ISO-2022-JP.
 773    */
 774   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 775     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
 776                                                   aBuffer.Length());
 777   }
 778
 779  private:
 780   Encoding() = delete;
 781   Encoding(const Encoding&) = delete;
 782   Encoding& operator=(const Encoding&) = delete;
 783   ~Encoding() = delete;
 784 };
 785
 786 /**
 787  * A converter that decodes a byte stream into Unicode according to a
 788  * character encoding in a streaming (incremental) manner.
 789  *
 790  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
 791  * buffer `aDst` both of which are caller-allocated. There are variants for
 792  * both UTF-8 and UTF-16 output buffers.
 793  *
 794  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
 795  * into `aDst` until one of the following three things happens:
 796  *
 797  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
 798  *    variants only).
 799  *
 800  * 2. The output buffer has been filled so near capacity that the decoder
 801  *    cannot be sure that processing an additional byte of input wouldn't
 802  *    cause so much output that the output buffer would overflow.
 803  *
 804  * 3. All the input bytes have been processed.
 805  *
 806  * The `Decode*` method then returns tuple of a status indicating which one
 807  * of the three reasons to return happened, how many input bytes were read,
 808  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
 809  * when decoding to UTF-16) were written, and in the case of the
 810  * variants performing replacement, a boolean indicating whether an error was
 811  * replaced with the REPLACEMENT CHARACTER during the call.
 812  *
 813  * The number of bytes "written" is what's logically written. Garbage may be
 814  * written in the output buffer beyond the point logically written to.
 815  *
 816  * In the case of the `*WithoutReplacement` variants, the status is a
 817  * `uint32_t` whose possible values are packed info about a malformed byte
 818  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
 819  * listed above).
 820  *
 821  * Packed info about malformed sequences has the following format:
 822  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
 823  * indicate the number of bytes that were consumed after the malformed
 824  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
 825  * the length of the malformed byte sequence (possible decimal values 1, 2,
 826  * 3 or 4). The maximum possible sum of the two is 6.
 827  *
 828  * In the case of methods whose name does not end with
 829  * `*WithoutReplacement`, malformed sequences are automatically replaced
 830  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
 831  * return early.
 832  *
 833  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
 834  * space. When decoding to UTF-16, the output buffer must have at least two
 835  * UTF-16 code units (`char16_t`) of space.
 836  *
 837  * When decoding to UTF-8 without replacement, the methods are guaranteed
 838  * not to return indicating that more output space is needed if the length
 839  * of the output buffer is at least the length returned by
 840  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
 841  * with replacement, the length of the output buffer that guarantees the
 842  * methods not to return indicating that more output space is needed is given
 843  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
 844  * or without replacement, the length of the output buffer that guarantees
 845  * the methods not to return indicating that more output space is needed is
 846  * given by `MaxUTF16BufferLength()`.
 847  *
 848  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
 849  * and the output after each `Decode*` call is guaranteed to consist of
 850  * complete characters. (I.e. the code unit sequence for the last character is
 851  * guaranteed not to be split across output buffers.)
 852  *
 853  * The boolean argument `aLast` indicates that the end of the stream is reached
 854  * when all the bytes in `aSrc` have been consumed.
 855  *
 856  * A `Decoder` object can be used to incrementally decode a byte stream.
 857  *
 858  * During the processing of a single stream, the caller must call `Decode*`
 859  * zero or more times with `aLast` set to `false` and then call `Decode*` at
 860  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
 861  * the processing of the stream has ended. Otherwise, the caller must call
 862  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
 863  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
 864  *
 865  * Once the stream has ended, the `Decoder` object must not be used anymore.
 866  * That is, you need to create another one to process another stream.
 867  *
 868  * When the decoder returns `kOutputFull` or the decoder returns a malformed
 869  * result and the caller does not wish to treat it as a fatal error, the input
 870  * buffer `aSrc` may not have been completely consumed. In that case, the caller
 871  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
 872  * call.
 873  *
 874  * # Infinite loops
 875  *
 876  * When converting with a fixed-size output buffer whose size is too small to
 877  * accommodate one character of output, an infinite loop ensues. When
 878  * converting with a fixed-size output buffer, it generally makes sense to
 879  * make the buffer fairly large (e.g. couple of kilobytes).
 880  */
 881 class Decoder final {
 882  public:
 883   ~Decoder() = default;
 884   static void operator delete(void* aDecoder) {
 885     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
 886   }
 887
 888   /**
 889    * The `Encoding` this `Decoder` is for.
 890    *
 891    * BOM sniffing can change the return value of this method during the life
 892    * of the decoder.
 893    */
 894   inline NotNull<const mozilla::Encoding*> Encoding() const {
 895     return WrapNotNull(decoder_encoding(this));
 896   }
 897
 898   /**
 899    * Query the worst-case UTF-8 output size _with replacement_.
 900    *
 901    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 902    * that will not overflow given the current state of the decoder and
 903    * `aByteLength` number of additional input bytes when decoding with
 904    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
 905    * sequence.
 906    */
 907   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
 908     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
 909     if (max.value() == std::numeric_limits<size_t>::max()) {
 910       // Mark invalid by overflowing
 911       max++;
 912       MOZ_ASSERT(!max.isValid());
 913     }
 914     return max;
 915   }
 916
 917   /**
 918    * Query the worst-case UTF-8 output size _without replacement_.
 919    *
 920    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 921    * that will not overflow given the current state of the decoder and
 922    * `aByteLength` number of additional input bytes when decoding without
 923    * replacement error handling.
 924    *
 925    * Note that this value may be too small for the `WithReplacement` case.
 926    * Use `MaxUTF8BufferLength()` for that case.
 927    */
 928   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
 929       size_t aByteLength) const {
 930     CheckedInt<size_t> max(
 931         decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
 932     if (max.value() == std::numeric_limits<size_t>::max()) {
 933       // Mark invalid by overflowing
 934       max++;
 935       MOZ_ASSERT(!max.isValid());
 936     }
 937     return max;
 938   }
 939
 940   /**
 941    * Incrementally decode a byte stream into UTF-8 with malformed sequences
 942    * replaced with the REPLACEMENT CHARACTER.
 943    *
 944    * See the documentation of the class for documentation for `Decode*`
 945    * methods collectively.
 946    */
 947   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
 948       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 949     size_t srcRead = aSrc.Length();
 950     size_t dstWritten = aDst.Length();
 951     bool hadReplacements;
 952     uint32_t result =
 953         decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
 954                                &dstWritten, aLast, &hadReplacements);
 955     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
 956   }
 957
 958   /**
 959    * Incrementally decode a byte stream into UTF-8 _without replacement_.
 960    *
 961    * See the documentation of the class for documentation for `Decode*`
 962    * methods collectively.
 963    */
 964   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
 965       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 966     size_t srcRead = aSrc.Length();
 967     size_t dstWritten = aDst.Length();
 968     uint32_t result = decoder_decode_to_utf8_without_replacement(
 969         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
 970     return MakeTuple(result, srcRead, dstWritten);
 971   }
 972
 973   /**
 974    * Query the worst-case UTF-16 output size (with or without replacement).
 975    *
 976    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
 977    * that will not overflow given the current state of the decoder and
 978    * `aByteLength` number of additional input bytes.
 979    *
 980    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
 981    * return value of this method applies also in the
 982    * `_without_replacement` case.
 983    */
 984   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
 985     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
 986     if (max.value() == std::numeric_limits<size_t>::max()) {
 987       // Mark invalid by overflowing
 988       max++;
 989       MOZ_ASSERT(!max.isValid());
 990     }
 991     return max;
 992   }
 993
 994   /**
 995    * Incrementally decode a byte stream into UTF-16 with malformed sequences
 996    * replaced with the REPLACEMENT CHARACTER.
 997    *
 998    * See the documentation of the class for documentation for `Decode*`
 999    * methods collectively.
1000    */
1001   inline Tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1002       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1003     size_t srcRead = aSrc.Length();
1004     size_t dstWritten = aDst.Length();
1005     bool hadReplacements;
1006     uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1007                                               aDst.Elements(), &dstWritten,
1008                                               aLast, &hadReplacements);
1009     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1010   }
1011
1012   /**
1013    * Incrementally decode a byte stream into UTF-16 _without replacement_.
1014    *
1015    * See the documentation of the class for documentation for `Decode*`
1016    * methods collectively.
1017    */
1018   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1019       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1020     size_t srcRead = aSrc.Length();
1021     size_t dstWritten = aDst.Length();
1022     uint32_t result = decoder_decode_to_utf16_without_replacement(
1023         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1024     return MakeTuple(result, srcRead, dstWritten);
1025   }
1026
1027   /**
1028    * Checks for compatibility with storing Unicode scalar values as unsigned
1029    * bytes taking into account the state of the decoder.
1030    *
1031    * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1032    * including waiting for the BOM, or if the encoding is never
1033    * Latin1-byte-compatible.
1034    *
1035    * Otherwise returns the index of the first byte whose unsigned value doesn't
1036    * directly correspond to the decoded Unicode scalar value, or the length
1037    * of the input if all bytes in the input decode directly to scalar values
1038    * corresponding to the unsigned byte values.
1039    *
1040    * Does not change the state of the decoder.
1041    *
1042    * Do not use this unless you are supporting SpiderMonkey-style string
1043    * storage optimizations.
1044    */
1045   inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1046       Span<const uint8_t> aBuffer) const {
1047     size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1048                                                        aBuffer.Length());
1049     if (upTo == std::numeric_limits<size_t>::max()) {
1050       return mozilla::Nothing();
1051     }
1052     return mozilla::Some(upTo);
1053   }
1054
1055  private:
1056   Decoder() = delete;
1057   Decoder(const Decoder&) = delete;
1058   Decoder& operator=(const Decoder&) = delete;
1059 };
1060
1061 /**
1062  * A converter that encodes a Unicode stream into bytes according to a
1063  * character encoding in a streaming (incremental) manner.
1064  *
1065  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1066  * buffer `aDst` both of which are caller-allocated. There are variants for
1067  * both UTF-8 and UTF-16 input buffers.
1068  *
1069  * An `Encode*` method encode characters from `aSrc` into bytes characters
1070  * stored into `aDst` until one of the following three things happens:
1071  *
1072  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1073  *    only).
1074  *
1075  * 2. The output buffer has been filled so near capacity that the decoder
1076  *    cannot be sure that processing an additional character of input wouldn't
1077  *    cause so much output that the output buffer would overflow.
1078  *
1079  * 3. All the input characters have been processed.
1080  *
1081  * The `Encode*` method then returns tuple of a status indicating which one
1082  * of the three reasons to return happened, how many input code units (`uint8_t`
1083  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1084  * how many output bytes were written, and in the case of the variants that
1085  * perform replacement, a boolean indicating whether an unmappable
1086  * character was replaced with a numeric character reference during the call.
1087  *
1088  * The number of bytes "written" is what's logically written. Garbage may be
1089  * written in the output buffer beyond the point logically written to.
1090  *
1091  * In the case of the methods whose name ends with
1092  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1093  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1094  * to the three cases listed above).
1095  *
1096  * In the case of methods whose name does not end with
1097  * `*WithoutReplacement`, unmappable characters are automatically replaced
1098  * with the corresponding numeric character references and unmappable
1099  * characters do not cause the methods to return early.
1100  *
1101  * When encoding from UTF-8 without replacement, the methods are guaranteed
1102  * not to return indicating that more output space is needed if the length
1103  * of the output buffer is at least the length returned by
1104  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1105  * UTF-8 with replacement, the length of the output buffer that guarantees the
1106  * methods not to return indicating that more output space is needed in the
1107  * absence of unmappable characters is given by
1108  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1109  * UTF-16 without replacement, the methods are guaranteed not to return
1110  * indicating that more output space is needed if the length of the output
1111  * buffer is at least the length returned by
1112  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1113  * from UTF-16 with replacement, the the length of the output buffer that
1114  * guarantees the methods not to return indicating that more output space is
1115  * needed in the absence of unmappable characters is given by
1116  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1117  * When encoding with replacement, applications are not expected to size the
1118  * buffer for the worst case ahead of time but to resize the buffer if there
1119  * are unmappable characters. This is why max length queries are only available
1120  * for the case where there are no unmappable characters.
1121  *
1122  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1123  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1124  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1125  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1126  * surrogate pairs are not split across input buffer boundaries.
1127  *
1128  * After an `Encode*` call returns, the output produced so far, taken as a
1129  * whole from the start of the stream, is guaranteed to consist of a valid
1130  * byte sequence in the target encoding. (I.e. the code unit sequence for a
1131  * character is guaranteed not to be split across output buffers. However, due
1132  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1133  * from the start for it to be valid. For other encodings, the validity holds
1134  * on a per-output buffer basis.)
1135  *
1136  * The boolean argument `aLast` indicates that the end of the stream is reached
1137  * when all the characters in `aSrc` have been consumed. This argument is needed
1138  * for ISO-2022-JP and is ignored for other encodings.
1139  *
1140  * An `Encoder` object can be used to incrementally encode a byte stream.
1141  *
1142  * During the processing of a single stream, the caller must call `Encode*`
1143  * zero or more times with `aLast` set to `false` and then call `Encode*` at
1144  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1145  * the processing of the stream has ended. Otherwise, the caller must call
1146  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1147  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1148  *
1149  * Once the stream has ended, the `Encoder` object must not be used anymore.
1150  * That is, you need to create another one to process another stream.
1151  *
1152  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1153  * result and the caller does not wish to treat it as a fatal error, the input
1154  * buffer `aSrc` may not have been completely consumed. In that case, the caller
1155  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1156  * call.
1157  *
1158  * # Infinite loops
1159  *
1160  * When converting with a fixed-size output buffer whose size is too small to
1161  * accommodate one character of output, an infinite loop ensues. When
1162  * converting with a fixed-size output buffer, it generally makes sense to
1163  * make the buffer fairly large (e.g. couple of kilobytes).
1164  */
1165 class Encoder final {
1166  public:
1167   ~Encoder() = default;
1168
1169   static void operator delete(void* aEncoder) {
1170     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1171   }
1172
1173   /**
1174    * The `Encoding` this `Encoder` is for.
1175    */
1176   inline NotNull<const mozilla::Encoding*> Encoding() const {
1177     return WrapNotNull(encoder_encoding(this));
1178   }
1179
1180   /**
1181    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1182    * ASCII state and `false` otherwise.
1183    */
1184   inline bool HasPendingState() const {
1185     return encoder_has_pending_state(this);
1186   }
1187
1188   /**
1189    * Query the worst-case output size when encoding from UTF-8 with
1190    * replacement.
1191    *
1192    * Returns the size of the output buffer in bytes that will not overflow
1193    * given the current state of the encoder and `aByteLength` number of
1194    * additional input code units if there are no unmappable characters in
1195    * the input.
1196    */
1197   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1198       size_t aByteLength) const {
1199     CheckedInt<size_t> max(
1200         encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1201                                                               aByteLength));
1202     if (max.value() == std::numeric_limits<size_t>::max()) {
1203       // Mark invalid by overflowing
1204       max++;
1205       MOZ_ASSERT(!max.isValid());
1206     }
1207     return max;
1208   }
1209
1210   /**
1211    * Query the worst-case output size when encoding from UTF-8 without
1212    * replacement.
1213    *
1214    * Returns the size of the output buffer in bytes that will not overflow
1215    * given the current state of the encoder and `aByteLength` number of
1216    * additional input code units.
1217    */
1218   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1219       size_t aByteLength) const {
1220     CheckedInt<size_t> max(
1221         encoder_max_buffer_length_from_utf8_without_replacement(this,
1222                                                                 aByteLength));
1223     if (max.value() == std::numeric_limits<size_t>::max()) {
1224       // Mark invalid by overflowing
1225       max++;
1226       MOZ_ASSERT(!max.isValid());
1227     }
1228     return max;
1229   }
1230
1231   /**
1232    * Incrementally encode into byte stream from UTF-8 with unmappable
1233    * characters replaced with HTML (decimal) numeric character references.
1234    *
1235    * See the documentation of the class for documentation for `Encode*`
1236    * methods collectively.
1237    *
1238    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1239    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1240    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1241    */
1242   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1243       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1244     size_t srcRead = aSrc.Length();
1245     size_t dstWritten = aDst.Length();
1246     bool hadReplacements;
1247     uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1248                                                aDst.Elements(), &dstWritten,
1249                                                aLast, &hadReplacements);
1250     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1251   }
1252
1253   /**
1254    * Incrementally encode into byte stream from UTF-8 _without replacement_.
1255    *
1256    * See the documentation of the class for documentation for `Encode*`
1257    * methods collectively.
1258    *
1259    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1260    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1261    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1262    */
1263   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1264       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1265     size_t srcRead = aSrc.Length();
1266     size_t dstWritten = aDst.Length();
1267     uint32_t result = encoder_encode_from_utf8_without_replacement(
1268         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1269     return MakeTuple(result, srcRead, dstWritten);
1270   }
1271
1272   /**
1273    * Query the worst-case output size when encoding from UTF-16 with
1274    * replacement.
1275    *
1276    * Returns the size of the output buffer in bytes that will not overflow
1277    * given the current state of the encoder and `aU16Length` number of
1278    * additional input code units if there are no unmappable characters in
1279    * the input.
1280    */
1281   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1282       size_t aU16Length) const {
1283     CheckedInt<size_t> max(
1284         encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1285                                                                aU16Length));
1286     if (max.value() == std::numeric_limits<size_t>::max()) {
1287       // Mark invalid by overflowing
1288       max++;
1289       MOZ_ASSERT(!max.isValid());
1290     }
1291     return max;
1292   }
1293
1294   /**
1295    * Query the worst-case output size when encoding from UTF-16 without
1296    * replacement.
1297    *
1298    * Returns the size of the output buffer in bytes that will not overflow
1299    * given the current state of the encoder and `aU16Length` number of
1300    * additional input code units.
1301    */
1302   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1303       size_t aU16Length) const {
1304     CheckedInt<size_t> max(
1305         encoder_max_buffer_length_from_utf16_without_replacement(this,
1306                                                                  aU16Length));
1307     if (max.value() == std::numeric_limits<size_t>::max()) {
1308       // Mark invalid by overflowing
1309       max++;
1310       MOZ_ASSERT(!max.isValid());
1311     }
1312     return max;
1313   }
1314
1315   /**
1316    * Incrementally encode into byte stream from UTF-16 with unmappable
1317    * characters replaced with HTML (decimal) numeric character references.
1318    *
1319    * See the documentation of the class for documentation for `Encode*`
1320    * methods collectively.
1321    */
1322   inline Tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1323       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1324     size_t srcRead = aSrc.Length();
1325     size_t dstWritten = aDst.Length();
1326     bool hadReplacements;
1327     uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1328                                                 aDst.Elements(), &dstWritten,
1329                                                 aLast, &hadReplacements);
1330     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1331   }
1332
1333   /**
1334    * Incrementally encode into byte stream from UTF-16 _without replacement_.
1335    *
1336    * See the documentation of the class for documentation for `Encode*`
1337    * methods collectively.
1338    */
1339   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1340       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1341     size_t srcRead = aSrc.Length();
1342     size_t dstWritten = aDst.Length();
1343     uint32_t result = encoder_encode_from_utf16_without_replacement(
1344         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1345     return MakeTuple(result, srcRead, dstWritten);
1346   }
1347
1348  private:
1349   Encoder() = delete;
1350   Encoder(const Encoder&) = delete;
1351   Encoder& operator=(const Encoder&) = delete;
1352 };
1353
1354 };  // namespace mozilla
1355
1356 #endif  // mozilla_Encoding_h