intl/Encoding.h

   1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
   2 // file at the top-level directory of this distribution.
   3 //
   4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   7 // option. This file may not be copied, modified, or distributed
   8 // except according to those terms.
   9
  10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
  11 // "top-level directory" in the above notice refers to
  12 // third_party/rust/encoding_c/.
  13
  14 #ifndef mozilla_Encoding_h
  15 #define mozilla_Encoding_h
  16
  17 #include "mozilla/CheckedInt.h"
  18 #include "mozilla/Maybe.h"
  19 #include "mozilla/NotNull.h"
  20 #include "mozilla/Span.h"
  21 #include "nsString.h"
  22
  23 #include <tuple>
  24
  25 namespace mozilla {
  26 class Encoding;
  27 class Decoder;
  28 class Encoder;
  29 };  // namespace mozilla
  30
  31 #define ENCODING_RS_ENCODING mozilla::Encoding
  32 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
  33   mozilla::NotNull<const mozilla::Encoding*>
  34 #define ENCODING_RS_ENCODER mozilla::Encoder
  35 #define ENCODING_RS_DECODER mozilla::Decoder
  36
  37 #include "encoding_rs.h"
  38
  39 extern "C" {
  40
  41 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
  42                                              uint8_t const* src, size_t src_len,
  43                                              nsAString* dst);
  44
  45 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
  46     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  47     nsAString* dst);
  48
  49 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
  50     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  51     nsAString* dst);
  52
  53 nsresult
  54 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
  55     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  56     nsAString* dst);
  57
  58 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
  59                                             char16_t const* src, size_t src_len,
  60                                             nsACString* dst);
  61
  62 nsresult mozilla_encoding_decode_to_nscstring(
  63     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  64
  65 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
  66     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  67
  68 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
  69     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  70
  71 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
  72     mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
  73     nsACString* dst, size_t already_validated);
  74
  75 nsresult
  76 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
  77     mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
  78
  79 nsresult mozilla_encoding_encode_from_nscstring(
  80     mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
  81
  82 }  // extern "C"
  83
  84 namespace mozilla {
  85
  86 /**
  87  * Return value from `Decoder`/`Encoder` to indicate that input
  88  * was exhausted.
  89  */
  90 const uint32_t kInputEmpty = INPUT_EMPTY;
  91
  92 /**
  93  * Return value from `Decoder`/`Encoder` to indicate that output
  94  * space was insufficient.
  95  */
  96 const uint32_t kOutputFull = OUTPUT_FULL;
  97
  98 /**
  99  * An encoding as defined in the Encoding Standard
 100  * (https://encoding.spec.whatwg.org/).
 101  *
 102  * See https://docs.rs/encoding_rs/ for the Rust API docs.
 103  *
 104  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
 105  * sequence and, in most cases, vice versa. Each encoding has a name, an output
 106  * encoding, and one or more labels.
 107  *
 108  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
 109  * encoding in formats and protocols. The _name_ of the encoding is the
 110  * preferred label in the case appropriate for returning from the
 111  * `characterSet` property of the `Document` DOM interface, except for
 112  * the replacement encoding whose name is not one of its labels.
 113  *
 114  * The _output encoding_ is the encoding used for form submission and URL
 115  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
 116  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
 117  * encodings.
 118  *
 119  * # Streaming vs. Non-Streaming
 120  *
 121  * When you have the entire input in a single buffer, you can use the
 122  * methods `Decode()`, `DecodeWithBOMRemoval()`,
 123  * `DecodeWithoutBOMHandling()`,
 124  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
 125  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
 126  * NewEncoder()` methods), these methods perform heap allocations. You should
 127  * the `Decoder` and `Encoder` objects when your input is split into multiple
 128  * buffers or when you want to control the allocation of the output buffers.
 129  *
 130  * # Instances
 131  *
 132  * All instances of `Encoding` are statically allocated and have the process's
 133  * lifetime. There is precisely one unique `Encoding` instance for each
 134  * encoding defined in the Encoding Standard.
 135  *
 136  * To obtain a reference to a particular encoding whose identity you know at
 137  * compile time, use a `static` that refers to encoding. There is a `static`
 138  * for each encoding. The `static`s are named in all caps with hyphens
 139  * replaced with underscores and with `_ENCODING` appended to the
 140  * name. For example, if you know at compile time that you will want to
 141  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
 142  *
 143  * If you don't know what encoding you need at compile time and need to
 144  * dynamically get an encoding by label, use `Encoding::for_label()`.
 145  *
 146  * Pointers to `Encoding` can be compared with `==` to check for the sameness
 147  * of two encodings.
 148  *
 149  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
 150  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
 151  * `const mozilla::Encoding*` in the C signature and
 152  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
 153  */
 154 class Encoding final {
 155  public:
 156   /**
 157    * Implements the _get an encoding_ algorithm
 158    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
 159    *
 160    * If, after ASCII-lowercasing and removing leading and trailing
 161    * whitespace, the argument matches a label defined in the Encoding
 162    * Standard, `const Encoding*` representing the corresponding
 163    * encoding is returned. If there is no match, `nullptr` is returned.
 164    *
 165    * This is the right method to use if the action upon the method returning
 166    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
 167    * instead. When the action upon the method returning `nullptr` is not to
 168    * proceed with a fallback but to refuse processing,
 169    * `ForLabelNoReplacement()` is more appropriate.
 170    */
 171   static inline const Encoding* ForLabel(Span<const char> aLabel) {
 172     return encoding_for_label(
 173         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 174   }
 175
 176   /**
 177    * `nsAString` argument version. See above for docs.
 178    */
 179   static inline const Encoding* ForLabel(const nsAString& aLabel) {
 180     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
 181   }
 182
 183   /**
 184    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
 185    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
 186    *
 187    * This method is useful in scenarios where a fatal error is required
 188    * upon invalid label, because in those cases the caller typically wishes
 189    * to treat the labels that map to the replacement encoding as fatal
 190    * errors, too.
 191    *
 192    * It is not OK to use this method when the action upon the method returning
 193    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
 194    * such a case, the `ForLabel()` method should be used instead in order to
 195    * avoid unsafe fallback for labels that `ForLabel()` maps to
 196    * `REPLACEMENT_ENCODING`.
 197    */
 198   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
 199     return encoding_for_label_no_replacement(
 200         reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
 201   }
 202
 203   /**
 204    * `nsAString` argument version. See above for docs.
 205    */
 206   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
 207     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
 208   }
 209
 210   /**
 211    * Performs non-incremental BOM sniffing.
 212    *
 213    * The argument must either be a buffer representing the entire input
 214    * stream (non-streaming case) or a buffer representing at least the first
 215    * three bytes of the input stream (streaming case).
 216    *
 217    * Returns `{UTF_8_ENCODING, 3}`,
 218    * `{UTF_16LE_ENCODING, 2}` or
 219    * `{UTF_16BE_ENCODING, 3}` if the argument starts with the
 220    * UTF-8, UTF-16LE or UTF-16BE BOM or `{nullptr, 0}` otherwise.
 221    */
 222   static inline std::tuple<const Encoding*, size_t> ForBOM(
 223       Span<const uint8_t> aBuffer) {
 224     size_t len = aBuffer.Length();
 225     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
 226     return {encoding, len};
 227   }
 228
 229   /**
 230    * Writes the name of this encoding into `aName`.
 231    *
 232    * This name is appropriate to return as-is from the DOM
 233    * `document.characterSet` property.
 234    */
 235   inline void Name(nsACString& aName) const {
 236     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
 237     size_t length =
 238         encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
 239     aName.SetLength(length);  // truncation is the 64-bit case is OK
 240   }
 241
 242   /**
 243    * Checks whether the _output encoding_ of this encoding can encode every
 244    * Unicode code point. (Only true if the output encoding is UTF-8.)
 245    */
 246   inline bool CanEncodeEverything() const {
 247     return encoding_can_encode_everything(this);
 248   }
 249
 250   /**
 251    * Checks whether this encoding maps one byte to one Basic Multilingual
 252    * Plane code point (i.e. byte length equals decoded UTF-16 length) and
 253    * vice versa (for mappable characters).
 254    *
 255    * `true` iff this encoding is on the list of Legacy single-byte
 256    * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
 257    * in the spec or x-user-defined.
 258    */
 259   inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
 260
 261   /**
 262    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
 263    * U+0000...U+007F and vice versa.
 264    */
 265   inline bool IsAsciiCompatible() const {
 266     return encoding_is_ascii_compatible(this);
 267   }
 268
 269   /**
 270    * Checks whether this is a Japanese legacy encoding.
 271    */
 272   inline bool IsJapaneseLegacy() const {
 273     return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
 274            this == ISO_2022_JP_ENCODING;
 275   }
 276
 277   /**
 278    * Returns the _output encoding_ of this encoding. This is UTF-8 for
 279    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
 280    */
 281   inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
 282     return WrapNotNull(encoding_output_encoding(this));
 283   }
 284
 285   /**
 286    * Decode complete input to `nsACString` _with BOM sniffing_ and with
 287    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 288    * entire input is available as a single buffer (i.e. the end of the
 289    * buffer marks the end of the stream).
 290    *
 291    * This method implements the (non-streaming version of) the
 292    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 293    *
 294    * The second item in the returned tuple is the encoding that was actually
 295    * used (which may differ from this encoding thanks to BOM sniffing).
 296    *
 297    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 298    * if there were malformed sequences (that were replaced with the
 299    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 300    * tuple.
 301    *
 302    * The backing buffer of the string isn't copied if the input buffer
 303    * is heap-allocated and decoding from UTF-8 and the input is valid
 304    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 305    * the input is valid ASCII or decoding from ISO-2022-JP and the
 306    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 307    * the same string as both arguments.
 308    *
 309    * _Note:_ It is wrong to use this when the input buffer represents only
 310    * a segment of the input instead of the whole input. Use `NewDecoder()`
 311    * when decoding segmented input.
 312    */
 313   inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 314       const nsACString& aBytes, nsACString& aOut) const {
 315     const Encoding* encoding = this;
 316     const nsACString* bytes = &aBytes;
 317     nsACString* out = &aOut;
 318     nsresult rv;
 319     if (bytes == out) {
 320       nsAutoCString temp(aBytes);
 321       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
 322     } else {
 323       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
 324     }
 325     return {rv, WrapNotNull(encoding)};
 326   }
 327
 328   /**
 329    * Decode complete input to `nsAString` _with BOM sniffing_ and with
 330    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 331    * entire input is available as a single buffer (i.e. the end of the
 332    * buffer marks the end of the stream).
 333    *
 334    * This method implements the (non-streaming version of) the
 335    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
 336    *
 337    * The second item in the returned tuple is the encoding that was actually
 338    * used (which may differ from this encoding thanks to BOM sniffing).
 339    *
 340    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 341    * if there were malformed sequences (that were replaced with the
 342    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
 343    * tuple.
 344    *
 345    * _Note:_ It is wrong to use this when the input buffer represents only
 346    * a segment of the input instead of the whole input. Use `NewDecoder()`
 347    * when decoding segmented input.
 348    */
 349   inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
 350       Span<const uint8_t> aBytes, nsAString& aOut) const {
 351     const Encoding* encoding = this;
 352     nsresult rv = mozilla_encoding_decode_to_nsstring(
 353         &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
 354     return {rv, WrapNotNull(encoding)};
 355   }
 356
 357   /**
 358    * Decode complete input to `nsACString` _with BOM removal_ and with
 359    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 360    * entire input is available as a single buffer (i.e. the end of the
 361    * buffer marks the end of the stream).
 362    *
 363    * When invoked on `UTF_8`, this method implements the (non-streaming
 364    * version of) the _UTF-8 decode_
 365    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 366    *
 367    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 368    * if there were malformed sequences (that were replaced with the
 369    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 370    *
 371    * The backing buffer of the string isn't copied if the input buffer
 372    * is heap-allocated and decoding from UTF-8 and the input is valid
 373    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
 374    * the input is valid ASCII or decoding from ISO-2022-JP and the
 375    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
 376    * the same string as both arguments.
 377    *
 378    * _Note:_ It is wrong to use this when the input buffer represents only
 379    * a segment of the input instead of the whole input. Use
 380    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 381    */
 382   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
 383                                        nsACString& aOut) const {
 384     const nsACString* bytes = &aBytes;
 385     nsACString* out = &aOut;
 386     if (bytes == out) {
 387       nsAutoCString temp(aBytes);
 388       return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
 389                                                                    out);
 390     }
 391     return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
 392                                                                  out);
 393   }
 394
 395   /**
 396    * Decode complete input to `nsAString` _with BOM removal_ and with
 397    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
 398    * entire input is available as a single buffer (i.e. the end of the
 399    * buffer marks the end of the stream).
 400    *
 401    * When invoked on `UTF_8`, this method implements the (non-streaming
 402    * version of) the _UTF-8 decode_
 403    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
 404    *
 405    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 406    * if there were malformed sequences (that were replaced with the
 407    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 408    *
 409    * _Note:_ It is wrong to use this when the input buffer represents only
 410    * a segment of the input instead of the whole input. Use
 411    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
 412    */
 413   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
 414                                        nsAString& aOut) const {
 415     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
 416         this, aBytes.Elements(), aBytes.Length(), &aOut);
 417   }
 418
 419   /**
 420    * Decode complete input to `nsACString` _without BOM handling_ and
 421    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 422    * the entire input is available as a single buffer (i.e. the end of the
 423    * buffer marks the end of the stream).
 424    *
 425    * When invoked on `UTF_8`, this method implements the (non-streaming
 426    * version of) the _UTF-8 decode without BOM_
 427    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 428    *
 429    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 430    * if there were malformed sequences (that were replaced with the
 431    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 432    *
 433    * The backing buffer of the string isn't copied if the input buffer
 434    * is heap-allocated and decoding from UTF-8 and the input is valid
 435    * UTF-8, decoding from an ASCII-compatible encoding and the input
 436    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 437    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 438    * as both arguments.
 439    *
 440    * _Note:_ It is wrong to use this when the input buffer represents only
 441    * a segment of the input instead of the whole input. Use
 442    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 443    */
 444   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
 445                                            nsACString& aOut) const {
 446     const nsACString* bytes = &aBytes;
 447     nsACString* out = &aOut;
 448     if (bytes == out) {
 449       nsAutoCString temp(aBytes);
 450       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 451           this, &temp, out);
 452     }
 453     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
 454         this, bytes, out);
 455   }
 456
 457   /**
 458    * Decode complete input to `nsAString` _without BOM handling_ and
 459    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 460    * the entire input is available as a single buffer (i.e. the end of the
 461    * buffer marks the end of the stream).
 462    *
 463    * When invoked on `UTF_8`, this method implements the (non-streaming
 464    * version of) the _UTF-8 decode without BOM_
 465    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 466    *
 467    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 468    * if there were malformed sequences (that were replaced with the
 469    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 470    *
 471    * _Note:_ It is wrong to use this when the input buffer represents only
 472    * a segment of the input instead of the whole input. Use
 473    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 474    */
 475   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 476                                            nsAString& aOut) const {
 477     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
 478         this, aBytes.Elements(), aBytes.Length(), &aOut);
 479   }
 480
 481   /**
 482    * Decode complete input to `nsACString` _without BOM handling_ and
 483    * _with malformed sequences treated as fatal_ when the entire input is
 484    * available as a single buffer (i.e. the end of the buffer marks the end
 485    * of the stream).
 486    *
 487    * When invoked on `UTF_8`, this method implements the (non-streaming
 488    * version of) the _UTF-8 decode without BOM or fail_
 489    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 490    * spec concept.
 491    *
 492    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 493    * if a malformed sequence was encountered and `NS_OK` otherwise.
 494    *
 495    * The backing buffer of the string isn't copied if the input buffer
 496    * is heap-allocated and decoding from UTF-8 and the input is valid
 497    * UTF-8, decoding from an ASCII-compatible encoding and the input
 498    * is valid ASCII or decoding from ISO-2022-JP and the input stays
 499    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 500    * as both arguments.
 501    *
 502    * _Note:_ It is wrong to use this when the input buffer represents only
 503    * a segment of the input instead of the whole input. Use
 504    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 505    */
 506   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 507       const nsACString& aBytes, nsACString& aOut) const {
 508     const nsACString* bytes = &aBytes;
 509     nsACString* out = &aOut;
 510     if (bytes == out) {
 511       nsAutoCString temp(aBytes);
 512       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 513           this, &temp, out);
 514     }
 515     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
 516         this, bytes, out);
 517   }
 518
 519   /**
 520    * Decode complete input to `nsACString` _without BOM handling_ and
 521    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
 522    * the entire input is available as a single buffer (i.e. the end of the
 523    * buffer marks the end of the stream) _asserting that a number of bytes
 524    * from the start are already known to be valid UTF-8_.
 525    *
 526    * The use case for this method is avoiding copying when dealing with
 527    * input that has a UTF-8 BOM. _When in doubt, do not use this method._
 528    *
 529    * When invoked on `UTF_8`, this method implements the (non-streaming
 530    * version of) the _UTF-8 decode without BOM_
 531    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
 532    *
 533    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
 534    * if there were malformed sequences (that were replaced with the
 535    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
 536    *
 537    * _Note:_ It is wrong to use this when the input buffer represents only
 538    * a segment of the input instead of the whole input. Use
 539    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 540    *
 541    * # Safety
 542    *
 543    * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
 544    * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
 545    */
 546   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
 547                                            nsACString& aOut,
 548                                            size_t aAlreadyValidated) const {
 549     return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
 550         this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
 551   }
 552
 553   /**
 554    * Decode complete input to `nsAString` _without BOM handling_ and
 555    * _with malformed sequences treated as fatal_ when the entire input is
 556    * available as a single buffer (i.e. the end of the buffer marks the end
 557    * of the stream).
 558    *
 559    * When invoked on `UTF_8`, this method implements the (non-streaming
 560    * version of) the _UTF-8 decode without BOM or fail_
 561    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
 562    * spec concept.
 563    *
 564    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
 565    * if a malformed sequence was encountered and `NS_OK` otherwise.
 566    *
 567    * _Note:_ It is wrong to use this when the input buffer represents only
 568    * a segment of the input instead of the whole input. Use
 569    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
 570    */
 571   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
 572       Span<const uint8_t> aBytes, nsAString& aOut) const {
 573     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
 574         this, aBytes.Elements(), aBytes.Length(), &aOut);
 575   }
 576
 577   /**
 578    * Encode complete input to `nsACString` with unmappable characters
 579    * replaced with decimal numeric character references when the entire input
 580    * is available as a single buffer (i.e. the end of the buffer marks the
 581    * end of the stream).
 582    *
 583    * This method implements the (non-streaming version of) the
 584    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 585    *
 586    * The second item in the returned tuple is the encoding that was actually
 587    * used (which may differ from this encoding thanks to some encodings
 588    * having UTF-8 as their output encoding).
 589    *
 590    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
 591    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
 592    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
 593    * replaced with numeric character references) and `NS_OK` otherwise.
 594    *
 595    * The backing buffer of the string isn't copied if the input buffer
 596    * is heap-allocated and encoding to UTF-8 and the input is valid
 597    * UTF-8, encoding to an ASCII-compatible encoding and the input
 598    * is valid ASCII or encoding from ISO-2022-JP and the input stays
 599    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
 600    * as both arguments.
 601    *
 602    * _Note:_ It is wrong to use this when the input buffer represents only
 603    * a segment of the input instead of the whole input. Use `NewEncoder()`
 604    * when encoding segmented output.
 605    */
 606   inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 607       const nsACString& aString, nsACString& aOut) const {
 608     const Encoding* encoding = this;
 609     const nsACString* string = &aString;
 610     nsACString* out = &aOut;
 611     nsresult rv;
 612     if (string == out) {
 613       nsAutoCString temp(aString);
 614       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
 615     } else {
 616       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
 617     }
 618     return {rv, WrapNotNull(encoding)};
 619   }
 620
 621   /**
 622    * Encode complete input to `nsACString` with unmappable characters
 623    * replaced with decimal numeric character references when the entire input
 624    * is available as a single buffer (i.e. the end of the buffer marks the
 625    * end of the stream).
 626    *
 627    * This method implements the (non-streaming version of) the
 628    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
 629    *
 630    * The second item in the returned tuple is the encoding that was actually
 631    * used (which may differ from this encoding thanks to some encodings
 632    * having UTF-8 as their output encoding).
 633    *
 634    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
 635    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
 636    * were replaced with numeric character references) and `NS_OK` otherwise.
 637
 638    * _Note:_ It is wrong to use this when the input buffer represents only
 639    * a segment of the input instead of the whole input. Use `NewEncoder()`
 640    * when encoding segmented output.
 641    */
 642   inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
 643       Span<const char16_t> aString, nsACString& aOut) const {
 644     const Encoding* encoding = this;
 645     nsresult rv = mozilla_encoding_encode_from_utf16(
 646         &encoding, aString.Elements(), aString.Length(), &aOut);
 647     return {rv, WrapNotNull(encoding)};
 648   }
 649
 650   /**
 651    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
 652    *
 653    * BOM sniffing may cause the returned decoder to morph into a decoder
 654    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 655    */
 656   inline UniquePtr<Decoder> NewDecoder() const {
 657     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
 658     return decoder;
 659   }
 660
 661   /**
 662    * Instantiates a new decoder for this encoding with BOM sniffing enabled
 663    * into memory occupied by a previously-instantiated decoder.
 664    *
 665    * BOM sniffing may cause the returned decoder to morph into a decoder
 666    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
 667    */
 668   inline void NewDecoderInto(Decoder& aDecoder) const {
 669     encoding_new_decoder_into(this, &aDecoder);
 670   }
 671
 672   /**
 673    * Instantiates a new decoder for this encoding with BOM removal.
 674    *
 675    * If the input starts with bytes that are the BOM for this encoding,
 676    * those bytes are removed. However, the decoder never morphs into a
 677    * decoder for another encoding: A BOM for another encoding is treated as
 678    * (potentially malformed) input to the decoding algorithm for this
 679    * encoding.
 680    */
 681   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
 682     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
 683     return decoder;
 684   }
 685
 686   /**
 687    * Instantiates a new decoder for this encoding with BOM removal
 688    * into memory occupied by a previously-instantiated decoder.
 689    *
 690    * If the input starts with bytes that are the BOM for this encoding,
 691    * those bytes are removed. However, the decoder never morphs into a
 692    * decoder for another encoding: A BOM for another encoding is treated as
 693    * (potentially malformed) input to the decoding algorithm for this
 694    * encoding.
 695    */
 696   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
 697     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
 698   }
 699
 700   /**
 701    * Instantiates a new decoder for this encoding with BOM handling disabled.
 702    *
 703    * If the input starts with bytes that look like a BOM, those bytes are
 704    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 705    * for another encoding.)
 706    *
 707    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 708    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
 709    * instead of this method to cause the BOM to be removed.
 710    */
 711   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
 712     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
 713     return decoder;
 714   }
 715
 716   /**
 717    * Instantiates a new decoder for this encoding with BOM handling disabled
 718    * into memory occupied by a previously-instantiated decoder.
 719    *
 720    * If the input starts with bytes that look like a BOM, those bytes are
 721    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
 722    * for another encoding.)
 723    *
 724    * _Note:_ If the caller has performed BOM sniffing on its own but has not
 725    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
 726    * instead of this method to cause the BOM to be removed.
 727    */
 728   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
 729     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
 730   }
 731
 732   /**
 733    * Instantiates a new encoder for the output encoding of this encoding.
 734    */
 735   inline UniquePtr<Encoder> NewEncoder() const {
 736     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
 737     return encoder;
 738   }
 739
 740   /**
 741    * Instantiates a new encoder for the output encoding of this encoding
 742    * into memory occupied by a previously-instantiated encoder.
 743    */
 744   inline void NewEncoderInto(Encoder& aEncoder) const {
 745     encoding_new_encoder_into(this, &aEncoder);
 746   }
 747
 748   /**
 749    * Validates UTF-8.
 750    *
 751    * Returns the index of the first byte that makes the input malformed as
 752    * UTF-8 or the length of the input if the input is entirely valid.
 753    */
 754   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
 755     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 756   }
 757
 758   /**
 759    * Validates ASCII.
 760    *
 761    * Returns the index of the first byte that makes the input malformed as
 762    * ASCII or the length of the input if the input is entirely valid.
 763    */
 764   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 765     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
 766   }
 767
 768   /**
 769    * Validates ISO-2022-JP ASCII-state data.
 770    *
 771    * Returns the index of the first byte that makes the input not
 772    * representable in the ASCII state of ISO-2022-JP or the length of the
 773    * input if the input is entirely representable in the ASCII state of
 774    * ISO-2022-JP.
 775    */
 776   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
 777     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
 778                                                   aBuffer.Length());
 779   }
 780
 781  private:
 782   Encoding() = delete;
 783   Encoding(const Encoding&) = delete;
 784   Encoding& operator=(const Encoding&) = delete;
 785   ~Encoding() = delete;
 786 };
 787
 788 /**
 789  * A converter that decodes a byte stream into Unicode according to a
 790  * character encoding in a streaming (incremental) manner.
 791  *
 792  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
 793  * buffer `aDst` both of which are caller-allocated. There are variants for
 794  * both UTF-8 and UTF-16 output buffers.
 795  *
 796  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
 797  * into `aDst` until one of the following three things happens:
 798  *
 799  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
 800  *    variants only).
 801  *
 802  * 2. The output buffer has been filled so near capacity that the decoder
 803  *    cannot be sure that processing an additional byte of input wouldn't
 804  *    cause so much output that the output buffer would overflow.
 805  *
 806  * 3. All the input bytes have been processed.
 807  *
 808  * The `Decode*` method then returns tuple of a status indicating which one
 809  * of the three reasons to return happened, how many input bytes were read,
 810  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
 811  * when decoding to UTF-16) were written, and in the case of the
 812  * variants performing replacement, a boolean indicating whether an error was
 813  * replaced with the REPLACEMENT CHARACTER during the call.
 814  *
 815  * The number of bytes "written" is what's logically written. Garbage may be
 816  * written in the output buffer beyond the point logically written to.
 817  *
 818  * In the case of the `*WithoutReplacement` variants, the status is a
 819  * `uint32_t` whose possible values are packed info about a malformed byte
 820  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
 821  * listed above).
 822  *
 823  * Packed info about malformed sequences has the following format:
 824  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
 825  * indicate the number of bytes that were consumed after the malformed
 826  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
 827  * the length of the malformed byte sequence (possible decimal values 1, 2,
 828  * 3 or 4). The maximum possible sum of the two is 6.
 829  *
 830  * In the case of methods whose name does not end with
 831  * `*WithoutReplacement`, malformed sequences are automatically replaced
 832  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
 833  * return early.
 834  *
 835  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
 836  * space. When decoding to UTF-16, the output buffer must have at least two
 837  * UTF-16 code units (`char16_t`) of space.
 838  *
 839  * When decoding to UTF-8 without replacement, the methods are guaranteed
 840  * not to return indicating that more output space is needed if the length
 841  * of the output buffer is at least the length returned by
 842  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
 843  * with replacement, the length of the output buffer that guarantees the
 844  * methods not to return indicating that more output space is needed is given
 845  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
 846  * or without replacement, the length of the output buffer that guarantees
 847  * the methods not to return indicating that more output space is needed is
 848  * given by `MaxUTF16BufferLength()`.
 849  *
 850  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
 851  * and the output after each `Decode*` call is guaranteed to consist of
 852  * complete characters. (I.e. the code unit sequence for the last character is
 853  * guaranteed not to be split across output buffers.)
 854  *
 855  * The boolean argument `aLast` indicates that the end of the stream is reached
 856  * when all the bytes in `aSrc` have been consumed.
 857  *
 858  * A `Decoder` object can be used to incrementally decode a byte stream.
 859  *
 860  * During the processing of a single stream, the caller must call `Decode*`
 861  * zero or more times with `aLast` set to `false` and then call `Decode*` at
 862  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
 863  * the processing of the stream has ended. Otherwise, the caller must call
 864  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
 865  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
 866  *
 867  * Once the stream has ended, the `Decoder` object must not be used anymore.
 868  * That is, you need to create another one to process another stream.
 869  *
 870  * When the decoder returns `kOutputFull` or the decoder returns a malformed
 871  * result and the caller does not wish to treat it as a fatal error, the input
 872  * buffer `aSrc` may not have been completely consumed. In that case, the caller
 873  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
 874  * call.
 875  *
 876  * # Infinite loops
 877  *
 878  * When converting with a fixed-size output buffer whose size is too small to
 879  * accommodate one character of output, an infinite loop ensues. When
 880  * converting with a fixed-size output buffer, it generally makes sense to
 881  * make the buffer fairly large (e.g. couple of kilobytes).
 882  */
 883 class Decoder final {
 884  public:
 885   ~Decoder() = default;
 886   static void operator delete(void* aDecoder) {
 887     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
 888   }
 889
 890   /**
 891    * The `Encoding` this `Decoder` is for.
 892    *
 893    * BOM sniffing can change the return value of this method during the life
 894    * of the decoder.
 895    */
 896   inline NotNull<const mozilla::Encoding*> Encoding() const {
 897     return WrapNotNull(decoder_encoding(this));
 898   }
 899
 900   /**
 901    * Query the worst-case UTF-8 output size _with replacement_.
 902    *
 903    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 904    * that will not overflow given the current state of the decoder and
 905    * `aByteLength` number of additional input bytes when decoding with
 906    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
 907    * sequence.
 908    */
 909   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
 910     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
 911     if (max.value() == std::numeric_limits<size_t>::max()) {
 912       // Mark invalid by overflowing
 913       max++;
 914       MOZ_ASSERT(!max.isValid());
 915     }
 916     return max;
 917   }
 918
 919   /**
 920    * Query the worst-case UTF-8 output size _without replacement_.
 921    *
 922    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
 923    * that will not overflow given the current state of the decoder and
 924    * `aByteLength` number of additional input bytes when decoding without
 925    * replacement error handling.
 926    *
 927    * Note that this value may be too small for the `WithReplacement` case.
 928    * Use `MaxUTF8BufferLength()` for that case.
 929    */
 930   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
 931       size_t aByteLength) const {
 932     CheckedInt<size_t> max(
 933         decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
 934     if (max.value() == std::numeric_limits<size_t>::max()) {
 935       // Mark invalid by overflowing
 936       max++;
 937       MOZ_ASSERT(!max.isValid());
 938     }
 939     return max;
 940   }
 941
 942   /**
 943    * Incrementally decode a byte stream into UTF-8 with malformed sequences
 944    * replaced with the REPLACEMENT CHARACTER.
 945    *
 946    * See the documentation of the class for documentation for `Decode*`
 947    * methods collectively.
 948    */
 949   inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
 950       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 951     size_t srcRead = aSrc.Length();
 952     size_t dstWritten = aDst.Length();
 953     bool hadReplacements;
 954     uint32_t result =
 955         decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
 956                                &dstWritten, aLast, &hadReplacements);
 957     return {result, srcRead, dstWritten, hadReplacements};
 958   }
 959
 960   /**
 961    * Incrementally decode a byte stream into UTF-8 _without replacement_.
 962    *
 963    * See the documentation of the class for documentation for `Decode*`
 964    * methods collectively.
 965    */
 966   inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
 967       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
 968     size_t srcRead = aSrc.Length();
 969     size_t dstWritten = aDst.Length();
 970     uint32_t result = decoder_decode_to_utf8_without_replacement(
 971         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
 972     return {result, srcRead, dstWritten};
 973   }
 974
 975   /**
 976    * Query the worst-case UTF-16 output size (with or without replacement).
 977    *
 978    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
 979    * that will not overflow given the current state of the decoder and
 980    * `aByteLength` number of additional input bytes.
 981    *
 982    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
 983    * return value of this method applies also in the
 984    * `_without_replacement` case.
 985    */
 986   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
 987     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
 988     if (max.value() == std::numeric_limits<size_t>::max()) {
 989       // Mark invalid by overflowing
 990       max++;
 991       MOZ_ASSERT(!max.isValid());
 992     }
 993     return max;
 994   }
 995
 996   /**
 997    * Incrementally decode a byte stream into UTF-16 with malformed sequences
 998    * replaced with the REPLACEMENT CHARACTER.
 999    *
1000    * See the documentation of the class for documentation for `Decode*`
1001    * methods collectively.
1002    */
1003   inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
1004       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1005     size_t srcRead = aSrc.Length();
1006     size_t dstWritten = aDst.Length();
1007     bool hadReplacements;
1008     uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
1009                                               aDst.Elements(), &dstWritten,
1010                                               aLast, &hadReplacements);
1011     return {result, srcRead, dstWritten, hadReplacements};
1012   }
1013
1014   /**
1015    * Incrementally decode a byte stream into UTF-16 _without replacement_.
1016    *
1017    * See the documentation of the class for documentation for `Decode*`
1018    * methods collectively.
1019    */
1020   inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1021       Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
1022     size_t srcRead = aSrc.Length();
1023     size_t dstWritten = aDst.Length();
1024     uint32_t result = decoder_decode_to_utf16_without_replacement(
1025         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1026     return {result, srcRead, dstWritten};
1027   }
1028
1029   /**
1030    * Checks for compatibility with storing Unicode scalar values as unsigned
1031    * bytes taking into account the state of the decoder.
1032    *
1033    * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
1034    * including waiting for the BOM, or if the encoding is never
1035    * Latin1-byte-compatible.
1036    *
1037    * Otherwise returns the index of the first byte whose unsigned value doesn't
1038    * directly correspond to the decoded Unicode scalar value, or the length
1039    * of the input if all bytes in the input decode directly to scalar values
1040    * corresponding to the unsigned byte values.
1041    *
1042    * Does not change the state of the decoder.
1043    *
1044    * Do not use this unless you are supporting SpiderMonkey-style string
1045    * storage optimizations.
1046    */
1047   inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
1048       Span<const uint8_t> aBuffer) const {
1049     size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
1050                                                        aBuffer.Length());
1051     if (upTo == std::numeric_limits<size_t>::max()) {
1052       return mozilla::Nothing();
1053     }
1054     return mozilla::Some(upTo);
1055   }
1056
1057  private:
1058   Decoder() = delete;
1059   Decoder(const Decoder&) = delete;
1060   Decoder& operator=(const Decoder&) = delete;
1061 };
1062
1063 /**
1064  * A converter that encodes a Unicode stream into bytes according to a
1065  * character encoding in a streaming (incremental) manner.
1066  *
1067  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1068  * buffer `aDst` both of which are caller-allocated. There are variants for
1069  * both UTF-8 and UTF-16 input buffers.
1070  *
1071  * An `Encode*` method encode characters from `aSrc` into bytes characters
1072  * stored into `aDst` until one of the following three things happens:
1073  *
1074  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1075  *    only).
1076  *
1077  * 2. The output buffer has been filled so near capacity that the decoder
1078  *    cannot be sure that processing an additional character of input wouldn't
1079  *    cause so much output that the output buffer would overflow.
1080  *
1081  * 3. All the input characters have been processed.
1082  *
1083  * The `Encode*` method then returns tuple of a status indicating which one
1084  * of the three reasons to return happened, how many input code units (`uint8_t`
1085  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1086  * how many output bytes were written, and in the case of the variants that
1087  * perform replacement, a boolean indicating whether an unmappable
1088  * character was replaced with a numeric character reference during the call.
1089  *
1090  * The number of bytes "written" is what's logically written. Garbage may be
1091  * written in the output buffer beyond the point logically written to.
1092  *
1093  * In the case of the methods whose name ends with
1094  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1095  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1096  * to the three cases listed above).
1097  *
1098  * In the case of methods whose name does not end with
1099  * `*WithoutReplacement`, unmappable characters are automatically replaced
1100  * with the corresponding numeric character references and unmappable
1101  * characters do not cause the methods to return early.
1102  *
1103  * When encoding from UTF-8 without replacement, the methods are guaranteed
1104  * not to return indicating that more output space is needed if the length
1105  * of the output buffer is at least the length returned by
1106  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1107  * UTF-8 with replacement, the length of the output buffer that guarantees the
1108  * methods not to return indicating that more output space is needed in the
1109  * absence of unmappable characters is given by
1110  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1111  * UTF-16 without replacement, the methods are guaranteed not to return
1112  * indicating that more output space is needed if the length of the output
1113  * buffer is at least the length returned by
1114  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1115  * from UTF-16 with replacement, the the length of the output buffer that
1116  * guarantees the methods not to return indicating that more output space is
1117  * needed in the absence of unmappable characters is given by
1118  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1119  * When encoding with replacement, applications are not expected to size the
1120  * buffer for the worst case ahead of time but to resize the buffer if there
1121  * are unmappable characters. This is why max length queries are only available
1122  * for the case where there are no unmappable characters.
1123  *
1124  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1125  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1126  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1127  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1128  * surrogate pairs are not split across input buffer boundaries.
1129  *
1130  * After an `Encode*` call returns, the output produced so far, taken as a
1131  * whole from the start of the stream, is guaranteed to consist of a valid
1132  * byte sequence in the target encoding. (I.e. the code unit sequence for a
1133  * character is guaranteed not to be split across output buffers. However, due
1134  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1135  * from the start for it to be valid. For other encodings, the validity holds
1136  * on a per-output buffer basis.)
1137  *
1138  * The boolean argument `aLast` indicates that the end of the stream is reached
1139  * when all the characters in `aSrc` have been consumed. This argument is needed
1140  * for ISO-2022-JP and is ignored for other encodings.
1141  *
1142  * An `Encoder` object can be used to incrementally encode a byte stream.
1143  *
1144  * During the processing of a single stream, the caller must call `Encode*`
1145  * zero or more times with `aLast` set to `false` and then call `Encode*` at
1146  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1147  * the processing of the stream has ended. Otherwise, the caller must call
1148  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1149  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1150  *
1151  * Once the stream has ended, the `Encoder` object must not be used anymore.
1152  * That is, you need to create another one to process another stream.
1153  *
1154  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1155  * result and the caller does not wish to treat it as a fatal error, the input
1156  * buffer `aSrc` may not have been completely consumed. In that case, the caller
1157  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1158  * call.
1159  *
1160  * # Infinite loops
1161  *
1162  * When converting with a fixed-size output buffer whose size is too small to
1163  * accommodate one character of output, an infinite loop ensues. When
1164  * converting with a fixed-size output buffer, it generally makes sense to
1165  * make the buffer fairly large (e.g. couple of kilobytes).
1166  */
1167 class Encoder final {
1168  public:
1169   ~Encoder() = default;
1170
1171   static void operator delete(void* aEncoder) {
1172     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1173   }
1174
1175   /**
1176    * The `Encoding` this `Encoder` is for.
1177    */
1178   inline NotNull<const mozilla::Encoding*> Encoding() const {
1179     return WrapNotNull(encoder_encoding(this));
1180   }
1181
1182   /**
1183    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1184    * ASCII state and `false` otherwise.
1185    */
1186   inline bool HasPendingState() const {
1187     return encoder_has_pending_state(this);
1188   }
1189
1190   /**
1191    * Query the worst-case output size when encoding from UTF-8 with
1192    * replacement.
1193    *
1194    * Returns the size of the output buffer in bytes that will not overflow
1195    * given the current state of the encoder and `aByteLength` number of
1196    * additional input code units if there are no unmappable characters in
1197    * the input.
1198    */
1199   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1200       size_t aByteLength) const {
1201     CheckedInt<size_t> max(
1202         encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
1203                                                               aByteLength));
1204     if (max.value() == std::numeric_limits<size_t>::max()) {
1205       // Mark invalid by overflowing
1206       max++;
1207       MOZ_ASSERT(!max.isValid());
1208     }
1209     return max;
1210   }
1211
1212   /**
1213    * Query the worst-case output size when encoding from UTF-8 without
1214    * replacement.
1215    *
1216    * Returns the size of the output buffer in bytes that will not overflow
1217    * given the current state of the encoder and `aByteLength` number of
1218    * additional input code units.
1219    */
1220   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1221       size_t aByteLength) const {
1222     CheckedInt<size_t> max(
1223         encoder_max_buffer_length_from_utf8_without_replacement(this,
1224                                                                 aByteLength));
1225     if (max.value() == std::numeric_limits<size_t>::max()) {
1226       // Mark invalid by overflowing
1227       max++;
1228       MOZ_ASSERT(!max.isValid());
1229     }
1230     return max;
1231   }
1232
1233   /**
1234    * Incrementally encode into byte stream from UTF-8 with unmappable
1235    * characters replaced with HTML (decimal) numeric character references.
1236    *
1237    * See the documentation of the class for documentation for `Encode*`
1238    * methods collectively.
1239    *
1240    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1241    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1242    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1243    */
1244   inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
1245       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1246     size_t srcRead = aSrc.Length();
1247     size_t dstWritten = aDst.Length();
1248     bool hadReplacements;
1249     uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
1250                                                aDst.Elements(), &dstWritten,
1251                                                aLast, &hadReplacements);
1252     return {result, srcRead, dstWritten, hadReplacements};
1253   }
1254
1255   /**
1256    * Incrementally encode into byte stream from UTF-8 _without replacement_.
1257    *
1258    * See the documentation of the class for documentation for `Encode*`
1259    * methods collectively.
1260    *
1261    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1262    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1263    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1264    */
1265   inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1266       Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1267     size_t srcRead = aSrc.Length();
1268     size_t dstWritten = aDst.Length();
1269     uint32_t result = encoder_encode_from_utf8_without_replacement(
1270         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1271     return {result, srcRead, dstWritten};
1272   }
1273
1274   /**
1275    * Query the worst-case output size when encoding from UTF-16 with
1276    * replacement.
1277    *
1278    * Returns the size of the output buffer in bytes that will not overflow
1279    * given the current state of the encoder and `aU16Length` number of
1280    * additional input code units if there are no unmappable characters in
1281    * the input.
1282    */
1283   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1284       size_t aU16Length) const {
1285     CheckedInt<size_t> max(
1286         encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
1287                                                                aU16Length));
1288     if (max.value() == std::numeric_limits<size_t>::max()) {
1289       // Mark invalid by overflowing
1290       max++;
1291       MOZ_ASSERT(!max.isValid());
1292     }
1293     return max;
1294   }
1295
1296   /**
1297    * Query the worst-case output size when encoding from UTF-16 without
1298    * replacement.
1299    *
1300    * Returns the size of the output buffer in bytes that will not overflow
1301    * given the current state of the encoder and `aU16Length` number of
1302    * additional input code units.
1303    */
1304   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1305       size_t aU16Length) const {
1306     CheckedInt<size_t> max(
1307         encoder_max_buffer_length_from_utf16_without_replacement(this,
1308                                                                  aU16Length));
1309     if (max.value() == std::numeric_limits<size_t>::max()) {
1310       // Mark invalid by overflowing
1311       max++;
1312       MOZ_ASSERT(!max.isValid());
1313     }
1314     return max;
1315   }
1316
1317   /**
1318    * Incrementally encode into byte stream from UTF-16 with unmappable
1319    * characters replaced with HTML (decimal) numeric character references.
1320    *
1321    * See the documentation of the class for documentation for `Encode*`
1322    * methods collectively.
1323    */
1324   inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
1325       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1326     size_t srcRead = aSrc.Length();
1327     size_t dstWritten = aDst.Length();
1328     bool hadReplacements;
1329     uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
1330                                                 aDst.Elements(), &dstWritten,
1331                                                 aLast, &hadReplacements);
1332     return {result, srcRead, dstWritten, hadReplacements};
1333   }
1334
1335   /**
1336    * Incrementally encode into byte stream from UTF-16 _without replacement_.
1337    *
1338    * See the documentation of the class for documentation for `Encode*`
1339    * methods collectively.
1340    */
1341   inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1342       Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
1343     size_t srcRead = aSrc.Length();
1344     size_t dstWritten = aDst.Length();
1345     uint32_t result = encoder_encode_from_utf16_without_replacement(
1346         this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1347     return {result, srcRead, dstWritten};
1348   }
1349
1350  private:
1351   Encoder() = delete;
1352   Encoder(const Encoder&) = delete;
1353   Encoder& operator=(const Encoder&) = delete;
1354 };
1355
1356 };  // namespace mozilla
1357
1358 #endif  // mozilla_Encoding_h