Bug 1625482 [wpt PR 22496] - [ScrollTimeline] Do not show scrollbar to bypass flakine...
[gecko.git] / intl / EncodingDetector.h
blob6915c825912460852e97e2b07af17a85aea75d0b
1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Mostly copied and pasted from
11 // third_party/rust/chardetng/src/lib.rs , so
12 // "top-level directory of this distribution" above refers to
13 // third_party/rust/chardetng/
15 #ifndef mozilla_EncodingDetector_h
16 #define mozilla_EncodingDetector_h
18 #include "mozilla/Encoding.h"
20 namespace mozilla {
21 class EncodingDetector;
22 }; // namespace mozilla
24 #define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector
26 #include "chardetng.h"
28 namespace mozilla {
30 /**
31 * A Web browser-oriented detector for guessing what character
32 * encoding a stream of bytes is encoded in.
34 * The bytes are fed to the detector incrementally using the `feed`
35 * method. The current guess of the detector can be queried using
36 * the `guess` method. The guessing parameters are arguments to the
37 * `guess` method rather than arguments to the constructor in order
38 * to enable the application to check if the arguments affect the
39 * guessing outcome. (The specific use case is to disable UI for
40 * re-running the detector with UTF-8 allowed and the top-level
41 * domain name ignored if those arguments don't change the guess.)
43 class EncodingDetector final {
44 public:
45 ~EncodingDetector() = default;
47 static void operator delete(void* aDetector) {
48 chardetng_encoding_detector_free(
49 reinterpret_cast<EncodingDetector*>(aDetector));
52 /**
53 * Creates a new instance of the detector.
55 static inline UniquePtr<EncodingDetector> Create() {
56 UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
57 return detector;
60 /**
61 * Inform the detector of a chunk of input.
63 * The byte stream is represented as a sequence of calls to this
64 * method such that the concatenation of the arguments to this
65 * method form the byte stream. It does not matter how the application
66 * chooses to chunk the stream. It is OK to call this method with
67 * a zero-length byte slice.
69 * The end of the stream is indicated by calling this method with
70 * `aLast` set to `true`. In that case, the end of the stream is
71 * considered to occur after the last byte of the `aBuffer` (which
72 * may be zero-length) passed in the same call. Once this method
73 * has been called with `last` set to `true` this method must not
74 * be called again.
76 * If you want to perform detection on just the prefix of a longer
77 * stream, do not pass `aLast=true` after the prefix if the stream
78 * actually still continues.
80 * Returns `true` if after processing `aBuffer` the stream has
81 * contained at least one non-ASCII byte and `false` if only
82 * ASCII has been seen so far.
84 * # Panics
86 * If this method has previously been called with `aLast` set to `true`.
88 inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
89 return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
90 aBuffer.Length(), aLast);
93 /**
94 * Guess the encoding given the bytes pushed to the detector so far
95 * (via `Feed()`), the top-level domain name from which the bytes were
96 * loaded, and an indication of whether to consider UTF-8 as a permissible
97 * guess.
99 * The `aTld` argument takes the rightmost DNS label of the hostname of the
100 * host the stream was loaded from in lower-case ASCII form. That is, if
101 * the label is an internationalized top-level domain name, it must be
102 * provided in its Punycode form. If the TLD that the stream was loaded
103 * from is unavalable, an empty `Spane` may be passed instead, which is
104 * equivalent to passing a `Span` for "com".
106 * If the `aAllowUTF8` argument is set to `false`, the return value of
107 * this method won't be `UTF_8_ENCODING`. When performing detection
108 * on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
109 * unless the user has taken a specific contextual action to request an
110 * override. This way, Web developers cannot start depending on UTF-8
111 * detection. Such reliance would make the Web Platform more brittle.
113 * Returns the guessed encoding.
115 * # Panics
117 * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
118 * condition is intentionally limited to signs of failing to extract the
119 * label correctly, failing to provide it in its Punycode form, and failure
120 * to lower-case it. Full DNS label validation is intentionally not performed
121 * to avoid panics when the reality doesn't match the specs.)
123 inline mozilla::NotNull<const mozilla::Encoding*> Guess(
124 Span<const char> aTLD, bool aAllowUTF8) const {
125 return WrapNotNull(chardetng_encoding_detector_guess(
126 this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
129 private:
130 EncodingDetector() = delete;
131 EncodingDetector(const EncodingDetector&) = delete;
132 EncodingDetector& operator=(const EncodingDetector&) = delete;
135 }; // namespace mozilla
137 #endif // mozilla_EncodingDetector_h