1 // Copyright 2018 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 // Mostly copied and pasted from
11 // third_party/rust/shift_or_euc/src/lib.rs , so
12 // "top-level directory of this distribution" above refers to
13 // third_party/rust/shift_or_euc/
15 #ifndef mozilla_JapaneseDetector_h
16 #define mozilla_JapaneseDetector_h
18 #include "mozilla/Encoding.h"
21 class JapaneseDetector
;
22 }; // namespace mozilla
24 #define SHIFT_OR_EUC_DETECTOR mozilla::JapaneseDetector
26 #include "shift_or_euc.h"
31 * A Japanese legacy encoding detector for detecting between Shift_JIS,
32 * EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the
33 * encoding is one of those.
35 * # Principle of Operation
37 * The detector is based on two observations:
39 * 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or
40 * EUC-JP, so encountering such an escape sequence (before non-ASCII has been
41 * encountered) can be taken as indication of ISO-2022-JP.
42 * 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is
43 * decoded as EUC-JP, or vice versa, the result is either an error or
44 * half-width katakana, and it's very uncommon for Japanese HTML to have
45 * half-width katakana character before a normal kana or common kanji
46 * character. Therefore, if decoding as Shift_JIS results in error or
47 * have-width katakana, the detector decides that the content is EUC-JP, and
52 * The detector gives the wrong answer if the text has a half-width katakana
53 * character before normal kana or common kanji. Some uncommon kanji are
54 * undecidable. (All JIS X 0208 Level 1 kanji are decidable.)
56 * The half-width katakana issue is mainly relevant for old 8-bit JIS X
57 * 0201-only text files that would decode correctly as Shift_JIS but that the
58 * detector detects as EUC-JP.
60 * The undecidable kanji issue does not realistically show up when a full
61 * document is fed to the detector, because, realistically, in a full
62 * document, there is at least one kana or common kanji. It can occur,
63 * though, if the detector is only run on a prefix of a document and the
64 * prefix only contains the title of the document. It is possible for
65 * document title to consist entirely of undecidable kanji. (Indeed,
66 * Japanese Wikipedia has articles with such titles.) If the detector is
67 * undecided, a fallback to Shift_JIS should be used.
69 class JapaneseDetector final
{
71 ~JapaneseDetector() {}
73 static void operator delete(void* aDetector
) {
74 shift_or_euc_detector_free(reinterpret_cast<JapaneseDetector
*>(aDetector
));
78 * Instantiates the detector. If `aAllow2022` is `true` the possible
79 * guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If
80 * `aAllow2022` is `false`, the possible guesses are Shift_JIS, EUC-JP,
83 static inline UniquePtr
<JapaneseDetector
> Create(bool aAllow2022
) {
84 UniquePtr
<JapaneseDetector
> detector(shift_or_euc_detector_new(aAllow2022
));
89 * Feeds bytes to the detector. If `aLast` is `true` the end of the stream
90 * is considered to occur immediately after the end of `aBuffer`.
91 * Otherwise, the stream is expected to continue. `aBuffer` may be empty.
93 * If you're running the detector only on a prefix of a complete
94 * document, _do not_ pass `aLast` as `true` after the prefix if the
95 * stream as a whole still contains more content.
97 * Returns `SHIFT_JIS_ENCODING` if the detector guessed
98 * Shift_JIS. Returns `EUC_JP_ENCODING` if the detector
99 * guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the
100 * detector guessed ISO-2022-JP (only possible if `true` was passed as
101 * `aAllow2022` when instantiating the detector). Returns `nullptr` if the
102 * detector is undecided. If `nullptr` is returned even when passing `true`
103 * as `aLast`, falling back to Shift_JIS is the best guess for Web
106 * Do not call again after the method has returned non-`nullptr` or after
107 * the method has been called with `true` as `aLast`. (Asserts if the
108 * previous sentence isn't adhered to.)
110 inline const mozilla::Encoding
* Feed(Span
<const uint8_t> aBuffer
,
112 return shift_or_euc_detector_feed(this, aBuffer
.Elements(),
113 aBuffer
.Length(), aLast
);
117 JapaneseDetector() = delete;
118 JapaneseDetector(const JapaneseDetector
&) = delete;
119 JapaneseDetector
& operator=(const JapaneseDetector
&) = delete;
122 }; // namespace mozilla
124 #endif // mozilla_JapaneseDetector_h