1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
6 * (Note: this is *not* the same as the encoding of windows-1252 or
7 * latin1 content on the web. In Web terms, this encoding
8 * corresponds to "isomorphic decode" / "isomorphic encoding" from
12 #ifndef mozilla_Latin1_h
13 #define mozilla_Latin1_h
15 #include "mozilla/JsRust.h"
16 #include "mozilla/Span.h"
17 #include "mozilla/Tuple.h"
18 #include "mozilla/TypeTraits.h"
21 # include "encoding_rs_mem.h"
28 // It's important for optimizations that Latin1ness checks
29 // and inflation/deflation function use the same short
30 // string limit. The limit is 16, because that's the shortest
31 // that inflates/deflates using SIMD.
32 constexpr size_t kShortStringLimitForInlinePaths
= 16;
34 template <typename Char
>
35 class MakeUnsignedChar
: public MakeUnsigned
<Char
> {};
38 class MakeUnsignedChar
<char16_t
> {
40 using Type
= char16_t
;
44 class MakeUnsignedChar
<char32_t
> {
46 using Type
= char32_t
;
52 * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
55 template <typename Char
>
56 constexpr bool IsNonAsciiLatin1(Char aChar
) {
57 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
58 auto uc
= static_cast<UnsignedChar
>(aChar
);
59 return uc
>= 0x80 && uc
<= 0xFF;
65 * Returns |true| iff |aString| contains only Latin1 characters, that is,
66 * characters in the range [U+0000, U+00FF].
68 * @param aString a potentially-invalid UTF-16 string to scan
70 inline bool IsUtf16Latin1(mozilla::Span
<const char16_t
> aString
) {
71 size_t length
= aString
.Length();
72 const char16_t
* ptr
= aString
.Elements();
73 // For short strings, calling into Rust is a pessimization, and the SIMD
74 // code won't have a chance to kick in anyway.
75 // 16 is a bit larger than logically necessary for this function alone,
76 // but it's important that the limit here matches the limit used in
77 // LossyConvertUtf16toLatin1!
78 if (length
< mozilla::detail::kShortStringLimitForInlinePaths
) {
80 for (size_t i
= 0; i
< length
; i
++) {
85 return encoding_mem_is_utf16_latin1(ptr
, length
);
89 * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
92 * If you know that the argument is always absolutely guaranteed to be valid
93 * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
95 * @param aString potentially-invalid UTF-8 string to scan
97 inline bool IsUtf8Latin1(mozilla::Span
<const char> aString
) {
98 return encoding_mem_is_utf8_latin1(aString
.Elements(), aString
.Length());
102 * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
103 * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
104 * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
106 * @param aString known-valid UTF-8 string to scan
108 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span
<const char> aString
) {
109 return encoding_mem_is_str_latin1(aString
.Elements(), aString
.Length());
113 * Returns the index of first byte that starts an invalid byte
114 * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
115 * string, or the length of the string if there are neither.
117 * If you know that the argument is always absolutely guaranteed to be valid
118 * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
120 * @param aString potentially-invalid UTF-8 string to scan
122 inline size_t Utf8Latin1UpTo(mozilla::Span
<const char> aString
) {
123 return encoding_mem_utf8_latin1_up_to(aString
.Elements(), aString
.Length());
127 * Returns the index of first byte that starts a non-Latin1 byte
128 * sequence in a known-valid UTF-8 string, or the length of the
129 * string if there are none. (If the string might not be valid
130 * UTF-8, use Utf8Latin1UpTo() instead.)
132 * @param aString known-valid UTF-8 string to scan
134 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span
<const char> aString
) {
135 return encoding_mem_str_latin1_up_to(aString
.Elements(), aString
.Length());
139 * If all the code points in the input are below U+0100, converts to Latin1,
140 * i.e. unsigned byte value is Unicode scalar value. If there are code points
141 * above U+00FF, produces unspecified garbage in a memory-safe way. The
142 * nature of the garbage must not be relied upon.
144 * The length of aDest must not be less than the length of aSource.
146 inline void LossyConvertUtf16toLatin1(mozilla::Span
<const char16_t
> aSource
,
147 mozilla::Span
<char> aDest
) {
148 const char16_t
* srcPtr
= aSource
.Elements();
149 size_t srcLen
= aSource
.Length();
150 char* dstPtr
= aDest
.Elements();
151 size_t dstLen
= aDest
.Length();
152 // Avoid function call overhead when SIMD isn't used anyway
153 // If you change the length limit here, be sure to change
154 // IsUtf16Latin1 and IsAscii to match so that optimizations don't
156 if (srcLen
< mozilla::detail::kShortStringLimitForInlinePaths
) {
157 MOZ_ASSERT(dstLen
>= srcLen
);
158 uint8_t* unsignedPtr
= reinterpret_cast<uint8_t*>(dstPtr
);
159 const char16_t
* end
= srcPtr
+ srcLen
;
160 while (srcPtr
< end
) {
161 *unsignedPtr
= static_cast<uint8_t>(*srcPtr
);
167 encoding_mem_convert_utf16_to_latin1_lossy(srcPtr
, srcLen
, dstPtr
, dstLen
);
171 * If all the code points in the input are below U+0100, converts to Latin1,
172 * i.e. unsigned byte value is Unicode scalar value. If there are code points
173 * above U+00FF, produces unspecified garbage in a memory-safe way. The
174 * nature of the garbage must not be relied upon.
176 * Returns the number of code units written.
178 * The length of aDest must not be less than the length of aSource.
180 inline size_t LossyConvertUtf8toLatin1(mozilla::Span
<const char> aSource
,
181 mozilla::Span
<char> aDest
) {
182 return encoding_mem_convert_utf8_to_latin1_lossy(
183 aSource
.Elements(), aSource
.Length(), aDest
.Elements(), aDest
.Length());
187 * Converts each byte of |aSource|, interpreted as a Unicode scalar value
188 * having that unsigned value, to its UTF-8 representation in |aDest|.
190 * Returns the number of code units written.
192 * The length of aDest must be at least twice the length of aSource.
194 inline size_t ConvertLatin1toUtf8(mozilla::Span
<const char> aSource
,
195 mozilla::Span
<char> aDest
) {
196 return encoding_mem_convert_latin1_to_utf8(
197 aSource
.Elements(), aSource
.Length(), aDest
.Elements(), aDest
.Length());
201 * Converts bytes whose unsigned value is interpreted as Unicode code point
202 * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
205 * Returns the number of bytes read and the number of bytes written.
207 * If the output isn't large enough, not all input is consumed.
209 * The conversion is guaranteed to be complete if the length of aDest is
210 * at least the length of aSource times two.
212 * The output is always valid UTF-8 ending on scalar value boundary
213 * even in the case of partial conversion.
215 * The semantics of this function match the semantics of
216 * TextEncoder.encodeInto.
217 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
219 inline mozilla::Tuple
<size_t, size_t> ConvertLatin1toUtf8Partial(
220 mozilla::Span
<const char> aSource
, mozilla::Span
<char> aDest
) {
221 size_t srcLen
= aSource
.Length();
222 size_t dstLen
= aDest
.Length();
223 encoding_mem_convert_latin1_to_utf8_partial(aSource
.Elements(), &srcLen
,
224 aDest
.Elements(), &dstLen
);
225 return mozilla::MakeTuple(srcLen
, dstLen
);
229 * Converts Latin-1 code points (i.e. each byte is the identical code
230 * point) from |aSource| to UTF-16 code points in |aDest|.
232 * The length of aDest must not be less than the length of aSource.
234 inline void ConvertLatin1toUtf16(mozilla::Span
<const char> aSource
,
235 mozilla::Span
<char16_t
> aDest
) {
236 const char* srcPtr
= aSource
.Elements();
237 size_t srcLen
= aSource
.Length();
238 char16_t
* dstPtr
= aDest
.Elements();
239 size_t dstLen
= aDest
.Length();
240 // Avoid function call overhead when SIMD isn't used anyway
241 if (srcLen
< mozilla::detail::kShortStringLimitForInlinePaths
) {
242 MOZ_ASSERT(dstLen
>= srcLen
);
243 const uint8_t* unsignedPtr
= reinterpret_cast<const uint8_t*>(srcPtr
);
244 const uint8_t* end
= unsignedPtr
+ srcLen
;
245 while (unsignedPtr
< end
) {
246 *dstPtr
= *unsignedPtr
;
252 encoding_mem_convert_latin1_to_utf16(srcPtr
, srcLen
, dstPtr
, dstLen
);
257 }; // namespace mozilla
259 #endif // mozilla_Latin1_h