Bug 1869043 assert that graph set access is main thread only r=padenot
[gecko.git] / mfbt / Latin1.h
bloba57d771b649b7572b98fd85a807550e38f54a87a
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
6 * (Note: this is *not* the same as the encoding of windows-1252 or
7 * latin1 content on the web. In Web terms, this encoding
8 * corresponds to "isomorphic decode" / "isomorphic encoding" from
9 * the Infra Standard.)
12 #ifndef mozilla_Latin1_h
13 #define mozilla_Latin1_h
15 #include <type_traits>
17 #include "mozilla/JsRust.h"
18 #include "mozilla/Span.h"
20 #if MOZ_HAS_JSRUST()
21 # include "encoding_rs_mem.h"
22 #endif
24 namespace mozilla {
26 namespace detail {
28 // It's important for optimizations that Latin1ness checks
29 // and inflation/deflation function use the same short
30 // string limit. The limit is 16, because that's the shortest
31 // that inflates/deflates using SIMD.
32 constexpr size_t kShortStringLimitForInlinePaths = 16;
34 template <typename Char>
35 class MakeUnsignedChar {
36 public:
37 using Type = std::make_unsigned_t<Char>;
40 template <>
41 class MakeUnsignedChar<char16_t> {
42 public:
43 using Type = char16_t;
46 template <>
47 class MakeUnsignedChar<char32_t> {
48 public:
49 using Type = char32_t;
52 } // namespace detail
54 /**
55 * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
56 * [0x80, 0xFF].
58 template <typename Char>
59 constexpr bool IsNonAsciiLatin1(Char aChar) {
60 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
61 auto uc = static_cast<UnsignedChar>(aChar);
62 return uc >= 0x80 && uc <= 0xFF;
65 #if MOZ_HAS_JSRUST()
67 /**
68 * Returns |true| iff |aString| contains only Latin1 characters, that is,
69 * characters in the range [U+0000, U+00FF].
71 * @param aString a potentially-invalid UTF-16 string to scan
73 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) {
74 size_t length = aString.Length();
75 const char16_t* ptr = aString.Elements();
76 // For short strings, calling into Rust is a pessimization, and the SIMD
77 // code won't have a chance to kick in anyway.
78 // 16 is a bit larger than logically necessary for this function alone,
79 // but it's important that the limit here matches the limit used in
80 // LossyConvertUtf16toLatin1!
81 if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
82 char16_t accu = 0;
83 for (size_t i = 0; i < length; i++) {
84 accu |= ptr[i];
86 return accu < 0x100;
88 return encoding_mem_is_utf16_latin1(ptr, length);
91 /**
92 * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
93 * characters.
95 * If you know that the argument is always absolutely guaranteed to be valid
96 * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
98 * @param aString potentially-invalid UTF-8 string to scan
100 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) {
101 return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
105 * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
106 * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
107 * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
109 * @param aString known-valid UTF-8 string to scan
111 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) {
112 return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
116 * Returns the index of first byte that starts an invalid byte
117 * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
118 * string, or the length of the string if there are neither.
120 * If you know that the argument is always absolutely guaranteed to be valid
121 * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
123 * @param aString potentially-invalid UTF-8 string to scan
125 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) {
126 return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length());
130 * Returns the index of first byte that starts a non-Latin1 byte
131 * sequence in a known-valid UTF-8 string, or the length of the
132 * string if there are none. (If the string might not be valid
133 * UTF-8, use Utf8Latin1UpTo() instead.)
135 * @param aString known-valid UTF-8 string to scan
137 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) {
138 return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length());
142 * If all the code points in the input are below U+0100, converts to Latin1,
143 * i.e. unsigned byte value is Unicode scalar value. If there are code points
144 * above U+00FF, produces unspecified garbage in a memory-safe way. The
145 * nature of the garbage must not be relied upon.
147 * The length of aDest must not be less than the length of aSource.
149 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,
150 mozilla::Span<char> aDest) {
151 const char16_t* srcPtr = aSource.Elements();
152 size_t srcLen = aSource.Length();
153 char* dstPtr = aDest.Elements();
154 size_t dstLen = aDest.Length();
155 // Avoid function call overhead when SIMD isn't used anyway
156 // If you change the length limit here, be sure to change
157 // IsUtf16Latin1 and IsAscii to match so that optimizations don't
158 // fail!
159 if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
160 MOZ_ASSERT(dstLen >= srcLen);
161 uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr);
162 const char16_t* end = srcPtr + srcLen;
163 while (srcPtr < end) {
164 *unsignedPtr = static_cast<uint8_t>(*srcPtr);
165 ++srcPtr;
166 ++unsignedPtr;
168 return;
170 encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen);
174 * If all the code points in the input are below U+0100, converts to Latin1,
175 * i.e. unsigned byte value is Unicode scalar value. If there are code points
176 * above U+00FF, produces unspecified garbage in a memory-safe way. The
177 * nature of the garbage must not be relied upon.
179 * Returns the number of code units written.
181 * The length of aDest must not be less than the length of aSource.
183 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,
184 mozilla::Span<char> aDest) {
185 return encoding_mem_convert_utf8_to_latin1_lossy(
186 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
190 * Converts each byte of |aSource|, interpreted as a Unicode scalar value
191 * having that unsigned value, to its UTF-8 representation in |aDest|.
193 * Returns the number of code units written.
195 * The length of aDest must be at least twice the length of aSource.
197 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource,
198 mozilla::Span<char> aDest) {
199 return encoding_mem_convert_latin1_to_utf8(
200 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
204 * Converts bytes whose unsigned value is interpreted as Unicode code point
205 * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
206 * output space.
208 * Returns the number of bytes read and the number of bytes written.
210 * If the output isn't large enough, not all input is consumed.
212 * The conversion is guaranteed to be complete if the length of aDest is
213 * at least the length of aSource times two.
215 * The output is always valid UTF-8 ending on scalar value boundary
216 * even in the case of partial conversion.
218 * The semantics of this function match the semantics of
219 * TextEncoder.encodeInto.
220 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
222 inline std::tuple<size_t, size_t> ConvertLatin1toUtf8Partial(
223 mozilla::Span<const char> aSource, mozilla::Span<char> aDest) {
224 size_t srcLen = aSource.Length();
225 size_t dstLen = aDest.Length();
226 encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen,
227 aDest.Elements(), &dstLen);
228 return std::make_tuple(srcLen, dstLen);
232 * Converts Latin-1 code points (i.e. each byte is the identical code
233 * point) from |aSource| to UTF-16 code points in |aDest|.
235 * The length of aDest must not be less than the length of aSource.
237 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource,
238 mozilla::Span<char16_t> aDest) {
239 const char* srcPtr = aSource.Elements();
240 size_t srcLen = aSource.Length();
241 char16_t* dstPtr = aDest.Elements();
242 size_t dstLen = aDest.Length();
243 // Avoid function call overhead when SIMD isn't used anyway
244 if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
245 MOZ_ASSERT(dstLen >= srcLen);
246 const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr);
247 const uint8_t* end = unsignedPtr + srcLen;
248 while (unsignedPtr < end) {
249 *dstPtr = *unsignedPtr;
250 ++unsignedPtr;
251 ++dstPtr;
253 return;
255 encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen);
258 #endif
260 }; // namespace mozilla
262 #endif // mozilla_Latin1_h