1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /* Character/text operations. */
9 #ifndef mozilla_TextUtils_h
10 #define mozilla_TextUtils_h
12 #include "mozilla/Assertions.h"
13 #include "mozilla/Latin1.h"
16 // Can't include mozilla/Encoding.h here.
18 // Declared as uint8_t instead of char to match declaration in another header.
19 size_t encoding_ascii_valid_up_to(uint8_t const* buffer
, size_t buffer_len
);
25 // See Utf8.h for IsUtf8() and conversions between UTF-8 and UTF-16.
26 // See Latin1.h for testing UTF-16 and UTF-8 for Latin1ness and
27 // for conversions to and from Latin1.
29 // The overloads below are not templated in order to make
30 // implicit conversions to span work as expected for the Span
33 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
34 inline constexpr bool IsAscii(unsigned char aChar
) { return aChar
< 0x80; }
36 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
37 inline constexpr bool IsAscii(signed char aChar
) {
38 return IsAscii(static_cast<unsigned char>(aChar
));
41 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
42 inline constexpr bool IsAscii(char aChar
) {
43 return IsAscii(static_cast<unsigned char>(aChar
));
46 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
47 inline constexpr bool IsAscii(char16_t aChar
) { return aChar
< 0x80; }
49 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
50 inline constexpr bool IsAscii(char32_t aChar
) { return aChar
< 0x80; }
53 * Returns |true| iff |aString| contains only ASCII characters, that is,
54 * characters in the range [0x00, 0x80).
56 * @param aString a 8-bit wide string to scan
58 inline bool IsAscii(mozilla::Span
<const char> aString
) {
60 size_t length
= aString
.Length();
61 const char* ptr
= aString
.Elements();
62 // For short strings, avoid the function call, since, the SIMD
63 // code won't have a chance to kick in anyway.
64 if (length
< mozilla::detail::kShortStringLimitForInlinePaths
) {
65 const uint8_t* uptr
= reinterpret_cast<const uint8_t*>(ptr
);
67 for (size_t i
= 0; i
< length
; i
++) {
72 return encoding_mem_is_ascii(ptr
, length
);
74 for (char c
: aString
) {
84 * Returns |true| iff |aString| contains only ASCII characters, that is,
85 * characters in the range [0x00, 0x80).
87 * @param aString a 16-bit wide string to scan
89 inline bool IsAscii(mozilla::Span
<const char16_t
> aString
) {
91 size_t length
= aString
.Length();
92 const char16_t
* ptr
= aString
.Elements();
93 // For short strings, calling into Rust is a pessimization, and the SIMD
94 // code won't have a chance to kick in anyway.
95 // 16 is a bit larger than logically necessary for this function alone,
96 // but it's important that the limit here matches the limit used in
97 // LossyConvertUtf16toLatin1!
98 if (length
< mozilla::detail::kShortStringLimitForInlinePaths
) {
100 for (size_t i
= 0; i
< length
; i
++) {
105 return encoding_mem_is_basic_latin(ptr
, length
);
107 for (char16_t c
: aString
) {
117 * Returns true iff every character in the null-terminated string pointed to by
118 * |aChar| is ASCII, i.e. in the range [0, 0x80).
120 template <typename Char
>
121 constexpr bool IsAsciiNullTerminated(const Char
* aChar
) {
122 while (Char c
= *aChar
++) {
132 * Returns the index of the first non-ASCII byte or
133 * the length of the string if there are none.
135 inline size_t AsciiValidUpTo(mozilla::Span
<const char> aString
) {
136 return encoding_ascii_valid_up_to(
137 reinterpret_cast<const uint8_t*>(aString
.Elements()), aString
.Length());
141 * Returns the index of the first unpaired surrogate or
142 * the length of the string if there are none.
144 inline size_t Utf16ValidUpTo(mozilla::Span
<const char16_t
> aString
) {
145 return encoding_mem_utf16_valid_up_to(aString
.Elements(), aString
.Length());
149 * Replaces unpaired surrogates with U+FFFD in the argument.
151 * Note: If you have an nsAString, use EnsureUTF16Validity() from
152 * nsReadableUtils.h instead to avoid unsharing a valid shared
155 inline void EnsureUtf16ValiditySpan(mozilla::Span
<char16_t
> aString
) {
156 encoding_mem_ensure_utf16_validity(aString
.Elements(), aString
.Length());
160 * Convert ASCII to UTF-16. In debug builds, assert that the input is
163 * The length of aDest must not be less than the length of aSource.
165 inline void ConvertAsciitoUtf16(mozilla::Span
<const char> aSource
,
166 mozilla::Span
<char16_t
> aDest
) {
167 MOZ_ASSERT(IsAscii(aSource
));
168 ConvertLatin1toUtf16(aSource
, aDest
);
171 #endif // MOZ_HAS_JSRUST
174 * Returns true iff |aChar| matches Ascii Whitespace.
176 * This function is intended to match the Infra standard
177 * (https://infra.spec.whatwg.org/#ascii-whitespace)
179 template <typename Char
>
180 constexpr bool IsAsciiWhitespace(Char aChar
) {
181 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
182 auto uc
= static_cast<UnsignedChar
>(aChar
);
183 return uc
== 0x9 || uc
== 0xA || uc
== 0xC || uc
== 0xD || uc
== 0x20;
187 * Returns true iff |aChar| matches [a-z].
189 * This function is basically what you thought islower was, except its behavior
190 * doesn't depend on the user's current locale.
192 template <typename Char
>
193 constexpr bool IsAsciiLowercaseAlpha(Char aChar
) {
194 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
195 auto uc
= static_cast<UnsignedChar
>(aChar
);
196 return 'a' <= uc
&& uc
<= 'z';
200 * Returns true iff |aChar| matches [A-Z].
202 * This function is basically what you thought isupper was, except its behavior
203 * doesn't depend on the user's current locale.
205 template <typename Char
>
206 constexpr bool IsAsciiUppercaseAlpha(Char aChar
) {
207 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
208 auto uc
= static_cast<UnsignedChar
>(aChar
);
209 return 'A' <= uc
&& uc
<= 'Z';
213 * Returns true iff |aChar| matches [a-zA-Z].
215 * This function is basically what you thought isalpha was, except its behavior
216 * doesn't depend on the user's current locale.
218 template <typename Char
>
219 constexpr bool IsAsciiAlpha(Char aChar
) {
220 return IsAsciiLowercaseAlpha(aChar
) || IsAsciiUppercaseAlpha(aChar
);
224 * Returns true iff |aChar| matches [0-9].
226 * This function is basically what you thought isdigit was, except its behavior
227 * doesn't depend on the user's current locale.
229 template <typename Char
>
230 constexpr bool IsAsciiDigit(Char aChar
) {
231 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
232 auto uc
= static_cast<UnsignedChar
>(aChar
);
233 return '0' <= uc
&& uc
<= '9';
237 * Returns true iff |aChar| matches [0-9a-fA-F].
239 * This function is basically isxdigit, but guaranteed to be only for ASCII.
241 template <typename Char
>
242 constexpr bool IsAsciiHexDigit(Char aChar
) {
243 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
244 auto uc
= static_cast<UnsignedChar
>(aChar
);
245 return ('0' <= uc
&& uc
<= '9') || ('a' <= uc
&& uc
<= 'f') ||
246 ('A' <= uc
&& uc
<= 'F');
250 * Returns true iff |aChar| matches [a-zA-Z0-9].
252 * This function is basically what you thought isalnum was, except its behavior
253 * doesn't depend on the user's current locale.
255 template <typename Char
>
256 constexpr bool IsAsciiAlphanumeric(Char aChar
) {
257 return IsAsciiDigit(aChar
) || IsAsciiAlpha(aChar
);
261 * Converts an ASCII alphanumeric digit [0-9a-zA-Z] to number as if in base-36.
262 * (This function therefore works for decimal, hexadecimal, etc.).
264 template <typename Char
>
265 uint8_t AsciiAlphanumericToNumber(Char aChar
) {
266 using UnsignedChar
= typename
detail::MakeUnsignedChar
<Char
>::Type
;
267 auto uc
= static_cast<UnsignedChar
>(aChar
);
269 if ('0' <= uc
&& uc
<= '9') {
273 if ('A' <= uc
&& uc
<= 'Z') {
274 return uc
- 'A' + 10;
277 // Ideally this function would be constexpr, but unfortunately gcc at least as
278 // of 6.4 forbids non-constexpr function calls in unevaluated constexpr
279 // function calls. See bug 1453456. So for now, just assert and leave the
280 // entire function non-constexpr.
281 MOZ_ASSERT('a' <= uc
&& uc
<= 'z',
282 "non-ASCII alphanumeric character can't be converted to number");
283 return uc
- 'a' + 10;
286 } // namespace mozilla
288 #endif /* mozilla_TextUtils_h */