mfbt/Latin1.h

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this
   3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
   6  * (Note: this is *not* the same as the encoding of windows-1252 or
   7  * latin1 content on the web. In Web terms, this encoding
   8  * corresponds to "isomorphic decode" / "isomorphic encoding" from
   9  * the Infra Standard.)
  10  */
  11
  12 #ifndef mozilla_Latin1_h
  13 #define mozilla_Latin1_h
  14
  15 #include <type_traits>
  16
  17 #include "mozilla/JsRust.h"
  18 #include "mozilla/Span.h"
  19 #include "mozilla/Tuple.h"
  20
  21 #if MOZ_HAS_JSRUST()
  22 #  include "encoding_rs_mem.h"
  23 #endif
  24
  25 namespace mozilla {
  26
  27 namespace detail {
  28
  29 // It's important for optimizations that Latin1ness checks
  30 // and inflation/deflation function use the same short
  31 // string limit. The limit is 16, because that's the shortest
  32 // that inflates/deflates using SIMD.
  33 constexpr size_t kShortStringLimitForInlinePaths = 16;
  34
  35 template <typename Char>
  36 class MakeUnsignedChar {
  37  public:
  38   using Type = std::make_unsigned_t<Char>;
  39 };
  40
  41 template <>
  42 class MakeUnsignedChar<char16_t> {
  43  public:
  44   using Type = char16_t;
  45 };
  46
  47 template <>
  48 class MakeUnsignedChar<char32_t> {
  49  public:
  50   using Type = char32_t;
  51 };
  52
  53 }  // namespace detail
  54
  55 /**
  56  * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
  57  * [0x80, 0xFF].
  58  */
  59 template <typename Char>
  60 constexpr bool IsNonAsciiLatin1(Char aChar) {
  61   using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
  62   auto uc = static_cast<UnsignedChar>(aChar);
  63   return uc >= 0x80 && uc <= 0xFF;
  64 }
  65
  66 #if MOZ_HAS_JSRUST()
  67
  68 /**
  69  * Returns |true| iff |aString| contains only Latin1 characters, that is,
  70  * characters in the range [U+0000, U+00FF].
  71  *
  72  * @param aString a potentially-invalid UTF-16 string to scan
  73  */
  74 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) {
  75   size_t length = aString.Length();
  76   const char16_t* ptr = aString.Elements();
  77   // For short strings, calling into Rust is a pessimization, and the SIMD
  78   // code won't have a chance to kick in anyway.
  79   // 16 is a bit larger than logically necessary for this function alone,
  80   // but it's important that the limit here matches the limit used in
  81   // LossyConvertUtf16toLatin1!
  82   if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
  83     char16_t accu = 0;
  84     for (size_t i = 0; i < length; i++) {
  85       accu |= ptr[i];
  86     }
  87     return accu < 0x100;
  88   }
  89   return encoding_mem_is_utf16_latin1(ptr, length);
  90 }
  91
  92 /**
  93  * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
  94  * characters.
  95  *
  96  * If you know that the argument is always absolutely guaranteed to be valid
  97  * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
  98  *
  99  * @param aString potentially-invalid UTF-8 string to scan
 100  */
 101 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) {
 102   return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
 103 }
 104
 105 /**
 106  * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
 107  * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
 108  * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
 109  *
 110  * @param aString known-valid UTF-8 string to scan
 111  */
 112 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) {
 113   return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
 114 }
 115
 116 /**
 117  * Returns the index of first byte that starts an invalid byte
 118  * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
 119  * string, or the length of the string if there are neither.
 120  *
 121  * If you know that the argument is always absolutely guaranteed to be valid
 122  * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
 123  *
 124  * @param aString potentially-invalid UTF-8 string to scan
 125  */
 126 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) {
 127   return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length());
 128 }
 129
 130 /**
 131  * Returns the index of first byte that starts a non-Latin1 byte
 132  * sequence in a known-valid UTF-8 string, or the length of the
 133  * string if there are none. (If the string might not be valid
 134  * UTF-8, use Utf8Latin1UpTo() instead.)
 135  *
 136  * @param aString known-valid UTF-8 string to scan
 137  */
 138 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) {
 139   return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length());
 140 }
 141
 142 /**
 143  * If all the code points in the input are below U+0100, converts to Latin1,
 144  * i.e. unsigned byte value is Unicode scalar value. If there are code points
 145  * above U+00FF, produces unspecified garbage in a memory-safe way. The
 146  * nature of the garbage must not be relied upon.
 147  *
 148  * The length of aDest must not be less than the length of aSource.
 149  */
 150 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,
 151                                       mozilla::Span<char> aDest) {
 152   const char16_t* srcPtr = aSource.Elements();
 153   size_t srcLen = aSource.Length();
 154   char* dstPtr = aDest.Elements();
 155   size_t dstLen = aDest.Length();
 156   // Avoid function call overhead when SIMD isn't used anyway
 157   // If you change the length limit here, be sure to change
 158   // IsUtf16Latin1 and IsAscii to match so that optimizations don't
 159   // fail!
 160   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
 161     MOZ_ASSERT(dstLen >= srcLen);
 162     uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr);
 163     const char16_t* end = srcPtr + srcLen;
 164     while (srcPtr < end) {
 165       *unsignedPtr = static_cast<uint8_t>(*srcPtr);
 166       ++srcPtr;
 167       ++unsignedPtr;
 168     }
 169     return;
 170   }
 171   encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen);
 172 }
 173
 174 /**
 175  * If all the code points in the input are below U+0100, converts to Latin1,
 176  * i.e. unsigned byte value is Unicode scalar value. If there are code points
 177  * above U+00FF, produces unspecified garbage in a memory-safe way. The
 178  * nature of the garbage must not be relied upon.
 179  *
 180  * Returns the number of code units written.
 181  *
 182  * The length of aDest must not be less than the length of aSource.
 183  */
 184 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,
 185                                        mozilla::Span<char> aDest) {
 186   return encoding_mem_convert_utf8_to_latin1_lossy(
 187       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 188 }
 189
 190 /**
 191  * Converts each byte of |aSource|, interpreted as a Unicode scalar value
 192  * having that unsigned value, to its UTF-8 representation in |aDest|.
 193  *
 194  * Returns the number of code units written.
 195  *
 196  * The length of aDest must be at least twice the length of aSource.
 197  */
 198 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource,
 199                                   mozilla::Span<char> aDest) {
 200   return encoding_mem_convert_latin1_to_utf8(
 201       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 202 }
 203
 204 /**
 205  * Converts bytes whose unsigned value is interpreted as Unicode code point
 206  * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
 207  * output space.
 208  *
 209  * Returns the number of bytes read and the number of bytes written.
 210  *
 211  * If the output isn't large enough, not all input is consumed.
 212  *
 213  * The conversion is guaranteed to be complete if the length of aDest is
 214  * at least the length of aSource times two.
 215  *
 216  * The output is always valid UTF-8 ending on scalar value boundary
 217  * even in the case of partial conversion.
 218  *
 219  * The semantics of this function match the semantics of
 220  * TextEncoder.encodeInto.
 221  * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
 222  */
 223 inline mozilla::Tuple<size_t, size_t> ConvertLatin1toUtf8Partial(
 224     mozilla::Span<const char> aSource, mozilla::Span<char> aDest) {
 225   size_t srcLen = aSource.Length();
 226   size_t dstLen = aDest.Length();
 227   encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen,
 228                                               aDest.Elements(), &dstLen);
 229   return mozilla::MakeTuple(srcLen, dstLen);
 230 }
 231
 232 /**
 233  * Converts Latin-1 code points (i.e. each byte is the identical code
 234  * point) from |aSource| to UTF-16 code points in |aDest|.
 235  *
 236  * The length of aDest must not be less than the length of aSource.
 237  */
 238 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource,
 239                                  mozilla::Span<char16_t> aDest) {
 240   const char* srcPtr = aSource.Elements();
 241   size_t srcLen = aSource.Length();
 242   char16_t* dstPtr = aDest.Elements();
 243   size_t dstLen = aDest.Length();
 244   // Avoid function call overhead when SIMD isn't used anyway
 245   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
 246     MOZ_ASSERT(dstLen >= srcLen);
 247     const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr);
 248     const uint8_t* end = unsignedPtr + srcLen;
 249     while (unsignedPtr < end) {
 250       *dstPtr = *unsignedPtr;
 251       ++unsignedPtr;
 252       ++dstPtr;
 253     }
 254     return;
 255   }
 256   encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen);
 257 }
 258
 259 #endif
 260
 261 };  // namespace mozilla
 262
 263 #endif  // mozilla_Latin1_h