mfbt/Latin1.h

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this
   3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
   6  * (Note: this is *not* the same as the encoding of windows-1252 or
   7  * latin1 content on the web. In Web terms, this encoding
   8  * corresponds to "isomorphic decode" / "isomorphic encoding" from
   9  * the Infra Standard.)
  10  */
  11
  12 #ifndef mozilla_Latin1_h
  13 #define mozilla_Latin1_h
  14
  15 #include "mozilla/JsRust.h"
  16 #include "mozilla/Span.h"
  17 #include "mozilla/Tuple.h"
  18 #include "mozilla/TypeTraits.h"
  19
  20 #if MOZ_HAS_JSRUST()
  21 #  include "encoding_rs_mem.h"
  22 #endif
  23
  24 namespace mozilla {
  25
  26 namespace detail {
  27
  28 // It's important for optimizations that Latin1ness checks
  29 // and inflation/deflation function use the same short
  30 // string limit. The limit is 16, because that's the shortest
  31 // that inflates/deflates using SIMD.
  32 constexpr size_t kShortStringLimitForInlinePaths = 16;
  33
  34 template <typename Char>
  35 class MakeUnsignedChar : public MakeUnsigned<Char> {};
  36
  37 template <>
  38 class MakeUnsignedChar<char16_t> {
  39  public:
  40   using Type = char16_t;
  41 };
  42
  43 template <>
  44 class MakeUnsignedChar<char32_t> {
  45  public:
  46   using Type = char32_t;
  47 };
  48
  49 }  // namespace detail
  50
  51 /**
  52  * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
  53  * [0x80, 0xFF].
  54  */
  55 template <typename Char>
  56 constexpr bool IsNonAsciiLatin1(Char aChar) {
  57   using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
  58   auto uc = static_cast<UnsignedChar>(aChar);
  59   return uc >= 0x80 && uc <= 0xFF;
  60 }
  61
  62 #if MOZ_HAS_JSRUST()
  63
  64 /**
  65  * Returns |true| iff |aString| contains only Latin1 characters, that is,
  66  * characters in the range [U+0000, U+00FF].
  67  *
  68  * @param aString a potentially-invalid UTF-16 string to scan
  69  */
  70 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) {
  71   size_t length = aString.Length();
  72   const char16_t* ptr = aString.Elements();
  73   // For short strings, calling into Rust is a pessimization, and the SIMD
  74   // code won't have a chance to kick in anyway.
  75   // 16 is a bit larger than logically necessary for this function alone,
  76   // but it's important that the limit here matches the limit used in
  77   // LossyConvertUtf16toLatin1!
  78   if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
  79     char16_t accu = 0;
  80     for (size_t i = 0; i < length; i++) {
  81       accu |= ptr[i];
  82     }
  83     return accu < 0x100;
  84   }
  85   return encoding_mem_is_utf16_latin1(ptr, length);
  86 }
  87
  88 /**
  89  * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
  90  * characters.
  91  *
  92  * If you know that the argument is always absolutely guaranteed to be valid
  93  * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
  94  *
  95  * @param aString potentially-invalid UTF-8 string to scan
  96  */
  97 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) {
  98   return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
  99 }
 100
 101 /**
 102  * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
 103  * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
 104  * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
 105  *
 106  * @param aString known-valid UTF-8 string to scan
 107  */
 108 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) {
 109   return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
 110 }
 111
 112 /**
 113  * Returns the index of first byte that starts an invalid byte
 114  * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
 115  * string, or the length of the string if there are neither.
 116  *
 117  * If you know that the argument is always absolutely guaranteed to be valid
 118  * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
 119  *
 120  * @param aString potentially-invalid UTF-8 string to scan
 121  */
 122 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) {
 123   return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length());
 124 }
 125
 126 /**
 127  * Returns the index of first byte that starts a non-Latin1 byte
 128  * sequence in a known-valid UTF-8 string, or the length of the
 129  * string if there are none. (If the string might not be valid
 130  * UTF-8, use Utf8Latin1UpTo() instead.)
 131  *
 132  * @param aString known-valid UTF-8 string to scan
 133  */
 134 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) {
 135   return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length());
 136 }
 137
 138 /**
 139  * If all the code points in the input are below U+0100, converts to Latin1,
 140  * i.e. unsigned byte value is Unicode scalar value. If there are code points
 141  * above U+00FF, produces unspecified garbage in a memory-safe way. The
 142  * nature of the garbage must not be relied upon.
 143  *
 144  * The length of aDest must not be less than the length of aSource.
 145  */
 146 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,
 147                                       mozilla::Span<char> aDest) {
 148   const char16_t* srcPtr = aSource.Elements();
 149   size_t srcLen = aSource.Length();
 150   char* dstPtr = aDest.Elements();
 151   size_t dstLen = aDest.Length();
 152   // Avoid function call overhead when SIMD isn't used anyway
 153   // If you change the length limit here, be sure to change
 154   // IsUtf16Latin1 and IsAscii to match so that optimizations don't
 155   // fail!
 156   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
 157     MOZ_ASSERT(dstLen >= srcLen);
 158     uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr);
 159     const char16_t* end = srcPtr + srcLen;
 160     while (srcPtr < end) {
 161       *unsignedPtr = static_cast<uint8_t>(*srcPtr);
 162       ++srcPtr;
 163       ++unsignedPtr;
 164     }
 165     return;
 166   }
 167   encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen);
 168 }
 169
 170 /**
 171  * If all the code points in the input are below U+0100, converts to Latin1,
 172  * i.e. unsigned byte value is Unicode scalar value. If there are code points
 173  * above U+00FF, produces unspecified garbage in a memory-safe way. The
 174  * nature of the garbage must not be relied upon.
 175  *
 176  * Returns the number of code units written.
 177  *
 178  * The length of aDest must not be less than the length of aSource.
 179  */
 180 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,
 181                                        mozilla::Span<char> aDest) {
 182   return encoding_mem_convert_utf8_to_latin1_lossy(
 183       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 184 }
 185
 186 /**
 187  * Converts each byte of |aSource|, interpreted as a Unicode scalar value
 188  * having that unsigned value, to its UTF-8 representation in |aDest|.
 189  *
 190  * Returns the number of code units written.
 191  *
 192  * The length of aDest must be at least twice the length of aSource.
 193  */
 194 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource,
 195                                   mozilla::Span<char> aDest) {
 196   return encoding_mem_convert_latin1_to_utf8(
 197       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 198 }
 199
 200 /**
 201  * Converts bytes whose unsigned value is interpreted as Unicode code point
 202  * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
 203  * output space.
 204  *
 205  * Returns the number of bytes read and the number of bytes written.
 206  *
 207  * If the output isn't large enough, not all input is consumed.
 208  *
 209  * The conversion is guaranteed to be complete if the length of aDest is
 210  * at least the length of aSource times two.
 211  *
 212  * The output is always valid UTF-8 ending on scalar value boundary
 213  * even in the case of partial conversion.
 214  *
 215  * The semantics of this function match the semantics of
 216  * TextEncoder.encodeInto.
 217  * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
 218  */
 219 inline mozilla::Tuple<size_t, size_t> ConvertLatin1toUtf8Partial(
 220     mozilla::Span<const char> aSource, mozilla::Span<char> aDest) {
 221   size_t srcLen = aSource.Length();
 222   size_t dstLen = aDest.Length();
 223   encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen,
 224                                               aDest.Elements(), &dstLen);
 225   return mozilla::MakeTuple(srcLen, dstLen);
 226 }
 227
 228 /**
 229  * Converts Latin-1 code points (i.e. each byte is the identical code
 230  * point) from |aSource| to UTF-16 code points in |aDest|.
 231  *
 232  * The length of aDest must not be less than the length of aSource.
 233  */
 234 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource,
 235                                  mozilla::Span<char16_t> aDest) {
 236   const char* srcPtr = aSource.Elements();
 237   size_t srcLen = aSource.Length();
 238   char16_t* dstPtr = aDest.Elements();
 239   size_t dstLen = aDest.Length();
 240   // Avoid function call overhead when SIMD isn't used anyway
 241   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
 242     MOZ_ASSERT(dstLen >= srcLen);
 243     const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr);
 244     const uint8_t* end = unsignedPtr + srcLen;
 245     while (unsignedPtr < end) {
 246       *dstPtr = *unsignedPtr;
 247       ++unsignedPtr;
 248       ++dstPtr;
 249     }
 250     return;
 251   }
 252   encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen);
 253 }
 254
 255 #endif
 256
 257 };  // namespace mozilla
 258
 259 #endif  // mozilla_Latin1_h