netcore/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 using System.Diagnostics;
   6 using System.Runtime.Intrinsics;
   7 using System.Runtime.Intrinsics.X86;
   8 using System.Numerics;
   9 using Internal.Runtime.CompilerServices;
  10
  11 #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
  12 #if BIT64
  13 using nint = System.Int64;
  14 using nuint = System.UInt64;
  15 #else // BIT64
  16 using nint = System.Int32;
  17 using nuint = System.UInt32;
  18 #endif // BIT64
  19
  20 namespace System.Text.Unicode
  21 {
  22     internal static unsafe partial class Utf16Utility
  23     {
  24 #if DEBUG
  25         static Utf16Utility()
  26         {
  27             Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
  28             Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
  29         }
  30 #endif // DEBUG
  31
  32         // Returns &inputBuffer[inputLength] if the input buffer is valid.
  33         /// <summary>
  34         /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
  35         /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
  36         /// </summary>
  37         /// <remarks>
  38         /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
  39         /// </remarks>
  40         public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
  41         {
  42             Debug.Assert(inputLength >= 0, "Input length must not be negative.");
  43             Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
  44
  45             // First, we'll handle the common case of all-ASCII. If this is able to
  46             // consume the entire buffer, we'll skip the remainder of this method's logic.
  47
  48             int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
  49             Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
  50
  51             pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
  52             inputLength -= numAsciiCharsConsumedJustNow;
  53
  54             if (inputLength == 0)
  55             {
  56                 utf8CodeUnitCountAdjustment = 0;
  57                 scalarCountAdjustment = 0;
  58                 return pInputBuffer;
  59             }
  60
  61             // If we got here, it means we saw some non-ASCII data, so within our
  62             // vectorized code paths below we'll handle all non-surrogate UTF-16
  63             // code points branchlessly. We'll only branch if we see surrogates.
  64             //
  65             // We still optimistically assume the data is mostly ASCII. This means that the
  66             // number of UTF-8 code units and the number of scalars almost matches the number
  67             // of UTF-16 code units. As we go through the input and find non-ASCII
  68             // characters, we'll keep track of these "adjustment" fixups. To get the
  69             // total number of UTF-8 code units required to encode the input data, add
  70             // the UTF-8 code unit count adjustment to the number of UTF-16 code units
  71             // seen.  To get the total number of scalars present in the input data,
  72             // add the scalar count adjustment to the number of UTF-16 code units seen.
  73
  74             long tempUtf8CodeUnitCountAdjustment = 0;
  75             int tempScalarCountAdjustment = 0;
  76
  77             if (Sse2.IsSupported)
  78             {
  79                 if (inputLength >= Vector128<ushort>.Count)
  80                 {
  81                     Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
  82                     Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
  83                     Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
  84                     Vector128<ushort> vectorZero = Vector128<ushort>.Zero;
  85
  86                     do
  87                     {
  88                         Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
  89                         uint mask;
  90
  91                         // The 'charIsNonAscii' vector we're about to build will have the 0x8000 or the 0x0080
  92                         // bit set (but not both!) only if the corresponding input char is non-ASCII. Which of
  93                         // the two bits is set doesn't matter, as will be explained in the diagram a few lines
  94                         // below.
  95
  96                         Vector128<ushort> charIsNonAscii;
  97                         if (Sse41.IsSupported)
  98                         {
  99                             // sets 0x0080 bit if corresponding char element is >= 0x0080
 100                             charIsNonAscii = Sse41.Min(utf16Data, vector0080);
 101                         }
 102                         else
 103                         {
 104                             // sets 0x8000 bit if corresponding char element is >= 0x0080
 105                             charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7)));
 106                         }
 107
 108 #if DEBUG
 109                         // Quick check to ensure we didn't accidentally set both 0x8080 bits in any element.
 110                         uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
 111                         Debug.Assert((debugMask & (debugMask << 1)) == 0, "Two set bits shouldn't occur adjacent to each other in this mask.");
 112 #endif // DEBUG
 113
 114                         // sets 0x8080 bits if corresponding char element is >= 0x0800
 115                         Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
 116
 117                         mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
 118
 119                         // Each odd bit of mask will be 1 only if the char was >= 0x0080,
 120                         // and each even bit of mask will be 1 only if the char was >= 0x0800.
 121                         //
 122                         // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
 123                         //
 124                         //            ,-- set if char[1] is non-ASCII
 125                         //            |   ,-- set if char[0] is non-ASCII
 126                         //            v   v
 127                         // mask = ... 1 1 1 0
 128                         //              ^   ^-- set if char[0] is >= 0x0800
 129                         //              `-- set if char[1] is >= 0x0800
 130                         //
 131                         // (If the SSE4.1 code path is taken above, the meaning of the odd and even
 132                         // bits are swapped, but the logic below otherwise holds.)
 133                         //
 134                         // This means we can popcnt the number of set bits, and the result is the
 135                         // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
 136                         // it expands. This results in the wrong count for UTF-16 surrogate code
 137                         // units (we just counted that each individual code unit expands to 3 bytes,
 138                         // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
 139                         // We'll handle this in just a moment.
 140                         //
 141                         // For now, compute the popcnt but squirrel it away. We'll fold it in to the
 142                         // cumulative UTF-8 adjustment factor once we determine that there are no
 143                         // unpaired surrogates in our data. (Unpaired surrogates would invalidate
 144                         // our computed result and we'd have to throw it away.)
 145
 146                         uint popcnt = (uint)BitOperations.PopCount(mask);
 147
 148                         // Surrogates need to be special-cased for two reasons: (a) we need
 149                         // to account for the fact that we over-counted in the addition above;
 150                         // and (b) they require separate validation.
 151
 152                         utf16Data = Sse2.Add(utf16Data, vectorA800);
 153                         mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
 154
 155                         if (mask != 0)
 156                         {
 157                             // There's at least one UTF-16 surrogate code unit present.
 158                             // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
 159                             // the resulting bits of 'mask' will occur in pairs:
 160                             // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
 161                             // - 11 if the corresponding UTF-16 char was a surrogate code unit.
 162                             //
 163                             // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
 164                             // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
 165                             // a low surrogate. Since we added 0xA800 in the vectorized operation above,
 166                             // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
 167                             // If we logical right-shift each word by 3, we'll end up with the bit pattern
 168                             // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
 169                             // determine whether a given char was a high or a low surrogate.
 170                             //
 171                             // Therefore the resulting bits of 'mask2' will occur in pairs:
 172                             // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
 173                             // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
 174                             // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
 175                             //   Since 'mask' already has 00 in these positions (since the corresponding char
 176                             //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.
 177
 178                             uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
 179
 180                             // 'lowSurrogatesMask' has its bits occur in pairs:
 181                             // - 01 if the corresponding char was a low surrogate char,
 182                             // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.
 183
 184                             uint lowSurrogatesMask = mask2 & mask;
 185
 186                             // 'highSurrogatesMask' has its bits occur in pairs:
 187                             // - 01 if the corresponding char was a high surrogate char,
 188                             // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.
 189
 190                             uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;
 191
 192                             Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
 193                                 "A char cannot simultaneously be both a high and a low surrogate char.");
 194
 195                             Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
 196                                 "Only even bits (no odd bits) of the masks should be set.");
 197
 198                             // Now check that each high surrogate is followed by a low surrogate and that each
 199                             // low surrogate follows a high surrogate. We make an exception for the case where
 200                             // the final char of the vector is a high surrogate, since we can't perform validation
 201                             // on it until the next iteration of the loop when we hope to consume the matching
 202                             // low surrogate.
 203
 204                             highSurrogatesMask <<= 2;
 205                             if ((ushort)highSurrogatesMask != lowSurrogatesMask)
 206                             {
 207                                 goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
 208                             }
 209
 210                             if (highSurrogatesMask > ushort.MaxValue)
 211                             {
 212                                 // There was a standalone high surrogate at the end of the vector.
 213                                 // We'll adjust our counters so that we don't consider this char consumed.
 214
 215                                 highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
 216                                 popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
 217                                 pInputBuffer--;
 218                                 inputLength++;
 219                             }
 220
 221                             // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
 222                             // free right now, saving the extension step a few lines below. If we're 32-bit, the
 223                             // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
 224                             // 64 -bit extension a few lines below.
 225                             nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);
 226
 227                             // 2 UTF-16 chars become 1 Unicode scalar
 228
 229                             tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;
 230
 231                             // Since each surrogate code unit was >= 0x0800, we eagerly assumed
 232                             // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
 233                             // assumes that the pair is encoded as 6 UTF-8 code units. Since each
 234                             // pair is in reality only encoded as 4 UTF-8 code units, we need to
 235                             // perform this adjustment now.
 236
 237                             if (IntPtr.Size == 8)
 238                             {
 239                                 // Since we've already zero-extended surrogatePairsCountNuint, we can directly
 240                                 // sub + sub. It's more efficient than shl + sub.
 241                                 tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
 242                                 tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
 243                             }
 244                             else
 245                             {
 246                                 // Take the hit of the 64-bit extension now.
 247                                 tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
 248                             }
 249                         }
 250
 251                         tempUtf8CodeUnitCountAdjustment += popcnt;
 252                         pInputBuffer += Vector128<ushort>.Count;
 253                         inputLength -= Vector128<ushort>.Count;
 254                     } while (inputLength >= Vector128<ushort>.Count);
 255                 }
 256             }
 257             else if (Vector.IsHardwareAccelerated)
 258             {
 259                 if (inputLength >= Vector<ushort>.Count)
 260                 {
 261                     Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
 262                     Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
 263                     Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
 264                     Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
 265
 266                     do
 267                     {
 268                         // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
 269                         // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
 270                         // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
 271                         // vectors, each element of the sum will contain one of three values:
 272                         //
 273                         // 0x0000 ( 0) = original char was 0000..007F
 274                         // 0xFFFF (-1) = original char was 0080..07FF
 275                         // 0xFFFE (-2) = original char was 0800..FFFF
 276                         //
 277                         // We'll negate them to produce a value 0..2 for each element, then sum all the
 278                         // elements together to produce the number of *additional* UTF-8 code units
 279                         // required to represent this UTF-16 data. This is similar to the popcnt step
 280                         // performed by the SSE2 code path. This will overcount surrogates, but we'll
 281                         // handle that shortly.
 282
 283                         Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
 284                         Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
 285                         Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
 286                         Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
 287
 288                         // We'll try summing by a natural word (rather than a 16-bit word) at a time,
 289                         // which should halve the number of operations we must perform.
 290
 291                         nuint popcnt = 0;
 292                         for (int i = 0; i < Vector<nuint>.Count; i++)
 293                         {
 294                             popcnt += sumVector[i];
 295                         }
 296
 297                         uint popcnt32 = (uint)popcnt;
 298                         if (IntPtr.Size == 8)
 299                         {
 300                             popcnt32 += (uint)(popcnt >> 32);
 301                         }
 302
 303                         // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
 304                         // know there aren't any unpaired surrogates in the input data.
 305
 306                         popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
 307
 308                         // Now check for surrogates.
 309
 310                         utf16Data -= vectorD800;
 311                         Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
 312                         if (surrogateChars != Vector<ushort>.Zero)
 313                         {
 314                             // There's at least one surrogate (high or low) UTF-16 code unit in
 315                             // the vector. We'll build up additional vectors: 'highSurrogateChars'
 316                             // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
 317                             // UTF-16 code unit was a high or low surrogate, respectively.
 318
 319                             Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
 320                             Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
 321
 322                             // We want to make sure that each high surrogate code unit is followed by
 323                             // a low surrogate code unit and each low surrogate code unit follows a
 324                             // high surrogate code unit. Since we don't have an equivalent of pmovmskb
 325                             // or palignr available to us, we'll do this as a loop. We won't look at
 326                             // the very last high surrogate char element since we don't yet know if
 327                             // the next vector read will have a low surrogate char element.
 328
 329                             if (lowSurrogateChars[0] != 0)
 330                             {
 331                                 goto Error; // error: start of buffer contains standalone low surrogate char
 332                             }
 333
 334                             ushort surrogatePairsCount = 0;
 335                             for (int i = 0; i < Vector<ushort>.Count - 1; i++)
 336                             {
 337                                 surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
 338                                 if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
 339                                 {
 340                                     goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
 341                                 }
 342                             }
 343
 344                             if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
 345                             {
 346                                 // There was a standalone high surrogate at the end of the vector.
 347                                 // We'll adjust our counters so that we don't consider this char consumed.
 348
 349                                 pInputBuffer--;
 350                                 inputLength++;
 351                                 popcnt32 -= 2;
 352                             }
 353
 354                             nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
 355
 356                             // 2 UTF-16 chars become 1 Unicode scalar
 357
 358                             tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
 359
 360                             // Since each surrogate code unit was >= 0x0800, we eagerly assumed
 361                             // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
 362                             // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
 363                             // so we'll adjust this now.
 364
 365                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 366                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 367                         }
 368
 369                         tempUtf8CodeUnitCountAdjustment += popcnt32;
 370                         pInputBuffer += Vector<ushort>.Count;
 371                         inputLength -= Vector<ushort>.Count;
 372                     } while (inputLength >= Vector<ushort>.Count);
 373                 }
 374             }
 375
 376         NonVectorizedLoop:
 377
 378             // Vectorization isn't supported on our current platform, or the input was too small to benefit
 379             // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
 380             // drain remaining valid chars before we report failure.
 381
 382             for (; inputLength > 0; pInputBuffer++, inputLength--)
 383             {
 384                 uint thisChar = pInputBuffer[0];
 385                 if (thisChar <= 0x7F)
 386                 {
 387                     continue;
 388                 }
 389
 390                 // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
 391                 // This optimistically assumes no surrogates, which we'll handle shortly.
 392
 393                 tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
 394
 395                 if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
 396                 {
 397                     continue;
 398                 }
 399
 400                 // Found a surrogate char. Back out the adjustment we made above, then
 401                 // try to consume the entire surrogate pair all at once. We won't bother
 402                 // trying to interpret the surrogate pair as a scalar value; we'll only
 403                 // validate that its bit pattern matches what's expected for a surrogate pair.
 404
 405                 tempUtf8CodeUnitCountAdjustment -= 2;
 406
 407                 if (inputLength == 1)
 408                 {
 409                     goto Error; // input buffer too small to read a surrogate pair
 410                 }
 411
 412                 thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
 413                 if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
 414                 {
 415                     goto Error; // not a well-formed surrogate pair
 416                 }
 417
 418                 tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
 419                 tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
 420
 421                 pInputBuffer++; // consumed one extra char
 422                 inputLength--;
 423             }
 424
 425         Error:
 426
 427             // Also used for normal return.
 428
 429             utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
 430             scalarCountAdjustment = tempScalarCountAdjustment;
 431             return pInputBuffer;
 432         }
 433     }
 434 }