netcore/System.Private.CoreLib/shared/System/Text/Unicode/Utf16Utility.Validation.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 using System.Diagnostics;
   6 using System.Runtime.Intrinsics;
   7 using System.Runtime.Intrinsics.X86;
   8 using System.Numerics;
   9 using Internal.Runtime.CompilerServices;
  10
  11 #if BIT64
  12 using nint = System.Int64;
  13 using nuint = System.UInt64;
  14 #else // BIT64
  15 using nint = System.Int32;
  16 using nuint = System.UInt32;
  17 #endif // BIT64
  18
  19 namespace System.Text.Unicode
  20 {
  21     internal static unsafe partial class Utf16Utility
  22     {
  23         // Returns &inputBuffer[inputLength] if the input buffer is valid.
  24         /// <summary>
  25         /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
  26         /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
  27         /// </summary>
  28         /// <remarks>
  29         /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
  30         /// </remarks>
  31         public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
  32         {
  33             // First, we'll handle the common case of all-ASCII. If this is able to
  34             // consume the entire buffer, we'll skip the remainder of this method's logic.
  35
  36             int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
  37             Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
  38
  39             pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
  40             inputLength -= numAsciiCharsConsumedJustNow;
  41
  42             if (inputLength == 0)
  43             {
  44                 utf8CodeUnitCountAdjustment = 0;
  45                 scalarCountAdjustment = 0;
  46                 return pInputBuffer;
  47             }
  48
  49             // If we got here, it means we saw some non-ASCII data, so within our
  50             // vectorized code paths below we'll handle all non-surrogate UTF-16
  51             // code points branchlessly. We'll only branch if we see surrogates.
  52             //
  53             // We still optimistically assume the data is mostly ASCII. This means that the
  54             // number of UTF-8 code units and the number of scalars almost matches the number
  55             // of UTF-16 code units. As we go through the input and find non-ASCII
  56             // characters, we'll keep track of these "adjustment" fixups. To get the
  57             // total number of UTF-8 code units required to encode the input data, add
  58             // the UTF-8 code unit count adjustment to the number of UTF-16 code units
  59             // seen.  To get the total number of scalars present in the input data,
  60             // add the scalar count adjustment to the number of UTF-16 code units seen.
  61
  62             long tempUtf8CodeUnitCountAdjustment = 0;
  63             int tempScalarCountAdjustment = 0;
  64
  65             if (Sse41.IsSupported)
  66             {
  67                 if (inputLength >= Vector128<ushort>.Count)
  68                 {
  69                     Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
  70                     Vector128<ushort> vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800
  71                     Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
  72                     Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
  73
  74                     do
  75                     {
  76                         Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
  77
  78                         uint mask = (uint)Sse2.MoveMask(
  79                             Sse2.Or(
  80                                 Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8),
  81                                 Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte());
  82
  83                         // Each odd bit of mask will be 1 only if the char was >= 0x0080,
  84                         // and each even bit of mask will be 1 only if the char was >= 0x0800.
  85                         //
  86                         // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
  87                         //
  88                         //            ,-- set if char[1] is non-ASCII
  89                         //            |   ,-- set if char[0] is non-ASCII
  90                         //            v   v
  91                         // mask = ... 1 1 1 0
  92                         //              ^   ^-- set if char[0] is >= 0x800
  93                         //              `-- set if char[1] is >= 0x800
  94                         //
  95                         // This means we can popcnt the number of set bits, and the result is the
  96                         // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
  97                         // it expands. This results in the wrong count for UTF-16 surrogate code
  98                         // units (we just counted that each individual code unit expands to 3 bytes,
  99                         // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
 100                         // We'll handle this in just a moment.
 101                         //
 102                         // For now, compute the popcnt but squirrel it away. We'll fold it in to the
 103                         // cumulative UTF-8 adjustment factor once we determine that there are no
 104                         // unpaired surrogates in our data. (Unpaired surrogates would invalidate
 105                         // our computed result and we'd have to throw it away.)
 106
 107                         uint popcnt = (uint)BitOperations.PopCount(mask);
 108
 109                         // Surrogates need to be special-cased for two reasons: (a) we need
 110                         // to account for the fact that we over-counted in the addition above;
 111                         // and (b) they require separate validation.
 112
 113                         utf16Data = Sse2.Add(utf16Data, vectorA800);
 114                         mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
 115
 116                         if (mask != 0)
 117                         {
 118                             // There's at least one UTF-16 surrogate code unit present.
 119                             // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
 120                             // the resulting bits of 'mask' will occur in pairs:
 121                             // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
 122                             // - 11 if the corresponding UTF-16 char was a surrogate code unit.
 123                             //
 124                             // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
 125                             // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
 126                             // a low surrogate. Since we added 0xA800 in the vectorized operation above,
 127                             // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
 128                             // If we logical right-shift each word by 3, we'll end up with the bit pattern
 129                             // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
 130                             // determine whether a given char was a high or a low surrogate.
 131                             //
 132                             // Therefore the resulting bits of 'mask2' will occur in pairs:
 133                             // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
 134                             // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
 135                             // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
 136
 137                             uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
 138
 139                             uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00
 140                             uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00
 141
 142                             // Now check that each high surrogate is followed by a low surrogate and that each
 143                             // low surrogate follows a high surrogate. We make an exception for the case where
 144                             // the final char of the vector is a high surrogate, since we can't perform validation
 145                             // on it until the next iteration of the loop when we hope to consume the matching
 146                             // low surrogate.
 147
 148                             highSurrogatesMask <<= 2;
 149                             if ((ushort)highSurrogatesMask != lowSurrogatesMask)
 150                             {
 151                                 goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
 152                             }
 153
 154                             if (highSurrogatesMask > ushort.MaxValue)
 155                             {
 156                                 // There was a standalone high surrogate at the end of the vector.
 157                                 // We'll adjust our counters so that we don't consider this char consumed.
 158
 159                                 highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
 160                                 popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
 161                                 pInputBuffer--;
 162                                 inputLength++;
 163                             }
 164
 165                             int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask);
 166
 167                             // 2 UTF-16 chars become 1 Unicode scalar
 168
 169                             tempScalarCountAdjustment -= surrogatePairsCount;
 170
 171                             // Since each surrogate code unit was >= 0x0800, we eagerly assumed
 172                             // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
 173                             // assumes that the pair is encoded as 6 UTF-8 code units. Since each
 174                             // pair is in reality only encoded as 4 UTF-8 code units, we need to
 175                             // perform this adjustment now.
 176
 177                             nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size
 178                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 179                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 180                         }
 181
 182                         tempUtf8CodeUnitCountAdjustment += popcnt;
 183                         pInputBuffer += Vector128<ushort>.Count;
 184                         inputLength -= Vector128<ushort>.Count;
 185                     } while (inputLength >= Vector128<ushort>.Count);
 186                 }
 187             }
 188             else if (Vector.IsHardwareAccelerated)
 189             {
 190                 if (inputLength >= Vector<ushort>.Count)
 191                 {
 192                     Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
 193                     Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
 194                     Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
 195                     Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
 196
 197                     do
 198                     {
 199                         // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
 200                         // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
 201                         // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
 202                         // vectors, each element of the sum will contain one of three values:
 203                         //
 204                         // 0x0000 ( 0) = original char was 0000..007F
 205                         // 0xFFFF (-1) = original char was 0080..07FF
 206                         // 0xFFFE (-2) = original char was 0800..FFFF
 207                         //
 208                         // We'll negate them to produce a value 0..2 for each element, then sum all the
 209                         // elements together to produce the number of *additional* UTF-8 code units
 210                         // required to represent this UTF-16 data. This is similar to the popcnt step
 211                         // performed by the SSE41 code path. This will overcount surrogates, but we'll
 212                         // handle that shortly.
 213
 214                         Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
 215                         Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
 216                         Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
 217                         Vector<nuint> sumVector = (Vector<nuint>)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes));
 218
 219                         // We'll try summing by a natural word (rather than a 16-bit word) at a time,
 220                         // which should halve the number of operations we must perform.
 221
 222                         nuint popcnt = 0;
 223                         for (int i = 0; i < Vector<nuint>.Count; i++)
 224                         {
 225                             popcnt += sumVector[i];
 226                         }
 227
 228                         uint popcnt32 = (uint)popcnt;
 229                         if (IntPtr.Size == 8)
 230                         {
 231                             popcnt32 += (uint)(popcnt >> 32);
 232                         }
 233
 234                         // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
 235                         // know there aren't any unpaired surrogates in the input data.
 236
 237                         popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
 238
 239                         // Now check for surrogates.
 240
 241                         utf16Data -= vectorD800;
 242                         Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
 243                         if (surrogateChars != Vector<ushort>.Zero)
 244                         {
 245                             // There's at least one surrogate (high or low) UTF-16 code unit in
 246                             // the vector. We'll build up additional vectors: 'highSurrogateChars'
 247                             // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
 248                             // UTF-16 code unit was a high or low surrogate, respectively.
 249
 250                             Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
 251                             Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
 252
 253                             // We want to make sure that each high surrogate code unit is followed by
 254                             // a low surrogate code unit and each low surrogate code unit follows a
 255                             // high surrogate code unit. Since we don't have an equivalent of pmovmskb
 256                             // or palignr available to us, we'll do this as a loop. We won't look at
 257                             // the very last high surrogate char element since we don't yet know if
 258                             // the next vector read will have a low surrogate char element.
 259
 260                             ushort surrogatePairsCount = 0;
 261                             for (int i = 0; i < Vector<ushort>.Count - 1; i++)
 262                             {
 263                                 surrogatePairsCount -= highSurrogateChars[i];
 264                                 if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
 265                                 {
 266                                     goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
 267                                 }
 268                             }
 269
 270                             if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
 271                             {
 272                                 // There was a standalone high surrogate at the end of the vector.
 273                                 // We'll adjust our counters so that we don't consider this char consumed.
 274
 275                                 pInputBuffer--;
 276                                 inputLength++;
 277                                 popcnt32 -= 2;
 278                                 tempScalarCountAdjustment--;
 279                             }
 280
 281                             nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
 282
 283                             // 2 UTF-16 chars become 1 Unicode scalar
 284
 285                             tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
 286
 287                             // Since each surrogate code unit was >= 0x0800, we eagerly assumed
 288                             // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
 289                             // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
 290                             // so we'll adjust this now.
 291
 292                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 293                             tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
 294                         }
 295
 296                         tempUtf8CodeUnitCountAdjustment += popcnt32;
 297                         pInputBuffer += Vector<ushort>.Count;
 298                         inputLength -= Vector<ushort>.Count;
 299                     } while (inputLength >= Vector<ushort>.Count);
 300                 }
 301             }
 302
 303         NonVectorizedLoop:
 304
 305             // Vectorization isn't supported on our current platform, or the input was too small to benefit
 306             // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
 307             // drain remaining valid chars before we report failure.
 308
 309             for (; inputLength > 0; pInputBuffer++, inputLength--)
 310             {
 311                 uint thisChar = pInputBuffer[0];
 312                 if (thisChar <= 0x7F)
 313                 {
 314                     continue;
 315                 }
 316
 317                 // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
 318                 // This optimistically assumes no surrogates, which we'll handle shortly.
 319
 320                 tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
 321
 322                 if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
 323                 {
 324                     continue;
 325                 }
 326
 327                 // Found a surrogate char. Back out the adjustment we made above, then
 328                 // try to consume the entire surrogate pair all at once. We won't bother
 329                 // trying to interpret the surrogate pair as a scalar value; we'll only
 330                 // validate that its bit pattern matches what's expected for a surrogate pair.
 331
 332                 tempUtf8CodeUnitCountAdjustment -= 2;
 333
 334                 if (inputLength == 1)
 335                 {
 336                     goto Error; // input buffer too small to read a surrogate pair
 337                 }
 338
 339                 thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
 340                 if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
 341                 {
 342                     goto Error; // not a well-formed surrogate pair
 343                 }
 344
 345                 tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
 346                 tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
 347
 348                 pInputBuffer++; // consumed one extra char
 349                 inputLength--;
 350             }
 351
 352         Error:
 353
 354             // Also used for normal return.
 355
 356             utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
 357             scalarCountAdjustment = tempScalarCountAdjustment;
 358             return pInputBuffer;
 359         }
 360     }
 361 }