Clarify that vector read is unaligned
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / Unicode / Utf16Utility.Validation.cs
blob7f8e8c157db065207f6aa98c101a70eb952d375f
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System.Diagnostics;
6 using System.Runtime.Intrinsics;
7 using System.Runtime.Intrinsics.X86;
8 using System.Numerics;
9 using Internal.Runtime.CompilerServices;
11 #if BIT64
12 using nint = System.Int64;
13 using nuint = System.UInt64;
14 #else // BIT64
15 using nint = System.Int32;
16 using nuint = System.UInt32;
17 #endif // BIT64
19 namespace System.Text.Unicode
21 internal static unsafe partial class Utf16Utility
23 // Returns &inputBuffer[inputLength] if the input buffer is valid.
24 /// <summary>
25 /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
26 /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
27 /// </summary>
28 /// <remarks>
29 /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
30 /// </remarks>
31 public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
33 // First, we'll handle the common case of all-ASCII. If this is able to
34 // consume the entire buffer, we'll skip the remainder of this method's logic.
36 int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
37 Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
39 pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
40 inputLength -= numAsciiCharsConsumedJustNow;
42 if (inputLength == 0)
44 utf8CodeUnitCountAdjustment = 0;
45 scalarCountAdjustment = 0;
46 return pInputBuffer;
49 // If we got here, it means we saw some non-ASCII data, so within our
50 // vectorized code paths below we'll handle all non-surrogate UTF-16
51 // code points branchlessly. We'll only branch if we see surrogates.
52 //
53 // We still optimistically assume the data is mostly ASCII. This means that the
54 // number of UTF-8 code units and the number of scalars almost matches the number
55 // of UTF-16 code units. As we go through the input and find non-ASCII
56 // characters, we'll keep track of these "adjustment" fixups. To get the
57 // total number of UTF-8 code units required to encode the input data, add
58 // the UTF-8 code unit count adjustment to the number of UTF-16 code units
59 // seen. To get the total number of scalars present in the input data,
60 // add the scalar count adjustment to the number of UTF-16 code units seen.
62 long tempUtf8CodeUnitCountAdjustment = 0;
63 int tempScalarCountAdjustment = 0;
65 if (Sse41.IsSupported)
67 if (inputLength >= Vector128<ushort>.Count)
69 Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
70 Vector128<ushort> vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800
71 Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
72 Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
76 Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
78 uint mask = (uint)Sse2.MoveMask(
79 Sse2.Or(
80 Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8),
81 Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte());
83 // Each odd bit of mask will be 1 only if the char was >= 0x0080,
84 // and each even bit of mask will be 1 only if the char was >= 0x0800.
86 // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
88 // ,-- set if char[1] is non-ASCII
89 // | ,-- set if char[0] is non-ASCII
90 // v v
91 // mask = ... 1 1 1 0
92 // ^ ^-- set if char[0] is >= 0x800
93 // `-- set if char[1] is >= 0x800
95 // This means we can popcnt the number of set bits, and the result is the
96 // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
97 // it expands. This results in the wrong count for UTF-16 surrogate code
98 // units (we just counted that each individual code unit expands to 3 bytes,
99 // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
100 // We'll handle this in just a moment.
102 // For now, compute the popcnt but squirrel it away. We'll fold it in to the
103 // cumulative UTF-8 adjustment factor once we determine that there are no
104 // unpaired surrogates in our data. (Unpaired surrogates would invalidate
105 // our computed result and we'd have to throw it away.)
107 uint popcnt = (uint)BitOperations.PopCount(mask);
109 // Surrogates need to be special-cased for two reasons: (a) we need
110 // to account for the fact that we over-counted in the addition above;
111 // and (b) they require separate validation.
113 utf16Data = Sse2.Add(utf16Data, vectorA800);
114 mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
116 if (mask != 0)
118 // There's at least one UTF-16 surrogate code unit present.
119 // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
120 // the resulting bits of 'mask' will occur in pairs:
121 // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
122 // - 11 if the corresponding UTF-16 char was a surrogate code unit.
124 // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
125 // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
126 // a low surrogate. Since we added 0xA800 in the vectorized operation above,
127 // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
128 // If we logical right-shift each word by 3, we'll end up with the bit pattern
129 // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
130 // determine whether a given char was a high or a low surrogate.
132 // Therefore the resulting bits of 'mask2' will occur in pairs:
133 // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
134 // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
135 // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
137 uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
139 uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00
140 uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00
142 // Now check that each high surrogate is followed by a low surrogate and that each
143 // low surrogate follows a high surrogate. We make an exception for the case where
144 // the final char of the vector is a high surrogate, since we can't perform validation
145 // on it until the next iteration of the loop when we hope to consume the matching
146 // low surrogate.
148 highSurrogatesMask <<= 2;
149 if ((ushort)highSurrogatesMask != lowSurrogatesMask)
151 goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
154 if (highSurrogatesMask > ushort.MaxValue)
156 // There was a standalone high surrogate at the end of the vector.
157 // We'll adjust our counters so that we don't consider this char consumed.
159 highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
160 popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
161 pInputBuffer--;
162 inputLength++;
165 int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask);
167 // 2 UTF-16 chars become 1 Unicode scalar
169 tempScalarCountAdjustment -= surrogatePairsCount;
171 // Since each surrogate code unit was >= 0x0800, we eagerly assumed
172 // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
173 // assumes that the pair is encoded as 6 UTF-8 code units. Since each
174 // pair is in reality only encoded as 4 UTF-8 code units, we need to
175 // perform this adjustment now.
177 nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size
178 tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
179 tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
182 tempUtf8CodeUnitCountAdjustment += popcnt;
183 pInputBuffer += Vector128<ushort>.Count;
184 inputLength -= Vector128<ushort>.Count;
185 } while (inputLength >= Vector128<ushort>.Count);
188 else if (Vector.IsHardwareAccelerated)
190 if (inputLength >= Vector<ushort>.Count)
192 Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
193 Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
194 Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
195 Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
199 // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
200 // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
201 // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
202 // vectors, each element of the sum will contain one of three values:
204 // 0x0000 ( 0) = original char was 0000..007F
205 // 0xFFFF (-1) = original char was 0080..07FF
206 // 0xFFFE (-2) = original char was 0800..FFFF
208 // We'll negate them to produce a value 0..2 for each element, then sum all the
209 // elements together to produce the number of *additional* UTF-8 code units
210 // required to represent this UTF-16 data. This is similar to the popcnt step
211 // performed by the SSE41 code path. This will overcount surrogates, but we'll
212 // handle that shortly.
214 Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
215 Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
216 Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
217 Vector<nuint> sumVector = (Vector<nuint>)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes));
219 // We'll try summing by a natural word (rather than a 16-bit word) at a time,
220 // which should halve the number of operations we must perform.
222 nuint popcnt = 0;
223 for (int i = 0; i < Vector<nuint>.Count; i++)
225 popcnt += sumVector[i];
228 uint popcnt32 = (uint)popcnt;
229 if (IntPtr.Size == 8)
231 popcnt32 += (uint)(popcnt >> 32);
234 // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
235 // know there aren't any unpaired surrogates in the input data.
237 popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
239 // Now check for surrogates.
241 utf16Data -= vectorD800;
242 Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
243 if (surrogateChars != Vector<ushort>.Zero)
245 // There's at least one surrogate (high or low) UTF-16 code unit in
246 // the vector. We'll build up additional vectors: 'highSurrogateChars'
247 // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
248 // UTF-16 code unit was a high or low surrogate, respectively.
250 Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
251 Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
253 // We want to make sure that each high surrogate code unit is followed by
254 // a low surrogate code unit and each low surrogate code unit follows a
255 // high surrogate code unit. Since we don't have an equivalent of pmovmskb
256 // or palignr available to us, we'll do this as a loop. We won't look at
257 // the very last high surrogate char element since we don't yet know if
258 // the next vector read will have a low surrogate char element.
260 ushort surrogatePairsCount = 0;
261 for (int i = 0; i < Vector<ushort>.Count - 1; i++)
263 surrogatePairsCount -= highSurrogateChars[i];
264 if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
266 goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
270 if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
272 // There was a standalone high surrogate at the end of the vector.
273 // We'll adjust our counters so that we don't consider this char consumed.
275 pInputBuffer--;
276 inputLength++;
277 popcnt32 -= 2;
278 tempScalarCountAdjustment--;
281 nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
283 // 2 UTF-16 chars become 1 Unicode scalar
285 tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
287 // Since each surrogate code unit was >= 0x0800, we eagerly assumed
288 // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
289 // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
290 // so we'll adjust this now.
292 tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
293 tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
296 tempUtf8CodeUnitCountAdjustment += popcnt32;
297 pInputBuffer += Vector<ushort>.Count;
298 inputLength -= Vector<ushort>.Count;
299 } while (inputLength >= Vector<ushort>.Count);
303 NonVectorizedLoop:
305 // Vectorization isn't supported on our current platform, or the input was too small to benefit
306 // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
307 // drain remaining valid chars before we report failure.
309 for (; inputLength > 0; pInputBuffer++, inputLength--)
311 uint thisChar = pInputBuffer[0];
312 if (thisChar <= 0x7F)
314 continue;
317 // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
318 // This optimistically assumes no surrogates, which we'll handle shortly.
320 tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
322 if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
324 continue;
327 // Found a surrogate char. Back out the adjustment we made above, then
328 // try to consume the entire surrogate pair all at once. We won't bother
329 // trying to interpret the surrogate pair as a scalar value; we'll only
330 // validate that its bit pattern matches what's expected for a surrogate pair.
332 tempUtf8CodeUnitCountAdjustment -= 2;
334 if (inputLength == 1)
336 goto Error; // input buffer too small to read a surrogate pair
339 thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
340 if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
342 goto Error; // not a well-formed surrogate pair
345 tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
346 tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
348 pInputBuffer++; // consumed one extra char
349 inputLength--;
352 Error:
354 // Also used for normal return.
356 utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
357 scalarCountAdjustment = tempScalarCountAdjustment;
358 return pInputBuffer;