1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System
.Diagnostics
;
6 using System
.Runtime
.Intrinsics
;
7 using System
.Runtime
.Intrinsics
.X86
;
9 using Internal
.Runtime
.CompilerServices
;
11 #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
13 using nint
= System
.Int64
;
14 using nuint
= System
.UInt64
;
16 using nint
= System
.Int32
;
17 using nuint
= System
.UInt32
;
20 namespace System
.Text
.Unicode
22 internal static unsafe partial class Utf16Utility
27 Debug
.Assert(sizeof(nint
) == IntPtr
.Size
&& nint
.MinValue
< 0, "nint is defined incorrectly.");
28 Debug
.Assert(sizeof(nuint
) == IntPtr
.Size
&& nuint
.MinValue
== 0, "nuint is defined incorrectly.");
32 // Returns &inputBuffer[inputLength] if the input buffer is valid.
34 /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
35 /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
38 /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
40 public static char* GetPointerToFirstInvalidChar(char* pInputBuffer
, int inputLength
, out long utf8CodeUnitCountAdjustment
, out int scalarCountAdjustment
)
42 Debug
.Assert(inputLength
>= 0, "Input length must not be negative.");
43 Debug
.Assert(pInputBuffer
!= null || inputLength
== 0, "Input length must be zero if input buffer pointer is null.");
45 // First, we'll handle the common case of all-ASCII. If this is able to
46 // consume the entire buffer, we'll skip the remainder of this method's logic.
48 int numAsciiCharsConsumedJustNow
= (int)ASCIIUtility
.GetIndexOfFirstNonAsciiChar(pInputBuffer
, (uint)inputLength
);
49 Debug
.Assert(0 <= numAsciiCharsConsumedJustNow
&& numAsciiCharsConsumedJustNow
<= inputLength
);
51 pInputBuffer
+= (uint)numAsciiCharsConsumedJustNow
;
52 inputLength
-= numAsciiCharsConsumedJustNow
;
56 utf8CodeUnitCountAdjustment
= 0;
57 scalarCountAdjustment
= 0;
61 // If we got here, it means we saw some non-ASCII data, so within our
62 // vectorized code paths below we'll handle all non-surrogate UTF-16
63 // code points branchlessly. We'll only branch if we see surrogates.
65 // We still optimistically assume the data is mostly ASCII. This means that the
66 // number of UTF-8 code units and the number of scalars almost matches the number
67 // of UTF-16 code units. As we go through the input and find non-ASCII
68 // characters, we'll keep track of these "adjustment" fixups. To get the
69 // total number of UTF-8 code units required to encode the input data, add
70 // the UTF-8 code unit count adjustment to the number of UTF-16 code units
71 // seen. To get the total number of scalars present in the input data,
72 // add the scalar count adjustment to the number of UTF-16 code units seen.
74 long tempUtf8CodeUnitCountAdjustment
= 0;
75 int tempScalarCountAdjustment
= 0;
79 if (inputLength
>= Vector128
<ushort>.Count
)
81 Vector128
<ushort> vector0080
= Vector128
.Create((ushort)0x80);
82 Vector128
<ushort> vectorA800
= Vector128
.Create((ushort)0xA800);
83 Vector128
<short> vector8800
= Vector128
.Create(unchecked((short)0x8800));
84 Vector128
<ushort> vectorZero
= Vector128
<ushort>.Zero
;
88 Vector128
<ushort> utf16Data
= Sse2
.LoadVector128((ushort*)pInputBuffer
); // unaligned
91 // The 'charIsNonAscii' vector we're about to build will have the 0x8000 or the 0x0080
92 // bit set (but not both!) only if the corresponding input char is non-ASCII. Which of
93 // the two bits is set doesn't matter, as will be explained in the diagram a few lines
96 Vector128
<ushort> charIsNonAscii
;
97 if (Sse41
.IsSupported
)
99 // sets 0x0080 bit if corresponding char element is >= 0x0080
100 charIsNonAscii
= Sse41
.Min(utf16Data
, vector0080
);
104 // sets 0x8000 bit if corresponding char element is >= 0x0080
105 charIsNonAscii
= Sse2
.AndNot(vector0080
, Sse2
.Subtract(vectorZero
, Sse2
.ShiftRightLogical(utf16Data
, 7)));
109 // Quick check to ensure we didn't accidentally set both 0x8080 bits in any element.
110 uint debugMask
= (uint)Sse2
.MoveMask(charIsNonAscii
.AsByte());
111 Debug
.Assert((debugMask
& (debugMask
<< 1)) == 0, "Two set bits shouldn't occur adjacent to each other in this mask.");
114 // sets 0x8080 bits if corresponding char element is >= 0x0800
115 Vector128
<ushort> charIsThreeByteUtf8Encoded
= Sse2
.Subtract(vectorZero
, Sse2
.ShiftRightLogical(utf16Data
, 11));
117 mask
= (uint)Sse2
.MoveMask(Sse2
.Or(charIsNonAscii
, charIsThreeByteUtf8Encoded
).AsByte());
119 // Each odd bit of mask will be 1 only if the char was >= 0x0080,
120 // and each even bit of mask will be 1 only if the char was >= 0x0800.
122 // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
124 // ,-- set if char[1] is non-ASCII
125 // | ,-- set if char[0] is non-ASCII
127 // mask = ... 1 1 1 0
128 // ^ ^-- set if char[0] is >= 0x0800
129 // `-- set if char[1] is >= 0x0800
131 // (If the SSE4.1 code path is taken above, the meaning of the odd and even
132 // bits are swapped, but the logic below otherwise holds.)
134 // This means we can popcnt the number of set bits, and the result is the
135 // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
136 // it expands. This results in the wrong count for UTF-16 surrogate code
137 // units (we just counted that each individual code unit expands to 3 bytes,
138 // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
139 // We'll handle this in just a moment.
141 // For now, compute the popcnt but squirrel it away. We'll fold it in to the
142 // cumulative UTF-8 adjustment factor once we determine that there are no
143 // unpaired surrogates in our data. (Unpaired surrogates would invalidate
144 // our computed result and we'd have to throw it away.)
146 uint popcnt
= (uint)BitOperations
.PopCount(mask
);
148 // Surrogates need to be special-cased for two reasons: (a) we need
149 // to account for the fact that we over-counted in the addition above;
150 // and (b) they require separate validation.
152 utf16Data
= Sse2
.Add(utf16Data
, vectorA800
);
153 mask
= (uint)Sse2
.MoveMask(Sse2
.CompareLessThan(utf16Data
.AsInt16(), vector8800
).AsByte());
157 // There's at least one UTF-16 surrogate code unit present.
158 // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
159 // the resulting bits of 'mask' will occur in pairs:
160 // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
161 // - 11 if the corresponding UTF-16 char was a surrogate code unit.
163 // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
164 // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
165 // a low surrogate. Since we added 0xA800 in the vectorized operation above,
166 // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
167 // If we logical right-shift each word by 3, we'll end up with the bit pattern
168 // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
169 // determine whether a given char was a high or a low surrogate.
171 // Therefore the resulting bits of 'mask2' will occur in pairs:
172 // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
173 // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
174 // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
175 // Since 'mask' already has 00 in these positions (since the corresponding char
176 // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.
178 uint mask2
= (uint)Sse2
.MoveMask(Sse2
.ShiftRightLogical(utf16Data
, 3).AsByte());
180 // 'lowSurrogatesMask' has its bits occur in pairs:
181 // - 01 if the corresponding char was a low surrogate char,
182 // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.
184 uint lowSurrogatesMask
= mask2
& mask
;
186 // 'highSurrogatesMask' has its bits occur in pairs:
187 // - 01 if the corresponding char was a high surrogate char,
188 // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.
190 uint highSurrogatesMask
= (mask2 ^
0b_0101_0101_0101_0101u
/* flip all even-numbered bits 00 <-> 01 */) & mask
;
192 Debug
.Assert((highSurrogatesMask
& lowSurrogatesMask
) == 0,
193 "A char cannot simultaneously be both a high and a low surrogate char.");
195 Debug
.Assert(((highSurrogatesMask
| lowSurrogatesMask
) & 0b_1010_1010_1010_1010u
) == 0,
196 "Only even bits (no odd bits) of the masks should be set.");
198 // Now check that each high surrogate is followed by a low surrogate and that each
199 // low surrogate follows a high surrogate. We make an exception for the case where
200 // the final char of the vector is a high surrogate, since we can't perform validation
201 // on it until the next iteration of the loop when we hope to consume the matching
204 highSurrogatesMask
<<= 2;
205 if ((ushort)highSurrogatesMask
!= lowSurrogatesMask
)
207 goto NonVectorizedLoop
; // error: mismatched surrogate pair; break out of vectorized logic
210 if (highSurrogatesMask
> ushort.MaxValue
)
212 // There was a standalone high surrogate at the end of the vector.
213 // We'll adjust our counters so that we don't consider this char consumed.
215 highSurrogatesMask
= (ushort)highSurrogatesMask
; // don't allow stray high surrogate to be consumed by popcnt
216 popcnt
-= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
221 // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
222 // free right now, saving the extension step a few lines below. If we're 32-bit, the
223 // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
224 // 64 -bit extension a few lines below.
225 nuint surrogatePairsCountNuint
= (uint)BitOperations
.PopCount(highSurrogatesMask
);
227 // 2 UTF-16 chars become 1 Unicode scalar
229 tempScalarCountAdjustment
-= (int)surrogatePairsCountNuint
;
231 // Since each surrogate code unit was >= 0x0800, we eagerly assumed
232 // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
233 // assumes that the pair is encoded as 6 UTF-8 code units. Since each
234 // pair is in reality only encoded as 4 UTF-8 code units, we need to
235 // perform this adjustment now.
237 if (IntPtr
.Size
== 8)
239 // Since we've already zero-extended surrogatePairsCountNuint, we can directly
240 // sub + sub. It's more efficient than shl + sub.
241 tempUtf8CodeUnitCountAdjustment
-= (long)surrogatePairsCountNuint
;
242 tempUtf8CodeUnitCountAdjustment
-= (long)surrogatePairsCountNuint
;
246 // Take the hit of the 64-bit extension now.
247 tempUtf8CodeUnitCountAdjustment
-= 2 * (uint)surrogatePairsCountNuint
;
251 tempUtf8CodeUnitCountAdjustment
+= popcnt
;
252 pInputBuffer
+= Vector128
<ushort>.Count
;
253 inputLength
-= Vector128
<ushort>.Count
;
254 } while (inputLength
>= Vector128
<ushort>.Count
);
257 else if (Vector
.IsHardwareAccelerated
)
259 if (inputLength
>= Vector
<ushort>.Count
)
261 Vector
<ushort> vector0080
= new Vector
<ushort>(0x0080);
262 Vector
<ushort> vector0400
= new Vector
<ushort>(0x0400);
263 Vector
<ushort> vector0800
= new Vector
<ushort>(0x0800);
264 Vector
<ushort> vectorD800
= new Vector
<ushort>(0xD800);
268 // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
269 // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
270 // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
271 // vectors, each element of the sum will contain one of three values:
273 // 0x0000 ( 0) = original char was 0000..007F
274 // 0xFFFF (-1) = original char was 0080..07FF
275 // 0xFFFE (-2) = original char was 0800..FFFF
277 // We'll negate them to produce a value 0..2 for each element, then sum all the
278 // elements together to produce the number of *additional* UTF-8 code units
279 // required to represent this UTF-16 data. This is similar to the popcnt step
280 // performed by the SSE2 code path. This will overcount surrogates, but we'll
281 // handle that shortly.
283 Vector
<ushort> utf16Data
= Unsafe
.ReadUnaligned
<Vector
<ushort>>(pInputBuffer
);
284 Vector
<ushort> twoOrMoreUtf8Bytes
= Vector
.GreaterThanOrEqual(utf16Data
, vector0080
);
285 Vector
<ushort> threeOrMoreUtf8Bytes
= Vector
.GreaterThanOrEqual(utf16Data
, vector0800
);
286 Vector
<nuint
> sumVector
= (Vector
<nuint
>)(Vector
<ushort>.Zero
- twoOrMoreUtf8Bytes
- threeOrMoreUtf8Bytes
);
288 // We'll try summing by a natural word (rather than a 16-bit word) at a time,
289 // which should halve the number of operations we must perform.
292 for (int i
= 0; i
< Vector
<nuint
>.Count
; i
++)
294 popcnt
+= sumVector
[i
];
297 uint popcnt32
= (uint)popcnt
;
298 if (IntPtr
.Size
== 8)
300 popcnt32
+= (uint)(popcnt
>> 32);
303 // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
304 // know there aren't any unpaired surrogates in the input data.
306 popcnt32
= (ushort)popcnt32
+ (popcnt32
>> 16);
308 // Now check for surrogates.
310 utf16Data
-= vectorD800
;
311 Vector
<ushort> surrogateChars
= Vector
.LessThan(utf16Data
, vector0800
);
312 if (surrogateChars
!= Vector
<ushort>.Zero
)
314 // There's at least one surrogate (high or low) UTF-16 code unit in
315 // the vector. We'll build up additional vectors: 'highSurrogateChars'
316 // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
317 // UTF-16 code unit was a high or low surrogate, respectively.
319 Vector
<ushort> highSurrogateChars
= Vector
.LessThan(utf16Data
, vector0400
);
320 Vector
<ushort> lowSurrogateChars
= Vector
.AndNot(surrogateChars
, highSurrogateChars
);
322 // We want to make sure that each high surrogate code unit is followed by
323 // a low surrogate code unit and each low surrogate code unit follows a
324 // high surrogate code unit. Since we don't have an equivalent of pmovmskb
325 // or palignr available to us, we'll do this as a loop. We won't look at
326 // the very last high surrogate char element since we don't yet know if
327 // the next vector read will have a low surrogate char element.
329 if (lowSurrogateChars
[0] != 0)
331 goto Error
; // error: start of buffer contains standalone low surrogate char
334 ushort surrogatePairsCount
= 0;
335 for (int i
= 0; i
< Vector
<ushort>.Count
- 1; i
++)
337 surrogatePairsCount
-= highSurrogateChars
[i
]; // turns into +1 or +0
338 if (highSurrogateChars
[i
] != lowSurrogateChars
[i
+ 1])
340 goto NonVectorizedLoop
; // error: mismatched surrogate pair; break out of vectorized logic
344 if (highSurrogateChars
[Vector
<ushort>.Count
- 1] != 0)
346 // There was a standalone high surrogate at the end of the vector.
347 // We'll adjust our counters so that we don't consider this char consumed.
354 nint surrogatePairsCountNint
= (nint
)surrogatePairsCount
; // zero-extend to native int size
356 // 2 UTF-16 chars become 1 Unicode scalar
358 tempScalarCountAdjustment
-= (int)surrogatePairsCountNint
;
360 // Since each surrogate code unit was >= 0x0800, we eagerly assumed
361 // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
362 // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
363 // so we'll adjust this now.
365 tempUtf8CodeUnitCountAdjustment
-= surrogatePairsCountNint
;
366 tempUtf8CodeUnitCountAdjustment
-= surrogatePairsCountNint
;
369 tempUtf8CodeUnitCountAdjustment
+= popcnt32
;
370 pInputBuffer
+= Vector
<ushort>.Count
;
371 inputLength
-= Vector
<ushort>.Count
;
372 } while (inputLength
>= Vector
<ushort>.Count
);
378 // Vectorization isn't supported on our current platform, or the input was too small to benefit
379 // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
380 // drain remaining valid chars before we report failure.
382 for (; inputLength
> 0; pInputBuffer
++, inputLength
--)
384 uint thisChar
= pInputBuffer
[0];
385 if (thisChar
<= 0x7F)
390 // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
391 // This optimistically assumes no surrogates, which we'll handle shortly.
393 tempUtf8CodeUnitCountAdjustment
+= (thisChar
+ 0x0001_F
800u) >> 16;
395 if (!UnicodeUtility
.IsSurrogateCodePoint(thisChar
))
400 // Found a surrogate char. Back out the adjustment we made above, then
401 // try to consume the entire surrogate pair all at once. We won't bother
402 // trying to interpret the surrogate pair as a scalar value; we'll only
403 // validate that its bit pattern matches what's expected for a surrogate pair.
405 tempUtf8CodeUnitCountAdjustment
-= 2;
407 if (inputLength
== 1)
409 goto Error
; // input buffer too small to read a surrogate pair
412 thisChar
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
413 if (((thisChar
- (BitConverter
.IsLittleEndian
? 0xDC00_D
800u : 0xD800_DC
00u)) & 0xFC00_FC
00u) != 0)
415 goto Error
; // not a well-formed surrogate pair
418 tempScalarCountAdjustment
--; // 2 UTF-16 code units -> 1 scalar
419 tempUtf8CodeUnitCountAdjustment
+= 2; // 2 UTF-16 code units -> 4 UTF-8 code units
421 pInputBuffer
++; // consumed one extra char
427 // Also used for normal return.
429 utf8CodeUnitCountAdjustment
= tempUtf8CodeUnitCountAdjustment
;
430 scalarCountAdjustment
= tempScalarCountAdjustment
;