1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 using System
.Buffers
.Binary
;
7 using System
.Diagnostics
;
9 using System
.Runtime
.Intrinsics
.X86
;
10 using Internal
.Runtime
.CompilerServices
;
12 #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
14 using nint
= System
.Int64
;
15 using nuint
= System
.UInt64
;
17 using nint
= System
.Int32
;
18 using nuint
= System
.UInt32
;
21 namespace System
.Text
.Unicode
23 internal static unsafe partial class Utf8Utility
28 Debug
.Assert(sizeof(nint
) == IntPtr
.Size
&& nint
.MinValue
< 0, "nint is defined incorrectly.");
29 Debug
.Assert(sizeof(nuint
) == IntPtr
.Size
&& nuint
.MinValue
== 0, "nuint is defined incorrectly.");
31 _ValidateAdditionalNIntDefinitions();
35 // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
36 // the next byte would have been consumed from / the next char would have been written to.
37 // inputLength in bytes, outputCharsRemaining in chars.
38 public static OperationStatus
TranscodeToUtf16(byte* pInputBuffer
, int inputLength
, char* pOutputBuffer
, int outputCharsRemaining
, out byte* pInputBufferRemaining
, out char* pOutputBufferRemaining
)
40 Debug
.Assert(inputLength
>= 0, "Input length must not be negative.");
41 Debug
.Assert(pInputBuffer
!= null || inputLength
== 0, "Input length must be zero if input buffer pointer is null.");
43 Debug
.Assert(outputCharsRemaining
>= 0, "Destination length must not be negative.");
44 Debug
.Assert(pOutputBuffer
!= null || outputCharsRemaining
== 0, "Destination length must be zero if destination buffer pointer is null.");
46 // First, try vectorized conversion.
49 nuint numElementsConverted
= ASCIIUtility
.WidenAsciiToUtf16(pInputBuffer
, pOutputBuffer
, (uint)Math
.Min(inputLength
, outputCharsRemaining
));
51 pInputBuffer
+= numElementsConverted
;
52 pOutputBuffer
+= numElementsConverted
;
54 // Quick check - did we just end up consuming the entire input buffer?
55 // If so, short-circuit the remainder of the method.
57 if ((int)numElementsConverted
== inputLength
)
59 pInputBufferRemaining
= pInputBuffer
;
60 pOutputBufferRemaining
= pOutputBuffer
;
61 return OperationStatus
.Done
;
64 inputLength
-= (int)numElementsConverted
;
65 outputCharsRemaining
-= (int)numElementsConverted
;
68 if (inputLength
< sizeof(uint))
70 goto ProcessInputOfLessThanDWordSize
;
73 byte* pFinalPosWhereCanReadDWordFromInputBuffer
= pInputBuffer
+ (uint)inputLength
- 4;
75 // Begin the main loop.
78 byte* pLastBufferPosProcessed
= null; // used for invariant checking in debug builds
81 while (pInputBuffer
<= pFinalPosWhereCanReadDWordFromInputBuffer
)
83 // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
85 uint thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
90 Debug
.Assert(pLastBufferPosProcessed
< pInputBuffer
, "Algorithm should've made forward progress since last read.");
91 pLastBufferPosProcessed
= pInputBuffer
;
93 // First, check for the common case of all-ASCII bytes.
95 if (ASCIIUtility
.AllBytesInUInt32AreAscii(thisDWord
))
97 // We read an all-ASCII sequence.
99 if (outputCharsRemaining
< sizeof(uint))
101 goto ProcessRemainingBytesSlow
; // running out of space, but may be able to write some data
104 Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer
, thisDWord
);
107 outputCharsRemaining
-= 4;
109 // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
110 // Below is basically unrolled loops with poor man's vectorization.
112 uint remainingInputBytes
= (uint)(void*)Unsafe
.ByteOffset(ref *pInputBuffer
, ref *pFinalPosWhereCanReadDWordFromInputBuffer
) + 4;
113 uint maxIters
= Math
.Min(remainingInputBytes
, (uint)outputCharsRemaining
) / (2 * sizeof(uint));
116 for (i
= 0; (uint)i
< maxIters
; i
++)
118 // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
120 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
121 secondDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
+ sizeof(uint));
123 if (!ASCIIUtility
.AllBytesInUInt32AreAscii(thisDWord
| secondDWord
))
125 goto LoopTerminatedEarlyDueToNonAsciiData
;
130 Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer
[0], thisDWord
);
131 Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer
[4], secondDWord
);
136 outputCharsRemaining
-= 8 * i
;
138 continue; // need to perform a bounds check because we might be running out of data
140 LoopTerminatedEarlyDueToNonAsciiData:
142 if (ASCIIUtility
.AllBytesInUInt32AreAscii(thisDWord
))
144 // The first DWORD contained all-ASCII bytes, so expand it.
146 Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer
, thisDWord
);
148 // continue the outer loop from the second DWORD
150 Debug
.Assert(!ASCIIUtility
.AllBytesInUInt32AreAscii(secondDWord
));
151 thisDWord
= secondDWord
;
155 outputCharsRemaining
-= 4;
158 outputCharsRemaining
-= 8 * i
;
160 // We know that there's *at least* one DWORD of data remaining in the buffer.
161 // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
163 goto AfterReadDWordSkipAllBytesAsciiCheck
;
166 AfterReadDWordSkipAllBytesAsciiCheck:
168 Debug
.Assert(!ASCIIUtility
.AllBytesInUInt32AreAscii(thisDWord
)); // this should have been handled earlier
170 // Next, try stripping off ASCII bytes one at a time.
171 // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
173 if (UInt32FirstByteIsAscii(thisDWord
))
175 if (outputCharsRemaining
>= 3)
177 // Fast-track: we don't need to check the destination length for subsequent
178 // ASCII bytes since we know we can write them all now.
180 uint thisDWordLittleEndian
= ToLittleEndian(thisDWord
);
182 nuint adjustment
= 1;
183 pOutputBuffer
[0] = (char)(byte)thisDWordLittleEndian
;
185 if (UInt32SecondByteIsAscii(thisDWord
))
188 thisDWordLittleEndian
>>= 8;
189 pOutputBuffer
[1] = (char)(byte)thisDWordLittleEndian
;
191 if (UInt32ThirdByteIsAscii(thisDWord
))
194 thisDWordLittleEndian
>>= 8;
195 pOutputBuffer
[2] = (char)(byte)thisDWordLittleEndian
;
199 pInputBuffer
+= adjustment
;
200 pOutputBuffer
+= adjustment
;
201 outputCharsRemaining
-= (int)adjustment
;
205 // Slow-track: we need to make sure each individual write has enough
206 // of a buffer so that we don't overrun the destination.
208 if (outputCharsRemaining
== 0)
210 goto OutputBufferTooSmall
;
213 uint thisDWordLittleEndian
= ToLittleEndian(thisDWord
);
216 *pOutputBuffer
++ = (char)(byte)thisDWordLittleEndian
;
217 outputCharsRemaining
--;
219 if (UInt32SecondByteIsAscii(thisDWord
))
221 if (outputCharsRemaining
== 0)
223 goto OutputBufferTooSmall
;
227 thisDWordLittleEndian
>>= 8;
228 *pOutputBuffer
++ = (char)(byte)thisDWordLittleEndian
;
230 // We can perform a small optimization here. We know at this point that
231 // the output buffer is fully consumed (we read two ASCII bytes and wrote
232 // two ASCII chars, and we checked earlier that the destination buffer
233 // can't store a third byte). If the next byte is ASCII, we can jump straight
234 // to the return statement since the end-of-method logic only relies on the
235 // destination buffer pointer -- NOT the output chars remaining count -- being
236 // correct. If the next byte is not ASCII, we'll need to continue with the
237 // rest of the main loop, but we can set the buffer length directly to zero
238 // rather than decrementing it from 1 to 0.
240 Debug
.Assert(outputCharsRemaining
== 1);
242 if (UInt32ThirdByteIsAscii(thisDWord
))
244 goto OutputBufferTooSmall
;
248 outputCharsRemaining
= 0;
253 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
255 goto ProcessRemainingBytesSlow
; // input buffer doesn't contain enough data to read a DWORD
259 // The input buffer at the current offset contains a non-ASCII byte.
260 // Read an entire DWORD and fall through to multi-byte consumption logic.
261 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
265 BeforeProcessTwoByteSequence:
267 // At this point, we know we're working with a multi-byte code unit,
268 // but we haven't yet validated it.
270 // The masks and comparands are derived from the Unicode Standard, Table 3-6.
271 // Additionally, we need to check for valid byte sequences per Table 3-7.
273 // Check the 2-byte case.
275 if (UInt32BeginsWithUtf8TwoByteMask(thisDWord
))
277 // Per Table 3-7, valid sequences are:
278 // [ C2..DF ] [ 80..BF ]
280 if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord
))
285 ProcessTwoByteSequenceSkipOverlongFormCheck:
287 // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
288 // there's a good chance that if we see one two-byte run then there's another two-byte
289 // run immediately after. Let's check that now.
291 // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
292 // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
293 // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
295 if ((BitConverter
.IsLittleEndian
&& UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord
))
296 || (!BitConverter
.IsLittleEndian
&& (UInt32EndsWithUtf8TwoByteMask(thisDWord
) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord
))))
298 // We have two runs of two bytes each.
300 if (outputCharsRemaining
< 2)
302 goto ProcessRemainingBytesSlow
; // running out of output buffer
305 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord
));
309 outputCharsRemaining
-= 2;
311 if (pInputBuffer
<= pFinalPosWhereCanReadDWordFromInputBuffer
)
313 // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
314 // also two bytes. Check for that first before going back to the beginning of the loop.
316 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
318 if (BitConverter
.IsLittleEndian
)
320 if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord
))
322 // The next sequence is a valid two-byte sequence.
323 goto ProcessTwoByteSequenceSkipOverlongFormCheck
;
328 if (UInt32BeginsWithUtf8TwoByteMask(thisDWord
))
330 if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord
))
332 goto Error
; // The next sequence purports to be a 2-byte sequence but is overlong.
335 goto ProcessTwoByteSequenceSkipOverlongFormCheck
;
339 // If we reached this point, the next sequence is something other than a valid
340 // two-byte sequence, so go back to the beginning of the loop.
345 goto ProcessRemainingBytesSlow
; // Running out of data - go down slow path
349 // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
350 // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
353 uint charToWrite
= ExtractCharFromFirstTwoByteSequence(thisDWord
); // optimistically compute this now, but don't store until we know dest is large enough
355 if (UInt32ThirdByteIsAscii(thisDWord
))
357 if (UInt32FourthByteIsAscii(thisDWord
))
359 if (outputCharsRemaining
< 3)
361 goto ProcessRemainingBytesSlow
; // running out of output buffer
364 pOutputBuffer
[0] = (char)charToWrite
;
365 if (BitConverter
.IsLittleEndian
)
368 pOutputBuffer
[1] = (char)(byte)thisDWord
;
370 pOutputBuffer
[2] = (char)thisDWord
;
374 pOutputBuffer
[2] = (char)(byte)thisDWord
;
375 pOutputBuffer
[1] = (char)(byte)(thisDWord
>> 8);
379 outputCharsRemaining
-= 3;
381 continue; // go back to original bounds check and check for ASCII
385 if (outputCharsRemaining
< 2)
387 goto ProcessRemainingBytesSlow
; // running out of output buffer
390 pOutputBuffer
[0] = (char)charToWrite
;
391 pOutputBuffer
[1] = (char)(byte)(thisDWord
>> (BitConverter
.IsLittleEndian
? 16 : 8));
394 outputCharsRemaining
-= 2;
396 // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
397 // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
399 if (pFinalPosWhereCanReadDWordFromInputBuffer
< pInputBuffer
)
401 goto ProcessRemainingBytesSlow
; // Running out of data - go down slow path
405 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
406 goto BeforeProcessTwoByteSequence
;
412 if (outputCharsRemaining
== 0)
414 goto ProcessRemainingBytesSlow
; // running out of output buffer
417 pOutputBuffer
[0] = (char)charToWrite
;
420 outputCharsRemaining
--;
422 if (pFinalPosWhereCanReadDWordFromInputBuffer
< pInputBuffer
)
424 goto ProcessRemainingBytesSlow
; // Running out of data - go down slow path
428 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
429 goto BeforeProcessThreeByteSequence
; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
434 // Check the 3-byte case.
436 BeforeProcessThreeByteSequence:
438 if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord
))
440 ProcessThreeByteSequenceWithCheck:
442 // We need to check for overlong or surrogate three-byte sequences.
444 // Per Table 3-7, valid sequences are:
445 // [ E0 ] [ A0..BF ] [ 80..BF ]
446 // [ E1..EC ] [ 80..BF ] [ 80..BF ]
447 // [ ED ] [ 80..9F ] [ 80..BF ]
448 // [ EE..EF ] [ 80..BF ] [ 80..BF ]
450 // Big-endian examples of using the above validation table:
451 // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
452 // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
453 // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
454 // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
455 // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
457 if (BitConverter
.IsLittleEndian
)
459 // The "overlong or surrogate" check can be implemented using a single jump, but there's
460 // some overhead to moving the bits into the correct locations in order to perform the
461 // correct comparison, and in practice the processor's branch prediction capability is
462 // good enough that we shouldn't bother. So we'll use two jumps instead.
464 // Can't extract this check into its own helper method because JITter produces suboptimal
465 // assembly, even with aggressive inlining.
467 // Code below becomes 5 instructions: test, jz, lea, test, jz
469 if (((thisDWord
& 0x0000_
200Fu
) == 0) || (((thisDWord
- 0x0000_
200Du
) & 0x0000_
200Fu
) == 0))
471 goto Error
; // overlong or surrogate
476 if (((thisDWord
& 0x0F20_
0000u) == 0) || (((thisDWord
- 0x0D20_
0000u) & 0x0F20_
0000u) == 0))
478 goto Error
; // overlong or surrogate
482 // At this point, we know the incoming scalar is well-formed.
484 if (outputCharsRemaining
== 0)
486 goto OutputBufferTooSmall
; // not enough space in the destination buffer to write
489 // As an optimization, on compatible platforms check if a second three-byte sequence immediately
490 // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
492 if (Bmi2
.X64
.IsSupported
)
494 Debug
.Assert(BitConverter
.IsLittleEndian
, "BMI2 requires little-endian.");
496 // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
497 // would indicate the potential start of a second three-byte sequence.
499 if (((thisDWord
- 0xE000_
0000u) & 0xF000_
0000u) == 0)
501 // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
502 // the final place where we can safely perform a DWORD read, and we want to probe whether it's
503 // safe to read a DWORD beginning at address &pInputBuffer[3].
505 if (outputCharsRemaining
> 1 && (nint
)(void*)Unsafe
.ByteOffset(ref *pInputBuffer
, ref *pFinalPosWhereCanReadDWordFromInputBuffer
) >= 3)
507 // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
508 // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
509 // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
510 // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
511 // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
512 // fail, and redirect control flow to the error handling logic at the very end of this method.
514 uint secondDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
+ 3);
516 if (UInt32BeginsWithUtf8ThreeByteMask(secondDWord
)
517 && ((secondDWord
& 0x0000_
200Fu
) != 0)
518 && (((secondDWord
- 0x0000_
200Du
) & 0x0000_
200Fu
) != 0))
520 // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
521 ulong combinedQWord
= ((ulong)BinaryPrimitives
.ReverseEndianness(secondDWord
) << 32) | BinaryPrimitives
.ReverseEndianness(thisDWord
);
522 thisDWord
= secondDWord
; // store this value in the correct local for the ASCII drain logic
524 // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
525 ulong extractedQWord
= Bmi2
.X64
.ParallelBitExtract(combinedQWord
, 0x0F3F3F00_
0F
3F
3F
00ul);
527 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, (uint)extractedQWord
);
530 outputCharsRemaining
-= 2;
532 // Drain any ASCII data following the second three-byte sequence.
534 goto CheckForAsciiByteAfterThreeByteSequence
;
540 // Couldn't extract 2x three-byte sequences together, just do this one by itself.
542 *pOutputBuffer
= (char)ExtractCharFromFirstThreeByteSequence(thisDWord
);
545 outputCharsRemaining
--;
547 CheckForAsciiByteAfterThreeByteSequence:
549 // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
550 // in to the text. If this happens strip it off now before seeing if the next character
551 // consists of three code units.
553 if (UInt32FourthByteIsAscii(thisDWord
))
555 if (outputCharsRemaining
== 0)
557 goto OutputBufferTooSmall
;
560 if (BitConverter
.IsLittleEndian
)
562 *pOutputBuffer
= (char)(thisDWord
>> 24);
566 *pOutputBuffer
= (char)(byte)thisDWord
;
571 outputCharsRemaining
--;
574 if (pInputBuffer
<= pFinalPosWhereCanReadDWordFromInputBuffer
)
576 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
578 // Optimization: A three-byte character could indicate CJK text, which makes it likely
579 // that the character following this one is also CJK. We'll check for a three-byte sequence
580 // marker now and jump directly to three-byte sequence processing if we see one, skipping
581 // all of the logic at the beginning of the loop.
583 if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord
))
585 goto ProcessThreeByteSequenceWithCheck
; // found a three-byte sequence marker; validate and consume
589 goto AfterReadDWord
; // probably ASCII punctuation or whitespace
594 goto ProcessRemainingBytesSlow
; // Running out of data - go down slow path
598 // Assume the 4-byte case, but we need to validate.
601 // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
603 // Per Table 3-7, valid sequences are:
604 // [ F0 ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
605 // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
606 // [ F4 ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
608 if (!UInt32BeginsWithUtf8FourByteMask(thisDWord
))
613 // Now check for overlong / out-of-range sequences.
615 if (BitConverter
.IsLittleEndian
)
617 // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
618 // We want to get the 'w' byte in front of the 'z' byte so that we can perform
619 // a single range comparison. We'll take advantage of the fact that the JITter
620 // can detect a ROR / ROL operation, then we'll just zero out the bytes that
621 // aren't involved in the range check.
623 uint toCheck
= thisDWord
& 0x0000_FFFFu
;
625 // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
627 toCheck
= BitOperations
.RotateRight(toCheck
, 8);
629 // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
631 if (!UnicodeUtility
.IsInRangeInclusive(toCheck
, 0xF000_
0090u, 0xF400_
008Fu
))
638 if (!UnicodeUtility
.IsInRangeInclusive(thisDWord
, 0xF090_
0000u, 0xF48F_FFFFu
))
644 // Validation complete.
646 if (outputCharsRemaining
< 2)
648 // There's no point to falling back to the "drain the input buffer" logic, since we know
649 // we can't write anything to the destination. So we'll just exit immediately.
650 goto OutputBufferTooSmall
;
653 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, ExtractCharsFromFourByteSequence(thisDWord
));
657 outputCharsRemaining
-= 2;
659 continue; // go back to beginning of loop for processing
663 ProcessRemainingBytesSlow:
664 inputLength
= (int)(void*)Unsafe
.ByteOffset(ref *pInputBuffer
, ref *pFinalPosWhereCanReadDWordFromInputBuffer
) + 4;
666 ProcessInputOfLessThanDWordSize:
667 while (inputLength
> 0)
669 uint firstByte
= pInputBuffer
[0];
670 if (firstByte
<= 0x7Fu
)
672 if (outputCharsRemaining
== 0)
674 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
677 // 1-byte (ASCII) case
678 *pOutputBuffer
= (char)firstByte
;
683 outputCharsRemaining
--;
687 // Potentially the start of a multi-byte sequence?
690 if ((byte)firstByte
<= (0xDFu
- 0xC2u
))
692 // Potentially a 2-byte sequence?
695 goto InputBufferTooSmall
; // out of data
698 uint secondByte
= pInputBuffer
[1];
699 if (!IsLowByteUtf8ContinuationByte(secondByte
))
701 goto Error
; // 2-byte marker not followed by continuation byte
704 if (outputCharsRemaining
== 0)
706 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
709 uint asChar
= (firstByte
<< 6) + secondByte
+ ((0xC2u
- 0xC0u
) << 6) - 0x80u
; // remove UTF-8 markers from scalar
710 *pOutputBuffer
= (char)asChar
;
715 outputCharsRemaining
--;
718 else if ((byte)firstByte
<= (0xEFu
- 0xC2u
))
720 // Potentially a 3-byte sequence?
721 if (inputLength
>= 3)
723 uint secondByte
= pInputBuffer
[1];
724 uint thirdByte
= pInputBuffer
[2];
725 if (!IsLowByteUtf8ContinuationByte(secondByte
) || !IsLowByteUtf8ContinuationByte(thirdByte
))
727 goto Error
; // 3-byte marker not followed by 2 continuation bytes
730 // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
731 // We account for this in the comparisons below.
733 uint partialChar
= (firstByte
<< 12) + (secondByte
<< 6);
734 if (partialChar
< ((0xE0u
- 0xC2u
) << 12) + (0xA0u
<< 6))
736 goto Error
; // this is an overlong encoding; fail
739 partialChar
-= ((0xEDu
- 0xC2u
) << 12) + (0xA0u
<< 6); // if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
740 if (partialChar
< 0x0800u
/* number of code points in UTF-16 surrogate code point range */)
742 goto Error
; // attempted to encode a UTF-16 surrogate code point; fail
745 if (outputCharsRemaining
== 0)
747 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
750 // Now restore the full scalar value.
752 partialChar
+= thirdByte
;
753 partialChar
+= 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
754 partialChar
-= 0x80u
; // remove third byte continuation marker
756 *pOutputBuffer
= (char)partialChar
;
761 outputCharsRemaining
--;
764 else if (inputLength
>= 2)
766 uint secondByte
= pInputBuffer
[1];
767 if (!IsLowByteUtf8ContinuationByte(secondByte
))
769 goto Error
; // 3-byte marker not followed by continuation byte
772 // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
773 // from just the first two bytes.
775 uint partialChar
= (firstByte
<< 6) + secondByte
; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
776 if (partialChar
< ((0xE0u
- 0xC2u
) << 6) + 0xA0u
)
778 goto Error
; // failed overlong check
780 if (UnicodeUtility
.IsInRangeInclusive(partialChar
, ((0xEDu
- 0xC2u
) << 6) + 0xA0u
, ((0xEEu
- 0xC2u
) << 6) + 0x7Fu
))
782 goto Error
; // failed surrogate check
786 goto InputBufferTooSmall
; // out of data
788 else if ((byte)firstByte
<= (0xF4u
- 0xC2u
))
790 // Potentially a 4-byte sequence?
794 goto InputBufferTooSmall
; // ran out of data
797 uint nextByte
= pInputBuffer
[1];
798 if (!IsLowByteUtf8ContinuationByte(nextByte
))
800 goto Error
; // 4-byte marker not followed by a continuation byte
803 uint asPartialChar
= (firstByte
<< 6) + nextByte
; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
804 if (!UnicodeUtility
.IsInRangeInclusive(asPartialChar
, ((0xF0u
- 0xC2u
) << 6) + 0x90u
, ((0xF4u
- 0xC2u
) << 6) + 0x8Fu
))
806 goto Error
; // failed overlong / out-of-range check
811 goto InputBufferTooSmall
; // ran out of data
814 if (!IsLowByteUtf8ContinuationByte(pInputBuffer
[2]))
816 goto Error
; // third byte in 4-byte sequence not a continuation byte
821 goto InputBufferTooSmall
; // ran out of data
824 if (!IsLowByteUtf8ContinuationByte(pInputBuffer
[3]))
826 goto Error
; // fourth byte in 4-byte sequence not a continuation byte
829 // If we read a valid astral scalar value, the only way we could've fallen down this code path
830 // is that we didn't have enough output buffer to write the result.
832 goto OutputBufferTooSmall
;
836 goto Error
; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
840 OperationStatus retVal
= OperationStatus
.Done
;
844 retVal
= OperationStatus
.NeedMoreData
;
847 OutputBufferTooSmall:
848 retVal
= OperationStatus
.DestinationTooSmall
;
852 retVal
= OperationStatus
.InvalidData
;
856 pInputBufferRemaining
= pInputBuffer
;
857 pOutputBufferRemaining
= pOutputBuffer
;
861 // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
862 // the next char would have been consumed from / the next byte would have been written to.
863 // inputLength in chars, outputBytesRemaining in bytes.
864 public static OperationStatus
TranscodeToUtf8(char* pInputBuffer
, int inputLength
, byte* pOutputBuffer
, int outputBytesRemaining
, out char* pInputBufferRemaining
, out byte* pOutputBufferRemaining
)
866 const int CharsPerDWord
= sizeof(uint) / sizeof(char);
868 Debug
.Assert(inputLength
>= 0, "Input length must not be negative.");
869 Debug
.Assert(pInputBuffer
!= null || inputLength
== 0, "Input length must be zero if input buffer pointer is null.");
871 Debug
.Assert(outputBytesRemaining
>= 0, "Destination length must not be negative.");
872 Debug
.Assert(pOutputBuffer
!= null || outputBytesRemaining
== 0, "Destination length must be zero if destination buffer pointer is null.");
874 // First, try vectorized conversion.
877 nuint numElementsConverted
= ASCIIUtility
.NarrowUtf16ToAscii(pInputBuffer
, pOutputBuffer
, (uint)Math
.Min(inputLength
, outputBytesRemaining
));
879 pInputBuffer
+= numElementsConverted
;
880 pOutputBuffer
+= numElementsConverted
;
882 // Quick check - did we just end up consuming the entire input buffer?
883 // If so, short-circuit the remainder of the method.
885 if ((int)numElementsConverted
== inputLength
)
887 pInputBufferRemaining
= pInputBuffer
;
888 pOutputBufferRemaining
= pOutputBuffer
;
889 return OperationStatus
.Done
;
892 inputLength
-= (int)numElementsConverted
;
893 outputBytesRemaining
-= (int)numElementsConverted
;
896 if (inputLength
< CharsPerDWord
)
898 goto ProcessInputOfLessThanDWordSize
;
901 char* pFinalPosWhereCanReadDWordFromInputBuffer
= pInputBuffer
+ (uint)inputLength
- CharsPerDWord
;
903 // Begin the main loop.
906 char* pLastBufferPosProcessed
= null; // used for invariant checking in debug builds
911 while (pInputBuffer
<= pFinalPosWhereCanReadDWordFromInputBuffer
)
913 // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
915 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
920 Debug
.Assert(pLastBufferPosProcessed
< pInputBuffer
, "Algorithm should've made forward progress since last read.");
921 pLastBufferPosProcessed
= pInputBuffer
;
924 // First, check for the common case of all-ASCII chars.
926 if (Utf16Utility
.AllCharsInUInt32AreAscii(thisDWord
))
928 // We read an all-ASCII sequence (2 chars).
930 if (outputBytesRemaining
< 2)
932 goto ProcessOneCharFromCurrentDWordAndFinish
; // running out of space, but may be able to write some data
935 // The high WORD of the local declared below might be populated with garbage
936 // as a result of our shifts below, but that's ok since we're only going to
937 // write the low WORD.
939 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
940 // (Same logic works regardless of endianness.)
941 uint valueToWrite
= thisDWord
| (thisDWord
>> 8);
943 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
, (ushort)valueToWrite
);
947 outputBytesRemaining
-= 2;
949 // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
950 // Below is basically unrolled loops with poor man's vectorization.
952 uint inputCharsRemaining
= (uint)(pFinalPosWhereCanReadDWordFromInputBuffer
- pInputBuffer
) + 2;
953 uint minElementsRemaining
= (uint)Math
.Min(inputCharsRemaining
, outputBytesRemaining
);
955 if (Bmi2
.X64
.IsSupported
)
957 Debug
.Assert(BitConverter
.IsLittleEndian
, "BMI2 requires little-endian.");
958 const ulong PEXT_MASK
= 0x00FF00FF_
00FF
00FFul
;
960 // Try reading and writing 8 elements per iteration.
961 uint maxIters
= minElementsRemaining
/ 8;
962 ulong firstQWord
, secondQWord
;
964 for (i
= 0; (uint)i
< maxIters
; i
++)
966 firstQWord
= Unsafe
.ReadUnaligned
<ulong>(pInputBuffer
);
967 secondQWord
= Unsafe
.ReadUnaligned
<ulong>(pInputBuffer
+ 4);
969 if (!Utf16Utility
.AllCharsInUInt64AreAscii(firstQWord
| secondQWord
))
971 goto LoopTerminatedDueToNonAsciiData
;
974 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, (uint)Bmi2
.X64
.ParallelBitExtract(firstQWord
, PEXT_MASK
));
975 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
+ 4, (uint)Bmi2
.X64
.ParallelBitExtract(secondQWord
, PEXT_MASK
));
981 outputBytesRemaining
-= 8 * i
;
983 // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
985 if ((minElementsRemaining
& 4) != 0)
987 secondQWord
= Unsafe
.ReadUnaligned
<ulong>(pInputBuffer
);
989 if (!Utf16Utility
.AllCharsInUInt64AreAscii(secondQWord
))
991 goto LoopTerminatedDueToNonAsciiDataInSecondQWord
;
994 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, (uint)Bmi2
.X64
.ParallelBitExtract(secondQWord
, PEXT_MASK
));
998 outputBytesRemaining
-= 4;
1001 continue; // Go back to beginning of main loop, read data, check for ASCII
1003 LoopTerminatedDueToNonAsciiData:
1005 outputBytesRemaining
-= 8 * i
;
1007 // First, see if we can drain any ASCII data from the first QWORD.
1009 if (Utf16Utility
.AllCharsInUInt64AreAscii(firstQWord
))
1011 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, (uint)Bmi2
.X64
.ParallelBitExtract(firstQWord
, PEXT_MASK
));
1014 outputBytesRemaining
-= 4;
1018 secondQWord
= firstQWord
;
1021 LoopTerminatedDueToNonAsciiDataInSecondQWord:
1023 Debug
.Assert(!Utf16Utility
.AllCharsInUInt64AreAscii(secondQWord
)); // this condition should've been checked earlier
1025 thisDWord
= (uint)secondQWord
;
1026 if (Utf16Utility
.AllCharsInUInt32AreAscii(thisDWord
))
1028 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1029 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
, (ushort)(thisDWord
| (thisDWord
>> 8)));
1032 outputBytesRemaining
-= 2;
1033 thisDWord
= (uint)(secondQWord
>> 32);
1036 goto AfterReadDWordSkipAllCharsAsciiCheck
;
1040 // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
1041 uint maxIters
= minElementsRemaining
/ 4;
1044 for (i
= 0; (uint)i
< maxIters
; i
++)
1046 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1047 secondDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
+ 2);
1049 if (!Utf16Utility
.AllCharsInUInt32AreAscii(thisDWord
| secondDWord
))
1051 goto LoopTerminatedDueToNonAsciiData
;
1054 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1055 // (Same logic works regardless of endianness.)
1056 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
, (ushort)(thisDWord
| (thisDWord
>> 8)));
1057 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
+ 2, (ushort)(secondDWord
| (secondDWord
>> 8)));
1063 outputBytesRemaining
-= 4 * i
;
1065 continue; // Go back to beginning of main loop, read data, check for ASCII
1067 LoopTerminatedDueToNonAsciiData:
1069 outputBytesRemaining
-= 4 * i
;
1071 // First, see if we can drain any ASCII data from the first DWORD.
1073 if (Utf16Utility
.AllCharsInUInt32AreAscii(thisDWord
))
1075 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1076 // (Same logic works regardless of endianness.)
1077 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
, (ushort)(thisDWord
| (thisDWord
>> 8)));
1080 outputBytesRemaining
-= 2;
1081 thisDWord
= secondDWord
;
1084 goto AfterReadDWordSkipAllCharsAsciiCheck
;
1088 AfterReadDWordSkipAllCharsAsciiCheck:
1090 Debug
.Assert(!Utf16Utility
.AllCharsInUInt32AreAscii(thisDWord
)); // this should have been handled earlier
1092 // Next, try stripping off the first ASCII char if it exists.
1093 // We don't check for a second ASCII char since that should have been handled above.
1095 if (IsFirstCharAscii(thisDWord
))
1097 if (outputBytesRemaining
== 0)
1099 goto OutputBufferTooSmall
;
1102 if (BitConverter
.IsLittleEndian
)
1104 pOutputBuffer
[0] = (byte)thisDWord
; // extract [ ## ## 00 AA ]
1108 pOutputBuffer
[0] = (byte)(thisDWord
>> 24); // extract [ AA 00 ## ## ]
1113 outputBytesRemaining
--;
1115 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1117 goto ProcessNextCharAndFinish
; // input buffer doesn't contain enough data to read a DWORD
1121 // The input buffer at the current offset contains a non-ASCII char.
1122 // Read an entire DWORD and fall through to non-ASCII consumption logic.
1123 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1127 // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
1129 if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord
))
1131 TryConsumeMultipleTwoByteSequences:
1133 // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
1134 // a tight loop without falling back to the main loop.
1136 if (IsSecondCharTwoUtf8Bytes(thisDWord
))
1138 // We have two runs of two bytes each.
1140 if (outputBytesRemaining
< 4)
1142 goto ProcessOneCharFromCurrentDWordAndFinish
; // running out of output buffer
1145 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord
));
1149 outputBytesRemaining
-= 4;
1151 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1153 goto ProcessNextCharAndFinish
; // Running out of data - go down slow path
1157 // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
1158 // also two bytes. Check for that first before going back to the beginning of the loop.
1160 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1162 if (IsFirstCharTwoUtf8Bytes(thisDWord
))
1164 // Validated we have a two-byte sequence coming up
1165 goto TryConsumeMultipleTwoByteSequences
;
1168 // If we reached this point, the next sequence is something other than a valid
1169 // two-byte sequence, so go back to the beginning of the loop.
1170 goto AfterReadDWord
;
1174 if (outputBytesRemaining
< 2)
1176 goto OutputBufferTooSmall
;
1179 Unsafe
.WriteUnaligned
<ushort>(pOutputBuffer
, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord
));
1181 // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
1182 // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
1185 if (IsSecondCharAscii(thisDWord
))
1187 if (outputBytesRemaining
>= 3)
1189 if (BitConverter
.IsLittleEndian
)
1193 pOutputBuffer
[2] = (byte)thisDWord
;
1197 outputBytesRemaining
-= 3;
1199 continue; // go back to original bounds check and check for ASCII
1205 goto OutputBufferTooSmall
;
1212 outputBytesRemaining
-= 2;
1214 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1216 goto ProcessNextCharAndFinish
; // Running out of data - go down slow path
1220 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1221 goto BeforeProcessThreeByteSequence
; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
1226 // Check the 3-byte case.
1228 BeforeProcessThreeByteSequence:
1230 if (!IsFirstCharSurrogate(thisDWord
))
1232 // Optimization: A three-byte character could indicate CJK text, which makes it likely
1233 // that the character following this one is also CJK. We'll perform the check now
1234 // rather than jumping to the beginning of the main loop.
1236 if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord
))
1238 if (!IsSecondCharSurrogate(thisDWord
))
1240 if (outputBytesRemaining
< 6)
1242 goto ConsumeSingleThreeByteRun
; // not enough space - try consuming as much as we can
1245 WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer
, thisDWord
);
1249 outputBytesRemaining
-= 6;
1251 // Try to remain in the 3-byte processing loop if at all possible.
1253 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1255 goto ProcessNextCharAndFinish
; // Running out of data - go down slow path
1259 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1261 if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord
))
1263 goto BeforeProcessThreeByteSequence
;
1267 // Fall back to standard processing loop since we don't know how to optimize this.
1268 goto AfterReadDWord
;
1274 ConsumeSingleThreeByteRun:
1276 if (outputBytesRemaining
< 3)
1278 goto OutputBufferTooSmall
;
1281 WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer
, thisDWord
);
1285 outputBytesRemaining
-= 3;
1287 // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
1288 // in to the text. If this happens strip it off now before seeing if the next character
1289 // consists of three code units.
1291 if (IsSecondCharAscii(thisDWord
))
1293 if (outputBytesRemaining
== 0)
1295 goto OutputBufferTooSmall
;
1298 if (BitConverter
.IsLittleEndian
)
1300 *pOutputBuffer
= (byte)(thisDWord
>> 16);
1304 *pOutputBuffer
= (byte)(thisDWord
);
1309 outputBytesRemaining
--;
1311 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1313 goto ProcessNextCharAndFinish
; // Running out of data - go down slow path
1317 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1319 if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord
))
1321 goto BeforeProcessThreeByteSequence
;
1325 // Fall back to standard processing loop since we don't know how to optimize this.
1326 goto AfterReadDWord
;
1331 if (pInputBuffer
> pFinalPosWhereCanReadDWordFromInputBuffer
)
1333 goto ProcessNextCharAndFinish
; // Running out of data - go down slow path
1337 thisDWord
= Unsafe
.ReadUnaligned
<uint>(pInputBuffer
);
1338 goto AfterReadDWordSkipAllCharsAsciiCheck
; // we just checked above that this value isn't ASCII
1342 // Four byte sequence processing
1344 if (IsWellFormedUtf16SurrogatePair(thisDWord
))
1346 if (outputBytesRemaining
< 4)
1348 goto OutputBufferTooSmall
;
1351 Unsafe
.WriteUnaligned
<uint>(pOutputBuffer
, ExtractFourUtf8BytesFromSurrogatePair(thisDWord
));
1355 outputBytesRemaining
-= 4;
1357 continue; // go back to beginning of loop for processing
1360 goto Error
; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
1363 ProcessNextCharAndFinish:
1364 inputLength
= (int)(pFinalPosWhereCanReadDWordFromInputBuffer
- pInputBuffer
) + CharsPerDWord
;
1366 ProcessInputOfLessThanDWordSize:
1367 Debug
.Assert(inputLength
< CharsPerDWord
);
1369 if (inputLength
== 0)
1371 goto InputBufferFullyConsumed
;
1374 uint thisChar
= *pInputBuffer
;
1375 goto ProcessFinalChar
;
1377 ProcessOneCharFromCurrentDWordAndFinish:
1378 if (BitConverter
.IsLittleEndian
)
1380 thisChar
= thisDWord
& 0xFFFFu
; // preserve only the first char
1384 thisChar
= thisDWord
>> 16; // preserve only the first char
1389 if (thisChar
<= 0x7Fu
)
1391 if (outputBytesRemaining
== 0)
1393 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
1396 // 1-byte (ASCII) case
1397 *pOutputBuffer
= (byte)thisChar
;
1402 else if (thisChar
< 0x0800u
)
1404 if (outputBytesRemaining
< 2)
1406 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
1410 pOutputBuffer
[1] = (byte)((thisChar
& 0x3Fu
) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
1411 pOutputBuffer
[0] = (byte)((thisChar
>> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
1416 else if (!UnicodeUtility
.IsSurrogateCodePoint(thisChar
))
1418 if (outputBytesRemaining
< 3)
1420 goto OutputBufferTooSmall
; // we have no hope of writing anything to the output
1424 pOutputBuffer
[2] = (byte)((thisChar
& 0x3Fu
) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
1425 pOutputBuffer
[1] = (byte)(((thisChar
>> 6) & 0x3Fu
) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
1426 pOutputBuffer
[0] = (byte)((thisChar
>> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
1431 else if (thisChar
<= 0xDBFFu
)
1433 // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
1434 goto InputBufferTooSmall
;
1438 // UTF-16 low surrogate code point with no leading data, report error
1443 // There are two ways we can end up here. Either we were running low on input data,
1444 // or we were running low on space in the destination buffer. If we're running low on
1445 // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
1446 // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
1447 // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
1448 // then we didn't modify inputLength since entering the main loop, which means it should
1449 // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
1450 // which of the two scenarios we're in.
1452 if (inputLength
> 1)
1454 goto OutputBufferTooSmall
;
1457 InputBufferFullyConsumed:
1458 OperationStatus retVal
= OperationStatus
.Done
;
1461 InputBufferTooSmall:
1462 retVal
= OperationStatus
.NeedMoreData
;
1465 OutputBufferTooSmall:
1466 retVal
= OperationStatus
.DestinationTooSmall
;
1470 retVal
= OperationStatus
.InvalidData
;
1474 pInputBufferRemaining
= pInputBuffer
;
1475 pOutputBufferRemaining
= pOutputBuffer
;