More Corelib cleanup (dotnet/coreclr#26872)
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / Unicode / Utf8Utility.Transcoding.cs
blob09a843c8123fd06f2eda41a2a4ffb1eaf4a5e4f4
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System.Buffers;
6 using System.Buffers.Binary;
7 using System.Diagnostics;
8 using System.Numerics;
9 using System.Runtime.Intrinsics.X86;
10 using Internal.Runtime.CompilerServices;
12 #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
13 #if BIT64
14 using nint = System.Int64;
15 using nuint = System.UInt64;
16 #else // BIT64
17 using nint = System.Int32;
18 using nuint = System.UInt32;
19 #endif // BIT64
21 namespace System.Text.Unicode
23 internal static unsafe partial class Utf8Utility
25 #if DEBUG
26 static Utf8Utility()
28 Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
29 Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
31 _ValidateAdditionalNIntDefinitions();
33 #endif // DEBUG
35 // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
36 // the next byte would have been consumed from / the next char would have been written to.
37 // inputLength in bytes, outputCharsRemaining in chars.
38 public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining)
40 Debug.Assert(inputLength >= 0, "Input length must not be negative.");
41 Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
43 Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative.");
44 Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
46 // First, try vectorized conversion.
49 nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
51 pInputBuffer += numElementsConverted;
52 pOutputBuffer += numElementsConverted;
54 // Quick check - did we just end up consuming the entire input buffer?
55 // If so, short-circuit the remainder of the method.
57 if ((int)numElementsConverted == inputLength)
59 pInputBufferRemaining = pInputBuffer;
60 pOutputBufferRemaining = pOutputBuffer;
61 return OperationStatus.Done;
64 inputLength -= (int)numElementsConverted;
65 outputCharsRemaining -= (int)numElementsConverted;
68 if (inputLength < sizeof(uint))
70 goto ProcessInputOfLessThanDWordSize;
73 byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4;
75 // Begin the main loop.
77 #if DEBUG
78 byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
79 #endif
81 while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
83 // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
85 uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
87 AfterReadDWord:
89 #if DEBUG
90 Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
91 pLastBufferPosProcessed = pInputBuffer;
92 #endif
93 // First, check for the common case of all-ASCII bytes.
95 if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
97 // We read an all-ASCII sequence.
99 if (outputCharsRemaining < sizeof(uint))
101 goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
104 Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
105 pInputBuffer += 4;
106 pOutputBuffer += 4;
107 outputCharsRemaining -= 4;
109 // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
110 // Below is basically unrolled loops with poor man's vectorization.
112 uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
113 uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint));
114 uint secondDWord;
115 int i;
116 for (i = 0; (uint)i < maxIters; i++)
118 // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
120 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
121 secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint));
123 if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord))
125 goto LoopTerminatedEarlyDueToNonAsciiData;
128 pInputBuffer += 8;
130 Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
131 Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
133 pOutputBuffer += 8;
136 outputCharsRemaining -= 8 * i;
138 continue; // need to perform a bounds check because we might be running out of data
140 LoopTerminatedEarlyDueToNonAsciiData:
142 if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
144 // The first DWORD contained all-ASCII bytes, so expand it.
146 Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
148 // continue the outer loop from the second DWORD
150 Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord));
151 thisDWord = secondDWord;
153 pInputBuffer += 4;
154 pOutputBuffer += 4;
155 outputCharsRemaining -= 4;
158 outputCharsRemaining -= 8 * i;
160 // We know that there's *at least* one DWORD of data remaining in the buffer.
161 // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
163 goto AfterReadDWordSkipAllBytesAsciiCheck;
166 AfterReadDWordSkipAllBytesAsciiCheck:
168 Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
170 // Next, try stripping off ASCII bytes one at a time.
171 // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
173 if (UInt32FirstByteIsAscii(thisDWord))
175 if (outputCharsRemaining >= 3)
177 // Fast-track: we don't need to check the destination length for subsequent
178 // ASCII bytes since we know we can write them all now.
180 uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
182 nuint adjustment = 1;
183 pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
185 if (UInt32SecondByteIsAscii(thisDWord))
187 adjustment++;
188 thisDWordLittleEndian >>= 8;
189 pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
191 if (UInt32ThirdByteIsAscii(thisDWord))
193 adjustment++;
194 thisDWordLittleEndian >>= 8;
195 pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian;
199 pInputBuffer += adjustment;
200 pOutputBuffer += adjustment;
201 outputCharsRemaining -= (int)adjustment;
203 else
205 // Slow-track: we need to make sure each individual write has enough
206 // of a buffer so that we don't overrun the destination.
208 if (outputCharsRemaining == 0)
210 goto OutputBufferTooSmall;
213 uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
215 pInputBuffer++;
216 *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
217 outputCharsRemaining--;
219 if (UInt32SecondByteIsAscii(thisDWord))
221 if (outputCharsRemaining == 0)
223 goto OutputBufferTooSmall;
226 pInputBuffer++;
227 thisDWordLittleEndian >>= 8;
228 *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
230 // We can perform a small optimization here. We know at this point that
231 // the output buffer is fully consumed (we read two ASCII bytes and wrote
232 // two ASCII chars, and we checked earlier that the destination buffer
233 // can't store a third byte). If the next byte is ASCII, we can jump straight
234 // to the return statement since the end-of-method logic only relies on the
235 // destination buffer pointer -- NOT the output chars remaining count -- being
236 // correct. If the next byte is not ASCII, we'll need to continue with the
237 // rest of the main loop, but we can set the buffer length directly to zero
238 // rather than decrementing it from 1 to 0.
240 Debug.Assert(outputCharsRemaining == 1);
242 if (UInt32ThirdByteIsAscii(thisDWord))
244 goto OutputBufferTooSmall;
246 else
248 outputCharsRemaining = 0;
253 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
255 goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD
257 else
259 // The input buffer at the current offset contains a non-ASCII byte.
260 // Read an entire DWORD and fall through to multi-byte consumption logic.
261 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
265 BeforeProcessTwoByteSequence:
267 // At this point, we know we're working with a multi-byte code unit,
268 // but we haven't yet validated it.
270 // The masks and comparands are derived from the Unicode Standard, Table 3-6.
271 // Additionally, we need to check for valid byte sequences per Table 3-7.
273 // Check the 2-byte case.
275 if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
277 // Per Table 3-7, valid sequences are:
278 // [ C2..DF ] [ 80..BF ]
280 if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
282 goto Error;
285 ProcessTwoByteSequenceSkipOverlongFormCheck:
287 // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
288 // there's a good chance that if we see one two-byte run then there's another two-byte
289 // run immediately after. Let's check that now.
291 // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
292 // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
293 // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
295 if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
296 || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
298 // We have two runs of two bytes each.
300 if (outputCharsRemaining < 2)
302 goto ProcessRemainingBytesSlow; // running out of output buffer
305 Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
307 pInputBuffer += 4;
308 pOutputBuffer += 2;
309 outputCharsRemaining -= 2;
311 if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
313 // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
314 // also two bytes. Check for that first before going back to the beginning of the loop.
316 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
318 if (BitConverter.IsLittleEndian)
320 if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
322 // The next sequence is a valid two-byte sequence.
323 goto ProcessTwoByteSequenceSkipOverlongFormCheck;
326 else
328 if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
330 if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
332 goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
335 goto ProcessTwoByteSequenceSkipOverlongFormCheck;
339 // If we reached this point, the next sequence is something other than a valid
340 // two-byte sequence, so go back to the beginning of the loop.
341 goto AfterReadDWord;
343 else
345 goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
349 // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
350 // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
351 // bytes are ASCII?
353 uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
355 if (UInt32ThirdByteIsAscii(thisDWord))
357 if (UInt32FourthByteIsAscii(thisDWord))
359 if (outputCharsRemaining < 3)
361 goto ProcessRemainingBytesSlow; // running out of output buffer
364 pOutputBuffer[0] = (char)charToWrite;
365 if (BitConverter.IsLittleEndian)
367 thisDWord >>= 16;
368 pOutputBuffer[1] = (char)(byte)thisDWord;
369 thisDWord >>= 8;
370 pOutputBuffer[2] = (char)thisDWord;
372 else
374 pOutputBuffer[2] = (char)(byte)thisDWord;
375 pOutputBuffer[1] = (char)(byte)(thisDWord >> 8);
377 pInputBuffer += 4;
378 pOutputBuffer += 3;
379 outputCharsRemaining -= 3;
381 continue; // go back to original bounds check and check for ASCII
383 else
385 if (outputCharsRemaining < 2)
387 goto ProcessRemainingBytesSlow; // running out of output buffer
390 pOutputBuffer[0] = (char)charToWrite;
391 pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8));
392 pInputBuffer += 3;
393 pOutputBuffer += 2;
394 outputCharsRemaining -= 2;
396 // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
397 // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
399 if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
401 goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
403 else
405 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
406 goto BeforeProcessTwoByteSequence;
410 else
412 if (outputCharsRemaining == 0)
414 goto ProcessRemainingBytesSlow; // running out of output buffer
417 pOutputBuffer[0] = (char)charToWrite;
418 pInputBuffer += 2;
419 pOutputBuffer++;
420 outputCharsRemaining--;
422 if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
424 goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
426 else
428 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
429 goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
434 // Check the 3-byte case.
436 BeforeProcessThreeByteSequence:
438 if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
440 ProcessThreeByteSequenceWithCheck:
442 // We need to check for overlong or surrogate three-byte sequences.
444 // Per Table 3-7, valid sequences are:
445 // [ E0 ] [ A0..BF ] [ 80..BF ]
446 // [ E1..EC ] [ 80..BF ] [ 80..BF ]
447 // [ ED ] [ 80..9F ] [ 80..BF ]
448 // [ EE..EF ] [ 80..BF ] [ 80..BF ]
450 // Big-endian examples of using the above validation table:
451 // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
452 // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
453 // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
454 // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
455 // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
457 if (BitConverter.IsLittleEndian)
459 // The "overlong or surrogate" check can be implemented using a single jump, but there's
460 // some overhead to moving the bits into the correct locations in order to perform the
461 // correct comparison, and in practice the processor's branch prediction capability is
462 // good enough that we shouldn't bother. So we'll use two jumps instead.
464 // Can't extract this check into its own helper method because JITter produces suboptimal
465 // assembly, even with aggressive inlining.
467 // Code below becomes 5 instructions: test, jz, lea, test, jz
469 if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0))
471 goto Error; // overlong or surrogate
474 else
476 if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0))
478 goto Error; // overlong or surrogate
482 // At this point, we know the incoming scalar is well-formed.
484 if (outputCharsRemaining == 0)
486 goto OutputBufferTooSmall; // not enough space in the destination buffer to write
489 // As an optimization, on compatible platforms check if a second three-byte sequence immediately
490 // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
492 if (Bmi2.X64.IsSupported)
494 Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
496 // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
497 // would indicate the potential start of a second three-byte sequence.
499 if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0)
501 // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
502 // the final place where we can safely perform a DWORD read, and we want to probe whether it's
503 // safe to read a DWORD beginning at address &pInputBuffer[3].
505 if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
507 // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
508 // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
509 // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
510 // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
511 // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
512 // fail, and redirect control flow to the error handling logic at the very end of this method.
514 uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
516 if (UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
517 && ((secondDWord & 0x0000_200Fu) != 0)
518 && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
520 // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
521 ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
522 thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
524 // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
525 ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
527 Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
528 pInputBuffer += 6;
529 pOutputBuffer += 2;
530 outputCharsRemaining -= 2;
532 // Drain any ASCII data following the second three-byte sequence.
534 goto CheckForAsciiByteAfterThreeByteSequence;
540 // Couldn't extract 2x three-byte sequences together, just do this one by itself.
542 *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
543 pInputBuffer += 3;
544 pOutputBuffer++;
545 outputCharsRemaining--;
547 CheckForAsciiByteAfterThreeByteSequence:
549 // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
550 // in to the text. If this happens strip it off now before seeing if the next character
551 // consists of three code units.
553 if (UInt32FourthByteIsAscii(thisDWord))
555 if (outputCharsRemaining == 0)
557 goto OutputBufferTooSmall;
560 if (BitConverter.IsLittleEndian)
562 *pOutputBuffer = (char)(thisDWord >> 24);
564 else
566 *pOutputBuffer = (char)(byte)thisDWord;
569 pInputBuffer++;
570 pOutputBuffer++;
571 outputCharsRemaining--;
574 if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
576 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
578 // Optimization: A three-byte character could indicate CJK text, which makes it likely
579 // that the character following this one is also CJK. We'll check for a three-byte sequence
580 // marker now and jump directly to three-byte sequence processing if we see one, skipping
581 // all of the logic at the beginning of the loop.
583 if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
585 goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
587 else
589 goto AfterReadDWord; // probably ASCII punctuation or whitespace
592 else
594 goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
598 // Assume the 4-byte case, but we need to validate.
601 // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
603 // Per Table 3-7, valid sequences are:
604 // [ F0 ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
605 // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
606 // [ F4 ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
608 if (!UInt32BeginsWithUtf8FourByteMask(thisDWord))
610 goto Error;
613 // Now check for overlong / out-of-range sequences.
615 if (BitConverter.IsLittleEndian)
617 // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
618 // We want to get the 'w' byte in front of the 'z' byte so that we can perform
619 // a single range comparison. We'll take advantage of the fact that the JITter
620 // can detect a ROR / ROL operation, then we'll just zero out the bytes that
621 // aren't involved in the range check.
623 uint toCheck = thisDWord & 0x0000_FFFFu;
625 // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
627 toCheck = BitOperations.RotateRight(toCheck, 8);
629 // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
631 if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu))
633 goto Error;
636 else
638 if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu))
640 goto Error;
644 // Validation complete.
646 if (outputCharsRemaining < 2)
648 // There's no point to falling back to the "drain the input buffer" logic, since we know
649 // we can't write anything to the destination. So we'll just exit immediately.
650 goto OutputBufferTooSmall;
653 Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord));
655 pInputBuffer += 4;
656 pOutputBuffer += 2;
657 outputCharsRemaining -= 2;
659 continue; // go back to beginning of loop for processing
663 ProcessRemainingBytesSlow:
664 inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
666 ProcessInputOfLessThanDWordSize:
667 while (inputLength > 0)
669 uint firstByte = pInputBuffer[0];
670 if (firstByte <= 0x7Fu)
672 if (outputCharsRemaining == 0)
674 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
677 // 1-byte (ASCII) case
678 *pOutputBuffer = (char)firstByte;
680 pInputBuffer++;
681 pOutputBuffer++;
682 inputLength--;
683 outputCharsRemaining--;
684 continue;
687 // Potentially the start of a multi-byte sequence?
689 firstByte -= 0xC2u;
690 if ((byte)firstByte <= (0xDFu - 0xC2u))
692 // Potentially a 2-byte sequence?
693 if (inputLength < 2)
695 goto InputBufferTooSmall; // out of data
698 uint secondByte = pInputBuffer[1];
699 if (!IsLowByteUtf8ContinuationByte(secondByte))
701 goto Error; // 2-byte marker not followed by continuation byte
704 if (outputCharsRemaining == 0)
706 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
709 uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar
710 *pOutputBuffer = (char)asChar;
712 pInputBuffer += 2;
713 pOutputBuffer++;
714 inputLength -= 2;
715 outputCharsRemaining--;
716 continue;
718 else if ((byte)firstByte <= (0xEFu - 0xC2u))
720 // Potentially a 3-byte sequence?
721 if (inputLength >= 3)
723 uint secondByte = pInputBuffer[1];
724 uint thirdByte = pInputBuffer[2];
725 if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte))
727 goto Error; // 3-byte marker not followed by 2 continuation bytes
730 // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
731 // We account for this in the comparisons below.
733 uint partialChar = (firstByte << 12) + (secondByte << 6);
734 if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6))
736 goto Error; // this is an overlong encoding; fail
739 partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); // if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
740 if (partialChar < 0x0800u /* number of code points in UTF-16 surrogate code point range */)
742 goto Error; // attempted to encode a UTF-16 surrogate code point; fail
745 if (outputCharsRemaining == 0)
747 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
750 // Now restore the full scalar value.
752 partialChar += thirdByte;
753 partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
754 partialChar -= 0x80u; // remove third byte continuation marker
756 *pOutputBuffer = (char)partialChar;
758 pInputBuffer += 3;
759 pOutputBuffer++;
760 inputLength -= 3;
761 outputCharsRemaining--;
762 continue;
764 else if (inputLength >= 2)
766 uint secondByte = pInputBuffer[1];
767 if (!IsLowByteUtf8ContinuationByte(secondByte))
769 goto Error; // 3-byte marker not followed by continuation byte
772 // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
773 // from just the first two bytes.
775 uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
776 if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u)
778 goto Error; // failed overlong check
780 if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu))
782 goto Error; // failed surrogate check
786 goto InputBufferTooSmall; // out of data
788 else if ((byte)firstByte <= (0xF4u - 0xC2u))
790 // Potentially a 4-byte sequence?
792 if (inputLength < 2)
794 goto InputBufferTooSmall; // ran out of data
797 uint nextByte = pInputBuffer[1];
798 if (!IsLowByteUtf8ContinuationByte(nextByte))
800 goto Error; // 4-byte marker not followed by a continuation byte
803 uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
804 if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu))
806 goto Error; // failed overlong / out-of-range check
809 if (inputLength < 3)
811 goto InputBufferTooSmall; // ran out of data
814 if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
816 goto Error; // third byte in 4-byte sequence not a continuation byte
819 if (inputLength < 4)
821 goto InputBufferTooSmall; // ran out of data
824 if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
826 goto Error; // fourth byte in 4-byte sequence not a continuation byte
829 // If we read a valid astral scalar value, the only way we could've fallen down this code path
830 // is that we didn't have enough output buffer to write the result.
832 goto OutputBufferTooSmall;
834 else
836 goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
840 OperationStatus retVal = OperationStatus.Done;
841 goto ReturnCommon;
843 InputBufferTooSmall:
844 retVal = OperationStatus.NeedMoreData;
845 goto ReturnCommon;
847 OutputBufferTooSmall:
848 retVal = OperationStatus.DestinationTooSmall;
849 goto ReturnCommon;
851 Error:
852 retVal = OperationStatus.InvalidData;
853 goto ReturnCommon;
855 ReturnCommon:
856 pInputBufferRemaining = pInputBuffer;
857 pOutputBufferRemaining = pOutputBuffer;
858 return retVal;
861 // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
862 // the next char would have been consumed from / the next byte would have been written to.
863 // inputLength in chars, outputBytesRemaining in bytes.
864 public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining)
866 const int CharsPerDWord = sizeof(uint) / sizeof(char);
868 Debug.Assert(inputLength >= 0, "Input length must not be negative.");
869 Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
871 Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
872 Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
874 // First, try vectorized conversion.
877 nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
879 pInputBuffer += numElementsConverted;
880 pOutputBuffer += numElementsConverted;
882 // Quick check - did we just end up consuming the entire input buffer?
883 // If so, short-circuit the remainder of the method.
885 if ((int)numElementsConverted == inputLength)
887 pInputBufferRemaining = pInputBuffer;
888 pOutputBufferRemaining = pOutputBuffer;
889 return OperationStatus.Done;
892 inputLength -= (int)numElementsConverted;
893 outputBytesRemaining -= (int)numElementsConverted;
896 if (inputLength < CharsPerDWord)
898 goto ProcessInputOfLessThanDWordSize;
901 char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
903 // Begin the main loop.
905 #if DEBUG
906 char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
907 #endif
909 uint thisDWord;
911 while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
913 // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
915 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
917 AfterReadDWord:
919 #if DEBUG
920 Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
921 pLastBufferPosProcessed = pInputBuffer;
922 #endif
924 // First, check for the common case of all-ASCII chars.
926 if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
928 // We read an all-ASCII sequence (2 chars).
930 if (outputBytesRemaining < 2)
932 goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data
935 // The high WORD of the local declared below might be populated with garbage
936 // as a result of our shifts below, but that's ok since we're only going to
937 // write the low WORD.
939 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
940 // (Same logic works regardless of endianness.)
941 uint valueToWrite = thisDWord | (thisDWord >> 8);
943 Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite);
945 pInputBuffer += 2;
946 pOutputBuffer += 2;
947 outputBytesRemaining -= 2;
949 // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
950 // Below is basically unrolled loops with poor man's vectorization.
952 uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
953 uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
955 if (Bmi2.X64.IsSupported)
957 Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
958 const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
960 // Try reading and writing 8 elements per iteration.
961 uint maxIters = minElementsRemaining / 8;
962 ulong firstQWord, secondQWord;
963 int i;
964 for (i = 0; (uint)i < maxIters; i++)
966 firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
967 secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
969 if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
971 goto LoopTerminatedDueToNonAsciiData;
974 Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
975 Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
977 pInputBuffer += 8;
978 pOutputBuffer += 8;
981 outputBytesRemaining -= 8 * i;
983 // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
985 if ((minElementsRemaining & 4) != 0)
987 secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
989 if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
991 goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
994 Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
996 pInputBuffer += 4;
997 pOutputBuffer += 4;
998 outputBytesRemaining -= 4;
1001 continue; // Go back to beginning of main loop, read data, check for ASCII
1003 LoopTerminatedDueToNonAsciiData:
1005 outputBytesRemaining -= 8 * i;
1007 // First, see if we can drain any ASCII data from the first QWORD.
1009 if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
1011 Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
1012 pInputBuffer += 4;
1013 pOutputBuffer += 4;
1014 outputBytesRemaining -= 4;
1016 else
1018 secondQWord = firstQWord;
1021 LoopTerminatedDueToNonAsciiDataInSecondQWord:
1023 Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
1025 thisDWord = (uint)secondQWord;
1026 if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
1028 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1029 Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
1030 pInputBuffer += 2;
1031 pOutputBuffer += 2;
1032 outputBytesRemaining -= 2;
1033 thisDWord = (uint)(secondQWord >> 32);
1036 goto AfterReadDWordSkipAllCharsAsciiCheck;
1038 else
1040 // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
1041 uint maxIters = minElementsRemaining / 4;
1042 uint secondDWord;
1043 int i;
1044 for (i = 0; (uint)i < maxIters; i++)
1046 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1047 secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2);
1049 if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord))
1051 goto LoopTerminatedDueToNonAsciiData;
1054 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1055 // (Same logic works regardless of endianness.)
1056 Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
1057 Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8)));
1059 pInputBuffer += 4;
1060 pOutputBuffer += 4;
1063 outputBytesRemaining -= 4 * i;
1065 continue; // Go back to beginning of main loop, read data, check for ASCII
1067 LoopTerminatedDueToNonAsciiData:
1069 outputBytesRemaining -= 4 * i;
1071 // First, see if we can drain any ASCII data from the first DWORD.
1073 if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
1075 // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
1076 // (Same logic works regardless of endianness.)
1077 Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
1078 pInputBuffer += 2;
1079 pOutputBuffer += 2;
1080 outputBytesRemaining -= 2;
1081 thisDWord = secondDWord;
1084 goto AfterReadDWordSkipAllCharsAsciiCheck;
1088 AfterReadDWordSkipAllCharsAsciiCheck:
1090 Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier
1092 // Next, try stripping off the first ASCII char if it exists.
1093 // We don't check for a second ASCII char since that should have been handled above.
1095 if (IsFirstCharAscii(thisDWord))
1097 if (outputBytesRemaining == 0)
1099 goto OutputBufferTooSmall;
1102 if (BitConverter.IsLittleEndian)
1104 pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ]
1106 else
1108 pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
1111 pInputBuffer++;
1112 pOutputBuffer++;
1113 outputBytesRemaining--;
1115 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1117 goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD
1119 else
1121 // The input buffer at the current offset contains a non-ASCII char.
1122 // Read an entire DWORD and fall through to non-ASCII consumption logic.
1123 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1127 // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
1129 if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
1131 TryConsumeMultipleTwoByteSequences:
1133 // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
1134 // a tight loop without falling back to the main loop.
1136 if (IsSecondCharTwoUtf8Bytes(thisDWord))
1138 // We have two runs of two bytes each.
1140 if (outputBytesRemaining < 4)
1142 goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
1145 Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
1147 pInputBuffer += 2;
1148 pOutputBuffer += 4;
1149 outputBytesRemaining -= 4;
1151 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1153 goto ProcessNextCharAndFinish; // Running out of data - go down slow path
1155 else
1157 // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
1158 // also two bytes. Check for that first before going back to the beginning of the loop.
1160 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1162 if (IsFirstCharTwoUtf8Bytes(thisDWord))
1164 // Validated we have a two-byte sequence coming up
1165 goto TryConsumeMultipleTwoByteSequences;
1168 // If we reached this point, the next sequence is something other than a valid
1169 // two-byte sequence, so go back to the beginning of the loop.
1170 goto AfterReadDWord;
1174 if (outputBytesRemaining < 2)
1176 goto OutputBufferTooSmall;
1179 Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
1181 // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
1182 // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
1183 // char is ASCII?
1185 if (IsSecondCharAscii(thisDWord))
1187 if (outputBytesRemaining >= 3)
1189 if (BitConverter.IsLittleEndian)
1191 thisDWord >>= 16;
1193 pOutputBuffer[2] = (byte)thisDWord;
1195 pInputBuffer += 2;
1196 pOutputBuffer += 3;
1197 outputBytesRemaining -= 3;
1199 continue; // go back to original bounds check and check for ASCII
1201 else
1203 pInputBuffer++;
1204 pOutputBuffer += 2;
1205 goto OutputBufferTooSmall;
1208 else
1210 pInputBuffer++;
1211 pOutputBuffer += 2;
1212 outputBytesRemaining -= 2;
1214 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1216 goto ProcessNextCharAndFinish; // Running out of data - go down slow path
1218 else
1220 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1221 goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
1226 // Check the 3-byte case.
1228 BeforeProcessThreeByteSequence:
1230 if (!IsFirstCharSurrogate(thisDWord))
1232 // Optimization: A three-byte character could indicate CJK text, which makes it likely
1233 // that the character following this one is also CJK. We'll perform the check now
1234 // rather than jumping to the beginning of the main loop.
1236 if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
1238 if (!IsSecondCharSurrogate(thisDWord))
1240 if (outputBytesRemaining < 6)
1242 goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
1245 WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
1247 pInputBuffer += 2;
1248 pOutputBuffer += 6;
1249 outputBytesRemaining -= 6;
1251 // Try to remain in the 3-byte processing loop if at all possible.
1253 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1255 goto ProcessNextCharAndFinish; // Running out of data - go down slow path
1257 else
1259 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1261 if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
1263 goto BeforeProcessThreeByteSequence;
1265 else
1267 // Fall back to standard processing loop since we don't know how to optimize this.
1268 goto AfterReadDWord;
1274 ConsumeSingleThreeByteRun:
1276 if (outputBytesRemaining < 3)
1278 goto OutputBufferTooSmall;
1281 WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
1283 pInputBuffer++;
1284 pOutputBuffer += 3;
1285 outputBytesRemaining -= 3;
1287 // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
1288 // in to the text. If this happens strip it off now before seeing if the next character
1289 // consists of three code units.
1291 if (IsSecondCharAscii(thisDWord))
1293 if (outputBytesRemaining == 0)
1295 goto OutputBufferTooSmall;
1298 if (BitConverter.IsLittleEndian)
1300 *pOutputBuffer = (byte)(thisDWord >> 16);
1302 else
1304 *pOutputBuffer = (byte)(thisDWord);
1307 pInputBuffer++;
1308 pOutputBuffer++;
1309 outputBytesRemaining--;
1311 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1313 goto ProcessNextCharAndFinish; // Running out of data - go down slow path
1315 else
1317 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1319 if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
1321 goto BeforeProcessThreeByteSequence;
1323 else
1325 // Fall back to standard processing loop since we don't know how to optimize this.
1326 goto AfterReadDWord;
1331 if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
1333 goto ProcessNextCharAndFinish; // Running out of data - go down slow path
1335 else
1337 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
1338 goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII
1342 // Four byte sequence processing
1344 if (IsWellFormedUtf16SurrogatePair(thisDWord))
1346 if (outputBytesRemaining < 4)
1348 goto OutputBufferTooSmall;
1351 Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
1353 pInputBuffer += 2;
1354 pOutputBuffer += 4;
1355 outputBytesRemaining -= 4;
1357 continue; // go back to beginning of loop for processing
1360 goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
1363 ProcessNextCharAndFinish:
1364 inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
1366 ProcessInputOfLessThanDWordSize:
1367 Debug.Assert(inputLength < CharsPerDWord);
1369 if (inputLength == 0)
1371 goto InputBufferFullyConsumed;
1374 uint thisChar = *pInputBuffer;
1375 goto ProcessFinalChar;
1377 ProcessOneCharFromCurrentDWordAndFinish:
1378 if (BitConverter.IsLittleEndian)
1380 thisChar = thisDWord & 0xFFFFu; // preserve only the first char
1382 else
1384 thisChar = thisDWord >> 16; // preserve only the first char
1387 ProcessFinalChar:
1389 if (thisChar <= 0x7Fu)
1391 if (outputBytesRemaining == 0)
1393 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
1396 // 1-byte (ASCII) case
1397 *pOutputBuffer = (byte)thisChar;
1399 pInputBuffer++;
1400 pOutputBuffer++;
1402 else if (thisChar < 0x0800u)
1404 if (outputBytesRemaining < 2)
1406 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
1409 // 2-byte case
1410 pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
1411 pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
1413 pInputBuffer++;
1414 pOutputBuffer += 2;
1416 else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
1418 if (outputBytesRemaining < 3)
1420 goto OutputBufferTooSmall; // we have no hope of writing anything to the output
1423 // 3-byte case
1424 pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
1425 pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
1426 pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
1428 pInputBuffer++;
1429 pOutputBuffer += 3;
1431 else if (thisChar <= 0xDBFFu)
1433 // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
1434 goto InputBufferTooSmall;
1436 else
1438 // UTF-16 low surrogate code point with no leading data, report error
1439 goto Error;
1443 // There are two ways we can end up here. Either we were running low on input data,
1444 // or we were running low on space in the destination buffer. If we're running low on
1445 // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
1446 // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
1447 // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
1448 // then we didn't modify inputLength since entering the main loop, which means it should
1449 // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
1450 // which of the two scenarios we're in.
1452 if (inputLength > 1)
1454 goto OutputBufferTooSmall;
1457 InputBufferFullyConsumed:
1458 OperationStatus retVal = OperationStatus.Done;
1459 goto ReturnCommon;
1461 InputBufferTooSmall:
1462 retVal = OperationStatus.NeedMoreData;
1463 goto ReturnCommon;
1465 OutputBufferTooSmall:
1466 retVal = OperationStatus.DestinationTooSmall;
1467 goto ReturnCommon;
1469 Error:
1470 retVal = OperationStatus.InvalidData;
1471 goto ReturnCommon;
1473 ReturnCommon:
1474 pInputBufferRemaining = pInputBuffer;
1475 pOutputBufferRemaining = pOutputBuffer;
1476 return retVal;