Fix IDE0025 (use expression body for properties)
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / UnicodeEncoding.cs
blobeaa54acf5dc56d66b1e23ab3ee41ea3618000e01
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 //
6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
7 //
9 // This define can be used to turn off the fast loops. Useful for finding whether
10 // the problem is fastloop-specific.
11 #define FASTLOOP
13 using System;
14 using System.Globalization;
15 using System.Diagnostics;
16 using System.Runtime.InteropServices;
18 using Internal.Runtime.CompilerServices;
20 namespace System.Text
22 public class UnicodeEncoding : Encoding
24 // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
25 // The initialization code will not be run until a static member of the class is referenced
26 internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true);
27 internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true);
29 private readonly bool isThrowException = false;
31 private readonly bool bigEndian = false;
32 private readonly bool byteOrderMark = false;
34 // Unicode version 2.0 character size in bytes
35 public const int CharSize = 2;
37 public UnicodeEncoding()
38 : this(false, true)
43 public UnicodeEncoding(bool bigEndian, bool byteOrderMark)
44 : base(bigEndian ? 1201 : 1200) //Set the data item.
46 this.bigEndian = bigEndian;
47 this.byteOrderMark = byteOrderMark;
51 public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
52 : this(bigEndian, byteOrderMark)
54 this.isThrowException = throwOnInvalidBytes;
56 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
57 if (this.isThrowException)
58 SetDefaultFallbacks();
61 internal sealed override void SetDefaultFallbacks()
63 // For UTF-X encodings, we use a replacement fallback with an empty string
64 if (this.isThrowException)
66 this.encoderFallback = EncoderFallback.ExceptionFallback;
67 this.decoderFallback = DecoderFallback.ExceptionFallback;
69 else
71 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
72 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
76 // The following methods are copied from EncodingNLS.cs.
77 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
78 // These should be kept in sync for the following classes:
79 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
82 // Returns the number of bytes required to encode a range of characters in
83 // a character array.
85 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
86 // So if you fix this, fix the others. Currently those include:
87 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
88 // parent method is safe
90 public override unsafe int GetByteCount(char[] chars, int index, int count)
92 // Validate input parameters
93 if (chars == null)
94 throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
96 if (index < 0 || count < 0)
97 throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
99 if (chars.Length - index < count)
100 throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
102 // If no input, return 0, avoid fixed empty array problem
103 if (count == 0)
104 return 0;
106 // Just call the pointer version
107 fixed (char* pChars = chars)
108 return GetByteCount(pChars + index, count, null);
111 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
112 // So if you fix this, fix the others. Currently those include:
113 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
114 // parent method is safe
116 public override unsafe int GetByteCount(string s)
118 // Validate input
119 if (s == null)
120 throw new ArgumentNullException(nameof(s));
122 fixed (char* pChars = s)
123 return GetByteCount(pChars, s.Length, null);
126 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
127 // So if you fix this, fix the others. Currently those include:
128 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
130 [CLSCompliant(false)]
131 public override unsafe int GetByteCount(char* chars, int count)
133 // Validate Parameters
134 if (chars == null)
135 throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
137 if (count < 0)
138 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
140 // Call it with empty encoder
141 return GetByteCount(chars, count, null);
144 // Parent method is safe.
145 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
146 // So if you fix this, fix the others. Currently those include:
147 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
149 public override unsafe int GetBytes(string s, int charIndex, int charCount,
150 byte[] bytes, int byteIndex)
152 if (s == null || bytes == null)
153 throw new ArgumentNullException(s == null ? nameof(s) : nameof(bytes), SR.ArgumentNull_Array);
155 if (charIndex < 0 || charCount < 0)
156 throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
158 if (s.Length - charIndex < charCount)
159 throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
161 if (byteIndex < 0 || byteIndex > bytes.Length)
162 throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
164 int byteCount = bytes.Length - byteIndex;
166 fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
167 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
170 // Encodes a range of characters in a character array into a range of bytes
171 // in a byte array. An exception occurs if the byte array is not large
172 // enough to hold the complete encoding of the characters. The
173 // GetByteCount method can be used to determine the exact number of
174 // bytes that will be produced for a given range of characters.
175 // Alternatively, the GetMaxByteCount method can be used to
176 // determine the maximum number of bytes that will be produced for a given
177 // number of characters, regardless of the actual character values.
179 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
180 // So if you fix this, fix the others. Currently those include:
181 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
182 // parent method is safe
184 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
185 byte[] bytes, int byteIndex)
187 // Validate parameters
188 if (chars == null || bytes == null)
189 throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array);
191 if (charIndex < 0 || charCount < 0)
192 throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
194 if (chars.Length - charIndex < charCount)
195 throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
197 if (byteIndex < 0 || byteIndex > bytes.Length)
198 throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
200 // If nothing to encode return 0, avoid fixed problem
201 if (charCount == 0)
202 return 0;
204 // Just call pointer version
205 int byteCount = bytes.Length - byteIndex;
207 fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
208 // Remember that byteCount is # to decode, not size of array.
209 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
212 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
213 // So if you fix this, fix the others. Currently those include:
214 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
216 [CLSCompliant(false)]
217 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
219 // Validate Parameters
220 if (bytes == null || chars == null)
221 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
223 if (charCount < 0 || byteCount < 0)
224 throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
226 return GetBytes(chars, charCount, bytes, byteCount, null);
229 // Returns the number of characters produced by decoding a range of bytes
230 // in a byte array.
232 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
233 // So if you fix this, fix the others. Currently those include:
234 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
235 // parent method is safe
237 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
239 // Validate Parameters
240 if (bytes == null)
241 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
243 if (index < 0 || count < 0)
244 throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
246 if (bytes.Length - index < count)
247 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
249 // If no input just return 0, fixed doesn't like 0 length arrays
250 if (count == 0)
251 return 0;
253 // Just call pointer version
254 fixed (byte* pBytes = bytes)
255 return GetCharCount(pBytes + index, count, null);
258 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
259 // So if you fix this, fix the others. Currently those include:
260 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
262 [CLSCompliant(false)]
263 public override unsafe int GetCharCount(byte* bytes, int count)
265 // Validate Parameters
266 if (bytes == null)
267 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
269 if (count < 0)
270 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
272 return GetCharCount(bytes, count, null);
275 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
276 // So if you fix this, fix the others. Currently those include:
277 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
278 // parent method is safe
280 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
281 char[] chars, int charIndex)
283 // Validate Parameters
284 if (bytes == null || chars == null)
285 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
287 if (byteIndex < 0 || byteCount < 0)
288 throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
290 if ( bytes.Length - byteIndex < byteCount)
291 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
293 if (charIndex < 0 || charIndex > chars.Length)
294 throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
296 // If no input, return 0 & avoid fixed problem
297 if (byteCount == 0)
298 return 0;
300 // Just call pointer version
301 int charCount = chars.Length - charIndex;
303 fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
304 // Remember that charCount is # to decode, not size of array
305 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
308 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
309 // So if you fix this, fix the others. Currently those include:
310 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
312 [CLSCompliant(false)]
313 public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
315 // Validate Parameters
316 if (bytes == null || chars == null)
317 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
319 if (charCount < 0 || byteCount < 0)
320 throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
322 return GetChars(bytes, byteCount, chars, charCount, null);
325 // Returns a string containing the decoded representation of a range of
326 // bytes in a byte array.
328 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
329 // So if you fix this, fix the others. Currently those include:
330 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
331 // parent method is safe
333 public override unsafe string GetString(byte[] bytes, int index, int count)
335 // Validate Parameters
336 if (bytes == null)
337 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
339 if (index < 0 || count < 0)
340 throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
342 if (bytes.Length - index < count)
343 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
345 // Avoid problems with empty input buffer
346 if (count == 0) return string.Empty;
348 fixed (byte* pBytes = bytes)
349 return string.CreateStringFromEncoding(
350 pBytes + index, count, this);
354 // End of standard methods copied from EncodingNLS.cs
356 internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS? encoder)
358 Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null");
359 Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0");
361 // Start by assuming each char gets 2 bytes
362 int byteCount = count << 1;
364 // Check for overflow in byteCount
365 // (If they were all invalid chars, this would actually be wrong,
366 // but that's a ridiculously large # so we're not concerned about that case)
367 if (byteCount < 0)
368 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
370 char* charStart = chars;
371 char* charEnd = chars + count;
372 char charLeftOver = (char)0;
374 bool wasHereBefore = false;
376 // For fallback we may need a fallback buffer
377 EncoderFallbackBuffer? fallbackBuffer = null;
378 char* charsForFallback;
380 if (encoder != null)
382 charLeftOver = encoder._charLeftOver;
384 // Assume extra bytes to encode charLeftOver if it existed
385 if (charLeftOver > 0)
386 byteCount += 2;
388 // We mustn't have left over fallback data when counting
389 if (encoder.InternalHasFallbackBuffer)
391 fallbackBuffer = encoder.FallbackBuffer;
392 if (fallbackBuffer.Remaining > 0)
393 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
395 // Set our internal fallback interesting things.
396 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
400 char ch;
401 TryAgain:
403 while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
405 // First unwind any fallback
406 if (ch == 0)
408 // No fallback, maybe we can do it fast
409 #if FASTLOOP
410 // If endianess is backwards then each pair of bytes would be backwards.
411 if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
412 #if BIT64
413 (unchecked((long)chars) & 7) == 0 &&
414 #else
415 (unchecked((int)chars) & 3) == 0 &&
416 #endif
417 charLeftOver == 0)
419 // Need -1 to check 2 at a time. If we have an even #, longChars will go
420 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
421 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
422 ulong* longEnd = (ulong*)(charEnd - 3);
424 // Need new char* so we can check 4 at a time
425 ulong* longChars = (ulong*)chars;
427 while (longChars < longEnd)
429 // See if we potentially have surrogates (0x8000 bit set)
430 // (We're either big endian on a big endian machine or little endian on
431 // a little endian machine so that'll work)
432 if ((0x8000800080008000 & *longChars) != 0)
434 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
435 // 5 bits looks like 11011, then its a high or low surrogate.
436 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
437 // Note that we expect BMP characters to be more common than surrogates
438 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
439 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
441 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
442 // but no clue if they're high or low.
443 // If each of the 4 characters are non-zero, then none are surrogates.
444 if ((uTemp & 0xFFFF000000000000) == 0 ||
445 (uTemp & 0x0000FFFF00000000) == 0 ||
446 (uTemp & 0x00000000FFFF0000) == 0 ||
447 (uTemp & 0x000000000000FFFF) == 0)
449 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
450 // or if there's 1 or 4 surrogates
452 // If they happen to be high/low/high/low, we may as well continue. Check the next
453 // bit to see if its set (low) or not (high) in the right pattern
454 if ((0xfc00fc00fc00fc00 & *longChars) !=
455 (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
457 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
458 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
460 // Drop out to the slow loop to resolve the surrogates
461 break;
463 // else they are all surrogates in High/Low/High/Low order, so we can use them.
465 // else none are surrogates, so we can use them.
467 // else all < 0x8000 so we can use them
469 // We already counted these four chars, go to next long.
470 longChars++;
473 chars = (char*)longChars;
475 if (chars >= charEnd)
476 break;
478 #endif // FASTLOOP
480 // No fallback, just get next char
481 ch = *chars;
482 chars++;
484 else
486 // We weren't preallocating fallback space.
487 byteCount += 2;
490 // Check for high or low surrogates
491 if (ch >= 0xd800 && ch <= 0xdfff)
493 // Was it a high surrogate?
494 if (ch <= 0xdbff)
496 // Its a high surrogate, if we already had a high surrogate do its fallback
497 if (charLeftOver > 0)
499 // Unwind the current character, this should be safe because we
500 // don't have leftover data in the fallback, so chars must have
501 // advanced already.
502 Debug.Assert(chars > charStart,
503 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
504 chars--;
506 // If previous high surrogate deallocate 2 bytes
507 byteCount -= 2;
509 // Fallback the previous surrogate
510 // Need to initialize fallback buffer?
511 if (fallbackBuffer == null)
513 if (encoder == null)
514 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
515 else
516 fallbackBuffer = encoder.FallbackBuffer;
518 // Set our internal fallback interesting things.
519 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
522 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
523 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
524 chars = charsForFallback;
526 // Now no high surrogate left over
527 charLeftOver = (char)0;
528 continue;
531 // Remember this high surrogate
532 charLeftOver = ch;
533 continue;
537 // Its a low surrogate
538 if (charLeftOver == 0)
540 // Expected a previous high surrogate.
541 // Don't count this one (we'll count its fallback if necessary)
542 byteCount -= 2;
544 // fallback this one
545 // Need to initialize fallback buffer?
546 if (fallbackBuffer == null)
548 if (encoder == null)
549 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
550 else
551 fallbackBuffer = encoder.FallbackBuffer;
553 // Set our internal fallback interesting things.
554 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
556 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
557 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
558 chars = charsForFallback;
559 continue;
562 // Valid surrogate pair, add our charLeftOver
563 charLeftOver = (char)0;
564 continue;
566 else if (charLeftOver > 0)
568 // Expected a low surrogate, but this char is normal
570 // Rewind the current character, fallback previous character.
571 // this should be safe because we don't have leftover data in the
572 // fallback, so chars must have advanced already.
573 Debug.Assert(chars > charStart,
574 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
575 chars--;
577 // fallback previous chars
578 // Need to initialize fallback buffer?
579 if (fallbackBuffer == null)
581 if (encoder == null)
582 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
583 else
584 fallbackBuffer = encoder.FallbackBuffer;
586 // Set our internal fallback interesting things.
587 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
589 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
590 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
591 chars = charsForFallback;
593 // Ignore charLeftOver or throw
594 byteCount -= 2;
595 charLeftOver = (char)0;
597 continue;
600 // Ok we had something to add (already counted)
603 // Don't allocate space for left over char
604 if (charLeftOver > 0)
606 byteCount -= 2;
608 // If we have to flush, stick it in fallback and try again
609 if (encoder == null || encoder.MustFlush)
611 if (wasHereBefore)
613 // Throw it, using our complete character
614 throw new ArgumentException(
615 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
617 else
619 // Need to initialize fallback buffer?
620 if (fallbackBuffer == null)
622 if (encoder == null)
623 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
624 else
625 fallbackBuffer = encoder.FallbackBuffer;
627 // Set our internal fallback interesting things.
628 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
630 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
631 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
632 chars = charsForFallback;
633 charLeftOver = (char)0;
634 wasHereBefore = true;
635 goto TryAgain;
640 // Shouldn't have anything in fallback buffer for GetByteCount
641 // (don't have to check _throwOnOverflow for count)
642 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
643 "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
645 // Don't remember fallbackBuffer.encoder for counting
646 return byteCount;
649 internal sealed override unsafe int GetBytes(
650 char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS? encoder)
652 Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null");
653 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
654 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
655 Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null");
657 char charLeftOver = (char)0;
658 char ch;
659 bool wasHereBefore = false;
662 byte* byteEnd = bytes + byteCount;
663 char* charEnd = chars + charCount;
664 byte* byteStart = bytes;
665 char* charStart = chars;
667 // For fallback we may need a fallback buffer
668 EncoderFallbackBuffer? fallbackBuffer = null;
669 char* charsForFallback;
671 // Get our encoder, but don't clear it yet.
672 if (encoder != null)
674 charLeftOver = encoder._charLeftOver;
676 // We mustn't have left over fallback data when counting
677 if (encoder.InternalHasFallbackBuffer)
679 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
680 fallbackBuffer = encoder.FallbackBuffer;
681 if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
682 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
684 // Set our internal fallback interesting things.
685 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
689 TryAgain:
690 while (((ch = (fallbackBuffer == null) ?
691 (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) ||
692 chars < charEnd)
694 // First unwind any fallback
695 if (ch == 0)
697 // No fallback, maybe we can do it fast
698 #if FASTLOOP
699 // If endianess is backwards then each pair of bytes would be backwards.
700 if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
701 #if BIT64
702 (unchecked((long)chars) & 7) == 0 &&
703 #else
704 (unchecked((int)chars) & 3) == 0 &&
705 #endif
706 charLeftOver == 0)
708 // Need -1 to check 2 at a time. If we have an even #, longChars will go
709 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
710 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
711 // We can only go iCount units (limited by shorter of char or byte buffers.
712 ulong* longEnd = (ulong*)(chars - 3 +
713 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
714 (byteEnd - bytes) >> 1 : charEnd - chars));
716 // Need new char* so we can check 4 at a time
717 ulong* longChars = (ulong*)chars;
718 ulong* longBytes = (ulong*)bytes;
720 while (longChars < longEnd)
722 // See if we potentially have surrogates (0x8000 bit set)
723 // (We're either big endian on a big endian machine or little endian on
724 // a little endian machine so that'll work)
725 if ((0x8000800080008000 & *longChars) != 0)
727 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
728 // 5 bits looks like 11011, then its a high or low surrogate.
729 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
730 // Note that we expect BMP characters to be more common than surrogates
731 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
732 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
734 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
735 // but no clue if they're high or low.
736 // If each of the 4 characters are non-zero, then none are surrogates.
737 if ((uTemp & 0xFFFF000000000000) == 0 ||
738 (uTemp & 0x0000FFFF00000000) == 0 ||
739 (uTemp & 0x00000000FFFF0000) == 0 ||
740 (uTemp & 0x000000000000FFFF) == 0)
742 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
743 // or if there's 1 or 4 surrogates
745 // If they happen to be high/low/high/low, we may as well continue. Check the next
746 // bit to see if its set (low) or not (high) in the right pattern
747 if ((0xfc00fc00fc00fc00 & *longChars) !=
748 (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
750 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
751 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
753 // Drop out to the slow loop to resolve the surrogates
754 break;
756 // else they are all surrogates in High/Low/High/Low order, so we can use them.
758 // else none are surrogates, so we can use them.
760 // else all < 0x8000 so we can use them
762 // We can use these 4 chars.
763 Unsafe.WriteUnaligned<ulong>(longBytes, *longChars);
764 longChars++;
765 longBytes++;
768 chars = (char*)longChars;
769 bytes = (byte*)longBytes;
771 if (chars >= charEnd)
772 break;
774 #endif // FASTLOOP
776 // No fallback, just get next char
777 ch = *chars;
778 chars++;
781 // Check for high or low surrogates
782 if (ch >= 0xd800 && ch <= 0xdfff)
784 // Was it a high surrogate?
785 if (ch <= 0xdbff)
787 // Its a high surrogate, see if we already had a high surrogate
788 if (charLeftOver > 0)
790 // Unwind the current character, this should be safe because we
791 // don't have leftover data in the fallback, so chars must have
792 // advanced already.
793 Debug.Assert(chars > charStart,
794 "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
795 chars--;
797 // Fallback the previous surrogate
798 // Might need to create our fallback buffer
799 if (fallbackBuffer == null)
801 if (encoder == null)
802 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
803 else
804 fallbackBuffer = encoder.FallbackBuffer;
806 // Set our internal fallback interesting things.
807 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
810 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
811 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
812 chars = charsForFallback;
814 charLeftOver = (char)0;
815 continue;
818 // Remember this high surrogate
819 charLeftOver = ch;
820 continue;
823 // Its a low surrogate
824 if (charLeftOver == 0)
826 // We'll fall back this one
827 // Might need to create our fallback buffer
828 if (fallbackBuffer == null)
830 if (encoder == null)
831 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
832 else
833 fallbackBuffer = encoder.FallbackBuffer;
835 // Set our internal fallback interesting things.
836 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
839 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
840 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
841 chars = charsForFallback;
842 continue;
845 // Valid surrogate pair, add our charLeftOver
846 if (bytes + 3 >= byteEnd)
848 // Not enough room to add this surrogate pair
849 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
851 // These must have both been from the fallbacks.
852 // Both of these MUST have been from a fallback because if the 1st wasn't
853 // from a fallback, then a high surrogate followed by an illegal char
854 // would've caused the high surrogate to fall back. If a high surrogate
855 // fell back, then it was consumed and both chars came from the fallback.
856 fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate
857 fallbackBuffer.MovePrevious();
859 else
861 // If we don't have enough room, then either we should've advanced a while
862 // or we should have bytes==byteStart and throw below
863 Debug.Assert(chars > charStart + 1 || bytes == byteStart,
864 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
865 chars -= 2; // Didn't use either surrogate
867 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
868 charLeftOver = (char)0; // we'll retry it later
869 break; // Didn't throw, but stop 'til next time.
872 if (bigEndian)
874 *(bytes++) = (byte)(charLeftOver >> 8);
875 *(bytes++) = (byte)charLeftOver;
877 else
879 *(bytes++) = (byte)charLeftOver;
880 *(bytes++) = (byte)(charLeftOver >> 8);
883 charLeftOver = (char)0;
885 else if (charLeftOver > 0)
887 // Expected a low surrogate, but this char is normal
889 // Rewind the current character, fallback previous character.
890 // this should be safe because we don't have leftover data in the
891 // fallback, so chars must have advanced already.
892 Debug.Assert(chars > charStart,
893 "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
894 chars--;
896 // fallback previous chars
897 // Might need to create our fallback buffer
898 if (fallbackBuffer == null)
900 if (encoder == null)
901 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
902 else
903 fallbackBuffer = encoder.FallbackBuffer;
905 // Set our internal fallback interesting things.
906 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
909 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
910 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
911 chars = charsForFallback;
913 // Ignore charLeftOver or throw
914 charLeftOver = (char)0;
915 continue;
918 // Ok, we have a char to add
919 if (bytes + 1 >= byteEnd)
921 // Couldn't add this char
922 if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
923 fallbackBuffer.MovePrevious(); // Not using this fallback char
924 else
926 // Lonely charLeftOver (from previous call) would've been caught up above,
927 // so this must be a case where we've already read an input char.
928 Debug.Assert(chars > charStart,
929 "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
930 chars--; // Not using this char
932 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
933 break; // didn't throw, just stop
936 if (bigEndian)
938 *(bytes++) = (byte)(ch >> 8);
939 *(bytes++) = (byte)ch;
941 else
943 *(bytes++) = (byte)ch;
944 *(bytes++) = (byte)(ch >> 8);
948 // Don't allocate space for left over char
949 if (charLeftOver > 0)
951 // If we aren't flushing we need to fall this back
952 if (encoder == null || encoder.MustFlush)
954 if (wasHereBefore)
956 // Throw it, using our complete character
957 throw new ArgumentException(
958 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
960 else
962 // If we have to flush, stick it in fallback and try again
963 // Might need to create our fallback buffer
964 if (fallbackBuffer == null)
966 if (encoder == null)
967 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
968 else
969 fallbackBuffer = encoder.FallbackBuffer;
971 // Set our internal fallback interesting things.
972 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
975 // If we're not flushing, that'll remember the left over character.
976 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
977 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
978 chars = charsForFallback;
980 charLeftOver = (char)0;
981 wasHereBefore = true;
982 goto TryAgain;
987 // Not flushing, remember it in the encoder
988 if (encoder != null)
990 encoder._charLeftOver = charLeftOver;
991 encoder._charsUsed = (int)(chars - charStart);
994 // Remember charLeftOver if we must, or clear it if we're flushing
995 // (charLeftOver should be 0 if we're flushing)
996 Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0,
997 "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
999 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
1000 encoder == null || !encoder._throwOnOverflow,
1001 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
1003 return (int)(bytes - byteStart);
1006 internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS? baseDecoder)
1008 Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null");
1009 Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0");
1011 UnicodeEncoding.Decoder? decoder = (UnicodeEncoding.Decoder?)baseDecoder;
1013 byte* byteEnd = bytes + count;
1014 byte* byteStart = bytes;
1016 // Need last vars
1017 int lastByte = -1;
1018 char lastChar = (char)0;
1020 // Start by assuming same # of chars as bytes
1021 int charCount = count >> 1;
1023 // For fallback we may need a fallback buffer
1024 DecoderFallbackBuffer? fallbackBuffer = null;
1026 if (decoder != null)
1028 lastByte = decoder.lastByte;
1029 lastChar = decoder.lastChar;
1031 // Assume extra char if last char was around
1032 if (lastChar > 0)
1033 charCount++;
1035 // Assume extra char if extra last byte makes up odd # of input bytes
1036 if (lastByte >= 0 && (count & 1) == 1)
1038 charCount++;
1041 // Shouldn't have anything in fallback buffer for GetCharCount
1042 // (don't have to check _throwOnOverflow for count)
1043 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1044 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
1047 while (bytes < byteEnd)
1049 // If we're aligned then maybe we can do it fast
1050 // That'll hurt if we're unaligned because we'll always test but never be aligned
1051 #if FASTLOOP
1052 if ((bigEndian ^ BitConverter.IsLittleEndian) &&
1053 #if BIT64
1054 (unchecked((long)bytes) & 7) == 0 &&
1055 #else
1056 (unchecked((int)bytes) & 3) == 0 &&
1057 #endif // BIT64
1058 lastByte == -1 && lastChar == 0)
1060 // Need -1 to check 2 at a time. If we have an even #, longBytes will go
1061 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
1062 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1063 ulong* longEnd = (ulong*)(byteEnd - 7);
1065 // Need new char* so we can check 4 at a time
1066 ulong* longBytes = (ulong*)bytes;
1068 while (longBytes < longEnd)
1070 // See if we potentially have surrogates (0x8000 bit set)
1071 // (We're either big endian on a big endian machine or little endian on
1072 // a little endian machine so that'll work)
1073 if ((0x8000800080008000 & *longBytes) != 0)
1075 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1076 // 5 bits looks like 11011, then its a high or low surrogate.
1077 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1078 // Note that we expect BMP characters to be more common than surrogates
1079 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1080 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1082 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1083 // but no clue if they're high or low.
1084 // If each of the 4 characters are non-zero, then none are surrogates.
1085 if ((uTemp & 0xFFFF000000000000) == 0 ||
1086 (uTemp & 0x0000FFFF00000000) == 0 ||
1087 (uTemp & 0x00000000FFFF0000) == 0 ||
1088 (uTemp & 0x000000000000FFFF) == 0)
1090 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1091 // or if there's 1 or 4 surrogates
1093 // If they happen to be high/low/high/low, we may as well continue. Check the next
1094 // bit to see if its set (low) or not (high) in the right pattern
1095 if ((0xfc00fc00fc00fc00 & *longBytes) !=
1096 (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
1098 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1099 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1101 // Drop out to the slow loop to resolve the surrogates
1102 break;
1104 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1106 // else none are surrogates, so we can use them.
1108 // else all < 0x8000 so we can use them
1110 // We can use these 4 chars.
1111 longBytes++;
1114 bytes = (byte*)longBytes;
1116 if (bytes >= byteEnd)
1117 break;
1119 #endif // FASTLOOP
1121 // Get 1st byte
1122 if (lastByte < 0)
1124 lastByte = *bytes++;
1125 if (bytes >= byteEnd) break;
1128 // Get full char
1129 char ch;
1130 if (bigEndian)
1132 ch = (char)(lastByte << 8 | *(bytes++));
1134 else
1136 ch = (char)(*(bytes++) << 8 | lastByte);
1138 lastByte = -1;
1140 // See if the char's valid
1141 if (ch >= 0xd800 && ch <= 0xdfff)
1143 // Was it a high surrogate?
1144 if (ch <= 0xdbff)
1146 // Its a high surrogate, if we had one then do fallback for previous one
1147 if (lastChar > 0)
1149 // Ignore previous bad high surrogate
1150 charCount--;
1152 // Get fallback for previous high surrogate
1153 // Note we have to reconstruct bytes because some may have been in decoder
1154 byte[]? byteBuffer = null;
1155 if (bigEndian)
1157 byteBuffer = new byte[]
1158 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1160 else
1162 byteBuffer = new byte[]
1163 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1166 if (fallbackBuffer == null)
1168 if (decoder == null)
1169 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1170 else
1171 fallbackBuffer = decoder.FallbackBuffer;
1173 // Set our internal fallback interesting things.
1174 fallbackBuffer.InternalInitialize(byteStart, null);
1177 // Get fallback.
1178 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1181 // Ignore the last one which fell back already,
1182 // and remember the new high surrogate
1183 lastChar = ch;
1184 continue;
1187 // Its a low surrogate
1188 if (lastChar == 0)
1190 // Expected a previous high surrogate
1191 charCount--;
1193 // Get fallback for this low surrogate
1194 // Note we have to reconstruct bytes because some may have been in decoder
1195 byte[]? byteBuffer = null;
1196 if (bigEndian)
1198 byteBuffer = new byte[]
1199 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1201 else
1203 byteBuffer = new byte[]
1204 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1207 if (fallbackBuffer == null)
1209 if (decoder == null)
1210 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1211 else
1212 fallbackBuffer = decoder.FallbackBuffer;
1214 // Set our internal fallback interesting things.
1215 fallbackBuffer.InternalInitialize(byteStart, null);
1218 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1220 // Ignore this one (we already did its fallback)
1221 continue;
1224 // Valid surrogate pair, already counted.
1225 lastChar = (char)0;
1227 else if (lastChar > 0)
1229 // Had a high surrogate, expected a low surrogate
1230 // Un-count the last high surrogate
1231 charCount--;
1233 // fall back the high surrogate.
1234 byte[]? byteBuffer = null;
1235 if (bigEndian)
1237 byteBuffer = new byte[]
1238 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1240 else
1242 byteBuffer = new byte[]
1243 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1246 if (fallbackBuffer == null)
1248 if (decoder == null)
1249 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1250 else
1251 fallbackBuffer = decoder.FallbackBuffer;
1253 // Set our internal fallback interesting things.
1254 fallbackBuffer.InternalInitialize(byteStart, null);
1257 // Already subtracted high surrogate
1258 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1260 // Not left over now, clear previous high surrogate and continue to add current char
1261 lastChar = (char)0;
1264 // Valid char, already counted
1267 // Extra space if we can't use decoder
1268 if (decoder == null || decoder.MustFlush)
1270 if (lastChar > 0)
1272 // No hanging high surrogates allowed, do fallback and remove count for it
1273 charCount--;
1274 byte[]? byteBuffer = null;
1275 if (bigEndian)
1277 byteBuffer = new byte[]
1278 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1280 else
1282 byteBuffer = new byte[]
1283 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1286 if (fallbackBuffer == null)
1288 if (decoder == null)
1289 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1290 else
1291 fallbackBuffer = decoder.FallbackBuffer;
1293 // Set our internal fallback interesting things.
1294 fallbackBuffer.InternalInitialize(byteStart, null);
1297 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
1299 lastChar = (char)0;
1302 if (lastByte >= 0)
1304 if (fallbackBuffer == null)
1306 if (decoder == null)
1307 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1308 else
1309 fallbackBuffer = decoder.FallbackBuffer;
1311 // Set our internal fallback interesting things.
1312 fallbackBuffer.InternalInitialize(byteStart, null);
1315 // No hanging odd bytes allowed if must flush
1316 charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes);
1317 lastByte = -1;
1321 // If we had a high surrogate left over, we can't count it
1322 if (lastChar > 0)
1323 charCount--;
1325 // Shouldn't have anything in fallback buffer for GetCharCount
1326 // (don't have to check _throwOnOverflow for count)
1327 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1328 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
1330 return charCount;
1333 internal sealed override unsafe int GetChars(
1334 byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS? baseDecoder)
1336 Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null");
1337 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
1338 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0");
1339 Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null");
1341 UnicodeEncoding.Decoder? decoder = (UnicodeEncoding.Decoder?)baseDecoder;
1343 // Need last vars
1344 int lastByte = -1;
1345 char lastChar = (char)0;
1347 // Get our decoder (but don't clear it yet)
1348 if (decoder != null)
1350 lastByte = decoder.lastByte;
1351 lastChar = decoder.lastChar;
1353 // Shouldn't have anything in fallback buffer for GetChars
1354 // (don't have to check _throwOnOverflow for chars)
1355 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
1356 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
1359 // For fallback we may need a fallback buffer
1360 DecoderFallbackBuffer? fallbackBuffer = null;
1361 char* charsForFallback;
1363 byte* byteEnd = bytes + byteCount;
1364 char* charEnd = chars + charCount;
1365 byte* byteStart = bytes;
1366 char* charStart = chars;
1368 while (bytes < byteEnd)
1370 // If we're aligned then maybe we can do it fast
1371 // That'll hurt if we're unaligned because we'll always test but never be aligned
1372 #if FASTLOOP
1373 if ((bigEndian ^ BitConverter.IsLittleEndian) &&
1374 #if BIT64
1375 (unchecked((long)chars) & 7) == 0 &&
1376 #else
1377 (unchecked((int)chars) & 3) == 0 &&
1378 #endif
1379 lastByte == -1 && lastChar == 0)
1381 // Need -1 to check 2 at a time. If we have an even #, longChars will go
1382 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
1383 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1384 // We can only go iCount units (limited by shorter of char or byte buffers.
1385 ulong* longEnd = (ulong*)(bytes - 7 +
1386 (((byteEnd - bytes) >> 1 < charEnd - chars) ?
1387 (byteEnd - bytes) : (charEnd - chars) << 1));
1389 // Need new char* so we can check 4 at a time
1390 ulong* longBytes = (ulong*)bytes;
1391 ulong* longChars = (ulong*)chars;
1393 while (longBytes < longEnd)
1395 // See if we potentially have surrogates (0x8000 bit set)
1396 // (We're either big endian on a big endian machine or little endian on
1397 // a little endian machine so that'll work)
1398 if ((0x8000800080008000 & *longBytes) != 0)
1400 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1401 // 5 bits looks like 11011, then its a high or low surrogate.
1402 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1403 // Note that we expect BMP characters to be more common than surrogates
1404 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1405 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
1407 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1408 // but no clue if they're high or low.
1409 // If each of the 4 characters are non-zero, then none are surrogates.
1410 if ((uTemp & 0xFFFF000000000000) == 0 ||
1411 (uTemp & 0x0000FFFF00000000) == 0 ||
1412 (uTemp & 0x00000000FFFF0000) == 0 ||
1413 (uTemp & 0x000000000000FFFF) == 0)
1415 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1416 // or if there's 1 or 4 surrogates
1418 // If they happen to be high/low/high/low, we may as well continue. Check the next
1419 // bit to see if its set (low) or not (high) in the right pattern
1420 if ((0xfc00fc00fc00fc00 & *longBytes) !=
1421 (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
1423 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1424 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1426 // Drop out to the slow loop to resolve the surrogates
1427 break;
1429 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1431 // else none are surrogates, so we can use them.
1433 // else all < 0x8000 so we can use them
1435 // We can use these 4 chars.
1436 Unsafe.WriteUnaligned<ulong>(longChars, *longBytes);
1437 longBytes++;
1438 longChars++;
1441 chars = (char*)longChars;
1442 bytes = (byte*)longBytes;
1444 if (bytes >= byteEnd)
1445 break;
1447 #endif // FASTLOOP
1449 // Get 1st byte
1450 if (lastByte < 0)
1452 lastByte = *bytes++;
1453 continue;
1456 // Get full char
1457 char ch;
1458 if (bigEndian)
1460 ch = (char)(lastByte << 8 | *(bytes++));
1462 else
1464 ch = (char)(*(bytes++) << 8 | lastByte);
1466 lastByte = -1;
1468 // See if the char's valid
1469 if (ch >= 0xd800 && ch <= 0xdfff)
1471 // Was it a high surrogate?
1472 if (ch <= 0xdbff)
1474 // Its a high surrogate, if we had one then do fallback for previous one
1475 if (lastChar > 0)
1477 // Get fallback for previous high surrogate
1478 // Note we have to reconstruct bytes because some may have been in decoder
1479 byte[]? byteBuffer = null;
1480 if (bigEndian)
1482 byteBuffer = new byte[]
1483 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1485 else
1487 byteBuffer = new byte[]
1488 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1491 if (fallbackBuffer == null)
1493 if (decoder == null)
1494 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1495 else
1496 fallbackBuffer = decoder.FallbackBuffer;
1498 // Set our internal fallback interesting things.
1499 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1502 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1503 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1504 chars = charsForFallback;
1506 if (!fallbackResult)
1508 // couldn't fall back lonely surrogate
1509 // We either advanced bytes or chars should == charStart and throw below
1510 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1511 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
1512 bytes -= 2; // didn't use these 2 bytes
1513 fallbackBuffer.InternalReset();
1514 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1515 break; // couldn't fallback but didn't throw
1519 // Ignore the previous high surrogate which fell back already,
1520 // yet remember the current high surrogate for next time.
1521 lastChar = ch;
1522 continue;
1525 // Its a low surrogate
1526 if (lastChar == 0)
1528 // Expected a previous high surrogate
1529 // Get fallback for this low surrogate
1530 // Note we have to reconstruct bytes because some may have been in decoder
1531 byte[]? byteBuffer = null;
1532 if (bigEndian)
1534 byteBuffer = new byte[]
1535 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
1537 else
1539 byteBuffer = new byte[]
1540 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
1543 if (fallbackBuffer == null)
1545 if (decoder == null)
1546 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1547 else
1548 fallbackBuffer = decoder.FallbackBuffer;
1550 // Set our internal fallback interesting things.
1551 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1554 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1555 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1556 chars = charsForFallback;
1558 if (!fallbackResult)
1560 // couldn't fall back lonely surrogate
1561 // We either advanced bytes or chars should == charStart and throw below
1562 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1563 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
1564 bytes -= 2; // didn't use these 2 bytes
1565 fallbackBuffer.InternalReset();
1566 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1567 break; // couldn't fallback but didn't throw
1570 // Didn't throw, ignore this one (we already did its fallback)
1571 continue;
1574 // Valid surrogate pair, add our lastChar (will need 2 chars)
1575 if (chars >= charEnd - 1)
1577 // couldn't find room for this surrogate pair
1578 // We either advanced bytes or chars should == charStart and throw below
1579 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1580 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
1581 bytes -= 2; // didn't use these 2 bytes
1582 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1583 // Leave lastChar for next call to Convert()
1584 break; // couldn't fallback but didn't throw
1587 *chars++ = lastChar;
1588 lastChar = (char)0;
1590 else if (lastChar > 0)
1592 // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
1593 byte[]? byteBuffer = null;
1594 if (bigEndian)
1596 byteBuffer = new byte[]
1597 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1599 else
1601 byteBuffer = new byte[]
1602 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1605 if (fallbackBuffer == null)
1607 if (decoder == null)
1608 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1609 else
1610 fallbackBuffer = decoder.FallbackBuffer;
1612 // Set our internal fallback interesting things.
1613 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1616 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1617 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1618 chars = charsForFallback;
1620 if (!fallbackResult)
1622 // couldn't fall back high surrogate, or char that would be next
1623 // We either advanced bytes or chars should == charStart and throw below
1624 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1625 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
1626 bytes -= 2; // didn't use these 2 bytes
1627 fallbackBuffer.InternalReset();
1628 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1629 break; // couldn't fallback but didn't throw
1632 // Not left over now, clear previous high surrogate and continue to add current char
1633 lastChar = (char)0;
1636 // Valid char, room for it?
1637 if (chars >= charEnd)
1639 // 2 bytes couldn't fall back
1640 // We either advanced bytes or chars should == charStart and throw below
1641 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1642 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
1643 bytes -= 2; // didn't use these bytes
1644 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1645 break; // couldn't fallback but didn't throw
1648 // add it
1649 *chars++ = ch;
1652 // Remember our decoder if we must
1653 if (decoder == null || decoder.MustFlush)
1655 if (lastChar > 0)
1657 // No hanging high surrogates allowed, do fallback and remove count for it
1658 byte[]? byteBuffer = null;
1659 if (bigEndian)
1661 byteBuffer = new byte[]
1662 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
1664 else
1666 byteBuffer = new byte[]
1667 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
1670 if (fallbackBuffer == null)
1672 if (decoder == null)
1673 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1674 else
1675 fallbackBuffer = decoder.FallbackBuffer;
1677 // Set our internal fallback interesting things.
1678 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1681 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1682 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
1683 chars = charsForFallback;
1685 if (!fallbackResult)
1687 // 2 bytes couldn't fall back
1688 // We either advanced bytes or chars should == charStart and throw below
1689 Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
1690 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
1691 bytes -= 2; // didn't use these bytes
1692 if (lastByte >= 0)
1693 bytes--; // had an extra last byte hanging around
1694 fallbackBuffer.InternalReset();
1695 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1696 // We'll remember these in our decoder though
1697 bytes += 2;
1698 if (lastByte >= 0)
1699 bytes++;
1700 goto End;
1703 // done with this one
1704 lastChar = (char)0;
1707 if (lastByte >= 0)
1709 if (fallbackBuffer == null)
1711 if (decoder == null)
1712 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
1713 else
1714 fallbackBuffer = decoder.FallbackBuffer;
1716 // Set our internal fallback interesting things.
1717 fallbackBuffer.InternalInitialize(byteStart, charEnd);
1720 // No hanging odd bytes allowed if must flush
1721 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
1722 bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback);
1723 chars = charsForFallback;
1725 if (!fallbackResult)
1727 // odd byte couldn't fall back
1728 bytes--; // didn't use this byte
1729 fallbackBuffer.InternalReset();
1730 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
1731 // didn't throw, but we'll remember it in the decoder
1732 bytes++;
1733 goto End;
1736 // Didn't fail, clear buffer
1737 lastByte = -1;
1741 End:
1743 // Remember our decoder if we must
1744 if (decoder != null)
1746 Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)),
1747 "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
1748 // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
1751 decoder._bytesUsed = (int)(bytes - byteStart);
1752 decoder.lastChar = lastChar;
1753 decoder.lastByte = lastByte;
1756 // Shouldn't have anything in fallback buffer for GetChars
1757 // (don't have to check _throwOnOverflow for count or chars)
1758 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
1759 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
1761 return (int)(chars - charStart);
1765 public override System.Text.Encoder GetEncoder()
1767 return new EncoderNLS(this);
1771 public override System.Text.Decoder GetDecoder()
1773 return new UnicodeEncoding.Decoder(this);
1777 public override byte[] GetPreamble()
1779 if (byteOrderMark)
1781 // Note - we must allocate new byte[]'s here to prevent someone
1782 // from modifying a cached byte[].
1783 if (bigEndian)
1784 return new byte[2] { 0xfe, 0xff };
1785 else
1786 return new byte[2] { 0xff, 0xfe };
1788 return Array.Empty<byte>();
1791 public override ReadOnlySpan<byte> Preamble =>
1792 GetType() != typeof(UnicodeEncoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UnicodeEncoding overrode GetPreamble
1793 !byteOrderMark ? default :
1794 bigEndian ? (ReadOnlySpan<byte>)new byte[2] { 0xfe, 0xff } : // uses C# compiler's optimization for static byte[] data
1795 (ReadOnlySpan<byte>)new byte[2] { 0xff, 0xfe };
1797 public override int GetMaxByteCount(int charCount)
1799 if (charCount < 0)
1800 throw new ArgumentOutOfRangeException(nameof(charCount),
1801 SR.ArgumentOutOfRange_NeedNonNegNum);
1803 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
1804 long byteCount = (long)charCount + 1;
1806 if (EncoderFallback.MaxCharCount > 1)
1807 byteCount *= EncoderFallback.MaxCharCount;
1809 // 2 bytes per char
1810 byteCount <<= 1;
1812 if (byteCount > 0x7fffffff)
1813 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
1815 return (int)byteCount;
1819 public override int GetMaxCharCount(int byteCount)
1821 if (byteCount < 0)
1822 throw new ArgumentOutOfRangeException(nameof(byteCount),
1823 SR.ArgumentOutOfRange_NeedNonNegNum);
1825 // long because byteCount could be biggest int.
1826 // 1 char per 2 bytes. Round up in case 1 left over in decoder.
1827 // Round up using &1 in case byteCount is max size
1828 // Might also need an extra 1 if there's a left over high surrogate in the decoder.
1829 long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1;
1831 // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that)
1832 if (DecoderFallback.MaxCharCount > 1)
1833 charCount *= DecoderFallback.MaxCharCount;
1835 if (charCount > 0x7fffffff)
1836 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
1838 return (int)charCount;
1842 public override bool Equals(object? value)
1844 if (value is UnicodeEncoding that)
1847 // Big Endian Unicode has different code page (1201) than small Endian one (1200),
1848 // so we still have to check _codePage here.
1850 return (CodePage == that.CodePage) &&
1851 byteOrderMark == that.byteOrderMark &&
1852 // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
1853 bigEndian == that.bigEndian &&
1854 (EncoderFallback.Equals(that.EncoderFallback)) &&
1855 (DecoderFallback.Equals(that.DecoderFallback));
1857 return (false);
1860 public override int GetHashCode()
1862 return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
1863 (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0);
1866 private sealed class Decoder : System.Text.DecoderNLS
1868 internal int lastByte = -1;
1869 internal char lastChar = '\0';
1871 public Decoder(UnicodeEncoding encoding) : base(encoding)
1873 // base calls reset
1876 public override void Reset()
1878 lastByte = -1;
1879 lastChar = '\0';
1880 if (_fallbackBuffer != null)
1881 _fallbackBuffer.Reset();
1884 // Anything left in our decoder?
1885 internal override bool HasState => (this.lastByte != -1 || this.lastChar != '\0');