1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
9 // This define can be used to turn off the fast loops. Useful for finding whether
10 // the problem is fastloop-specific.
14 using System
.Globalization
;
15 using System
.Diagnostics
;
16 using System
.Runtime
.InteropServices
;
18 using Internal
.Runtime
.CompilerServices
;
22 public class UnicodeEncoding
: Encoding
24 // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
25 // The initialization code will not be run until a static member of the class is referenced
26 internal static readonly UnicodeEncoding s_bigEndianDefault
= new UnicodeEncoding(bigEndian
: true, byteOrderMark
: true);
27 internal static readonly UnicodeEncoding s_littleEndianDefault
= new UnicodeEncoding(bigEndian
: false, byteOrderMark
: true);
29 private readonly bool isThrowException
= false;
31 private readonly bool bigEndian
= false;
32 private readonly bool byteOrderMark
= false;
34 // Unicode version 2.0 character size in bytes
35 public const int CharSize
= 2;
37 public UnicodeEncoding()
43 public UnicodeEncoding(bool bigEndian
, bool byteOrderMark
)
44 : base(bigEndian
? 1201 : 1200) //Set the data item.
46 this.bigEndian
= bigEndian
;
47 this.byteOrderMark
= byteOrderMark
;
51 public UnicodeEncoding(bool bigEndian
, bool byteOrderMark
, bool throwOnInvalidBytes
)
52 : this(bigEndian
, byteOrderMark
)
54 this.isThrowException
= throwOnInvalidBytes
;
56 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
57 if (this.isThrowException
)
58 SetDefaultFallbacks();
61 internal sealed override void SetDefaultFallbacks()
63 // For UTF-X encodings, we use a replacement fallback with an empty string
64 if (this.isThrowException
)
66 this.encoderFallback
= EncoderFallback
.ExceptionFallback
;
67 this.decoderFallback
= DecoderFallback
.ExceptionFallback
;
71 this.encoderFallback
= new EncoderReplacementFallback("\xFFFD");
72 this.decoderFallback
= new DecoderReplacementFallback("\xFFFD");
76 // The following methods are copied from EncodingNLS.cs.
77 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
78 // These should be kept in sync for the following classes:
79 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
82 // Returns the number of bytes required to encode a range of characters in
85 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
86 // So if you fix this, fix the others. Currently those include:
87 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
88 // parent method is safe
90 public override unsafe int GetByteCount(char[] chars
, int index
, int count
)
92 // Validate input parameters
94 throw new ArgumentNullException(nameof(chars
), SR
.ArgumentNull_Array
);
96 if (index
< 0 || count
< 0)
97 throw new ArgumentOutOfRangeException((index
< 0 ? nameof(index
) : nameof(count
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
99 if (chars
.Length
- index
< count
)
100 throw new ArgumentOutOfRangeException(nameof(chars
), SR
.ArgumentOutOfRange_IndexCountBuffer
);
102 // If no input, return 0, avoid fixed empty array problem
106 // Just call the pointer version
107 fixed (char* pChars
= chars
)
108 return GetByteCount(pChars
+ index
, count
, null);
111 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
112 // So if you fix this, fix the others. Currently those include:
113 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
114 // parent method is safe
116 public override unsafe int GetByteCount(string s
)
120 throw new ArgumentNullException(nameof(s
));
122 fixed (char* pChars
= s
)
123 return GetByteCount(pChars
, s
.Length
, null);
126 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
127 // So if you fix this, fix the others. Currently those include:
128 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
130 [CLSCompliant(false)]
131 public override unsafe int GetByteCount(char* chars
, int count
)
133 // Validate Parameters
135 throw new ArgumentNullException(nameof(chars
), SR
.ArgumentNull_Array
);
138 throw new ArgumentOutOfRangeException(nameof(count
), SR
.ArgumentOutOfRange_NeedNonNegNum
);
140 // Call it with empty encoder
141 return GetByteCount(chars
, count
, null);
144 // Parent method is safe.
145 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
146 // So if you fix this, fix the others. Currently those include:
147 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
149 public override unsafe int GetBytes(string s
, int charIndex
, int charCount
,
150 byte[] bytes
, int byteIndex
)
152 if (s
== null || bytes
== null)
153 throw new ArgumentNullException(s
== null ? nameof(s
) : nameof(bytes
), SR
.ArgumentNull_Array
);
155 if (charIndex
< 0 || charCount
< 0)
156 throw new ArgumentOutOfRangeException(charIndex
< 0 ? nameof(charIndex
) : nameof(charCount
), SR
.ArgumentOutOfRange_NeedNonNegNum
);
158 if (s
.Length
- charIndex
< charCount
)
159 throw new ArgumentOutOfRangeException(nameof(s
), SR
.ArgumentOutOfRange_IndexCount
);
161 if (byteIndex
< 0 || byteIndex
> bytes
.Length
)
162 throw new ArgumentOutOfRangeException(nameof(byteIndex
), SR
.ArgumentOutOfRange_Index
);
164 int byteCount
= bytes
.Length
- byteIndex
;
166 fixed (char* pChars
= s
) fixed (byte* pBytes
= &MemoryMarshal
.GetReference((Span
<byte>)bytes
))
167 return GetBytes(pChars
+ charIndex
, charCount
, pBytes
+ byteIndex
, byteCount
, null);
170 // Encodes a range of characters in a character array into a range of bytes
171 // in a byte array. An exception occurs if the byte array is not large
172 // enough to hold the complete encoding of the characters. The
173 // GetByteCount method can be used to determine the exact number of
174 // bytes that will be produced for a given range of characters.
175 // Alternatively, the GetMaxByteCount method can be used to
176 // determine the maximum number of bytes that will be produced for a given
177 // number of characters, regardless of the actual character values.
179 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
180 // So if you fix this, fix the others. Currently those include:
181 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
182 // parent method is safe
184 public override unsafe int GetBytes(char[] chars
, int charIndex
, int charCount
,
185 byte[] bytes
, int byteIndex
)
187 // Validate parameters
188 if (chars
== null || bytes
== null)
189 throw new ArgumentNullException((chars
== null ? nameof(chars
) : nameof(bytes
)), SR
.ArgumentNull_Array
);
191 if (charIndex
< 0 || charCount
< 0)
192 throw new ArgumentOutOfRangeException((charIndex
< 0 ? nameof(charIndex
) : nameof(charCount
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
194 if (chars
.Length
- charIndex
< charCount
)
195 throw new ArgumentOutOfRangeException(nameof(chars
), SR
.ArgumentOutOfRange_IndexCountBuffer
);
197 if (byteIndex
< 0 || byteIndex
> bytes
.Length
)
198 throw new ArgumentOutOfRangeException(nameof(byteIndex
), SR
.ArgumentOutOfRange_Index
);
200 // If nothing to encode return 0, avoid fixed problem
204 // Just call pointer version
205 int byteCount
= bytes
.Length
- byteIndex
;
207 fixed (char* pChars
= chars
) fixed (byte* pBytes
= &MemoryMarshal
.GetReference((Span
<byte>)bytes
))
208 // Remember that byteCount is # to decode, not size of array.
209 return GetBytes(pChars
+ charIndex
, charCount
, pBytes
+ byteIndex
, byteCount
, null);
212 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
213 // So if you fix this, fix the others. Currently those include:
214 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
216 [CLSCompliant(false)]
217 public override unsafe int GetBytes(char* chars
, int charCount
, byte* bytes
, int byteCount
)
219 // Validate Parameters
220 if (bytes
== null || chars
== null)
221 throw new ArgumentNullException(bytes
== null ? nameof(bytes
) : nameof(chars
), SR
.ArgumentNull_Array
);
223 if (charCount
< 0 || byteCount
< 0)
224 throw new ArgumentOutOfRangeException((charCount
< 0 ? nameof(charCount
) : nameof(byteCount
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
226 return GetBytes(chars
, charCount
, bytes
, byteCount
, null);
229 // Returns the number of characters produced by decoding a range of bytes
232 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
233 // So if you fix this, fix the others. Currently those include:
234 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
235 // parent method is safe
237 public override unsafe int GetCharCount(byte[] bytes
, int index
, int count
)
239 // Validate Parameters
241 throw new ArgumentNullException(nameof(bytes
), SR
.ArgumentNull_Array
);
243 if (index
< 0 || count
< 0)
244 throw new ArgumentOutOfRangeException((index
< 0 ? nameof(index
) : nameof(count
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
246 if (bytes
.Length
- index
< count
)
247 throw new ArgumentOutOfRangeException(nameof(bytes
), SR
.ArgumentOutOfRange_IndexCountBuffer
);
249 // If no input just return 0, fixed doesn't like 0 length arrays
253 // Just call pointer version
254 fixed (byte* pBytes
= bytes
)
255 return GetCharCount(pBytes
+ index
, count
, null);
258 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
259 // So if you fix this, fix the others. Currently those include:
260 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
262 [CLSCompliant(false)]
263 public override unsafe int GetCharCount(byte* bytes
, int count
)
265 // Validate Parameters
267 throw new ArgumentNullException(nameof(bytes
), SR
.ArgumentNull_Array
);
270 throw new ArgumentOutOfRangeException(nameof(count
), SR
.ArgumentOutOfRange_NeedNonNegNum
);
272 return GetCharCount(bytes
, count
, null);
275 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
276 // So if you fix this, fix the others. Currently those include:
277 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
278 // parent method is safe
280 public override unsafe int GetChars(byte[] bytes
, int byteIndex
, int byteCount
,
281 char[] chars
, int charIndex
)
283 // Validate Parameters
284 if (bytes
== null || chars
== null)
285 throw new ArgumentNullException(bytes
== null ? nameof(bytes
) : nameof(chars
), SR
.ArgumentNull_Array
);
287 if (byteIndex
< 0 || byteCount
< 0)
288 throw new ArgumentOutOfRangeException((byteIndex
< 0 ? nameof(byteIndex
) : nameof(byteCount
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
290 if ( bytes
.Length
- byteIndex
< byteCount
)
291 throw new ArgumentOutOfRangeException(nameof(bytes
), SR
.ArgumentOutOfRange_IndexCountBuffer
);
293 if (charIndex
< 0 || charIndex
> chars
.Length
)
294 throw new ArgumentOutOfRangeException(nameof(charIndex
), SR
.ArgumentOutOfRange_Index
);
296 // If no input, return 0 & avoid fixed problem
300 // Just call pointer version
301 int charCount
= chars
.Length
- charIndex
;
303 fixed (byte* pBytes
= bytes
) fixed (char* pChars
= &MemoryMarshal
.GetReference((Span
<char>)chars
))
304 // Remember that charCount is # to decode, not size of array
305 return GetChars(pBytes
+ byteIndex
, byteCount
, pChars
+ charIndex
, charCount
, null);
308 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
309 // So if you fix this, fix the others. Currently those include:
310 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
312 [CLSCompliant(false)]
313 public override unsafe int GetChars(byte* bytes
, int byteCount
, char* chars
, int charCount
)
315 // Validate Parameters
316 if (bytes
== null || chars
== null)
317 throw new ArgumentNullException(bytes
== null ? nameof(bytes
) : nameof(chars
), SR
.ArgumentNull_Array
);
319 if (charCount
< 0 || byteCount
< 0)
320 throw new ArgumentOutOfRangeException((charCount
< 0 ? nameof(charCount
) : nameof(byteCount
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
322 return GetChars(bytes
, byteCount
, chars
, charCount
, null);
325 // Returns a string containing the decoded representation of a range of
326 // bytes in a byte array.
328 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
329 // So if you fix this, fix the others. Currently those include:
330 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
331 // parent method is safe
333 public override unsafe string GetString(byte[] bytes
, int index
, int count
)
335 // Validate Parameters
337 throw new ArgumentNullException(nameof(bytes
), SR
.ArgumentNull_Array
);
339 if (index
< 0 || count
< 0)
340 throw new ArgumentOutOfRangeException((index
< 0 ? nameof(index
) : nameof(count
)), SR
.ArgumentOutOfRange_NeedNonNegNum
);
342 if (bytes
.Length
- index
< count
)
343 throw new ArgumentOutOfRangeException(nameof(bytes
), SR
.ArgumentOutOfRange_IndexCountBuffer
);
345 // Avoid problems with empty input buffer
346 if (count
== 0) return string.Empty
;
348 fixed (byte* pBytes
= bytes
)
349 return string.CreateStringFromEncoding(
350 pBytes
+ index
, count
, this);
354 // End of standard methods copied from EncodingNLS.cs
356 internal sealed override unsafe int GetByteCount(char* chars
, int count
, EncoderNLS
? encoder
)
358 Debug
.Assert(chars
!= null, "[UnicodeEncoding.GetByteCount]chars!=null");
359 Debug
.Assert(count
>= 0, "[UnicodeEncoding.GetByteCount]count >=0");
361 // Start by assuming each char gets 2 bytes
362 int byteCount
= count
<< 1;
364 // Check for overflow in byteCount
365 // (If they were all invalid chars, this would actually be wrong,
366 // but that's a ridiculously large # so we're not concerned about that case)
368 throw new ArgumentOutOfRangeException(nameof(count
), SR
.ArgumentOutOfRange_GetByteCountOverflow
);
370 char* charStart
= chars
;
371 char* charEnd
= chars
+ count
;
372 char charLeftOver
= (char)0;
374 bool wasHereBefore
= false;
376 // For fallback we may need a fallback buffer
377 EncoderFallbackBuffer
? fallbackBuffer
= null;
378 char* charsForFallback
;
382 charLeftOver
= encoder
._charLeftOver
;
384 // Assume extra bytes to encode charLeftOver if it existed
385 if (charLeftOver
> 0)
388 // We mustn't have left over fallback data when counting
389 if (encoder
.InternalHasFallbackBuffer
)
391 fallbackBuffer
= encoder
.FallbackBuffer
;
392 if (fallbackBuffer
.Remaining
> 0)
393 throw new ArgumentException(SR
.Format(SR
.Argument_EncoderFallbackNotEmpty
, this.EncodingName
, encoder
.Fallback
?.GetType()));
395 // Set our internal fallback interesting things.
396 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
403 while (((ch
= (fallbackBuffer
== null) ? (char)0 : fallbackBuffer
.InternalGetNextChar()) != 0) || chars
< charEnd
)
405 // First unwind any fallback
408 // No fallback, maybe we can do it fast
410 // If endianess is backwards then each pair of bytes would be backwards.
411 if ( (bigEndian ^ BitConverter
.IsLittleEndian
) &&
413 (unchecked((long)chars
) & 7) == 0 &&
415 (unchecked((int)chars
) & 3) == 0 &&
419 // Need -1 to check 2 at a time. If we have an even #, longChars will go
420 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
421 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
422 ulong* longEnd
= (ulong*)(charEnd
- 3);
424 // Need new char* so we can check 4 at a time
425 ulong* longChars
= (ulong*)chars
;
427 while (longChars
< longEnd
)
429 // See if we potentially have surrogates (0x8000 bit set)
430 // (We're either big endian on a big endian machine or little endian on
431 // a little endian machine so that'll work)
432 if ((0x8000800080008000 & *longChars
) != 0)
434 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
435 // 5 bits looks like 11011, then its a high or low surrogate.
436 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
437 // Note that we expect BMP characters to be more common than surrogates
438 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
439 ulong uTemp
= (0xf800f800f800f800 & *longChars
) ^
0xd800d800d800d800;
441 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
442 // but no clue if they're high or low.
443 // If each of the 4 characters are non-zero, then none are surrogates.
444 if ((uTemp
& 0xFFFF000000000000) == 0 ||
445 (uTemp
& 0x0000FFFF00000000) == 0 ||
446 (uTemp
& 0x00000000FFFF0000) == 0 ||
447 (uTemp
& 0x000000000000FFFF) == 0)
449 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
450 // or if there's 1 or 4 surrogates
452 // If they happen to be high/low/high/low, we may as well continue. Check the next
453 // bit to see if its set (low) or not (high) in the right pattern
454 if ((0xfc00fc00fc00fc00 & *longChars
) !=
455 (BitConverter
.IsLittleEndian
? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
457 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
458 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
460 // Drop out to the slow loop to resolve the surrogates
463 // else they are all surrogates in High/Low/High/Low order, so we can use them.
465 // else none are surrogates, so we can use them.
467 // else all < 0x8000 so we can use them
469 // We already counted these four chars, go to next long.
473 chars
= (char*)longChars
;
475 if (chars
>= charEnd
)
480 // No fallback, just get next char
486 // We weren't preallocating fallback space.
490 // Check for high or low surrogates
491 if (ch
>= 0xd800 && ch
<= 0xdfff)
493 // Was it a high surrogate?
496 // Its a high surrogate, if we already had a high surrogate do its fallback
497 if (charLeftOver
> 0)
499 // Unwind the current character, this should be safe because we
500 // don't have leftover data in the fallback, so chars must have
502 Debug
.Assert(chars
> charStart
,
503 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
506 // If previous high surrogate deallocate 2 bytes
509 // Fallback the previous surrogate
510 // Need to initialize fallback buffer?
511 if (fallbackBuffer
== null)
514 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
516 fallbackBuffer
= encoder
.FallbackBuffer
;
518 // Set our internal fallback interesting things.
519 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
522 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be enregistered
523 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
524 chars
= charsForFallback
;
526 // Now no high surrogate left over
527 charLeftOver
= (char)0;
531 // Remember this high surrogate
537 // Its a low surrogate
538 if (charLeftOver
== 0)
540 // Expected a previous high surrogate.
541 // Don't count this one (we'll count its fallback if necessary)
545 // Need to initialize fallback buffer?
546 if (fallbackBuffer
== null)
549 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
551 fallbackBuffer
= encoder
.FallbackBuffer
;
553 // Set our internal fallback interesting things.
554 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
556 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
557 fallbackBuffer
.InternalFallback(ch
, ref charsForFallback
);
558 chars
= charsForFallback
;
562 // Valid surrogate pair, add our charLeftOver
563 charLeftOver
= (char)0;
566 else if (charLeftOver
> 0)
568 // Expected a low surrogate, but this char is normal
570 // Rewind the current character, fallback previous character.
571 // this should be safe because we don't have leftover data in the
572 // fallback, so chars must have advanced already.
573 Debug
.Assert(chars
> charStart
,
574 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
577 // fallback previous chars
578 // Need to initialize fallback buffer?
579 if (fallbackBuffer
== null)
582 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
584 fallbackBuffer
= encoder
.FallbackBuffer
;
586 // Set our internal fallback interesting things.
587 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
589 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
590 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
591 chars
= charsForFallback
;
593 // Ignore charLeftOver or throw
595 charLeftOver
= (char)0;
600 // Ok we had something to add (already counted)
603 // Don't allocate space for left over char
604 if (charLeftOver
> 0)
608 // If we have to flush, stick it in fallback and try again
609 if (encoder
== null || encoder
.MustFlush
)
613 // Throw it, using our complete character
614 throw new ArgumentException(
615 SR
.Format(SR
.Argument_RecursiveFallback
, charLeftOver
), nameof(chars
));
619 // Need to initialize fallback buffer?
620 if (fallbackBuffer
== null)
623 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
625 fallbackBuffer
= encoder
.FallbackBuffer
;
627 // Set our internal fallback interesting things.
628 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
630 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
631 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
632 chars
= charsForFallback
;
633 charLeftOver
= (char)0;
634 wasHereBefore
= true;
640 // Shouldn't have anything in fallback buffer for GetByteCount
641 // (don't have to check _throwOnOverflow for count)
642 Debug
.Assert(fallbackBuffer
== null || fallbackBuffer
.Remaining
== 0,
643 "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
645 // Don't remember fallbackBuffer.encoder for counting
649 internal sealed override unsafe int GetBytes(
650 char* chars
, int charCount
, byte* bytes
, int byteCount
, EncoderNLS
? encoder
)
652 Debug
.Assert(chars
!= null, "[UnicodeEncoding.GetBytes]chars!=null");
653 Debug
.Assert(byteCount
>= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
654 Debug
.Assert(charCount
>= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
655 Debug
.Assert(bytes
!= null, "[UnicodeEncoding.GetBytes]bytes!=null");
657 char charLeftOver
= (char)0;
659 bool wasHereBefore
= false;
662 byte* byteEnd
= bytes
+ byteCount
;
663 char* charEnd
= chars
+ charCount
;
664 byte* byteStart
= bytes
;
665 char* charStart
= chars
;
667 // For fallback we may need a fallback buffer
668 EncoderFallbackBuffer
? fallbackBuffer
= null;
669 char* charsForFallback
;
671 // Get our encoder, but don't clear it yet.
674 charLeftOver
= encoder
._charLeftOver
;
676 // We mustn't have left over fallback data when counting
677 if (encoder
.InternalHasFallbackBuffer
)
679 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
680 fallbackBuffer
= encoder
.FallbackBuffer
;
681 if (fallbackBuffer
.Remaining
> 0 && encoder
._throwOnOverflow
)
682 throw new ArgumentException(SR
.Format(SR
.Argument_EncoderFallbackNotEmpty
, this.EncodingName
, encoder
.Fallback
?.GetType()));
684 // Set our internal fallback interesting things.
685 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, false);
690 while (((ch
= (fallbackBuffer
== null) ?
691 (char)0 : fallbackBuffer
.InternalGetNextChar()) != 0) ||
694 // First unwind any fallback
697 // No fallback, maybe we can do it fast
699 // If endianess is backwards then each pair of bytes would be backwards.
700 if ( (bigEndian ^ BitConverter
.IsLittleEndian
) &&
702 (unchecked((long)chars
) & 7) == 0 &&
704 (unchecked((int)chars
) & 3) == 0 &&
708 // Need -1 to check 2 at a time. If we have an even #, longChars will go
709 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
710 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
711 // We can only go iCount units (limited by shorter of char or byte buffers.
712 ulong* longEnd
= (ulong*)(chars
- 3 +
713 (((byteEnd
- bytes
) >> 1 < charEnd
- chars
) ?
714 (byteEnd
- bytes
) >> 1 : charEnd
- chars
));
716 // Need new char* so we can check 4 at a time
717 ulong* longChars
= (ulong*)chars
;
718 ulong* longBytes
= (ulong*)bytes
;
720 while (longChars
< longEnd
)
722 // See if we potentially have surrogates (0x8000 bit set)
723 // (We're either big endian on a big endian machine or little endian on
724 // a little endian machine so that'll work)
725 if ((0x8000800080008000 & *longChars
) != 0)
727 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
728 // 5 bits looks like 11011, then its a high or low surrogate.
729 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
730 // Note that we expect BMP characters to be more common than surrogates
731 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
732 ulong uTemp
= (0xf800f800f800f800 & *longChars
) ^
0xd800d800d800d800;
734 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
735 // but no clue if they're high or low.
736 // If each of the 4 characters are non-zero, then none are surrogates.
737 if ((uTemp
& 0xFFFF000000000000) == 0 ||
738 (uTemp
& 0x0000FFFF00000000) == 0 ||
739 (uTemp
& 0x00000000FFFF0000) == 0 ||
740 (uTemp
& 0x000000000000FFFF) == 0)
742 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
743 // or if there's 1 or 4 surrogates
745 // If they happen to be high/low/high/low, we may as well continue. Check the next
746 // bit to see if its set (low) or not (high) in the right pattern
747 if ((0xfc00fc00fc00fc00 & *longChars
) !=
748 (BitConverter
.IsLittleEndian
? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
750 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
751 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
753 // Drop out to the slow loop to resolve the surrogates
756 // else they are all surrogates in High/Low/High/Low order, so we can use them.
758 // else none are surrogates, so we can use them.
760 // else all < 0x8000 so we can use them
762 // We can use these 4 chars.
763 Unsafe
.WriteUnaligned
<ulong>(longBytes
, *longChars
);
768 chars
= (char*)longChars
;
769 bytes
= (byte*)longBytes
;
771 if (chars
>= charEnd
)
776 // No fallback, just get next char
781 // Check for high or low surrogates
782 if (ch
>= 0xd800 && ch
<= 0xdfff)
784 // Was it a high surrogate?
787 // Its a high surrogate, see if we already had a high surrogate
788 if (charLeftOver
> 0)
790 // Unwind the current character, this should be safe because we
791 // don't have leftover data in the fallback, so chars must have
793 Debug
.Assert(chars
> charStart
,
794 "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
797 // Fallback the previous surrogate
798 // Might need to create our fallback buffer
799 if (fallbackBuffer
== null)
802 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
804 fallbackBuffer
= encoder
.FallbackBuffer
;
806 // Set our internal fallback interesting things.
807 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, true);
810 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
811 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
812 chars
= charsForFallback
;
814 charLeftOver
= (char)0;
818 // Remember this high surrogate
823 // Its a low surrogate
824 if (charLeftOver
== 0)
826 // We'll fall back this one
827 // Might need to create our fallback buffer
828 if (fallbackBuffer
== null)
831 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
833 fallbackBuffer
= encoder
.FallbackBuffer
;
835 // Set our internal fallback interesting things.
836 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, true);
839 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
840 fallbackBuffer
.InternalFallback(ch
, ref charsForFallback
);
841 chars
= charsForFallback
;
845 // Valid surrogate pair, add our charLeftOver
846 if (bytes
+ 3 >= byteEnd
)
848 // Not enough room to add this surrogate pair
849 if (fallbackBuffer
!= null && fallbackBuffer
.bFallingBack
)
851 // These must have both been from the fallbacks.
852 // Both of these MUST have been from a fallback because if the 1st wasn't
853 // from a fallback, then a high surrogate followed by an illegal char
854 // would've caused the high surrogate to fall back. If a high surrogate
855 // fell back, then it was consumed and both chars came from the fallback.
856 fallbackBuffer
.MovePrevious(); // Didn't use either fallback surrogate
857 fallbackBuffer
.MovePrevious();
861 // If we don't have enough room, then either we should've advanced a while
862 // or we should have bytes==byteStart and throw below
863 Debug
.Assert(chars
> charStart
+ 1 || bytes
== byteStart
,
864 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
865 chars
-= 2; // Didn't use either surrogate
867 ThrowBytesOverflow(encoder
, bytes
== byteStart
); // Throw maybe (if no bytes written)
868 charLeftOver
= (char)0; // we'll retry it later
869 break; // Didn't throw, but stop 'til next time.
874 *(bytes
++) = (byte)(charLeftOver
>> 8);
875 *(bytes
++) = (byte)charLeftOver
;
879 *(bytes
++) = (byte)charLeftOver
;
880 *(bytes
++) = (byte)(charLeftOver
>> 8);
883 charLeftOver
= (char)0;
885 else if (charLeftOver
> 0)
887 // Expected a low surrogate, but this char is normal
889 // Rewind the current character, fallback previous character.
890 // this should be safe because we don't have leftover data in the
891 // fallback, so chars must have advanced already.
892 Debug
.Assert(chars
> charStart
,
893 "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
896 // fallback previous chars
897 // Might need to create our fallback buffer
898 if (fallbackBuffer
== null)
901 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
903 fallbackBuffer
= encoder
.FallbackBuffer
;
905 // Set our internal fallback interesting things.
906 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, true);
909 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
910 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
911 chars
= charsForFallback
;
913 // Ignore charLeftOver or throw
914 charLeftOver
= (char)0;
918 // Ok, we have a char to add
919 if (bytes
+ 1 >= byteEnd
)
921 // Couldn't add this char
922 if (fallbackBuffer
!= null && fallbackBuffer
.bFallingBack
)
923 fallbackBuffer
.MovePrevious(); // Not using this fallback char
926 // Lonely charLeftOver (from previous call) would've been caught up above,
927 // so this must be a case where we've already read an input char.
928 Debug
.Assert(chars
> charStart
,
929 "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
930 chars
--; // Not using this char
932 ThrowBytesOverflow(encoder
, bytes
== byteStart
); // Throw maybe (if no bytes written)
933 break; // didn't throw, just stop
938 *(bytes
++) = (byte)(ch
>> 8);
939 *(bytes
++) = (byte)ch
;
943 *(bytes
++) = (byte)ch
;
944 *(bytes
++) = (byte)(ch
>> 8);
948 // Don't allocate space for left over char
949 if (charLeftOver
> 0)
951 // If we aren't flushing we need to fall this back
952 if (encoder
== null || encoder
.MustFlush
)
956 // Throw it, using our complete character
957 throw new ArgumentException(
958 SR
.Format(SR
.Argument_RecursiveFallback
, charLeftOver
), nameof(chars
));
962 // If we have to flush, stick it in fallback and try again
963 // Might need to create our fallback buffer
964 if (fallbackBuffer
== null)
967 fallbackBuffer
= this.encoderFallback
.CreateFallbackBuffer();
969 fallbackBuffer
= encoder
.FallbackBuffer
;
971 // Set our internal fallback interesting things.
972 fallbackBuffer
.InternalInitialize(charStart
, charEnd
, encoder
, true);
975 // If we're not flushing, that'll remember the left over character.
976 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
977 fallbackBuffer
.InternalFallback(charLeftOver
, ref charsForFallback
);
978 chars
= charsForFallback
;
980 charLeftOver
= (char)0;
981 wasHereBefore
= true;
987 // Not flushing, remember it in the encoder
990 encoder
._charLeftOver
= charLeftOver
;
991 encoder
._charsUsed
= (int)(chars
- charStart
);
994 // Remember charLeftOver if we must, or clear it if we're flushing
995 // (charLeftOver should be 0 if we're flushing)
996 Debug
.Assert((encoder
!= null && !encoder
.MustFlush
) || charLeftOver
== (char)0,
997 "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
999 Debug
.Assert(fallbackBuffer
== null || fallbackBuffer
.Remaining
== 0 ||
1000 encoder
== null || !encoder
._throwOnOverflow
,
1001 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
1003 return (int)(bytes
- byteStart
);
1006 internal sealed override unsafe int GetCharCount(byte* bytes
, int count
, DecoderNLS
? baseDecoder
)
1008 Debug
.Assert(bytes
!= null, "[UnicodeEncoding.GetCharCount]bytes!=null");
1009 Debug
.Assert(count
>= 0, "[UnicodeEncoding.GetCharCount]count >=0");
1011 UnicodeEncoding
.Decoder
? decoder
= (UnicodeEncoding
.Decoder
?)baseDecoder
;
1013 byte* byteEnd
= bytes
+ count
;
1014 byte* byteStart
= bytes
;
1018 char lastChar
= (char)0;
1020 // Start by assuming same # of chars as bytes
1021 int charCount
= count
>> 1;
1023 // For fallback we may need a fallback buffer
1024 DecoderFallbackBuffer
? fallbackBuffer
= null;
1026 if (decoder
!= null)
1028 lastByte
= decoder
.lastByte
;
1029 lastChar
= decoder
.lastChar
;
1031 // Assume extra char if last char was around
1035 // Assume extra char if extra last byte makes up odd # of input bytes
1036 if (lastByte
>= 0 && (count
& 1) == 1)
1041 // Shouldn't have anything in fallback buffer for GetCharCount
1042 // (don't have to check _throwOnOverflow for count)
1043 Debug
.Assert(!decoder
.InternalHasFallbackBuffer
|| decoder
.FallbackBuffer
.Remaining
== 0,
1044 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
1047 while (bytes
< byteEnd
)
1049 // If we're aligned then maybe we can do it fast
1050 // That'll hurt if we're unaligned because we'll always test but never be aligned
1052 if ((bigEndian ^ BitConverter
.IsLittleEndian
) &&
1054 (unchecked((long)bytes
) & 7) == 0 &&
1056 (unchecked((int)bytes
) & 3) == 0 &&
1058 lastByte
== -1 && lastChar
== 0)
1060 // Need -1 to check 2 at a time. If we have an even #, longBytes will go
1061 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
1062 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1063 ulong* longEnd
= (ulong*)(byteEnd
- 7);
1065 // Need new char* so we can check 4 at a time
1066 ulong* longBytes
= (ulong*)bytes
;
1068 while (longBytes
< longEnd
)
1070 // See if we potentially have surrogates (0x8000 bit set)
1071 // (We're either big endian on a big endian machine or little endian on
1072 // a little endian machine so that'll work)
1073 if ((0x8000800080008000 & *longBytes
) != 0)
1075 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1076 // 5 bits looks like 11011, then its a high or low surrogate.
1077 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1078 // Note that we expect BMP characters to be more common than surrogates
1079 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1080 ulong uTemp
= (0xf800f800f800f800 & *longBytes
) ^
0xd800d800d800d800;
1082 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1083 // but no clue if they're high or low.
1084 // If each of the 4 characters are non-zero, then none are surrogates.
1085 if ((uTemp
& 0xFFFF000000000000) == 0 ||
1086 (uTemp
& 0x0000FFFF00000000) == 0 ||
1087 (uTemp
& 0x00000000FFFF0000) == 0 ||
1088 (uTemp
& 0x000000000000FFFF) == 0)
1090 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1091 // or if there's 1 or 4 surrogates
1093 // If they happen to be high/low/high/low, we may as well continue. Check the next
1094 // bit to see if its set (low) or not (high) in the right pattern
1095 if ((0xfc00fc00fc00fc00 & *longBytes
) !=
1096 (BitConverter
.IsLittleEndian
? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
1098 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1099 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1101 // Drop out to the slow loop to resolve the surrogates
1104 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1106 // else none are surrogates, so we can use them.
1108 // else all < 0x8000 so we can use them
1110 // We can use these 4 chars.
1114 bytes
= (byte*)longBytes
;
1116 if (bytes
>= byteEnd
)
1124 lastByte
= *bytes
++;
1125 if (bytes
>= byteEnd
) break;
1132 ch
= (char)(lastByte
<< 8 | *(bytes
++));
1136 ch
= (char)(*(bytes
++) << 8 | lastByte
);
1140 // See if the char's valid
1141 if (ch
>= 0xd800 && ch
<= 0xdfff)
1143 // Was it a high surrogate?
1146 // Its a high surrogate, if we had one then do fallback for previous one
1149 // Ignore previous bad high surrogate
1152 // Get fallback for previous high surrogate
1153 // Note we have to reconstruct bytes because some may have been in decoder
1154 byte[]? byteBuffer
= null;
1157 byteBuffer
= new byte[]
1158 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1162 byteBuffer
= new byte[]
1163 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1166 if (fallbackBuffer
== null)
1168 if (decoder
== null)
1169 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1171 fallbackBuffer
= decoder
.FallbackBuffer
;
1173 // Set our internal fallback interesting things.
1174 fallbackBuffer
.InternalInitialize(byteStart
, null);
1178 charCount
+= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
);
1181 // Ignore the last one which fell back already,
1182 // and remember the new high surrogate
1187 // Its a low surrogate
1190 // Expected a previous high surrogate
1193 // Get fallback for this low surrogate
1194 // Note we have to reconstruct bytes because some may have been in decoder
1195 byte[]? byteBuffer
= null;
1198 byteBuffer
= new byte[]
1199 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }
;
1203 byteBuffer
= new byte[]
1204 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }
;
1207 if (fallbackBuffer
== null)
1209 if (decoder
== null)
1210 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1212 fallbackBuffer
= decoder
.FallbackBuffer
;
1214 // Set our internal fallback interesting things.
1215 fallbackBuffer
.InternalInitialize(byteStart
, null);
1218 charCount
+= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
);
1220 // Ignore this one (we already did its fallback)
1224 // Valid surrogate pair, already counted.
1227 else if (lastChar
> 0)
1229 // Had a high surrogate, expected a low surrogate
1230 // Un-count the last high surrogate
1233 // fall back the high surrogate.
1234 byte[]? byteBuffer
= null;
1237 byteBuffer
= new byte[]
1238 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1242 byteBuffer
= new byte[]
1243 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1246 if (fallbackBuffer
== null)
1248 if (decoder
== null)
1249 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1251 fallbackBuffer
= decoder
.FallbackBuffer
;
1253 // Set our internal fallback interesting things.
1254 fallbackBuffer
.InternalInitialize(byteStart
, null);
1257 // Already subtracted high surrogate
1258 charCount
+= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
);
1260 // Not left over now, clear previous high surrogate and continue to add current char
1264 // Valid char, already counted
1267 // Extra space if we can't use decoder
1268 if (decoder
== null || decoder
.MustFlush
)
1272 // No hanging high surrogates allowed, do fallback and remove count for it
1274 byte[]? byteBuffer
= null;
1277 byteBuffer
= new byte[]
1278 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1282 byteBuffer
= new byte[]
1283 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1286 if (fallbackBuffer
== null)
1288 if (decoder
== null)
1289 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1291 fallbackBuffer
= decoder
.FallbackBuffer
;
1293 // Set our internal fallback interesting things.
1294 fallbackBuffer
.InternalInitialize(byteStart
, null);
1297 charCount
+= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
);
1304 if (fallbackBuffer
== null)
1306 if (decoder
== null)
1307 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1309 fallbackBuffer
= decoder
.FallbackBuffer
;
1311 // Set our internal fallback interesting things.
1312 fallbackBuffer
.InternalInitialize(byteStart
, null);
1315 // No hanging odd bytes allowed if must flush
1316 charCount
+= fallbackBuffer
.InternalFallback(new byte[] { unchecked((byte)lastByte) }
, bytes
);
1321 // If we had a high surrogate left over, we can't count it
1325 // Shouldn't have anything in fallback buffer for GetCharCount
1326 // (don't have to check _throwOnOverflow for count)
1327 Debug
.Assert(fallbackBuffer
== null || fallbackBuffer
.Remaining
== 0,
1328 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
1333 internal sealed override unsafe int GetChars(
1334 byte* bytes
, int byteCount
, char* chars
, int charCount
, DecoderNLS
? baseDecoder
)
1336 Debug
.Assert(chars
!= null, "[UnicodeEncoding.GetChars]chars!=null");
1337 Debug
.Assert(byteCount
>= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
1338 Debug
.Assert(charCount
>= 0, "[UnicodeEncoding.GetChars]charCount >=0");
1339 Debug
.Assert(bytes
!= null, "[UnicodeEncoding.GetChars]bytes!=null");
1341 UnicodeEncoding
.Decoder
? decoder
= (UnicodeEncoding
.Decoder
?)baseDecoder
;
1345 char lastChar
= (char)0;
1347 // Get our decoder (but don't clear it yet)
1348 if (decoder
!= null)
1350 lastByte
= decoder
.lastByte
;
1351 lastChar
= decoder
.lastChar
;
1353 // Shouldn't have anything in fallback buffer for GetChars
1354 // (don't have to check _throwOnOverflow for chars)
1355 Debug
.Assert(!decoder
.InternalHasFallbackBuffer
|| decoder
.FallbackBuffer
.Remaining
== 0,
1356 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
1359 // For fallback we may need a fallback buffer
1360 DecoderFallbackBuffer
? fallbackBuffer
= null;
1361 char* charsForFallback
;
1363 byte* byteEnd
= bytes
+ byteCount
;
1364 char* charEnd
= chars
+ charCount
;
1365 byte* byteStart
= bytes
;
1366 char* charStart
= chars
;
1368 while (bytes
< byteEnd
)
1370 // If we're aligned then maybe we can do it fast
1371 // That'll hurt if we're unaligned because we'll always test but never be aligned
1373 if ((bigEndian ^ BitConverter
.IsLittleEndian
) &&
1375 (unchecked((long)chars
) & 7) == 0 &&
1377 (unchecked((int)chars
) & 3) == 0 &&
1379 lastByte
== -1 && lastChar
== 0)
1381 // Need -1 to check 2 at a time. If we have an even #, longChars will go
1382 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
1383 // will go from longEnd - 1 long to longEnd. (Might not get to use this)
1384 // We can only go iCount units (limited by shorter of char or byte buffers.
1385 ulong* longEnd
= (ulong*)(bytes
- 7 +
1386 (((byteEnd
- bytes
) >> 1 < charEnd
- chars
) ?
1387 (byteEnd
- bytes
) : (charEnd
- chars
) << 1));
1389 // Need new char* so we can check 4 at a time
1390 ulong* longBytes
= (ulong*)bytes
;
1391 ulong* longChars
= (ulong*)chars
;
1393 while (longBytes
< longEnd
)
1395 // See if we potentially have surrogates (0x8000 bit set)
1396 // (We're either big endian on a big endian machine or little endian on
1397 // a little endian machine so that'll work)
1398 if ((0x8000800080008000 & *longBytes
) != 0)
1400 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
1401 // 5 bits looks like 11011, then its a high or low surrogate.
1402 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
1403 // Note that we expect BMP characters to be more common than surrogates
1404 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
1405 ulong uTemp
= (0xf800f800f800f800 & *longBytes
) ^
0xd800d800d800d800;
1407 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
1408 // but no clue if they're high or low.
1409 // If each of the 4 characters are non-zero, then none are surrogates.
1410 if ((uTemp
& 0xFFFF000000000000) == 0 ||
1411 (uTemp
& 0x0000FFFF00000000) == 0 ||
1412 (uTemp
& 0x00000000FFFF0000) == 0 ||
1413 (uTemp
& 0x000000000000FFFF) == 0)
1415 // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
1416 // or if there's 1 or 4 surrogates
1418 // If they happen to be high/low/high/low, we may as well continue. Check the next
1419 // bit to see if its set (low) or not (high) in the right pattern
1420 if ((0xfc00fc00fc00fc00 & *longBytes
) !=
1421 (BitConverter
.IsLittleEndian
? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
1423 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
1424 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
1426 // Drop out to the slow loop to resolve the surrogates
1429 // else they are all surrogates in High/Low/High/Low order, so we can use them.
1431 // else none are surrogates, so we can use them.
1433 // else all < 0x8000 so we can use them
1435 // We can use these 4 chars.
1436 Unsafe
.WriteUnaligned
<ulong>(longChars
, *longBytes
);
1441 chars
= (char*)longChars
;
1442 bytes
= (byte*)longBytes
;
1444 if (bytes
>= byteEnd
)
1452 lastByte
= *bytes
++;
1460 ch
= (char)(lastByte
<< 8 | *(bytes
++));
1464 ch
= (char)(*(bytes
++) << 8 | lastByte
);
1468 // See if the char's valid
1469 if (ch
>= 0xd800 && ch
<= 0xdfff)
1471 // Was it a high surrogate?
1474 // Its a high surrogate, if we had one then do fallback for previous one
1477 // Get fallback for previous high surrogate
1478 // Note we have to reconstruct bytes because some may have been in decoder
1479 byte[]? byteBuffer
= null;
1482 byteBuffer
= new byte[]
1483 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1487 byteBuffer
= new byte[]
1488 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1491 if (fallbackBuffer
== null)
1493 if (decoder
== null)
1494 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1496 fallbackBuffer
= decoder
.FallbackBuffer
;
1498 // Set our internal fallback interesting things.
1499 fallbackBuffer
.InternalInitialize(byteStart
, charEnd
);
1502 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
1503 bool fallbackResult
= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
, ref charsForFallback
);
1504 chars
= charsForFallback
;
1506 if (!fallbackResult
)
1508 // couldn't fall back lonely surrogate
1509 // We either advanced bytes or chars should == charStart and throw below
1510 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1511 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
1512 bytes
-= 2; // didn't use these 2 bytes
1513 fallbackBuffer
.InternalReset();
1514 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1515 break; // couldn't fallback but didn't throw
1519 // Ignore the previous high surrogate which fell back already,
1520 // yet remember the current high surrogate for next time.
1525 // Its a low surrogate
1528 // Expected a previous high surrogate
1529 // Get fallback for this low surrogate
1530 // Note we have to reconstruct bytes because some may have been in decoder
1531 byte[]? byteBuffer
= null;
1534 byteBuffer
= new byte[]
1535 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }
;
1539 byteBuffer
= new byte[]
1540 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }
;
1543 if (fallbackBuffer
== null)
1545 if (decoder
== null)
1546 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1548 fallbackBuffer
= decoder
.FallbackBuffer
;
1550 // Set our internal fallback interesting things.
1551 fallbackBuffer
.InternalInitialize(byteStart
, charEnd
);
1554 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
1555 bool fallbackResult
= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
, ref charsForFallback
);
1556 chars
= charsForFallback
;
1558 if (!fallbackResult
)
1560 // couldn't fall back lonely surrogate
1561 // We either advanced bytes or chars should == charStart and throw below
1562 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1563 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
1564 bytes
-= 2; // didn't use these 2 bytes
1565 fallbackBuffer
.InternalReset();
1566 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1567 break; // couldn't fallback but didn't throw
1570 // Didn't throw, ignore this one (we already did its fallback)
1574 // Valid surrogate pair, add our lastChar (will need 2 chars)
1575 if (chars
>= charEnd
- 1)
1577 // couldn't find room for this surrogate pair
1578 // We either advanced bytes or chars should == charStart and throw below
1579 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1580 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
1581 bytes
-= 2; // didn't use these 2 bytes
1582 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1583 // Leave lastChar for next call to Convert()
1584 break; // couldn't fallback but didn't throw
1587 *chars
++ = lastChar
;
1590 else if (lastChar
> 0)
1592 // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
1593 byte[]? byteBuffer
= null;
1596 byteBuffer
= new byte[]
1597 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1601 byteBuffer
= new byte[]
1602 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1605 if (fallbackBuffer
== null)
1607 if (decoder
== null)
1608 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1610 fallbackBuffer
= decoder
.FallbackBuffer
;
1612 // Set our internal fallback interesting things.
1613 fallbackBuffer
.InternalInitialize(byteStart
, charEnd
);
1616 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
1617 bool fallbackResult
= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
, ref charsForFallback
);
1618 chars
= charsForFallback
;
1620 if (!fallbackResult
)
1622 // couldn't fall back high surrogate, or char that would be next
1623 // We either advanced bytes or chars should == charStart and throw below
1624 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1625 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
1626 bytes
-= 2; // didn't use these 2 bytes
1627 fallbackBuffer
.InternalReset();
1628 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1629 break; // couldn't fallback but didn't throw
1632 // Not left over now, clear previous high surrogate and continue to add current char
1636 // Valid char, room for it?
1637 if (chars
>= charEnd
)
1639 // 2 bytes couldn't fall back
1640 // We either advanced bytes or chars should == charStart and throw below
1641 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1642 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
1643 bytes
-= 2; // didn't use these bytes
1644 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1645 break; // couldn't fallback but didn't throw
1652 // Remember our decoder if we must
1653 if (decoder
== null || decoder
.MustFlush
)
1657 // No hanging high surrogates allowed, do fallback and remove count for it
1658 byte[]? byteBuffer
= null;
1661 byteBuffer
= new byte[]
1662 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }
;
1666 byteBuffer
= new byte[]
1667 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }
;
1670 if (fallbackBuffer
== null)
1672 if (decoder
== null)
1673 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1675 fallbackBuffer
= decoder
.FallbackBuffer
;
1677 // Set our internal fallback interesting things.
1678 fallbackBuffer
.InternalInitialize(byteStart
, charEnd
);
1681 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
1682 bool fallbackResult
= fallbackBuffer
.InternalFallback(byteBuffer
, bytes
, ref charsForFallback
);
1683 chars
= charsForFallback
;
1685 if (!fallbackResult
)
1687 // 2 bytes couldn't fall back
1688 // We either advanced bytes or chars should == charStart and throw below
1689 Debug
.Assert(bytes
>= byteStart
+ 2 || chars
== charStart
,
1690 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
1691 bytes
-= 2; // didn't use these bytes
1693 bytes
--; // had an extra last byte hanging around
1694 fallbackBuffer
.InternalReset();
1695 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1696 // We'll remember these in our decoder though
1703 // done with this one
1709 if (fallbackBuffer
== null)
1711 if (decoder
== null)
1712 fallbackBuffer
= this.decoderFallback
.CreateFallbackBuffer();
1714 fallbackBuffer
= decoder
.FallbackBuffer
;
1716 // Set our internal fallback interesting things.
1717 fallbackBuffer
.InternalInitialize(byteStart
, charEnd
);
1720 // No hanging odd bytes allowed if must flush
1721 charsForFallback
= chars
; // Avoid passing chars by reference to allow it to be en-registered
1722 bool fallbackResult
= fallbackBuffer
.InternalFallback(new byte[] { unchecked((byte)lastByte) }
, bytes
, ref charsForFallback
);
1723 chars
= charsForFallback
;
1725 if (!fallbackResult
)
1727 // odd byte couldn't fall back
1728 bytes
--; // didn't use this byte
1729 fallbackBuffer
.InternalReset();
1730 ThrowCharsOverflow(decoder
, chars
== charStart
);// Might throw, if no chars output
1731 // didn't throw, but we'll remember it in the decoder
1736 // Didn't fail, clear buffer
1743 // Remember our decoder if we must
1744 if (decoder
!= null)
1746 Debug
.Assert((decoder
.MustFlush
== false) || ((lastChar
== (char)0) && (lastByte
== -1)),
1747 "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
1748 // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
1751 decoder
._bytesUsed
= (int)(bytes
- byteStart
);
1752 decoder
.lastChar
= lastChar
;
1753 decoder
.lastByte
= lastByte
;
1756 // Shouldn't have anything in fallback buffer for GetChars
1757 // (don't have to check _throwOnOverflow for count or chars)
1758 Debug
.Assert(fallbackBuffer
== null || fallbackBuffer
.Remaining
== 0,
1759 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
1761 return (int)(chars
- charStart
);
1765 public override System
.Text
.Encoder
GetEncoder()
1767 return new EncoderNLS(this);
1771 public override System
.Text
.Decoder
GetDecoder()
1773 return new UnicodeEncoding
.Decoder(this);
1777 public override byte[] GetPreamble()
1781 // Note - we must allocate new byte[]'s here to prevent someone
1782 // from modifying a cached byte[].
1784 return new byte[2] { 0xfe, 0xff }
;
1786 return new byte[2] { 0xff, 0xfe }
;
1788 return Array
.Empty
<byte>();
1791 public override ReadOnlySpan
<byte> Preamble
=>
1792 GetType() != typeof(UnicodeEncoding
) ? new ReadOnlySpan
<byte>(GetPreamble()) : // in case a derived UnicodeEncoding overrode GetPreamble
1793 !byteOrderMark
? default :
1794 bigEndian
? (ReadOnlySpan
<byte>)new byte[2] { 0xfe, 0xff }
: // uses C# compiler's optimization for static byte[] data
1795 (ReadOnlySpan
<byte>)new byte[2] { 0xff, 0xfe }
;
1797 public override int GetMaxByteCount(int charCount
)
1800 throw new ArgumentOutOfRangeException(nameof(charCount
),
1801 SR
.ArgumentOutOfRange_NeedNonNegNum
);
1803 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
1804 long byteCount
= (long)charCount
+ 1;
1806 if (EncoderFallback
.MaxCharCount
> 1)
1807 byteCount
*= EncoderFallback
.MaxCharCount
;
1812 if (byteCount
> 0x7fffffff)
1813 throw new ArgumentOutOfRangeException(nameof(charCount
), SR
.ArgumentOutOfRange_GetByteCountOverflow
);
1815 return (int)byteCount
;
1819 public override int GetMaxCharCount(int byteCount
)
1822 throw new ArgumentOutOfRangeException(nameof(byteCount
),
1823 SR
.ArgumentOutOfRange_NeedNonNegNum
);
1825 // long because byteCount could be biggest int.
1826 // 1 char per 2 bytes. Round up in case 1 left over in decoder.
1827 // Round up using &1 in case byteCount is max size
1828 // Might also need an extra 1 if there's a left over high surrogate in the decoder.
1829 long charCount
= (long)(byteCount
>> 1) + (byteCount
& 1) + 1;
1831 // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that)
1832 if (DecoderFallback
.MaxCharCount
> 1)
1833 charCount
*= DecoderFallback
.MaxCharCount
;
1835 if (charCount
> 0x7fffffff)
1836 throw new ArgumentOutOfRangeException(nameof(byteCount
), SR
.ArgumentOutOfRange_GetCharCountOverflow
);
1838 return (int)charCount
;
1842 public override bool Equals(object? value)
1844 if (value is UnicodeEncoding that
)
1847 // Big Endian Unicode has different code page (1201) than small Endian one (1200),
1848 // so we still have to check _codePage here.
1850 return (CodePage
== that
.CodePage
) &&
1851 byteOrderMark
== that
.byteOrderMark
&&
1852 // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
1853 bigEndian
== that
.bigEndian
&&
1854 (EncoderFallback
.Equals(that
.EncoderFallback
)) &&
1855 (DecoderFallback
.Equals(that
.DecoderFallback
));
1860 public override int GetHashCode()
1862 return CodePage
+ this.EncoderFallback
.GetHashCode() + this.DecoderFallback
.GetHashCode() +
1863 (byteOrderMark
? 4 : 0) + (bigEndian
? 8 : 0);
1866 private sealed class Decoder
: System
.Text
.DecoderNLS
1868 internal int lastByte
= -1;
1869 internal char lastChar
= '\0';
1871 public Decoder(UnicodeEncoding encoding
) : base(encoding
)
1876 public override void Reset()
1880 if (_fallbackBuffer
!= null)
1881 _fallbackBuffer
.Reset();
1884 // Anything left in our decoder?
1885 internal override bool HasState
=> (this.lastByte
!= -1 || this.lastChar
!= '\0');