Refactoring the ARM Hardware Intrinsics based on the latest design decisions. (#26895)
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / UTF32Encoding.cs
blobe300db5de32381817fcdfa89c65112bf7b222d2b
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 //
6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
7 //
9 using System.Diagnostics;
10 using System.Runtime.InteropServices;
12 namespace System.Text
14 // Encodes text into and out of UTF-32. UTF-32 is a way of writing
15 // Unicode characters with a single storage unit (32 bits) per character,
17 // The UTF-32 byte order mark is simply the Unicode byte order mark
18 // (0x00FEFF) written in UTF-32 (0x0000FEFF or 0xFFFE0000). The byte order
19 // mark is used mostly to distinguish UTF-32 text from other encodings, and doesn't
20 // switch the byte orderings.
22 public sealed class UTF32Encoding : Encoding
25 words bits UTF-32 representation
26 ----- ---- -----------------------------------
27 1 16 00000000 00000000 xxxxxxxx xxxxxxxx
28 2 21 00000000 000xxxxx hhhhhhll llllllll
29 ----- ---- -----------------------------------
31 Surrogate:
32 Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
35 // Used by Encoding.UTF32/BigEndianUTF32 for lazy initialization
36 // The initialization code will not be run until a static member of the class is referenced
37 internal static readonly UTF32Encoding s_default = new UTF32Encoding(bigEndian: false, byteOrderMark: true);
38 internal static readonly UTF32Encoding s_bigEndianDefault = new UTF32Encoding(bigEndian: true, byteOrderMark: true);
40 private readonly bool _emitUTF32ByteOrderMark = false;
41 private readonly bool _isThrowException = false;
42 private readonly bool _bigEndian = false;
44 public UTF32Encoding() : this(false, true)
48 public UTF32Encoding(bool bigEndian, bool byteOrderMark) :
49 base(bigEndian ? 12001 : 12000)
51 _bigEndian = bigEndian;
52 _emitUTF32ByteOrderMark = byteOrderMark;
55 public UTF32Encoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidCharacters) :
56 this(bigEndian, byteOrderMark)
58 _isThrowException = throwOnInvalidCharacters;
60 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
61 if (_isThrowException)
62 SetDefaultFallbacks();
65 internal override void SetDefaultFallbacks()
67 // For UTF-X encodings, we use a replacement fallback with an empty string
68 if (_isThrowException)
70 this.encoderFallback = EncoderFallback.ExceptionFallback;
71 this.decoderFallback = DecoderFallback.ExceptionFallback;
73 else
75 this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
76 this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
80 // The following methods are copied from EncodingNLS.cs.
81 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
82 // These should be kept in sync for the following classes:
83 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
85 // Returns the number of bytes required to encode a range of characters in
86 // a character array.
88 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
89 // So if you fix this, fix the others. Currently those include:
90 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
91 // parent method is safe
93 public override unsafe int GetByteCount(char[] chars, int index, int count)
95 // Validate input parameters
96 if (chars == null)
97 throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
99 if (index < 0 || count < 0)
100 throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
102 if (chars.Length - index < count)
103 throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
105 // If no input, return 0, avoid fixed empty array problem
106 if (count == 0)
107 return 0;
109 // Just call the pointer version
110 fixed (char* pChars = chars)
111 return GetByteCount(pChars + index, count, null);
114 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
115 // So if you fix this, fix the others. Currently those include:
116 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
117 // parent method is safe
119 public override unsafe int GetByteCount(string s)
121 // Validate input
122 if (s == null)
123 throw new ArgumentNullException(nameof(s));
125 fixed (char* pChars = s)
126 return GetByteCount(pChars, s.Length, null);
129 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
130 // So if you fix this, fix the others. Currently those include:
131 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
133 [CLSCompliant(false)]
134 public override unsafe int GetByteCount(char* chars, int count)
136 // Validate Parameters
137 if (chars == null)
138 throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
140 if (count < 0)
141 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
143 // Call it with empty encoder
144 return GetByteCount(chars, count, null);
147 // Parent method is safe.
148 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
149 // So if you fix this, fix the others. Currently those include:
150 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
152 public override unsafe int GetBytes(string s, int charIndex, int charCount,
153 byte[] bytes, int byteIndex)
155 if (s == null || bytes == null)
156 throw new ArgumentNullException(s == null ? nameof(s) : nameof(bytes), SR.ArgumentNull_Array);
158 if (charIndex < 0 || charCount < 0)
159 throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
161 if (s.Length - charIndex < charCount)
162 throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
164 if (byteIndex < 0 || byteIndex > bytes.Length)
165 throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
167 int byteCount = bytes.Length - byteIndex;
169 fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
170 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
173 // Encodes a range of characters in a character array into a range of bytes
174 // in a byte array. An exception occurs if the byte array is not large
175 // enough to hold the complete encoding of the characters. The
176 // GetByteCount method can be used to determine the exact number of
177 // bytes that will be produced for a given range of characters.
178 // Alternatively, the GetMaxByteCount method can be used to
179 // determine the maximum number of bytes that will be produced for a given
180 // number of characters, regardless of the actual character values.
182 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
183 // So if you fix this, fix the others. Currently those include:
184 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
185 // parent method is safe
187 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
188 byte[] bytes, int byteIndex)
190 // Validate parameters
191 if (chars == null || bytes == null)
192 throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), SR.ArgumentNull_Array);
194 if (charIndex < 0 || charCount < 0)
195 throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
197 if (chars.Length - charIndex < charCount)
198 throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
200 if (byteIndex < 0 || byteIndex > bytes.Length)
201 throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
203 // If nothing to encode return 0, avoid fixed problem
204 if (charCount == 0)
205 return 0;
207 // Just call pointer version
208 int byteCount = bytes.Length - byteIndex;
210 fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
211 // Remember that byteCount is # to decode, not size of array.
212 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
215 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
216 // So if you fix this, fix the others. Currently those include:
217 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
219 [CLSCompliant(false)]
220 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
222 // Validate Parameters
223 if (bytes == null || chars == null)
224 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
226 if (charCount < 0 || byteCount < 0)
227 throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
229 return GetBytes(chars, charCount, bytes, byteCount, null);
232 // Returns the number of characters produced by decoding a range of bytes
233 // in a byte array.
235 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
236 // So if you fix this, fix the others. Currently those include:
237 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
238 // parent method is safe
240 public override unsafe int GetCharCount(byte[] bytes, int index, int count)
242 // Validate Parameters
243 if (bytes == null)
244 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
246 if (index < 0 || count < 0)
247 throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
249 if (bytes.Length - index < count)
250 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
252 // If no input just return 0, fixed doesn't like 0 length arrays.
253 if (count == 0)
254 return 0;
256 // Just call pointer version
257 fixed (byte* pBytes = bytes)
258 return GetCharCount(pBytes + index, count, null);
261 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
262 // So if you fix this, fix the others. Currently those include:
263 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
265 [CLSCompliant(false)]
266 public override unsafe int GetCharCount(byte* bytes, int count)
268 // Validate Parameters
269 if (bytes == null)
270 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
272 if (count < 0)
273 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
275 return GetCharCount(bytes, count, null);
278 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
279 // So if you fix this, fix the others. Currently those include:
280 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
281 // parent method is safe
283 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
284 char[] chars, int charIndex)
286 // Validate Parameters
287 if (bytes == null || chars == null)
288 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
290 if (byteIndex < 0 || byteCount < 0)
291 throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
293 if (bytes.Length - byteIndex < byteCount)
294 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
296 if (charIndex < 0 || charIndex > chars.Length)
297 throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
299 // If no input, return 0 & avoid fixed problem
300 if (byteCount == 0)
301 return 0;
303 // Just call pointer version
304 int charCount = chars.Length - charIndex;
306 fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
307 // Remember that charCount is # to decode, not size of array
308 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
311 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
312 // So if you fix this, fix the others. Currently those include:
313 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
315 [CLSCompliant(false)]
316 public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
318 // Validate Parameters
319 if (bytes == null || chars == null)
320 throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
322 if (charCount < 0 || byteCount < 0)
323 throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
325 return GetChars(bytes, byteCount, chars, charCount, null);
328 // Returns a string containing the decoded representation of a range of
329 // bytes in a byte array.
331 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
332 // So if you fix this, fix the others. Currently those include:
333 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
334 // parent method is safe
336 public override unsafe string GetString(byte[] bytes, int index, int count)
338 // Validate Parameters
339 if (bytes == null)
340 throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
342 if (index < 0 || count < 0)
343 throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
345 if (bytes.Length - index < count)
346 throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
348 // Avoid problems with empty input buffer
349 if (count == 0) return string.Empty;
351 fixed (byte* pBytes = bytes)
352 return string.CreateStringFromEncoding(
353 pBytes + index, count, this);
357 // End of standard methods copied from EncodingNLS.cs
359 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS? encoder)
361 Debug.Assert(chars != null, "[UTF32Encoding.GetByteCount]chars!=null");
362 Debug.Assert(count >= 0, "[UTF32Encoding.GetByteCount]count >=0");
364 char* end = chars + count;
365 char* charStart = chars;
366 int byteCount = 0;
368 char highSurrogate = '\0';
370 // For fallback we may need a fallback buffer
371 EncoderFallbackBuffer? fallbackBuffer = null;
372 char* charsForFallback;
374 if (encoder != null)
376 highSurrogate = encoder._charLeftOver;
377 fallbackBuffer = encoder.FallbackBuffer;
379 // We mustn't have left over fallback data when counting
380 if (fallbackBuffer.Remaining > 0)
381 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType().ToString() ?? string.Empty));
383 else
385 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
388 // Set our internal fallback interesting things.
389 fallbackBuffer.InternalInitialize(charStart, end, encoder, false);
391 char ch;
392 TryAgain:
394 while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < end)
396 // First unwind any fallback
397 if (ch == 0)
399 // No fallback, just get next char
400 ch = *chars;
401 chars++;
404 // Do we need a low surrogate?
405 if (highSurrogate != '\0')
408 // In previous char, we encounter a high surrogate, so we are expecting a low surrogate here.
410 if (char.IsLowSurrogate(ch))
412 // They're all legal
413 highSurrogate = '\0';
416 // One surrogate pair will be translated into 4 bytes UTF32.
419 byteCount += 4;
420 continue;
423 // We are missing our low surrogate, decrement chars and fallback the high surrogate
424 // The high surrogate may have come from the encoder, but nothing else did.
425 Debug.Assert(chars > charStart,
426 "[UTF32Encoding.GetByteCount]Expected chars to have advanced if no low surrogate");
427 chars--;
429 // Do the fallback
430 charsForFallback = chars;
431 fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
432 chars = charsForFallback;
434 // We're going to fallback the old high surrogate.
435 highSurrogate = '\0';
436 continue;
439 // Do we have another high surrogate?
440 if (char.IsHighSurrogate(ch))
443 // We'll have a high surrogate to check next time.
445 highSurrogate = ch;
446 continue;
449 // Check for illegal characters
450 if (char.IsLowSurrogate(ch))
452 // We have a leading low surrogate, do the fallback
453 charsForFallback = chars;
454 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
455 chars = charsForFallback;
457 // Try again with fallback buffer
458 continue;
461 // We get to add the character (4 bytes UTF32)
462 byteCount += 4;
465 // May have to do our last surrogate
466 if ((encoder == null || encoder.MustFlush) && highSurrogate > 0)
468 // We have to do the fallback for the lonely high surrogate
469 charsForFallback = chars;
470 fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
471 chars = charsForFallback;
473 highSurrogate = (char)0;
474 goto TryAgain;
477 // Check for overflows.
478 if (byteCount < 0)
479 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
481 // Shouldn't have anything in fallback buffer for GetByteCount
482 // (don't have to check _throwOnOverflow for count)
483 Debug.Assert(fallbackBuffer.Remaining == 0,
484 "[UTF32Encoding.GetByteCount]Expected empty fallback buffer at end");
486 // Return our count
487 return byteCount;
490 internal override unsafe int GetBytes(char* chars, int charCount,
491 byte* bytes, int byteCount, EncoderNLS? encoder)
493 Debug.Assert(chars != null, "[UTF32Encoding.GetBytes]chars!=null");
494 Debug.Assert(bytes != null, "[UTF32Encoding.GetBytes]bytes!=null");
495 Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetBytes]byteCount >=0");
496 Debug.Assert(charCount >= 0, "[UTF32Encoding.GetBytes]charCount >=0");
498 char* charStart = chars;
499 char* charEnd = chars + charCount;
500 byte* byteStart = bytes;
501 byte* byteEnd = bytes + byteCount;
503 char highSurrogate = '\0';
505 // For fallback we may need a fallback buffer
506 EncoderFallbackBuffer? fallbackBuffer = null;
507 char* charsForFallback;
509 if (encoder != null)
511 highSurrogate = encoder._charLeftOver;
512 fallbackBuffer = encoder.FallbackBuffer;
514 // We mustn't have left over fallback data when not converting
515 if (encoder._throwOnOverflow && fallbackBuffer.Remaining > 0)
516 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
518 else
520 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
523 // Set our internal fallback interesting things.
524 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
526 char ch;
527 TryAgain:
529 while (((ch = fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
531 // First unwind any fallback
532 if (ch == 0)
534 // No fallback, just get next char
535 ch = *chars;
536 chars++;
539 // Do we need a low surrogate?
540 if (highSurrogate != '\0')
543 // In previous char, we encountered a high surrogate, so we are expecting a low surrogate here.
545 if (char.IsLowSurrogate(ch))
547 // Is it a legal one?
548 uint iTemp = GetSurrogate(highSurrogate, ch);
549 highSurrogate = '\0';
552 // One surrogate pair will be translated into 4 bytes UTF32.
554 if (bytes + 3 >= byteEnd)
556 // Don't have 4 bytes
557 if (fallbackBuffer.bFallingBack)
559 fallbackBuffer.MovePrevious(); // Aren't using these 2 fallback chars
560 fallbackBuffer.MovePrevious();
562 else
564 // If we don't have enough room, then either we should've advanced a while
565 // or we should have bytes==byteStart and throw below
566 Debug.Assert(chars > charStart + 1 || bytes == byteStart,
567 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
568 chars -= 2; // Aren't using those 2 chars
570 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
571 highSurrogate = (char)0; // Nothing left over (we backed up to start of pair if supplimentary)
572 break;
575 if (_bigEndian)
577 *(bytes++) = (byte)(0x00);
578 *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0
579 *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF
580 *(bytes++) = (byte)(iTemp); // Implies & 0xFF
582 else
584 *(bytes++) = (byte)(iTemp); // Implies & 0xFF
585 *(bytes++) = (byte)(iTemp >> 8); // Implies & 0xFF
586 *(bytes++) = (byte)(iTemp >> 16); // Implies & 0xFF, which isn't needed cause high are all 0
587 *(bytes++) = (byte)(0x00);
589 continue;
592 // We are missing our low surrogate, decrement chars and fallback the high surrogate
593 // The high surrogate may have come from the encoder, but nothing else did.
594 Debug.Assert(chars > charStart,
595 "[UTF32Encoding.GetBytes]Expected chars to have advanced if no low surrogate");
596 chars--;
598 // Do the fallback
599 charsForFallback = chars;
600 fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
601 chars = charsForFallback;
603 // We're going to fallback the old high surrogate.
604 highSurrogate = '\0';
605 continue;
608 // Do we have another high surrogate?, if so remember it
609 if (char.IsHighSurrogate(ch))
612 // We'll have a high surrogate to check next time.
614 highSurrogate = ch;
615 continue;
618 // Check for illegal characters (low surrogate)
619 if (char.IsLowSurrogate(ch))
621 // We have a leading low surrogate, do the fallback
622 charsForFallback = chars;
623 fallbackBuffer.InternalFallback(ch, ref charsForFallback);
624 chars = charsForFallback;
626 // Try again with fallback buffer
627 continue;
630 // We get to add the character, yippee.
631 if (bytes + 3 >= byteEnd)
633 // Don't have 4 bytes
634 if (fallbackBuffer.bFallingBack)
635 fallbackBuffer.MovePrevious(); // Aren't using this fallback char
636 else
638 // Must've advanced already
639 Debug.Assert(chars > charStart,
640 "[UTF32Encoding.GetBytes]Expected chars to have advanced if normal character");
641 chars--; // Aren't using this char
643 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
644 break; // Didn't throw, stop
647 if (_bigEndian)
649 *(bytes++) = (byte)(0x00);
650 *(bytes++) = (byte)(0x00);
651 *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF
652 *(bytes++) = (byte)(ch); // Implies & 0xFF
654 else
656 *(bytes++) = (byte)(ch); // Implies & 0xFF
657 *(bytes++) = (byte)((uint)ch >> 8); // Implies & 0xFF
658 *(bytes++) = (byte)(0x00);
659 *(bytes++) = (byte)(0x00);
663 // May have to do our last surrogate
664 if ((encoder == null || encoder.MustFlush) && highSurrogate > 0)
666 // We have to do the fallback for the lonely high surrogate
667 charsForFallback = chars;
668 fallbackBuffer.InternalFallback(highSurrogate, ref charsForFallback);
669 chars = charsForFallback;
671 highSurrogate = (char)0;
672 goto TryAgain;
675 // Fix our encoder if we have one
676 Debug.Assert(highSurrogate == 0 || (encoder != null && !encoder.MustFlush),
677 "[UTF32Encoding.GetBytes]Expected encoder to be flushed.");
679 if (encoder != null)
681 // Remember our left over surrogate (or 0 if flushing)
682 encoder._charLeftOver = highSurrogate;
684 // Need # chars used
685 encoder._charsUsed = (int)(chars - charStart);
688 // return the new length
689 return (int)(bytes - byteStart);
692 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS? baseDecoder)
694 Debug.Assert(bytes != null, "[UTF32Encoding.GetCharCount]bytes!=null");
695 Debug.Assert(count >= 0, "[UTF32Encoding.GetCharCount]count >=0");
697 UTF32Decoder? decoder = (UTF32Decoder?)baseDecoder;
699 // None so far!
700 int charCount = 0;
701 byte* end = bytes + count;
702 byte* byteStart = bytes;
704 // Set up decoder
705 int readCount = 0;
706 uint iChar = 0;
708 // For fallback we may need a fallback buffer
709 DecoderFallbackBuffer? fallbackBuffer = null;
711 // See if there's anything in our decoder
712 if (decoder != null)
714 readCount = decoder.readByteCount;
715 iChar = (uint)decoder.iChar;
716 fallbackBuffer = decoder.FallbackBuffer;
718 // Shouldn't have anything in fallback buffer for GetCharCount
719 // (don't have to check _throwOnOverflow for chars or count)
720 Debug.Assert(fallbackBuffer.Remaining == 0,
721 "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at start");
723 else
725 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
728 // Set our internal fallback interesting things.
729 fallbackBuffer.InternalInitialize(byteStart, null);
731 // Loop through our input, 4 characters at a time!
732 while (bytes < end && charCount >= 0)
734 // Get our next character
735 if (_bigEndian)
737 // Scoot left and add it to the bottom
738 iChar <<= 8;
739 iChar += *(bytes++);
741 else
743 // Scoot right and add it to the top
744 iChar >>= 8;
745 iChar += (uint)(*(bytes++)) << 24;
748 readCount++;
750 // See if we have all the bytes yet
751 if (readCount < 4)
752 continue;
754 // Have the bytes
755 readCount = 0;
757 // See if its valid to encode
758 if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF))
760 // Need to fall back these 4 bytes
761 byte[] fallbackBytes;
762 if (_bigEndian)
764 fallbackBytes = new byte[] {
765 unchecked((byte)(iChar >> 24)), unchecked((byte)(iChar >> 16)),
766 unchecked((byte)(iChar >> 8)), unchecked((byte)(iChar)) };
768 else
770 fallbackBytes = new byte[] {
771 unchecked((byte)(iChar)), unchecked((byte)(iChar >> 8)),
772 unchecked((byte)(iChar >> 16)), unchecked((byte)(iChar >> 24)) };
775 charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes);
777 // Ignore the illegal character
778 iChar = 0;
779 continue;
782 // Ok, we have something we can add to our output
783 if (iChar >= 0x10000)
785 // Surrogates take 2
786 charCount++;
789 // Add the rest of the surrogate or our normal character
790 charCount++;
792 // iChar is back to 0
793 iChar = 0;
796 // See if we have something left over that has to be decoded
797 if (readCount > 0 && (decoder == null || decoder.MustFlush))
799 // Oops, there's something left over with no place to go.
800 byte[] fallbackBytes = new byte[readCount];
801 if (_bigEndian)
803 while (readCount > 0)
805 fallbackBytes[--readCount] = unchecked((byte)iChar);
806 iChar >>= 8;
809 else
811 while (readCount > 0)
813 fallbackBytes[--readCount] = unchecked((byte)(iChar >> 24));
814 iChar <<= 8;
818 charCount += fallbackBuffer.InternalFallback(fallbackBytes, bytes);
821 // Check for overflows.
822 if (charCount < 0)
823 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
825 // Shouldn't have anything in fallback buffer for GetCharCount
826 // (don't have to check _throwOnOverflow for chars or count)
827 Debug.Assert(fallbackBuffer.Remaining == 0,
828 "[UTF32Encoding.GetCharCount]Expected empty fallback buffer at end");
830 // Return our count
831 return charCount;
834 internal override unsafe int GetChars(byte* bytes, int byteCount,
835 char* chars, int charCount, DecoderNLS? baseDecoder)
837 Debug.Assert(chars != null, "[UTF32Encoding.GetChars]chars!=null");
838 Debug.Assert(bytes != null, "[UTF32Encoding.GetChars]bytes!=null");
839 Debug.Assert(byteCount >= 0, "[UTF32Encoding.GetChars]byteCount >=0");
840 Debug.Assert(charCount >= 0, "[UTF32Encoding.GetChars]charCount >=0");
842 UTF32Decoder? decoder = (UTF32Decoder?)baseDecoder;
844 // None so far!
845 char* charStart = chars;
846 char* charEnd = chars + charCount;
848 byte* byteStart = bytes;
849 byte* byteEnd = bytes + byteCount;
851 // See if there's anything in our decoder (but don't clear it yet)
852 int readCount = 0;
853 uint iChar = 0;
855 // For fallback we may need a fallback buffer
856 DecoderFallbackBuffer? fallbackBuffer = null;
857 char* charsForFallback;
859 // See if there's anything in our decoder
860 if (decoder != null)
862 readCount = decoder.readByteCount;
863 iChar = (uint)decoder.iChar;
864 Debug.Assert(baseDecoder != null);
865 fallbackBuffer = baseDecoder.FallbackBuffer;
867 // Shouldn't have anything in fallback buffer for GetChars
868 // (don't have to check _throwOnOverflow for chars)
869 Debug.Assert(fallbackBuffer.Remaining == 0,
870 "[UTF32Encoding.GetChars]Expected empty fallback buffer at start");
872 else
874 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
877 // Set our internal fallback interesting things.
878 fallbackBuffer.InternalInitialize(bytes, chars + charCount);
880 // Loop through our input, 4 characters at a time!
881 while (bytes < byteEnd)
883 // Get our next character
884 if (_bigEndian)
886 // Scoot left and add it to the bottom
887 iChar <<= 8;
888 iChar += *(bytes++);
890 else
892 // Scoot right and add it to the top
893 iChar >>= 8;
894 iChar += (uint)(*(bytes++)) << 24;
897 readCount++;
899 // See if we have all the bytes yet
900 if (readCount < 4)
901 continue;
903 // Have the bytes
904 readCount = 0;
906 // See if its valid to encode
907 if (iChar > 0x10FFFF || (iChar >= 0xD800 && iChar <= 0xDFFF))
909 // Need to fall back these 4 bytes
910 byte[] fallbackBytes;
911 if (_bigEndian)
913 fallbackBytes = new byte[] {
914 unchecked((byte)(iChar >> 24)), unchecked((byte)(iChar >> 16)),
915 unchecked((byte)(iChar >> 8)), unchecked((byte)(iChar)) };
917 else
919 fallbackBytes = new byte[] {
920 unchecked((byte)(iChar)), unchecked((byte)(iChar >> 8)),
921 unchecked((byte)(iChar >> 16)), unchecked((byte)(iChar >> 24)) };
924 // Chars won't be updated unless this works.
925 charsForFallback = chars;
926 bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback);
927 chars = charsForFallback;
929 if (!fallbackResult)
931 // Couldn't fallback, throw or wait til next time
932 // We either read enough bytes for bytes-=4 to work, or we're
933 // going to throw in ThrowCharsOverflow because chars == charStart
934 Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
935 "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (bad surrogate)");
936 bytes -= 4; // get back to where we were
937 iChar = 0; // Remembering nothing
938 fallbackBuffer.InternalReset();
939 ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
940 break; // Stop here, didn't throw
943 // Ignore the illegal character
944 iChar = 0;
945 continue;
948 // Ok, we have something we can add to our output
949 if (iChar >= 0x10000)
951 // Surrogates take 2
952 if (chars >= charEnd - 1)
954 // Throwing or stopping
955 // We either read enough bytes for bytes-=4 to work, or we're
956 // going to throw in ThrowCharsOverflow because chars == charStart
957 Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
958 "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (surrogate)");
959 bytes -= 4; // get back to where we were
960 iChar = 0; // Remembering nothing
961 ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
962 break; // Stop here, didn't throw
965 *(chars++) = GetHighSurrogate(iChar);
966 iChar = GetLowSurrogate(iChar);
968 // Bounds check for normal character
969 else if (chars >= charEnd)
971 // Throwing or stopping
972 // We either read enough bytes for bytes-=4 to work, or we're
973 // going to throw in ThrowCharsOverflow because chars == charStart
974 Debug.Assert(bytes >= byteStart + 4 || chars == charStart,
975 "[UTF32Encoding.GetChars]Expected to have consumed bytes or throw (normal char)");
976 bytes -= 4; // get back to where we were
977 iChar = 0; // Remembering nothing
978 ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
979 break; // Stop here, didn't throw
982 // Add the rest of the surrogate or our normal character
983 *(chars++) = (char)iChar;
985 // iChar is back to 0
986 iChar = 0;
989 // See if we have something left over that has to be decoded
990 if (readCount > 0 && (decoder == null || decoder.MustFlush))
992 // Oops, there's something left over with no place to go.
993 byte[] fallbackBytes = new byte[readCount];
994 int tempCount = readCount;
995 if (_bigEndian)
997 while (tempCount > 0)
999 fallbackBytes[--tempCount] = unchecked((byte)iChar);
1000 iChar >>= 8;
1003 else
1005 while (tempCount > 0)
1007 fallbackBytes[--tempCount] = unchecked((byte)(iChar >> 24));
1008 iChar <<= 8;
1012 charsForFallback = chars;
1013 bool fallbackResult = fallbackBuffer.InternalFallback(fallbackBytes, bytes, ref charsForFallback);
1014 chars = charsForFallback;
1016 if (!fallbackResult)
1018 // Couldn't fallback.
1019 fallbackBuffer.InternalReset();
1020 ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
1021 // Stop here, didn't throw, backed up, so still nothing in buffer
1023 else
1025 // Don't clear our decoder unless we could fall it back.
1026 // If we caught the if above, then we're a convert() and will catch this next time.
1027 readCount = 0;
1028 iChar = 0;
1032 // Remember any left over stuff, clearing buffer as well for MustFlush
1033 if (decoder != null)
1035 decoder.iChar = (int)iChar;
1036 decoder.readByteCount = readCount;
1037 decoder._bytesUsed = (int)(bytes - byteStart);
1040 // Shouldn't have anything in fallback buffer for GetChars
1041 // (don't have to check _throwOnOverflow for chars)
1042 Debug.Assert(fallbackBuffer.Remaining == 0,
1043 "[UTF32Encoding.GetChars]Expected empty fallback buffer at end");
1045 // Return our count
1046 return (int)(chars - charStart);
1049 private uint GetSurrogate(char cHigh, char cLow)
1051 return (((uint)cHigh - 0xD800) * 0x400) + ((uint)cLow - 0xDC00) + 0x10000;
1054 private char GetHighSurrogate(uint iChar)
1056 return (char)((iChar - 0x10000) / 0x400 + 0xD800);
1059 private char GetLowSurrogate(uint iChar)
1061 return (char)((iChar - 0x10000) % 0x400 + 0xDC00);
1064 public override Decoder GetDecoder()
1066 return new UTF32Decoder(this);
1069 public override Encoder GetEncoder()
1071 return new EncoderNLS(this);
1074 public override int GetMaxByteCount(int charCount)
1076 if (charCount < 0)
1077 throw new ArgumentOutOfRangeException(nameof(charCount),
1078 SR.ArgumentOutOfRange_NeedNonNegNum);
1080 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
1081 long byteCount = (long)charCount + 1;
1083 if (EncoderFallback.MaxCharCount > 1)
1084 byteCount *= EncoderFallback.MaxCharCount;
1086 // 4 bytes per char
1087 byteCount *= 4;
1089 if (byteCount > 0x7fffffff)
1090 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
1092 return (int)byteCount;
1095 public override int GetMaxCharCount(int byteCount)
1097 if (byteCount < 0)
1098 throw new ArgumentOutOfRangeException(nameof(byteCount),
1099 SR.ArgumentOutOfRange_NeedNonNegNum);
1101 // A supplementary character becomes 2 surrogate characters, so 4 input bytes becomes 2 chars,
1102 // plus we may have 1 surrogate char left over if the decoder has 3 bytes in it already for a non-bmp char.
1103 // Have to add another one because 1/2 == 0, but 3 bytes left over could be 2 char surrogate pair
1104 int charCount = (byteCount / 2) + 2;
1106 // Also consider fallback because our input bytes could be out of range of unicode.
1107 // Since fallback would fallback 4 bytes at a time, we'll only fall back 1/2 of MaxCharCount.
1108 if (DecoderFallback.MaxCharCount > 2)
1110 // Multiply time fallback size
1111 charCount *= DecoderFallback.MaxCharCount;
1113 // We were already figuring 2 chars per 4 bytes, but fallback will be different #
1114 charCount /= 2;
1117 if (charCount > 0x7fffffff)
1118 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
1120 return (int)charCount;
1123 public override byte[] GetPreamble()
1125 if (_emitUTF32ByteOrderMark)
1127 // Allocate new array to prevent users from modifying it.
1128 if (_bigEndian)
1130 return new byte[4] { 0x00, 0x00, 0xFE, 0xFF };
1132 else
1134 return new byte[4] { 0xFF, 0xFE, 0x00, 0x00 }; // 00 00 FE FF
1137 else
1138 return Array.Empty<byte>();
1141 public override ReadOnlySpan<byte> Preamble =>
1142 GetType() != typeof(UTF32Encoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UTF32Encoding overrode GetPreamble
1143 !_emitUTF32ByteOrderMark ? default :
1144 _bigEndian ? (ReadOnlySpan<byte>)new byte[4] { 0x00, 0x00, 0xFE, 0xFF } : // uses C# compiler's optimization for static byte[] data
1145 (ReadOnlySpan<byte>)new byte[4] { 0xFF, 0xFE, 0x00, 0x00 };
1147 public override bool Equals(object? value)
1149 if (value is UTF32Encoding that)
1151 return (_emitUTF32ByteOrderMark == that._emitUTF32ByteOrderMark) &&
1152 (_bigEndian == that._bigEndian) &&
1153 (EncoderFallback.Equals(that.EncoderFallback)) &&
1154 (DecoderFallback.Equals(that.DecoderFallback));
1157 return false;
1160 public override int GetHashCode()
1162 // Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
1163 return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
1164 CodePage + (_emitUTF32ByteOrderMark ? 4 : 0) + (_bigEndian ? 8 : 0);
1167 private sealed class UTF32Decoder : DecoderNLS
1169 // Need a place to store any extra bytes we may have picked up
1170 internal int iChar = 0;
1171 internal int readByteCount = 0;
1173 public UTF32Decoder(UTF32Encoding encoding) : base(encoding)
1175 // base calls reset
1178 public override void Reset()
1180 this.iChar = 0;
1181 this.readByteCount = 0;
1182 if (_fallbackBuffer != null)
1183 _fallbackBuffer.Reset();
1186 // Anything left in our decoder?
1187 internal override bool HasState =>
1188 // ReadByteCount is our flag. (iChar==0 doesn't mean much).
1189 this.readByteCount != 0;