2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
30 using System
.Runtime
.InteropServices
;
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [MonoLimitation ("EncoderFallback is not handled")]
36 public class UTF8Encoding
: Encoding
38 // Magic number used by Windows for UTF-8.
39 internal const int UTF8_CODE_PAGE
= 65001;
42 private bool emitIdentifier
;
45 public UTF8Encoding () : this (false, false) {}
46 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier
)
47 : this (encoderShouldEmitUTF8Identifier
, false) {}
49 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier
, bool throwOnInvalidBytes
)
50 : base (UTF8_CODE_PAGE
)
52 emitIdentifier
= encoderShouldEmitUTF8Identifier
;
53 if (throwOnInvalidBytes
)
54 SetFallbackInternal (null, DecoderFallback
.ExceptionFallback
);
56 SetFallbackInternal (null, DecoderFallback
.StandardSafeFallback
);
58 web_name
= body_name
= header_name
= "utf-8";
59 encoding_name
= "Unicode (UTF-8)";
60 is_browser_save
= true;
61 is_browser_display
= true;
62 is_mail_news_display
= true;
63 is_mail_news_save
= true;
64 windows_code_page
= UnicodeEncoding
.UNICODE_CODE_PAGE
;
67 #region GetByteCount()
69 // Internal version of "GetByteCount" which can handle a rolling
70 // state between multiple calls to this method.
71 private static int InternalGetByteCount (char[] chars
, int index
, int count
, ref char leftOver
, bool flush
)
73 // Validate the parameters.
75 throw new ArgumentNullException ("chars");
77 if (index
< 0 || index
> chars
.Length
) {
78 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
80 if (count
< 0 || count
> (chars
.Length
- index
)) {
81 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
84 if (index
== chars
.Length
) {
85 if (flush
&& leftOver
!= '\0') {
86 // Flush the left-over surrogate pair start.
94 fixed (char* cptr
= chars
) {
95 return InternalGetByteCount (cptr
+ index
, count
, ref leftOver
, flush
);
100 private unsafe static int InternalGetByteCount (char* chars
, int count
, ref char leftOver
, bool flush
)
103 char* end
= chars
+ count
;
104 while (chars
< end
) {
106 for (; chars
< end
; chars
++) {
107 if (*chars
< '\x80') {
109 } else if (*chars
< '\x800') {
111 } else if (*chars
< '\uD800' || *chars
> '\uDFFF') {
113 } else if (*chars
<= '\uDBFF') {
114 // This is a surrogate start char, exit the inner loop only
115 // if we don't find the complete surrogate pair.
116 if (chars
+ 1 < end
&& chars
[1] >= '\uDC00' && chars
[1] <= '\uDFFF') {
125 // We have a surrogate tail without
126 // leading surrogate. In NET_2_0 it
127 // uses fallback. In NET_1_1 we output
134 if (*chars
>= '\uDC00' && *chars
<= '\uDFFF') {
135 // We have a correct surrogate pair.
139 // We have a surrogate start followed by a
140 // regular character. Technically, this is
141 // invalid, but we have to do something.
142 // We write out the surrogate start and then
143 // re-visit the current character again.
150 // Flush the left-over surrogate pair start.
151 if (leftOver
!= '\0') {
159 // Get the number of bytes needed to encode a character buffer.
160 public override int GetByteCount (char[] chars
, int index
, int count
)
163 return InternalGetByteCount (chars
, index
, count
, ref dummy
, true);
167 [CLSCompliant (false)]
169 public unsafe override int GetByteCount (char* chars
, int count
)
172 throw new ArgumentNullException ("chars");
176 return InternalGetByteCount (chars
, count
, ref dummy
, true);
183 // Internal version of "GetBytes" which can handle a rolling
184 // state between multiple calls to this method.
185 private static int InternalGetBytes (char[] chars
, int charIndex
,
186 int charCount
, byte[] bytes
,
187 int byteIndex
, ref char leftOver
,
190 // Validate the parameters.
192 throw new ArgumentNullException ("chars");
195 throw new ArgumentNullException ("bytes");
197 if (charIndex
< 0 || charIndex
> chars
.Length
) {
198 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
200 if (charCount
< 0 || charCount
> (chars
.Length
- charIndex
)) {
201 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
203 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
204 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
207 if (charIndex
== chars
.Length
) {
208 if (flush
&& leftOver
!= '\0') {
209 // FIXME: use EncoderFallback.
211 // By default it is empty, so I do nothing for now.
218 fixed (char* cptr
= chars
) {
219 if (bytes
.Length
== byteIndex
)
220 return InternalGetBytes (
221 cptr
+ charIndex
, charCount
,
222 null, 0, ref leftOver
, flush
);
223 fixed (byte *bptr
= bytes
) {
224 return InternalGetBytes (
225 cptr
+ charIndex
, charCount
,
226 bptr
+ byteIndex
, bytes
.Length
- byteIndex
,
227 ref leftOver
, flush
);
233 private unsafe static int InternalGetBytes (char* chars
, int count
, byte* bytes
, int bcount
, ref char leftOver
, bool flush
)
235 char* end
= chars
+ count
;
236 byte* end_bytes
= bytes
+ bcount
;
237 while (chars
< end
) {
239 for (; chars
< end
; chars
++) {
242 if (bytes
>= end_bytes
)
245 } else if (ch
< '\x800') {
246 if (bytes
+ 1 >= end_bytes
)
248 bytes
[0] = (byte) (0xC0 | (ch
>> 6));
249 bytes
[1] = (byte) (0x80 | (ch
& 0x3F));
251 } else if (ch
< '\uD800' || ch
> '\uDFFF') {
252 if (bytes
+ 2 >= end_bytes
)
254 bytes
[0] = (byte) (0xE0 | (ch
>> 12));
255 bytes
[1] = (byte) (0x80 | ((ch
>> 6) & 0x3F));
256 bytes
[2] = (byte) (0x80 | (ch
& 0x3F));
258 } else if (ch
<= '\uDBFF') {
259 // This is a surrogate char, exit the inner loop.
264 // We have a surrogate tail without
265 // leading surrogate. In NET_2_0 it
266 // uses fallback. In NET_1_1 we output
268 if (bytes
+ 2 >= end_bytes
)
270 bytes
[0] = (byte) (0xE0 | (ch
>> 12));
271 bytes
[1] = (byte) (0x80 | ((ch
>> 6) & 0x3F));
272 bytes
[2] = (byte) (0x80 | (ch
& 0x3F));
278 if (*chars
>= '\uDC00' && *chars
<= '\uDFFF') {
279 // We have a correct surrogate pair.
280 int ch
= 0x10000 + (int) *chars
- 0xDC00 + (((int) leftOver
- 0xD800) << 10);
281 if (bytes
+ 3 >= end_bytes
)
283 bytes
[0] = (byte) (0xF0 | (ch
>> 18));
284 bytes
[1] = (byte) (0x80 | ((ch
>> 12) & 0x3F));
285 bytes
[2] = (byte) (0x80 | ((ch
>> 6) & 0x3F));
286 bytes
[3] = (byte) (0x80 | (ch
& 0x3F));
290 // We have a surrogate start followed by a
291 // regular character. Technically, this is
292 // invalid, but we have to do something.
293 // We write out the surrogate start and then
294 // re-visit the current character again.
296 if (bytes
+ 2 >= end_bytes
)
298 bytes
[0] = (byte) (0xE0 | (ch
>> 12));
299 bytes
[1] = (byte) (0x80 | ((ch
>> 6) & 0x3F));
300 bytes
[2] = (byte) (0x80 | (ch
& 0x3F));
307 // Flush the left-over surrogate pair start.
308 if (leftOver
!= '\0') {
310 if (bytes
+ 2 < end_bytes
) {
311 bytes
[0] = (byte) (0xE0 | (ch
>> 12));
312 bytes
[1] = (byte) (0x80 | ((ch
>> 6) & 0x3F));
313 bytes
[2] = (byte) (0x80 | (ch
& 0x3F));
321 return (int)(bytes
- (end_bytes
- bcount
));
323 throw new ArgumentException ("Insufficient Space", "bytes");
326 // Get the bytes that result from encoding a character buffer.
327 public override int GetBytes (char[] chars
, int charIndex
, int charCount
,
328 byte[] bytes
, int byteIndex
)
330 char leftOver
= '\0';
331 return InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteIndex
, ref leftOver
, true);
334 // Convenience wrappers for "GetBytes".
335 public override int GetBytes (String s
, int charIndex
, int charCount
,
336 byte[] bytes
, int byteIndex
)
338 // Validate the parameters.
340 throw new ArgumentNullException ("s");
343 throw new ArgumentNullException ("bytes");
345 if (charIndex
< 0 || charIndex
> s
.Length
) {
346 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
348 if (charCount
< 0 || charCount
> (s
.Length
- charIndex
)) {
349 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
351 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
352 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
355 if (charIndex
== s
.Length
)
359 fixed (char* cptr
= s
) {
361 if (bytes
.Length
== byteIndex
)
362 return InternalGetBytes (
363 cptr
+ charIndex
, charCount
,
364 null, 0, ref dummy
, true);
365 fixed (byte *bptr
= bytes
) {
366 return InternalGetBytes (
367 cptr
+ charIndex
, charCount
,
368 bptr
+ byteIndex
, bytes
.Length
- byteIndex
,
375 [CLSCompliant (false)]
377 public unsafe override int GetBytes (char* chars
, int charCount
, byte* bytes
, int byteCount
)
380 throw new ArgumentNullException ("chars");
382 throw new IndexOutOfRangeException ("charCount");
384 throw new ArgumentNullException ("bytes");
386 throw new IndexOutOfRangeException ("charCount");
393 return InternalGetBytes (chars
, charCount
, null, 0, ref dummy
, true);
395 return InternalGetBytes (chars
, charCount
, bytes
, byteCount
, ref dummy
, true);
400 // Internal version of "GetCharCount" which can handle a rolling
401 // state between multiple calls to this method.
402 private unsafe static int InternalGetCharCount (
403 byte[] bytes
, int index
, int count
, uint leftOverBits
,
404 uint leftOverCount
, object provider
,
405 ref DecoderFallbackBuffer fallbackBuffer
, ref byte [] bufferArg
, bool flush
)
407 // Validate the parameters.
409 throw new ArgumentNullException ("bytes");
411 if (index
< 0 || index
> bytes
.Length
) {
412 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
414 if (count
< 0 || count
> (bytes
.Length
- index
)) {
415 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
420 fixed (byte *bptr
= bytes
)
421 return InternalGetCharCount (bptr
+ index
, count
,
422 leftOverBits
, leftOverCount
, provider
, ref fallbackBuffer
, ref bufferArg
, flush
);
425 private unsafe static int InternalGetCharCount (
426 byte* bytes
, int count
, uint leftOverBits
,
427 uint leftOverCount
, object provider
,
428 ref DecoderFallbackBuffer fallbackBuffer
, ref byte [] bufferArg
, bool flush
)
434 if (leftOverCount
== 0) {
435 int end
= index
+ count
;
436 for (; index
< end
; index
++, count
--) {
437 if (bytes
[index
] < 0x80)
444 // Determine the number of characters that we have.
446 uint leftBits
= leftOverBits
;
447 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
448 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
450 ch
= (uint)(bytes
[index
++]);
453 // Process a UTF-8 start character.
454 if (ch
< (uint)0x0080) {
455 // Single-byte UTF-8 character.
457 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
458 // Double-byte UTF-8 character.
459 leftBits
= (ch
& (uint)0x1F);
462 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
463 // Three-byte UTF-8 character.
464 leftBits
= (ch
& (uint)0x0F);
467 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
468 // Four-byte UTF-8 character.
469 leftBits
= (ch
& (uint)0x07);
472 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
473 // Five-byte UTF-8 character.
474 leftBits
= (ch
& (uint)0x03);
477 } else if ((ch
& (uint)0xFE) == (uint)0xFC) {
478 // Six-byte UTF-8 character.
479 leftBits
= (ch
& (uint)0x03);
483 // Invalid UTF-8 start character.
484 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- 1, 1);
487 // Process an extra byte in a multi-byte sequence.
488 if ((ch
& (uint)0xC0) == (uint)0x80) {
489 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
490 if (++leftSoFar
>= leftSize
) {
491 // We have a complete character now.
492 if (leftBits
< (uint)0x10000) {
493 // is it an overlong ?
494 bool overlong
= false;
497 overlong
= (leftBits
<= 0x7F);
500 overlong
= (leftBits
<= 0x07FF);
503 overlong
= (leftBits
<= 0xFFFF);
506 overlong
= (leftBits
<= 0x1FFFFF);
509 overlong
= (leftBits
<= 0x03FFFFFF);
513 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- leftSoFar
, leftSoFar
);
515 else if ((leftBits
& 0xF800) == 0xD800) {
516 // UTF-8 doesn't use surrogate characters
517 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- leftSoFar
, leftSoFar
);
521 } else if (leftBits
< (uint)0x110000) {
524 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- leftSoFar
, leftSoFar
);
529 // Invalid UTF-8 sequence: clear and restart.
530 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- leftSoFar
, leftSoFar
);
537 if (flush
&& leftSize
!= 0) {
538 // We had left-over bytes that didn't make up
539 // a complete UTF-8 character sequence.
540 length
+= Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, index
- leftSoFar
, leftSoFar
);
543 // Return the final length to the caller.
547 // for GetCharCount()
548 static unsafe int Fallback (object provider
, ref DecoderFallbackBuffer buffer
, ref byte [] bufferArg
, byte* bytes
, long index
, uint size
)
550 if (buffer
== null) {
551 DecoderFallback fb
= provider
as DecoderFallback
;
553 buffer
= fb
.CreateFallbackBuffer ();
555 buffer
= ((Decoder
) provider
).FallbackBuffer
;
557 if (bufferArg
== null)
558 bufferArg
= new byte [1];
560 for (int i
= 0; i
< size
; i
++) {
561 bufferArg
[0] = bytes
[(int) index
+ i
];
562 buffer
.Fallback (bufferArg
, 0);
563 ret
+= buffer
.Remaining
;
570 static unsafe void Fallback (object provider
, ref DecoderFallbackBuffer buffer
, ref byte [] bufferArg
, byte* bytes
, long byteIndex
, uint size
,
571 char* chars
, ref int charIndex
)
573 if (buffer
== null) {
574 DecoderFallback fb
= provider
as DecoderFallback
;
576 buffer
= fb
.CreateFallbackBuffer ();
578 buffer
= ((Decoder
) provider
).FallbackBuffer
;
580 if (bufferArg
== null)
581 bufferArg
= new byte [1];
582 for (int i
= 0; i
< size
; i
++) {
583 bufferArg
[0] = bytes
[byteIndex
+ i
];
584 buffer
.Fallback (bufferArg
, 0);
585 while (buffer
.Remaining
> 0)
586 chars
[charIndex
++] = buffer
.GetNextChar ();
591 // Get the number of characters needed to decode a byte buffer.
592 public override int GetCharCount (byte[] bytes
, int index
, int count
)
594 DecoderFallbackBuffer buf
= null;
595 byte [] bufferArg
= null;
596 return InternalGetCharCount (bytes
, index
, count
, 0, 0, DecoderFallback
, ref buf
, ref bufferArg
, true);
599 [CLSCompliant (false)]
601 public unsafe override int GetCharCount (byte* bytes
, int count
)
603 DecoderFallbackBuffer buf
= null;
604 byte [] bufferArg
= null;
605 return InternalGetCharCount (bytes
, count
, 0, 0, DecoderFallback
, ref buf
, ref bufferArg
, true);
608 // Get the characters that result from decoding a byte buffer.
609 private unsafe static int InternalGetChars (
610 byte[] bytes
, int byteIndex
, int byteCount
, char[] chars
,
611 int charIndex
, ref uint leftOverBits
, ref uint leftOverCount
,
613 ref DecoderFallbackBuffer fallbackBuffer
, ref byte [] bufferArg
, bool flush
)
615 // Validate the parameters.
617 throw new ArgumentNullException ("bytes");
620 throw new ArgumentNullException ("chars");
622 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
623 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
625 if (byteCount
< 0 || byteCount
> (bytes
.Length
- byteIndex
)) {
626 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
628 if (charIndex
< 0 || charIndex
> chars
.Length
) {
629 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
632 if (charIndex
== chars
.Length
)
635 fixed (char* cptr
= chars
) {
636 if (byteCount
== 0 || byteIndex
== bytes
.Length
)
637 return InternalGetChars (null, 0, cptr
+ charIndex
, chars
.Length
- charIndex
, ref leftOverBits
, ref leftOverCount
, provider
, ref fallbackBuffer
, ref bufferArg
, flush
);
639 fixed (byte* bptr
= bytes
)
640 return InternalGetChars (bptr
+ byteIndex
, byteCount
, cptr
+ charIndex
, chars
.Length
- charIndex
, ref leftOverBits
, ref leftOverCount
, provider
, ref fallbackBuffer
, ref bufferArg
, flush
);
644 private unsafe static int InternalGetChars (
645 byte* bytes
, int byteCount
, char* chars
, int charCount
,
646 ref uint leftOverBits
, ref uint leftOverCount
,
648 ref DecoderFallbackBuffer fallbackBuffer
, ref byte [] bufferArg
, bool flush
)
650 int charIndex
= 0, byteIndex
= 0;
651 int length
= charCount
;
652 int posn
= charIndex
;
654 if (leftOverCount
== 0) {
655 int end
= byteIndex
+ byteCount
;
656 for (; byteIndex
< end
; posn
++, byteIndex
++, byteCount
--) {
657 if (bytes
[byteIndex
] < 0x80)
658 chars
[posn
] = (char) bytes
[byteIndex
];
664 // Convert the bytes into the output buffer.
666 uint leftBits
= leftOverBits
;
667 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
668 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
670 int byteEnd
= byteIndex
+ byteCount
;
671 for(; byteIndex
< byteEnd
; byteIndex
++) {
672 // Fetch the next character from the byte buffer.
673 ch
= (uint)(bytes
[byteIndex
]);
675 // Process a UTF-8 start character.
676 if (ch
< (uint)0x0080) {
677 // Single-byte UTF-8 character.
678 if (posn
>= length
) {
679 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
681 chars
[posn
++] = (char)ch
;
682 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
683 // Double-byte UTF-8 character.
684 leftBits
= (ch
& (uint)0x1F);
687 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
688 // Three-byte UTF-8 character.
689 leftBits
= (ch
& (uint)0x0F);
692 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
693 // Four-byte UTF-8 character.
694 leftBits
= (ch
& (uint)0x07);
697 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
698 // Five-byte UTF-8 character.
699 leftBits
= (ch
& (uint)0x03);
702 } else if ((ch
& (uint)0xFE) == (uint)0xFC) {
703 // Six-byte UTF-8 character.
704 leftBits
= (ch
& (uint)0x03);
708 // Invalid UTF-8 start character.
709 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
, 1, chars
, ref posn
);
712 // Process an extra byte in a multi-byte sequence.
713 if ((ch
& (uint)0xC0) == (uint)0x80) {
714 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
715 if (++leftSoFar
>= leftSize
) {
716 // We have a complete character now.
717 if (leftBits
< (uint)0x10000) {
718 // is it an overlong ?
719 bool overlong
= false;
722 overlong
= (leftBits
<= 0x7F);
725 overlong
= (leftBits
<= 0x07FF);
728 overlong
= (leftBits
<= 0xFFFF);
731 overlong
= (leftBits
<= 0x1FFFFF);
734 overlong
= (leftBits
<= 0x03FFFFFF);
738 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
- leftSoFar
, leftSoFar
, chars
, ref posn
);
740 else if ((leftBits
& 0xF800) == 0xD800) {
741 // UTF-8 doesn't use surrogate characters
742 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
- leftSoFar
, leftSoFar
, chars
, ref posn
);
745 if (posn
>= length
) {
746 throw new ArgumentException
747 (_("Arg_InsufficientSpace"), "chars");
749 chars
[posn
++] = (char)leftBits
;
751 } else if (leftBits
< (uint)0x110000) {
752 if ((posn
+ 2) > length
) {
753 throw new ArgumentException
754 (_("Arg_InsufficientSpace"), "chars");
756 leftBits
-= (uint)0x10000;
757 chars
[posn
++] = (char)((leftBits
>> 10) +
760 (char)((leftBits
& (uint)0x3FF) + (uint)0xDC00);
762 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
- leftSoFar
, leftSoFar
, chars
, ref posn
);
767 // Invalid UTF-8 sequence: clear and restart.
768 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
- leftSoFar
, leftSoFar
, chars
, ref posn
);
774 if (flush
&& leftSize
!= 0) {
775 // We had left-over bytes that didn't make up
776 // a complete UTF-8 character sequence.
777 Fallback (provider
, ref fallbackBuffer
, ref bufferArg
, bytes
, byteIndex
- leftSoFar
, leftSoFar
, chars
, ref posn
);
779 leftOverBits
= leftBits
;
780 leftOverCount
= (leftSoFar
| (leftSize
<< 4));
782 // Return the final length to the caller.
783 return posn
- charIndex
;
786 // Get the characters that result from decoding a byte buffer.
787 public override int GetChars (byte[] bytes
, int byteIndex
, int byteCount
,
788 char[] chars
, int charIndex
)
790 uint leftOverBits
= 0;
791 uint leftOverCount
= 0;
792 DecoderFallbackBuffer buf
= null;
793 byte [] bufferArg
= null;
794 return InternalGetChars (bytes
, byteIndex
, byteCount
, chars
,
795 charIndex
, ref leftOverBits
, ref leftOverCount
, DecoderFallback
, ref buf
, ref bufferArg
, true);
798 [CLSCompliant (false)]
800 public unsafe override int GetChars (byte* bytes
, int byteCount
, char* chars
, int charCount
)
802 DecoderFallbackBuffer buf
= null;
803 byte [] bufferArg
= null;
804 uint leftOverBits
= 0;
805 uint leftOverCount
= 0;
806 return InternalGetChars (bytes
, byteCount
, chars
,
807 charCount
, ref leftOverBits
, ref leftOverCount
, DecoderFallback
, ref buf
, ref bufferArg
, true);
810 // Get the maximum number of bytes needed to encode a
811 // specified number of characters.
812 public override int GetMaxByteCount (int charCount
)
815 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
817 return charCount
* 4;
820 // Get the maximum number of characters needed to decode a
821 // specified number of bytes.
822 public override int GetMaxCharCount (int byteCount
)
825 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
830 // Get a UTF8-specific decoder that is attached to this instance.
831 public override Decoder
GetDecoder ()
833 return new UTF8Decoder (DecoderFallback
);
836 // Get a UTF8-specific encoder that is attached to this instance.
837 public override Encoder
GetEncoder ()
839 return new UTF8Encoder (emitIdentifier
);
842 // Get the UTF8 preamble.
843 public override byte[] GetPreamble ()
846 return new byte [] { 0xEF, 0xBB, 0xBF }
;
851 // Determine if this object is equal to another.
852 public override bool Equals (Object
value)
854 UTF8Encoding enc
= (value as UTF8Encoding
);
856 return (codePage
== enc
.codePage
&&
857 emitIdentifier
== enc
.emitIdentifier
&&
858 DecoderFallback
.Equals (enc
.DecoderFallback
) &&
859 EncoderFallback
.Equals (enc
.EncoderFallback
));
865 // Get the hash code for this object.
866 public override int GetHashCode ()
868 return base.GetHashCode ();
871 public override int GetByteCount (string chars
)
873 // hmm, does this override make any sense?
874 return base.GetByteCount (chars
);
878 public override string GetString (byte [] bytes
, int index
, int count
)
880 // hmm, does this override make any sense?
881 return base.GetString (bytes
, index
, count
);
884 // UTF-8 decoder implementation.
886 private class UTF8Decoder
: Decoder
888 private uint leftOverBits
;
889 private uint leftOverCount
;
892 public UTF8Decoder (DecoderFallback fallback
)
899 // Override inherited methods.
900 public override int GetCharCount (byte[] bytes
, int index
, int count
)
902 DecoderFallbackBuffer buf
= null;
903 byte [] bufferArg
= null;
904 return InternalGetCharCount (bytes
, index
, count
,
905 leftOverBits
, leftOverCount
, this, ref buf
, ref bufferArg
, false);
907 public override int GetChars (byte[] bytes
, int byteIndex
,
908 int byteCount
, char[] chars
, int charIndex
)
910 DecoderFallbackBuffer buf
= null;
911 byte [] bufferArg
= null;
912 return InternalGetChars (bytes
, byteIndex
, byteCount
,
913 chars
, charIndex
, ref leftOverBits
, ref leftOverCount
, this, ref buf
, ref bufferArg
, false);
916 } // class UTF8Decoder
918 // UTF-8 encoder implementation.
920 private class UTF8Encoder
: Encoder
922 // private bool emitIdentifier;
923 private char leftOverForCount
;
924 private char leftOverForConv
;
927 public UTF8Encoder (bool emitIdentifier
)
929 // this.emitIdentifier = emitIdentifier;
930 leftOverForCount
= '\0';
931 leftOverForConv
= '\0';
934 // Override inherited methods.
935 public override int GetByteCount (char[] chars
, int index
,
936 int count
, bool flush
)
938 return InternalGetByteCount (chars
, index
, count
, ref leftOverForCount
, flush
);
940 public override int GetBytes (char[] chars
, int charIndex
,
941 int charCount
, byte[] bytes
, int byteIndex
, bool flush
)
944 result
= InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteIndex
, ref leftOverForConv
, flush
);
945 // emitIdentifier = false;
949 public unsafe override int GetByteCount (char* chars
, int count
, bool flush
)
951 return InternalGetByteCount (chars
, count
, ref leftOverForCount
, flush
);
954 public unsafe override int GetBytes (char* chars
, int charCount
,
955 byte* bytes
, int byteCount
, bool flush
)
958 result
= InternalGetBytes (chars
, charCount
, bytes
, byteCount
, ref leftOverForConv
, flush
);
959 // emitIdentifier = false;
962 } // class UTF8Encoder
964 }; // class UTF8Encoding
966 }; // namespace System.Text