2009-12-02 Jb Evain <jbevain@novell.com>
[mcs.git] / class / corlib / System.Text / UTF8Encoding.cs
blob1b722349dc96b4630dff60c61a22c13277c0179c
1 /*
2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
26 namespace System.Text
29 using System;
30 using System.Runtime.InteropServices;
32 [Serializable]
33 [MonoLimitation ("Serialization format not compatible with .NET")]
34 [MonoLimitation ("EncoderFallback is not handled")]
35 [ComVisible (true)]
36 public class UTF8Encoding : Encoding
38 // Magic number used by Windows for UTF-8.
39 internal const int UTF8_CODE_PAGE = 65001;
41 // Internal state.
42 private bool emitIdentifier;
44 // Constructors.
45 public UTF8Encoding () : this (false, false) {}
46 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
47 : this (encoderShouldEmitUTF8Identifier, false) {}
49 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
50 : base (UTF8_CODE_PAGE)
52 emitIdentifier = encoderShouldEmitUTF8Identifier;
53 if (throwOnInvalidBytes)
54 SetFallbackInternal (null, DecoderFallback.ExceptionFallback);
55 else
56 SetFallbackInternal (null, DecoderFallback.StandardSafeFallback);
58 web_name = body_name = header_name = "utf-8";
59 encoding_name = "Unicode (UTF-8)";
60 is_browser_save = true;
61 is_browser_display = true;
62 is_mail_news_display = true;
63 is_mail_news_save = true;
64 windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
67 #region GetByteCount()
69 // Internal version of "GetByteCount" which can handle a rolling
70 // state between multiple calls to this method.
71 private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
73 // Validate the parameters.
74 if (chars == null) {
75 throw new ArgumentNullException ("chars");
77 if (index < 0 || index > chars.Length) {
78 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
80 if (count < 0 || count > (chars.Length - index)) {
81 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
84 if (index == chars.Length) {
85 if (flush && leftOver != '\0') {
86 // Flush the left-over surrogate pair start.
87 leftOver = '\0';
88 return 3;
90 return 0;
93 unsafe {
94 fixed (char* cptr = chars) {
95 return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
100 private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
102 int length = 0;
103 char* end = chars + count;
104 while (chars < end) {
105 if (leftOver == 0) {
106 for (; chars < end; chars++) {
107 if (*chars < '\x80') {
108 ++length;
109 } else if (*chars < '\x800') {
110 length += 2;
111 } else if (*chars < '\uD800' || *chars > '\uDFFF') {
112 length += 3;
113 } else if (*chars <= '\uDBFF') {
114 // This is a surrogate start char, exit the inner loop only
115 // if we don't find the complete surrogate pair.
116 if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
117 length += 4;
118 chars++;
119 continue;
121 leftOver = *chars;
122 chars++;
123 break;
124 } else {
125 // We have a surrogate tail without
126 // leading surrogate. In NET_2_0 it
127 // uses fallback. In NET_1_1 we output
128 // wrong surrogate.
129 length += 3;
130 leftOver = '\0';
133 } else {
134 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
135 // We have a correct surrogate pair.
136 length += 4;
137 chars++;
138 } else {
139 // We have a surrogate start followed by a
140 // regular character. Technically, this is
141 // invalid, but we have to do something.
142 // We write out the surrogate start and then
143 // re-visit the current character again.
144 length += 3;
146 leftOver = '\0';
149 if (flush) {
150 // Flush the left-over surrogate pair start.
151 if (leftOver != '\0') {
152 length += 3;
153 leftOver = '\0';
156 return length;
159 // Get the number of bytes needed to encode a character buffer.
160 public override int GetByteCount (char[] chars, int index, int count)
162 char dummy = '\0';
163 return InternalGetByteCount (chars, index, count, ref dummy, true);
167 [CLSCompliant (false)]
168 [ComVisible (false)]
169 public unsafe override int GetByteCount (char* chars, int count)
171 if (chars == null)
172 throw new ArgumentNullException ("chars");
173 if (count == 0)
174 return 0;
175 char dummy = '\0';
176 return InternalGetByteCount (chars, count, ref dummy, true);
179 #endregion
181 #region GetBytes()
183 // Internal version of "GetBytes" which can handle a rolling
184 // state between multiple calls to this method.
185 private static int InternalGetBytes (char[] chars, int charIndex,
186 int charCount, byte[] bytes,
187 int byteIndex, ref char leftOver,
188 bool flush)
190 // Validate the parameters.
191 if (chars == null) {
192 throw new ArgumentNullException ("chars");
194 if (bytes == null) {
195 throw new ArgumentNullException ("bytes");
197 if (charIndex < 0 || charIndex > chars.Length) {
198 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
200 if (charCount < 0 || charCount > (chars.Length - charIndex)) {
201 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
203 if (byteIndex < 0 || byteIndex > bytes.Length) {
204 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
207 if (charIndex == chars.Length) {
208 if (flush && leftOver != '\0') {
209 // FIXME: use EncoderFallback.
211 // By default it is empty, so I do nothing for now.
212 leftOver = '\0';
214 return 0;
217 unsafe {
218 fixed (char* cptr = chars) {
219 if (bytes.Length == byteIndex)
220 return InternalGetBytes (
221 cptr + charIndex, charCount,
222 null, 0, ref leftOver, flush);
223 fixed (byte *bptr = bytes) {
224 return InternalGetBytes (
225 cptr + charIndex, charCount,
226 bptr + byteIndex, bytes.Length - byteIndex,
227 ref leftOver, flush);
233 private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, ref char leftOver, bool flush)
235 char* end = chars + count;
236 byte* end_bytes = bytes + bcount;
237 while (chars < end) {
238 if (leftOver == 0) {
239 for (; chars < end; chars++) {
240 int ch = *chars;
241 if (ch < '\x80') {
242 if (bytes >= end_bytes)
243 goto fail_no_space;
244 *bytes++ = (byte)ch;
245 } else if (ch < '\x800') {
246 if (bytes + 1 >= end_bytes)
247 goto fail_no_space;
248 bytes [0] = (byte) (0xC0 | (ch >> 6));
249 bytes [1] = (byte) (0x80 | (ch & 0x3F));
250 bytes += 2;
251 } else if (ch < '\uD800' || ch > '\uDFFF') {
252 if (bytes + 2 >= end_bytes)
253 goto fail_no_space;
254 bytes [0] = (byte) (0xE0 | (ch >> 12));
255 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
256 bytes [2] = (byte) (0x80 | (ch & 0x3F));
257 bytes += 3;
258 } else if (ch <= '\uDBFF') {
259 // This is a surrogate char, exit the inner loop.
260 leftOver = *chars;
261 chars++;
262 break;
263 } else {
264 // We have a surrogate tail without
265 // leading surrogate. In NET_2_0 it
266 // uses fallback. In NET_1_1 we output
267 // wrong surrogate.
268 if (bytes + 2 >= end_bytes)
269 goto fail_no_space;
270 bytes [0] = (byte) (0xE0 | (ch >> 12));
271 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
272 bytes [2] = (byte) (0x80 | (ch & 0x3F));
273 bytes += 3;
274 leftOver = '\0';
277 } else {
278 if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
279 // We have a correct surrogate pair.
280 int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
281 if (bytes + 3 >= end_bytes)
282 goto fail_no_space;
283 bytes [0] = (byte) (0xF0 | (ch >> 18));
284 bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
285 bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
286 bytes [3] = (byte) (0x80 | (ch & 0x3F));
287 bytes += 4;
288 chars++;
289 } else {
290 // We have a surrogate start followed by a
291 // regular character. Technically, this is
292 // invalid, but we have to do something.
293 // We write out the surrogate start and then
294 // re-visit the current character again.
295 int ch = leftOver;
296 if (bytes + 2 >= end_bytes)
297 goto fail_no_space;
298 bytes [0] = (byte) (0xE0 | (ch >> 12));
299 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
300 bytes [2] = (byte) (0x80 | (ch & 0x3F));
301 bytes += 3;
303 leftOver = '\0';
306 if (flush) {
307 // Flush the left-over surrogate pair start.
308 if (leftOver != '\0') {
309 int ch = leftOver;
310 if (bytes + 2 < end_bytes) {
311 bytes [0] = (byte) (0xE0 | (ch >> 12));
312 bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
313 bytes [2] = (byte) (0x80 | (ch & 0x3F));
314 bytes += 3;
315 } else {
316 goto fail_no_space;
318 leftOver = '\0';
321 return (int)(bytes - (end_bytes - bcount));
322 fail_no_space:
323 throw new ArgumentException ("Insufficient Space", "bytes");
326 // Get the bytes that result from encoding a character buffer.
327 public override int GetBytes (char[] chars, int charIndex, int charCount,
328 byte[] bytes, int byteIndex)
330 char leftOver = '\0';
331 return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
334 // Convenience wrappers for "GetBytes".
335 public override int GetBytes (String s, int charIndex, int charCount,
336 byte[] bytes, int byteIndex)
338 // Validate the parameters.
339 if (s == null) {
340 throw new ArgumentNullException ("s");
342 if (bytes == null) {
343 throw new ArgumentNullException ("bytes");
345 if (charIndex < 0 || charIndex > s.Length) {
346 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
348 if (charCount < 0 || charCount > (s.Length - charIndex)) {
349 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
351 if (byteIndex < 0 || byteIndex > bytes.Length) {
352 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
355 if (charIndex == s.Length)
356 return 0;
358 unsafe {
359 fixed (char* cptr = s) {
360 char dummy = '\0';
361 if (bytes.Length == byteIndex)
362 return InternalGetBytes (
363 cptr + charIndex, charCount,
364 null, 0, ref dummy, true);
365 fixed (byte *bptr = bytes) {
366 return InternalGetBytes (
367 cptr + charIndex, charCount,
368 bptr + byteIndex, bytes.Length - byteIndex,
369 ref dummy, true);
375 [CLSCompliant (false)]
376 [ComVisible (false)]
377 public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
379 if (chars == null)
380 throw new ArgumentNullException ("chars");
381 if (charCount < 0)
382 throw new IndexOutOfRangeException ("charCount");
383 if (bytes == null)
384 throw new ArgumentNullException ("bytes");
385 if (byteCount < 0)
386 throw new IndexOutOfRangeException ("charCount");
388 if (charCount == 0)
389 return 0;
391 char dummy = '\0';
392 if (byteCount == 0)
393 return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
394 else
395 return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
398 #endregion
400 // Internal version of "GetCharCount" which can handle a rolling
401 // state between multiple calls to this method.
402 private unsafe static int InternalGetCharCount (
403 byte[] bytes, int index, int count, uint leftOverBits,
404 uint leftOverCount, object provider,
405 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
407 // Validate the parameters.
408 if (bytes == null) {
409 throw new ArgumentNullException ("bytes");
411 if (index < 0 || index > bytes.Length) {
412 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
414 if (count < 0 || count > (bytes.Length - index)) {
415 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
418 if (count == 0)
419 return 0;
420 fixed (byte *bptr = bytes)
421 return InternalGetCharCount (bptr + index, count,
422 leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
425 private unsafe static int InternalGetCharCount (
426 byte* bytes, int count, uint leftOverBits,
427 uint leftOverCount, object provider,
428 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
430 int index = 0;
432 int length = 0;
434 if (leftOverCount == 0) {
435 int end = index + count;
436 for (; index < end; index++, count--) {
437 if (bytes [index] < 0x80)
438 length++;
439 else
440 break;
444 // Determine the number of characters that we have.
445 uint ch;
446 uint leftBits = leftOverBits;
447 uint leftSoFar = (leftOverCount & (uint)0x0F);
448 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
449 while (count > 0) {
450 ch = (uint)(bytes[index++]);
451 --count;
452 if (leftSize == 0) {
453 // Process a UTF-8 start character.
454 if (ch < (uint)0x0080) {
455 // Single-byte UTF-8 character.
456 ++length;
457 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
458 // Double-byte UTF-8 character.
459 leftBits = (ch & (uint)0x1F);
460 leftSoFar = 1;
461 leftSize = 2;
462 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
463 // Three-byte UTF-8 character.
464 leftBits = (ch & (uint)0x0F);
465 leftSoFar = 1;
466 leftSize = 3;
467 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
468 // Four-byte UTF-8 character.
469 leftBits = (ch & (uint)0x07);
470 leftSoFar = 1;
471 leftSize = 4;
472 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
473 // Five-byte UTF-8 character.
474 leftBits = (ch & (uint)0x03);
475 leftSoFar = 1;
476 leftSize = 5;
477 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
478 // Six-byte UTF-8 character.
479 leftBits = (ch & (uint)0x03);
480 leftSoFar = 1;
481 leftSize = 6;
482 } else {
483 // Invalid UTF-8 start character.
484 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
486 } else {
487 // Process an extra byte in a multi-byte sequence.
488 if ((ch & (uint)0xC0) == (uint)0x80) {
489 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
490 if (++leftSoFar >= leftSize) {
491 // We have a complete character now.
492 if (leftBits < (uint)0x10000) {
493 // is it an overlong ?
494 bool overlong = false;
495 switch (leftSize) {
496 case 2:
497 overlong = (leftBits <= 0x7F);
498 break;
499 case 3:
500 overlong = (leftBits <= 0x07FF);
501 break;
502 case 4:
503 overlong = (leftBits <= 0xFFFF);
504 break;
505 case 5:
506 overlong = (leftBits <= 0x1FFFFF);
507 break;
508 case 6:
509 overlong = (leftBits <= 0x03FFFFFF);
510 break;
512 if (overlong) {
513 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
515 else if ((leftBits & 0xF800) == 0xD800) {
516 // UTF-8 doesn't use surrogate characters
517 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
519 else
520 ++length;
521 } else if (leftBits < (uint)0x110000) {
522 length += 2;
523 } else {
524 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
526 leftSize = 0;
528 } else {
529 // Invalid UTF-8 sequence: clear and restart.
530 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
531 leftSize = 0;
532 --index;
533 ++count;
537 if (flush && leftSize != 0) {
538 // We had left-over bytes that didn't make up
539 // a complete UTF-8 character sequence.
540 length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
543 // Return the final length to the caller.
544 return length;
547 // for GetCharCount()
548 static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
550 if (buffer == null) {
551 DecoderFallback fb = provider as DecoderFallback;
552 if (fb != null)
553 buffer = fb.CreateFallbackBuffer ();
554 else
555 buffer = ((Decoder) provider).FallbackBuffer;
557 if (bufferArg == null)
558 bufferArg = new byte [1];
559 int ret = 0;
560 for (int i = 0; i < size; i++) {
561 bufferArg [0] = bytes [(int) index + i];
562 buffer.Fallback (bufferArg, 0);
563 ret += buffer.Remaining;
564 buffer.Reset ();
566 return ret;
569 // for GetChars()
570 static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
571 char* chars, ref int charIndex)
573 if (buffer == null) {
574 DecoderFallback fb = provider as DecoderFallback;
575 if (fb != null)
576 buffer = fb.CreateFallbackBuffer ();
577 else
578 buffer = ((Decoder) provider).FallbackBuffer;
580 if (bufferArg == null)
581 bufferArg = new byte [1];
582 for (int i = 0; i < size; i++) {
583 bufferArg [0] = bytes [byteIndex + i];
584 buffer.Fallback (bufferArg, 0);
585 while (buffer.Remaining > 0)
586 chars [charIndex++] = buffer.GetNextChar ();
587 buffer.Reset ();
591 // Get the number of characters needed to decode a byte buffer.
592 public override int GetCharCount (byte[] bytes, int index, int count)
594 DecoderFallbackBuffer buf = null;
595 byte [] bufferArg = null;
596 return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
599 [CLSCompliant (false)]
600 [ComVisible (false)]
601 public unsafe override int GetCharCount (byte* bytes, int count)
603 DecoderFallbackBuffer buf = null;
604 byte [] bufferArg = null;
605 return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
608 // Get the characters that result from decoding a byte buffer.
609 private unsafe static int InternalGetChars (
610 byte[] bytes, int byteIndex, int byteCount, char[] chars,
611 int charIndex, ref uint leftOverBits, ref uint leftOverCount,
612 object provider,
613 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
615 // Validate the parameters.
616 if (bytes == null) {
617 throw new ArgumentNullException ("bytes");
619 if (chars == null) {
620 throw new ArgumentNullException ("chars");
622 if (byteIndex < 0 || byteIndex > bytes.Length) {
623 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
625 if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
626 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
628 if (charIndex < 0 || charIndex > chars.Length) {
629 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
632 if (charIndex == chars.Length)
633 return 0;
635 fixed (char* cptr = chars) {
636 if (byteCount == 0 || byteIndex == bytes.Length)
637 return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
638 // otherwise...
639 fixed (byte* bptr = bytes)
640 return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
644 private unsafe static int InternalGetChars (
645 byte* bytes, int byteCount, char* chars, int charCount,
646 ref uint leftOverBits, ref uint leftOverCount,
647 object provider,
648 ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
650 int charIndex = 0, byteIndex = 0;
651 int length = charCount;
652 int posn = charIndex;
654 if (leftOverCount == 0) {
655 int end = byteIndex + byteCount;
656 for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
657 if (bytes [byteIndex] < 0x80)
658 chars [posn] = (char) bytes [byteIndex];
659 else
660 break;
664 // Convert the bytes into the output buffer.
665 uint ch;
666 uint leftBits = leftOverBits;
667 uint leftSoFar = (leftOverCount & (uint)0x0F);
668 uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
670 int byteEnd = byteIndex + byteCount;
671 for(; byteIndex < byteEnd; byteIndex++) {
672 // Fetch the next character from the byte buffer.
673 ch = (uint)(bytes[byteIndex]);
674 if (leftSize == 0) {
675 // Process a UTF-8 start character.
676 if (ch < (uint)0x0080) {
677 // Single-byte UTF-8 character.
678 if (posn >= length) {
679 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
681 chars[posn++] = (char)ch;
682 } else if ((ch & (uint)0xE0) == (uint)0xC0) {
683 // Double-byte UTF-8 character.
684 leftBits = (ch & (uint)0x1F);
685 leftSoFar = 1;
686 leftSize = 2;
687 } else if ((ch & (uint)0xF0) == (uint)0xE0) {
688 // Three-byte UTF-8 character.
689 leftBits = (ch & (uint)0x0F);
690 leftSoFar = 1;
691 leftSize = 3;
692 } else if ((ch & (uint)0xF8) == (uint)0xF0) {
693 // Four-byte UTF-8 character.
694 leftBits = (ch & (uint)0x07);
695 leftSoFar = 1;
696 leftSize = 4;
697 } else if ((ch & (uint)0xFC) == (uint)0xF8) {
698 // Five-byte UTF-8 character.
699 leftBits = (ch & (uint)0x03);
700 leftSoFar = 1;
701 leftSize = 5;
702 } else if ((ch & (uint)0xFE) == (uint)0xFC) {
703 // Six-byte UTF-8 character.
704 leftBits = (ch & (uint)0x03);
705 leftSoFar = 1;
706 leftSize = 6;
707 } else {
708 // Invalid UTF-8 start character.
709 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
711 } else {
712 // Process an extra byte in a multi-byte sequence.
713 if ((ch & (uint)0xC0) == (uint)0x80) {
714 leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
715 if (++leftSoFar >= leftSize) {
716 // We have a complete character now.
717 if (leftBits < (uint)0x10000) {
718 // is it an overlong ?
719 bool overlong = false;
720 switch (leftSize) {
721 case 2:
722 overlong = (leftBits <= 0x7F);
723 break;
724 case 3:
725 overlong = (leftBits <= 0x07FF);
726 break;
727 case 4:
728 overlong = (leftBits <= 0xFFFF);
729 break;
730 case 5:
731 overlong = (leftBits <= 0x1FFFFF);
732 break;
733 case 6:
734 overlong = (leftBits <= 0x03FFFFFF);
735 break;
737 if (overlong) {
738 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
740 else if ((leftBits & 0xF800) == 0xD800) {
741 // UTF-8 doesn't use surrogate characters
742 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
744 else {
745 if (posn >= length) {
746 throw new ArgumentException
747 (_("Arg_InsufficientSpace"), "chars");
749 chars[posn++] = (char)leftBits;
751 } else if (leftBits < (uint)0x110000) {
752 if ((posn + 2) > length) {
753 throw new ArgumentException
754 (_("Arg_InsufficientSpace"), "chars");
756 leftBits -= (uint)0x10000;
757 chars[posn++] = (char)((leftBits >> 10) +
758 (uint)0xD800);
759 chars[posn++] =
760 (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
761 } else {
762 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
764 leftSize = 0;
766 } else {
767 // Invalid UTF-8 sequence: clear and restart.
768 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
769 leftSize = 0;
770 --byteIndex;
774 if (flush && leftSize != 0) {
775 // We had left-over bytes that didn't make up
776 // a complete UTF-8 character sequence.
777 Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
779 leftOverBits = leftBits;
780 leftOverCount = (leftSoFar | (leftSize << 4));
782 // Return the final length to the caller.
783 return posn - charIndex;
786 // Get the characters that result from decoding a byte buffer.
787 public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
788 char[] chars, int charIndex)
790 uint leftOverBits = 0;
791 uint leftOverCount = 0;
792 DecoderFallbackBuffer buf = null;
793 byte [] bufferArg = null;
794 return InternalGetChars (bytes, byteIndex, byteCount, chars,
795 charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
798 [CLSCompliant (false)]
799 [ComVisible (false)]
800 public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
802 DecoderFallbackBuffer buf = null;
803 byte [] bufferArg = null;
804 uint leftOverBits = 0;
805 uint leftOverCount = 0;
806 return InternalGetChars (bytes, byteCount, chars,
807 charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
810 // Get the maximum number of bytes needed to encode a
811 // specified number of characters.
812 public override int GetMaxByteCount (int charCount)
814 if (charCount < 0) {
815 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
817 return charCount * 4;
820 // Get the maximum number of characters needed to decode a
821 // specified number of bytes.
822 public override int GetMaxCharCount (int byteCount)
824 if (byteCount < 0) {
825 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
827 return byteCount;
830 // Get a UTF8-specific decoder that is attached to this instance.
831 public override Decoder GetDecoder ()
833 return new UTF8Decoder (DecoderFallback);
836 // Get a UTF8-specific encoder that is attached to this instance.
837 public override Encoder GetEncoder ()
839 return new UTF8Encoder (emitIdentifier);
842 // Get the UTF8 preamble.
843 public override byte[] GetPreamble ()
845 if (emitIdentifier)
846 return new byte [] { 0xEF, 0xBB, 0xBF };
848 return new byte [0];
851 // Determine if this object is equal to another.
852 public override bool Equals (Object value)
854 UTF8Encoding enc = (value as UTF8Encoding);
855 if (enc != null) {
856 return (codePage == enc.codePage &&
857 emitIdentifier == enc.emitIdentifier &&
858 DecoderFallback.Equals (enc.DecoderFallback) &&
859 EncoderFallback.Equals (enc.EncoderFallback));
860 } else {
861 return false;
865 // Get the hash code for this object.
866 public override int GetHashCode ()
868 return base.GetHashCode ();
871 public override int GetByteCount (string chars)
873 // hmm, does this override make any sense?
874 return base.GetByteCount (chars);
877 [ComVisible (false)]
878 public override string GetString (byte [] bytes, int index, int count)
880 // hmm, does this override make any sense?
881 return base.GetString (bytes, index, count);
884 // UTF-8 decoder implementation.
885 [Serializable]
886 private class UTF8Decoder : Decoder
888 private uint leftOverBits;
889 private uint leftOverCount;
891 // Constructor.
892 public UTF8Decoder (DecoderFallback fallback)
894 Fallback = fallback;
895 leftOverBits = 0;
896 leftOverCount = 0;
899 // Override inherited methods.
900 public override int GetCharCount (byte[] bytes, int index, int count)
902 DecoderFallbackBuffer buf = null;
903 byte [] bufferArg = null;
904 return InternalGetCharCount (bytes, index, count,
905 leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
907 public override int GetChars (byte[] bytes, int byteIndex,
908 int byteCount, char[] chars, int charIndex)
910 DecoderFallbackBuffer buf = null;
911 byte [] bufferArg = null;
912 return InternalGetChars (bytes, byteIndex, byteCount,
913 chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
916 } // class UTF8Decoder
918 // UTF-8 encoder implementation.
919 [Serializable]
920 private class UTF8Encoder : Encoder
922 // private bool emitIdentifier;
923 private char leftOverForCount;
924 private char leftOverForConv;
926 // Constructor.
927 public UTF8Encoder (bool emitIdentifier)
929 // this.emitIdentifier = emitIdentifier;
930 leftOverForCount = '\0';
931 leftOverForConv = '\0';
934 // Override inherited methods.
935 public override int GetByteCount (char[] chars, int index,
936 int count, bool flush)
938 return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
940 public override int GetBytes (char[] chars, int charIndex,
941 int charCount, byte[] bytes, int byteIndex, bool flush)
943 int result;
944 result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
945 // emitIdentifier = false;
946 return result;
949 public unsafe override int GetByteCount (char* chars, int count, bool flush)
951 return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
954 public unsafe override int GetBytes (char* chars, int charCount,
955 byte* bytes, int byteCount, bool flush)
957 int result;
958 result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
959 // emitIdentifier = false;
960 return result;
962 } // class UTF8Encoder
964 }; // class UTF8Encoding
966 }; // namespace System.Text