2 * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
4 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
5 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
32 [MonoTODO ("Fix serialization compatibility with MS.NET")]
33 public class UTF8Encoding
: Encoding
35 // Magic number used by Windows for UTF-8.
36 internal const int UTF8_CODE_PAGE
= 65001;
39 private bool emitIdentifier
;
40 private bool throwOnInvalid
;
43 public UTF8Encoding () : this (false, false) {}
44 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier
)
45 : this (encoderShouldEmitUTF8Identifier
, false) {}
47 public UTF8Encoding (bool encoderShouldEmitUTF8Identifier
, bool throwOnInvalidBytes
)
48 : base (UTF8_CODE_PAGE
)
50 emitIdentifier
= encoderShouldEmitUTF8Identifier
;
51 throwOnInvalid
= throwOnInvalidBytes
;
53 web_name
= body_name
= header_name
= "utf-8";
54 encoding_name
= "Unicode (UTF-8)";
55 is_browser_save
= true;
56 is_browser_display
= true;
57 is_mail_news_display
= true;
58 windows_code_page
= UnicodeEncoding
.UNICODE_CODE_PAGE
;
61 // Internal version of "GetByteCount" which can handle a rolling
62 // state between multiple calls to this method.
63 private static int InternalGetByteCount (char[] chars
, int index
, int count
, uint leftOver
, bool flush
)
65 // Validate the parameters.
67 throw new ArgumentNullException ("chars");
69 if (index
< 0 || index
> chars
.Length
) {
70 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
72 if (count
< 0 || count
> (chars
.Length
- index
)) {
73 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
76 // Determine the lengths of all characters.
85 } else if (ch
< '\u0800') {
87 } else if (ch
>= '\uD800' && ch
<= '\uDBFF') {
88 // This is the start of a surrogate pair.
93 } else if (ch
>= '\uDC00' && ch
<= '\uDFFF') {
94 // We have a surrogate pair.
98 // We have a surrogate start followed by a
99 // regular character. Technically, this is
100 // invalid, but we have to do something.
101 // We write out the surrogate start and then
102 // re-visit the current character again.
110 if (flush
&& pair
!= 0) {
111 // Flush the left-over surrogate pair start.
115 // Return the final length to the caller.
119 // Get the number of bytes needed to encode a character buffer.
120 public override int GetByteCount (char[] chars
, int index
, int count
)
122 return InternalGetByteCount (chars
, index
, count
, 0, true);
125 // Convenience wrappers for "GetByteCount".
126 public override int GetByteCount (String s
)
128 // Validate the parameters.
130 throw new ArgumentNullException ("s");
133 // Determine the lengths of all characters.
136 int count
= s
.Length
;
143 } else if (ch
< '\u0800') {
145 } else if (ch
>= '\uD800' && ch
<= '\uDBFF' && count
> 1) {
146 // This may be the start of a surrogate pair.
147 pair
= (uint)(s
[index
]);
148 if (pair
>= (uint)0xDC00 && pair
<= (uint)0xDFFF) {
161 // Return the final length to the caller.
165 // Internal version of "GetBytes" which can handle a rolling
166 // state between multiple calls to this method.
167 private static int InternalGetBytes (char[] chars
, int charIndex
,
168 int charCount
, byte[] bytes
,
169 int byteIndex
, ref uint leftOver
,
172 // Validate the parameters.
174 throw new ArgumentNullException ("chars");
177 throw new ArgumentNullException ("bytes");
179 if (charIndex
< 0 || charIndex
> chars
.Length
) {
180 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
182 if (charCount
< 0 || charCount
> (chars
.Length
- charIndex
)) {
183 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
185 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
186 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
189 // Convert the characters into bytes.
191 int length
= bytes
.Length
;
193 uint left
= leftOver
;
194 int posn
= byteIndex
;
195 while (charCount
> 0) {
196 // Fetch the next UTF-16 character pair value.
197 ch
= chars
[charIndex
++];
200 if (ch
>= '\uD800' && ch
<= '\uDBFF') {
201 // This is the start of a surrogate pair.
205 // This is a regular character.
208 } else if (ch
>= '\uDC00' && ch
<= '\uDFFF') {
209 // We have a surrogate pair.
210 pair
= ((left
- (uint)0xD800) << 10) +
211 (((uint)ch
) - (uint)0xDC00) +
215 // We have a surrogate start followed by a
216 // regular character. Technically, this is
217 // invalid, but we have to do something.
218 // We write out the surrogate start and then
219 // re-visit the current character again.
226 // Encode the character pair value.
227 if (pair
< (uint)0x0080) {
228 if (posn
>= length
) {
229 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
231 bytes
[posn
++] = (byte)pair
;
232 } else if (pair
< (uint)0x0800) {
233 if ((posn
+ 2) > length
) {
234 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
236 bytes
[posn
++] = (byte)(0xC0 | (pair
>> 6));
237 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
238 } else if (pair
< (uint)0x10000) {
239 if ((posn
+ 3) > length
) {
240 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
242 bytes
[posn
++] = (byte)(0xE0 | (pair
>> 12));
243 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
244 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
246 if ((posn
+ 4) > length
) {
247 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
249 bytes
[posn
++] = (byte)(0xF0 | (pair
>> 18));
250 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 12) & 0x3F));
251 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
252 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
255 if (flush
&& left
!= 0) {
256 // Flush the left-over surrogate pair start.
257 if ((posn
+ 3) > length
) {
258 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
260 bytes
[posn
++] = (byte)(0xE0 | (left
>> 12));
261 bytes
[posn
++] = (byte)(0x80 | ((left
>> 6) & 0x3F));
262 bytes
[posn
++] = (byte)(0x80 | (left
& 0x3F));
267 // Return the final count to the caller.
268 return posn
- byteIndex
;
271 // Get the bytes that result from encoding a character buffer.
272 public override int GetBytes (char[] chars
, int charIndex
, int charCount
,
273 byte[] bytes
, int byteIndex
)
276 return InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteIndex
, ref leftOver
, true);
279 // Convenience wrappers for "GetBytes".
280 public override int GetBytes (String s
, int charIndex
, int charCount
,
281 byte[] bytes
, int byteIndex
)
283 // Validate the parameters.
285 throw new ArgumentNullException ("s");
288 throw new ArgumentNullException ("bytes");
290 if (charIndex
< 0 || charIndex
> s
.Length
) {
291 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
293 if (charCount
< 0 || charCount
> (s
.Length
- charIndex
)) {
294 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
296 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
297 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
300 // Convert the characters into bytes.
302 int length
= bytes
.Length
;
304 int posn
= byteIndex
;
305 while (charCount
> 0) {
306 // Fetch the next UTF-16 character pair value.
308 if (ch
>= '\uD800' && ch
<= '\uDBFF' && charCount
> 1) {
309 // This may be the start of a surrogate pair.
310 pair
= (uint)(s
[charIndex
]);
311 if (pair
>= (uint)0xDC00 && pair
<= (uint)0xDFFF) {
312 pair
= (pair
- (uint)0xDC00) +
313 ((((uint)ch
) - (uint)0xD800) << 10) +
325 // Encode the character pair value.
326 if (pair
< (uint)0x0080) {
327 if (posn
>= length
) {
328 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
330 bytes
[posn
++] = (byte)pair
;
331 } else if (pair
< (uint)0x0800) {
332 if ((posn
+ 2) > length
) {
333 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
335 bytes
[posn
++] = (byte)(0xC0 | (pair
>> 6));
336 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
337 } else if (pair
< (uint)0x10000) {
338 if ((posn
+ 3) > length
) {
339 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
341 bytes
[posn
++] = (byte)(0xE0 | (pair
>> 12));
342 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
343 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
345 if ((posn
+ 4) > length
) {
346 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
348 bytes
[posn
++] = (byte)(0xF0 | (pair
>> 18));
349 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 12) & 0x3F));
350 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
351 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
355 // Return the final count to the caller.
356 return posn
- byteIndex
;
359 // Internal version of "GetCharCount" which can handle a rolling
360 // state between multiple calls to this method.
361 private static int InternalGetCharCount (byte[] bytes
, int index
, int count
,
364 bool throwOnInvalid
, bool flush
)
366 // Validate the parameters.
368 throw new ArgumentNullException ("bytes");
370 if (index
< 0 || index
> bytes
.Length
) {
371 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
373 if (count
< 0 || count
> (bytes
.Length
- index
)) {
374 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
377 // Determine the number of characters that we have.
380 uint leftBits
= leftOverBits
;
381 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
382 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
384 ch
= (uint)(bytes
[index
++]);
387 // Process a UTF-8 start character.
388 if (ch
< (uint)0x0080) {
389 // Single-byte UTF-8 character.
391 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
392 // Double-byte UTF-8 character.
393 leftBits
= (ch
& (uint)0x1F);
396 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
397 // Three-byte UTF-8 character.
398 leftBits
= (ch
& (uint)0x0F);
401 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
402 // Four-byte UTF-8 character.
403 leftBits
= (ch
& (uint)0x07);
406 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
407 // Five-byte UTF-8 character.
408 leftBits
= (ch
& (uint)0x03);
411 } else if ((ch
& (uint)0xFC) == (uint)0xFC) {
412 // Six-byte UTF-8 character.
413 leftBits
= (ch
& (uint)0x03);
417 // Invalid UTF-8 start character.
418 if (throwOnInvalid
) {
419 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
423 // Process an extra byte in a multi-byte sequence.
424 if ((ch
& (uint)0xC0) == (uint)0x80) {
425 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
426 if (++leftSoFar
>= leftSize
) {
427 // We have a complete character now.
428 if (leftBits
< (uint)0x10000) {
429 if (leftBits
!= (uint)0xFEFF) {
430 // is it an overlong ?
431 bool overlong
= false;
434 overlong
= (leftBits
<= 0x7F);
437 overlong
= (leftBits
<= 0x07FF);
440 overlong
= (leftBits
<= 0xFFFF);
443 overlong
= (leftBits
<= 0x1FFFFF);
446 overlong
= (leftBits
<= 0x03FFFFFF);
451 throw new ArgumentException (_("Overlong"), leftBits
.ToString ());
456 } else if (leftBits
< (uint)0x110000) {
458 } else if (throwOnInvalid
) {
459 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
464 // Invalid UTF-8 sequence: clear and restart.
465 if (throwOnInvalid
) {
466 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
474 if (flush
&& leftSize
!= 0 && throwOnInvalid
) {
475 // We had left-over bytes that didn't make up
476 // a complete UTF-8 character sequence.
477 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
480 // Return the final length to the caller.
484 // Get the number of characters needed to decode a byte buffer.
485 public override int GetCharCount (byte[] bytes
, int index
, int count
)
487 return InternalGetCharCount (bytes
, index
, count
, 0, 0, throwOnInvalid
, true);
490 // Get the characters that result from decoding a byte buffer.
491 private static int InternalGetChars (byte[] bytes
, int byteIndex
,
492 int byteCount
, char[] chars
,
493 int charIndex
, ref uint leftOverBits
,
494 ref uint leftOverCount
,
495 bool throwOnInvalid
, bool flush
)
497 // Validate the parameters.
499 throw new ArgumentNullException ("bytes");
502 throw new ArgumentNullException ("chars");
504 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
505 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
507 if (byteCount
< 0 || byteCount
> (bytes
.Length
- byteIndex
)) {
508 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
510 if (charIndex
< 0 || charIndex
> chars
.Length
) {
511 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
514 if (charIndex
== chars
.Length
)
517 // Convert the bytes into the output buffer.
519 int length
= chars
.Length
;
520 int posn
= charIndex
;
521 uint leftBits
= leftOverBits
;
522 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
523 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
524 while (byteCount
> 0) {
525 // Fetch the next character from the byte buffer.
526 ch
= (uint)(bytes
[byteIndex
++]);
529 // Process a UTF-8 start character.
530 if (ch
< (uint)0x0080) {
531 // Single-byte UTF-8 character.
532 if (posn
>= length
) {
533 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
535 chars
[posn
++] = (char)ch
;
536 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
537 // Double-byte UTF-8 character.
538 leftBits
= (ch
& (uint)0x1F);
541 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
542 // Three-byte UTF-8 character.
543 leftBits
= (ch
& (uint)0x0F);
546 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
547 // Four-byte UTF-8 character.
548 leftBits
= (ch
& (uint)0x07);
551 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
552 // Five-byte UTF-8 character.
553 leftBits
= (ch
& (uint)0x03);
556 } else if ((ch
& (uint)0xFC) == (uint)0xFC) {
557 // Six-byte UTF-8 character.
558 leftBits
= (ch
& (uint)0x03);
562 // Invalid UTF-8 start character.
563 if (throwOnInvalid
) {
564 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
568 // Process an extra byte in a multi-byte sequence.
569 if ((ch
& (uint)0xC0) == (uint)0x80) {
570 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
571 if (++leftSoFar
>= leftSize
) {
572 // We have a complete character now.
573 if (leftBits
< (uint)0x10000) {
574 if (leftBits
!= (uint)0xFEFF) {
575 // is it an overlong ?
576 bool overlong
= false;
579 overlong
= (leftBits
<= 0x7F);
582 overlong
= (leftBits
<= 0x07FF);
585 overlong
= (leftBits
<= 0xFFFF);
588 overlong
= (leftBits
<= 0x1FFFFF);
591 overlong
= (leftBits
<= 0x03FFFFFF);
596 throw new ArgumentException (_("Overlong"), leftBits
.ToString ());
599 if (posn
>= length
) {
600 throw new ArgumentException
601 (_("Arg_InsufficientSpace"), "chars");
603 chars
[posn
++] = (char)leftBits
;
606 } else if (leftBits
< (uint)0x110000) {
607 if ((posn
+ 2) > length
) {
608 throw new ArgumentException
609 (_("Arg_InsufficientSpace"), "chars");
611 leftBits
-= (uint)0x10000;
612 chars
[posn
++] = (char)((leftBits
>> 10) +
615 (char)((leftBits
& (uint)0x3FF) + (uint)0xDC00);
616 } else if (throwOnInvalid
) {
617 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
622 // Invalid UTF-8 sequence: clear and restart.
623 if (throwOnInvalid
) {
624 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
632 if (flush
&& leftSize
!= 0 && throwOnInvalid
) {
633 // We had left-over bytes that didn't make up
634 // a complete UTF-8 character sequence.
635 throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
637 leftOverBits
= leftBits
;
638 leftOverCount
= (leftSoFar
| (leftSize
<< 4));
640 // Return the final length to the caller.
641 return posn
- charIndex
;
644 // Get the characters that result from decoding a byte buffer.
645 public override int GetChars (byte[] bytes
, int byteIndex
, int byteCount
,
646 char[] chars
, int charIndex
)
648 uint leftOverBits
= 0;
649 uint leftOverCount
= 0;
650 return InternalGetChars (bytes
, byteIndex
, byteCount
, chars
,
651 charIndex
, ref leftOverBits
, ref leftOverCount
, throwOnInvalid
, true);
654 // Get the maximum number of bytes needed to encode a
655 // specified number of characters.
656 public override int GetMaxByteCount (int charCount
)
659 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
661 return charCount
* 4;
664 // Get the maximum number of characters needed to decode a
665 // specified number of bytes.
666 public override int GetMaxCharCount (int byteCount
)
669 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
674 // Get a UTF8-specific decoder that is attached to this instance.
675 public override Decoder
GetDecoder ()
677 return new UTF8Decoder (throwOnInvalid
);
680 // Get a UTF8-specific encoder that is attached to this instance.
681 public override Encoder
GetEncoder ()
683 return new UTF8Encoder (emitIdentifier
);
686 // Get the UTF8 preamble.
687 public override byte[] GetPreamble ()
689 if (emitIdentifier
) {
690 byte[] pre
= new byte [3];
700 // Determine if this object is equal to another.
701 public override bool Equals (Object
value)
703 UTF8Encoding enc
= (value as UTF8Encoding
);
705 return (codePage
== enc
.codePage
&&
706 emitIdentifier
== enc
.emitIdentifier
&&
707 throwOnInvalid
== enc
.throwOnInvalid
);
713 // Get the hash code for this object.
714 public override int GetHashCode ()
716 return base.GetHashCode ();
719 public override byte [] GetBytes (String s
)
722 throw new ArgumentNullException ("s");
724 int length
= GetByteCount (s
);
725 byte [] bytes
= new byte [length
];
726 GetBytes (s
, 0, s
.Length
, bytes
, 0);
730 // UTF-8 decoder implementation.
732 private class UTF8Decoder
: Decoder
734 private bool throwOnInvalid
;
735 private uint leftOverBits
;
736 private uint leftOverCount
;
739 public UTF8Decoder (bool throwOnInvalid
)
741 this.throwOnInvalid
= throwOnInvalid
;
746 // Override inherited methods.
747 public override int GetCharCount (byte[] bytes
, int index
, int count
)
749 return InternalGetCharCount (bytes
, index
, count
,
750 leftOverBits
, leftOverCount
, throwOnInvalid
, false);
752 public override int GetChars (byte[] bytes
, int byteIndex
,
753 int byteCount
, char[] chars
, int charIndex
)
755 return InternalGetChars (bytes
, byteIndex
, byteCount
,
756 chars
, charIndex
, ref leftOverBits
, ref leftOverCount
, throwOnInvalid
, false);
759 } // class UTF8Decoder
761 // UTF-8 encoder implementation.
763 private class UTF8Encoder
: Encoder
765 private bool emitIdentifier
;
766 private uint leftOver
;
769 public UTF8Encoder (bool emitIdentifier
)
771 this.emitIdentifier
= emitIdentifier
;
775 // Override inherited methods.
776 public override int GetByteCount (char[] chars
, int index
,
777 int count
, bool flush
)
779 return InternalGetByteCount (chars
, index
, count
, leftOver
, flush
);
781 public override int GetBytes (char[] chars
, int charIndex
,
782 int charCount
, byte[] bytes
, int byteCount
, bool flush
)
785 result
= InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteCount
, ref leftOver
, flush
);
786 emitIdentifier
= false;
790 } // class UTF8Encoder
792 }; // class UTF8Encoding
794 }; // namespace System.Text