2 * Mono.Unix/UnixEncoding.cs
5 * Jonathan Pryor (jonpryor@vt.edu)
7 * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
8 * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
9 * Copyright (C) 2005 Jonathan Pryor
11 * Permission is hereby granted, free of charge, to any person obtaining
12 * a copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
25 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
26 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 * OTHER DEALINGS IN THE SOFTWARE.
37 public class UnixEncoding
: Encoding
39 public static readonly Encoding Instance
= new UnixEncoding ();
41 public static readonly char EscapeByte
= '\u0000';
44 public UnixEncoding ()
48 // Internal version of "GetByteCount" which can handle a rolling
49 // state between multiple calls to this method.
50 private static int InternalGetByteCount (char[] chars
, int index
, int count
, uint leftOver
, bool flush
)
52 // Validate the parameters.
54 throw new ArgumentNullException ("chars");
56 if (index
< 0 || index
> chars
.Length
) {
57 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
59 if (count
< 0 || count
> (chars
.Length
- index
)) {
60 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
63 // Determine the lengths of all characters.
70 if (ch
== EscapeByte
&& count
> 1) {
74 } else if (ch
< '\u0080') {
76 } else if (ch
< '\u0800') {
78 } else if (ch
>= '\uD800' && ch
<= '\uDBFF') {
79 // This is the start of a surrogate pair.
84 } else if (ch
>= '\uDC00' && ch
<= '\uDFFF') {
85 // We have a surrogate pair.
89 // We have a surrogate start followed by a
90 // regular character. Technically, this is
91 // invalid, but we have to do something.
92 // We write out the surrogate start and then
93 // re-visit the current character again.
101 if (flush
&& pair
!= 0) {
102 // Flush the left-over surrogate pair start.
106 // Return the final length to the caller.
110 // Get the number of bytes needed to encode a character buffer.
111 public override int GetByteCount (char[] chars
, int index
, int count
)
113 return InternalGetByteCount (chars
, index
, count
, 0, true);
116 // Convenience wrappers for "GetByteCount".
117 public override int GetByteCount (String s
)
119 // Validate the parameters.
121 throw new ArgumentNullException ("s");
124 // Determine the lengths of all characters.
127 int count
= s
.Length
;
132 if (ch
== EscapeByte
&& count
> 1) {
136 } else if (ch
< '\u0080') {
138 } else if (ch
< '\u0800') {
140 } else if (ch
>= '\uD800' && ch
<= '\uDBFF' && count
> 1) {
141 // This may be the start of a surrogate pair.
142 pair
= (uint)(s
[index
]);
143 if (pair
>= (uint)0xDC00 && pair
<= (uint)0xDFFF) {
156 // Return the final length to the caller.
160 // Internal version of "GetBytes" which can handle a rolling
161 // state between multiple calls to this method.
162 private static int InternalGetBytes (char[] chars
, int charIndex
,
163 int charCount
, byte[] bytes
,
164 int byteIndex
, ref uint leftOver
,
167 // Validate the parameters.
169 throw new ArgumentNullException ("chars");
172 throw new ArgumentNullException ("bytes");
174 if (charIndex
< 0 || charIndex
> chars
.Length
) {
175 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
177 if (charCount
< 0 || charCount
> (chars
.Length
- charIndex
)) {
178 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
180 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
181 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
184 // Convert the characters into bytes.
186 int length
= bytes
.Length
;
188 uint left
= leftOver
;
189 int posn
= byteIndex
;
190 while (charCount
> 0) {
191 // Fetch the next UTF-16 character pair value.
192 ch
= chars
[charIndex
++];
195 if (ch
>= '\uD800' && ch
<= '\uDBFF') {
196 // This is the start of a surrogate pair.
199 } else if (ch
== EscapeByte
) {
200 if (posn
>= length
) {
201 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
203 if (--charCount
>= 0) {
204 bytes
[posn
++] = (byte) chars
[charIndex
++];
208 // This is a regular character.
211 } else if (ch
>= '\uDC00' && ch
<= '\uDFFF') {
212 // We have a surrogate pair.
213 pair
= ((left
- (uint)0xD800) << 10) +
214 (((uint)ch
) - (uint)0xDC00) +
218 // We have a surrogate start followed by a
219 // regular character. Technically, this is
220 // invalid, but we have to do something.
221 // We write out the surrogate start and then
222 // re-visit the current character again.
229 // Encode the character pair value.
230 if (pair
< (uint)0x0080) {
231 if (posn
>= length
) {
232 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
234 bytes
[posn
++] = (byte)pair
;
235 } else if (pair
< (uint)0x0800) {
236 if ((posn
+ 2) > length
) {
237 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
239 bytes
[posn
++] = (byte)(0xC0 | (pair
>> 6));
240 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
241 } else if (pair
< (uint)0x10000) {
242 if ((posn
+ 3) > length
) {
243 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
245 bytes
[posn
++] = (byte)(0xE0 | (pair
>> 12));
246 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
247 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
249 if ((posn
+ 4) > length
) {
250 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
252 bytes
[posn
++] = (byte)(0xF0 | (pair
>> 18));
253 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 12) & 0x3F));
254 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
255 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
258 if (flush
&& left
!= 0) {
259 // Flush the left-over surrogate pair start.
260 if ((posn
+ 3) > length
) {
261 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
263 bytes
[posn
++] = (byte)(0xE0 | (left
>> 12));
264 bytes
[posn
++] = (byte)(0x80 | ((left
>> 6) & 0x3F));
265 bytes
[posn
++] = (byte)(0x80 | (left
& 0x3F));
270 // Return the final count to the caller.
271 return posn
- byteIndex
;
274 // Get the bytes that result from encoding a character buffer.
275 public override int GetBytes (char[] chars
, int charIndex
, int charCount
,
276 byte[] bytes
, int byteIndex
)
279 return InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteIndex
, ref leftOver
, true);
282 // Convenience wrappers for "GetBytes".
283 public override int GetBytes (String s
, int charIndex
, int charCount
,
284 byte[] bytes
, int byteIndex
)
286 // Validate the parameters.
288 throw new ArgumentNullException ("s");
291 throw new ArgumentNullException ("bytes");
293 if (charIndex
< 0 || charIndex
> s
.Length
) {
294 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
296 if (charCount
< 0 || charCount
> (s
.Length
- charIndex
)) {
297 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
299 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
300 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
303 // Convert the characters into bytes.
305 int length
= bytes
.Length
;
307 int posn
= byteIndex
;
308 while (charCount
> 0) {
309 // Fetch the next UTF-16 character pair value.
311 if (ch
>= '\uD800' && ch
<= '\uDBFF' && charCount
> 1) {
312 // This may be the start of a surrogate pair.
313 pair
= (uint)(s
[charIndex
]);
314 if (pair
>= (uint)0xDC00 && pair
<= (uint)0xDFFF) {
315 pair
= (pair
- (uint)0xDC00) +
316 ((((uint)ch
) - (uint)0xD800) << 10) +
323 } else if (ch
== EscapeByte
&& charCount
> 1) {
324 if (posn
>= length
) {
325 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
328 if (charCount
>= 0) {
329 bytes
[posn
++] = (byte) s
[charIndex
++];
337 // Encode the character pair value.
338 if (pair
< (uint)0x0080) {
339 if (posn
>= length
) {
340 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
342 bytes
[posn
++] = (byte)pair
;
343 } else if (pair
< (uint)0x0800) {
344 if ((posn
+ 2) > length
) {
345 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
347 bytes
[posn
++] = (byte)(0xC0 | (pair
>> 6));
348 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
349 } else if (pair
< (uint)0x10000) {
350 if ((posn
+ 3) > length
) {
351 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
353 bytes
[posn
++] = (byte)(0xE0 | (pair
>> 12));
354 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
355 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
357 if ((posn
+ 4) > length
) {
358 throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
360 bytes
[posn
++] = (byte)(0xF0 | (pair
>> 18));
361 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 12) & 0x3F));
362 bytes
[posn
++] = (byte)(0x80 | ((pair
>> 6) & 0x3F));
363 bytes
[posn
++] = (byte)(0x80 | (pair
& 0x3F));
367 // Return the final count to the caller.
368 return posn
- byteIndex
;
371 // Internal version of "GetCharCount" which can handle a rolling
372 // state between multiple calls to this method.
373 private static int InternalGetCharCount (byte[] bytes
, int index
, int count
,
376 bool throwOnInvalid
, bool flush
)
378 // Validate the parameters.
380 throw new ArgumentNullException ("bytes");
382 if (index
< 0 || index
> bytes
.Length
) {
383 throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
385 if (count
< 0 || count
> (bytes
.Length
- index
)) {
386 throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
389 // Determine the number of characters that we have.
393 uint leftBits
= leftOverBits
;
394 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
395 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
397 ch
= (uint)(bytes
[index
++]);
401 // Process a UTF-8 start character.
402 if (ch
< (uint)0x0080) {
403 // Single-byte UTF-8 character.
406 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
407 // Double-byte UTF-8 character.
408 leftBits
= (ch
& (uint)0x1F);
411 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
412 // Three-byte UTF-8 character.
413 leftBits
= (ch
& (uint)0x0F);
416 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
417 // Four-byte UTF-8 character.
418 leftBits
= (ch
& (uint)0x07);
421 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
422 // Five-byte UTF-8 character.
423 leftBits
= (ch
& (uint)0x03);
426 } else if ((ch
& (uint)0xFE) == (uint)0xFC) {
427 // Six-byte UTF-8 character.
428 leftBits
= (ch
& (uint)0x03);
432 // Invalid UTF-8 start character.
433 if (throwOnInvalid
) {
434 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
436 length
+= next_raw
*2;
440 // Process an extra byte in a multi-byte sequence.
441 if ((ch
& (uint)0xC0) == (uint)0x80) {
442 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
443 if (++leftSoFar
>= leftSize
) {
444 // We have a complete character now.
445 if (leftBits
< (uint)0x10000) {
446 // is it an overlong ?
447 bool overlong
= false;
450 overlong
= (leftBits
<= 0x7F);
453 overlong
= (leftBits
<= 0x07FF);
456 overlong
= (leftBits
<= 0xFFFF);
459 overlong
= (leftBits
<= 0x1FFFFF);
462 overlong
= (leftBits
<= 0x03FFFFFF);
466 // if (throwOnInvalid)
467 // throw new ArgumentException (_("Overlong"), leftBits.ToString ());
468 length
+= next_raw
*2;
472 } else if (leftBits
< (uint)0x110000) {
474 } else if (throwOnInvalid
) {
476 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
477 length
+= next_raw
*2;
483 // Invalid UTF-8 sequence: clear and restart.
484 if (throwOnInvalid
) {
485 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
487 // don't escape the current byte, process it normally
488 if (ch
< (uint)0x0080) {
493 length
+= next_raw
*2;
499 if (flush
&& leftSize
!= 0 && throwOnInvalid
) {
500 // We had left-over bytes that didn't make up
501 // a complete UTF-8 character sequence.
502 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
503 length
+= next_raw
* 2;
506 // Return the final length to the caller.
510 // Get the number of characters needed to decode a byte buffer.
511 public override int GetCharCount (byte[] bytes
, int index
, int count
)
513 return InternalGetCharCount (bytes
, index
, count
, 0, 0, true, true);
516 // Get the characters that result from decoding a byte buffer.
517 private static int InternalGetChars (byte[] bytes
, int byteIndex
,
518 int byteCount
, char[] chars
,
519 int charIndex
, ref uint leftOverBits
,
520 ref uint leftOverCount
,
521 bool throwOnInvalid
, bool flush
)
523 // Validate the parameters.
525 throw new ArgumentNullException ("bytes");
528 throw new ArgumentNullException ("chars");
530 if (byteIndex
< 0 || byteIndex
> bytes
.Length
) {
531 throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
533 if (byteCount
< 0 || byteCount
> (bytes
.Length
- byteIndex
)) {
534 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
536 if (charIndex
< 0 || charIndex
> chars
.Length
) {
537 throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
540 if (charIndex
== chars
.Length
)
543 // Convert the bytes into the output buffer.
544 byte[] raw
= new byte[6];
547 int length
= chars
.Length
;
548 int posn
= charIndex
;
549 uint leftBits
= leftOverBits
;
550 uint leftSoFar
= (leftOverCount
& (uint)0x0F);
551 uint leftSize
= ((leftOverCount
>> 4) & (uint)0x0F);
552 while (byteCount
> 0) {
553 // Fetch the next character from the byte buffer.
554 ch
= (uint)(bytes
[byteIndex
++]);
555 raw
[next_raw
++] = (byte) ch
;
558 // Process a UTF-8 start character.
559 if (ch
< (uint)0x0080) {
560 // Single-byte UTF-8 character.
561 if (posn
>= length
) {
562 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
565 chars
[posn
++] = (char)ch
;
566 } else if ((ch
& (uint)0xE0) == (uint)0xC0) {
567 // Double-byte UTF-8 character.
568 leftBits
= (ch
& (uint)0x1F);
571 } else if ((ch
& (uint)0xF0) == (uint)0xE0) {
572 // Three-byte UTF-8 character.
573 leftBits
= (ch
& (uint)0x0F);
576 } else if ((ch
& (uint)0xF8) == (uint)0xF0) {
577 // Four-byte UTF-8 character.
578 leftBits
= (ch
& (uint)0x07);
581 } else if ((ch
& (uint)0xFC) == (uint)0xF8) {
582 // Five-byte UTF-8 character.
583 leftBits
= (ch
& (uint)0x03);
586 } else if ((ch
& (uint)0xFE) == (uint)0xFC) {
587 // Six-byte UTF-8 character.
588 leftBits
= (ch
& (uint)0x03);
592 // Invalid UTF-8 start character.
593 if (throwOnInvalid
) {
594 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
597 chars
[posn
++] = EscapeByte
;
598 chars
[posn
++] = (char) ch
;
601 // Process an extra byte in a multi-byte sequence.
602 if ((ch
& (uint)0xC0) == (uint)0x80) {
603 leftBits
= ((leftBits
<< 6) | (ch
& (uint)0x3F));
604 if (++leftSoFar
>= leftSize
) {
605 // We have a complete character now.
606 if (leftBits
< (uint)0x10000) {
607 // is it an overlong ?
608 bool overlong
= false;
611 overlong
= (leftBits
<= 0x7F);
614 overlong
= (leftBits
<= 0x07FF);
617 overlong
= (leftBits
<= 0xFFFF);
620 overlong
= (leftBits
<= 0x1FFFFF);
623 overlong
= (leftBits
<= 0x03FFFFFF);
627 // if (throwOnInvalid)
628 // throw new ArgumentException (_("Overlong"), leftBits.ToString ());
629 CopyRaw (raw
, ref next_raw
, chars
, ref posn
, length
);
632 if (posn
>= length
) {
633 throw new ArgumentException
634 (_("Arg_InsufficientSpace"), "chars");
636 chars
[posn
++] = (char)leftBits
;
638 } else if (leftBits
< (uint)0x110000) {
639 if ((posn
+ 2) > length
) {
640 throw new ArgumentException
641 (_("Arg_InsufficientSpace"), "chars");
643 leftBits
-= (uint)0x10000;
644 chars
[posn
++] = (char)((leftBits
>> 10) +
647 (char)((leftBits
& (uint)0x3FF) + (uint)0xDC00);
648 } else if (throwOnInvalid
) {
650 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
651 CopyRaw (raw
, ref next_raw
, chars
, ref posn
, length
);
657 // Invalid UTF-8 sequence: clear and restart.
658 if (throwOnInvalid
) {
659 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
661 // don't escape the current byte, process it normally
662 if (ch
< (uint)0x0080) {
667 CopyRaw (raw
, ref next_raw
, chars
, ref posn
, length
);
673 if (flush
&& leftSize
!= 0 && throwOnInvalid
) {
674 // We had left-over bytes that didn't make up
675 // a complete UTF-8 character sequence.
676 // throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
677 CopyRaw (raw
, ref next_raw
, chars
, ref posn
, length
);
679 leftOverBits
= leftBits
;
680 leftOverCount
= (leftSoFar
| (leftSize
<< 4));
682 // Return the final length to the caller.
683 return posn
- charIndex
;
686 private static void CopyRaw (byte[] raw
, ref int next_raw
, char[] chars
, ref int posn
, int length
)
688 if (posn
+(next_raw
*2) > length
)
689 throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
691 for (int i
= 0; i
< next_raw
; ++i
) {
692 chars
[posn
++] = EscapeByte
;
693 chars
[posn
++] = (char) raw
[i
];
699 // Get the characters that result from decoding a byte buffer.
700 public override int GetChars (byte[] bytes
, int byteIndex
, int byteCount
,
701 char[] chars
, int charIndex
)
703 uint leftOverBits
= 0;
704 uint leftOverCount
= 0;
705 return InternalGetChars (bytes
, byteIndex
, byteCount
, chars
,
706 charIndex
, ref leftOverBits
, ref leftOverCount
, true, true);
709 // Get the maximum number of bytes needed to encode a
710 // specified number of characters.
711 public override int GetMaxByteCount (int charCount
)
714 throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
716 return charCount
* 4;
719 // Get the maximum number of characters needed to decode a
720 // specified number of bytes.
721 public override int GetMaxCharCount (int byteCount
)
724 throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
729 // Get a Unix-specific decoder that is attached to this instance.
730 public override Decoder
GetDecoder ()
732 return new UnixDecoder ();
735 // Get a Unix-specific encoder that is attached to this instance.
736 public override Encoder
GetEncoder ()
738 return new UnixEncoder ();
741 // Get the Unix preamble.
742 public override byte[] GetPreamble ()
747 // Determine if this object is equal to another.
748 public override bool Equals (Object
value)
750 UnixEncoding enc
= (value as UnixEncoding
);
759 // Get the hash code for this object.
760 public override int GetHashCode ()
762 return base.GetHashCode ();
765 public override byte [] GetBytes (String s
)
768 throw new ArgumentNullException ("s");
770 int length
= GetByteCount (s
);
771 byte [] bytes
= new byte [length
];
772 GetBytes (s
, 0, s
.Length
, bytes
, 0);
776 // Unix decoder implementation.
778 private class UnixDecoder
: Decoder
780 private uint leftOverBits
;
781 private uint leftOverCount
;
784 public UnixDecoder ()
790 // Override inherited methods.
791 public override int GetCharCount (byte[] bytes
, int index
, int count
)
793 return InternalGetCharCount (bytes
, index
, count
,
794 leftOverBits
, leftOverCount
, true, false);
796 public override int GetChars (byte[] bytes
, int byteIndex
,
797 int byteCount
, char[] chars
, int charIndex
)
799 return InternalGetChars (bytes
, byteIndex
, byteCount
,
800 chars
, charIndex
, ref leftOverBits
, ref leftOverCount
, true, false);
805 // Unix encoder implementation.
807 private class UnixEncoder
: Encoder
809 private uint leftOver
;
812 public UnixEncoder ()
817 // Override inherited methods.
818 public override int GetByteCount (char[] chars
, int index
,
819 int count
, bool flush
)
821 return InternalGetByteCount (chars
, index
, count
, leftOver
, flush
);
823 public override int GetBytes (char[] chars
, int charIndex
,
824 int charCount
, byte[] bytes
, int byteCount
, bool flush
)
827 result
= InternalGetBytes (chars
, charIndex
, charCount
, bytes
, byteCount
, ref leftOver
, flush
);
832 private static string _ (string arg
)