2 * CP51932.cs - Japanese EUC-JP code page.
4 * It is based on CP932.cs from Portable.NET
7 * Atsushi Enomoto <atsushi@ximian.com>
9 * Below are original (CP932.cs) copyright lines
13 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
15 * Permission is hereby granted, free of charge, to any person obtaining
16 * a copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
22 * The above copyright notice and this permission notice shall be included
23 * in all copies or substantial portions of the Software.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
29 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
30 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31 * OTHER DEALINGS IN THE SOFTWARE.
36 Well, there looks no jis.table source. Thus, it seems like it is
37 generated from text files from Unicode Home Page such like
38 ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
39 However, it is non-normative and in Japan it is contains many problem.
41 FIXME: Some characters such as 0xFF0B (wide "plus") are missing in
46 0x00-0x1F, 0x7F : control characters
48 0xA1A1-0xFEFE : Kanji (precisely, both bytes contain only A1-FE)
49 0x8EA1-0x8EDF : half-width Katakana
50 0x8FA1A1-0x8FFEFE : Complemental Kanji
62 public class CP51932
: MonoEncoding
64 // Magic number used by Windows for the EUC-JP code page.
65 private const int EUC_JP_CODE_PAGE
= 51932;
68 public CP51932 () : base (EUC_JP_CODE_PAGE
, 932)
73 public override int GetByteCount (char [] chars
, int index
, int length
)
75 return new CP51932Encoder (this).GetByteCount (chars
, index
, length
, true);
78 public unsafe override int GetByteCountImpl (char* chars
, int count
)
80 return new CP51932Encoder (this).GetByteCountImpl (chars
, count
, true);
83 public unsafe override int GetBytesImpl (char* chars
, int charCount
, byte* bytes
, int byteCount
)
85 return new CP51932Encoder (this).GetBytesImpl (chars
, charCount
, bytes
, byteCount
, true);
88 public override int GetCharCount (byte [] bytes
, int index
, int count
)
91 return new CP51932Decoder ().GetCharCount (
92 bytes
, index
, count
, true);
94 return new CP51932Decoder ().GetCharCount (
99 public override int GetChars (
100 byte [] bytes
, int byteIndex
, int byteCount
,
101 char [] chars
, int charIndex
)
104 return new CP51932Decoder ().GetChars (bytes
,
105 byteIndex
, byteCount
, chars
, charIndex
, true);
107 return new CP51932Decoder ().GetChars (bytes
,
108 byteIndex
, byteCount
, chars
, charIndex
);
112 // Get the maximum number of bytes needed to encode a
113 // specified number of characters.
114 public override int GetMaxByteCount(int charCount
)
118 throw new ArgumentOutOfRangeException
120 Strings
.GetString("ArgRange_NonNegative"));
122 return charCount
* 3;
125 // Get the maximum number of characters needed to decode a
126 // specified number of bytes.
127 public override int GetMaxCharCount(int byteCount
)
131 throw new ArgumentOutOfRangeException
133 Strings
.GetString ("ArgRange_NonNegative"));
138 public override Encoder
GetEncoder ()
140 return new CP51932Encoder (this);
143 public override Decoder
GetDecoder ()
145 return new CP51932Decoder ();
150 // Get the mail body name for this encoding.
151 public override String BodyName
{
152 get { return "euc-jp"; }
155 // Get the human-readable name for this encoding.
156 public override String EncodingName
{
157 get { return "Japanese (EUC)"; }
160 // Get the mail agent header name for this encoding.
161 public override String HeaderName
{
162 get { return "euc-jp"; }
165 // Determine if this encoding can be displayed in a Web browser.
166 public override bool IsBrowserDisplay
{
170 // Determine if this encoding can be saved from a Web browser.
171 public override bool IsBrowserSave
{
175 // Determine if this encoding can be displayed in a mail/news agent.
176 public override bool IsMailNewsDisplay
{
180 // Determine if this encoding can be saved from a mail/news agent.
181 public override bool IsMailNewsSave
{
185 // Get the IANA-preferred Web name for this encoding.
186 public override String WebName
{
187 get { return "euc-jp"; }
190 #endif // !ECMA_COMPAT
192 public class CP51932Encoder
: MonoEncoder
194 public CP51932Encoder (MonoEncoding encoding
)
199 // Get the number of bytes needed to encode a character buffer.
200 public unsafe override int GetByteCountImpl (
201 char* chars
, int count
, bool refresh
)
203 // Determine the length of the final output.
207 byte [] cjkToJis
= JISConvert
.Convert
.cjkToJis
;
208 byte [] extraToJis
= JISConvert
.Convert
.extraToJis
;
211 ch
= chars
[index
++];
215 // Character maps to itself.
217 } else if (ch
< 0x0100) {
218 // Check for special Latin 1 characters that
219 // can be mapped to double-byte code points.
220 if(ch
== 0x00A2 || ch
== 0x00A3 || ch
== 0x00A7 ||
221 ch
== 0x00A8 || ch
== 0x00AC || ch
== 0x00B0 ||
222 ch
== 0x00B1 || ch
== 0x00B4 || ch
== 0x00B6 ||
223 ch
== 0x00D7 || ch
== 0x00F7)
227 } else if (ch
>= 0x0391 && ch
<= 0x0451) {
228 // Greek subset characters.
230 } else if (ch
>= 0x2010 && ch
<= 0x9FA5) {
231 // This range contains the bulk of the CJK set.
232 value = (ch
- 0x2010) * 2;
233 value = ((int) (cjkToJis
[value])) | (((int)(cjkToJis
[value + 1])) << 8);
236 } else if(ch
>= 0xFF01 && ch
< 0xFF60) {
237 // This range contains extra characters.
238 value = (ch
- 0xFF01) * 2;
239 value = ((int)(extraToJis
[value])) |
240 (((int)(extraToJis
[value + 1])) << 8);
243 } else if(ch
>= 0xFF60 && ch
<= 0xFFA0) {
244 ++length
; // half-width kana
248 // Return the length to the caller.
252 // Get the bytes that result from encoding a character buffer.
253 public unsafe override int GetBytesImpl (
254 char* chars
, int charCount
, byte* bytes
, int byteCount
, bool refresh
)
259 // Convert the characters into their byte form.
260 int posn
= byteIndex
;
261 int byteLength
= byteCount
;
264 byte[] cjkToJis
= JISConvert
.Convert
.cjkToJis
;
265 byte[] greekToJis
= JISConvert
.Convert
.greekToJis
;
266 byte[] extraToJis
= JISConvert
.Convert
.extraToJis
;
268 for (; charCount
> 0; charIndex
++, --charCount
) {
269 ch
= chars
[charIndex
];
270 if (posn
>= byteLength
) {
271 throw new ArgumentException (Strings
.GetString ("Arg_InsufficientSpace"), "bytes");
275 // Character maps to itself.
276 bytes
[posn
++] = (byte)ch
;
278 } else if (ch
>= 0x0391 && ch
<= 0x0451) {
279 // Greek subset characters.
280 value = (ch
- 0x0391) * 2;
281 value = ((int)(greekToJis
[value])) |
282 (((int)(greekToJis
[value + 1])) << 8);
283 } else if (ch
>= 0x2010 && ch
<= 0x9FA5) {
284 // This range contains the bulk of the CJK set.
285 value = (ch
- 0x2010) * 2;
286 value = ((int) (cjkToJis
[value])) |
287 (((int)(cjkToJis
[value + 1])) << 8);
288 } else if (ch
>= 0xFF01 && ch
<= 0xFF60) {
289 // This range contains extra characters,
290 // including half-width katakana.
291 value = (ch
- 0xFF01) * 2;
292 value = ((int) (extraToJis
[value])) |
293 (((int) (extraToJis
[value + 1])) << 8);
294 } else if (ch
>= 0xFF60 && ch
<= 0xFFA0) {
295 value = ch
- 0xFF60 + 0x8EA0;
297 // Invalid character.
304 chars
, ref charIndex
, ref charCount
,
305 bytes
, ref posn
, ref byteCount
);
307 bytes
[posn
++] = (byte) '?';
309 } else if (value < 0x0100) {
310 bytes
[posn
++] = (byte) value;
311 } else if ((posn
+ 1) >= byteLength
) {
312 throw new ArgumentException (Strings
.GetString ("Arg_InsufficientSpace"), "bytes");
313 } else if (value < 0x8000) {
314 // general 2byte glyph/kanji
316 bytes
[posn
++] = (byte) (value / 0x5E + 0xA1);
317 bytes
[posn
++] = (byte) (value % 0x5E + 0xA1);
318 //Console.WriteLine ("{0:X04}", ch);
324 bytes
[posn
++] = 0x8E;
325 bytes
[posn
++] = (byte) (value - 0x8E00);
329 // Return the final length to the caller.
330 return posn
- byteIndex
;
334 internal class CP51932Decoder
: DbcsEncoding
.DbcsDecoder
336 public CP51932Decoder ()
341 int last_count
, last_bytes
;
343 // Get the number of characters needed to decode a byte buffer.
344 public override int GetCharCount (byte [] bytes
, int index
, int count
)
346 return GetCharCount (bytes
, index
, count
, false);
354 int GetCharCount (byte [] bytes
, int index
, int count
, bool refresh
)
356 CheckRange (bytes
, index
, count
);
358 // Determine the total length of the converted string.
360 byte[] table0208
= JISConvert
.Convert
.jisx0208ToUnicode
;
361 byte[] table0212
= JISConvert
.Convert
.jisx0212ToUnicode
;
364 int last
= last_count
;
367 byteval
= bytes
[index
++];
370 if (byteval
== 0x8F) {
372 // Invalid second byte of a 3-byte character.
376 // First byte in a triple-byte sequence
379 } else if (byteval
<= 0x7F) {
380 // Ordinary ASCII/Latin1/Control character.
382 } else if (byteval
== 0x8E) {
383 // First byte of half-width Katakana
385 } else if (byteval
>= 0xA1 && byteval
<= 0xFE) {
386 // First byte in a double-byte sequence.
389 // Invalid first byte.
393 else if (last
== 0x8E) {
394 if (byteval
>= 0xA1 && byteval
<= 0xDF) {
395 value = ((byteval
- 0x40) |
399 // Invalid second byte.
404 else if (last
== 0x8F) {
406 // FIXME: currently not supported yet
411 // Second byte in a double-byte sequence.
412 value = (last
- 0xA1) * 0x5E;
414 if (byteval
>= 0xA1 && byteval
<= 0xFE)
416 value += (byteval
- 0xA1);
420 // Invalid second byte.
427 value = ((int) (table0208
[value]))
428 | (((int) (table0208
[value + 1])) << 8);
430 value = ((int) (table0212
[value]))
431 | (((int) (table0212
[value + 1])) << 8);
439 // seems like .NET 2.0 adds \u30FB for insufficient
440 // byte seuqence (for Japanese \u30FB makes sense).
441 if (refresh
&& last
!= 0)
446 // Return the final length to the caller.
450 public override int GetChars (byte[] bytes
, int byteIndex
,
451 int byteCount
, char[] chars
,
454 return GetChars (bytes
, byteIndex
, byteCount
, chars
, charIndex
, false);
462 int GetChars (byte[] bytes
, int byteIndex
,
463 int byteCount
, char[] chars
,
464 int charIndex
, bool refresh
)
466 CheckRange (bytes
, byteIndex
, byteCount
, chars
, charIndex
);
468 // Decode the bytes in the buffer.
469 int posn
= charIndex
;
470 int charLength
= chars
.Length
;
472 int last
= last_bytes
;
473 byte[] table0208
= JISConvert
.Convert
.jisx0208ToUnicode
;
474 byte[] table0212
= JISConvert
.Convert
.jisx0212ToUnicode
;
476 while (byteCount
> 0) {
477 byteval
= bytes
[byteIndex
++];
480 if (byteval
== 0x8F) {
482 // Invalid second byte of a 3-byte character.
484 if (posn
>= charLength
)
485 throw Insufficient ();
486 chars
[posn
++] = '\u30FB';
488 // First byte in a triple-byte sequence
491 } else if (byteval
<= 0x7F) {
492 // Ordinary ASCII/Latin1/Control character.
493 if (posn
>= charLength
)
494 throw Insufficient ();
495 chars
[posn
++] = (char) byteval
;
496 } else if (byteval
== 0x8E) {
497 // First byte of half-width Katakana
499 } else if (byteval
>= 0xA1 && byteval
<= 0xFE) {
500 // First byte in a double-byte sequence.
503 // Invalid first byte.
504 if (posn
>= charLength
)
505 throw Insufficient ();
506 chars
[posn
++] = '\u30FB';
509 else if (last
== 0x8E) {
510 if (byteval
>= 0xA1 && byteval
<= 0xDF) {
511 value = ((byteval
- 0x40) |
513 if (posn
>= charLength
)
514 throw Insufficient ();
515 chars
[posn
++] = (char) value;
517 // Invalid second byte.
518 if (posn
>= charLength
)
519 throw Insufficient ();
520 chars
[posn
++] = '\u30FB';
524 else if (last
== 0x8F) {
526 // FIXME: currently not supported yet
531 // Second byte in a double-byte sequence.
532 value = (last
- 0xA1) * 0x5E;
534 if (byteval
>= 0xA1 && byteval
<= 0xFE)
536 value += (byteval
- 0xA1);
540 // Invalid second byte.
542 if (posn
>= charLength
)
543 throw Insufficient ();
544 chars
[posn
++] = '\u30FB';
549 value = ((int) (table0208
[value]))
550 | (((int) (table0208
[value + 1])) << 8);
552 value = ((int) (table0212
[value]))
553 | (((int) (table0212
[value + 1])) << 8);
554 if (posn
>= charLength
)
555 throw Insufficient ();
557 chars
[posn
++] = (char)value;
559 chars
[posn
++] = '\u30FB';
563 if (refresh
&& last
!= 0) {
564 // seems like .NET 2.0 adds \u30FB for insufficient
565 // byte seuqence (for Japanese \u30FB makes sense).
566 if (posn
>= charLength
)
567 throw Insufficient ();
568 chars
[posn
++] = '\u30FB';
573 // Return the final length to the caller.
574 return posn
- charIndex
;
577 Exception
Insufficient ()
579 throw new ArgumentException
581 ("Arg_InsufficientSpace"), "chars");
583 }; // class CP51932Decoder
586 public class ENCeuc_jp
: CP51932
588 public ENCeuc_jp () : base() {}
592 }; // namespace I18N.CJK