2010-03-02 Jb Evain <jbevain@novell.com>
[mcs.git] / class / I18N / CJK / CP51932.cs
blobbffb15ba77b8fabf9d62be16beb0e7759a468402
1 /*
2 * CP51932.cs - Japanese EUC-JP code page.
4 * It is based on CP932.cs from Portable.NET
6 * Author:
7 * Atsushi Enomoto <atsushi@ximian.com>
9 * Below are original (CP932.cs) copyright lines
11 * (C)2004 Novell Inc.
13 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
15 * Permission is hereby granted, free of charge, to any person obtaining
16 * a copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
22 * The above copyright notice and this permission notice shall be included
23 * in all copies or substantial portions of the Software.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
29 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
30 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31 * OTHER DEALINGS IN THE SOFTWARE.
36 Well, there looks no jis.table source. Thus, it seems like it is
37 generated from text files from Unicode Home Page such like
38 ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
39 However, it is non-normative and in Japan it is contains many problem.
41 FIXME: Some characters such as 0xFF0B (wide "plus") are missing in
42 that table.
46 0x00-0x1F, 0x7F : control characters
47 0x20-0x7E : ASCII
48 0xA1A1-0xFEFE : Kanji (precisely, both bytes contain only A1-FE)
49 0x8EA1-0x8EDF : half-width Katakana
50 0x8FA1A1-0x8FFEFE : Complemental Kanji
54 namespace I18N.CJK
57 using System;
58 using System.Text;
59 using I18N.Common;
61 [Serializable]
62 public class CP51932 : MonoEncoding
64 // Magic number used by Windows for the EUC-JP code page.
65 private const int EUC_JP_CODE_PAGE = 51932;
67 // Constructor.
68 public CP51932 () : base (EUC_JP_CODE_PAGE, 932)
73 public override int GetByteCount (char [] chars, int index, int length)
75 return new CP51932Encoder (this).GetByteCount (chars, index, length, true);
78 public unsafe override int GetByteCountImpl (char* chars, int count)
80 return new CP51932Encoder (this).GetByteCountImpl (chars, count, true);
83 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
85 return new CP51932Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
88 public override int GetCharCount (byte [] bytes, int index, int count)
90 #if NET_2_0
91 return new CP51932Decoder ().GetCharCount (
92 bytes, index, count, true);
93 #else
94 return new CP51932Decoder ().GetCharCount (
95 bytes, index, count);
96 #endif
99 public override int GetChars (
100 byte [] bytes, int byteIndex, int byteCount,
101 char [] chars, int charIndex)
103 #if NET_2_0
104 return new CP51932Decoder ().GetChars (bytes,
105 byteIndex, byteCount, chars, charIndex, true);
106 #else
107 return new CP51932Decoder ().GetChars (bytes,
108 byteIndex, byteCount, chars, charIndex);
109 #endif
112 // Get the maximum number of bytes needed to encode a
113 // specified number of characters.
114 public override int GetMaxByteCount(int charCount)
116 if(charCount < 0)
118 throw new ArgumentOutOfRangeException
119 ("charCount",
120 Strings.GetString("ArgRange_NonNegative"));
122 return charCount * 3;
125 // Get the maximum number of characters needed to decode a
126 // specified number of bytes.
127 public override int GetMaxCharCount(int byteCount)
129 if(byteCount < 0)
131 throw new ArgumentOutOfRangeException
132 ("byteCount",
133 Strings.GetString ("ArgRange_NonNegative"));
135 return byteCount;
138 public override Encoder GetEncoder ()
140 return new CP51932Encoder (this);
143 public override Decoder GetDecoder ()
145 return new CP51932Decoder ();
148 #if !ECMA_COMPAT
150 // Get the mail body name for this encoding.
151 public override String BodyName {
152 get { return "euc-jp"; }
155 // Get the human-readable name for this encoding.
156 public override String EncodingName {
157 get { return "Japanese (EUC)"; }
160 // Get the mail agent header name for this encoding.
161 public override String HeaderName {
162 get { return "euc-jp"; }
165 // Determine if this encoding can be displayed in a Web browser.
166 public override bool IsBrowserDisplay {
167 get { return true; }
170 // Determine if this encoding can be saved from a Web browser.
171 public override bool IsBrowserSave {
172 get { return true; }
175 // Determine if this encoding can be displayed in a mail/news agent.
176 public override bool IsMailNewsDisplay {
177 get { return true; }
180 // Determine if this encoding can be saved from a mail/news agent.
181 public override bool IsMailNewsSave {
182 get { return true; }
185 // Get the IANA-preferred Web name for this encoding.
186 public override String WebName {
187 get { return "euc-jp"; }
189 } // CP51932
190 #endif // !ECMA_COMPAT
192 public class CP51932Encoder : MonoEncoder
194 public CP51932Encoder (MonoEncoding encoding)
195 : base (encoding)
199 // Get the number of bytes needed to encode a character buffer.
200 public unsafe override int GetByteCountImpl (
201 char* chars, int count, bool refresh)
203 // Determine the length of the final output.
204 int index = 0;
205 int length = 0;
206 int ch, value;
207 byte [] cjkToJis = JISConvert.Convert.cjkToJis;
208 byte [] extraToJis = JISConvert.Convert.extraToJis;
210 while (count > 0) {
211 ch = chars [index++];
212 --count;
213 ++length;
214 if (ch < 0x0080) {
215 // Character maps to itself.
216 continue;
217 } else if (ch < 0x0100) {
218 // Check for special Latin 1 characters that
219 // can be mapped to double-byte code points.
220 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
221 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
222 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
223 ch == 0x00D7 || ch == 0x00F7)
225 ++length;
227 } else if (ch >= 0x0391 && ch <= 0x0451) {
228 // Greek subset characters.
229 ++length;
230 } else if (ch >= 0x2010 && ch <= 0x9FA5) {
231 // This range contains the bulk of the CJK set.
232 value = (ch - 0x2010) * 2;
233 value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8);
234 if(value >= 0x0100)
235 ++length;
236 } else if(ch >= 0xFF01 && ch < 0xFF60) {
237 // This range contains extra characters.
238 value = (ch - 0xFF01) * 2;
239 value = ((int)(extraToJis[value])) |
240 (((int)(extraToJis[value + 1])) << 8);
241 if(value >= 0x0100)
242 ++length;
243 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
244 ++length; // half-width kana
248 // Return the length to the caller.
249 return length;
252 // Get the bytes that result from encoding a character buffer.
253 public unsafe override int GetBytesImpl (
254 char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
256 int charIndex = 0;
257 int byteIndex = 0;
259 // Convert the characters into their byte form.
260 int posn = byteIndex;
261 int byteLength = byteCount;
262 int ch, value;
264 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
265 byte[] greekToJis = JISConvert.Convert.greekToJis;
266 byte[] extraToJis = JISConvert.Convert.extraToJis;
268 for (; charCount > 0; charIndex++, --charCount) {
269 ch = chars [charIndex];
270 if (posn >= byteLength) {
271 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
274 if (ch < 0x0080) {
275 // Character maps to itself.
276 bytes[posn++] = (byte)ch;
277 continue;
278 } else if (ch >= 0x0391 && ch <= 0x0451) {
279 // Greek subset characters.
280 value = (ch - 0x0391) * 2;
281 value = ((int)(greekToJis[value])) |
282 (((int)(greekToJis[value + 1])) << 8);
283 } else if (ch >= 0x2010 && ch <= 0x9FA5) {
284 // This range contains the bulk of the CJK set.
285 value = (ch - 0x2010) * 2;
286 value = ((int) (cjkToJis[value])) |
287 (((int)(cjkToJis[value + 1])) << 8);
288 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
289 // This range contains extra characters,
290 // including half-width katakana.
291 value = (ch - 0xFF01) * 2;
292 value = ((int) (extraToJis [value])) |
293 (((int) (extraToJis [value + 1])) << 8);
294 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
295 value = ch - 0xFF60 + 0x8EA0;
296 } else {
297 // Invalid character.
298 value = 0;
301 if (value == 0) {
302 #if NET_2_0
303 HandleFallback (
304 chars, ref charIndex, ref charCount,
305 bytes, ref posn, ref byteCount);
306 #else
307 bytes [posn++] = (byte) '?';
308 #endif
309 } else if (value < 0x0100) {
310 bytes [posn++] = (byte) value;
311 } else if ((posn + 1) >= byteLength) {
312 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
313 } else if (value < 0x8000) {
314 // general 2byte glyph/kanji
315 value -= 0x0100;
316 bytes [posn++] = (byte) (value / 0x5E + 0xA1);
317 bytes [posn++] = (byte) (value % 0x5E + 0xA1);
318 //Console.WriteLine ("{0:X04}", ch);
319 continue;
321 else
323 // half-width kana
324 bytes [posn++] = 0x8E;
325 bytes [posn++] = (byte) (value - 0x8E00);
329 // Return the final length to the caller.
330 return posn - byteIndex;
332 } // CP51932Encoder
334 internal class CP51932Decoder : DbcsEncoding.DbcsDecoder
336 public CP51932Decoder ()
337 : base (null)
341 int last_count, last_bytes;
343 // Get the number of characters needed to decode a byte buffer.
344 public override int GetCharCount (byte [] bytes, int index, int count)
346 return GetCharCount (bytes, index, count, false);
349 #if NET_2_0
350 public override
351 #else
352 internal
353 #endif
354 int GetCharCount (byte [] bytes, int index, int count, bool refresh)
356 CheckRange (bytes, index, count);
358 // Determine the total length of the converted string.
359 int value = 0;
360 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
361 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
362 int length = 0;
363 int byteval = 0;
364 int last = last_count;
366 while (count > 0) {
367 byteval = bytes [index++];
368 --count;
369 if (last == 0) {
370 if (byteval == 0x8F) {
371 if (byteval != 0) {
372 // Invalid second byte of a 3-byte character.
373 last = 0;
374 length++;
376 // First byte in a triple-byte sequence
377 else
378 last = byteval;
379 } else if (byteval <= 0x7F) {
380 // Ordinary ASCII/Latin1/Control character.
381 length++;
382 } else if (byteval == 0x8E) {
383 // First byte of half-width Katakana
384 last = byteval;
385 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
386 // First byte in a double-byte sequence.
387 last = byteval;
388 } else {
389 // Invalid first byte.
390 length++;
393 else if (last == 0x8E) {
394 if (byteval >= 0xA1 && byteval <= 0xDF) {
395 value = ((byteval - 0x40) |
396 (last + 0x71) << 8);
397 length++;
398 } else {
399 // Invalid second byte.
400 length++;
402 last =0;
404 else if (last == 0x8F) {
405 // 3-byte character
406 // FIXME: currently not supported yet
407 last = byteval;
409 else
411 // Second byte in a double-byte sequence.
412 value = (last - 0xA1) * 0x5E;
413 last = 0;
414 if (byteval >= 0xA1 && byteval <= 0xFE)
416 value += (byteval - 0xA1);
418 else
420 // Invalid second byte.
421 last = 0;
422 length++;
423 continue;
426 value *= 2;
427 value = ((int) (table0208 [value]))
428 | (((int) (table0208 [value + 1])) << 8);
429 if (value == 0)
430 value = ((int) (table0212 [value]))
431 | (((int) (table0212 [value + 1])) << 8);
432 if (value != 0)
433 length++;
434 else
435 length++;
439 // seems like .NET 2.0 adds \u30FB for insufficient
440 // byte seuqence (for Japanese \u30FB makes sense).
441 if (refresh && last != 0)
442 length++;
443 else
444 last_count = last;
446 // Return the final length to the caller.
447 return length;
450 public override int GetChars (byte[] bytes, int byteIndex,
451 int byteCount, char[] chars,
452 int charIndex)
454 return GetChars (bytes, byteIndex, byteCount, chars, charIndex, false);
457 #if NET_2_0
458 public override
459 #else
460 internal
461 #endif
462 int GetChars (byte[] bytes, int byteIndex,
463 int byteCount, char[] chars,
464 int charIndex, bool refresh)
466 CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
468 // Decode the bytes in the buffer.
469 int posn = charIndex;
470 int charLength = chars.Length;
471 int byteval, value;
472 int last = last_bytes;
473 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
474 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
476 while (byteCount > 0) {
477 byteval = bytes [byteIndex++];
478 --byteCount;
479 if (last == 0) {
480 if (byteval == 0x8F) {
481 if (byteval != 0) {
482 // Invalid second byte of a 3-byte character.
483 last = 0;
484 if (posn >= charLength)
485 throw Insufficient ();
486 chars [posn++] = '\u30FB';
488 // First byte in a triple-byte sequence
489 else
490 last = byteval;
491 } else if (byteval <= 0x7F) {
492 // Ordinary ASCII/Latin1/Control character.
493 if (posn >= charLength)
494 throw Insufficient ();
495 chars [posn++] = (char) byteval;
496 } else if (byteval == 0x8E) {
497 // First byte of half-width Katakana
498 last = byteval;
499 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
500 // First byte in a double-byte sequence.
501 last = byteval;
502 } else {
503 // Invalid first byte.
504 if (posn >= charLength)
505 throw Insufficient ();
506 chars [posn++] = '\u30FB';
509 else if (last == 0x8E) {
510 if (byteval >= 0xA1 && byteval <= 0xDF) {
511 value = ((byteval - 0x40) |
512 (last + 0x71) << 8);
513 if (posn >= charLength)
514 throw Insufficient ();
515 chars [posn++] = (char) value;
516 } else {
517 // Invalid second byte.
518 if (posn >= charLength)
519 throw Insufficient ();
520 chars [posn++] = '\u30FB';
522 last =0;
524 else if (last == 0x8F) {
525 // 3-byte character
526 // FIXME: currently not supported yet
527 last = byteval;
529 else
531 // Second byte in a double-byte sequence.
532 value = (last - 0xA1) * 0x5E;
533 last = 0;
534 if (byteval >= 0xA1 && byteval <= 0xFE)
536 value += (byteval - 0xA1);
538 else
540 // Invalid second byte.
541 last = 0;
542 if (posn >= charLength)
543 throw Insufficient ();
544 chars [posn++] = '\u30FB';
545 continue;
548 value *= 2;
549 value = ((int) (table0208 [value]))
550 | (((int) (table0208 [value + 1])) << 8);
551 if (value == 0)
552 value = ((int) (table0212 [value]))
553 | (((int) (table0212 [value + 1])) << 8);
554 if (posn >= charLength)
555 throw Insufficient ();
556 if (value != 0)
557 chars [posn++] = (char)value;
558 else
559 chars [posn++] = '\u30FB';
563 if (refresh && last != 0) {
564 // seems like .NET 2.0 adds \u30FB for insufficient
565 // byte seuqence (for Japanese \u30FB makes sense).
566 if (posn >= charLength)
567 throw Insufficient ();
568 chars [posn++] = '\u30FB';
570 else
571 last_bytes = last;
573 // Return the final length to the caller.
574 return posn - charIndex;
577 Exception Insufficient ()
579 throw new ArgumentException
580 (Strings.GetString
581 ("Arg_InsufficientSpace"), "chars");
583 }; // class CP51932Decoder
585 [Serializable]
586 public class ENCeuc_jp : CP51932
588 public ENCeuc_jp () : base() {}
590 }; // class ENCeucjp
592 }; // namespace I18N.CJK