class/I18N/CJK/CP51932.cs

   1 /*
   2  * CP51932.cs - Japanese EUC-JP code page.
   3  *
   4  * It is based on CP932.cs from Portable.NET
   5  *
   6  * Author:
   7  *      Atsushi Enomoto <atsushi@ximian.com>
   8  *
   9  * Below are original (CP932.cs) copyright lines
  10  *
  11  * (C)2004 Novell Inc.
  12  *
  13  * Copyright (c) 2002  Southern Storm Software, Pty Ltd
  14  *
  15  * Permission is hereby granted, free of charge, to any person obtaining
  16  * a copy of this software and associated documentation files (the "Software"),
  17  * to deal in the Software without restriction, including without limitation
  18  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  19  * and/or sell copies of the Software, and to permit persons to whom the
  20  * Software is furnished to do so, subject to the following conditions:
  21  *
  22  * The above copyright notice and this permission notice shall be included
  23  * in all copies or substantial portions of the Software.
  24  *
  25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  26  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  27  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  28  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  29  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  30  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  31  * OTHER DEALINGS IN THE SOFTWARE.
  32  */
  33
  34 /*
  35
  36         Well, there looks no jis.table source. Thus, it seems like it is
  37         generated from text files from Unicode Home Page such like
  38         ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
  39         However, it is non-normative and in Japan it is contains many problem.
  40
  41         FIXME:  Some characters such as 0xFF0B (wide "plus") are missing in
  42                 that table.
  43 */
  44
  45 /*
  46         0x00-0x1F, 0x7F   : control characters
  47         0x20-0x7E         : ASCII
  48         0xA1A1-0xFEFE     : Kanji (precisely, both bytes contain only A1-FE)
  49         0x8EA1-0x8EDF     : half-width Katakana
  50         0x8FA1A1-0x8FFEFE : Complemental Kanji
  51
  52 */
  53
  54 namespace I18N.CJK
  55 {
  56
  57 using System;
  58 using System.Text;
  59 using I18N.Common;
  60
  61 [Serializable]
  62 public class CP51932 : MonoEncoding
  63 {
  64         // Magic number used by Windows for the EUC-JP code page.
  65         private const int EUC_JP_CODE_PAGE = 51932;
  66
  67         // Constructor.
  68         public CP51932 () : base (EUC_JP_CODE_PAGE, 932)
  69         {
  70         }
  71
  72
  73         public override int GetByteCount (char [] chars, int index, int length)
  74         {
  75                 return new CP51932Encoder (this).GetByteCount (chars, index, length, true);
  76         }
  77
  78         public unsafe override int GetByteCountImpl (char* chars, int count)
  79         {
  80                 return new CP51932Encoder (this).GetByteCountImpl (chars, count, true);
  81         }
  82
  83         public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
  84         {
  85                 return new CP51932Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
  86         }
  87
  88         public override int GetCharCount (byte [] bytes, int index, int count)
  89         {
  90 #if NET_2_0
  91                 return new CP51932Decoder ().GetCharCount (
  92                         bytes, index, count, true);
  93 #else
  94                 return new CP51932Decoder ().GetCharCount (
  95                         bytes, index, count);
  96 #endif
  97         }
  98
  99         public override int GetChars (
 100                 byte [] bytes, int byteIndex, int byteCount,
 101                 char [] chars, int charIndex)
 102         {
 103 #if NET_2_0
 104                 return new CP51932Decoder ().GetChars (bytes,
 105                         byteIndex, byteCount, chars, charIndex, true);
 106 #else
 107                 return new CP51932Decoder ().GetChars (bytes,
 108                         byteIndex, byteCount, chars, charIndex);
 109 #endif
 110         }
 111
 112         // Get the maximum number of bytes needed to encode a
 113         // specified number of characters.
 114         public override int GetMaxByteCount(int charCount)
 115         {
 116                 if(charCount < 0)
 117                 {
 118                         throw new ArgumentOutOfRangeException
 119                                 ("charCount",
 120                                  Strings.GetString("ArgRange_NonNegative"));
 121                 }
 122                 return charCount * 3;
 123         }
 124
 125         // Get the maximum number of characters needed to decode a
 126         // specified number of bytes.
 127         public override int GetMaxCharCount(int byteCount)
 128         {
 129                 if(byteCount < 0)
 130                 {
 131                         throw new ArgumentOutOfRangeException
 132                                 ("byteCount",
 133                                  Strings.GetString ("ArgRange_NonNegative"));
 134                 }
 135                 return byteCount;
 136         }
 137
 138         public override Encoder GetEncoder ()
 139         {
 140                 return new CP51932Encoder (this);
 141         }
 142
 143         public override Decoder GetDecoder ()
 144         {
 145                 return new CP51932Decoder ();
 146         }
 147
 148 #if !ECMA_COMPAT
 149
 150         // Get the mail body name for this encoding.
 151         public override String BodyName {
 152                 get { return "euc-jp"; }
 153         }
 154
 155         // Get the human-readable name for this encoding.
 156         public override String EncodingName {
 157                 get { return "Japanese (EUC)"; }
 158         }
 159
 160         // Get the mail agent header name for this encoding.
 161         public override String HeaderName {
 162                 get { return "euc-jp"; }
 163         }
 164
 165         // Determine if this encoding can be displayed in a Web browser.
 166         public override bool IsBrowserDisplay {
 167                 get { return true; }
 168         }
 169
 170         // Determine if this encoding can be saved from a Web browser.
 171         public override bool IsBrowserSave {
 172                 get { return true; }
 173         }
 174
 175         // Determine if this encoding can be displayed in a mail/news agent.
 176         public override bool IsMailNewsDisplay {
 177                 get { return true; }
 178         }
 179
 180         // Determine if this encoding can be saved from a mail/news agent.
 181         public override bool IsMailNewsSave {
 182                 get { return true; }
 183         }
 184
 185         // Get the IANA-preferred Web name for this encoding.
 186         public override String WebName {
 187                 get { return "euc-jp"; }
 188         }
 189 } // CP51932
 190 #endif // !ECMA_COMPAT
 191
 192 public class CP51932Encoder : MonoEncoder
 193 {
 194         public CP51932Encoder (MonoEncoding encoding)
 195                 : base (encoding)
 196         {
 197         }
 198
 199         // Get the number of bytes needed to encode a character buffer.
 200         public unsafe override int GetByteCountImpl (
 201                 char* chars, int count, bool refresh)
 202         {
 203                 // Determine the length of the final output.
 204                 int index = 0;
 205                 int length = 0;
 206                 int ch, value;
 207                 byte [] cjkToJis = JISConvert.Convert.cjkToJis;
 208                 byte [] extraToJis = JISConvert.Convert.extraToJis;
 209
 210                 while (count > 0) {
 211                         ch = chars [index++];
 212                         --count;
 213                         ++length;
 214                         if (ch < 0x0080) {
 215                                 // Character maps to itself.
 216                                 continue;
 217                         } else if (ch < 0x0100) {
 218                                 // Check for special Latin 1 characters that
 219                                 // can be mapped to double-byte code points.
 220                                 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
 221                                    ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
 222                                    ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
 223                                    ch == 0x00D7 || ch == 0x00F7)
 224                                 {
 225                                         ++length;
 226                                 }
 227                         } else if (ch >= 0x0391 && ch <= 0x0451) {
 228                                 // Greek subset characters.
 229                                 ++length;
 230                         } else if (ch >= 0x2010 && ch <= 0x9FA5) {
 231                                 // This range contains the bulk of the CJK set.
 232                                 value = (ch - 0x2010) * 2;
 233                                 value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8);
 234                                 if(value >= 0x0100)
 235                                         ++length;
 236                         } else if(ch >= 0xFF01 && ch < 0xFF60) {
 237                                 // This range contains extra characters.
 238                                 value = (ch - 0xFF01) * 2;
 239                                 value = ((int)(extraToJis[value])) |
 240                                                 (((int)(extraToJis[value + 1])) << 8);
 241                                 if(value >= 0x0100)
 242                                         ++length;
 243                         } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
 244                                 ++length; // half-width kana
 245                         }
 246                 }
 247
 248                 // Return the length to the caller.
 249                 return length;
 250         }
 251
 252         // Get the bytes that result from encoding a character buffer.
 253         public unsafe override int GetBytesImpl (
 254                 char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
 255         {
 256                 int charIndex = 0;
 257                 int byteIndex = 0;
 258
 259                 // Convert the characters into their byte form.
 260                 int posn = byteIndex;
 261                 int byteLength = byteCount;
 262                 int ch, value;
 263
 264                 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
 265                 byte[] greekToJis = JISConvert.Convert.greekToJis;
 266                 byte[] extraToJis = JISConvert.Convert.extraToJis;
 267
 268                 for (; charCount > 0; charIndex++, --charCount) {
 269                         ch = chars [charIndex];
 270                         if (posn >= byteLength) {
 271                                 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
 272                         }
 273
 274                         if (ch < 0x0080) {
 275                                 // Character maps to itself.
 276                                 bytes[posn++] = (byte)ch;
 277                                 continue;
 278                         } else if (ch >= 0x0391 && ch <= 0x0451) {
 279                                 // Greek subset characters.
 280                                 value = (ch - 0x0391) * 2;
 281                                 value = ((int)(greekToJis[value])) |
 282                                                 (((int)(greekToJis[value + 1])) << 8);
 283                         } else if (ch >= 0x2010 && ch <= 0x9FA5) {
 284                                 // This range contains the bulk of the CJK set.
 285                                 value = (ch - 0x2010) * 2;
 286                                 value = ((int) (cjkToJis[value])) |
 287                                                 (((int)(cjkToJis[value + 1])) << 8);
 288                         } else if (ch >= 0xFF01 && ch <= 0xFF60) {
 289                                 // This range contains extra characters,
 290                                 // including half-width katakana.
 291                                 value = (ch - 0xFF01) * 2;
 292                                 value = ((int) (extraToJis [value])) |
 293                                                 (((int) (extraToJis [value + 1])) << 8);
 294                         } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
 295                                 value = ch - 0xFF60 + 0x8EA0;
 296                         } else {
 297                                 // Invalid character.
 298                                 value = 0;
 299                         }
 300
 301                         if (value == 0) {
 302 #if NET_2_0
 303                                 HandleFallback (
 304                                         chars, ref charIndex, ref charCount,
 305                                         bytes, ref posn, ref byteCount);
 306 #else
 307                                 bytes [posn++] = (byte) '?';
 308 #endif
 309                         } else if (value < 0x0100) {
 310                                 bytes [posn++] = (byte) value;
 311                         } else if ((posn + 1) >= byteLength) {
 312                                 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
 313                         } else if (value < 0x8000) {
 314                                 // general 2byte glyph/kanji
 315                                 value -= 0x0100;
 316                                 bytes [posn++] = (byte) (value / 0x5E + 0xA1);
 317                                 bytes [posn++] = (byte) (value % 0x5E + 0xA1);
 318 //Console.WriteLine ("{0:X04}", ch);
 319                                 continue;
 320                         }
 321                         else
 322                         {
 323                                 // half-width kana
 324                                 bytes [posn++] = 0x8E;
 325                                 bytes [posn++] = (byte) (value - 0x8E00);
 326                         }
 327                 }
 328
 329                 // Return the final length to the caller.
 330                 return posn - byteIndex;
 331         }
 332 } // CP51932Encoder
 333
 334 internal class CP51932Decoder : DbcsEncoding.DbcsDecoder
 335 {
 336         public CP51932Decoder ()
 337                 : base (null)
 338         {
 339         }
 340
 341         int last_count, last_bytes;
 342
 343         // Get the number of characters needed to decode a byte buffer.
 344         public override int GetCharCount (byte [] bytes, int index, int count)
 345         {
 346                 return GetCharCount (bytes, index, count, false);
 347         }
 348
 349 #if NET_2_0
 350         public override
 351 #else
 352         internal
 353 #endif
 354         int GetCharCount (byte [] bytes, int index, int count, bool refresh)
 355         {
 356                 CheckRange (bytes, index, count);
 357
 358                 // Determine the total length of the converted string.
 359                 int value = 0;
 360                 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
 361                 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
 362                 int length = 0;
 363                 int byteval = 0;
 364                 int last = last_count;
 365
 366                 while (count > 0) {
 367                         byteval = bytes [index++];
 368                         --count;
 369                         if (last == 0) {
 370                                 if (byteval == 0x8F) {
 371                                         if (byteval != 0) {
 372                                                 // Invalid second byte of a 3-byte character.
 373                                                 last = 0;
 374                                                 length++;
 375                                         }
 376                                         // First byte in a triple-byte sequence
 377                                         else
 378                                                 last = byteval;
 379                                 } else if (byteval <= 0x7F) {
 380                                         // Ordinary ASCII/Latin1/Control character.
 381                                         length++;
 382                                 } else if (byteval == 0x8E) {
 383                                         // First byte of half-width Katakana
 384                                         last = byteval;
 385                                 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
 386                                         // First byte in a double-byte sequence.
 387                                         last = byteval;
 388                                 } else {
 389                                         // Invalid first byte.
 390                                         length++;
 391                                 }
 392                         }
 393                         else if (last == 0x8E) {
 394                                 if (byteval >= 0xA1 && byteval <= 0xDF) {
 395                                         value = ((byteval - 0x40) |
 396                                                 (last + 0x71) << 8);
 397                                         length++;
 398                                 } else {
 399                                         // Invalid second byte.
 400                                         length++;
 401                                 }
 402                                 last =0;
 403                         }
 404                         else if (last == 0x8F) {
 405                                 // 3-byte character
 406                                 // FIXME: currently not supported yet
 407                                 last = byteval;
 408                         }
 409                         else
 410                         {
 411                                 // Second byte in a double-byte sequence.
 412                                 value = (last - 0xA1) * 0x5E;
 413                                 last = 0;
 414                                 if (byteval >= 0xA1 && byteval <= 0xFE)
 415                                 {
 416                                         value += (byteval - 0xA1);
 417                                 }
 418                                 else
 419                                 {
 420                                         // Invalid second byte.
 421                                         last = 0;
 422                                         length++;
 423                                         continue;
 424                                 }
 425
 426                                 value *= 2;
 427                                 value = ((int) (table0208 [value]))
 428                                         | (((int) (table0208 [value + 1])) << 8);
 429                                 if (value == 0)
 430                                         value = ((int) (table0212 [value]))
 431                                                 | (((int) (table0212 [value + 1])) << 8);
 432                                 if (value != 0)
 433                                         length++;
 434                                 else
 435                                         length++;
 436                         }
 437                 }
 438
 439                 // seems like .NET 2.0 adds \u30FB for insufficient
 440                 // byte seuqence (for Japanese \u30FB makes sense).
 441                 if (refresh && last != 0)
 442                         length++;
 443                 else
 444                         last_count = last;
 445
 446                 // Return the final length to the caller.
 447                 return length;
 448         }
 449
 450         public override int GetChars (byte[] bytes, int byteIndex,
 451                                                  int byteCount, char[] chars,
 452                                                  int charIndex)
 453         {
 454                 return GetChars (bytes, byteIndex, byteCount, chars, charIndex, false);
 455         }
 456
 457 #if NET_2_0
 458         public override
 459 #else
 460         internal
 461 #endif
 462         int GetChars (byte[] bytes, int byteIndex,
 463                                                  int byteCount, char[] chars,
 464                                                  int charIndex, bool refresh)
 465         {
 466                 CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
 467
 468                 // Decode the bytes in the buffer.
 469                 int posn = charIndex;
 470                 int charLength = chars.Length;
 471                 int byteval, value;
 472                 int last = last_bytes;
 473                 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
 474                 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
 475
 476                 while (byteCount > 0) {
 477                         byteval = bytes [byteIndex++];
 478                         --byteCount;
 479                         if (last == 0) {
 480                                 if (byteval == 0x8F) {
 481                                         if (byteval != 0) {
 482                                                 // Invalid second byte of a 3-byte character.
 483                                                 last = 0;
 484                                                 if (posn >= charLength)
 485                                                         throw Insufficient ();
 486                                                 chars [posn++] = '\u30FB';
 487                                         }
 488                                         // First byte in a triple-byte sequence
 489                                         else
 490                                                 last = byteval;
 491                                 } else if (byteval <= 0x7F) {
 492                                         // Ordinary ASCII/Latin1/Control character.
 493                                         if (posn >= charLength)
 494                                                 throw Insufficient ();
 495                                         chars [posn++] = (char) byteval;
 496                                 } else if (byteval == 0x8E) {
 497                                         // First byte of half-width Katakana
 498                                         last = byteval;
 499                                 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
 500                                         // First byte in a double-byte sequence.
 501                                         last = byteval;
 502                                 } else {
 503                                         // Invalid first byte.
 504                                         if (posn >= charLength)
 505                                                 throw Insufficient ();
 506                                         chars [posn++] = '\u30FB';
 507                                 }
 508                         }
 509                         else if (last == 0x8E) {
 510                                 if (byteval >= 0xA1 && byteval <= 0xDF) {
 511                                         value = ((byteval - 0x40) |
 512                                                 (last + 0x71) << 8);
 513                                         if (posn >= charLength)
 514                                                 throw Insufficient ();
 515                                         chars [posn++] = (char) value;
 516                                 } else {
 517                                         // Invalid second byte.
 518                                         if (posn >= charLength)
 519                                                 throw Insufficient ();
 520                                         chars [posn++] = '\u30FB';
 521                                 }
 522                                 last =0;
 523                         }
 524                         else if (last == 0x8F) {
 525                                 // 3-byte character
 526                                 // FIXME: currently not supported yet
 527                                 last = byteval;
 528                         }
 529                         else
 530                         {
 531                                 // Second byte in a double-byte sequence.
 532                                 value = (last - 0xA1) * 0x5E;
 533                                 last = 0;
 534                                 if (byteval >= 0xA1 && byteval <= 0xFE)
 535                                 {
 536                                         value += (byteval - 0xA1);
 537                                 }
 538                                 else
 539                                 {
 540                                         // Invalid second byte.
 541                                         last = 0;
 542                                         if (posn >= charLength)
 543                                                 throw Insufficient ();
 544                                         chars [posn++] = '\u30FB';
 545                                         continue;
 546                                 }
 547
 548                                 value *= 2;
 549                                 value = ((int) (table0208 [value]))
 550                                         | (((int) (table0208 [value + 1])) << 8);
 551                                 if (value == 0)
 552                                         value = ((int) (table0212 [value]))
 553                                                 | (((int) (table0212 [value + 1])) << 8);
 554                                 if (posn >= charLength)
 555                                         throw Insufficient ();
 556                                 if (value != 0)
 557                                         chars [posn++] = (char)value;
 558                                 else
 559                                         chars [posn++] = '\u30FB';
 560                         }
 561                 }
 562
 563                 if (refresh && last != 0) {
 564                         // seems like .NET 2.0 adds \u30FB for insufficient
 565                         // byte seuqence (for Japanese \u30FB makes sense).
 566                         if (posn >= charLength)
 567                                 throw Insufficient ();
 568                         chars [posn++] = '\u30FB';
 569                 }
 570                 else
 571                         last_bytes = last;
 572
 573                 // Return the final length to the caller.
 574                 return posn - charIndex;
 575         }
 576
 577         Exception Insufficient ()
 578         {
 579                 throw new ArgumentException
 580                         (Strings.GetString
 581                                 ("Arg_InsufficientSpace"), "chars");
 582         }
 583 }; // class CP51932Decoder
 584
 585 [Serializable]
 586 public class ENCeuc_jp : CP51932
 587 {
 588         public ENCeuc_jp () : base() {}
 589
 590 }; // class ENCeucjp
 591
 592 }; // namespace I18N.CJK