2009-11-02 Jb Evain <jbevain@novell.com>
[mcs.git] / class / I18N / CJK / GB18030Encoding.cs
blob2cb1ff121521b24c5c1e4163248f4aa60f1c740f
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 // Atsushi Enomoto <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10 using I18N.Common;
12 namespace I18N.CJK
14 [Serializable]
15 internal class ENCgb18030 : GB18030Encoding
17 public ENCgb18030 (): base () {}
20 [Serializable]
21 public class CP54936 : GB18030Encoding { }
23 [Serializable]
24 public class GB18030Encoding : MonoEncoding
26 // Constructor.
27 public GB18030Encoding ()
28 : base (54936, 936)
32 public override string EncodingName {
33 get { return "Chinese Simplified (GB18030)"; }
36 public override string HeaderName {
37 get { return "GB18030"; }
40 public override string BodyName {
41 get { return "GB18030"; }
44 public override string WebName {
45 get { return "GB18030"; }
48 public override bool IsMailNewsDisplay {
49 get { return true; }
52 public override bool IsMailNewsSave {
53 get { return true; }
56 public override bool IsBrowserDisplay {
57 get { return true; }
60 public override bool IsBrowserSave {
61 get { return true; }
64 public override int GetMaxByteCount (int len)
66 // non-GB2312 characters in \u0080 - \uFFFF
67 return len * 4;
70 public override int GetMaxCharCount (int len)
72 return len;
75 public override int GetByteCount (char [] chars, int index, int length)
77 return new GB18030Encoder (this).GetByteCount (chars, index, length, true);
80 public unsafe override int GetByteCountImpl (char* chars, int count)
82 return new GB18030Encoder (this).GetByteCountImpl (chars, count, true);
85 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
87 return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
90 public override int GetCharCount (byte [] bytes, int start, int len)
92 return new GB18030Decoder ().GetCharCount (bytes, start, len);
95 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
97 return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
100 public override Encoder GetEncoder ()
102 return new GB18030Encoder (this);
105 public override Decoder GetDecoder ()
107 return new GB18030Decoder ();
111 class GB18030Decoder : DbcsEncoding.DbcsDecoder
113 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
114 // for now incomplete block is not supported - should we?
115 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
117 public GB18030Decoder ()
118 : base (null)
122 public override int GetCharCount (byte [] bytes, int start, int len)
124 CheckRange (bytes, start, len);
126 int end = start + len;
127 int ret = 0;
128 while (start < end) {
129 if (bytes [start] < 0x80) {
130 ret++;
131 start++;
132 continue;
134 else if (bytes [start] == 0x80) {
135 // Euro sign - actually it is obsolete,
136 // now it's just reserved but not used
137 ret++;
138 start++;
139 continue;
141 else if (bytes [start] == 0xFF) {
142 // invalid data - fill '?'
143 ret++;
144 start++;
145 continue;
147 else if (start + 1 >= end) {
148 // incomplete1 = bytes [start];
149 // incomplete2 = -1;
150 // incomplete3 = -1;
151 ret++;
152 break; // incomplete tail.
155 byte second = bytes [start + 1];
156 if (second == 0x7F || second == 0xFF) {
157 // invalid data
158 ret++;
159 start += 2;
160 continue;
162 else if (0x30 <= second && second <= 0x39) {
163 // UCS mapping
164 if (start + 3 >= end) {
165 // incomplete tail.
166 // incomplete1 = bytes [start];
167 // incomplete2 = bytes [start + 1];
168 // if (start + 3 == end)
169 // incomplete3 = bytes [start + 2];
170 ret += start + 3 == end ? 3 : 2;
171 break;
173 long value = GB18030Source.FromGBX (bytes, start);
174 if (value < 0) {
175 // invalid data.
176 ret++;
177 start -= (int) value;
178 } else if (value >= 0x10000) {
179 // UTF16 surrogate
180 ret += 2;
181 start += 4;
182 } else {
183 // UTF16 BMP
184 ret++;
185 start+= 4;
187 } else {
188 // GB2312 mapping
189 start += 2;
190 ret++;
193 return ret;
196 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
198 CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
200 int byteEnd = byteIndex + byteCount;
201 int charStart = charIndex;
203 while (byteIndex < byteEnd) {
204 if (bytes [byteIndex] < 0x80) {
205 chars [charIndex++] = (char) bytes [byteIndex++];
206 continue;
208 else if (bytes [byteIndex] == 0x80) {
209 // Euro sign - actually it is obsolete,
210 // now it's just reserved but not used
211 chars [charIndex++] = '\u20AC';
212 byteIndex++;
213 continue;
215 else if (bytes [byteIndex] == 0xFF) {
216 // invalid data - fill '?'
217 chars [charIndex++] = '?';
218 byteIndex++;
219 continue;
221 else if (byteIndex + 1 >= byteEnd) {
222 //incomplete1 = bytes [byteIndex++];
223 //incomplete2 = -1;
224 //incomplete3 = -1;
225 break; // incomplete tail.
228 byte second = bytes [byteIndex + 1];
229 if (second == 0x7F || second == 0xFF) {
230 // invalid data
231 chars [charIndex++] = '?';
232 byteIndex += 2;
234 else if (0x30 <= second && second <= 0x39) {
235 // UCS mapping
236 if (byteIndex + 3 >= byteEnd) {
237 // incomplete tail.
238 //incomplete1 = bytes [byteIndex];
239 //incomplete2 = bytes [byteIndex + 1];
240 //if (byteIndex + 3 == byteEnd)
241 // incomplete3 = bytes [byteIndex + 2];
242 break;
244 long value = GB18030Source.FromGBX (bytes, byteIndex);
245 if (value < 0) {
246 // invalid data.
247 chars [charIndex++] = '?';
248 byteIndex -= (int) value;
249 } else if (value >= 0x10000) {
250 // UTF16 surrogate
251 value -= 0x10000;
252 chars [charIndex++] = (char) (value / 0x400 + 0xD800);
253 chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
254 byteIndex += 4;
255 } else {
256 // UTF16 BMP
257 chars [charIndex++] = (char) value;
258 byteIndex += 4;
260 } else {
261 byte first = bytes [byteIndex];
262 int ord = ((first - 0x81) * 191 + second - 0x40) * 2;
263 char c1 = ord < 0 || ord >= gb2312.n2u.Length ?
264 '\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256);
265 if (c1 == 0)
266 chars [charIndex++] = '?';
267 else
268 chars [charIndex++] = c1;
269 byteIndex += 2;
273 return charIndex - charStart;
277 class GB18030Encoder : MonoEncoder
279 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
281 public GB18030Encoder (MonoEncoding owner)
282 : base (owner)
286 char incomplete_byte_count;
287 char incomplete_bytes;
289 public unsafe override int GetByteCountImpl (char* chars, int count, bool refresh)
291 int start = 0;
292 int end = count;
293 int ret = 0;
294 while (start < end) {
295 char ch = chars [start];
296 if (ch < 0x80) {
297 // ASCII
298 ret++;
299 start++;
300 continue;
301 } else if (Char.IsSurrogate (ch)) {
302 // Surrogate
303 if (start + 1 == end) {
304 incomplete_byte_count = ch;
305 start++;
306 } else {
307 ret += 4;
308 start += 2;
310 continue;
313 if (ch < 0x80 || ch == 0xFF) {
314 // ASCII
315 ret++;
316 start++;
317 continue;
320 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
321 byte b2 = gb2312.u2n [((int) ch) * 2];
322 if (b1 != 0 && b2 != 0) {
323 // GB2312
324 ret += 2;
325 start++;
326 continue;
329 // non-GB2312
330 long value = GB18030Source.FromUCS (ch);
331 if (value < 0)
332 ret++; // invalid(?)
333 else
334 ret += 4;
335 start++;
338 if (refresh) {
339 if (incomplete_byte_count != char.MinValue)
340 ret++;
341 incomplete_byte_count = char.MinValue;
343 return ret;
346 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
348 int charIndex = 0;
349 int byteIndex = 0;
351 int charEnd = charIndex + charCount;
352 int byteStart = byteIndex;
353 char ch = incomplete_bytes;
355 while (charIndex < charEnd) {
356 if (incomplete_bytes == char.MinValue)
357 ch = chars [charIndex++];
358 else
359 incomplete_bytes = char.MinValue;
361 if (ch < 0x80) {
362 // ASCII
363 bytes [byteIndex++] = (byte) ch;
364 continue;
365 } else if (Char.IsSurrogate (ch)) {
366 // Surrogate
367 if (charIndex == charEnd) {
368 incomplete_bytes = ch;
369 break; // incomplete
371 char ch2 = chars [charIndex++];
372 if (!Char.IsSurrogate (ch2)) {
373 // invalid surrogate
374 #if NET_2_0
375 HandleFallback (
376 chars, ref charIndex, ref charCount,
377 bytes, ref byteIndex, ref byteCount);
378 #else
379 bytes [byteIndex++] = (byte) '?';
380 #endif
381 continue;
383 int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
384 GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp));
385 byteIndex += 4;
386 continue;
390 if (ch <= 0x80 || ch == 0xFF) {
391 // Character maps to itself
392 bytes [byteIndex++] = (byte) ch;
393 continue;
396 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
397 byte b2 = gb2312.u2n [((int) ch) * 2];
398 if (b1 != 0 && b2 != 0) {
399 bytes [byteIndex++] = b1;
400 bytes [byteIndex++] = b2;
401 continue;
404 long value = GB18030Source.FromUCS (ch);
405 if (value < 0)
406 bytes [byteIndex++] = 0x3F; // invalid(?)
407 else {
408 // non-GB2312
409 GB18030Source.Unlinear (bytes + byteIndex, value);
410 byteIndex += 4;
414 if (refresh) {
415 if (incomplete_bytes != char.MinValue)
416 bytes [byteIndex++] = 0x3F; // incomplete
417 incomplete_bytes = char.MinValue;
420 return byteIndex - byteStart;