Merge from mainline (gomp-merge-2005-02-26).
[official-gcc.git] / libjava / gnu / java / nio / charset / UTF_8.java
blob87df64423601f7b72d56057cf5d9207d6947da29
1 /* UTF_8.java --
2 Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu.java.nio.charset;
40 import java.nio.ByteBuffer;
41 import java.nio.CharBuffer;
42 import java.nio.charset.Charset;
43 import java.nio.charset.CharsetDecoder;
44 import java.nio.charset.CharsetEncoder;
45 import java.nio.charset.CoderResult;
47 /**
48 * UTF-8 charset.
50 * <p> UTF-8 references:
51 * <ul>
52 * <li> <a href="http://ietf.org/rfc/rfc2279.txt">RFC 2279</a>
53 * <li> The <a href="http://www.unicode.org/unicode/standard/standard.html">
54 * Unicode standard</a> and
55 * <a href="http://www.unicode.org/versions/corrigendum1.html">
56 * Corrigendum</a>
57 * </ul>
59 * @author Jesse Rosenstock
61 final class UTF_8 extends Charset
63 UTF_8 ()
65 super ("UTF-8", new String[] {
66 /* These names are provided by
67 * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
69 "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305",
70 "windows-65001", "cp1208",
71 // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
72 "UTF8"
73 });
76 public boolean contains (Charset cs)
78 return cs instanceof US_ASCII || cs instanceof ISO_8859_1
79 || cs instanceof UTF_8 || cs instanceof UTF_16BE
80 || cs instanceof UTF_16LE || cs instanceof UTF_16;
83 public CharsetDecoder newDecoder ()
85 return new Decoder (this);
88 public CharsetEncoder newEncoder ()
90 return new Encoder (this);
93 private static final class Decoder extends CharsetDecoder
95 // Package-private to avoid a trampoline constructor.
96 Decoder (Charset cs)
98 super (cs, 1.0f, 1.0f);
101 protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out)
103 // TODO: Optimize this in the case in.hasArray() / out.hasArray()
104 int inPos = in.position();
107 while (in.hasRemaining ())
109 char c;
110 byte b1 = in.get ();
111 int highNibble = (b1 >> 4) & 0xF;
113 switch (highNibble)
115 case 0: case 1: case 2: case 3:
116 case 4: case 5: case 6: case 7:
117 if (out.remaining () < 1)
118 return CoderResult.OVERFLOW;
119 out.put ((char) b1);
120 inPos++;
121 break;
123 case 0xC: case 0xD:
124 byte b2;
125 if (in.remaining () < 1)
126 return CoderResult.UNDERFLOW;
127 if (out.remaining () < 1)
128 return CoderResult.OVERFLOW;
129 if (!isContinuation (b2 = in.get ()))
130 return CoderResult.malformedForLength (1);
131 c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F));
132 // check that we had the shortest encoding
133 if (c <= 0x7F)
134 return CoderResult.malformedForLength (2);
135 out.put (c);
136 inPos += 2;
137 break;
139 case 0xE:
140 byte b3;
141 if (in.remaining () < 2)
142 return CoderResult.UNDERFLOW;
143 if (out.remaining () < 1)
144 return CoderResult.OVERFLOW;
145 if (!isContinuation (b2 = in.get ()))
146 return CoderResult.malformedForLength (1);
147 if (!isContinuation (b3 = in.get ()))
148 return CoderResult.malformedForLength (1);
149 c = (char) (((b1 & 0x0F) << 12)
150 | ((b2 & 0x3F) << 6)
151 | (b3 & 0x3F));
152 // check that we had the shortest encoding
153 if (c <= 0x7FF)
154 return CoderResult.malformedForLength (3);
155 out.put (c);
156 inPos += 3;
157 break;
159 default:
160 return CoderResult.malformedForLength (1);
164 return CoderResult.UNDERFLOW;
166 finally
168 // In case we did a get(), then encountered an error, reset the
169 // position to before the error. If there was no error, this
170 // will benignly reset the position to the value it already has.
171 in.position (inPos);
175 private static boolean isContinuation (byte b)
177 return (b & 0xC0) == 0x80;
181 private static final class Encoder extends CharsetEncoder
183 // Package-private to avoid a trampoline constructor.
184 Encoder (Charset cs)
186 // According to
187 // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html
188 // On average, English takes slightly over one unit per code point.
189 // Most Latin-script languages take about 1.1 bytes. Greek, Russian,
190 // Arabic and Hebrew take about 1.7 bytes, and most others (including
191 // Japanese, Chinese, Korean and Hindi) take about 3 bytes.
192 // We assume we will be dealing with latin scripts, and use 1.1
193 // for averageBytesPerChar.
194 super (cs, 1.1f, 4.0f);
197 protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out)
199 int inPos = in.position();
202 // TODO: Optimize this in the case in.hasArray() / out.hasArray()
203 while (in.hasRemaining ())
205 int remaining = out.remaining ();
206 char c = in.get ();
208 // UCS-4 range (hex.) UTF-8 octet sequence (binary)
209 // 0000 0000-0000 007F 0xxxxxxx
210 // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
211 // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213 // Scalar Value UTF-16 byte 1 byte 2 byte 3 byte 4
214 // 0000 0000 0xxx xxxx 0000 0000 0xxx xxxx 0xxx xxxx
215 // 0000 0yyy yyxx xxxx 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
216 // zzzz yyyy yyxx xxxx zzzz yyyy yyxx xxxx 1110 zzzz 10yy yyyy 10xx xxxx
217 // u uuuu zzzz yyyy yyxx xxxx 1101 10ww wwzz zzyy 1111 0uuu 10uu zzzz 10yy yyyy 10xx xxxx
218 // + 1101 11yy yyxx xxxx
219 // Note: uuuuu = wwww + 1
221 if (c <= 0x7F)
223 if (remaining < 1)
224 return CoderResult.OVERFLOW;
225 out.put ((byte) c);
226 inPos++;
228 else if (c <= 0x7FF)
230 if (remaining < 2)
231 return CoderResult.OVERFLOW;
232 out.put ((byte) (0xC0 | (c >> 6)));
233 out.put ((byte) (0x80 | (c & 0x3F)));
234 inPos++;
236 else if (0xD800 <= c && c <= 0xDFFF)
238 if (remaining < 4)
239 return CoderResult.OVERFLOW;
241 // we got a low surrogate without a preciding high one
242 if (c > 0xDBFF)
243 return CoderResult.malformedForLength (1);
245 // high surrogates
246 if (!in.hasRemaining ())
247 return CoderResult.UNDERFLOW;
249 char d = in.get ();
251 // make sure d is a low surrogate
252 if (d < 0xDC00 || d > 0xDFFF)
253 return CoderResult.malformedForLength (1);
255 // make the 32 bit value
256 // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
257 int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000;
258 // assert value == value2;
259 out.put ((byte) (0xF0 | (value >> 18)));
260 out.put ((byte) (0x80 | ((value >> 12) & 0x3F)));
261 out.put ((byte) (0x80 | ((value >> 6) & 0x3F)));
262 out.put ((byte) (0x80 | ((value ) & 0x3F)));
264 inPos += 2;
266 else
268 if (remaining < 3)
269 return CoderResult.OVERFLOW;
271 out.put ((byte) (0xE0 | (c >> 12)));
272 out.put ((byte) (0x80 | ((c >> 6) & 0x3F)));
273 out.put ((byte) (0x80 | (c & 0x3F)));
274 inPos++;
278 return CoderResult.UNDERFLOW;
280 finally
282 // In case we did a get(), then encountered an error, reset the
283 // position to before the error. If there was no error, this
284 // will benignly reset the position to the value it already has.
285 in.position (inPos);