2 Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu
.java
.nio
.charset
;
40 import java
.nio
.ByteBuffer
;
41 import java
.nio
.CharBuffer
;
42 import java
.nio
.charset
.Charset
;
43 import java
.nio
.charset
.CharsetDecoder
;
44 import java
.nio
.charset
.CharsetEncoder
;
45 import java
.nio
.charset
.CoderResult
;
50 * <p> UTF-8 references:
52 * <li> <a href="http://ietf.org/rfc/rfc2279.txt">RFC 2279</a>
53 * <li> The <a href="http://www.unicode.org/unicode/standard/standard.html">
54 * Unicode standard</a> and
55 * <a href="http://www.unicode.org/versions/corrigendum1.html">
59 * @author Jesse Rosenstock
61 final class UTF_8
extends Charset
65 super ("UTF-8", new String
[] {
66 /* These names are provided by
67 * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
69 "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305",
70 "windows-65001", "cp1208",
71 // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
76 public boolean contains (Charset cs
)
78 return cs
instanceof US_ASCII
|| cs
instanceof ISO_8859_1
79 || cs
instanceof UTF_8
|| cs
instanceof UTF_16BE
80 || cs
instanceof UTF_16LE
|| cs
instanceof UTF_16
;
83 public CharsetDecoder
newDecoder ()
85 return new Decoder (this);
88 public CharsetEncoder
newEncoder ()
90 return new Encoder (this);
93 private static final class Decoder
extends CharsetDecoder
95 // Package-private to avoid a trampoline constructor.
98 super (cs
, 1.0f
, 1.0f
);
101 protected CoderResult
decodeLoop (ByteBuffer in
, CharBuffer out
)
103 // TODO: Optimize this in the case in.hasArray() / out.hasArray()
104 int inPos
= in
.position();
107 while (in
.hasRemaining ())
111 int highNibble
= (b1
>> 4) & 0xF;
115 case 0: case 1: case 2: case 3:
116 case 4: case 5: case 6: case 7:
117 if (out
.remaining () < 1)
118 return CoderResult
.OVERFLOW
;
125 if (in
.remaining () < 1)
126 return CoderResult
.UNDERFLOW
;
127 if (out
.remaining () < 1)
128 return CoderResult
.OVERFLOW
;
129 if (!isContinuation (b2
= in
.get ()))
130 return CoderResult
.malformedForLength (1);
131 c
= (char) (((b1
& 0x1F) << 6) | (b2
& 0x3F));
132 // check that we had the shortest encoding
134 return CoderResult
.malformedForLength (2);
141 if (in
.remaining () < 2)
142 return CoderResult
.UNDERFLOW
;
143 if (out
.remaining () < 1)
144 return CoderResult
.OVERFLOW
;
145 if (!isContinuation (b2
= in
.get ()))
146 return CoderResult
.malformedForLength (1);
147 if (!isContinuation (b3
= in
.get ()))
148 return CoderResult
.malformedForLength (1);
149 c
= (char) (((b1
& 0x0F) << 12)
152 // check that we had the shortest encoding
154 return CoderResult
.malformedForLength (3);
160 return CoderResult
.malformedForLength (1);
164 return CoderResult
.UNDERFLOW
;
168 // In case we did a get(), then encountered an error, reset the
169 // position to before the error. If there was no error, this
170 // will benignly reset the position to the value it already has.
175 private static boolean isContinuation (byte b
)
177 return (b
& 0xC0) == 0x80;
181 private static final class Encoder
extends CharsetEncoder
183 // Package-private to avoid a trampoline constructor.
187 // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html
188 // On average, English takes slightly over one unit per code point.
189 // Most Latin-script languages take about 1.1 bytes. Greek, Russian,
190 // Arabic and Hebrew take about 1.7 bytes, and most others (including
191 // Japanese, Chinese, Korean and Hindi) take about 3 bytes.
192 // We assume we will be dealing with latin scripts, and use 1.1
193 // for averageBytesPerChar.
194 super (cs
, 1.1f
, 4.0f
);
197 protected CoderResult
encodeLoop (CharBuffer in
, ByteBuffer out
)
199 int inPos
= in
.position();
202 // TODO: Optimize this in the case in.hasArray() / out.hasArray()
203 while (in
.hasRemaining ())
205 int remaining
= out
.remaining ();
208 // UCS-4 range (hex.) UTF-8 octet sequence (binary)
209 // 0000 0000-0000 007F 0xxxxxxx
210 // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
211 // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
213 // Scalar Value UTF-16 byte 1 byte 2 byte 3 byte 4
214 // 0000 0000 0xxx xxxx 0000 0000 0xxx xxxx 0xxx xxxx
215 // 0000 0yyy yyxx xxxx 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
216 // zzzz yyyy yyxx xxxx zzzz yyyy yyxx xxxx 1110 zzzz 10yy yyyy 10xx xxxx
217 // u uuuu zzzz yyyy yyxx xxxx 1101 10ww wwzz zzyy 1111 0uuu 10uu zzzz 10yy yyyy 10xx xxxx
218 // + 1101 11yy yyxx xxxx
219 // Note: uuuuu = wwww + 1
224 return CoderResult
.OVERFLOW
;
231 return CoderResult
.OVERFLOW
;
232 out
.put ((byte) (0xC0 | (c
>> 6)));
233 out
.put ((byte) (0x80 | (c
& 0x3F)));
236 else if (0xD800 <= c
&& c
<= 0xDFFF)
239 return CoderResult
.OVERFLOW
;
241 // we got a low surrogate without a preciding high one
243 return CoderResult
.malformedForLength (1);
246 if (!in
.hasRemaining ())
247 return CoderResult
.UNDERFLOW
;
251 // make sure d is a low surrogate
252 if (d
< 0xDC00 || d
> 0xDFFF)
253 return CoderResult
.malformedForLength (1);
255 // make the 32 bit value
256 // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
257 int value
= (((c
& 0x3FF) << 10) | (d
& 0x3FF)) + 0x10000;
258 // assert value == value2;
259 out
.put ((byte) (0xF0 | (value
>> 18)));
260 out
.put ((byte) (0x80 | ((value
>> 12) & 0x3F)));
261 out
.put ((byte) (0x80 | ((value
>> 6) & 0x3F)));
262 out
.put ((byte) (0x80 | ((value
) & 0x3F)));
269 return CoderResult
.OVERFLOW
;
271 out
.put ((byte) (0xE0 | (c
>> 12)));
272 out
.put ((byte) (0x80 | ((c
>> 6) & 0x3F)));
273 out
.put ((byte) (0x80 | (c
& 0x3F)));
278 return CoderResult
.UNDERFLOW
;
282 // In case we did a get(), then encountered an error, reset the
283 // position to before the error. If there was no error, this
284 // will benignly reset the position to the value it already has.