libjava/gnu/java/nio/charset/UTF_8.java

   1 /* UTF_8.java --
   2    Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package gnu.java.nio.charset;
  39
  40 import java.nio.ByteBuffer;
  41 import java.nio.CharBuffer;
  42 import java.nio.charset.Charset;
  43 import java.nio.charset.CharsetDecoder;
  44 import java.nio.charset.CharsetEncoder;
  45 import java.nio.charset.CoderResult;
  46
  47 /**
  48  * UTF-8 charset.
  49  *
  50  * <p> UTF-8 references:
  51  * <ul>
  52  *   <li> <a href="http://ietf.org/rfc/rfc2279.txt">RFC 2279</a>
  53  *   <li> The <a href="http://www.unicode.org/unicode/standard/standard.html">
  54  *     Unicode standard</a> and
  55  *     <a href="http://www.unicode.org/versions/corrigendum1.html">
  56  *      Corrigendum</a>
  57  * </ul>
  58  *
  59  * @author Jesse Rosenstock
  60  */
  61 final class UTF_8 extends Charset
  62 {
  63   UTF_8 ()
  64   {
  65     super ("UTF-8", new String[] {
  66         /* These names are provided by
  67          * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL
  68          */
  69         "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305",
  70         "windows-65001", "cp1208",
  71         // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
  72         "UTF8"
  73     });
  74   }
  75
  76   public boolean contains (Charset cs)
  77   {
  78     return cs instanceof US_ASCII || cs instanceof ISO_8859_1
  79       || cs instanceof UTF_8 || cs instanceof UTF_16BE
  80       || cs instanceof UTF_16LE || cs instanceof UTF_16;
  81   }
  82
  83   public CharsetDecoder newDecoder ()
  84   {
  85     return new Decoder (this);
  86   }
  87
  88   public CharsetEncoder newEncoder ()
  89   {
  90     return new Encoder (this);
  91   }
  92
  93   private static final class Decoder extends CharsetDecoder
  94   {
  95     // Package-private to avoid a trampoline constructor.
  96     Decoder (Charset cs)
  97     {
  98       super (cs, 1.0f, 1.0f);
  99     }
 100
 101     protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out)
 102     {
 103       // TODO: Optimize this in the case in.hasArray() / out.hasArray()
 104       int inPos = in.position();
 105       try
 106         {
 107           while (in.hasRemaining ())
 108             {
 109               char c;
 110               byte b1 = in.get ();
 111               int highNibble = (b1 >> 4) & 0xF;
 112
 113               switch (highNibble)
 114                 {
 115                   case 0: case 1: case 2: case 3:
 116                   case 4: case 5: case 6: case 7:
 117                     if (out.remaining () < 1)
 118                       return CoderResult.OVERFLOW;
 119                     out.put ((char) b1);
 120                     inPos++;
 121                     break;
 122
 123                   case 0xC: case 0xD:
 124                     byte b2;
 125                     if (in.remaining () < 1)
 126                       return CoderResult.UNDERFLOW;
 127                     if (out.remaining () < 1)
 128                       return CoderResult.OVERFLOW;
 129                     if (!isContinuation (b2 = in.get ()))
 130                       return CoderResult.malformedForLength (1);
 131                     c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F));
 132                     // check that we had the shortest encoding
 133                     if (c <= 0x7F)
 134                       return CoderResult.malformedForLength (2);
 135                     out.put (c);
 136                     inPos += 2;
 137                     break;
 138
 139                   case 0xE:
 140                     byte b3;
 141                     if (in.remaining () < 2)
 142                       return CoderResult.UNDERFLOW;
 143                     if (out.remaining () < 1)
 144                       return CoderResult.OVERFLOW;
 145                     if (!isContinuation (b2 = in.get ()))
 146                       return CoderResult.malformedForLength (1);
 147                     if (!isContinuation (b3 = in.get ()))
 148                       return CoderResult.malformedForLength (1);
 149                     c = (char) (((b1 & 0x0F) << 12)
 150                                 | ((b2 & 0x3F) << 6)
 151                                 | (b3 & 0x3F));
 152                     // check that we had the shortest encoding
 153                     if (c <= 0x7FF)
 154                       return CoderResult.malformedForLength (3);
 155                     out.put (c);
 156                     inPos += 3;
 157                     break;
 158
 159                   default:
 160                     return CoderResult.malformedForLength (1);
 161                 }
 162             }
 163
 164           return CoderResult.UNDERFLOW;
 165         }
 166       finally
 167         {
 168           // In case we did a get(), then encountered an error, reset the
 169           // position to before the error.  If there was no error, this
 170           // will benignly reset the position to the value it already has.
 171           in.position (inPos);
 172         }
 173     }
 174
 175     private static boolean isContinuation (byte b)
 176     {
 177       return (b & 0xC0) == 0x80;
 178     }
 179   }
 180
 181   private static final class Encoder extends CharsetEncoder
 182   {
 183     // Package-private to avoid a trampoline constructor.
 184     Encoder (Charset cs)
 185     {
 186       // According to
 187       // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html
 188       //   On average, English takes slightly over one unit per code point.
 189       //   Most Latin-script languages take about 1.1 bytes. Greek, Russian,
 190       //   Arabic and Hebrew take about 1.7 bytes, and most others (including
 191       //   Japanese, Chinese, Korean and Hindi) take about 3 bytes.
 192       // We assume we will be dealing with latin scripts, and use 1.1
 193       // for averageBytesPerChar.
 194       super (cs, 1.1f, 4.0f);
 195     }
 196
 197     protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out)
 198     {
 199       int inPos = in.position();
 200       try
 201         {
 202           // TODO: Optimize this in the case in.hasArray() / out.hasArray()
 203           while (in.hasRemaining ())
 204           {
 205             int remaining = out.remaining ();
 206             char c = in.get ();
 207
 208             // UCS-4 range (hex.)           UTF-8 octet sequence (binary)
 209             // 0000 0000-0000 007F   0xxxxxxx
 210             // 0000 0080-0000 07FF   110xxxxx 10xxxxxx
 211             // 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 212
 213             //        Scalar Value          UTF-16                byte 1     byte 2     byte 3     byte 4
 214             //        0000 0000 0xxx xxxx   0000 0000 0xxx xxxx   0xxx xxxx
 215             //        0000 0yyy yyxx xxxx   0000 0yyy yyxx xxxx   110y yyyy  10xx xxxx
 216             //        zzzz yyyy yyxx xxxx   zzzz yyyy yyxx xxxx   1110 zzzz  10yy yyyy  10xx xxxx
 217             // u uuuu zzzz yyyy yyxx xxxx   1101 10ww wwzz zzyy   1111 0uuu  10uu zzzz  10yy yyyy  10xx xxxx
 218             //                            + 1101 11yy yyxx xxxx
 219             // Note: uuuuu = wwww + 1
 220
 221             if (c <= 0x7F)
 222               {
 223                 if (remaining < 1)
 224                   return CoderResult.OVERFLOW;
 225                 out.put ((byte) c);
 226                 inPos++;
 227               }
 228             else if (c <= 0x7FF)
 229               {
 230                 if (remaining < 2)
 231                   return CoderResult.OVERFLOW;
 232                 out.put ((byte) (0xC0 | (c >> 6)));
 233                 out.put ((byte) (0x80 | (c & 0x3F)));
 234                 inPos++;
 235               }
 236             else if (0xD800 <= c && c <= 0xDFFF)
 237               {
 238                 if (remaining < 4)
 239                   return CoderResult.OVERFLOW;
 240
 241                 // we got a low surrogate without a preciding high one
 242                 if (c > 0xDBFF)
 243                   return CoderResult.malformedForLength (1);
 244
 245                 // high surrogates
 246                 if (!in.hasRemaining ())
 247                   return CoderResult.UNDERFLOW;
 248
 249                 char d = in.get ();
 250
 251                 // make sure d is a low surrogate
 252                 if (d < 0xDC00 || d > 0xDFFF)
 253                   return CoderResult.malformedForLength (1);
 254
 255                 // make the 32 bit value
 256                 // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
 257                 int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000;
 258                 // assert value == value2;
 259                 out.put ((byte) (0xF0 | (value >> 18)));
 260                 out.put ((byte) (0x80 | ((value >> 12) & 0x3F)));
 261                 out.put ((byte) (0x80 | ((value >>  6) & 0x3F)));
 262                 out.put ((byte) (0x80 | ((value      ) & 0x3F)));
 263
 264                 inPos += 2;
 265               }
 266             else
 267               {
 268                 if (remaining < 3)
 269                   return CoderResult.OVERFLOW;
 270
 271                 out.put ((byte) (0xE0 | (c >> 12)));
 272                 out.put ((byte) (0x80 | ((c >> 6) & 0x3F)));
 273                 out.put ((byte) (0x80 | (c & 0x3F)));
 274                 inPos++;
 275               }
 276           }
 277
 278           return CoderResult.UNDERFLOW;
 279         }
 280       finally
 281         {
 282           // In case we did a get(), then encountered an error, reset the
 283           // position to before the error.  If there was no error, this
 284           // will benignly reset the position to the value it already has.
 285           in.position (inPos);
 286         }
 287     }
 288   }
 289 }