libjava/gnu/java/text/WordBreakIterator.java

   1 /* WordBreakIterator.java - Default word BreakIterator.
   2    Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package gnu.java.text;
  40
  41 import java.text.CharacterIterator;
  42
  43 /**
  44  * @author Tom Tromey <tromey@cygnus.com>
  45  * @date March 22, 1999
  46  * Written using The Unicode Standard, Version 2.0.
  47  */
  48
  49 public class WordBreakIterator extends BaseBreakIterator
  50 {
  51   public Object clone ()
  52   {
  53     return new WordBreakIterator (this);
  54   }
  55
  56   public WordBreakIterator ()
  57   {
  58   }
  59
  60   private WordBreakIterator (WordBreakIterator other)
  61   {
  62     iter = (CharacterIterator) other.iter.clone();
  63   }
  64
  65   // Some methods to tell us different properties of characters.
  66   private final boolean isHira (char c)
  67   {
  68     return c >= 0x3040 && c <= 0x309f;
  69   }
  70   private final boolean isKata (char c)
  71   {
  72     return c >= 0x30a0 && c <= 0x30ff;
  73   }
  74   private final boolean isHan (char c)
  75   {
  76     return c >= 0x4e00 && c <= 0x9fff;
  77   }
  78
  79   public int next ()
  80   {
  81     int end = iter.getEndIndex();
  82     if (iter.getIndex() == end)
  83       return DONE;
  84
  85     while (iter.getIndex() < end)
  86       {
  87         char c = iter.current();
  88         if (c == CharacterIterator.DONE)
  89           break;
  90         int type = Character.getType(c);
  91
  92         char n = iter.next();
  93         if (n == CharacterIterator.DONE)
  94           break;
  95
  96         // Break after paragraph separators.
  97         if (type == Character.PARAGRAPH_SEPARATOR
  98             || type == Character.LINE_SEPARATOR)
  99           break;
 100
 101         // Break between letters and non-letters.
 102         // FIXME: we treat apostrophe as part of a word.  This
 103         // is an English-ism.
 104         boolean is_letter = Character.isLetter(c);
 105         if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
 106             && Character.isLetter(n))
 107           break;
 108
 109         // Always break after certain symbols, such as punctuation.
 110         // This heuristic is derived from hints in the JCL book and is
 111         // not part of Unicode.  It seems to be right, however.
 112         // FIXME: we treat apostrophe as part of a word.  This
 113         // is an English-ism.
 114         if (c != '\''
 115             && (type == Character.DASH_PUNCTUATION
 116                 || type == Character.START_PUNCTUATION
 117                 || type == Character.END_PUNCTUATION
 118                 || type == Character.CONNECTOR_PUNCTUATION
 119                 || type == Character.OTHER_PUNCTUATION
 120                 || type == Character.MATH_SYMBOL
 121                 || type == Character.CURRENCY_SYMBOL
 122                 || type == Character.MODIFIER_SYMBOL
 123                 || type == Character.OTHER_SYMBOL
 124                 || type == Character.FORMAT
 125                 || type == Character.CONTROL))
 126           break;
 127
 128         boolean is_hira = isHira (c);
 129         boolean is_kata = isKata (c);
 130         boolean is_han = isHan (c);
 131
 132         // Special case Japanese.
 133         if (! is_hira && ! is_kata && ! is_han
 134             && type != Character.NON_SPACING_MARK
 135             && (isHira (n) || isKata (n) || isHan (n)))
 136           break;
 137
 138         if (is_hira || is_kata || is_han || is_letter)
 139           {
 140             // Now we need to do some lookahead.  We might need to do
 141             // quite a bit of lookahead, so we save our position and
 142             // restore it later.
 143             int save = iter.getIndex();
 144             // Skip string of non spacing marks.
 145             while (n != CharacterIterator.DONE
 146                    && Character.getType(n) == Character.NON_SPACING_MARK)
 147               n = iter.next();
 148             if (n == CharacterIterator.DONE)
 149               break;
 150             if ((is_hira && ! isHira (n))
 151                 || (is_kata && ! isHira (n) && ! isKata (n))
 152                 || (is_han && ! isHira (n) && ! isHan (n))
 153                 // FIXME: we treat apostrophe as part of a word.  This
 154                 // is an English-ism.
 155                 || (is_letter && ! Character.isLetter(n) && n != '\''))
 156               break;
 157             iter.setIndex(save);
 158           }
 159       }
 160
 161     return iter.getIndex();
 162   }
 163
 164   public int previous ()
 165   {
 166     int start = iter.getBeginIndex();
 167     if (iter.getIndex() == start)
 168       return DONE;
 169
 170     while (iter.getIndex() >= start)
 171       {
 172         char c = iter.previous();
 173         if (c == CharacterIterator.DONE)
 174           break;
 175
 176         boolean is_hira = isHira (c);
 177         boolean is_kata = isKata (c);
 178         boolean is_han = isHan (c);
 179         boolean is_letter = Character.isLetter(c);
 180
 181         char n = iter.previous();
 182         if (n == CharacterIterator.DONE)
 183           break;
 184         iter.next();
 185         int type = Character.getType(n);
 186         // Break after paragraph separators.
 187         if (type == Character.PARAGRAPH_SEPARATOR
 188             || type == Character.LINE_SEPARATOR)
 189           break;
 190
 191         // Break between letters and non-letters.
 192         // FIXME: we treat apostrophe as part of a word.  This
 193         // is an English-ism.
 194         if (n != '\'' && ! Character.isLetter(n)
 195             && type != Character.NON_SPACING_MARK
 196             && is_letter)
 197           break;
 198
 199         // Always break after certain symbols, such as punctuation.
 200         // This heuristic is derived from hints in the JCL book and is
 201         // not part of Unicode.  It seems to be right, however.
 202         // FIXME: we treat apostrophe as part of a word.  This
 203         // is an English-ism.
 204         if (n != '\''
 205             && (type == Character.DASH_PUNCTUATION
 206                 || type == Character.START_PUNCTUATION
 207                 || type == Character.END_PUNCTUATION
 208                 || type == Character.CONNECTOR_PUNCTUATION
 209                 || type == Character.OTHER_PUNCTUATION
 210                 || type == Character.MATH_SYMBOL
 211                 || type == Character.CURRENCY_SYMBOL
 212                 || type == Character.MODIFIER_SYMBOL
 213                 || type == Character.OTHER_SYMBOL
 214                 || type == Character.FORMAT
 215                 || type == Character.CONTROL))
 216           break;
 217
 218         // Special case Japanese.
 219         if ((is_hira || is_kata || is_han)
 220             && ! isHira (n) && ! isKata (n) && ! isHan (n)
 221             && type != Character.NON_SPACING_MARK)
 222           break;
 223
 224         // We might have to skip over non spacing marks to see what's
 225         // on the other side.
 226         if (! is_hira || (! is_letter && c != '\''))
 227           {
 228             int save = iter.getIndex();
 229             while (n != CharacterIterator.DONE
 230                    && Character.getType(n) == Character.NON_SPACING_MARK)
 231               n = iter.previous();
 232             iter.setIndex(save);
 233             // This is a strange case: a bunch of non-spacing marks at
 234             // the beginning.  We treat the current location as a word
 235             // break.
 236             if (n == CharacterIterator.DONE)
 237               break;
 238             if ((isHira (n) && ! is_hira)
 239                 || (isKata (n) && ! is_hira && ! is_kata)
 240                 || (isHan (n) && ! is_hira && ! is_han)
 241                 // FIXME: we treat apostrophe as part of a word.  This
 242                 // is an English-ism.
 243                 || (! is_letter && c != '\'' && Character.isLetter(n)))
 244               break;
 245           }
 246       }
 247
 248     return iter.getIndex();
 249   }
 250 }