libjava/gnu/java/text/WordBreakIterator.java

   1 /* WordBreakIterator.java - Default word BreakIterator.
   2    Copyright (C) 1999, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package gnu.java.text;
  40
  41 import java.text.BreakIterator;
  42 import java.text.CharacterIterator;
  43
  44 /**
  45  * @author Tom Tromey <tromey@cygnus.com>
  46  * @date March 22, 1999
  47  * Written using The Unicode Standard, Version 2.0.
  48  */
  49
  50 public class WordBreakIterator extends BaseBreakIterator
  51 {
  52   public Object clone ()
  53   {
  54     return new WordBreakIterator (this);
  55   }
  56
  57   public WordBreakIterator ()
  58   {
  59     iter = null;
  60   }
  61
  62   private WordBreakIterator (WordBreakIterator other)
  63   {
  64     iter = (CharacterIterator) other.iter.clone();
  65   }
  66
  67   // Some methods to tell us different properties of characters.
  68   private final boolean isHira (char c)
  69   {
  70     return c >= 0x3040 && c <= 0x309f;
  71   }
  72   private final boolean isKata (char c)
  73   {
  74     return c >= 0x30a0 && c <= 0x30ff;
  75   }
  76   private final boolean isHan (char c)
  77   {
  78     return c >= 0x4e00 && c <= 0x9fff;
  79   }
  80
  81   public int next ()
  82   {
  83     int end = iter.getEndIndex();
  84     if (iter.getIndex() == end)
  85       return DONE;
  86
  87     while (iter.getIndex() < end)
  88       {
  89         char c = iter.current();
  90         if (c == CharacterIterator.DONE)
  91           break;
  92         int type = Character.getType(c);
  93
  94         char n = iter.next();
  95         if (n == CharacterIterator.DONE)
  96           break;
  97
  98         // Break after paragraph separators.
  99         if (type == Character.PARAGRAPH_SEPARATOR
 100             || type == Character.LINE_SEPARATOR)
 101           break;
 102
 103         // Break between letters and non-letters.
 104         // FIXME: we treat apostrophe as part of a word.  This
 105         // is an English-ism.
 106         boolean is_letter = Character.isLetter(c);
 107         if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
 108             && Character.isLetter(n))
 109           break;
 110
 111         // Always break after certain symbols, such as punctuation.
 112         // This heuristic is derived from hints in the JCL book and is
 113         // not part of Unicode.  It seems to be right, however.
 114         // FIXME: we treat apostrophe as part of a word.  This
 115         // is an English-ism.
 116         if (c != '\''
 117             && (type == Character.DASH_PUNCTUATION
 118                 || type == Character.START_PUNCTUATION
 119                 || type == Character.END_PUNCTUATION
 120                 || type == Character.CONNECTOR_PUNCTUATION
 121                 || type == Character.OTHER_PUNCTUATION
 122                 || type == Character.MATH_SYMBOL
 123                 || type == Character.CURRENCY_SYMBOL
 124                 || type == Character.MODIFIER_SYMBOL
 125                 || type == Character.OTHER_SYMBOL
 126                 || type == Character.FORMAT
 127                 || type == Character.CONTROL))
 128           break;
 129
 130         boolean is_hira = isHira (c);
 131         boolean is_kata = isKata (c);
 132         boolean is_han = isHan (c);
 133
 134         // Special case Japanese.
 135         if (! is_hira && ! is_kata && ! is_han
 136             && type != Character.NON_SPACING_MARK
 137             && (isHira (n) || isKata (n) || isHan (n)))
 138           break;
 139
 140         if (is_hira || is_kata || is_han || is_letter)
 141           {
 142             // Now we need to do some lookahead.  We might need to do
 143             // quite a bit of lookahead, so we save our position and
 144             // restore it later.
 145             int save = iter.getIndex();
 146             // Skip string of non spacing marks.
 147             while (n != CharacterIterator.DONE
 148                    && Character.getType(n) == Character.NON_SPACING_MARK)
 149               n = iter.next();
 150             if (n == CharacterIterator.DONE)
 151               break;
 152             if ((is_hira && ! isHira (n))
 153                 || (is_kata && ! isHira (n) && ! isKata (n))
 154                 || (is_han && ! isHira (n) && ! isHan (n))
 155                 // FIXME: we treat apostrophe as part of a word.  This
 156                 // is an English-ism.
 157                 || (is_letter && ! Character.isLetter(n) && n != '\''))
 158               break;
 159             iter.setIndex(save);
 160           }
 161       }
 162
 163     return iter.getIndex();
 164   }
 165
 166   public int previous ()
 167   {
 168     int start = iter.getBeginIndex();
 169     if (iter.getIndex() == start)
 170       return DONE;
 171
 172     while (iter.getIndex() >= start)
 173       {
 174         char c = iter.previous();
 175         if (c == CharacterIterator.DONE)
 176           break;
 177
 178         boolean is_hira = isHira (c);
 179         boolean is_kata = isKata (c);
 180         boolean is_han = isHan (c);
 181         boolean is_letter = Character.isLetter(c);
 182
 183         char n = iter.previous();
 184         if (n == CharacterIterator.DONE)
 185           break;
 186         iter.next();
 187         int type = Character.getType(n);
 188         // Break after paragraph separators.
 189         if (type == Character.PARAGRAPH_SEPARATOR
 190             || type == Character.LINE_SEPARATOR)
 191           break;
 192
 193         // Break between letters and non-letters.
 194         // FIXME: we treat apostrophe as part of a word.  This
 195         // is an English-ism.
 196         if (n != '\'' && ! Character.isLetter(n)
 197             && type != Character.NON_SPACING_MARK
 198             && is_letter)
 199           break;
 200
 201         // Always break after certain symbols, such as punctuation.
 202         // This heuristic is derived from hints in the JCL book and is
 203         // not part of Unicode.  It seems to be right, however.
 204         // FIXME: we treat apostrophe as part of a word.  This
 205         // is an English-ism.
 206         if (n != '\''
 207             && (type == Character.DASH_PUNCTUATION
 208                 || type == Character.START_PUNCTUATION
 209                 || type == Character.END_PUNCTUATION
 210                 || type == Character.CONNECTOR_PUNCTUATION
 211                 || type == Character.OTHER_PUNCTUATION
 212                 || type == Character.MATH_SYMBOL
 213                 || type == Character.CURRENCY_SYMBOL
 214                 || type == Character.MODIFIER_SYMBOL
 215                 || type == Character.OTHER_SYMBOL
 216                 || type == Character.FORMAT
 217                 || type == Character.CONTROL))
 218           break;
 219
 220         // Special case Japanese.
 221         if ((is_hira || is_kata || is_han)
 222             && ! isHira (n) && ! isKata (n) && ! isHan (n)
 223             && type != Character.NON_SPACING_MARK)
 224           break;
 225
 226         // We might have to skip over non spacing marks to see what's
 227         // on the other side.
 228         if (! is_hira || (! is_letter && c != '\''))
 229           {
 230             int save = iter.getIndex();
 231             while (n != CharacterIterator.DONE
 232                    && Character.getType(n) == Character.NON_SPACING_MARK)
 233               n = iter.previous();
 234             iter.setIndex(save);
 235             // This is a strange case: a bunch of non-spacing marks at
 236             // the beginning.  We treat the current location as a word
 237             // break.
 238             if (n == CharacterIterator.DONE)
 239               break;
 240             if ((isHira (n) && ! is_hira)
 241                 || (isKata (n) && ! is_hira && ! is_kata)
 242                 || (isHan (n) && ! is_hira && ! is_han)
 243                 // FIXME: we treat apostrophe as part of a word.  This
 244                 // is an English-ism.
 245                 || (! is_letter && c != '\'' && Character.isLetter(n)))
 246               break;
 247           }
 248       }
 249
 250     return iter.getIndex();
 251   }
 252 }