FSF GCC merge 02/23/03
[official-gcc.git] / libjava / gnu / java / text / WordBreakIterator.java
blob3b6aae8c8540f57a81b70797e6279b973d0c6993
1 /* WordBreakIterator.java - Default word BreakIterator.
2 Copyright (C) 1999, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu.java.text;
41 import java.text.BreakIterator;
42 import java.text.CharacterIterator;
44 /**
45 * @author Tom Tromey <tromey@cygnus.com>
46 * @date March 22, 1999
47 * Written using The Unicode Standard, Version 2.0.
50 public class WordBreakIterator extends BaseBreakIterator
52 public Object clone ()
54 return new WordBreakIterator (this);
57 public WordBreakIterator ()
59 iter = null;
62 private WordBreakIterator (WordBreakIterator other)
64 iter = (CharacterIterator) other.iter.clone();
67 // Some methods to tell us different properties of characters.
68 private final boolean isHira (char c)
70 return c >= 0x3040 && c <= 0x309f;
72 private final boolean isKata (char c)
74 return c >= 0x30a0 && c <= 0x30ff;
76 private final boolean isHan (char c)
78 return c >= 0x4e00 && c <= 0x9fff;
81 public int next ()
83 int end = iter.getEndIndex();
84 if (iter.getIndex() == end)
85 return DONE;
87 while (iter.getIndex() < end)
89 char c = iter.current();
90 if (c == CharacterIterator.DONE)
91 break;
92 int type = Character.getType(c);
94 char n = iter.next();
95 if (n == CharacterIterator.DONE)
96 break;
98 // Break after paragraph separators.
99 if (type == Character.PARAGRAPH_SEPARATOR
100 || type == Character.LINE_SEPARATOR)
101 break;
103 // Break between letters and non-letters.
104 // FIXME: we treat apostrophe as part of a word. This
105 // is an English-ism.
106 boolean is_letter = Character.isLetter(c);
107 if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
108 && Character.isLetter(n))
109 break;
111 // Always break after certain symbols, such as punctuation.
112 // This heuristic is derived from hints in the JCL book and is
113 // not part of Unicode. It seems to be right, however.
114 // FIXME: we treat apostrophe as part of a word. This
115 // is an English-ism.
116 if (c != '\''
117 && (type == Character.DASH_PUNCTUATION
118 || type == Character.START_PUNCTUATION
119 || type == Character.END_PUNCTUATION
120 || type == Character.CONNECTOR_PUNCTUATION
121 || type == Character.OTHER_PUNCTUATION
122 || type == Character.MATH_SYMBOL
123 || type == Character.CURRENCY_SYMBOL
124 || type == Character.MODIFIER_SYMBOL
125 || type == Character.OTHER_SYMBOL
126 || type == Character.FORMAT
127 || type == Character.CONTROL))
128 break;
130 boolean is_hira = isHira (c);
131 boolean is_kata = isKata (c);
132 boolean is_han = isHan (c);
134 // Special case Japanese.
135 if (! is_hira && ! is_kata && ! is_han
136 && type != Character.NON_SPACING_MARK
137 && (isHira (n) || isKata (n) || isHan (n)))
138 break;
140 if (is_hira || is_kata || is_han || is_letter)
142 // Now we need to do some lookahead. We might need to do
143 // quite a bit of lookahead, so we save our position and
144 // restore it later.
145 int save = iter.getIndex();
146 // Skip string of non spacing marks.
147 while (n != CharacterIterator.DONE
148 && Character.getType(n) == Character.NON_SPACING_MARK)
149 n = iter.next();
150 if (n == CharacterIterator.DONE)
151 break;
152 if ((is_hira && ! isHira (n))
153 || (is_kata && ! isHira (n) && ! isKata (n))
154 || (is_han && ! isHira (n) && ! isHan (n))
155 // FIXME: we treat apostrophe as part of a word. This
156 // is an English-ism.
157 || (is_letter && ! Character.isLetter(n) && n != '\''))
158 break;
159 iter.setIndex(save);
163 return iter.getIndex();
166 public int previous ()
168 int start = iter.getBeginIndex();
169 if (iter.getIndex() == start)
170 return DONE;
172 while (iter.getIndex() >= start)
174 char c = iter.previous();
175 if (c == CharacterIterator.DONE)
176 break;
178 boolean is_hira = isHira (c);
179 boolean is_kata = isKata (c);
180 boolean is_han = isHan (c);
181 boolean is_letter = Character.isLetter(c);
183 char n = iter.previous();
184 if (n == CharacterIterator.DONE)
185 break;
186 iter.next();
187 int type = Character.getType(n);
188 // Break after paragraph separators.
189 if (type == Character.PARAGRAPH_SEPARATOR
190 || type == Character.LINE_SEPARATOR)
191 break;
193 // Break between letters and non-letters.
194 // FIXME: we treat apostrophe as part of a word. This
195 // is an English-ism.
196 if (n != '\'' && ! Character.isLetter(n)
197 && type != Character.NON_SPACING_MARK
198 && is_letter)
199 break;
201 // Always break after certain symbols, such as punctuation.
202 // This heuristic is derived from hints in the JCL book and is
203 // not part of Unicode. It seems to be right, however.
204 // FIXME: we treat apostrophe as part of a word. This
205 // is an English-ism.
206 if (n != '\''
207 && (type == Character.DASH_PUNCTUATION
208 || type == Character.START_PUNCTUATION
209 || type == Character.END_PUNCTUATION
210 || type == Character.CONNECTOR_PUNCTUATION
211 || type == Character.OTHER_PUNCTUATION
212 || type == Character.MATH_SYMBOL
213 || type == Character.CURRENCY_SYMBOL
214 || type == Character.MODIFIER_SYMBOL
215 || type == Character.OTHER_SYMBOL
216 || type == Character.FORMAT
217 || type == Character.CONTROL))
218 break;
220 // Special case Japanese.
221 if ((is_hira || is_kata || is_han)
222 && ! isHira (n) && ! isKata (n) && ! isHan (n)
223 && type != Character.NON_SPACING_MARK)
224 break;
226 // We might have to skip over non spacing marks to see what's
227 // on the other side.
228 if (! is_hira || (! is_letter && c != '\''))
230 int save = iter.getIndex();
231 while (n != CharacterIterator.DONE
232 && Character.getType(n) == Character.NON_SPACING_MARK)
233 n = iter.previous();
234 iter.setIndex(save);
235 // This is a strange case: a bunch of non-spacing marks at
236 // the beginning. We treat the current location as a word
237 // break.
238 if (n == CharacterIterator.DONE)
239 break;
240 if ((isHira (n) && ! is_hira)
241 || (isKata (n) && ! is_hira && ! is_kata)
242 || (isHan (n) && ! is_hira && ! is_han)
243 // FIXME: we treat apostrophe as part of a word. This
244 // is an English-ism.
245 || (! is_letter && c != '\'' && Character.isLetter(n)))
246 break;
250 return iter.getIndex();