Merge from mainline (gomp-merge-2005-02-26).
[official-gcc.git] / libjava / gnu / java / text / WordBreakIterator.java
blob85c44da456950f2485ce18cb8970591cdc874cc6
1 /* WordBreakIterator.java - Default word BreakIterator.
2 Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu.java.text;
41 import java.text.CharacterIterator;
43 /**
44 * @author Tom Tromey <tromey@cygnus.com>
45 * @date March 22, 1999
46 * Written using The Unicode Standard, Version 2.0.
49 public class WordBreakIterator extends BaseBreakIterator
51 public Object clone ()
53 return new WordBreakIterator (this);
56 public WordBreakIterator ()
60 private WordBreakIterator (WordBreakIterator other)
62 iter = (CharacterIterator) other.iter.clone();
65 // Some methods to tell us different properties of characters.
66 private final boolean isHira (char c)
68 return c >= 0x3040 && c <= 0x309f;
70 private final boolean isKata (char c)
72 return c >= 0x30a0 && c <= 0x30ff;
74 private final boolean isHan (char c)
76 return c >= 0x4e00 && c <= 0x9fff;
79 public int next ()
81 int end = iter.getEndIndex();
82 if (iter.getIndex() == end)
83 return DONE;
85 while (iter.getIndex() < end)
87 char c = iter.current();
88 if (c == CharacterIterator.DONE)
89 break;
90 int type = Character.getType(c);
92 char n = iter.next();
93 if (n == CharacterIterator.DONE)
94 break;
96 // Break after paragraph separators.
97 if (type == Character.PARAGRAPH_SEPARATOR
98 || type == Character.LINE_SEPARATOR)
99 break;
101 // Break between letters and non-letters.
102 // FIXME: we treat apostrophe as part of a word. This
103 // is an English-ism.
104 boolean is_letter = Character.isLetter(c);
105 if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
106 && Character.isLetter(n))
107 break;
109 // Always break after certain symbols, such as punctuation.
110 // This heuristic is derived from hints in the JCL book and is
111 // not part of Unicode. It seems to be right, however.
112 // FIXME: we treat apostrophe as part of a word. This
113 // is an English-ism.
114 if (c != '\''
115 && (type == Character.DASH_PUNCTUATION
116 || type == Character.START_PUNCTUATION
117 || type == Character.END_PUNCTUATION
118 || type == Character.CONNECTOR_PUNCTUATION
119 || type == Character.OTHER_PUNCTUATION
120 || type == Character.MATH_SYMBOL
121 || type == Character.CURRENCY_SYMBOL
122 || type == Character.MODIFIER_SYMBOL
123 || type == Character.OTHER_SYMBOL
124 || type == Character.FORMAT
125 || type == Character.CONTROL))
126 break;
128 boolean is_hira = isHira (c);
129 boolean is_kata = isKata (c);
130 boolean is_han = isHan (c);
132 // Special case Japanese.
133 if (! is_hira && ! is_kata && ! is_han
134 && type != Character.NON_SPACING_MARK
135 && (isHira (n) || isKata (n) || isHan (n)))
136 break;
138 if (is_hira || is_kata || is_han || is_letter)
140 // Now we need to do some lookahead. We might need to do
141 // quite a bit of lookahead, so we save our position and
142 // restore it later.
143 int save = iter.getIndex();
144 // Skip string of non spacing marks.
145 while (n != CharacterIterator.DONE
146 && Character.getType(n) == Character.NON_SPACING_MARK)
147 n = iter.next();
148 if (n == CharacterIterator.DONE)
149 break;
150 if ((is_hira && ! isHira (n))
151 || (is_kata && ! isHira (n) && ! isKata (n))
152 || (is_han && ! isHira (n) && ! isHan (n))
153 // FIXME: we treat apostrophe as part of a word. This
154 // is an English-ism.
155 || (is_letter && ! Character.isLetter(n) && n != '\''))
156 break;
157 iter.setIndex(save);
161 return iter.getIndex();
164 public int previous ()
166 int start = iter.getBeginIndex();
167 if (iter.getIndex() == start)
168 return DONE;
170 while (iter.getIndex() >= start)
172 char c = iter.previous();
173 if (c == CharacterIterator.DONE)
174 break;
176 boolean is_hira = isHira (c);
177 boolean is_kata = isKata (c);
178 boolean is_han = isHan (c);
179 boolean is_letter = Character.isLetter(c);
181 char n = iter.previous();
182 if (n == CharacterIterator.DONE)
183 break;
184 iter.next();
185 int type = Character.getType(n);
186 // Break after paragraph separators.
187 if (type == Character.PARAGRAPH_SEPARATOR
188 || type == Character.LINE_SEPARATOR)
189 break;
191 // Break between letters and non-letters.
192 // FIXME: we treat apostrophe as part of a word. This
193 // is an English-ism.
194 if (n != '\'' && ! Character.isLetter(n)
195 && type != Character.NON_SPACING_MARK
196 && is_letter)
197 break;
199 // Always break after certain symbols, such as punctuation.
200 // This heuristic is derived from hints in the JCL book and is
201 // not part of Unicode. It seems to be right, however.
202 // FIXME: we treat apostrophe as part of a word. This
203 // is an English-ism.
204 if (n != '\''
205 && (type == Character.DASH_PUNCTUATION
206 || type == Character.START_PUNCTUATION
207 || type == Character.END_PUNCTUATION
208 || type == Character.CONNECTOR_PUNCTUATION
209 || type == Character.OTHER_PUNCTUATION
210 || type == Character.MATH_SYMBOL
211 || type == Character.CURRENCY_SYMBOL
212 || type == Character.MODIFIER_SYMBOL
213 || type == Character.OTHER_SYMBOL
214 || type == Character.FORMAT
215 || type == Character.CONTROL))
216 break;
218 // Special case Japanese.
219 if ((is_hira || is_kata || is_han)
220 && ! isHira (n) && ! isKata (n) && ! isHan (n)
221 && type != Character.NON_SPACING_MARK)
222 break;
224 // We might have to skip over non spacing marks to see what's
225 // on the other side.
226 if (! is_hira || (! is_letter && c != '\''))
228 int save = iter.getIndex();
229 while (n != CharacterIterator.DONE
230 && Character.getType(n) == Character.NON_SPACING_MARK)
231 n = iter.previous();
232 iter.setIndex(save);
233 // This is a strange case: a bunch of non-spacing marks at
234 // the beginning. We treat the current location as a word
235 // break.
236 if (n == CharacterIterator.DONE)
237 break;
238 if ((isHira (n) && ! is_hira)
239 || (isKata (n) && ! is_hira && ! is_kata)
240 || (isHan (n) && ! is_hira && ! is_han)
241 // FIXME: we treat apostrophe as part of a word. This
242 // is an English-ism.
243 || (! is_letter && c != '\'' && Character.isLetter(n)))
244 break;
248 return iter.getIndex();