1 /* WordBreakIterator.java - Default word BreakIterator.
2 Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu
.java
.text
;
41 import java
.text
.CharacterIterator
;
44 * @author Tom Tromey <tromey@cygnus.com>
45 * @date March 22, 1999
46 * Written using The Unicode Standard, Version 2.0.
49 public class WordBreakIterator
extends BaseBreakIterator
51 public Object
clone ()
53 return new WordBreakIterator (this);
56 public WordBreakIterator ()
60 private WordBreakIterator (WordBreakIterator other
)
62 iter
= (CharacterIterator
) other
.iter
.clone();
65 // Some methods to tell us different properties of characters.
66 private final boolean isHira (char c
)
68 return c
>= 0x3040 && c
<= 0x309f;
70 private final boolean isKata (char c
)
72 return c
>= 0x30a0 && c
<= 0x30ff;
74 private final boolean isHan (char c
)
76 return c
>= 0x4e00 && c
<= 0x9fff;
81 int end
= iter
.getEndIndex();
82 if (iter
.getIndex() == end
)
85 while (iter
.getIndex() < end
)
87 char c
= iter
.current();
88 if (c
== CharacterIterator
.DONE
)
90 int type
= Character
.getType(c
);
93 if (n
== CharacterIterator
.DONE
)
96 // Break after paragraph separators.
97 if (type
== Character
.PARAGRAPH_SEPARATOR
98 || type
== Character
.LINE_SEPARATOR
)
101 // Break between letters and non-letters.
102 // FIXME: we treat apostrophe as part of a word. This
103 // is an English-ism.
104 boolean is_letter
= Character
.isLetter(c
);
105 if (c
!= '\'' && ! is_letter
&& type
!= Character
.NON_SPACING_MARK
106 && Character
.isLetter(n
))
109 // Always break after certain symbols, such as punctuation.
110 // This heuristic is derived from hints in the JCL book and is
111 // not part of Unicode. It seems to be right, however.
112 // FIXME: we treat apostrophe as part of a word. This
113 // is an English-ism.
115 && (type
== Character
.DASH_PUNCTUATION
116 || type
== Character
.START_PUNCTUATION
117 || type
== Character
.END_PUNCTUATION
118 || type
== Character
.CONNECTOR_PUNCTUATION
119 || type
== Character
.OTHER_PUNCTUATION
120 || type
== Character
.MATH_SYMBOL
121 || type
== Character
.CURRENCY_SYMBOL
122 || type
== Character
.MODIFIER_SYMBOL
123 || type
== Character
.OTHER_SYMBOL
124 || type
== Character
.FORMAT
125 || type
== Character
.CONTROL
))
128 boolean is_hira
= isHira (c
);
129 boolean is_kata
= isKata (c
);
130 boolean is_han
= isHan (c
);
132 // Special case Japanese.
133 if (! is_hira
&& ! is_kata
&& ! is_han
134 && type
!= Character
.NON_SPACING_MARK
135 && (isHira (n
) || isKata (n
) || isHan (n
)))
138 if (is_hira
|| is_kata
|| is_han
|| is_letter
)
140 // Now we need to do some lookahead. We might need to do
141 // quite a bit of lookahead, so we save our position and
143 int save
= iter
.getIndex();
144 // Skip string of non spacing marks.
145 while (n
!= CharacterIterator
.DONE
146 && Character
.getType(n
) == Character
.NON_SPACING_MARK
)
148 if (n
== CharacterIterator
.DONE
)
150 if ((is_hira
&& ! isHira (n
))
151 || (is_kata
&& ! isHira (n
) && ! isKata (n
))
152 || (is_han
&& ! isHira (n
) && ! isHan (n
))
153 // FIXME: we treat apostrophe as part of a word. This
154 // is an English-ism.
155 || (is_letter
&& ! Character
.isLetter(n
) && n
!= '\''))
161 return iter
.getIndex();
164 public int previous ()
166 int start
= iter
.getBeginIndex();
167 if (iter
.getIndex() == start
)
170 while (iter
.getIndex() >= start
)
172 char c
= iter
.previous();
173 if (c
== CharacterIterator
.DONE
)
176 boolean is_hira
= isHira (c
);
177 boolean is_kata
= isKata (c
);
178 boolean is_han
= isHan (c
);
179 boolean is_letter
= Character
.isLetter(c
);
181 char n
= iter
.previous();
182 if (n
== CharacterIterator
.DONE
)
185 int type
= Character
.getType(n
);
186 // Break after paragraph separators.
187 if (type
== Character
.PARAGRAPH_SEPARATOR
188 || type
== Character
.LINE_SEPARATOR
)
191 // Break between letters and non-letters.
192 // FIXME: we treat apostrophe as part of a word. This
193 // is an English-ism.
194 if (n
!= '\'' && ! Character
.isLetter(n
)
195 && type
!= Character
.NON_SPACING_MARK
199 // Always break after certain symbols, such as punctuation.
200 // This heuristic is derived from hints in the JCL book and is
201 // not part of Unicode. It seems to be right, however.
202 // FIXME: we treat apostrophe as part of a word. This
203 // is an English-ism.
205 && (type
== Character
.DASH_PUNCTUATION
206 || type
== Character
.START_PUNCTUATION
207 || type
== Character
.END_PUNCTUATION
208 || type
== Character
.CONNECTOR_PUNCTUATION
209 || type
== Character
.OTHER_PUNCTUATION
210 || type
== Character
.MATH_SYMBOL
211 || type
== Character
.CURRENCY_SYMBOL
212 || type
== Character
.MODIFIER_SYMBOL
213 || type
== Character
.OTHER_SYMBOL
214 || type
== Character
.FORMAT
215 || type
== Character
.CONTROL
))
218 // Special case Japanese.
219 if ((is_hira
|| is_kata
|| is_han
)
220 && ! isHira (n
) && ! isKata (n
) && ! isHan (n
)
221 && type
!= Character
.NON_SPACING_MARK
)
224 // We might have to skip over non spacing marks to see what's
225 // on the other side.
226 if (! is_hira
|| (! is_letter
&& c
!= '\''))
228 int save
= iter
.getIndex();
229 while (n
!= CharacterIterator
.DONE
230 && Character
.getType(n
) == Character
.NON_SPACING_MARK
)
233 // This is a strange case: a bunch of non-spacing marks at
234 // the beginning. We treat the current location as a word
236 if (n
== CharacterIterator
.DONE
)
238 if ((isHira (n
) && ! is_hira
)
239 || (isKata (n
) && ! is_hira
&& ! is_kata
)
240 || (isHan (n
) && ! is_hira
&& ! is_han
)
241 // FIXME: we treat apostrophe as part of a word. This
242 // is an English-ism.
243 || (! is_letter
&& c
!= '\'' && Character
.isLetter(n
)))
248 return iter
.getIndex();