1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/break_iterator.h"
7 #include "base/strings/string_piece.h"
8 #include "base/strings/stringprintf.h"
9 #include "base/strings/utf_string_conversions.h"
10 #include "testing/gtest/include/gtest/gtest.h"
15 TEST(BreakIteratorTest
, BreakWordEmpty
) {
17 BreakIterator
iter(empty
, BreakIterator::BREAK_WORD
);
18 ASSERT_TRUE(iter
.Init());
19 EXPECT_FALSE(iter
.Advance());
20 EXPECT_FALSE(iter
.IsWord());
21 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
22 EXPECT_FALSE(iter
.IsWord());
25 TEST(BreakIteratorTest
, BreakWord
) {
26 string16
space(UTF8ToUTF16(" "));
27 string16
str(UTF8ToUTF16(" foo bar! \npouet boom"));
28 BreakIterator
iter(str
, BreakIterator::BREAK_WORD
);
29 ASSERT_TRUE(iter
.Init());
30 EXPECT_TRUE(iter
.Advance());
31 EXPECT_FALSE(iter
.IsWord());
32 EXPECT_EQ(space
, iter
.GetString());
33 EXPECT_TRUE(iter
.Advance());
34 EXPECT_TRUE(iter
.IsWord());
35 EXPECT_EQ(UTF8ToUTF16("foo"), iter
.GetString());
36 EXPECT_TRUE(iter
.Advance());
37 EXPECT_FALSE(iter
.IsWord());
38 EXPECT_EQ(space
, iter
.GetString());
39 EXPECT_TRUE(iter
.Advance());
40 EXPECT_TRUE(iter
.IsWord());
41 EXPECT_EQ(UTF8ToUTF16("bar"), iter
.GetString());
42 EXPECT_TRUE(iter
.Advance());
43 EXPECT_FALSE(iter
.IsWord());
44 EXPECT_EQ(UTF8ToUTF16("!"), iter
.GetString());
45 EXPECT_TRUE(iter
.Advance());
46 EXPECT_FALSE(iter
.IsWord());
47 EXPECT_EQ(space
, iter
.GetString());
48 EXPECT_TRUE(iter
.Advance());
49 EXPECT_FALSE(iter
.IsWord());
50 EXPECT_EQ(UTF8ToUTF16("\n"), iter
.GetString());
51 EXPECT_TRUE(iter
.Advance());
52 EXPECT_TRUE(iter
.IsWord());
53 EXPECT_EQ(UTF8ToUTF16("pouet"), iter
.GetString());
54 EXPECT_TRUE(iter
.Advance());
55 EXPECT_FALSE(iter
.IsWord());
56 EXPECT_EQ(space
, iter
.GetString());
57 EXPECT_TRUE(iter
.Advance());
58 EXPECT_TRUE(iter
.IsWord());
59 EXPECT_EQ(UTF8ToUTF16("boom"), iter
.GetString());
60 EXPECT_FALSE(iter
.Advance());
61 EXPECT_FALSE(iter
.IsWord());
62 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
63 EXPECT_FALSE(iter
.IsWord());
66 TEST(BreakIteratorTest
, BreakWide16
) {
67 // Two greek words separated by space.
68 const string16
str(WideToUTF16(
69 L
"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
70 L
"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
71 const string16
word1(str
.substr(0, 10));
72 const string16
word2(str
.substr(11, 5));
73 BreakIterator
iter(str
, BreakIterator::BREAK_WORD
);
74 ASSERT_TRUE(iter
.Init());
75 EXPECT_TRUE(iter
.Advance());
76 EXPECT_TRUE(iter
.IsWord());
77 EXPECT_EQ(word1
, iter
.GetString());
78 EXPECT_TRUE(iter
.Advance());
79 EXPECT_FALSE(iter
.IsWord());
80 EXPECT_EQ(UTF8ToUTF16(" "), iter
.GetString());
81 EXPECT_TRUE(iter
.Advance());
82 EXPECT_TRUE(iter
.IsWord());
83 EXPECT_EQ(word2
, iter
.GetString());
84 EXPECT_FALSE(iter
.Advance());
85 EXPECT_FALSE(iter
.IsWord());
86 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
87 EXPECT_FALSE(iter
.IsWord());
90 TEST(BreakIteratorTest
, BreakWide32
) {
91 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
92 const char very_wide_char
[] = "\xF0\x9D\x92\x9C";
94 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char
)));
95 const string16
very_wide_word(str
.substr(0, 2));
97 BreakIterator
iter(str
, BreakIterator::BREAK_WORD
);
98 ASSERT_TRUE(iter
.Init());
99 EXPECT_TRUE(iter
.Advance());
100 EXPECT_TRUE(iter
.IsWord());
101 EXPECT_EQ(very_wide_word
, iter
.GetString());
102 EXPECT_TRUE(iter
.Advance());
103 EXPECT_FALSE(iter
.IsWord());
104 EXPECT_EQ(UTF8ToUTF16(" "), iter
.GetString());
105 EXPECT_TRUE(iter
.Advance());
106 EXPECT_TRUE(iter
.IsWord());
107 EXPECT_EQ(UTF8ToUTF16("a"), iter
.GetString());
108 EXPECT_FALSE(iter
.Advance());
109 EXPECT_FALSE(iter
.IsWord());
110 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
111 EXPECT_FALSE(iter
.IsWord());
114 TEST(BreakIteratorTest
, BreakSpaceEmpty
) {
116 BreakIterator
iter(empty
, BreakIterator::BREAK_SPACE
);
117 ASSERT_TRUE(iter
.Init());
118 EXPECT_FALSE(iter
.Advance());
119 EXPECT_FALSE(iter
.IsWord());
120 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
121 EXPECT_FALSE(iter
.IsWord());
124 TEST(BreakIteratorTest
, BreakSpace
) {
125 string16
str(UTF8ToUTF16(" foo bar! \npouet boom"));
126 BreakIterator
iter(str
, BreakIterator::BREAK_SPACE
);
127 ASSERT_TRUE(iter
.Init());
128 EXPECT_TRUE(iter
.Advance());
129 EXPECT_FALSE(iter
.IsWord());
130 EXPECT_EQ(UTF8ToUTF16(" "), iter
.GetString());
131 EXPECT_TRUE(iter
.Advance());
132 EXPECT_FALSE(iter
.IsWord());
133 EXPECT_EQ(UTF8ToUTF16("foo "), iter
.GetString());
134 EXPECT_TRUE(iter
.Advance());
135 EXPECT_FALSE(iter
.IsWord());
136 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter
.GetString());
137 EXPECT_TRUE(iter
.Advance());
138 EXPECT_FALSE(iter
.IsWord());
139 EXPECT_EQ(UTF8ToUTF16("pouet "), iter
.GetString());
140 EXPECT_TRUE(iter
.Advance());
141 EXPECT_FALSE(iter
.IsWord());
142 EXPECT_EQ(UTF8ToUTF16("boom"), iter
.GetString());
143 EXPECT_FALSE(iter
.Advance());
144 EXPECT_FALSE(iter
.IsWord());
145 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
146 EXPECT_FALSE(iter
.IsWord());
149 TEST(BreakIteratorTest
, BreakSpaceSP
) {
150 string16
str(UTF8ToUTF16(" foo bar! \npouet boom "));
151 BreakIterator
iter(str
, BreakIterator::BREAK_SPACE
);
152 ASSERT_TRUE(iter
.Init());
153 EXPECT_TRUE(iter
.Advance());
154 EXPECT_FALSE(iter
.IsWord());
155 EXPECT_EQ(UTF8ToUTF16(" "), iter
.GetString());
156 EXPECT_TRUE(iter
.Advance());
157 EXPECT_FALSE(iter
.IsWord());
158 EXPECT_EQ(UTF8ToUTF16("foo "), iter
.GetString());
159 EXPECT_TRUE(iter
.Advance());
160 EXPECT_FALSE(iter
.IsWord());
161 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter
.GetString());
162 EXPECT_TRUE(iter
.Advance());
163 EXPECT_FALSE(iter
.IsWord());
164 EXPECT_EQ(UTF8ToUTF16("pouet "), iter
.GetString());
165 EXPECT_TRUE(iter
.Advance());
166 EXPECT_FALSE(iter
.IsWord());
167 EXPECT_EQ(UTF8ToUTF16("boom "), iter
.GetString());
168 EXPECT_FALSE(iter
.Advance());
169 EXPECT_FALSE(iter
.IsWord());
170 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
171 EXPECT_FALSE(iter
.IsWord());
174 TEST(BreakIteratorTest
, BreakSpacekWide16
) {
176 const string16
str(WideToUTF16(
177 L
"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
178 L
"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
179 const string16
word1(str
.substr(0, 11));
180 const string16
word2(str
.substr(11, 5));
181 BreakIterator
iter(str
, BreakIterator::BREAK_SPACE
);
182 ASSERT_TRUE(iter
.Init());
183 EXPECT_TRUE(iter
.Advance());
184 EXPECT_FALSE(iter
.IsWord());
185 EXPECT_EQ(word1
, iter
.GetString());
186 EXPECT_TRUE(iter
.Advance());
187 EXPECT_FALSE(iter
.IsWord());
188 EXPECT_EQ(word2
, iter
.GetString());
189 EXPECT_FALSE(iter
.Advance());
190 EXPECT_FALSE(iter
.IsWord());
191 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
192 EXPECT_FALSE(iter
.IsWord());
195 TEST(BreakIteratorTest
, BreakSpaceWide32
) {
196 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
197 const char very_wide_char
[] = "\xF0\x9D\x92\x9C";
199 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char
)));
200 const string16
very_wide_word(str
.substr(0, 3));
202 BreakIterator
iter(str
, BreakIterator::BREAK_SPACE
);
203 ASSERT_TRUE(iter
.Init());
204 EXPECT_TRUE(iter
.Advance());
205 EXPECT_FALSE(iter
.IsWord());
206 EXPECT_EQ(very_wide_word
, iter
.GetString());
207 EXPECT_TRUE(iter
.Advance());
208 EXPECT_FALSE(iter
.IsWord());
209 EXPECT_EQ(UTF8ToUTF16("a"), iter
.GetString());
210 EXPECT_FALSE(iter
.Advance());
211 EXPECT_FALSE(iter
.IsWord());
212 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
213 EXPECT_FALSE(iter
.IsWord());
216 TEST(BreakIteratorTest
, BreakLineEmpty
) {
218 BreakIterator
iter(empty
, BreakIterator::BREAK_NEWLINE
);
219 ASSERT_TRUE(iter
.Init());
220 EXPECT_FALSE(iter
.Advance());
221 EXPECT_FALSE(iter
.IsWord());
222 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
223 EXPECT_FALSE(iter
.IsWord());
226 TEST(BreakIteratorTest
, BreakLine
) {
227 string16
nl(UTF8ToUTF16("\n"));
228 string16
str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
229 BreakIterator
iter(str
, BreakIterator::BREAK_NEWLINE
);
230 ASSERT_TRUE(iter
.Init());
231 EXPECT_TRUE(iter
.Advance());
232 EXPECT_FALSE(iter
.IsWord());
233 EXPECT_EQ(nl
, iter
.GetString());
234 EXPECT_TRUE(iter
.Advance());
235 EXPECT_FALSE(iter
.IsWord());
236 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter
.GetString());
237 EXPECT_TRUE(iter
.Advance());
238 EXPECT_FALSE(iter
.IsWord());
239 EXPECT_EQ(nl
, iter
.GetString());
240 EXPECT_TRUE(iter
.Advance());
241 EXPECT_FALSE(iter
.IsWord());
242 EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter
.GetString());
243 EXPECT_FALSE(iter
.Advance());
244 EXPECT_FALSE(iter
.IsWord());
245 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
246 EXPECT_FALSE(iter
.IsWord());
249 TEST(BreakIteratorTest
, BreakLineNL
) {
250 string16
nl(UTF8ToUTF16("\n"));
251 string16
str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
252 BreakIterator
iter(str
, BreakIterator::BREAK_NEWLINE
);
253 ASSERT_TRUE(iter
.Init());
254 EXPECT_TRUE(iter
.Advance());
255 EXPECT_FALSE(iter
.IsWord());
256 EXPECT_EQ(nl
, iter
.GetString());
257 EXPECT_TRUE(iter
.Advance());
258 EXPECT_FALSE(iter
.IsWord());
259 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter
.GetString());
260 EXPECT_TRUE(iter
.Advance());
261 EXPECT_FALSE(iter
.IsWord());
262 EXPECT_EQ(nl
, iter
.GetString());
263 EXPECT_TRUE(iter
.Advance());
264 EXPECT_FALSE(iter
.IsWord());
265 EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter
.GetString());
266 EXPECT_FALSE(iter
.Advance());
267 EXPECT_FALSE(iter
.IsWord());
268 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
269 EXPECT_FALSE(iter
.IsWord());
272 TEST(BreakIteratorTest
, BreakLineWide16
) {
273 // Two Greek words separated by newline.
274 const string16
str(WideToUTF16(
275 L
"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
276 L
"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
277 const string16
line1(str
.substr(0, 11));
278 const string16
line2(str
.substr(11, 5));
279 BreakIterator
iter(str
, BreakIterator::BREAK_NEWLINE
);
280 ASSERT_TRUE(iter
.Init());
281 EXPECT_TRUE(iter
.Advance());
282 EXPECT_FALSE(iter
.IsWord());
283 EXPECT_EQ(line1
, iter
.GetString());
284 EXPECT_TRUE(iter
.Advance());
285 EXPECT_FALSE(iter
.IsWord());
286 EXPECT_EQ(line2
, iter
.GetString());
287 EXPECT_FALSE(iter
.Advance());
288 EXPECT_FALSE(iter
.IsWord());
289 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
290 EXPECT_FALSE(iter
.IsWord());
293 TEST(BreakIteratorTest
, BreakLineWide32
) {
294 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
295 const char very_wide_char
[] = "\xF0\x9D\x92\x9C";
297 UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char
)));
298 const string16
very_wide_line(str
.substr(0, 3));
299 BreakIterator
iter(str
, BreakIterator::BREAK_NEWLINE
);
300 ASSERT_TRUE(iter
.Init());
301 EXPECT_TRUE(iter
.Advance());
302 EXPECT_FALSE(iter
.IsWord());
303 EXPECT_EQ(very_wide_line
, iter
.GetString());
304 EXPECT_TRUE(iter
.Advance());
305 EXPECT_FALSE(iter
.IsWord());
306 EXPECT_EQ(UTF8ToUTF16("a"), iter
.GetString());
307 EXPECT_FALSE(iter
.Advance());
308 EXPECT_FALSE(iter
.IsWord());
309 EXPECT_FALSE(iter
.Advance()); // Test unexpected advance after end.
310 EXPECT_FALSE(iter
.IsWord());
313 TEST(BreakIteratorTest
, BreakCharacter
) {
314 static const wchar_t* kCharacters
[] = {
315 // An English word consisting of four ASCII characters.
316 L
"w", L
"o", L
"r", L
"d", L
" ",
317 // A Hindi word (which means "Hindi") consisting of three Devanagari
319 L
"\x0939\x093F", L
"\x0928\x094D", L
"\x0926\x0940", L
" ",
320 // A Thai word (which means "feel") consisting of three Thai characters.
321 L
"\x0E23\x0E39\x0E49", L
"\x0E2A\x0E36", L
"\x0E01", L
" ",
323 std::vector
<string16
> characters
;
325 for (size_t i
= 0; i
< arraysize(kCharacters
); ++i
) {
326 characters
.push_back(WideToUTF16(kCharacters
[i
]));
327 text
.append(characters
.back());
329 BreakIterator
iter(text
, BreakIterator::BREAK_CHARACTER
);
330 ASSERT_TRUE(iter
.Init());
331 for (size_t i
= 0; i
< arraysize(kCharacters
); ++i
) {
332 EXPECT_TRUE(iter
.Advance());
333 EXPECT_EQ(characters
[i
], iter
.GetString());
337 // Test for https://code.google.com/p/chromium/issues/detail?id=411213
338 // We should be able to get valid substrings with GetString() function
339 // after setting new content by calling SetText().
340 TEST(BreakIteratorTest
, GetStringAfterSetText
) {
341 const string16
initial_string(ASCIIToUTF16("str"));
342 BreakIterator
iter(initial_string
, BreakIterator::BREAK_WORD
);
343 ASSERT_TRUE(iter
.Init());
345 const string16
long_string(ASCIIToUTF16("another,string"));
346 EXPECT_TRUE(iter
.SetText(long_string
.c_str(), long_string
.size()));
347 EXPECT_TRUE(iter
.Advance());
348 EXPECT_TRUE(iter
.Advance()); // Advance to ',' in |long_string|
350 // Check that the current position is out of bounds of the |initial_string|.
351 EXPECT_LT(initial_string
.size(), iter
.pos());
353 // Check that we can get a valid substring of |long_string|.
354 EXPECT_EQ(ASCIIToUTF16(","), iter
.GetString());
357 TEST(BreakIteratorTest
, GetStringPiece
) {
358 const string16
initial_string(ASCIIToUTF16("some string"));
359 BreakIterator
iter(initial_string
, BreakIterator::BREAK_WORD
);
360 ASSERT_TRUE(iter
.Init());
362 EXPECT_TRUE(iter
.Advance());
363 EXPECT_EQ(iter
.GetString(), iter
.GetStringPiece().as_string());
364 EXPECT_EQ(StringPiece16(ASCIIToUTF16("some")), iter
.GetStringPiece());
366 EXPECT_TRUE(iter
.Advance());
367 EXPECT_TRUE(iter
.Advance());
368 EXPECT_EQ(iter
.GetString(), iter
.GetStringPiece().as_string());
369 EXPECT_EQ(StringPiece16(ASCIIToUTF16("string")), iter
.GetStringPiece());