Document xapian-compact --blocksize takes an argument
[xapian.git] / xapian-core / tests / api_spelling.cc
blobea2e90dd4f7affa0f00e26cd57a8289babc10bf6
1 /** @file api_spelling.cc
2 * @brief Test the spelling correction suggestion API.
3 */
4 /* Copyright (C) 2007,2008,2009,2010,2011 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "api_spelling.h"
26 #include <xapian.h>
28 #include "apitest.h"
29 #include "testsuite.h"
30 #include "testutils.h"
32 #include <string>
34 using namespace std;
36 // Test add_spelling() and remove_spelling(), which remote dbs support.
37 DEFINE_TESTCASE(spell0, spelling || remote) {
38 Xapian::WritableDatabase db = get_writable_database();
40 db.add_spelling("hello");
41 db.add_spelling("cell", 2);
42 db.commit();
43 db.add_spelling("zig");
44 db.add_spelling("ch");
45 db.add_spelling("hello", 2);
46 db.remove_spelling("hello", 2);
47 db.remove_spelling("cell", 6);
48 db.commit();
49 db.remove_spelling("hello");
50 db.remove_spelling("nonsuch");
51 db.remove_spelling("zzzzzzzzz", 1000000);
52 db.remove_spelling("aarvark");
53 db.remove_spelling("hello");
54 db.commit();
55 db.remove_spelling("hello");
57 return true;
60 // Test basic spelling correction features.
61 DEFINE_TESTCASE(spell1, spelling) {
62 Xapian::WritableDatabase db = get_writable_database();
64 // Check that the more frequent term is chosen.
65 db.add_spelling("hello");
66 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
67 db.add_spelling("cell", 2);
68 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
69 db.commit();
70 Xapian::Database dbr(get_writable_database_as_database());
71 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
72 TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "cell");
74 // Check suggestions for single edit errors to "zig".
75 db.add_spelling("zig");
76 // Transpositions:
77 TEST_EQUAL(db.get_spelling_suggestion("izg"), "zig");
78 TEST_EQUAL(db.get_spelling_suggestion("zgi"), "zig");
79 // Substitutions:
80 TEST_EQUAL(db.get_spelling_suggestion("sig"), "zig");
81 TEST_EQUAL(db.get_spelling_suggestion("zog"), "zig");
82 TEST_EQUAL(db.get_spelling_suggestion("zif"), "zig");
83 // Deletions:
84 TEST_EQUAL(db.get_spelling_suggestion("ig"), "zig");
85 TEST_EQUAL(db.get_spelling_suggestion("zg"), "zig");
86 TEST_EQUAL(db.get_spelling_suggestion("zi"), "zig");
87 // Insertions:
88 TEST_EQUAL(db.get_spelling_suggestion("azig"), "zig");
89 TEST_EQUAL(db.get_spelling_suggestion("zaig"), "zig");
90 TEST_EQUAL(db.get_spelling_suggestion("ziag"), "zig");
91 TEST_EQUAL(db.get_spelling_suggestion("ziga"), "zig");
93 // Check suggestions for single edit errors to "ch".
94 db.add_spelling("ch");
95 // Transpositions:
96 TEST_EQUAL(db.get_spelling_suggestion("hc"), "ch");
97 // Substitutions - we don't handle these for two character words:
98 TEST_EQUAL(db.get_spelling_suggestion("qh"), "");
99 TEST_EQUAL(db.get_spelling_suggestion("cq"), "");
100 // Deletions would leave a single character, and we don't handle those.
101 TEST_EQUAL(db.get_spelling_suggestion("c"), "");
102 TEST_EQUAL(db.get_spelling_suggestion("h"), "");
103 // Insertions:
104 TEST_EQUAL(db.get_spelling_suggestion("qch"), "ch");
105 TEST_EQUAL(db.get_spelling_suggestion("cqh"), "ch");
106 TEST_EQUAL(db.get_spelling_suggestion("chq"), "ch");
108 // Check assorted cases:
109 TEST_EQUAL(db.get_spelling_suggestion("shello"), "hello");
110 TEST_EQUAL(db.get_spelling_suggestion("hellot"), "hello");
111 TEST_EQUAL(db.get_spelling_suggestion("acell"), "cell");
112 TEST_EQUAL(db.get_spelling_suggestion("cella"), "cell");
113 TEST_EQUAL(db.get_spelling_suggestion("acella"), "cell");
114 TEST_EQUAL(db.get_spelling_suggestion("helo"), "hello");
115 TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
116 TEST_EQUAL(db.get_spelling_suggestion("helol"), "hello");
117 TEST_EQUAL(db.get_spelling_suggestion("clel"), "cell");
118 TEST_EQUAL(db.get_spelling_suggestion("ecll"), "cell");
119 TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
121 // Check that edit distance 3 isn't found by default:
122 TEST_EQUAL(db.get_spelling_suggestion("shelolx"), "");
123 TEST_EQUAL(db.get_spelling_suggestion("celling"), "");
124 TEST_EQUAL(db.get_spelling_suggestion("dellin"), "");
126 // Check that edit distance 3 is found if specified:
127 TEST_EQUAL(db.get_spelling_suggestion("shelolx", 3), "hello");
128 TEST_EQUAL(db.get_spelling_suggestion("celling", 3), "cell");
129 TEST_EQUAL(db.get_spelling_suggestion("dellin", 3), "cell");
131 // Make "hello" more frequent than "cell" (3 vs 2).
132 db.add_spelling("hello", 2);
133 TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
134 db.commit();
135 TEST_EQUAL(db.get_spelling_suggestion("cello"), "hello");
136 db.remove_spelling("hello", 2);
137 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
138 // Test "over-removing".
139 db.remove_spelling("cell", 6);
140 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
141 db.commit();
142 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
143 db.remove_spelling("hello");
144 TEST_EQUAL(db.get_spelling_suggestion("cell"), "");
146 // Test removing words not in the table.
147 db.remove_spelling("nonsuch");
148 db.remove_spelling("zzzzzzzzz", 1000000);
149 db.remove_spelling("aarvark");
151 // Try removing word which was present but no longer is.
152 db.remove_spelling("hello");
153 db.commit();
154 db.remove_spelling("hello");
156 return true;
159 // Test spelling correction for Unicode.
160 DEFINE_TESTCASE(spell2, spelling) {
161 Xapian::WritableDatabase db = get_writable_database();
163 // Check that a UTF-8 sequence counts as a single character.
164 db.add_spelling("h\xc3\xb6hle");
165 db.add_spelling("ascii");
166 TEST_EQUAL(db.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
167 TEST_EQUAL(db.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
168 TEST_EQUAL(db.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
169 TEST_EQUAL(db.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
170 TEST_EQUAL(db.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
171 TEST_EQUAL(db.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
172 db.commit();
173 Xapian::Database dbr(get_writable_database_as_database());
174 TEST_EQUAL(dbr.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
175 TEST_EQUAL(dbr.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
176 TEST_EQUAL(dbr.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
177 TEST_EQUAL(dbr.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
178 TEST_EQUAL(dbr.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
179 TEST_EQUAL(dbr.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
181 return true;
184 // Test spelling correction with multi databases
185 DEFINE_TESTCASE(spell3, spelling) {
186 Xapian::WritableDatabase db1 = get_writable_database();
187 // We can't just call get_writable_database() since it would delete db1
188 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
189 // changes to db1 are committed.
190 Xapian::WritableDatabase db2 = get_named_writable_database("spell3", "");
192 db1.add_spelling("hello");
193 db1.add_spelling("cell", 2);
194 db2.add_spelling("hello", 2);
195 db2.add_spelling("helo");
197 Xapian::Database db;
198 db.add_database(db1);
199 db.add_database(db2);
201 TEST_EQUAL(db.get_spelling_suggestion("hello"), "");
202 TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
203 TEST_EQUAL(db1.get_spelling_suggestion("hell"), "cell");
204 TEST_EQUAL(db2.get_spelling_suggestion("hell"), "hello");
207 // Test spelling iterator
208 Xapian::TermIterator i(db1.spellings_begin());
209 TEST_EQUAL(*i, "cell");
210 TEST_EQUAL(i.get_termfreq(), 2);
211 ++i;
212 TEST_EQUAL(*i, "hello");
213 TEST_EQUAL(i.get_termfreq(), 1);
214 ++i;
215 TEST(i == db1.spellings_end());
217 i = db2.spellings_begin();
218 TEST_EQUAL(*i, "hello");
219 TEST_EQUAL(i.get_termfreq(), 2);
220 ++i;
221 TEST_EQUAL(*i, "helo");
222 TEST_EQUAL(i.get_termfreq(), 1);
223 ++i;
224 TEST(i == db2.spellings_end());
226 i = db.spellings_begin();
227 TEST_EQUAL(*i, "cell");
228 TEST_EQUAL(i.get_termfreq(), 2);
229 ++i;
230 TEST_EQUAL(*i, "hello");
231 TEST_EQUAL(i.get_termfreq(), 3);
232 ++i;
233 TEST_EQUAL(*i, "helo");
234 TEST_EQUAL(i.get_termfreq(), 1);
235 ++i;
236 TEST(i == db.spellings_end());
238 return true;
241 // Regression test - check that appending works correctly.
242 DEFINE_TESTCASE(spell4, spelling) {
243 Xapian::WritableDatabase db = get_writable_database();
245 db.add_spelling("check");
246 db.add_spelling("pecks", 2);
247 db.commit();
248 db.add_spelling("becky");
249 db.commit();
251 TEST_EQUAL(db.get_spelling_suggestion("jeck", 2), "pecks");
253 return true;
256 // Regression test - used to segfault with some input values.
257 DEFINE_TESTCASE(spell5, spelling) {
258 const char * target = "\xe4\xb8\x80\xe4\xba\x9b";
260 Xapian::WritableDatabase db = get_writable_database();
261 db.add_spelling(target);
262 db.commit();
264 string s = db.get_spelling_suggestion("\xe4\xb8\x8d", 3);
265 TEST_EQUAL(s, target);
267 return true;
270 // Test basic spelling correction features.
271 DEFINE_TESTCASE(spell6, spelling) {
272 Xapian::WritableDatabase db = get_writable_database();
274 // Check that the more frequent term is chosen.
275 db.add_spelling("hello", 2);
276 db.add_spelling("sell", 3);
277 TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
278 db.commit();
279 Xapian::Database dbr(get_writable_database_as_database());
280 TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
281 TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "sell");
283 return true;
286 // Test suggestions when there's an exact match.
287 DEFINE_TESTCASE(spell7, spelling) {
288 Xapian::WritableDatabase db = get_writable_database();
290 // Check that the more frequent term is chosen.
291 db.add_spelling("word", 57);
292 db.add_spelling("wrod", 3);
293 db.add_spelling("sword", 56);
294 db.add_spelling("words", 57);
295 db.add_spelling("ward", 58);
296 db.commit();
297 TEST_EQUAL(db.get_spelling_suggestion("ward"), "");
298 TEST_EQUAL(db.get_spelling_suggestion("words"), "word");
299 TEST_EQUAL(db.get_spelling_suggestion("sword"), "word");
300 TEST_EQUAL(db.get_spelling_suggestion("wrod"), "word");
302 return true;
305 /// Regression test - repeated trigrams cancelled in 1.2.5 and earlier.
306 DEFINE_TESTCASE(spell8, spelling) {
307 Xapian::WritableDatabase db = get_writable_database();
309 // kin and kin used to cancel out in "skinking".
310 db.add_spelling("skinking", 2);
311 db.add_spelling("stinking", 1);
312 db.commit();
313 TEST_EQUAL(db.get_spelling_suggestion("scimkin", 3), "skinking");
315 return true;