Drop special handling for Compaq C++
[xapian.git] / xapian-core / bin / xapian-delve.cc
blob77f968de0210008ea1c117c5b11b7b7f3f088518
1 /* xapian-delve.cc: Allow inspection of the contents of a Xapian database
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016,2017,2018 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include <xapian.h>
27 #include <algorithm>
28 #include <iomanip>
29 #include <iostream>
30 #include <vector>
32 #include "gnu_getopt.h"
34 #include <cerrno>
35 #include <cstring>
36 #include <cstdlib>
37 #include "unicode/description_append.h"
39 #include "unicode/description_append.cc"
41 using namespace Xapian;
42 using namespace std;
44 static char separator = ' ';
46 static int verbose = 0;
47 static bool showvalues = false;
48 static bool showdocdata = false;
49 static bool count_zero_length_docs = false;
51 // How to decode document values.
52 static enum {
53 VALUE_ESCAPE,
54 VALUE_SORTABLE_SERIALISE,
55 VALUE_PACKED_INT,
56 VALUE_RAW
57 } value_decode = VALUE_ESCAPE;
59 #define PROG_NAME "delve"
60 #define PROG_DESC "Inspect the contents of a Xapian database"
62 static void show_usage() {
63 cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
64 "Options:\n"
65 " -a show all terms in the database\n"
66 " -A <prefix> show all terms in the database with given prefix\n"
67 " -r <recno> for term list(s)\n"
68 " -t <term> for posting list(s)\n"
69 " -t <term> -r <recno> for position list(s)\n"
70 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
71 " -1 output one list entry per line\n"
72 " -V[<type>]<valueno> output value valueno for each document referred to\n"
73 " (or each document in the database if no -r options).\n"
74 " <type> can be:\n"
75 " E: escape in a C-like way (default)\n"
76 " I: decode as a packed integer\n"
77 " R: show the raw value (which may contain binary data,\n"
78 " newlines, invalid UTF-8, etc)\n"
79 " S: decode using Xapian::sortable_unserialise()\n"
80 " -V[<type>] output all values for each document referred to.\n"
81 " <type> is as above.\n"
82 " -d output document data for each document referred to\n"
83 " -z for db, count documents with length 0\n"
84 " -v extra info (wdf and len for postlist;\n"
85 " wdf and termfreq for termlist; number of terms for db;\n"
86 " termfreq when showing all terms)\n"
87 " -vv even more info (also show collection freq and wdf\n"
88 " upper bound for terms)\n"
89 " --help display this help and exit\n"
90 " --version output version information and exit" << endl;
93 static void
94 show_db_stats(Database &db)
96 // Display a few database stats.
97 cout << "UUID = " << db.get_uuid() << endl;
98 cout << "number of documents = " << db.get_doccount() << endl;
99 cout << "average document length = " << db.get_avlength() << endl;
100 cout << "document length lower bound = " << db.get_doclength_lower_bound()
101 << endl;
102 cout << "document length upper bound = " << db.get_doclength_upper_bound()
103 << endl;
104 cout << "highest document id ever used = " << db.get_lastdocid() << endl;
105 cout << boolalpha;
106 cout << "has positional information = " << db.has_positions() << endl;
107 cout << "revision = ";
108 try {
109 cout << db.get_revision() << endl;
110 } catch (const Xapian::InvalidOperationError& e) {
111 cout << e.get_description() << endl;
112 } catch (const Xapian::UnimplementedError& e) {
113 cout << e.get_description() << endl;
115 cout << "currently open for writing = ";
116 try {
117 cout << db.locked() << endl;
118 } catch (const Xapian::Error& e) {
119 cout << e.get_description() << endl;
122 if (count_zero_length_docs) {
123 Xapian::doccount empty_docs = 0;
124 if (db.get_total_length() == 0) {
125 // All documents are empty.
126 empty_docs = db.get_doccount();
127 } else {
128 Xapian::PostingIterator d = db.postlist_begin(string());
129 while (d != db.postlist_end(string())) {
130 if (d.get_doclength() == 0)
131 ++empty_docs;
132 ++d;
135 cout << "number of zero-length documents = " << empty_docs << endl;
138 if (verbose) {
139 // To find the number of terms, we have to count them!
140 // This will take a few seconds or minutes, so only do it if -v
141 // was specified.
142 termcount terms = 0;
143 TermIterator t = db.allterms_begin();
144 while (t != db.allterms_end()) {
145 ++terms;
146 ++t;
148 cout << "number of distinct terms = " << terms << endl;
152 static void
153 decode_and_show_value(const string& value)
155 switch (value_decode) {
156 case VALUE_ESCAPE: {
157 string esc;
158 description_append(esc, value);
159 cout << esc;
160 break;
162 case VALUE_SORTABLE_SERIALISE:
163 cout << Xapian::sortable_unserialise(value);
164 break;
165 case VALUE_PACKED_INT: {
166 unsigned long long i = 0;
167 for (unsigned char ch : value) {
168 i = (i << 8) | ch;
170 cout << i;
171 break;
173 default: // VALUE_RAW
174 cout << value;
175 break;
179 static void
180 show_values(Database &db, docid docid, char sep)
182 Document doc = db.get_document(docid);
183 ValueIterator v = doc.values_begin();
184 while (v != doc.values_end()) {
185 cout << sep << v.get_valueno() << ':';
186 decode_and_show_value(*v);
187 ++v;
191 static void
192 show_values(Database &db,
193 vector<docid>::const_iterator i,
194 vector<docid>::const_iterator end)
196 while (i != end) {
197 cout << "Values for record #" << *i << ':';
198 show_values(db, *i, separator);
199 cout << endl;
200 ++i;
204 static void
205 show_value(Database &db,
206 vector<docid>::const_iterator i,
207 vector<docid>::const_iterator end,
208 Xapian::valueno slot)
210 while (i != end) {
211 Xapian::docid did = *i;
212 cout << "Value " << slot << " for record #" << did << ": ";
213 decode_and_show_value(db.get_document(did).get_value(slot));
214 cout << endl;
215 ++i;
219 static void
220 show_docdata(Database &db, docid docid, char sep)
222 cout << sep << "[" << db.get_document(docid).get_data() << ']';
225 static void
226 show_docdata(Database &db,
227 vector<docid>::const_iterator i,
228 vector<docid>::const_iterator end)
230 while (i != end) {
231 cout << "Data for record #" << *i << ':' << endl;
232 cout << db.get_document(*i).get_data() << endl;
233 ++i;
237 static void
238 show_termlist(const Database &db, Xapian::docid did,
239 const char * all_pfx = NULL)
241 TermIterator t, tend;
242 if (all_pfx) {
243 t = db.allterms_begin(all_pfx);
244 tend = db.allterms_end(all_pfx);
245 cout << "All terms in database";
246 if (all_pfx[0])
247 cout << " with prefix \"" << all_pfx << "\"";
248 } else {
249 t = db.termlist_begin(did);
250 tend = db.termlist_end(did);
251 cout << "Term List for record #" << did;
253 if (verbose) {
254 cout << " (";
255 if (did != 0)
256 cout << "wdf, ";
257 cout << "termfreq";
258 if (verbose > 1)
259 cout << ", collection freq, wdf upper bound";
260 cout << ')';
262 cout << ':';
264 while (t != tend) {
265 const string & term = *t;
266 cout << separator << term;
267 if (verbose) {
268 if (did != 0)
269 cout << ' ' << t.get_wdf();
270 cout << ' ' << t.get_termfreq();
271 if (verbose > 1) {
272 cout << ' ' << db.get_collection_freq(term)
273 << ' ' << db.get_wdf_upper_bound(term);
276 ++t;
278 cout << endl;
281 static void
282 show_termlists(Database &db,
283 vector<docid>::const_iterator i,
284 vector<docid>::const_iterator end)
286 // Display termlists
287 while (i != end) {
288 show_termlist(db, *i);
289 ++i;
294 main(int argc, char **argv) try {
295 if (argc > 1 && argv[1][0] == '-') {
296 if (strcmp(argv[1], "--help") == 0) {
297 cout << PROG_NAME " - " PROG_DESC "\n\n";
298 show_usage();
299 exit(0);
301 if (strcmp(argv[1], "--version") == 0) {
302 cout << PROG_NAME " - " PACKAGE_STRING << endl;
303 exit(0);
307 const char * all_terms = NULL;
308 vector<docid> recnos;
309 vector<string> terms;
310 vector<string> dbs;
311 Stem stemmer;
313 valueno slot = 0; // Avoid "may be used uninitialised" warnings.
314 bool slot_set = false;
316 int c;
317 while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
318 switch (c) {
319 case 'a':
320 all_terms = "";
321 break;
322 case 'A':
323 all_terms = optarg;
324 break;
325 case 'r': {
326 char * end;
327 errno = 0;
328 unsigned long n = strtoul(optarg, &end, 10);
329 if (optarg == end || *end) {
330 cout << "Non-numeric document id: " << optarg << endl;
331 exit(1);
333 Xapian::docid did(n);
334 if (errno == ERANGE || n == 0 || did != n) {
335 cout << "Document id out of range: " << optarg << endl;
336 exit(1);
338 recnos.push_back(did);
339 break;
341 case 't':
342 terms.push_back(optarg);
343 break;
344 case 's':
345 stemmer = Stem(optarg);
346 break;
347 case '1':
348 separator = '\n';
349 break;
350 case 'V':
351 if (optarg) {
352 switch (*optarg) {
353 case 'R':
354 value_decode = VALUE_RAW;
355 ++optarg;
356 break;
357 case 'I':
358 value_decode = VALUE_PACKED_INT;
359 ++optarg;
360 break;
361 case 'S':
362 value_decode = VALUE_SORTABLE_SERIALISE;
363 ++optarg;
364 break;
365 case 'E':
366 value_decode = VALUE_ESCAPE;
367 ++optarg;
368 break;
370 char * end;
371 errno = 0;
372 unsigned long n = strtoul(optarg, &end, 10);
373 if (optarg == end || *end) {
374 cout << "Non-numeric value slot: " << optarg << endl;
375 exit(1);
377 slot = Xapian::valueno(n);
378 if (errno == ERANGE || slot != n) {
379 cout << "Value slot out of range: " << optarg << endl;
380 exit(1);
382 slot_set = true;
383 } else {
384 showvalues = true;
386 break;
387 case 'd':
388 showdocdata = true;
389 break;
390 case 'v':
391 ++verbose;
392 break;
393 case 'z':
394 count_zero_length_docs = true;
395 break;
396 default:
397 show_usage();
398 exit(1);
402 while (argv[optind]) dbs.push_back(argv[optind++]);
404 if (dbs.empty()) {
405 show_usage();
406 exit(1);
409 std::sort(recnos.begin(), recnos.end());
411 Database db;
413 vector<string>::const_iterator i;
414 for (i = dbs.begin(); i != dbs.end(); ++i) {
415 try {
416 db.add_database(Database(*i));
417 } catch (const Error &e) {
418 cerr << "Error opening database '" << *i << "': ";
419 cerr << e.get_description() << endl;
420 return 1;
425 if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
426 // Show some statistics about the database.
427 show_db_stats(db);
428 return 0;
431 if (all_terms) {
432 show_termlist(db, 0, all_terms);
435 if (!recnos.empty()) {
436 if (showvalues) {
437 show_values(db, recnos.begin(), recnos.end());
438 } else if (slot_set) {
439 show_value(db, recnos.begin(), recnos.end(), slot);
442 if (showdocdata) {
443 show_docdata(db, recnos.begin(), recnos.end());
445 } else {
446 if (slot_set) {
447 cout << "Value " << slot << " for each document:";
448 ValueIterator it = db.valuestream_begin(slot);
449 while (it != db.valuestream_end(slot)) {
450 cout << separator << it.get_docid() << ':';
451 decode_and_show_value(*it);
452 ++it;
454 cout << endl;
458 if (terms.empty()) {
459 show_termlists(db, recnos.begin(), recnos.end());
460 return 0;
463 vector<string>::const_iterator i;
464 for (i = terms.begin(); i != terms.end(); ++i) {
465 string term = stemmer(*i);
466 PostingIterator p = db.postlist_begin(term);
467 PostingIterator pend = db.postlist_end(term);
468 if (p == pend) {
469 cout << "term '" << term << "' not in database\n";
470 continue;
472 if (recnos.empty()) {
473 // Display posting list
474 cout << "Posting List for term '" << term << "' (termfreq "
475 << db.get_termfreq(term) << ", collfreq "
476 << db.get_collection_freq(term) << ", wdf_max "
477 << db.get_wdf_upper_bound(term) << "):";
478 while (p != pend) {
479 cout << separator << *p;
480 if (verbose) {
481 cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
483 if (showvalues) show_values(db, *p, ' ');
484 if (showdocdata) show_docdata(db, *p, ' ');
485 ++p;
487 cout << endl;
488 } else {
489 // Display position lists
490 vector<docid>::const_iterator j;
491 for (j = recnos.begin(); j != recnos.end(); ++j) {
492 p.skip_to(*j);
493 if (p == pend || *p != *j) {
494 cout << "term '" << term <<
495 "' doesn't index document #" << *j << endl;
496 } else {
497 cout << "Position List for term '" << term
498 << "', record #" << *j << ':';
499 try {
500 PositionIterator pos = p.positionlist_begin();
501 while (pos != p.positionlist_end()) {
502 cout << separator << *pos;
503 ++pos;
505 cout << endl;
506 } catch (const Error &e) {
507 cerr << "Error: " << e.get_description() << endl;
513 } catch (const Error &e) {
514 cerr << "\nError: " << e.get_description() << endl;
515 return 1;