1 /* xapian-delve.cc: Allow inspection of the contents of a Xapian database
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016,2017,2018 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
32 #include "gnu_getopt.h"
37 #include "unicode/description_append.h"
39 #include "unicode/description_append.cc"
41 using namespace Xapian
;
44 static char separator
= ' ';
46 static int verbose
= 0;
47 static bool showvalues
= false;
48 static bool showdocdata
= false;
49 static bool count_zero_length_docs
= false;
51 // How to decode document values.
54 VALUE_SORTABLE_SERIALISE
,
57 } value_decode
= VALUE_ESCAPE
;
59 #define PROG_NAME "delve"
60 #define PROG_DESC "Inspect the contents of a Xapian database"
62 static void show_usage() {
63 cout
<< "Usage: " PROG_NAME
" [OPTIONS] DATABASE...\n\n"
65 " -a show all terms in the database\n"
66 " -A <prefix> show all terms in the database with given prefix\n"
67 " -r <recno> for term list(s)\n"
68 " -t <term> for posting list(s)\n"
69 " -t <term> -r <recno> for position list(s)\n"
70 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
71 " -1 output one list entry per line\n"
72 " -V[<type>]<valueno> output value valueno for each document referred to\n"
73 " (or each document in the database if no -r options).\n"
75 " E: escape in a C-like way (default)\n"
76 " I: decode as a packed integer\n"
77 " R: show the raw value (which may contain binary data,\n"
78 " newlines, invalid UTF-8, etc)\n"
79 " S: decode using Xapian::sortable_unserialise()\n"
80 " -V[<type>] output all values for each document referred to.\n"
81 " <type> is as above.\n"
82 " -d output document data for each document referred to\n"
83 " -z for db, count documents with length 0\n"
84 " -v extra info (wdf and len for postlist;\n"
85 " wdf and termfreq for termlist; number of terms for db;\n"
86 " termfreq when showing all terms)\n"
87 " -vv even more info (also show collection freq and wdf\n"
88 " upper bound for terms)\n"
89 " --help display this help and exit\n"
90 " --version output version information and exit" << endl
;
94 show_db_stats(Database
&db
)
96 // Display a few database stats.
97 cout
<< "UUID = " << db
.get_uuid() << endl
;
98 cout
<< "number of documents = " << db
.get_doccount() << endl
;
99 cout
<< "average document length = " << db
.get_avlength() << endl
;
100 cout
<< "document length lower bound = " << db
.get_doclength_lower_bound()
102 cout
<< "document length upper bound = " << db
.get_doclength_upper_bound()
104 cout
<< "highest document id ever used = " << db
.get_lastdocid() << endl
;
106 cout
<< "has positional information = " << db
.has_positions() << endl
;
107 cout
<< "revision = ";
109 cout
<< db
.get_revision() << endl
;
110 } catch (const Xapian::InvalidOperationError
& e
) {
111 cout
<< e
.get_description() << endl
;
112 } catch (const Xapian::UnimplementedError
& e
) {
113 cout
<< e
.get_description() << endl
;
115 cout
<< "currently open for writing = ";
117 cout
<< db
.locked() << endl
;
118 } catch (const Xapian::Error
& e
) {
119 cout
<< e
.get_description() << endl
;
122 if (count_zero_length_docs
) {
123 Xapian::doccount empty_docs
= 0;
124 if (db
.get_total_length() == 0) {
125 // All documents are empty.
126 empty_docs
= db
.get_doccount();
128 Xapian::PostingIterator d
= db
.postlist_begin(string());
129 while (d
!= db
.postlist_end(string())) {
130 if (d
.get_doclength() == 0)
135 cout
<< "number of zero-length documents = " << empty_docs
<< endl
;
139 // To find the number of terms, we have to count them!
140 // This will take a few seconds or minutes, so only do it if -v
143 TermIterator t
= db
.allterms_begin();
144 while (t
!= db
.allterms_end()) {
148 cout
<< "number of distinct terms = " << terms
<< endl
;
153 decode_and_show_value(const string
& value
)
155 switch (value_decode
) {
158 description_append(esc
, value
);
162 case VALUE_SORTABLE_SERIALISE
:
163 cout
<< Xapian::sortable_unserialise(value
);
165 case VALUE_PACKED_INT
: {
166 unsigned long long i
= 0;
167 for (unsigned char ch
: value
) {
173 default: // VALUE_RAW
180 show_values(Database
&db
, docid docid
, char sep
)
182 Document doc
= db
.get_document(docid
);
183 ValueIterator v
= doc
.values_begin();
184 while (v
!= doc
.values_end()) {
185 cout
<< sep
<< v
.get_valueno() << ':';
186 decode_and_show_value(*v
);
192 show_values(Database
&db
,
193 vector
<docid
>::const_iterator i
,
194 vector
<docid
>::const_iterator end
)
197 cout
<< "Values for record #" << *i
<< ':';
198 show_values(db
, *i
, separator
);
205 show_value(Database
&db
,
206 vector
<docid
>::const_iterator i
,
207 vector
<docid
>::const_iterator end
,
208 Xapian::valueno slot
)
211 Xapian::docid did
= *i
;
212 cout
<< "Value " << slot
<< " for record #" << did
<< ": ";
213 decode_and_show_value(db
.get_document(did
).get_value(slot
));
220 show_docdata(Database
&db
, docid docid
, char sep
)
222 cout
<< sep
<< "[" << db
.get_document(docid
).get_data() << ']';
226 show_docdata(Database
&db
,
227 vector
<docid
>::const_iterator i
,
228 vector
<docid
>::const_iterator end
)
231 cout
<< "Data for record #" << *i
<< ':' << endl
;
232 cout
<< db
.get_document(*i
).get_data() << endl
;
238 show_termlist(const Database
&db
, Xapian::docid did
,
239 const char * all_pfx
= NULL
)
241 TermIterator t
, tend
;
243 t
= db
.allterms_begin(all_pfx
);
244 tend
= db
.allterms_end(all_pfx
);
245 cout
<< "All terms in database";
247 cout
<< " with prefix \"" << all_pfx
<< "\"";
249 t
= db
.termlist_begin(did
);
250 tend
= db
.termlist_end(did
);
251 cout
<< "Term List for record #" << did
;
259 cout
<< ", collection freq, wdf upper bound";
265 const string
& term
= *t
;
266 cout
<< separator
<< term
;
269 cout
<< ' ' << t
.get_wdf();
270 cout
<< ' ' << t
.get_termfreq();
272 cout
<< ' ' << db
.get_collection_freq(term
)
273 << ' ' << db
.get_wdf_upper_bound(term
);
282 show_termlists(Database
&db
,
283 vector
<docid
>::const_iterator i
,
284 vector
<docid
>::const_iterator end
)
288 show_termlist(db
, *i
);
294 main(int argc
, char **argv
) try {
295 if (argc
> 1 && argv
[1][0] == '-') {
296 if (strcmp(argv
[1], "--help") == 0) {
297 cout
<< PROG_NAME
" - " PROG_DESC
"\n\n";
301 if (strcmp(argv
[1], "--version") == 0) {
302 cout
<< PROG_NAME
" - " PACKAGE_STRING
<< endl
;
307 const char * all_terms
= NULL
;
308 vector
<docid
> recnos
;
309 vector
<string
> terms
;
313 valueno slot
= 0; // Avoid "may be used uninitialised" warnings.
314 bool slot_set
= false;
317 while ((c
= gnu_getopt(argc
, argv
, "aA:r:t:s:1vV::dz")) != -1) {
328 unsigned long n
= strtoul(optarg
, &end
, 10);
329 if (optarg
== end
|| *end
) {
330 cout
<< "Non-numeric document id: " << optarg
<< endl
;
333 Xapian::docid
did(n
);
334 if (errno
== ERANGE
|| n
== 0 || did
!= n
) {
335 cout
<< "Document id out of range: " << optarg
<< endl
;
338 recnos
.push_back(did
);
342 terms
.push_back(optarg
);
345 stemmer
= Stem(optarg
);
354 value_decode
= VALUE_RAW
;
358 value_decode
= VALUE_PACKED_INT
;
362 value_decode
= VALUE_SORTABLE_SERIALISE
;
366 value_decode
= VALUE_ESCAPE
;
372 unsigned long n
= strtoul(optarg
, &end
, 10);
373 if (optarg
== end
|| *end
) {
374 cout
<< "Non-numeric value slot: " << optarg
<< endl
;
377 slot
= Xapian::valueno(n
);
378 if (errno
== ERANGE
|| slot
!= n
) {
379 cout
<< "Value slot out of range: " << optarg
<< endl
;
394 count_zero_length_docs
= true;
402 while (argv
[optind
]) dbs
.push_back(argv
[optind
++]);
409 std::sort(recnos
.begin(), recnos
.end());
413 vector
<string
>::const_iterator i
;
414 for (i
= dbs
.begin(); i
!= dbs
.end(); ++i
) {
416 db
.add_database(Database(*i
));
417 } catch (const Error
&e
) {
418 cerr
<< "Error opening database '" << *i
<< "': ";
419 cerr
<< e
.get_description() << endl
;
425 if (!all_terms
&& terms
.empty() && recnos
.empty() && !slot_set
) {
426 // Show some statistics about the database.
432 show_termlist(db
, 0, all_terms
);
435 if (!recnos
.empty()) {
437 show_values(db
, recnos
.begin(), recnos
.end());
438 } else if (slot_set
) {
439 show_value(db
, recnos
.begin(), recnos
.end(), slot
);
443 show_docdata(db
, recnos
.begin(), recnos
.end());
447 cout
<< "Value " << slot
<< " for each document:";
448 ValueIterator it
= db
.valuestream_begin(slot
);
449 while (it
!= db
.valuestream_end(slot
)) {
450 cout
<< separator
<< it
.get_docid() << ':';
451 decode_and_show_value(*it
);
459 show_termlists(db
, recnos
.begin(), recnos
.end());
463 vector
<string
>::const_iterator i
;
464 for (i
= terms
.begin(); i
!= terms
.end(); ++i
) {
465 string term
= stemmer(*i
);
466 PostingIterator p
= db
.postlist_begin(term
);
467 PostingIterator pend
= db
.postlist_end(term
);
469 cout
<< "term '" << term
<< "' not in database\n";
472 if (recnos
.empty()) {
473 // Display posting list
474 cout
<< "Posting List for term '" << term
<< "' (termfreq "
475 << db
.get_termfreq(term
) << ", collfreq "
476 << db
.get_collection_freq(term
) << ", wdf_max "
477 << db
.get_wdf_upper_bound(term
) << "):";
479 cout
<< separator
<< *p
;
481 cout
<< ' ' << p
.get_wdf() << ' ' << p
.get_doclength();
483 if (showvalues
) show_values(db
, *p
, ' ');
484 if (showdocdata
) show_docdata(db
, *p
, ' ');
489 // Display position lists
490 vector
<docid
>::const_iterator j
;
491 for (j
= recnos
.begin(); j
!= recnos
.end(); ++j
) {
493 if (p
== pend
|| *p
!= *j
) {
494 cout
<< "term '" << term
<<
495 "' doesn't index document #" << *j
<< endl
;
497 cout
<< "Position List for term '" << term
498 << "', record #" << *j
<< ':';
500 PositionIterator pos
= p
.positionlist_begin();
501 while (pos
!= p
.positionlist_end()) {
502 cout
<< separator
<< *pos
;
506 } catch (const Error
&e
) {
507 cerr
<< "Error: " << e
.get_description() << endl
;
513 } catch (const Error
&e
) {
514 cerr
<< "\nError: " << e
.get_description() << endl
;