treewide: reduce $PATH checks for `git' executable
[public-inbox.git] / lib / PublicInbox / Search.pm
blobe5c5d6abbf722cfbba54d5c2ae31dc2dc36de3cb
1 # Copyright (C) all contributors <meta@public-inbox.org>
2 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
3 # based on notmuch, but with no concept of folders, files or flags
5 # Read-only search interface for use by the web and NNTP interfaces
6 package PublicInbox::Search;
7 use strict;
8 use v5.10.1;
9 use parent qw(Exporter);
10 our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms);
11 use List::Util qw(max);
12 use POSIX qw(strftime);
13 use Carp ();
14 our $XHC = 0; # defined but false
16 # values for searching, changing the numeric value breaks
17 # compatibility with old indices (so don't change them it)
18 use constant {
19 TS => 0, # Received: in Unix time (IMAP INTERNALDATE, JMAP receivedAt)
20 YYYYMMDD => 1, # redundant with DT below
21 DT => 2, # Date: YYYYMMDDHHMMSS (IMAP SENT*, JMAP sentAt)
23 # added for public-inbox 1.6.0+
24 BYTES => 3, # IMAP RFC822.SIZE
25 UID => 4, # IMAP UID == NNTP article number == Xapian docid
26 THREADID => 5, # RFC 8474, RFC 8621
28 # TODO
29 # REPLYCNT => ?, # IMAP ANSWERED
31 # SCHEMA_VERSION history
32 # 0 - initial
33 # 1 - subject_path is lower-cased
34 # 2 - subject_path is id_compress in the index, only
35 # 3 - message-ID is compressed if it includes '%' (hack!)
36 # 4 - change "Re: " normalization, avoid circular Reference ghosts
37 # 5 - subject_path drops trailing '.'
38 # 6 - preserve References: order in document data
39 # 7 - remove references and inreplyto terms
40 # 8 - remove redundant/unneeded document data
41 # 9 - disable Message-ID compression (SHA-1)
42 # 10 - optimize doc for NNTP overviews
43 # 11 - merge threads when vivifying ghosts
44 # 12 - change YYYYMMDD value column to numeric
45 # 13 - fix threading for empty References/In-Reply-To
46 # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
47 # 14 - fix ghost root vivification
48 # 15 - see public-inbox-v2-format(5)
49 # further bumps likely unnecessary, we'll suggest in-place
50 # "--reindex" use for further fixes and tweaks:
52 # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15):
53 # * "lid:" and "l:" for List-Id searches
55 # v1.6.0 adds BYTES, UID and THREADID values
56 SCHEMA_VERSION => 15,
59 use PublicInbox::Smsg;
60 eval { require PublicInbox::Over };
61 our $QP_FLAGS;
62 our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query);
63 our $Xap; # 'Xapian' or 'Search::Xapian'
64 our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor')
66 # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16,
67 # let's hope the ABI is stable
68 our $ENQ_DESCENDING = 0;
69 our $ENQ_ASCENDING = 1;
70 our @MAIL_VMAP = (
71 [ YYYYMMDD, 'd:'],
72 [ TS, 'rt:' ],
73 # these are undocumented for WWW, but lei and IMAP use them
74 [ DT, 'dt:' ],
75 [ BYTES, 'z:' ],
76 [ UID, 'uid:' ]
78 our @MAIL_NRP;
80 # Getopt::Long spec, only short options for portability in C++ implementation
81 our @XH_SPEC = (
82 'a', # ascending sort
83 'c', # code search
84 'd=s@', # shard dirs
85 'g=s', # git dir (with -c)
86 'k=i', # sort column (like sort(1))
87 'm=i', # maximum number of results
88 'o=i', # offset
89 'r', # 1=relevance then column
90 't', # collapse threads
91 'A=s@', # prefixes
92 'K=i', # timeout kill after i seconds
93 'O=s', # eidx_key
94 'T=i', # threadid
95 'Q=s@', # query prefixes "$user_prefix[:=]$XPREFIX"
98 sub load_xapian () {
99 return 1 if defined $Xap;
100 # n.b. PI_XAPIAN is intended for development use only
101 for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') {
102 eval "require $x";
103 next if $@;
105 $x->import(qw(:standard));
106 $Xap = $x;
108 # `version_string' was added in Xapian 1.1
109 my $xver = eval('v'.eval($x.'::version_string()')) //
110 eval('v'.eval($x.'::xapian_version_string()'));
112 # NumberRangeProcessor was added in Xapian 1.3.6,
113 # NumberValueRangeProcessor was removed for 1.5.0+,
114 # continue with the older /Value/ variant for now...
115 $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ?
116 'NumberRangeProcessor' : 'NumberValueRangeProcessor');
117 $X{$_} = $Xap.'::'.$_ for (keys %X);
119 *sortable_serialise = $x.'::sortable_serialise';
120 *sortable_unserialise = $x.'::sortable_unserialise';
121 # n.b. FLAG_PURE_NOT is expensive not suitable for a public
122 # website as it could become a denial-of-service vector
123 # FLAG_PHRASE also seems to cause performance problems chert
124 # (and probably earlier Xapian DBs). glass seems fine...
125 # TODO: make this an option, maybe?
126 # or make indexlevel=medium as default
127 $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() |
128 FLAG_WILDCARD();
129 @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP;
130 return 1;
132 undef;
135 # This is English-only, everything else is non-standard and may be confused as
136 # a prefix common in patch emails
137 our $LANG = 'english';
139 our %PATCH_BOOL_COMMON = (
140 dfpre => 'XDFPRE',
141 dfpost => 'XDFPOST',
142 dfblob => 'XDFPRE XDFPOST',
143 patchid => 'XDFID',
146 # note: the non-X term prefix allocations are shared with
147 # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
148 my %bool_pfx_external = (
149 mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
150 lid => 'G', # newsGroup (or similar entity), just inside <>
151 %PATCH_BOOL_COMMON
154 # for mairix compatibility
155 our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
156 our %PATCH_PROB_COMMON = (
157 s => 'S',
158 f => 'A',
159 b => $NON_QUOTED_BODY . ' XQUOT',
160 bs => $NON_QUOTED_BODY . ' XQUOT S',
161 n => 'XFN',
163 q => 'XQUOT',
164 nq => $NON_QUOTED_BODY,
165 dfn => 'XDFN',
166 dfa => 'XDFA',
167 dfb => 'XDFB',
168 dfhh => 'XDFHH',
169 dfctx => 'XDFCTX',
172 my %prob_prefix = (
173 m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
174 l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
175 t => 'XTO',
176 tc => 'XTO XCC',
177 c => 'XCC',
178 tcf => 'XTO XCC A',
179 a => 'XTO XCC A',
180 %PATCH_PROB_COMMON,
181 # default:
182 '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
185 # not documenting m: and mid: for now, the using the URLs works w/o Xapian
186 # not documenting lid: for now, either, it is probably redundant with l:,
187 # especially since we don't offer boolean searches for To/Cc/From
188 # headers, either
189 our @HELP = (
190 's:' => 'match within Subject e.g. s:"a quick brown fox"',
191 'd:' => <<EOF,
192 match date-time range, git "approxidate" formats supported
193 Open-ended ranges such as `d:last.week..' and
194 `d:..2.days.ago' are supported
196 'b:' => 'match within message body, including text attachments',
197 'nq:' => 'match non-quoted text within message body',
198 'q:' => 'match quoted text within message body',
199 'n:' => 'match filename of attachment(s)',
200 't:' => 'match within the To header',
201 'c:' => 'match within the Cc header',
202 'f:' => 'match within the From header',
203 'a:' => 'match within the To, Cc, and From headers',
204 'tc:' => 'match within the To and Cc headers',
205 'l:' => 'match contents of the List-Id header',
206 'bs:' => 'match within the Subject and body',
207 'dfn:' => 'match filename from diff',
208 'dfa:' => 'match diff removed (-) lines',
209 'dfb:' => 'match diff added (+) lines',
210 'dfhh:' => 'match diff hunk header context (usually a function name)',
211 'dfctx:' => 'match diff context lines',
212 'dfpre:' => 'match pre-image git blob ID',
213 'dfpost:' => 'match post-image git blob ID',
214 'dfblob:' => 'match either pre or post-image git blob ID',
215 'patchid:' => "match `git patch-id --stable' output",
216 'rt:' => <<EOF,
217 match received time, like `d:' if sender's clock was correct
220 chomp @HELP;
222 sub xdir ($;$) {
223 my ($self, $rdonly) = @_;
224 if ($rdonly || !defined($self->{shard})) {
225 $self->{xpfx};
226 } else { # v2, extindex, cindex only:
227 "$self->{xpfx}/$self->{shard}";
231 # returns shard directories as an array of strings, does not verify existence
232 sub shard_dirs ($) {
233 my ($self) = @_;
234 my $xpfx = $self->{xpfx};
235 if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox
236 ($xpfx);
237 } else { # v2 inbox, eidx, cidx
238 opendir(my $dh, $xpfx) or return (); # not initialized yet
239 # We need numeric sorting so shard[0] is first for reading
240 # Xapian metadata, if needed
241 my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return ();
242 map { "$xpfx/$_" } (0..$last);
246 # returns all shards as separate Xapian::Database objects w/o combining
247 sub xdb_shards_flat ($) {
248 my ($self) = @_;
249 load_xapian();
250 $self->{qp_flags} //= $QP_FLAGS;
251 my $slow_phrase;
252 my @xdb = map {
253 $slow_phrase ||= -f "$_/iamchert";
254 $X{Database}->new($_); # raises if missing
255 } shard_dirs($self);
256 $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase;
257 @xdb;
260 # v2 Xapian docids don't conflict, so they're identical to
261 # NNTP article numbers and IMAP UIDs.
262 # https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
263 sub mdocid {
264 my ($nshard, $mitem) = @_;
265 my $docid = $mitem->get_docid;
266 int(($docid - 1) / $nshard) + 1;
269 sub docids_to_artnums {
270 my $nshard = shift->{nshard};
271 # XXX does array vs arrayref make a difference in modern Perls?
272 map { int(($_ - 1) / $nshard) + 1 } @_;
275 sub mset_to_artnums {
276 my ($self, $mset) = @_;
277 my $nshard = $self->{nshard};
278 [ map { mdocid($nshard, $_) } $mset->items ];
281 sub xdb ($) {
282 my ($self) = @_;
283 $self->{xdb} // do {
284 my @xdb = $self->xdb_shards_flat or return;
285 $self->{nshard} = scalar(@xdb);
286 my $xdb = shift @xdb;
287 $xdb->add_database($_) for @xdb;
288 $self->{xdb} = $xdb;
292 sub new {
293 my ($class, $ibx) = @_;
294 ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
295 my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian';
296 my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION;
297 my $self = bless { xpfx => $xpfx }, $class;
298 $self->{altid} = $ibx->{altid} if defined($ibx->{altid});
299 $self;
302 sub reopen {
303 my ($self) = @_;
304 if (my $xdb = $self->{xdb}) {
305 $xdb->reopen;
307 $self; # make chaining easier
310 # Convert git "approxidate" ranges to something usable with our
311 # Xapian indices. At the moment, Xapian only offers a C++-only API
312 # and neither the SWIG nor XS bindings allow us to use custom code
313 # to parse dates (and libgit2 doesn't expose git__date_parse, either,
314 # so we're running git-rev-parse(1)).
315 # This replaces things we need to send to $git->date_parse with
316 # "\0".$strftime_format.['+'|$idx]."\0" placeholders
317 sub date_parse_prepare {
318 my ($to_parse, $pfx, $range) = @_;
319 # are we inside a parenthesized statement?
320 my $end = $range =~ s/([\)\s]*)\z// ? $1 : '';
321 my @r = split(/\.\./, $range, 2);
323 # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like
324 # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD
325 # We upgrade "d:" to "dt:" unconditionally
326 if ($pfx eq 'd') {
327 $pfx = 'dt';
328 # upgrade YYYYMMDD to YYYYMMDDHHMMSS
329 $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]]
330 [0-9]{2}[^[:alnum:]]
331 [0-9]{2}\z!x, @r));
332 $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r));
334 if ($pfx eq 'dt') {
335 if (!defined($r[1])) { # git needs gaps and not /\d{14}/
336 if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2})
337 ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) {
338 push @$to_parse, "$1-$2-$3 $4:$5:$6";
339 } else {
340 push @$to_parse, $r[0];
342 $r[0] = "\0%Y%m%d%H%M%S$#$to_parse\0";
343 $r[1] = "\0%Y%m%d%H%M%S+\0";
344 } else {
345 for my $x (@r) {
346 next if $x eq '' || $x =~ /\A[0-9]{14}\z/;
347 push @$to_parse, $x;
348 $x = "\0%Y%m%d%H%M%S$#$to_parse\0";
351 } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P
352 for my $x (@r) {
353 next if $x eq '' || $x =~ /\A[0-9]{5,}\z/;
354 push @$to_parse, $x;
355 $x = "\0%s$#$to_parse\0";
357 $r[1] //= "\0%s+\0"; # add 1 day
359 "$pfx:".join('..', @r).$end;
362 sub date_parse_finalize {
363 my ($git, $to_parse) = @_;
364 # git-rev-parse can handle any number of args up to system
365 # limits (around (4096*32) bytes on Linux).
366 my @r = $git->date_parse(@$to_parse);
367 # n.b. git respects TZ, times stored in SQLite/Xapian are always UTC,
368 # and gmtime doesn't seem to do the right thing when TZ!=UTC
369 my ($i, $t);
370 $_[2] =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/
371 $t = $2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0];
372 $1 eq '%s' ? $t : strftime($1, gmtime($t))/sge;
375 # n.b. argv never has NUL, though we'll need to filter it out
376 # if this $argv isn't from a command execution
377 sub query_argv_to_string {
378 my (undef, $git, $argv) = @_;
379 my $to_parse;
380 my $tmp = join(' ', map {;
381 if (s!\b(d|rt|dt):(\S+)\z!date_parse_prepare(
382 $to_parse //= [], $1, $2)!sge) {
384 } elsif (/\s/) {
385 s/(.*?)\b(\w+:)// ? qq{$1$2"$_"} : qq{"$_"};
386 } else {
389 } @$argv);
390 date_parse_finalize($git, $to_parse, $tmp) if $to_parse;
391 $tmp
394 # this is for the WWW "q=" query parameter and "lei q --stdin"
395 # it can't do d:"5 days ago", but it will do d:5.days.ago
396 sub query_approxidate {
397 my (undef, $git) = @_; # $_[2] = $query_string (modified in-place)
398 my $DQ = qq<"\x{201c}\x{201d}>; # Xapian can use curly quotes
399 $_[2] =~ tr/\x00/ /; # Xapian doesn't do NUL, we use it as a placeholder
400 my ($terms, $phrase, $to_parse);
401 $_[2] =~ s{([^$DQ]*)([$DQ][^$DQ]*[$DQ])?}{
402 ($terms, $phrase) = ($1, $2);
403 $terms =~ s!\b(d|rt|dt):(\S+)!
404 date_parse_prepare($to_parse //= [], $1, $2)!sge;
405 $terms.($phrase // '');
406 }sge;
407 date_parse_finalize($git, $to_parse, $_[2]) if $to_parse;
410 # read-only, for mail only (codesearch has different rules)
411 sub mset {
412 my ($self, $qry_str, $opt) = @_;
413 my $qp = $self->{qp} //= $self->qparse_new;
414 my $qry = $qp->parse_query($qry_str, $self->{qp_flags});
415 if (defined(my $eidx_key = $opt->{eidx_key})) {
416 $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key);
418 if (defined(my $uid_range = $opt->{uid_range})) {
419 my $range = $X{Query}->new(OP_VALUE_RANGE(), UID,
420 sortable_serialise($uid_range->[0]),
421 sortable_serialise($uid_range->[1]));
422 $qry = $X{Query}->new(OP_FILTER(), $qry, $range);
424 if (defined(my $tid = $opt->{threadid})) {
425 $tid = sortable_serialise($tid);
426 $qry = $X{Query}->new(OP_FILTER(), $qry,
427 $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid));
429 do_enquire($self, $qry, $opt, TS);
432 sub xhc_start_maybe (@) {
433 require PublicInbox::XapClient;
434 my $xhc = PublicInbox::XapClient::start_helper(@_);
435 require PublicInbox::XhcMset if $xhc;
436 $xhc;
439 sub xh_opt ($$) {
440 my ($self, $opt) = @_;
441 my $lim = $opt->{limit} || 50;
442 my @ret;
443 push @ret, '-o', $opt->{offset} if $opt->{offset};
444 push @ret, '-m', $lim;
445 my $rel = $opt->{relevance} // 0;
446 if ($rel == -2) { # ORDER BY docid/UID (highest first)
447 push @ret, '-k', '-1';
448 } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
449 push @ret, '-k', '-1';
450 push @ret, '-a';
451 } elsif ($rel == 0) {
452 push @ret, '-k', $opt->{sort_col} // TS;
453 push @ret, '-a' if $opt->{asc};
454 } else { # rel > 0
455 push @ret, '-r';
456 push @ret, '-k', $opt->{sort_col} // TS;
457 push @ret, '-a' if $opt->{asc};
459 push @ret, '-t' if $opt->{threads};
460 push @ret, '-T', $opt->{threadid} if defined $opt->{threadid};
461 push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key};
462 my $apfx = $self->{-alt_pfx} //= do {
463 my @tmp;
464 for (grep /\Aserial:/, @{$self->{altid} // []}) {
465 my (undef, $pfx) = split /:/, $_;
466 push @tmp, '-Q', "$pfx=X\U$pfx";
468 # TODO: arbitrary header indexing goes here
469 \@tmp;
471 (@ret, @$apfx);
474 # returns a true value if actually handled asynchronously,
475 # and a falsy value if handled synchronously
476 sub async_mset {
477 my ($self, $qry_str, $opt, $cb, @args) = @_;
478 if ($XHC) { # unconditionally retrieving pct + rank for now
479 xdb($self); # populate {nshards}
480 my @margs = ($self->xh_args, xh_opt($self, $opt));
481 my $ret = eval {
482 my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str);
483 PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args);
485 $cb->(@args, undef, $@) if $@;
486 $ret;
487 } else { # synchronous
488 my $mset = $self->mset($qry_str, $opt);
489 $cb->(@args, $mset);
490 undef;
494 sub do_enquire { # shared with CodeSearch
495 my ($self, $qry, $opt, $col) = @_;
496 my $enq = $X{Enquire}->new(xdb($self));
497 $enq->set_query($qry);
498 my $rel = $opt->{relevance} // 0;
499 if ($rel == -2) { # ORDER BY docid/UID (highest first)
500 $enq->set_weighting_scheme($X{BoolWeight}->new);
501 $enq->set_docid_order($ENQ_DESCENDING);
502 } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first)
503 $enq->set_weighting_scheme($X{BoolWeight}->new);
504 $enq->set_docid_order($ENQ_ASCENDING);
505 } elsif ($rel == 0) {
506 $enq->set_sort_by_value_then_relevance($col, !$opt->{asc});
507 } else { # rel > 0
508 $enq->set_sort_by_relevance_then_value($col, !$opt->{asc});
511 # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse
512 # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword
513 (($opt->{threads} // 0) == 1 && has_threadid($self)) and
514 $enq->set_collapse_key(THREADID);
515 retry_reopen($self, \&enquire_once, $enq,
516 $opt->{offset} || 0, $opt->{limit} || 50);
519 sub retry_reopen {
520 my ($self, $cb, @arg) = @_;
521 for my $i (1..10) {
522 if (wantarray) {
523 my @ret = eval { $cb->($self, @arg) };
524 return @ret unless $@;
525 } else {
526 my $ret = eval { $cb->($self, @arg) };
527 return $ret unless $@;
529 # Exception: The revision being read has been discarded -
530 # you should call Xapian::Database::reopen()
531 if (ref($@) =~ /\bDatabaseModifiedError\b/) {
532 reopen($self);
533 } else {
534 # let caller decide how to spew, because ExtMsg queries
535 # get wonky and trigger:
536 # "something terrible happened at .../Xapian/Enquire.pm"
537 Carp::croak($@);
540 Carp::croak("Too many Xapian database modifications in progress\n");
543 # returns true if all docs have the THREADID value
544 sub has_threadid ($) {
545 my ($self) = @_;
546 (xdb($self)->get_metadata('has_threadid') // '') eq '1';
549 sub enquire_once { # retry_reopen callback
550 my (undef, $enq, $offset, $limit) = @_;
551 $enq->get_mset($offset, $limit);
554 sub mset_to_smsg {
555 my ($self, $ibx, $mset) = @_;
556 my $nshard = $self->{nshard};
557 my $i = 0;
558 my %order = map { mdocid($nshard, $_) => ++$i } $mset->items;
559 my @msgs = sort {
560 $order{$a->{num}} <=> $order{$b->{num}}
561 } @{$ibx->over->get_all(keys %order)};
562 wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs;
565 # read-write
566 sub stemmer { $X{Stem}->new($LANG) }
568 sub qp_init_common {
569 my ($self) = @_;
570 my $qp = $X{QueryParser}->new;
571 $qp->set_default_op(OP_AND());
572 $qp->set_database(xdb($self));
573 $qp->set_stemmer(stemmer($self));
574 $qp->set_stemming_strategy(STEM_SOME());
575 my $cb = $qp->can('set_max_wildcard_expansion') //
576 $qp->can('set_max_expansion'); # Xapian 1.5.0+
577 $cb->($qp, 100);
578 $qp;
581 # read-only
582 sub qparse_new {
583 my ($self) = @_;
584 my $qp = qp_init_common($self);
585 my $cb = $qp->can('add_valuerangeprocessor') //
586 $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
588 $cb->($qp, $_) for @MAIL_NRP;
589 while (my ($name, $prefix) = each %bool_pfx_external) {
590 $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
593 # we do not actually create AltId objects,
594 # just parse the spec to avoid the extra DB handles for now.
595 if (my $altid = $self->{altid}) {
596 my $user_pfx = $self->{-user_pfx} = [];
597 for (@$altid) {
598 # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3'
599 # note: Xapian supports multibyte UTF-8, /^[0-9]+$/,
600 # and '_' with prefixes matching \w+
601 /\Aserial:(\w+):/ or next;
602 my $pfx = $1;
603 push @$user_pfx, "$pfx:", <<EOF;
604 alternate serial number e.g. $pfx:12345 (boolean)
606 # gmane => XGMANE
607 $qp->add_boolean_prefix($pfx, 'X'.uc($pfx));
609 chomp @$user_pfx;
612 while (my ($name, $prefix) = each %prob_prefix) {
613 $qp->add_prefix($name, $_) foreach split(/ /, $prefix);
615 $qp;
618 sub generate_cxx () { # generates snippet for xap_helper.h
619 my $ret = <<EOM;
620 # line ${\__LINE__} "${\__FILE__}"
621 static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}];
622 static void mail_nrp_init(void)
625 for (0..$#MAIL_VMAP) {
626 my $x = $MAIL_VMAP[$_];
627 $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n}
629 $ret .= <<EOM;
632 # line ${\__LINE__} "${\__FILE__}"
633 static void qp_init_mail_search(Xapian::QueryParser *qp)
635 for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++)
636 qp->ADD_RP(mail_nrp[i]);
638 for my $name (sort keys %bool_pfx_external) {
639 for (split(/ /, $bool_pfx_external{$name})) {
640 $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n}
643 # altid support is handled in xh_opt and srch_init_extra in XH
644 for my $name (sort keys %prob_prefix) {
645 for (split(/ /, $prob_prefix{$name})) {
646 $ret .= qq{\tqp->add_prefix("$name", "$_");\n}
649 $ret .= "}\n";
652 sub help {
653 my ($self) = @_;
654 $self->{qp} //= $self->qparse_new; # parse altids
655 my @ret = @HELP;
656 if (my $user_pfx = $self->{-user_pfx}) {
657 push @ret, @$user_pfx;
659 \@ret;
662 # always returns a scalar value
663 sub int_val ($$) {
664 my ($doc, $col) = @_;
665 my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian
666 sortable_unserialise($val) + 0; # PV => IV conversion
669 sub get_pct ($) { # mset item
670 # Capped at "99%" since "100%" takes an extra column in the
671 # thread skeleton view. <xapian/mset.h> says the value isn't
672 # very meaningful, anyways.
673 my $n = $_[0]->get_percent;
674 $n > 99 ? 99 : $n;
677 sub xap_terms ($$;@) {
678 my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
679 my $end = $xdb_or_doc->termlist_end(@docid);
680 my $cur = $xdb_or_doc->termlist_begin(@docid);
681 $cur->skip_to($pfx);
682 my (@ret, $tn);
683 my $pfxlen = length($pfx);
684 for (; $cur != $end; $cur++) {
685 $tn = $cur->get_termname;
686 index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
688 wantarray ? @ret : +{ map { $_ => undef } @ret };
691 # get combined docid from over.num:
692 # (not generic Xapian, only works with our sharding scheme for mail)
693 sub num2docid ($$) {
694 my ($self, $num) = @_;
695 my $nshard = $self->{nshard};
696 ($num - 1) * $nshard + $num % $nshard + 1;
699 sub all_terms {
700 my ($self, $pfx) = @_;
701 my $cur = xdb($self)->allterms_begin($pfx);
702 my $end = $self->{xdb}->allterms_end($pfx);
703 my $pfxlen = length($pfx);
704 my @ret;
705 for (; $cur != $end; $cur++) {
706 push @ret, substr($cur->get_termname, $pfxlen);
708 wantarray ? @ret : +{ map { $_ => undef } @ret };
711 sub xh_args { # prep getopt args to feed to xap_helper.h socket
712 map { ('-d', $_) } shard_dirs($_[0]);
715 sub docids_by_postlist ($$) {
716 my ($self, $q) = @_;
717 my $cur = $self->xdb->postlist_begin($q);
718 my $end = $self->{xdb}->postlist_end($q);
719 my @ids;
720 for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
721 @ids;
724 sub get_doc ($$) {
725 my ($self, $docid) = @_;
726 eval { $self->{xdb}->get_document($docid) } // do {
727 die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/;
728 undef;