Bug 17255 - Upgrade Elastic Search code to work with version 2.4+ - rebased wip
[koha.git] / Koha / SearchEngine / Elasticsearch / QueryBuilder.pm
blobd0aa916c230f18e3acaa2803ee30741b8506bb83
1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 =head1 NAME
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
25 =head1 DESCRIPTION
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
30 =head1 SYNOPSIS
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
38 =head1 METHODS
40 =cut
42 use base qw(Koha::SearchEngine::Elasticsearch);
43 use Carp;
44 use JSON;
45 use List::MoreUtils qw/ each_array /;
46 use Modern::Perl;
47 use URI::Escape;
49 use C4::Context;
50 use Data::Dumper; # TODO remove
52 =head2 build_query
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
59 for details.)
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
65 =over 4
67 =item sort
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
73 =back
75 =cut
77 sub build_query {
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
87 my $res;
88 $res->{query} = {
89 query_string => {
90 query => $query,
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
98 if ( $options{sort} ) {
99 foreach my $sort ( @{ $options{sort} } ) {
100 my ( $f, $d ) = @$sort{qw/ field direction /};
101 die "Invalid sort direction, $d"
102 if $d && ( $d ne 'asc' && $d ne 'desc' );
103 $d = 'asc' unless $d;
105 # TODO account for fields that don't have a 'phrase' type
107 $f = $self->_sort_field($f);
108 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
112 # See _convert_facets in Search.pm for how these get turned into
113 # things that Koha can use.
114 $res->{aggregations} = {
115 author => { terms => { field => "author__facet" } },
116 subject => { terms => { field => "subject__facet" } },
117 itype => { terms => { field => "itype__facet" } },
118 location => { terms => { field => "homebranch__facet" } },
119 'su-geo' => { terms => { field => "su-geo__facet" } },
120 se => { terms => { field => "se__facet" } },
122 if ( my $ef = $options{expanded_facet} ) {
123 $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
125 return $res;
128 =head2 build_browse_query
130 my $browse_query = $builder->build_browse_query($field, $query);
132 This performs a "starts with" style query on a particular field. The field
133 to be searched must have been indexed with an appropriate mapping as a
134 "phrase" subfield, which pretty much everything has.
136 =cut
138 # XXX this isn't really a browse query like we want in the end
139 sub build_browse_query {
140 my ( $self, $field, $query ) = @_;
142 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
144 return { query => '*' } if !defined $query;
146 # TODO this should come from Koha::SearchEngine::Elasticsearch
147 my %field_whitelist = (
148 title => 1,
149 author => 1,
151 $field = 'title' if !exists $field_whitelist{$field};
152 my $sort = $self->_sort_field($field);
153 my $res = {
154 query => {
155 match_phrase_prefix => {
156 "$field.phrase" => {
157 query => $query,
158 operator => 'or',
159 fuzziness => $fuzzy_enabled ? 'auto' : '0',
163 sort => [ { "$sort.phrase" => { order => "asc" } } ],
167 =head2 build_query_compat
169 my (
170 $error, $query, $simple_query, $query_cgi,
171 $query_desc, $limit, $limit_cgi, $limit_desc,
172 $stopwords_removed, $query_type
174 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
175 \@limits, \@sort_by, $scan, $lang );
177 This handles a search using the same api as L<C4::Search::buildQuery> does.
179 A very simple query will go in with C<$operands> set to ['query'], and
180 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
181 C<$query> set to something that can perform the search, C<$simple_query>
182 set to just the search term, C<$query_cgi> set to something that can
183 reproduce this search, and C<$query_desc> set to something else.
185 =cut
187 sub build_query_compat {
188 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
189 $lang, $params )
190 = @_;
192 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
193 my @sort_params = $self->_convert_sort_fields(@$sort_by);
194 my @index_params = $self->_convert_index_fields(@$indexes);
195 my $limits = $self->_fix_limit_special_cases($orig_limits);
197 # Merge the indexes in with the search terms and the operands so that
198 # each search thing is a handy unit.
199 unshift @$operators, undef; # The first one can't have an op
200 my @search_params;
201 my $ea = each_array( @$operands, @$operators, @index_params );
202 while ( my ( $oand, $otor, $index ) = $ea->() ) {
203 next if ( !defined($oand) || $oand eq '' );
204 push @search_params, {
205 operand => $self->_clean_search_term($oand), # the search terms
206 operator => defined($otor) ? uc $otor : undef, # AND and so on
207 $index ? %$index : (),
211 # We build a string query from limits and the queries. An alternative
212 # would be to pass them separately into build_query and let it build
213 # them into a structured ES query itself. Maybe later, though that'd be
214 # more robust.
215 my $query_str = join( ' AND ',
216 join( ' ', $self->_create_query_string(@search_params) ) || (),
217 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
219 # If there's no query on the left, let's remove the junk left behind
220 $query_str =~ s/^ AND //;
221 my %options;
222 $options{sort} = \@sort_params;
223 $options{expanded_facet} = $params->{expanded_facet};
224 my $query = $self->build_query( $query_str, %options );
226 #die Dumper($query);
227 # We roughly emulate the CGI parameters of the zebra query builder
228 my $query_cgi;
229 $query_cgi = 'idx=kw&q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
230 my $simple_query;
231 $simple_query = $operands->[0] if @$operands == 1;
232 my $query_desc = $simple_query;
233 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
234 my $limit_cgi = ( $orig_limits and @$orig_limits )
235 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
236 : '';
237 my $limit_desc;
238 $limit_desc = "$limit" if $limit;
239 return (
240 undef, $query, $simple_query, $query_cgi, $query_desc,
241 $limit, $limit_cgi, $limit_desc, undef, undef
245 =head2 build_authorities_query
247 my $query = $builder->build_authorities_query(\%search);
249 This takes a nice description of an authority search and turns it into a black-box
250 query that can then be passed to the appropriate searcher.
252 The search description is a hashref that looks something like:
255 searches => [
257 where => 'Heading', # search the main entry
258 operator => 'exact', # require an exact match
259 value => 'frogs', # the search string
262 where => '', # search all entries
263 operator => '', # default keyword, right truncation
264 value => 'pond',
267 sort => {
268 field => 'Heading',
269 order => 'desc',
271 authtypecode => 'TOPIC_TERM',
274 =cut
276 sub build_authorities_query {
277 my ( $self, $search ) = @_;
279 # Start by making the query parts
280 my @query_parts;
281 my @filter_parts;
282 foreach my $s ( @{ $search->{searches} } ) {
283 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
284 $wh = '_all' if $wh eq '';
285 if ( $op eq 'is' || $op eq '=' ) {
287 # look for something that matches completely
288 # note, '=' is about numerical vals. May need special handling.
289 # _allphrase is a special field that only groups the exact
290 # matches. Also, we lowercase our search because the ES
291 # index lowercases its values, and term searches don't get the
292 # search analyzer applied to them.
293 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
295 elsif ( $op eq 'exact' ) {
297 # left and right truncation, otherwise an exact phrase
298 push @query_parts, { match_phrase => { $wh => $val } };
300 elsif ( $op eq 'start' ) {
302 # startswith search
303 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
305 else {
306 # regular wordlist stuff
307 push @query_parts, { match => { $wh => $val } };
311 # Merge the query and filter parts appropriately
312 # 'should' behaves like 'or', if we want 'and', use 'must'
313 my $query_part = { bool => { should => \@query_parts } };
314 my $filter_part = { bool => { should => \@filter_parts } };
316 # We need to add '.phrase' to all the sort headings otherwise it'll sort
317 # based on the tokenised form.
318 my %s;
319 if ( exists $search->{sort} ) {
320 foreach my $k ( keys %{ $search->{sort} } ) {
321 my $f = $self->_sort_field($k);
322 $s{"$f.phrase"} = $search->{sort}{$k};
324 $search->{sort} = \%s;
327 # extract the sort stuff
328 my %sort;
329 %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
330 my $query;
331 if (@filter_parts) {
332 $query =
333 { query =>
334 { filtered => { filter => $filter_part, query => $query_part } }
337 else {
338 $query = { query => $query_part };
340 $query = { %$query, %sort };
341 return $query;
345 =head2 build_authorities_query_compat
347 my ($query) =
348 $builder->build_authorities_query_compat( \@marclist, \@and_or,
349 \@excluding, \@operator, \@value, $authtypecode, $orderby );
351 This builds a query for searching for authorities, in the style of
352 L<C4::AuthoritiesMarc::SearchAuthorities>.
354 Arguments:
356 =over 4
358 =item marclist
360 An arrayref containing where the particular term should be searched for.
361 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
362 thesaurus. If left blank, any field is used.
364 =item and_or
366 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
368 =item excluding
370 Also ignored.
372 =item operator
374 What form of search to do. Options are: is (phrase, no trunction, whole field
375 must match), = (number exact match), exact (phrase, but with left and right
376 truncation). If left blank, then word list, right truncted, anywhere is used.
378 =item value
380 The actual user-provided string value to search for.
382 =item authtypecode
384 The authority type code to search within. If blank, then all will be searched.
386 =item orderby
388 The order to sort the results by. Options are Relevance, HeadingAsc,
389 HeadingDsc, AuthidAsc, AuthidDsc.
391 =back
393 marclist, operator, and value must be the same length, and the values at
394 index /i/ all relate to each other.
396 This returns a query, which is a black box object that can be passed to the
397 appropriate search object.
399 =cut
401 sub build_authorities_query_compat {
402 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
403 $authtypecode, $orderby )
404 = @_;
406 # This turns the old-style many-options argument form into a more
407 # extensible hash form that is understood by L<build_authorities_query>.
408 my @searches;
410 my %koha_to_index_name = (
411 mainmainentry => 'Heading-Main',
412 mainentry => 'Heading',
413 match => 'Match',
414 'match-heading' => 'Match-heading',
415 'see-from' => 'Match-heading-see-from',
416 thesaurus => 'Subject-heading-thesaurus',
417 any => '',
420 # Make sure everything exists
421 foreach my $m (@$marclist) {
422 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
424 for ( my $i = 0 ; $i < @$value ; $i++ ) {
425 push @searches,
427 where => $koha_to_index_name{$marclist->[$i]},
428 operator => $operator->[$i],
429 value => $value->[$i],
433 my %sort;
434 my $sort_field =
435 ( $orderby =~ /^Heading/ ) ? 'Heading'
436 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
437 : undef;
438 if ($sort_field) {
439 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
440 %sort = ( $sort_field => $sort_order, );
442 my %search = (
443 searches => \@searches,
444 authtypecode => $authtypecode,
446 $search{sort} = \%sort if %sort;
447 my $query = $self->build_authorities_query( \%search );
448 return $query;
451 =head2 _convert_sort_fields
453 my @sort_params = _convert_sort_fields(@sort_by)
455 Converts the zebra-style sort index information into elasticsearch-style.
457 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
458 something that can be sent to L<build_query>.
460 =cut
462 sub _convert_sort_fields {
463 my ( $self, @sort_by ) = @_;
465 # Turn the sorting into something we care about.
466 my %sort_field_convert = (
467 acqdate => 'acqdate',
468 author => 'author',
469 call_number => 'callnum',
470 popularity => 'issues',
471 relevance => undef, # default
472 title => 'title',
473 pubdate => 'pubdate',
475 my %sort_order_convert =
476 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
478 # Convert the fields and orders, drop anything we don't know about.
479 grep { $_->{field} } map {
480 my ( $f, $d ) = split /_/;
482 field => $sort_field_convert{$f},
483 direction => $sort_order_convert{$d}
485 } @sort_by;
488 =head2 _convert_index_fields
490 my @index_params = $self->_convert_index_fields(@indexes);
492 Converts zebra-style search index notation into elasticsearch-style.
494 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
495 and it returns something that can be sent to L<build_query>.
497 B<TODO>: this will pull from the elasticsearch mappings table to figure out
498 types.
500 =cut
502 our %index_field_convert = (
503 'kw' => '_all',
504 'ti' => 'title',
505 'au' => 'author',
506 'su' => 'subject',
507 'nb' => 'isbn',
508 'se' => 'title-series',
509 'callnum' => 'callnum',
510 'itype' => 'itype',
511 'ln' => 'ln',
512 'branch' => 'homebranch',
513 'fic' => 'lf',
514 'mus' => 'rtype',
515 'aud' => 'ta',
516 'hi' => 'Host-Item-Number',
519 sub _convert_index_fields {
520 my ( $self, @indexes ) = @_;
522 my %index_type_convert =
523 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
525 # Convert according to our table, drop anything that doesn't convert.
526 # If a field starts with mc- we save it as it's used (and removed) later
527 # when joining things, to indicate we make it an 'OR' join.
528 # (Sorry, this got a bit ugly after special cases were found.)
529 grep { $_->{field} } map {
530 my ( $f, $t ) = split /,/;
531 my $mc = '';
532 if ($f =~ /^mc-/) {
533 $mc = 'mc-';
534 $f =~ s/^mc-//;
536 my $r = {
537 field => $index_field_convert{$f},
538 type => $index_type_convert{ $t // '__default' }
540 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
542 } @indexes;
545 =head2 _convert_index_strings
547 my @searches = $self->_convert_index_strings(@searches);
549 Similar to L<_convert_index_fields>, this takes strings of the form
550 B<field:search term> and rewrites the field from zebra-style to
551 elasticsearch-style. Anything it doesn't understand is returned verbatim.
553 =cut
555 sub _convert_index_strings {
556 my ( $self, @searches ) = @_;
557 my @res;
558 foreach my $s (@searches) {
559 next if $s eq '';
560 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
561 unless ( defined($field) && defined($term) ) {
562 push @res, $s;
563 next;
565 my ($conv) = $self->_convert_index_fields($field);
566 unless ( defined($conv) ) {
567 push @res, $s;
568 next;
570 push @res, $conv->{field} . ":"
571 . $self->_modify_string_by_type( %$conv, operand => $term );
573 return @res;
576 =head2 _convert_index_strings_freeform
578 my $search = $self->_convert_index_strings_freeform($search);
580 This is similar to L<_convert_index_strings>, however it'll search out the
581 things to change within the string. So it can handle strings such as
582 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
584 If there is something of the form "su,complete-subfield" or something, the
585 second part is stripped off as we can't yet handle that. Making it work
586 will have to wait for a real query parser.
588 =cut
590 sub _convert_index_strings_freeform {
591 my ( $self, $search ) = @_;
592 while ( my ( $zeb, $es ) = each %index_field_convert ) {
593 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
595 return $search;
598 =head2 _modify_string_by_type
600 my $str = $self->_modify_string_by_type(%index_field);
602 If you have a search term (operand) and a type (phrase, right-truncated), this
603 will convert the string to have the function in lucene search terms, e.g.
604 wrapping quotes around it.
606 =cut
608 sub _modify_string_by_type {
609 my ( $self, %idx ) = @_;
611 my $type = $idx{type} || '';
612 my $str = $idx{operand};
613 return $str unless $str; # Empty or undef, we can't use it.
615 $str .= '*' if $type eq 'right-truncate';
616 $str = '"' . $str . '"' if $type eq 'phrase';
617 return $str;
620 =head2 _join_queries
622 my $query_str = $self->_join_queries(@query_parts);
624 This takes a list of query parts, that might be search terms on their own, or
625 booleaned together, or specifying fields, or whatever, wraps them in
626 parentheses, and ANDs them all together. Suitable for feeding to the ES
627 query string query.
629 Note: doesn't AND them together if they specify an index that starts with "mc"
630 as that was a special case in the original code for dealing with multiple
631 choice options (you can't search for something that has an itype of A and
632 and itype of B otherwise.)
634 =cut
636 sub _join_queries {
637 my ( $self, @parts ) = @_;
639 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
640 my @mc_parts =
641 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
642 return () unless @norm_parts + @mc_parts;
643 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
644 my $grouped_mc =
645 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
647 # Handy trick: $x || () inside a join means that if $x ends up as an
648 # empty string, it gets replaced with (), which makes join ignore it.
649 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
650 # in this case.)
651 join( ' AND ',
652 join( ' AND ', map { "($_)" } @norm_parts ) || (),
653 $grouped_mc || () );
656 =head2 _make_phrases
658 my @phrased_queries = $self->_make_phrases(@query_parts);
660 This takes the supplied queries and forces them to be phrases by wrapping
661 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
662 the quotes outside of them if they're there.
664 =cut
666 sub _make_phrases {
667 my ( $self, @parts ) = @_;
668 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
671 =head2 _create_query_string
673 my @query_strings = $self->_create_query_string(@queries);
675 Given a list of hashrefs, it will turn them into a lucene-style query string.
676 The hash should contain field, type (both for the indexes), operator, and
677 operand.
679 =cut
681 sub _create_query_string {
682 my ( $self, @queries ) = @_;
684 map {
685 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
686 my $field = $_->{field} ? $_->{field} . ':' : '';
688 my $oand = $self->_modify_string_by_type(%$_);
689 "$otor($field$oand)";
690 } @queries;
693 =head2 _clean_search_term
695 my $term = $self->_clean_search_term($term);
697 This cleans a search term by removing any funny characters that may upset
698 ES and give us an error. It also calls L<_convert_index_strings_freeform>
699 to ensure those parts are correct.
701 =cut
703 sub _clean_search_term {
704 my ( $self, $term ) = @_;
706 # Some hardcoded searches (like with authorities) produce things like
707 # 'an=123', when it ought to be 'an:123' for our purposes.
708 $term =~ s/=/:/g;
709 $term = $self->_convert_index_strings_freeform($term);
710 $term =~ s/[{}]/"/g;
711 return $term;
714 =head2 _fix_limit_special_cases
716 my $limits = $self->_fix_limit_special_cases($limits);
718 This converts any special cases that the limit specifications have into things
719 that are more readily processable by the rest of the code.
721 The argument should be an arrayref, and it'll return an arrayref.
723 =cut
725 sub _fix_limit_special_cases {
726 my ( $self, $limits ) = @_;
728 my @new_lim;
729 foreach my $l (@$limits) {
731 # This is set up by opac-search.pl
732 if ( $l =~ /^yr,st-numeric,ge=/ ) {
733 my ( $start, $end ) =
734 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
735 next unless defined($start) && defined($end);
736 push @new_lim, "copydate:[$start TO $end]";
738 elsif ( $l =~ /^yr,st-numeric=/ ) {
739 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
740 next unless defined($date);
741 push @new_lim, "copydate:$date";
743 elsif ( $l =~ /^available$/ ) {
744 push @new_lim, 'onloan:0';
746 else {
747 push @new_lim, $l;
750 return \@new_lim;
753 =head2 _sort_field
755 my $field = $self->_sort_field($field);
757 Given a field name, this works out what the actual name of the version to sort
758 on should be. Often it's the same, sometimes it involves sticking "__sort" on
759 the end. Maybe it'll be something else in the future, who knows?
761 =cut
763 sub _sort_field {
764 my ($self, $f) = @_;
765 if ($self->sort_fields()->{$f}) {
766 $f .= '__sort';
768 return $f;