Bug 18068: ES - Fix location and (home|holding)branch facets
[koha.git] / Koha / SearchEngine / Elasticsearch / QueryBuilder.pm
blob222bb12c35dce6e7ecc8ef89022844c73c6228dc
1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 =head1 NAME
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
25 =head1 DESCRIPTION
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
30 =head1 SYNOPSIS
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
38 =head1 METHODS
40 =cut
42 use base qw(Koha::SearchEngine::Elasticsearch);
43 use Carp;
44 use JSON;
45 use List::MoreUtils qw/ each_array /;
46 use Modern::Perl;
47 use URI::Escape;
49 use C4::Context;
50 use Data::Dumper; # TODO remove
52 =head2 build_query
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
59 for details.)
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
65 =over 4
67 =item sort
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
73 =back
75 =cut
77 sub build_query {
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
87 my $res;
88 $res->{query} = {
89 query_string => {
90 query => $query,
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
98 if ( $options{sort} ) {
99 foreach my $sort ( @{ $options{sort} } ) {
100 my ( $f, $d ) = @$sort{qw/ field direction /};
101 die "Invalid sort direction, $d"
102 if $d && ( $d ne 'asc' && $d ne 'desc' );
103 $d = 'asc' unless $d;
105 # TODO account for fields that don't have a 'phrase' type
107 $f = $self->_sort_field($f);
108 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
112 # See _convert_facets in Search.pm for how these get turned into
113 # things that Koha can use.
114 $res->{facets} = {
115 author => { terms => { field => "author__facet" } },
116 subject => { terms => { field => "subject__facet" } },
117 itype => { terms => { field => "itype__facet" } },
118 location => { terms => { field => "location__facet" } },
119 'su-geo' => { terms => { field => "su-geo__facet" } },
120 se => { terms => { field => "se__facet" } },
123 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
124 if ( $display_library_facets eq 'both'
125 or $display_library_facets eq 'home' ) {
126 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
128 if ( $display_library_facets eq 'both'
129 or $display_library_facets eq 'holding' ) {
130 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
132 if ( my $ef = $options{expanded_facet} ) {
133 $res->{facets}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
135 return $res;
138 =head2 build_browse_query
140 my $browse_query = $builder->build_browse_query($field, $query);
142 This performs a "starts with" style query on a particular field. The field
143 to be searched must have been indexed with an appropriate mapping as a
144 "phrase" subfield, which pretty much everything has.
146 =cut
148 # XXX this isn't really a browse query like we want in the end
149 sub build_browse_query {
150 my ( $self, $field, $query ) = @_;
152 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
154 return { query => '*' } if !defined $query;
156 # TODO this should come from Koha::SearchEngine::Elasticsearch
157 my %field_whitelist = (
158 title => 1,
159 author => 1,
161 $field = 'title' if !exists $field_whitelist{$field};
162 my $sort = $self->_sort_field($field);
163 my $res = {
164 query => {
165 match_phrase_prefix => {
166 "$field.phrase" => {
167 query => $query,
168 operator => 'or',
169 fuzziness => $fuzzy_enabled ? 'auto' : '0',
173 sort => [ { "$sort.phrase" => { order => "asc" } } ],
177 =head2 build_query_compat
179 my (
180 $error, $query, $simple_query, $query_cgi,
181 $query_desc, $limit, $limit_cgi, $limit_desc,
182 $stopwords_removed, $query_type
184 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
185 \@limits, \@sort_by, $scan, $lang );
187 This handles a search using the same api as L<C4::Search::buildQuery> does.
189 A very simple query will go in with C<$operands> set to ['query'], and
190 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
191 C<$query> set to something that can perform the search, C<$simple_query>
192 set to just the search term, C<$query_cgi> set to something that can
193 reproduce this search, and C<$query_desc> set to something else.
195 =cut
197 sub build_query_compat {
198 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
199 $lang, $params )
200 = @_;
202 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
203 my @sort_params = $self->_convert_sort_fields(@$sort_by);
204 my @index_params = $self->_convert_index_fields(@$indexes);
205 my $limits = $self->_fix_limit_special_cases($orig_limits);
207 # Merge the indexes in with the search terms and the operands so that
208 # each search thing is a handy unit.
209 unshift @$operators, undef; # The first one can't have an op
210 my @search_params;
211 my $ea = each_array( @$operands, @$operators, @index_params );
212 while ( my ( $oand, $otor, $index ) = $ea->() ) {
213 next if ( !defined($oand) || $oand eq '' );
214 push @search_params, {
215 operand => $self->_clean_search_term($oand), # the search terms
216 operator => defined($otor) ? uc $otor : undef, # AND and so on
217 $index ? %$index : (),
221 # We build a string query from limits and the queries. An alternative
222 # would be to pass them separately into build_query and let it build
223 # them into a structured ES query itself. Maybe later, though that'd be
224 # more robust.
225 my $query_str = join( ' AND ',
226 join( ' ', $self->_create_query_string(@search_params) ) || (),
227 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
229 # If there's no query on the left, let's remove the junk left behind
230 $query_str =~ s/^ AND //;
231 my %options;
232 $options{sort} = \@sort_params;
233 $options{expanded_facet} = $params->{expanded_facet};
234 my $query = $self->build_query( $query_str, %options );
236 #die Dumper($query);
237 # We roughly emulate the CGI parameters of the zebra query builder
238 my $query_cgi;
239 $query_cgi = 'idx=kw&q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
240 my $simple_query;
241 $simple_query = $operands->[0] if @$operands == 1;
242 my $query_desc = $simple_query;
243 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
244 my $limit_cgi = ( $orig_limits and @$orig_limits )
245 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
246 : '';
247 my $limit_desc;
248 $limit_desc = "$limit" if $limit;
249 return (
250 undef, $query, $simple_query, $query_cgi, $query_desc,
251 $limit, $limit_cgi, $limit_desc, undef, undef
255 =head2 build_authorities_query
257 my $query = $builder->build_authorities_query(\%search);
259 This takes a nice description of an authority search and turns it into a black-box
260 query that can then be passed to the appropriate searcher.
262 The search description is a hashref that looks something like:
265 searches => [
267 where => 'Heading', # search the main entry
268 operator => 'exact', # require an exact match
269 value => 'frogs', # the search string
272 where => '', # search all entries
273 operator => '', # default keyword, right truncation
274 value => 'pond',
277 sort => {
278 field => 'Heading',
279 order => 'desc',
281 authtypecode => 'TOPIC_TERM',
284 =cut
286 sub build_authorities_query {
287 my ( $self, $search ) = @_;
289 # Start by making the query parts
290 my @query_parts;
291 my @filter_parts;
292 foreach my $s ( @{ $search->{searches} } ) {
293 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
294 $wh = '_all' if $wh eq '';
295 if ( $op eq 'is' || $op eq '=' ) {
297 # look for something that matches completely
298 # note, '=' is about numerical vals. May need special handling.
299 # _allphrase is a special field that only groups the exact
300 # matches. Also, we lowercase our search because the ES
301 # index lowercases its values, and term searches don't get the
302 # search analyzer applied to them.
303 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
305 elsif ( $op eq 'exact' ) {
307 # left and right truncation, otherwise an exact phrase
308 push @query_parts, { match_phrase => { $wh => $val } };
310 elsif ( $op eq 'start' ) {
312 # startswith search
313 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
315 else {
316 # regular wordlist stuff
317 push @query_parts, { match => { $wh => $val } };
321 # Merge the query and filter parts appropriately
322 # 'should' behaves like 'or', if we want 'and', use 'must'
323 my $query_part = { bool => { should => \@query_parts } };
324 my $filter_part = { bool => { should => \@filter_parts } };
326 # We need to add '.phrase' to all the sort headings otherwise it'll sort
327 # based on the tokenised form.
328 my %s;
329 if ( exists $search->{sort} ) {
330 foreach my $k ( keys %{ $search->{sort} } ) {
331 my $f = $self->_sort_field($k);
332 $s{"$f.phrase"} = $search->{sort}{$k};
334 $search->{sort} = \%s;
337 # extract the sort stuff
338 my %sort;
339 %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
340 my $query;
341 if (@filter_parts) {
342 $query =
343 { query =>
344 { filtered => { filter => $filter_part, query => $query_part } }
347 else {
348 $query = { query => $query_part };
350 $query = { %$query, %sort };
351 return $query;
355 =head2 build_authorities_query_compat
357 my ($query) =
358 $builder->build_authorities_query_compat( \@marclist, \@and_or,
359 \@excluding, \@operator, \@value, $authtypecode, $orderby );
361 This builds a query for searching for authorities, in the style of
362 L<C4::AuthoritiesMarc::SearchAuthorities>.
364 Arguments:
366 =over 4
368 =item marclist
370 An arrayref containing where the particular term should be searched for.
371 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
372 thesaurus. If left blank, any field is used.
374 =item and_or
376 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
378 =item excluding
380 Also ignored.
382 =item operator
384 What form of search to do. Options are: is (phrase, no trunction, whole field
385 must match), = (number exact match), exact (phrase, but with left and right
386 truncation). If left blank, then word list, right truncted, anywhere is used.
388 =item value
390 The actual user-provided string value to search for.
392 =item authtypecode
394 The authority type code to search within. If blank, then all will be searched.
396 =item orderby
398 The order to sort the results by. Options are Relevance, HeadingAsc,
399 HeadingDsc, AuthidAsc, AuthidDsc.
401 =back
403 marclist, operator, and value must be the same length, and the values at
404 index /i/ all relate to each other.
406 This returns a query, which is a black box object that can be passed to the
407 appropriate search object.
409 =cut
411 sub build_authorities_query_compat {
412 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
413 $authtypecode, $orderby )
414 = @_;
416 # This turns the old-style many-options argument form into a more
417 # extensible hash form that is understood by L<build_authorities_query>.
418 my @searches;
420 my %koha_to_index_name = (
421 mainmainentry => 'Heading-Main',
422 mainentry => 'Heading',
423 match => 'Match',
424 'match-heading' => 'Match-heading',
425 'see-from' => 'Match-heading-see-from',
426 thesaurus => 'Subject-heading-thesaurus',
427 any => '',
430 # Make sure everything exists
431 foreach my $m (@$marclist) {
432 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
434 for ( my $i = 0 ; $i < @$value ; $i++ ) {
435 push @searches,
437 where => $koha_to_index_name{$marclist->[$i]},
438 operator => $operator->[$i],
439 value => $value->[$i],
443 my %sort;
444 my $sort_field =
445 ( $orderby =~ /^Heading/ ) ? 'Heading'
446 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
447 : undef;
448 if ($sort_field) {
449 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
450 %sort = ( $sort_field => $sort_order, );
452 my %search = (
453 searches => \@searches,
454 authtypecode => $authtypecode,
456 $search{sort} = \%sort if %sort;
457 my $query = $self->build_authorities_query( \%search );
458 return $query;
461 =head2 _convert_sort_fields
463 my @sort_params = _convert_sort_fields(@sort_by)
465 Converts the zebra-style sort index information into elasticsearch-style.
467 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
468 something that can be sent to L<build_query>.
470 =cut
472 sub _convert_sort_fields {
473 my ( $self, @sort_by ) = @_;
475 # Turn the sorting into something we care about.
476 my %sort_field_convert = (
477 acqdate => 'acqdate',
478 author => 'author',
479 call_number => 'callnum',
480 popularity => 'issues',
481 relevance => undef, # default
482 title => 'title',
483 pubdate => 'pubdate',
485 my %sort_order_convert =
486 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
488 # Convert the fields and orders, drop anything we don't know about.
489 grep { $_->{field} } map {
490 my ( $f, $d ) = split /_/;
492 field => $sort_field_convert{$f},
493 direction => $sort_order_convert{$d}
495 } @sort_by;
498 =head2 _convert_index_fields
500 my @index_params = $self->_convert_index_fields(@indexes);
502 Converts zebra-style search index notation into elasticsearch-style.
504 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
505 and it returns something that can be sent to L<build_query>.
507 B<TODO>: this will pull from the elasticsearch mappings table to figure out
508 types.
510 =cut
512 our %index_field_convert = (
513 'kw' => '_all',
514 'ti' => 'title',
515 'au' => 'author',
516 'su' => 'subject',
517 'nb' => 'isbn',
518 'se' => 'title-series',
519 'callnum' => 'callnum',
520 'itype' => 'itype',
521 'ln' => 'ln',
522 'branch' => 'homebranch',
523 'fic' => 'lf',
524 'mus' => 'rtype',
525 'aud' => 'ta',
526 'hi' => 'Host-Item-Number',
529 sub _convert_index_fields {
530 my ( $self, @indexes ) = @_;
532 my %index_type_convert =
533 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
535 # Convert according to our table, drop anything that doesn't convert.
536 # If a field starts with mc- we save it as it's used (and removed) later
537 # when joining things, to indicate we make it an 'OR' join.
538 # (Sorry, this got a bit ugly after special cases were found.)
539 grep { $_->{field} } map {
540 my ( $f, $t ) = split /,/;
541 my $mc = '';
542 if ($f =~ /^mc-/) {
543 $mc = 'mc-';
544 $f =~ s/^mc-//;
546 my $r = {
547 field => $index_field_convert{$f},
548 type => $index_type_convert{ $t // '__default' }
550 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
552 } @indexes;
555 =head2 _convert_index_strings
557 my @searches = $self->_convert_index_strings(@searches);
559 Similar to L<_convert_index_fields>, this takes strings of the form
560 B<field:search term> and rewrites the field from zebra-style to
561 elasticsearch-style. Anything it doesn't understand is returned verbatim.
563 =cut
565 sub _convert_index_strings {
566 my ( $self, @searches ) = @_;
567 my @res;
568 foreach my $s (@searches) {
569 next if $s eq '';
570 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
571 unless ( defined($field) && defined($term) ) {
572 push @res, $s;
573 next;
575 my ($conv) = $self->_convert_index_fields($field);
576 unless ( defined($conv) ) {
577 push @res, $s;
578 next;
580 push @res, $conv->{field} . ":"
581 . $self->_modify_string_by_type( %$conv, operand => $term );
583 return @res;
586 =head2 _convert_index_strings_freeform
588 my $search = $self->_convert_index_strings_freeform($search);
590 This is similar to L<_convert_index_strings>, however it'll search out the
591 things to change within the string. So it can handle strings such as
592 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
594 If there is something of the form "su,complete-subfield" or something, the
595 second part is stripped off as we can't yet handle that. Making it work
596 will have to wait for a real query parser.
598 =cut
600 sub _convert_index_strings_freeform {
601 my ( $self, $search ) = @_;
602 while ( my ( $zeb, $es ) = each %index_field_convert ) {
603 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
605 return $search;
608 =head2 _modify_string_by_type
610 my $str = $self->_modify_string_by_type(%index_field);
612 If you have a search term (operand) and a type (phrase, right-truncated), this
613 will convert the string to have the function in lucene search terms, e.g.
614 wrapping quotes around it.
616 =cut
618 sub _modify_string_by_type {
619 my ( $self, %idx ) = @_;
621 my $type = $idx{type} || '';
622 my $str = $idx{operand};
623 return $str unless $str; # Empty or undef, we can't use it.
625 $str .= '*' if $type eq 'right-truncate';
626 $str = '"' . $str . '"' if $type eq 'phrase';
627 return $str;
630 =head2 _join_queries
632 my $query_str = $self->_join_queries(@query_parts);
634 This takes a list of query parts, that might be search terms on their own, or
635 booleaned together, or specifying fields, or whatever, wraps them in
636 parentheses, and ANDs them all together. Suitable for feeding to the ES
637 query string query.
639 Note: doesn't AND them together if they specify an index that starts with "mc"
640 as that was a special case in the original code for dealing with multiple
641 choice options (you can't search for something that has an itype of A and
642 and itype of B otherwise.)
644 =cut
646 sub _join_queries {
647 my ( $self, @parts ) = @_;
649 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
650 my @mc_parts =
651 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
652 return () unless @norm_parts + @mc_parts;
653 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
654 my $grouped_mc =
655 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
657 # Handy trick: $x || () inside a join means that if $x ends up as an
658 # empty string, it gets replaced with (), which makes join ignore it.
659 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
660 # in this case.)
661 join( ' AND ',
662 join( ' AND ', map { "($_)" } @norm_parts ) || (),
663 $grouped_mc || () );
666 =head2 _make_phrases
668 my @phrased_queries = $self->_make_phrases(@query_parts);
670 This takes the supplied queries and forces them to be phrases by wrapping
671 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
672 the quotes outside of them if they're there.
674 =cut
676 sub _make_phrases {
677 my ( $self, @parts ) = @_;
678 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
681 =head2 _create_query_string
683 my @query_strings = $self->_create_query_string(@queries);
685 Given a list of hashrefs, it will turn them into a lucene-style query string.
686 The hash should contain field, type (both for the indexes), operator, and
687 operand.
689 =cut
691 sub _create_query_string {
692 my ( $self, @queries ) = @_;
694 map {
695 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
696 my $field = $_->{field} ? $_->{field} . ':' : '';
698 my $oand = $self->_modify_string_by_type(%$_);
699 "$otor($field$oand)";
700 } @queries;
703 =head2 _clean_search_term
705 my $term = $self->_clean_search_term($term);
707 This cleans a search term by removing any funny characters that may upset
708 ES and give us an error. It also calls L<_convert_index_strings_freeform>
709 to ensure those parts are correct.
711 =cut
713 sub _clean_search_term {
714 my ( $self, $term ) = @_;
716 # Some hardcoded searches (like with authorities) produce things like
717 # 'an=123', when it ought to be 'an:123' for our purposes.
718 $term =~ s/=/:/g;
719 $term = $self->_convert_index_strings_freeform($term);
720 $term =~ s/[{}]/"/g;
721 return $term;
724 =head2 _fix_limit_special_cases
726 my $limits = $self->_fix_limit_special_cases($limits);
728 This converts any special cases that the limit specifications have into things
729 that are more readily processable by the rest of the code.
731 The argument should be an arrayref, and it'll return an arrayref.
733 =cut
735 sub _fix_limit_special_cases {
736 my ( $self, $limits ) = @_;
738 my @new_lim;
739 foreach my $l (@$limits) {
741 # This is set up by opac-search.pl
742 if ( $l =~ /^yr,st-numeric,ge=/ ) {
743 my ( $start, $end ) =
744 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
745 next unless defined($start) && defined($end);
746 push @new_lim, "copydate:[$start TO $end]";
748 elsif ( $l =~ /^yr,st-numeric=/ ) {
749 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
750 next unless defined($date);
751 push @new_lim, "copydate:$date";
753 elsif ( $l =~ /^available$/ ) {
754 push @new_lim, 'onloan:0';
756 else {
757 push @new_lim, $l;
760 return \@new_lim;
763 =head2 _sort_field
765 my $field = $self->_sort_field($field);
767 Given a field name, this works out what the actual name of the version to sort
768 on should be. Often it's the same, sometimes it involves sticking "__sort" on
769 the end. Maybe it'll be something else in the future, who knows?
771 =cut
773 sub _sort_field {
774 my ($self, $f) = @_;
775 if ($self->sort_fields()->{$f}) {
776 $f .= '__sort';
778 return $f;