Bug 23719: Allow searching specific fields for matching authorities in ES
[koha.git] / Koha / SearchEngine / Elasticsearch / QueryBuilder.pm
blobdaf7e65688af9410a72a5a96d2a8f737def694cf
1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 =head1 NAME
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
25 =head1 DESCRIPTION
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
30 =head1 SYNOPSIS
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
38 =head1 METHODS
40 =cut
42 use base qw(Koha::SearchEngine::Elasticsearch);
43 use Carp;
44 use JSON;
45 use List::MoreUtils qw/ each_array /;
46 use Modern::Perl;
47 use URI::Escape;
49 use C4::Context;
50 use Koha::Exceptions;
51 use Koha::Caches;
53 =head2 build_query
55 my $simple_query = $builder->build_query("hello", %options)
57 This will build a query that can be issued to elasticsearch from the provided
58 string input. This expects a lucene style search form (see
59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
60 for details.)
62 It'll make an attempt to respect the various query options.
64 Additional options can be provided with the C<%options> hash.
66 =over 4
68 =item sort
70 This should be an arrayref of hashrefs, each containing a C<field> and an
71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
74 =back
76 =cut
78 sub build_query {
79 my ( $self, $query, %options ) = @_;
81 my $stemming = C4::Context->preference("QueryStemming") || 0;
82 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
87 my $res;
88 my $fields = $self->_search_fields({
89 is_opac => $options{is_opac},
90 weighted_fields => $options{weighted_fields},
91 });
92 if ($options{whole_record}) {
93 push @$fields, 'marc_data_array.*';
95 $res->{query} = {
96 query_string => {
97 query => $query,
98 fuzziness => $fuzzy_enabled ? 'auto' : '0',
99 default_operator => 'AND',
100 fields => $fields,
101 lenient => JSON::true,
102 analyze_wildcard => JSON::true,
106 if ( $options{sort} ) {
107 foreach my $sort ( @{ $options{sort} } ) {
108 my ( $f, $d ) = @$sort{qw/ field direction /};
109 die "Invalid sort direction, $d"
110 if $d && ( $d ne 'asc' && $d ne 'desc' );
111 $d = 'asc' unless $d;
113 $f = $self->_sort_field($f);
114 push @{ $res->{sort} }, { $f => { order => $d } };
118 # See _convert_facets in Search.pm for how these get turned into
119 # things that Koha can use.
120 my $size = C4::Context->preference('FacetMaxCount');
121 $res->{aggregations} = {
122 author => { terms => { field => "author__facet" , size => $size } },
123 subject => { terms => { field => "subject__facet", size => $size } },
124 itype => { terms => { field => "itype__facet", size => $size} },
125 location => { terms => { field => "location__facet", size => $size } },
126 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
127 'title-series' => { terms => { field => "title-series__facet", size => $size } },
128 ccode => { terms => { field => "ccode__facet", size => $size } },
129 ln => { terms => { field => "ln__facet", size => $size } },
132 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
133 if ( $display_library_facets eq 'both'
134 or $display_library_facets eq 'home' ) {
135 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
137 if ( $display_library_facets eq 'both'
138 or $display_library_facets eq 'holding' ) {
139 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
141 return $res;
144 =head2 build_query_compat
146 my (
147 $error, $query, $simple_query, $query_cgi,
148 $query_desc, $limit, $limit_cgi, $limit_desc,
149 $stopwords_removed, $query_type
151 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
152 \@limits, \@sort_by, $scan, $lang, $params );
154 This handles a search using the same api as L<C4::Search::buildQuery> does.
156 A very simple query will go in with C<$operands> set to ['query'], and
157 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
158 C<$query> set to something that can perform the search, C<$simple_query>
159 set to just the search term, C<$query_cgi> set to something that can
160 reproduce this search, and C<$query_desc> set to something else.
162 =cut
164 sub build_query_compat {
165 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
166 $lang, $params )
167 = @_;
169 my $query;
170 my $query_str = '';
171 my $search_param_query_str = '';
172 my $limits = ();
173 if ( $scan ) {
174 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
175 $search_param_query_str = $query_str;
176 } else {
177 my @sort_params = $self->_convert_sort_fields(@$sort_by);
178 my @index_params = $self->_convert_index_fields(@$indexes);
179 my $limits = $self->_fix_limit_special_cases($orig_limits);
180 if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
181 # Merge the indexes in with the search terms and the operands so that
182 # each search thing is a handy unit.
183 unshift @$operators, undef; # The first one can't have an op
184 my @search_params;
185 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
186 my $ea = each_array( @$operands, @$operators, @index_params );
187 while ( my ( $oand, $otor, $index ) = $ea->() ) {
188 next if ( !defined($oand) || $oand eq '' );
189 $oand = $self->_clean_search_term($oand);
190 $oand = $self->_truncate_terms($oand) if ($truncate);
191 push @search_params, {
192 operand => $oand, # the search terms
193 operator => defined($otor) ? uc $otor : undef, # AND and so on
194 $index ? %$index : (),
198 # We build a string query from limits and the queries. An alternative
199 # would be to pass them separately into build_query and let it build
200 # them into a structured ES query itself. Maybe later, though that'd be
201 # more robust.
202 $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
203 $query_str = join( ' AND ',
204 $search_param_query_str || (),
205 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
207 # If there's no query on the left, let's remove the junk left behind
208 $query_str =~ s/^ AND //;
209 my %options;
210 $options{sort} = \@sort_params;
211 $options{is_opac} = $params->{is_opac};
212 $options{weighted_fields} = $params->{weighted_fields};
213 $options{whole_record} = $params->{whole_record};
214 $query = $self->build_query( $query_str, %options );
217 # We roughly emulate the CGI parameters of the zebra query builder
218 my $query_cgi = '';
219 shift @$operators; # Shift out the one we unshifted before
220 my $ea = each_array( @$operands, @$operators, @$indexes );
221 while ( my ( $oand, $otor, $index ) = $ea->() ) {
222 $query_cgi .= '&' if $query_cgi;
223 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
224 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
226 $query_cgi .= '&scan=1' if ( $scan );
228 my $simple_query;
229 $simple_query = $operands->[0] if @$operands == 1;
230 my $query_desc;
231 if ( $simple_query ) {
232 $query_desc = $simple_query;
233 } else {
234 $query_desc = $search_param_query_str;
236 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
237 my $limit_cgi = ( $orig_limits and @$orig_limits )
238 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
239 : '';
240 my $limit_desc;
241 $limit_desc = "$limit" if $limit;
243 return (
244 undef, $query, $simple_query, $query_cgi, $query_desc,
245 $limit, $limit_cgi, $limit_desc, undef, undef
249 =head2 build_authorities_query
251 my $query = $builder->build_authorities_query(\%search);
253 This takes a nice description of an authority search and turns it into a black-box
254 query that can then be passed to the appropriate searcher.
256 The search description is a hashref that looks something like:
259 searches => [
261 where => 'Heading', # search the main entry
262 operator => 'exact', # require an exact match
263 value => 'frogs', # the search string
266 where => '', # search all entries
267 operator => '', # default keyword, right truncation
268 value => 'pond',
271 sort => {
272 field => 'Heading',
273 order => 'desc',
275 authtypecode => 'TOPIC_TERM',
278 =cut
280 sub build_authorities_query {
281 my ( $self, $search ) = @_;
283 # Start by making the query parts
284 my @query_parts;
286 foreach my $s ( @{ $search->{searches} } ) {
287 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
288 if ( $op eq 'is' || $op eq '=' || $op eq 'exact') {
289 if ($wh) {
290 # Match the whole field, case insensitive, UTF normalized.
291 push @query_parts, { term => { "$wh.ci_raw" => $val } };
293 else {
294 # Match the whole field for all searchable fields, case insensitive,
295 # UTF normalized.
296 # Given that field data is "The quick brown fox"
297 # "The quick brown fox" and "the quick brown fox" will match
298 # but not "quick brown fox".
299 push @query_parts, {
300 multi_match => {
301 query => $val,
302 fields => $self->_search_fields({ subfield => 'ci_raw' }),
307 elsif ( $op eq 'start') {
308 # Match the prefix within a field for all searchable fields.
309 # Given that field data is "The quick brown fox"
310 # "The quick bro" will match, but not "quick bro"
312 # Does not seems to be a multi prefix query
313 # so we need to create one
314 if ($wh) {
315 # Match prefix of the field.
316 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
318 else {
319 my @prefix_queries;
320 foreach my $field (@{$self->_search_fields()}) {
321 push @prefix_queries, {
322 prefix => { "$field.ci_raw" => $val }
325 push @query_parts, {
326 'bool' => {
327 'should' => \@prefix_queries,
328 'minimum_should_match' => 1
333 else {
334 # Query all searchable fields.
335 # Given that field data is "The quick brown fox"
336 # a search containing any of the words will match, regardless
337 # of order.
339 my @tokens = $self->_split_query( $val );
340 foreach my $token ( @tokens ) {
341 $token = $self->_truncate_terms(
342 $self->_clean_search_term( $token )
345 my $query = $self->_join_queries( @tokens );
347 if ($wh) {
348 push @query_parts, { query_string => {
349 default_field => $wh,
350 analyze_wildcard => JSON::true,
351 query => $query
352 } };
354 else {
355 push @query_parts, {
356 query_string => {
357 analyze_wildcard => JSON::true,
358 query => $query,
359 fields => $self->_search_fields(),
366 # Merge the query parts appropriately
367 # 'should' behaves like 'or'
368 # 'must' behaves like 'and'
369 # Zebra behaviour seem to match must so using that here
370 my $elastic_query = {};
371 $elastic_query->{bool}->{must} = \@query_parts;
373 # Filter by authtypecode if set
374 if ($search->{authtypecode}) {
375 $elastic_query->{bool}->{filter} = {
376 term => {
377 "authtype.raw" => $search->{authtypecode}
382 my $query = {
383 query => $elastic_query
386 # Add the sort stuff
387 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
389 return $query;
392 =head2 build_authorities_query_compat
394 my ($query) =
395 $builder->build_authorities_query_compat( \@marclist, \@and_or,
396 \@excluding, \@operator, \@value, $authtypecode, $orderby );
398 This builds a query for searching for authorities, in the style of
399 L<C4::AuthoritiesMarc::SearchAuthorities>.
401 Arguments:
403 =over 4
405 =item marclist
407 An arrayref containing where the particular term should be searched for.
408 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
409 thesaurus. If left blank, any field is used.
411 =item and_or
413 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
415 =item excluding
417 Also ignored.
419 =item operator
421 What form of search to do. Options are: is (phrase, no truncation, whole field
422 must match), = (number exact match), exact (phrase, no truncation, whole field
423 must match). If left blank, then word list, right truncated, anywhere is used.
425 =item value
427 The actual user-provided string value to search for.
429 =item authtypecode
431 The authority type code to search within. If blank, then all will be searched.
433 =item orderby
435 The order to sort the results by. Options are Relevance, HeadingAsc,
436 HeadingDsc, AuthidAsc, AuthidDsc.
438 =back
440 marclist, operator, and value must be the same length, and the values at
441 index /i/ all relate to each other.
443 This returns a query, which is a black box object that can be passed to the
444 appropriate search object.
446 =cut
448 our $koha_to_index_name = {
449 mainmainentry => 'heading-main',
450 mainentry => 'heading',
451 match => 'match',
452 'match-heading' => 'match-heading',
453 'see-from' => 'match-heading-see-from',
454 thesaurus => 'subject-heading-thesaurus',
455 any => '',
456 all => ''
459 sub build_authorities_query_compat {
460 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
461 $authtypecode, $orderby )
462 = @_;
464 # This turns the old-style many-options argument form into a more
465 # extensible hash form that is understood by L<build_authorities_query>.
466 my @searches;
468 # Convert to lower case
469 $marclist = [map(lc, @{$marclist})];
470 $orderby = lc $orderby;
472 my @indexes;
473 # Make sure everything exists
474 foreach my $m (@$marclist) {
475 push @indexes, exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
477 for ( my $i = 0 ; $i < @$value ; $i++ ) {
478 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
479 push @searches,
481 where => $indexes[$i],
482 operator => $operator->[$i],
483 value => $value->[$i],
487 my %sort;
488 my $sort_field =
489 ( $orderby =~ /^heading/ ) ? 'heading__sort'
490 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
491 : undef;
492 if ($sort_field) {
493 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
494 %sort = ( $sort_field => $sort_order, );
496 my %search = (
497 searches => \@searches,
498 authtypecode => $authtypecode,
500 $search{sort} = \%sort if %sort;
501 my $query = $self->build_authorities_query( \%search );
502 return $query;
505 =head2 _build_scan_query
507 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
509 This will build an aggregation scan query that can be issued to elasticsearch from
510 the provided string input.
512 =cut
514 our %scan_field_convert = (
515 'ti' => 'title',
516 'au' => 'author',
517 'su' => 'subject',
518 'se' => 'title-series',
519 'pb' => 'publisher',
522 sub _build_scan_query {
523 my ( $self, $operands, $indexes ) = @_;
525 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
526 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
528 my ( $f, $d ) = split( /,/, $index);
529 $index = $scan_field_convert{$f} || $f;
531 my $res;
532 $res->{query} = {
533 query_string => {
534 query => '*'
537 $res->{aggregations} = {
538 $index => {
539 terms => {
540 field => $index . '__facet',
541 order => { '_term' => 'asc' },
542 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
546 return ($res, $term);
549 =head2 _create_regex_filter
551 my $filter = $builder->_create_regex_filter('term')
553 This will create a regex filter that can be used with an aggregation query.
555 =cut
557 sub _create_regex_filter {
558 my ($self, $term) = @_;
560 my $result = '';
561 foreach my $c (split(//, quotemeta($term))) {
562 my $lc = lc($c);
563 my $uc = uc($c);
564 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
566 return $result;
569 =head2 _convert_sort_fields
571 my @sort_params = _convert_sort_fields(@sort_by)
573 Converts the zebra-style sort index information into elasticsearch-style.
575 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
576 something that can be sent to L<build_query>.
578 =cut
580 sub _convert_sort_fields {
581 my ( $self, @sort_by ) = @_;
583 # Turn the sorting into something we care about.
584 my %sort_field_convert = (
585 acqdate => 'date-of-acquisition',
586 author => 'author',
587 call_number => 'local-classification',
588 popularity => 'issues',
589 relevance => undef, # default
590 title => 'title',
591 pubdate => 'date-of-publication',
593 my %sort_order_convert =
594 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
596 # Convert the fields and orders, drop anything we don't know about.
597 grep { $_->{field} } map {
598 my ( $f, $d ) = /(.+)_(.+)/;
600 field => $sort_field_convert{$f},
601 direction => $sort_order_convert{$d}
603 } @sort_by;
606 =head2 _convert_index_fields
608 my @index_params = $self->_convert_index_fields(@indexes);
610 Converts zebra-style search index notation into elasticsearch-style.
612 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
613 and it returns something that can be sent to L<build_query>.
615 B<TODO>: this will pull from the elasticsearch mappings table to figure out
616 types.
618 =cut
620 our %index_field_convert = (
621 'kw' => '',
622 'ab' => 'abstract',
623 'au' => 'author',
624 'lcn' => 'local-classification',
625 'callnum' => 'local-classification',
626 'record-type' => 'rtype',
627 'mc-rtype' => 'rtype',
628 'mus' => 'rtype',
629 'lc-card' => 'lc-card-number',
630 'sn' => 'local-number',
631 'yr' => 'date-of-publication',
632 'pubdate' => 'date-of-publication',
633 'acqdate' => 'date-of-acquisition',
634 'date/time-last-modified' => 'date-time-last-modified',
635 'dtlm' => 'date-time-last-modified',
636 'diss' => 'dissertation-information',
637 'nb' => 'isbn',
638 'ns' => 'issn',
639 'music-number' => 'identifier-publisher-for-music',
640 'number-music-publisher' => 'identifier-publisher-for-music',
641 'music' => 'identifier-publisher-for-music',
642 'ident' => 'identifier-standard',
643 'cpn' => 'corporate-name',
644 'cfn' => 'conference-name',
645 'pn' => 'personal-name',
646 'pb' => 'publisher',
647 'pv' => 'provider',
648 'nt' => 'note',
649 'notes' => 'note',
650 'rcn' => 'record-control-number',
651 'su' => 'subject',
652 'su-to' => 'subject',
653 #'su-geo' => 'subject',
654 'su-ut' => 'subject',
655 'ti' => 'title',
656 'se' => 'title-series',
657 'ut' => 'title-uniform',
658 'an' => 'koha-auth-number',
659 'authority-number' => 'koha-auth-number',
660 'at' => 'authtype',
661 'he' => 'heading',
662 'rank' => 'relevance',
663 'phr' => 'st-phrase',
664 'wrdl' => 'st-word-list',
665 'rt' => 'right-truncation',
666 'rtrn' => 'right-truncation',
667 'ltrn' => 'left-truncation',
668 'rltrn' => 'left-and-right',
669 'mc-itemtype' => 'itemtype',
670 'mc-ccode' => 'ccode',
671 'branch' => 'homebranch',
672 'mc-loc' => 'location',
673 'stocknumber' => 'number-local-acquisition',
674 'inv' => 'number-local-acquisition',
675 'bc' => 'barcode',
676 'mc-itype' => 'itype',
677 'aub' => 'author-personal-bibliography',
678 'auo' => 'author-in-order',
679 'ff8-22' => 'ta',
680 'aud' => 'ta',
681 'audience' => 'ta',
682 'frequency-code' => 'ff8-18',
683 'illustration-code' => 'ff8-18-21',
684 'regularity-code' => 'ff8-19',
685 'type-of-serial' => 'ff8-21',
686 'format' => 'ff8-23',
687 'conference-code' => 'ff8-29',
688 'festschrift-indicator' => 'ff8-30',
689 'index-indicator' => 'ff8-31',
690 'fiction' => 'lf',
691 'fic' => 'lf',
692 'literature-code' => 'lf',
693 'biography' => 'bio',
694 'ff8-34' => 'bio',
695 'biography-code' => 'bio',
696 'l-format' => 'ff7-01-02',
697 'lex' => 'lexile-number',
698 'hi' => 'host-item-number',
699 'itu' => 'index-term-uncontrolled',
700 'itg' => 'index-term-genre',
702 my $field_name_pattern = '[\w\-]+';
703 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
705 sub _convert_index_fields {
706 my ( $self, @indexes ) = @_;
708 my %index_type_convert =
709 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
711 # Convert according to our table, drop anything that doesn't convert.
712 # If a field starts with mc- we save it as it's used (and removed) later
713 # when joining things, to indicate we make it an 'OR' join.
714 # (Sorry, this got a bit ugly after special cases were found.)
715 map {
716 # Lower case all field names
717 my ( $f, $t ) = map(lc, split /,/);
718 my $mc = '';
719 if ($f =~ /^mc-/) {
720 $mc = 'mc-';
721 $f =~ s/^mc-//;
723 my $r = {
724 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
725 type => $index_type_convert{ $t // '__default' }
727 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
728 $r->{field} ? $r : undef;
729 } @indexes;
732 =head2 _convert_index_strings
734 my @searches = $self->_convert_index_strings(@searches);
736 Similar to L<_convert_index_fields>, this takes strings of the form
737 B<field:search term> and rewrites the field from zebra-style to
738 elasticsearch-style. Anything it doesn't understand is returned verbatim.
740 =cut
742 sub _convert_index_strings {
743 my ( $self, @searches ) = @_;
744 my @res;
745 foreach my $s (@searches) {
746 next if $s eq '';
747 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
748 unless ( defined($field) && defined($term) ) {
749 push @res, $s;
750 next;
752 my ($conv) = $self->_convert_index_fields($field);
753 unless ( defined($conv) ) {
754 push @res, $s;
755 next;
757 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
758 . $self->_modify_string_by_type( %$conv, operand => $term );
760 return @res;
763 =head2 _convert_index_strings_freeform
765 my $search = $self->_convert_index_strings_freeform($search);
767 This is similar to L<_convert_index_strings>, however it'll search out the
768 things to change within the string. So it can handle strings such as
769 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
771 If there is something of the form "su,complete-subfield" or something, the
772 second part is stripped off as we can't yet handle that. Making it work
773 will have to wait for a real query parser.
775 =cut
777 sub _convert_index_strings_freeform {
778 my ( $self, $search ) = @_;
779 # @TODO: Currenty will alter also fields contained within quotes:
780 # `searching for "stuff cn:123"` for example will become
781 # `searching for "stuff local-number:123"
783 # Fixing this is tricky, one possibility:
784 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
785 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
787 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
788 # them back when processing is done.
790 # Lower case field names
791 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
792 # Resolve possible field aliases
793 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
794 return $search;
797 =head2 _modify_string_by_type
799 my $str = $self->_modify_string_by_type(%index_field);
801 If you have a search term (operand) and a type (phrase, right-truncated), this
802 will convert the string to have the function in lucene search terms, e.g.
803 wrapping quotes around it.
805 =cut
807 sub _modify_string_by_type {
808 my ( $self, %idx ) = @_;
810 my $type = $idx{type} || '';
811 my $str = $idx{operand};
812 return $str unless $str; # Empty or undef, we can't use it.
814 $str .= '*' if $type eq 'right-truncate';
815 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
816 if ($type eq 'st-year') {
817 if ($str =~ /^(.*)-(.*)$/) {
818 my $from = $1 || '*';
819 my $until = $2 || '*';
820 $str = "[$from TO $until]";
823 return $str;
826 =head2 _join_queries
828 my $query_str = $self->_join_queries(@query_parts);
830 This takes a list of query parts, that might be search terms on their own, or
831 booleaned together, or specifying fields, or whatever, wraps them in
832 parentheses, and ANDs them all together. Suitable for feeding to the ES
833 query string query.
835 Note: doesn't AND them together if they specify an index that starts with "mc"
836 as that was a special case in the original code for dealing with multiple
837 choice options (you can't search for something that has an itype of A and
838 and itype of B otherwise.)
840 =cut
842 sub _join_queries {
843 my ( $self, @parts ) = @_;
845 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
846 my @mc_parts =
847 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
848 return () unless @norm_parts + @mc_parts;
849 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
850 my $grouped_mc =
851 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
853 # Handy trick: $x || () inside a join means that if $x ends up as an
854 # empty string, it gets replaced with (), which makes join ignore it.
855 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
856 # in this case.)
857 join( ' AND ',
858 join( ' AND ', map { "($_)" } @norm_parts ) || (),
859 $grouped_mc || () );
862 =head2 _make_phrases
864 my @phrased_queries = $self->_make_phrases(@query_parts);
866 This takes the supplied queries and forces them to be phrases by wrapping
867 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
868 the quotes outside of them if they're there.
870 =cut
872 sub _make_phrases {
873 my ( $self, @parts ) = @_;
874 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
877 =head2 _create_query_string
879 my @query_strings = $self->_create_query_string(@queries);
881 Given a list of hashrefs, it will turn them into a lucene-style query string.
882 The hash should contain field, type (both for the indexes), operator, and
883 operand.
885 =cut
887 sub _create_query_string {
888 my ( $self, @queries ) = @_;
890 map {
891 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
892 my $field = $_->{field} ? $_->{field} . ':' : '';
894 my $oand = $self->_modify_string_by_type(%$_);
895 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
896 "$otor($field$oand)";
897 } @queries;
900 =head2 _clean_search_term
902 my $term = $self->_clean_search_term($term);
904 This cleans a search term by removing any funny characters that may upset
905 ES and give us an error. It also calls L<_convert_index_strings_freeform>
906 to ensure those parts are correct.
908 =cut
910 sub _clean_search_term {
911 my ( $self, $term ) = @_;
913 # Lookahead for checking if we are inside quotes
914 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
916 # Some hardcoded searches (like with authorities) produce things like
917 # 'an=123', when it ought to be 'an:123' for our purposes.
918 $term =~ s/=/:/g;
920 $term = $self->_convert_index_strings_freeform($term);
921 $term =~ s/[{}]/"/g;
923 # Remove unbalanced quotes
924 my $unquoted = $term;
925 my $count = ($unquoted =~ tr/"/ /);
926 if ($count % 2 == 1) {
927 $term = $unquoted;
930 # Remove unquoted colons that have whitespace on either side of them
931 $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
933 $term = $self->_query_regex_escape_process($term);
935 return $term;
938 =head2 _query_regex_escape_process
940 my $query = $self->_query_regex_escape_process($query);
942 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
944 =cut
946 sub _query_regex_escape_process {
947 my ($self, $query) = @_;
948 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
949 if ($regex_escape_options ne 'dont_escape') {
950 if ($regex_escape_options eq 'escape') {
951 # Will escape unescaped slashes (/) while preserving
952 # unescaped slashes within quotes
953 # @TODO: assumes quotes are always balanced and will
954 # not handle escaped qoutes properly, should perhaps be
955 # replaced with a more general parser solution
956 # so that this function is ever only provided with unqouted
957 # query parts
958 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
960 elsif($regex_escape_options eq 'unescape_escaped') {
961 # Will unescape escaped slashes (\/) and escape
962 # unescaped slashes (/) while preserving slashes within quotes
963 # The same limitatations as above apply for handling of quotes
964 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
967 return $query;
970 =head2 _fix_limit_special_cases
972 my $limits = $self->_fix_limit_special_cases($limits);
974 This converts any special cases that the limit specifications have into things
975 that are more readily processable by the rest of the code.
977 The argument should be an arrayref, and it'll return an arrayref.
979 =cut
981 sub _fix_limit_special_cases {
982 my ( $self, $limits ) = @_;
984 my @new_lim;
985 foreach my $l (@$limits) {
987 # This is set up by opac-search.pl
988 if ( $l =~ /^yr,st-numeric,ge=/ ) {
989 my ( $start, $end ) =
990 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
991 next unless defined($start) && defined($end);
992 push @new_lim, "copydate:[$start TO $end]";
994 elsif ( $l =~ /^yr,st-numeric=/ ) {
995 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
996 next unless defined($date);
997 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
998 push @new_lim, "copydate:$date";
1000 elsif ( $l =~ /^available$/ ) {
1001 push @new_lim, 'onloan:false';
1003 else {
1004 push @new_lim, $l;
1007 return \@new_lim;
1010 =head2 _sort_field
1012 my $field = $self->_sort_field($field);
1014 Given a field name, this works out what the actual name of the field to sort
1015 on should be. A '__sort' suffix is added for fields with a sort version, and
1016 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1017 to avoid sorting on a tokenized value.
1019 =cut
1021 sub _sort_field {
1022 my ($self, $f) = @_;
1024 my $mappings = $self->get_elasticsearch_mappings();
1025 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1026 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1027 $f .= '__sort';
1028 } else {
1029 # We need to add '.raw' to text fields without a sort field,
1030 # otherwise it'll sort based on the tokenised form.
1031 $f .= '.raw' if $textField;
1033 return $f;
1036 =head2 _truncate_terms
1038 my $query = $self->_truncate_terms($query);
1040 Given a string query this function appends '*' wildcard to all terms except
1041 operands and double quoted strings.
1043 =cut
1045 sub _truncate_terms {
1046 my ( $self, $query ) = @_;
1048 my @tokens = $self->_split_query( $query );
1050 # Filter out empty tokens
1051 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1053 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1054 my @terms = map {
1055 my $w = $_;
1056 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1057 } @words;
1059 return join ' ', @terms;
1062 =head2 _split_query
1064 my @token = $self->_split_query($query_str);
1066 Given a string query this function splits it to tokens taking into account
1067 any field prefixes and quoted strings.
1069 =cut
1071 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1073 sub _split_query {
1074 my ( $self, $query ) = @_;
1076 # '"donald duck" title:"the mouse" and peter" get split into
1077 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1078 my @tokens = split $tokenize_split_re, $query;
1080 # Filter out empty values
1081 @tokens = grep( /\S/, @tokens );
1083 return @tokens;
1086 =head2 _search_fields
1087 my $weighted_fields = $self->_search_fields({
1088 is_opac => 0,
1089 weighted_fields => 1,
1090 subfield => 'raw'
1093 Generate a list of searchable fields to be used for Elasticsearch queries
1094 applied to multiple fields.
1096 Returns an arrayref of field names for either OPAC or Staff client, with
1097 possible weights and subfield appended to each field name depending on the
1098 options provided.
1100 =over 4
1102 =item C<$params>
1104 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1105 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1106 fields weights will be applied on returned fields. C<subfield> can be used to
1107 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1109 =back
1111 =cut
1113 sub _search_fields {
1114 my ($self, $params) = @_;
1115 $params //= {
1116 is_opac => 0,
1117 weighted_fields => 0,
1118 whole_record => 0,
1119 # This is a hack for authorities build_authorities_query
1120 # can hopefully be removed in the future
1121 subfield => undef,
1123 my $cache = Koha::Caches->get_instance();
1124 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client');
1125 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1126 if (!$search_fields) {
1127 # The reason we don't use Koha::SearchFields->search here is we don't
1128 # want or need resultset wrapped as Koha::SearchField object.
1129 # It does not make any sense in this context and would cause
1130 # unnecessary overhead sice we are only querying for data
1131 # Also would not work, or produce strange results, with the "columns"
1132 # option.
1133 my $schema = Koha::Database->schema;
1134 my $result = $schema->resultset('SearchField')->search(
1136 $params->{is_opac} ? (
1137 'opac' => 1,
1138 ) : (
1139 'staff_client' => 1
1141 'type' => { '!=' => 'boolean' },
1142 'search_marc_map.index_name' => $self->index,
1143 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1144 'search_marc_to_fields.search' => 1,
1147 columns => [qw/name weight/],
1148 collapse => 1,
1149 join => {search_marc_to_fields => 'search_marc_map'},
1152 my @search_fields;
1153 while (my $search_field = $result->next) {
1154 push @search_fields, [
1155 $search_field->name,
1156 $search_field->weight ? $search_field->weight : ()
1159 $search_fields = \@search_fields;
1160 $cache->set_in_cache($cache_key, $search_fields);
1162 if ($params->{subfield}) {
1163 my $subfield = $params->{subfield};
1164 $search_fields = [
1165 map {
1166 # Copy values to avoid mutating cached
1167 # data (since unsafe is used)
1168 my ($field, $weight) = @{$_};
1169 ["${field}.${subfield}", $weight];
1170 } @{$search_fields}
1173 if ($params->{weighted_fields}) {
1174 return [map { join('^', @{$_}) } @{$search_fields}];
1176 else {
1177 # Exclude weight from field
1178 return [map { $_->[0] } @{$search_fields}];