Koha/ElasticSearch.pm

   1 package Koha::ElasticSearch;
   2
   3 # Copyright 2015 Catalyst IT
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 3 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use base qw(Class::Accessor);
  21
  22 use C4::Context;
  23
  24 use Koha::Database;
  25
  26 use Carp;
  27 use JSON;
  28 use Modern::Perl;
  29 use Readonly;
  30
  31 use Data::Dumper;    # TODO remove
  32
  33 __PACKAGE__->mk_ro_accessors(qw( index ));
  34 __PACKAGE__->mk_accessors(qw( sort_fields ));
  35
  36 # Constants to refer to the standard index names
  37 Readonly our $BIBLIOS_INDEX     => 'biblios';
  38 Readonly our $AUTHORITIES_INDEX => 'authorities';
  39
  40 =head1 NAME
  41
  42 Koha::ElasticSearch - Base module for things using elasticsearch
  43
  44 =head1 ACCESSORS
  45
  46 =over 4
  47
  48 =item index
  49
  50 The name of the index to use, generally 'biblios' or 'authorities'.
  51
  52 =back
  53
  54 =head1 FUNCTIONS
  55
  56 =cut
  57
  58 sub new {
  59     my $class = shift @_;
  60     my $self = $class->SUPER::new(@_);
  61     # Check for a valid index
  62     croak('No index name provided') unless $self->index;
  63     return $self;
  64 }
  65
  66 =head2 get_elasticsearch_params
  67
  68     my $params = $self->get_elasticsearch_params();
  69
  70 This provides a hashref that contains the parameters for connecting to the
  71 ElasicSearch servers, in the form:
  72
  73     {
  74         'nodes' => ['127.0.0.1:9200', 'anotherserver:9200'],
  75         'index_name' => 'koha_instance_index',
  76     }
  77
  78 This is configured by the following in the C<config> block in koha-conf.xml:
  79
  80     <elasticsearch>
  81         <server>127.0.0.1:9200</server>
  82         <server>anotherserver:9200</server>
  83         <index_name>koha_instance</index_name>
  84     </elasticsearch>
  85
  86 =cut
  87
  88 sub get_elasticsearch_params {
  89     my ($self) = @_;
  90
  91     # Copy the hash so that we're not modifying the original
  92     my $conf = C4::Context->config('elasticsearch');
  93     die "No 'elasticsearch' block is defined in koha-conf.xml.\n" if ( !$conf );
  94     my $es = { %{ $conf } };
  95
  96     # Helpfully, the multiple server lines end up in an array for us anyway
  97     # if there are multiple ones, but not if there's only one.
  98     my $server = $es->{server};
  99     delete $es->{server};
 100     if ( ref($server) eq 'ARRAY' ) {
 101
 102         # store it called 'nodes' (which is used by newer Search::Elasticsearch)
 103         $es->{nodes} = $server;
 104     }
 105     elsif ($server) {
 106         $es->{nodes} = [$server];
 107     }
 108     else {
 109         die "No elasticsearch servers were specified in koha-conf.xml.\n";
 110     }
 111     die "No elasticserver index_name was specified in koha-conf.xml.\n"
 112       if ( !$es->{index_name} );
 113     # Append the name of this particular index to our namespace
 114     $es->{index_name} .= '_' . $self->index;
 115     return $es;
 116 }
 117
 118 =head2 get_elasticsearch_settings
 119
 120     my $settings = $self->get_elasticsearch_settings();
 121
 122 This provides the settings provided to elasticsearch when an index is created.
 123 These can do things like define tokenisation methods.
 124
 125 A hashref containing the settings is returned.
 126
 127 =cut
 128
 129 sub get_elasticsearch_settings {
 130     my ($self) = @_;
 131
 132     # Ultimately this should come from a file or something, and not be
 133     # hardcoded.
 134     my $settings = {
 135         index => {
 136             analysis => {
 137                 analyzer => {
 138                     analyser_phrase => {
 139                         tokenizer => 'keyword',
 140                         filter    => ['lowercase'],
 141                     },
 142                     analyser_standard => {
 143                         tokenizer => 'standard',
 144                         filter    => ['lowercase'],
 145                     }
 146                 },
 147             }
 148         }
 149     };
 150     return $settings;
 151 }
 152
 153 =head2 get_elasticsearch_mappings
 154
 155     my $mappings = $self->get_elasticsearch_mappings();
 156
 157 This provides the mappings that get passed to elasticsearch when an index is
 158 created.
 159
 160 =cut
 161
 162 sub get_elasticsearch_mappings {
 163     my ($self) = @_;
 164
 165     # TODO cache in the object?
 166     my $mappings = {
 167         data => {
 168             properties => {
 169                 record => {
 170                     store          => "yes",
 171                     include_in_all => JSON::false,
 172                     type           => "string",
 173                 },
 174                 '_all.phrase' => {
 175                     search_analyzer => "analyser_phrase",
 176                     index_analyzer  => "analyser_phrase",
 177                     type            => "string",
 178                 },
 179             }
 180         }
 181     };
 182     my %sort_fields;
 183     my $marcflavour = lc C4::Context->preference('marcflavour');
 184     $self->_foreach_mapping(
 185         sub {
 186             my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
 187             return if $marc_type ne $marcflavour;
 188             # TODO if this gets any sort of complexity to it, it should
 189             # be broken out into its own function.
 190
 191             # TODO be aware of date formats, but this requires pre-parsing
 192             # as ES will simply reject anything with an invalid date.
 193             my $es_type =
 194               $type eq 'boolean'
 195               ? 'boolean'
 196               : 'string';
 197             $mappings->{data}{properties}{$name} = {
 198                 search_analyzer => "analyser_standard",
 199                 index_analyzer  => "analyser_standard",
 200                 type            => $es_type,
 201                 fields          => {
 202                     phrase => {
 203                         search_analyzer => "analyser_phrase",
 204                         index_analyzer  => "analyser_phrase",
 205                         type            => "string",
 206                         copy_to         => "_all.phrase",
 207                     },
 208                     raw => {
 209                         "type" => "string",
 210                         "index" => "not_analyzed",
 211                     }
 212                 },
 213             };
 214             $mappings->{data}{properties}{$name}{null_value} = 0
 215               if $type eq 'boolean';
 216             if ($facet) {
 217                 $mappings->{data}{properties}{ $name . '__facet' } = {
 218                     type  => "string",
 219                     index => "not_analyzed",
 220                 };
 221             }
 222             if ($suggestible) {
 223                 $mappings->{data}{properties}{ $name . '__suggestion' } = {
 224                     type => 'completion',
 225                     index_analyzer => 'simple',
 226                     search_analyzer => 'simple',
 227                 };
 228             }
 229             # Sort may be true, false, or undef. Here we care if it's
 230             # anything other than undef.
 231             if (defined $sort) {
 232                 $mappings->{data}{properties}{ $name . '__sort' } = {
 233                     search_analyzer => "analyser_phrase",
 234                     index_analyzer  => "analyser_phrase",
 235                     type            => "string",
 236                     include_in_all  => JSON::false,
 237                     fields          => {
 238                         phrase => {
 239                             search_analyzer => "analyser_phrase",
 240                             index_analyzer  => "analyser_phrase",
 241                             type            => "string",
 242                         },
 243                     },
 244                 };
 245                 $sort_fields{$name} = 1;
 246             }
 247         }
 248     );
 249     $self->sort_fields(\%sort_fields);
 250     return $mappings;
 251 }
 252
 253 # This overrides the accessor provided by Class::Accessor so that if
 254 # sort_fields isn't set, then it'll generate it.
 255 sub sort_fields {
 256     my $self = shift;
 257
 258     if (@_) {
 259         $self->_sort_fields_accessor(@_);
 260         return;
 261     }
 262     my $val = $self->_sort_fields_accessor();
 263     return $val if $val;
 264
 265     # This will populate the accessor as a side effect
 266     $self->get_elasticsearch_mappings();
 267     return $self->_sort_fields_accessor();
 268 }
 269
 270 # Provides the rules for data conversion.
 271 sub get_fixer_rules {
 272     my ($self) = @_;
 273
 274     my $marcflavour = lc C4::Context->preference('marcflavour');
 275     my @rules;
 276     $self->_foreach_mapping(
 277         sub {
 278             my ( $name, $type, $facet, $suggestible, $sort, $marc_type, $marc_field ) = @_;
 279             return if $marc_type ne $marcflavour;
 280             my $options = '';
 281
 282             # There's a bug when using 'split' with something that
 283             # selects a range
 284             # The split makes everything into nested arrays, but that's not
 285             # really a big deal, ES doesn't mind.
 286             $options = '-split => 1' unless $marc_field =~ m|_/| || $type eq 'sum';
 287             push @rules, "marc_map('$marc_field','${name}', $options)";
 288             if ($facet) {
 289                 push @rules, "marc_map('$marc_field','${name}__facet', $options)";
 290             }
 291             if ($suggestible) {
 292                 push @rules,
 293 "marc_map('$marc_field','${name}__suggestion.input.\$append', $options)";
 294             }
 295             if ( $type eq 'boolean' ) {
 296
 297                 # boolean gets special handling, basically if it doesn't exist,
 298                 # it's added and set to false. Otherwise we can't query it.
 299                 push @rules,
 300                   "unless exists('$name') add_field('$name', 0) end";
 301             }
 302             if ($type eq 'sum' ) {
 303                 push @rules, "sum('$name')";
 304             }
 305             # Sort is a bit special as it can be true, false, undef. For
 306             # fixer rules, we care about "true", or "undef" if there is
 307             # special handling of this field from other one. "undef" means
 308             # to do the default thing, which is make it sortable.
 309             if ($self->sort_fields()->{$name}) {
 310                 if ($sort || !defined $sort) {
 311                     push @rules, "marc_map('$marc_field','${name}__sort', $options)";
 312                 }
 313             }
 314         }
 315     );
 316     return \@rules;
 317 }
 318
 319 =head2 _foreach_mapping
 320
 321     $self->_foreach_mapping(
 322         sub {
 323             my ( $name, $type, $facet, $suggestible, $sort, $marc_type,
 324                 $marc_field )
 325               = @_;
 326             return unless $marc_type eq 'marc21';
 327             print "Data comes from: " . $marc_field . "\n";
 328         }
 329     );
 330
 331 This allows you to apply a function to each entry in the elasticsearch mappings
 332 table, in order to build the mappings for whatever is needed.
 333
 334 In the provided function, the files are:
 335
 336 =over 4
 337
 338 =item C<$name>
 339
 340 The field name for elasticsearch (corresponds to the 'mapping' column in the
 341 database.
 342
 343 =item C<$type>
 344
 345 The type for this value, e.g. 'string'.
 346
 347 =item C<$facet>
 348
 349 True if this value should be facetised. This only really makes sense if the
 350 field is understood by the facet processing code anyway.
 351
 352 =item C<$sort>
 353
 354 True if this is a field that a) needs special sort handling, and b) if it
 355 should be sorted on. False if a) but not b). Undef if not a). This allows,
 356 for example, author to be sorted on but not everything marked with "author"
 357 to be included in that sort.
 358
 359 =item C<$marc_type>
 360
 361 A string that indicates the MARC type that this mapping is for, e.g. 'marc21',
 362 'unimarc', 'normarc'.
 363
 364 =item C<$marc_field>
 365
 366 A string that describes the MARC field that contains the data to extract.
 367 These are of a form suited to Catmandu's MARC fixers.
 368
 369 =back
 370
 371 =cut
 372
 373 sub _foreach_mapping {
 374     my ( $self, $sub ) = @_;
 375
 376     # TODO use a caching framework here
 377     my $search_fields = Koha::Database->schema->resultset('SearchField')->search(
 378         {
 379             'search_marc_map.index_name' => $self->index,
 380         },
 381         {   join => { search_marc_to_fields => 'search_marc_map' },
 382             '+select' => [
 383                 'search_marc_to_fields.facet',
 384                 'search_marc_to_fields.suggestible',
 385                 'search_marc_to_fields.sort',
 386                 'search_marc_map.marc_type',
 387                 'search_marc_map.marc_field',
 388             ],
 389             '+as'     => [
 390                 'facet',
 391                 'suggestible',
 392                 'sort',
 393                 'marc_type',
 394                 'marc_field',
 395             ],
 396         }
 397     );
 398
 399     while ( my $search_field = $search_fields->next ) {
 400         $sub->(
 401             $search_field->name,
 402             $search_field->type,
 403             $search_field->get_column('facet'),
 404             $search_field->get_column('suggestible'),
 405             $search_field->get_column('sort'),
 406             $search_field->get_column('marc_type'),
 407             $search_field->get_column('marc_field'),
 408         );
 409     }
 410 }
 411
 412 =head2 process_error
 413
 414     die process_error($@);
 415
 416 This parses an Elasticsearch error message and produces a human-readable
 417 result from it. This result is probably missing all the useful information
 418 that you might want in diagnosing an issue, so the warning is also logged.
 419
 420 Note that currently the resulting message is not internationalised. This
 421 will happen eventually by some method or other.
 422
 423 =cut
 424
 425 sub process_error {
 426     my ($self, $msg) = @_;
 427
 428     warn $msg; # simple logging
 429
 430     # This is super-primitive
 431     return "Unable to understand your search query, please rephrase and try again.\n" if $msg =~ /ParseException/;
 432
 433     return "Unable to perform your search. Please try again.\n";
 434 }
 435
 436 1;
 437
 438 __END__
 439
 440 =head1 AUTHOR
 441
 442 =over 4
 443
 444 =item Chris Cormack C<< <chrisc@catalyst.net.nz> >>
 445
 446 =item Robin Sheat C<< <robin@catalyst.net.nz> >>
 447
 448 =back
 449
 450 =cut