3 # This inserts records from a Koha database into elastic search
5 # Copyright 2014 Catalyst IT
7 # This file is part of Koha.
9 # Koha is free software; you can redistribute it and/or modify it
10 # under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 3 of the License, or
12 # (at your option) any later version.
14 # Koha is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with Koha; if not, see <http://www.gnu.org/licenses>.
24 rebuild_elasticsearch.pl - inserts records from a Koha database into Elasticsearch
28 B<rebuild_elasticsearch.pl>
29 [B<-c|--commit>=C<count>]
43 Inserts records from a Koha database into Elasticsearch.
49 =item B<-c|--commit>=C<count>
51 Specify how many records will be batched up before they're added to Elasticsearch.
52 Higher should be faster, but will cause more RAM usage. Default is 5000.
56 Delete the index and recreate it before indexing.
60 Reload mappings from files (specified in koha-conf.xml) before indexing.
63 =item B<-a|--authorities>
65 Index the authorities only. Combining this with B<-b> is the same as
66 specifying neither and so both get indexed.
70 Index the biblios only. Combining this with B<-a> is the same as
71 specifying neither and so both get indexed.
73 =item B<-bn|--bnumber>
75 Only index the supplied biblionumber, mostly for testing purposes. May be
80 Only index the supplied authority id, mostly for testing purposes. May be
83 =item B<-p|--processes>
85 Number of processes to use for indexing. This can be used to do more indexing
86 work in parallel on multicore systems. By default, a single process is used.
90 By default, this program only emits warnings and errors. This makes it talk
91 more. Add more to make it even more wordy, in particular when debugging.
103 =head1 IMPLEMENTATION
111 use Koha
::MetadataRecord
::Authority
;
112 use Koha
::BiblioUtils
;
113 use Koha
::SearchEngine
::Elasticsearch
;
114 use Koha
::SearchEngine
::Elasticsearch
::Indexer
;
122 my ($delete, $reset, $help, $man, $processes);
123 my ($index_biblios, $index_authorities);
124 my (@biblionumbers,@authids);
126 $|=1; # flushes output
129 'c|commit=i' => \
$commit,
130 'd|delete' => \
$delete,
131 'r|reset' => \
$reset,
132 'a|authorities' => \
$index_authorities,
133 'b|biblios' => \
$index_biblios,
134 'bn|bnumber=i' => \
@biblionumbers,
135 'ai|authid=i' => \
@authids,
136 'p|processes=i' => \
$processes,
137 'v|verbose+' => \
$verbose,
142 # Default is to do both
143 unless ($index_authorities || $index_biblios) {
144 $index_authorities = $index_biblios = 1;
147 if ($processes && ( @biblionumbers || @authids) ) {
148 die "Argument p|processes cannot be combined with bn|bnumber or ai|authid";
151 pod2usage
(1) if $help;
152 pod2usage
( -exitstatus
=> 0, -verbose
=> 2 ) if $man;
157 Koha
::SearchEngine
::Elasticsearch
->reset_elasticsearch_mappings;
161 _verify_index_state
($Koha::SearchEngine
::Elasticsearch
::BIBLIOS_INDEX
, $delete) if ($index_biblios);
162 _verify_index_state
($Koha::SearchEngine
::Elasticsearch
::AUTHORITIES_INDEX
, $delete) if ($index_authorities);
165 my $slice_count = ( $processes //= 1 );
166 my %iterator_options;
168 if ($slice_count > 1) {
169 # Fire up child processes for processing slices from 2 on. This main process will handle slice 1.
171 for (my $proc = 1; $proc < $slice_count; $proc++) {
173 die "Failed to fork a child process\n" unless defined $pid;
175 # Child process, give it a slice to process
176 $slice_index = $proc;
180 # Fudge the commit count a bit to spread out the Elasticsearch commits
181 $commit *= 1 + 0.10 * $slice_index;
182 _log
(1, "Processing slice @{[$slice_index + 1]} of $slice_count\n");
183 $iterator_options{slice
} = { index => $slice_index, count
=> $slice_count };
187 if ($index_biblios) {
188 _log
(1, "Indexing biblios\n");
189 if (@biblionumbers) {
191 my $r = shift @biblionumbers;
192 return () unless defined $r;
193 return ($r, Koha
::BiblioUtils
->get_from_biblionumber($r, item_data
=> 1 ));
196 my $records = Koha
::BiblioUtils
->get_all_biblios_iterator(%iterator_options);
201 _do_reindex
($next, $Koha::SearchEngine
::Elasticsearch
::BIBLIOS_INDEX
);
203 if ($index_authorities) {
204 _log
(1, "Indexing authorities\n");
207 my $r = shift @authids;
208 return () unless defined $r;
209 my $a = Koha
::MetadataRecord
::Authority
->get_from_authid($r);
213 my $records = Koha
::MetadataRecord
::Authority
->get_all_authorities_iterator(%iterator_options);
218 _do_reindex
($next, $Koha::SearchEngine
::Elasticsearch
::AUTHORITIES_INDEX
);
221 if ($slice_index == 0) {
222 # Main process, wait for children
223 for (my $proc = 1; $proc < $processes; $proc++) {
228 =head1 INTERNAL METHODS
230 =head2 _verify_index_state
232 _verify_index_state($Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX, 1);
234 Checks the index state and recreates it if requested.
238 sub _verify_index_state
{
239 my ( $index_name, $recreate ) = @_;
241 _log
(1, "Checking state of $index_name index\n");
242 my $indexer = Koha
::SearchEngine
::Elasticsearch
::Indexer
->new( { index => $index_name } );
245 _log
(1, "Dropping and recreating $index_name index\n");
246 $indexer->drop_index() if $indexer->index_exists();
247 $indexer->create_index();
249 elsif (!$indexer->index_exists) {
250 # Create index if does not exist
251 $indexer->create_index();
252 } elsif ($indexer->is_index_status_ok) {
253 # Update mapping unless index is some kind of problematic state
254 $indexer->update_mappings();
255 } elsif ($indexer->is_index_status_recreate_required) {
256 warn qq/Index "$index_name" has status "recreate required", suggesting it should be recreated/;
262 _do_reindex($callback, $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX);
264 Does the actual reindexing. $callback is a function that always returns the next record.
265 For each index we iterate through the records, committing at specified count
270 my ( $next, $index_name ) = @_;
272 my $indexer = Koha
::SearchEngine
::Elasticsearch
::Indexer
->new( { index => $index_name } );
275 my $commit_count = $commit;
276 my ( @id_buffer, @commit_buffer );
277 while ( my $record = $next->() ) {
278 my $id = $record->id // $record->authid;
279 my $record = $record->record;
281 if ( $verbose == 1 ) {
282 _log
( 1, "$count records processed\n" ) if ( $count % 1000 == 0);
287 push @id_buffer, $id;
288 push @commit_buffer, $record;
289 if ( !( --$commit_count ) ) {
290 _log
( 1, "Committing $commit records...\n" );
291 my $response = $indexer->update_index( \
@id_buffer, \
@commit_buffer );
292 _handle_response
($response);
293 $commit_count = $commit;
296 _log
( 1, "Commit complete\n" );
300 # There are probably uncommitted records
301 _log
( 1, "Committing final records...\n" );
302 my $response = $indexer->update_index( \
@id_buffer, \
@commit_buffer );
303 _handle_response
($response);
304 _log
( 1, "Total $count records indexed\n" );
311 Checks some basic stuff to ensure that it's sane before we start.
316 # Do we have an elasticsearch block defined?
317 my $conf = C4
::Context
->config('elasticsearch');
318 die "No 'elasticsearch' block is defined in koha-conf.xml.\n" if ( !$conf );
321 =head2 _handle_response
323 Parse the return from update_index and display errors depending on verbosity of the script
327 sub _handle_response
{
329 if( $response->{errors
} eq 'true' ){
330 _log
( 1, "There were errors during indexing\n" );
332 foreach my $item (@
{$response->{items
}}){
333 next unless defined $item->{index}->{error
};
334 print "Record #" . $item->{index}->{_id
} . " " .
335 $item->{index}->{error
}->{reason
} . " (" . $item->{index}->{error
}->{type
} . ") : " .
336 $item->{index}->{error
}->{caused_by
}->{type
} . " (" . $item->{index}->{error
}->{caused_by
}->{reason
} . ")\n";
344 _log($level, "Message\n");
346 Output progress information.
348 Will output the message if verbosity level is set to $level or more. Will not
349 include a trailing newline automatically.
354 my ($level, $msg) = @_;
356 print "[$$] $msg" if ($verbose >= $level);