Bug 23777: (follow-up) Use To.json in results.tt
[koha.git] / misc / search_tools / rebuild_elasticsearch.pl
blob9ac93d4860657a7a3972b27140c1b3e4e044fd37
1 #!/usr/bin/perl
3 # This inserts records from a Koha database into elastic search
5 # Copyright 2014 Catalyst IT
7 # This file is part of Koha.
9 # Koha is free software; you can redistribute it and/or modify it
10 # under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 3 of the License, or
12 # (at your option) any later version.
14 # Koha is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 =head1 NAME
24 rebuild_elasticsearch.pl - inserts records from a Koha database into Elasticsearch
26 =head1 SYNOPSIS
28 B<rebuild_elasticsearch.pl>
29 [B<-c|--commit>=C<count>]
30 [B<-v|--verbose>]
31 [B<-h|--help>]
32 [B<--man>]
34 =head1 DESCRIPTION
36 Inserts records from a Koha database into Elasticsearch.
38 =head1 OPTIONS
40 =over
42 =item B<-c|--commit>=C<count>
44 Specify how many records will be batched up before they're added to Elasticsearch.
45 Higher should be faster, but will cause more RAM usage. Default is 5000.
47 =item B<-d|--delete>
49 Delete the index and recreate it before indexing.
51 =item B<-a|--authorities>
53 Index the authorities only. Combining this with B<-b> is the same as
54 specifying neither and so both get indexed.
56 =item B<-b|--biblios>
58 Index the biblios only. Combining this with B<-a> is the same as
59 specifying neither and so both get indexed.
61 =item B<-bn|--bnumber>
63 Only index the supplied biblionumber, mostly for testing purposes. May be
64 repeated.
66 =item B<-ai|--authid>
68 Only index the supplied authority id, mostly for testing purposes. May be
69 repeated.
71 =item B<-p|--processes>
73 Number of processes to use for indexing. This can be used to do more indexing
74 work in parallel on multicore systems. By default, a single process is used.
76 =item B<-v|--verbose>
78 By default, this program only emits warnings and errors. This makes it talk
79 more. Add more to make it even more wordy, in particular when debugging.
81 =item B<-h|--help>
83 Help!
85 =item B<--man>
87 Full documentation.
89 =back
91 =head1 IMPLEMENTATION
93 =cut
95 use autodie;
96 use Getopt::Long;
97 use Koha::Script;
98 use C4::Context;
99 use Koha::MetadataRecord::Authority;
100 use Koha::BiblioUtils;
101 use Koha::SearchEngine::Elasticsearch::Indexer;
102 use MARC::Field;
103 use MARC::Record;
104 use Modern::Perl;
105 use Pod::Usage;
107 my $verbose = 0;
108 my $commit = 5000;
109 my ($delete, $help, $man, $processes);
110 my ($index_biblios, $index_authorities);
111 my (@biblionumbers,@authids);
113 $|=1; # flushes output
115 GetOptions(
116 'c|commit=i' => \$commit,
117 'd|delete' => \$delete,
118 'a|authorities' => \$index_authorities,
119 'b|biblios' => \$index_biblios,
120 'bn|bnumber=i' => \@biblionumbers,
121 'ai|authid=i' => \@authids,
122 'p|processes=i' => \$processes,
123 'v|verbose+' => \$verbose,
124 'h|help' => \$help,
125 'man' => \$man,
128 # Default is to do both
129 unless ($index_authorities || $index_biblios) {
130 $index_authorities = $index_biblios = 1;
133 if ($processes && ( @biblionumbers || @authids) ) {
134 die "Argument p|processes cannot be combined with bn|bnumber or ai|authid";
137 pod2usage(1) if $help;
138 pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
140 _sanity_check();
142 _verify_index_state($Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX, $delete) if ($index_biblios);
143 _verify_index_state($Koha::SearchEngine::Elasticsearch::AUTHORITIES_INDEX, $delete) if ($index_authorities);
145 my $slice_index = 0;
146 my $slice_count = ( $processes //= 1 );
147 my %iterator_options;
149 if ($slice_count > 1) {
150 # Fire up child processes for processing slices from 2 on. This main process will handle slice 1.
151 $slice_index = 0;
152 for (my $proc = 1; $proc < $slice_count; $proc++) {
153 my $pid = fork();
154 die "Failed to fork a child process\n" unless defined $pid;
155 if ($pid == 0) {
156 # Child process, give it a slice to process
157 $slice_index = $proc;
158 last;
161 # Fudge the commit count a bit to spread out the Elasticsearch commits
162 $commit *= 1 + 0.10 * $slice_index;
163 _log(1, "Processing slice @{[$slice_index + 1]} of $slice_count\n");
164 $iterator_options{slice} = { index => $slice_index, count => $slice_count };
167 my $next;
168 if ($index_biblios) {
169 _log(1, "Indexing biblios\n");
170 if (@biblionumbers) {
171 $next = sub {
172 my $r = shift @biblionumbers;
173 return () unless defined $r;
174 return ($r, Koha::BiblioUtils->get_from_biblionumber($r, item_data => 1 ));
176 } else {
177 my $records = Koha::BiblioUtils->get_all_biblios_iterator(%iterator_options);
178 $next = sub {
179 $records->next();
182 _do_reindex($next, $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX);
184 if ($index_authorities) {
185 _log(1, "Indexing authorities\n");
186 if (@authids) {
187 $next = sub {
188 my $r = shift @authids;
189 return () unless defined $r;
190 my $a = Koha::MetadataRecord::Authority->get_from_authid($r);
191 return ($r, $a);
193 } else {
194 my $records = Koha::MetadataRecord::Authority->get_all_authorities_iterator(%iterator_options);
195 $next = sub {
196 $records->next();
199 _do_reindex($next, $Koha::SearchEngine::Elasticsearch::AUTHORITIES_INDEX);
202 if ($slice_index == 0) {
203 # Main process, wait for children
204 for (my $proc = 1; $proc < $processes; $proc++) {
205 wait();
209 =head2 _verify_index_state
211 _verify_index_state($Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX, 1);
213 Checks the index state and recreates it if requested.
215 =cut
217 sub _verify_index_state {
218 my ( $index_name, $recreate ) = @_;
220 _log(1, "Checking state of $index_name index\n");
221 my $indexer = Koha::SearchEngine::Elasticsearch::Indexer->new( { index => $index_name } );
223 if ($recreate) {
224 _log(1, "Dropping and recreating $index_name index\n");
225 $indexer->drop_index() if $indexer->index_exists();
226 $indexer->create_index();
228 elsif (!$indexer->index_exists) {
229 # Create index if does not exist
230 $indexer->create_index();
231 } elsif ($indexer->is_index_status_ok) {
232 # Update mapping unless index is some kind of problematic state
233 $indexer->update_mappings();
234 } elsif ($indexer->is_index_status_recreate_required) {
235 warn qq/Index "$index_name" has status "recreate required", suggesting it should be recreated/;
239 =head2 _do_reindex
241 _do_reindex($callback, $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX);
243 Does the actual reindexing. $callback is a function that always returns the next record.
245 =cut
247 sub _do_reindex {
248 my ( $next, $index_name ) = @_;
250 my $indexer = Koha::SearchEngine::Elasticsearch::Indexer->new( { index => $index_name } );
252 my $count = 0;
253 my $commit_count = $commit;
254 my ( @id_buffer, @commit_buffer );
255 while ( my $record = $next->() ) {
256 my $id = $record->id // $record->authid;
257 my $record = $record->record;
258 $count++;
259 if ( $verbose == 1 ) {
260 _log( 1, "$count records processed\n" ) if ( $count % 1000 == 0);
261 } else {
262 _log( 2, "$id\n" );
265 push @id_buffer, $id;
266 push @commit_buffer, $record;
267 if ( !( --$commit_count ) ) {
268 _log( 1, "Committing $commit records...\n" );
269 $indexer->update_index( \@id_buffer, \@commit_buffer );
270 $commit_count = $commit;
271 @id_buffer = ();
272 @commit_buffer = ();
273 _log( 1, "Commit complete\n" );
277 # There are probably uncommitted records
278 _log( 1, "Committing final records...\n" );
279 $indexer->update_index( \@id_buffer, \@commit_buffer );
280 _log( 1, "Total $count records indexed\n" );
283 =head2 _sanity_check
285 _sanity_check();
287 Checks some basic stuff to ensure that it's sane before we start.
289 =cut
291 sub _sanity_check {
292 # Do we have an elasticsearch block defined?
293 my $conf = C4::Context->config('elasticsearch');
294 die "No 'elasticsearch' block is defined in koha-conf.xml.\n" if ( !$conf );
297 =head2 _log
299 _log($level, "Message\n");
301 Output progress information.
303 Will output the message if verbosity level is set to $level or more. Will not
304 include a trailing newline automatically.
306 =cut
308 sub _log {
309 my ($level, $msg) = @_;
311 print "[$$] $msg" if ($verbose >= $level);