Updated ScriptAlias to point to opac/opac-search.pl instead of
[koha.git] / misc / migration_tools / rebuild_nozebra.pl
blob7432c5e6b4cfcc21a4e1f5b89af61dc6a04a3498
1 #!/usr/bin/perl
3 use C4::Context;
4 use Getopt::Long;
5 use C4::Biblio;
6 use C4::AuthoritiesMarc;
8 use strict;
9 #
10 # script that fills the nozebra table
14 $|=1; # flushes output
16 # limit for database dumping
17 my $limit;# = "LIMIT 100";
18 my $directory;
19 #my $skip_export;
20 #my $keep_export;
21 #my $reset;
22 #my $biblios;
23 my $authorities;
24 my $sysprefs;
25 my $commit;
26 my $want_help;
28 my $result = GetOptions(
29 'd:s' => \$directory,
30 # 'reset' => \$reset,
31 # 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!!
32 # 'k' => \$keep_export,
33 # 'b' => \$biblios,
34 # 'a' => \$authorities,
35 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref
36 'h|help' => \$want_help,
37 'commit:f' => \$commit,
40 if (not $result or $want_help) {
41 print_usage();
42 exit 0;
46 sub print_usage {
47 print <<_USAGE_;
48 $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
50 Use this batch job to reindex all biblio and authority
51 records in your Koha database. This job is useful
52 only if you are NOT using Zebra ('NoZebra'); if you are
53 using the 'Zebra'mode, this job should NOT be used.
55 Parameters:
56 -d Temporary directory for indexing.
57 If not specified, one is automatically
58 created. The export directory
59 is automatically deleted unless
60 you supply the -k switch.
62 -s Rebuild "NoZebraIndexes" System Preference
64 --help or -h show this message.
65 _USAGE_
66 } # END of print_usage sub
69 my $commitnum = 1000;
70 $commitnum = $commit if ($commit) ;
72 $directory = "export" unless $directory;
73 my $dbh=C4::Context->dbh;
74 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
76 $dbh->do("truncate nozebra");
78 my %index = GetNoZebraIndexes();
80 if (!%index || $sysprefs ) {
81 if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
82 $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
83 'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
84 'isbn' => '010a',
85 'issn' => '011a',
86 'biblionumber' =>'0909',
87 'itemtype' => '200b',
88 'language' => '101a',
89 'publisher' => '210c',
90 'date' => '210d',
91 'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
92 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
93 'subject' => '600*,601*,606*,610*',
94 'dewey' => '676a',
95 'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
96 %index = GetNoZebraIndexes();
97 } elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
98 $dbh->do("UPDATE systempreferences SET value=\"
99 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
100 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
101 'isbn' => '020a',
102 'issn' => '022a',
103 'lccn' => '010a',
104 'biblionumber => '999c',
105 'itemtype' => '942c',
106 'publisher' => '260b',
107 'date' => '260c',
108 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
109 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
110 'dewey' => '082',
111 'bc' => '952p',
112 'callnum' => '952o',
113 'an' => '6009,6109,6119',
114 'series' => 440*,490*,
115 'host-item' => '9529
116 'shelf' => '952c',
117 'collection' => '9528',
118 \"WHERE variable='NoZebraIndexes'");
120 %index = GetNoZebraIndexes();
123 $|=1;
125 $dbh->{AutoCommit} = 0;
127 print "***********************************\n";
128 print "***** building BIBLIO indexes *****\n";
129 print "***********************************\n";
131 my $sth;
132 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
133 $sth->execute();
134 my $i=0;
135 my %result;
136 while (my ($biblionumber) = $sth->fetchrow) {
137 $i++;
138 print "\r$i";
139 my $record;
140 eval{
141 $record = GetMarcBiblio($biblionumber);
143 if($@){
144 print " There was some pb getting biblionumber : ".$biblionumber."\n";
145 next;
147 next unless $record;
148 # get title of the record (to store the 10 first letters with the index)
149 my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
150 my $title = lc($record->subfield($titletag,$titlesubfield));
152 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
153 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
154 # limit to 10 char, should be enough, and limit the DB size
155 $title = substr($title,0,10);
156 #parse each field
157 foreach my $field ($record->fields()) {
158 #parse each subfield
159 next if $field->tag <10;
160 foreach my $subfield ($field->subfields()) {
161 my $tag = $field->tag();
162 my $subfieldcode = $subfield->[0];
163 my $indexed=0;
164 # check each index to see if the subfield is stored somewhere
165 # otherwise, store it in __RAW__ index
166 foreach my $key (keys %index) {
167 if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
168 $indexed=1;
169 my $line= lc $subfield->[1];
170 # remove meaningless value in the field...
171 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
172 # ... and split in words
173 foreach (split / /,$line) {
174 next unless $_; # skip empty values (multiple spaces)
175 # remove any accented char
176 # if the entry is already here, improve weight
177 if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
178 my $weight=$1+1;
179 $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
180 $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
181 # otherwise, create it, with weight=1
182 } else {
183 $result{$key}->{"$_"}.="$biblionumber,$title-1;";
188 # the subfield is not indexed, store it in __RAW__ index anyway
189 unless ($indexed) {
190 my $line= lc $subfield->[1];
191 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
192 foreach (split / /,$line) {
193 next unless $_;
194 # warn $record->as_formatted."$_ =>".$title;
195 if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
196 my $weight=$1+1;
197 # $weight++;
198 $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
199 $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
200 } else {
201 $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
210 print "\nInserting records...\n";
211 $i=0;
213 my $commitnum = 100;
214 $dbh->{AutoCommit} = 0;
216 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
217 foreach my $key (keys %result) {
218 foreach my $index (keys %{$result{$key}}) {
219 if (length($result{$key}->{$index}) > 1000000) {
220 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
222 print "\r$i";
223 $i++;
224 $sth->execute($key,$index,$result{$key}->{$index});
225 $dbh->commit() if (0 == $i % $commitnum);
227 $dbh->commit() if (0 == $i % $commitnum);
229 $dbh->commit;
232 print "\nbiblios done\n";
234 print "\n***********************************\n";
235 print "***** building AUTHORITIES indexes *****\n";
236 print "***********************************\n";
238 $sth=$dbh->prepare("select authid from auth_header order by authid $limit");
239 $sth->execute();
240 $i=0;
241 %result = ();
242 while (my ($authid) = $sth->fetchrow) {
243 $i++;
244 print "\r$i";
245 my $record;
246 eval{
247 $record = GetAuthority($authid);
249 if($@){
250 print " There was some pb getting authnumber : ".$authid."\n";
251 next;
254 my %index;
255 # for authorities, the "title" is the $a mainentry
256 my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
258 warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
259 my $title = $record->subfield($authref->{auth_tag_to_report},'a');
260 $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
261 $index{'mainentry'} = $authref->{'auth_tag_to_report'}.'*';
262 $index{'auth_type'} = '152b';
264 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
265 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
266 $title = quotemeta $title;
267 # limit to 10 char, should be enough, and limit the DB size
268 $title = substr($title,0,10);
269 #parse each field
270 foreach my $field ($record->fields()) {
271 #parse each subfield
272 next if $field->tag <10;
273 foreach my $subfield ($field->subfields()) {
274 my $tag = $field->tag();
275 my $subfieldcode = $subfield->[0];
276 my $indexed=0;
277 # check each index to see if the subfield is stored somewhere
278 # otherwise, store it in __RAW__ index
279 foreach my $key (keys %index) {
280 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
281 $indexed=1;
282 my $line= lc $subfield->[1];
283 # remove meaningless value in the field...
284 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
285 # ... and split in words
286 foreach (split / /,$line) {
287 next unless $_; # skip empty values (multiple spaces)
288 # if the entry is already here, improve weight
289 if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
290 my $weight=$1+1;
291 $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
292 $result{$key}->{"$_"} .= "$authid,$title-$weight;";
293 # otherwise, create it, with weight=1
294 } else {
295 $result{$key}->{"$_"}.="$authid,$title-1;";
300 # the subfield is not indexed, store it in __RAW__ index anyway
301 unless ($indexed) {
302 my $line= lc $subfield->[1];
303 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
304 foreach (split / /,$line) {
305 next unless $_;
306 # warn $record->as_formatted."$_ =>".$title;
307 if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
308 my $weight=$1+1;
309 # $weight++;
310 $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
311 $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
312 } else {
313 $result{__RAW__}->{"$_"}.="$authid,$title-1;";
323 print "\nInserting...\n";
324 $i=0;
326 my $commitnum = 100;
327 $dbh->{AutoCommit} = 0;
328 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
329 foreach my $key (keys %result) {
330 foreach my $index (keys %{$result{$key}}) {
331 if (length($result{$key}->{$index}) > 1000000) {
332 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
334 print "\r$i";
335 $i++;
336 $sth->execute($key,$index,$result{$key}->{$index});
337 $dbh->commit() if (0 == $i % $commitnum);
339 $dbh->commit() if (0 == $i % $commitnum);
341 $dbh->commit;
342 print "\nauthorities done\n";