Bug 2505 - Add commented use warnings where missing in the misc/ directory
[koha.git] / misc / migration_tools / rebuild_nozebra.pl
blobb6b82c5c8cf959fd29274b87f5c77d05914203e0
1 #!/usr/bin/perl
3 use C4::Context;
4 use Getopt::Long;
5 use C4::Biblio;
6 use C4::AuthoritiesMarc;
8 use strict;
9 #use warnings; FIXME - Bug 2505
11 # script that fills the nozebra table
15 $|=1; # flushes output
17 # limit for database dumping
18 my $limit;# = "LIMIT 100";
19 my $directory;
20 #my $skip_export;
21 #my $keep_export;
22 #my $reset;
23 #my $biblios;
24 my $authorities;
25 my $sysprefs;
26 my $commit;
27 my $want_help;
29 my $result = GetOptions(
30 'd:s' => \$directory,
31 # 'reset' => \$reset,
32 # 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!!
33 # 'k' => \$keep_export,
34 # 'b' => \$biblios,
35 # 'a' => \$authorities,
36 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref
37 'h|help' => \$want_help,
38 'commit:f' => \$commit,
41 if (not $result or $want_help) {
42 print_usage();
43 exit 0;
47 sub print_usage {
48 print <<_USAGE_;
49 $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
51 Use this batch job to reindex all biblio and authority
52 records in your Koha database. This job is useful
53 only if you are NOT using Zebra ('NoZebra'); if you are
54 using the 'Zebra'mode, this job should NOT be used.
56 Parameters:
57 -d Temporary directory for indexing.
58 If not specified, one is automatically
59 created. The export directory
60 is automatically deleted unless
61 you supply the -k switch.
63 -s Rebuild "NoZebraIndexes" System Preference
65 --help or -h show this message.
66 _USAGE_
67 } # END of print_usage sub
70 my $commitnum = 1000;
71 $commitnum = $commit if ($commit) ;
73 $directory = "export" unless $directory;
74 my $dbh=C4::Context->dbh;
75 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
77 $dbh->do("truncate nozebra");
79 my %index = GetNoZebraIndexes();
81 if (!%index || $sysprefs ) {
82 if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
83 $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
84 'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
85 'isbn' => '010a',
86 'issn' => '011a',
87 'biblionumber' =>'0909',
88 'itemtype' => '200b',
89 'language' => '101a',
90 'publisher' => '210c',
91 'date' => '210d',
92 'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
93 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
94 'subject' => '600*,601*,606*,610*',
95 'dewey' => '676a',
96 'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
97 %index = GetNoZebraIndexes();
98 } elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
99 $dbh->do("UPDATE systempreferences SET value=\"
100 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
101 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
102 'isbn' => '020a',
103 'issn' => '022a',
104 'lccn' => '010a',
105 'biblionumber => '999c',
106 'itemtype' => '942c',
107 'publisher' => '260b',
108 'date' => '260c',
109 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
110 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
111 'dewey' => '082',
112 'bc' => '952p',
113 'callnum' => '952o',
114 'an' => '6009,6109,6119',
115 'series' => 440*,490*,
116 'host-item' => '9529
117 'shelf' => '952c',
118 'collection' => '9528',
119 \"WHERE variable='NoZebraIndexes'");
121 %index = GetNoZebraIndexes();
124 $|=1;
126 $dbh->{AutoCommit} = 0;
128 print "***********************************\n";
129 print "***** building BIBLIO indexes *****\n";
130 print "***********************************\n";
132 my $sth;
133 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
134 $sth->execute();
135 my $i=0;
136 my %result;
137 while (my ($biblionumber) = $sth->fetchrow) {
138 $i++;
139 print "\r$i";
140 my $record;
141 eval{
142 $record = GetMarcBiblio($biblionumber);
144 if($@){
145 print " There was some pb getting biblionumber : ".$biblionumber."\n";
146 next;
148 next unless $record;
149 # get title of the record (to store the 10 first letters with the index)
150 my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
151 my $title = lc($record->subfield($titletag,$titlesubfield));
153 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
154 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
155 # limit to 10 char, should be enough, and limit the DB size
156 $title = substr($title,0,10);
157 #parse each field
158 foreach my $field ($record->fields()) {
159 #parse each subfield
160 next if $field->tag <10;
161 foreach my $subfield ($field->subfields()) {
162 my $tag = $field->tag();
163 my $subfieldcode = $subfield->[0];
164 my $indexed=0;
165 # check each index to see if the subfield is stored somewhere
166 # otherwise, store it in __RAW__ index
167 foreach my $key (keys %index) {
168 if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
169 $indexed=1;
170 my $line= lc $subfield->[1];
171 # remove meaningless value in the field...
172 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
173 # ... and split in words
174 foreach (split / /,$line) {
175 next unless $_; # skip empty values (multiple spaces)
176 # remove any accented char
177 # if the entry is already here, improve weight
178 if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
179 my $weight=$1+1;
180 $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
181 $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
182 # otherwise, create it, with weight=1
183 } else {
184 $result{$key}->{"$_"}.="$biblionumber,$title-1;";
189 # the subfield is not indexed, store it in __RAW__ index anyway
190 unless ($indexed) {
191 my $line= lc $subfield->[1];
192 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
193 foreach (split / /,$line) {
194 next unless $_;
195 # warn $record->as_formatted."$_ =>".$title;
196 if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
197 my $weight=$1+1;
198 # $weight++;
199 $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
200 $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
201 } else {
202 $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
211 print "\nInserting records...\n";
212 $i=0;
214 my $commitnum = 100;
215 $dbh->{AutoCommit} = 0;
217 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
218 foreach my $key (keys %result) {
219 foreach my $index (keys %{$result{$key}}) {
220 if (length($result{$key}->{$index}) > 1000000) {
221 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
223 print "\r$i";
224 $i++;
225 $sth->execute($key,$index,$result{$key}->{$index});
226 $dbh->commit() if (0 == $i % $commitnum);
228 $dbh->commit() if (0 == $i % $commitnum);
230 $dbh->commit;
233 print "\nbiblios done\n";
235 print "\n***********************************\n";
236 print "***** building AUTHORITIES indexes *****\n";
237 print "***********************************\n";
239 $sth=$dbh->prepare("select authid from auth_header order by authid $limit");
240 $sth->execute();
241 $i=0;
242 %result = ();
243 while (my ($authid) = $sth->fetchrow) {
244 $i++;
245 print "\r$i";
246 my $record;
247 eval{
248 $record = GetAuthority($authid);
250 if($@){
251 print " There was some pb getting authnumber : ".$authid."\n";
252 next;
255 my %index;
256 # for authorities, the "title" is the $a mainentry
257 my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
259 warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
260 my $title = $record->subfield($authref->{auth_tag_to_report},'a');
261 $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
262 $index{'mainentry'} = $authref->{'auth_tag_to_report'}.'*';
263 $index{'auth_type'} = '152b';
265 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
266 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
267 $title = quotemeta $title;
268 # limit to 10 char, should be enough, and limit the DB size
269 $title = substr($title,0,10);
270 #parse each field
271 foreach my $field ($record->fields()) {
272 #parse each subfield
273 next if $field->tag <10;
274 foreach my $subfield ($field->subfields()) {
275 my $tag = $field->tag();
276 my $subfieldcode = $subfield->[0];
277 my $indexed=0;
278 # check each index to see if the subfield is stored somewhere
279 # otherwise, store it in __RAW__ index
280 foreach my $key (keys %index) {
281 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
282 $indexed=1;
283 my $line= lc $subfield->[1];
284 # remove meaningless value in the field...
285 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
286 # ... and split in words
287 foreach (split / /,$line) {
288 next unless $_; # skip empty values (multiple spaces)
289 # if the entry is already here, improve weight
290 if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
291 my $weight=$1+1;
292 $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
293 $result{$key}->{"$_"} .= "$authid,$title-$weight;";
294 # otherwise, create it, with weight=1
295 } else {
296 $result{$key}->{"$_"}.="$authid,$title-1;";
301 # the subfield is not indexed, store it in __RAW__ index anyway
302 unless ($indexed) {
303 my $line= lc $subfield->[1];
304 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
305 foreach (split / /,$line) {
306 next unless $_;
307 # warn $record->as_formatted."$_ =>".$title;
308 if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
309 my $weight=$1+1;
310 # $weight++;
311 $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
312 $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
313 } else {
314 $result{__RAW__}->{"$_"}.="$authid,$title-1;";
324 print "\nInserting...\n";
325 $i=0;
327 my $commitnum = 100;
328 $dbh->{AutoCommit} = 0;
329 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
330 foreach my $key (keys %result) {
331 foreach my $index (keys %{$result{$key}}) {
332 if (length($result{$key}->{$index}) > 1000000) {
333 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
335 print "\r$i";
336 $i++;
337 $sth->execute($key,$index,$result{$key}->{$index});
338 $dbh->commit() if (0 == $i % $commitnum);
340 $dbh->commit() if (0 == $i % $commitnum);
342 $dbh->commit;
343 print "\nauthorities done\n";