misc/migration_tools/rebuild_nozebra.pl

   1 #!/usr/bin/perl
   2
   3 use C4::Context;
   4 use Getopt::Long;
   5 use C4::Biblio;
   6 use C4::AuthoritiesMarc;
   7
   8 use strict;
   9 #use warnings; FIXME - Bug 2505
  10 #
  11 # script that fills the nozebra table
  12 #
  13 #
  14
  15 $|=1; # flushes output
  16
  17 # limit for database dumping
  18 my $limit;# = "LIMIT 100";
  19 my $directory;
  20 #my $skip_export;
  21 #my $keep_export;
  22 #my $reset;
  23 #my $biblios;
  24 my $authorities;
  25 my $sysprefs;
  26 my $commit;
  27 my $want_help;
  28
  29 my $result = GetOptions(
  30     'd:s'      => \$directory,
  31 #    'reset'      => \$reset,
  32 #    's'        => \$skip_export,    # Not used and conflicts with 's' option some lines below for sysprefs!!!
  33 #    'k'        => \$keep_export,
  34 #    'b'        => \$biblios,
  35 #    'a'        => \$authorities,
  36     's'        => \$sysprefs,  # rebuild 'NoZebraIndexes' syspref
  37     'h|help'        => \$want_help,
  38    'commit:f'    => \$commit,
  39     );
  40
  41 if (not $result or $want_help) {
  42     print_usage();
  43     exit 0;
  44 }
  45
  46
  47 sub print_usage {
  48     print <<_USAGE_;
  49 $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
  50
  51 Use this batch job to reindex all biblio and authority
  52 records in your Koha database.  This job is useful
  53 only if you are NOT using Zebra ('NoZebra'); if you are
  54 using the 'Zebra'mode, this job should NOT be used.
  55
  56 Parameters:
  57     -d                      Temporary directory for indexing.
  58                             If not specified, one is automatically
  59                             created.  The export directory
  60                             is automatically deleted unless
  61                             you supply the -k switch.
  62
  63     -s                      Rebuild "NoZebraIndexes" System Preference
  64
  65     --help or -h            show this message.
  66 _USAGE_
  67 }   # END of print_usage sub
  68
  69
  70 my $commitnum = 1000;
  71 $commitnum = $commit if ($commit) ;
  72
  73 $directory = "export" unless $directory;
  74 my $dbh=C4::Context->dbh;
  75 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
  76
  77 $dbh->do("truncate nozebra");
  78
  79 my %index = GetNoZebraIndexes();
  80
  81 if  (!%index || $sysprefs ) {
  82     if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
  83         $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
  84         'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
  85         'isbn' => '010a',
  86         'issn' => '011a',
  87         'biblionumber' =>'0909',
  88         'itemtype' => '200b',
  89         'language' => '101a',
  90         'publisher' => '210c',
  91         'date' => '210d',
  92         'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
  93         'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
  94         'subject' => '600*,601*,606*,610*',
  95         'dewey' => '676a',
  96         'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
  97         %index = GetNoZebraIndexes();
  98     } elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
  99         $dbh->do("UPDATE systempreferences SET value=\"
 100 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
 101 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
 102 'isbn' => '020a',
 103 'issn' => '022a',
 104 'lccn' => '010a',
 105 'biblionumber => '999c',
 106 'itemtype' => '942c',
 107 'publisher' => '260b',
 108 'date' => '260c',
 109 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
 110 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
 111 'dewey' => '082',
 112 'bc' => '952p',
 113 'callnum' => '952o',
 114 'an' => '6009,6109,6119',
 115 'series' => 440*,490*,
 116 'host-item' => '9529
 117 'shelf' => '952c',
 118 'collection' => '9528',
 119 \"WHERE variable='NoZebraIndexes'");
 120
 121         %index = GetNoZebraIndexes();
 122     }
 123 }
 124 $|=1;
 125
 126 $dbh->{AutoCommit} = 0;
 127
 128 print "***********************************\n";
 129 print "***** building BIBLIO indexes *****\n";
 130 print "***********************************\n";
 131
 132 my $sth;
 133 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
 134 $sth->execute();
 135 my $i=0;
 136 my %result;
 137 while (my ($biblionumber) = $sth->fetchrow) {
 138         $i++;
 139         print "\r$i";
 140         my  $record;
 141     eval{
 142             $record = GetMarcBiblio($biblionumber);
 143     };
 144     if($@){
 145             print "  There was some pb getting biblionumber : ".$biblionumber."\n";
 146             next;
 147     }
 148     next unless $record;
 149     # get title of the record (to store the 10 first letters with the index)
 150     my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
 151     my $title = lc($record->subfield($titletag,$titlesubfield));
 152
 153     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
 154     $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
 155     # limit to 10 char, should be enough, and limit the DB size
 156     $title = substr($title,0,10);
 157     #parse each field
 158     foreach my $field ($record->fields()) {
 159         #parse each subfield
 160         next if $field->tag <10;
 161         foreach my $subfield ($field->subfields()) {
 162             my $tag = $field->tag();
 163             my $subfieldcode = $subfield->[0];
 164             my $indexed=0;
 165             # check each index to see if the subfield is stored somewhere
 166             # otherwise, store it in __RAW__ index
 167             foreach my $key (keys %index) {
 168                 if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
 169                     $indexed=1;
 170                     my $line= lc $subfield->[1];
 171                     # remove meaningless value in the field...
 172                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
 173                     # ... and split in words
 174                     foreach (split / /,$line) {
 175                         next unless $_; # skip  empty values (multiple spaces)
 176                         # remove any accented char
 177                         # if the entry is already here, improve weight
 178                         if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
 179                             my $weight=$1+1;
 180                             $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
 181                             $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
 182                         # otherwise, create it, with weight=1
 183                         } else {
 184                             $result{$key}->{"$_"}.="$biblionumber,$title-1;";
 185                         }
 186                     }
 187                 }
 188             }
 189             # the subfield is not indexed, store it in __RAW__ index anyway
 190             unless ($indexed) {
 191                 my $line= lc $subfield->[1];
 192                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
 193                 foreach (split / /,$line) {
 194                         next unless $_;
 195 #                     warn $record->as_formatted."$_ =>".$title;
 196                         if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
 197                             my $weight=$1+1;
 198 #                             $weight++;
 199                             $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
 200                             $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
 201                         } else {
 202                             $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
 203                         }
 204                 }
 205             }
 206         }
 207     }
 208 }
 209
 210
 211 print "\nInserting records...\n";
 212 $i=0;
 213
 214 my $commitnum = 100;
 215 $dbh->{AutoCommit} = 0;
 216
 217 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
 218 foreach my $key (keys %result) {
 219     foreach my $index (keys %{$result{$key}}) {
 220         if (length($result{$key}->{$index}) > 1000000) {
 221             print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
 222         }
 223         print "\r$i";
 224         $i++;
 225         $sth->execute($key,$index,$result{$key}->{$index});
 226         $dbh->commit() if (0 == $i % $commitnum);
 227     }
 228    $dbh->commit() if (0 == $i % $commitnum);
 229 }
 230 $dbh->commit;
 231
 232
 233 print "\nbiblios done\n";
 234
 235 print "\n***********************************\n";
 236 print "***** building AUTHORITIES indexes *****\n";
 237 print "***********************************\n";
 238
 239 $sth=$dbh->prepare("select authid from auth_header order by authid $limit");
 240 $sth->execute();
 241 $i=0;
 242 %result = ();
 243 while (my ($authid) = $sth->fetchrow) {
 244     $i++;
 245     print "\r$i";
 246     my $record;
 247     eval{
 248         $record = GetAuthority($authid);
 249     };
 250     if($@){
 251         print "  There was some pb getting authnumber : ".$authid."\n";
 252         next;
 253     }
 254
 255     my %index;
 256     # for authorities, the "title" is the $a mainentry
 257     my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
 258
 259     warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
 260     my $title = $record->subfield($authref->{auth_tag_to_report},'a');
 261     $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
 262     $index{'mainentry'}    = $authref->{'auth_tag_to_report'}.'*';
 263     $index{'auth_type'}    = '152b';
 264
 265     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
 266     $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
 267     $title = quotemeta $title;
 268     # limit to 10 char, should be enough, and limit the DB size
 269     $title = substr($title,0,10);
 270     #parse each field
 271     foreach my $field ($record->fields()) {
 272         #parse each subfield
 273         next if $field->tag <10;
 274         foreach my $subfield ($field->subfields()) {
 275             my $tag = $field->tag();
 276             my $subfieldcode = $subfield->[0];
 277             my $indexed=0;
 278             # check each index to see if the subfield is stored somewhere
 279             # otherwise, store it in __RAW__ index
 280             foreach my $key (keys %index) {
 281                 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
 282                     $indexed=1;
 283                     my $line= lc $subfield->[1];
 284                     # remove meaningless value in the field...
 285                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
 286                     # ... and split in words
 287                     foreach (split / /,$line) {
 288                         next unless $_; # skip  empty values (multiple spaces)
 289                         # if the entry is already here, improve weight
 290                         if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
 291                             my $weight=$1+1;
 292                             $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
 293                             $result{$key}->{"$_"} .= "$authid,$title-$weight;";
 294                         # otherwise, create it, with weight=1
 295                         } else {
 296                             $result{$key}->{"$_"}.="$authid,$title-1;";
 297                         }
 298                     }
 299                 }
 300             }
 301             # the subfield is not indexed, store it in __RAW__ index anyway
 302             unless ($indexed) {
 303                 my $line= lc $subfield->[1];
 304                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
 305                 foreach (split / /,$line) {
 306                         next unless $_;
 307 #                     warn $record->as_formatted."$_ =>".$title;
 308                         if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
 309                             my $weight=$1+1;
 310 #                             $weight++;
 311                             $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
 312                             $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
 313                         } else {
 314                             $result{__RAW__}->{"$_"}.="$authid,$title-1;";
 315                         }
 316                 }
 317             }
 318         }
 319     }
 320 }
 321
 322
 323
 324 print "\nInserting...\n";
 325 $i=0;
 326
 327 my $commitnum = 100;
 328 $dbh->{AutoCommit} = 0;
 329 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
 330 foreach my $key (keys %result) {
 331     foreach my $index (keys %{$result{$key}}) {
 332         if (length($result{$key}->{$index}) > 1000000) {
 333             print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
 334         }
 335         print "\r$i";
 336         $i++;
 337         $sth->execute($key,$index,$result{$key}->{$index});
 338         $dbh->commit() if (0 == $i % $commitnum);
 339     }
 340    $dbh->commit() if (0 == $i % $commitnum);
 341 }
 342 $dbh->commit;
 343 print "\nauthorities done\n";