misc/migration_tools/rebuild_nozebra.pl

   1 #!/usr/bin/perl
   2
   3 use C4::Context;
   4 use Getopt::Long;
   5 use C4::Biblio;
   6 use C4::AuthoritiesMarc;
   7
   8 use strict;
   9 #
  10 # script that fills the nozebra table
  11 #
  12 #
  13
  14 $|=1; # flushes output
  15
  16 # limit for database dumping
  17 my $limit;# = "LIMIT 100";
  18 my $directory;
  19 #my $skip_export;
  20 #my $keep_export;
  21 #my $reset;
  22 #my $biblios;
  23 my $authorities;
  24 my $sysprefs;
  25 my $commit;
  26 my $want_help;
  27
  28 my $result = GetOptions(
  29     'd:s'      => \$directory,
  30 #    'reset'      => \$reset,
  31 #    's'        => \$skip_export,    # Not used and conflicts with 's' option some lines below for sysprefs!!!
  32 #    'k'        => \$keep_export,
  33 #    'b'        => \$biblios,
  34 #    'a'        => \$authorities,
  35     's'        => \$sysprefs,  # rebuild 'NoZebraIndexes' syspref
  36     'h|help'        => \$want_help,
  37    'commit:f'    => \$commit,
  38     );
  39
  40 if (not $result or $want_help) {
  41     print_usage();
  42     exit 0;
  43 }
  44
  45
  46 sub print_usage {
  47     print <<_USAGE_;
  48 $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
  49
  50 Use this batch job to reindex all biblio and authority
  51 records in your Koha database.  This job is useful
  52 only if you are NOT using Zebra ('NoZebra'); if you are
  53 using the 'Zebra'mode, this job should NOT be used.
  54
  55 Parameters:
  56     -d                      Temporary directory for indexing.
  57                             If not specified, one is automatically
  58                             created.  The export directory
  59                             is automatically deleted unless
  60                             you supply the -k switch.
  61
  62     -s                      Rebuild "NoZebraIndexes" System Preference
  63
  64     --help or -h            show this message.
  65 _USAGE_
  66 }   # END of print_usage sub
  67
  68
  69 my $commitnum = 1000;
  70 $commitnum = $commit if ($commit) ;
  71
  72 $directory = "export" unless $directory;
  73 my $dbh=C4::Context->dbh;
  74 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
  75
  76 $dbh->do("truncate nozebra");
  77
  78 my %index = GetNoZebraIndexes();
  79
  80 if  (!%index || $sysprefs ) {
  81     if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
  82         $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
  83         'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
  84         'isbn' => '010a',
  85         'issn' => '011a',
  86         'biblionumber' =>'0909',
  87         'itemtype' => '200b',
  88         'language' => '101a',
  89         'publisher' => '210c',
  90         'date' => '210d',
  91         'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
  92         'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
  93         'subject' => '600*,601*,606*,610*',
  94         'dewey' => '676a',
  95         'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
  96         %index = GetNoZebraIndexes();
  97     } elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
  98         $dbh->do("UPDATE systempreferences SET value=\"
  99 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
 100 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
 101 'isbn' => '020a',
 102 'issn' => '022a',
 103 'lccn' => '010a',
 104 'biblionumber => '999c',
 105 'itemtype' => '942c',
 106 'publisher' => '260b',
 107 'date' => '260c',
 108 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
 109 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
 110 'dewey' => '082',
 111 'bc' => '952p',
 112 'callnum' => '952o',
 113 'an' => '6009,6109,6119',
 114 'series' => 440*,490*,
 115 'host-item' => '9529
 116 'shelf' => '952c',
 117 'collection' => '9528',
 118 \"WHERE variable='NoZebraIndexes'");
 119
 120         %index = GetNoZebraIndexes();
 121     }
 122 }
 123 $|=1;
 124
 125 $dbh->{AutoCommit} = 0;
 126
 127 print "***********************************\n";
 128 print "***** building BIBLIO indexes *****\n";
 129 print "***********************************\n";
 130
 131 my $sth;
 132 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
 133 $sth->execute();
 134 my $i=0;
 135 my %result;
 136 while (my ($biblionumber) = $sth->fetchrow) {
 137         $i++;
 138         print "\r$i";
 139         my  $record;
 140     eval{
 141             $record = GetMarcBiblio($biblionumber);
 142     };
 143     if($@){
 144             print "  There was some pb getting biblionumber : ".$biblionumber."\n";
 145             next;
 146     }
 147     next unless $record;
 148     # get title of the record (to store the 10 first letters with the index)
 149     my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
 150     my $title = lc($record->subfield($titletag,$titlesubfield));
 151
 152     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
 153     $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
 154     # limit to 10 char, should be enough, and limit the DB size
 155     $title = substr($title,0,10);
 156     #parse each field
 157     foreach my $field ($record->fields()) {
 158         #parse each subfield
 159         next if $field->tag <10;
 160         foreach my $subfield ($field->subfields()) {
 161             my $tag = $field->tag();
 162             my $subfieldcode = $subfield->[0];
 163             my $indexed=0;
 164             # check each index to see if the subfield is stored somewhere
 165             # otherwise, store it in __RAW__ index
 166             foreach my $key (keys %index) {
 167                 if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
 168                     $indexed=1;
 169                     my $line= lc $subfield->[1];
 170                     # remove meaningless value in the field...
 171                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
 172                     # ... and split in words
 173                     foreach (split / /,$line) {
 174                         next unless $_; # skip  empty values (multiple spaces)
 175                         # remove any accented char
 176                         # if the entry is already here, improve weight
 177                         if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
 178                             my $weight=$1+1;
 179                             $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
 180                             $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
 181                         # otherwise, create it, with weight=1
 182                         } else {
 183                             $result{$key}->{"$_"}.="$biblionumber,$title-1;";
 184                         }
 185                     }
 186                 }
 187             }
 188             # the subfield is not indexed, store it in __RAW__ index anyway
 189             unless ($indexed) {
 190                 my $line= lc $subfield->[1];
 191                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
 192                 foreach (split / /,$line) {
 193                         next unless $_;
 194 #                     warn $record->as_formatted."$_ =>".$title;
 195                         if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
 196                             my $weight=$1+1;
 197 #                             $weight++;
 198                             $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
 199                             $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
 200                         } else {
 201                             $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
 202                         }
 203                 }
 204             }
 205         }
 206     }
 207 }
 208
 209
 210 print "\nInserting records...\n";
 211 $i=0;
 212
 213 my $commitnum = 100;
 214 $dbh->{AutoCommit} = 0;
 215
 216 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
 217 foreach my $key (keys %result) {
 218     foreach my $index (keys %{$result{$key}}) {
 219         if (length($result{$key}->{$index}) > 1000000) {
 220             print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
 221         }
 222         print "\r$i";
 223         $i++;
 224         $sth->execute($key,$index,$result{$key}->{$index});
 225         $dbh->commit() if (0 == $i % $commitnum);
 226     }
 227    $dbh->commit() if (0 == $i % $commitnum);
 228 }
 229 $dbh->commit;
 230
 231
 232 print "\nbiblios done\n";
 233
 234 print "\n***********************************\n";
 235 print "***** building AUTHORITIES indexes *****\n";
 236 print "***********************************\n";
 237
 238 $sth=$dbh->prepare("select authid from auth_header order by authid $limit");
 239 $sth->execute();
 240 $i=0;
 241 %result = ();
 242 while (my ($authid) = $sth->fetchrow) {
 243     $i++;
 244     print "\r$i";
 245     my $record;
 246     eval{
 247         $record = GetAuthority($authid);
 248     };
 249     if($@){
 250         print "  There was some pb getting authnumber : ".$authid."\n";
 251         next;
 252     }
 253
 254     my %index;
 255     # for authorities, the "title" is the $a mainentry
 256     my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
 257
 258     warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
 259     my $title = $record->subfield($authref->{auth_tag_to_report},'a');
 260     $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
 261     $index{'mainentry'}    = $authref->{'auth_tag_to_report'}.'*';
 262     $index{'auth_type'}    = '152b';
 263
 264     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
 265     $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
 266     $title = quotemeta $title;
 267     # limit to 10 char, should be enough, and limit the DB size
 268     $title = substr($title,0,10);
 269     #parse each field
 270     foreach my $field ($record->fields()) {
 271         #parse each subfield
 272         next if $field->tag <10;
 273         foreach my $subfield ($field->subfields()) {
 274             my $tag = $field->tag();
 275             my $subfieldcode = $subfield->[0];
 276             my $indexed=0;
 277             # check each index to see if the subfield is stored somewhere
 278             # otherwise, store it in __RAW__ index
 279             foreach my $key (keys %index) {
 280                 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
 281                     $indexed=1;
 282                     my $line= lc $subfield->[1];
 283                     # remove meaningless value in the field...
 284                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
 285                     # ... and split in words
 286                     foreach (split / /,$line) {
 287                         next unless $_; # skip  empty values (multiple spaces)
 288                         # if the entry is already here, improve weight
 289                         if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
 290                             my $weight=$1+1;
 291                             $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
 292                             $result{$key}->{"$_"} .= "$authid,$title-$weight;";
 293                         # otherwise, create it, with weight=1
 294                         } else {
 295                             $result{$key}->{"$_"}.="$authid,$title-1;";
 296                         }
 297                     }
 298                 }
 299             }
 300             # the subfield is not indexed, store it in __RAW__ index anyway
 301             unless ($indexed) {
 302                 my $line= lc $subfield->[1];
 303                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
 304                 foreach (split / /,$line) {
 305                         next unless $_;
 306 #                     warn $record->as_formatted."$_ =>".$title;
 307                         if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
 308                             my $weight=$1+1;
 309 #                             $weight++;
 310                             $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
 311                             $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
 312                         } else {
 313                             $result{__RAW__}->{"$_"}.="$authid,$title-1;";
 314                         }
 315                 }
 316             }
 317         }
 318     }
 319 }
 320
 321
 322
 323 print "\nInserting...\n";
 324 $i=0;
 325
 326 my $commitnum = 100;
 327 $dbh->{AutoCommit} = 0;
 328 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
 329 foreach my $key (keys %result) {
 330     foreach my $index (keys %{$result{$key}}) {
 331         if (length($result{$key}->{$index}) > 1000000) {
 332             print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
 333         }
 334         print "\r$i";
 335         $i++;
 336         $sth->execute($key,$index,$result{$key}->{$index});
 337         $dbh->commit() if (0 == $i % $commitnum);
 338     }
 339    $dbh->commit() if (0 == $i % $commitnum);
 340 }
 341 $dbh->commit;
 342 print "\nauthorities done\n";