misc/migration_tools/rebuild_zebra_sliced.sh

   1 #!/bin/sh
   2
   3 usage() {
   4     local scriptname=$(basename $0)
   5     cat <<EOF
   6 $scriptname
   7
   8 Index Koha records by chunks. It is useful when a record causes errors and
   9 stops the indexing process. With this script, if indexing of one chunk fails,
  10 that chunk is split into two or more chunks, and indexing continues on these chunks.
  11 rebuild_zebra.pl is called only once to export records. Splitting and indexing
  12 is handled by this script (using zebraidx for indexing).
  13
  14 Usage:
  15 $scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
  16 $scriptname -h
  17
  18     -o | --offset         Offset parameter of rebuild_zebra.pl.
  19                           Default: $OFFSET
  20     -l | --length         Length parameter of rebuild_zebra.pl. If omitted, the
  21                           length is automatically calculated to index all
  22                           records
  23     -s | --chunks-size    Initial chunk size (number of records indexed at once)
  24                           Default: $CHUNKSSIZE
  25     -d | --export-dir     Where rebuild_zebra.pl will export data
  26                           Default: $EXPORTDIR
  27     -L | --log-dir        Log directory
  28                           Default: $LOGDIR
  29     -r | --remove-logs    Clean log directory before start
  30                           Default: $RMLOGS
  31     -t | --type           Record type ('biblios' or 'authorities')
  32                           Default: $TYPE
  33     -f | --force          Don't ask for confirmation before start
  34     -h | --help           Display this help message
  35     --reset-index         Reset Zebra index for 'type'
  36 EOF
  37 }
  38
  39 splitfile() {
  40     local file=$1
  41     local prefix=$2
  42     local size=$3
  43     local script='
  44         my $indexmode = '"$INDEXMODE"';
  45         my $prefix = '"\"$prefix\""';
  46         my $size = '"$size"';
  47         my ($i,$count) = (0,0);
  48         open(my $fh, "<", '"\"$file\""');
  49         open(my $out, ">", sprintf("$prefix%02d", $i));
  50         my $closed = 0;
  51         while (<$fh>) {
  52             my $line = $_;
  53             if ($closed) {
  54                 open($out, ">", sprintf("$prefix%02d", $i));
  55                 $closed = 0;
  56                 if ($indexmode eq "dom" && $line !~ /<collection>/) {
  57                     print $out "<collection>";
  58                 }
  59             }
  60             print $out $line;
  61             $count++ if ($line =~ m|^</record>|);
  62             if ($count == $size) {
  63                 if ($indexmode eq "dom" && $line !~ m|</collection>|) {
  64                     print $out "</collection>";
  65                 }
  66                 $count = 0;
  67                 $i++;
  68                 close($out);
  69                 $closed = 1;
  70             }
  71         }
  72     '
  73     $PERL -e "$script"
  74 }
  75
  76 indexfile() {
  77     local file=$1
  78     local chunkssize=$2
  79
  80     if [ $chunkssize -lt 1 ]; then
  81         echo "Fail on file $file"
  82     else
  83
  84         local prefix="${file}_${chunkssize}_"
  85         echo "Splitting file in chunks of $chunkssize records"
  86         splitfile $file $prefix $chunkssize
  87
  88         dir=$(dirname $prefix)
  89         local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
  90         for chunkfile in $files; do
  91             echo "Indexing $chunkfile"
  92             size=$(grep '^</record>' $chunkfile | wc -l)
  93             logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
  94             ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
  95             $ZEBRAIDX_CMD >$logfile 2>&1
  96             grep "Records: $size" $logfile >/dev/null 2>&1
  97             if [ $? -ne 0 ]; then
  98                 echo "Indexing failed. See log file $logfile"
  99                 echo "Split file and continue..."
 100                 indexfile $chunkfile $(($chunkssize/2))
 101             else
 102                 ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
 103                 $ZEBRAIDX_CMD >> $logfile 2>&1
 104             fi
 105         done
 106     fi
 107 }
 108
 109 OFFSET=0
 110 LENGTH=
 111 CHUNKSSIZE=10000
 112 EXPORTDIR=/tmp/rebuild/export
 113 LOGDIR=/tmp/rebuild/logs
 114 RMLOGS=no
 115 NOCONFIRM=no
 116 TYPE=biblios
 117 HELP=no
 118 RESETINDEX=no
 119
 120 # Get parameters
 121 while [ $1 ]; do
 122     case $1 in
 123         -o | --offset )
 124             shift
 125             OFFSET=$1
 126             ;;
 127         -l | --length )
 128             shift
 129             LENGTH=$1
 130             ;;
 131         -s | --chunks-size )
 132             shift
 133             CHUNKSSIZE=$1
 134             ;;
 135         -d | --export-dir )
 136             shift
 137             EXPORTDIR=$1
 138             ;;
 139         -L | --log-dir )
 140             shift
 141             LOGDIR=$1
 142             ;;
 143         -r | --remove-logs )
 144             RMLOGS=yes
 145             ;;
 146         -t | --type )
 147             shift
 148             TYPE=$1
 149             ;;
 150         -f | --force )
 151             NOCONFIRM=yes
 152             ;;
 153         -h | --help )
 154             HELP=yes
 155             ;;
 156         --reset-index )
 157             RESETINDEX=yes
 158             ;;
 159         * )
 160             usage
 161             exit 1
 162     esac
 163     shift
 164 done
 165
 166 if [ $HELP = "yes" ]; then
 167     usage
 168     exit 0
 169 fi
 170
 171 if [ -z $KOHA_CONF ]; then
 172     echo "KOHA_CONF is not set"
 173     exit 1
 174 fi
 175
 176 if [ -z $PERL5LIB ]; then
 177     echo "PERL5LIB is not set"
 178     exit 1
 179 fi
 180
 181
 182 TYPESWITCH=
 183 SQLTABLE=
 184 case $TYPE in
 185     biblios )
 186         TYPESWITCH=-b
 187         SQLTABLE="biblio"
 188         ;;
 189     authorities )
 190         TYPESWITCH=-a
 191         SQLTABLE="auth_header"
 192         ;;
 193     * )
 194         echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
 195         TYPESWITCH=-b
 196         TYPE=biblios
 197         SQLTABLE="biblio"
 198 esac
 199
 200 PERL=`which perl`
 201 if [ -z $PERL ]; then
 202     echo "perl not found"
 203     exit 1
 204 fi
 205
 206 if [ -z $LENGTH ]; then
 207     LENGTH=$($PERL -e '
 208         use C4::Context;
 209         my ($count) = C4::Context->dbh->selectrow_array(qq{
 210             SELECT COUNT(*) FROM '"$SQLTABLE"'
 211         });
 212         print $count;
 213     ')
 214 fi
 215
 216 ZEBRAIDX=`which zebraidx`
 217 if [ -z $ZEBRAIDX ]; then
 218     echo "zebraidx not found"
 219     exit 1
 220 fi
 221
 222 REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
 223 if [ ! -f $REBUILDZEBRA ]; then
 224     echo "$REBUILDZEBRA: file not found"
 225     exit 1
 226 fi
 227
 228 echo ""
 229 echo "Configuration"
 230 echo "========================================================================="
 231 echo "KOHA_CONF: $KOHA_CONF"
 232 echo "PERL5LIB: $PERL5LIB"
 233 echo "-------------------------------------------------------------------------"
 234 echo "Start at offset: $OFFSET"
 235 echo "Total number of records to index: $LENGTH"
 236 echo "Initial chunk size: $CHUNKSSIZE"
 237 echo "Export directory: $EXPORTDIR"
 238 echo "Log directory: $LOGDIR"
 239 echo "Remove logs before start? $RMLOGS"
 240 echo "Type of record: $TYPE"
 241 echo "Reset index before start? $RESETINDEX"
 242 echo "-------------------------------------------------------------------------"
 243 echo "zebraidx path: $ZEBRAIDX"
 244 echo "rebuild_zebra path: $REBUILDZEBRA"
 245 echo "perl path: $PERL"
 246 echo "========================================================================="
 247
 248 if [ $NOCONFIRM != "yes" ]; then
 249     confirm=y
 250     echo -n "Confirm ? [Y/n] "
 251     read response
 252     if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then
 253         confirm=n
 254     fi
 255
 256     if [ $confirm = "n" ]; then
 257         exit 0
 258     fi
 259 fi
 260
 261 mkdir -p $EXPORTDIR
 262 if [ $? -ne 0 ]; then
 263     echo "Failed to create directory $EXPORTDIR. Aborting."
 264     exit 1
 265 fi
 266
 267 mkdir -p $LOGDIR
 268 if [ $? -ne 0 ]; then
 269     echo "Failed to create directory $LOGDIR. Aborting."
 270     exit 1
 271 fi
 272
 273 if [ $RMLOGS = "yes" ]; then
 274     rm -f $LOGDIR/*.log
 275 fi
 276
 277 REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
 278 echo "\n$REBUILDZEBRA_CMD"
 279 $REBUILDZEBRA_CMD
 280
 281 EXPORTFILE=
 282 case $TYPE in
 283     biblios )
 284         EXPORTFILE="$EXPORTDIR/biblio/exported_records"
 285         indexmode_config_name="zebra_bib_index_mode"
 286         ;;
 287     authorities )
 288         EXPORTFILE="$EXPORTDIR/authority/exported_records"
 289         indexmode_config_name="zebra_auth_index_mode"
 290         ;;
 291     * )
 292         echo "Error: TYPE '$TYPE' is not supported"
 293         exit 1
 294 esac
 295
 296 INDEXMODE=$(perl -e '
 297     use C4::Context;
 298     print C4::Context->config('"$indexmode_config_name"');
 299 ')
 300
 301 CONFIGFILE=$(perl -e '
 302     use C4::Context;
 303     my $zebra_server = ('"$TYPE"' eq "biblios") ? "biblioserver" : "authorityserver";
 304     print C4::Context->zebraconfig($zebra_server)->{config};
 305 ')
 306
 307 if [ $RESETINDEX = "yes" ]; then
 308     RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
 309     echo "\n$RESETINDEX_CMD"
 310     $RESETINDEX_CMD
 311     echo ""
 312 fi
 313
 314 indexfile $EXPORTFILE $CHUNKSSIZE