misc/cronjobs/cloud-kw.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Copyright 2008 Tamil s.a.r.l.
   5 #
   6 # This file is part of Koha.
   7 #
   8 # Koha is free software; you can redistribute it and/or modify it under the
   9 # terms of the GNU General Public License as published by the Free Software
  10 # Foundation; either version 2 of the License, or (at your option) any later
  11 # version.
  12 #
  13 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  14 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with Koha; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20
  21 use strict;
  22 use warnings;
  23 use diagnostics;
  24 use Carp;
  25 use YAML::Syck;
  26 use Pod::Usage;
  27 use Getopt::Long;
  28 use C4::Context;
  29
  30
  31 my $verbose     = 0;
  32 my $help        = 0;
  33 my $conf        = '';
  34 GetOptions(
  35     'verbose'   => \$verbose,
  36     'help'      => \$help,
  37     'conf=s'    => \$conf,
  38 );
  39
  40 sub usage {
  41     pod2usage( -verbose => 2 );
  42     exit;
  43 }
  44
  45 usage() if $help || !$conf;
  46
  47
  48 my @clouds;
  49 print "Reading configuration file: $conf\n" if $verbose;
  50 eval {
  51     @clouds = LoadFile( $conf );
  52 };
  53 croak "Unable to read configuration file: $conf\n" if $@;
  54
  55 for my $cloud ( @clouds ) {
  56     print "Create a cloud\n",
  57           "  Koha conf file:  ", $cloud->{KohaConf} ? $cloud->{KohaConf} : "default", "\n",
  58           "  Zebra Index:     ", $cloud->{ZebraIndex}, "\n",
  59           "  Koha Keyword:    ", $cloud->{KohaIndex}, "\n",
  60           "  Count:           ", $cloud->{Count}, "\n",
  61           "  Withcss:         ", $cloud->{Withcss}, "\n",
  62           "  Output:          ", $cloud->{Output}, "\n",
  63       if $verbose;
  64
  65     # Set Koha context if KohaConf is present
  66     my $set_new_context = 0;
  67     if ( $cloud->{KohaConf} ) {
  68         if ( -e $cloud->{KohaConf} ) {
  69             my $context = C4::Context->new( $cloud->{KohaConf} );
  70             $context->set_context();
  71             $set_new_context = 1;
  72         }
  73         else {
  74             carp "Koha conf file doesn't exist: ", $cloud->{KohaConf}, " ; use KOHA_CONF\n";
  75         }
  76     }
  77
  78     my $index = new ZebraIndex( $cloud->{ZebraIndex} );
  79     $index->scan( $cloud->{Count} );
  80
  81     open my $fh, ">", $cloud->{Output}
  82         or croak "Unable to create file ", $cloud->{Output};
  83
  84     my $withcss = $cloud->{Withcss} =~ /^yes/i;
  85     print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
  86     close $fh;
  87     $set_new_context && restore_context C4::Context;
  88 }
  89
  90
  91
  92 package ZebraIndex;
  93
  94 use strict;
  95 use warnings;
  96 use diagnostics;
  97 use Carp;
  98
  99 sub new {
 100     my $self = {};
 101     my $class = shift;
 102     $self->{ zebra_index  } = shift;
 103     $self->{ top_terms    } = undef;
 104     $self->{ levels_cloud } = 24;
 105     bless $self, $class;
 106
 107     # Test Zebra index
 108     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 109     eval {
 110         my $ss = $zbiblio->scan_pqf(
 111             '@attr 1=' . $self->{ zebra_index } . ' @attr 4=1 @attr 6=3 "a"'
 112         );
 113     };
 114     croak "Invalid Zebra index: ", $self->{ zebra_index } if $@;
 115
 116     return $self;
 117 }
 118
 119
 120 #
 121 # scan
 122 #   Scan zebra index and populate an array of top terms
 123 #
 124 # PARAMETERS:
 125 #   $max_terms    Max number of top terms
 126 #
 127 # RETURN:
 128 #   A 4-dimensionnal array in $self->{top_terms}
 129 #   [0] term
 130 #   [1] term number of occurences
 131 #   [2] term proportional relative weight in terms set E[0-1]
 132 #   [3] term logarithmic relative weight E [0-levels_cloud]
 133 #
 134 #   This array is sorted alphabetically by terms ([0])
 135 #   It can be easily sorted by occurences:
 136 #     @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
 137 #
 138 sub scan {
 139     my $self       = shift;
 140     my $index_name = $self->{ zebra_index };
 141     my $max_terms  = shift;
 142
 143     my $MAX_OCCURENCE = 1000000000;
 144
 145     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 146     my $number_of_terms = 0;
 147     my @terms;      # 2 dimensions array
 148     my $min_occurence_index = -1;
 149     my $min_occurence;
 150     my $from = '0';
 151
 152     while (1) {
 153         my $ss;
 154         eval {
 155             print "$from\n" if $verbose;
 156             $from =~ s/\"/\\\"/g;
 157             my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
 158                         . $from . 'a"';
 159             $ss = $zbiblio->scan_pqf( $query );
 160         };
 161         if ($@) {
 162             chop $from;
 163             next;
 164         }
 165         $ss->option( rpnCharset => 'UTF-8' );
 166         last if $ss->size() == 0;
 167         my $term = '';
 168         my $occ = 0;
 169         for my $index ( 0..$ss->size()-1 ) {
 170             ($term, $occ) = $ss->display_term($index);
 171             #print "$term:$occ\n";
 172             if ( $number_of_terms < $max_terms ) {
 173                 push( @terms, [ $term, $occ ] );
 174                 ++$number_of_terms;
 175                 if ( $number_of_terms == $max_terms ) {
 176                     $min_occurence = $MAX_OCCURENCE;
 177                     for (0..$number_of_terms-1) {
 178                         my @term = @{ $terms[$_] };
 179                         if ( $term[1] <= $min_occurence ) {
 180                             $min_occurence       = $term[1];
 181                             $min_occurence_index = $_;
 182                         }
 183                     }
 184                 }
 185             }
 186             else {
 187                 if ( $occ > $min_occurence) {
 188                     @{ $terms[$min_occurence_index] }[0] = $term;
 189                     @{ $terms[$min_occurence_index] }[1] = $occ;
 190                     $min_occurence = $MAX_OCCURENCE;
 191                     for (0..$max_terms-1) {
 192                         my @term = @{ $terms[$_] };
 193                         if ( $term[1] <= $min_occurence ) {
 194                             $min_occurence       = $term[1];
 195                             $min_occurence_index = $_;
 196                         }
 197                     }
 198                 }
 199             }
 200         }
 201         $from = $term;
 202     }
 203
 204     # Sort array of array by terms weight
 205     @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
 206
 207     # A relatif weight to other set terms is added to each term
 208     my $min     = $terms[0][1];
 209     my $log_min = log( $min );
 210     my $max     = $terms[$#terms][1];
 211     my $log_max = log( $max );
 212     my $delta   = $max - $min;
 213     $delta = 1 if $delta == 0; # Very unlikely
 214     my $factor;
 215     if ($log_max - $log_min == 0) {
 216         $log_min = $log_min - $self->{levels_cloud};
 217         $factor = 1;
 218     }
 219     else {
 220         $factor = $self->{levels_cloud} / ($log_max - $log_min);
 221     }
 222
 223     foreach (0..$#terms) {
 224         my $count = @{ $terms[$_] }[1];
 225         my $weight = ( $count - $min ) / $delta;
 226         my $log_weight = int( (log($count) - $log_min) * $factor);
 227         push( @{ $terms[$_] }, $weight );
 228         push( @{ $terms[$_] }, $log_weight );
 229     }
 230     $self->{ top_terms } = \@terms;
 231
 232     # Sort array of array by terms alphabetical order
 233     @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
 234 }
 235
 236
 237 #
 238 # Returns a HTML version of index top terms formated
 239 # as a 'tag cloud'.
 240 #
 241 sub html_cloud {
 242     my $self = shift;
 243     my $koha_index = shift;
 244     my $withcss = shift;
 245     my @terms = @{ $self->{top_terms} };
 246     my $html = '';
 247     if ( $withcss ) {
 248         $html = <<EOS;
 249 <style>
 250 .subjectcloud {
 251     text-align:  center;
 252     line-height: 16px;
 253     margin: 20px;
 254     background: #f0f0f0;
 255     padding: 3%;
 256 }
 257 .subjectcloud a {
 258     font-weight: lighter;
 259     text-decoration: none;
 260 }
 261 span.tagcloud0  { font-size: 12px;}
 262 span.tagcloud1  { font-size: 13px;}
 263 span.tagcloud2  { font-size: 14px;}
 264 span.tagcloud3  { font-size: 15px;}
 265 span.tagcloud4  { font-size: 16px;}
 266 span.tagcloud5  { font-size: 17px;}
 267 span.tagcloud6  { font-size: 18px;}
 268 span.tagcloud7  { font-size: 19px;}
 269 span.tagcloud8  { font-size: 20px;}
 270 span.tagcloud9  { font-size: 21px;}
 271 span.tagcloud10 { font-size: 22px;}
 272 span.tagcloud11 { font-size: 23px;}
 273 span.tagcloud12 { font-size: 24px;}
 274 span.tagcloud13 { font-size: 25px;}
 275 span.tagcloud14 { font-size: 26px;}
 276 span.tagcloud15 { font-size: 27px;}
 277 span.tagcloud16 { font-size: 28px;}
 278 span.tagcloud17 { font-size: 29px;}
 279 span.tagcloud18 { font-size: 30px;}
 280 span.tagcloud19 { font-size: 31px;}
 281 span.tagcloud20 { font-size: 32px;}
 282 span.tagcloud21 { font-size: 33px;}
 283 span.tagcloud22 { font-size: 34px;}
 284 span.tagcloud23 { font-size: 35px;}
 285 span.tagcloud24 { font-size: 36px;}
 286 </style>
 287 <div class="subjectcloud">
 288 EOS
 289     }
 290     for (0..$#terms) {
 291         my @term = @{ $terms[$_] };
 292         my $uri = $term[0];
 293         $uri =~ s/\(//g;
 294         #print "  0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
 295         $html = $html
 296             . '<span class="tagcloud'
 297             . $term[3]
 298             . '">'
 299             . '<a href="/cgi-bin/koha/opac-search.pl?q='
 300             . $koha_index
 301             . '%3A'
 302             . $uri
 303             . '">'
 304             . $term[0]
 305             . "</a></span>\n";
 306     }
 307     $html .= "</div>\n";
 308     return $html;
 309 }
 310
 311
 312 =head1 NAME
 313
 314 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
 315
 316 =head1 USAGE
 317
 318 =over
 319
 320 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
 321
 322 Creates multiple HTML files containing kewords cloud with top terms sorted
 323 by their logarithmic weight.
 324 F<cloud.conf> is a YAML configuration file driving cloud generation
 325 process.
 326
 327 =back
 328
 329 =head1 PARAMETERS
 330
 331 =over
 332
 333 =item B<--conf=configuration file>
 334
 335 Specify configuration file name
 336
 337 =item B<--verbose|-v>
 338
 339 Enable script verbose mode.
 340
 341 =item B<--help|-h>
 342
 343 Print this help page.
 344
 345 =back
 346
 347 =head1 CONFIGURATION
 348
 349 Configuration file looks like that:
 350
 351  ---
 352   # Koha configuration file for a specific installation
 353   # If not present, defaults to KOHA_CONF
 354   KohaConf: /home/koha/mylibray/etc/koha-conf.xml
 355   # Zebra index to scan
 356   ZebraIndex: Author
 357   # Koha index used to link found kewords with an opac search URL
 358   KohaIndex: au
 359   # Number of top keyword to use for the cloud
 360   Count: 50
 361   # Include CSS style directives with the cloud
 362   # This could be used as a model and then CSS directives are
 363   # put in the appropriate CSS file directly.
 364   Withcss: Yes
 365   # HTML file where to output the cloud
 366   Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
 367  ---
 368   KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
 369   ZebraIndex: Subject
 370   KohaIndex: su
 371   Count: 200
 372   Withcss: no
 373   Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
 374
 375 =head1 IMPROVEMENTS
 376
 377 Generated top terms have more informations than those outputted from
 378 the time beeing. Some parameters could be easily added to improve
 379 this script:
 380
 381 =over
 382
 383 =item B<WithCount>
 384
 385 In order to output terms with the number of occurences they
 386 have been found in Koha Catalogue by Zebra.
 387
 388 =item B<CloudLevels>
 389
 390 Number of levels in the cloud. Now 24 levels are hardcoded.
 391
 392 =item B<Weithing>
 393
 394 Weighting method used to distribute terms in the cloud. We could have two
 395 values: Logarithmic and Linear. Now it's Logarithmic by default.
 396
 397 =item B<Order>
 398
 399 Now terms are outputted in the lexical order. They could be sorted
 400 by their weight.
 401
 402 =back
 403
 404 =cut
 405
 406