Bug 16601: Update of italian MARC21 files
[koha.git] / misc / cronjobs / cloud-kw.pl
blob96864156efed1e1ac49b36fb9dd3ce3432a28715
1 #!/usr/bin/perl
4 # Copyright 2008 Tamil s.a.r.l.
6 # This file is part of Koha.
8 # Koha is free software; you can redistribute it and/or modify it
9 # under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 3 of the License, or
11 # (at your option) any later version.
13 # Koha is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with Koha; if not, see <http://www.gnu.org/licenses>.
21 use strict;
22 use warnings;
23 use diagnostics;
24 use Carp;
25 use YAML::Syck;
26 use Pod::Usage;
27 use Getopt::Long;
28 use C4::Context;
29 use C4::Log;
31 my $verbose = 0;
32 my $help = 0;
33 my $conf = '';
34 GetOptions(
35 'verbose' => \$verbose,
36 'help' => \$help,
37 'conf=s' => \$conf,
40 sub usage {
41 pod2usage( -verbose => 2 );
42 exit;
45 usage() if $help || !$conf;
47 cronlogaction();
49 my @clouds;
50 print "Reading configuration file: $conf\n" if $verbose;
51 eval {
52 @clouds = LoadFile( $conf );
54 croak "Unable to read configuration file: $conf\n" if $@;
56 for my $cloud ( @clouds ) {
57 print "Create a cloud\n",
58 " Koha conf file: ", $cloud->{KohaConf} ? $cloud->{KohaConf} : "default", "\n",
59 " Zebra Index: ", $cloud->{ZebraIndex}, "\n",
60 " Koha Keyword: ", $cloud->{KohaIndex}, "\n",
61 " Count: ", $cloud->{Count}, "\n",
62 " Withcss: ", $cloud->{Withcss}, "\n",
63 " Output: ", $cloud->{Output}, "\n",
64 if $verbose;
66 # Set Koha context if KohaConf is present
67 my $set_new_context = 0;
68 if ( $cloud->{KohaConf} ) {
69 if ( -e $cloud->{KohaConf} ) {
70 my $context = C4::Context->new( $cloud->{KohaConf} );
71 $context->set_context();
72 $set_new_context = 1;
74 else {
75 carp "Koha conf file doesn't exist: ", $cloud->{KohaConf}, " ; use KOHA_CONF\n";
79 my $index = new ZebraIndex( $cloud->{ZebraIndex} );
80 $index->scan( $cloud->{Count} );
82 open my $fh, ">", $cloud->{Output}
83 or croak "Unable to create file ", $cloud->{Output};
85 my $withcss = $cloud->{Withcss} =~ /^yes/i;
86 print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
87 close $fh;
88 $set_new_context && restore_context C4::Context;
93 package ZebraIndex;
95 use strict;
96 use warnings;
97 use diagnostics;
98 use Carp;
100 sub new {
101 my $self = {};
102 my $class = shift;
103 $self->{ zebra_index } = shift;
104 $self->{ top_terms } = undef;
105 $self->{ levels_cloud } = 24;
106 bless $self, $class;
108 # Test Zebra index
109 my $zbiblio = C4::Context->Zconn( "biblioserver" );
110 eval {
111 my $ss = $zbiblio->scan_pqf(
112 '@attr 1=' . $self->{ zebra_index } . ' @attr 4=1 @attr 6=3 "a"'
115 croak "Invalid Zebra index: ", $self->{ zebra_index } if $@;
117 return $self;
122 # scan
123 # Scan zebra index and populate an array of top terms
125 # PARAMETERS:
126 # $max_terms Max number of top terms
128 # RETURN:
129 # A 4-dimensionnal array in $self->{top_terms}
130 # [0] term
131 # [1] term number of occurrences
132 # [2] term proportional relative weight in terms set E[0-1]
133 # [3] term logarithmic relative weight E [0-levels_cloud]
135 # This array is sorted alphabetically by terms ([0])
136 # It can be easily sorted by occurrences:
137 # @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
139 sub scan {
140 my $self = shift;
141 my $index_name = $self->{ zebra_index };
142 my $max_terms = shift;
144 my $MAX_OCCURENCE = 1000000000;
146 my $zbiblio = C4::Context->Zconn( "biblioserver" );
147 my $number_of_terms = 0;
148 my @terms; # 2 dimensions array
149 my $min_occurence_index = -1;
150 my $min_occurence;
151 my $from = '0';
153 while (1) {
154 my $ss;
155 eval {
156 print "$from\n" if $verbose;
157 $from =~ s/\"/\\\"/g;
158 my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
159 . $from . 'a"';
160 $ss = $zbiblio->scan_pqf( $query );
162 if ($@) {
163 chop $from;
164 next;
166 $ss->option( rpnCharset => 'UTF-8' );
167 last if $ss->size() == 0;
168 my $term = '';
169 my $occ = 0;
170 for my $index ( 0..$ss->size()-1 ) {
171 ($term, $occ) = $ss->display_term($index);
172 #print "$term:$occ\n";
173 if ( $number_of_terms < $max_terms ) {
174 push( @terms, [ $term, $occ ] );
175 ++$number_of_terms;
176 if ( $number_of_terms == $max_terms ) {
177 $min_occurence = $MAX_OCCURENCE;
178 for (0..$number_of_terms-1) {
179 my @term = @{ $terms[$_] };
180 if ( $term[1] <= $min_occurence ) {
181 $min_occurence = $term[1];
182 $min_occurence_index = $_;
187 else {
188 if ( $occ > $min_occurence) {
189 @{ $terms[$min_occurence_index] }[0] = $term;
190 @{ $terms[$min_occurence_index] }[1] = $occ;
191 $min_occurence = $MAX_OCCURENCE;
192 for (0..$max_terms-1) {
193 my @term = @{ $terms[$_] };
194 if ( $term[1] <= $min_occurence ) {
195 $min_occurence = $term[1];
196 $min_occurence_index = $_;
202 $from = $term;
205 # Sort array of array by terms weight
206 @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
208 # A relatif weight to other set terms is added to each term
209 my $min = $terms[0][1];
210 my $log_min = log( $min );
211 my $max = $terms[$#terms][1];
212 my $log_max = log( $max );
213 my $delta = $max - $min;
214 $delta = 1 if $delta == 0; # Very unlikely
215 my $factor;
216 if ($log_max - $log_min == 0) {
217 $log_min = $log_min - $self->{levels_cloud};
218 $factor = 1;
220 else {
221 $factor = $self->{levels_cloud} / ($log_max - $log_min);
224 foreach (0..$#terms) {
225 my $count = @{ $terms[$_] }[1];
226 my $weight = ( $count - $min ) / $delta;
227 my $log_weight = int( (log($count) - $log_min) * $factor);
228 push( @{ $terms[$_] }, $weight );
229 push( @{ $terms[$_] }, $log_weight );
231 $self->{ top_terms } = \@terms;
233 # Sort array of array by terms alphabetical order
234 @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
239 # Returns a HTML version of index top terms formatted
240 # as a 'tag cloud'.
242 sub html_cloud {
243 my $self = shift;
244 my $koha_index = shift;
245 my $withcss = shift;
246 my @terms = @{ $self->{top_terms} };
247 my $html = '';
248 if ( $withcss ) {
249 $html = <<EOS;
250 <style>
251 .subjectcloud {
252 text-align: center;
253 line-height: 16px;
254 margin: 20px;
255 background: #f0f0f0;
256 padding: 3%;
258 .subjectcloud a {
259 font-weight: lighter;
260 text-decoration: none;
262 span.tagcloud0 { font-size: 12px;}
263 span.tagcloud1 { font-size: 13px;}
264 span.tagcloud2 { font-size: 14px;}
265 span.tagcloud3 { font-size: 15px;}
266 span.tagcloud4 { font-size: 16px;}
267 span.tagcloud5 { font-size: 17px;}
268 span.tagcloud6 { font-size: 18px;}
269 span.tagcloud7 { font-size: 19px;}
270 span.tagcloud8 { font-size: 20px;}
271 span.tagcloud9 { font-size: 21px;}
272 span.tagcloud10 { font-size: 22px;}
273 span.tagcloud11 { font-size: 23px;}
274 span.tagcloud12 { font-size: 24px;}
275 span.tagcloud13 { font-size: 25px;}
276 span.tagcloud14 { font-size: 26px;}
277 span.tagcloud15 { font-size: 27px;}
278 span.tagcloud16 { font-size: 28px;}
279 span.tagcloud17 { font-size: 29px;}
280 span.tagcloud18 { font-size: 30px;}
281 span.tagcloud19 { font-size: 31px;}
282 span.tagcloud20 { font-size: 32px;}
283 span.tagcloud21 { font-size: 33px;}
284 span.tagcloud22 { font-size: 34px;}
285 span.tagcloud23 { font-size: 35px;}
286 span.tagcloud24 { font-size: 36px;}
287 </style>
288 <div class="subjectcloud">
291 for (0..$#terms) {
292 my @term = @{ $terms[$_] };
293 my $uri = $term[0];
294 $uri =~ s/\(//g;
295 #print " 0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
296 $html = $html
297 . '<span class="tagcloud'
298 . $term[3]
299 . '">'
300 . '<a href="/cgi-bin/koha/opac-search.pl?q='
301 . $koha_index
302 . '%3A'
303 . $uri
304 . '">'
305 . $term[0]
306 . "</a></span>\n";
308 $html .= "</div>\n";
309 return $html;
313 =head1 NAME
315 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
317 =head1 USAGE
319 =over
321 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
323 Creates multiple HTML files containing kewords cloud with top terms sorted
324 by their logarithmic weight.
325 F<cloud.conf> is a YAML configuration file driving cloud generation
326 process.
328 =back
330 =head1 PARAMETERS
332 =over
334 =item B<--conf=configuration file>
336 Specify configuration file name
338 =item B<--verbose|-v>
340 Enable script verbose mode.
342 =item B<--help|-h>
344 Print this help page.
346 =back
348 =head1 CONFIGURATION
350 Configuration file looks like that:
352 ---
353 # Koha configuration file for a specific installation
354 # If not present, defaults to KOHA_CONF
355 KohaConf: /home/koha/mylibray/etc/koha-conf.xml
356 # Zebra index to scan
357 ZebraIndex: Author
358 # Koha index used to link found kewords with an opac search URL
359 KohaIndex: au
360 # Number of top keyword to use for the cloud
361 Count: 50
362 # Include CSS style directives with the cloud
363 # This could be used as a model and then CSS directives are
364 # put in the appropriate CSS file directly.
365 Withcss: Yes
366 # HTML file where to output the cloud
367 Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
368 ---
369 KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
370 ZebraIndex: Subject
371 KohaIndex: su
372 Count: 200
373 Withcss: no
374 Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
376 =head1 IMPROVEMENTS
378 Generated top terms have more informations than those outputted from
379 the time being. Some parameters could be easily added to improve
380 this script:
382 =over
384 =item B<WithCount>
386 In order to output terms with the number of occurrences they
387 have been found in Koha Catalogue by Zebra.
389 =item B<CloudLevels>
391 Number of levels in the cloud. Now 24 levels are hardcoded.
393 =item B<Weithing>
395 Weighting method used to distribute terms in the cloud. We could have two
396 values: Logarithmic and Linear. Now it's Logarithmic by default.
398 =item B<Order>
400 Now terms are outputted in the lexical order. They could be sorted
401 by their weight.
403 =back
405 =cut