4 # Copyright 2008 Tamil s.a.r.l.
6 # This file is part of Koha.
8 # Koha is free software; you can redistribute it and/or modify it
9 # under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 3 of the License, or
11 # (at your option) any later version.
13 # Koha is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with Koha; if not, see <http://www.gnu.org/licenses>.
29 use Koha
::Script
-cron
;
37 'verbose' => \
$verbose,
43 pod2usage
( -verbose
=> 2 );
47 usage
() if $help || !$conf;
52 print "Reading configuration file: $conf\n" if $verbose;
54 @clouds = LoadFile
( $conf );
56 croak
"Unable to read configuration file: $conf\n" if $@
;
58 for my $cloud ( @clouds ) {
59 print "Create a cloud\n",
60 " Koha conf file: ", $cloud->{KohaConf
} ?
$cloud->{KohaConf
} : "default", "\n",
61 " Zebra Index: ", $cloud->{ZebraIndex
}, "\n",
62 " Koha Keyword: ", $cloud->{KohaIndex
}, "\n",
63 " Count: ", $cloud->{Count
}, "\n",
64 " Withcss: ", $cloud->{Withcss
}, "\n",
65 " Output: ", $cloud->{Output
}, "\n",
68 # Set Koha context if KohaConf is present
69 my $set_new_context = 0;
70 if ( $cloud->{KohaConf
} ) {
71 if ( -e
$cloud->{KohaConf
} ) {
72 my $context = C4
::Context
->new( $cloud->{KohaConf
} );
73 $context->set_context();
77 carp
"Koha conf file doesn't exist: ", $cloud->{KohaConf
}, " ; use KOHA_CONF\n";
81 my $index = new ZebraIndex
( $cloud->{ZebraIndex
} );
82 $index->scan( $cloud->{Count
} );
84 open my $fh, ">", $cloud->{Output
}
85 or croak
"Unable to create file ", $cloud->{Output
};
87 my $withcss = $cloud->{Withcss
} =~ /^yes/i;
88 print $fh $index->html_cloud( $cloud->{KohaIndex
}, $withcss );
90 $set_new_context && restore_context C4
::Context
;
105 $self->{ zebra_index
} = shift;
106 $self->{ top_terms
} = undef;
107 $self->{ levels_cloud
} = 24;
111 my $zbiblio = C4
::Context
->Zconn( "biblioserver" );
113 my $ss = $zbiblio->scan_pqf(
114 '@attr 1=' . $self->{ zebra_index
} . ' @attr 4=1 @attr 6=3 "a"'
117 croak
"Invalid Zebra index: ", $self->{ zebra_index
} if $@
;
125 # Scan zebra index and populate an array of top terms
128 # $max_terms Max number of top terms
131 # A 4-dimensionnal array in $self->{top_terms}
133 # [1] term number of occurrences
134 # [2] term proportional relative weight in terms set E[0-1]
135 # [3] term logarithmic relative weight E [0-levels_cloud]
137 # This array is sorted alphabetically by terms ([0])
138 # It can be easily sorted by occurrences:
139 # @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
143 my $index_name = $self->{ zebra_index
};
144 my $max_terms = shift;
146 my $MAX_OCCURENCE = 1000000000;
148 my $zbiblio = C4
::Context
->Zconn( "biblioserver" );
149 my $number_of_terms = 0;
150 my @terms; # 2 dimensions array
151 my $min_occurence_index = -1;
158 print "$from\n" if $verbose;
159 $from =~ s/\"/\\\"/g;
160 my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
162 $ss = $zbiblio->scan_pqf( $query );
168 $ss->option( rpnCharset
=> 'UTF-8' );
169 last if $ss->size() == 0;
172 for my $index ( 0..$ss->size()-1 ) {
173 ($term, $occ) = $ss->display_term($index);
174 #print "$term:$occ\n";
175 if ( $number_of_terms < $max_terms ) {
176 push( @terms, [ $term, $occ ] );
178 if ( $number_of_terms == $max_terms ) {
179 $min_occurence = $MAX_OCCURENCE;
180 for (0..$number_of_terms-1) {
181 my @term = @
{ $terms[$_] };
182 if ( $term[1] <= $min_occurence ) {
183 $min_occurence = $term[1];
184 $min_occurence_index = $_;
190 if ( $occ > $min_occurence) {
191 @
{ $terms[$min_occurence_index] }[0] = $term;
192 @
{ $terms[$min_occurence_index] }[1] = $occ;
193 $min_occurence = $MAX_OCCURENCE;
194 for (0..$max_terms-1) {
195 my @term = @
{ $terms[$_] };
196 if ( $term[1] <= $min_occurence ) {
197 $min_occurence = $term[1];
198 $min_occurence_index = $_;
207 # Sort array of array by terms weight
208 @terms = sort { @
{$a}[1] <=> @
{$b}[1] } @terms;
210 # A relatif weight to other set terms is added to each term
211 my $min = $terms[0][1];
212 my $log_min = log( $min );
213 my $max = $terms[$#terms][1];
214 my $log_max = log( $max );
215 my $delta = $max - $min;
216 $delta = 1 if $delta == 0; # Very unlikely
218 if ($log_max - $log_min == 0) {
219 $log_min = $log_min - $self->{levels_cloud
};
223 $factor = $self->{levels_cloud
} / ($log_max - $log_min);
226 foreach (0..$#terms) {
227 my $count = @
{ $terms[$_] }[1];
228 my $weight = ( $count - $min ) / $delta;
229 my $log_weight = int( (log($count) - $log_min) * $factor);
230 push( @
{ $terms[$_] }, $weight );
231 push( @
{ $terms[$_] }, $log_weight );
233 $self->{ top_terms
} = \
@terms;
235 # Sort array of array by terms alphabetical order
236 @terms = sort { @
{$a}[0] cmp @
{$b}[0] } @terms;
241 # Returns a HTML version of index top terms formatted
246 my $koha_index = shift;
248 my @terms = @
{ $self->{top_terms
} };
261 font-weight: lighter;
262 text-decoration: none;
264 span.tagcloud0 { font-size: 12px;}
265 span.tagcloud1 { font-size: 13px;}
266 span.tagcloud2 { font-size: 14px;}
267 span.tagcloud3 { font-size: 15px;}
268 span.tagcloud4 { font-size: 16px;}
269 span.tagcloud5 { font-size: 17px;}
270 span.tagcloud6 { font-size: 18px;}
271 span.tagcloud7 { font-size: 19px;}
272 span.tagcloud8 { font-size: 20px;}
273 span.tagcloud9 { font-size: 21px;}
274 span.tagcloud10 { font-size: 22px;}
275 span.tagcloud11 { font-size: 23px;}
276 span.tagcloud12 { font-size: 24px;}
277 span.tagcloud13 { font-size: 25px;}
278 span.tagcloud14 { font-size: 26px;}
279 span.tagcloud15 { font-size: 27px;}
280 span.tagcloud16 { font-size: 28px;}
281 span.tagcloud17 { font-size: 29px;}
282 span.tagcloud18 { font-size: 30px;}
283 span.tagcloud19 { font-size: 31px;}
284 span.tagcloud20 { font-size: 32px;}
285 span.tagcloud21 { font-size: 33px;}
286 span.tagcloud22 { font-size: 34px;}
287 span.tagcloud23 { font-size: 35px;}
288 span.tagcloud24 { font-size: 36px;}
290 <div class="subjectcloud">
294 my @term = @
{ $terms[$_] };
297 #print " 0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
299 . '<span class="tagcloud'
302 . '<a href="/cgi-bin/koha/opac-search.pl?q='
317 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
323 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
325 Creates multiple HTML files containing kewords cloud with top terms sorted
326 by their logarithmic weight.
327 F<cloud.conf> is a YAML configuration file driving cloud generation
336 =item B<--conf=configuration file>
338 Specify configuration file name
340 =item B<--verbose|-v>
342 Enable script verbose mode.
346 Print this help page.
352 Configuration file looks like that:
355 # Koha configuration file for a specific installation
356 # If not present, defaults to KOHA_CONF
357 KohaConf: /home/koha/mylibray/etc/koha-conf.xml
358 # Zebra index to scan
360 # Koha index used to link found kewords with an opac search URL
362 # Number of top keyword to use for the cloud
364 # Include CSS style directives with the cloud
365 # This could be used as a model and then CSS directives are
366 # put in the appropriate CSS file directly.
368 # HTML file where to output the cloud
369 Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
371 KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
376 Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
380 Generated top terms have more informations than those outputted from
381 the time being. Some parameters could be easily added to improve
388 In order to output terms with the number of occurrences they
389 have been found in Koha Catalogue by Zebra.
393 Number of levels in the cloud. Now 24 levels are hardcoded.
397 Weighting method used to distribute terms in the cloud. We could have two
398 values: Logarithmic and Linear. Now it's Logarithmic by default.
402 Now terms are outputted in the lexical order. They could be sorted