Bug 26922: Regression tests
[koha.git] / misc / link_bibs_to_authorities.pl
blobe7df4d57f676f2f7b095066f378bc6777ec8ef6e
1 #!/usr/bin/perl
3 use strict;
4 use warnings;
6 BEGIN {
8 # find Koha's Perl modules
9 # test carefully before changing this
10 use FindBin;
11 eval { require "$FindBin::Bin/kohalib.pl" };
14 use Koha::Script;
15 use C4::Context;
16 use C4::Biblio;
17 use Getopt::Long;
18 use Pod::Usage;
19 use Data::Dumper;
20 use Time::HiRes qw/time/;
21 use POSIX qw/strftime ceil/;
22 use Module::Load::Conditional qw(can_load);
24 sub usage {
25 pod2usage( -verbose => 2 );
26 exit;
29 $| = 1;
31 # command-line parameters
32 my $verbose = 0;
33 my $link_report = 0;
34 my $test_only = 0;
35 my $want_help = 0;
36 my $auth_limit;
37 my $bib_limit;
38 my $commit = 100;
39 my $tagtolink;
40 my $allowrelink = C4::Context->preference("CatalogModuleRelink") || '';
42 my $result = GetOptions(
43 'v|verbose' => \$verbose,
44 't|test' => \$test_only,
45 'l|link-report' => \$link_report,
46 'a|auth-limit=s' => \$auth_limit,
47 'b|bib-limit=s' => \$bib_limit,
48 'c|commit=i' => \$commit,
49 'g|tagtolink=i' => \$tagtolink,
50 'h|help' => \$want_help
53 binmode( STDOUT, ":encoding(UTF-8)" );
55 if ( not $result or $want_help ) {
56 usage();
59 my $linker_module =
60 "C4::Linker::" . ( C4::Context->preference("LinkerModule") || 'Default' );
61 unless ( can_load( modules => { $linker_module => undef } ) ) {
62 $linker_module = 'C4::Linker::Default';
63 unless ( can_load( modules => { $linker_module => undef } ) ) {
64 die "Unable to load linker module. Aborting.";
68 my $linker = $linker_module->new(
70 'auth_limit' => $auth_limit,
71 'options' => C4::Context->preference("LinkerOptions")
75 my $num_bibs_processed = 0;
76 my $num_bibs_modified = 0;
77 my $num_bad_bibs = 0;
78 my %unlinked_headings;
79 my %linked_headings;
80 my %fuzzy_headings;
81 my $dbh = C4::Context->dbh;
82 $dbh->{AutoCommit} = 0;
83 process_bibs( $linker, $bib_limit, $auth_limit, $commit, { tagtolink => $tagtolink, allowrelink => $allowrelink });
84 $dbh->commit();
86 exit 0;
88 sub process_bibs {
89 my ( $linker, $bib_limit, $auth_limit, $commit, $args ) = @_;
90 my $tagtolink = $args->{tagtolink};
91 my $allowrelink = $args->{allowrelink};
92 my $bib_where = '';
93 my $starttime = time();
94 if ($bib_limit) {
95 $bib_where = "WHERE $bib_limit";
97 my $sql =
98 "SELECT biblionumber FROM biblio $bib_where ORDER BY biblionumber ASC";
99 my $sth = $dbh->prepare($sql);
100 $sth->execute();
101 my $linker_args = { tagtolink => $tagtolink, allowrelink => $allowrelink };
102 while ( my ($biblionumber) = $sth->fetchrow_array() ) {
103 $num_bibs_processed++;
104 process_bib( $linker, $biblionumber, $linker_args );
106 if ( not $test_only and ( $num_bibs_processed % $commit ) == 0 ) {
107 print_progress_and_commit($num_bibs_processed);
111 if ( not $test_only ) {
112 $dbh->commit;
115 my $headings_linked = 0;
116 my $headings_unlinked = 0;
117 my $headings_fuzzy = 0;
118 for ( values %linked_headings ) { $headings_linked += $_; }
119 for ( values %unlinked_headings ) { $headings_unlinked += $_; }
120 for ( values %fuzzy_headings ) { $headings_fuzzy += $_; }
122 my $endtime = time();
123 my $totaltime = ceil (($endtime - $starttime) * 1000);
124 $starttime = strftime('%D %T', localtime($starttime));
125 $endtime = strftime('%D %T', localtime($endtime));
127 my $summary = <<_SUMMARY_;
129 Bib authority heading linking report
130 =======================================================
131 Linker module: $linker_module
132 Run started at: $starttime
133 Run ended at: $endtime
134 Total run time: $totaltime ms
135 Number of bibs checked: $num_bibs_processed
136 Number of bibs modified: $num_bibs_modified
137 Number of bibs with errors: $num_bad_bibs
138 Number of headings linked: $headings_linked
139 Number of headings unlinked: $headings_unlinked
140 Number of headings fuzzily linked: $headings_fuzzy
141 _SUMMARY_
142 $summary .= "\n**** Ran in test mode only ****\n" if $test_only;
143 print $summary;
145 if ($link_report) {
146 my @keys;
147 print <<_LINKED_HEADER_;
149 Linked headings (from most frequent to least):
150 -------------------------------------------------------
152 _LINKED_HEADER_
154 @keys = sort {
155 $linked_headings{$b} <=> $linked_headings{$a} or "\L$a" cmp "\L$b"
156 } keys %linked_headings;
157 foreach my $key (@keys) {
158 print "$key:\t" . $linked_headings{$key} . " occurrences\n";
161 print <<_UNLINKED_HEADER_;
163 Unlinked headings (from most frequent to least):
164 -------------------------------------------------------
166 _UNLINKED_HEADER_
168 @keys = sort {
169 $unlinked_headings{$b} <=> $unlinked_headings{$a}
170 or "\L$a" cmp "\L$b"
171 } keys %unlinked_headings;
172 foreach my $key (@keys) {
173 print "$key:\t" . $unlinked_headings{$key} . " occurrences\n";
176 print <<_FUZZY_HEADER_;
178 Fuzzily-matched headings (from most frequent to least):
179 -------------------------------------------------------
181 _FUZZY_HEADER_
183 @keys = sort {
184 $fuzzy_headings{$b} <=> $fuzzy_headings{$a} or "\L$a" cmp "\L$b"
185 } keys %fuzzy_headings;
186 foreach my $key (@keys) {
187 print "$key:\t" . $fuzzy_headings{$key} . " occurrences\n";
189 print $summary;
193 sub process_bib {
194 my $linker = shift;
195 my $biblionumber = shift;
196 my $args = shift;
197 my $tagtolink = $args->{tagtolink};
198 my $allowrelink = $args->{allowrelink};
199 my $bib = GetMarcBiblio({ biblionumber => $biblionumber });
200 unless ( defined $bib ) {
201 print
202 "\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n";
203 $num_bad_bibs++;
204 return;
207 my $frameworkcode = GetFrameworkCode($biblionumber);
209 my ( $headings_changed, $results ) =
210 LinkBibHeadingsToAuthorities( $linker, $bib, $frameworkcode, $allowrelink, $tagtolink );
211 foreach my $key ( keys %{ $results->{'unlinked'} } ) {
212 $unlinked_headings{$key} += $results->{'unlinked'}->{$key};
214 foreach my $key ( keys %{ $results->{'linked'} } ) {
215 $linked_headings{$key} += $results->{'linked'}->{$key};
217 foreach my $key ( keys %{ $results->{'fuzzy'} } ) {
218 $fuzzy_headings{$key} += $results->{'fuzzy'}->{$key};
221 if ($headings_changed) {
222 if ($verbose) {
223 my $title = substr( $bib->title, 0, 20 );
224 printf(
225 "Bib %12d (%-20s): %3d headings changed\n",
226 $biblionumber,
227 $title,
228 $headings_changed
231 if ( not $test_only ) {
232 ModBiblio( $bib, $biblionumber, $frameworkcode, 1 );
233 #Last param is to note ModBiblio was called from linking script and bib should not be linked again
234 $num_bibs_modified++;
239 sub print_progress_and_commit {
240 my $recs = shift;
241 $dbh->commit();
242 print "... processed $recs records\n";
245 =head1 NAME
247 link_bibs_to_authorities.pl
249 =head1 SYNOPSIS
251 link_bibs_to_authorities.pl
252 link_bibs_to_authorities.pl -v
253 link_bibs_to_authorities.pl -l
254 link_bibs_to_authorities.pl --commit=1000
255 link_bibs_to_authorities.pl --auth-limit=STRING
256 link_bibs_to_authorities.pl --bib-limit=STRING
257 link_bibs_to_authorities.pl -g=700
259 =head1 DESCRIPTION
261 This batch job checks each bib record in the Koha database and attempts to link
262 each of its headings to the matching authority record.
264 =over 8
266 =item B<--help>
268 Prints this help
270 =item B<-v|--verbose>
272 Provide verbose log information (print the number of headings changed for each
273 bib record).
275 =item B<-l|--link-report>
277 Provide a report of all the headings that were processed: which were matched,
278 which were not, etc.
280 =item B<--auth-limit=S>
282 Only process those headings which match an authority record that matches the
283 user-specified WHERE clause.
285 =item B<--bib-limit=S>
287 Only process those bib records that match the user-specified WHERE clause.
289 =item B<--commit=N>
291 Commit the results to the database after every N records are processed.
293 =item B<-g=N>
295 Only process those headings found in MARC field N.
297 =item B<--test>
299 Only test the authority linking and report the results; do not change the bib
300 records.
302 =back
304 =cut