Bug 15838 - Subscription duplicating: Reset fields from SubscriptionDuplicateDroppedI...
[koha.git] / misc / link_bibs_to_authorities.pl
blob37596c5024540b7588527a62201e46bf38565379
1 #!/usr/bin/perl
3 use strict;
4 use warnings;
6 BEGIN {
8 # find Koha's Perl modules
9 # test carefully before changing this
10 use FindBin;
11 eval { require "$FindBin::Bin/kohalib.pl" };
14 use C4::Context;
15 use C4::Biblio;
16 use Getopt::Long;
17 use Pod::Usage;
18 use Data::Dumper;
19 use Time::HiRes qw/time/;
20 use POSIX qw/strftime ceil/;
21 use Module::Load::Conditional qw(can_load);
23 sub usage {
24 pod2usage( -verbose => 2 );
25 exit;
28 $| = 1;
30 # command-line parameters
31 my $verbose = 0;
32 my $link_report = 0;
33 my $test_only = 0;
34 my $want_help = 0;
35 my $auth_limit;
36 my $bib_limit;
37 my $commit = 100;
39 my $result = GetOptions(
40 'v|verbose' => \$verbose,
41 't|test' => \$test_only,
42 'l|link-report' => \$link_report,
43 'a|auth-limit=s' => \$auth_limit,
44 'b|bib-limit=s' => \$bib_limit,
45 'c|commit=i' => \$commit,
46 'h|help' => \$want_help
49 binmode( STDOUT, ":utf8" );
51 if ( not $result or $want_help ) {
52 usage();
55 my $linker_module =
56 "C4::Linker::" . ( C4::Context->preference("LinkerModule") || 'Default' );
57 unless ( can_load( modules => { $linker_module => undef } ) ) {
58 $linker_module = 'C4::Linker::Default';
59 unless ( can_load( modules => { $linker_module => undef } ) ) {
60 die "Unable to load linker module. Aborting.";
64 my $linker = $linker_module->new(
66 'auth_limit' => $auth_limit,
67 'options' => C4::Context->preference("LinkerOptions")
71 my $num_bibs_processed = 0;
72 my $num_bibs_modified = 0;
73 my $num_bad_bibs = 0;
74 my %unlinked_headings;
75 my %linked_headings;
76 my %fuzzy_headings;
77 my $dbh = C4::Context->dbh;
78 $dbh->{AutoCommit} = 0;
79 process_bibs( $linker, $bib_limit, $auth_limit, $commit );
80 $dbh->commit();
82 exit 0;
84 sub process_bibs {
85 my ( $linker, $bib_limit, $auth_limit, $commit ) = @_;
86 my $bib_where = '';
87 my $starttime = time();
88 if ($bib_limit) {
89 $bib_where = "WHERE $bib_limit";
91 my $sql =
92 "SELECT biblionumber FROM biblio $bib_where ORDER BY biblionumber ASC";
93 my $sth = $dbh->prepare($sql);
94 $sth->execute();
95 while ( my ($biblionumber) = $sth->fetchrow_array() ) {
96 $num_bibs_processed++;
97 process_bib( $linker, $biblionumber );
99 if ( not $test_only and ( $num_bibs_processed % $commit ) == 0 ) {
100 print_progress_and_commit($num_bibs_processed);
104 if ( not $test_only ) {
105 $dbh->commit;
108 my $headings_linked = 0;
109 my $headings_unlinked = 0;
110 my $headings_fuzzy = 0;
111 for ( values %linked_headings ) { $headings_linked += $_; }
112 for ( values %unlinked_headings ) { $headings_unlinked += $_; }
113 for ( values %fuzzy_headings ) { $headings_fuzzy += $_; }
115 my $endtime = time();
116 my $totaltime = ceil (($endtime - $starttime) * 1000);
117 $starttime = strftime('%D %T', localtime($starttime));
118 $endtime = strftime('%D %T', localtime($endtime));
120 my $summary = <<_SUMMARY_;
122 Bib authority heading linking report
123 =======================================================
124 Linker module: $linker_module
125 Run started at: $starttime
126 Run ended at: $endtime
127 Total run time: $totaltime ms
128 Number of bibs checked: $num_bibs_processed
129 Number of bibs modified: $num_bibs_modified
130 Number of bibs with errors: $num_bad_bibs
131 Number of headings linked: $headings_linked
132 Number of headings unlinked: $headings_unlinked
133 Number of headings fuzzily linked: $headings_fuzzy
134 _SUMMARY_
135 $summary .= "\n**** Ran in test mode only ****\n" if $test_only;
136 print $summary;
138 if ($link_report) {
139 my @keys;
140 print <<_LINKED_HEADER_;
142 Linked headings (from most frequent to least):
143 -------------------------------------------------------
145 _LINKED_HEADER_
147 @keys = sort {
148 $linked_headings{$b} <=> $linked_headings{$a} or "\L$a" cmp "\L$b"
149 } keys %linked_headings;
150 foreach my $key (@keys) {
151 print "$key:\t" . $linked_headings{$key} . " occurrences\n";
154 print <<_UNLINKED_HEADER_;
156 Unlinked headings (from most frequent to least):
157 -------------------------------------------------------
159 _UNLINKED_HEADER_
161 @keys = sort {
162 $unlinked_headings{$b} <=> $unlinked_headings{$a}
163 or "\L$a" cmp "\L$b"
164 } keys %unlinked_headings;
165 foreach my $key (@keys) {
166 print "$key:\t" . $unlinked_headings{$key} . " occurrences\n";
169 print <<_FUZZY_HEADER_;
171 Fuzzily-matched headings (from most frequent to least):
172 -------------------------------------------------------
174 _FUZZY_HEADER_
176 @keys = sort {
177 $fuzzy_headings{$b} <=> $fuzzy_headings{$a} or "\L$a" cmp "\L$b"
178 } keys %fuzzy_headings;
179 foreach my $key (@keys) {
180 print "$key:\t" . $fuzzy_headings{$key} . " occurrences\n";
182 print $summary;
186 sub process_bib {
187 my $linker = shift;
188 my $biblionumber = shift;
190 my $bib = GetMarcBiblio($biblionumber);
191 unless ( defined $bib ) {
192 print
193 "\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n";
194 $num_bad_bibs++;
195 return;
198 my ( $headings_changed, $results ) =
199 LinkBibHeadingsToAuthorities( $linker, $bib,
200 GetFrameworkCode($biblionumber) );
201 foreach my $key ( keys %{ $results->{'unlinked'} } ) {
202 $unlinked_headings{$key} += $results->{'unlinked'}->{$key};
204 foreach my $key ( keys %{ $results->{'linked'} } ) {
205 $linked_headings{$key} += $results->{'linked'}->{$key};
207 foreach my $key ( keys %{ $results->{'fuzzy'} } ) {
208 $fuzzy_headings{$key} += $results->{'fuzzy'}->{$key};
211 if ($headings_changed) {
212 if ($verbose) {
213 my $title = substr( $bib->title, 0, 20 );
214 print
215 "Bib $biblionumber ($title): $headings_changed headings changed\n";
217 if ( not $test_only ) {
218 ModBiblio( $bib, $biblionumber, GetFrameworkCode($biblionumber) );
219 $num_bibs_modified++;
224 sub print_progress_and_commit {
225 my $recs = shift;
226 $dbh->commit();
227 print "... processed $recs records\n";
230 =head1 NAME
232 link_bibs_to_authorities.pl
234 =head1 SYNOPSIS
236 link_bibs_to_authorities.pl
237 link_bibs_to_authorities.pl -v
238 link_bibs_to_authorities.pl -l
239 link_bibs_to_authorities.pl --commit=1000
240 link_bibs_to_authorities.pl --auth-limit=STRING
241 link_bibs_to_authorities.pl --bib-limit=STRING
243 =head1 DESCRIPTION
245 This batch job checks each bib record in the Koha database and attempts to link
246 each of its headings to the matching authority record.
248 =over 8
250 =item B<--help>
252 Prints this help
254 =item B<-v|--verbose>
256 Provide verbose log information (print the number of headings changed for each
257 bib record).
259 =item B<-l|--link-report>
261 Provide a report of all the headings that were processed: which were matched,
262 which were not, etc.
264 =item B<--auth-limit=S>
266 Only process those headings which match an authority record that matches the
267 user-specified WHERE clause.
269 =item B<--bib-limit=S>
271 Only process those bib records that match the user-specified WHERE clause.
273 =item B<--commit=N>
275 Commit the results to the database after every N records are processed.
277 =item B<--test>
279 Only test the authority linking and report the results; do not change the bib
280 records.
282 =back
284 =cut