Bug 16601: Update of italian MARC21 files
[koha.git] / misc / maintenance / MARC21_utf8_flag_fix.pl
blob17b885d9f8279335668e22483d20cde378ff10ae
1 #!/usr/bin/perl
3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 use strict;
21 use warnings;
23 use MARC::Record;
24 use MARC::File::XML;
25 use Getopt::Long qw(:config auto_help auto_version);
26 use Pod::Usage;
28 use C4::Biblio;
29 use C4::Charset;
30 use C4::Context;
31 use C4::Debug;
33 BEGIN {
34 # find Koha's Perl modules
35 # test carefully before changing this
36 use FindBin;
37 eval { require "$FindBin::Bin/../kohalib.pl" };
40 our $debug;
42 ## OPTIONS
43 my $help = 0;
44 my $man = 0;
45 my $verbose = 0;
47 my $limit; # undef, not zero.
48 my $offset = 0;
49 my $dump = 0;
50 my $summary = 1;
51 my $fix = 0;
53 GetOptions(
54 'help|?' => \$help,
55 'man' => \$man,
56 'verbose=i' => \$verbose,
57 'limit=i' => \$limit,
58 'offset=i' => \$offset,
59 'dump!' => \$dump,
60 'summary!' => \$summary,
61 'fix!' => \$fix,
62 ) or pod2usage(2);
63 pod2usage( -verbose => 2 ) if ($man);
64 pod2usage( -verbose => 2 ) if ($help and $verbose);
65 pod2usage(1) if $help;
67 if ($debug) {
68 $summary++;
69 $verbose++;
72 my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
73 ($marcflavour eq 'MARC21') or die "marcflavour must be MARC21, not $marcflavour";
75 my $all = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems");
76 $all->execute;
77 my $total = $all->fetchrow;
79 my $count_query = "SELECT COUNT(*) FROM biblioitems WHERE substr(marc, 10, 1) = ?";
80 my $query = "SELECT * FROM biblioitems WHERE substr(marc, 10, 1) <> ?";
82 my $sth = C4::Context->dbh->prepare($count_query);
83 $sth->execute('a');
84 my $count = $sth->fetchrow;
85 my $badcount = $total-$count;
87 if ($summary) {
88 print "# biblioitems with leader/09 = 'a'\n";
89 printf "# %9s match\n", $count;
90 printf "# %9s BAD \n", $badcount;
91 printf "# %9s total\n\n", $total;
92 printf "# Examining %s BAD record(s), offset %d:\n", ($limit || 'all'), $offset;
95 my $bad_recs = C4::Context->dbh->prepare($query);
96 $bad_recs->execute('a');
97 $limit or $limit = $bad_recs->rows(); # limit becomes max if unspecified
98 $limit += $offset if $offset; # increase limit for offset
99 my $i = 0;
101 MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
103 while ( my $row = $bad_recs->fetchrow_hashref() ) {
104 (++$i > $limit) and last;
105 ( $i > $offset) or next;
106 my $xml = $row->{marcxml};
107 $xml =~ s/.*(\<leader\>)/$1/s;
108 $xml =~ s/(\<\/leader\>).*/$1/s;
109 # $xml now pared down to just the <leader> element
110 printf "# %4d of %4d: biblionumber %s : %s\n", $i, $badcount, $row->{biblionumber}, $xml;
111 my $stripped = StripNonXmlChars($row->{marcxml});
112 ($stripped eq $row->{marcxml}) or printf STDERR "%d NON-XML Characters removed!!\n", (length($row->{marcxml}) - length($stripped));
113 my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
114 if ($@ or not $record) {
115 print STDERR "ERROR in MARC::Record::new_from_xml(\$marcxml, 'utf8', $marcflavour): $@\n\tSkipping $row->{biblionumber}\n";
116 next;
118 if ($fix) {
119 SetMarcUnicodeFlag($record, $marcflavour);
120 if (ModBiblioMarc($record, $row->{biblionumber})) {
121 printf "# %4d of %4d: biblionumber %s : <leader>%s</leader>\n", $i, $badcount, $row->{biblionumber}, $record->leader();
122 } else {
123 print STDERR "ERROR in ModBiblioMarc(\$record, $row->{biblionumber})\n";
126 $dump and print $row->{marcxml}, "\n";
129 __END__
131 =head1 NAME
133 MARC21_utf8_flag_fix.pl - Repair missing leader position 9 value ("a" for MARC21 - UTF8).
135 =head1 SYNOPSIS
137 MARC21_utf8_flag_fix.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l N ] [ -o N ] [ -f ]
139 Help Options:
140 -h --help -? Brief help message
141 -m --man Full documentation, same as --help --verbose
142 --version Prints version info
144 Feedback Options:
145 -d --dump Dump MARCXML of biblioitems processed, default OFF
146 -s --summary Print initial summary of good and bad biblioitems counted, default ON
147 -v --verbose Increase verbosity of output, default OFF
149 Run Options:
150 -f --fix Save repaired leaders to biblioitems.marcxml,
151 -l --limit Number of biblioitems to display or fix
152 -o --offset Number of biblioitems to skip (not displayed or fixed)
154 =head1 OPTIONS
156 =over 8
158 =item B<--fix>
160 This is the most important option. Without it, the script just tells you about the problem records.
161 With --fix, the script fixes the same records.
163 =item B<--limit=N>
165 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
166 The default is to target all records with bad leaders.
168 =item B<--offset=N>
170 Like an OFFSET statement in SQL, this tells the script to skip N of the targeted records.
171 The default is 0, i.e. skip none of them.
173 =back
175 The binary ON/OFF options can be negated like:
176 B<--nosummary> Do not display summary.
177 B<--nodump> Do not dump MARCXML.
178 B<--nofix> Do not change any records. This is the default mode.
180 =head1 DESCRIPTION
182 Koha expects to have all MARXML records internalized in UTF-8 encoding. This
183 presents a problem when records have been inserted with the leader/09 showing
184 blank for MARC8 encoding. This script is used to determine the extent of the
185 problem and to fix the affected leaders.
187 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
189 Run MARC21_utf8_flag_fix.pl the first time with no options, and assuming you agree that the leaders
190 presented need fixing, run it again with B<--fix>.
192 =head1 USAGE EXAMPLES
194 B<MARC21_utf8_flag_fix.pl>
196 In the most basic form, displays summary of biblioitems examined
197 and the leader from any found without /09 = a.
199 B<MARC21_utf8_flag_fix.pl --fix>
201 Fixes the same biblioitems, displaying summary and each leader before/after change.
203 B<MARC21_utf8_flag_fix.pl --limit=3 --offset=15 --nosummary --dump>
205 Dumps MARCXML from the 16th, 17th and 18th bad records found.
207 B<MARC21_utf8_flag_fix.pl -l 3 -o 15 -s 0 -d>
209 Same thing as previous example in terse form.
211 =head1 TO DO
213 Allow biblionumbers to be piped into STDIN as the selection mechanism.
215 =head1 SEE ALSO
217 C4::Biblio
219 =cut