Bug 6149 - Follow-up [syspref] - Stopwords for Result Highlighting
[koha.git] / misc / maintenance / MARC21_utf8_flag_fix.pl
blobe7e915651e8f18c979caf62fede5eaedc69b2165
1 #!/usr/bin/perl
3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 use strict;
21 use warnings;
23 use MARC::Record;
24 use MARC::File::XML;
25 use Getopt::Long qw(:config auto_help auto_version);
26 use Pod::Usage;
28 use C4::Biblio;
29 use C4::Charset;
30 use C4::Context;
31 use C4::Debug;
33 use vars qw($VERSION);
35 BEGIN {
36 # find Koha's Perl modules
37 # test carefully before changing this
38 use FindBin;
39 eval { require "$FindBin::Bin/../kohalib.pl" };
40 $VERSION = 0.02;
43 our $debug;
45 ## OPTIONS
46 my $help = 0;
47 my $man = 0;
48 my $verbose = 0;
50 my $limit; # undef, not zero.
51 my $offset = 0;
52 my $dump = 0;
53 my $summary = 1;
54 my $fix = 0;
56 GetOptions(
57 'help|?' => \$help,
58 'man' => \$man,
59 'verbose=i' => \$verbose,
60 'limit=i' => \$limit,
61 'offset=i' => \$offset,
62 'dump!' => \$dump,
63 'summary!' => \$summary,
64 'fix!' => \$fix,
65 ) or pod2usage(2);
66 pod2usage( -verbose => 2 ) if ($man);
67 pod2usage( -verbose => 2 ) if ($help and $verbose);
68 pod2usage(1) if $help;
70 if ($debug) {
71 $summary++;
72 $verbose++;
75 my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
76 ($marcflavour eq 'MARC21') or die "marcflavour must be MARC21, not $marcflavour";
78 my $all = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems");
79 $all->execute;
80 my $total = $all->fetchrow;
82 my $count_query = "SELECT COUNT(*) FROM biblioitems WHERE substr(marc, 10, 1) = ?";
83 my $query = "SELECT * FROM biblioitems WHERE substr(marc, 10, 1) <> ?";
85 my $sth = C4::Context->dbh->prepare($count_query);
86 $sth->execute('a');
87 my $count = $sth->fetchrow;
88 my $badcount = $total-$count;
90 if ($summary) {
91 print "# biblioitems with leader/09 = 'a'\n";
92 printf "# %9s match\n", $count;
93 printf "# %9s BAD \n", $badcount;
94 printf "# %9s total\n\n", $total;
95 printf "# Examining %s BAD record(s), offset %d:\n", ($limit || 'all'), $offset;
98 my $bad_recs = C4::Context->dbh->prepare($query);
99 $bad_recs->execute('a');
100 $limit or $limit = $bad_recs->rows(); # limit becomes max if unspecified
101 $limit += $offset if $offset; # increase limit for offset
102 my $i = 0;
104 MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
106 while ( my $row = $bad_recs->fetchrow_hashref() ) {
107 (++$i > $limit) and last;
108 ( $i > $offset) or next;
109 my $xml = $row->{marcxml};
110 $xml =~ s/.*(\<leader\>)/$1/s;
111 $xml =~ s/(\<\/leader\>).*/$1/s;
112 # $xml now pared down to just the <leader> element
113 printf "# %4d of %4d: biblionumber %s : %s\n", $i, $badcount, $row->{biblionumber}, $xml;
114 my $stripped = StripNonXmlChars($row->{marcxml});
115 ($stripped eq $row->{marcxml}) or printf STDERR "%d NON-XML Characters removed!!\n", (length($row->{marcxml}) - length($stripped));
116 my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
117 if ($@ or not $record) {
118 print STDERR "ERROR in MARC::Record::new_from_xml(\$marcxml, 'utf8', $marcflavour): $@\n\tSkipping $row->{biblionumber}\n";
119 next;
121 if ($fix) {
122 SetMarcUnicodeFlag($record, $marcflavour);
123 if (ModBiblioMarc($record, $row->{biblionumber})) {
124 printf "# %4d of %4d: biblionumber %s : <leader>%s</leader>\n", $i, $badcount, $row->{biblionumber}, $record->leader();
125 } else {
126 print STDERR "ERROR in ModBiblioMarc(\$record, $row->{biblionumber})\n";
129 $dump and print $row->{marcxml}, "\n";
132 __END__
134 =head1 NAME
136 MARC21_utf8_flag_fix.pl - Repair missing leader position 9 value ("a" for MARC21 - UTF8).
138 =head1 SYNOPSIS
140 MARC21_utf8_flag_fix.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l N ] [ -o N ] [ -f ]
142 Help Options:
143 -h --help -? Brief help message
144 -m --man Full documentation, same as --help --verbose
145 --version Prints version info
147 Feedback Options:
148 -d --dump Dump MARCXML of biblioitems processed, default OFF
149 -s --summary Print initial summary of good and bad biblioitems counted, default ON
150 -v --verbose Increase verbosity of output, default OFF
152 Run Options:
153 -f --fix Save repaired leaders to biblioitems.marcxml,
154 -l --limit Number of biblioitems to display or fix
155 -o --offset Number of biblioitems to skip (not displayed or fixed)
157 =head1 OPTIONS
159 =over 8
161 =item B<--fix>
163 This is the most important option. Without it, the script just tells you about the problem records.
164 With --fix, the script fixes the same records.
166 =item B<--limit=N>
168 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
169 The default is to target all records with bad leaders.
171 =item B<--offset=N>
173 Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
174 The default is 0, i.e. skip none of them.
176 =back
178 The binary ON/OFF options can be negated like:
179 B<--nosummary> Do not display summary.
180 B<--nodump> Do not dump MARCXML.
181 B<--nofix> Do not change any records. This is the default mode.
183 =head1 DESCRIPTION
185 Koha expects to have all MARXML records internalized in UTF-8 encoding. This
186 presents a problem when records have been inserted with the leader/09 showing
187 blank for MARC8 encoding. This script is used to determine the extent of the
188 problem and to fix the affected leaders.
190 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
192 Run MARC21_utf8_flag_fix.pl the first time with no options, and assuming you agree that the leaders
193 presented need fixing, run it again with B<--fix>.
195 =head1 USAGE EXAMPLES
197 B<MARC21_utf8_flag_fix.pl>
199 In the most basic form, displays summary of biblioitems examined
200 and the leader from any found without /09 = a.
202 B<MARC21_utf8_flag_fix.pl --fix>
204 Fixes the same biblioitems, displaying summary and each leader before/after change.
206 B<MARC21_utf8_flag_fix.pl --limit=3 --offset=15 --nosummary --dump>
208 Dumps MARCXML from the 16th, 17th and 18th bad records found.
210 B<MARC21_utf8_flag_fix.pl -l 3 -o 15 -s 0 -d>
212 Same thing as previous example in terse form.
214 =head1 TO DO
216 Allow biblionumbers to be piped into STDIN as the selection mechanism.
218 =head1 SEE ALSO
220 C4::Biblio
222 =cut