Bug 13903: (QA followup) change routes to /holds
[koha.git] / misc / cronjobs / MARC21_parse_test.pl
blobf745f11b9911033f8bc6690cd140ee41b6115510
1 #!/usr/bin/perl
3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 use strict;
21 use warnings;
23 use MARC::Record;
24 use MARC::File::XML;
25 use MARC::File::USMARC;
27 use open OUT => ':encoding(UTF-8)';
29 use Getopt::Long qw(:config auto_help auto_version);
30 use Pod::Usage;
32 use C4::Biblio;
33 use C4::Charset;
34 use C4::Context;
35 use C4::Debug;
37 BEGIN {
38 # find Koha's Perl modules
39 # test carefully before changing this
40 use FindBin;
41 eval { require "$FindBin::Bin/../kohalib.pl" };
44 our $debug;
46 ## OPTIONS
47 my $help = 0;
48 my $man = 0;
49 my $verbose = 0;
51 my $limit; # undef, not zero.
52 my $offset = 0;
53 my $dump = 0;
54 my $all = 0;
55 my $summary = 1;
56 my $lint = 0;
57 my $fix = 0;
58 my $filename = "/tmp/MARC21_parse_test.$$.marc";
60 GetOptions(
61 'help|?' => \$help,
62 'man' => \$man,
63 'verbose' => \$verbose,
64 'limit=i' => \$limit,
65 'offset=i' => \$offset,
66 'filename' => \$filename,
67 'All!' => \$all,
68 'Lint!' => \$lint,
69 'dump!' => \$dump,
70 'summary!' => \$summary,
71 'fix!' => \$fix,
72 ) or pod2usage(2);
73 pod2usage( -verbose => 2 ) if ($man);
74 pod2usage( -verbose => 2 ) if ($help and $verbose);
75 pod2usage(1) if $help;
77 if ($debug) {
78 $summary++;
79 $verbose++;
80 $lint++;
83 my $lint_object;
84 if ($lint) {
85 require MARC::Lint;
86 $lint_object = new MARC::Lint;
88 my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
89 (uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
91 # my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
92 # $countq->execute; $countq->fetchrow();
93 my $max = 999999; # arbitrary ceiling
95 $limit or $limit = $max; # limit becomes max if unspecified
97 if ($summary) {
98 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
99 printf "# limit %d, offset %d:\n", $limit, $offset;
100 printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
101 $verbose and print "# Using temp file: $filename\n"
104 MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
106 my $query = "SELECT * FROM biblioitems ";
107 my $recs;
108 if ($all) {
109 if ($limit or $offset) {
110 my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
111 $query .= $limit_clause;
113 $verbose and print "# Query: $query\n";
114 $recs = C4::Context->dbh->prepare($query);
115 $recs->execute();
116 } else {
117 $query .= "WHERE biblionumber=?";
118 $verbose and print "# Query: $query\n";
119 $recs = C4::Context->dbh->prepare($query);
120 # no execute, we execute per biblionumber
121 print "# Reading biblionumbers from STDIN\n";
124 sub next_row {
125 $all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
126 while (my $biblionumber = <>) {
127 chomp($biblionumber);
128 unless (defined $biblionumber) {
129 print "Skipping blank line $.\n";
130 next;
132 unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
133 print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
134 next;
136 ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
137 $recs->execute($biblionumber);
138 return $recs->fetchrow_hashref();
140 return undef; # just in case
143 my $ilimit = $limit;
144 $ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
145 my $i = 0;
146 my $found = 0;
147 my $fixed = 0;
148 my $fine = 0;
149 my $failed = 0;
150 my $warns = 0;
151 my $printline = 0;
152 while ( my $row = next_row() ) {
153 ++$i;
154 unless ($all) {
155 ($i > $ilimit) and last; # controls for user-input data/files
156 ($i > $offset) or next;
158 my $xml = $row->{marcxml};
159 my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
160 # $xml now pared down to just the <leader> element
161 $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
162 my $stripped = StripNonXmlChars($xml);
163 ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
164 my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
165 if (not $record) {
166 $found++;
167 my $msg = $@ || '';
168 $verbose or $msg =~ s# at /usr/.*$##gs; # shorten common error message
169 print "$bibnum_prefix ERROR: $msg\n";
170 } else {
171 $fine++;
173 if ($lint) {
174 open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
175 print FILE $xml;
176 close FILE;
177 my $file = MARC::File::XML->in( $filename );
178 while ( my $marc = $file->next() ) { # should be only 1
179 # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
180 $lint_object->check_record( $marc );
181 if ($lint_object->warnings) {
182 $warns++;
183 print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
187 if ($fix and not $record) {
188 my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
189 unless ($record_from_blob) {
190 print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
191 $failed++;
192 } else {
193 my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
194 if ($mod) {
195 $fixed++; print "$bibnum_prefix FIXED\n";
196 } else {
197 $failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
201 $dump and print $row->{marcxml}, "\n";
204 (-f $filename) and unlink ($filename); # remove tempfile
206 if ($summary) {
207 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
208 printf "# limit %d, offset %d:\n", $limit, $offset;
209 print "\nRESULTS (number of records)...\n";
210 printf " %6d -- OK \n", $fine;
211 printf " %6d -- w/ bad marcxml \n", $found;
212 printf " %6d -- w/ MARC::Lint warnings\n", $warns;
213 printf " %6d -- fixed from marc \n", $fixed;
214 printf " %6d -- failed to fix \n", $failed;
218 __END__
220 =head1 NAME
222 MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
224 =head1 SYNOPSIS
226 MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
228 Help Options:
229 -h --help -? Brief help message
230 -m --man Full documentation, same as --help --verbose
231 --version Prints version info
233 Feedback Options:
234 -d --dump Dump MARCXML of biblioitems processed, default OFF
235 -s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
236 -L --Lint Show any warnings from MARC::Lint, default OFF
237 -v --verbose Increase verbosity of output, default OFF
239 Run Options:
240 -f --fix Replace biblioitems.marcxml from data in marc field, default OFF
241 -A --All Use the whole biblioitems table as target set, default OFF
242 -l --limit Number of biblioitems to display or fix
243 -o --offset Number of biblioitems to skip (not displayed or fixed)
245 =head1 OPTIONS
247 =over 8
249 =item B<--All>
251 Target the entire biblioitems table.
252 Beware, on a large table B<--All> can be very costly to performance.
254 =item B<--fix>
256 Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
257 biblioitems.marcxml from biblioitems.marc.
259 =item B<--limit=N>
261 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
262 This applies whether the target records are determined by user input, filenames or <--All>.
264 =item B<--offset=N>
266 Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
267 The default is 0, i.e. skip none of them.
269 =back
271 The binary ON/OFF options can be negated like:
272 B<--nosummary> Do not display summary.
273 B<--nodump> Do not dump MARCXML.
274 B<--noLint> Do not show MARC::Lint warnings.
275 B<--nofix> Do not change any records. This is the default mode.
277 =head1 ARGUMENTS
279 Any number of filepath arguments can be referenced. They will be read in order and used to select the target
280 set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
281 options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
283 =head1 DESCRIPTION
285 This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
286 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
288 Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
289 Or run with B<--All> to go through the entire table.
290 Run the script again with B<--fix> to attempt repair of the same target set.
292 After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
294 =head1 USAGE EXAMPLES
296 B<MARC21_parse_test.pl>
298 In the most basic form, allows you to input biblionumbers and checks them individually.
300 B<MARC21_parse_test.pl --fix>
302 Same thing but fixes them if they fail to parse.
304 B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
306 Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
308 B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
310 Dumps MARCXML from the 16th, 17th and 18th records found in the database.
312 B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
314 Same thing as previous example in terse form.
316 =head1 TODO
318 Add more documentation for OPTIONS.
320 Update zebra status so rebuild of index is not necessary.
322 =head1 SEE ALSO
324 MARC::Lint
325 C4::Biblio
327 =cut