Bug 15230: Remove unused file circ/stats.pl and stats.tt from system
[koha.git] / misc / cronjobs / MARC21_parse_test.pl
blob83b4cf0d33387e7ebaa1fe50080467c31b23c273
1 #!/usr/bin/perl
3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 use strict;
21 use warnings;
23 use MARC::Record;
24 use MARC::File::XML;
25 use MARC::File::USMARC;
27 use open OUT => ':encoding(UTF-8)';
29 use Getopt::Long qw(:config auto_help auto_version);
30 use Pod::Usage;
32 use C4::Biblio;
33 use C4::Charset;
34 use C4::Context;
35 use C4::Debug;
37 use vars qw($VERSION);
39 BEGIN {
40 # find Koha's Perl modules
41 # test carefully before changing this
42 use FindBin;
43 eval { require "$FindBin::Bin/../kohalib.pl" };
44 $VERSION = 0.03;
47 our $debug;
49 ## OPTIONS
50 my $help = 0;
51 my $man = 0;
52 my $verbose = 0;
54 my $limit; # undef, not zero.
55 my $offset = 0;
56 my $dump = 0;
57 my $all = 0;
58 my $summary = 1;
59 my $lint = 0;
60 my $fix = 0;
61 my $filename = "/tmp/MARC21_parse_test.$$.marc";
63 GetOptions(
64 'help|?' => \$help,
65 'man' => \$man,
66 'verbose' => \$verbose,
67 'limit=i' => \$limit,
68 'offset=i' => \$offset,
69 'filename' => \$filename,
70 'All!' => \$all,
71 'Lint!' => \$lint,
72 'dump!' => \$dump,
73 'summary!' => \$summary,
74 'fix!' => \$fix,
75 ) or pod2usage(2);
76 pod2usage( -verbose => 2 ) if ($man);
77 pod2usage( -verbose => 2 ) if ($help and $verbose);
78 pod2usage(1) if $help;
80 if ($debug) {
81 $summary++;
82 $verbose++;
83 $lint++;
86 my $lint_object;
87 if ($lint) {
88 require MARC::Lint;
89 $lint_object = new MARC::Lint;
91 my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
92 (uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
94 # my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
95 # $countq->execute; $countq->fetchrow();
96 my $max = 999999; # arbitrary ceiling
98 $limit or $limit = $max; # limit becomes max if unspecified
100 if ($summary) {
101 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
102 printf "# limit %d, offset %d:\n", $limit, $offset;
103 printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
104 $verbose and print "# Using temp file: $filename\n"
107 MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
109 my $query = "SELECT * FROM biblioitems ";
110 my $recs;
111 if ($all) {
112 if ($limit or $offset) {
113 my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
114 $query .= $limit_clause;
116 $verbose and print "# Query: $query\n";
117 $recs = C4::Context->dbh->prepare($query);
118 $recs->execute();
119 } else {
120 $query .= "WHERE biblionumber=?";
121 $verbose and print "# Query: $query\n";
122 $recs = C4::Context->dbh->prepare($query);
123 # no execute, we execute per biblionumber
124 print "# Reading biblionumbers from STDIN\n";
127 sub next_row {
128 $all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
129 while (my $biblionumber = <>) {
130 chomp($biblionumber);
131 unless (defined $biblionumber) {
132 print "Skipping blank line $.\n";
133 next;
135 unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
136 print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
137 next;
139 ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
140 $recs->execute($biblionumber);
141 return $recs->fetchrow_hashref();
143 return undef; # just in case
146 my $ilimit = $limit;
147 $ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
148 my $i = 0;
149 my $found = 0;
150 my $fixed = 0;
151 my $fine = 0;
152 my $failed = 0;
153 my $warns = 0;
154 my $printline = 0;
155 while ( my $row = next_row() ) {
156 ++$i;
157 unless ($all) {
158 ($i > $ilimit) and last; # controls for user-input data/files
159 ($i > $offset) or next;
161 my $xml = $row->{marcxml};
162 my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
163 # $xml now pared down to just the <leader> element
164 $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
165 my $stripped = StripNonXmlChars($xml);
166 ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
167 my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
168 if (not $record) {
169 $found++;
170 my $msg = $@ || '';
171 $verbose or $msg =~ s# at /usr/.*$##gs; # shorten common error message
172 print "$bibnum_prefix ERROR: $msg\n";
173 } else {
174 $fine++;
176 if ($lint) {
177 open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
178 print FILE $xml;
179 close FILE;
180 my $file = MARC::File::XML->in( $filename );
181 while ( my $marc = $file->next() ) { # should be only 1
182 # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
183 $lint_object->check_record( $marc );
184 if ($lint_object->warnings) {
185 $warns++;
186 print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
190 if ($fix and not $record) {
191 my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
192 unless ($record_from_blob) {
193 print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
194 $failed++;
195 } else {
196 my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
197 if ($mod) {
198 $fixed++; print "$bibnum_prefix FIXED\n";
199 } else {
200 $failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
204 $dump and print $row->{marcxml}, "\n";
207 (-f $filename) and unlink ($filename); # remove tempfile
209 if ($summary) {
210 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
211 printf "# limit %d, offset %d:\n", $limit, $offset;
212 print "\nRESULTS (number of records)...\n";
213 printf " %6d -- OK \n", $fine;
214 printf " %6d -- w/ bad marcxml \n", $found;
215 printf " %6d -- w/ MARC::Lint warnings\n", $warns;
216 printf " %6d -- fixed from marc \n", $fixed;
217 printf " %6d -- failed to fix \n", $failed;
221 __END__
223 =head1 NAME
225 MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
227 =head1 SYNOPSIS
229 MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
231 Help Options:
232 -h --help -? Brief help message
233 -m --man Full documentation, same as --help --verbose
234 --version Prints version info
236 Feedback Options:
237 -d --dump Dump MARCXML of biblioitems processed, default OFF
238 -s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
239 -L --Lint Show any warnings from MARC::Lint, default OFF
240 -v --verbose Increase verbosity of output, default OFF
242 Run Options:
243 -f --fix Replace biblioitems.marcxml from data in marc field, default OFF
244 -A --All Use the whole biblioitems table as target set, default OFF
245 -l --limit Number of biblioitems to display or fix
246 -o --offset Number of biblioitems to skip (not displayed or fixed)
248 =head1 OPTIONS
250 =over 8
252 =item B<--All>
254 Target the entire biblioitems table.
255 Beware, on a large table B<--All> can be very costly to performance.
257 =item B<--fix>
259 Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
260 biblioitems.marcxml from biblioitems.marc.
262 =item B<--limit=N>
264 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
265 This applies whether the target records are determined by user input, filenames or <--All>.
267 =item B<--offset=N>
269 Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
270 The default is 0, i.e. skip none of them.
272 =back
274 The binary ON/OFF options can be negated like:
275 B<--nosummary> Do not display summary.
276 B<--nodump> Do not dump MARCXML.
277 B<--noLint> Do not show MARC::Lint warnings.
278 B<--nofix> Do not change any records. This is the default mode.
280 =head1 ARGUMENTS
282 Any number of filepath arguments can be referenced. They will be read in order and used to select the target
283 set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
284 options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
286 =head1 DESCRIPTION
288 This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
289 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
291 Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
292 Or run with B<--All> to go through the entire table.
293 Run the script again with B<--fix> to attempt repair of the same target set.
295 After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
297 =head1 USAGE EXAMPLES
299 B<MARC21_parse_test.pl>
301 In the most basic form, allows you to input biblionumbers and checks them individually.
303 B<MARC21_parse_test.pl --fix>
305 Same thing but fixes them if they fail to parse.
307 B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
309 Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
311 B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
313 Dumps MARCXML from the 16th, 17th and 18th records found in the database.
315 B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
317 Same thing as previous example in terse form.
319 =head1 TODO
321 Add more documentation for OPTIONS.
323 Update zebra status so rebuild of index is not necessary.
325 =head1 SEE ALSO
327 MARC::Lint
328 C4::Biblio
330 =cut