3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 use MARC
::File
::USMARC
;
27 use open OUT
=> ':utf8';
29 use Getopt
::Long
qw(:config auto_help auto_version);
37 use vars
qw($VERSION);
40 # find Koha's Perl modules
41 # test carefully before changing this
43 eval { require "$FindBin::Bin/../kohalib.pl" };
54 my $limit; # undef, not zero.
61 my $filename = "/tmp/MARC21_parse_test.$$.marc";
66 'verbose' => \$verbose,
68 'offset=i' => \$offset,
69 'filename' => \$filename,
73 'summary!' => \$summary,
76 pod2usage( -verbose => 2 ) if ($man);
77 pod2usage( -verbose => 2 ) if ($help and $verbose);
78 pod2usage(1) if $help;
89 $lint_object = new MARC::Lint;
91 my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
92 (uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
94 # my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
95 # $countq->execute; $countq->fetchrow();
96 my $max = 999999; # arbitrary ceiling
98 $limit or $limit = $max; # limit becomes max if unspecified
101 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
102 printf "# limit %d, offset %d:\n", $limit, $offset;
103 printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
104 $verbose and print "# Using temp file: $filename\n"
107 MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
109 my $query = "SELECT * FROM biblioitems ";
112 if ($limit or $offset) {
113 my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
114 $query .= $limit_clause;
116 $verbose and print "# Query: $query\n";
117 $recs = C4::Context->dbh->prepare($query);
120 $query .= "WHERE biblionumber=?";
121 $verbose and print "# Query: $query\n";
122 $recs = C4::Context->dbh->prepare($query);
123 # no execute, we execute per biblionumber
124 print "# Reading biblionumbers from STDIN\n";
128 $all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
129 while (my $biblionumber = <>) {
130 chomp($biblionumber);
131 unless (defined $biblionumber) {
132 print "Skipping blank line $.\n";
135 unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
136 print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
139 ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
140 $recs->execute($biblionumber);
141 return $recs->fetchrow_hashref();
143 return undef; # just in case
147 $ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
155 while ( my $row = next_row() ) {
158 ($i > $ilimit) and last; # controls for user-input data/files
159 ($i > $offset) or next;
161 my $xml = $row->{marcxml};
162 my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
163 # $xml now pared down to just the <leader> element
164 $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
165 my $stripped = StripNonXmlChars($xml);
166 ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
167 my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
171 $verbose or $msg =~ s# at /usr/.*$##gs; # shorten common error message
172 print "$bibnum_prefix ERROR: $msg\n";
177 open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
180 my $file = MARC::File::XML->in( $filename );
181 while ( my $marc = $file->next() ) { # should be only 1
182 # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
183 $lint_object->check_record( $marc );
184 if ($lint_object->warnings) {
186 print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
190 if ($fix and not $record) {
191 my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
192 unless ($record_from_blob) {
193 print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
196 my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
198 $fixed++; print "$bibnum_prefix FIXED\n";
200 $failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
204 $dump and print $row->{marcxml}, "\n";
207 (-f $filename) and unlink ($filename); # remove tempfile
210 printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
211 printf "# limit %d, offset %d:\n", $limit, $offset;
212 print "\nRESULTS (number of records)...\n";
213 printf " %6d -- OK \n", $fine;
214 printf " %6d -- w/ bad marcxml \n", $found;
215 printf " %6d -- w/ MARC::Lint warnings\n", $warns;
216 printf " %6d -- fixed from marc \n", $fixed;
217 printf " %6d -- failed to fix \n", $failed;
225 MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
229 MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
232 -h --help -? Brief help message
233 -m --man Full documentation, same as --help --verbose
234 --version Prints version info
237 -d --dump Dump MARCXML of biblioitems processed, default OFF
238 -s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
239 -L --Lint Show any warnings from MARC::Lint, default OFF
240 -v --verbose Increase verbosity of output, default OFF
243 -f --fix Replace biblioitems.marcxml from data in marc field, default OFF
244 -A --All Use the whole biblioitems table as target set, default OFF
245 -l --limit Number of biblioitems to display or fix
246 -o --offset Number of biblioitems to skip (not displayed or fixed)
254 Target the entire biblioitems table.
255 Beware, on a large table B<--All> can be very costly to performance.
259 Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
260 biblioitems.marcxml from biblioitems.marc.
264 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
265 This applies whether the target records are determined by user input, filenames or <--All>.
269 Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
270 The default is 0, i.e. skip none of them.
274 The binary ON/OFF options can be negated like:
275 B<--nosummary> Do not display summary.
276 B<--nodump> Do not dump MARCXML.
277 B<--noLint> Do not show MARC::Lint warnings.
278 B<--nofix> Do not change any records. This is the default mode.
282 Any number of filepath arguments can be referenced. They will be read in order and used to select the target
283 set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
284 options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
288 This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
289 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
291 Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
292 Or run with B<--All> to go through the entire table.
293 Run the script again with B<--fix> to attempt repair of the same target set.
295 After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
297 =head1 USAGE EXAMPLES
299 B<MARC21_parse_test.pl>
301 In the most basic form, allows you to input biblionumbers and checks them individually.
303 B<MARC21_parse_test.pl --fix>
305 Same thing but fixes them if they fail to parse.
307 B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
309 Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
311 B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
313 Dumps MARCXML from the 16th, 17th and 18th records found in the database.
315 B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
317 Same thing as previous example in terse form.
321 Add more documentation for OPTIONS.
323 Update zebra status so rebuild of index is not necessary.