3 # Copyright 2009 Liblime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
25 use MARC
::File
::USMARC
;
27 use open OUT
=> ':encoding(UTF-8)';
29 use Getopt
::Long
qw(:config auto_help auto_version);
38 # find Koha's Perl modules
39 # test carefully before changing this
41 eval { require "$FindBin::Bin/../kohalib.pl" };
51 my $limit; # undef, not zero.
58 my $filename = "/tmp/MARC21_parse_test.$$.marc";
63 'verbose' => \
$verbose,
65 'offset=i' => \
$offset,
66 'filename' => \
$filename,
70 'summary!' => \
$summary,
73 pod2usage
( -verbose
=> 2 ) if ($man);
74 pod2usage
( -verbose
=> 2 ) if ($help and $verbose);
75 pod2usage
(1) if $help;
86 $lint_object = new MARC
::Lint
;
88 my $marcflavour = C4
::Context
->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
89 (uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
91 # my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
92 # $countq->execute; $countq->fetchrow();
93 my $max = 999999; # arbitrary ceiling
95 $limit or $limit = $max; # limit becomes max if unspecified
98 printf "# Examining marcxml from %s\n", ($all ?
'ALL biblioitems' : 'SELECT biblionumbers');
99 printf "# limit %d, offset %d:\n", $limit, $offset;
100 printf "# MARC::Lint warnings: %s\n", ($lint ?
'ON' : 'OFF');
101 $verbose and print "# Using temp file: $filename\n"
104 MARC
::File
::XML
->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
106 my $query = "SELECT * FROM biblioitems ";
109 if ($limit or $offset) {
110 my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
111 $query .= $limit_clause;
113 $verbose and print "# Query: $query\n";
114 $recs = C4
::Context
->dbh->prepare($query);
117 $query .= "WHERE biblionumber=?";
118 $verbose and print "# Query: $query\n";
119 $recs = C4
::Context
->dbh->prepare($query);
120 # no execute, we execute per biblionumber
121 print "# Reading biblionumbers from STDIN\n";
125 $all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
126 while (my $biblionumber = <>) {
127 chomp($biblionumber);
128 unless (defined $biblionumber) {
129 print "Skipping blank line $.\n";
132 unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
133 print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
136 ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
137 $recs->execute($biblionumber);
138 return $recs->fetchrow_hashref();
140 return undef; # just in case
144 $ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
152 while ( my $row = next_row
() ) {
155 ($i > $ilimit) and last; # controls for user-input data/files
156 ($i > $offset) or next;
158 my $xml = $row->{marcxml
};
159 my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber
};
160 # $xml now pared down to just the <leader> element
161 $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber
};
162 my $stripped = StripNonXmlChars
($xml);
163 ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
164 my $record = eval { MARC
::Record
::new_from_xml
( $stripped, 'utf8', $marcflavour ) };
168 $verbose or $msg =~ s
# at /usr/.*$##gs; # shorten common error message
169 print "$bibnum_prefix ERROR: $msg\n";
174 open (FILE
, ">$filename") or die "Cannot write to temp file: $filename";
177 my $file = MARC
::File
::XML
->in( $filename );
178 while ( my $marc = $file->next() ) { # should be only 1
179 # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
180 $lint_object->check_record( $marc );
181 if ($lint_object->warnings) {
183 print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
187 if ($fix and not $record) {
188 my $record_from_blob = MARC
::Record
->new_from_usmarc($row->{marc
});
189 unless ($record_from_blob) {
190 print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
193 my $mod = ModBiblioMarc
($record_from_blob, $row->{biblionumber
}, '');
195 $fixed++; print "$bibnum_prefix FIXED\n";
197 $failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
201 $dump and print $row->{marcxml
}, "\n";
204 (-f
$filename) and unlink ($filename); # remove tempfile
207 printf "# Examining marcxml from %s\n", ($all ?
'ALL biblioitems' : 'SELECT biblionumbers');
208 printf "# limit %d, offset %d:\n", $limit, $offset;
209 print "\nRESULTS (number of records)...\n";
210 printf " %6d -- OK \n", $fine;
211 printf " %6d -- w/ bad marcxml \n", $found;
212 printf " %6d -- w/ MARC::Lint warnings\n", $warns;
213 printf " %6d -- fixed from marc \n", $fixed;
214 printf " %6d -- failed to fix \n", $failed;
222 MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
226 MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
229 -h --help -? Brief help message
230 -m --man Full documentation, same as --help --verbose
231 --version Prints version info
234 -d --dump Dump MARCXML of biblioitems processed, default OFF
235 -s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
236 -L --Lint Show any warnings from MARC::Lint, default OFF
237 -v --verbose Increase verbosity of output, default OFF
240 -f --fix Replace biblioitems.marcxml from data in marc field, default OFF
241 -A --All Use the whole biblioitems table as target set, default OFF
242 -l --limit Number of biblioitems to display or fix
243 -o --offset Number of biblioitems to skip (not displayed or fixed)
251 Target the entire biblioitems table.
252 Beware, on a large table B<--All> can be very costly to performance.
256 Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
257 biblioitems.marcxml from biblioitems.marc.
261 Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
262 This applies whether the target records are determined by user input, filenames or <--All>.
266 Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
267 The default is 0, i.e. skip none of them.
271 The binary ON/OFF options can be negated like:
272 B<--nosummary> Do not display summary.
273 B<--nodump> Do not dump MARCXML.
274 B<--noLint> Do not show MARC::Lint warnings.
275 B<--nofix> Do not change any records. This is the default mode.
279 Any number of filepath arguments can be referenced. They will be read in order and used to select the target
280 set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
281 options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
285 This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
286 As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
288 Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
289 Or run with B<--All> to go through the entire table.
290 Run the script again with B<--fix> to attempt repair of the same target set.
292 After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
294 =head1 USAGE EXAMPLES
296 B<MARC21_parse_test.pl>
298 In the most basic form, allows you to input biblionumbers and checks them individually.
300 B<MARC21_parse_test.pl --fix>
302 Same thing but fixes them if they fail to parse.
304 B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
306 Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
308 B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
310 Dumps MARCXML from the 16th, 17th and 18th records found in the database.
312 B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
314 Same thing as previous example in terse form.
318 Add more documentation for OPTIONS.
320 Update zebra status so rebuild of index is not necessary.