Bug 24807: (follow-up) Add support for spaces as unknown characters
[koha.git] / misc / maintenance / sanitize_records.pl
blobf9084e8e3b365408898e58c4d09db60761fc4cd5
1 #!/usr/bin/perl
3 # This file is part of Koha.
5 # Copyright 2014 BibLibre
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 use Modern::Perl;
22 use Koha::Script;
23 use C4::Charset qw( SanitizeRecord );
24 use C4::Context;
25 use DBI;
26 use C4::Biblio;
27 use Getopt::Long;
28 use Pod::Usage;
30 my ( $help, $verbose, $confirm, $biblionumbers, $reindex, $filename,
31 $auto_search, $fix_ampersand );
32 my $result = GetOptions(
33 'h|help' => \$help,
34 'v|verbose' => \$verbose,
35 'c|confirm' => \$confirm,
36 'biblionumbers:s' => \$biblionumbers,
37 'reindex' => \$reindex,
38 'f|filename:s' => \$filename,
39 'auto-search' => \$auto_search,
40 'fix-ampersand' => \$fix_ampersand,
41 ) || pod2usage(1);
43 # This script only fix ampersand at the moment.
44 # It is enabled by default.
45 $fix_ampersand = 1;
47 if ($help) {
48 pod2usage(0);
51 unless ( $filename or $biblionumbers or $auto_search ) {
52 pod2usage(
53 -exitval => 1,
54 -message =>
55 qq{\n\tAt least one record number source should be provided.\n}
59 if ( $filename and $biblionumbers
60 or $filename and $auto_search
61 or $biblionumbers and $auto_search )
63 pod2usage(
64 -exitval => 1,
65 -message => qq{\n\tOnly one record number source should be provided.\n}
69 my @biblionumbers;
71 # We first detect if we have a file or biblos directly entered by command line
72 #or if we want to use findAmp() sub
73 if ($auto_search) {
74 @biblionumbers = biblios_to_sanitize();
76 elsif ($filename) {
77 if ( -e $filename ) {
78 open( my $fh, '<', $filename ) || die("Can't open $filename ($!)");
79 while (<$fh>) {
80 chomp;
81 my $line = $_;
82 push @biblionumbers, split( " |,", $line );
84 close $fh;
86 else {
87 pod2usage(
88 -exitval => 1,
89 -message =>
90 qq{\n\tThis filename does not exist. Please verify the path is correct.\n}
94 else {
95 @biblionumbers = split m|,|, $biblionumbers if $biblionumbers;
98 # We remove spaces
99 s/(^\s*|\s*$)//g for @biblionumbers;
101 # Remove empty lines
102 @biblionumbers = grep { !/^$/ } @biblionumbers;
104 say @biblionumbers . " records to process" if $verbose;
106 my @changes;
107 for my $biblionumber (@biblionumbers) {
108 print "processing record $biblionumber..." if $verbose;
109 unless ( $biblionumber =~ m|^\d+$| ) {
110 say " skipping. ERROR: Invalid biblionumber." if $verbose;
111 next;
113 my $record = C4::Biblio::GetMarcBiblio({ biblionumber => $biblionumber });
114 unless ($record) {
115 say " skipping. ERROR: Invalid record." if $verbose;
116 next;
119 my ( $cleaned_record, $has_been_modified ) =
120 C4::Charset::SanitizeRecord( $record, $biblionumber );
121 if ($has_been_modified) {
122 my $frameworkcode = C4::Biblio::GetFrameworkCode($record);
124 C4::Biblio::ModBiblio( $cleaned_record, $biblionumber, $frameworkcode )
125 if $confirm;
126 push @changes, $biblionumber;
127 say " Done!" if $verbose;
129 else {
130 say " Nothing to do." if $verbose;
134 if ($verbose) {
135 say "Total: "
136 . @changes
137 . " records "
138 . ( $confirm ? "cleaned!" : "to clean." );
141 if ( $reindex and $confirm and @changes ) {
142 say "Now, reindexing using -b -v" if $verbose;
143 my $kohapath = C4::Context->config('intranetdir');
144 my $cmd = qq|
145 $kohapath/misc/migration_tools/rebuild_zebra.pl -b -v -where "biblionumber IN ( |
146 . join( ',', @changes ) . q| )"
148 system($cmd);
151 sub biblios_to_sanitize {
152 my $dbh = C4::Context->dbh;
153 my $query = q{
154 SELECT biblionumber
155 FROM biblio_metadata
156 WHERE format = 'marcxml'
157 AND `schema` = ?
158 AND metadata LIKE "%&amp;amp;%"
160 return @{ $dbh->selectcol_arrayref( $query, { Slice => {} }, C4::Context->preference('marcflavour') ) };
163 =head1 NAME
165 sanitize_records - This script sanitizes a record.
167 =head1 SYNOPSIS
169 sanitize_records.pl [-h|--help] [-v|--verbose] [-c|--confirm] [--biblionumbers=BIBLIONUMBER_LIST] [-f|--filename=FILENAME] [--auto-search] [--reindex] [--fix-ampersand]
171 You can either give some biblionumbers or a file with biblionumbers or ask for an auto-search.
173 =head1 OPTIONS
175 =over
177 =item B<-h|--help>
179 Print a brief help message
181 =item B<-v|--verbose>
183 Verbose mode.
185 =item B<-c|--confirm>
187 This flag must be provided in order for the script to actually
188 sanitize records. If it is not supplied, the script will
189 only report on the record list to process.
191 =item B<--biblionumbers=BIBLIONUMBER_LIST>
193 Give a biblionumber list using this parameter. They must be separated by
194 commas.
196 =item B<-f|--filename=FILENAME>
198 Give a biblionumber list using a filename. One biblionumber by line or separate them with a whitespace character.
200 =item B<--auto-search>
202 Automatically search records containing "&amp;" in biblio_metadata.metadata or in the specified fields.
204 =item B<--fix-ampersand>
206 Replace '&amp;' by '&' in the records.
207 Replace '&amp;amp;amp;etc.' with '&amp;' in the records.
209 =item B<--reindex>
211 Reindex the modified records.
213 =back
215 =head1 AUTHOR
217 Alex Arnaud <alex.arnaud@biblibre.com>
218 Christophe Croullebois <christophe.croullebois@biblibre.com>
219 Jonathan Druart <jonathan.druart@biblibre.com>
221 =head1 COPYRIGHT
223 Copyright 2014 BibLibre
225 =head1 LICENSE
227 This file is part of Koha.
229 # Koha is free software; you can redistribute it and/or modify it
230 # under the terms of the GNU General Public License as published by
231 # the Free Software Foundation; either version 3 of the License, or
232 # (at your option) any later version.
234 # Koha is distributed in the hope that it will be useful, but
235 # WITHOUT ANY WARRANTY; without even the implied warranty of
236 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
237 # GNU General Public License for more details.
239 # You should have received a copy of the GNU General Public License
240 # along with Koha; if not, see <http://www.gnu.org/licenses>.
242 =cut