Bug 24759: DBRev 19.12.00.051
[koha.git] / misc / cronjobs / check-url.pl
blob68a7b8e788a862fc09afb51da3a4a3a07d6fa8b2
1 #!/usr/bin/perl
4 # Copyright 2009 Tamil s.a.r.l.
6 # This file is part of Koha.
8 # Koha is free software; you can redistribute it and/or modify it
9 # under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 3 of the License, or
11 # (at your option) any later version.
13 # Koha is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with Koha; if not, see <http://www.gnu.org/licenses>.
23 package C4::URL::Checker;
25 =head1 NAME
27 C4::URL::Checker - base object for checking URL stored in Koha DB
29 =head1 SYNOPSIS
31 use C4::URL::Checker;
33 my $checker = C4::URL::Checker->new( );
34 $checker->{ host_default } = 'http://mylib.kohalibrary.com';
35 my $checked_urls = $checker->check_biblio( 123 );
36 foreach my $url ( @$checked_urls ) {
37 print "url: ", $url->{ url  }, "\n",
38 "is_success: ", $url->{ is_success }, "\n",
39 "status: ", $url->{ status }, "\n";
42 =head1 FUNCTIONS
44 =head2 new
46 Create a URL Checker. The returned object can be used to set
47 default host variable :
49 my $checker = C4::URL::Checker->new( );
50 $checker->{ host_default } = 'http://mylib.kohalibrary.com';
52 =head2 check_biblio
54 Check all URL from a biblio record. Returns a pointer to an array
55 containing all URLs with checking for each of them.
57 my $checked_urls = $checker->check_biblio( 123 );
59 With 2 URLs, the returned array will look like that:
63 'url' => 'http://mylib.tamil.fr/img/62265_0055B.JPG',
64 'is_success' => 1,
65 'status' => 'ok'
68 'url' => 'http://mylib.tamil.fr//img/62265_0055C.JPG',
69 'is_success' => 0,
70 'status' => '404 - Page not found'
75 =cut
77 use strict;
78 use warnings;
79 use LWP::UserAgent;
80 use HTTP::Request;
81 use Koha::Script -cron;
82 use C4::Biblio;
86 sub new {
88 my $self = {};
89 my ($class, $timeout, $agent) = @_;
91 my $uagent = new LWP::UserAgent;
92 $uagent->agent( $agent ) if $agent;
93 $uagent->timeout( $timeout) if $timeout;
94 $self->{ user_agent } = $uagent;
95 $self->{ bad_url } = { };
97 bless $self, $class;
98 return $self;
102 sub check_biblio {
103 my $self = shift;
104 my $biblionumber = shift;
105 my $uagent = $self->{ user_agent };
106 my $host = $self->{ host_default };
107 my $bad_url = $self->{ bad_url };
109 my $record = GetMarcBiblio({ biblionumber => $biblionumber });
110 return unless $record->field('856');
112 my @urls = ();
113 foreach my $field ( $record->field('856') ) {
114 my $url = $field->subfield('u');
115 next unless $url;
116 $url = "$host/$url" unless $url =~ /^http/;
117 my $check = { url => $url };
118 if ( $bad_url->{ $url } ) {
119 $check->{ is_success } = 1;
120 $check->{ status } = '500 Site already checked';
122 else {
123 my $req = HTTP::Request->new( GET => $url );
124 my $res = $uagent->request( $req, sub { die }, 1 );
125 if ( $res->is_success ) {
126 $check->{ is_success } = 1;
127 $check->{ status } = 'ok';
129 else {
130 $check->{ is_success } = 0;
131 $check->{ status } = $res->status_line;
132 $bad_url->{ $url } = 1;
135 push @urls, $check;
137 return \@urls;
142 package Main;
144 use strict;
145 use warnings;
146 use diagnostics;
147 use Carp;
149 use Pod::Usage;
150 use Getopt::Long;
151 use Koha::Script -cron;
152 use C4::Context;
156 my $verbose = 0;
157 my $help = 0;
158 my $host = '';
159 my $host_pro = '';
160 my $html = 0;
161 my $uriedit = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber=";
162 my $agent = '';
163 my $timeout = 15;
164 GetOptions(
165 'verbose' => \$verbose,
166 'html' => \$html,
167 'help' => \$help,
168 'host=s' => \$host,
169 'host-pro=s' => \$host_pro,
170 'agent=s' => \$agent,
171 'timeout=i', => \$timeout,
175 sub usage {
176 pod2usage( -verbose => 2 );
177 exit;
181 sub bibediturl {
182 my $biblionumber = shift;
183 my $html = "<a href=\"$host_pro$uriedit$biblionumber\">$biblionumber</a>";
184 return $html;
189 # Check all URLs from all current Koha biblio records
191 sub check_all_url {
192 my $checker = C4::URL::Checker->new($timeout,$agent);
193 $checker->{ host_default } = $host;
195 my $context = new C4::Context( );
196 my $dbh = $context->dbh;
197 my $sth = $dbh->prepare(
198 "SELECT biblionumber FROM biblioitems WHERE url <> ''" );
199 $sth->execute;
200 if ( $html ) {
201 print <<EOS;
202 <html>
203 <body>
204 <table>
207 while ( my ($biblionumber) = $sth->fetchrow ) {
208 my $result = $checker->check_biblio( $biblionumber );
209 next unless $result; # No URL
210 foreach my $url ( @$result ) {
211 if ( ! $url->{ is_success } || $verbose ) {
212 print $html
213 ? "<tr>\n<td>" . bibediturl( $biblionumber ) .
214 "</td>\n<td>" . $url->{url} . "</td>\n<td>" .
215 $url->{status} . "</td>\n</tr>\n\n"
216 : "$biblionumber\t" . $url->{ url } . "\t" .
217 $url->{ status } . "\n";
221 print "</table>\n</body>\n</html>\n" if $html;
225 # BEGIN
227 usage() if $help;
229 if ( $html && !$host_pro ) {
230 if ( $host ) {
231 $host_pro = $host;
233 else {
234 print "Error: host-pro parameter or host must be provided in html mode\n";
235 exit;
239 check_all_url();
243 =head1 NAME
245 check-url.pl - Check URLs from 856$u field.
247 =head1 USAGE
249 =over
251 =item check-url.pl [--verbose|--help] [--agent=agent-string] [--host=http://default.tld]
253 Scan all URLs found in 856$u of bib records
254 and display if resources are available or not.
255 This script is deprecated. You should rather use check-url-quick.pl.
257 =back
259 =head1 PARAMETERS
261 =over
263 =item B<--host=http://default.tld>
265 Server host used when URL doesn't have one, ie doesn't begin with 'http:'.
266 For example, if --host=http://www.mylib.com, then when 856$u contains
267 'img/image.jpg', the url checked is: http://www.mylib.com/image.jpg'.
269 =item B<--verbose|-v>
271 Outputs both successful and failed URLs.
273 =item B<--html>
275 Formats output in HTML. The result can be redirected to a file
276 accessible by http. This way, it's possible to link directly to biblio
277 record in edit mode. With this parameter B<--host-pro> is required.
279 =item B<--host-pro=http://koha-pro.tld>
281 Server host used to link to biblio record editing page.
283 =item B<--agent=agent-string>
285 Change default libwww user-agent string to custom. Some sites do
286 not like libwww user-agent and return false 40x failure codes,
287 so this allows Koha to report itself as Koha, or a browser.
289 =item B<--timeout=15>
291 Timeout for fetching URLs. By default 15 seconds.
293 =item B<--help|-h>
295 Print this help page.
297 =back
299 =cut