Bug 2959 - Add a timeout parameter to the URL checker
[koha.git] / misc / cronjobs / check-url.pl
blobc2f4b5c3b8804ef4749b1ba9628632afe885bc83
1 #!/usr/bin/perl
4 # Copyright 2009 Tamil s.a.r.l.
6 # This software is placed under the gnu General Public License, v2
7 # (http://www.gnu.org/licenses/gpl.html)
12 package C4::URL::Checker;
14 =head1 NAME
16 C4::URL::Checker - base object for checking URL stored in Koha DB
18 =head1 SYNOPSIS
20 use C4::URL::Checker;
22 my $checker = C4::URL::Checker->new( );
23 $checker->{ host_default } = 'http://mylib.kohalibrary.com';
24 my $checked_urls = $checker->check_biblio( 123 );
25 foreach my $url ( @$checked_urls ) {
26 print "url: ", $url->{ url  }, "\n",
27 "is_success: ", $url->{ is_success }, "\n",
28 "status: ", $url->{ status }, "\n";
31 =head1 FUNCTIONS
33 =head2 new
35 Create a URL Checker. The returned object can be used to set
36 default host variable :
38 my $checker = C4::URL::Checker->new( );
39 $checker->{ host_default } = 'http://mylib.kohalibrary.com';
41 =head2 check_biblio
43 Check all URL from a biblio record. Returns a pointer to an array
44 containing all URLs with checking for each of them.
46 my $checked_urls = $checker->check_biblio( 123 );
48 With 2 URLs, the returned array will look like that:
52 'url' => 'http://mylib.tamil.fr/img/62265_0055B.JPG',
53 'is_success' => 1,
54 'status' => 'ok'
57 'url' => 'http://mylib.tamil.fr//img/62265_0055C.JPG',
58 'is_success' => 0,
59 'status' => '404 - Page not found'
64 =cut
66 use strict;
67 use warnings;
68 use LWP::UserAgent;
69 use HTTP::Request;
70 use C4::Biblio;
74 sub new {
76 my $self = {};
77 my ($class, $timeout) = @_;
79 my $uagent = new LWP::UserAgent;
80 $uagent->timeout( $timeout) if $timeout;
81 $self->{ user_agent } = $uagent;
82 $self->{ bad_url } = { };
84 bless $self, $class;
85 return $self;
89 sub check_biblio {
90 my $self = shift;
91 my $biblionumber = shift;
92 my $uagent = $self->{ user_agent };
93 my $host = $self->{ host_default };
94 my $bad_url = $self->{ bad_url };
96 my $record = GetMarcBiblio( $biblionumber );
97 return unless $record->field('856');
99 my @urls = ();
100 foreach my $field ( $record->field('856') ) {
101 my $url = $field->subfield('u');
102 next unless $url;
103 $url = "$host/$url" unless $url =~ /^http/;
104 my $check = { url => $url };
105 if ( $bad_url->{ $url } ) {
106 $check->{ is_success } = 1;
107 $check->{ status } = '500 Site already checked';
109 else {
110 my $req = HTTP::Request->new( GET => $url );
111 my $res = $uagent->request( $req, sub { die }, 1 );
112 if ( $res->is_success ) {
113 $check->{ is_success } = 1;
114 $check->{ status } = 'ok';
116 else {
117 $check->{ is_success } = 0;
118 $check->{ status } = $res->status_line;
119 $bad_url->{ $url } = 1;
122 push @urls, $check;
124 return \@urls;
129 package Main;
131 use strict;
132 use warnings;
133 use diagnostics;
134 use Carp;
136 use Pod::Usage;
137 use Getopt::Long;
138 use C4::Context;
142 my $verbose = 0;
143 my $help = 0;
144 my $host = '';
145 my $host_pro = '';
146 my $html = 0;
147 my $uriedit = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber=";
148 my $timeout = 15;
149 GetOptions(
150 'verbose' => \$verbose,
151 'html' => \$html,
152 'help' => \$help,
153 'host=s' => \$host,
154 'host-pro=s' => \$host_pro,
155 'timeout=i', => \$timeout,
159 sub usage {
160 pod2usage( -verbose => 2 );
161 exit;
165 sub bibediturl {
166 my $biblionumber = shift;
167 my $html = "<a href=\"$host_pro$uriedit$biblionumber\">$biblionumber</a>";
168 return $html;
173 # Check all URLs from all current Koha biblio records
175 sub check_all_url {
176 my $checker = C4::URL::Checker->new($timeout);
177 $checker->{ host_default } = $host;
179 my $context = new C4::Context( );
180 my $dbh = $context->dbh;
181 my $sth = $dbh->prepare(
182 "SELECT biblionumber FROM biblioitems WHERE url <> ''" );
183 $sth->execute;
184 if ( $html ) {
185 print <<EOS;
186 <html>
187 <body>
188 <table>
191 while ( my ($biblionumber) = $sth->fetchrow ) {
192 my $result = $checker->check_biblio( $biblionumber );
193 next unless $result; # No URL
194 foreach my $url ( @$result ) {
195 if ( ! $url->{ is_success } || $verbose ) {
196 print $html
197 ? "<tr>\n<td>" . bibediturl( $biblionumber ) .
198 "</td>\n<td>" . $url->{url} . "</td>\n<td>" .
199 $url->{status} . "</td>\n</tr>\n\n"
200 : "$biblionumber\t" . $url->{ url } . "\t" .
201 $url->{ status } . "\n";
205 print "</table>\n</body>\n</html>\n" if $html;
209 # BEGIN
211 usage() if $help;
213 if ( $html && !$host_pro ) {
214 if ( $host ) {
215 $host_pro = $host;
217 else {
218 print "Error: host-pro parameter or host must be provided in html mode\n";
219 exit;
223 check_all_url();
227 =head1 NAME
229 check-url.pl - Check URLs from 856$u field.
231 =head1 USAGE
233 =over
235 =item check-url.pl [--verbose|--help] [--host=http://default.tld]
237 Scan all URLs found in 856$u of bib records
238 and display if resources are available or not.
240 =back
242 =head1 PARAMETERS
244 =over
246 =item B<--host=http://default.tld>
248 Server host used when URL doesn't have one, ie doesn't begin with 'http:'.
249 For example, if --host=http://www.mylib.com, then when 856$u contains
250 'img/image.jpg', the url checked is: http://www.mylib.com/image.jpg'.
252 =item B<--verbose|-v>
254 Outputs both successful and failed URLs.
256 =item B<--html>
258 Formats output in HTML. The result can be redirected to a file
259 accessible by http. This way, it's possible to link directly to biblio
260 record in edit mode. With this parameter B<--host-pro> is required.
262 =item B<--host-pro=http://koha-pro.tld>
264 Server host used to link to biblio record editing page.
266 =item B<--timeout=15>
268 Timeout for fetching URLs. By default 15 seconds.
270 =item B<--help|-h>
272 Print this help page.
274 =back
276 =cut