2 # Copyright (C) 2016-2021 all contributors <meta@public-inbox.org>
3 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
5 my $usage = "$0 [-j JOBS] [-s SLOW_THRESHOLD] URL_OF_INBOX\n";
8 use File
::Temp
qw(tempfile);
10 use Getopt
::Long
qw(:config gnu_getopt no_ignore_case auto_abbrev);
13 use POSIX
qw(:sys_wait_h);
14 use Time
::HiRes
qw(gettimeofday tv_interval);
18 # we want to use vfork+exec with spawn, WWW::Mechanize can use too much
19 # memory and fork(2) fails
20 use PublicInbox
::Spawn
qw(spawn which);
21 $ENV{PERL_INLINE_DIRECTORY
} or warn "PERL_INLINE_DIRECTORY unset, may OOM\n";
27 '-j|jobs=i' => \
$nproc,
28 '-s|slow-threshold=f' => \
$slow,
30 GetOptions
(%opts) or die "bad command-line args\n$usage";
31 my $root_url = shift or die $usage;
33 chomp(my $xmlstarlet = which
('xmlstarlet'));
34 my $atom_check = eval {
35 my $cmd = [ qw(xmlstarlet val -e -) ];
37 my ($in, $out, $err) = @_;
39 open my $in_fh, '+>', undef;
40 open my $out_fh, '+>', undef;
41 open my $err_fh, '+>', undef;
44 sysseek($in_fh, 0, 0);
50 my $pid = spawn
($cmd, undef, $rdr);
51 while (waitpid($pid, 0) != $pid) {
53 warn "waitpid(xmlstarlet, $pid) $!";
56 sysseek($out_fh, 0, 0);
57 sysread($out_fh, $$out, -s
$out_fh);
58 sysseek($err_fh, 0, 0);
59 sysread($err_fh, $$err, -s
$err_fh);
65 $SIG{INT
} = sub { exit 130 };
66 $SIG{TERM
} = sub { exit 0 };
69 my $pid = waitpid(-1, WNOHANG
);
70 return if !defined $pid || $pid <= 0;
71 my $p = delete $workers{$pid} || '(unknown)';
72 warn("$pid [$p] exited with $?\n") if $?
;
76 my @todo = IO
::Socket
->socketpair(AF_UNIX
, SOCK_SEQPACKET
, 0);
77 die "socketpair failed: $!" unless $todo[1];
78 my @done = IO
::Socket
->socketpair(AF_UNIX
, SOCK_SEQPACKET
, 0);
79 die "socketpair failed: $!" unless $done[1];
82 foreach my $p (1..$nproc) {
84 die "fork failed: $!\n" unless defined $pid;
90 worker_loop
($todo[0], $done[1]);
94 my ($fh, $tmp) = tempfile
('www-check-XXXX',
95 SUFFIX
=> '.gdbm', UNLINK
=> 1, TMPDIR
=> 1);
96 my $gdbm = tie
my %seen, 'GDBM_File', $tmp, &GDBM_WRCREAT
, 0600;
97 defined $gdbm or die "gdbm open failed: $!\n";
102 $todo[1]->blocking(0);
103 $done[0]->blocking(0);
104 $seen{$root_url} = 1;
107 my @queue = ($root_url);
108 my $timeout = $slow * 4;
109 while (keys %workers) { # reacts to SIGCHLD
112 vec($rvec, fileno($done[0]), 1) = 1;
114 vec($wvec, fileno($todo[1]), 1) = 1;
115 } elsif ($ndone == $nsent) {
116 kill 'TERM', keys %workers;
119 if (!select($rvec, $wvec, undef, $timeout)) {
120 while (my ($k, $v) = each %seen) {
122 print "WAIT ($ndone/$nsent) <$k>\n";
125 while ($u = shift @queue) {
126 my $s = $todo[1]->send($u, 0);
134 $r = $done[0]->recv($u, 65535, 0);
135 } while (!defined $r && $!{EINTR
});
137 if ($u =~ s/\ADONE\t//) {
149 my ($todo_rd, $done_wr) = @_;
150 $SIG{CHLD
} = 'DEFAULT';
151 my $m = WWW
::Mechanize
->new(autocheck
=> 0);
152 my $cc = LWP
::ConnCache
->new;
153 $m->stack_depth(0); # no history
156 $todo_rd->recv(my $u, 65535, 0);
159 my $t = [ gettimeofday
];
161 $t = tv_interval
($t);
162 printf "SLOW %0.06f % 5d %s\n", $t, $$, $u if $t > $slow;
164 if ($r->is_success) {
166 (split('#', $_->URI->abs->as_string))[0] => 1;
168 $_->tag && $_->url !~ /:/
170 @links = keys %links;
171 } elsif ($r->code != 300) {
172 warn "W: ".$r->code . " $u\n"
177 foreach my $l (@links, "DONE\t$u") {
178 next if $l eq '' || $l =~ /\.mbox(?:\.gz)\z/;
180 $s = $done_wr->send($l, 0);
181 } while (!defined $s && $!{EINTR
});
182 die "$$ send $!\n" unless defined $s;
184 die "$$ send truncated $s < $n\n" if $s != $n;
187 # make sure the HTML source doesn't screw up terminals
188 # when people curl the source (not remotely an expert
189 # on languages or encodings, here).
190 my $ct = $r->header('Content-Type') || '';
191 warn "no Content-Type: $u\n" if $ct eq '';
193 if ($atom_check && $ct =~ m!\bapplication/atom\+xml\b!) {
194 my $raw = $r->decoded_content;
195 my ($out, $err) = ('', '');
196 my $fail = $atom_check->(\
$raw, \
$out, \
$err);
197 warn "Atom ($fail) - $u - <1:$out> <2:$err>\n" if $fail;
200 next if $ct !~ m!\btext/html\b!;
201 my $dc = $r->decoded_content;
202 if ($dc =~ /([\x00-\x08\x0d-\x1f\x7f-\x{99999999}]+)/s) {