Initial checkin after first run
[andk-cpan-tools.git] / bin / basesmoker-antihang.pl
blob01833f1a07cd1d32bdde959af9ed2a8f21ee5181
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--help|h!>
25 This help
27 =item B<--sleep=i>
29 After how many seconds we look again into the megalog file. Defaults to 15.
31 A longer value reduces the noise on the console, a shorter ensures
32 that we identify each problem earlier. For a while 120 was a good
33 default but then I discovered that PDL-2.006 was hung repeatedly
34 during testing and then I decided to reduce to 15. Now 150 in order to
35 have a whole day in one sceen window.
37 =item B<--longsleep=i>
39 After how many seconds we look again wheather a new megalog file is
40 there. Defaults to 600.
42 =back
44 =head1 DESCRIPTION
48 =head1 BUGS
50 - distrofilenames hardcoded
51 - no globs
53 =cut
56 use FindBin;
57 use lib "$FindBin::Bin/../lib";
58 BEGIN {
59 push @INC, qw( );
61 our $HAVE_SHUFFLE;
62 BEGIN {
63 $HAVE_SHUFFLE = eval { require Algorithm::Numerical::Shuffle; 1; };
65 use File::ReadBackwards;
66 use File::Which ();
67 use Getopt::Long;
68 use Hash::Util qw(lock_keys);
69 use List::Util qw(reduce);
70 use Pod::Usage qw(pod2usage);
71 use Proc::ProcessTable;
72 use Sys::Hostname ();
74 File::Which::which('lsof') or die "required external command 'lsof' not installed, refusing to run";
76 our %Opt;
77 lock_keys %Opt, map { /([^=|]+)/ } @opt;
78 GetOptions(\%Opt,
79 @opt,
80 ) or pod2usage(1);
81 if ($Opt{help}) {
82 pod2usage(0);
84 $Opt{sleep} = 150 unless defined $Opt{sleep};
85 $Opt{longsleep} = 600 unless defined $Opt{longsleep};
87 sub counting_sleep ($) {
88 my $sleep = shift;
89 my $start_time = time;
90 my $start_time_str = scalar localtime $start_time;
91 my $eta = $start_time + $sleep;
92 local($|)=1;
93 while () {
94 my $left = $eta - time;
95 last if $left < 0;
96 printf "\r%s: sleeping %d: %d ", $start_time_str, $sleep, $left;
97 sleep 1;
99 print "\n";
102 sub last_megalog () {
103 while () {
104 my $dh;
105 until (opendir $dh, ".") {
106 warn "Couldn't opendir .: $!; Sleeping\n";
107 counting_sleep $Opt{sleep};
109 my @have_mtime = grep { -e $_ } readdir $dh;
110 local($^T) = time;
111 my @mfiles = grep { -M $_ < 60/86400 }
112 grep { /megalog-\d+-\d+-\d+T\d+:\d+:\d+.log$/ } @have_mtime;
113 if ($HAVE_SHUFFLE) {
114 Algorithm::Numerical::Shuffle::shuffle(\@mfiles);
115 } else {
116 @mfiles = sort { $b cmp $a } @mfiles;
118 my $self_host;
119 for my $mfile (@mfiles) {
120 open my $fh, "<", $mfile or next;
121 local $/ = "\n";
122 while (<$fh>) {
123 next unless m{^perl\|-\>\s+\S+installed-perls/(?:perl|host/([^/]+))/};
124 my $megalog_host = $1 || "k83";
125 $self_host //= Sys::Hostname::hostname;
126 if ($self_host eq $megalog_host) {
127 return $mfile;
131 warn "Couldn't find an interesting megalog file; Sleeping\n";
132 counting_sleep $Opt{longsleep};
137 my $Lfile;
138 my $Lsize;
139 sub megalog_is_growing ($) {
140 my($file) = @_;
141 my $size0;
142 if ($Lfile && $Lfile eq $file) {
143 $size0 = $Lsize;
144 } else {
145 $size0 = -s $file;
146 my $sleep = 3; # arbitrary
147 counting_sleep $sleep;
149 my $size1 = -s $file;
150 $Lfile = $file;
151 $Lsize = $size1;
152 my $growing;
153 if ($size1 > $size0) {
154 $growing = 1; # "growing";
155 } else {
156 $growing = 0; # "NOT growing";
158 warn sprintf "watching %s%s growing\n", $file, $growing ? "" : " NOT";
159 return $growing;
163 sub kill_stalled_leaves ($) {
164 my($proc) = @_;
165 my $ppt = Proc::ProcessTable->new( 'enable_ttys' => 0 );
166 my $ref = $ppt->table;
167 my %parent_map = ($proc => 1);
168 my %family;
169 while () {
170 my $keys = keys %parent_map;
171 for my $p (@$ref) {
172 next if $p->state eq "defunct"; # zombie
173 if (exists $parent_map{$p->pid} && ! exists $family{$p->pid}) {
174 $family{$p->pid} = $p;
176 next unless $parent_map{$p->ppid};
177 $parent_map{$p->pid} = $p->ppid;
179 next if keys %parent_map > $keys;
180 last;
182 my $FORMAT = "%-6s %-6s %-8s %-24s %s\n";
183 printf($FORMAT, "PID", "PPID", "STAT", "START", "COMMAND");
184 my @tokill;
185 my %map2 = %parent_map;
186 while (%map2) {
187 my %values = map {($_ => 1)} values %map2; # parents
188 my @leaves = grep { ! $values{$_} } keys %map2;
189 unless (@tokill) {
190 @tokill = @leaves;
192 my $tp = pop @leaves;
193 delete $map2{$tp};
194 my $p = delete $family{$tp}
195 or die "Panic: unexpected non-value in family tp='$tp'";
196 printf($FORMAT,
197 $p->pid,
198 $p->ppid,
199 $p->state,
200 scalar(localtime($p->start)),
201 $p->cmndline);
203 warn "ATTENTION: About to kill @tokill\n";
204 my $signaled = kill 15, @tokill;
205 warn "successfully signaled $signaled processes with signal 15\n";
206 sleep 2;
207 TOKILL: for my $pid (@tokill) {
208 if (kill 0, $pid) {
209 warn "process $pid still alive? killing with SIGKILL\n";
210 if (kill 9, $pid) {
211 warn "sent signal successfully\n";
212 } else {
213 warn "could not signal $pid\: $!\n";
215 } else {
216 warn "verified that $pid is dead\n";
221 while () {
222 my $file = last_megalog;
223 if (megalog_is_growing($file)) {
224 my $bw = File::ReadBackwards->new( $file ) or
225 die "can't read '$file' $!" ;
226 my $log_line;
227 my $proc;
228 my $max = 0;
229 my $min = 9999999999;
230 while( defined( $log_line = $bw->readline ) ) {
231 last unless $log_line =~ /====/;
232 next unless $log_line =~ /={10}monitoring proc (\d+) perl (\S+) secs ([0-9\.]+)={7}/; # external invariable
233 $proc = $1;
234 my $perl = $2;
235 my $time = $3;
236 $max = $time if $time > $max;
237 $min = $time if $time < $min;
239 if ($proc) {
240 my $stalled = $max - $min;
241 if ($stalled > 3) { # arbitrary
242 my $cwd = `lsof -p $proc | awk '\$4=="cwd"{print \$9}'`;
243 chomp $cwd;
244 warn "process $proc stalled for $stalled seconds in $cwd\n";
245 my $timeout = 3600; # arbitrary
246 # these times are observed values how long the slowest
247 # operation of each distro can take without output
248 # when it is not hanging. undef stands for not
249 # measured
250 my $timeouts =
253 "Alien-FLTK2-0.09296" => undef,
254 "Alien-SDL-1.444" => undef,
255 "AnyEvent-Fork-Pool-1.2" => 30, # seen 2
256 "AnyEvent-JSONRPC-Lite-0.15" => undef,
257 "AnyEvent-Gearman-0.10" => 60, # seen 6, seen hanging with 5.22.2
258 "Badger-0.09" => 60, # seen 9, seen hanging as 5.12.5
259 "CGI-Application-4.50" => 30,
260 "CSS-Inliner-3935" => 10, #3934 whole job took 5 seconds
261 "CSS-Sass-3.3.6" => 60, # seen 7, seen hanging with 5.12.5
262 "Class-Std-0.011" => 10,
263 "Coro-6.41" => 10,
264 "DBIx-ObjectMapper-0.3013" => undef,
265 "DJabberd-0.85" => undef,
266 "Devel-TrackSIG-0.03" => 10,
267 "Devel-Trepan-0.73" => 300,
268 "Enbugger-2.016" => 300, # seen 38-96
269 "FFI-Platypus-0.40" => 300, # seen 160
270 "Feersum-1.405" => 30, # fails always anyway
271 "File-Flock-2014.01" => 90, # seen 14
272 "File-Lock-Multi-1.02" => 90, # seen 19
273 "Games-AlphaBeta-v0.4.7" => undef,
274 "Gearman-1.12" => 180, # seen 17
275 "HTML-EmbeddedPerl-0.91" => 30, # seen 0, https://rt.cpan.org/Ticket/Display.html?id=88731
276 "HTTP-Server-Simple-0.52" => 180, # seen 20
277 "IO-Socket-Socks-Wrapper-0.14" => 120, # seen 40
278 "IPC-Exe-2.002001" => 5,
279 "IPC-Pipeline-0.8" => 30,
280 "LWP-Protocol-Coro-http-v1.0.7" => 10,
281 "Module-Refresh-0.17" => 60, # seen 2, seen hanging in v5.12.5
282 "Mojolicious-Plugin-AssetPack-1.13" => 120, # seen 25
283 "Mojolicious-Plugin-AssetPack-1.24" => 120,
284 "Net-AMQP-RabbitMQ-0.007001" => undef,
285 "Net-AMQP-RabbitMQ-1.400000" => 110, # seen 53 total, but no stalls>127
286 "Net-LDAP-Server-Test-0.14" => undef,
287 "Net-Netcat-0.05" => undef,
288 "Object-Lazy-0.13" => 30,
289 "PDL-2.006" => undef,
290 "POE-Component-Client-HTTP-0.949"=> 120, # seen 33
291 "POE-Component-Client-Keepalive-0.272"=>300, # seen 40
292 "POEx-Role-PSGIServer-1.150280" => 30, # DEPREACATED
293 "POE-Quickie-0.18" => undef,
294 "Project-Euler-0.20" => 10,
295 "Prophet-0.751" => 120, # seen 40
296 "Redis-Jet-0.08" => 60, # seen 2
297 "RT-Client-REST-0.43" => 20,
298 "Server-Starter-0.12" => undef,
299 "Server-Starter-0.19" => undef,
300 "Server-Starter-0.32" => undef,
301 "Starlet-0.31" => undef,
302 "Starman-0.4014" => 360, # seen 152
303 "Sub-Call-Tail-0.05" => 1,
304 "Tapper-Testplan-4.1.2" => undef,
305 "Test-Moose-More-0.027" => 1, # haengt anscheinend immer, braucht keine Chance
306 "Test-HTTP-Server-Simple-0.11" => 60, # seen 0
307 "Tk-804.030" => 10,
308 "Twiggy-0.1025" => 120, # seen 6
309 "XML-Grammar-Fortune-0.0501" => undef,
310 "ZeroMQ-PubSub-0.10" => undef,
311 "math-image-109" => undef,
313 while (my($distro,$ttimeout) = each %$timeouts) {
314 if ($cwd =~ m!/\Q$distro\E-!) {
315 $timeout = $ttimeout || 120;
316 last;
319 keys %$timeouts; # do we need to reset the iterator here?
320 if ($stalled >= $timeout) { # arbitrary
321 warn sprintf "no output for %d seconds (timeout=%d)\n", $stalled, $timeout;
322 kill_stalled_leaves ($proc);
324 } else {
325 warn "working OK\n";
327 } else {
328 warn "is busy\n";
331 counting_sleep $Opt{sleep}; # arbitrary
334 # Local Variables:
335 # mode: cperl
336 # cperl-indent-level: 4
337 # End: