add a hanging distro
[andk-cpan-tools.git] / bin / basesmoker-antihang.pl
blob0595e0c4c8bc555fc491ac704c44814b830c44ed
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--help|h!>
25 This help
27 =item B<--sleep=i>
29 After how many seconds we look again into the megalog file. Defaults to 15.
31 A longer value reduces the noise on the console, a shorter ensures
32 that we identify each problem earlier. For a while 120 was a good
33 default but then I discovered that PDL-2.006 was hung repeatedly
34 during testing and then I decided to reduce to 15. Now 150 in order to
35 have a whole day in one sceen window.
37 =item B<--longsleep=i>
39 After how many seconds we look again wheather a new megalog file is
40 there. Defaults to 600.
42 =back
44 =head1 DESCRIPTION
48 =head1 BUGS
50 - distrofilenames hardcoded
51 - no globs
53 =cut
56 use FindBin;
57 use lib "$FindBin::Bin/../lib";
58 BEGIN {
59 push @INC, qw( );
61 our $HAVE_SHUFFLE;
62 BEGIN {
63 $HAVE_SHUFFLE = eval { require Algorithm::Numerical::Shuffle; 1; };
65 use File::ReadBackwards;
66 use File::Which ();
67 use Getopt::Long;
68 use Hash::Util qw(lock_keys);
69 use List::Util qw(reduce);
70 use Pod::Usage qw(pod2usage);
71 use Proc::ProcessTable;
72 use Sys::Hostname ();
74 File::Which::which('lsof') or die "required external command 'lsof' not installed, refusing to run";
76 our %Opt;
77 lock_keys %Opt, map { /([^=|]+)/ } @opt;
78 GetOptions(\%Opt,
79 @opt,
80 ) or pod2usage(1);
81 if ($Opt{help}) {
82 pod2usage(0);
84 $Opt{sleep} = 150 unless defined $Opt{sleep};
85 $Opt{longsleep} = 600 unless defined $Opt{longsleep};
87 sub counting_sleep ($) {
88 my $sleep = shift;
89 my $start_time = time;
90 my $start_time_str = scalar localtime $start_time;
91 my $eta = $start_time + $sleep;
92 local($|)=1;
93 while () {
94 my $left = $eta - time;
95 last if $left < 0;
96 printf "\r%s: sleeping %d: %d ", $start_time_str, $sleep, $left;
97 sleep 1;
99 print "\n";
102 sub last_megalog () {
103 while () {
104 my $dh;
105 until (opendir $dh, ".") {
106 warn "Couldn't opendir .: $!; Sleeping\n";
107 counting_sleep $Opt{sleep};
109 my @have_mtime = grep { -e $_ } readdir $dh;
110 local($^T) = time;
111 my @mfiles = grep { -M $_ < 60/86400 }
112 grep { /megalog-\d+-\d+-\d+T\d+:\d+:\d+.log$/ } @have_mtime;
113 if ($HAVE_SHUFFLE) {
114 Algorithm::Numerical::Shuffle::shuffle(\@mfiles);
115 } else {
116 @mfiles = sort { $b cmp $a } @mfiles;
118 my $self_host;
119 for my $mfile (@mfiles) {
120 open my $fh, "<", $mfile or next;
121 local $/ = "\n";
122 while (<$fh>) {
123 next unless m{^perl\|-\>\s+\S+installed-perls/(?:perl|host/([^/]+))/};
124 my $megalog_host = $1 || "k83";
125 $self_host //= Sys::Hostname::hostname;
126 if ($self_host eq $megalog_host) {
127 return $mfile;
131 warn "Couldn't find an interesting megalog file; Sleeping\n";
132 counting_sleep $Opt{longsleep};
137 my $Lfile;
138 my $Lsize;
139 sub megalog_is_growing ($) {
140 my($file) = @_;
141 my $size0;
142 if ($Lfile && $Lfile eq $file) {
143 $size0 = $Lsize;
144 } else {
145 $size0 = -s $file;
146 my $sleep = 3; # arbitrary
147 counting_sleep $sleep;
149 my $size1 = -s $file;
150 $Lfile = $file;
151 $Lsize = $size1;
152 my $growing;
153 if ($size1 > $size0) {
154 $growing = 1; # "growing";
155 } else {
156 $growing = 0; # "NOT growing";
158 warn sprintf "watching %s%s growing\n", $file, $growing ? "" : " NOT";
159 return $growing;
163 sub kill_stalled_leaves ($) {
164 my($proc) = @_;
165 my $ppt = Proc::ProcessTable->new( 'enable_ttys' => 0 );
166 my $ref = $ppt->table;
167 my %parent_map = ($proc => 1);
168 my %family;
169 while () {
170 my $keys = keys %parent_map;
171 for my $p (@$ref) {
172 next if $p->state eq "defunct"; # zombie
173 if (exists $parent_map{$p->pid} && ! exists $family{$p->pid}) {
174 $family{$p->pid} = $p;
176 next unless $parent_map{$p->ppid};
177 $parent_map{$p->pid} = $p->ppid;
179 next if keys %parent_map > $keys;
180 last;
182 my $FORMAT = "%-6s %-6s %-8s %-24s %s\n";
183 printf($FORMAT, "PID", "PPID", "STAT", "START", "COMMAND");
184 my @tokill;
185 my %map2 = %parent_map;
186 while (%map2) {
187 my %values = map {($_ => 1)} values %map2; # parents
188 my @leaves = grep { ! $values{$_} } keys %map2;
189 unless (@tokill) {
190 @tokill = @leaves;
192 my $tp = pop @leaves;
193 delete $map2{$tp};
194 my $p = delete $family{$tp}
195 or die "Panic: unexpected non-value in family tp='$tp'";
196 printf($FORMAT,
197 $p->pid,
198 $p->ppid,
199 $p->state,
200 scalar(localtime($p->start)),
201 $p->cmndline);
203 warn "ATTENTION: About to kill @tokill\n";
204 my $signaled = kill 15, @tokill;
205 warn "successfully signaled $signaled processes with signal 15\n";
206 sleep 2;
207 TOKILL: for my $pid (@tokill) {
208 if (kill 0, $pid) {
209 warn "process $pid still alive? killing with SIGKILL\n";
210 if (kill 9, $pid) {
211 warn "sent signal successfully\n";
212 } else {
213 warn "could not signal $pid\: $!\n";
215 } else {
216 warn "verified that $pid is dead\n";
221 while () {
222 my $file = last_megalog;
223 if (megalog_is_growing($file)) {
224 my $bw = File::ReadBackwards->new( $file ) or
225 die "can't read '$file' $!" ;
226 my $log_line;
227 my $proc;
228 my $max = 0;
229 my $min = 9999999999;
230 while( defined( $log_line = $bw->readline ) ) {
231 # hanging Starlet:
232 next if $log_line =~
233 /^(\QCan't exec "": No such file or directory at\E
234 |\Qnew worker \E\d+\Q seems to have failed to start, exit status\E
235 |\Qstarting new worker\E
236 )/x;
237 last unless $log_line =~ /====/;
238 next unless $log_line =~ /={10}monitoring proc (\d+) perl (\S+) secs ([0-9\.]+)={7}/; # external invariable
239 $proc = $1;
240 my $perl = $2;
241 my $time = $3;
242 $max = $time if $time > $max;
243 $min = $time if $time < $min;
245 if ($proc) {
246 my $stalled = $max - $min;
247 if ($stalled > 3) { # arbitrary
248 my $cwd = `lsof -p $proc | awk '\$4=="cwd"{print \$9}'`;
249 chomp $cwd;
250 warn "process $proc stalled for $stalled seconds in $cwd\n";
251 my $timeout = 3600; # arbitrary
252 # these times are observed values how long the slowest
253 # operation of each distro can take without output
254 # when it is not hanging. undef stands for not
255 # measured
256 my $timeouts =
258 "Algorithm-LinearManifoldDataClusterer-1.01" => 60, # seen 2
259 "Alien-FLTK2-0.09296" => undef,
260 "Alien-SDL-1.444" => undef,
261 "Analizo-1.22.0" => 600, # seen 190
262 "AnyEvent-AggressiveIdle-0.04" => 120, # seen 3
263 "AnyEvent-Fork-Pool-1.2" => 30, # seen 2
264 "AnyEvent-JSONRPC-Lite-0.15" => undef,
265 "AnyEvent-Gearman-0.10" => 60, # seen 6, seen hanging with 5.22.2
266 "AnyEvent-RetryTimer-0.1" => 60, # seen 2
267 "AnyEvent-Tickit-0.01" => 60, # seen nothing
268 "Apache-SessionX-2.01.tar.gz" => undef,
269 "ARCv2-1.05" => 60, # seen 4
270 "Badger-0.09" => 60, # seen 9, seen hanging as 5.12.5
271 "CGI-Application-4.50" => 30,
272 "CGI-Debug-1.0" => 30, # abandoned since 2000?
273 "CSS-Inliner-3935" => 10, #3934 whole job took 5 seconds
274 "CSS-Sass-3.3.6" => 60, # seen 7, seen hanging with 5.12.5
275 "Cache-Memcached-Semaphore-0.3" => 60, # seems abandoned
276 "Chart-GRACE-0.95" => 60, # declared abandoned in 01.DISABLED
277 "Class-Std-0.011" => 10,
278 "Concurrent-Object-1.07" => 30, # seen 1
279 "Coro-6.41" => 10,
280 "Data-STUID-0.01" => 60, # nothing seen
281 "DBIx-ObjectMapper-0.3013" => undef,
282 "DBIx-XMLServer-0.02" => 60,
283 "DJabberd-0.85" => undef,
284 "Daemon-Device-1.07" => 60, # seen 7
285 "Devel-TrackSIG-0.03" => 10,
286 "Devel-Trepan-0.73" => 300,
287 "ETL-Yertl-0.043" => 300, # seen 42
288 "Emacs-EPL-0.7" => 30, # seen 1
289 "Enbugger-2.016" => 300, # seen 38-96
290 "FAQ-OMatic-2.702" => 60, # nothing seen
291 "FAQ-OMatic-2.719" => 60, # seen 1
292 "FFI-Platypus-0.40" => 300, # seen 160
293 "Feersum-1.405" => 30, # fails always anyway
294 "File-BSDGlob-0.94" => 30, # seen 1 https://rt.cpan.org/Ticket/Display.html?id=95586
295 "File-Flock-2014.01" => 90, # seen 14
296 "File-Lock-Multi-1.02" => 90, # seen 19
297 "FreeRADIUS-Database-0.06" => 90, # seen 8
298 "Games-AlphaBeta-v0.4.7" => undef,
299 "Games-Rezrov-0.20" => 120, # seen nothing
300 "Games-Rezrov-0.15" => 120, # seen nothing
301 "Gearman-1.12" => 180, # seen 17
302 "HTML-EmbeddedPerl-0.91" => 30, # seen 0, https://rt.cpan.org/Ticket/Display.html?id=88731
303 "HTTP-Proxy-0.304" => 30, # have reported the hang
304 "HTTP-Server-Simple-0.52" => 180, # seen 20
305 "IO-Socket-Socks-Wrapper-0.14" => 120, # seen 40
306 "IPC-Exe-2.002001" => 5,
307 "IPC-Pipeline-0.8" => 30,
308 "JIRA-REST-Class-0.12" => 60, # seen 3
309 "Jvm-0.9.2" => 60, # seen nothing
310 "LWP-Protocol-Coro-http-v1.0.7" => 10,
311 "Lingua-POSAlign-0.01" => 10, # seen 0
312 "MMM-Text-Search-0.07" => 300, # seen nothing
313 "Mail-MtPolicyd-2.03" => 60, # seen nothing
314 "Mail-SendEasy-1.2" => 60, # seen nothing
315 "Module-Refresh-0.17" => 60, # seen 2, seen hanging in v5.12.5
316 "Mojolicious-Plugin-AssetPack-1.13" => 120, # seen 25
317 "Mojolicious-Plugin-AssetPack-1.24" => 120,
318 "Monoceros-0.27" => 60, # seen 12
319 "MySQL-TableInfo-1.01" => 60, # nothing seen
320 "Net-AMQP-RabbitMQ-0.007001" => undef,
321 "Net-AMQP-RabbitMQ-1.400000" => 110, # seen 53 total, but no stalls>127
322 "Net-AMQP-RabbitMQ-2.30000" => 110,
323 "Net-Dropbear-0.10" => 300, # seen 13
324 "Net-Goofey-1.4" => 60, # seen nothing
325 "Net-LDAP-Server-Test-0.14" => undef,
326 "Net-Netcat-0.05" => undef,
327 "NetServer-Generic-1.03" => 3600, # haengt, aber am Schluss gibts ok
328 "nsapi_perl-0.24" => 60, # seem nothing
329 "Object-Lazy-0.13" => 30,
330 "OpenFrame-Segment-Apache-1.20" => 60, # seen 1
331 "PDF-Create-1.43" => 60, # seen 3
332 "PDL-2.006" => undef,
333 "POE-Component-Client-HTTP-0.949"=> 120, # seen 33
334 "POE-Component-Client-Keepalive-0.272"=>300, # seen 40
335 "POE-Loop-IO_Async-0.004" => 120, # seen 36
336 "POEx-Role-PSGIServer-1.150280" => 30, # DEPREACATED
337 "POE-Quickie-0.18" => undef,
338 "Pod-Trial-LinkImg-0.005" => 30, # seen 1
339 "PerlQt-3.006" => 120, # seen 13
340 "PerlQt-3.008" => 120, # seen 13
341 "Project-Euler-0.20" => 10,
342 "Prophet-0.751" => 120, # seen 40
343 "RAS-PortMaster-1.16" => 120, # seen nothing
344 "Redis-Jet-0.08" => 60, # seen 2
345 "RT-Client-REST-0.43" => 20,
346 "Search-Sitemap-2.13" => 120, # seen 6
347 "Server-Starter-0.12" => undef,
348 "Server-Starter-0.19" => undef,
349 "Server-Starter-0.32" => undef,
350 "Server-Starter-0.34" => 300, # seen 94
351 "Starlet-0.31" => 300, # seen 16
352 "Starman-0.4014" => 360, # seen 152
353 "Sub-Call-Tail-0.05" => 1,
354 "Sudo-0.33" => 1800, # seen 941
355 "Sybase-Xfer-0.63" => 123, # nothing seen
356 "Tangram-2.04" => 60, # nothing seen
357 "Tapper-Testplan-4.1.2" => undef,
358 "Test-Moose-More-0.027" => 1, # haengt anscheinend immer, braucht keine Chance
359 "Test-HTTP-Server-Simple-0.11" => 60, # seen 0
360 "Test-WWW-Simple-0.39" => 120, # seen 38
361 "Tk-804.030" => 10,
362 "TripleStore-0.03" => 0, # seen 60
363 "Twiggy-0.1025" => 120, # seen 6
364 "UNIVERSAL-isa-1.20171012" => 60, # seen 1
365 "Vim-Debug-0.904" => 120, # seen 14
366 "What-1.00" => 60, # seen 0
367 "Win32-MSAgent-0.07" => 60, # nothing seen
368 "Winamp-Control-0.2.1" => 60, # vermute userinputversuch
369 "XML-Grammar-Fortune-0.0501" => undef,
370 "X11-Protocol-0.56" => 60, # seen ok 1..3
371 "ZeroMQ-PubSub-0.10" => undef,
372 "math-image-109" => undef,
374 while (my($distro,$ttimeout) = each %$timeouts) {
375 if ($cwd =~ m!/\Q$distro\E-!) {
376 $timeout = $ttimeout || 120;
377 last;
380 keys %$timeouts; # do we need to reset the iterator here?
381 if ($stalled >= $timeout) { # arbitrary
382 warn sprintf "no output for %d seconds (timeout=%d)\n", $stalled, $timeout;
383 kill_stalled_leaves ($proc);
385 } else {
386 warn "working OK\n";
388 } else {
389 warn "is busy\n";
392 counting_sleep $Opt{sleep}; # arbitrary
395 # Local Variables:
396 # mode: cperl
397 # cperl-indent-level: 4
398 # End: