21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
29 After how many seconds we look again into the megalog file. Defaults to 15.
31 A longer value reduces the noise on the console, a shorter ensures
32 that we identify each problem earlier. For a while 120 was a good
33 default but then I discovered that PDL-2.006 was hung repeatedly
34 during testing and then I decided to reduce to 15. Now 150 in order to
35 have a whole day in one sceen window.
37 =item B<--longsleep=i>
39 After how many seconds we look again wheather a new megalog file is
40 there. Defaults to 600.
50 - distrofilenames hardcoded
57 use lib
"$FindBin::Bin/../lib";
63 $HAVE_SHUFFLE = eval { require Algorithm
::Numerical
::Shuffle
; 1; };
65 use File
::ReadBackwards
;
68 use Hash
::Util
qw(lock_keys);
69 use List
::Util
qw(reduce);
70 use Pod
::Usage
qw(pod2usage);
71 use Proc
::ProcessTable
;
74 File
::Which
::which
('lsof') or die "required external command 'lsof' not installed, refusing to run";
77 lock_keys
%Opt, map { /([^=|]+)/ } @opt;
84 $Opt{sleep} = 150 unless defined $Opt{sleep};
85 $Opt{longsleep
} = 600 unless defined $Opt{longsleep
};
87 sub counting_sleep
($) {
89 my $start_time = time;
90 my $start_time_str = scalar localtime $start_time;
91 my $eta = $start_time + $sleep;
94 my $left = $eta - time;
96 printf "\r%s: sleeping %d: %d ", $start_time_str, $sleep, $left;
102 sub last_megalog
() {
105 until (opendir $dh, ".") {
106 warn "Couldn't opendir .: $!; Sleeping\n";
107 counting_sleep
$Opt{sleep};
109 my @have_mtime = grep { -e
$_ } readdir $dh;
111 my @mfiles = grep { -M
$_ < 60/86400 }
112 grep { /megalog-\d+-\d+-\d+T\d+:\d+:\d+.log$/ } @have_mtime;
114 Algorithm
::Numerical
::Shuffle
::shuffle
(\
@mfiles);
116 @mfiles = sort { $b cmp $a } @mfiles;
119 for my $mfile (@mfiles) {
120 open my $fh, "<", $mfile or next;
123 next unless m{^perl\|-\>\s+\S+installed-perls/(?:perl|host/([^/]+))/};
124 my $megalog_host = $1 || "k83";
125 $self_host //= Sys
::Hostname
::hostname
;
126 if ($self_host eq $megalog_host) {
131 warn "Couldn't find an interesting megalog file; Sleeping\n";
132 counting_sleep
$Opt{longsleep
};
139 sub megalog_is_growing
($) {
142 if ($Lfile && $Lfile eq $file) {
146 my $sleep = 3; # arbitrary
147 counting_sleep
$sleep;
149 my $size1 = -s
$file;
153 if ($size1 > $size0) {
154 $growing = 1; # "growing";
156 $growing = 0; # "NOT growing";
158 warn sprintf "watching %s%s growing\n", $file, $growing ?
"" : " NOT";
163 sub kill_stalled_leaves
($) {
165 my $ppt = Proc
::ProcessTable
->new( 'enable_ttys' => 0 );
166 my $ref = $ppt->table;
167 my %parent_map = ($proc => 1);
170 my $keys = keys %parent_map;
172 next if $p->state eq "defunct"; # zombie
173 if (exists $parent_map{$p->pid} && ! exists $family{$p->pid}) {
174 $family{$p->pid} = $p;
176 next unless $parent_map{$p->ppid};
177 $parent_map{$p->pid} = $p->ppid;
179 next if keys %parent_map > $keys;
182 my $FORMAT = "%-6s %-6s %-8s %-24s %s\n";
183 printf($FORMAT, "PID", "PPID", "STAT", "START", "COMMAND");
185 my %map2 = %parent_map;
187 my %values = map {($_ => 1)} values %map2; # parents
188 my @leaves = grep { ! $values{$_} } keys %map2;
192 my $tp = pop @leaves;
194 my $p = delete $family{$tp}
195 or die "Panic: unexpected non-value in family tp='$tp'";
200 scalar(localtime($p->start)),
203 warn "ATTENTION: About to kill @tokill\n";
204 my $signaled = kill 15, @tokill;
205 warn "successfully signaled $signaled processes with signal 15\n";
207 TOKILL
: for my $pid (@tokill) {
209 warn "process $pid still alive? killing with SIGKILL\n";
211 warn "sent signal successfully\n";
213 warn "could not signal $pid\: $!\n";
216 warn "verified that $pid is dead\n";
222 my $file = last_megalog
;
223 if (megalog_is_growing
($file)) {
224 my $bw = File
::ReadBackwards
->new( $file ) or
225 die "can't read '$file' $!" ;
229 my $min = 9999999999;
230 while( defined( $log_line = $bw->readline ) ) {
233 /^(\QCan
't exec "": No such file or directory at\E
234 |\Qnew worker \E\d+\Q seems to have failed to start, exit status\E
235 |\Qstarting new worker\E
237 last unless $log_line =~ /====/;
238 next unless $log_line =~ /={10}monitoring proc (\d+) perl (\S+) secs ([0-9\.]+)={7}/; # external invariable
242 $max = $time if $time > $max;
243 $min = $time if $time < $min;
246 my $stalled = $max - $min;
247 if ($stalled > 3) { # arbitrary
248 my $cwd = `lsof -p $proc | awk '\
$4=="cwd"{print \
$9}'`;
250 warn "process $proc stalled for $stalled seconds in $cwd\n";
251 my $timeout = 3600; # arbitrary
252 # these times are observed values how long the slowest
253 # operation of each distro can take without output
254 # when it is not hanging. undef stands for not
258 "Algorithm-LinearManifoldDataClusterer-1.01" => 60, # seen 2
259 "Alien-FLTK2-0.09296" => undef,
260 "Alien-SDL-1.444" => undef,
261 "Analizo-1.22.0" => 600, # seen 190
262 "AnyEvent-AggressiveIdle-0.04" => 120, # seen 3
263 "AnyEvent-Fork-Pool-1.2" => 30, # seen 2
264 "AnyEvent-JSONRPC-Lite-0.15" => undef,
265 "AnyEvent-Gearman-0.10" => 60, # seen 6, seen hanging with 5.22.2
266 "AnyEvent-RetryTimer-0.1" => 60, # seen 2
267 "AnyEvent-Tickit-0.01" => 60, # seen nothing
268 "Apache-SessionX-2.01.tar.gz" => undef,
269 "ARCv2-1.05" => 60, # seen 4
270 "Badger-0.09" => 60, # seen 9, seen hanging as 5.12.5
271 "CGI-Application-4.50" => 30,
272 "CGI-Debug-1.0" => 30, # abandoned since 2000?
273 "CSS-Inliner-3935" => 10, #3934 whole job took 5 seconds
274 "CSS-Sass-3.3.6" => 60, # seen 7, seen hanging with 5.12.5
275 "Cache-Memcached-Semaphore-0.3" => 60, # seems abandoned
276 "Chart-GRACE-0.95" => 60, # declared abandoned in 01.DISABLED
277 "Class-Std-0.011" => 10,
278 "Concurrent-Object-1.07" => 30, # seen 1
280 "Data-STUID-0.01" => 60, # nothing seen
281 "DBIx-ObjectMapper-0.3013" => undef,
282 "DBIx-XMLServer-0.02" => 60,
283 "DJabberd-0.85" => undef,
284 "Daemon-Device-1.07" => 60, # seen 7
285 "Devel-TrackSIG-0.03" => 10,
286 "Devel-Trepan-0.73" => 300,
287 "ETL-Yertl-0.043" => 300, # seen 42
288 "Emacs-EPL-0.7" => 30, # seen 1
289 "Enbugger-2.016" => 300, # seen 38-96
290 "FAQ-OMatic-2.702" => 60, # nothing seen
291 "FAQ-OMatic-2.719" => 60, # seen 1
292 "FFI-Platypus-0.40" => 300, # seen 160
293 "Feersum-1.405" => 30, # fails always anyway
294 "File-BSDGlob-0.94" => 30, # seen 1 https://rt.cpan.org/Ticket/Display.html?id=95586
295 "File-Flock-2014.01" => 90, # seen 14
296 "File-Lock-Multi-1.02" => 90, # seen 19
297 "FreeRADIUS-Database-0.06" => 90, # seen 8
298 "Games-AlphaBeta-v0.4.7" => undef,
299 "Games-Rezrov-0.20" => 120, # seen nothing
300 "Games-Rezrov-0.15" => 120, # seen nothing
301 "Gearman-1.12" => 180, # seen 17
302 "HTML-EmbeddedPerl-0.91" => 30, # seen 0, https://rt.cpan.org/Ticket/Display.html?id=88731
303 "HTTP-Proxy-0.304" => 30, # have reported the hang
304 "HTTP-Server-Simple-0.52" => 180, # seen 20
305 "IO-Socket-Socks-Wrapper-0.14" => 120, # seen 40
306 "IPC-Exe-2.002001" => 5,
307 "IPC-Pipeline-0.8" => 30,
308 "JIRA-REST-Class-0.12" => 60, # seen 3
309 "Jvm-0.9.2" => 60, # seen nothing
310 "LWP-Protocol-Coro-http-v1.0.7" => 10,
311 "Lingua-POSAlign-0.01" => 10, # seen 0
312 "MMM-Text-Search-0.07" => 300, # seen nothing
313 "Mail-MtPolicyd-2.03" => 60, # seen nothing
314 "Mail-SendEasy-1.2" => 60, # seen nothing
315 "Module-Refresh-0.17" => 60, # seen 2, seen hanging in v5.12.5
316 "Mojolicious-Plugin-AssetPack-1.13" => 120, # seen 25
317 "Mojolicious-Plugin-AssetPack-1.24" => 120,
318 "Monoceros-0.27" => 60, # seen 12
319 "MySQL-TableInfo-1.01" => 60, # nothing seen
320 "Net-AMQP-RabbitMQ-0.007001" => undef,
321 "Net-AMQP-RabbitMQ-1.400000" => 110, # seen 53 total, but no stalls>127
322 "Net-AMQP-RabbitMQ-2.30000" => 110,
323 "Net-Dropbear-0.10" => 300, # seen 13
324 "Net-Goofey-1.4" => 60, # seen nothing
325 "Net-LDAP-Server-Test-0.14" => undef,
326 "Net-Netcat-0.05" => undef,
327 "NetServer-Generic-1.03" => 3600, # haengt, aber am Schluss gibts ok
328 "nsapi_perl-0.24" => 60, # seem nothing
329 "Object-Lazy-0.13" => 30,
330 "OpenFrame-Segment-Apache-1.20" => 60, # seen 1
331 "PDF-Create-1.43" => 60, # seen 3
332 "PDL-2.006" => undef,
333 "POE-Component-Client-HTTP-0.949"=> 120, # seen 33
334 "POE-Component-Client-Keepalive-0.272"=>300, # seen 40
335 "POE-Loop-IO_Async-0.004" => 120, # seen 36
336 "POEx-Role-PSGIServer-1.150280" => 30, # DEPREACATED
337 "POE-Quickie-0.18" => undef,
338 "Pod-Trial-LinkImg-0.005" => 30, # seen 1
339 "PerlQt-3.006" => 120, # seen 13
340 "PerlQt-3.008" => 120, # seen 13
341 "Project-Euler-0.20" => 10,
342 "Prophet-0.751" => 120, # seen 40
343 "RAS-PortMaster-1.16" => 120, # seen nothing
344 "Redis-Jet-0.08" => 60, # seen 2
345 "RT-Client-REST-0.43" => 20,
346 "Search-Sitemap-2.13" => 120, # seen 6
347 "Server-Starter-0.12" => undef,
348 "Server-Starter-0.19" => undef,
349 "Server-Starter-0.32" => undef,
350 "Server-Starter-0.34" => 300, # seen 94
351 "Starlet-0.31" => 300, # seen 16
352 "Starman-0.4014" => 360, # seen 152
353 "Sub-Call-Tail-0.05" => 1,
354 "Sudo-0.33" => 1800, # seen 941
355 "Sybase-Xfer-0.63" => 123, # nothing seen
356 "Tangram-2.04" => 60, # nothing seen
357 "Tapper-Testplan-4.1.2" => undef,
358 "Test-Moose-More-0.027" => 1, # haengt anscheinend immer, braucht keine Chance
359 "Test-HTTP-Server-Simple-0.11" => 60, # seen 0
360 "Test-WWW-Simple-0.39" => 120, # seen 38
362 "TripleStore-0.03" => 0, # seen 60
363 "Twiggy-0.1025" => 120, # seen 6
364 "UNIVERSAL-isa-1.20171012" => 60, # seen 1
365 "Vim-Debug-0.904" => 120, # seen 14
366 "What-1.00" => 60, # seen 0
367 "Win32-MSAgent-0.07" => 60, # nothing seen
368 "Winamp-Control-0.2.1" => 60, # vermute userinputversuch
369 "XML-Grammar-Fortune-0.0501" => undef,
370 "X11-Protocol-0.56" => 60, # seen ok 1..3
371 "ZeroMQ-PubSub-0.10" => undef,
372 "math-image-109" => undef,
374 while (my($distro,$ttimeout) = each %$timeouts) {
375 if ($cwd =~ m!/\Q$distro\E-!) {
376 $timeout = $ttimeout || 120;
380 keys %$timeouts; # do we need to reset the iterator here?
381 if ($stalled >= $timeout) { # arbitrary
382 warn sprintf "no output for %d seconds (timeout=%d)\n", $stalled, $timeout;
383 kill_stalled_leaves ($proc);
392 counting_sleep $Opt{sleep}; # arbitrary
397 # cperl-indent-level: 4