21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
29 After how many seconds we look again into the megalog file. Defaults to 15.
31 A longer value reduces the noise on the console, a shorter ensures
32 that we identify each problem earlier. For a while 120 was a good
33 default but then I discovered that PDL-2.006 was hung repeatedly
34 during testing and then I decided to reduce to 15. Now 150 in order to
35 have a whole day in one sceen window.
37 =item B<--longsleep=i>
39 After how many seconds we look again wheather a new megalog file is
40 there. Defaults to 600.
50 - distrofilenames hardcoded
57 use lib
"$FindBin::Bin/../lib";
63 $HAVE_SHUFFLE = eval { require Algorithm
::Numerical
::Shuffle
; 1; };
65 use File
::ReadBackwards
;
68 use Hash
::Util
qw(lock_keys);
69 use List
::Util
qw(reduce);
70 use Pod
::Usage
qw(pod2usage);
71 use Proc
::ProcessTable
;
74 File
::Which
::which
('lsof') or die "required external command 'lsof' not installed, refusing to run";
77 lock_keys
%Opt, map { /([^=|]+)/ } @opt;
84 $Opt{sleep} = 150 unless defined $Opt{sleep};
85 $Opt{longsleep
} = 600 unless defined $Opt{longsleep
};
87 sub counting_sleep
($) {
89 my $start_time = time;
90 my $start_time_str = scalar localtime $start_time;
91 my $eta = $start_time + $sleep;
94 my $left = $eta - time;
96 printf "\r%s: sleeping %d: %d ", $start_time_str, $sleep, $left;
102 sub last_megalog
() {
105 until (opendir $dh, ".") {
106 warn "Couldn't opendir .: $!; Sleeping\n";
107 counting_sleep
$Opt{sleep};
109 my @have_mtime = grep { -e
$_ } readdir $dh;
111 my @mfiles = grep { -M
$_ < 60/86400 }
112 grep { /megalog-\d+-\d+-\d+T\d+:\d+:\d+.log$/ } @have_mtime;
114 Algorithm
::Numerical
::Shuffle
::shuffle
(\
@mfiles);
116 @mfiles = sort { $b cmp $a } @mfiles;
119 for my $mfile (@mfiles) {
120 open my $fh, "<", $mfile or next;
123 next unless m{^perl\|-\>\s+\S+installed-perls/(?:perl|host/([^/]+))/};
124 my $megalog_host = $1 || "k83";
125 $self_host //= Sys
::Hostname
::hostname
;
126 if ($self_host eq $megalog_host) {
131 warn "Couldn't find an interesting megalog file; Sleeping\n";
132 counting_sleep
$Opt{longsleep
};
139 sub megalog_is_growing
($) {
142 if ($Lfile && $Lfile eq $file) {
146 my $sleep = 3; # arbitrary
147 counting_sleep
$sleep;
149 my $size1 = -s
$file;
153 if ($size1 > $size0) {
154 $growing = 1; # "growing";
156 $growing = 0; # "NOT growing";
158 warn sprintf "watching %s%s growing\n", $file, $growing ?
"" : " NOT";
163 sub kill_stalled_leaves
($) {
165 my $ppt = Proc
::ProcessTable
->new( 'enable_ttys' => 0 );
166 my $ref = $ppt->table;
167 my %parent_map = ($proc => 1);
170 my $keys = keys %parent_map;
172 next if $p->state eq "defunct"; # zombie
173 if (exists $parent_map{$p->pid} && ! exists $family{$p->pid}) {
174 $family{$p->pid} = $p;
176 next unless $parent_map{$p->ppid};
177 $parent_map{$p->pid} = $p->ppid;
179 next if keys %parent_map > $keys;
182 my $FORMAT = "%-6s %-6s %-8s %-24s %s\n";
183 printf($FORMAT, "PID", "PPID", "STAT", "START", "COMMAND");
185 my %map2 = %parent_map;
187 my %values = map {($_ => 1)} values %map2; # parents
188 my @leaves = grep { ! $values{$_} } keys %map2;
192 my $tp = pop @leaves;
194 my $p = delete $family{$tp}
195 or die "Panic: unexpected non-value in family tp='$tp'";
200 scalar(localtime($p->start)),
203 warn "ATTENTION: About to kill @tokill\n";
204 my $signaled = kill 15, @tokill;
205 warn "successfully signaled $signaled processes with signal 15\n";
207 TOKILL
: for my $pid (@tokill) {
209 warn "process $pid still alive? killing with SIGKILL\n";
211 warn "sent signal successfully\n";
213 warn "could not signal $pid\: $!\n";
216 warn "verified that $pid is dead\n";
222 my $file = last_megalog
;
223 if (megalog_is_growing
($file)) {
224 my $bw = File
::ReadBackwards
->new( $file ) or
225 die "can't read '$file' $!" ;
229 my $min = 9999999999;
230 while( defined( $log_line = $bw->readline ) ) {
231 last unless $log_line =~ /====/;
232 next unless $log_line =~ /={10}monitoring proc (\d+) perl (\S+) secs ([0-9\.]+)={7}/; # external invariable
236 $max = $time if $time > $max;
237 $min = $time if $time < $min;
240 my $stalled = $max - $min;
241 if ($stalled > 3) { # arbitrary
242 my $cwd = `lsof -p $proc | awk '\$4=="cwd"{print \$9}'`;
244 warn "process $proc stalled for $stalled seconds in $cwd\n";
245 my $timeout = 3600; # arbitrary
246 # these times are observed values how long the slowest
247 # operation of each distro can take without output
248 # when it is not hanging. undef stands for not
253 "Alien-FLTK2-0.09296" => undef,
254 "Alien-SDL-1.444" => undef,
255 "AnyEvent-Fork-Pool-1.2" => 30, # seen 2
256 "AnyEvent-JSONRPC-Lite-0.15" => undef,
257 "AnyEvent-Gearman-0.10" => 60, # seen 6, seen hanging with 5.22.2
258 "Badger-0.09" => 60, # seen 9, seen hanging as 5.12.5
259 "CGI-Application-4.50" => 30,
260 "CSS-Inliner-3935" => 10, #3934 whole job took 5 seconds
261 "CSS-Sass-3.3.6" => 60, # seen 7, seen hanging with 5.12.5
262 "Class-Std-0.011" => 10,
264 "DBIx-ObjectMapper-0.3013" => undef,
265 "DJabberd-0.85" => undef,
266 "Devel-TrackSIG-0.03" => 10,
267 "Devel-Trepan-0.73" => 300,
268 "Enbugger-2.016" => 300, # seen 38-96
269 "FFI-Platypus-0.40" => 300, # seen 160
270 "Feersum-1.405" => 30, # fails always anyway
271 "File-Flock-2014.01" => 90, # seen 14
272 "File-Lock-Multi-1.02" => 90, # seen 19
273 "Games-AlphaBeta-v0.4.7" => undef,
274 "Gearman-1.12" => 180, # seen 17
275 "HTML-EmbeddedPerl-0.91" => 30, # seen 0, https://rt.cpan.org/Ticket/Display.html?id=88731
276 "HTTP-Server-Simple-0.52" => 180, # seen 20
277 "IO-Socket-Socks-Wrapper-0.14" => 120, # seen 40
278 "IPC-Exe-2.002001" => 5,
279 "IPC-Pipeline-0.8" => 30,
280 "LWP-Protocol-Coro-http-v1.0.7" => 10,
281 "Module-Refresh-0.17" => 60, # seen 2, seen hanging in v5.12.5
282 "Mojolicious-Plugin-AssetPack-1.13" => 120, # seen 25
283 "Mojolicious-Plugin-AssetPack-1.24" => 120,
284 "Net-AMQP-RabbitMQ-0.007001" => undef,
285 "Net-AMQP-RabbitMQ-1.400000" => 110, # seen 53 total, but no stalls>127
286 "Net-LDAP-Server-Test-0.14" => undef,
287 "Net-Netcat-0.05" => undef,
288 "Object-Lazy-0.13" => 30,
289 "PDL-2.006" => undef,
290 "POE-Component-Client-HTTP-0.949"=> 120, # seen 33
291 "POE-Component-Client-Keepalive-0.272"=>300, # seen 40
292 "POEx-Role-PSGIServer-1.150280" => 30, # DEPREACATED
293 "POE-Quickie-0.18" => undef,
294 "Project-Euler-0.20" => 10,
295 "Prophet-0.751" => 120, # seen 40
296 "Redis-Jet-0.08" => 60, # seen 2
297 "RT-Client-REST-0.43" => 20,
298 "Server-Starter-0.12" => undef,
299 "Server-Starter-0.19" => undef,
300 "Server-Starter-0.32" => undef,
301 "Starlet-0.31" => undef,
302 "Starman-0.4014" => 360, # seen 152
303 "Sub-Call-Tail-0.05" => 1,
304 "Tapper-Testplan-4.1.2" => undef,
305 "Test-Moose-More-0.027" => 1, # haengt anscheinend immer, braucht keine Chance
306 "Test-HTTP-Server-Simple-0.11" => 60, # seen 0
308 "Twiggy-0.1025" => 120, # seen 6
309 "UNIVERSAL-isa-1.20171012" => 60, # seen 1
310 "XML-Grammar-Fortune-0.0501" => undef,
311 "ZeroMQ-PubSub-0.10" => undef,
312 "math-image-109" => undef,
314 while (my($distro,$ttimeout) = each %$timeouts) {
315 if ($cwd =~ m!/\Q$distro\E-!) {
316 $timeout = $ttimeout || 120;
320 keys %$timeouts; # do we need to reset the iterator here?
321 if ($stalled >= $timeout) { # arbitrary
322 warn sprintf "no output for %d seconds (timeout=%d)\n", $stalled, $timeout;
323 kill_stalled_leaves
($proc);
332 counting_sleep
$Opt{sleep}; # arbitrary
337 # cperl-indent-level: 4