640 number_to_scaled_string is duplicated in several commands
[unleashed.git] / usr / src / cmd / intrd / intrd.pl
blobcf68d5a9c955e37701cab1d848bd4acadf74ee3e
1 #!/usr/perl5/bin/perl
3 # CDDL HEADER START
5 # The contents of this file are subject to the terms of the
6 # Common Development and Distribution License (the "License").
7 # You may not use this file except in compliance with the License.
9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 # or http://www.opensolaris.org/os/licensing.
11 # See the License for the specific language governing permissions
12 # and limitations under the License.
14 # When distributing Covered Code, include this CDDL HEADER in each
15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 # If applicable, add the following below this CDDL HEADER, with the
17 # fields enclosed by brackets "[]" replaced with your own identifying
18 # information: Portions Copyright [yyyy] [name of copyright owner]
20 # CDDL HEADER END
24 # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
27 require 5.8.4;
28 use strict;
29 use warnings;
30 use POSIX;
31 use File::Basename("basename");
33 my $cmdname = basename($0);
35 my $using_scengen = 0; # 1 if using scenario simulator
36 my $debug = 0;
38 my $normal_sleeptime = 10; # time to sleep between samples
39 my $idle_sleeptime = 45; # time to sleep when idle
40 my $onecpu_sleeptime = (60 * 15); # used if only 1 CPU on system
41 my $sleeptime = $normal_sleeptime; # either normal_ or idle_ or onecpu_
43 my $idle_intrload = .1; # idle if interrupt load < 10%
45 my $timerange_toohi = .01;
46 my $statslen = 60; # time period (in secs) to keep in @deltas
49 # Parse arguments. intrd does not accept any public arguments; the two
50 # arguments below are meant for testing purposes. -D generates a significant
51 # amount of syslog output. -S <filename> loads the filename as a perl
52 # script. That file is expected to implement a kstat "simulator" which
53 # can be used to feed information to intrd and verify intrd's responses.
55 while ($_ = shift @ARGV) {
56 if ($_ eq "-S" && $#ARGV != -1) {
57 $using_scengen = 1;
58 do $ARGV[0]; # load simulator
59 shift @ARGV;
60 } elsif ($_ eq "-D") {
61 $debug = 1;
65 if ($using_scengen == 0) {
66 require Sun::Solaris::Kstat;
67 require Sun::Solaris::Intrs;
68 import Sun::Solaris::Intrs(qw(intrmove is_apic));
69 require Sys::Syslog;
70 import Sys::Syslog;
71 openlog($cmdname, 'pid', 'daemon');
72 setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
73 &Sys::Syslog::LOG_INFO));
76 my $asserted = 0;
77 my $assert_level = 'debug'; # syslog level for assertion failures
78 sub VERIFY($@)
80 my $bad = (shift() == 0); # $_[0] == 0 means assert failed
81 if ($bad) {
82 my $msg = shift();
83 syslog($assert_level, "VERIFY: $msg", @_);
84 $asserted++;
86 return ($bad);
92 sub getstat($$);
93 sub generate_delta($$);
94 sub compress_deltas($);
95 sub dumpdelta($);
97 sub goodness($);
98 sub imbalanced($$);
99 sub do_reconfig($);
101 sub goodness_cpu($$); # private function
102 sub move_intr($$$$); # private function
103 sub ivecs_to_string(@); # private function
104 sub do_find_goal($$$$); # private function
105 sub find_goal($$); # private function
106 sub do_reconfig_cpu2cpu($$$$); # private function
107 sub do_reconfig_cpu($$$); # private function
111 # What follow are the basic data structures routines of intrd.
113 # getstat() is responsible for reading the kstats and generating a "stat" hash.
115 # generate_delta() is responsible for taking two "stat" hashes and creating
116 # a new "delta" hash that represents what has changed over time.
118 # compress_deltas() is responsible for taking a list of deltas and generating
119 # a single delta hash that encompasses all the time periods described by the
120 # deltas.
124 # getstat() is handed a reference to a kstat and generates a hash, returned
125 # by reference, containing all the fields from the kstats which we need.
126 # If it returns the scalar 0, it failed to gather the kstats, and the caller
127 # should react accordingly.
129 # getstat() is also responsible for maintaining a reasonable $sleeptime.
131 # {"snaptime"} kstat's snaptime
132 # {<cpuid>} one hash reference per online cpu
133 # ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
134 # ->{"crtime"} == cpu:<cpuid>:sys:crtime
135 # ->{"ivecs"}
136 # ->{<cookie#>} iterates over pci_intrs::<nexus>:cookie
137 # ->{"time"} == pci_intrs:<ivec#>:<nexus>:time (in nsec)
138 # ->{"pil"} == pci_intrs:<ivec#>:<nexus>:pil
139 # ->{"crtime"} == pci_intrs:<ivec#>:<nexus>:crtime
140 # ->{"ino"} == pci_intrs:<ivec#>:<nexus>:ino
141 # ->{"num_ino"} == num inos of single device instance sharing this entry
142 # Will be > 1 on pcplusmp X86 systems for devices
143 # with multiple MSI interrupts.
144 # ->{"buspath"} == pci_intrs:<ivec#>:<nexus>:buspath
145 # ->{"name"} == pci_intrs:<ivec#>:<nexus>:name
146 # ->{"ihs"} == pci_intrs:<ivec#>:<nexus>:ihs
149 sub getstat($$)
151 my ($ks, $pcplusmp_sys) = @_;
153 my $cpucnt = 0;
154 my %stat = ();
155 my ($minsnap, $maxsnap);
157 # Hash of hash which matches (MSI device, ino) combos to kstats.
158 my %msidevs = ();
160 # kstats are not generated atomically. Each kstat hierarchy will
161 # have been generated within the kernel at a different time. On a
162 # thrashing system, we may not run quickly enough in order to get
163 # coherent kstat timing information across all the kstats. To
164 # determine if this is occurring, $minsnap/$maxsnap are used to
165 # find the breadth between the first and last snaptime of all the
166 # kstats we access. $maxsnap - $minsnap roughly represents the
167 # total time taken up in getstat(). If this time approaches the
168 # time between snapshots, our results may not be useful.
170 $minsnap = -1; # snaptime is always a positive number
171 $maxsnap = $minsnap;
173 # Iterate over the cpus in cpu:<cpuid>::. Check
174 # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
175 # processor is "on-line". If not, it isn't accepting interrupts
176 # and doesn't concern us.
178 # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
180 while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
181 next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
182 #"state" fld of kstat w/
183 # modname inst name-"cpuinfo0"
184 my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
185 next if ($state !~ /^on-line\0/);
186 my $cpu_sys = $cpst->{sys};
188 $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
189 $cpu_sys->{cpu_nsec_user} +
190 $cpu_sys->{cpu_nsec_kernel});
191 $stat{$cpu}{crtime} = $cpu_sys->{crtime};
192 $stat{$cpu}{ivecs} = {};
194 if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
195 $minsnap = $cpu_sys->{snaptime};
197 if ($cpu_sys->{snaptime} > $maxsnap) {
198 $maxsnap = $cpu_sys->{snaptime};
200 $cpucnt++;
203 if ($cpucnt <= 1) {
204 $sleeptime = $onecpu_sleeptime;
205 return (0); # nothing to do with 1 CPU
208 # Iterate over the ivecs. If the cpu is not on-line, ignore the
209 # ivecs mapped to it, if any.
211 # Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
212 # ino, name, and buspath. Check $minsnap/$maxsnap.
214 foreach my $inst (values(%{$ks->{pci_intrs}})) {
215 my $intrcfg = (values(%$inst))[0];
216 my $cpu = $intrcfg->{cpu};
218 next unless exists $stat{$cpu};
219 next if ($intrcfg->{type} =~ /^disabled\0/);
221 # Perl looks beyond NULL chars in pattern matching.
222 # Truncate name field at the first NULL
223 $intrcfg->{name} =~ s/\0.*$//;
225 if ($intrcfg->{snaptime} < $minsnap) {
226 $minsnap = $intrcfg->{snaptime};
227 } elsif ($intrcfg->{snaptime} > $maxsnap) {
228 $maxsnap = $intrcfg->{snaptime};
231 my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
232 if (exists $stat{$cpu}{ivecs}{$cookie}) {
233 my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
235 $cookiestats->{time} += $intrcfg->{time};
236 $cookiestats->{name} .= "/$intrcfg->{name}";
238 # If this new interrupt sharing $cookie represents a
239 # change from an earlier getstat, make sure that
240 # generate_delta will see the change by setting
241 # crtime to the most recent crtime of its components.
243 if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
244 $cookiestats->{crtime} = $intrcfg->{crtime};
246 $cookiestats->{ihs}++;
247 next;
249 $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
250 $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
251 $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
252 $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
253 $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
254 $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
255 $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
256 $stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
258 if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
259 if (!(exists($msidevs{$intrcfg->{name}}))) {
260 $msidevs{$intrcfg->{name}} = {};
262 $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
263 \$stat{$cpu}{ivecs}{$cookie};
267 # All MSI interrupts of a device instance share a single MSI address.
268 # On X86 systems with an APIC, this MSI address is interpreted as CPU
269 # routing info by the APIC. For this reason, on these platforms, all
270 # interrupts for MSI devices must be moved to the same CPU at the same
271 # time.
273 # Since all interrupts will be on the same CPU on these platforms, all
274 # interrupts can be consolidated into one ivec entry. For such devices,
275 # num_ino will be > 1 to denote that a group move is needed.
277 # Loop thru all MSI devices on X86 pcplusmp systems.
278 # Nop on other systems.
279 foreach my $msidevkey (sort keys %msidevs) {
281 # Loop thru inos of the device, sorted by lowest value first
282 # For each cookie found for a device, incr num_ino for the
283 # lowest cookie and remove other cookies.
285 # Assumes PIL is the same for first and current cookies
287 my $first_ino = -1;
288 my $first_cookiep;
289 my $curr_cookiep;
290 foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
291 $curr_cookiep = $msidevs{$msidevkey}{$inokey};
292 if ($first_ino == -1) {
293 $first_ino = $inokey;
294 $first_cookiep = $curr_cookiep;
295 } else {
296 $$first_cookiep->{num_ino}++;
297 $$first_cookiep->{time} +=
298 $$curr_cookiep->{time};
299 if ($$curr_cookiep->{crtime} >
300 $$first_cookiep->{crtime}) {
301 $$first_cookiep->{crtime} =
302 $$curr_cookiep->{crtime};
304 # Invalidate this cookie, less complicated and
305 # more efficient than deleting it.
306 $$curr_cookiep->{num_ino} = 0;
311 # We define the timerange as the amount of time spent gathering the
312 # various kstats, divided by our sleeptime. If we take a lot of time
313 # to access the kstats, and then we create a delta comparing these
314 # kstats with a prior set of kstats, that delta will cover
315 # substaintially different amount of time depending upon which
316 # interrupt or CPU is being examined.
318 # By checking the timerange here, we guarantee that any deltas
319 # created from these kstats will contain self-consistent data,
320 # in that all CPUs and interrupts cover a similar span of time.
322 # $timerange_toohi is the upper bound. Any timerange above
323 # this is thrown out as garbage. If the stat is safely within this
324 # bound, we treat the stat as representing an instant in time, rather
325 # than the time range it actually spans. We arbitrarily choose minsnap
326 # as the snaptime of the stat.
328 $stat{snaptime} = $minsnap;
329 my $timerange = ($maxsnap - $minsnap) / $sleeptime;
330 return (0) if ($timerange > $timerange_toohi); # i.e. failure
331 return (\%stat);
335 # dumpdelta takes a reference to our "delta" structure:
336 # {"missing"} "1" if the delta's component stats had inconsistencies
337 # {"minsnap"} time of the first kstat snaptime used in this delta
338 # {"maxsnap"} time of the last kstat snaptime used in this delta
339 # {"goodness"} cost function applied to this delta
340 # {"avgintrload"} avg of interrupt load across cpus, as a percentage
341 # {"avgintrnsec"} avg number of nsec spent in interrupts, per cpu
342 # {<cpuid>} iterates over on-line cpus
343 # ->{"intrs"} cpu's movable intr time (sum of "time" for each ivec)
344 # ->{"tot"} CPU load from all sources in nsec
345 # ->{"bigintr"} largest value of {ivecs}{<ivec#>}{time} from below
346 # ->{"intrload"} intrs / tot
347 # ->{"ivecs"}
348 # ->{<ivec#>} iterates over ivecs for this cpu
349 # ->{"time"} time used by this interrupt (in nsec)
350 # ->{"pil"} pil level of this interrupt
351 # ->{"ino"} interrupt number (or base vector if MSI group)
352 # ->{"buspath"} filename of the directory of the device's bus
353 # ->{"name"} device name
354 # ->{"ihs"} number of different handlers sharing this ino
355 # ->{"num_ino"} number of interrupt vectors in MSI group
357 # It prints out the delta structure in a nice, human readable display.
360 sub dumpdelta($)
362 my ($delta) = @_;
364 # print global info
366 syslog('debug', "dumpdelta:");
367 syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
368 syslog('debug', " avgintrload: %5.2f%% avgintrnsec: %d",
369 $delta->{avgintrload} * 100, $delta->{avgintrnsec});
370 syslog('debug', " goodness: %5.2f%%", $delta->{goodness} * 100)
371 if exists($delta->{goodness});
373 # iterate over cpus
375 while (my ($cpu, $cpst) = each %$delta) {
376 next if !ref($cpst); # skip non-cpuid entries
377 my $tot = $cpst->{tot};
378 syslog('debug', " cpu %3d intr %7.3f%% (bigintr %7.3f%%)",
379 $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
380 syslog('debug', " intrs %d, bigintr %d",
381 $cpst->{intrs}, $cpst->{bigintr});
383 # iterate over ivecs on this cpu
385 while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
386 syslog('debug', " %15s:\"%s\": %7.3f%% %d",
387 ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
388 $ivst->{name}), $ivec,
389 $ivst->{time}*100 / $tot, $ivst->{time});
395 # generate_delta($stat, $newstat) takes two stat references, returned from
396 # getstat(), and creates a %delta. %delta (not surprisingly) contains the
397 # same basic info as stat and newstat, but with the timestamps as deltas
398 # instead of absolute times. We return a reference to the delta.
401 sub generate_delta($$)
403 my ($stat, $newstat) = @_;
405 my %delta = ();
406 my $intrload;
407 my $intrnsec;
408 my $cpus;
410 # Take the worstcase timerange
411 $delta{minsnap} = $stat->{snaptime};
412 $delta{maxsnap} = $newstat->{snaptime};
413 if (VERIFY($delta{maxsnap} > $delta{minsnap},
414 "generate_delta: stats aren't ascending")) {
415 $delta{missing} = 1;
416 return (\%delta);
419 # if there are a different number of cpus in the stats, set missing
421 $delta{missing} = (keys(%$stat) != keys(%$newstat));
422 if (VERIFY($delta{missing} == 0,
423 "generate_delta: number of CPUs changed")) {
424 return (\%delta);
427 # scan through every cpu in %newstat and compare against %stat
429 while (my ($cpu, $newcpst) = each %$newstat) {
430 next if !ref($newcpst); # skip non-cpuid fields
432 # If %stat is missing a cpu from %newstat, then it was just
433 # onlined. Mark missing.
435 if (VERIFY(exists $stat->{$cpu} &&
436 $stat->{$cpu}{crtime} == $newcpst->{crtime},
437 "generate_delta: cpu $cpu changed")) {
438 $delta{missing} = 1;
439 return (\%delta);
441 my $cpst = $stat->{$cpu};
442 $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
443 if (VERIFY($delta{$cpu}{tot} >= 0,
444 "generate_delta: deltas are not ascending?")) {
445 $delta{missing} = 1;
446 delete($delta{$cpu});
447 return (\%delta);
449 # Avoid remote chance of division by zero
450 $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
451 $delta{$cpu}{intrs} = 0;
452 $delta{$cpu}{bigintr} = 0;
454 my %ivecs = ();
455 $delta{$cpu}{ivecs} = \%ivecs;
457 # if the number of ivecs differs, set missing
459 if (VERIFY(keys(%{$cpst->{ivecs}}) ==
460 keys(%{$newcpst->{ivecs}}),
461 "generate_delta: cpu $cpu has more/less".
462 " interrupts")) {
463 $delta{missing} = 1;
464 return (\%delta);
467 while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
469 # Unused cookie, corresponding to an MSI vector which
470 # is part of a group. The whole group is accounted for
471 # by a different cookie.
472 next if ($newivec->{num_ino} == 0);
474 # If this ivec doesn't exist in $stat, or if $stat
475 # shows a different crtime, set missing.
476 if (VERIFY(exists $cpst->{ivecs}{$inum} &&
477 $cpst->{ivecs}{$inum}{crtime} ==
478 $newivec->{crtime},
479 "generate_delta: cpu $cpu inum $inum".
480 " has changed")) {
481 $delta{missing} = 1;
482 return (\%delta);
484 my $ivec = $cpst->{ivecs}{$inum};
486 # Create $delta{$cpu}{ivecs}{$inum}.
488 my %dltivec = ();
489 $delta{$cpu}{ivecs}{$inum} = \%dltivec;
491 # calculate time used by this interrupt
493 my $time = $newivec->{time} - $ivec->{time};
494 if (VERIFY($time >= 0,
495 "generate_delta: ivec went backwards?")) {
496 $delta{missing} = 1;
497 delete($delta{$cpu}{ivecs}{$inum});
498 return (\%delta);
500 $delta{$cpu}{intrs} += $time;
501 $dltivec{time} = $time;
502 if ($time > $delta{$cpu}{bigintr}) {
503 $delta{$cpu}{bigintr} = $time;
506 # Transfer over basic info about the kstat. We
507 # don't have to worry about discrepancies between
508 # ivec and newivec because we verified that both
509 # have the same crtime.
511 $dltivec{pil} = $newivec->{pil};
512 $dltivec{ino} = $newivec->{ino};
513 $dltivec{buspath} = $newivec->{buspath};
514 $dltivec{name} = $newivec->{name};
515 $dltivec{ihs} = $newivec->{ihs};
516 $dltivec{num_ino} = $newivec->{num_ino};
518 if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
519 # Ewww! Hopefully just a rounding error.
520 # Make something up.
521 $delta{$cpu}{tot} = $delta{$cpu}{intrs};
523 $delta{$cpu}{intrload} =
524 $delta{$cpu}{intrs} / $delta{$cpu}{tot};
525 $intrload += $delta{$cpu}{intrload};
526 $intrnsec += $delta{$cpu}{intrs};
527 $cpus++;
529 if ($cpus > 0) {
530 $delta{avgintrload} = $intrload / $cpus;
531 $delta{avgintrnsec} = $intrnsec / $cpus;
532 } else {
533 $delta{avgintrload} = 0;
534 $delta{avgintrnsec} = 0;
536 return (\%delta);
540 # compress_delta takes a list of deltas, and returns a single new delta
541 # which represents the combined information from all the deltas. The deltas
542 # provided are assumed to be sequential in time. The resulting compressed
543 # delta looks just like any other delta. This new delta is also more accurate
544 # since its statistics are averaged over a longer period than any of the
545 # original deltas.
547 sub compress_deltas ($)
549 my ($deltas) = @_;
551 my %newdelta = ();
552 my ($intrs, $tot);
553 my $cpus = 0;
554 my ($high_intrload) = 0;
556 if (VERIFY($#$deltas != -1,
557 "compress_deltas: list of delta is empty?")) {
558 return (0);
560 $newdelta{minsnap} = $deltas->[0]{minsnap};
561 $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
562 $newdelta{missing} = 0;
564 foreach my $delta (@$deltas) {
565 if (VERIFY($delta->{missing} == 0,
566 "compressing bad deltas?")) {
567 return (0);
569 while (my ($cpuid, $cpu) = each %$delta) {
570 next if !ref($cpu);
572 $intrs += $cpu->{intrs};
573 $tot += $cpu->{tot};
574 $newdelta{$cpuid}{intrs} += $cpu->{intrs};
575 $newdelta{$cpuid}{tot} += $cpu->{tot};
576 if (!exists $newdelta{$cpuid}{ivecs}) {
577 my %ivecs = ();
578 $newdelta{$cpuid}{ivecs} = \%ivecs;
580 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
581 my $newivecs = $newdelta{$cpuid}{ivecs};
582 $newivecs->{$inum}{time} += $ivec->{time};
583 $newivecs->{$inum}{pil} = $ivec->{pil};
584 $newivecs->{$inum}{ino} = $ivec->{ino};
585 $newivecs->{$inum}{buspath} = $ivec->{buspath};
586 $newivecs->{$inum}{name} = $ivec->{name};
587 $newivecs->{$inum}{ihs} = $ivec->{ihs};
588 $newivecs->{$inum}{num_ino} = $ivec->{num_ino};
592 foreach my $cpu (values(%newdelta)) {
593 next if !ref($cpu); # ignore non-cpu fields
594 $cpus++;
596 my $bigintr = 0;
597 foreach my $ivec (values(%{$cpu->{ivecs}})) {
598 if ($ivec->{time} > $bigintr) {
599 $bigintr = $ivec->{time};
602 $cpu->{bigintr} = $bigintr;
603 $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
604 if ($high_intrload < $cpu->{intrload}) {
605 $high_intrload = $cpu->{intrload};
607 $cpu->{tot} = 1 if $cpu->{tot} <= 0;
609 if ($cpus == 0) {
610 $newdelta{avgintrnsec} = 0;
611 $newdelta{avgintrload} = 0;
612 } else {
613 $newdelta{avgintrnsec} = $intrs / $cpus;
614 $newdelta{avgintrload} = $intrs / $tot;
616 $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
617 $normal_sleeptime;
618 return (\%newdelta);
625 # What follow are the core functions responsible for examining the deltas
626 # generated above and deciding what to do about them.
628 # goodness() and its helper goodness_cpu() return a heuristic which describe
629 # how good (or bad) the current interrupt balance is. The value returned will
630 # be between 0 and 1, with 0 representing maximum goodness, and 1 representing
631 # maximum badness.
633 # imbalanced() compares a current and historical value of goodness, and
634 # determines if there has been enough change to warrant evaluating a
635 # reconfiguration of the interrupts
637 # do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
638 # find_goal(), do_find_goal(), and move_intr(), are responsible for examining
639 # a delta and determining the best possible assignment of interrupts to CPUs.
641 # It is important that do_reconfig() be in alignment with goodness(). If
642 # do_reconfig were to generate a new interrupt distribution that worsened
643 # goodness, we could get into a pathological loop with intrd fighting itself,
644 # constantly deciding that things are imbalanced, and then changing things
645 # only to make them worse.
649 # any goodness over $goodness_unsafe_load is considered really bad
650 # goodness must drop by at least $goodness_mindelta for a reconfig
652 my $goodness_unsafe_load = .9;
653 my $goodness_mindelta = .1;
655 # goodness(%delta) examines a delta and return its "goodness". goodness will
656 # be between 0 (best) and 1 (major bad). goodness is determined by evaluating
657 # the goodness of each individual cpu, and returning the worst case. This
658 # helps on systems with many CPUs, where otherwise a single pathological CPU
659 # might otherwise be ignored because the average was OK.
661 # To calculate the goodness of an individual CPU, we start by looking at its
662 # load due to interrupts. If the load is above a certain high threshold and
663 # there is more than one interrupt assigned to this CPU, we set goodness
664 # to worst-case. If the load is below the average interrupt load of all CPUs,
665 # then we return best-case, since what's to complain about?
667 # Otherwise we look at how much the load is above the average, and return
668 # that as the goodness, with one caveat: we never return more than the CPU's
669 # interrupt load ignoring its largest single interrupt source. This is
670 # because a CPU with one high-load interrupt, and no other interrupts, is
671 # perfectly balanced. Nothing can be done to improve the situation, and thus
672 # it is perfectly balanced even if the interrupt's load is 100%.
674 sub goodness($)
676 my ($delta) = @_;
678 return (1) if $delta->{missing} > 0;
680 my $high_goodness = 0;
681 my $goodness;
683 foreach my $cpu (values(%$delta)) {
684 next if !ref($cpu); # skip non-cpuid fields
686 $goodness = goodness_cpu($cpu, $delta->{avgintrload});
687 if (VERIFY($goodness >= 0 && $goodness <= 1,
688 "goodness: cpu goodness out of range?")) {
689 dumpdelta($delta);
690 return (1);
692 if ($goodness == 1) {
693 return (1); # worst case, no need to continue
695 if ($goodness > $high_goodness) {
696 $high_goodness = $goodness;
699 return ($high_goodness);
702 sub goodness_cpu($$) # private function
704 my ($cpu, $avgintrload) = @_;
706 my $goodness;
707 my $load = $cpu->{intrs} / $cpu->{tot};
709 return (0) if ($load < $avgintrload); # low loads are perfectly good
711 # Calculate $load_no_bigintr, which represents the load
712 # due to interrupts, excluding the one biggest interrupt.
713 # This is the most gain we can get on this CPU from
714 # offloading interrupts.
716 my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
718 # A major imbalance is indicated if a CPU is saturated
719 # with interrupt handling, and it has more than one
720 # source of interrupts. Those other interrupts could be
721 # starved if of a lower pil. Return a goodness of 1,
722 # which is the worst possible return value,
723 # which will effectively contaminate this entire delta.
725 my $cnt = keys(%{$cpu->{ivecs}});
727 if ($load > $goodness_unsafe_load && $cnt > 1) {
728 return (1);
730 $goodness = $load - $avgintrload;
731 if ($goodness > $load_no_bigintr) {
732 $goodness = $load_no_bigintr;
734 return ($goodness);
738 # imbalanced() is used by the main routine to determine if the goodness
739 # has shifted far enough from our last baseline to warrant a reassignment
740 # of interrupts. A very high goodness indicates that a CPU is way out of
741 # whack. If the goodness has varied too much since the baseline, then
742 # perhaps a reconfiguration is worth considering.
744 sub imbalanced ($$)
746 my ($goodness, $baseline) = @_;
748 # Return 1 if we are pathological, or creeping away from the baseline
750 return (1) if $goodness > .50;
751 return (1) if abs($goodness - $baseline) > $goodness_mindelta;
752 return (0);
755 # do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
756 # decision-making functions responsible for generating a new interrupt
757 # distribution. They are designed with the definition of goodness() in
758 # mind, i.e. they use the same definition of "good distribution" as does
759 # goodness().
761 # do_reconfig() is responsible for deciding whether a redistribution is
762 # actually warranted. If the goodness is already pretty good, it doesn't
763 # waste the CPU time to generate a new distribution. If it
764 # calculates a new distribution and finds that it is not sufficiently
765 # improved from the prior distirbution, it will not do the redistribution,
766 # mainly to avoid the disruption to system performance caused by
767 # rejuggling interrupts.
769 # Its main loop works by going through a list of cpus sorted from
770 # highest to lowest interrupt load. It removes the highest-load cpus
771 # one at a time and hands them off to do_reconfig_cpu(). This function
772 # then re-sorts the remaining CPUs from lowest to highest interrupt load,
773 # and one at a time attempts to rejuggle interrupts between the original
774 # high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
775 # considered finished as soon as its interrupt load is within
776 # $goodness_mindelta of the average interrupt load. Such a CPU will have
777 # a goodness of below the $goodness_mindelta threshold.
780 # move_intr(\%delta, $inum, $oldcpu, $newcpu)
781 # used by reconfiguration code to move an interrupt between cpus within
782 # a delta. This manipulates data structures, and does not actually move
783 # the interrupt on the running system.
785 sub move_intr($$$$) # private function
787 my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
789 my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
791 # Remove ivec from old cpu
793 my $oldcpu = $delta->{$oldcpuid};
794 $oldcpu->{intrs} -= $ivec->{time};
795 $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
796 delete($oldcpu->{ivecs}{$inum});
798 VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
799 VERIFY($ivec->{time} <= $oldcpu->{bigintr},
800 "move_intr: intr's time > bigintr?");
802 if ($ivec->{time} >= $oldcpu->{bigintr}) {
803 my $bigtime = 0;
805 foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
806 $bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
808 $oldcpu->{bigintr} = $bigtime;
811 # Add ivec onto new cpu
813 my $newcpu = $delta->{$newcpuid};
815 $ivec->{nowcpu} = $newcpuid;
816 $newcpu->{intrs} += $ivec->{time};
817 $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
818 $newcpu->{ivecs}{$inum} = $ivec;
820 $newcpu->{bigintr} = $ivec->{time}
821 if $ivec->{time} > $newcpu->{bigintr};
824 sub move_intr_check($$$) # private function
826 my ($delta, $oldcpuid, $newcpuid) = @_;
828 VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
829 "Moved interrupts left 100+%% load on src cpu");
830 VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
831 "Moved interrupts left 100+%% load on tgt cpu");
834 sub ivecs_to_string(@) # private function
836 my $str = "";
837 foreach my $ivec (@_) {
838 $str = "$str $ivec->{inum}";
840 return ($str);
844 sub do_reconfig($)
846 my ($delta) = @_;
848 my $goodness = $delta->{goodness};
850 # We can't improve goodness to better than 0. We should stop here
851 # if, even if we achieve a goodness of 0, the improvement is still
852 # too small to merit the action.
854 if ($goodness - 0 < $goodness_mindelta) {
855 syslog('debug', "goodness good enough, don't reconfig");
856 return (0);
859 syslog('notice', "Optimizing interrupt assignments");
861 if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
862 "have a delta with missing")) {
863 return (-1);
866 # Make a list of all cpuids, and also add some extra information
867 # to the ivec structures.
869 my @cpusortlist = ();
871 while (my ($cpuid, $cpu) = each %$delta) {
872 next if !ref($cpu); # skip non-cpu entries
874 push(@cpusortlist, $cpuid);
875 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
876 $ivec->{origcpu} = $cpuid;
877 $ivec->{nowcpu} = $cpuid;
878 $ivec->{inum} = $inum;
882 # Sort the list of CPUs from highest to lowest interrupt load.
883 # Remove the top CPU from that list and attempt to redistribute
884 # its interrupts. If the CPU has a goodness below a threshold,
885 # just ignore the CPU and move to the next one. If the CPU's
886 # load falls below the average load plus that same threshold,
887 # then there are no CPUs left worth reconfiguring, and we're done.
889 while (@cpusortlist) {
890 # Re-sort cpusortlist each time, since do_reconfig_cpu can
891 # move interrupts around.
893 @cpusortlist =
894 sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
895 @cpusortlist);
897 my $cpu = shift(@cpusortlist);
898 if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
899 ($delta->{$cpu}{intrload} <=
900 $delta->{avgintrload} + $goodness_mindelta)) {
901 syslog('debug', "finished reconfig: cpu $cpu load ".
902 "$delta->{$cpu}{intrload} avgload ".
903 "$delta->{avgintrload}");
904 last;
906 if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
907 $goodness_mindelta) {
908 next;
910 do_reconfig_cpu($delta, \@cpusortlist, $cpu);
913 # How good a job did we do? If the improvement was minimal, and
914 # our goodness wasn't pathological (and thus needing any help it
915 # can get), then don't bother moving the interrupts.
917 my $newgoodness = goodness($delta);
918 VERIFY($newgoodness <= $goodness,
919 "reconfig: result has worse goodness?");
921 if (($goodness != 1 || $newgoodness == 1) &&
922 $goodness - $newgoodness < $goodness_mindelta) {
923 syslog('debug', "goodness already near optimum, ".
924 "don't reconfig");
925 return (0);
927 syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
928 $newgoodness*100);
930 # Time to move those interrupts!
932 my $ret = 1;
933 my $warned = 0;
934 while (my ($cpuid, $cpu) = each %$delta) {
935 next if $cpuid =~ /\D/;
936 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
937 next if ($ivec->{origcpu} == $cpuid);
939 if (!intrmove($ivec->{buspath}, $ivec->{origcpu},
940 $ivec->{ino}, $cpuid, $ivec->{num_ino})) {
941 syslog('warning', "Unable to move interrupts")
942 if $warned++ == 0;
943 syslog('debug', "Unable to move buspath ".
944 "$ivec->{buspath} ino $ivec->{ino} to ".
945 "cpu $cpuid");
946 $ret = -1;
951 syslog('notice', "Interrupt assignments optimized");
952 return ($ret);
955 sub do_reconfig_cpu($$$) # private function
957 my ($delta, $cpusortlist, $oldcpuid) = @_;
959 # We have been asked to rejuggle interrupts between $oldcpuid and
960 # other CPUs found on $cpusortlist so as to improve the load on
961 # $oldcpuid. We reverse $cpusortlist to get our own copy of the
962 # list, sorted from lowest to highest interrupt load. One at a
963 # time, shift a CPU off of this list of CPUs, and attempt to
964 # rejuggle interrupts between the two CPUs. Don't do this if the
965 # other CPU has a higher load than oldcpuid. We're done rejuggling
966 # once $oldcpuid's goodness falls below a threshold.
968 syslog('debug', "reconfiguring $oldcpuid");
970 my $cpu = $delta->{$oldcpuid};
971 my $avgintrload = $delta->{avgintrload};
973 my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
974 while ($#cputargetlist != -1) {
975 last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
977 my $tgtcpuid = shift(@cputargetlist);
978 my $tgt = $delta->{$tgtcpuid};
979 my $load = $cpu->{intrload};
980 my $tgtload = $tgt->{intrload};
981 last if $tgtload > $load;
982 do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
986 sub do_reconfig_cpu2cpu($$$$) # private function
988 my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
990 # We've been asked to consider interrupt juggling between srccpuid
991 # (with a high interrupt load) and tgtcpuid (with a lower interrupt
992 # load). First, make a single list with all of the ivecs from both
993 # CPUs, and sort the list from highest to lowest load.
995 syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
997 # Gather together all the ivecs and sort by load
999 my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
1000 values(%{$delta->{$tgtcpuid}{ivecs}}));
1001 return if $#ivecs == -1;
1003 @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
1005 # Our "goal" load for srccpuid is the average load across all CPUs.
1006 # find_goal() will find determine the optimum selection of the
1007 # available interrupts which comes closest to this goal without
1008 # falling below the goal.
1010 my $goal = $delta->{avgintrnsec};
1012 # We know that the interrupt load on tgtcpuid is less than that on
1013 # srccpuid, but its load could still be above avgintrnsec. Don't
1014 # choose a goal which would bring srccpuid below the load on tgtcpuid.
1016 my $avgnsec =
1017 ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
1018 if ($goal < $avgnsec) {
1019 $goal = $avgnsec;
1022 # If the largest of the interrupts is on srccpuid, leave it there.
1023 # This can help minimize the disruption caused by moving interrupts.
1025 if ($ivecs[0]->{origcpu} == $srccpuid) {
1026 syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
1027 $goal -= $ivecs[0]->{time};
1028 shift(@ivecs);
1031 syslog('debug', "GOAL: inums should total $goal");
1032 find_goal(\@ivecs, $goal);
1034 # find_goal() returned its results to us by setting $ivec->{goal} if
1035 # the ivec should be on srccpuid, or clearing it for tgtcpuid.
1036 # Call move_intr() to update our $delta with the new results.
1038 foreach my $ivec (@ivecs) {
1039 syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
1040 VERIFY($ivec->{nowcpu} == $srccpuid ||
1041 $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
1042 "interrupt not currently on src or tgt cpu");
1044 if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
1045 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1046 $srccpuid);
1047 } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
1048 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1049 $tgtcpuid);
1052 move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
1054 my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
1055 VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
1056 "cpu2cpu: new load didn't end up in expected range");
1060 # find_goal() and its helper do_find_goal() are used to find the best
1061 # combination of interrupts in order to generate a load that is as close
1062 # as possible to a goal load without falling below that goal. Before returning
1063 # to its caller, find_goal() sets a new value in the hash of each interrupt,
1064 # {goal}, which if set signifies that this interrupt is one of the interrupts
1065 # identified as part of the set of interrupts which best meet the goal.
1067 # The arguments to find_goal are a list of ivecs (hash references), sorted
1068 # by descending {time}, and the goal load. The goal is relative to {time}.
1069 # The best fit is determined by performing a depth-first search. do_find_goal
1070 # is the recursive subroutine which carries out the search.
1072 # It is passed an index as an argument, originally 0. On a given invocation,
1073 # it is only to consider interrupts in the ivecs array starting at that index.
1074 # It then considers two possibilities:
1075 # 1) What is the best goal-fit if I include ivecs[index]?
1076 # 2) What is the best goal-fit if I exclude ivecs[index]?
1077 # To determine case 1, it subtracts the load of ivecs[index] from the goal,
1078 # and calls itself recursively with that new goal and index++.
1079 # To determine case 2, it calls itself recursively with the same goal and
1080 # index++.
1082 # It then compares the two results, decide which one best meets the goals,
1083 # and returns the result. The return value is the best-fit's interrupt load,
1084 # followed by a list of all the interrupts which make up that best-fit.
1086 # As an optimization, a second array loads[] is created which mirrors ivecs[].
1087 # loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1088 # by do_find_goal to avoid recursing all the way to the end of the ivecs
1089 # array if including all remaining interrupts will still leave the best-fit
1090 # at below goal load. If so, it then includes all remaining interrupts on
1091 # the goal list and returns.
1093 sub find_goal($$) # private function
1095 my ($ivecs, $goal) = @_;
1097 my @goals;
1098 my $load;
1099 my $ivec;
1101 if ($goal <= 0) {
1102 @goals = (); # the empty set will best meet the goal
1103 } else {
1104 syslog('debug', "finding goal from intrs %s",
1105 ivecs_to_string(@$ivecs));
1107 # Generate @loads array
1109 my $tot = 0;
1110 foreach $ivec (@$ivecs) {
1111 $tot += $ivec->{time};
1113 my @loads = ();
1114 foreach $ivec (@$ivecs) {
1115 push(@loads, $tot);
1116 $tot -= $ivec->{time};
1118 ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1119 VERIFY($load >= $goal, "find_goal didn't meet goals");
1121 syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1123 # Set or clear $ivec->{goal} for each ivec, based on returned @goals
1125 foreach $ivec (@$ivecs) {
1126 if ($#goals > -1 && $ivec == $goals[0]) {
1127 syslog('debug', "inum $ivec->{inum} on source cpu");
1128 $ivec->{goal} = 1;
1129 shift(@goals);
1130 } else {
1131 syslog('debug', "inum $ivec->{inum} on target cpu");
1132 $ivec->{goal} = 0;
1138 sub do_find_goal($$$$) # private function
1140 my ($ivecs, $loads, $goal, $idx) = @_;
1142 if ($idx > $#{$ivecs}) {
1143 return (0);
1145 syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1147 my $load = $ivecs->[$idx]{time};
1148 my @goals_with = ();
1149 my @goals_without = ();
1150 my ($with, $without);
1152 # If we include all remaining items and we're still below goal,
1153 # stop here. We can just return a result that includes $idx and all
1154 # subsequent ivecs. Since this will still be below goal, there's
1155 # nothing better to be done.
1157 if ($loads->[$idx] <= $goal) {
1158 syslog('debug',
1159 "$idx: including all remaining intrs %s with load %d",
1160 ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1161 $loads->[$idx]);
1162 return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1165 # Evaluate the "with" option, i.e. the best matching goal which
1166 # includes $ivecs->[$idx]. If idx's load is more than our goal load,
1167 # stop here. Once we're above the goal, there is no need to consider
1168 # further interrupts since they'll only take us further from the goal.
1170 if ($goal <= $load) {
1171 $with = $load; # stop here
1172 } else {
1173 ($with, @goals_with) =
1174 do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1175 $with += $load;
1177 syslog('debug', "$idx: with-load $with intrs %s",
1178 ivecs_to_string($ivecs->[$idx], @goals_with));
1180 # Evaluate the "without" option, i.e. the best matching goal which
1181 # excludes $ivecs->[$idx].
1183 ($without, @goals_without) =
1184 &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1185 syslog('debug', "$idx: without-load $without intrs %s",
1186 ivecs_to_string(@goals_without));
1188 # We now have our "with" and "without" options, and we choose which
1189 # best fits the goal. If one is greater than goal and the other is
1190 # below goal, we choose the one that is greater. If they are both
1191 # below goal, then we choose the one that is greater. If they are
1192 # both above goal, then we choose the smaller.
1194 my $which; # 0 == with, 1 == without
1195 if ($with >= $goal && $without < $goal) {
1196 $which = 0;
1197 } elsif ($with < $goal && $without >= $goal) {
1198 $which = 1;
1199 } elsif ($with >= $goal && $without >= $goal) {
1200 $which = ($without < $with);
1201 } else {
1202 $which = ($without > $with);
1205 # Return the load of our best case scenario, followed by all the ivecs
1206 # which compose that goal.
1208 if ($which == 1) { # without
1209 syslog('debug', "$idx: going without");
1210 return ($without, @goals_without);
1211 } else {
1212 syslog('debug', "$idx: going with");
1213 return ($with, $ivecs->[$idx], @goals_with);
1215 # Not reached
1221 syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1223 my @deltas = ();
1224 my $deltas_tottime = 0; # sum of maxsnap-minsnap across @deltas
1225 my $avggoodness;
1226 my $baseline_goodness = 0;
1227 my $compdelta;
1229 my $do_reconfig;
1231 # temp variables
1232 my $goodness;
1233 my $deltatime;
1234 my $olddelta;
1235 my $olddeltatime;
1236 my $delta;
1237 my $newstat;
1238 my $below_statslen;
1239 my $newtime;
1240 my $ret;
1243 my $gotsig = 0;
1244 $SIG{INT} = sub { $gotsig = 1; }; # don't die in the middle of retargeting
1245 $SIG{HUP} = $SIG{INT};
1246 $SIG{TERM} = $SIG{INT};
1248 my $ks;
1249 if ($using_scengen == 0) {
1250 $ks = Sun::Solaris::Kstat->new();
1251 } else {
1252 $ks = myks_update(); # supplied by the simulator
1255 # If no pci_intrs kstats were found, we need to exit, but we can't because
1256 # SMF will restart us and/or report an error to the administrator. But
1257 # there's nothing an administrator can do. So print out a message for SMF
1258 # logs and silently pause forever.
1260 if (!exists($ks->{pci_intrs})) {
1261 print STDERR "$cmdname: no interrupts were found; ".
1262 "your PCI bus may not yet be supported\n";
1263 pause() while $gotsig == 0;
1264 exit 0;
1267 # See if this is a system with a pcplusmp APIC.
1268 # Such systems will get special handling.
1269 # Assume that if one bus has a pcplusmp APIC that they all do.
1271 # Get a list of pci_intrs kstats.
1272 my @elem = values(%{$ks->{pci_intrs}});
1273 my $elem0 = $elem[0];
1274 my $elemval = (values(%$elem0))[0];
1276 # Use its buspath to query the system. It is assumed that either all or none
1277 # of the busses on a system are hosted by the pcplusmp APIC or APIX.
1278 my $pcplusmp_sys = is_apic($elemval->{buspath});
1280 my $stat = getstat($ks, $pcplusmp_sys);
1282 for (;;) {
1283 sub clear_deltas {
1284 @deltas = ();
1285 $deltas_tottime = 0;
1286 $stat = 0; # prevent next gen_delta() from setting {missing}
1289 # 1. Sleep, update the kstats, and save the new stats in $newstat.
1291 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit
1292 if ($using_scengen == 0) {
1293 sleep($sleeptime);
1294 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit
1295 $ks->update();
1296 } else {
1297 $ks = myks_update();
1299 $newstat = getstat($ks, $pcplusmp_sys);
1301 # $stat or $newstat could be zero if they're uninitialized, or if
1302 # getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1303 # and try again. If $newstat is zero, then we also sleep and try
1304 # again, hoping the problem will clear up.
1306 next if (!ref $newstat);
1307 if (!ref $stat) {
1308 $stat = $newstat;
1309 next;
1312 # 2. Compare $newstat with the prior set of values, result in %$delta.
1314 $delta = generate_delta($stat, $newstat);
1315 dumpdelta($delta) if $debug; # Dump most recent stats to stdout.
1316 $stat = $newstat; # The new stats now become the old stats.
1319 # 3. If $delta->{missing}, then there has been a reconfiguration of
1320 # either cpus or interrupts (probably both). We need to toss out our
1321 # old set of statistics and start from scratch.
1323 # Also, if the delta covers a very long range of time, then we've
1324 # been experiencing a system overload that has resulted in intrd
1325 # not being allowed to run effectively for a while now. As above,
1326 # toss our old statistics and start from scratch.
1328 $deltatime = $delta->{maxsnap} - $delta->{minsnap};
1329 if ($delta->{missing} > 0 || $deltatime > $statslen) {
1330 clear_deltas();
1331 syslog('debug', "evaluating interrupt assignments");
1332 next;
1336 # 4. Incorporate new delta into the list of deltas, and associated
1337 # statistics. If we've just now received $statslen deltas, then it's
1338 # time to evaluate a reconfiguration.
1340 $below_statslen = ($deltas_tottime < $statslen);
1341 $deltas_tottime += $deltatime;
1342 $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1343 push(@deltas, $delta);
1345 # 5. Remove old deltas if total time is more than $statslen. We use
1346 # @deltas as a moving average of the last $statslen seconds. Shift
1347 # off the olders deltas, but only if that doesn't cause us to fall
1348 # below $statslen seconds.
1350 while (@deltas > 1) {
1351 $olddelta = $deltas[0];
1352 $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1353 $newtime = $deltas_tottime - $olddeltatime;
1354 last if ($newtime < $statslen);
1356 shift(@deltas);
1357 $deltas_tottime = $newtime;
1360 # 6. The brains of the operation are here. First, check if we're
1361 # imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1362 # either because of imbalance or above in step 4, we evaluate a
1363 # new configuration.
1365 # First, take @deltas and generate a single "compressed" delta
1366 # which summarizes them all. Pass that to do_reconfig and see
1367 # what it does with it:
1369 # $ret == -1 : failure
1370 # $ret == 0 : current config is optimal (or close enough)
1371 # $ret == 1 : reconfiguration has occurred
1373 # If $ret is -1 or 1, dump all our deltas and start from scratch.
1374 # Step 4 above will set do_reconfig soon thereafter.
1376 # If $ret is 0, then nothing has happened because we're already
1377 # good enough. Set baseline_goodness to current goodness.
1379 $compdelta = compress_deltas(\@deltas);
1380 if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1381 clear_deltas();
1382 next;
1384 $compdelta->{goodness} = goodness($compdelta);
1385 dumpdelta($compdelta) if $debug;
1387 $goodness = $compdelta->{goodness};
1388 syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1390 if ($deltas_tottime >= $statslen &&
1391 imbalanced($goodness, $baseline_goodness)) {
1392 $do_reconfig = 1;
1395 if ($do_reconfig) {
1396 $ret = do_reconfig($compdelta);
1398 if ($ret != 0) {
1399 clear_deltas();
1400 syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1401 } else {
1402 syslog('debug', "setting new baseline of $goodness");
1403 $baseline_goodness = $goodness;
1406 syslog('debug', "---------------------------------------");