usr/src/cmd/intrd/intrd.pl

   1 #!/usr/perl5/bin/perl
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22
  23 #
  24 # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  25 #
  26
  27 require 5.8.4;
  28 use strict;
  29 use warnings;
  30 use POSIX;
  31 use File::Basename("basename");
  32
  33 my $cmdname = basename($0);
  34
  35 my $using_scengen = 0;  # 1 if using scenario simulator
  36 my $debug = 0;
  37
  38 my $normal_sleeptime = 10;              # time to sleep between samples
  39 my $idle_sleeptime = 45;                # time to sleep when idle
  40 my $onecpu_sleeptime = (60 * 15);       # used if only 1 CPU on system
  41 my $sleeptime = $normal_sleeptime;      # either normal_ or idle_ or onecpu_
  42
  43 my $idle_intrload = .1;                 # idle if interrupt load < 10%
  44
  45 my $timerange_toohi    = .01;
  46 my $statslen = 60;      # time period (in secs) to keep in @deltas
  47
  48
  49 # Parse arguments. intrd does not accept any public arguments; the two
  50 # arguments below are meant for testing purposes. -D generates a significant
  51 # amount of syslog output. -S <filename> loads the filename as a perl
  52 # script. That file is expected to implement a kstat "simulator" which
  53 # can be used to feed information to intrd and verify intrd's responses.
  54
  55 while ($_ = shift @ARGV) {
  56         if ($_ eq "-S" && $#ARGV != -1) {
  57                 $using_scengen = 1;
  58                 do $ARGV[0];    # load simulator
  59                 shift @ARGV;
  60         } elsif ($_ eq "-D") {
  61                 $debug = 1;
  62         }
  63 }
  64
  65 if ($using_scengen == 0) {
  66         require Sun::Solaris::Kstat;
  67         require Sun::Solaris::Intrs;
  68         import Sun::Solaris::Intrs(qw(intrmove is_apic));
  69         require Sys::Syslog;
  70         import Sys::Syslog;
  71         openlog($cmdname, 'pid', 'daemon');
  72         setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
  73             &Sys::Syslog::LOG_INFO));
  74 }
  75
  76 my $asserted = 0;
  77 my $assert_level = 'debug';     # syslog level for assertion failures
  78 sub VERIFY($@)
  79 {
  80         my $bad = (shift() == 0);       # $_[0] == 0 means assert failed
  81         if ($bad) {
  82                 my $msg = shift();
  83                 syslog($assert_level, "VERIFY: $msg", @_);
  84                 $asserted++;
  85         }
  86         return ($bad);
  87 }
  88
  89
  90
  91
  92 sub getstat($$);
  93 sub generate_delta($$);
  94 sub compress_deltas($);
  95 sub dumpdelta($);
  96
  97 sub goodness($);
  98 sub imbalanced($$);
  99 sub do_reconfig($);
 100
 101 sub goodness_cpu($$);           # private function
 102 sub move_intr($$$$);            # private function
 103 sub ivecs_to_string(@);         # private function
 104 sub do_find_goal($$$$);         # private function
 105 sub find_goal($$);              # private function
 106 sub do_reconfig_cpu2cpu($$$$);  # private function
 107 sub do_reconfig_cpu($$$);       # private function
 108
 109
 110 #
 111 # What follow are the basic data structures routines of intrd.
 112 #
 113 # getstat() is responsible for reading the kstats and generating a "stat" hash.
 114 #
 115 # generate_delta() is responsible for taking two "stat" hashes and creating
 116 # a new "delta" hash that represents what has changed over time.
 117 #
 118 # compress_deltas() is responsible for taking a list of deltas and generating
 119 # a single delta hash that encompasses all the time periods described by the
 120 # deltas.
 121
 122
 123 #
 124 # getstat() is handed a reference to a kstat and generates a hash, returned
 125 # by reference, containing all the fields from the kstats which we need.
 126 # If it returns the scalar 0, it failed to gather the kstats, and the caller
 127 # should react accordingly.
 128 #
 129 # getstat() is also responsible for maintaining a reasonable $sleeptime.
 130 #
 131 # {"snaptime"}          kstat's snaptime
 132 # {<cpuid>}             one hash reference per online cpu
 133 #  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
 134 #  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
 135 #  ->{"ivecs"}
 136 #     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
 137 #        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
 138 #        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
 139 #        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
 140 #        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
 141 #        ->{"num_ino"}  == num inos of single device instance sharing this entry
 142 #                               Will be > 1 on pcplusmp X86 systems for devices
 143 #                               with multiple MSI interrupts.
 144 #        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
 145 #        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
 146 #        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
 147 #
 148
 149 sub getstat($$)
 150 {
 151         my ($ks, $pcplusmp_sys) = @_;
 152
 153         my $cpucnt = 0;
 154         my %stat = ();
 155         my ($minsnap, $maxsnap);
 156
 157         # Hash of hash which matches (MSI device, ino) combos to kstats.
 158         my %msidevs = ();
 159
 160         # kstats are not generated atomically. Each kstat hierarchy will
 161         # have been generated within the kernel at a different time. On a
 162         # thrashing system, we may not run quickly enough in order to get
 163         # coherent kstat timing information across all the kstats. To
 164         # determine if this is occurring, $minsnap/$maxsnap are used to
 165         # find the breadth between the first and last snaptime of all the
 166         # kstats we access. $maxsnap - $minsnap roughly represents the
 167         # total time taken up in getstat(). If this time approaches the
 168         # time between snapshots, our results may not be useful.
 169
 170         $minsnap = -1;          # snaptime is always a positive number
 171         $maxsnap = $minsnap;
 172
 173         # Iterate over the cpus in cpu:<cpuid>::. Check
 174         # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
 175         # processor is "on-line". If not, it isn't accepting interrupts
 176         # and doesn't concern us.
 177         #
 178         # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
 179
 180         while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
 181                 next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
 182                 #"state" fld of kstat w/
 183                 #                 modname    inst name-"cpuinfo0"
 184                 my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
 185                 next if ($state !~ /^on-line\0/);
 186                 my $cpu_sys = $cpst->{sys};
 187
 188                 $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
 189                                     $cpu_sys->{cpu_nsec_user} +
 190                                     $cpu_sys->{cpu_nsec_kernel});
 191                 $stat{$cpu}{crtime} = $cpu_sys->{crtime};
 192                 $stat{$cpu}{ivecs} = {};
 193
 194                 if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
 195                         $minsnap = $cpu_sys->{snaptime};
 196                 }
 197                 if ($cpu_sys->{snaptime} > $maxsnap) {
 198                         $maxsnap = $cpu_sys->{snaptime};
 199                 }
 200                 $cpucnt++;
 201         }
 202
 203         if ($cpucnt <= 1) {
 204                 $sleeptime = $onecpu_sleeptime;
 205                 return (0);     # nothing to do with 1 CPU
 206         }
 207
 208         # Iterate over the ivecs. If the cpu is not on-line, ignore the
 209         # ivecs mapped to it, if any.
 210         #
 211         # Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
 212         # ino, name, and buspath. Check $minsnap/$maxsnap.
 213
 214         foreach my $inst (values(%{$ks->{pci_intrs}})) {
 215                 my $intrcfg = (values(%$inst))[0];
 216                 my $cpu = $intrcfg->{cpu};
 217
 218                 next unless exists $stat{$cpu};
 219                 next if ($intrcfg->{type} =~ /^disabled\0/);
 220
 221                 # Perl looks beyond NULL chars in pattern matching.
 222                 # Truncate name field at the first NULL
 223                 $intrcfg->{name} =~ s/\0.*$//;
 224
 225                 if ($intrcfg->{snaptime} < $minsnap) {
 226                         $minsnap = $intrcfg->{snaptime};
 227                 } elsif ($intrcfg->{snaptime} > $maxsnap) {
 228                         $maxsnap = $intrcfg->{snaptime};
 229                 }
 230
 231                 my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
 232                 if (exists $stat{$cpu}{ivecs}{$cookie}) {
 233                         my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
 234
 235                         $cookiestats->{time} += $intrcfg->{time};
 236                         $cookiestats->{name} .= "/$intrcfg->{name}";
 237
 238                         # If this new interrupt sharing $cookie represents a
 239                         # change from an earlier getstat, make sure that
 240                         # generate_delta will see the change by setting
 241                         # crtime to the most recent crtime of its components.
 242
 243                         if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
 244                                 $cookiestats->{crtime} = $intrcfg->{crtime};
 245                         }
 246                         $cookiestats->{ihs}++;
 247                         next;
 248                 }
 249                 $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
 250                 $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
 251                 $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
 252                 $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
 253                 $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
 254                 $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
 255                 $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
 256                 $stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
 257
 258                 if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
 259                         if (!(exists($msidevs{$intrcfg->{name}}))) {
 260                                 $msidevs{$intrcfg->{name}} = {};
 261                         }
 262                         $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
 263                             \$stat{$cpu}{ivecs}{$cookie};
 264                 }
 265         }
 266
 267         # All MSI interrupts of a device instance share a single MSI address.
 268         # On X86 systems with an APIC, this MSI address is interpreted as CPU
 269         # routing info by the APIC.  For this reason, on these platforms, all
 270         # interrupts for MSI devices must be moved to the same CPU at the same
 271         # time.
 272         #
 273         # Since all interrupts will be on the same CPU on these platforms, all
 274         # interrupts can be consolidated into one ivec entry.  For such devices,
 275         # num_ino will be > 1 to denote that a group move is needed.
 276
 277         # Loop thru all MSI devices on X86 pcplusmp systems.
 278         # Nop on other systems.
 279         foreach my $msidevkey (sort keys %msidevs) {
 280
 281                 # Loop thru inos of the device, sorted by lowest value first
 282                 # For each cookie found for a device, incr num_ino for the
 283                 # lowest cookie and remove other cookies.
 284
 285                 # Assumes PIL is the same for first and current cookies
 286
 287                 my $first_ino = -1;
 288                 my $first_cookiep;
 289                 my $curr_cookiep;
 290                 foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
 291                         $curr_cookiep = $msidevs{$msidevkey}{$inokey};
 292                         if ($first_ino == -1) {
 293                                 $first_ino = $inokey;
 294                                 $first_cookiep = $curr_cookiep;
 295                         } else {
 296                                 $$first_cookiep->{num_ino}++;
 297                                 $$first_cookiep->{time} +=
 298                                     $$curr_cookiep->{time};
 299                                 if ($$curr_cookiep->{crtime} >
 300                                     $$first_cookiep->{crtime}) {
 301                                         $$first_cookiep->{crtime} =
 302                                             $$curr_cookiep->{crtime};
 303                                 }
 304                                 # Invalidate this cookie, less complicated and
 305                                 # more efficient than deleting it.
 306                                 $$curr_cookiep->{num_ino} = 0;
 307                         }
 308                 }
 309         }
 310
 311         # We define the timerange as the amount of time spent gathering the
 312         # various kstats, divided by our sleeptime. If we take a lot of time
 313         # to access the kstats, and then we create a delta comparing these
 314         # kstats with a prior set of kstats, that delta will cover
 315         # substaintially different amount of time depending upon which
 316         # interrupt or CPU is being examined.
 317         #
 318         # By checking the timerange here, we guarantee that any deltas
 319         # created from these kstats will contain self-consistent data,
 320         # in that all CPUs and interrupts cover a similar span of time.
 321         #
 322         # $timerange_toohi is the upper bound. Any timerange above
 323         # this is thrown out as garbage. If the stat is safely within this
 324         # bound, we treat the stat as representing an instant in time, rather
 325         # than the time range it actually spans. We arbitrarily choose minsnap
 326         # as the snaptime of the stat.
 327
 328         $stat{snaptime} = $minsnap;
 329         my $timerange = ($maxsnap - $minsnap) / $sleeptime;
 330         return (0) if ($timerange > $timerange_toohi);  # i.e. failure
 331         return (\%stat);
 332 }
 333
 334 #
 335 # dumpdelta takes a reference to our "delta" structure:
 336 # {"missing"}           "1" if the delta's component stats had inconsistencies
 337 # {"minsnap"}           time of the first kstat snaptime used in this delta
 338 # {"maxsnap"}           time of the last kstat snaptime used in this delta
 339 # {"goodness"}          cost function applied to this delta
 340 # {"avgintrload"}       avg of interrupt load across cpus, as a percentage
 341 # {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
 342 # {<cpuid>}             iterates over on-line cpus
 343 #  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
 344 #  ->{"tot"}            CPU load from all sources in nsec
 345 #  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
 346 #  ->{"intrload"}       intrs / tot
 347 #  ->{"ivecs"}
 348 #     ->{<ivec#>}       iterates over ivecs for this cpu
 349 #        ->{"time"}     time used by this interrupt (in nsec)
 350 #        ->{"pil"}      pil level of this interrupt
 351 #        ->{"ino"}      interrupt number (or base vector if MSI group)
 352 #        ->{"buspath"}  filename of the directory of the device's bus
 353 #        ->{"name"}     device name
 354 #        ->{"ihs"}      number of different handlers sharing this ino
 355 #        ->{"num_ino"}  number of interrupt vectors in MSI group
 356 #
 357 # It prints out the delta structure in a nice, human readable display.
 358 #
 359
 360 sub dumpdelta($)
 361 {
 362         my ($delta) = @_;
 363
 364         # print global info
 365
 366         syslog('debug', "dumpdelta:");
 367         syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
 368         syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
 369                $delta->{avgintrload} * 100, $delta->{avgintrnsec});
 370         syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
 371             if exists($delta->{goodness});
 372
 373         # iterate over cpus
 374
 375         while (my ($cpu, $cpst) = each %$delta) {
 376                 next if !ref($cpst);            # skip non-cpuid entries
 377                 my $tot = $cpst->{tot};
 378                 syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
 379                        $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
 380                 syslog('debug', "        intrs %d, bigintr %d",
 381                        $cpst->{intrs}, $cpst->{bigintr});
 382
 383                 # iterate over ivecs on this cpu
 384
 385                 while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
 386                         syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
 387                             ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
 388                             $ivst->{name}), $ivec,
 389                             $ivst->{time}*100 / $tot, $ivst->{time});
 390                 }
 391         }
 392 }
 393
 394 #
 395 # generate_delta($stat, $newstat) takes two stat references, returned from
 396 # getstat(), and creates a %delta. %delta (not surprisingly) contains the
 397 # same basic info as stat and newstat, but with the timestamps as deltas
 398 # instead of absolute times. We return a reference to the delta.
 399 #
 400
 401 sub generate_delta($$)
 402 {
 403         my ($stat, $newstat) = @_;
 404
 405         my %delta = ();
 406         my $intrload;
 407         my $intrnsec;
 408         my $cpus;
 409
 410         # Take the worstcase timerange
 411         $delta{minsnap} = $stat->{snaptime};
 412         $delta{maxsnap} = $newstat->{snaptime};
 413         if (VERIFY($delta{maxsnap} > $delta{minsnap},
 414             "generate_delta: stats aren't ascending")) {
 415                 $delta{missing} = 1;
 416                 return (\%delta);
 417         }
 418
 419         # if there are a different number of cpus in the stats, set missing
 420
 421         $delta{missing} = (keys(%$stat) != keys(%$newstat));
 422         if (VERIFY($delta{missing} == 0,
 423             "generate_delta: number of CPUs changed")) {
 424                 return (\%delta);
 425         }
 426
 427         # scan through every cpu in %newstat and compare against %stat
 428
 429         while (my ($cpu, $newcpst) = each %$newstat) {
 430                 next if !ref($newcpst);         # skip non-cpuid fields
 431
 432                 # If %stat is missing a cpu from %newstat, then it was just
 433                 # onlined. Mark missing.
 434
 435                 if (VERIFY(exists $stat->{$cpu} &&
 436                     $stat->{$cpu}{crtime} == $newcpst->{crtime},
 437                     "generate_delta: cpu $cpu changed")) {
 438                         $delta{missing} = 1;
 439                         return (\%delta);
 440                 }
 441                 my $cpst = $stat->{$cpu};
 442                 $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
 443                 if (VERIFY($delta{$cpu}{tot} >= 0,
 444                     "generate_delta: deltas are not ascending?")) {
 445                         $delta{missing} = 1;
 446                         delete($delta{$cpu});
 447                         return (\%delta);
 448                 }
 449                 # Avoid remote chance of division by zero
 450                 $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
 451                 $delta{$cpu}{intrs} = 0;
 452                 $delta{$cpu}{bigintr} = 0;
 453
 454                 my %ivecs = ();
 455                 $delta{$cpu}{ivecs} = \%ivecs;
 456
 457                 # if the number of ivecs differs, set missing
 458
 459                 if (VERIFY(keys(%{$cpst->{ivecs}}) ==
 460                            keys(%{$newcpst->{ivecs}}),
 461                            "generate_delta: cpu $cpu has more/less".
 462                            " interrupts")) {
 463                         $delta{missing} = 1;
 464                         return (\%delta);
 465                 }
 466
 467                 while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
 468
 469                         # Unused cookie, corresponding to an MSI vector which
 470                         # is part of a group.  The whole group is accounted for
 471                         # by a different cookie.
 472                         next if ($newivec->{num_ino} == 0);
 473
 474                         # If this ivec doesn't exist in $stat, or if $stat
 475                         # shows a different crtime, set missing.
 476                         if (VERIFY(exists $cpst->{ivecs}{$inum} &&
 477                                    $cpst->{ivecs}{$inum}{crtime} ==
 478                                    $newivec->{crtime},
 479                                    "generate_delta: cpu $cpu inum $inum".
 480                                    " has changed")) {
 481                                 $delta{missing} = 1;
 482                                 return (\%delta);
 483                         }
 484                         my $ivec = $cpst->{ivecs}{$inum};
 485
 486                         # Create $delta{$cpu}{ivecs}{$inum}.
 487
 488                         my %dltivec = ();
 489                         $delta{$cpu}{ivecs}{$inum} = \%dltivec;
 490
 491                         # calculate time used by this interrupt
 492
 493                         my $time = $newivec->{time} - $ivec->{time};
 494                         if (VERIFY($time >= 0,
 495                                    "generate_delta: ivec went backwards?")) {
 496                                 $delta{missing} = 1;
 497                                 delete($delta{$cpu}{ivecs}{$inum});
 498                                 return (\%delta);
 499                         }
 500                         $delta{$cpu}{intrs} += $time;
 501                         $dltivec{time} = $time;
 502                         if ($time > $delta{$cpu}{bigintr}) {
 503                                 $delta{$cpu}{bigintr} = $time;
 504                         }
 505
 506                         # Transfer over basic info about the kstat. We
 507                         # don't have to worry about discrepancies between
 508                         # ivec and newivec because we verified that both
 509                         # have the same crtime.
 510
 511                         $dltivec{pil} = $newivec->{pil};
 512                         $dltivec{ino} = $newivec->{ino};
 513                         $dltivec{buspath} = $newivec->{buspath};
 514                         $dltivec{name} = $newivec->{name};
 515                         $dltivec{ihs} = $newivec->{ihs};
 516                         $dltivec{num_ino} = $newivec->{num_ino};
 517                 }
 518                 if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
 519                         # Ewww! Hopefully just a rounding error.
 520                         # Make something up.
 521                         $delta{$cpu}{tot} = $delta{$cpu}{intrs};
 522                 }
 523                 $delta{$cpu}{intrload} =
 524                        $delta{$cpu}{intrs} / $delta{$cpu}{tot};
 525                 $intrload += $delta{$cpu}{intrload};
 526                 $intrnsec += $delta{$cpu}{intrs};
 527                 $cpus++;
 528         }
 529         if ($cpus > 0) {
 530                 $delta{avgintrload} = $intrload / $cpus;
 531                 $delta{avgintrnsec} = $intrnsec / $cpus;
 532         } else {
 533                 $delta{avgintrload} = 0;
 534                 $delta{avgintrnsec} = 0;
 535         }
 536         return (\%delta);
 537 }
 538
 539
 540 # compress_delta takes a list of deltas, and returns a single new delta
 541 # which represents the combined information from all the deltas. The deltas
 542 # provided are assumed to be sequential in time. The resulting compressed
 543 # delta looks just like any other delta. This new delta is also more accurate
 544 # since its statistics are averaged over a longer period than any of the
 545 # original deltas.
 546
 547 sub compress_deltas ($)
 548 {
 549         my ($deltas) = @_;
 550
 551         my %newdelta = ();
 552         my ($intrs, $tot);
 553         my $cpus = 0;
 554         my ($high_intrload) = 0;
 555
 556         if (VERIFY($#$deltas != -1,
 557                    "compress_deltas: list of delta is empty?")) {
 558                 return (0);
 559         }
 560         $newdelta{minsnap} = $deltas->[0]{minsnap};
 561         $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
 562         $newdelta{missing} = 0;
 563
 564         foreach my $delta (@$deltas) {
 565                 if (VERIFY($delta->{missing} == 0,
 566                     "compressing bad deltas?")) {
 567                         return (0);
 568                 }
 569                 while (my ($cpuid, $cpu) = each %$delta) {
 570                         next if !ref($cpu);
 571
 572                         $intrs += $cpu->{intrs};
 573                         $tot += $cpu->{tot};
 574                         $newdelta{$cpuid}{intrs} += $cpu->{intrs};
 575                         $newdelta{$cpuid}{tot} += $cpu->{tot};
 576                         if (!exists $newdelta{$cpuid}{ivecs}) {
 577                                 my %ivecs = ();
 578                                 $newdelta{$cpuid}{ivecs} = \%ivecs;
 579                         }
 580                         while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
 581                                 my $newivecs = $newdelta{$cpuid}{ivecs};
 582                                 $newivecs->{$inum}{time} += $ivec->{time};
 583                                 $newivecs->{$inum}{pil} = $ivec->{pil};
 584                                 $newivecs->{$inum}{ino} = $ivec->{ino};
 585                                 $newivecs->{$inum}{buspath} = $ivec->{buspath};
 586                                 $newivecs->{$inum}{name} = $ivec->{name};
 587                                 $newivecs->{$inum}{ihs} = $ivec->{ihs};
 588                                 $newivecs->{$inum}{num_ino} = $ivec->{num_ino};
 589                         }
 590                 }
 591         }
 592         foreach my $cpu (values(%newdelta)) {
 593                 next if !ref($cpu); # ignore non-cpu fields
 594                 $cpus++;
 595
 596                 my $bigintr = 0;
 597                 foreach my $ivec (values(%{$cpu->{ivecs}})) {
 598                         if ($ivec->{time} > $bigintr) {
 599                                 $bigintr = $ivec->{time};
 600                         }
 601                 }
 602                 $cpu->{bigintr} = $bigintr;
 603                 $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
 604                 if ($high_intrload < $cpu->{intrload}) {
 605                         $high_intrload = $cpu->{intrload};
 606                 }
 607                 $cpu->{tot} = 1 if $cpu->{tot} <= 0;
 608         }
 609         if ($cpus == 0) {
 610                 $newdelta{avgintrnsec} = 0;
 611                 $newdelta{avgintrload} = 0;
 612         } else {
 613                 $newdelta{avgintrnsec} = $intrs / $cpus;
 614                 $newdelta{avgintrload} = $intrs / $tot;
 615         }
 616         $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
 617             $normal_sleeptime;
 618         return (\%newdelta);
 619 }
 620
 621
 622
 623
 624
 625 # What follow are the core functions responsible for examining the deltas
 626 # generated above and deciding what to do about them.
 627 #
 628 # goodness() and its helper goodness_cpu() return a heuristic which describe
 629 # how good (or bad) the current interrupt balance is. The value returned will
 630 # be between 0 and 1, with 0 representing maximum goodness, and 1 representing
 631 # maximum badness.
 632 #
 633 # imbalanced() compares a current and historical value of goodness, and
 634 # determines if there has been enough change to warrant evaluating a
 635 # reconfiguration of the interrupts
 636 #
 637 # do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
 638 # find_goal(), do_find_goal(), and move_intr(), are responsible for examining
 639 # a delta and determining the best possible assignment of interrupts to CPUs.
 640 #
 641 # It is important that do_reconfig() be in alignment with goodness(). If
 642 # do_reconfig were to generate a new interrupt distribution that worsened
 643 # goodness, we could get into a pathological loop with intrd fighting itself,
 644 # constantly deciding that things are imbalanced, and then changing things
 645 # only to make them worse.
 646
 647
 648
 649 # any goodness over $goodness_unsafe_load is considered really bad
 650 # goodness must drop by at least $goodness_mindelta for a reconfig
 651
 652 my $goodness_unsafe_load = .9;
 653 my $goodness_mindelta = .1;
 654
 655 # goodness(%delta) examines a delta and return its "goodness". goodness will
 656 # be between 0 (best) and 1 (major bad). goodness is determined by evaluating
 657 # the goodness of each individual cpu, and returning the worst case. This
 658 # helps on systems with many CPUs, where otherwise a single pathological CPU
 659 # might otherwise be ignored because the average was OK.
 660 #
 661 # To calculate the goodness of an individual CPU, we start by looking at its
 662 # load due to interrupts. If the load is above a certain high threshold and
 663 # there is more than one interrupt assigned to this CPU, we set goodness
 664 # to worst-case. If the load is below the average interrupt load of all CPUs,
 665 # then we return best-case, since what's to complain about?
 666 #
 667 # Otherwise we look at how much the load is above the average, and return
 668 # that as the goodness, with one caveat: we never return more than the CPU's
 669 # interrupt load ignoring its largest single interrupt source. This is
 670 # because a CPU with one high-load interrupt, and no other interrupts, is
 671 # perfectly balanced. Nothing can be done to improve the situation, and thus
 672 # it is perfectly balanced even if the interrupt's load is 100%.
 673
 674 sub goodness($)
 675 {
 676         my ($delta) = @_;
 677
 678         return (1) if $delta->{missing} > 0;
 679
 680         my $high_goodness = 0;
 681         my $goodness;
 682
 683         foreach my $cpu (values(%$delta)) {
 684                 next if !ref($cpu);             # skip non-cpuid fields
 685
 686                 $goodness = goodness_cpu($cpu, $delta->{avgintrload});
 687                 if (VERIFY($goodness >= 0 && $goodness <= 1,
 688                            "goodness: cpu goodness out of range?")) {
 689                         dumpdelta($delta);
 690                         return (1);
 691                 }
 692                 if ($goodness == 1) {
 693                         return (1);     # worst case, no need to continue
 694                 }
 695                 if ($goodness > $high_goodness) {
 696                         $high_goodness = $goodness;
 697                 }
 698         }
 699         return ($high_goodness);
 700 }
 701
 702 sub goodness_cpu($$)            # private function
 703 {
 704         my ($cpu, $avgintrload) = @_;
 705
 706         my $goodness;
 707         my $load = $cpu->{intrs} / $cpu->{tot};
 708
 709         return (0) if ($load < $avgintrload);   # low loads are perfectly good
 710
 711         # Calculate $load_no_bigintr, which represents the load
 712         # due to interrupts, excluding the one biggest interrupt.
 713         # This is the most gain we can get on this CPU from
 714         # offloading interrupts.
 715
 716         my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
 717
 718         # A major imbalance is indicated if a CPU is saturated
 719         # with interrupt handling, and it has more than one
 720         # source of interrupts. Those other interrupts could be
 721         # starved if of a lower pil. Return a goodness of 1,
 722         # which is the worst possible return value,
 723         # which will effectively contaminate this entire delta.
 724
 725         my $cnt = keys(%{$cpu->{ivecs}});
 726
 727         if ($load > $goodness_unsafe_load && $cnt > 1) {
 728                 return (1);
 729         }
 730         $goodness = $load - $avgintrload;
 731         if ($goodness > $load_no_bigintr) {
 732                 $goodness = $load_no_bigintr;
 733         }
 734         return ($goodness);
 735 }
 736
 737
 738 # imbalanced() is used by the main routine to determine if the goodness
 739 # has shifted far enough from our last baseline to warrant a reassignment
 740 # of interrupts. A very high goodness indicates that a CPU is way out of
 741 # whack. If the goodness has varied too much since the baseline, then
 742 # perhaps a reconfiguration is worth considering.
 743
 744 sub imbalanced ($$)
 745 {
 746         my ($goodness, $baseline) = @_;
 747
 748         # Return 1 if we are pathological, or creeping away from the baseline
 749
 750         return (1) if $goodness > .50;
 751         return (1) if abs($goodness - $baseline) > $goodness_mindelta;
 752         return (0);
 753 }
 754
 755 # do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
 756 # decision-making functions responsible for generating a new interrupt
 757 # distribution. They are designed with the definition of goodness() in
 758 # mind, i.e. they use the same definition of "good distribution" as does
 759 # goodness().
 760 #
 761 # do_reconfig() is responsible for deciding whether a redistribution is
 762 # actually warranted. If the goodness is already pretty good, it doesn't
 763 # waste the CPU time to generate a new distribution. If it
 764 # calculates a new distribution and finds that it is not sufficiently
 765 # improved from the prior distirbution, it will not do the redistribution,
 766 # mainly to avoid the disruption to system performance caused by
 767 # rejuggling interrupts.
 768 #
 769 # Its main loop works by going through a list of cpus sorted from
 770 # highest to lowest interrupt load. It removes the highest-load cpus
 771 # one at a time and hands them off to do_reconfig_cpu(). This function
 772 # then re-sorts the remaining CPUs from lowest to highest interrupt load,
 773 # and one at a time attempts to rejuggle interrupts between the original
 774 # high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
 775 # considered finished as soon as its interrupt load is within
 776 # $goodness_mindelta of the average interrupt load. Such a CPU will have
 777 # a goodness of below the $goodness_mindelta threshold.
 778
 779 #
 780 # move_intr(\%delta, $inum, $oldcpu, $newcpu)
 781 # used by reconfiguration code to move an interrupt between cpus within
 782 # a delta. This manipulates data structures, and does not actually move
 783 # the interrupt on the running system.
 784 #
 785 sub move_intr($$$$)             # private function
 786 {
 787         my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
 788
 789         my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
 790
 791         # Remove ivec from old cpu
 792
 793         my $oldcpu = $delta->{$oldcpuid};
 794         $oldcpu->{intrs} -= $ivec->{time};
 795         $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
 796         delete($oldcpu->{ivecs}{$inum});
 797
 798         VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
 799         VERIFY($ivec->{time} <= $oldcpu->{bigintr},
 800                "move_intr: intr's time > bigintr?");
 801
 802         if ($ivec->{time} >= $oldcpu->{bigintr}) {
 803                 my $bigtime = 0;
 804
 805                 foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
 806                         $bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
 807                 }
 808                 $oldcpu->{bigintr} = $bigtime;
 809         }
 810
 811         # Add ivec onto new cpu
 812
 813         my $newcpu = $delta->{$newcpuid};
 814
 815         $ivec->{nowcpu} = $newcpuid;
 816         $newcpu->{intrs} += $ivec->{time};
 817         $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
 818         $newcpu->{ivecs}{$inum} = $ivec;
 819
 820         $newcpu->{bigintr} = $ivec->{time}
 821                 if $ivec->{time} > $newcpu->{bigintr};
 822 }
 823
 824 sub move_intr_check($$$)        # private function
 825 {
 826         my ($delta, $oldcpuid, $newcpuid) = @_;
 827
 828         VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
 829                "Moved interrupts left 100+%% load on src cpu");
 830         VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
 831                "Moved interrupts left 100+%% load on tgt cpu");
 832 }
 833
 834 sub ivecs_to_string(@)          # private function
 835 {
 836         my $str = "";
 837         foreach my $ivec (@_) {
 838                 $str = "$str $ivec->{inum}";
 839         }
 840         return ($str);
 841 }
 842
 843
 844 sub do_reconfig($)
 845 {
 846         my ($delta) = @_;
 847
 848         my $goodness = $delta->{goodness};
 849
 850         # We can't improve goodness to better than 0. We should stop here
 851         # if, even if we achieve a goodness of 0, the improvement is still
 852         # too small to merit the action.
 853
 854         if ($goodness - 0 < $goodness_mindelta) {
 855                 syslog('debug', "goodness good enough, don't reconfig");
 856                 return (0);
 857         }
 858
 859         syslog('notice', "Optimizing interrupt assignments");
 860
 861         if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
 862             "have a delta with missing")) {
 863                 return (-1);
 864         }
 865
 866         # Make a list of all cpuids, and also add some extra information
 867         # to the ivec structures.
 868
 869         my @cpusortlist = ();
 870
 871         while (my ($cpuid, $cpu) = each %$delta) {
 872                 next if !ref($cpu);     # skip non-cpu entries
 873
 874                 push(@cpusortlist, $cpuid);
 875                 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
 876                         $ivec->{origcpu} = $cpuid;
 877                         $ivec->{nowcpu} = $cpuid;
 878                         $ivec->{inum} = $inum;
 879                 }
 880         }
 881
 882         # Sort the list of CPUs from highest to lowest interrupt load.
 883         # Remove the top CPU from that list and attempt to redistribute
 884         # its interrupts. If the CPU has a goodness below a threshold,
 885         # just ignore the CPU and move to the next one. If the CPU's
 886         # load falls below the average load plus that same threshold,
 887         # then there are no CPUs left worth reconfiguring, and we're done.
 888
 889         while (@cpusortlist) {
 890                 # Re-sort cpusortlist each time, since do_reconfig_cpu can
 891                 # move interrupts around.
 892
 893                 @cpusortlist =
 894                     sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
 895                     @cpusortlist);
 896
 897                 my $cpu = shift(@cpusortlist);
 898                 if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
 899                     ($delta->{$cpu}{intrload} <=
 900                     $delta->{avgintrload} + $goodness_mindelta)) {
 901                         syslog('debug', "finished reconfig: cpu $cpu load ".
 902                             "$delta->{$cpu}{intrload} avgload ".
 903                             "$delta->{avgintrload}");
 904                         last;
 905                 }
 906                 if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
 907                     $goodness_mindelta) {
 908                         next;
 909                 }
 910                 do_reconfig_cpu($delta, \@cpusortlist, $cpu);
 911         }
 912
 913         # How good a job did we do? If the improvement was minimal, and
 914         # our goodness wasn't pathological (and thus needing any help it
 915         # can get), then don't bother moving the interrupts.
 916
 917         my $newgoodness = goodness($delta);
 918         VERIFY($newgoodness <= $goodness,
 919                "reconfig: result has worse goodness?");
 920
 921         if (($goodness != 1 || $newgoodness == 1) &&
 922             $goodness - $newgoodness < $goodness_mindelta) {
 923                 syslog('debug', "goodness already near optimum, ".
 924                        "don't reconfig");
 925                 return (0);
 926         }
 927         syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
 928                $newgoodness*100);
 929
 930         # Time to move those interrupts!
 931
 932         my $ret = 1;
 933         my $warned = 0;
 934         while (my ($cpuid, $cpu) = each %$delta) {
 935                 next if $cpuid =~ /\D/;
 936                 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
 937                         next if ($ivec->{origcpu} == $cpuid);
 938
 939                         if (!intrmove($ivec->{buspath}, $ivec->{origcpu},
 940                             $ivec->{ino}, $cpuid, $ivec->{num_ino})) {
 941                                 syslog('warning', "Unable to move interrupts")
 942                                     if $warned++ == 0;
 943                                 syslog('debug', "Unable to move buspath ".
 944                                     "$ivec->{buspath} ino $ivec->{ino} to ".
 945                                     "cpu $cpuid");
 946                                 $ret = -1;
 947                         }
 948                 }
 949         }
 950
 951         syslog('notice', "Interrupt assignments optimized");
 952         return ($ret);
 953 }
 954
 955 sub do_reconfig_cpu($$$)        # private function
 956 {
 957         my ($delta, $cpusortlist, $oldcpuid) = @_;
 958
 959         # We have been asked to rejuggle interrupts between $oldcpuid and
 960         # other CPUs found on $cpusortlist so as to improve the load on
 961         # $oldcpuid. We reverse $cpusortlist to get our own copy of the
 962         # list, sorted from lowest to highest interrupt load. One at a
 963         # time, shift a CPU off of this list of CPUs, and attempt to
 964         # rejuggle interrupts between the two CPUs. Don't do this if the
 965         # other CPU has a higher load than oldcpuid. We're done rejuggling
 966         # once $oldcpuid's goodness falls below a threshold.
 967
 968         syslog('debug', "reconfiguring $oldcpuid");
 969
 970         my $cpu = $delta->{$oldcpuid};
 971         my $avgintrload = $delta->{avgintrload};
 972
 973         my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
 974         while ($#cputargetlist != -1) {
 975                 last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
 976
 977                 my $tgtcpuid = shift(@cputargetlist);
 978                 my $tgt = $delta->{$tgtcpuid};
 979                 my $load = $cpu->{intrload};
 980                 my $tgtload = $tgt->{intrload};
 981                 last if $tgtload > $load;
 982                 do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
 983         }
 984 }
 985
 986 sub do_reconfig_cpu2cpu($$$$)   # private function
 987 {
 988         my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
 989
 990         # We've been asked to consider interrupt juggling between srccpuid
 991         # (with a high interrupt load) and tgtcpuid (with a lower interrupt
 992         # load). First, make a single list with all of the ivecs from both
 993         # CPUs, and sort the list from highest to lowest load.
 994
 995         syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
 996
 997         # Gather together all the ivecs and sort by load
 998
 999         my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
1000             values(%{$delta->{$tgtcpuid}{ivecs}}));
1001         return if $#ivecs == -1;
1002
1003         @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
1004
1005         # Our "goal" load for srccpuid is the average load across all CPUs.
1006         # find_goal() will find determine the optimum selection of the
1007         # available interrupts which comes closest to this goal without
1008         # falling below the goal.
1009
1010         my $goal = $delta->{avgintrnsec};
1011
1012         # We know that the interrupt load on tgtcpuid is less than that on
1013         # srccpuid, but its load could still be above avgintrnsec. Don't
1014         # choose a goal which would bring srccpuid below the load on tgtcpuid.
1015
1016         my $avgnsec =
1017             ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
1018         if ($goal < $avgnsec) {
1019                 $goal = $avgnsec;
1020         }
1021
1022         # If the largest of the interrupts is on srccpuid, leave it there.
1023         # This can help minimize the disruption caused by moving interrupts.
1024
1025         if ($ivecs[0]->{origcpu} == $srccpuid) {
1026                 syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
1027                 $goal -= $ivecs[0]->{time};
1028                 shift(@ivecs);
1029         }
1030
1031         syslog('debug', "GOAL: inums should total $goal");
1032         find_goal(\@ivecs, $goal);
1033
1034         # find_goal() returned its results to us by setting $ivec->{goal} if
1035         # the ivec should be on srccpuid, or clearing it for tgtcpuid.
1036         # Call move_intr() to update our $delta with the new results.
1037
1038         foreach my $ivec (@ivecs) {
1039                 syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
1040                 VERIFY($ivec->{nowcpu} == $srccpuid ||
1041                     $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
1042                     "interrupt not currently on src or tgt cpu");
1043
1044                 if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
1045                         move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1046                             $srccpuid);
1047                 } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
1048                         move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1049                             $tgtcpuid);
1050                 }
1051         }
1052         move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
1053
1054         my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
1055         VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
1056             "cpu2cpu: new load didn't end up in expected range");
1057 }
1058
1059
1060 # find_goal() and its helper do_find_goal() are used to find the best
1061 # combination of interrupts in order to generate a load that is as close
1062 # as possible to a goal load without falling below that goal. Before returning
1063 # to its caller, find_goal() sets a new value in the hash of each interrupt,
1064 # {goal}, which if set signifies that this interrupt is one of the interrupts
1065 # identified as part of the set of interrupts which best meet the goal.
1066 #
1067 # The arguments to find_goal are a list of ivecs (hash references), sorted
1068 # by descending {time}, and the goal load. The goal is relative to {time}.
1069 # The best fit is determined by performing a depth-first search. do_find_goal
1070 # is the recursive subroutine which carries out the search.
1071 #
1072 # It is passed an index as an argument, originally 0. On a given invocation,
1073 # it is only to consider interrupts in the ivecs array starting at that index.
1074 # It then considers two possibilities:
1075 #   1) What is the best goal-fit if I include ivecs[index]?
1076 #   2) What is the best goal-fit if I exclude ivecs[index]?
1077 # To determine case 1, it subtracts the load of ivecs[index] from the goal,
1078 # and calls itself recursively with that new goal and index++.
1079 # To determine case 2, it calls itself recursively with the same goal and
1080 # index++.
1081 #
1082 # It then compares the two results, decide which one best meets the goals,
1083 # and returns the result. The return value is the best-fit's interrupt load,
1084 # followed by a list of all the interrupts which make up that best-fit.
1085 #
1086 # As an optimization, a second array loads[] is created which mirrors ivecs[].
1087 # loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1088 # by do_find_goal to avoid recursing all the way to the end of the ivecs
1089 # array if including all remaining interrupts will still leave the best-fit
1090 # at below goal load. If so, it then includes all remaining interrupts on
1091 # the goal list and returns.
1092 #
1093 sub find_goal($$)               # private function
1094 {
1095         my ($ivecs, $goal) = @_;
1096
1097         my @goals;
1098         my $load;
1099         my $ivec;
1100
1101         if ($goal <= 0) {
1102                 @goals = ();    # the empty set will best meet the goal
1103         } else {
1104                 syslog('debug', "finding goal from intrs %s",
1105                     ivecs_to_string(@$ivecs));
1106
1107                 # Generate @loads array
1108
1109                 my $tot = 0;
1110                 foreach $ivec (@$ivecs) {
1111                         $tot += $ivec->{time};
1112                 }
1113                 my @loads = ();
1114                 foreach $ivec (@$ivecs) {
1115                         push(@loads, $tot);
1116                         $tot -= $ivec->{time};
1117                 }
1118                 ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1119                 VERIFY($load >= $goal, "find_goal didn't meet goals");
1120         }
1121         syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1122
1123         # Set or clear $ivec->{goal} for each ivec, based on returned @goals
1124
1125         foreach $ivec (@$ivecs) {
1126                 if ($#goals > -1 && $ivec == $goals[0]) {
1127                         syslog('debug', "inum $ivec->{inum} on source cpu");
1128                         $ivec->{goal} = 1;
1129                         shift(@goals);
1130                 } else {
1131                         syslog('debug', "inum $ivec->{inum} on target cpu");
1132                         $ivec->{goal} = 0;
1133                 }
1134         }
1135 }
1136
1137
1138 sub do_find_goal($$$$)          # private function
1139 {
1140         my ($ivecs, $loads, $goal, $idx) = @_;
1141
1142         if ($idx > $#{$ivecs}) {
1143                 return (0);
1144         }
1145         syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1146
1147         my $load = $ivecs->[$idx]{time};
1148         my @goals_with = ();
1149         my @goals_without = ();
1150         my ($with, $without);
1151
1152         # If we include all remaining items and we're still below goal,
1153         # stop here. We can just return a result that includes $idx and all
1154         # subsequent ivecs. Since this will still be below goal, there's
1155         # nothing better to be done.
1156
1157         if ($loads->[$idx] <= $goal) {
1158                 syslog('debug',
1159                     "$idx: including all remaining intrs %s with load %d",
1160                     ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1161                     $loads->[$idx]);
1162                 return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1163         }
1164
1165         # Evaluate the "with" option, i.e. the best matching goal which
1166         # includes $ivecs->[$idx]. If idx's load is more than our goal load,
1167         # stop here. Once we're above the goal, there is no need to consider
1168         # further interrupts since they'll only take us further from the goal.
1169
1170         if ($goal <= $load) {
1171                 $with = $load;  # stop here
1172         } else {
1173                 ($with, @goals_with) =
1174                     do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1175                 $with += $load;
1176         }
1177         syslog('debug', "$idx: with-load $with intrs %s",
1178                ivecs_to_string($ivecs->[$idx], @goals_with));
1179
1180         # Evaluate the "without" option, i.e. the best matching goal which
1181         # excludes $ivecs->[$idx].
1182
1183         ($without, @goals_without) =
1184             &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1185         syslog('debug', "$idx: without-load $without intrs %s",
1186                ivecs_to_string(@goals_without));
1187
1188         # We now have our "with" and "without" options, and we choose which
1189         # best fits the goal. If one is greater than goal and the other is
1190         # below goal, we choose the one that is greater. If they are both
1191         # below goal, then we choose the one that is greater. If they are
1192         # both above goal, then we choose the smaller.
1193
1194         my $which;              # 0 == with, 1 == without
1195         if ($with >= $goal && $without < $goal) {
1196                 $which = 0;
1197         } elsif ($with < $goal && $without >= $goal) {
1198                 $which = 1;
1199         } elsif ($with >= $goal && $without >= $goal) {
1200                 $which = ($without < $with);
1201         } else {
1202                 $which = ($without > $with);
1203         }
1204
1205         # Return the load of our best case scenario, followed by all the ivecs
1206         # which compose that goal.
1207
1208         if ($which == 1) {      # without
1209                 syslog('debug', "$idx: going without");
1210                 return ($without, @goals_without);
1211         } else {
1212                 syslog('debug', "$idx: going with");
1213                 return ($with, $ivecs->[$idx], @goals_with);
1214         }
1215         # Not reached
1216 }
1217
1218
1219
1220
1221 syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1222
1223 my @deltas = ();
1224 my $deltas_tottime = 0;         # sum of maxsnap-minsnap across @deltas
1225 my $avggoodness;
1226 my $baseline_goodness = 0;
1227 my $compdelta;
1228
1229 my $do_reconfig;
1230
1231 # temp variables
1232 my $goodness;
1233 my $deltatime;
1234 my $olddelta;
1235 my $olddeltatime;
1236 my $delta;
1237 my $newstat;
1238 my $below_statslen;
1239 my $newtime;
1240 my $ret;
1241
1242
1243 my $gotsig = 0;
1244 $SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
1245 $SIG{HUP} = $SIG{INT};
1246 $SIG{TERM} = $SIG{INT};
1247
1248 my $ks;
1249 if ($using_scengen == 0) {
1250         $ks = Sun::Solaris::Kstat->new();
1251 } else {
1252         $ks = myks_update();    # supplied by the simulator
1253 }
1254
1255 # If no pci_intrs kstats were found, we need to exit, but we can't because
1256 # SMF will restart us and/or report an error to the administrator. But
1257 # there's nothing an administrator can do. So print out a message for SMF
1258 # logs and silently pause forever.
1259
1260 if (!exists($ks->{pci_intrs})) {
1261         print STDERR "$cmdname: no interrupts were found; ".
1262             "your PCI bus may not yet be supported\n";
1263         pause() while $gotsig == 0;
1264         exit 0;
1265 }
1266
1267 # See if this is a system with a pcplusmp APIC.
1268 # Such systems will get special handling.
1269 # Assume that if one bus has a pcplusmp APIC that they all do.
1270
1271 # Get a list of pci_intrs kstats.
1272 my @elem = values(%{$ks->{pci_intrs}});
1273 my $elem0 = $elem[0];
1274 my $elemval = (values(%$elem0))[0];
1275
1276 # Use its buspath to query the system.  It is assumed that either all or none
1277 # of the busses on a system are hosted by the pcplusmp APIC or APIX.
1278 my $pcplusmp_sys = is_apic($elemval->{buspath});
1279
1280 my $stat = getstat($ks, $pcplusmp_sys);
1281
1282 for (;;) {
1283         sub clear_deltas {
1284                 @deltas = ();
1285                 $deltas_tottime = 0;
1286                 $stat = 0;   # prevent next gen_delta() from setting {missing}
1287         }
1288
1289         # 1. Sleep, update the kstats, and save the new stats in $newstat.
1290
1291         exit 0 if $gotsig;              # if we got ^C / SIGTERM, exit
1292         if ($using_scengen == 0) {
1293                 sleep($sleeptime);
1294                 exit 0 if $gotsig;      # if we got ^C / SIGTERM, exit
1295                 $ks->update();
1296         } else {
1297                 $ks = myks_update();
1298         }
1299         $newstat = getstat($ks, $pcplusmp_sys);
1300
1301         # $stat or $newstat could be zero if they're uninitialized, or if
1302         # getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1303         # and try again. If $newstat is zero, then we also sleep and try
1304         # again, hoping the problem will clear up.
1305
1306         next if (!ref $newstat);
1307         if (!ref $stat) {
1308                 $stat = $newstat;
1309                 next;
1310         }
1311
1312         # 2. Compare $newstat with the prior set of values, result in %$delta.
1313
1314         $delta = generate_delta($stat, $newstat);
1315         dumpdelta($delta) if $debug;    # Dump most recent stats to stdout.
1316         $stat = $newstat;       # The new stats now become the old stats.
1317
1318
1319         # 3. If $delta->{missing}, then there has been a reconfiguration of
1320         # either cpus or interrupts (probably both). We need to toss out our
1321         # old set of statistics and start from scratch.
1322         #
1323         # Also, if the delta covers a very long range of time, then we've
1324         # been experiencing a system overload that has resulted in intrd
1325         # not being allowed to run effectively for a while now. As above,
1326         # toss our old statistics and start from scratch.
1327
1328         $deltatime = $delta->{maxsnap} - $delta->{minsnap};
1329         if ($delta->{missing} > 0 || $deltatime > $statslen) {
1330                 clear_deltas();
1331                 syslog('debug', "evaluating interrupt assignments");
1332                 next;
1333         }
1334
1335
1336         # 4. Incorporate new delta into the list of deltas, and associated
1337         # statistics. If we've just now received $statslen deltas, then it's
1338         # time to evaluate a reconfiguration.
1339
1340         $below_statslen = ($deltas_tottime < $statslen);
1341         $deltas_tottime += $deltatime;
1342         $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1343         push(@deltas, $delta);
1344
1345         # 5. Remove old deltas if total time is more than $statslen. We use
1346         # @deltas as a moving average of the last $statslen seconds. Shift
1347         # off the olders deltas, but only if that doesn't cause us to fall
1348         # below $statslen seconds.
1349
1350         while (@deltas > 1) {
1351                 $olddelta = $deltas[0];
1352                 $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1353                 $newtime = $deltas_tottime - $olddeltatime;
1354                 last if ($newtime < $statslen);
1355
1356                 shift(@deltas);
1357                 $deltas_tottime = $newtime;
1358         }
1359
1360         # 6. The brains of the operation are here. First, check if we're
1361         # imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1362         # either because of imbalance or above in step 4, we evaluate a
1363         # new configuration.
1364         #
1365         # First, take @deltas and generate a single "compressed" delta
1366         # which summarizes them all. Pass that to do_reconfig and see
1367         # what it does with it:
1368         #
1369         # $ret == -1 : failure
1370         # $ret ==  0 : current config is optimal (or close enough)
1371         # $ret ==  1 : reconfiguration has occurred
1372         #
1373         # If $ret is -1 or 1, dump all our deltas and start from scratch.
1374         # Step 4 above will set do_reconfig soon thereafter.
1375         #
1376         # If $ret is 0, then nothing has happened because we're already
1377         # good enough. Set baseline_goodness to current goodness.
1378
1379         $compdelta = compress_deltas(\@deltas);
1380         if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1381                 clear_deltas();
1382                 next;
1383         }
1384         $compdelta->{goodness} = goodness($compdelta);
1385         dumpdelta($compdelta) if $debug;
1386
1387         $goodness = $compdelta->{goodness};
1388         syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1389
1390         if ($deltas_tottime >= $statslen &&
1391             imbalanced($goodness, $baseline_goodness)) {
1392                 $do_reconfig = 1;
1393         }
1394
1395         if ($do_reconfig) {
1396                 $ret = do_reconfig($compdelta);
1397
1398                 if ($ret != 0) {
1399                         clear_deltas();
1400                         syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1401                 } else {
1402                         syslog('debug', "setting new baseline of $goodness");
1403                         $baseline_goodness = $goodness;
1404                 }
1405         }
1406         syslog('debug', "---------------------------------------");
1407 }