recipes/linux/linux-efika-2.6.20.20/sched-cfs-v9-v2.6.20.11.patch

   1 This is the Complete Fair Scheduler (CFS) v9 patch for
   2 linux 2.6.20.10 patch (rediffed cleanly against .11).
   3
   4 http://people.redhat.com/mingo/cfs-scheduler/
   5
   6 Index: linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
   7 ===================================================================
   8 --- linux-cfs-2.6.20.8.q.orig/Documentation/kernel-parameters.txt
   9 +++ linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
  10 @@ -914,49 +914,6 @@ and is between 256 and 4096 characters.
  11
  12         mga=            [HW,DRM]
  13
  14 -       migration_cost=
  15 -                       [KNL,SMP] debug: override scheduler migration costs
  16 -                       Format: <level-1-usecs>,<level-2-usecs>,...
  17 -                       This debugging option can be used to override the
  18 -                       default scheduler migration cost matrix. The numbers
  19 -                       are indexed by 'CPU domain distance'.
  20 -                       E.g. migration_cost=1000,2000,3000 on an SMT NUMA
  21 -                       box will set up an intra-core migration cost of
  22 -                       1 msec, an inter-core migration cost of 2 msecs,
  23 -                       and an inter-node migration cost of 3 msecs.
  24 -
  25 -                       WARNING: using the wrong values here can break
  26 -                       scheduler performance, so it's only for scheduler
  27 -                       development purposes, not production environments.
  28 -
  29 -       migration_debug=
  30 -                       [KNL,SMP] migration cost auto-detect verbosity
  31 -                       Format=<0|1|2>
  32 -                       If a system's migration matrix reported at bootup
  33 -                       seems erroneous then this option can be used to
  34 -                       increase verbosity of the detection process.
  35 -                       We default to 0 (no extra messages), 1 will print
  36 -                       some more information, and 2 will be really
  37 -                       verbose (probably only useful if you also have a
  38 -                       serial console attached to the system).
  39 -
  40 -       migration_factor=
  41 -                       [KNL,SMP] multiply/divide migration costs by a factor
  42 -                       Format=<percent>
  43 -                       This debug option can be used to proportionally
  44 -                       increase or decrease the auto-detected migration
  45 -                       costs for all entries of the migration matrix.
  46 -                       E.g. migration_factor=150 will increase migration
  47 -                       costs by 50%. (and thus the scheduler will be less
  48 -                       eager migrating cache-hot tasks)
  49 -                       migration_factor=80 will decrease migration costs
  50 -                       by 20%. (thus the scheduler will be more eager to
  51 -                       migrate tasks)
  52 -
  53 -                       WARNING: using the wrong values here can break
  54 -                       scheduler performance, so it's only for scheduler
  55 -                       development purposes, not production environments.
  56 -
  57         mousedev.tap_time=
  58                         [MOUSE] Maximum time between finger touching and
  59                         leaving touchpad surface for touch to be considered
  60 Index: linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
  61 ===================================================================
  62 --- /dev/null
  63 +++ linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
  64 @@ -0,0 +1,107 @@
  65 +[announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]
  66 +
  67 +i'm pleased to announce the first release of the "Modular Scheduler Core
  68 +and Completely Fair Scheduler [CFS]" patchset:
  69 +
  70 +   http://redhat.com/~mingo/cfs-scheduler/
  71 +
  72 +This project is a complete rewrite of the Linux task scheduler. My goal
  73 +is to address various feature requests and to fix deficiencies in the
  74 +vanilla scheduler that were suggested/found in the past few years, both
  75 +for desktop scheduling and for server scheduling workloads.
  76 +
  77 +[ QuickStart: apply the patch, recompile, reboot. The new scheduler
  78 +  will be active by default and all tasks will default to the
  79 +  SCHED_NORMAL interactive scheduling class. ]
  80 +
  81 +Highlights are:
  82 +
  83 + - the introduction of Scheduling Classes: an extensible hierarchy of
  84 +   scheduler modules. These modules encapsulate scheduling policy
  85 +   details and are handled by the scheduler core without the core
  86 +   code assuming about them too much.
  87 +
  88 + - sched_fair.c implements the 'CFS desktop scheduler': it is a
  89 +   replacement for the vanilla scheduler's SCHED_OTHER interactivity
  90 +   code.
  91 +
  92 +   i'd like to give credit to Con Kolivas for the general approach here:
  93 +   he has proven via RSDL/SD that 'fair scheduling' is possible and that
  94 +   it results in better desktop scheduling. Kudos Con!
  95 +
  96 +   The CFS patch uses a completely different approach and implementation
  97 +   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
  98 +   that of RSDL/SD, which is a high standard to meet :-) Testing
  99 +   feedback is welcome to decide this one way or another. [ and, in any
 100 +   case, all of SD's logic could be added via a kernel/sched_sd.c module
 101 +   as well, if Con is interested in such an approach. ]
 102 +
 103 +   CFS's design is quite radical: it does not use runqueues, it uses a
 104 +   time-ordered rbtree to build a 'timeline' of future task execution,
 105 +   and thus has no 'array switch' artifacts (by which both the vanilla
 106 +   scheduler and RSDL/SD are affected).
 107 +
 108 +   CFS uses nanosecond granularity accounting and does not rely on any
 109 +   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
 110 +   'timeslices' and has no heuristics whatsoever. There is only one
 111 +   central tunable:
 112 +
 113 +         /proc/sys/kernel/sched_granularity_ns
 114 +
 115 +   which can be used to tune the scheduler from 'desktop' (low
 116 +   latencies) to 'server' (good batching) workloads. It defaults to a
 117 +   setting suitable for desktop workloads. SCHED_BATCH is handled by the
 118 +   CFS scheduler module too.
 119 +
 120 +   due to its design, the CFS scheduler is not prone to any of the
 121 +   'attacks' that exist today against the heuristics of the stock
 122 +   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
 123 +   work fine and do not impact interactivity and produce the expected
 124 +   behavior.
 125 +
 126 +   the CFS scheduler has a much stronger handling of nice levels and
 127 +   SCHED_BATCH: both types of workloads should be isolated much more
 128 +   agressively than under the vanilla scheduler.
 129 +
 130 +   ( another rdetail: due to nanosec accounting and timeline sorting,
 131 +     sched_yield() support is very simple under CFS, and in fact under
 132 +     CFS sched_yield() behaves much better than under any other
 133 +     scheduler i have tested so far. )
 134 +
 135 + - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
 136 +   way than the vanilla scheduler does. It uses 100 runqueues (for all
 137 +   100 RT priority levels, instead of 140 in the vanilla scheduler)
 138 +   and it needs no expired array.
 139 +
 140 + - reworked/sanitized SMP load-balancing: the runqueue-walking
 141 +   assumptions are gone from the load-balancing code now, and
 142 +   iterators of the scheduling modules are used. The balancing code got
 143 +   quite a bit simpler as a result.
 144 +
 145 +the core scheduler got smaller by more than 700 lines:
 146 +
 147 + kernel/sched.c | 1454 ++++++++++++++++------------------------------------------------
 148 + 1 file changed, 372 insertions(+), 1082 deletions(-)
 149 +
 150 +and even adding all the scheduling modules, the total size impact is
 151 +relatively small:
 152 +
 153 + 18 files changed, 1454 insertions(+), 1133 deletions(-)
 154 +
 155 +most of the increase is due to extensive comments. The kernel size
 156 +impact is in fact a small negative:
 157 +
 158 +   text    data     bss     dec     hex filename
 159 +  23366    4001      24   27391    6aff kernel/sched.o.vanilla
 160 +  24159    2705      56   26920    6928 kernel/sched.o.CFS
 161 +
 162 +(this is mainly due to the benefit of getting rid of the expired array
 163 +and its data structure overhead.)
 164 +
 165 +thanks go to Thomas Gleixner and Arjan van de Ven for review of this
 166 +patchset.
 167 +
 168 +as usual, any sort of feedback, bugreports, fixes and suggestions are
 169 +more than welcome,
 170 +
 171 +       Ingo
 172 Index: linux-cfs-2.6.20.8.q/Makefile
 173 ===================================================================
 174 --- linux-cfs-2.6.20.8.q.orig/Makefile
 175 +++ linux-cfs-2.6.20.8.q/Makefile
 176 @@ -1,7 +1,7 @@
 177  VERSION = 2
 178  PATCHLEVEL = 6
 179  SUBLEVEL = 20
 180 -EXTRAVERSION = .11
 181 +EXTRAVERSION = .11-cfs-v9
 182  NAME = Homicidal Dwarf Hamster
 183
 184  # *DOCUMENTATION*
 185 Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
 186 ===================================================================
 187 --- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/smpboot.c
 188 +++ linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
 189 @@ -1132,18 +1132,6 @@ exit:
 190  }
 191  #endif
 192
 193 -static void smp_tune_scheduling(void)
 194 -{
 195 -       unsigned long cachesize;       /* kB   */
 196 -
 197 -       if (cpu_khz) {
 198 -               cachesize = boot_cpu_data.x86_cache_size;
 199 -
 200 -               if (cachesize > 0)
 201 -                       max_cache_size = cachesize * 1024;
 202 -       }
 203 -}
 204 -
 205  /*
 206   * Cycle through the processors sending APIC IPIs to boot each.
 207   */
 208 @@ -1172,7 +1160,6 @@ static void __init smp_boot_cpus(unsigne
 209         x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
 210
 211         current_thread_info()->cpu = 0;
 212 -       smp_tune_scheduling();
 213
 214         set_cpu_sibling_map(0);
 215
 216 Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
 217 ===================================================================
 218 --- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/syscall_table.S
 219 +++ linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
 220 @@ -319,3 +319,4 @@ ENTRY(sys_call_table)
 221         .long sys_move_pages
 222         .long sys_getcpu
 223         .long sys_epoll_pwait
 224 +       .long sys_sched_yield_to        /* 320 */
 225 Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
 226 ===================================================================
 227 --- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/tsc.c
 228 +++ linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
 229 @@ -61,6 +61,8 @@ static inline int check_tsc_unstable(voi
 230
 231  void mark_tsc_unstable(void)
 232  {
 233 +       sched_clock_unstable_event();
 234 +
 235         tsc_unstable = 1;
 236  }
 237  EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 238 @@ -107,13 +109,7 @@ unsigned long long sched_clock(void)
 239  {
 240         unsigned long long this_offset;
 241
 242 -       /*
 243 -        * in the NUMA case we dont use the TSC as they are not
 244 -        * synchronized across all CPUs.
 245 -        */
 246 -#ifndef CONFIG_NUMA
 247 -       if (!cpu_khz || check_tsc_unstable())
 248 -#endif
 249 +       if (!cpu_khz || !cpu_has_tsc)
 250                 /* no locking but a rare wrong value is not a big deal */
 251                 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 252
 253 Index: linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
 254 ===================================================================
 255 --- linux-cfs-2.6.20.8.q.orig/arch/ia64/kernel/setup.c
 256 +++ linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
 257 @@ -773,7 +773,6 @@ static void __cpuinit
 258  get_max_cacheline_size (void)
 259  {
 260         unsigned long line_size, max = 1;
 261 -       unsigned int cache_size = 0;
 262         u64 l, levels, unique_caches;
 263          pal_cache_config_info_t cci;
 264          s64 status;
 265 @@ -803,8 +802,6 @@ get_max_cacheline_size (void)
 266                 line_size = 1 << cci.pcci_line_size;
 267                 if (line_size > max)
 268                         max = line_size;
 269 -               if (cache_size < cci.pcci_cache_size)
 270 -                       cache_size = cci.pcci_cache_size;
 271                 if (!cci.pcci_unified) {
 272                         status = ia64_pal_cache_config_info(l,
 273                                                     /* cache_type (instruction)= */ 1,
 274 @@ -821,9 +818,6 @@ get_max_cacheline_size (void)
 275                         ia64_i_cache_stride_shift = cci.pcci_stride;
 276         }
 277    out:
 278 -#ifdef CONFIG_SMP
 279 -       max_cache_size = max(max_cache_size, cache_size);
 280 -#endif
 281         if (max > ia64_max_cacheline_size)
 282                 ia64_max_cacheline_size = max;
 283  }
 284 Index: linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
 285 ===================================================================
 286 --- linux-cfs-2.6.20.8.q.orig/arch/mips/kernel/smp.c
 287 +++ linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
 288 @@ -245,7 +245,6 @@ void __init smp_prepare_cpus(unsigned in
 289  {
 290         init_new_context(current, &init_mm);
 291         current_thread_info()->cpu = 0;
 292 -       smp_tune_scheduling();
 293         plat_prepare_cpus(max_cpus);
 294  #ifndef CONFIG_HOTPLUG_CPU
 295         cpu_present_map = cpu_possible_map;
 296 Index: linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
 297 ===================================================================
 298 --- linux-cfs-2.6.20.8.q.orig/arch/sparc/kernel/smp.c
 299 +++ linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
 300 @@ -69,16 +69,6 @@ void __cpuinit smp_store_cpu_info(int id
 301         cpu_data(id).prom_node = cpu_node;
 302         cpu_data(id).mid = cpu_get_hwmid(cpu_node);
 303
 304 -       /* this is required to tune the scheduler correctly */
 305 -       /* is it possible to have CPUs with different cache sizes? */
 306 -       if (id == boot_cpu_id) {
 307 -               int cache_line,cache_nlines;
 308 -               cache_line = 0x20;
 309 -               cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
 310 -               cache_nlines = 0x8000;
 311 -               cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
 312 -               max_cache_size = cache_line * cache_nlines;
 313 -       }
 314         if (cpu_data(id).mid < 0)
 315                 panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
 316  }
 317 Index: linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
 318 ===================================================================
 319 --- linux-cfs-2.6.20.8.q.orig/arch/sparc64/kernel/smp.c
 320 +++ linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
 321 @@ -1293,41 +1293,6 @@ int setup_profiling_timer(unsigned int m
 322         return 0;
 323  }
 324
 325 -static void __init smp_tune_scheduling(void)
 326 -{
 327 -       struct device_node *dp;
 328 -       int instance;
 329 -       unsigned int def, smallest = ~0U;
 330 -
 331 -       def = ((tlb_type == hypervisor) ?
 332 -              (3 * 1024 * 1024) :
 333 -              (4 * 1024 * 1024));
 334 -
 335 -       instance = 0;
 336 -       while (!cpu_find_by_instance(instance, &dp, NULL)) {
 337 -               unsigned int val;
 338 -
 339 -               val = of_getintprop_default(dp, "ecache-size", def);
 340 -               if (val < smallest)
 341 -                       smallest = val;
 342 -
 343 -               instance++;
 344 -       }
 345 -
 346 -       /* Any value less than 256K is nonsense.  */
 347 -       if (smallest < (256U * 1024U))
 348 -               smallest = 256 * 1024;
 349 -
 350 -       max_cache_size = smallest;
 351 -
 352 -       if (smallest < 1U * 1024U * 1024U)
 353 -               printk(KERN_INFO "Using max_cache_size of %uKB\n",
 354 -                      smallest / 1024U);
 355 -       else
 356 -               printk(KERN_INFO "Using max_cache_size of %uMB\n",
 357 -                      smallest / 1024U / 1024U);
 358 -}
 359 -
 360  /* Constrain the number of cpus to max_cpus.  */
 361  void __init smp_prepare_cpus(unsigned int max_cpus)
 362  {
 363 @@ -1363,7 +1328,6 @@ void __init smp_prepare_cpus(unsigned in
 364         }
 365
 366         smp_store_cpu_info(boot_cpu_id);
 367 -       smp_tune_scheduling();
 368  }
 369
 370  /* Set this up early so that things like the scheduler can init
 371 Index: linux-cfs-2.6.20.8.q/fs/proc/array.c
 372 ===================================================================
 373 --- linux-cfs-2.6.20.8.q.orig/fs/proc/array.c
 374 +++ linux-cfs-2.6.20.8.q/fs/proc/array.c
 375 @@ -165,7 +165,6 @@ static inline char * task_state(struct t
 376         rcu_read_lock();
 377         buffer += sprintf(buffer,
 378                 "State:\t%s\n"
 379 -               "SleepAVG:\t%lu%%\n"
 380                 "Tgid:\t%d\n"
 381                 "Pid:\t%d\n"
 382                 "PPid:\t%d\n"
 383 @@ -173,9 +172,8 @@ static inline char * task_state(struct t
 384                 "Uid:\t%d\t%d\t%d\t%d\n"
 385                 "Gid:\t%d\t%d\t%d\t%d\n",
 386                 get_task_state(p),
 387 -               (p->sleep_avg/1024)*100/(1020000000/1024),
 388 -               p->tgid, p->pid,
 389 -               pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 390 +               p->tgid, p->pid,
 391 +               pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 392                 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
 393                 p->uid, p->euid, p->suid, p->fsuid,
 394                 p->gid, p->egid, p->sgid, p->fsgid);
 395 @@ -312,6 +310,11 @@ int proc_pid_status(struct task_struct *
 396         return buffer - orig;
 397  }
 398
 399 +int proc_pid_sched(struct task_struct *task, char *buffer)
 400 +{
 401 +       return sched_print_task_state(task, buffer) - buffer;
 402 +}
 403 +
 404  static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 405  {
 406         unsigned long vsize, eip, esp, wchan = ~0UL;
 407 Index: linux-cfs-2.6.20.8.q/fs/proc/base.c
 408 ===================================================================
 409 --- linux-cfs-2.6.20.8.q.orig/fs/proc/base.c
 410 +++ linux-cfs-2.6.20.8.q/fs/proc/base.c
 411 @@ -1839,6 +1839,7 @@ static struct pid_entry tgid_base_stuff[
 412         INF("environ",    S_IRUSR, pid_environ),
 413         INF("auxv",       S_IRUSR, pid_auxv),
 414         INF("status",     S_IRUGO, pid_status),
 415 +       INF("sched",      S_IRUGO, pid_sched),
 416         INF("cmdline",    S_IRUGO, pid_cmdline),
 417         INF("stat",       S_IRUGO, tgid_stat),
 418         INF("statm",      S_IRUGO, pid_statm),
 419 @@ -2121,6 +2122,7 @@ static struct pid_entry tid_base_stuff[]
 420         INF("environ",   S_IRUSR, pid_environ),
 421         INF("auxv",      S_IRUSR, pid_auxv),
 422         INF("status",    S_IRUGO, pid_status),
 423 +       INF("sched",     S_IRUGO, pid_sched),
 424         INF("cmdline",   S_IRUGO, pid_cmdline),
 425         INF("stat",      S_IRUGO, tid_stat),
 426         INF("statm",     S_IRUGO, pid_statm),
 427 Index: linux-cfs-2.6.20.8.q/fs/proc/internal.h
 428 ===================================================================
 429 --- linux-cfs-2.6.20.8.q.orig/fs/proc/internal.h
 430 +++ linux-cfs-2.6.20.8.q/fs/proc/internal.h
 431 @@ -36,6 +36,7 @@ extern int proc_exe_link(struct inode *,
 432  extern int proc_tid_stat(struct task_struct *,  char *);
 433  extern int proc_tgid_stat(struct task_struct *, char *);
 434  extern int proc_pid_status(struct task_struct *, char *);
 435 +extern int proc_pid_sched(struct task_struct *, char *);
 436  extern int proc_pid_statm(struct task_struct *, char *);
 437
 438  extern struct file_operations proc_maps_operations;
 439 Index: linux-cfs-2.6.20.8.q/include/asm-generic/bitops/sched.h
 440 ===================================================================
 441 --- linux-cfs-2.6.20.8.q.orig/include/asm-generic/bitops/sched.h
 442 +++ linux-cfs-2.6.20.8.q/include/asm-generic/bitops/sched.h
 443 @@ -6,28 +6,23 @@
 444
 445  /*
 446   * Every architecture must define this function. It's the fastest
 447 - * way of searching a 140-bit bitmap where the first 100 bits are
 448 - * unlikely to be set. It's guaranteed that at least one of the 140
 449 - * bits is cleared.
 450 + * way of searching a 100-bit bitmap.  It's guaranteed that at least
 451 + * one of the 100 bits is cleared.
 452   */
 453  static inline int sched_find_first_bit(const unsigned long *b)
 454  {
 455  #if BITS_PER_LONG == 64
 456 -       if (unlikely(b[0]))
 457 +       if (b[0])
 458                 return __ffs(b[0]);
 459 -       if (likely(b[1]))
 460 -               return __ffs(b[1]) + 64;
 461 -       return __ffs(b[2]) + 128;
 462 +       return __ffs(b[1]) + 64;
 463  #elif BITS_PER_LONG == 32
 464 -       if (unlikely(b[0]))
 465 +       if (b[0])
 466                 return __ffs(b[0]);
 467 -       if (unlikely(b[1]))
 468 +       if (b[1])
 469                 return __ffs(b[1]) + 32;
 470 -       if (unlikely(b[2]))
 471 +       if (b[2])
 472                 return __ffs(b[2]) + 64;
 473 -       if (b[3])
 474 -               return __ffs(b[3]) + 96;
 475 -       return __ffs(b[4]) + 128;
 476 +       return __ffs(b[3]) + 96;
 477  #else
 478  #error BITS_PER_LONG not defined
 479  #endif
 480 Index: linux-cfs-2.6.20.8.q/include/asm-i386/topology.h
 481 ===================================================================
 482 --- linux-cfs-2.6.20.8.q.orig/include/asm-i386/topology.h
 483 +++ linux-cfs-2.6.20.8.q/include/asm-i386/topology.h
 484 @@ -85,7 +85,6 @@ static inline int node_to_first_cpu(int
 485         .idle_idx               = 1,                    \
 486         .newidle_idx            = 2,                    \
 487         .wake_idx               = 1,                    \
 488 -       .per_cpu_gain           = 100,                  \
 489         .flags                  = SD_LOAD_BALANCE       \
 490                                 | SD_BALANCE_EXEC       \
 491                                 | SD_BALANCE_FORK       \
 492 Index: linux-cfs-2.6.20.8.q/include/asm-i386/unistd.h
 493 ===================================================================
 494 --- linux-cfs-2.6.20.8.q.orig/include/asm-i386/unistd.h
 495 +++ linux-cfs-2.6.20.8.q/include/asm-i386/unistd.h
 496 @@ -325,10 +325,11 @@
 497  #define __NR_move_pages                317
 498  #define __NR_getcpu            318
 499  #define __NR_epoll_pwait       319
 500 +#define __NR_sched_yield_to    320
 501
 502  #ifdef __KERNEL__
 503
 504 -#define NR_syscalls 320
 505 +#define NR_syscalls 321
 506
 507  #define __ARCH_WANT_IPC_PARSE_VERSION
 508  #define __ARCH_WANT_OLD_READDIR
 509 Index: linux-cfs-2.6.20.8.q/include/asm-ia64/topology.h
 510 ===================================================================
 511 --- linux-cfs-2.6.20.8.q.orig/include/asm-ia64/topology.h
 512 +++ linux-cfs-2.6.20.8.q/include/asm-ia64/topology.h
 513 @@ -65,7 +65,6 @@ void build_cpu_to_node_map(void);
 514         .max_interval           = 4,                    \
 515         .busy_factor            = 64,                   \
 516         .imbalance_pct          = 125,                  \
 517 -       .per_cpu_gain           = 100,                  \
 518         .cache_nice_tries       = 2,                    \
 519         .busy_idx               = 2,                    \
 520         .idle_idx               = 1,                    \
 521 @@ -97,7 +96,6 @@ void build_cpu_to_node_map(void);
 522         .newidle_idx            = 0, /* unused */       \
 523         .wake_idx               = 1,                    \
 524         .forkexec_idx           = 1,                    \
 525 -       .per_cpu_gain           = 100,                  \
 526         .flags                  = SD_LOAD_BALANCE       \
 527                                 | SD_BALANCE_EXEC       \
 528                                 | SD_BALANCE_FORK       \
 529 Index: linux-cfs-2.6.20.8.q/include/asm-mips/mach-ip27/topology.h
 530 ===================================================================
 531 --- linux-cfs-2.6.20.8.q.orig/include/asm-mips/mach-ip27/topology.h
 532 +++ linux-cfs-2.6.20.8.q/include/asm-mips/mach-ip27/topology.h
 533 @@ -28,7 +28,6 @@ extern unsigned char __node_distances[MA
 534         .busy_factor            = 32,                   \
 535         .imbalance_pct          = 125,                  \
 536         .cache_nice_tries       = 1,                    \
 537 -       .per_cpu_gain           = 100,                  \
 538         .flags                  = SD_LOAD_BALANCE       \
 539                                 | SD_BALANCE_EXEC       \
 540                                 | SD_WAKE_BALANCE,      \
 541 Index: linux-cfs-2.6.20.8.q/include/asm-powerpc/topology.h
 542 ===================================================================
 543 --- linux-cfs-2.6.20.8.q.orig/include/asm-powerpc/topology.h
 544 +++ linux-cfs-2.6.20.8.q/include/asm-powerpc/topology.h
 545 @@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct
 546         .busy_factor            = 32,                   \
 547         .imbalance_pct          = 125,                  \
 548         .cache_nice_tries       = 1,                    \
 549 -       .per_cpu_gain           = 100,                  \
 550         .busy_idx               = 3,                    \
 551         .idle_idx               = 1,                    \
 552         .newidle_idx            = 2,                    \
 553 Index: linux-cfs-2.6.20.8.q/include/asm-x86_64/topology.h
 554 ===================================================================
 555 --- linux-cfs-2.6.20.8.q.orig/include/asm-x86_64/topology.h
 556 +++ linux-cfs-2.6.20.8.q/include/asm-x86_64/topology.h
 557 @@ -43,7 +43,6 @@ extern int __node_distance(int, int);
 558         .newidle_idx            = 0,                    \
 559         .wake_idx               = 1,                    \
 560         .forkexec_idx           = 1,                    \
 561 -       .per_cpu_gain           = 100,                  \
 562         .flags                  = SD_LOAD_BALANCE       \
 563                                 | SD_BALANCE_FORK       \
 564                                 | SD_BALANCE_EXEC       \
 565 Index: linux-cfs-2.6.20.8.q/include/asm-x86_64/unistd.h
 566 ===================================================================
 567 --- linux-cfs-2.6.20.8.q.orig/include/asm-x86_64/unistd.h
 568 +++ linux-cfs-2.6.20.8.q/include/asm-x86_64/unistd.h
 569 @@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync
 570  __SYSCALL(__NR_vmsplice, sys_vmsplice)
 571  #define __NR_move_pages                279
 572  __SYSCALL(__NR_move_pages, sys_move_pages)
 573 +#define __NR_sched_yield_to    280
 574 +__SYSCALL(__NR_sched_yield_to, sys_sched_yield_to)
 575
 576 -#define __NR_syscall_max __NR_move_pages
 577 +#define __NR_syscall_max __NR_sched_yield_to
 578
 579  #ifndef __NO_STUBS
 580  #define __ARCH_WANT_OLD_READDIR
 581 Index: linux-cfs-2.6.20.8.q/include/linux/hardirq.h
 582 ===================================================================
 583 --- linux-cfs-2.6.20.8.q.orig/include/linux/hardirq.h
 584 +++ linux-cfs-2.6.20.8.q/include/linux/hardirq.h
 585 @@ -79,6 +79,19 @@
 586  #endif
 587
 588  #ifdef CONFIG_PREEMPT
 589 +# define PREEMPT_CHECK_OFFSET 1
 590 +#else
 591 +# define PREEMPT_CHECK_OFFSET 0
 592 +#endif
 593 +
 594 +/*
 595 + * Check whether we were atomic before we did preempt_disable():
 596 + * (used by the scheduler)
 597 + */
 598 +#define in_atomic_preempt_off() \
 599 +               ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 600 +
 601 +#ifdef CONFIG_PREEMPT
 602  # define preemptible() (preempt_count() == 0 && !irqs_disabled())
 603  # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 604  #else
 605 Index: linux-cfs-2.6.20.8.q/include/linux/ktime.h
 606 ===================================================================
 607 --- linux-cfs-2.6.20.8.q.orig/include/linux/ktime.h
 608 +++ linux-cfs-2.6.20.8.q/include/linux/ktime.h
 609 @@ -274,4 +274,6 @@ extern void ktime_get_ts(struct timespec
 610  /* Get the real (wall-) time in timespec format: */
 611  #define ktime_get_real_ts(ts)  getnstimeofday(ts)
 612
 613 +extern ktime_t ktime_get(void);
 614 +
 615  #endif
 616 Index: linux-cfs-2.6.20.8.q/include/linux/sched.h
 617 ===================================================================
 618 --- linux-cfs-2.6.20.8.q.orig/include/linux/sched.h
 619 +++ linux-cfs-2.6.20.8.q/include/linux/sched.h
 620 @@ -2,7 +2,6 @@
 621  #define _LINUX_SCHED_H
 622
 623  #include <linux/auxvec.h>      /* For AT_VECTOR_SIZE */
 624 -
 625  /*
 626   * cloning flags:
 627   */
 628 @@ -37,6 +36,8 @@
 629
 630  #ifdef __KERNEL__
 631
 632 +#include <linux/rbtree.h>      /* For run_node */
 633 +
 634  struct sched_param {
 635         int sched_priority;
 636  };
 637 @@ -196,13 +197,13 @@ extern void init_idle(struct task_struct
 638  extern cpumask_t nohz_cpu_mask;
 639
 640  /*
 641 - * Only dump TASK_* tasks. (-1 for all tasks)
 642 + * Only dump TASK_* tasks. (0 for all tasks)
 643   */
 644  extern void show_state_filter(unsigned long state_filter);
 645
 646  static inline void show_state(void)
 647  {
 648 -       show_state_filter(-1);
 649 +       show_state_filter(0);
 650  }
 651
 652  extern void show_regs(struct pt_regs *);
 653 @@ -464,7 +465,7 @@ struct signal_struct {
 654          * from jiffies_to_ns(utime + stime) if sched_clock uses something
 655          * other than jiffies.)
 656          */
 657 -       unsigned long long sched_time;
 658 +       unsigned long long sum_sched_runtime;
 659
 660         /*
 661          * We don't bother to synchronize most readers of this at all,
 662 @@ -524,6 +525,7 @@ struct signal_struct {
 663  #define MAX_RT_PRIO            MAX_USER_RT_PRIO
 664
 665  #define MAX_PRIO               (MAX_RT_PRIO + 40)
 666 +#define DEFAULT_PRIO           (MAX_RT_PRIO + 20)
 667
 668  #define rt_prio(prio)          unlikely((prio) < MAX_RT_PRIO)
 669  #define rt_task(p)             rt_prio((p)->prio)
 670 @@ -635,7 +637,14 @@ enum idle_type
 671  /*
 672   * sched-domains (multiprocessor balancing) declarations:
 673   */
 674 -#define SCHED_LOAD_SCALE       128UL   /* increase resolution of load */
 675 +
 676 +/*
 677 + * Increase resolution of nice-level calculations:
 678 + */
 679 +#define SCHED_LOAD_SHIFT       10
 680 +#define SCHED_LOAD_SCALE       (1UL << SCHED_LOAD_SHIFT)
 681 +
 682 +#define SCHED_LOAD_SCALE_FUZZ  (SCHED_LOAD_SCALE >> 5)
 683
 684  #ifdef CONFIG_SMP
 685  #define SD_LOAD_BALANCE                1       /* Do load balancing on this domain. */
 686 @@ -684,7 +693,6 @@ struct sched_domain {
 687         unsigned int imbalance_pct;     /* No balance until over watermark */
 688         unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 689         unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
 690 -       unsigned int per_cpu_gain;      /* CPU % gained by adding domain cpus */
 691         unsigned int busy_idx;
 692         unsigned int idle_idx;
 693         unsigned int newidle_idx;
 694 @@ -733,12 +741,6 @@ struct sched_domain {
 695  extern int partition_sched_domains(cpumask_t *partition1,
 696                                     cpumask_t *partition2);
 697
 698 -/*
 699 - * Maximum cache size the migration-costs auto-tuning code will
 700 - * search from:
 701 - */
 702 -extern unsigned int max_cache_size;
 703 -
 704  #endif /* CONFIG_SMP */
 705
 706
 707 @@ -789,14 +791,28 @@ struct mempolicy;
 708  struct pipe_inode_info;
 709  struct uts_namespace;
 710
 711 -enum sleep_type {
 712 -       SLEEP_NORMAL,
 713 -       SLEEP_NONINTERACTIVE,
 714 -       SLEEP_INTERACTIVE,
 715 -       SLEEP_INTERRUPTED,
 716 -};
 717 +struct rq;
 718
 719 -struct prio_array;
 720 +struct sched_class {
 721 +       struct sched_class *next;
 722 +
 723 +       void (*enqueue_task) (struct rq *rq, struct task_struct *p,
 724 +                             int wakeup, u64 now);
 725 +       void (*dequeue_task) (struct rq *rq, struct task_struct *p,
 726 +                             int sleep, u64 now);
 727 +       void (*yield_task) (struct rq *rq, struct task_struct *p,
 728 +                           struct task_struct *p_to);
 729 +
 730 +       void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 731 +
 732 +       struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
 733 +       void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 734 +
 735 +       struct task_struct * (*load_balance_start) (struct rq *rq);
 736 +       struct task_struct * (*load_balance_next) (struct rq *rq);
 737 +       void (*task_tick) (struct rq *rq, struct task_struct *p);
 738 +       void (*task_new) (struct rq *rq, struct task_struct *p);
 739 +};
 740
 741  struct task_struct {
 742         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
 743 @@ -813,26 +829,45 @@ struct task_struct {
 744  #endif
 745  #endif
 746         int load_weight;        /* for niceness load balancing purposes */
 747 +       int load_shift;
 748 +
 749         int prio, static_prio, normal_prio;
 750 +       int on_rq;
 751         struct list_head run_list;
 752 -       struct prio_array *array;
 753 +       struct rb_node run_node;
 754
 755         unsigned short ioprio;
 756  #ifdef CONFIG_BLK_DEV_IO_TRACE
 757         unsigned int btrace_seq;
 758  #endif
 759 -       unsigned long sleep_avg;
 760 -       unsigned long long timestamp, last_ran;
 761 -       unsigned long long sched_time; /* sched_clock time spent running */
 762 -       enum sleep_type sleep_type;
 763 +       /* CFS scheduling class statistics fields: */
 764 +       u64 wait_start_fair;
 765 +       u64 wait_start;
 766 +       u64 exec_start;
 767 +       u64 sleep_start;
 768 +       u64 block_start;
 769 +       u64 sleep_max;
 770 +       u64 block_max;
 771 +       u64 exec_max;
 772 +       u64 wait_max;
 773 +       u64 last_ran;
 774 +
 775 +       s64 wait_runtime;
 776 +       u64 sum_exec_runtime;
 777 +       s64 fair_key;
 778 +       s64 sum_wait_runtime;
 779
 780         unsigned long policy;
 781         cpumask_t cpus_allowed;
 782 -       unsigned int time_slice, first_time_slice;
 783 +       unsigned int time_slice;
 784 +       struct sched_class *sched_class;
 785 +
 786 +       s64 min_wait_runtime;
 787
 788  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 789         struct sched_info sched_info;
 790  #endif
 791 +       u64 nr_switches;
 792
 793         struct list_head tasks;
 794         /*
 795 @@ -1195,8 +1230,9 @@ static inline int set_cpus_allowed(struc
 796  #endif
 797
 798  extern unsigned long long sched_clock(void);
 799 +extern void sched_clock_unstable_event(void);
 800  extern unsigned long long
 801 -current_sched_time(const struct task_struct *current_task);
 802 +current_sched_runtime(const struct task_struct *current_task);
 803
 804  /* sched_exec is called by processes performing an exec */
 805  #ifdef CONFIG_SMP
 806 @@ -1212,6 +1248,13 @@ static inline void idle_task_exit(void)
 807  #endif
 808
 809  extern void sched_idle_next(void);
 810 +extern char * sched_print_task_state(struct task_struct *p, char *buffer);
 811 +
 812 +extern unsigned int sysctl_sched_granularity;
 813 +extern unsigned int sysctl_sched_wakeup_granularity;
 814 +extern unsigned int sysctl_sched_sleep_history_max;
 815 +extern unsigned int sysctl_sched_child_runs_first;
 816 +extern unsigned int sysctl_sched_load_smoothing;
 817
 818  #ifdef CONFIG_RT_MUTEXES
 819  extern int rt_mutex_getprio(struct task_struct *p);
 820 @@ -1290,8 +1333,7 @@ extern void FASTCALL(wake_up_new_task(st
 821  #else
 822   static inline void kick_process(struct task_struct *tsk) { }
 823  #endif
 824 -extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
 825 -extern void FASTCALL(sched_exit(struct task_struct * p));
 826 +extern void sched_fork(struct task_struct * p, int clone_flags);
 827
 828  extern int in_group_p(gid_t);
 829  extern int in_egroup_p(gid_t);
 830 Index: linux-cfs-2.6.20.8.q/include/linux/topology.h
 831 ===================================================================
 832 --- linux-cfs-2.6.20.8.q.orig/include/linux/topology.h
 833 +++ linux-cfs-2.6.20.8.q/include/linux/topology.h
 834 @@ -96,7 +96,6 @@
 835         .busy_factor            = 64,                   \
 836         .imbalance_pct          = 110,                  \
 837         .cache_nice_tries       = 0,                    \
 838 -       .per_cpu_gain           = 25,                   \
 839         .busy_idx               = 0,                    \
 840         .idle_idx               = 0,                    \
 841         .newidle_idx            = 1,                    \
 842 @@ -128,7 +127,6 @@
 843         .busy_factor            = 64,                   \
 844         .imbalance_pct          = 125,                  \
 845         .cache_nice_tries       = 1,                    \
 846 -       .per_cpu_gain           = 100,                  \
 847         .busy_idx               = 2,                    \
 848         .idle_idx               = 1,                    \
 849         .newidle_idx            = 2,                    \
 850 @@ -159,7 +157,6 @@
 851         .busy_factor            = 64,                   \
 852         .imbalance_pct          = 125,                  \
 853         .cache_nice_tries       = 1,                    \
 854 -       .per_cpu_gain           = 100,                  \
 855         .busy_idx               = 2,                    \
 856         .idle_idx               = 1,                    \
 857         .newidle_idx            = 2,                    \
 858 @@ -193,7 +190,6 @@
 859         .newidle_idx            = 0, /* unused */       \
 860         .wake_idx               = 0, /* unused */       \
 861         .forkexec_idx           = 0, /* unused */       \
 862 -       .per_cpu_gain           = 100,                  \
 863         .flags                  = SD_LOAD_BALANCE       \
 864                                 | SD_SERIALIZE, \
 865         .last_balance           = jiffies,              \
 866 Index: linux-cfs-2.6.20.8.q/init/main.c
 867 ===================================================================
 868 --- linux-cfs-2.6.20.8.q.orig/init/main.c
 869 +++ linux-cfs-2.6.20.8.q/init/main.c
 870 @@ -422,7 +422,7 @@ static void noinline rest_init(void)
 871
 872         /*
 873          * The boot idle thread must execute schedule()
 874 -        * at least one to get things moving:
 875 +        * at least once to get things moving:
 876          */
 877         preempt_enable_no_resched();
 878         schedule();
 879 Index: linux-cfs-2.6.20.8.q/kernel/exit.c
 880 ===================================================================
 881 --- linux-cfs-2.6.20.8.q.orig/kernel/exit.c
 882 +++ linux-cfs-2.6.20.8.q/kernel/exit.c
 883 @@ -112,7 +112,7 @@ static void __exit_signal(struct task_st
 884                 sig->maj_flt += tsk->maj_flt;
 885                 sig->nvcsw += tsk->nvcsw;
 886                 sig->nivcsw += tsk->nivcsw;
 887 -               sig->sched_time += tsk->sched_time;
 888 +               sig->sum_sched_runtime += tsk->sum_exec_runtime;
 889                 sig = NULL; /* Marker for below. */
 890         }
 891
 892 @@ -170,7 +170,6 @@ repeat:
 893                 zap_leader = (leader->exit_signal == -1);
 894         }
 895
 896 -       sched_exit(p);
 897         write_unlock_irq(&tasklist_lock);
 898         proc_flush_task(p);
 899         release_thread(p);
 900 Index: linux-cfs-2.6.20.8.q/kernel/fork.c
 901 ===================================================================
 902 --- linux-cfs-2.6.20.8.q.orig/kernel/fork.c
 903 +++ linux-cfs-2.6.20.8.q/kernel/fork.c
 904 @@ -874,7 +874,7 @@ static inline int copy_signal(unsigned l
 905         sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 906         sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 907         sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 908 -       sig->sched_time = 0;
 909 +       sig->sum_sched_runtime = 0;
 910         INIT_LIST_HEAD(&sig->cpu_timers[0]);
 911         INIT_LIST_HEAD(&sig->cpu_timers[1]);
 912         INIT_LIST_HEAD(&sig->cpu_timers[2]);
 913 @@ -1037,7 +1037,7 @@ static struct task_struct *copy_process(
 914
 915         p->utime = cputime_zero;
 916         p->stime = cputime_zero;
 917 -       p->sched_time = 0;
 918 +
 919         p->rchar = 0;           /* I/O counter: bytes read */
 920         p->wchar = 0;           /* I/O counter: bytes written */
 921         p->syscr = 0;           /* I/O counter: read syscalls */
 922 Index: linux-cfs-2.6.20.8.q/kernel/hrtimer.c
 923 ===================================================================
 924 --- linux-cfs-2.6.20.8.q.orig/kernel/hrtimer.c
 925 +++ linux-cfs-2.6.20.8.q/kernel/hrtimer.c
 926 @@ -45,7 +45,7 @@
 927   *
 928   * returns the time in ktime_t format
 929   */
 930 -static ktime_t ktime_get(void)
 931 +ktime_t ktime_get(void)
 932  {
 933         struct timespec now;
 934
 935 Index: linux-cfs-2.6.20.8.q/kernel/posix-cpu-timers.c
 936 ===================================================================
 937 --- linux-cfs-2.6.20.8.q.orig/kernel/posix-cpu-timers.c
 938 +++ linux-cfs-2.6.20.8.q/kernel/posix-cpu-timers.c
 939 @@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
 940  }
 941  static inline unsigned long long sched_ns(struct task_struct *p)
 942  {
 943 -       return (p == current) ? current_sched_time(p) : p->sched_time;
 944 +       return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime;
 945  }
 946
 947  int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 948 @@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked
 949                 } while (t != p);
 950                 break;
 951         case CPUCLOCK_SCHED:
 952 -               cpu->sched = p->signal->sched_time;
 953 +               cpu->sched = p->signal->sum_sched_runtime;
 954                 /* Add in each other live thread.  */
 955                 while ((t = next_thread(t)) != p) {
 956 -                       cpu->sched += t->sched_time;
 957 +                       cpu->sched += t->sum_exec_runtime;
 958                 }
 959                 cpu->sched += sched_ns(p);
 960                 break;
 961 @@ -417,7 +417,7 @@ int posix_cpu_timer_del(struct k_itimer
 962   */
 963  static void cleanup_timers(struct list_head *head,
 964                            cputime_t utime, cputime_t stime,
 965 -                          unsigned long long sched_time)
 966 +                          unsigned long long sum_exec_runtime)
 967  {
 968         struct cpu_timer_list *timer, *next;
 969         cputime_t ptime = cputime_add(utime, stime);
 970 @@ -446,10 +446,10 @@ static void cleanup_timers(struct list_h
 971         ++head;
 972         list_for_each_entry_safe(timer, next, head, entry) {
 973                 list_del_init(&timer->entry);
 974 -               if (timer->expires.sched < sched_time) {
 975 +               if (timer->expires.sched < sum_exec_runtime) {
 976                         timer->expires.sched = 0;
 977                 } else {
 978 -                       timer->expires.sched -= sched_time;
 979 +                       timer->expires.sched -= sum_exec_runtime;
 980                 }
 981         }
 982  }
 983 @@ -462,7 +462,7 @@ static void cleanup_timers(struct list_h
 984  void posix_cpu_timers_exit(struct task_struct *tsk)
 985  {
 986         cleanup_timers(tsk->cpu_timers,
 987 -                      tsk->utime, tsk->stime, tsk->sched_time);
 988 +                      tsk->utime, tsk->stime, tsk->sum_exec_runtime);
 989
 990  }
 991  void posix_cpu_timers_exit_group(struct task_struct *tsk)
 992 @@ -470,7 +470,7 @@ void posix_cpu_timers_exit_group(struct
 993         cleanup_timers(tsk->signal->cpu_timers,
 994                        cputime_add(tsk->utime, tsk->signal->utime),
 995                        cputime_add(tsk->stime, tsk->signal->stime),
 996 -                      tsk->sched_time + tsk->signal->sched_time);
 997 +                      tsk->sum_exec_runtime + tsk->signal->sum_sched_runtime);
 998  }
 999
1000
1001 @@ -531,7 +531,7 @@ static void process_timer_rebalance(stru
1002                 nsleft = max_t(unsigned long long, nsleft, 1);
1003                 do {
1004                         if (likely(!(t->flags & PF_EXITING))) {
1005 -                               ns = t->sched_time + nsleft;
1006 +                               ns = t->sum_exec_runtime + nsleft;
1007                                 if (t->it_sched_expires == 0 ||
1008                                     t->it_sched_expires > ns) {
1009                                         t->it_sched_expires = ns;
1010 @@ -999,7 +999,7 @@ static void check_thread_timers(struct t
1011                 struct cpu_timer_list *t = list_entry(timers->next,
1012                                                       struct cpu_timer_list,
1013                                                       entry);
1014 -               if (!--maxfire || tsk->sched_time < t->expires.sched) {
1015 +               if (!--maxfire || tsk->sum_exec_runtime < t->expires.sched) {
1016                         tsk->it_sched_expires = t->expires.sched;
1017                         break;
1018                 }
1019 @@ -1019,7 +1019,7 @@ static void check_process_timers(struct
1020         int maxfire;
1021         struct signal_struct *const sig = tsk->signal;
1022         cputime_t utime, stime, ptime, virt_expires, prof_expires;
1023 -       unsigned long long sched_time, sched_expires;
1024 +       unsigned long long sum_sched_runtime, sched_expires;
1025         struct task_struct *t;
1026         struct list_head *timers = sig->cpu_timers;
1027
1028 @@ -1039,12 +1039,12 @@ static void check_process_timers(struct
1029          */
1030         utime = sig->utime;
1031         stime = sig->stime;
1032 -       sched_time = sig->sched_time;
1033 +       sum_sched_runtime = sig->sum_sched_runtime;
1034         t = tsk;
1035         do {
1036                 utime = cputime_add(utime, t->utime);
1037                 stime = cputime_add(stime, t->stime);
1038 -               sched_time += t->sched_time;
1039 +               sum_sched_runtime += t->sum_exec_runtime;
1040                 t = next_thread(t);
1041         } while (t != tsk);
1042         ptime = cputime_add(utime, stime);
1043 @@ -1085,7 +1085,7 @@ static void check_process_timers(struct
1044                 struct cpu_timer_list *t = list_entry(timers->next,
1045                                                       struct cpu_timer_list,
1046                                                       entry);
1047 -               if (!--maxfire || sched_time < t->expires.sched) {
1048 +               if (!--maxfire || sum_sched_runtime < t->expires.sched) {
1049                         sched_expires = t->expires.sched;
1050                         break;
1051                 }
1052 @@ -1177,7 +1177,7 @@ static void check_process_timers(struct
1053                 virt_left = cputime_sub(virt_expires, utime);
1054                 virt_left = cputime_div_non_zero(virt_left, nthreads);
1055                 if (sched_expires) {
1056 -                       sched_left = sched_expires - sched_time;
1057 +                       sched_left = sched_expires - sum_sched_runtime;
1058                         do_div(sched_left, nthreads);
1059                         sched_left = max_t(unsigned long long, sched_left, 1);
1060                 } else {
1061 @@ -1203,7 +1203,7 @@ static void check_process_timers(struct
1062                                 t->it_virt_expires = ticks;
1063                         }
1064
1065 -                       sched = t->sched_time + sched_left;
1066 +                       sched = t->sum_exec_runtime + sched_left;
1067                         if (sched_expires && (t->it_sched_expires == 0 ||
1068                                               t->it_sched_expires > sched)) {
1069                                 t->it_sched_expires = sched;
1070 @@ -1295,7 +1295,7 @@ void run_posix_cpu_timers(struct task_st
1071
1072         if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1073             (tsk->it_sched_expires == 0 ||
1074 -            tsk->sched_time < tsk->it_sched_expires))
1075 +            tsk->sum_exec_runtime < tsk->it_sched_expires))
1076                 return;
1077
1078  #undef UNEXPIRED
1079 Index: linux-cfs-2.6.20.8.q/kernel/sched.c
1080 ===================================================================
1081 --- linux-cfs-2.6.20.8.q.orig/kernel/sched.c
1082 +++ linux-cfs-2.6.20.8.q/kernel/sched.c
1083 @@ -89,110 +89,13 @@
1084   */
1085  #define MIN_TIMESLICE          max(5 * HZ / 1000, 1)
1086  #define DEF_TIMESLICE          (100 * HZ / 1000)
1087 -#define ON_RUNQUEUE_WEIGHT      30
1088 -#define CHILD_PENALTY           95
1089 -#define PARENT_PENALTY         100
1090 -#define EXIT_WEIGHT              3
1091 -#define PRIO_BONUS_RATIO        25
1092 -#define MAX_BONUS              (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
1093 -#define INTERACTIVE_DELTA        2
1094 -#define MAX_SLEEP_AVG          (DEF_TIMESLICE * MAX_BONUS)
1095 -#define STARVATION_LIMIT       (MAX_SLEEP_AVG)
1096 -#define NS_MAX_SLEEP_AVG       (JIFFIES_TO_NS(MAX_SLEEP_AVG))
1097 -
1098 -/*
1099 - * If a task is 'interactive' then we reinsert it in the active
1100 - * array after it has expired its current timeslice. (it will not
1101 - * continue to run immediately, it will still roundrobin with
1102 - * other interactive tasks.)
1103 - *
1104 - * This part scales the interactivity limit depending on niceness.
1105 - *
1106 - * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
1107 - * Here are a few examples of different nice levels:
1108 - *
1109 - *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
1110 - *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
1111 - *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
1112 - *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
1113 - *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
1114 - *
1115 - * (the X axis represents the possible -5 ... 0 ... +5 dynamic
1116 - *  priority range a task can explore, a value of '1' means the
1117 - *  task is rated interactive.)
1118 - *
1119 - * Ie. nice +19 tasks can never get 'interactive' enough to be
1120 - * reinserted into the active array. And only heavily CPU-hog nice -20
1121 - * tasks will be expired. Default nice 0 tasks are somewhere between,
1122 - * it takes some effort for them to get interactive, but it's not
1123 - * too hard.
1124 - */
1125 -
1126 -#define CURRENT_BONUS(p) \
1127 -       (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
1128 -               MAX_SLEEP_AVG)
1129 -
1130 -#define GRANULARITY    (10 * HZ / 1000 ? : 1)
1131 -
1132 -#ifdef CONFIG_SMP
1133 -#define TIMESLICE_GRANULARITY(p)       (GRANULARITY * \
1134 -               (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
1135 -                       num_online_cpus())
1136 -#else
1137 -#define TIMESLICE_GRANULARITY(p)       (GRANULARITY * \
1138 -               (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
1139 -#endif
1140 -
1141 -#define SCALE(v1,v1_max,v2_max) \
1142 -       (v1) * (v2_max) / (v1_max)
1143 -
1144 -#define DELTA(p) \
1145 -       (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
1146 -               INTERACTIVE_DELTA)
1147 -
1148 -#define TASK_INTERACTIVE(p) \
1149 -       ((p)->prio <= (p)->static_prio - DELTA(p))
1150 -
1151 -#define INTERACTIVE_SLEEP(p) \
1152 -       (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
1153 -               (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
1154 -
1155 -#define TASK_PREEMPTS_CURR(p, rq) \
1156 -       ((p)->prio < (rq)->curr->prio)
1157 -
1158 -#define SCALE_PRIO(x, prio) \
1159 -       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
1160 -
1161 -static unsigned int static_prio_timeslice(int static_prio)
1162 -{
1163 -       if (static_prio < NICE_TO_PRIO(0))
1164 -               return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
1165 -       else
1166 -               return SCALE_PRIO(DEF_TIMESLICE, static_prio);
1167 -}
1168 -
1169 -/*
1170 - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
1171 - * to time slice values: [800ms ... 100ms ... 5ms]
1172 - *
1173 - * The higher a thread's priority, the bigger timeslices
1174 - * it gets during one round of execution. But even the lowest
1175 - * priority thread gets MIN_TIMESLICE worth of execution time.
1176 - */
1177 -
1178 -static inline unsigned int task_timeslice(struct task_struct *p)
1179 -{
1180 -       return static_prio_timeslice(p->static_prio);
1181 -}
1182
1183  /*
1184 - * These are the runqueue data structures:
1185 + * This is the priority-queue data structure of the RT scheduling class:
1186   */
1187 -
1188  struct prio_array {
1189 -       unsigned int nr_active;
1190 -       DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
1191 -       struct list_head queue[MAX_PRIO];
1192 +       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
1193 +       struct list_head queue[MAX_RT_PRIO];
1194  };
1195
1196  /*
1197 @@ -209,12 +112,13 @@ struct rq {
1198          * nr_running and cpu_load should be in the same cacheline because
1199          * remote CPUs use both these fields when doing load calculation.
1200          */
1201 -       unsigned long nr_running;
1202 +       long nr_running;
1203         unsigned long raw_weighted_load;
1204 -#ifdef CONFIG_SMP
1205 -       unsigned long cpu_load[3];
1206 -#endif
1207 -       unsigned long long nr_switches;
1208 +       #define CPU_LOAD_IDX_MAX 5
1209 +       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
1210 +
1211 +       u64 nr_switches;
1212 +       unsigned long nr_load_updates;
1213
1214         /*
1215          * This is part of a global counter where only the total sum
1216 @@ -224,14 +128,29 @@ struct rq {
1217          */
1218         unsigned long nr_uninterruptible;
1219
1220 -       unsigned long expired_timestamp;
1221 -       /* Cached timestamp set by update_cpu_clock() */
1222 -       unsigned long long most_recent_timestamp;
1223         struct task_struct *curr, *idle;
1224         unsigned long next_balance;
1225         struct mm_struct *prev_mm;
1226 -       struct prio_array *active, *expired, arrays[2];
1227 -       int best_expired_prio;
1228 +
1229 +       u64 clock, prev_clock_raw;
1230 +       s64 clock_max_delta;
1231 +       u64 fair_clock, prev_fair_clock;
1232 +       u64 exec_clock, prev_exec_clock;
1233 +       u64 wait_runtime;
1234 +
1235 +       unsigned int clock_warps;
1236 +       unsigned int clock_unstable_events;
1237 +
1238 +       struct sched_class *load_balance_class;
1239 +
1240 +       struct prio_array active;
1241 +       int rt_load_balance_idx;
1242 +       struct list_head *rt_load_balance_head, *rt_load_balance_curr;
1243 +
1244 +       struct rb_root tasks_timeline;
1245 +       struct rb_node *rb_leftmost;
1246 +       struct rb_node *rb_load_balance_curr;
1247 +
1248         atomic_t nr_iowait;
1249
1250  #ifdef CONFIG_SMP
1251 @@ -268,7 +187,107 @@ struct rq {
1252         struct lock_class_key rq_lock_key;
1253  };
1254
1255 -static DEFINE_PER_CPU(struct rq, runqueues);
1256 +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
1257 +
1258 +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
1259 +{
1260 +       rq->curr->sched_class->check_preempt_curr(rq, p);
1261 +}
1262 +
1263 +#define SCALE_PRIO(x, prio) \
1264 +       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
1265 +
1266 +/*
1267 + * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
1268 + * to time slice values: [800ms ... 100ms ... 5ms]
1269 + */
1270 +static unsigned int static_prio_timeslice(int static_prio)
1271 +{
1272 +       if (static_prio == NICE_TO_PRIO(19))
1273 +               return 1;
1274 +
1275 +       if (static_prio < NICE_TO_PRIO(0))
1276 +               return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
1277 +       else
1278 +               return SCALE_PRIO(DEF_TIMESLICE, static_prio);
1279 +}
1280 +
1281 +/*
1282 + * Print out various scheduling related per-task fields:
1283 + */
1284 +char * sched_print_task_state(struct task_struct *p, char *buffer)
1285 +{
1286 +       struct rq *this_rq = &per_cpu(runqueues, raw_smp_processor_id());
1287 +       unsigned long long t0, t1;
1288 +
1289 +#define P(F) \
1290 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", #F, (long long)p->F)
1291 +
1292 +       P(wait_start);
1293 +       P(wait_start_fair);
1294 +       P(exec_start);
1295 +       P(sleep_start);
1296 +       P(block_start);
1297 +       P(sleep_max);
1298 +       P(block_max);
1299 +       P(exec_max);
1300 +       P(wait_max);
1301 +       P(min_wait_runtime);
1302 +       P(last_ran);
1303 +       P(wait_runtime);
1304 +       P(sum_exec_runtime);
1305 +#undef P
1306 +
1307 +       t0 = sched_clock();
1308 +       t1 = sched_clock();
1309 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "clock-delta",
1310 +                               (long long)t1-t0);
1311 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-wait_runtime",
1312 +                               (long long)this_rq->wait_runtime);
1313 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-exec_clock",
1314 +                               (long long)this_rq->exec_clock);
1315 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-fair_clock",
1316 +                               (long long)this_rq->fair_clock);
1317 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-clock",
1318 +                               (long long)this_rq->clock);
1319 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-prev_clock_raw",
1320 +                               (long long)this_rq->prev_clock_raw);
1321 +       buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-clock_max_delta",
1322 +                               (long long)this_rq->clock_max_delta);
1323 +       buffer += sprintf(buffer, "%-25s:%20u\n", "rq-clock_warps",
1324 +                               this_rq->clock_warps);
1325 +       buffer += sprintf(buffer, "%-25s:%20u\n", "rq-clock_unstable_events",
1326 +                               this_rq->clock_unstable_events);
1327 +       return buffer;
1328 +}
1329 +
1330 +/*
1331 + * Per-runqueue clock, as finegrained as the platform can give us:
1332 + */
1333 +static inline unsigned long long __rq_clock(struct rq *rq)
1334 +{
1335 +       u64 now = sched_clock();
1336 +       u64 clock = rq->clock;
1337 +       u64 prev_raw = rq->prev_clock_raw;
1338 +       s64 delta = now - prev_raw;
1339 +
1340 +       /*
1341 +        * Protect against sched_clock() occasionally going backwards:
1342 +        */
1343 +       if (unlikely(delta < 0)) {
1344 +               clock++;
1345 +               rq->clock_warps++;
1346 +       } else {
1347 +               if (unlikely(delta > rq->clock_max_delta))
1348 +                       rq->clock_max_delta = delta;
1349 +               clock += delta;
1350 +       }
1351 +
1352 +       rq->prev_clock_raw = now;
1353 +       rq->clock = clock;
1354 +
1355 +       return clock;
1356 +}
1357
1358  static inline int cpu_of(struct rq *rq)
1359  {
1360 @@ -279,6 +298,16 @@ static inline int cpu_of(struct rq *rq)
1361  #endif
1362  }
1363
1364 +static inline unsigned long long rq_clock(struct rq *rq)
1365 +{
1366 +       int this_cpu = smp_processor_id();
1367 +
1368 +       if (this_cpu == cpu_of(rq))
1369 +               return __rq_clock(rq);
1370 +
1371 +       return rq->clock;
1372 +}
1373 +
1374  /*
1375   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1376   * See detach_destroy_domains: synchronize_sched for details.
1377 @@ -423,134 +452,6 @@ static inline void task_rq_unlock(struct
1378         spin_unlock_irqrestore(&rq->lock, *flags);
1379  }
1380
1381 -#ifdef CONFIG_SCHEDSTATS
1382 -/*
1383 - * bump this up when changing the output format or the meaning of an existing
1384 - * format, so that tools can adapt (or abort)
1385 - */
1386 -#define SCHEDSTAT_VERSION 14
1387 -
1388 -static int show_schedstat(struct seq_file *seq, void *v)
1389 -{
1390 -       int cpu;
1391 -
1392 -       seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
1393 -       seq_printf(seq, "timestamp %lu\n", jiffies);
1394 -       for_each_online_cpu(cpu) {
1395 -               struct rq *rq = cpu_rq(cpu);
1396 -#ifdef CONFIG_SMP
1397 -               struct sched_domain *sd;
1398 -               int dcnt = 0;
1399 -#endif
1400 -
1401 -               /* runqueue-specific stats */
1402 -               seq_printf(seq,
1403 -                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
1404 -                   cpu, rq->yld_both_empty,
1405 -                   rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
1406 -                   rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
1407 -                   rq->ttwu_cnt, rq->ttwu_local,
1408 -                   rq->rq_sched_info.cpu_time,
1409 -                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
1410 -
1411 -               seq_printf(seq, "\n");
1412 -
1413 -#ifdef CONFIG_SMP
1414 -               /* domain-specific stats */
1415 -               preempt_disable();
1416 -               for_each_domain(cpu, sd) {
1417 -                       enum idle_type itype;
1418 -                       char mask_str[NR_CPUS];
1419 -
1420 -                       cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
1421 -                       seq_printf(seq, "domain%d %s", dcnt++, mask_str);
1422 -                       for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
1423 -                                       itype++) {
1424 -                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
1425 -                                               "%lu",
1426 -                                   sd->lb_cnt[itype],
1427 -                                   sd->lb_balanced[itype],
1428 -                                   sd->lb_failed[itype],
1429 -                                   sd->lb_imbalance[itype],
1430 -                                   sd->lb_gained[itype],
1431 -                                   sd->lb_hot_gained[itype],
1432 -                                   sd->lb_nobusyq[itype],
1433 -                                   sd->lb_nobusyg[itype]);
1434 -                       }
1435 -                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
1436 -                           " %lu %lu %lu\n",
1437 -                           sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
1438 -                           sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
1439 -                           sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
1440 -                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
1441 -                           sd->ttwu_move_balance);
1442 -               }
1443 -               preempt_enable();
1444 -#endif
1445 -       }
1446 -       return 0;
1447 -}
1448 -
1449 -static int schedstat_open(struct inode *inode, struct file *file)
1450 -{
1451 -       unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
1452 -       char *buf = kmalloc(size, GFP_KERNEL);
1453 -       struct seq_file *m;
1454 -       int res;
1455 -
1456 -       if (!buf)
1457 -               return -ENOMEM;
1458 -       res = single_open(file, show_schedstat, NULL);
1459 -       if (!res) {
1460 -               m = file->private_data;
1461 -               m->buf = buf;
1462 -               m->size = size;
1463 -       } else
1464 -               kfree(buf);
1465 -       return res;
1466 -}
1467 -
1468 -const struct file_operations proc_schedstat_operations = {
1469 -       .open    = schedstat_open,
1470 -       .read    = seq_read,
1471 -       .llseek  = seq_lseek,
1472 -       .release = single_release,
1473 -};
1474 -
1475 -/*
1476 - * Expects runqueue lock to be held for atomicity of update
1477 - */
1478 -static inline void
1479 -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
1480 -{
1481 -       if (rq) {
1482 -               rq->rq_sched_info.run_delay += delta_jiffies;
1483 -               rq->rq_sched_info.pcnt++;
1484 -       }
1485 -}
1486 -
1487 -/*
1488 - * Expects runqueue lock to be held for atomicity of update
1489 - */
1490 -static inline void
1491 -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
1492 -{
1493 -       if (rq)
1494 -               rq->rq_sched_info.cpu_time += delta_jiffies;
1495 -}
1496 -# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
1497 -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
1498 -#else /* !CONFIG_SCHEDSTATS */
1499 -static inline void
1500 -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
1501 -{}
1502 -static inline void
1503 -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
1504 -{}
1505 -# define schedstat_inc(rq, field)      do { } while (0)
1506 -# define schedstat_add(rq, field, amt) do { } while (0)
1507 -#endif
1508 -
1509  /*
1510   * this_rq_lock - lock this runqueue and disable interrupts.
1511   */
1512 @@ -566,178 +467,60 @@ static inline struct rq *this_rq_lock(vo
1513         return rq;
1514  }
1515
1516 -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1517 -/*
1518 - * Called when a process is dequeued from the active array and given
1519 - * the cpu.  We should note that with the exception of interactive
1520 - * tasks, the expired queue will become the active queue after the active
1521 - * queue is empty, without explicitly dequeuing and requeuing tasks in the
1522 - * expired queue.  (Interactive tasks may be requeued directly to the
1523 - * active queue, thus delaying tasks in the expired queue from running;
1524 - * see scheduler_tick()).
1525 - *
1526 - * This function is only called from sched_info_arrive(), rather than
1527 - * dequeue_task(). Even though a task may be queued and dequeued multiple
1528 - * times as it is shuffled about, we're really interested in knowing how
1529 - * long it was from the *first* time it was queued to the time that it
1530 - * finally hit a cpu.
1531 - */
1532 -static inline void sched_info_dequeued(struct task_struct *t)
1533 -{
1534 -       t->sched_info.last_queued = 0;
1535 -}
1536 -
1537  /*
1538 - * Called when a task finally hits the cpu.  We can now calculate how
1539 - * long it was waiting to run.  We also note when it began so that we
1540 - * can keep stats on how long its timeslice is.
1541 + * CPU frequency is/was unstable - start new by setting prev_clock_raw:
1542   */
1543 -static void sched_info_arrive(struct task_struct *t)
1544 +void sched_clock_unstable_event(void)
1545  {
1546 -       unsigned long now = jiffies, delta_jiffies = 0;
1547 -
1548 -       if (t->sched_info.last_queued)
1549 -               delta_jiffies = now - t->sched_info.last_queued;
1550 -       sched_info_dequeued(t);
1551 -       t->sched_info.run_delay += delta_jiffies;
1552 -       t->sched_info.last_arrival = now;
1553 -       t->sched_info.pcnt++;
1554 +       unsigned long flags;
1555 +       struct rq *rq;
1556
1557 -       rq_sched_info_arrive(task_rq(t), delta_jiffies);
1558 +       rq = task_rq_lock(current, &flags);
1559 +       rq->prev_clock_raw = sched_clock();
1560 +       rq->clock_unstable_events++;
1561 +       task_rq_unlock(rq, &flags);
1562  }
1563
1564  /*
1565 - * Called when a process is queued into either the active or expired
1566 - * array.  The time is noted and later used to determine how long we
1567 - * had to wait for us to reach the cpu.  Since the expired queue will
1568 - * become the active queue after active queue is empty, without dequeuing
1569 - * and requeuing any tasks, we are interested in queuing to either. It
1570 - * is unusual but not impossible for tasks to be dequeued and immediately
1571 - * requeued in the same or another array: this can happen in sched_yield(),
1572 - * set_user_nice(), and even load_balance() as it moves tasks from runqueue
1573 - * to runqueue.
1574 + * resched_task - mark a task 'to be rescheduled now'.
1575   *
1576 - * This function is only called from enqueue_task(), but also only updates
1577 - * the timestamp if it is already not set.  It's assumed that
1578 - * sched_info_dequeued() will clear that stamp when appropriate.
1579 - */
1580 -static inline void sched_info_queued(struct task_struct *t)
1581 -{
1582 -       if (unlikely(sched_info_on()))
1583 -               if (!t->sched_info.last_queued)
1584 -                       t->sched_info.last_queued = jiffies;
1585 -}
1586 -
1587 -/*
1588 - * Called when a process ceases being the active-running process, either
1589 - * voluntarily or involuntarily.  Now we can calculate how long we ran.
1590 + * On UP this means the setting of the need_resched flag, on SMP it
1591 + * might also involve a cross-CPU call to trigger the scheduler on
1592 + * the target CPU.
1593   */
1594 -static inline void sched_info_depart(struct task_struct *t)
1595 -{
1596 -       unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
1597 +#ifdef CONFIG_SMP
1598
1599 -       t->sched_info.cpu_time += delta_jiffies;
1600 -       rq_sched_info_depart(task_rq(t), delta_jiffies);
1601 -}
1602 +#ifndef tsk_is_polling
1603 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1604 +#endif
1605
1606 -/*
1607 - * Called when tasks are switched involuntarily due, typically, to expiring
1608 - * their time slice.  (This may also be called when switching to or from
1609 - * the idle task.)  We are only called when prev != next.
1610 - */
1611 -static inline void
1612 -__sched_info_switch(struct task_struct *prev, struct task_struct *next)
1613 +static void resched_task(struct task_struct *p)
1614  {
1615 -       struct rq *rq = task_rq(prev);
1616 -
1617 -       /*
1618 -        * prev now departs the cpu.  It's not interesting to record
1619 -        * stats about how efficient we were at scheduling the idle
1620 -        * process, however.
1621 -        */
1622 -       if (prev != rq->idle)
1623 -               sched_info_depart(prev);
1624 +       int cpu;
1625
1626 -       if (next != rq->idle)
1627 -               sched_info_arrive(next);
1628 -}
1629 -static inline void
1630 -sched_info_switch(struct task_struct *prev, struct task_struct *next)
1631 -{
1632 -       if (unlikely(sched_info_on()))
1633 -               __sched_info_switch(prev, next);
1634 -}
1635 -#else
1636 -#define sched_info_queued(t)           do { } while (0)
1637 -#define sched_info_switch(t, next)     do { } while (0)
1638 -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
1639 +       assert_spin_locked(&task_rq(p)->lock);
1640
1641 -/*
1642 - * Adding/removing a task to/from a priority array:
1643 - */
1644 -static void dequeue_task(struct task_struct *p, struct prio_array *array)
1645 -{
1646 -       array->nr_active--;
1647 -       list_del(&p->run_list);
1648 -       if (list_empty(array->queue + p->prio))
1649 -               __clear_bit(p->prio, array->bitmap);
1650 -}
1651 +       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1652 +               return;
1653
1654 -static void enqueue_task(struct task_struct *p, struct prio_array *array)
1655 -{
1656 -       sched_info_queued(p);
1657 -       list_add_tail(&p->run_list, array->queue + p->prio);
1658 -       __set_bit(p->prio, array->bitmap);
1659 -       array->nr_active++;
1660 -       p->array = array;
1661 -}
1662 +       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1663
1664 -/*
1665 - * Put task to the end of the run list without the overhead of dequeue
1666 - * followed by enqueue.
1667 - */
1668 -static void requeue_task(struct task_struct *p, struct prio_array *array)
1669 -{
1670 -       list_move_tail(&p->run_list, array->queue + p->prio);
1671 -}
1672 +       cpu = task_cpu(p);
1673 +       if (cpu == smp_processor_id())
1674 +               return;
1675
1676 -static inline void
1677 -enqueue_task_head(struct task_struct *p, struct prio_array *array)
1678 -{
1679 -       list_add(&p->run_list, array->queue + p->prio);
1680 -       __set_bit(p->prio, array->bitmap);
1681 -       array->nr_active++;
1682 -       p->array = array;
1683 +       /* NEED_RESCHED must be visible before we test polling */
1684 +       smp_mb();
1685 +       if (!tsk_is_polling(p))
1686 +               smp_send_reschedule(cpu);
1687  }
1688 -
1689 -/*
1690 - * __normal_prio - return the priority that is based on the static
1691 - * priority but is modified by bonuses/penalties.
1692 - *
1693 - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
1694 - * into the -5 ... 0 ... +5 bonus/penalty range.
1695 - *
1696 - * We use 25% of the full 0...39 priority range so that:
1697 - *
1698 - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
1699 - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
1700 - *
1701 - * Both properties are important to certain workloads.
1702 - */
1703 -
1704 -static inline int __normal_prio(struct task_struct *p)
1705 +#else
1706 +static inline void resched_task(struct task_struct *p)
1707  {
1708 -       int bonus, prio;
1709 -
1710 -       bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
1711 -
1712 -       prio = p->static_prio - bonus;
1713 -       if (prio < MAX_RT_PRIO)
1714 -               prio = MAX_RT_PRIO;
1715 -       if (prio > MAX_PRIO-1)
1716 -               prio = MAX_PRIO-1;
1717 -       return prio;
1718 +       assert_spin_locked(&task_rq(p)->lock);
1719 +       set_tsk_need_resched(p);
1720  }
1721 +#endif
1722
1723  /*
1724   * To aid in avoiding the subversion of "niceness" due to uneven distribution
1725 @@ -761,22 +544,33 @@ static inline int __normal_prio(struct t
1726  #define RTPRIO_TO_LOAD_WEIGHT(rp) \
1727         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
1728
1729 +/*
1730 + * Nice levels are logarithmic. These are the load shifts assigned
1731 + * to nice levels, where a step of every 2 nice levels means a
1732 + * multiplicator of 2:
1733 + */
1734 +const int prio_to_load_shift[40] = {
1735 +/* -20 */ 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
1736 +/* -10 */ 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
1737 +/*   0 */ 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,
1738 +/*  10 */  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
1739 +};
1740 +
1741 +static int get_load_shift(struct task_struct *p)
1742 +{
1743 +       int prio = p->static_prio;
1744 +
1745 +       if (rt_prio(prio) || p->policy == SCHED_BATCH)
1746 +               return 0;
1747 +
1748 +       return prio_to_load_shift[prio - MAX_RT_PRIO];
1749 +}
1750 +
1751  static void set_load_weight(struct task_struct *p)
1752  {
1753 -       if (has_rt_policy(p)) {
1754 -#ifdef CONFIG_SMP
1755 -               if (p == task_rq(p)->migration_thread)
1756 -                       /*
1757 -                        * The migration thread does the actual balancing.
1758 -                        * Giving its load any weight will skew balancing
1759 -                        * adversely.
1760 -                        */
1761 -                       p->load_weight = 0;
1762 -               else
1763 -#endif
1764 -                       p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
1765 -       } else
1766 -               p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
1767 +       p->load_shift = get_load_shift(p);
1768 +       p->load_weight = 1 << p->load_shift;
1769 +       p->wait_runtime = 0;
1770  }
1771
1772  static inline void
1773 @@ -803,6 +597,40 @@ static inline void dec_nr_running(struct
1774         dec_raw_weighted_load(rq, p);
1775  }
1776
1777 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1778 +
1779 +#include "sched_stats.h"
1780 +#include "sched_rt.c"
1781 +#include "sched_fair.c"
1782 +#include "sched_debug.c"
1783 +
1784 +#define sched_class_highest (&rt_sched_class)
1785 +
1786 +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1787 +{
1788 +       u64 now = rq_clock(rq);
1789 +
1790 +       sched_info_queued(p);
1791 +       p->sched_class->enqueue_task(rq, p, wakeup, now);
1792 +       p->on_rq = 1;
1793 +}
1794 +
1795 +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1796 +{
1797 +       u64 now = rq_clock(rq);
1798 +
1799 +       p->sched_class->dequeue_task(rq, p, sleep, now);
1800 +       p->on_rq = 0;
1801 +}
1802 +
1803 +/*
1804 + * __normal_prio - return the priority that is based on the static prio
1805 + */
1806 +static inline int __normal_prio(struct task_struct *p)
1807 +{
1808 +       return p->static_prio;
1809 +}
1810 +
1811  /*
1812   * Calculate the expected normal priority: i.e. priority
1813   * without taking RT-inheritance into account. Might be
1814 @@ -842,210 +670,31 @@ static int effective_prio(struct task_st
1815  }
1816
1817  /*
1818 - * __activate_task - move a task to the runqueue.
1819 + * activate_task - move a task to the runqueue.
1820   */
1821 -static void __activate_task(struct task_struct *p, struct rq *rq)
1822 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1823  {
1824 -       struct prio_array *target = rq->active;
1825 -
1826 -       if (batch_task(p))
1827 -               target = rq->expired;
1828 -       enqueue_task(p, target);
1829 +       enqueue_task(rq, p, wakeup);
1830         inc_nr_running(p, rq);
1831  }
1832
1833  /*
1834 - * __activate_idle_task - move idle task to the _front_ of runqueue.
1835 + * activate_idle_task - move idle task to the _front_ of runqueue.
1836   */
1837 -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
1838 +static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1839  {
1840 -       enqueue_task_head(p, rq->active);
1841 +       enqueue_task(rq, p, 0);
1842         inc_nr_running(p, rq);
1843  }
1844
1845  /*
1846 - * Recalculate p->normal_prio and p->prio after having slept,
1847 - * updating the sleep-average too:
1848 - */
1849 -static int recalc_task_prio(struct task_struct *p, unsigned long long now)
1850 -{
1851 -       /* Caller must always ensure 'now >= p->timestamp' */
1852 -       unsigned long sleep_time = now - p->timestamp;
1853 -
1854 -       if (batch_task(p))
1855 -               sleep_time = 0;
1856 -
1857 -       if (likely(sleep_time > 0)) {
1858 -               /*
1859 -                * This ceiling is set to the lowest priority that would allow
1860 -                * a task to be reinserted into the active array on timeslice
1861 -                * completion.
1862 -                */
1863 -               unsigned long ceiling = INTERACTIVE_SLEEP(p);
1864 -
1865 -               if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
1866 -                       /*
1867 -                        * Prevents user tasks from achieving best priority
1868 -                        * with one single large enough sleep.
1869 -                        */
1870 -                       p->sleep_avg = ceiling;
1871 -                       /*
1872 -                        * Using INTERACTIVE_SLEEP() as a ceiling places a
1873 -                        * nice(0) task 1ms sleep away from promotion, and
1874 -                        * gives it 700ms to round-robin with no chance of
1875 -                        * being demoted.  This is more than generous, so
1876 -                        * mark this sleep as non-interactive to prevent the
1877 -                        * on-runqueue bonus logic from intervening should
1878 -                        * this task not receive cpu immediately.
1879 -                        */
1880 -                       p->sleep_type = SLEEP_NONINTERACTIVE;
1881 -               } else {
1882 -                       /*
1883 -                        * Tasks waking from uninterruptible sleep are
1884 -                        * limited in their sleep_avg rise as they
1885 -                        * are likely to be waiting on I/O
1886 -                        */
1887 -                       if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
1888 -                               if (p->sleep_avg >= ceiling)
1889 -                                       sleep_time = 0;
1890 -                               else if (p->sleep_avg + sleep_time >=
1891 -                                        ceiling) {
1892 -                                               p->sleep_avg = ceiling;
1893 -                                               sleep_time = 0;
1894 -                               }
1895 -                       }
1896 -
1897 -                       /*
1898 -                        * This code gives a bonus to interactive tasks.
1899 -                        *
1900 -                        * The boost works by updating the 'average sleep time'
1901 -                        * value here, based on ->timestamp. The more time a
1902 -                        * task spends sleeping, the higher the average gets -
1903 -                        * and the higher the priority boost gets as well.
1904 -                        */
1905 -                       p->sleep_avg += sleep_time;
1906 -
1907 -               }
1908 -               if (p->sleep_avg > NS_MAX_SLEEP_AVG)
1909 -                       p->sleep_avg = NS_MAX_SLEEP_AVG;
1910 -       }
1911 -
1912 -       return effective_prio(p);
1913 -}
1914 -
1915 -/*
1916 - * activate_task - move a task to the runqueue and do priority recalculation
1917 - *
1918 - * Update all the scheduling statistics stuff. (sleep average
1919 - * calculation, priority modifiers, etc.)
1920 - */
1921 -static void activate_task(struct task_struct *p, struct rq *rq, int local)
1922 -{
1923 -       unsigned long long now;
1924 -
1925 -       if (rt_task(p))
1926 -               goto out;
1927 -
1928 -       now = sched_clock();
1929 -#ifdef CONFIG_SMP
1930 -       if (!local) {
1931 -               /* Compensate for drifting sched_clock */
1932 -               struct rq *this_rq = this_rq();
1933 -               now = (now - this_rq->most_recent_timestamp)
1934 -                       + rq->most_recent_timestamp;
1935 -       }
1936 -#endif
1937 -
1938 -       /*
1939 -        * Sleep time is in units of nanosecs, so shift by 20 to get a
1940 -        * milliseconds-range estimation of the amount of time that the task
1941 -        * spent sleeping:
1942 -        */
1943 -       if (unlikely(prof_on == SLEEP_PROFILING)) {
1944 -               if (p->state == TASK_UNINTERRUPTIBLE)
1945 -                       profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1946 -                                    (now - p->timestamp) >> 20);
1947 -       }
1948 -
1949 -       p->prio = recalc_task_prio(p, now);
1950 -
1951 -       /*
1952 -        * This checks to make sure it's not an uninterruptible task
1953 -        * that is now waking up.
1954 -        */
1955 -       if (p->sleep_type == SLEEP_NORMAL) {
1956 -               /*
1957 -                * Tasks which were woken up by interrupts (ie. hw events)
1958 -                * are most likely of interactive nature. So we give them
1959 -                * the credit of extending their sleep time to the period
1960 -                * of time they spend on the runqueue, waiting for execution
1961 -                * on a CPU, first time around:
1962 -                */
1963 -               if (in_interrupt())
1964 -                       p->sleep_type = SLEEP_INTERRUPTED;
1965 -               else {
1966 -                       /*
1967 -                        * Normal first-time wakeups get a credit too for
1968 -                        * on-runqueue time, but it will be weighted down:
1969 -                        */
1970 -                       p->sleep_type = SLEEP_INTERACTIVE;
1971 -               }
1972 -       }
1973 -       p->timestamp = now;
1974 -out:
1975 -       __activate_task(p, rq);
1976 -}
1977 -
1978 -/*
1979   * deactivate_task - remove a task from the runqueue.
1980   */
1981 -static void deactivate_task(struct task_struct *p, struct rq *rq)
1982 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1983  {
1984 +       dequeue_task(rq, p, sleep);
1985         dec_nr_running(p, rq);
1986 -       dequeue_task(p, p->array);
1987 -       p->array = NULL;
1988 -}
1989 -
1990 -/*
1991 - * resched_task - mark a task 'to be rescheduled now'.
1992 - *
1993 - * On UP this means the setting of the need_resched flag, on SMP it
1994 - * might also involve a cross-CPU call to trigger the scheduler on
1995 - * the target CPU.
1996 - */
1997 -#ifdef CONFIG_SMP
1998 -
1999 -#ifndef tsk_is_polling
2000 -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
2001 -#endif
2002 -
2003 -static void resched_task(struct task_struct *p)
2004 -{
2005 -       int cpu;
2006 -
2007 -       assert_spin_locked(&task_rq(p)->lock);
2008 -
2009 -       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
2010 -               return;
2011 -
2012 -       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
2013 -
2014 -       cpu = task_cpu(p);
2015 -       if (cpu == smp_processor_id())
2016 -               return;
2017 -
2018 -       /* NEED_RESCHED must be visible before we test polling */
2019 -       smp_mb();
2020 -       if (!tsk_is_polling(p))
2021 -               smp_send_reschedule(cpu);
2022 -}
2023 -#else
2024 -static inline void resched_task(struct task_struct *p)
2025 -{
2026 -       assert_spin_locked(&task_rq(p)->lock);
2027 -       set_tsk_need_resched(p);
2028  }
2029 -#endif
2030
2031  /**
2032   * task_curr - is this task currently executing on a CPU?
2033 @@ -1085,7 +734,7 @@ migrate_task(struct task_struct *p, int
2034          * If the task is not on a runqueue (and not running), then
2035          * it is sufficient to simply update the task's cpu field.
2036          */
2037 -       if (!p->array && !task_running(rq, p)) {
2038 +       if (!p->on_rq && !task_running(rq, p)) {
2039                 set_task_cpu(p, dest_cpu);
2040                 return 0;
2041         }
2042 @@ -1116,7 +765,7 @@ void wait_task_inactive(struct task_stru
2043  repeat:
2044         rq = task_rq_lock(p, &flags);
2045         /* Must be off runqueue entirely, not preempted. */
2046 -       if (unlikely(p->array || task_running(rq, p))) {
2047 +       if (unlikely(p->on_rq || task_running(rq, p))) {
2048                 /* If it's preempted, we yield.  It could be a while. */
2049                 preempted = !task_running(rq, p);
2050                 task_rq_unlock(rq, &flags);
2051 @@ -1292,9 +941,9 @@ static int sched_balance_self(int cpu, i
2052         struct sched_domain *tmp, *sd = NULL;
2053
2054         for_each_domain(cpu, tmp) {
2055 -               /*
2056 -                * If power savings logic is enabled for a domain, stop there.
2057 -                */
2058 +               /*
2059 +                * If power savings logic is enabled for a domain, stop there.
2060 +                */
2061                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2062                         break;
2063                 if (tmp->flags & flag)
2064 @@ -1412,7 +1061,7 @@ static int try_to_wake_up(struct task_st
2065         if (!(old_state & state))
2066                 goto out;
2067
2068 -       if (p->array)
2069 +       if (p->on_rq)
2070                 goto out_running;
2071
2072         cpu = task_cpu(p);
2073 @@ -1505,7 +1154,7 @@ out_set_cpu:
2074                 old_state = p->state;
2075                 if (!(old_state & state))
2076                         goto out;
2077 -               if (p->array)
2078 +               if (p->on_rq)
2079                         goto out_running;
2080
2081                 this_cpu = smp_processor_id();
2082 @@ -1514,25 +1163,10 @@ out_set_cpu:
2083
2084  out_activate:
2085  #endif /* CONFIG_SMP */
2086 -       if (old_state == TASK_UNINTERRUPTIBLE) {
2087 +       if (old_state == TASK_UNINTERRUPTIBLE)
2088                 rq->nr_uninterruptible--;
2089 -               /*
2090 -                * Tasks on involuntary sleep don't earn
2091 -                * sleep_avg beyond just interactive state.
2092 -                */
2093 -               p->sleep_type = SLEEP_NONINTERACTIVE;
2094 -       } else
2095
2096 -       /*
2097 -        * Tasks that have marked their sleep as noninteractive get
2098 -        * woken up with their sleep average not weighted in an
2099 -        * interactive way.
2100 -        */
2101 -               if (old_state & TASK_NONINTERACTIVE)
2102 -                       p->sleep_type = SLEEP_NONINTERACTIVE;
2103 -
2104 -
2105 -       activate_task(p, rq, cpu == this_cpu);
2106 +       activate_task(rq, p, 1);
2107         /*
2108          * Sync wakeups (i.e. those types of wakeups where the waker
2109          * has indicated that it will leave the CPU in short order)
2110 @@ -1541,10 +1175,8 @@ out_activate:
2111          * the waker guarantees that the freshly woken up task is going
2112          * to be considered on this CPU.)
2113          */
2114 -       if (!sync || cpu != this_cpu) {
2115 -               if (TASK_PREEMPTS_CURR(p, rq))
2116 -                       resched_task(rq->curr);
2117 -       }
2118 +       if (!sync || cpu != this_cpu)
2119 +               check_preempt_curr(rq, p);
2120         success = 1;
2121
2122  out_running:
2123 @@ -1567,19 +1199,35 @@ int fastcall wake_up_state(struct task_s
2124         return try_to_wake_up(p, state, 0);
2125  }
2126
2127 -static void task_running_tick(struct rq *rq, struct task_struct *p);
2128 +/*
2129 + * The task was running during this tick - call the class tick
2130 + * (to update the time slice counter and other statistics, etc.):
2131 + */
2132 +static void task_running_tick(struct rq *rq, struct task_struct *p)
2133 +{
2134 +       spin_lock(&rq->lock);
2135 +       p->sched_class->task_tick(rq, p);
2136 +       spin_unlock(&rq->lock);
2137 +}
2138 +
2139  /*
2140   * Perform scheduler related setup for a newly forked process p.
2141   * p is forked by current.
2142 + *
2143 + * __sched_fork() is basic setup used by init_idle() too:
2144   */
2145 -void fastcall sched_fork(struct task_struct *p, int clone_flags)
2146 +static void __sched_fork(struct task_struct *p)
2147  {
2148 -       int cpu = get_cpu();
2149 +       p->wait_start_fair = p->wait_start = p->exec_start = p->last_ran = 0;
2150 +       p->sum_exec_runtime = p->wait_runtime = 0;
2151 +       p->sum_wait_runtime = 0;
2152 +       p->sleep_start = p->block_start = 0;
2153 +       p->sleep_max = p->block_max = p->exec_max = p->wait_max = 0;
2154
2155 -#ifdef CONFIG_SMP
2156 -       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2157 -#endif
2158 -       set_task_cpu(p, cpu);
2159 +       INIT_LIST_HEAD(&p->run_list);
2160 +       p->on_rq = 0;
2161 +       p->nr_switches = 0;
2162 +       p->min_wait_runtime = 0;
2163
2164         /*
2165          * We mark the process as running here, but have not actually
2166 @@ -1588,16 +1236,29 @@ void fastcall sched_fork(struct task_str
2167          * event cannot wake it up and insert it on the runqueue either.
2168          */
2169         p->state = TASK_RUNNING;
2170 +}
2171 +
2172 +/*
2173 + * fork()/clone()-time setup:
2174 + */
2175 +void sched_fork(struct task_struct *p, int clone_flags)
2176 +{
2177 +       int cpu = get_cpu();
2178 +
2179 +       __sched_fork(p);
2180 +
2181 +#ifdef CONFIG_SMP
2182 +       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2183 +#endif
2184 +       set_task_cpu(p, cpu);
2185
2186         /*
2187          * Make sure we do not leak PI boosting priority to the child:
2188          */
2189         p->prio = current->normal_prio;
2190
2191 -       INIT_LIST_HEAD(&p->run_list);
2192 -       p->array = NULL;
2193  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2194 -       if (unlikely(sched_info_on()))
2195 +       if (likely(sched_info_on()))
2196                 memset(&p->sched_info, 0, sizeof(p->sched_info));
2197  #endif
2198  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2199 @@ -1607,34 +1268,16 @@ void fastcall sched_fork(struct task_str
2200         /* Want to start with kernel preemption disabled. */
2201         task_thread_info(p)->preempt_count = 1;
2202  #endif
2203 -       /*
2204 -        * Share the timeslice between parent and child, thus the
2205 -        * total amount of pending timeslices in the system doesn't change,
2206 -        * resulting in more scheduling fairness.
2207 -        */
2208 -       local_irq_disable();
2209 -       p->time_slice = (current->time_slice + 1) >> 1;
2210 -       /*
2211 -        * The remainder of the first timeslice might be recovered by
2212 -        * the parent if the child exits early enough.
2213 -        */
2214 -       p->first_time_slice = 1;
2215 -       current->time_slice >>= 1;
2216 -       p->timestamp = sched_clock();
2217 -       if (unlikely(!current->time_slice)) {
2218 -               /*
2219 -                * This case is rare, it happens when the parent has only
2220 -                * a single jiffy left from its timeslice. Taking the
2221 -                * runqueue lock is not a problem.
2222 -                */
2223 -               current->time_slice = 1;
2224 -               task_running_tick(cpu_rq(cpu), current);
2225 -       }
2226 -       local_irq_enable();
2227         put_cpu();
2228  }
2229
2230  /*
2231 + * After fork, child runs first. (default) If set to 0 then
2232 + * parent will (try to) run first.
2233 + */
2234 +unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
2235 +
2236 +/*
2237   * wake_up_new_task - wake up a newly created task for the first time.
2238   *
2239   * This function will do some initial scheduler statistics housekeeping
2240 @@ -1643,107 +1286,27 @@ void fastcall sched_fork(struct task_str
2241   */
2242  void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2243  {
2244 -       struct rq *rq, *this_rq;
2245         unsigned long flags;
2246 -       int this_cpu, cpu;
2247 +       struct rq *rq;
2248 +       int this_cpu;
2249
2250         rq = task_rq_lock(p, &flags);
2251         BUG_ON(p->state != TASK_RUNNING);
2252 -       this_cpu = smp_processor_id();
2253 -       cpu = task_cpu(p);
2254 -
2255 -       /*
2256 -        * We decrease the sleep average of forking parents
2257 -        * and children as well, to keep max-interactive tasks
2258 -        * from forking tasks that are max-interactive. The parent
2259 -        * (current) is done further down, under its lock.
2260 -        */
2261 -       p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
2262 -               CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
2263 +       this_cpu = smp_processor_id(); /* parent's CPU */
2264
2265         p->prio = effective_prio(p);
2266
2267 -       if (likely(cpu == this_cpu)) {
2268 -               if (!(clone_flags & CLONE_VM)) {
2269 -                       /*
2270 -                        * The VM isn't cloned, so we're in a good position to
2271 -                        * do child-runs-first in anticipation of an exec. This
2272 -                        * usually avoids a lot of COW overhead.
2273 -                        */
2274 -                       if (unlikely(!current->array))
2275 -                               __activate_task(p, rq);
2276 -                       else {
2277 -                               p->prio = current->prio;
2278 -                               p->normal_prio = current->normal_prio;
2279 -                               list_add_tail(&p->run_list, &current->run_list);
2280 -                               p->array = current->array;
2281 -                               p->array->nr_active++;
2282 -                               inc_nr_running(p, rq);
2283 -                       }
2284 -                       set_need_resched();
2285 -               } else
2286 -                       /* Run child last */
2287 -                       __activate_task(p, rq);
2288 -               /*
2289 -                * We skip the following code due to cpu == this_cpu
2290 -                *
2291 -                *   task_rq_unlock(rq, &flags);
2292 -                *   this_rq = task_rq_lock(current, &flags);
2293 -                */
2294 -               this_rq = rq;
2295 +       if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
2296 +                       task_cpu(p) != this_cpu || !current->on_rq) {
2297 +               activate_task(rq, p, 0);
2298         } else {
2299 -               this_rq = cpu_rq(this_cpu);
2300 -
2301 -               /*
2302 -                * Not the local CPU - must adjust timestamp. This should
2303 -                * get optimised away in the !CONFIG_SMP case.
2304 -                */
2305 -               p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
2306 -                                       + rq->most_recent_timestamp;
2307 -               __activate_task(p, rq);
2308 -               if (TASK_PREEMPTS_CURR(p, rq))
2309 -                       resched_task(rq->curr);
2310 -
2311                 /*
2312 -                * Parent and child are on different CPUs, now get the
2313 -                * parent runqueue to update the parent's ->sleep_avg:
2314 +                * Let the scheduling class do new task startup
2315 +                * management (if any):
2316                  */
2317 -               task_rq_unlock(rq, &flags);
2318 -               this_rq = task_rq_lock(current, &flags);
2319 +               p->sched_class->task_new(rq, p);
2320         }
2321 -       current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
2322 -               PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
2323 -       task_rq_unlock(this_rq, &flags);
2324 -}
2325 -
2326 -/*
2327 - * Potentially available exiting-child timeslices are
2328 - * retrieved here - this way the parent does not get
2329 - * penalized for creating too many threads.
2330 - *
2331 - * (this cannot be used to 'generate' timeslices
2332 - * artificially, because any timeslice recovered here
2333 - * was given away by the parent in the first place.)
2334 - */
2335 -void fastcall sched_exit(struct task_struct *p)
2336 -{
2337 -       unsigned long flags;
2338 -       struct rq *rq;
2339 -
2340 -       /*
2341 -        * If the child was a (relative-) CPU hog then decrease
2342 -        * the sleep_avg of the parent as well.
2343 -        */
2344 -       rq = task_rq_lock(p->parent, &flags);
2345 -       if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
2346 -               p->parent->time_slice += p->time_slice;
2347 -               if (unlikely(p->parent->time_slice > task_timeslice(p)))
2348 -                       p->parent->time_slice = task_timeslice(p);
2349 -       }
2350 -       if (p->sleep_avg < p->parent->sleep_avg)
2351 -               p->parent->sleep_avg = p->parent->sleep_avg /
2352 -               (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
2353 -               (EXIT_WEIGHT + 1);
2354 +       check_preempt_curr(rq, p);
2355         task_rq_unlock(rq, &flags);
2356  }
2357
2358 @@ -1941,17 +1504,56 @@ unsigned long nr_active(void)
2359         return running + uninterruptible;
2360  }
2361
2362 -#ifdef CONFIG_SMP
2363 -
2364 -/*
2365 - * Is this task likely cache-hot:
2366 - */
2367 -static inline int
2368 -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2369 +static void update_load_fair(struct rq *this_rq)
2370  {
2371 -       return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
2372 +       unsigned long this_load, fair_delta, exec_delta, idle_delta;
2373 +       unsigned int i, scale;
2374 +       s64 fair_delta64, exec_delta64;
2375 +       unsigned long tmp;
2376 +       u64 tmp64;
2377 +
2378 +       this_rq->nr_load_updates++;
2379 +
2380 +       fair_delta64 = this_rq->fair_clock - this_rq->prev_fair_clock + 1;
2381 +       this_rq->prev_fair_clock = this_rq->fair_clock;
2382 +       WARN_ON_ONCE(fair_delta64 <= 0);
2383 +
2384 +       exec_delta64 = this_rq->exec_clock - this_rq->prev_exec_clock + 1;
2385 +       this_rq->prev_exec_clock = this_rq->exec_clock;
2386 +       WARN_ON_ONCE(exec_delta64 <= 0);
2387 +
2388 +       if (fair_delta64 > (s64)LONG_MAX)
2389 +               fair_delta64 = (s64)LONG_MAX;
2390 +       fair_delta = (unsigned long)fair_delta64;
2391 +
2392 +       if (exec_delta64 > (s64)LONG_MAX)
2393 +               exec_delta64 = (s64)LONG_MAX;
2394 +       exec_delta = (unsigned long)exec_delta64;
2395 +       if (exec_delta > TICK_NSEC)
2396 +               exec_delta = TICK_NSEC;
2397 +
2398 +       idle_delta = TICK_NSEC - exec_delta;
2399 +
2400 +       tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
2401 +       tmp64 = (u64)tmp * (u64)exec_delta;
2402 +       do_div(tmp64, TICK_NSEC);
2403 +       this_load = (unsigned long)tmp64;
2404 +
2405 +       /* Update our load: */
2406 +       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2407 +               unsigned long old_load, new_load;
2408 +
2409 +               /* scale is effectively 1 << i now, and >> i divides by scale */
2410 +
2411 +               old_load = this_rq->cpu_load[i];
2412 +               new_load = this_load;
2413 +
2414 +               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2415 +       }
2416  }
2417
2418 +#ifdef CONFIG_SMP
2419 +
2420  /*
2421   * double_rq_lock - safely lock two runqueues
2422   *
2423 @@ -2068,23 +1670,17 @@ void sched_exec(void)
2424   * pull_task - move a task from a remote runqueue to the local runqueue.
2425   * Both runqueues must be locked.
2426   */
2427 -static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2428 -                     struct task_struct *p, struct rq *this_rq,
2429 -                     struct prio_array *this_array, int this_cpu)
2430 +static void pull_task(struct rq *src_rq, struct task_struct *p,
2431 +                     struct rq *this_rq, int this_cpu)
2432  {
2433 -       dequeue_task(p, src_array);
2434 -       dec_nr_running(p, src_rq);
2435 +       deactivate_task(src_rq, p, 0);
2436         set_task_cpu(p, this_cpu);
2437 -       inc_nr_running(p, this_rq);
2438 -       enqueue_task(p, this_array);
2439 -       p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2440 -                               + this_rq->most_recent_timestamp;
2441 +       activate_task(this_rq, p, 0);
2442         /*
2443          * Note that idle threads have a prio of MAX_PRIO, for this test
2444          * to be always true for them.
2445          */
2446 -       if (TASK_PREEMPTS_CURR(p, this_rq))
2447 -               resched_task(this_rq->curr);
2448 +       check_preempt_curr(this_rq, p);
2449  }
2450
2451  /*
2452 @@ -2109,25 +1705,59 @@ int can_migrate_task(struct task_struct
2453                 return 0;
2454
2455         /*
2456 -        * Aggressive migration if:
2457 -        * 1) task is cache cold, or
2458 -        * 2) too many balance attempts have failed.
2459 +        * Aggressive migration if too many balance attempts have failed:
2460          */
2461 -
2462 -       if (sd->nr_balance_failed > sd->cache_nice_tries) {
2463 -#ifdef CONFIG_SCHEDSTATS
2464 -               if (task_hot(p, rq->most_recent_timestamp, sd))
2465 -                       schedstat_inc(sd, lb_hot_gained[idle]);
2466 -#endif
2467 +       if (sd->nr_balance_failed > sd->cache_nice_tries)
2468                 return 1;
2469 -       }
2470
2471 -       if (task_hot(p, rq->most_recent_timestamp, sd))
2472 -               return 0;
2473         return 1;
2474  }
2475
2476 -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2477 +/*
2478 + * Load-balancing iterator: iterate through the hieararchy of scheduling
2479 + * classes, starting with the highest-prio one:
2480 + */
2481 +
2482 +struct task_struct * load_balance_start(struct rq *rq)
2483 +{
2484 +       struct sched_class *class = sched_class_highest;
2485 +       struct task_struct *p;
2486 +
2487 +       do {
2488 +               p = class->load_balance_start(rq);
2489 +               if (p) {
2490 +                       rq->load_balance_class = class;
2491 +                       return p;
2492 +               }
2493 +               class = class->next;
2494 +       } while (class);
2495 +
2496 +       return NULL;
2497 +}
2498 +
2499 +struct task_struct * load_balance_next(struct rq *rq)
2500 +{
2501 +       struct sched_class *class = rq->load_balance_class;
2502 +       struct task_struct *p;
2503 +
2504 +       p = class->load_balance_next(rq);
2505 +       if (p)
2506 +               return p;
2507 +       /*
2508 +        * Pick up the next class (if any) and attempt to start
2509 +        * the iterator there:
2510 +        */
2511 +       while ((class = class->next)) {
2512 +               p = class->load_balance_start(rq);
2513 +               if (p) {
2514 +                       rq->load_balance_class = class;
2515 +                       return p;
2516 +               }
2517 +       }
2518 +       return NULL;
2519 +}
2520 +
2521 +#define rq_best_prio(rq) (rq)->curr->prio
2522
2523  /*
2524   * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2525 @@ -2141,11 +1771,9 @@ static int move_tasks(struct rq *this_rq
2526                       struct sched_domain *sd, enum idle_type idle,
2527                       int *all_pinned)
2528  {
2529 -       int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2530 +       int pulled = 0, pinned = 0, this_best_prio, best_prio,
2531             best_prio_seen, skip_for_load;
2532 -       struct prio_array *array, *dst_array;
2533 -       struct list_head *head, *curr;
2534 -       struct task_struct *tmp;
2535 +       struct task_struct *p;
2536         long rem_load_move;
2537
2538         if (max_nr_move == 0 || max_load_move == 0)
2539 @@ -2165,76 +1793,41 @@ static int move_tasks(struct rq *this_rq
2540         best_prio_seen = best_prio == busiest->curr->prio;
2541
2542         /*
2543 -        * We first consider expired tasks. Those will likely not be
2544 -        * executed in the near future, and they are most likely to
2545 -        * be cache-cold, thus switching CPUs has the least effect
2546 -        * on them.
2547 -        */
2548 -       if (busiest->expired->nr_active) {
2549 -               array = busiest->expired;
2550 -               dst_array = this_rq->expired;
2551 -       } else {
2552 -               array = busiest->active;
2553 -               dst_array = this_rq->active;
2554 -       }
2555 -
2556 -new_array:
2557 -       /* Start searching at priority 0: */
2558 -       idx = 0;
2559 -skip_bitmap:
2560 -       if (!idx)
2561 -               idx = sched_find_first_bit(array->bitmap);
2562 -       else
2563 -               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2564 -       if (idx >= MAX_PRIO) {
2565 -               if (array == busiest->expired && busiest->active->nr_active) {
2566 -                       array = busiest->active;
2567 -                       dst_array = this_rq->active;
2568 -                       goto new_array;
2569 -               }
2570 +        * Start the load-balancing iterator:
2571 +        */
2572 +       p = load_balance_start(busiest);
2573 +next:
2574 +       if (!p)
2575                 goto out;
2576 -       }
2577 -
2578 -       head = array->queue + idx;
2579 -       curr = head->prev;
2580 -skip_queue:
2581 -       tmp = list_entry(curr, struct task_struct, run_list);
2582 -
2583 -       curr = curr->prev;
2584 -
2585         /*
2586          * To help distribute high priority tasks accross CPUs we don't
2587          * skip a task if it will be the highest priority task (i.e. smallest
2588          * prio value) on its new queue regardless of its load weight
2589          */
2590 -       skip_for_load = tmp->load_weight > rem_load_move;
2591 -       if (skip_for_load && idx < this_best_prio)
2592 -               skip_for_load = !best_prio_seen && idx == best_prio;
2593 +       skip_for_load = p->load_weight > rem_load_move;
2594 +       if (skip_for_load && p->prio < this_best_prio)
2595 +               skip_for_load = !best_prio_seen && p->prio == best_prio;
2596         if (skip_for_load ||
2597 -           !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2598 +           !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2599
2600 -               best_prio_seen |= idx == best_prio;
2601 -               if (curr != head)
2602 -                       goto skip_queue;
2603 -               idx++;
2604 -               goto skip_bitmap;
2605 +               best_prio_seen |= p->prio == best_prio;
2606 +               p = load_balance_next(busiest);
2607 +               goto next;
2608         }
2609
2610 -       pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2611 +       pull_task(busiest, p, this_rq, this_cpu);
2612         pulled++;
2613 -       rem_load_move -= tmp->load_weight;
2614 +       rem_load_move -= p->load_weight;
2615
2616         /*
2617          * We only want to steal up to the prescribed number of tasks
2618          * and the prescribed amount of weighted load.
2619          */
2620         if (pulled < max_nr_move && rem_load_move > 0) {
2621 -               if (idx < this_best_prio)
2622 -                       this_best_prio = idx;
2623 -               if (curr != head)
2624 -                       goto skip_queue;
2625 -               idx++;
2626 -               goto skip_bitmap;
2627 +               if (p->prio < this_best_prio)
2628 +                       this_best_prio = p->prio;
2629 +               p = load_balance_next(busiest);
2630 +               goto next;
2631         }
2632  out:
2633         /*
2634 @@ -2360,8 +1953,8 @@ find_busiest_group(struct sched_domain *
2635                  * Busy processors will not participate in power savings
2636                  * balance.
2637                  */
2638 -               if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2639 -                       goto group_next;
2640 +               if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2641 +                       goto group_next;
2642
2643                 /*
2644                  * If the local group is idle or completely loaded
2645 @@ -2371,42 +1964,42 @@ find_busiest_group(struct sched_domain *
2646                                     !this_nr_running))
2647                         power_savings_balance = 0;
2648
2649 -               /*
2650 +               /*
2651                  * If a group is already running at full capacity or idle,
2652                  * don't include that group in power savings calculations
2653 -                */
2654 -               if (!power_savings_balance || sum_nr_running >= group_capacity
2655 +                */
2656 +               if (!power_savings_balance || sum_nr_running >= group_capacity
2657                     || !sum_nr_running)
2658 -                       goto group_next;
2659 +                       goto group_next;
2660
2661 -               /*
2662 +               /*
2663                  * Calculate the group which has the least non-idle load.
2664 -                * This is the group from where we need to pick up the load
2665 -                * for saving power
2666 -                */
2667 -               if ((sum_nr_running < min_nr_running) ||
2668 -                   (sum_nr_running == min_nr_running &&
2669 +                * This is the group from where we need to pick up the load
2670 +                * for saving power
2671 +                */
2672 +               if ((sum_nr_running < min_nr_running) ||
2673 +                   (sum_nr_running == min_nr_running &&
2674                      first_cpu(group->cpumask) <
2675                      first_cpu(group_min->cpumask))) {
2676 -                       group_min = group;
2677 -                       min_nr_running = sum_nr_running;
2678 +                       group_min = group;
2679 +                       min_nr_running = sum_nr_running;
2680                         min_load_per_task = sum_weighted_load /
2681                                                 sum_nr_running;
2682 -               }
2683 +               }
2684
2685 -               /*
2686 +               /*
2687                  * Calculate the group which is almost near its
2688 -                * capacity but still has some space to pick up some load
2689 -                * from other group and save more power
2690 -                */
2691 -               if (sum_nr_running <= group_capacity - 1) {
2692 -                       if (sum_nr_running > leader_nr_running ||
2693 -                           (sum_nr_running == leader_nr_running &&
2694 -                            first_cpu(group->cpumask) >
2695 -                             first_cpu(group_leader->cpumask))) {
2696 -                               group_leader = group;
2697 -                               leader_nr_running = sum_nr_running;
2698 -                       }
2699 +                * capacity but still has some space to pick up some load
2700 +                * from other group and save more power
2701 +                */
2702 +               if (sum_nr_running <= group_capacity - 1) {
2703 +                       if (sum_nr_running > leader_nr_running ||
2704 +                           (sum_nr_running == leader_nr_running &&
2705 +                            first_cpu(group->cpumask) >
2706 +                             first_cpu(group_leader->cpumask))) {
2707 +                               group_leader = group;
2708 +                               leader_nr_running = sum_nr_running;
2709 +                       }
2710                 }
2711  group_next:
2712  #endif
2713 @@ -2461,7 +2054,7 @@ group_next:
2714          * a think about bumping its value to force at least one task to be
2715          * moved
2716          */
2717 -       if (*imbalance < busiest_load_per_task) {
2718 +       if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
2719                 unsigned long tmp, pwr_now, pwr_move;
2720                 unsigned int imbn;
2721
2722 @@ -2475,7 +2068,8 @@ small_imbalance:
2723                 } else
2724                         this_load_per_task = SCHED_LOAD_SCALE;
2725
2726 -               if (max_load - this_load >= busiest_load_per_task * imbn) {
2727 +               if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2728 +                                       busiest_load_per_task * imbn) {
2729                         *imbalance = busiest_load_per_task;
2730                         return busiest;
2731                 }
2732 @@ -2884,30 +2478,6 @@ static void active_load_balance(struct r
2733         spin_unlock(&target_rq->lock);
2734  }
2735
2736 -static void update_load(struct rq *this_rq)
2737 -{
2738 -       unsigned long this_load;
2739 -       int i, scale;
2740 -
2741 -       this_load = this_rq->raw_weighted_load;
2742 -
2743 -       /* Update our load: */
2744 -       for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
2745 -               unsigned long old_load, new_load;
2746 -
2747 -               old_load = this_rq->cpu_load[i];
2748 -               new_load = this_load;
2749 -               /*
2750 -                * Round up the averaging division if load is increasing. This
2751 -                * prevents us from getting stuck on 9 if the load is 10, for
2752 -                * example.
2753 -                */
2754 -               if (new_load > old_load)
2755 -                       new_load += scale-1;
2756 -               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2757 -       }
2758 -}
2759 -
2760  /*
2761   * run_rebalance_domains is triggered when needed from the scheduler tick.
2762   *
2763 @@ -2987,76 +2557,27 @@ static inline void idle_balance(int cpu,
2764  }
2765  #endif
2766
2767 -static inline void wake_priority_sleeper(struct rq *rq)
2768 -{
2769 -#ifdef CONFIG_SCHED_SMT
2770 -       if (!rq->nr_running)
2771 -               return;
2772 -
2773 -       spin_lock(&rq->lock);
2774 -       /*
2775 -        * If an SMT sibling task has been put to sleep for priority
2776 -        * reasons reschedule the idle task to see if it can now run.
2777 -        */
2778 -       if (rq->nr_running)
2779 -               resched_task(rq->idle);
2780 -       spin_unlock(&rq->lock);
2781 -#endif
2782 -}
2783 -
2784  DEFINE_PER_CPU(struct kernel_stat, kstat);
2785
2786  EXPORT_PER_CPU_SYMBOL(kstat);
2787
2788  /*
2789 - * This is called on clock ticks and on context switches.
2790 - * Bank in p->sched_time the ns elapsed since the last tick or switch.
2791 - */
2792 -static inline void
2793 -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2794 -{
2795 -       p->sched_time += now - p->last_ran;
2796 -       p->last_ran = rq->most_recent_timestamp = now;
2797 -}
2798 -
2799 -/*
2800 - * Return current->sched_time plus any more ns on the sched_clock
2801 + * Return current->sum_exec_runtime plus any more ns on the sched_clock
2802   * that have not yet been banked.
2803   */
2804 -unsigned long long current_sched_time(const struct task_struct *p)
2805 +unsigned long long current_sched_runtime(const struct task_struct *p)
2806  {
2807         unsigned long long ns;
2808         unsigned long flags;
2809
2810         local_irq_save(flags);
2811 -       ns = p->sched_time + sched_clock() - p->last_ran;
2812 +       ns = p->sum_exec_runtime + sched_clock() - p->last_ran;
2813         local_irq_restore(flags);
2814
2815         return ns;
2816  }
2817
2818  /*
2819 - * We place interactive tasks back into the active array, if possible.
2820 - *
2821 - * To guarantee that this does not starve expired tasks we ignore the
2822 - * interactivity of a task if the first expired task had to wait more
2823 - * than a 'reasonable' amount of time. This deadline timeout is
2824 - * load-dependent, as the frequency of array switched decreases with
2825 - * increasing number of running tasks. We also ignore the interactivity
2826 - * if a better static_prio task has expired:
2827 - */
2828 -static inline int expired_starving(struct rq *rq)
2829 -{
2830 -       if (rq->curr->static_prio > rq->best_expired_prio)
2831 -               return 1;
2832 -       if (!STARVATION_LIMIT || !rq->expired_timestamp)
2833 -               return 0;
2834 -       if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
2835 -               return 1;
2836 -       return 0;
2837 -}
2838 -
2839 -/*
2840   * Account user cpu time to a process.
2841   * @p: the process that the cpu time gets accounted to
2842   * @hardirq_offset: the offset to subtract from hardirq_count()
2843 @@ -3129,81 +2650,6 @@ void account_steal_time(struct task_stru
2844                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2845  }
2846
2847 -static void task_running_tick(struct rq *rq, struct task_struct *p)
2848 -{
2849 -       if (p->array != rq->active) {
2850 -               /* Task has expired but was not scheduled yet */
2851 -               set_tsk_need_resched(p);
2852 -               return;
2853 -       }
2854 -       spin_lock(&rq->lock);
2855 -       /*
2856 -        * The task was running during this tick - update the
2857 -        * time slice counter. Note: we do not update a thread's
2858 -        * priority until it either goes to sleep or uses up its
2859 -        * timeslice. This makes it possible for interactive tasks
2860 -        * to use up their timeslices at their highest priority levels.
2861 -        */
2862 -       if (rt_task(p)) {
2863 -               /*
2864 -                * RR tasks need a special form of timeslice management.
2865 -                * FIFO tasks have no timeslices.
2866 -                */
2867 -               if ((p->policy == SCHED_RR) && !--p->time_slice) {
2868 -                       p->time_slice = task_timeslice(p);
2869 -                       p->first_time_slice = 0;
2870 -                       set_tsk_need_resched(p);
2871 -
2872 -                       /* put it at the end of the queue: */
2873 -                       requeue_task(p, rq->active);
2874 -               }
2875 -               goto out_unlock;
2876 -       }
2877 -       if (!--p->time_slice) {
2878 -               dequeue_task(p, rq->active);
2879 -               set_tsk_need_resched(p);
2880 -               p->prio = effective_prio(p);
2881 -               p->time_slice = task_timeslice(p);
2882 -               p->first_time_slice = 0;
2883 -
2884 -               if (!rq->expired_timestamp)
2885 -                       rq->expired_timestamp = jiffies;
2886 -               if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
2887 -                       enqueue_task(p, rq->expired);
2888 -                       if (p->static_prio < rq->best_expired_prio)
2889 -                               rq->best_expired_prio = p->static_prio;
2890 -               } else
2891 -                       enqueue_task(p, rq->active);
2892 -       } else {
2893 -               /*
2894 -                * Prevent a too long timeslice allowing a task to monopolize
2895 -                * the CPU. We do this by splitting up the timeslice into
2896 -                * smaller pieces.
2897 -                *
2898 -                * Note: this does not mean the task's timeslices expire or
2899 -                * get lost in any way, they just might be preempted by
2900 -                * another task of equal priority. (one with higher
2901 -                * priority would have preempted this task already.) We
2902 -                * requeue this task to the end of the list on this priority
2903 -                * level, which is in essence a round-robin of tasks with
2904 -                * equal priority.
2905 -                *
2906 -                * This only applies to tasks in the interactive
2907 -                * delta range with at least TIMESLICE_GRANULARITY to requeue.
2908 -                */
2909 -               if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2910 -                       p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2911 -                       (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2912 -                       (p->array == rq->active)) {
2913 -
2914 -                       requeue_task(p, rq->active);
2915 -                       set_tsk_need_resched(p);
2916 -               }
2917 -       }
2918 -out_unlock:
2919 -       spin_unlock(&rq->lock);
2920 -}
2921 -
2922  /*
2923   * This function gets called by the timer code, with HZ frequency.
2924   * We call it with interrupts disabled.
2925 @@ -3213,155 +2659,19 @@ out_unlock:
2926   */
2927  void scheduler_tick(void)
2928  {
2929 -       unsigned long long now = sched_clock();
2930         struct task_struct *p = current;
2931         int cpu = smp_processor_id();
2932         struct rq *rq = cpu_rq(cpu);
2933
2934 -       update_cpu_clock(p, rq, now);
2935 -
2936 -       if (p == rq->idle)
2937 -               /* Task on the idle queue */
2938 -               wake_priority_sleeper(rq);
2939 -       else
2940 +       if (p != rq->idle)
2941                 task_running_tick(rq, p);
2942 +       update_load_fair(rq);
2943  #ifdef CONFIG_SMP
2944 -       update_load(rq);
2945         if (time_after_eq(jiffies, rq->next_balance))
2946                 raise_softirq(SCHED_SOFTIRQ);
2947  #endif
2948  }
2949
2950 -#ifdef CONFIG_SCHED_SMT
2951 -static inline void wakeup_busy_runqueue(struct rq *rq)
2952 -{
2953 -       /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2954 -       if (rq->curr == rq->idle && rq->nr_running)
2955 -               resched_task(rq->idle);
2956 -}
2957 -
2958 -/*
2959 - * Called with interrupt disabled and this_rq's runqueue locked.
2960 - */
2961 -static void wake_sleeping_dependent(int this_cpu)
2962 -{
2963 -       struct sched_domain *tmp, *sd = NULL;
2964 -       int i;
2965 -
2966 -       for_each_domain(this_cpu, tmp) {
2967 -               if (tmp->flags & SD_SHARE_CPUPOWER) {
2968 -                       sd = tmp;
2969 -                       break;
2970 -               }
2971 -       }
2972 -
2973 -       if (!sd)
2974 -               return;
2975 -
2976 -       for_each_cpu_mask(i, sd->span) {
2977 -               struct rq *smt_rq = cpu_rq(i);
2978 -
2979 -               if (i == this_cpu)
2980 -                       continue;
2981 -               if (unlikely(!spin_trylock(&smt_rq->lock)))
2982 -                       continue;
2983 -
2984 -               wakeup_busy_runqueue(smt_rq);
2985 -               spin_unlock(&smt_rq->lock);
2986 -       }
2987 -}
2988 -
2989 -/*
2990 - * number of 'lost' timeslices this task wont be able to fully
2991 - * utilize, if another task runs on a sibling. This models the
2992 - * slowdown effect of other tasks running on siblings:
2993 - */
2994 -static inline unsigned long
2995 -smt_slice(struct task_struct *p, struct sched_domain *sd)
2996 -{
2997 -       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2998 -}
2999 -
3000 -/*
3001 - * To minimise lock contention and not have to drop this_rq's runlock we only
3002 - * trylock the sibling runqueues and bypass those runqueues if we fail to
3003 - * acquire their lock. As we only trylock the normal locking order does not
3004 - * need to be obeyed.
3005 - */
3006 -static int
3007 -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
3008 -{
3009 -       struct sched_domain *tmp, *sd = NULL;
3010 -       int ret = 0, i;
3011 -
3012 -       /* kernel/rt threads do not participate in dependent sleeping */
3013 -       if (!p->mm || rt_task(p))
3014 -               return 0;
3015 -
3016 -       for_each_domain(this_cpu, tmp) {
3017 -               if (tmp->flags & SD_SHARE_CPUPOWER) {
3018 -                       sd = tmp;
3019 -                       break;
3020 -               }
3021 -       }
3022 -
3023 -       if (!sd)
3024 -               return 0;
3025 -
3026 -       for_each_cpu_mask(i, sd->span) {
3027 -               struct task_struct *smt_curr;
3028 -               struct rq *smt_rq;
3029 -
3030 -               if (i == this_cpu)
3031 -                       continue;
3032 -
3033 -               smt_rq = cpu_rq(i);
3034 -               if (unlikely(!spin_trylock(&smt_rq->lock)))
3035 -                       continue;
3036 -
3037 -               smt_curr = smt_rq->curr;
3038 -
3039 -               if (!smt_curr->mm)
3040 -                       goto unlock;
3041 -
3042 -               /*
3043 -                * If a user task with lower static priority than the
3044 -                * running task on the SMT sibling is trying to schedule,
3045 -                * delay it till there is proportionately less timeslice
3046 -                * left of the sibling task to prevent a lower priority
3047 -                * task from using an unfair proportion of the
3048 -                * physical cpu's resources. -ck
3049 -                */
3050 -               if (rt_task(smt_curr)) {
3051 -                       /*
3052 -                        * With real time tasks we run non-rt tasks only
3053 -                        * per_cpu_gain% of the time.
3054 -                        */
3055 -                       if ((jiffies % DEF_TIMESLICE) >
3056 -                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
3057 -                                       ret = 1;
3058 -               } else {
3059 -                       if (smt_curr->static_prio < p->static_prio &&
3060 -                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
3061 -                               smt_slice(smt_curr, sd) > task_timeslice(p))
3062 -                                       ret = 1;
3063 -               }
3064 -unlock:
3065 -               spin_unlock(&smt_rq->lock);
3066 -       }
3067 -       return ret;
3068 -}
3069 -#else
3070 -static inline void wake_sleeping_dependent(int this_cpu)
3071 -{
3072 -}
3073 -static inline int
3074 -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
3075 -{
3076 -       return 0;
3077 -}
3078 -#endif
3079 -
3080  #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3081
3082  void fastcall add_preempt_count(int val)
3083 @@ -3400,49 +2710,27 @@ EXPORT_SYMBOL(sub_preempt_count);
3084
3085  #endif
3086
3087 -static inline int interactive_sleep(enum sleep_type sleep_type)
3088 -{
3089 -       return (sleep_type == SLEEP_INTERACTIVE ||
3090 -               sleep_type == SLEEP_INTERRUPTED);
3091 -}
3092 -
3093  /*
3094 - * schedule() is the main scheduler function.
3095 + * Various schedule()-time debugging checks and statistics:
3096   */
3097 -asmlinkage void __sched schedule(void)
3098 +static inline void schedule_debug(struct rq *rq, struct task_struct *prev)
3099  {
3100 -       struct task_struct *prev, *next;
3101 -       struct prio_array *array;
3102 -       struct list_head *queue;
3103 -       unsigned long long now;
3104 -       unsigned long run_time;
3105 -       int cpu, idx, new_prio;
3106 -       long *switch_count;
3107 -       struct rq *rq;
3108 -
3109         /*
3110          * Test if we are atomic.  Since do_exit() needs to call into
3111          * schedule() atomically, we ignore that path for now.
3112          * Otherwise, whine if we are scheduling when we should not be.
3113          */
3114 -       if (unlikely(in_atomic() && !current->exit_state)) {
3115 +       if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) {
3116                 printk(KERN_ERR "BUG: scheduling while atomic: "
3117                         "%s/0x%08x/%d\n",
3118 -                       current->comm, preempt_count(), current->pid);
3119 -               debug_show_held_locks(current);
3120 +                       prev->comm, preempt_count(), prev->pid);
3121 +               debug_show_held_locks(prev);
3122                 if (irqs_disabled())
3123 -                       print_irqtrace_events(current);
3124 +                       print_irqtrace_events(prev);
3125                 dump_stack();
3126         }
3127         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3128
3129 -need_resched:
3130 -       preempt_disable();
3131 -       prev = current;
3132 -       release_kernel_lock(prev);
3133 -need_resched_nonpreemptible:
3134 -       rq = this_rq();
3135 -
3136         /*
3137          * The idle thread is not allowed to schedule!
3138          * Remove this check after it has been exercised a bit.
3139 @@ -3453,19 +2741,45 @@ need_resched_nonpreemptible:
3140         }
3141
3142         schedstat_inc(rq, sched_cnt);
3143 -       now = sched_clock();
3144 -       if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
3145 -               run_time = now - prev->timestamp;
3146 -               if (unlikely((long long)(now - prev->timestamp) < 0))
3147 -                       run_time = 0;
3148 -       } else
3149 -               run_time = NS_MAX_SLEEP_AVG;
3150 +}
3151
3152 -       /*
3153 -        * Tasks charged proportionately less run_time at high sleep_avg to
3154 -        * delay them losing their interactive status
3155 -        */
3156 -       run_time /= (CURRENT_BONUS(prev) ? : 1);
3157 +static inline struct task_struct *
3158 +pick_next_task(struct rq *rq, struct task_struct *prev)
3159 +{
3160 +       struct sched_class *class = sched_class_highest;
3161 +       u64 now = __rq_clock(rq);
3162 +       struct task_struct *p;
3163 +
3164 +       prev->sched_class->put_prev_task(rq, prev, now);
3165 +
3166 +       do {
3167 +               p = class->pick_next_task(rq, now);
3168 +               if (p)
3169 +                       return p;
3170 +               class = class->next;
3171 +       } while (class);
3172 +
3173 +       return NULL;
3174 +}
3175 +
3176 +/*
3177 + * schedule() is the main scheduler function.
3178 + */
3179 +asmlinkage void __sched schedule(void)
3180 +{
3181 +       struct task_struct *prev, *next;
3182 +       long *switch_count;
3183 +       struct rq *rq;
3184 +       int cpu;
3185 +
3186 +need_resched:
3187 +       preempt_disable();
3188 +       prev = current;
3189 +       release_kernel_lock(prev);
3190 +need_resched_nonpreemptible:
3191 +       rq = this_rq();
3192 +
3193 +       schedule_debug(rq, prev);
3194
3195         spin_lock_irq(&rq->lock);
3196
3197 @@ -3478,7 +2792,7 @@ need_resched_nonpreemptible:
3198                 else {
3199                         if (prev->state == TASK_UNINTERRUPTIBLE)
3200                                 rq->nr_uninterruptible++;
3201 -                       deactivate_task(prev, rq);
3202 +                       deactivate_task(rq, prev, 1);
3203                 }
3204         }
3205
3206 @@ -3486,68 +2800,25 @@ need_resched_nonpreemptible:
3207         if (unlikely(!rq->nr_running)) {
3208                 idle_balance(cpu, rq);
3209                 if (!rq->nr_running) {
3210 +                       prev->sched_class->put_prev_task(rq, prev,
3211 +                                                        __rq_clock(rq));
3212                         next = rq->idle;
3213 -                       rq->expired_timestamp = 0;
3214 -                       wake_sleeping_dependent(cpu);
3215 +                       schedstat_inc(rq, sched_goidle);
3216                         goto switch_tasks;
3217                 }
3218         }
3219
3220 -       array = rq->active;
3221 -       if (unlikely(!array->nr_active)) {
3222 -               /*
3223 -                * Switch the active and expired arrays.
3224 -                */
3225 -               schedstat_inc(rq, sched_switch);
3226 -               rq->active = rq->expired;
3227 -               rq->expired = array;
3228 -               array = rq->active;
3229 -               rq->expired_timestamp = 0;
3230 -               rq->best_expired_prio = MAX_PRIO;
3231 -       }
3232 -
3233 -       idx = sched_find_first_bit(array->bitmap);
3234 -       queue = array->queue + idx;
3235 -       next = list_entry(queue->next, struct task_struct, run_list);
3236 -
3237 -       if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3238 -               unsigned long long delta = now - next->timestamp;
3239 -               if (unlikely((long long)(now - next->timestamp) < 0))
3240 -                       delta = 0;
3241 -
3242 -               if (next->sleep_type == SLEEP_INTERACTIVE)
3243 -                       delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3244 -
3245 -               array = next->array;
3246 -               new_prio = recalc_task_prio(next, next->timestamp + delta);
3247 -
3248 -               if (unlikely(next->prio != new_prio)) {
3249 -                       dequeue_task(next, array);
3250 -                       next->prio = new_prio;
3251 -                       enqueue_task(next, array);
3252 -               }
3253 -       }
3254 -       next->sleep_type = SLEEP_NORMAL;
3255 -       if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next))
3256 -               next = rq->idle;
3257 +       next = pick_next_task(rq, prev);
3258 +       next->nr_switches++;
3259 +
3260  switch_tasks:
3261 -       if (next == rq->idle)
3262 -               schedstat_inc(rq, sched_goidle);
3263         prefetch(next);
3264         prefetch_stack(next);
3265         clear_tsk_need_resched(prev);
3266         rcu_qsctr_inc(task_cpu(prev));
3267
3268 -       update_cpu_clock(prev, rq, now);
3269 -
3270 -       prev->sleep_avg -= run_time;
3271 -       if ((long)prev->sleep_avg <= 0)
3272 -               prev->sleep_avg = 0;
3273 -       prev->timestamp = prev->last_ran = now;
3274 -
3275         sched_info_switch(prev, next);
3276         if (likely(prev != next)) {
3277 -               next->timestamp = next->last_ran = now;
3278                 rq->nr_switches++;
3279                 rq->curr = next;
3280                 ++*switch_count;
3281 @@ -3978,29 +3249,28 @@ EXPORT_SYMBOL(sleep_on_timeout);
3282   */
3283  void rt_mutex_setprio(struct task_struct *p, int prio)
3284  {
3285 -       struct prio_array *array;
3286         unsigned long flags;
3287 +       int oldprio, on_rq;
3288         struct rq *rq;
3289 -       int oldprio;
3290
3291         BUG_ON(prio < 0 || prio > MAX_PRIO);
3292
3293         rq = task_rq_lock(p, &flags);
3294
3295         oldprio = p->prio;
3296 -       array = p->array;
3297 -       if (array)
3298 -               dequeue_task(p, array);
3299 +       on_rq = p->on_rq;
3300 +       if (on_rq)
3301 +               dequeue_task(rq, p, 0);
3302 +
3303 +       if (rt_prio(prio))
3304 +               p->sched_class = &rt_sched_class;
3305 +       else
3306 +               p->sched_class = &fair_sched_class;
3307 +
3308         p->prio = prio;
3309
3310 -       if (array) {
3311 -               /*
3312 -                * If changing to an RT priority then queue it
3313 -                * in the active array!
3314 -                */
3315 -               if (rt_task(p))
3316 -                       array = rq->active;
3317 -               enqueue_task(p, array);
3318 +       if (on_rq) {
3319 +               enqueue_task(rq, p, 0);
3320                 /*
3321                  * Reschedule if we are currently running on this runqueue and
3322                  * our priority decreased, or if we are not currently running on
3323 @@ -4009,8 +3279,9 @@ void rt_mutex_setprio(struct task_struct
3324                 if (task_running(rq, p)) {
3325                         if (p->prio > oldprio)
3326                                 resched_task(rq->curr);
3327 -               } else if (TASK_PREEMPTS_CURR(p, rq))
3328 -                       resched_task(rq->curr);
3329 +               } else {
3330 +                       check_preempt_curr(rq, p);
3331 +               }
3332         }
3333         task_rq_unlock(rq, &flags);
3334  }
3335 @@ -4019,8 +3290,7 @@ void rt_mutex_setprio(struct task_struct
3336
3337  void set_user_nice(struct task_struct *p, long nice)
3338  {
3339 -       struct prio_array *array;
3340 -       int old_prio, delta;
3341 +       int old_prio, delta, on_rq;
3342         unsigned long flags;
3343         struct rq *rq;
3344
3345 @@ -4041,9 +3311,9 @@ void set_user_nice(struct task_struct *p
3346                 p->static_prio = NICE_TO_PRIO(nice);
3347                 goto out_unlock;
3348         }
3349 -       array = p->array;
3350 -       if (array) {
3351 -               dequeue_task(p, array);
3352 +       on_rq = p->on_rq;
3353 +       if (on_rq) {
3354 +               dequeue_task(rq, p, 0);
3355                 dec_raw_weighted_load(rq, p);
3356         }
3357
3358 @@ -4053,8 +3323,8 @@ void set_user_nice(struct task_struct *p
3359         p->prio = effective_prio(p);
3360         delta = p->prio - old_prio;
3361
3362 -       if (array) {
3363 -               enqueue_task(p, array);
3364 +       if (on_rq) {
3365 +               enqueue_task(rq, p, 0);
3366                 inc_raw_weighted_load(rq, p);
3367                 /*
3368                  * If the task increased its priority or is running and
3369 @@ -4175,20 +3445,27 @@ static inline struct task_struct *find_p
3370  }
3371
3372  /* Actually do priority change: must hold rq lock. */
3373 -static void __setscheduler(struct task_struct *p, int policy, int prio)
3374 +static void
3375 +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3376  {
3377 -       BUG_ON(p->array);
3378 +       BUG_ON(p->on_rq);
3379
3380         p->policy = policy;
3381 +       switch (p->policy) {
3382 +       case SCHED_NORMAL:
3383 +       case SCHED_BATCH:
3384 +               p->sched_class = &fair_sched_class;
3385 +               break;
3386 +       case SCHED_FIFO:
3387 +       case SCHED_RR:
3388 +               p->sched_class = &rt_sched_class;
3389 +               break;
3390 +       }
3391 +
3392         p->rt_priority = prio;
3393         p->normal_prio = normal_prio(p);
3394         /* we are holding p->pi_lock already */
3395         p->prio = rt_mutex_getprio(p);
3396 -       /*
3397 -        * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3398 -        */
3399 -       if (policy == SCHED_BATCH)
3400 -               p->sleep_avg = 0;
3401         set_load_weight(p);
3402  }
3403
3404 @@ -4204,8 +3481,7 @@ static void __setscheduler(struct task_s
3405  int sched_setscheduler(struct task_struct *p, int policy,
3406                        struct sched_param *param)
3407  {
3408 -       int retval, oldprio, oldpolicy = -1;
3409 -       struct prio_array *array;
3410 +       int retval, oldprio, oldpolicy = -1, on_rq;
3411         unsigned long flags;
3412         struct rq *rq;
3413
3414 @@ -4279,13 +3555,13 @@ recheck:
3415                 spin_unlock_irqrestore(&p->pi_lock, flags);
3416                 goto recheck;
3417         }
3418 -       array = p->array;
3419 -       if (array)
3420 -               deactivate_task(p, rq);
3421 +       on_rq = p->on_rq;
3422 +       if (on_rq)
3423 +               deactivate_task(rq, p, 0);
3424         oldprio = p->prio;
3425 -       __setscheduler(p, policy, param->sched_priority);
3426 -       if (array) {
3427 -               __activate_task(p, rq);
3428 +       __setscheduler(rq, p, policy, param->sched_priority);
3429 +       if (on_rq) {
3430 +               activate_task(rq, p, 0);
3431                 /*
3432                  * Reschedule if we are currently running on this runqueue and
3433                  * our priority decreased, or if we are not currently running on
3434 @@ -4294,8 +3570,9 @@ recheck:
3435                 if (task_running(rq, p)) {
3436                         if (p->prio > oldprio)
3437                                 resched_task(rq->curr);
3438 -               } else if (TASK_PREEMPTS_CURR(p, rq))
3439 -                       resched_task(rq->curr);
3440 +               } else {
3441 +                       check_preempt_curr(rq, p);
3442 +               }
3443         }
3444         __task_rq_unlock(rq);
3445         spin_unlock_irqrestore(&p->pi_lock, flags);
3446 @@ -4558,50 +3835,66 @@ asmlinkage long sys_sched_getaffinity(pi
3447         if (ret < 0)
3448                 return ret;
3449
3450 -       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3451 -               return -EFAULT;
3452 +       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3453 +               return -EFAULT;
3454 +
3455 +       return sizeof(cpumask_t);
3456 +}
3457 +
3458 +/**
3459 + * sys_sched_yield - yield the current processor to other threads.
3460 + *
3461 + * This function yields the current CPU to other tasks. If there are no
3462 + * other threads running on this CPU then this function will return.
3463 + */
3464 +asmlinkage long sys_sched_yield(void)
3465 +{
3466 +       struct rq *rq = this_rq_lock();
3467 +
3468 +       schedstat_inc(rq, yld_cnt);
3469 +       if (rq->nr_running == 1)
3470 +               schedstat_inc(rq, yld_act_empty);
3471 +       else
3472 +               current->sched_class->yield_task(rq, current, NULL);
3473 +
3474 +       /*
3475 +        * Since we are going to call schedule() anyway, there's
3476 +        * no need to preempt or enable interrupts:
3477 +        */
3478 +       __release(rq->lock);
3479 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3480 +       _raw_spin_unlock(&rq->lock);
3481 +       preempt_enable_no_resched();
3482 +
3483 +       schedule();
3484
3485 -       return sizeof(cpumask_t);
3486 +       return 0;
3487  }
3488
3489  /**
3490 - * sys_sched_yield - yield the current processor to other threads.
3491 + * sys_sched_yield_to - yield the current processor to another thread
3492   *
3493 - * this function yields the current CPU by moving the calling thread
3494 + * This function yields the current CPU by moving the calling thread
3495   * to the expired array. If there are no other threads running on this
3496   * CPU then this function will return.
3497   */
3498 -asmlinkage long sys_sched_yield(void)
3499 +asmlinkage long sys_sched_yield_to(pid_t pid)
3500  {
3501 -       struct rq *rq = this_rq_lock();
3502 -       struct prio_array *array = current->array, *target = rq->expired;
3503 +       struct task_struct *p_to;
3504 +       struct rq *rq;
3505
3506 -       schedstat_inc(rq, yld_cnt);
3507 -       /*
3508 -        * We implement yielding by moving the task into the expired
3509 -        * queue.
3510 -        *
3511 -        * (special rule: RT tasks will just roundrobin in the active
3512 -        *  array.)
3513 -        */
3514 -       if (rt_task(current))
3515 -               target = rq->active;
3516 +       rcu_read_lock();
3517 +       p_to = find_task_by_pid(pid);
3518 +       if (!p_to)
3519 +               goto out_unlock;
3520
3521 -       if (array->nr_active == 1) {
3522 +       rq = this_rq_lock();
3523 +
3524 +       schedstat_inc(rq, yld_cnt);
3525 +       if (rq->nr_running == 1)
3526                 schedstat_inc(rq, yld_act_empty);
3527 -               if (!rq->expired->nr_active)
3528 -                       schedstat_inc(rq, yld_both_empty);
3529 -       } else if (!rq->expired->nr_active)
3530 -               schedstat_inc(rq, yld_exp_empty);
3531 -
3532 -       if (array != target) {
3533 -               dequeue_task(current, array);
3534 -               enqueue_task(current, target);
3535 -       } else
3536 -               /*
3537 -                * requeue_task is cheaper so perform that if possible.
3538 -                */
3539 -               requeue_task(current, array);
3540 +       else
3541 +               current->sched_class->yield_task(rq, current, p_to);
3542
3543         /*
3544          * Since we are going to call schedule() anyway, there's
3545 @@ -4610,13 +3903,19 @@ asmlinkage long sys_sched_yield(void)
3546         __release(rq->lock);
3547         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3548         _raw_spin_unlock(&rq->lock);
3549 +       rcu_read_unlock();
3550         preempt_enable_no_resched();
3551
3552         schedule();
3553
3554         return 0;
3555 +
3556 +out_unlock:
3557 +       rcu_read_unlock();
3558 +       return -ESRCH;
3559  }
3560
3561 +
3562  static void __cond_resched(void)
3563  {
3564  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3565 @@ -4812,7 +4111,7 @@ long sys_sched_rr_get_interval(pid_t pid
3566                 goto out_unlock;
3567
3568         jiffies_to_timespec(p->policy == SCHED_FIFO ?
3569 -                               0 : task_timeslice(p), &t);
3570 +                               0 : static_prio_timeslice(p->static_prio), &t);
3571         read_unlock(&tasklist_lock);
3572         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3573  out_nounlock:
3574 @@ -4915,7 +4214,7 @@ void show_state_filter(unsigned long sta
3575                  * console might take alot of time:
3576                  */
3577                 touch_nmi_watchdog();
3578 -               if (p->state & state_filter)
3579 +               if (!state_filter || (p->state & state_filter))
3580                         show_task(p);
3581         } while_each_thread(g, p);
3582
3583 @@ -4925,6 +4224,7 @@ void show_state_filter(unsigned long sta
3584          */
3585         if (state_filter == -1)
3586                 debug_show_all_locks();
3587 +       sysrq_sched_debug_show();
3588  }
3589
3590  /**
3591 @@ -4940,11 +4240,10 @@ void __cpuinit init_idle(struct task_str
3592         struct rq *rq = cpu_rq(cpu);
3593         unsigned long flags;
3594
3595 -       idle->timestamp = sched_clock();
3596 -       idle->sleep_avg = 0;
3597 -       idle->array = NULL;
3598 +       __sched_fork(idle);
3599 +       idle->exec_start = sched_clock();
3600 +
3601         idle->prio = idle->normal_prio = MAX_PRIO;
3602 -       idle->state = TASK_RUNNING;
3603         idle->cpus_allowed = cpumask_of_cpu(cpu);
3604         set_task_cpu(idle, cpu);
3605
3606 @@ -5062,19 +4361,10 @@ static int __migrate_task(struct task_st
3607                 goto out;
3608
3609         set_task_cpu(p, dest_cpu);
3610 -       if (p->array) {
3611 -               /*
3612 -                * Sync timestamp with rq_dest's before activating.
3613 -                * The same thing could be achieved by doing this step
3614 -                * afterwards, and pretending it was a local activate.
3615 -                * This way is cleaner and logically correct.
3616 -                */
3617 -               p->timestamp = p->timestamp - rq_src->most_recent_timestamp
3618 -                               + rq_dest->most_recent_timestamp;
3619 -               deactivate_task(p, rq_src);
3620 -               __activate_task(p, rq_dest);
3621 -               if (TASK_PREEMPTS_CURR(p, rq_dest))
3622 -                       resched_task(rq_dest->curr);
3623 +       if (p->on_rq) {
3624 +               deactivate_task(rq_src, p, 0);
3625 +               activate_task(rq_dest, p, 0);
3626 +               check_preempt_curr(rq_dest, p);
3627         }
3628         ret = 1;
3629  out:
3630 @@ -5246,10 +4536,10 @@ void sched_idle_next(void)
3631          */
3632         spin_lock_irqsave(&rq->lock, flags);
3633
3634 -       __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3635 +       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
3636
3637         /* Add idle task to the _front_ of its priority queue: */
3638 -       __activate_idle_task(p, rq);
3639 +       activate_idle_task(p, rq);
3640
3641         spin_unlock_irqrestore(&rq->lock, flags);
3642  }
3643 @@ -5299,16 +4589,15 @@ static void migrate_dead(unsigned int de
3644  static void migrate_dead_tasks(unsigned int dead_cpu)
3645  {
3646         struct rq *rq = cpu_rq(dead_cpu);
3647 -       unsigned int arr, i;
3648 +       struct task_struct *next;
3649
3650 -       for (arr = 0; arr < 2; arr++) {
3651 -               for (i = 0; i < MAX_PRIO; i++) {
3652 -                       struct list_head *list = &rq->arrays[arr].queue[i];
3653 -
3654 -                       while (!list_empty(list))
3655 -                               migrate_dead(dead_cpu, list_entry(list->next,
3656 -                                            struct task_struct, run_list));
3657 -               }
3658 +       for (;;) {
3659 +               if (!rq->nr_running)
3660 +                       break;
3661 +               next = pick_next_task(rq, rq->curr);
3662 +               if (!next)
3663 +                       break;
3664 +               migrate_dead(dead_cpu, next);
3665         }
3666  }
3667  #endif /* CONFIG_HOTPLUG_CPU */
3668 @@ -5334,7 +4623,7 @@ migration_call(struct notifier_block *nf
3669                 kthread_bind(p, cpu);
3670                 /* Must be high prio: stop_machine expects to yield to it. */
3671                 rq = task_rq_lock(p, &flags);
3672 -               __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3673 +               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
3674                 task_rq_unlock(rq, &flags);
3675                 cpu_rq(cpu)->migration_thread = p;
3676                 break;
3677 @@ -5362,9 +4651,9 @@ migration_call(struct notifier_block *nf
3678                 rq->migration_thread = NULL;
3679                 /* Idle task back to normal (off runqueue, low prio) */
3680                 rq = task_rq_lock(rq->idle, &flags);
3681 -               deactivate_task(rq->idle, rq);
3682 +               deactivate_task(rq, rq->idle, 0);
3683                 rq->idle->static_prio = MAX_PRIO;
3684 -               __setscheduler(rq->idle, SCHED_NORMAL, 0);
3685 +               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
3686                 migrate_dead_tasks(cpu);
3687                 task_rq_unlock(rq, &flags);
3688                 migrate_nr_uninterruptible(rq);
3689 @@ -5665,483 +4954,6 @@ init_sched_build_groups(cpumask_t span,
3690
3691  #define SD_NODES_PER_DOMAIN 16
3692
3693 -/*
3694 - * Self-tuning task migration cost measurement between source and target CPUs.
3695 - *
3696 - * This is done by measuring the cost of manipulating buffers of varying
3697 - * sizes. For a given buffer-size here are the steps that are taken:
3698 - *
3699 - * 1) the source CPU reads+dirties a shared buffer
3700 - * 2) the target CPU reads+dirties the same shared buffer
3701 - *
3702 - * We measure how long they take, in the following 4 scenarios:
3703 - *
3704 - *  - source: CPU1, target: CPU2 | cost1
3705 - *  - source: CPU2, target: CPU1 | cost2
3706 - *  - source: CPU1, target: CPU1 | cost3
3707 - *  - source: CPU2, target: CPU2 | cost4
3708 - *
3709 - * We then calculate the cost3+cost4-cost1-cost2 difference - this is
3710 - * the cost of migration.
3711 - *
3712 - * We then start off from a small buffer-size and iterate up to larger
3713 - * buffer sizes, in 5% steps - measuring each buffer-size separately, and
3714 - * doing a maximum search for the cost. (The maximum cost for a migration
3715 - * normally occurs when the working set size is around the effective cache
3716 - * size.)
3717 - */
3718 -#define SEARCH_SCOPE           2
3719 -#define MIN_CACHE_SIZE         (64*1024U)
3720 -#define DEFAULT_CACHE_SIZE     (5*1024*1024U)
3721 -#define ITERATIONS             1
3722 -#define SIZE_THRESH            130
3723 -#define COST_THRESH            130
3724 -
3725 -/*
3726 - * The migration cost is a function of 'domain distance'. Domain
3727 - * distance is the number of steps a CPU has to iterate down its
3728 - * domain tree to share a domain with the other CPU. The farther
3729 - * two CPUs are from each other, the larger the distance gets.
3730 - *
3731 - * Note that we use the distance only to cache measurement results,
3732 - * the distance value is not used numerically otherwise. When two
3733 - * CPUs have the same distance it is assumed that the migration
3734 - * cost is the same. (this is a simplification but quite practical)
3735 - */
3736 -#define MAX_DOMAIN_DISTANCE 32
3737 -
3738 -static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
3739 -               { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
3740 -/*
3741 - * Architectures may override the migration cost and thus avoid
3742 - * boot-time calibration. Unit is nanoseconds. Mostly useful for
3743 - * virtualized hardware:
3744 - */
3745 -#ifdef CONFIG_DEFAULT_MIGRATION_COST
3746 -                       CONFIG_DEFAULT_MIGRATION_COST
3747 -#else
3748 -                       -1LL
3749 -#endif
3750 -};
3751 -
3752 -/*
3753 - * Allow override of migration cost - in units of microseconds.
3754 - * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
3755 - * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
3756 - */
3757 -static int __init migration_cost_setup(char *str)
3758 -{
3759 -       int ints[MAX_DOMAIN_DISTANCE+1], i;
3760 -
3761 -       str = get_options(str, ARRAY_SIZE(ints), ints);
3762 -
3763 -       printk("#ints: %d\n", ints[0]);
3764 -       for (i = 1; i <= ints[0]; i++) {
3765 -               migration_cost[i-1] = (unsigned long long)ints[i]*1000;
3766 -               printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
3767 -       }
3768 -       return 1;
3769 -}
3770 -
3771 -__setup ("migration_cost=", migration_cost_setup);
3772 -
3773 -/*
3774 - * Global multiplier (divisor) for migration-cutoff values,
3775 - * in percentiles. E.g. use a value of 150 to get 1.5 times
3776 - * longer cache-hot cutoff times.
3777 - *
3778 - * (We scale it from 100 to 128 to long long handling easier.)
3779 - */
3780 -
3781 -#define MIGRATION_FACTOR_SCALE 128
3782 -
3783 -static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
3784 -
3785 -static int __init setup_migration_factor(char *str)
3786 -{
3787 -       get_option(&str, &migration_factor);
3788 -       migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
3789 -       return 1;
3790 -}
3791 -
3792 -__setup("migration_factor=", setup_migration_factor);
3793 -
3794 -/*
3795 - * Estimated distance of two CPUs, measured via the number of domains
3796 - * we have to pass for the two CPUs to be in the same span:
3797 - */
3798 -static unsigned long domain_distance(int cpu1, int cpu2)
3799 -{
3800 -       unsigned long distance = 0;
3801 -       struct sched_domain *sd;
3802 -
3803 -       for_each_domain(cpu1, sd) {
3804 -               WARN_ON(!cpu_isset(cpu1, sd->span));
3805 -               if (cpu_isset(cpu2, sd->span))
3806 -                       return distance;
3807 -               distance++;
3808 -       }
3809 -       if (distance >= MAX_DOMAIN_DISTANCE) {
3810 -               WARN_ON(1);
3811 -               distance = MAX_DOMAIN_DISTANCE-1;
3812 -       }
3813 -
3814 -       return distance;
3815 -}
3816 -
3817 -static unsigned int migration_debug;
3818 -
3819 -static int __init setup_migration_debug(char *str)
3820 -{
3821 -       get_option(&str, &migration_debug);
3822 -       return 1;
3823 -}
3824 -
3825 -__setup("migration_debug=", setup_migration_debug);
3826 -
3827 -/*
3828 - * Maximum cache-size that the scheduler should try to measure.
3829 - * Architectures with larger caches should tune this up during
3830 - * bootup. Gets used in the domain-setup code (i.e. during SMP
3831 - * bootup).
3832 - */
3833 -unsigned int max_cache_size;
3834 -
3835 -static int __init setup_max_cache_size(char *str)
3836 -{
3837 -       get_option(&str, &max_cache_size);
3838 -       return 1;
3839 -}
3840 -
3841 -__setup("max_cache_size=", setup_max_cache_size);
3842 -
3843 -/*
3844 - * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
3845 - * is the operation that is timed, so we try to generate unpredictable
3846 - * cachemisses that still end up filling the L2 cache:
3847 - */
3848 -static void touch_cache(void *__cache, unsigned long __size)
3849 -{
3850 -       unsigned long size = __size / sizeof(long);
3851 -       unsigned long chunk1 = size / 3;
3852 -       unsigned long chunk2 = 2 * size / 3;
3853 -       unsigned long *cache = __cache;
3854 -       int i;
3855 -
3856 -       for (i = 0; i < size/6; i += 8) {
3857 -               switch (i % 6) {
3858 -                       case 0: cache[i]++;
3859 -                       case 1: cache[size-1-i]++;
3860 -                       case 2: cache[chunk1-i]++;
3861 -                       case 3: cache[chunk1+i]++;
3862 -                       case 4: cache[chunk2-i]++;
3863 -                       case 5: cache[chunk2+i]++;
3864 -               }
3865 -       }
3866 -}
3867 -
3868 -/*
3869 - * Measure the cache-cost of one task migration. Returns in units of nsec.
3870 - */
3871 -static unsigned long long
3872 -measure_one(void *cache, unsigned long size, int source, int target)
3873 -{
3874 -       cpumask_t mask, saved_mask;
3875 -       unsigned long long t0, t1, t2, t3, cost;
3876 -
3877 -       saved_mask = current->cpus_allowed;
3878 -
3879 -       /*
3880 -        * Flush source caches to RAM and invalidate them:
3881 -        */
3882 -       sched_cacheflush();
3883 -
3884 -       /*
3885 -        * Migrate to the source CPU:
3886 -        */
3887 -       mask = cpumask_of_cpu(source);
3888 -       set_cpus_allowed(current, mask);
3889 -       WARN_ON(smp_processor_id() != source);
3890 -
3891 -       /*
3892 -        * Dirty the working set:
3893 -        */
3894 -       t0 = sched_clock();
3895 -       touch_cache(cache, size);
3896 -       t1 = sched_clock();
3897 -
3898 -       /*
3899 -        * Migrate to the target CPU, dirty the L2 cache and access
3900 -        * the shared buffer. (which represents the working set
3901 -        * of a migrated task.)
3902 -        */
3903 -       mask = cpumask_of_cpu(target);
3904 -       set_cpus_allowed(current, mask);
3905 -       WARN_ON(smp_processor_id() != target);
3906 -
3907 -       t2 = sched_clock();
3908 -       touch_cache(cache, size);
3909 -       t3 = sched_clock();
3910 -
3911 -       cost = t1-t0 + t3-t2;
3912 -
3913 -       if (migration_debug >= 2)
3914 -               printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
3915 -                       source, target, t1-t0, t1-t0, t3-t2, cost);
3916 -       /*
3917 -        * Flush target caches to RAM and invalidate them:
3918 -        */
3919 -       sched_cacheflush();
3920 -
3921 -       set_cpus_allowed(current, saved_mask);
3922 -
3923 -       return cost;
3924 -}
3925 -
3926 -/*
3927 - * Measure a series of task migrations and return the average
3928 - * result. Since this code runs early during bootup the system
3929 - * is 'undisturbed' and the average latency makes sense.
3930 - *
3931 - * The algorithm in essence auto-detects the relevant cache-size,
3932 - * so it will properly detect different cachesizes for different
3933 - * cache-hierarchies, depending on how the CPUs are connected.
3934 - *
3935 - * Architectures can prime the upper limit of the search range via
3936 - * max_cache_size, otherwise the search range defaults to 20MB...64K.
3937 - */
3938 -static unsigned long long
3939 -measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
3940 -{
3941 -       unsigned long long cost1, cost2;
3942 -       int i;
3943 -
3944 -       /*
3945 -        * Measure the migration cost of 'size' bytes, over an
3946 -        * average of 10 runs:
3947 -        *
3948 -        * (We perturb the cache size by a small (0..4k)
3949 -        *  value to compensate size/alignment related artifacts.
3950 -        *  We also subtract the cost of the operation done on
3951 -        *  the same CPU.)
3952 -        */
3953 -       cost1 = 0;
3954 -
3955 -       /*
3956 -        * dry run, to make sure we start off cache-cold on cpu1,
3957 -        * and to get any vmalloc pagefaults in advance:
3958 -        */
3959 -       measure_one(cache, size, cpu1, cpu2);
3960 -       for (i = 0; i < ITERATIONS; i++)
3961 -               cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
3962 -
3963 -       measure_one(cache, size, cpu2, cpu1);
3964 -       for (i = 0; i < ITERATIONS; i++)
3965 -               cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
3966 -
3967 -       /*
3968 -        * (We measure the non-migrating [cached] cost on both
3969 -        *  cpu1 and cpu2, to handle CPUs with different speeds)
3970 -        */
3971 -       cost2 = 0;
3972 -
3973 -       measure_one(cache, size, cpu1, cpu1);
3974 -       for (i = 0; i < ITERATIONS; i++)
3975 -               cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
3976 -
3977 -       measure_one(cache, size, cpu2, cpu2);
3978 -       for (i = 0; i < ITERATIONS; i++)
3979 -               cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
3980 -
3981 -       /*
3982 -        * Get the per-iteration migration cost:
3983 -        */
3984 -       do_div(cost1, 2 * ITERATIONS);
3985 -       do_div(cost2, 2 * ITERATIONS);
3986 -
3987 -       return cost1 - cost2;
3988 -}
3989 -
3990 -static unsigned long long measure_migration_cost(int cpu1, int cpu2)
3991 -{
3992 -       unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
3993 -       unsigned int max_size, size, size_found = 0;
3994 -       long long cost = 0, prev_cost;
3995 -       void *cache;
3996 -
3997 -       /*
3998 -        * Search from max_cache_size*5 down to 64K - the real relevant
3999 -        * cachesize has to lie somewhere inbetween.
4000 -        */
4001 -       if (max_cache_size) {
4002 -               max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
4003 -               size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
4004 -       } else {
4005 -               /*
4006 -                * Since we have no estimation about the relevant
4007 -                * search range
4008 -                */
4009 -               max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
4010 -               size = MIN_CACHE_SIZE;
4011 -       }
4012 -
4013 -       if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
4014 -               printk("cpu %d and %d not both online!\n", cpu1, cpu2);
4015 -               return 0;
4016 -       }
4017 -
4018 -       /*
4019 -        * Allocate the working set:
4020 -        */
4021 -       cache = vmalloc(max_size);
4022 -       if (!cache) {
4023 -               printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
4024 -               return 1000000; /* return 1 msec on very small boxen */
4025 -       }
4026 -
4027 -       while (size <= max_size) {
4028 -               prev_cost = cost;
4029 -               cost = measure_cost(cpu1, cpu2, cache, size);
4030 -
4031 -               /*
4032 -                * Update the max:
4033 -                */
4034 -               if (cost > 0) {
4035 -                       if (max_cost < cost) {
4036 -                               max_cost = cost;
4037 -                               size_found = size;
4038 -                       }
4039 -               }
4040 -               /*
4041 -                * Calculate average fluctuation, we use this to prevent
4042 -                * noise from triggering an early break out of the loop:
4043 -                */
4044 -               fluct = abs(cost - prev_cost);
4045 -               avg_fluct = (avg_fluct + fluct)/2;
4046 -
4047 -               if (migration_debug)
4048 -                       printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
4049 -                               "(%8Ld %8Ld)\n",
4050 -                               cpu1, cpu2, size,
4051 -                               (long)cost / 1000000,
4052 -                               ((long)cost / 100000) % 10,
4053 -                               (long)max_cost / 1000000,
4054 -                               ((long)max_cost / 100000) % 10,
4055 -                               domain_distance(cpu1, cpu2),
4056 -                               cost, avg_fluct);
4057 -
4058 -               /*
4059 -                * If we iterated at least 20% past the previous maximum,
4060 -                * and the cost has dropped by more than 20% already,
4061 -                * (taking fluctuations into account) then we assume to
4062 -                * have found the maximum and break out of the loop early:
4063 -                */
4064 -               if (size_found && (size*100 > size_found*SIZE_THRESH))
4065 -                       if (cost+avg_fluct <= 0 ||
4066 -                               max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
4067 -
4068 -                               if (migration_debug)
4069 -                                       printk("-> found max.\n");
4070 -                               break;
4071 -                       }
4072 -               /*
4073 -                * Increase the cachesize in 10% steps:
4074 -                */
4075 -               size = size * 10 / 9;
4076 -       }
4077 -
4078 -       if (migration_debug)
4079 -               printk("[%d][%d] working set size found: %d, cost: %Ld\n",
4080 -                       cpu1, cpu2, size_found, max_cost);
4081 -
4082 -       vfree(cache);
4083 -
4084 -       /*
4085 -        * A task is considered 'cache cold' if at least 2 times
4086 -        * the worst-case cost of migration has passed.
4087 -        *
4088 -        * (this limit is only listened to if the load-balancing
4089 -        * situation is 'nice' - if there is a large imbalance we
4090 -        * ignore it for the sake of CPU utilization and
4091 -        * processing fairness.)
4092 -        */
4093 -       return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
4094 -}
4095 -
4096 -static void calibrate_migration_costs(const cpumask_t *cpu_map)
4097 -{
4098 -       int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
4099 -       unsigned long j0, j1, distance, max_distance = 0;
4100 -       struct sched_domain *sd;
4101 -
4102 -       j0 = jiffies;
4103 -
4104 -       /*
4105 -        * First pass - calculate the cacheflush times:
4106 -        */
4107 -       for_each_cpu_mask(cpu1, *cpu_map) {
4108 -               for_each_cpu_mask(cpu2, *cpu_map) {
4109 -                       if (cpu1 == cpu2)
4110 -                               continue;
4111 -                       distance = domain_distance(cpu1, cpu2);
4112 -                       max_distance = max(max_distance, distance);
4113 -                       /*
4114 -                        * No result cached yet?
4115 -                        */
4116 -                       if (migration_cost[distance] == -1LL)
4117 -                               migration_cost[distance] =
4118 -                                       measure_migration_cost(cpu1, cpu2);
4119 -               }
4120 -       }
4121 -       /*
4122 -        * Second pass - update the sched domain hierarchy with
4123 -        * the new cache-hot-time estimations:
4124 -        */
4125 -       for_each_cpu_mask(cpu, *cpu_map) {
4126 -               distance = 0;
4127 -               for_each_domain(cpu, sd) {
4128 -                       sd->cache_hot_time = migration_cost[distance];
4129 -                       distance++;
4130 -               }
4131 -       }
4132 -       /*
4133 -        * Print the matrix:
4134 -        */
4135 -       if (migration_debug)
4136 -               printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
4137 -                       max_cache_size,
4138 -#ifdef CONFIG_X86
4139 -                       cpu_khz/1000
4140 -#else
4141 -                       -1
4142 -#endif
4143 -               );
4144 -       if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
4145 -               printk("migration_cost=");
4146 -               for (distance = 0; distance <= max_distance; distance++) {
4147 -                       if (distance)
4148 -                               printk(",");
4149 -                       printk("%ld", (long)migration_cost[distance] / 1000);
4150 -               }
4151 -               printk("\n");
4152 -       }
4153 -       j1 = jiffies;
4154 -       if (migration_debug)
4155 -               printk("migration: %ld seconds\n", (j1-j0) / HZ);
4156 -
4157 -       /*
4158 -        * Move back to the original CPU. NUMA-Q gets confused
4159 -        * if we migrate to another quad during bootup.
4160 -        */
4161 -       if (raw_smp_processor_id() != orig_cpu) {
4162 -               cpumask_t mask = cpumask_of_cpu(orig_cpu),
4163 -                       saved_mask = current->cpus_allowed;
4164 -
4165 -               set_cpus_allowed(current, mask);
4166 -               set_cpus_allowed(current, saved_mask);
4167 -       }
4168 -}
4169 -
4170  #ifdef CONFIG_NUMA
4171
4172  /**
4173 @@ -6671,10 +5483,6 @@ static int build_sched_domains(const cpu
4174  #endif
4175                 cpu_attach_domain(sd, i);
4176         }
4177 -       /*
4178 -        * Tune cache-hot values:
4179 -        */
4180 -       calibrate_migration_costs(cpu_map);
4181
4182         return 0;
4183
4184 @@ -6875,6 +5683,16 @@ void __init sched_init_smp(void)
4185         /* Move init over to a non-isolated CPU */
4186         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
4187                 BUG();
4188 +       /*
4189 +        * Increase the granularity value when there are more CPUs,
4190 +        * because with more CPUs the 'effective latency' as visible
4191 +        * to users decreases. But the relationship is not linear,
4192 +        * so pick a second-best guess by going with the log2 of the
4193 +        * number of CPUs.
4194 +        *
4195 +        * This idea comes from the SD scheduler of Con Kolivas:
4196 +        */
4197 +       sysctl_sched_granularity *= 1 + ilog2(num_online_cpus());
4198  }
4199  #else
4200  void __init sched_init_smp(void)
4201 @@ -6894,7 +5712,14 @@ int in_sched_functions(unsigned long add
4202
4203  void __init sched_init(void)
4204  {
4205 -       int i, j, k;
4206 +       int i, j;
4207 +
4208 +       current->sched_class = &fair_sched_class;
4209 +       /*
4210 +        * Link up the scheduling class hierarchy:
4211 +        */
4212 +       rt_sched_class.next = &fair_sched_class;
4213 +       fair_sched_class.next = NULL;
4214
4215         for_each_possible_cpu(i) {
4216                 struct prio_array *array;
4217 @@ -6904,14 +5729,13 @@ void __init sched_init(void)
4218                 spin_lock_init(&rq->lock);
4219                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
4220                 rq->nr_running = 0;
4221 -               rq->active = rq->arrays;
4222 -               rq->expired = rq->arrays + 1;
4223 -               rq->best_expired_prio = MAX_PRIO;
4224 +               rq->tasks_timeline = RB_ROOT;
4225 +               rq->clock = rq->fair_clock = 1;
4226
4227 +               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
4228 +                       rq->cpu_load[j] = 0;
4229  #ifdef CONFIG_SMP
4230                 rq->sd = NULL;
4231 -               for (j = 1; j < 3; j++)
4232 -                       rq->cpu_load[j] = 0;
4233                 rq->active_balance = 0;
4234                 rq->push_cpu = 0;
4235                 rq->cpu = i;
4236 @@ -6920,15 +5744,13 @@ void __init sched_init(void)
4237  #endif
4238                 atomic_set(&rq->nr_iowait, 0);
4239
4240 -               for (j = 0; j < 2; j++) {
4241 -                       array = rq->arrays + j;
4242 -                       for (k = 0; k < MAX_PRIO; k++) {
4243 -                               INIT_LIST_HEAD(array->queue + k);
4244 -                               __clear_bit(k, array->bitmap);
4245 -                       }
4246 -                       // delimiter for bitsearch
4247 -                       __set_bit(MAX_PRIO, array->bitmap);
4248 +               array = &rq->active;
4249 +               for (j = 0; j < MAX_RT_PRIO; j++) {
4250 +                       INIT_LIST_HEAD(array->queue + j);
4251 +                       __clear_bit(j, array->bitmap);
4252                 }
4253 +               /* delimiter for bitsearch: */
4254 +               __set_bit(MAX_RT_PRIO, array->bitmap);
4255         }
4256
4257         set_load_weight(&init_task);
4258 @@ -6984,28 +5806,54 @@ EXPORT_SYMBOL(__might_sleep);
4259  #ifdef CONFIG_MAGIC_SYSRQ
4260  void normalize_rt_tasks(void)
4261  {
4262 -       struct prio_array *array;
4263         struct task_struct *p;
4264         unsigned long flags;
4265         struct rq *rq;
4266 +       int on_rq;
4267
4268         read_lock_irq(&tasklist_lock);
4269         for_each_process(p) {
4270 -               if (!rt_task(p))
4271 +               p->fair_key = 0;
4272 +               p->wait_runtime = 0;
4273 +               p->wait_start_fair = 0;
4274 +               p->wait_start = 0;
4275 +               p->exec_start = 0;
4276 +               p->sleep_start = 0;
4277 +               p->block_start = 0;
4278 +               task_rq(p)->fair_clock = 0;
4279 +               task_rq(p)->clock = 0;
4280 +
4281 +               if (!rt_task(p)) {
4282 +                       /*
4283 +                        * Renice negative nice level userspace
4284 +                        * tasks back to 0:
4285 +                        */
4286 +                       if (TASK_NICE(p) < 0 && p->mm)
4287 +                               set_user_nice(p, 0);
4288                         continue;
4289 +               }
4290
4291                 spin_lock_irqsave(&p->pi_lock, flags);
4292                 rq = __task_rq_lock(p);
4293 +#ifdef CONFIG_SMP
4294 +               /*
4295 +                * Do not touch the migration thread:
4296 +                */
4297 +               if (p == rq->migration_thread)
4298 +                       goto out_unlock;
4299 +#endif
4300
4301 -               array = p->array;
4302 -               if (array)
4303 -                       deactivate_task(p, task_rq(p));
4304 -               __setscheduler(p, SCHED_NORMAL, 0);
4305 -               if (array) {
4306 -                       __activate_task(p, task_rq(p));
4307 +               on_rq = p->on_rq;
4308 +               if (on_rq)
4309 +                       deactivate_task(task_rq(p), p, 0);
4310 +               __setscheduler(rq, p, SCHED_NORMAL, 0);
4311 +               if (on_rq) {
4312 +                       activate_task(task_rq(p), p, 0);
4313                         resched_task(rq->curr);
4314                 }
4315 -
4316 +#ifdef CONFIG_SMP
4317 + out_unlock:
4318 +#endif
4319                 __task_rq_unlock(rq);
4320                 spin_unlock_irqrestore(&p->pi_lock, flags);
4321         }
4322 Index: linux-cfs-2.6.20.8.q/kernel/sched_debug.c
4323 ===================================================================
4324 --- /dev/null
4325 +++ linux-cfs-2.6.20.8.q/kernel/sched_debug.c
4326 @@ -0,0 +1,161 @@
4327 +/*
4328 + * kernel/time/sched_debug.c
4329 + *
4330 + * Print the CFS rbtree
4331 + *
4332 + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
4333 + *
4334 + * This program is free software; you can redistribute it and/or modify
4335 + * it under the terms of the GNU General Public License version 2 as
4336 + * published by the Free Software Foundation.
4337 + */
4338 +
4339 +#include <linux/proc_fs.h>
4340 +#include <linux/module.h>
4341 +#include <linux/spinlock.h>
4342 +#include <linux/sched.h>
4343 +#include <linux/seq_file.h>
4344 +#include <linux/kallsyms.h>
4345 +#include <linux/ktime.h>
4346 +
4347 +#include <asm/uaccess.h>
4348 +
4349 +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
4350 +
4351 +/*
4352 + * This allows printing both to /proc/sched_debug and
4353 + * to the console
4354 + */
4355 +#define SEQ_printf(m, x...)                    \
4356 + do {                                          \
4357 +       if (m)                                  \
4358 +               seq_printf(m, x);               \
4359 +       else                                    \
4360 +               printk(x);                      \
4361 + } while (0)
4362 +
4363 +static void
4364 +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
4365 +{
4366 +       if (rq->curr == p)
4367 +               SEQ_printf(m, "R");
4368 +       else
4369 +               SEQ_printf(m, " ");
4370 +
4371 +       SEQ_printf(m, "%14s %5d %15Ld %13Ld %13Ld %9Ld %5d "
4372 +                     "%15Ld %15Ld %15Ld\n",
4373 +               p->comm, p->pid,
4374 +               (long long)p->fair_key, (long long)p->fair_key - rq->fair_clock,
4375 +               (long long)p->wait_runtime,
4376 +               (long long)p->nr_switches,
4377 +               p->prio,
4378 +               (long long)p->wait_start_fair - rq->fair_clock,
4379 +               (long long)p->sum_exec_runtime,
4380 +               (long long)p->sum_wait_runtime);
4381 +}
4382 +
4383 +static void print_rq(struct seq_file *m, struct rq *rq, u64 now)
4384 +{
4385 +       struct task_struct *p;
4386 +       struct rb_node *curr;
4387 +
4388 +       SEQ_printf(m,
4389 +       "\nrunnable tasks:\n"
4390 +       "           task   PID        tree-key         delta       waiting"
4391 +       "  switches  prio     wstart-fair"
4392 +       "        sum-exec        sum-wait\n"
4393 +       "-----------------------------------------------------------------"
4394 +       "--------------------------------"
4395 +       "--------------------------------\n");
4396 +
4397 +       curr = first_fair(rq);
4398 +       while (curr) {
4399 +               p = rb_entry(curr, struct task_struct, run_node);
4400 +               print_task(m, rq, p, now);
4401 +
4402 +               curr = rb_next(curr);
4403 +       }
4404 +}
4405 +
4406 +static void print_cpu(struct seq_file *m, int cpu, u64 now)
4407 +{
4408 +       struct rq *rq = &per_cpu(runqueues, cpu);
4409 +
4410 +       SEQ_printf(m, "\ncpu: %d\n", cpu);
4411 +#define P(x) \
4412 +       SEQ_printf(m, "  .%-22s: %Lu\n", #x, (unsigned long long)(rq->x))
4413 +
4414 +       P(nr_running);
4415 +       P(raw_weighted_load);
4416 +       P(nr_switches);
4417 +       P(nr_load_updates);
4418 +       P(nr_uninterruptible);
4419 +       P(next_balance);
4420 +       P(curr->pid);
4421 +       P(clock);
4422 +       P(prev_clock_raw);
4423 +       P(clock_warps);
4424 +       P(clock_unstable_events);
4425 +       P(clock_max_delta);
4426 +       rq->clock_max_delta = 0;
4427 +       P(fair_clock);
4428 +       P(prev_fair_clock);
4429 +       P(exec_clock);
4430 +       P(prev_exec_clock);
4431 +       P(wait_runtime);
4432 +       P(cpu_load[0]);
4433 +       P(cpu_load[1]);
4434 +       P(cpu_load[2]);
4435 +       P(cpu_load[3]);
4436 +       P(cpu_load[4]);
4437 +#undef P
4438 +
4439 +       print_rq(m, rq, now);
4440 +}
4441 +
4442 +static int sched_debug_show(struct seq_file *m, void *v)
4443 +{
4444 +       u64 now = ktime_to_ns(ktime_get());
4445 +       int cpu;
4446 +
4447 +       SEQ_printf(m, "Sched Debug Version: v0.02\n");
4448 +       SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
4449 +
4450 +       for_each_online_cpu(cpu)
4451 +               print_cpu(m, cpu, now);
4452 +
4453 +       SEQ_printf(m, "\n");
4454 +
4455 +       return 0;
4456 +}
4457 +
4458 +void sysrq_sched_debug_show(void)
4459 +{
4460 +       sched_debug_show(NULL, NULL);
4461 +}
4462 +
4463 +static int sched_debug_open(struct inode *inode, struct file *filp)
4464 +{
4465 +       return single_open(filp, sched_debug_show, NULL);
4466 +}
4467 +
4468 +static struct file_operations sched_debug_fops = {
4469 +       .open           = sched_debug_open,
4470 +       .read           = seq_read,
4471 +       .llseek         = seq_lseek,
4472 +       .release        = seq_release,
4473 +};
4474 +
4475 +static int __init init_sched_debug_procfs(void)
4476 +{
4477 +       struct proc_dir_entry *pe;
4478 +
4479 +       pe = create_proc_entry("sched_debug", 0644, NULL);
4480 +       if (!pe)
4481 +               return -ENOMEM;
4482 +
4483 +       pe->proc_fops = &sched_debug_fops;
4484 +
4485 +       return 0;
4486 +}
4487 +__initcall(init_sched_debug_procfs);
4488 Index: linux-cfs-2.6.20.8.q/kernel/sched_fair.c
4489 ===================================================================
4490 --- /dev/null
4491 +++ linux-cfs-2.6.20.8.q/kernel/sched_fair.c
4492 @@ -0,0 +1,618 @@
4493 +/*
4494 + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4495 + */
4496 +
4497 +/*
4498 + * Preemption granularity:
4499 + * (default: 2 msec, units: nanoseconds)
4500 + *
4501 + * NOTE: this granularity value is not the same as the concept of
4502 + * 'timeslice length' - timeslices in CFS will typically be somewhat
4503 + * larger than this value. (to see the precise effective timeslice
4504 + * length of your workload, run vmstat and monitor the context-switches
4505 + * field)
4506 + *
4507 + * On SMP systems the value of this is multiplied by the log2 of the
4508 + * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
4509 + * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
4510 + */
4511 +unsigned int sysctl_sched_granularity __read_mostly = 2000000;
4512 +
4513 +unsigned int sysctl_sched_sleep_history_max __read_mostly = 2000000000;
4514 +
4515 +unsigned int sysctl_sched_load_smoothing = 2;
4516 +
4517 +/*
4518 + * Wake-up granularity.
4519 + * (default: 1 msec, units: nanoseconds)
4520 + *
4521 + * This option delays the preemption effects of decoupled workloads
4522 + * and reduces their over-scheduling. Synchronous workloads will still
4523 + * have immediate wakeup/sleep latencies.
4524 + */
4525 +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 0;
4526 +
4527 +
4528 +extern struct sched_class fair_sched_class;
4529 +
4530 +/**************************************************************/
4531 +/* Scheduling class tree data structure manipulation methods:
4532 + */
4533 +
4534 +/*
4535 + * Enqueue a task into the rb-tree:
4536 + */
4537 +static inline void __enqueue_task_fair(struct rq *rq, struct task_struct *p)
4538 +{
4539 +       struct rb_node **link = &rq->tasks_timeline.rb_node;
4540 +       struct rb_node *parent = NULL;
4541 +       struct task_struct *entry;
4542 +       s64 key = p->fair_key;
4543 +       int leftmost = 1;
4544 +
4545 +       /*
4546 +        * Find the right place in the rbtree:
4547 +        */
4548 +       while (*link) {
4549 +               parent = *link;
4550 +               entry = rb_entry(parent, struct task_struct, run_node);
4551 +               /*
4552 +                * We dont care about collisions. Nodes with
4553 +                * the same key stay together.
4554 +                */
4555 +               if (key < entry->fair_key) {
4556 +                       link = &parent->rb_left;
4557 +               } else {
4558 +                       link = &parent->rb_right;
4559 +                       leftmost = 0;
4560 +               }
4561 +       }
4562 +
4563 +       /*
4564 +        * Maintain a cache of leftmost tree entries (it is frequently
4565 +        * used):
4566 +        */
4567 +       if (leftmost)
4568 +               rq->rb_leftmost = &p->run_node;
4569 +
4570 +       rb_link_node(&p->run_node, parent, link);
4571 +       rb_insert_color(&p->run_node, &rq->tasks_timeline);
4572 +}
4573 +
4574 +static inline void __dequeue_task_fair(struct rq *rq, struct task_struct *p)
4575 +{
4576 +       if (rq->rb_leftmost == &p->run_node)
4577 +               rq->rb_leftmost = NULL;
4578 +       rb_erase(&p->run_node, &rq->tasks_timeline);
4579 +}
4580 +
4581 +static inline struct rb_node * first_fair(struct rq *rq)
4582 +{
4583 +       if (rq->rb_leftmost)
4584 +               return rq->rb_leftmost;
4585 +       /* Cache the value returned by rb_first() */
4586 +       rq->rb_leftmost = rb_first(&rq->tasks_timeline);
4587 +       return rq->rb_leftmost;
4588 +}
4589 +
4590 +static struct task_struct * __pick_next_task_fair(struct rq *rq)
4591 +{
4592 +       return rb_entry(first_fair(rq), struct task_struct, run_node);
4593 +}
4594 +
4595 +/**************************************************************/
4596 +/* Scheduling class statistics methods:
4597 + */
4598 +
4599 +static inline u64
4600 +rescale_load(struct task_struct *p, u64 value)
4601 +{
4602 +       int load_shift = p->load_shift;
4603 +
4604 +       if (load_shift == SCHED_LOAD_SHIFT)
4605 +               return value;
4606 +
4607 +       return (value << load_shift) >> SCHED_LOAD_SHIFT;
4608 +}
4609 +
4610 +static u64
4611 +niced_granularity(struct rq *rq, struct task_struct *curr,
4612 +                 unsigned long granularity)
4613 +{
4614 +       return rescale_load(curr, granularity);
4615 +}
4616 +
4617 +/*
4618 + * Update the current task's runtime statistics. Skip current tasks that
4619 + * are not in our scheduling class.
4620 + */
4621 +static inline void update_curr(struct rq *rq, u64 now)
4622 +{
4623 +       u64 delta_exec, delta_fair, delta_mine;
4624 +       struct task_struct *curr = rq->curr;
4625 +       unsigned long load;
4626 +
4627 +       if (curr->sched_class != &fair_sched_class || curr == rq->idle
4628 +                       || !curr->on_rq)
4629 +               return;
4630 +       /*
4631 +        * Get the amount of time the current task was running
4632 +        * since the last time we changed raw_weighted_load:
4633 +        */
4634 +       delta_exec = now - curr->exec_start;
4635 +       if (unlikely(delta_exec > curr->exec_max))
4636 +               curr->exec_max = delta_exec;
4637 +
4638 +       if (sysctl_sched_load_smoothing) {
4639 +               delta_fair = delta_exec << SCHED_LOAD_SHIFT;
4640 +               do_div(delta_fair, rq->raw_weighted_load);
4641 +
4642 +               load = rq->cpu_load[CPU_LOAD_IDX_MAX-1] + 1;
4643 +               if (sysctl_sched_load_smoothing & 2)
4644 +                       load = max(load, rq->raw_weighted_load);
4645 +
4646 +               delta_mine = delta_exec << curr->load_shift;
4647 +               do_div(delta_mine, load);
4648 +       } else {
4649 +               delta_fair = delta_exec << SCHED_LOAD_SHIFT;
4650 +               do_div(delta_fair, rq->raw_weighted_load);
4651 +
4652 +               delta_mine = delta_exec << curr->load_shift;
4653 +               do_div(delta_mine, rq->raw_weighted_load);
4654 +       }
4655 +
4656 +       curr->sum_exec_runtime += delta_exec;
4657 +       curr->exec_start = now;
4658 +
4659 +       rq->fair_clock += delta_fair;
4660 +       rq->exec_clock += delta_exec;
4661 +
4662 +       /*
4663 +        * We executed delta_exec amount of time on the CPU,
4664 +        * but we were only entitled to delta_mine amount of
4665 +        * time during that period (if nr_running == 1 then
4666 +        * the two values are equal):
4667 +        */
4668 +
4669 +       /*
4670 +        * Task already marked for preemption, do not burden
4671 +        * it with the cost of not having left the CPU yet.
4672 +        */
4673 +       if (unlikely(test_tsk_thread_flag(curr, TIF_NEED_RESCHED)))
4674 +               goto out_nowait;
4675 +
4676 +       curr->wait_runtime -= delta_exec - delta_mine;
4677 +       if (unlikely(curr->wait_runtime < curr->min_wait_runtime))
4678 +               curr->min_wait_runtime = curr->wait_runtime;
4679 +
4680 +       rq->wait_runtime -= delta_exec - delta_mine;
4681 +out_nowait:
4682 +       ;
4683 +}
4684 +
4685 +static inline void
4686 +update_stats_wait_start(struct rq *rq, struct task_struct *p, u64 now)
4687 +{
4688 +       p->wait_start_fair = rq->fair_clock;
4689 +       p->wait_start = now;
4690 +}
4691 +
4692 +/*
4693 + * Task is being enqueued - update stats:
4694 + */
4695 +static inline void
4696 +update_stats_enqueue(struct rq *rq, struct task_struct *p, u64 now)
4697 +{
4698 +       s64 key;
4699 +
4700 +       /*
4701 +        * Update the fair clock.
4702 +        */
4703 +       update_curr(rq, now);
4704 +
4705 +       /*
4706 +        * Are we enqueueing a waiting task? (for current tasks
4707 +        * a dequeue/enqueue event is a NOP)
4708 +        */
4709 +       if (p != rq->curr)
4710 +               update_stats_wait_start(rq, p, now);
4711 +       /*
4712 +        * Update the key:
4713 +        */
4714 +       key = rq->fair_clock;
4715 +
4716 +       /*
4717 +        * Optimize the common nice 0 case:
4718 +        */
4719 +       if (likely(p->load_shift == SCHED_LOAD_SHIFT)) {
4720 +               key -= p->wait_runtime;
4721 +       } else {
4722 +               unsigned int delta_bits;
4723 +
4724 +               if (p->load_shift < SCHED_LOAD_SHIFT) {
4725 +                       /* plus-reniced tasks get helped: */
4726 +                       delta_bits = SCHED_LOAD_SHIFT - p->load_shift;
4727 +                       key -= p->wait_runtime << delta_bits;
4728 +               } else {
4729 +                       /* negative-reniced tasks get hurt: */
4730 +                       delta_bits = p->load_shift - SCHED_LOAD_SHIFT;
4731 +                       key -= p->wait_runtime >> delta_bits;
4732 +               }
4733 +       }
4734 +
4735 +       p->fair_key = key;
4736 +}
4737 +
4738 +/*
4739 + * Note: must be called with a freshly updated rq->fair_clock.
4740 + */
4741 +static inline void
4742 +update_stats_wait_end(struct rq *rq, struct task_struct *p, u64 now)
4743 +{
4744 +       u64 delta, fair_delta, delta_wait;
4745 +
4746 +       delta_wait = now - p->wait_start;
4747 +       if (unlikely(delta_wait > p->wait_max))
4748 +               p->wait_max = delta_wait;
4749 +
4750 +       delta = rq->fair_clock - p->wait_start_fair;
4751 +       fair_delta = rescale_load(p, delta);
4752 +
4753 +       p->sum_wait_runtime += fair_delta;
4754 +       rq->wait_runtime += fair_delta;
4755 +       p->wait_runtime += fair_delta;
4756 +
4757 +       p->wait_start_fair = 0;
4758 +       p->wait_start = 0;
4759 +}
4760 +
4761 +static inline void
4762 +update_stats_dequeue(struct rq *rq, struct task_struct *p, u64 now)
4763 +{
4764 +       update_curr(rq, now);
4765 +       /*
4766 +        * Mark the end of the wait period if dequeueing a
4767 +        * waiting task:
4768 +        */
4769 +       if (p != rq->curr)
4770 +               update_stats_wait_end(rq, p, now);
4771 +}
4772 +
4773 +/*
4774 + * We are picking a new current task - update its stats:
4775 + */
4776 +static inline void
4777 +update_stats_curr_start(struct rq *rq, struct task_struct *p, u64 now)
4778 +{
4779 +       /*
4780 +        * We are starting a new run period:
4781 +        */
4782 +       p->exec_start = now;
4783 +}
4784 +
4785 +/*
4786 + * We are descheduling a task - update its stats:
4787 + */
4788 +static inline void
4789 +update_stats_curr_end(struct rq *rq, struct task_struct *p, u64 now)
4790 +{
4791 +       update_curr(rq, now);
4792 +
4793 +       p->exec_start = 0;
4794 +}
4795 +
4796 +/**************************************************************/
4797 +/* Scheduling class queueing methods:
4798 + */
4799 +
4800 +/*
4801 + * The enqueue_task method is called before nr_running is
4802 + * increased. Here we update the fair scheduling stats and
4803 + * then put the task into the rbtree:
4804 + */
4805 +static void
4806 +enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
4807 +{
4808 +       unsigned long max_delta = sysctl_sched_sleep_history_max, factor;
4809 +       u64 delta = 0;
4810 +
4811 +       if (wakeup) {
4812 +               if (p->sleep_start) {
4813 +                       delta = now - p->sleep_start;
4814 +                       if ((s64)delta < 0)
4815 +                               delta = 0;
4816 +
4817 +                       if (unlikely(delta > p->sleep_max))
4818 +                               p->sleep_max = delta;
4819 +
4820 +                       p->sleep_start = 0;
4821 +               }
4822 +               if (p->block_start) {
4823 +                       delta = now - p->block_start;
4824 +                       if ((s64)delta < 0)
4825 +                               delta = 0;
4826 +
4827 +                       if (unlikely(delta > p->block_max))
4828 +                               p->block_max = delta;
4829 +
4830 +                       p->block_start = 0;
4831 +               }
4832 +
4833 +               /*
4834 +                * We are after a wait period, decay the
4835 +                * wait_runtime value:
4836 +                */
4837 +               if (max_delta != -1 && max_delta != -2) {
4838 +                       if (delta < max_delta) {
4839 +                               factor = 1024 * (max_delta -
4840 +                                       (unsigned long)delta) / max_delta;
4841 +                               p->wait_runtime *= (int)factor;
4842 +                               p->wait_runtime /= 1024;
4843 +                       } else {
4844 +                               p->wait_runtime = 0;
4845 +                       }
4846 +               }
4847 +       }
4848 +       update_stats_enqueue(rq, p, now);
4849 +       if (wakeup && max_delta == -2)
4850 +               p->wait_runtime = 0;
4851 +       __enqueue_task_fair(rq, p);
4852 +}
4853 +
4854 +/*
4855 + * The dequeue_task method is called before nr_running is
4856 + * decreased. We remove the task from the rbtree and
4857 + * update the fair scheduling stats:
4858 + */
4859 +static void
4860 +dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
4861 +{
4862 +       update_stats_dequeue(rq, p, now);
4863 +       if (sleep) {
4864 +               if (p->state & TASK_INTERRUPTIBLE)
4865 +                       p->sleep_start = now;
4866 +               if (p->state & TASK_UNINTERRUPTIBLE)
4867 +                       p->block_start = now;
4868 +       }
4869 +       __dequeue_task_fair(rq, p);
4870 +}
4871 +
4872 +/*
4873 + * sched_yield() support is very simple via the rbtree: we just
4874 + * dequeue the task and move it after the next task, which
4875 + * causes tasks to roundrobin.
4876 + */
4877 +static void
4878 +yield_task_fair(struct rq *rq, struct task_struct *p, struct task_struct *p_to)
4879 +{
4880 +       struct rb_node *curr, *next, *first;
4881 +       struct task_struct *p_next;
4882 +       s64 yield_key;
4883 +       u64 now;
4884 +
4885 +       /*
4886 +        * yield-to support: if we are on the same runqueue then
4887 +        * give half of our wait_runtime (if it's positive) to the other task:
4888 +        */
4889 +       if (p_to && p->wait_runtime > 0) {
4890 +               p_to->wait_runtime += p->wait_runtime >> 1;
4891 +               p->wait_runtime >>= 1;
4892 +       }
4893 +       curr = &p->run_node;
4894 +       first = first_fair(rq);
4895 +       /*
4896 +        * Move this task to the second place in the tree:
4897 +        */
4898 +       if (unlikely(curr != first)) {
4899 +               next = first;
4900 +       } else {
4901 +               next = rb_next(curr);
4902 +               /*
4903 +                * We were the last one already - nothing to do, return
4904 +                * and reschedule:
4905 +                */
4906 +               if (unlikely(!next))
4907 +                       return;
4908 +       }
4909 +
4910 +       p_next = rb_entry(next, struct task_struct, run_node);
4911 +       /*
4912 +        * Minimally necessary key value to be the second in the tree:
4913 +        */
4914 +       yield_key = p_next->fair_key + 1;
4915 +
4916 +       now = __rq_clock(rq);
4917 +       dequeue_task_fair(rq, p, 0, now);
4918 +       p->on_rq = 0;
4919 +
4920 +       /*
4921 +        * Only update the key if we need to move more backwards
4922 +        * than the minimally necessary position to be the second:
4923 +        */
4924 +       if (p->fair_key < yield_key)
4925 +               p->fair_key = yield_key;
4926 +
4927 +       __enqueue_task_fair(rq, p);
4928 +       p->on_rq = 1;
4929 +}
4930 +
4931 +/*
4932 + * Preempt the current task with a newly woken task if needed:
4933 + */
4934 +static inline void
4935 +__check_preempt_curr_fair(struct rq *rq, struct task_struct *p,
4936 +                         struct task_struct *curr, unsigned long granularity)
4937 +{
4938 +       s64 __delta = curr->fair_key - p->fair_key;
4939 +
4940 +       /*
4941 +        * Take scheduling granularity into account - do not
4942 +        * preempt the current task unless the best task has
4943 +        * a larger than sched_granularity fairness advantage:
4944 +        */
4945 +       if (__delta > niced_granularity(rq, curr, granularity))
4946 +               resched_task(curr);
4947 +}
4948 +
4949 +/*
4950 + * Preempt the current task with a newly woken task if needed:
4951 + */
4952 +static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
4953 +{
4954 +       struct task_struct *curr = rq->curr;
4955 +
4956 +       if ((curr == rq->idle) || rt_prio(p->prio)) {
4957 +               resched_task(curr);
4958 +       } else {
4959 +               __check_preempt_curr_fair(rq, p, curr,
4960 +                                         sysctl_sched_granularity);
4961 +       }
4962 +}
4963 +
4964 +static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now)
4965 +{
4966 +       struct task_struct *p = __pick_next_task_fair(rq);
4967 +
4968 +       /*
4969 +        * Any task has to be enqueued before it get to execute on
4970 +        * a CPU. So account for the time it spent waiting on the
4971 +        * runqueue. (note, here we rely on pick_next_task() having
4972 +        * done a put_prev_task_fair() shortly before this, which
4973 +        * updated rq->fair_clock - used by update_stats_wait_end())
4974 +        */
4975 +       update_stats_wait_end(rq, p, now);
4976 +       update_stats_curr_start(rq, p, now);
4977 +
4978 +       return p;
4979 +}
4980 +
4981 +/*
4982 + * Account for a descheduled task:
4983 + */
4984 +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
4985 +{
4986 +       if (prev == rq->idle)
4987 +               return;
4988 +
4989 +       update_stats_curr_end(rq, prev, now);
4990 +       /*
4991 +        * If the task is still waiting for the CPU (it just got
4992 +        * preempted), start the wait period:
4993 +        */
4994 +       if (prev->on_rq)
4995 +               update_stats_wait_start(rq, prev, now);
4996 +}
4997 +
4998 +/**************************************************************/
4999 +/* Fair scheduling class load-balancing methods:
5000 + */
5001 +
5002 +/*
5003 + * Load-balancing iterator. Note: while the runqueue stays locked
5004 + * during the whole iteration, the current task might be
5005 + * dequeued so the iterator has to be dequeue-safe. Here we
5006 + * achieve that by always pre-iterating before returning
5007 + * the current task:
5008 + */
5009 +static struct task_struct * load_balance_start_fair(struct rq *rq)
5010 +{
5011 +       struct rb_node *first = first_fair(rq);
5012 +       struct task_struct *p;
5013 +
5014 +       if (!first)
5015 +               return NULL;
5016 +
5017 +       p = rb_entry(first, struct task_struct, run_node);
5018 +
5019 +       rq->rb_load_balance_curr = rb_next(first);
5020 +
5021 +       return p;
5022 +}
5023 +
5024 +static struct task_struct * load_balance_next_fair(struct rq *rq)
5025 +{
5026 +       struct rb_node *curr = rq->rb_load_balance_curr;
5027 +       struct task_struct *p;
5028 +
5029 +       if (!curr)
5030 +               return NULL;
5031 +
5032 +       p = rb_entry(curr, struct task_struct, run_node);
5033 +       rq->rb_load_balance_curr = rb_next(curr);
5034 +
5035 +       return p;
5036 +}
5037 +
5038 +/*
5039 + * scheduler tick hitting a task of our scheduling class:
5040 + */
5041 +static void task_tick_fair(struct rq *rq, struct task_struct *curr)
5042 +{
5043 +       struct task_struct *next;
5044 +       u64 now = __rq_clock(rq);
5045 +
5046 +       /*
5047 +        * Dequeue and enqueue the task to update its
5048 +        * position within the tree:
5049 +        */
5050 +       dequeue_task_fair(rq, curr, 0, now);
5051 +       curr->on_rq = 0;
5052 +       enqueue_task_fair(rq, curr, 0, now);
5053 +       curr->on_rq = 1;
5054 +
5055 +       /*
5056 +        * Reschedule if another task tops the current one.
5057 +        */
5058 +       next = __pick_next_task_fair(rq);
5059 +       if (next == curr)
5060 +               return;
5061 +
5062 +       if ((curr == rq->idle) || (rt_prio(next->prio) &&
5063 +                                       (next->prio < curr->prio)))
5064 +               resched_task(curr);
5065 +       else
5066 +               __check_preempt_curr_fair(rq, next, curr,
5067 +                                         sysctl_sched_granularity);
5068 +}
5069 +
5070 +/*
5071 + * Share the fairness runtime between parent and child, thus the
5072 + * total amount of pressure for CPU stays equal - new tasks
5073 + * get a chance to run but frequent forkers are not allowed to
5074 + * monopolize the CPU. Note: the parent runqueue is locked,
5075 + * the child is not running yet.
5076 + */
5077 +static void task_new_fair(struct rq *rq, struct task_struct *p)
5078 +{
5079 +       sched_info_queued(p);
5080 +       update_stats_enqueue(rq, p, rq_clock(rq));
5081 +       /*
5082 +        * Child runs first: we let it run before the parent
5083 +        * until it reschedules once. We set up the key so that
5084 +        * it will preempt the parent:
5085 +        */
5086 +       p->fair_key = current->fair_key - niced_granularity(rq, rq->curr,
5087 +                                               sysctl_sched_granularity) - 1;
5088 +       __enqueue_task_fair(rq, p);
5089 +       p->on_rq = 1;
5090 +       inc_nr_running(p, rq);
5091 +}
5092 +
5093 +/*
5094 + * All the scheduling class methods:
5095 + */
5096 +struct sched_class fair_sched_class __read_mostly = {
5097 +       .enqueue_task           = enqueue_task_fair,
5098 +       .dequeue_task           = dequeue_task_fair,
5099 +       .yield_task             = yield_task_fair,
5100 +
5101 +       .check_preempt_curr     = check_preempt_curr_fair,
5102 +
5103 +       .pick_next_task         = pick_next_task_fair,
5104 +       .put_prev_task          = put_prev_task_fair,
5105 +
5106 +       .load_balance_start     = load_balance_start_fair,
5107 +       .load_balance_next      = load_balance_next_fair,
5108 +       .task_tick              = task_tick_fair,
5109 +       .task_new               = task_new_fair,
5110 +};
5111 Index: linux-cfs-2.6.20.8.q/kernel/sched_rt.c
5112 ===================================================================
5113 --- /dev/null
5114 +++ linux-cfs-2.6.20.8.q/kernel/sched_rt.c
5115 @@ -0,0 +1,184 @@
5116 +/*
5117 + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
5118 + * policies)
5119 + */
5120 +
5121 +static void
5122 +enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
5123 +{
5124 +       struct prio_array *array = &rq->active;
5125 +
5126 +       list_add_tail(&p->run_list, array->queue + p->prio);
5127 +       __set_bit(p->prio, array->bitmap);
5128 +}
5129 +
5130 +/*
5131 + * Adding/removing a task to/from a priority array:
5132 + */
5133 +static void
5134 +dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
5135 +{
5136 +       struct prio_array *array = &rq->active;
5137 +
5138 +       list_del(&p->run_list);
5139 +       if (list_empty(array->queue + p->prio))
5140 +               __clear_bit(p->prio, array->bitmap);
5141 +}
5142 +
5143 +/*
5144 + * Put task to the end of the run list without the overhead of dequeue
5145 + * followed by enqueue.
5146 + */
5147 +static void requeue_task_rt(struct rq *rq, struct task_struct *p)
5148 +{
5149 +       struct prio_array *array = &rq->active;
5150 +
5151 +       list_move_tail(&p->run_list, array->queue + p->prio);
5152 +}
5153 +
5154 +static void
5155 +yield_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *p_to)
5156 +{
5157 +       requeue_task_rt(rq, p);
5158 +}
5159 +
5160 +/*
5161 + * Preempt the current task with a newly woken task if needed:
5162 + */
5163 +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
5164 +{
5165 +       if (p->prio < rq->curr->prio)
5166 +               resched_task(rq->curr);
5167 +}
5168 +
5169 +static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now)
5170 +{
5171 +       struct prio_array *array = &rq->active;
5172 +       struct list_head *queue;
5173 +       int idx;
5174 +
5175 +       idx = sched_find_first_bit(array->bitmap);
5176 +       if (idx >= MAX_RT_PRIO)
5177 +               return NULL;
5178 +
5179 +       queue = array->queue + idx;
5180 +       return list_entry(queue->next, struct task_struct, run_list);
5181 +}
5182 +
5183 +/*
5184 + * No accounting done when RT tasks are descheduled:
5185 + */
5186 +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
5187 +{
5188 +}
5189 +
5190 +/*
5191 + * Load-balancing iterator. Note: while the runqueue stays locked
5192 + * during the whole iteration, the current task might be
5193 + * dequeued so the iterator has to be dequeue-safe. Here we
5194 + * achieve that by always pre-iterating before returning
5195 + * the current task:
5196 + */
5197 +static struct task_struct * load_balance_start_rt(struct rq *rq)
5198 +{
5199 +       struct prio_array *array = &rq->active;
5200 +       struct list_head *head, *curr;
5201 +       struct task_struct *p;
5202 +       int idx;
5203 +
5204 +       idx = sched_find_first_bit(array->bitmap);
5205 +       if (idx >= MAX_RT_PRIO)
5206 +               return NULL;
5207 +
5208 +       head = array->queue + idx;
5209 +       curr = head->prev;
5210 +
5211 +       p = list_entry(curr, struct task_struct, run_list);
5212 +
5213 +       curr = curr->prev;
5214 +
5215 +       rq->rt_load_balance_idx = idx;
5216 +       rq->rt_load_balance_head = head;
5217 +       rq->rt_load_balance_curr = curr;
5218 +
5219 +       return p;
5220 +}
5221 +
5222 +static struct task_struct * load_balance_next_rt(struct rq *rq)
5223 +{
5224 +       struct prio_array *array = &rq->active;
5225 +       struct list_head *head, *curr;
5226 +       struct task_struct *p;
5227 +       int idx;
5228 +
5229 +       idx = rq->rt_load_balance_idx;
5230 +       head = rq->rt_load_balance_head;
5231 +       curr = rq->rt_load_balance_curr;
5232 +
5233 +       /*
5234 +        * If we arrived back to the head again then
5235 +        * iterate to the next queue (if any):
5236 +        */
5237 +       if (unlikely(head == curr)) {
5238 +               int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
5239 +
5240 +               if (next_idx >= MAX_RT_PRIO)
5241 +                       return NULL;
5242 +
5243 +               idx = next_idx;
5244 +               head = array->queue + idx;
5245 +               curr = head->prev;
5246 +
5247 +               rq->rt_load_balance_idx = idx;
5248 +               rq->rt_load_balance_head = head;
5249 +       }
5250 +
5251 +       p = list_entry(curr, struct task_struct, run_list);
5252 +
5253 +       curr = curr->prev;
5254 +
5255 +       rq->rt_load_balance_curr = curr;
5256 +
5257 +       return p;
5258 +}
5259 +
5260 +static void task_tick_rt(struct rq *rq, struct task_struct *p)
5261 +{
5262 +       /*
5263 +        * RR tasks need a special form of timeslice management.
5264 +        * FIFO tasks have no timeslices.
5265 +        */
5266 +       if ((p->policy == SCHED_RR) && !--p->time_slice) {
5267 +               p->time_slice = static_prio_timeslice(p->static_prio);
5268 +               set_tsk_need_resched(p);
5269 +
5270 +               /* put it at the end of the queue: */
5271 +               requeue_task_rt(rq, p);
5272 +       }
5273 +}
5274 +
5275 +/*
5276 + * No parent/child timeslice management necessary for RT tasks,
5277 + * just activate them:
5278 + */
5279 +static void task_new_rt(struct rq *rq, struct task_struct *p)
5280 +{
5281 +       activate_task(rq, p, 1);
5282 +}
5283 +
5284 +static struct sched_class rt_sched_class __read_mostly = {
5285 +       .enqueue_task           = enqueue_task_rt,
5286 +       .dequeue_task           = dequeue_task_rt,
5287 +       .yield_task             = yield_task_rt,
5288 +
5289 +       .check_preempt_curr     = check_preempt_curr_rt,
5290 +
5291 +       .pick_next_task         = pick_next_task_rt,
5292 +       .put_prev_task          = put_prev_task_rt,
5293 +
5294 +       .load_balance_start     = load_balance_start_rt,
5295 +       .load_balance_next      = load_balance_next_rt,
5296 +
5297 +       .task_tick              = task_tick_rt,
5298 +       .task_new               = task_new_rt,
5299 +};
5300 Index: linux-cfs-2.6.20.8.q/kernel/sched_stats.h
5301 ===================================================================
5302 --- /dev/null
5303 +++ linux-cfs-2.6.20.8.q/kernel/sched_stats.h
5304 @@ -0,0 +1,235 @@
5305 +
5306 +#ifdef CONFIG_SCHEDSTATS
5307 +/*
5308 + * bump this up when changing the output format or the meaning of an existing
5309 + * format, so that tools can adapt (or abort)
5310 + */
5311 +#define SCHEDSTAT_VERSION 14
5312 +
5313 +static int show_schedstat(struct seq_file *seq, void *v)
5314 +{
5315 +       int cpu;
5316 +
5317 +       seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
5318 +       seq_printf(seq, "timestamp %lu\n", jiffies);
5319 +       for_each_online_cpu(cpu) {
5320 +               struct rq *rq = cpu_rq(cpu);
5321 +#ifdef CONFIG_SMP
5322 +               struct sched_domain *sd;
5323 +               int dcnt = 0;
5324 +#endif
5325 +
5326 +               /* runqueue-specific stats */
5327 +               seq_printf(seq,
5328 +                   "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
5329 +                   cpu, rq->yld_both_empty,
5330 +                   rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
5331 +                   rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
5332 +                   rq->ttwu_cnt, rq->ttwu_local,
5333 +                   rq->rq_sched_info.cpu_time,
5334 +                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
5335 +
5336 +               seq_printf(seq, "\n");
5337 +
5338 +#ifdef CONFIG_SMP
5339 +               /* domain-specific stats */
5340 +               preempt_disable();
5341 +               for_each_domain(cpu, sd) {
5342 +                       enum idle_type itype;
5343 +                       char mask_str[NR_CPUS];
5344 +
5345 +                       cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
5346 +                       seq_printf(seq, "domain%d %s", dcnt++, mask_str);
5347 +                       for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
5348 +                                       itype++) {
5349 +                               seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
5350 +                                               "%lu",
5351 +                                   sd->lb_cnt[itype],
5352 +                                   sd->lb_balanced[itype],
5353 +                                   sd->lb_failed[itype],
5354 +                                   sd->lb_imbalance[itype],
5355 +                                   sd->lb_gained[itype],
5356 +                                   sd->lb_hot_gained[itype],
5357 +                                   sd->lb_nobusyq[itype],
5358 +                                   sd->lb_nobusyg[itype]);
5359 +                       }
5360 +                       seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
5361 +                           " %lu %lu %lu\n",
5362 +                           sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
5363 +                           sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
5364 +                           sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
5365 +                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
5366 +                           sd->ttwu_move_balance);
5367 +               }
5368 +               preempt_enable();
5369 +#endif
5370 +       }
5371 +       return 0;
5372 +}
5373 +
5374 +static int schedstat_open(struct inode *inode, struct file *file)
5375 +{
5376 +       unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
5377 +       char *buf = kmalloc(size, GFP_KERNEL);
5378 +       struct seq_file *m;
5379 +       int res;
5380 +
5381 +       if (!buf)
5382 +               return -ENOMEM;
5383 +       res = single_open(file, show_schedstat, NULL);
5384 +       if (!res) {
5385 +               m = file->private_data;
5386 +               m->buf = buf;
5387 +               m->size = size;
5388 +       } else
5389 +               kfree(buf);
5390 +       return res;
5391 +}
5392 +
5393 +const struct file_operations proc_schedstat_operations = {
5394 +       .open    = schedstat_open,
5395 +       .read    = seq_read,
5396 +       .llseek  = seq_lseek,
5397 +       .release = single_release,
5398 +};
5399 +
5400 +/*
5401 + * Expects runqueue lock to be held for atomicity of update
5402 + */
5403 +static inline void
5404 +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
5405 +{
5406 +       if (rq) {
5407 +               rq->rq_sched_info.run_delay += delta_jiffies;
5408 +               rq->rq_sched_info.pcnt++;
5409 +       }
5410 +}
5411 +
5412 +/*
5413 + * Expects runqueue lock to be held for atomicity of update
5414 + */
5415 +static inline void
5416 +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
5417 +{
5418 +       if (rq)
5419 +               rq->rq_sched_info.cpu_time += delta_jiffies;
5420 +}
5421 +# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
5422 +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
5423 +#else /* !CONFIG_SCHEDSTATS */
5424 +static inline void
5425 +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
5426 +{}
5427 +static inline void
5428 +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
5429 +{}
5430 +# define schedstat_inc(rq, field)      do { } while (0)
5431 +# define schedstat_add(rq, field, amt) do { } while (0)
5432 +#endif
5433 +
5434 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
5435 +/*
5436 + * Called when a process is dequeued from the active array and given
5437 + * the cpu.  We should note that with the exception of interactive
5438 + * tasks, the expired queue will become the active queue after the active
5439 + * queue is empty, without explicitly dequeuing and requeuing tasks in the
5440 + * expired queue.  (Interactive tasks may be requeued directly to the
5441 + * active queue, thus delaying tasks in the expired queue from running;
5442 + * see scheduler_tick()).
5443 + *
5444 + * This function is only called from sched_info_arrive(), rather than
5445 + * dequeue_task(). Even though a task may be queued and dequeued multiple
5446 + * times as it is shuffled about, we're really interested in knowing how
5447 + * long it was from the *first* time it was queued to the time that it
5448 + * finally hit a cpu.
5449 + */
5450 +static inline void sched_info_dequeued(struct task_struct *t)
5451 +{
5452 +       t->sched_info.last_queued = 0;
5453 +}
5454 +
5455 +/*
5456 + * Called when a task finally hits the cpu.  We can now calculate how
5457 + * long it was waiting to run.  We also note when it began so that we
5458 + * can keep stats on how long its timeslice is.
5459 + */
5460 +static void sched_info_arrive(struct task_struct *t)
5461 +{
5462 +       unsigned long now = jiffies, delta_jiffies = 0;
5463 +
5464 +       if (t->sched_info.last_queued)
5465 +               delta_jiffies = now - t->sched_info.last_queued;
5466 +       sched_info_dequeued(t);
5467 +       t->sched_info.run_delay += delta_jiffies;
5468 +       t->sched_info.last_arrival = now;
5469 +       t->sched_info.pcnt++;
5470 +
5471 +       rq_sched_info_arrive(task_rq(t), delta_jiffies);
5472 +}
5473 +
5474 +/*
5475 + * Called when a process is queued into either the active or expired
5476 + * array.  The time is noted and later used to determine how long we
5477 + * had to wait for us to reach the cpu.  Since the expired queue will
5478 + * become the active queue after active queue is empty, without dequeuing
5479 + * and requeuing any tasks, we are interested in queuing to either. It
5480 + * is unusual but not impossible for tasks to be dequeued and immediately
5481 + * requeued in the same or another array: this can happen in sched_yield(),
5482 + * set_user_nice(), and even load_balance() as it moves tasks from runqueue
5483 + * to runqueue.
5484 + *
5485 + * This function is only called from enqueue_task(), but also only updates
5486 + * the timestamp if it is already not set.  It's assumed that
5487 + * sched_info_dequeued() will clear that stamp when appropriate.
5488 + */
5489 +static inline void sched_info_queued(struct task_struct *t)
5490 +{
5491 +       if (unlikely(sched_info_on()))
5492 +               if (!t->sched_info.last_queued)
5493 +                       t->sched_info.last_queued = jiffies;
5494 +}
5495 +
5496 +/*
5497 + * Called when a process ceases being the active-running process, either
5498 + * voluntarily or involuntarily.  Now we can calculate how long we ran.
5499 + */
5500 +static inline void sched_info_depart(struct task_struct *t)
5501 +{
5502 +       unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
5503 +
5504 +       t->sched_info.cpu_time += delta_jiffies;
5505 +       rq_sched_info_depart(task_rq(t), delta_jiffies);
5506 +}
5507 +
5508 +/*
5509 + * Called when tasks are switched involuntarily due, typically, to expiring
5510 + * their time slice.  (This may also be called when switching to or from
5511 + * the idle task.)  We are only called when prev != next.
5512 + */
5513 +static inline void
5514 +__sched_info_switch(struct task_struct *prev, struct task_struct *next)
5515 +{
5516 +       struct rq *rq = task_rq(prev);
5517 +
5518 +       /*
5519 +        * prev now departs the cpu.  It's not interesting to record
5520 +        * stats about how efficient we were at scheduling the idle
5521 +        * process, however.
5522 +        */
5523 +       if (prev != rq->idle)
5524 +               sched_info_depart(prev);
5525 +
5526 +       if (next != rq->idle)
5527 +               sched_info_arrive(next);
5528 +}
5529 +static inline void
5530 +sched_info_switch(struct task_struct *prev, struct task_struct *next)
5531 +{
5532 +       if (unlikely(sched_info_on()))
5533 +               __sched_info_switch(prev, next);
5534 +}
5535 +#else
5536 +#define sched_info_queued(t)           do { } while (0)
5537 +#define sched_info_switch(t, next)     do { } while (0)
5538 +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
5539 +
5540 Index: linux-cfs-2.6.20.8.q/kernel/sysctl.c
5541 ===================================================================
5542 --- linux-cfs-2.6.20.8.q.orig/kernel/sysctl.c
5543 +++ linux-cfs-2.6.20.8.q/kernel/sysctl.c
5544 @@ -320,6 +320,46 @@ static ctl_table kern_table[] = {
5545                 .strategy       = &sysctl_uts_string,
5546         },
5547         {
5548 +               .ctl_name       = CTL_UNNUMBERED,
5549 +               .procname       = "sched_granularity_ns",
5550 +               .data           = &sysctl_sched_granularity,
5551 +               .maxlen         = sizeof(unsigned int),
5552 +               .mode           = 0644,
5553 +               .proc_handler   = &proc_dointvec,
5554 +       },
5555 +       {
5556 +               .ctl_name       = CTL_UNNUMBERED,
5557 +               .procname       = "sched_wakeup_granularity_ns",
5558 +               .data           = &sysctl_sched_wakeup_granularity,
5559 +               .maxlen         = sizeof(unsigned int),
5560 +               .mode           = 0644,
5561 +               .proc_handler   = &proc_dointvec,
5562 +       },
5563 +       {
5564 +               .ctl_name       = CTL_UNNUMBERED,
5565 +               .procname       = "sched_sleep_history_max_ns",
5566 +               .data           = &sysctl_sched_sleep_history_max,
5567 +               .maxlen         = sizeof(unsigned int),
5568 +               .mode           = 0644,
5569 +               .proc_handler   = &proc_dointvec,
5570 +       },
5571 +       {
5572 +               .ctl_name       = CTL_UNNUMBERED,
5573 +               .procname       = "sched_child_runs_first",
5574 +               .data           = &sysctl_sched_child_runs_first,
5575 +               .maxlen         = sizeof(unsigned int),
5576 +               .mode           = 0644,
5577 +               .proc_handler   = &proc_dointvec,
5578 +       },
5579 +       {
5580 +               .ctl_name       = CTL_UNNUMBERED,
5581 +               .procname       = "sched_load_smoothing",
5582 +               .data           = &sysctl_sched_load_smoothing,
5583 +               .maxlen         = sizeof(unsigned int),
5584 +               .mode           = 0644,
5585 +               .proc_handler   = &proc_dointvec,
5586 +       },
5587 +       {
5588                 .ctl_name       = KERN_PANIC,
5589                 .procname       = "panic",
5590                 .data           = &panic_timeout,