kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/config.h>
  23 #include <linux/mm.h>
  24 #include <linux/init.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/kernel_stat.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/mmu_context.h>
  31
  32 extern void timer_bh(void);
  33 extern void tqueue_bh(void);
  34 extern void immediate_bh(void);
  35
  36 /*
  37  * scheduler variables
  38  */
  39
  40 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  41
  42 extern void mem_use(void);
  43
  44 /*
  45  * Scheduling quanta.
  46  *
  47  * NOTE! The unix "nice" value influences how long a process
  48  * gets. The nice value ranges from -20 to +19, where a -20
  49  * is a "high-priority" task, and a "+10" is a low-priority
  50  * task.
  51  *
  52  * We want the time-slice to be around 50ms or so, so this
  53  * calculation depends on the value of HZ.
  54  */
  55 #if HZ < 200
  56 #define TICK_SCALE(x)   ((x) >> 2)
  57 #elif HZ < 400
  58 #define TICK_SCALE(x)   ((x) >> 1)
  59 #elif HZ < 800
  60 #define TICK_SCALE(x)   (x)
  61 #elif HZ < 1600
  62 #define TICK_SCALE(x)   ((x) << 1)
  63 #else
  64 #define TICK_SCALE(x)   ((x) << 2)
  65 #endif
  66
  67 #define NICE_TO_TICKS(nice)     (TICK_SCALE(20-(nice))+1)
  68
  69
  70 /*
  71  *      Init task must be ok at boot for the ix86 as we will check its signals
  72  *      via the SMP irq return path.
  73  */
  74
  75 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  76
  77 /*
  78  * The tasklist_lock protects the linked list of processes.
  79  *
  80  * The runqueue_lock locks the parts that actually access
  81  * and change the run-queues, and have to be interrupt-safe.
  82  *
  83  * If both locks are to be concurrently held, the runqueue_lock
  84  * nests inside the tasklist_lock.
  85  */
  86 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
  87 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
  88
  89 static LIST_HEAD(runqueue_head);
  90
  91 /*
  92  * We align per-CPU scheduling data on cacheline boundaries,
  93  * to prevent cacheline ping-pong.
  94  */
  95 static union {
  96         struct schedule_data {
  97                 struct task_struct * curr;
  98                 cycles_t last_schedule;
  99         } schedule_data;
 100         char __pad [SMP_CACHE_BYTES];
 101 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 102
 103 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 104 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 105
 106 struct kernel_stat kstat;
 107
 108 #ifdef CONFIG_SMP
 109
 110 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 111 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
 112                                 ((p)->cpus_allowed & (1 << cpu)))
 113
 114 #else
 115
 116 #define idle_task(cpu) (&init_task)
 117 #define can_schedule(p,cpu) (1)
 118
 119 #endif
 120
 121 void scheduling_functions_start_here(void) { }
 122
 123 /*
 124  * This is the function that decides how desirable a process is..
 125  * You can weigh different processes against each other depending
 126  * on what CPU they've run on lately etc to try to handle cache
 127  * and TLB miss penalties.
 128  *
 129  * Return values:
 130  *       -1000: never select this
 131  *           0: out of time, recalculate counters (but it might still be
 132  *              selected)
 133  *         +ve: "goodness" value (the larger, the better)
 134  *       +1000: realtime process, select this.
 135  */
 136
 137 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 138 {
 139         int weight;
 140
 141         /*
 142          * select the current process after every other
 143          * runnable process, but before the idle thread.
 144          * Also, dont trigger a counter recalculation.
 145          */
 146         weight = -1;
 147         if (p->policy & SCHED_YIELD)
 148                 goto out;
 149
 150         /*
 151          * Non-RT process - normal case first.
 152          */
 153         if (p->policy == SCHED_OTHER) {
 154                 /*
 155                  * Give the process a first-approximation goodness value
 156                  * according to the number of clock-ticks it has left.
 157                  *
 158                  * Don't do any other calculations if the time slice is
 159                  * over..
 160                  */
 161                 weight = p->counter;
 162                 if (!weight)
 163                         goto out;
 164
 165 #ifdef CONFIG_SMP
 166                 /* Give a largish advantage to the same processor...   */
 167                 /* (this is equivalent to penalizing other processors) */
 168                 if (p->processor == this_cpu)
 169                         weight += PROC_CHANGE_PENALTY;
 170 #endif
 171
 172                 /* .. and a slight advantage to the current MM */
 173                 if (p->mm == this_mm || !p->mm)
 174                         weight += 1;
 175                 weight += 20 - p->nice;
 176                 goto out;
 177         }
 178
 179         /*
 180          * Realtime process, select the first one on the
 181          * runqueue (taking priorities within processes
 182          * into account).
 183          */
 184         weight = 1000 + p->rt_priority;
 185 out:
 186         return weight;
 187 }
 188
 189 /*
 190  * the 'goodness value' of replacing a process on a given CPU.
 191  * positive value means 'replace', zero or negative means 'dont'.
 192  */
 193 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 194 {
 195         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 196 }
 197
 198 /*
 199  * This is ugly, but reschedule_idle() is very timing-critical.
 200  * We `are called with the runqueue spinlock held and we must
 201  * not claim the tasklist_lock.
 202  */
 203 static FASTCALL(void reschedule_idle(struct task_struct * p));
 204
 205 static void reschedule_idle(struct task_struct * p)
 206 {
 207 #ifdef CONFIG_SMP
 208         int this_cpu = smp_processor_id();
 209         struct task_struct *tsk, *target_tsk;
 210         int cpu, best_cpu, i, max_prio;
 211         cycles_t oldest_idle;
 212
 213         /*
 214          * shortcut if the woken up task's last CPU is
 215          * idle now.
 216          */
 217         best_cpu = p->processor;
 218         if (can_schedule(p, best_cpu)) {
 219                 tsk = idle_task(best_cpu);
 220                 if (cpu_curr(best_cpu) == tsk) {
 221                         int need_resched;
 222 send_now_idle:
 223                         /*
 224                          * If need_resched == -1 then we can skip sending
 225                          * the IPI altogether, tsk->need_resched is
 226                          * actively watched by the idle thread.
 227                          */
 228                         need_resched = tsk->need_resched;
 229                         tsk->need_resched = 1;
 230                         if ((best_cpu != this_cpu) && !need_resched)
 231                                 smp_send_reschedule(best_cpu);
 232                         return;
 233                 }
 234         }
 235
 236         /*
 237          * We know that the preferred CPU has a cache-affine current
 238          * process, lets try to find a new idle CPU for the woken-up
 239          * process. Select the least recently active idle CPU. (that
 240          * one will have the least active cache context.) Also find
 241          * the executing process which has the least priority.
 242          */
 243         oldest_idle = (cycles_t) -1;
 244         target_tsk = NULL;
 245         max_prio = 1;
 246
 247         for (i = 0; i < smp_num_cpus; i++) {
 248                 cpu = cpu_logical_map(i);
 249                 if (!can_schedule(p, cpu))
 250                         continue;
 251                 tsk = cpu_curr(cpu);
 252                 /*
 253                  * We use the first available idle CPU. This creates
 254                  * a priority list between idle CPUs, but this is not
 255                  * a problem.
 256                  */
 257                 if (tsk == idle_task(cpu)) {
 258                         if (last_schedule(cpu) < oldest_idle) {
 259                                 oldest_idle = last_schedule(cpu);
 260                                 target_tsk = tsk;
 261                         }
 262                 } else {
 263                         if (oldest_idle == -1ULL) {
 264                                 int prio = preemption_goodness(tsk, p, cpu);
 265
 266                                 if (prio > max_prio) {
 267                                         max_prio = prio;
 268                                         target_tsk = tsk;
 269                                 }
 270                         }
 271                 }
 272         }
 273         tsk = target_tsk;
 274         if (tsk) {
 275                 if (oldest_idle != -1ULL)
 276                         goto send_now_idle;
 277                 tsk->need_resched = 1;
 278                 if (tsk->processor != this_cpu)
 279                         smp_send_reschedule(tsk->processor);
 280         }
 281         return;
 282
 283
 284 #else /* UP */
 285         int this_cpu = smp_processor_id();
 286         struct task_struct *tsk;
 287
 288         tsk = cpu_curr(this_cpu);
 289         if (preemption_goodness(tsk, p, this_cpu) > 1)
 290                 tsk->need_resched = 1;
 291 #endif
 292 }
 293
 294 /*
 295  * Careful!
 296  *
 297  * This has to add the process to the _beginning_ of the
 298  * run-queue, not the end. See the comment about "This is
 299  * subtle" in the scheduler proper..
 300  */
 301 static inline void add_to_runqueue(struct task_struct * p)
 302 {
 303         list_add(&p->run_list, &runqueue_head);
 304         nr_running++;
 305 }
 306
 307 static inline void move_last_runqueue(struct task_struct * p)
 308 {
 309         list_del(&p->run_list);
 310         list_add_tail(&p->run_list, &runqueue_head);
 311 }
 312
 313 static inline void move_first_runqueue(struct task_struct * p)
 314 {
 315         list_del(&p->run_list);
 316         list_add(&p->run_list, &runqueue_head);
 317 }
 318
 319 /*
 320  * Wake up a process. Put it on the run-queue if it's not
 321  * already there.  The "current" process is always on the
 322  * run-queue (except when the actual re-schedule is in
 323  * progress), and as such you're allowed to do the simpler
 324  * "current->state = TASK_RUNNING" to mark yourself runnable
 325  * without the overhead of this.
 326  */
 327 inline void wake_up_process(struct task_struct * p)
 328 {
 329         unsigned long flags;
 330
 331         /*
 332          * We want the common case fall through straight, thus the goto.
 333          */
 334         spin_lock_irqsave(&runqueue_lock, flags);
 335         p->state = TASK_RUNNING;
 336         if (task_on_runqueue(p))
 337                 goto out;
 338         add_to_runqueue(p);
 339         reschedule_idle(p);
 340 out:
 341         spin_unlock_irqrestore(&runqueue_lock, flags);
 342 }
 343
 344 static inline void wake_up_process_synchronous(struct task_struct * p)
 345 {
 346         unsigned long flags;
 347
 348         /*
 349          * We want the common case fall through straight, thus the goto.
 350          */
 351         spin_lock_irqsave(&runqueue_lock, flags);
 352         p->state = TASK_RUNNING;
 353         if (task_on_runqueue(p))
 354                 goto out;
 355         add_to_runqueue(p);
 356 out:
 357         spin_unlock_irqrestore(&runqueue_lock, flags);
 358 }
 359
 360 static void process_timeout(unsigned long __data)
 361 {
 362         struct task_struct * p = (struct task_struct *) __data;
 363
 364         wake_up_process(p);
 365 }
 366
 367 signed long schedule_timeout(signed long timeout)
 368 {
 369         struct timer_list timer;
 370         unsigned long expire;
 371
 372         switch (timeout)
 373         {
 374         case MAX_SCHEDULE_TIMEOUT:
 375                 /*
 376                  * These two special cases are useful to be comfortable
 377                  * in the caller. Nothing more. We could take
 378                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 379                  * but I' d like to return a valid offset (>=0) to allow
 380                  * the caller to do everything it want with the retval.
 381                  */
 382                 schedule();
 383                 goto out;
 384         default:
 385                 /*
 386                  * Another bit of PARANOID. Note that the retval will be
 387                  * 0 since no piece of kernel is supposed to do a check
 388                  * for a negative retval of schedule_timeout() (since it
 389                  * should never happens anyway). You just have the printk()
 390                  * that will tell you if something is gone wrong and where.
 391                  */
 392                 if (timeout < 0)
 393                 {
 394                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 395                                "value %lx from %p\n", timeout,
 396                                __builtin_return_address(0));
 397                         current->state = TASK_RUNNING;
 398                         goto out;
 399                 }
 400         }
 401
 402         expire = timeout + jiffies;
 403
 404         init_timer(&timer);
 405         timer.expires = expire;
 406         timer.data = (unsigned long) current;
 407         timer.function = process_timeout;
 408
 409         add_timer(&timer);
 410         schedule();
 411         del_timer_sync(&timer);
 412
 413         timeout = expire - jiffies;
 414
 415  out:
 416         return timeout < 0 ? 0 : timeout;
 417 }
 418
 419 /*
 420  * schedule_tail() is getting called from the fork return path. This
 421  * cleans up all remaining scheduler things, without impacting the
 422  * common case.
 423  */
 424 static inline void __schedule_tail(struct task_struct *prev)
 425 {
 426 #ifdef CONFIG_SMP
 427         int policy;
 428
 429         /*
 430          * prev->policy can be written from here only before `prev'
 431          * can be scheduled (before setting prev->has_cpu to zero).
 432          * Of course it must also be read before allowing prev
 433          * to be rescheduled, but since the write depends on the read
 434          * to complete, wmb() is enough. (the spin_lock() acquired
 435          * before setting has_cpu is not enough because the spin_lock()
 436          * common code semantics allows code outside the critical section
 437          * to enter inside the critical section)
 438          */
 439         policy = prev->policy;
 440         prev->policy = policy & ~SCHED_YIELD;
 441         wmb();
 442
 443         /*
 444          * fast path falls through. We have to clear has_cpu before
 445          * checking prev->state to avoid a wakeup race - thus we
 446          * also have to protect against the task exiting early.
 447          */
 448         task_lock(prev);
 449         prev->has_cpu = 0;
 450         mb();
 451         if (prev->state == TASK_RUNNING)
 452                 goto needs_resched;
 453
 454 out_unlock:
 455         task_unlock(prev);
 456         return;
 457
 458         /*
 459          * Slow path - we 'push' the previous process and
 460          * reschedule_idle() will attempt to find a new
 461          * processor for it. (but it might preempt the
 462          * current process as well.) We must take the runqueue
 463          * lock and re-check prev->state to be correct. It might
 464          * still happen that this process has a preemption
 465          * 'in progress' already - but this is not a problem and
 466          * might happen in other circumstances as well.
 467          */
 468 needs_resched:
 469         {
 470                 unsigned long flags;
 471
 472                 /*
 473                  * Avoid taking the runqueue lock in cases where
 474                  * no preemption-check is necessery:
 475                  */
 476                 if ((prev == idle_task(smp_processor_id())) ||
 477                                                 (policy & SCHED_YIELD))
 478                         goto out_unlock;
 479
 480                 spin_lock_irqsave(&runqueue_lock, flags);
 481                 if (prev->state == TASK_RUNNING)
 482                         reschedule_idle(prev);
 483                 spin_unlock_irqrestore(&runqueue_lock, flags);
 484                 goto out_unlock;
 485         }
 486 #else
 487         prev->policy &= ~SCHED_YIELD;
 488 #endif /* CONFIG_SMP */
 489 }
 490
 491 void schedule_tail(struct task_struct *prev)
 492 {
 493         __schedule_tail(prev);
 494 }
 495
 496 /*
 497  *  'schedule()' is the scheduler function. It's a very simple and nice
 498  * scheduler: it's not perfect, but certainly works for most things.
 499  *
 500  * The goto is "interesting".
 501  *
 502  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 503  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 504  * information in task[0] is never used.
 505  */
 506 asmlinkage void schedule(void)
 507 {
 508         struct schedule_data * sched_data;
 509         struct task_struct *prev, *next, *p;
 510         struct list_head *tmp;
 511         int this_cpu, c;
 512
 513         if (!current->active_mm) BUG();
 514         if (tq_scheduler)
 515                 goto handle_tq_scheduler;
 516 tq_scheduler_back:
 517
 518         prev = current;
 519         this_cpu = prev->processor;
 520
 521         if (in_interrupt())
 522                 goto scheduling_in_interrupt;
 523
 524         release_kernel_lock(prev, this_cpu);
 525
 526         /* Do "administrative" work here while we don't hold any locks */
 527         if (softirq_active(this_cpu) & softirq_mask(this_cpu))
 528                 goto handle_softirq;
 529 handle_softirq_back:
 530
 531         /*
 532          * 'sched_data' is protected by the fact that we can run
 533          * only one process per CPU.
 534          */
 535         sched_data = & aligned_data[this_cpu].schedule_data;
 536
 537         spin_lock_irq(&runqueue_lock);
 538
 539         /* move an exhausted RR process to be last.. */
 540         if (prev->policy == SCHED_RR)
 541                 goto move_rr_last;
 542 move_rr_back:
 543
 544         switch (prev->state) {
 545                 case TASK_INTERRUPTIBLE:
 546                         if (signal_pending(prev)) {
 547                                 prev->state = TASK_RUNNING;
 548                                 break;
 549                         }
 550                 default:
 551                         del_from_runqueue(prev);
 552                 case TASK_RUNNING:
 553         }
 554         prev->need_resched = 0;
 555
 556         /*
 557          * this is the scheduler proper:
 558          */
 559
 560 repeat_schedule:
 561         /*
 562          * Default process to select..
 563          */
 564         next = idle_task(this_cpu);
 565         c = -1000;
 566         if (prev->state == TASK_RUNNING)
 567                 goto still_running;
 568
 569 still_running_back:
 570         list_for_each(tmp, &runqueue_head) {
 571                 p = list_entry(tmp, struct task_struct, run_list);
 572                 if (can_schedule(p, this_cpu)) {
 573                         int weight = goodness(p, this_cpu, prev->active_mm);
 574                         if (weight > c)
 575                                 c = weight, next = p;
 576                 }
 577         }
 578
 579         /* Do we need to re-calculate counters? */
 580         if (!c)
 581                 goto recalculate;
 582         /*
 583          * from this point on nothing can prevent us from
 584          * switching to the next task, save this fact in
 585          * sched_data.
 586          */
 587         sched_data->curr = next;
 588 #ifdef CONFIG_SMP
 589         next->has_cpu = 1;
 590         next->processor = this_cpu;
 591 #endif
 592         spin_unlock_irq(&runqueue_lock);
 593
 594         if (prev == next)
 595                 goto same_process;
 596
 597 #ifdef CONFIG_SMP
 598         /*
 599          * maintain the per-process 'last schedule' value.
 600          * (this has to be recalculated even if we reschedule to
 601          * the same process) Currently this is only used on SMP,
 602          * and it's approximate, so we do not have to maintain
 603          * it while holding the runqueue spinlock.
 604          */
 605         sched_data->last_schedule = get_cycles();
 606
 607         /*
 608          * We drop the scheduler lock early (it's a global spinlock),
 609          * thus we have to lock the previous process from getting
 610          * rescheduled during switch_to().
 611          */
 612
 613 #endif /* CONFIG_SMP */
 614
 615         kstat.context_swtch++;
 616         /*
 617          * there are 3 processes which are affected by a context switch:
 618          *
 619          * prev == .... ==> (last => next)
 620          *
 621          * It's the 'much more previous' 'prev' that is on next's stack,
 622          * but prev is set to (the just run) 'last' process by switch_to().
 623          * This might sound slightly confusing but makes tons of sense.
 624          */
 625         prepare_to_switch();
 626         {
 627                 struct mm_struct *mm = next->mm;
 628                 struct mm_struct *oldmm = prev->active_mm;
 629                 if (!mm) {
 630                         if (next->active_mm) BUG();
 631                         next->active_mm = oldmm;
 632                         atomic_inc(&oldmm->mm_count);
 633                         enter_lazy_tlb(oldmm, next, this_cpu);
 634                 } else {
 635                         if (next->active_mm != mm) BUG();
 636                         switch_mm(oldmm, mm, next, this_cpu);
 637                 }
 638
 639                 if (!prev->mm) {
 640                         prev->active_mm = NULL;
 641                         mmdrop(oldmm);
 642                 }
 643         }
 644
 645         /*
 646          * This just switches the register state and the
 647          * stack.
 648          */
 649         switch_to(prev, next, prev);
 650         __schedule_tail(prev);
 651
 652 same_process:
 653         reacquire_kernel_lock(current);
 654         if (current->need_resched)
 655                 goto tq_scheduler_back;
 656
 657         return;
 658
 659 recalculate:
 660         {
 661                 struct task_struct *p;
 662                 spin_unlock_irq(&runqueue_lock);
 663                 read_lock(&tasklist_lock);
 664                 for_each_task(p)
 665                         p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 666                 read_unlock(&tasklist_lock);
 667                 spin_lock_irq(&runqueue_lock);
 668         }
 669         goto repeat_schedule;
 670
 671 still_running:
 672         c = goodness(prev, this_cpu, prev->active_mm);
 673         next = prev;
 674         goto still_running_back;
 675
 676 handle_softirq:
 677         do_softirq();
 678         goto handle_softirq_back;
 679
 680 handle_tq_scheduler:
 681         /*
 682          * do not run the task queue with disabled interrupts,
 683          * cli() wouldn't work on SMP
 684          */
 685         sti();
 686         run_task_queue(&tq_scheduler);
 687         goto tq_scheduler_back;
 688
 689 move_rr_last:
 690         if (!prev->counter) {
 691                 prev->counter = NICE_TO_TICKS(prev->nice);
 692                 move_last_runqueue(prev);
 693         }
 694         goto move_rr_back;
 695
 696 scheduling_in_interrupt:
 697         printk("Scheduling in interrupt\n");
 698         BUG();
 699         return;
 700 }
 701
 702 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 703                                      unsigned int wq_mode, const int sync)
 704 {
 705         struct list_head *tmp, *head;
 706         struct task_struct *p, *best_exclusive;
 707         unsigned long flags;
 708         int best_cpu, irq;
 709
 710         if (!q)
 711                 goto out;
 712
 713         best_cpu = smp_processor_id();
 714         irq = in_interrupt();
 715         best_exclusive = NULL;
 716         wq_write_lock_irqsave(&q->lock, flags);
 717
 718 #if WAITQUEUE_DEBUG
 719         CHECK_MAGIC_WQHEAD(q);
 720 #endif
 721
 722         head = &q->task_list;
 723 #if WAITQUEUE_DEBUG
 724         if (!head->next || !head->prev)
 725                 WQ_BUG();
 726 #endif
 727         tmp = head->next;
 728         while (tmp != head) {
 729                 unsigned int state;
 730                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 731
 732                 tmp = tmp->next;
 733
 734 #if WAITQUEUE_DEBUG
 735                 CHECK_MAGIC(curr->__magic);
 736 #endif
 737                 p = curr->task;
 738                 state = p->state;
 739                 if (state & mode) {
 740 #if WAITQUEUE_DEBUG
 741                         curr->__waker = (long)__builtin_return_address(0);
 742 #endif
 743                         /*
 744                          * If waking up from an interrupt context then
 745                          * prefer processes which are affine to this
 746                          * CPU.
 747                          */
 748                         if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
 749                                 if (!best_exclusive)
 750                                         best_exclusive = p;
 751                                 if (p->processor == best_cpu) {
 752                                         best_exclusive = p;
 753                                         break;
 754                                 }
 755                         } else {
 756                                 if (sync)
 757                                         wake_up_process_synchronous(p);
 758                                 else
 759                                         wake_up_process(p);
 760                                 if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
 761                                         break;
 762                         }
 763                 }
 764         }
 765         if (best_exclusive) {
 766                 if (sync)
 767                         wake_up_process_synchronous(best_exclusive);
 768                 else
 769                         wake_up_process(best_exclusive);
 770         }
 771         wq_write_unlock_irqrestore(&q->lock, flags);
 772 out:
 773         return;
 774 }
 775
 776 void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 777 {
 778         __wake_up_common(q, mode, wq_mode, 0);
 779 }
 780
 781 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
 782 {
 783         __wake_up_common(q, mode, wq_mode, 1);
 784 }
 785
 786 #define SLEEP_ON_VAR                            \
 787         unsigned long flags;                    \
 788         wait_queue_t wait;                      \
 789         init_waitqueue_entry(&wait, current);
 790
 791 #define SLEEP_ON_HEAD                                   \
 792         wq_write_lock_irqsave(&q->lock,flags);          \
 793         __add_wait_queue(q, &wait);                     \
 794         wq_write_unlock(&q->lock);
 795
 796 #define SLEEP_ON_TAIL                                           \
 797         wq_write_lock_irq(&q->lock);                            \
 798         __remove_wait_queue(q, &wait);                          \
 799         wq_write_unlock_irqrestore(&q->lock,flags);
 800
 801 void interruptible_sleep_on(wait_queue_head_t *q)
 802 {
 803         SLEEP_ON_VAR
 804
 805         current->state = TASK_INTERRUPTIBLE;
 806
 807         SLEEP_ON_HEAD
 808         schedule();
 809         SLEEP_ON_TAIL
 810 }
 811
 812 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 813 {
 814         SLEEP_ON_VAR
 815
 816         current->state = TASK_INTERRUPTIBLE;
 817
 818         SLEEP_ON_HEAD
 819         timeout = schedule_timeout(timeout);
 820         SLEEP_ON_TAIL
 821
 822         return timeout;
 823 }
 824
 825 void sleep_on(wait_queue_head_t *q)
 826 {
 827         SLEEP_ON_VAR
 828
 829         current->state = TASK_UNINTERRUPTIBLE;
 830
 831         SLEEP_ON_HEAD
 832         schedule();
 833         SLEEP_ON_TAIL
 834 }
 835
 836 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 837 {
 838         SLEEP_ON_VAR
 839
 840         current->state = TASK_UNINTERRUPTIBLE;
 841
 842         SLEEP_ON_HEAD
 843         timeout = schedule_timeout(timeout);
 844         SLEEP_ON_TAIL
 845
 846         return timeout;
 847 }
 848
 849 void scheduling_functions_end_here(void) { }
 850
 851 #ifndef __alpha__
 852
 853 /*
 854  * This has been replaced by sys_setpriority.  Maybe it should be
 855  * moved into the arch dependent tree for those ports that require
 856  * it for backward compatibility?
 857  */
 858
 859 asmlinkage long sys_nice(int increment)
 860 {
 861         long newprio;
 862
 863         /*
 864          *      Setpriority might change our priority at the same moment.
 865          *      We don't have to worry. Conceptually one call occurs first
 866          *      and we have a single winner.
 867          */
 868         if (increment < 0) {
 869                 if (!capable(CAP_SYS_NICE))
 870                         return -EPERM;
 871                 if (increment < -40)
 872                         increment = -40;
 873         }
 874         if (increment > 40)
 875                 increment = 40;
 876
 877         newprio = current->nice + increment;
 878         if (newprio < -20)
 879                 newprio = -20;
 880         if (newprio > 19)
 881                 newprio = 19;
 882         current->nice = newprio;
 883         return 0;
 884 }
 885
 886 #endif
 887
 888 static inline struct task_struct *find_process_by_pid(pid_t pid)
 889 {
 890         struct task_struct *tsk = current;
 891
 892         if (pid)
 893                 tsk = find_task_by_pid(pid);
 894         return tsk;
 895 }
 896
 897 static int setscheduler(pid_t pid, int policy,
 898                         struct sched_param *param)
 899 {
 900         struct sched_param lp;
 901         struct task_struct *p;
 902         int retval;
 903
 904         retval = -EINVAL;
 905         if (!param || pid < 0)
 906                 goto out_nounlock;
 907
 908         retval = -EFAULT;
 909         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 910                 goto out_nounlock;
 911
 912         /*
 913          * We play safe to avoid deadlocks.
 914          */
 915         read_lock_irq(&tasklist_lock);
 916         spin_lock(&runqueue_lock);
 917
 918         p = find_process_by_pid(pid);
 919
 920         retval = -ESRCH;
 921         if (!p)
 922                 goto out_unlock;
 923
 924         if (policy < 0)
 925                 policy = p->policy;
 926         else {
 927                 retval = -EINVAL;
 928                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 929                                 policy != SCHED_OTHER)
 930                         goto out_unlock;
 931         }
 932
 933         /*
 934          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 935          * priority for SCHED_OTHER is 0.
 936          */
 937         retval = -EINVAL;
 938         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 939                 goto out_unlock;
 940         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 941                 goto out_unlock;
 942
 943         retval = -EPERM;
 944         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 945             !capable(CAP_SYS_NICE))
 946                 goto out_unlock;
 947         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 948             !capable(CAP_SYS_NICE))
 949                 goto out_unlock;
 950
 951         retval = 0;
 952         p->policy = policy;
 953         p->rt_priority = lp.sched_priority;
 954         if (task_on_runqueue(p))
 955                 move_first_runqueue(p);
 956
 957         current->need_resched = 1;
 958
 959 out_unlock:
 960         spin_unlock(&runqueue_lock);
 961         read_unlock_irq(&tasklist_lock);
 962
 963 out_nounlock:
 964         return retval;
 965 }
 966
 967 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 968                                       struct sched_param *param)
 969 {
 970         return setscheduler(pid, policy, param);
 971 }
 972
 973 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 974 {
 975         return setscheduler(pid, -1, param);
 976 }
 977
 978 asmlinkage long sys_sched_getscheduler(pid_t pid)
 979 {
 980         struct task_struct *p;
 981         int retval;
 982
 983         retval = -EINVAL;
 984         if (pid < 0)
 985                 goto out_nounlock;
 986
 987         retval = -ESRCH;
 988         read_lock(&tasklist_lock);
 989         p = find_process_by_pid(pid);
 990         if (p)
 991                 retval = p->policy & ~SCHED_YIELD;
 992         read_unlock(&tasklist_lock);
 993
 994 out_nounlock:
 995         return retval;
 996 }
 997
 998 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 999 {
1000         struct task_struct *p;
1001         struct sched_param lp;
1002         int retval;
1003
1004         retval = -EINVAL;
1005         if (!param || pid < 0)
1006                 goto out_nounlock;
1007
1008         read_lock(&tasklist_lock);
1009         p = find_process_by_pid(pid);
1010         retval = -ESRCH;
1011         if (!p)
1012                 goto out_unlock;
1013         lp.sched_priority = p->rt_priority;
1014         read_unlock(&tasklist_lock);
1015
1016         /*
1017          * This one might sleep, we cannot do it with a spinlock held ...
1018          */
1019         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1020
1021 out_nounlock:
1022         return retval;
1023
1024 out_unlock:
1025         read_unlock(&tasklist_lock);
1026         return retval;
1027 }
1028
1029 asmlinkage long sys_sched_yield(void)
1030 {
1031         /*
1032          * Trick. sched_yield() first counts the number of truly
1033          * 'pending' runnable processes, then returns if it's
1034          * only the current processes. (This test does not have
1035          * to be atomic.) In threaded applications this optimization
1036          * gets triggered quite often.
1037          */
1038
1039         int nr_pending = nr_running;
1040
1041 #if CONFIG_SMP
1042         int i;
1043
1044         // Substract non-idle processes running on other CPUs.
1045         for (i = 0; i < smp_num_cpus; i++)
1046                 if (aligned_data[i].schedule_data.curr != idle_task(i))
1047                         nr_pending--;
1048 #else
1049         // on UP this process is on the runqueue as well
1050         nr_pending--;
1051 #endif
1052         if (nr_pending) {
1053                 /*
1054                  * This process can only be rescheduled by us,
1055                  * so this is safe without any locking.
1056                  */
1057                 if (current->policy == SCHED_OTHER)
1058                         current->policy |= SCHED_YIELD;
1059                 current->need_resched = 1;
1060         }
1061         return 0;
1062 }
1063
1064 asmlinkage long sys_sched_get_priority_max(int policy)
1065 {
1066         int ret = -EINVAL;
1067
1068         switch (policy) {
1069         case SCHED_FIFO:
1070         case SCHED_RR:
1071                 ret = 99;
1072                 break;
1073         case SCHED_OTHER:
1074                 ret = 0;
1075                 break;
1076         }
1077         return ret;
1078 }
1079
1080 asmlinkage long sys_sched_get_priority_min(int policy)
1081 {
1082         int ret = -EINVAL;
1083
1084         switch (policy) {
1085         case SCHED_FIFO:
1086         case SCHED_RR:
1087                 ret = 1;
1088                 break;
1089         case SCHED_OTHER:
1090                 ret = 0;
1091         }
1092         return ret;
1093 }
1094
1095 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1096 {
1097         struct timespec t;
1098         struct task_struct *p;
1099         int retval = -EINVAL;
1100
1101         if (pid < 0)
1102                 goto out_nounlock;
1103
1104         retval = -ESRCH;
1105         read_lock(&tasklist_lock);
1106         p = find_process_by_pid(pid);
1107         if (p)
1108                 jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
1109                                     &t);
1110         read_unlock(&tasklist_lock);
1111         if (p)
1112                 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1113 out_nounlock:
1114         return retval;
1115 }
1116
1117 static void show_task(struct task_struct * p)
1118 {
1119         unsigned long free = 0;
1120         int state;
1121         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1122
1123         printk("%-8s  ", p->comm);
1124         state = p->state ? ffz(~p->state) + 1 : 0;
1125         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1126                 printk(stat_nam[state]);
1127         else
1128                 printk(" ");
1129 #if (BITS_PER_LONG == 32)
1130         if (p == current)
1131                 printk(" current  ");
1132         else
1133                 printk(" %08lX ", thread_saved_pc(&p->thread));
1134 #else
1135         if (p == current)
1136                 printk("   current task   ");
1137         else
1138                 printk(" %016lx ", thread_saved_pc(&p->thread));
1139 #endif
1140         {
1141                 unsigned long * n = (unsigned long *) (p+1);
1142                 while (!*n)
1143                         n++;
1144                 free = (unsigned long) n - (unsigned long)(p+1);
1145         }
1146         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1147         if (p->p_cptr)
1148                 printk("%5d ", p->p_cptr->pid);
1149         else
1150                 printk("      ");
1151         if (!p->mm)
1152                 printk(" (L-TLB) ");
1153         else
1154                 printk(" (NOTLB) ");
1155         if (p->p_ysptr)
1156                 printk("%7d", p->p_ysptr->pid);
1157         else
1158                 printk("       ");
1159         if (p->p_osptr)
1160                 printk(" %5d\n", p->p_osptr->pid);
1161         else
1162                 printk("\n");
1163
1164         {
1165                 struct sigqueue *q;
1166                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1167
1168                 render_sigset_t(&p->pending.signal, s);
1169                 render_sigset_t(&p->blocked, b);
1170                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1171                 for (q = p->pending.head; q ; q = q->next)
1172                         printk(" %d", q->info.si_signo);
1173                 printk(" X\n");
1174         }
1175 }
1176
1177 char * render_sigset_t(sigset_t *set, char *buffer)
1178 {
1179         int i = _NSIG, x;
1180         do {
1181                 i -= 4, x = 0;
1182                 if (sigismember(set, i+1)) x |= 1;
1183                 if (sigismember(set, i+2)) x |= 2;
1184                 if (sigismember(set, i+3)) x |= 4;
1185                 if (sigismember(set, i+4)) x |= 8;
1186                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1187         } while (i >= 4);
1188         *buffer = 0;
1189         return buffer;
1190 }
1191
1192 void show_state(void)
1193 {
1194         struct task_struct *p;
1195
1196 #if (BITS_PER_LONG == 32)
1197         printk("\n"
1198                "                         free                        sibling\n");
1199         printk("  task             PC    stack   pid father child younger older\n");
1200 #else
1201         printk("\n"
1202                "                                 free                        sibling\n");
1203         printk("  task                 PC        stack   pid father child younger older\n");
1204 #endif
1205         read_lock(&tasklist_lock);
1206         for_each_task(p)
1207                 show_task(p);
1208         read_unlock(&tasklist_lock);
1209 }
1210
1211 /*
1212  *      Put all the gunge required to become a kernel thread without
1213  *      attached user resources in one place where it belongs.
1214  */
1215
1216 void daemonize(void)
1217 {
1218         struct fs_struct *fs;
1219
1220
1221         /*
1222          * If we were started as result of loading a module, close all of the
1223          * user space pages.  We don't need them, and if we didn't close them
1224          * they would be locked into memory.
1225          */
1226         exit_mm(current);
1227
1228         current->session = 1;
1229         current->pgrp = 1;
1230
1231         /* Become as one with the init task */
1232
1233         exit_fs(current);       /* current->fs->count--; */
1234         fs = init_task.fs;
1235         current->fs = fs;
1236         atomic_inc(&fs->count);
1237         exit_files(current);
1238         current->files = init_task.files;
1239         atomic_inc(&current->files->count);
1240 }
1241
1242 void __init init_idle(void)
1243 {
1244         struct schedule_data * sched_data;
1245         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1246
1247         if (current != &init_task && task_on_runqueue(current)) {
1248                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1249                         smp_processor_id(), current->pid);
1250                 del_from_runqueue(current);
1251         }
1252         sched_data->curr = current;
1253         sched_data->last_schedule = get_cycles();
1254 }
1255
1256 extern void init_timervecs (void);
1257
1258 void __init sched_init(void)
1259 {
1260         /*
1261          * We have to do a little magic to get the first
1262          * process right in SMP mode.
1263          */
1264         int cpu = smp_processor_id();
1265         int nr;
1266
1267         init_task.processor = cpu;
1268
1269         for(nr = 0; nr < PIDHASH_SZ; nr++)
1270                 pidhash[nr] = NULL;
1271
1272         init_timervecs();
1273
1274         init_bh(TIMER_BH, timer_bh);
1275         init_bh(TQUEUE_BH, tqueue_bh);
1276         init_bh(IMMEDIATE_BH, immediate_bh);
1277
1278         /*
1279          * The boot idle thread does lazy MMU switching as well:
1280          */
1281         atomic_inc(&init_mm.mm_count);
1282         enter_lazy_tlb(&init_mm, current, cpu);
1283 }