kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/mm.h>
  23 #include <linux/init.h>
  24 #include <linux/smp_lock.h>
  25 #include <linux/interrupt.h>
  26 #include <linux/kernel_stat.h>
  27
  28 #include <asm/uaccess.h>
  29 #include <asm/mmu_context.h>
  30
  31
  32 extern void timer_bh(void);
  33 extern void tqueue_bh(void);
  34 extern void immediate_bh(void);
  35
  36 /*
  37  * scheduler variables
  38  */
  39
  40 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  41
  42 extern void mem_use(void);
  43
  44 /*
  45  *      Init task must be ok at boot for the ix86 as we will check its signals
  46  *      via the SMP irq return path.
  47  */
  48
  49 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  50
  51 /*
  52  * The tasklist_lock protects the linked list of processes.
  53  *
  54  * The scheduler lock is protecting against multiple entry
  55  * into the scheduling code, and doesn't need to worry
  56  * about interrupts (because interrupts cannot call the
  57  * scheduler).
  58  *
  59  * The run-queue lock locks the parts that actually access
  60  * and change the run-queues, and have to be interrupt-safe.
  61  */
  62 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
  63 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;      /* third */
  64
  65 static LIST_HEAD(runqueue_head);
  66
  67 /*
  68  * We align per-CPU scheduling data on cacheline boundaries,
  69  * to prevent cacheline ping-pong.
  70  */
  71 static union {
  72         struct schedule_data {
  73                 struct task_struct * curr;
  74                 cycles_t last_schedule;
  75         } schedule_data;
  76         char __pad [SMP_CACHE_BYTES];
  77 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
  78
  79 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
  80
  81 struct kernel_stat kstat = { 0 };
  82
  83 #ifdef __SMP__
  84
  85 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
  86 #define can_schedule(p) (!(p)->has_cpu)
  87
  88 #else
  89
  90 #define idle_task(cpu) (&init_task)
  91 #define can_schedule(p) (1)
  92
  93 #endif
  94
  95 void scheduling_functions_start_here(void) { }
  96
  97 /*
  98  * This is the function that decides how desirable a process is..
  99  * You can weigh different processes against each other depending
 100  * on what CPU they've run on lately etc to try to handle cache
 101  * and TLB miss penalties.
 102  *
 103  * Return values:
 104  *       -1000: never select this
 105  *           0: out of time, recalculate counters (but it might still be
 106  *              selected)
 107  *         +ve: "goodness" value (the larger, the better)
 108  *       +1000: realtime process, select this.
 109  */
 110
 111 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 112 {
 113         int weight;
 114
 115         /*
 116          * Realtime process, select the first one on the
 117          * runqueue (taking priorities within processes
 118          * into account).
 119          */
 120         if (p->policy != SCHED_OTHER) {
 121                 weight = 1000 + p->rt_priority;
 122                 goto out;
 123         }
 124
 125         /*
 126          * Give the process a first-approximation goodness value
 127          * according to the number of clock-ticks it has left.
 128          *
 129          * Don't do any other calculations if the time slice is
 130          * over..
 131          */
 132         weight = p->counter;
 133         if (!weight)
 134                 goto out;
 135
 136 #ifdef __SMP__
 137         /* Give a largish advantage to the same processor...   */
 138         /* (this is equivalent to penalizing other processors) */
 139         if (p->processor == this_cpu)
 140                 weight += PROC_CHANGE_PENALTY;
 141 #endif
 142
 143         /* .. and a slight advantage to the current MM */
 144         if (p->mm == this_mm || !p->mm)
 145                 weight += 1;
 146         weight += p->priority;
 147
 148 out:
 149         return weight;
 150 }
 151
 152 /*
 153  * subtle. We want to discard a yielded process only if it's being
 154  * considered for a reschedule. Wakeup-time 'queries' of the scheduling
 155  * state do not count. Another optimization we do: sched_yield()-ed
 156  * processes are runnable (and thus will be considered for scheduling)
 157  * right when they are calling schedule(). So the only place we need
 158  * to care about SCHED_YIELD is when we calculate the previous process'
 159  * goodness ...
 160  */
 161 static inline int prev_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 162 {
 163         if (p->policy & SCHED_YIELD) {
 164                 p->policy &= ~SCHED_YIELD;
 165                 return 0;
 166         }
 167         return goodness(p, this_cpu, this_mm);
 168 }
 169
 170 /*
 171  * the 'goodness value' of replacing a process on a given CPU.
 172  * positive value means 'replace', zero or negative means 'dont'.
 173  */
 174 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 175 {
 176         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 177 }
 178
 179 /*
 180  * This is ugly, but reschedule_idle() is very timing-critical.
 181  * We enter with the runqueue spinlock held, but we might end
 182  * up unlocking it early, so the caller must not unlock the
 183  * runqueue, it's always done by reschedule_idle().
 184  */
 185 static inline void reschedule_idle(struct task_struct * p, unsigned long flags)
 186 {
 187 #ifdef __SMP__
 188         int this_cpu = smp_processor_id(), target_cpu;
 189         struct task_struct *tsk;
 190         int cpu, best_cpu, i;
 191
 192         /*
 193          * shortcut if the woken up task's last CPU is
 194          * idle now.
 195          */
 196         best_cpu = p->processor;
 197         tsk = idle_task(best_cpu);
 198         if (cpu_curr(best_cpu) == tsk)
 199                 goto send_now;
 200
 201         /*
 202          * We know that the preferred CPU has a cache-affine current
 203          * process, lets try to find a new idle CPU for the woken-up
 204          * process:
 205          */
 206         for (i = smp_num_cpus - 1; i >= 0; i--) {
 207                 cpu = cpu_logical_map(i);
 208                 if (cpu == best_cpu)
 209                         continue;
 210                 tsk = cpu_curr(cpu);
 211                 /*
 212                  * We use the last available idle CPU. This creates
 213                  * a priority list between idle CPUs, but this is not
 214                  * a problem.
 215                  */
 216                 if (tsk == idle_task(cpu))
 217                         goto send_now;
 218         }
 219
 220         /*
 221          * No CPU is idle, but maybe this process has enough priority
 222          * to preempt it's preferred CPU.
 223          */
 224         tsk = cpu_curr(best_cpu);
 225         if (preemption_goodness(tsk, p, best_cpu) > 0)
 226                 goto send_now;
 227
 228         /*
 229          * We will get here often - or in the high CPU contention
 230          * case. No CPU is idle and this process is either lowprio or
 231          * the preferred CPU is highprio. Try to preempt some other CPU
 232          * only if it's RT or if it's iteractive and the preferred
 233          * cpu won't reschedule shortly.
 234          */
 235         if (p->avg_slice < cacheflush_time || (p->policy & ~SCHED_YIELD) != SCHED_OTHER) {
 236                 for (i = smp_num_cpus - 1; i >= 0; i--) {
 237                         cpu = cpu_logical_map(i);
 238                         if (cpu == best_cpu)
 239                                 continue;
 240                         tsk = cpu_curr(cpu);
 241                         if (preemption_goodness(tsk, p, cpu) > 0)
 242                                 goto send_now;
 243                 }
 244         }
 245
 246         spin_unlock_irqrestore(&runqueue_lock, flags);
 247         return;
 248
 249 send_now:
 250         target_cpu = tsk->processor;
 251         tsk->need_resched = 1;
 252         spin_unlock_irqrestore(&runqueue_lock, flags);
 253         /*
 254          * the APIC stuff can go outside of the lock because
 255          * it uses no task information, only CPU#.
 256          */
 257         if (target_cpu != this_cpu)
 258                 smp_send_reschedule(target_cpu);
 259         return;
 260 #else /* UP */
 261         int this_cpu = smp_processor_id();
 262         struct task_struct *tsk;
 263
 264         tsk = cpu_curr(this_cpu);
 265         if (preemption_goodness(tsk, p, this_cpu) > 0)
 266                 tsk->need_resched = 1;
 267         spin_unlock_irqrestore(&runqueue_lock, flags);
 268 #endif
 269 }
 270
 271 /*
 272  * Careful!
 273  *
 274  * This has to add the process to the _beginning_ of the
 275  * run-queue, not the end. See the comment about "This is
 276  * subtle" in the scheduler proper..
 277  */
 278 static inline void add_to_runqueue(struct task_struct * p)
 279 {
 280         list_add(&p->run_list, &runqueue_head);
 281         nr_running++;
 282 }
 283
 284 static inline void move_last_runqueue(struct task_struct * p)
 285 {
 286         list_del(&p->run_list);
 287         list_add_tail(&p->run_list, &runqueue_head);
 288 }
 289
 290 static inline void move_first_runqueue(struct task_struct * p)
 291 {
 292         list_del(&p->run_list);
 293         list_add(&p->run_list, &runqueue_head);
 294 }
 295
 296 /*
 297  * Wake up a process. Put it on the run-queue if it's not
 298  * already there.  The "current" process is always on the
 299  * run-queue (except when the actual re-schedule is in
 300  * progress), and as such you're allowed to do the simpler
 301  * "current->state = TASK_RUNNING" to mark yourself runnable
 302  * without the overhead of this.
 303  */
 304 inline void wake_up_process(struct task_struct * p)
 305 {
 306         unsigned long flags;
 307
 308         /*
 309          * We want the common case fall through straight, thus the goto.
 310          */
 311         spin_lock_irqsave(&runqueue_lock, flags);
 312         p->state = TASK_RUNNING;
 313         if (task_on_runqueue(p))
 314                 goto out;
 315         add_to_runqueue(p);
 316         reschedule_idle(p, flags); // spin_unlocks runqueue
 317
 318         return;
 319 out:
 320         spin_unlock_irqrestore(&runqueue_lock, flags);
 321 }
 322
 323 static inline void wake_up_process_synchronous(struct task_struct * p)
 324 {
 325         unsigned long flags;
 326
 327         /*
 328          * We want the common case fall through straight, thus the goto.
 329          */
 330         spin_lock_irqsave(&runqueue_lock, flags);
 331         p->state = TASK_RUNNING;
 332         if (task_on_runqueue(p))
 333                 goto out;
 334         add_to_runqueue(p);
 335 out:
 336         spin_unlock_irqrestore(&runqueue_lock, flags);
 337 }
 338
 339 static void process_timeout(unsigned long __data)
 340 {
 341         struct task_struct * p = (struct task_struct *) __data;
 342
 343         wake_up_process(p);
 344 }
 345
 346 signed long schedule_timeout(signed long timeout)
 347 {
 348         struct timer_list timer;
 349         unsigned long expire;
 350
 351         switch (timeout)
 352         {
 353         case MAX_SCHEDULE_TIMEOUT:
 354                 /*
 355                  * These two special cases are useful to be comfortable
 356                  * in the caller. Nothing more. We could take
 357                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 358                  * but I' d like to return a valid offset (>=0) to allow
 359                  * the caller to do everything it want with the retval.
 360                  */
 361                 schedule();
 362                 goto out;
 363         default:
 364                 /*
 365                  * Another bit of PARANOID. Note that the retval will be
 366                  * 0 since no piece of kernel is supposed to do a check
 367                  * for a negative retval of schedule_timeout() (since it
 368                  * should never happens anyway). You just have the printk()
 369                  * that will tell you if something is gone wrong and where.
 370                  */
 371                 if (timeout < 0)
 372                 {
 373                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 374                                "value %lx from %p\n", timeout,
 375                                __builtin_return_address(0));
 376                         current->state = TASK_RUNNING;
 377                         goto out;
 378                 }
 379         }
 380
 381         expire = timeout + jiffies;
 382
 383         init_timer(&timer);
 384         timer.expires = expire;
 385         timer.data = (unsigned long) current;
 386         timer.function = process_timeout;
 387
 388         add_timer(&timer);
 389         schedule();
 390         del_timer(&timer);
 391         /* RED-PEN. Timer may be running now on another cpu.
 392          * Pray that process will not exit enough fastly.
 393          */
 394
 395         timeout = expire - jiffies;
 396
 397  out:
 398         return timeout < 0 ? 0 : timeout;
 399 }
 400
 401 /*
 402  * schedule_tail() is getting called from the fork return path. This
 403  * cleans up all remaining scheduler things, without impacting the
 404  * common case.
 405  */
 406 static inline void __schedule_tail(struct task_struct *prev)
 407 {
 408 #ifdef __SMP__
 409         if ((prev->state == TASK_RUNNING) &&
 410                         (prev != idle_task(smp_processor_id()))) {
 411                 unsigned long flags;
 412
 413                 spin_lock_irqsave(&runqueue_lock, flags);
 414                 reschedule_idle(prev, flags); // spin_unlocks runqueue
 415         }
 416         wmb();
 417         prev->has_cpu = 0;
 418 #endif /* __SMP__ */
 419 }
 420
 421 void schedule_tail(struct task_struct *prev)
 422 {
 423         __schedule_tail(prev);
 424 }
 425
 426 /*
 427  *  'schedule()' is the scheduler function. It's a very simple and nice
 428  * scheduler: it's not perfect, but certainly works for most things.
 429  *
 430  * The goto is "interesting".
 431  *
 432  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 433  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 434  * information in task[0] is never used.
 435  */
 436 asmlinkage void schedule(void)
 437 {
 438         struct schedule_data * sched_data;
 439         struct task_struct *prev, *next, *p;
 440         struct list_head *tmp;
 441         int this_cpu, c;
 442
 443         if (!current->active_mm) BUG();
 444         if (tq_scheduler)
 445                 goto handle_tq_scheduler;
 446 tq_scheduler_back:
 447
 448         prev = current;
 449         this_cpu = prev->processor;
 450
 451         if (in_interrupt())
 452                 goto scheduling_in_interrupt;
 453
 454         release_kernel_lock(prev, this_cpu);
 455
 456         /* Do "administrative" work here while we don't hold any locks */
 457         if (softirq_state[this_cpu].active & softirq_state[this_cpu].mask)
 458                 goto handle_softirq;
 459 handle_softirq_back:
 460
 461         /*
 462          * 'sched_data' is protected by the fact that we can run
 463          * only one process per CPU.
 464          */
 465         sched_data = & aligned_data[this_cpu].schedule_data;
 466
 467         spin_lock_irq(&runqueue_lock);
 468
 469         /* move an exhausted RR process to be last.. */
 470         if (prev->policy == SCHED_RR)
 471                 goto move_rr_last;
 472 move_rr_back:
 473
 474         switch (prev->state & ~TASK_EXCLUSIVE) {
 475                 case TASK_INTERRUPTIBLE:
 476                         if (signal_pending(prev)) {
 477                                 prev->state = TASK_RUNNING;
 478                                 break;
 479                         }
 480                 default:
 481                         del_from_runqueue(prev);
 482                 case TASK_RUNNING:
 483         }
 484         prev->need_resched = 0;
 485
 486         /*
 487          * this is the scheduler proper:
 488          */
 489
 490 repeat_schedule:
 491         /*
 492          * Default process to select..
 493          */
 494         next = idle_task(this_cpu);
 495         c = -1000;
 496         if (prev->state == TASK_RUNNING)
 497                 goto still_running;
 498
 499 still_running_back:
 500         list_for_each(tmp, &runqueue_head) {
 501                 p = list_entry(tmp, struct task_struct, run_list);
 502                 if (can_schedule(p)) {
 503                         int weight = goodness(p, this_cpu, prev->active_mm);
 504                         if (weight > c)
 505                                 c = weight, next = p;
 506                 }
 507         }
 508
 509         /* Do we need to re-calculate counters? */
 510         if (!c)
 511                 goto recalculate;
 512         /*
 513          * from this point on nothing can prevent us from
 514          * switching to the next task, save this fact in
 515          * sched_data.
 516          */
 517         sched_data->curr = next;
 518 #ifdef __SMP__
 519         next->has_cpu = 1;
 520         next->processor = this_cpu;
 521 #endif
 522         spin_unlock_irq(&runqueue_lock);
 523
 524         if (prev == next)
 525                 goto same_process;
 526
 527 #ifdef __SMP__
 528         /*
 529          * maintain the per-process 'average timeslice' value.
 530          * (this has to be recalculated even if we reschedule to
 531          * the same process) Currently this is only used on SMP,
 532          * and it's approximate, so we do not have to maintain
 533          * it while holding the runqueue spinlock.
 534          */
 535         {
 536                 cycles_t t, this_slice;
 537
 538                 t = get_cycles();
 539                 this_slice = t - sched_data->last_schedule;
 540                 sched_data->last_schedule = t;
 541
 542                 /*
 543                  * Exponentially fading average calculation, with
 544                  * some weight so it doesnt get fooled easily by
 545                  * smaller irregularities.
 546                  */
 547                 prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
 548         }
 549
 550         /*
 551          * We drop the scheduler lock early (it's a global spinlock),
 552          * thus we have to lock the previous process from getting
 553          * rescheduled during switch_to().
 554          */
 555
 556 #endif /* __SMP__ */
 557
 558         kstat.context_swtch++;
 559         /*
 560          * there are 3 processes which are affected by a context switch:
 561          *
 562          * prev == .... ==> (last => next)
 563          *
 564          * It's the 'much more previous' 'prev' that is on next's stack,
 565          * but prev is set to (the just run) 'last' process by switch_to().
 566          * This might sound slightly confusing but makes tons of sense.
 567          */
 568         prepare_to_switch();
 569         {
 570                 struct mm_struct *mm = next->mm;
 571                 struct mm_struct *oldmm = prev->active_mm;
 572                 if (!mm) {
 573                         if (next->active_mm) BUG();
 574                         next->active_mm = oldmm;
 575                         atomic_inc(&oldmm->mm_count);
 576                         enter_lazy_tlb(oldmm, next, this_cpu);
 577                 } else {
 578                         if (next->active_mm != mm) BUG();
 579                         switch_mm(oldmm, mm, next, this_cpu);
 580                 }
 581
 582                 if (!prev->mm) {
 583                         prev->active_mm = NULL;
 584                         mmdrop(oldmm);
 585                 }
 586         }
 587
 588         /*
 589          * This just switches the register state and the
 590          * stack.
 591          */
 592         switch_to(prev, next, prev);
 593         __schedule_tail(prev);
 594
 595 same_process:
 596         reacquire_kernel_lock(current);
 597         return;
 598
 599 recalculate:
 600         {
 601                 struct task_struct *p;
 602                 spin_unlock_irq(&runqueue_lock);
 603                 read_lock(&tasklist_lock);
 604                 for_each_task(p)
 605                         p->counter = (p->counter >> 1) + p->priority;
 606                 read_unlock(&tasklist_lock);
 607                 spin_lock_irq(&runqueue_lock);
 608         }
 609         goto repeat_schedule;
 610
 611 still_running:
 612         c = prev_goodness(prev, this_cpu, prev->active_mm);
 613         next = prev;
 614         goto still_running_back;
 615
 616 handle_softirq:
 617         do_softirq();
 618         goto handle_softirq_back;
 619
 620 handle_tq_scheduler:
 621         run_task_queue(&tq_scheduler);
 622         goto tq_scheduler_back;
 623
 624 move_rr_last:
 625         if (!prev->counter) {
 626                 prev->counter = prev->priority;
 627                 move_last_runqueue(prev);
 628         }
 629         goto move_rr_back;
 630
 631 scheduling_in_interrupt:
 632         printk("Scheduling in interrupt\n");
 633         *(int *)0 = 0;
 634         return;
 635 }
 636
 637 static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, const int sync)
 638 {
 639         struct list_head *tmp, *head;
 640         struct task_struct *p;
 641         unsigned long flags;
 642
 643         if (!q)
 644                 goto out;
 645
 646         wq_write_lock_irqsave(&q->lock, flags);
 647
 648 #if WAITQUEUE_DEBUG
 649         CHECK_MAGIC_WQHEAD(q);
 650 #endif
 651
 652         head = &q->task_list;
 653 #if WAITQUEUE_DEBUG
 654         if (!head->next || !head->prev)
 655                 WQ_BUG();
 656 #endif
 657         list_for_each(tmp, head) {
 658                 unsigned int state;
 659                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 660
 661 #if WAITQUEUE_DEBUG
 662                 CHECK_MAGIC(curr->__magic);
 663 #endif
 664                 p = curr->task;
 665                 state = p->state;
 666                 if (state & (mode & ~TASK_EXCLUSIVE)) {
 667 #if WAITQUEUE_DEBUG
 668                         curr->__waker = (long)__builtin_return_address(0);
 669 #endif
 670                         if (sync)
 671                                 wake_up_process_synchronous(p);
 672                         else
 673                                 wake_up_process(p);
 674                         if (state & mode & TASK_EXCLUSIVE)
 675                                 break;
 676                 }
 677         }
 678         wq_write_unlock_irqrestore(&q->lock, flags);
 679 out:
 680         return;
 681 }
 682
 683 void __wake_up(wait_queue_head_t *q, unsigned int mode)
 684 {
 685         __wake_up_common(q, mode, 0);
 686 }
 687
 688 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode)
 689 {
 690         __wake_up_common(q, mode, 1);
 691 }
 692
 693 #define SLEEP_ON_VAR                            \
 694         unsigned long flags;                    \
 695         wait_queue_t wait;                      \
 696         init_waitqueue_entry(&wait, current);
 697
 698 #define SLEEP_ON_HEAD                                   \
 699         wq_write_lock_irqsave(&q->lock,flags);          \
 700         __add_wait_queue(q, &wait);                     \
 701         wq_write_unlock(&q->lock);
 702
 703 #define SLEEP_ON_TAIL                                           \
 704         wq_write_lock_irq(&q->lock);                            \
 705         __remove_wait_queue(q, &wait);                          \
 706         wq_write_unlock_irqrestore(&q->lock,flags);
 707
 708 void interruptible_sleep_on(wait_queue_head_t *q)
 709 {
 710         SLEEP_ON_VAR
 711
 712         current->state = TASK_INTERRUPTIBLE;
 713
 714         SLEEP_ON_HEAD
 715         schedule();
 716         SLEEP_ON_TAIL
 717 }
 718
 719 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 720 {
 721         SLEEP_ON_VAR
 722
 723         current->state = TASK_INTERRUPTIBLE;
 724
 725         SLEEP_ON_HEAD
 726         timeout = schedule_timeout(timeout);
 727         SLEEP_ON_TAIL
 728
 729         return timeout;
 730 }
 731
 732 void sleep_on(wait_queue_head_t *q)
 733 {
 734         SLEEP_ON_VAR
 735
 736         current->state = TASK_UNINTERRUPTIBLE;
 737
 738         SLEEP_ON_HEAD
 739         schedule();
 740         SLEEP_ON_TAIL
 741 }
 742
 743 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 744 {
 745         SLEEP_ON_VAR
 746
 747         current->state = TASK_UNINTERRUPTIBLE;
 748
 749         SLEEP_ON_HEAD
 750         timeout = schedule_timeout(timeout);
 751         SLEEP_ON_TAIL
 752
 753         return timeout;
 754 }
 755
 756 void scheduling_functions_end_here(void) { }
 757
 758 #ifndef __alpha__
 759
 760 /*
 761  * This has been replaced by sys_setpriority.  Maybe it should be
 762  * moved into the arch dependent tree for those ports that require
 763  * it for backward compatibility?
 764  */
 765
 766 asmlinkage long sys_nice(int increment)
 767 {
 768         unsigned long newprio;
 769         int increase = 0;
 770
 771         /*
 772          *      Setpriority might change our priority at the same moment.
 773          *      We don't have to worry. Conceptually one call occurs first
 774          *      and we have a single winner.
 775          */
 776
 777         newprio = increment;
 778         if (increment < 0) {
 779                 if (!capable(CAP_SYS_NICE))
 780                         return -EPERM;
 781                 newprio = -increment;
 782                 increase = 1;
 783         }
 784
 785         if (newprio > 40)
 786                 newprio = 40;
 787         /*
 788          * do a "normalization" of the priority (traditionally
 789          * Unix nice values are -20 to 20; Linux doesn't really
 790          * use that kind of thing, but uses the length of the
 791          * timeslice instead (default 200 ms). The rounding is
 792          * why we want to avoid negative values.
 793          */
 794         newprio = (newprio * DEF_PRIORITY + 10) / 20;
 795         increment = newprio;
 796         if (increase)
 797                 increment = -increment;
 798         /*
 799          *      Current->priority can change between this point
 800          *      and the assignment. We are assigning not doing add/subs
 801          *      so thats ok. Conceptually a process might just instantaneously
 802          *      read the value we stomp over. I don't think that is an issue
 803          *      unless posix makes it one. If so we can loop on changes
 804          *      to current->priority.
 805          */
 806         newprio = current->priority - increment;
 807         if ((signed) newprio < 1)
 808                 newprio = 1;
 809         if (newprio > DEF_PRIORITY*2)
 810                 newprio = DEF_PRIORITY*2;
 811         current->priority = newprio;
 812         return 0;
 813 }
 814
 815 #endif
 816
 817 static inline struct task_struct *find_process_by_pid(pid_t pid)
 818 {
 819         struct task_struct *tsk = current;
 820
 821         if (pid)
 822                 tsk = find_task_by_pid(pid);
 823         return tsk;
 824 }
 825
 826 static int setscheduler(pid_t pid, int policy,
 827                         struct sched_param *param)
 828 {
 829         struct sched_param lp;
 830         struct task_struct *p;
 831         int retval;
 832
 833         retval = -EINVAL;
 834         if (!param || pid < 0)
 835                 goto out_nounlock;
 836
 837         retval = -EFAULT;
 838         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 839                 goto out_nounlock;
 840
 841         /*
 842          * We play safe to avoid deadlocks.
 843          */
 844         spin_lock_irq(&runqueue_lock);
 845         read_lock(&tasklist_lock);
 846
 847         p = find_process_by_pid(pid);
 848
 849         retval = -ESRCH;
 850         if (!p)
 851                 goto out_unlock;
 852
 853         if (policy < 0)
 854                 policy = p->policy;
 855         else {
 856                 retval = -EINVAL;
 857                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 858                                 policy != SCHED_OTHER)
 859                         goto out_unlock;
 860         }
 861
 862         /*
 863          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 864          * priority for SCHED_OTHER is 0.
 865          */
 866         retval = -EINVAL;
 867         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 868                 goto out_unlock;
 869         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 870                 goto out_unlock;
 871
 872         retval = -EPERM;
 873         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 874             !capable(CAP_SYS_NICE))
 875                 goto out_unlock;
 876         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 877             !capable(CAP_SYS_NICE))
 878                 goto out_unlock;
 879
 880         retval = 0;
 881         p->policy = policy;
 882         p->rt_priority = lp.sched_priority;
 883         if (task_on_runqueue(p))
 884                 move_first_runqueue(p);
 885
 886         current->need_resched = 1;
 887
 888 out_unlock:
 889         read_unlock(&tasklist_lock);
 890         spin_unlock_irq(&runqueue_lock);
 891
 892 out_nounlock:
 893         return retval;
 894 }
 895
 896 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 897                                       struct sched_param *param)
 898 {
 899         return setscheduler(pid, policy, param);
 900 }
 901
 902 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 903 {
 904         return setscheduler(pid, -1, param);
 905 }
 906
 907 asmlinkage long sys_sched_getscheduler(pid_t pid)
 908 {
 909         struct task_struct *p;
 910         int retval;
 911
 912         retval = -EINVAL;
 913         if (pid < 0)
 914                 goto out_nounlock;
 915
 916         read_lock(&tasklist_lock);
 917
 918         retval = -ESRCH;
 919         p = find_process_by_pid(pid);
 920         if (!p)
 921                 goto out_unlock;
 922
 923         retval = p->policy;
 924
 925 out_unlock:
 926         read_unlock(&tasklist_lock);
 927
 928 out_nounlock:
 929         return retval;
 930 }
 931
 932 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 933 {
 934         struct task_struct *p;
 935         struct sched_param lp;
 936         int retval;
 937
 938         retval = -EINVAL;
 939         if (!param || pid < 0)
 940                 goto out_nounlock;
 941
 942         read_lock(&tasklist_lock);
 943         p = find_process_by_pid(pid);
 944         retval = -ESRCH;
 945         if (!p)
 946                 goto out_unlock;
 947         lp.sched_priority = p->rt_priority;
 948         read_unlock(&tasklist_lock);
 949
 950         /*
 951          * This one might sleep, we cannot do it with a spinlock held ...
 952          */
 953         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 954
 955 out_nounlock:
 956         return retval;
 957
 958 out_unlock:
 959         read_unlock(&tasklist_lock);
 960         return retval;
 961 }
 962
 963 asmlinkage long sys_sched_yield(void)
 964 {
 965         spin_lock_irq(&runqueue_lock);
 966         if (current->policy == SCHED_OTHER)
 967                 current->policy |= SCHED_YIELD;
 968         current->need_resched = 1;
 969         move_last_runqueue(current);
 970         spin_unlock_irq(&runqueue_lock);
 971         return 0;
 972 }
 973
 974 asmlinkage long sys_sched_get_priority_max(int policy)
 975 {
 976         int ret = -EINVAL;
 977
 978         switch (policy) {
 979         case SCHED_FIFO:
 980         case SCHED_RR:
 981                 ret = 99;
 982                 break;
 983         case SCHED_OTHER:
 984                 ret = 0;
 985                 break;
 986         }
 987         return ret;
 988 }
 989
 990 asmlinkage long sys_sched_get_priority_min(int policy)
 991 {
 992         int ret = -EINVAL;
 993
 994         switch (policy) {
 995         case SCHED_FIFO:
 996         case SCHED_RR:
 997                 ret = 1;
 998                 break;
 999         case SCHED_OTHER:
1000                 ret = 0;
1001         }
1002         return ret;
1003 }
1004
1005 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1006 {
1007         struct timespec t;
1008
1009         t.tv_sec = 0;
1010         t.tv_nsec = 150000;
1011         if (copy_to_user(interval, &t, sizeof(struct timespec)))
1012                 return -EFAULT;
1013         return 0;
1014 }
1015
1016 static void show_task(struct task_struct * p)
1017 {
1018         unsigned long free = 0;
1019         int state;
1020         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1021
1022         printk("%-8s  ", p->comm);
1023         state = p->state ? ffz(~p->state) + 1 : 0;
1024         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1025                 printk(stat_nam[state]);
1026         else
1027                 printk(" ");
1028 #if (BITS_PER_LONG == 32)
1029         if (p == current)
1030                 printk(" current  ");
1031         else
1032                 printk(" %08lX ", thread_saved_pc(&p->thread));
1033 #else
1034         if (p == current)
1035                 printk("   current task   ");
1036         else
1037                 printk(" %016lx ", thread_saved_pc(&p->thread));
1038 #endif
1039         {
1040                 unsigned long * n = (unsigned long *) (p+1);
1041                 while (!*n)
1042                         n++;
1043                 free = (unsigned long) n - (unsigned long)(p+1);
1044         }
1045         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1046         if (p->p_cptr)
1047                 printk("%5d ", p->p_cptr->pid);
1048         else
1049                 printk("      ");
1050         if (!p->mm)
1051                 printk(" (L-TLB) ");
1052         else
1053                 printk(" (NOTLB) ");
1054         if (p->p_ysptr)
1055                 printk("%7d", p->p_ysptr->pid);
1056         else
1057                 printk("       ");
1058         if (p->p_osptr)
1059                 printk(" %5d\n", p->p_osptr->pid);
1060         else
1061                 printk("\n");
1062
1063         {
1064                 struct signal_queue *q;
1065                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1066
1067                 render_sigset_t(&p->signal, s);
1068                 render_sigset_t(&p->blocked, b);
1069                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1070                 for (q = p->sigqueue; q ; q = q->next)
1071                         printk(" %d", q->info.si_signo);
1072                 printk(" X\n");
1073         }
1074 }
1075
1076 char * render_sigset_t(sigset_t *set, char *buffer)
1077 {
1078         int i = _NSIG, x;
1079         do {
1080                 i -= 4, x = 0;
1081                 if (sigismember(set, i+1)) x |= 1;
1082                 if (sigismember(set, i+2)) x |= 2;
1083                 if (sigismember(set, i+3)) x |= 4;
1084                 if (sigismember(set, i+4)) x |= 8;
1085                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1086         } while (i >= 4);
1087         *buffer = 0;
1088         return buffer;
1089 }
1090
1091 void show_state(void)
1092 {
1093         struct task_struct *p;
1094
1095 #if (BITS_PER_LONG == 32)
1096         printk("\n"
1097                "                         free                        sibling\n");
1098         printk("  task             PC    stack   pid father child younger older\n");
1099 #else
1100         printk("\n"
1101                "                                 free                        sibling\n");
1102         printk("  task                 PC        stack   pid father child younger older\n");
1103 #endif
1104         read_lock(&tasklist_lock);
1105         for_each_task(p)
1106                 show_task(p);
1107         read_unlock(&tasklist_lock);
1108 }
1109
1110 /*
1111  *      Put all the gunge required to become a kernel thread without
1112  *      attached user resources in one place where it belongs.
1113  */
1114
1115 void daemonize(void)
1116 {
1117         struct fs_struct *fs;
1118
1119
1120         /*
1121          * If we were started as result of loading a module, close all of the
1122          * user space pages.  We don't need them, and if we didn't close them
1123          * they would be locked into memory.
1124          */
1125         exit_mm(current);
1126
1127         current->session = 1;
1128         current->pgrp = 1;
1129
1130         /* Become as one with the init task */
1131
1132         exit_fs(current);       /* current->fs->count--; */
1133         fs = init_task.fs;
1134         current->fs = fs;
1135         atomic_inc(&fs->count);
1136
1137 }
1138
1139 void __init init_idle(void)
1140 {
1141         struct schedule_data * sched_data;
1142         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1143
1144         if (current != &init_task && task_on_runqueue(current)) {
1145                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1146                         smp_processor_id(), current->pid);
1147                 del_from_runqueue(current);
1148         }
1149         sched_data->curr = current;
1150         sched_data->last_schedule = get_cycles();
1151 }
1152
1153 void __init sched_init(void)
1154 {
1155         /*
1156          * We have to do a little magic to get the first
1157          * process right in SMP mode.
1158          */
1159         int cpu = smp_processor_id();
1160         int nr;
1161
1162         init_task.processor = cpu;
1163
1164         for(nr = 0; nr < PIDHASH_SZ; nr++)
1165                 pidhash[nr] = NULL;
1166
1167         init_bh(TIMER_BH, timer_bh);
1168         init_bh(TQUEUE_BH, tqueue_bh);
1169         init_bh(IMMEDIATE_BH, immediate_bh);
1170
1171         /*
1172          * The boot idle thread does lazy MMU switching as well:
1173          */
1174         atomic_inc(&init_mm.mm_count);
1175         enter_lazy_tlb(&init_mm, current, cpu);
1176 }