kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  13  */
  14
  15 /*
  16  * 'sched.c' is the main kernel file. It contains scheduling primitives
  17  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  18  * call functions (type getpid()), which just extract a field from
  19  * current-task
  20  */
  21
  22 #include <linux/config.h>
  23 #include <linux/mm.h>
  24 #include <linux/init.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/kernel_stat.h>
  28
  29 #include <asm/uaccess.h>
  30 #include <asm/mmu_context.h>
  31
  32
  33 extern void timer_bh(void);
  34 extern void tqueue_bh(void);
  35 extern void immediate_bh(void);
  36
  37 /*
  38  * scheduler variables
  39  */
  40
  41 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  42
  43 extern void mem_use(void);
  44
  45 /*
  46  *      Init task must be ok at boot for the ix86 as we will check its signals
  47  *      via the SMP irq return path.
  48  */
  49
  50 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
  51
  52 /*
  53  * The tasklist_lock protects the linked list of processes.
  54  *
  55  * The scheduler lock is protecting against multiple entry
  56  * into the scheduling code, and doesn't need to worry
  57  * about interrupts (because interrupts cannot call the
  58  * scheduler).
  59  *
  60  * The run-queue lock locks the parts that actually access
  61  * and change the run-queues, and have to be interrupt-safe.
  62  */
  63 __cacheline_aligned spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
  64 __cacheline_aligned rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;  /* third */
  65
  66 static LIST_HEAD(runqueue_head);
  67
  68 /*
  69  * We align per-CPU scheduling data on cacheline boundaries,
  70  * to prevent cacheline ping-pong.
  71  */
  72 static union {
  73         struct schedule_data {
  74                 struct task_struct * curr;
  75                 cycles_t last_schedule;
  76         } schedule_data;
  77         char __pad [SMP_CACHE_BYTES];
  78 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
  79
  80 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
  81
  82 struct kernel_stat kstat = { 0 };
  83
  84 #ifdef CONFIG_SMP
  85
  86 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
  87 #define can_schedule(p) (!(p)->has_cpu)
  88
  89 #else
  90
  91 #define idle_task(cpu) (&init_task)
  92 #define can_schedule(p) (1)
  93
  94 #endif
  95
  96 void scheduling_functions_start_here(void) { }
  97
  98 /*
  99  * This is the function that decides how desirable a process is..
 100  * You can weigh different processes against each other depending
 101  * on what CPU they've run on lately etc to try to handle cache
 102  * and TLB miss penalties.
 103  *
 104  * Return values:
 105  *       -1000: never select this
 106  *           0: out of time, recalculate counters (but it might still be
 107  *              selected)
 108  *         +ve: "goodness" value (the larger, the better)
 109  *       +1000: realtime process, select this.
 110  */
 111
 112 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 113 {
 114         int weight;
 115
 116         /*
 117          * Realtime process, select the first one on the
 118          * runqueue (taking priorities within processes
 119          * into account).
 120          */
 121         if (p->policy != SCHED_OTHER) {
 122                 weight = 1000 + p->rt_priority;
 123                 goto out;
 124         }
 125
 126         /*
 127          * Give the process a first-approximation goodness value
 128          * according to the number of clock-ticks it has left.
 129          *
 130          * Don't do any other calculations if the time slice is
 131          * over..
 132          */
 133         weight = p->counter;
 134         if (!weight)
 135                 goto out;
 136
 137 #ifdef CONFIG_SMP
 138         /* Give a largish advantage to the same processor...   */
 139         /* (this is equivalent to penalizing other processors) */
 140         if (p->processor == this_cpu)
 141                 weight += PROC_CHANGE_PENALTY;
 142 #endif
 143
 144         /* .. and a slight advantage to the current MM */
 145         if (p->mm == this_mm || !p->mm)
 146                 weight += 1;
 147         weight += p->priority;
 148
 149 out:
 150         return weight;
 151 }
 152
 153 /*
 154  * subtle. We want to discard a yielded process only if it's being
 155  * considered for a reschedule. Wakeup-time 'queries' of the scheduling
 156  * state do not count. Another optimization we do: sched_yield()-ed
 157  * processes are runnable (and thus will be considered for scheduling)
 158  * right when they are calling schedule(). So the only place we need
 159  * to care about SCHED_YIELD is when we calculate the previous process'
 160  * goodness ...
 161  */
 162 static inline int prev_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 163 {
 164         if (p->policy & SCHED_YIELD) {
 165                 p->policy &= ~SCHED_YIELD;
 166                 return 0;
 167         }
 168         return goodness(p, this_cpu, this_mm);
 169 }
 170
 171 /*
 172  * the 'goodness value' of replacing a process on a given CPU.
 173  * positive value means 'replace', zero or negative means 'dont'.
 174  */
 175 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 176 {
 177         return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 178 }
 179
 180 /*
 181  * This is ugly, but reschedule_idle() is very timing-critical.
 182  * We enter with the runqueue spinlock held, but we might end
 183  * up unlocking it early, so the caller must not unlock the
 184  * runqueue, it's always done by reschedule_idle().
 185  */
 186 static inline void reschedule_idle(struct task_struct * p, unsigned long flags)
 187 {
 188 #ifdef CONFIG_SMP
 189         int this_cpu = smp_processor_id(), target_cpu;
 190         struct task_struct *tsk;
 191         int cpu, best_cpu, i;
 192
 193         /*
 194          * shortcut if the woken up task's last CPU is
 195          * idle now.
 196          */
 197         best_cpu = p->processor;
 198         tsk = idle_task(best_cpu);
 199         if (cpu_curr(best_cpu) == tsk)
 200                 goto send_now;
 201
 202         /*
 203          * We know that the preferred CPU has a cache-affine current
 204          * process, lets try to find a new idle CPU for the woken-up
 205          * process:
 206          */
 207         for (i = smp_num_cpus - 1; i >= 0; i--) {
 208                 cpu = cpu_logical_map(i);
 209                 if (cpu == best_cpu)
 210                         continue;
 211                 tsk = cpu_curr(cpu);
 212                 /*
 213                  * We use the last available idle CPU. This creates
 214                  * a priority list between idle CPUs, but this is not
 215                  * a problem.
 216                  */
 217                 if (tsk == idle_task(cpu))
 218                         goto send_now;
 219         }
 220
 221         /*
 222          * No CPU is idle, but maybe this process has enough priority
 223          * to preempt it's preferred CPU.
 224          */
 225         tsk = cpu_curr(best_cpu);
 226         if (preemption_goodness(tsk, p, best_cpu) > 0)
 227                 goto send_now;
 228
 229         /*
 230          * We will get here often - or in the high CPU contention
 231          * case. No CPU is idle and this process is either lowprio or
 232          * the preferred CPU is highprio. Try to preempt some other CPU
 233          * only if it's RT or if it's iteractive and the preferred
 234          * cpu won't reschedule shortly.
 235          */
 236         if (p->avg_slice < cacheflush_time || (p->policy & ~SCHED_YIELD) != SCHED_OTHER) {
 237                 for (i = smp_num_cpus - 1; i >= 0; i--) {
 238                         cpu = cpu_logical_map(i);
 239                         if (cpu == best_cpu)
 240                                 continue;
 241                         tsk = cpu_curr(cpu);
 242                         if (preemption_goodness(tsk, p, cpu) > 0)
 243                                 goto send_now;
 244                 }
 245         }
 246
 247         spin_unlock_irqrestore(&runqueue_lock, flags);
 248         return;
 249
 250 send_now:
 251         target_cpu = tsk->processor;
 252         tsk->need_resched = 1;
 253         spin_unlock_irqrestore(&runqueue_lock, flags);
 254         /*
 255          * the APIC stuff can go outside of the lock because
 256          * it uses no task information, only CPU#.
 257          */
 258         if (target_cpu != this_cpu)
 259                 smp_send_reschedule(target_cpu);
 260         return;
 261 #else /* UP */
 262         int this_cpu = smp_processor_id();
 263         struct task_struct *tsk;
 264
 265         tsk = cpu_curr(this_cpu);
 266         if (preemption_goodness(tsk, p, this_cpu) > 0)
 267                 tsk->need_resched = 1;
 268         spin_unlock_irqrestore(&runqueue_lock, flags);
 269 #endif
 270 }
 271
 272 /*
 273  * Careful!
 274  *
 275  * This has to add the process to the _beginning_ of the
 276  * run-queue, not the end. See the comment about "This is
 277  * subtle" in the scheduler proper..
 278  */
 279 static inline void add_to_runqueue(struct task_struct * p)
 280 {
 281         list_add(&p->run_list, &runqueue_head);
 282         nr_running++;
 283 }
 284
 285 static inline void move_last_runqueue(struct task_struct * p)
 286 {
 287         list_del(&p->run_list);
 288         list_add_tail(&p->run_list, &runqueue_head);
 289 }
 290
 291 static inline void move_first_runqueue(struct task_struct * p)
 292 {
 293         list_del(&p->run_list);
 294         list_add(&p->run_list, &runqueue_head);
 295 }
 296
 297 /*
 298  * Wake up a process. Put it on the run-queue if it's not
 299  * already there.  The "current" process is always on the
 300  * run-queue (except when the actual re-schedule is in
 301  * progress), and as such you're allowed to do the simpler
 302  * "current->state = TASK_RUNNING" to mark yourself runnable
 303  * without the overhead of this.
 304  */
 305 inline void wake_up_process(struct task_struct * p)
 306 {
 307         unsigned long flags;
 308
 309         /*
 310          * We want the common case fall through straight, thus the goto.
 311          */
 312         spin_lock_irqsave(&runqueue_lock, flags);
 313         p->state = TASK_RUNNING;
 314         if (task_on_runqueue(p))
 315                 goto out;
 316         add_to_runqueue(p);
 317         reschedule_idle(p, flags); // spin_unlocks runqueue
 318
 319         return;
 320 out:
 321         spin_unlock_irqrestore(&runqueue_lock, flags);
 322 }
 323
 324 static inline void wake_up_process_synchronous(struct task_struct * p)
 325 {
 326         unsigned long flags;
 327
 328         /*
 329          * We want the common case fall through straight, thus the goto.
 330          */
 331         spin_lock_irqsave(&runqueue_lock, flags);
 332         p->state = TASK_RUNNING;
 333         if (task_on_runqueue(p))
 334                 goto out;
 335         add_to_runqueue(p);
 336 out:
 337         spin_unlock_irqrestore(&runqueue_lock, flags);
 338 }
 339
 340 static void process_timeout(unsigned long __data)
 341 {
 342         struct task_struct * p = (struct task_struct *) __data;
 343
 344         wake_up_process(p);
 345 }
 346
 347 signed long schedule_timeout(signed long timeout)
 348 {
 349         struct timer_list timer;
 350         unsigned long expire;
 351
 352         switch (timeout)
 353         {
 354         case MAX_SCHEDULE_TIMEOUT:
 355                 /*
 356                  * These two special cases are useful to be comfortable
 357                  * in the caller. Nothing more. We could take
 358                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 359                  * but I' d like to return a valid offset (>=0) to allow
 360                  * the caller to do everything it want with the retval.
 361                  */
 362                 schedule();
 363                 goto out;
 364         default:
 365                 /*
 366                  * Another bit of PARANOID. Note that the retval will be
 367                  * 0 since no piece of kernel is supposed to do a check
 368                  * for a negative retval of schedule_timeout() (since it
 369                  * should never happens anyway). You just have the printk()
 370                  * that will tell you if something is gone wrong and where.
 371                  */
 372                 if (timeout < 0)
 373                 {
 374                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 375                                "value %lx from %p\n", timeout,
 376                                __builtin_return_address(0));
 377                         current->state = TASK_RUNNING;
 378                         goto out;
 379                 }
 380         }
 381
 382         expire = timeout + jiffies;
 383
 384         init_timer(&timer);
 385         timer.expires = expire;
 386         timer.data = (unsigned long) current;
 387         timer.function = process_timeout;
 388
 389         add_timer(&timer);
 390         schedule();
 391         del_timer(&timer);
 392         /* RED-PEN. Timer may be running now on another cpu.
 393          * Pray that process will not exit enough fastly.
 394          */
 395
 396         timeout = expire - jiffies;
 397
 398  out:
 399         return timeout < 0 ? 0 : timeout;
 400 }
 401
 402 /*
 403  * schedule_tail() is getting called from the fork return path. This
 404  * cleans up all remaining scheduler things, without impacting the
 405  * common case.
 406  */
 407 static inline void __schedule_tail(struct task_struct *prev)
 408 {
 409         current->need_resched |= prev->need_resched;
 410 #ifdef CONFIG_SMP
 411         if ((prev->state == TASK_RUNNING) &&
 412                         (prev != idle_task(smp_processor_id()))) {
 413                 unsigned long flags;
 414
 415                 spin_lock_irqsave(&runqueue_lock, flags);
 416                 reschedule_idle(prev, flags); // spin_unlocks runqueue
 417         }
 418         wmb();
 419         prev->has_cpu = 0;
 420 #endif /* CONFIG_SMP */
 421 }
 422
 423 void schedule_tail(struct task_struct *prev)
 424 {
 425         __schedule_tail(prev);
 426 }
 427
 428 /*
 429  *  'schedule()' is the scheduler function. It's a very simple and nice
 430  * scheduler: it's not perfect, but certainly works for most things.
 431  *
 432  * The goto is "interesting".
 433  *
 434  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 435  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 436  * information in task[0] is never used.
 437  */
 438 asmlinkage void schedule(void)
 439 {
 440         struct schedule_data * sched_data;
 441         struct task_struct *prev, *next, *p;
 442         struct list_head *tmp;
 443         int this_cpu, c;
 444
 445         if (!current->active_mm) BUG();
 446         if (tq_scheduler)
 447                 goto handle_tq_scheduler;
 448 tq_scheduler_back:
 449
 450         prev = current;
 451         this_cpu = prev->processor;
 452
 453         if (in_interrupt())
 454                 goto scheduling_in_interrupt;
 455
 456         release_kernel_lock(prev, this_cpu);
 457
 458         /* Do "administrative" work here while we don't hold any locks */
 459         if (softirq_state[this_cpu].active & softirq_state[this_cpu].mask)
 460                 goto handle_softirq;
 461 handle_softirq_back:
 462
 463         /*
 464          * 'sched_data' is protected by the fact that we can run
 465          * only one process per CPU.
 466          */
 467         sched_data = & aligned_data[this_cpu].schedule_data;
 468
 469         spin_lock_irq(&runqueue_lock);
 470
 471         /* move an exhausted RR process to be last.. */
 472         if (prev->policy == SCHED_RR)
 473                 goto move_rr_last;
 474 move_rr_back:
 475
 476         switch (prev->state & ~TASK_EXCLUSIVE) {
 477                 case TASK_INTERRUPTIBLE:
 478                         if (signal_pending(prev)) {
 479                                 prev->state = TASK_RUNNING;
 480                                 break;
 481                         }
 482                 default:
 483                         del_from_runqueue(prev);
 484                 case TASK_RUNNING:
 485         }
 486         prev->need_resched = 0;
 487
 488         /*
 489          * this is the scheduler proper:
 490          */
 491
 492 repeat_schedule:
 493         /*
 494          * Default process to select..
 495          */
 496         next = idle_task(this_cpu);
 497         c = -1000;
 498         if (prev->state == TASK_RUNNING)
 499                 goto still_running;
 500
 501 still_running_back:
 502         list_for_each(tmp, &runqueue_head) {
 503                 p = list_entry(tmp, struct task_struct, run_list);
 504                 if (can_schedule(p)) {
 505                         int weight = goodness(p, this_cpu, prev->active_mm);
 506                         if (weight > c)
 507                                 c = weight, next = p;
 508                 }
 509         }
 510
 511         /* Do we need to re-calculate counters? */
 512         if (!c)
 513                 goto recalculate;
 514         /*
 515          * from this point on nothing can prevent us from
 516          * switching to the next task, save this fact in
 517          * sched_data.
 518          */
 519         sched_data->curr = next;
 520 #ifdef CONFIG_SMP
 521         next->has_cpu = 1;
 522         next->processor = this_cpu;
 523 #endif
 524         spin_unlock_irq(&runqueue_lock);
 525
 526         if (prev == next)
 527                 goto same_process;
 528
 529 #ifdef CONFIG_SMP
 530         /*
 531          * maintain the per-process 'average timeslice' value.
 532          * (this has to be recalculated even if we reschedule to
 533          * the same process) Currently this is only used on SMP,
 534          * and it's approximate, so we do not have to maintain
 535          * it while holding the runqueue spinlock.
 536          */
 537         {
 538                 cycles_t t, this_slice;
 539
 540                 t = get_cycles();
 541                 this_slice = t - sched_data->last_schedule;
 542                 sched_data->last_schedule = t;
 543
 544                 /*
 545                  * Exponentially fading average calculation, with
 546                  * some weight so it doesnt get fooled easily by
 547                  * smaller irregularities.
 548                  */
 549                 prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
 550         }
 551
 552         /*
 553          * We drop the scheduler lock early (it's a global spinlock),
 554          * thus we have to lock the previous process from getting
 555          * rescheduled during switch_to().
 556          */
 557
 558 #endif /* CONFIG_SMP */
 559
 560         kstat.context_swtch++;
 561         /*
 562          * there are 3 processes which are affected by a context switch:
 563          *
 564          * prev == .... ==> (last => next)
 565          *
 566          * It's the 'much more previous' 'prev' that is on next's stack,
 567          * but prev is set to (the just run) 'last' process by switch_to().
 568          * This might sound slightly confusing but makes tons of sense.
 569          */
 570         prepare_to_switch();
 571         {
 572                 struct mm_struct *mm = next->mm;
 573                 struct mm_struct *oldmm = prev->active_mm;
 574                 if (!mm) {
 575                         if (next->active_mm) BUG();
 576                         next->active_mm = oldmm;
 577                         atomic_inc(&oldmm->mm_count);
 578                         enter_lazy_tlb(oldmm, next, this_cpu);
 579                 } else {
 580                         if (next->active_mm != mm) BUG();
 581                         switch_mm(oldmm, mm, next, this_cpu);
 582                 }
 583
 584                 if (!prev->mm) {
 585                         prev->active_mm = NULL;
 586                         mmdrop(oldmm);
 587                 }
 588         }
 589
 590         /*
 591          * This just switches the register state and the
 592          * stack.
 593          */
 594         switch_to(prev, next, prev);
 595         __schedule_tail(prev);
 596
 597 same_process:
 598         reacquire_kernel_lock(current);
 599         return;
 600
 601 recalculate:
 602         {
 603                 struct task_struct *p;
 604                 spin_unlock_irq(&runqueue_lock);
 605                 read_lock(&tasklist_lock);
 606                 for_each_task(p)
 607                         p->counter = (p->counter >> 1) + p->priority;
 608                 read_unlock(&tasklist_lock);
 609                 spin_lock_irq(&runqueue_lock);
 610         }
 611         goto repeat_schedule;
 612
 613 still_running:
 614         c = prev_goodness(prev, this_cpu, prev->active_mm);
 615         next = prev;
 616         goto still_running_back;
 617
 618 handle_softirq:
 619         do_softirq();
 620         goto handle_softirq_back;
 621
 622 handle_tq_scheduler:
 623         /*
 624          * do not run the task queue with disabled interrupts,
 625          * cli() wouldn't work on SMP
 626          */
 627         sti();
 628         run_task_queue(&tq_scheduler);
 629         goto tq_scheduler_back;
 630
 631 move_rr_last:
 632         if (!prev->counter) {
 633                 prev->counter = prev->priority;
 634                 move_last_runqueue(prev);
 635         }
 636         goto move_rr_back;
 637
 638 scheduling_in_interrupt:
 639         printk("Scheduling in interrupt\n");
 640         BUG();
 641         return;
 642 }
 643
 644 static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, const int sync)
 645 {
 646         struct list_head *tmp, *head;
 647         struct task_struct *p;
 648         unsigned long flags;
 649
 650         if (!q)
 651                 goto out;
 652
 653         wq_write_lock_irqsave(&q->lock, flags);
 654
 655 #if WAITQUEUE_DEBUG
 656         CHECK_MAGIC_WQHEAD(q);
 657 #endif
 658
 659         head = &q->task_list;
 660 #if WAITQUEUE_DEBUG
 661         if (!head->next || !head->prev)
 662                 WQ_BUG();
 663 #endif
 664         list_for_each(tmp, head) {
 665                 unsigned int state;
 666                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 667
 668 #if WAITQUEUE_DEBUG
 669                 CHECK_MAGIC(curr->__magic);
 670 #endif
 671                 p = curr->task;
 672                 state = p->state;
 673                 if (state & (mode & ~TASK_EXCLUSIVE)) {
 674 #if WAITQUEUE_DEBUG
 675                         curr->__waker = (long)__builtin_return_address(0);
 676 #endif
 677                         if (sync)
 678                                 wake_up_process_synchronous(p);
 679                         else
 680                                 wake_up_process(p);
 681                         if (state & mode & TASK_EXCLUSIVE)
 682                                 break;
 683                 }
 684         }
 685         wq_write_unlock_irqrestore(&q->lock, flags);
 686 out:
 687         return;
 688 }
 689
 690 void __wake_up(wait_queue_head_t *q, unsigned int mode)
 691 {
 692         __wake_up_common(q, mode, 0);
 693 }
 694
 695 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode)
 696 {
 697         __wake_up_common(q, mode, 1);
 698 }
 699
 700 #define SLEEP_ON_VAR                            \
 701         unsigned long flags;                    \
 702         wait_queue_t wait;                      \
 703         init_waitqueue_entry(&wait, current);
 704
 705 #define SLEEP_ON_HEAD                                   \
 706         wq_write_lock_irqsave(&q->lock,flags);          \
 707         __add_wait_queue(q, &wait);                     \
 708         wq_write_unlock(&q->lock);
 709
 710 #define SLEEP_ON_TAIL                                           \
 711         wq_write_lock_irq(&q->lock);                            \
 712         __remove_wait_queue(q, &wait);                          \
 713         wq_write_unlock_irqrestore(&q->lock,flags);
 714
 715 void interruptible_sleep_on(wait_queue_head_t *q)
 716 {
 717         SLEEP_ON_VAR
 718
 719         current->state = TASK_INTERRUPTIBLE;
 720
 721         SLEEP_ON_HEAD
 722         schedule();
 723         SLEEP_ON_TAIL
 724 }
 725
 726 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 727 {
 728         SLEEP_ON_VAR
 729
 730         current->state = TASK_INTERRUPTIBLE;
 731
 732         SLEEP_ON_HEAD
 733         timeout = schedule_timeout(timeout);
 734         SLEEP_ON_TAIL
 735
 736         return timeout;
 737 }
 738
 739 void sleep_on(wait_queue_head_t *q)
 740 {
 741         SLEEP_ON_VAR
 742
 743         current->state = TASK_UNINTERRUPTIBLE;
 744
 745         SLEEP_ON_HEAD
 746         schedule();
 747         SLEEP_ON_TAIL
 748 }
 749
 750 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 751 {
 752         SLEEP_ON_VAR
 753
 754         current->state = TASK_UNINTERRUPTIBLE;
 755
 756         SLEEP_ON_HEAD
 757         timeout = schedule_timeout(timeout);
 758         SLEEP_ON_TAIL
 759
 760         return timeout;
 761 }
 762
 763 void scheduling_functions_end_here(void) { }
 764
 765 #ifndef __alpha__
 766
 767 /*
 768  * This has been replaced by sys_setpriority.  Maybe it should be
 769  * moved into the arch dependent tree for those ports that require
 770  * it for backward compatibility?
 771  */
 772
 773 asmlinkage long sys_nice(int increment)
 774 {
 775         unsigned long newprio;
 776         int increase = 0;
 777
 778         /*
 779          *      Setpriority might change our priority at the same moment.
 780          *      We don't have to worry. Conceptually one call occurs first
 781          *      and we have a single winner.
 782          */
 783
 784         newprio = increment;
 785         if (increment < 0) {
 786                 if (!capable(CAP_SYS_NICE))
 787                         return -EPERM;
 788                 newprio = -increment;
 789                 increase = 1;
 790         }
 791
 792         if (newprio > 40)
 793                 newprio = 40;
 794         /*
 795          * do a "normalization" of the priority (traditionally
 796          * Unix nice values are -20 to 20; Linux doesn't really
 797          * use that kind of thing, but uses the length of the
 798          * timeslice instead (default 200 ms). The rounding is
 799          * why we want to avoid negative values.
 800          */
 801         newprio = (newprio * DEF_PRIORITY + 10) / 20;
 802         increment = newprio;
 803         if (increase)
 804                 increment = -increment;
 805         /*
 806          *      Current->priority can change between this point
 807          *      and the assignment. We are assigning not doing add/subs
 808          *      so thats ok. Conceptually a process might just instantaneously
 809          *      read the value we stomp over. I don't think that is an issue
 810          *      unless posix makes it one. If so we can loop on changes
 811          *      to current->priority.
 812          */
 813         newprio = current->priority - increment;
 814         if ((signed) newprio < 1)
 815                 newprio = 1;
 816         if (newprio > DEF_PRIORITY*2)
 817                 newprio = DEF_PRIORITY*2;
 818         current->priority = newprio;
 819         return 0;
 820 }
 821
 822 #endif
 823
 824 static inline struct task_struct *find_process_by_pid(pid_t pid)
 825 {
 826         struct task_struct *tsk = current;
 827
 828         if (pid)
 829                 tsk = find_task_by_pid(pid);
 830         return tsk;
 831 }
 832
 833 static int setscheduler(pid_t pid, int policy,
 834                         struct sched_param *param)
 835 {
 836         struct sched_param lp;
 837         struct task_struct *p;
 838         int retval;
 839
 840         retval = -EINVAL;
 841         if (!param || pid < 0)
 842                 goto out_nounlock;
 843
 844         retval = -EFAULT;
 845         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 846                 goto out_nounlock;
 847
 848         /*
 849          * We play safe to avoid deadlocks.
 850          */
 851         spin_lock_irq(&runqueue_lock);
 852         read_lock(&tasklist_lock);
 853
 854         p = find_process_by_pid(pid);
 855
 856         retval = -ESRCH;
 857         if (!p)
 858                 goto out_unlock;
 859
 860         if (policy < 0)
 861                 policy = p->policy;
 862         else {
 863                 retval = -EINVAL;
 864                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
 865                                 policy != SCHED_OTHER)
 866                         goto out_unlock;
 867         }
 868
 869         /*
 870          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 871          * priority for SCHED_OTHER is 0.
 872          */
 873         retval = -EINVAL;
 874         if (lp.sched_priority < 0 || lp.sched_priority > 99)
 875                 goto out_unlock;
 876         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 877                 goto out_unlock;
 878
 879         retval = -EPERM;
 880         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 881             !capable(CAP_SYS_NICE))
 882                 goto out_unlock;
 883         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 884             !capable(CAP_SYS_NICE))
 885                 goto out_unlock;
 886
 887         retval = 0;
 888         p->policy = policy;
 889         p->rt_priority = lp.sched_priority;
 890         if (task_on_runqueue(p))
 891                 move_first_runqueue(p);
 892
 893         current->need_resched = 1;
 894
 895 out_unlock:
 896         read_unlock(&tasklist_lock);
 897         spin_unlock_irq(&runqueue_lock);
 898
 899 out_nounlock:
 900         return retval;
 901 }
 902
 903 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 904                                       struct sched_param *param)
 905 {
 906         return setscheduler(pid, policy, param);
 907 }
 908
 909 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 910 {
 911         return setscheduler(pid, -1, param);
 912 }
 913
 914 asmlinkage long sys_sched_getscheduler(pid_t pid)
 915 {
 916         struct task_struct *p;
 917         int retval;
 918
 919         retval = -EINVAL;
 920         if (pid < 0)
 921                 goto out_nounlock;
 922
 923         retval = -ESRCH;
 924         read_lock(&tasklist_lock);
 925         p = find_process_by_pid(pid);
 926         if (p)
 927                 retval = p->policy & ~SCHED_YIELD;
 928         read_unlock(&tasklist_lock);
 929
 930 out_nounlock:
 931         return retval;
 932 }
 933
 934 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 935 {
 936         struct task_struct *p;
 937         struct sched_param lp;
 938         int retval;
 939
 940         retval = -EINVAL;
 941         if (!param || pid < 0)
 942                 goto out_nounlock;
 943
 944         read_lock(&tasklist_lock);
 945         p = find_process_by_pid(pid);
 946         retval = -ESRCH;
 947         if (!p)
 948                 goto out_unlock;
 949         lp.sched_priority = p->rt_priority;
 950         read_unlock(&tasklist_lock);
 951
 952         /*
 953          * This one might sleep, we cannot do it with a spinlock held ...
 954          */
 955         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 956
 957 out_nounlock:
 958         return retval;
 959
 960 out_unlock:
 961         read_unlock(&tasklist_lock);
 962         return retval;
 963 }
 964
 965 asmlinkage long sys_sched_yield(void)
 966 {
 967         spin_lock_irq(&runqueue_lock);
 968         if (current->policy == SCHED_OTHER)
 969                 current->policy |= SCHED_YIELD;
 970         current->need_resched = 1;
 971         move_last_runqueue(current);
 972         spin_unlock_irq(&runqueue_lock);
 973         return 0;
 974 }
 975
 976 asmlinkage long sys_sched_get_priority_max(int policy)
 977 {
 978         int ret = -EINVAL;
 979
 980         switch (policy) {
 981         case SCHED_FIFO:
 982         case SCHED_RR:
 983                 ret = 99;
 984                 break;
 985         case SCHED_OTHER:
 986                 ret = 0;
 987                 break;
 988         }
 989         return ret;
 990 }
 991
 992 asmlinkage long sys_sched_get_priority_min(int policy)
 993 {
 994         int ret = -EINVAL;
 995
 996         switch (policy) {
 997         case SCHED_FIFO:
 998         case SCHED_RR:
 999                 ret = 1;
1000                 break;
1001         case SCHED_OTHER:
1002                 ret = 0;
1003         }
1004         return ret;
1005 }
1006
1007 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1008 {
1009         struct timespec t;
1010
1011         t.tv_sec = 0;
1012         t.tv_nsec = 150000;
1013         if (copy_to_user(interval, &t, sizeof(struct timespec)))
1014                 return -EFAULT;
1015         return 0;
1016 }
1017
1018 static void show_task(struct task_struct * p)
1019 {
1020         unsigned long free = 0;
1021         int state;
1022         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1023
1024         printk("%-8s  ", p->comm);
1025         state = p->state ? ffz(~p->state) + 1 : 0;
1026         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1027                 printk(stat_nam[state]);
1028         else
1029                 printk(" ");
1030 #if (BITS_PER_LONG == 32)
1031         if (p == current)
1032                 printk(" current  ");
1033         else
1034                 printk(" %08lX ", thread_saved_pc(&p->thread));
1035 #else
1036         if (p == current)
1037                 printk("   current task   ");
1038         else
1039                 printk(" %016lx ", thread_saved_pc(&p->thread));
1040 #endif
1041         {
1042                 unsigned long * n = (unsigned long *) (p+1);
1043                 while (!*n)
1044                         n++;
1045                 free = (unsigned long) n - (unsigned long)(p+1);
1046         }
1047         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1048         if (p->p_cptr)
1049                 printk("%5d ", p->p_cptr->pid);
1050         else
1051                 printk("      ");
1052         if (!p->mm)
1053                 printk(" (L-TLB) ");
1054         else
1055                 printk(" (NOTLB) ");
1056         if (p->p_ysptr)
1057                 printk("%7d", p->p_ysptr->pid);
1058         else
1059                 printk("       ");
1060         if (p->p_osptr)
1061                 printk(" %5d\n", p->p_osptr->pid);
1062         else
1063                 printk("\n");
1064
1065         {
1066                 struct signal_queue *q;
1067                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1068
1069                 render_sigset_t(&p->signal, s);
1070                 render_sigset_t(&p->blocked, b);
1071                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1072                 for (q = p->sigqueue; q ; q = q->next)
1073                         printk(" %d", q->info.si_signo);
1074                 printk(" X\n");
1075         }
1076 }
1077
1078 char * render_sigset_t(sigset_t *set, char *buffer)
1079 {
1080         int i = _NSIG, x;
1081         do {
1082                 i -= 4, x = 0;
1083                 if (sigismember(set, i+1)) x |= 1;
1084                 if (sigismember(set, i+2)) x |= 2;
1085                 if (sigismember(set, i+3)) x |= 4;
1086                 if (sigismember(set, i+4)) x |= 8;
1087                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1088         } while (i >= 4);
1089         *buffer = 0;
1090         return buffer;
1091 }
1092
1093 void show_state(void)
1094 {
1095         struct task_struct *p;
1096
1097 #if (BITS_PER_LONG == 32)
1098         printk("\n"
1099                "                         free                        sibling\n");
1100         printk("  task             PC    stack   pid father child younger older\n");
1101 #else
1102         printk("\n"
1103                "                                 free                        sibling\n");
1104         printk("  task                 PC        stack   pid father child younger older\n");
1105 #endif
1106         read_lock(&tasklist_lock);
1107         for_each_task(p)
1108                 show_task(p);
1109         read_unlock(&tasklist_lock);
1110 }
1111
1112 /*
1113  *      Put all the gunge required to become a kernel thread without
1114  *      attached user resources in one place where it belongs.
1115  */
1116
1117 void daemonize(void)
1118 {
1119         struct fs_struct *fs;
1120
1121
1122         /*
1123          * If we were started as result of loading a module, close all of the
1124          * user space pages.  We don't need them, and if we didn't close them
1125          * they would be locked into memory.
1126          */
1127         exit_mm(current);
1128
1129         current->session = 1;
1130         current->pgrp = 1;
1131
1132         /* Become as one with the init task */
1133
1134         exit_fs(current);       /* current->fs->count--; */
1135         fs = init_task.fs;
1136         current->fs = fs;
1137         atomic_inc(&fs->count);
1138
1139 }
1140
1141 void __init init_idle(void)
1142 {
1143         struct schedule_data * sched_data;
1144         sched_data = &aligned_data[smp_processor_id()].schedule_data;
1145
1146         if (current != &init_task && task_on_runqueue(current)) {
1147                 printk("UGH! (%d:%d) was on the runqueue, removing.\n",
1148                         smp_processor_id(), current->pid);
1149                 del_from_runqueue(current);
1150         }
1151         sched_data->curr = current;
1152         sched_data->last_schedule = get_cycles();
1153 }
1154
1155 extern void init_timervecs (void);
1156
1157 void __init sched_init(void)
1158 {
1159         /*
1160          * We have to do a little magic to get the first
1161          * process right in SMP mode.
1162          */
1163         int cpu = smp_processor_id();
1164         int nr;
1165
1166         init_task.processor = cpu;
1167
1168         for(nr = 0; nr < PIDHASH_SZ; nr++)
1169                 pidhash[nr] = NULL;
1170
1171         init_timervecs();
1172
1173         init_bh(TIMER_BH, timer_bh);
1174         init_bh(TQUEUE_BH, tqueue_bh);
1175         init_bh(IMMEDIATE_BH, immediate_bh);
1176
1177         /*
1178          * The boot idle thread does lazy MMU switching as well:
1179          */
1180         atomic_inc(&init_mm.mm_count);
1181         enter_lazy_tlb(&init_mm, current, cpu);
1182 }