kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  1996-04-21  Modified by Ulrich Windl to make NTP work
   7  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   8  *              make semaphores SMP safe
   9  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
  10  */
  11
  12 /*
  13  * 'sched.c' is the main kernel file. It contains scheduling primitives
  14  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  15  * call functions (type getpid()), which just extract a field from
  16  * current-task
  17  */
  18
  19 #include <linux/signal.h>
  20 #include <linux/sched.h>
  21 #include <linux/timer.h>
  22 #include <linux/kernel.h>
  23 #include <linux/kernel_stat.h>
  24 #include <linux/fdreg.h>
  25 #include <linux/errno.h>
  26 #include <linux/time.h>
  27 #include <linux/ptrace.h>
  28 #include <linux/delay.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/tqueue.h>
  31 #include <linux/resource.h>
  32 #include <linux/mm.h>
  33 #include <linux/smp.h>
  34 #include <linux/smp_lock.h>
  35 #include <linux/init.h>
  36
  37 #include <asm/system.h>
  38 #include <asm/io.h>
  39 #include <asm/uaccess.h>
  40 #include <asm/pgtable.h>
  41 #include <asm/mmu_context.h>
  42 #include <asm/spinlock.h>
  43
  44 #include <linux/timex.h>
  45
  46 /*
  47  * kernel variables
  48  */
  49
  50 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  51
  52 long tick = (1000000 + HZ/2) / HZ;      /* timer interrupt period */
  53
  54 /* The current time */
  55 volatile struct timeval xtime __attribute__ ((aligned (16)));
  56
  57 /* Don't completely fail for HZ > 500.  */
  58 int tickadj = 500/HZ ? : 1;             /* microsecs */
  59
  60 DECLARE_TASK_QUEUE(tq_timer);
  61 DECLARE_TASK_QUEUE(tq_immediate);
  62 DECLARE_TASK_QUEUE(tq_scheduler);
  63
  64 /*
  65  * phase-lock loop variables
  66  */
  67 /* TIME_ERROR prevents overwriting the CMOS clock */
  68 int time_state = TIME_ERROR;    /* clock synchronization status */
  69 int time_status = STA_UNSYNC;   /* clock status bits */
  70 long time_offset = 0;           /* time adjustment (us) */
  71 long time_constant = 2;         /* pll time constant */
  72 long time_tolerance = MAXFREQ;  /* frequency tolerance (ppm) */
  73 long time_precision = 1;        /* clock precision (us) */
  74 long time_maxerror = MAXPHASE;  /* maximum error (us) */
  75 long time_esterror = MAXPHASE;  /* estimated error (us) */
  76 long time_phase = 0;            /* phase offset (scaled us) */
  77 long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;  /* frequency offset (scaled ppm) */
  78 long time_adj = 0;              /* tick adjust (scaled 1 / HZ) */
  79 long time_reftime = 0;          /* time at last adjustment (s) */
  80
  81 long time_adjust = 0;
  82 long time_adjust_step = 0;
  83
  84 unsigned long event = 0;
  85
  86 extern int do_setitimer(int, struct itimerval *, struct itimerval *);
  87 unsigned int * prof_buffer = NULL;
  88 unsigned long prof_len = 0;
  89 unsigned long prof_shift = 0;
  90
  91 extern void mem_use(void);
  92
  93 unsigned long volatile jiffies=0;
  94
  95 /*
  96  *      Init task must be ok at boot for the ix86 as we will check its signals
  97  *      via the SMP irq return path.
  98  */
  99
 100 struct task_struct * task[NR_TASKS] = {&init_task, };
 101
 102 struct kernel_stat kstat = { 0 };
 103
 104 void scheduling_functions_start_here(void) { }
 105
 106 static inline void reschedule_idle(struct task_struct * p)
 107 {
 108
 109         /*
 110          * For SMP, we try to see if the CPU the task used
 111          * to run on is idle..
 112          */
 113 #if 0
 114         /*
 115          * Disable this for now. Ingo has some interesting
 116          * code that looks too complex, and I have some ideas,
 117          * but in the meantime.. One problem is that "wakeup()"
 118          * can be (and is) called before we've even initialized
 119          * SMP completely, so..
 120          */
 121 #ifdef __SMP__
 122         int want_cpu = p->processor;
 123
 124         /*
 125          * Don't even try to find another CPU for us if the task
 126          * ran on this one before..
 127          */
 128         if (want_cpu != smp_processor_id()) {
 129                 struct task_struct **idle = task;
 130                 int i = smp_num_cpus;
 131
 132                 do {
 133                         struct task_struct *tsk = *idle;
 134                         idle++;
 135                         /* Something like this.. */
 136                         if (tsk->has_cpu && tsk->processor == want_cpu) {
 137                                 tsk->need_resched = 1;
 138                                 smp_send_reschedule(want_cpu);
 139                                 return;
 140                         }
 141                 } while (--i > 0);
 142         }
 143 #endif
 144 #endif
 145         if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
 146                 current->need_resched = 1;
 147 }
 148
 149 /*
 150  * Careful!
 151  *
 152  * This has to add the process to the _beginning_ of the
 153  * run-queue, not the end. See the comment about "This is
 154  * subtle" in the scheduler proper..
 155  */
 156 static inline void add_to_runqueue(struct task_struct * p)
 157 {
 158         struct task_struct *next = init_task.next_run;
 159
 160         p->prev_run = &init_task;
 161         init_task.next_run = p;
 162         p->next_run = next;
 163         next->prev_run = p;
 164 }
 165
 166 static inline void del_from_runqueue(struct task_struct * p)
 167 {
 168         struct task_struct *next = p->next_run;
 169         struct task_struct *prev = p->prev_run;
 170
 171         nr_running--;
 172         next->prev_run = prev;
 173         prev->next_run = next;
 174         p->next_run = NULL;
 175         p->prev_run = NULL;
 176 }
 177
 178 static inline void move_last_runqueue(struct task_struct * p)
 179 {
 180         struct task_struct *next = p->next_run;
 181         struct task_struct *prev = p->prev_run;
 182
 183         /* remove from list */
 184         next->prev_run = prev;
 185         prev->next_run = next;
 186         /* add back to list */
 187         p->next_run = &init_task;
 188         prev = init_task.prev_run;
 189         init_task.prev_run = p;
 190         p->prev_run = prev;
 191         prev->next_run = p;
 192 }
 193
 194 static inline void move_first_runqueue(struct task_struct * p)
 195 {
 196         struct task_struct *next = p->next_run;
 197         struct task_struct *prev = p->prev_run;
 198
 199         /* remove from list */
 200         next->prev_run = prev;
 201         prev->next_run = next;
 202         /* add back to list */
 203         p->prev_run = &init_task;
 204         next = init_task.next_run;
 205         init_task.next_run = p;
 206         p->next_run = next;
 207         next->prev_run = p;
 208 }
 209
 210 /*
 211  * The tasklist_lock protects the linked list of processes.
 212  *
 213  * The scheduler lock is protecting against multiple entry
 214  * into the scheduling code, and doesn't need to worry
 215  * about interrupts (because interrupts cannot call the
 216  * scheduler).
 217  *
 218  * The run-queue lock locks the parts that actually access
 219  * and change the run-queues, and have to be interrupt-safe.
 220  */
 221 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; /* should be acquired first */
 222 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
 223 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;      /* third */
 224
 225 /*
 226  * Wake up a process. Put it on the run-queue if it's not
 227  * already there.  The "current" process is always on the
 228  * run-queue (except when the actual re-schedule is in
 229  * progress), and as such you're allowed to do the simpler
 230  * "current->state = TASK_RUNNING" to mark yourself runnable
 231  * without the overhead of this.
 232  */
 233 inline void wake_up_process(struct task_struct * p)
 234 {
 235         unsigned long flags;
 236
 237         spin_lock_irqsave(&runqueue_lock, flags);
 238         p->state = TASK_RUNNING;
 239         if (!p->next_run) {
 240                 add_to_runqueue(p);
 241                 reschedule_idle(p);
 242                 nr_running++;
 243         }
 244         spin_unlock_irqrestore(&runqueue_lock, flags);
 245 }
 246
 247 static void process_timeout(unsigned long __data)
 248 {
 249         struct task_struct * p = (struct task_struct *) __data;
 250
 251         p->timeout = 0;
 252         wake_up_process(p);
 253 }
 254
 255 /*
 256  * This is the function that decides how desirable a process is..
 257  * You can weigh different processes against each other depending
 258  * on what CPU they've run on lately etc to try to handle cache
 259  * and TLB miss penalties.
 260  *
 261  * Return values:
 262  *       -1000: never select this
 263  *           0: out of time, recalculate counters (but it might still be
 264  *              selected)
 265  *         +ve: "goodness" value (the larger, the better)
 266  *       +1000: realtime process, select this.
 267  */
 268 static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
 269 {
 270         int policy = p->policy;
 271         int weight;
 272
 273         if (policy & SCHED_YIELD) {
 274                 p->policy = policy & ~SCHED_YIELD;
 275                 return 0;
 276         }
 277
 278         /*
 279          * Realtime process, select the first one on the
 280          * runqueue (taking priorities within processes
 281          * into account).
 282          */
 283         if (policy != SCHED_OTHER)
 284                 return 1000 + p->rt_priority;
 285
 286         /*
 287          * Give the process a first-approximation goodness value
 288          * according to the number of clock-ticks it has left.
 289          *
 290          * Don't do any other calculations if the time slice is
 291          * over..
 292          */
 293         weight = p->counter;
 294         if (weight) {
 295
 296 #ifdef __SMP__
 297                 /* Give a largish advantage to the same processor...   */
 298                 /* (this is equivalent to penalizing other processors) */
 299                 if (p->processor == this_cpu)
 300                         weight += PROC_CHANGE_PENALTY;
 301 #endif
 302
 303                 /* .. and a slight advantage to the current thread */
 304                 if (p->mm == prev->mm)
 305                         weight += 1;
 306                 weight += p->priority;
 307         }
 308
 309         return weight;
 310 }
 311
 312 /*
 313  * Event timer code
 314  */
 315 #define TVN_BITS 6
 316 #define TVR_BITS 8
 317 #define TVN_SIZE (1 << TVN_BITS)
 318 #define TVR_SIZE (1 << TVR_BITS)
 319 #define TVN_MASK (TVN_SIZE - 1)
 320 #define TVR_MASK (TVR_SIZE - 1)
 321
 322 struct timer_vec {
 323         int index;
 324         struct timer_list *vec[TVN_SIZE];
 325 };
 326
 327 struct timer_vec_root {
 328         int index;
 329         struct timer_list *vec[TVR_SIZE];
 330 };
 331
 332 static struct timer_vec tv5 = { 0 };
 333 static struct timer_vec tv4 = { 0 };
 334 static struct timer_vec tv3 = { 0 };
 335 static struct timer_vec tv2 = { 0 };
 336 static struct timer_vec_root tv1 = { 0 };
 337
 338 static struct timer_vec * const tvecs[] = {
 339         (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
 340 };
 341
 342 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
 343
 344 static unsigned long timer_jiffies = 0;
 345
 346 static inline void insert_timer(struct timer_list *timer,
 347                                 struct timer_list **vec, int idx)
 348 {
 349         if ((timer->next = vec[idx]))
 350                 vec[idx]->prev = timer;
 351         vec[idx] = timer;
 352         timer->prev = (struct timer_list *)&vec[idx];
 353 }
 354
 355 static inline void internal_add_timer(struct timer_list *timer)
 356 {
 357         /*
 358          * must be cli-ed when calling this
 359          */
 360         unsigned long expires = timer->expires;
 361         unsigned long idx = expires - timer_jiffies;
 362
 363         if (idx < TVR_SIZE) {
 364                 int i = expires & TVR_MASK;
 365                 insert_timer(timer, tv1.vec, i);
 366         } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 367                 int i = (expires >> TVR_BITS) & TVN_MASK;
 368                 insert_timer(timer, tv2.vec, i);
 369         } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 370                 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 371                 insert_timer(timer, tv3.vec, i);
 372         } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 373                 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 374                 insert_timer(timer, tv4.vec, i);
 375         } else if (expires < timer_jiffies) {
 376                 /* can happen if you add a timer with expires == jiffies,
 377                  * or you set a timer to go off in the past
 378                  */
 379                 insert_timer(timer, tv1.vec, tv1.index);
 380         } else if (idx < 0xffffffffUL) {
 381                 int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 382                 insert_timer(timer, tv5.vec, i);
 383         } else {
 384                 /* Can only get here on architectures with 64-bit jiffies */
 385                 timer->next = timer->prev = timer;
 386         }
 387 }
 388
 389 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
 390
 391 void add_timer(struct timer_list *timer)
 392 {
 393         unsigned long flags;
 394
 395         spin_lock_irqsave(&timerlist_lock, flags);
 396         internal_add_timer(timer);
 397         spin_unlock_irqrestore(&timerlist_lock, flags);
 398 }
 399
 400 static inline int detach_timer(struct timer_list *timer)
 401 {
 402         struct timer_list *prev = timer->prev;
 403         if (prev) {
 404                 struct timer_list *next = timer->next;
 405                 prev->next = next;
 406                 if (next)
 407                         next->prev = prev;
 408                 return 1;
 409         }
 410         return 0;
 411 }
 412
 413 void mod_timer(struct timer_list *timer, unsigned long expires)
 414 {
 415         unsigned long flags;
 416
 417         spin_lock_irqsave(&timerlist_lock, flags);
 418         timer->expires = expires;
 419         detach_timer(timer);
 420         internal_add_timer(timer);
 421         spin_unlock_irqrestore(&timerlist_lock, flags);
 422 }
 423
 424 int del_timer(struct timer_list * timer)
 425 {
 426         int ret;
 427         unsigned long flags;
 428
 429         spin_lock_irqsave(&timerlist_lock, flags);
 430         ret = detach_timer(timer);
 431         timer->next = timer->prev = 0;
 432         spin_unlock_irqrestore(&timerlist_lock, flags);
 433
 434         /* Make sure the timer isn't running in parallell.. */
 435         synchronize_bh();
 436         return ret;
 437 }
 438
 439 #ifdef __SMP__
 440
 441 #define idle_task (task[cpu_number_map[this_cpu]])
 442 #define can_schedule(p) (!(p)->has_cpu)
 443
 444 #else
 445
 446 #define idle_task (&init_task)
 447 #define can_schedule(p) (1)
 448
 449 #endif
 450
 451 /*
 452  *  'schedule()' is the scheduler function. It's a very simple and nice
 453  * scheduler: it's not perfect, but certainly works for most things.
 454  *
 455  * The goto is "interesting".
 456  *
 457  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 458  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 459  * information in task[0] is never used.
 460  */
 461 asmlinkage void schedule(void)
 462 {
 463         struct task_struct * prev, * next;
 464         unsigned long timeout;
 465         int this_cpu;
 466
 467         prev = current;
 468         this_cpu = prev->processor;
 469         if (in_interrupt())
 470                 goto scheduling_in_interrupt;
 471         release_kernel_lock(prev, this_cpu);
 472
 473         __check_locks(1);
 474
 475         /* Do "administrative" work here while we don't hold any locks */
 476         if (bh_active & bh_mask)
 477                 do_bottom_half();
 478         run_task_queue(&tq_scheduler);
 479
 480         spin_lock(&scheduler_lock);
 481         spin_lock_irq(&runqueue_lock);
 482
 483         /* move an exhausted RR process to be last.. */
 484         prev->need_resched = 0;
 485         if (!prev->counter && prev->policy == SCHED_RR) {
 486                 prev->counter = prev->priority;
 487                 move_last_runqueue(prev);
 488         }
 489         timeout = 0;
 490         switch (prev->state) {
 491                 case TASK_INTERRUPTIBLE:
 492                         if (signal_pending(prev))
 493                                 goto makerunnable;
 494                         timeout = prev->timeout;
 495                         if (timeout && (timeout <= jiffies)) {
 496                                 prev->timeout = 0;
 497                                 timeout = 0;
 498                 makerunnable:
 499                                 prev->state = TASK_RUNNING;
 500                                 break;
 501                         }
 502                 default:
 503                         del_from_runqueue(prev);
 504                 case TASK_RUNNING:
 505         }
 506         {
 507                 struct task_struct * p = init_task.next_run;
 508                 /*
 509                  * This is subtle.
 510                  * Note how we can enable interrupts here, even
 511                  * though interrupts can add processes to the run-
 512                  * queue. This is because any new processes will
 513                  * be added to the front of the queue, so "p" above
 514                  * is a safe starting point.
 515                  * run-queue deletion and re-ordering is protected by
 516                  * the scheduler lock
 517                  */
 518                 spin_unlock_irq(&runqueue_lock);
 519 #ifdef __SMP__
 520                 prev->has_cpu = 0;
 521 #endif
 522
 523 /*
 524  * Note! there may appear new tasks on the run-queue during this, as
 525  * interrupts are enabled. However, they will be put on front of the
 526  * list, so our list starting at "p" is essentially fixed.
 527  */
 528 /* this is the scheduler proper: */
 529                 {
 530                         int c = -1000;
 531                         next = idle_task;
 532                         while (p != &init_task) {
 533                                 if (can_schedule(p)) {
 534                                         int weight = goodness(p, prev, this_cpu);
 535                                         if (weight > c)
 536                                                 c = weight, next = p;
 537                                 }
 538                                 p = p->next_run;
 539                         }
 540
 541                         /* Do we need to re-calculate counters? */
 542                         if (!c) {
 543                                 struct task_struct *p;
 544                                 read_lock(&tasklist_lock);
 545                                 for_each_task(p)
 546                                         p->counter = (p->counter >> 1) + p->priority;
 547                                 read_unlock(&tasklist_lock);
 548                         }
 549                 }
 550         }
 551
 552 #ifdef __SMP__
 553         next->has_cpu = 1;
 554         next->processor = this_cpu;
 555 #endif
 556
 557         if (prev != next) {
 558                 struct timer_list timer;
 559
 560                 kstat.context_swtch++;
 561                 if (timeout) {
 562                         init_timer(&timer);
 563                         timer.expires = timeout;
 564                         timer.data = (unsigned long) prev;
 565                         timer.function = process_timeout;
 566                         add_timer(&timer);
 567                 }
 568                 get_mmu_context(next);
 569                 switch_to(prev,next);
 570
 571                 if (timeout)
 572                         del_timer(&timer);
 573         }
 574
 575         spin_unlock(&scheduler_lock);
 576
 577         /*
 578          * At this point "prev" is "current", as we just
 579          * switched into it (from an even more "previous"
 580          * prev)
 581          */
 582         reacquire_kernel_lock(prev);
 583         return;
 584
 585 scheduling_in_interrupt:
 586         printk("Scheduling in interrupt\n");
 587         *(int *)0 = 0;
 588 }
 589
 590
 591 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
 592
 593 /*
 594  * wake_up doesn't wake up stopped processes - they have to be awakened
 595  * with signals or similar.
 596  *
 597  * Note that we only need a read lock for the wait queue (and thus do not
 598  * have to protect against interrupts), as the actual removal from the
 599  * queue is handled by the process itself.
 600  */
 601 void __wake_up(struct wait_queue **q, unsigned int mode)
 602 {
 603         struct wait_queue *next;
 604
 605         read_lock(&waitqueue_lock);
 606         if (q && (next = *q)) {
 607                 struct wait_queue *head;
 608
 609                 head = WAIT_QUEUE_HEAD(q);
 610                 while (next != head) {
 611                         struct task_struct *p = next->task;
 612                         next = next->next;
 613                         if (p->state & mode)
 614                                 wake_up_process(p);
 615                 }
 616         }
 617         read_unlock(&waitqueue_lock);
 618 }
 619
 620 /*
 621  * Semaphores are implemented using a two-way counter:
 622  * The "count" variable is decremented for each process
 623  * that tries to sleep, while the "waking" variable is
 624  * incremented when the "up()" code goes to wake up waiting
 625  * processes.
 626  *
 627  * Notably, the inline "up()" and "down()" functions can
 628  * efficiently test if they need to do any extra work (up
 629  * needs to do something only if count was negative before
 630  * the increment operation.
 631  *
 632  * waking_non_zero() (from asm/semaphore.h) must execute
 633  * atomically.
 634  *
 635  * When __up() is called, the count was negative before
 636  * incrementing it, and we need to wake up somebody.
 637  *
 638  * This routine adds one to the count of processes that need to
 639  * wake up and exit.  ALL waiting processes actually wake up but
 640  * only the one that gets to the "waking" field first will gate
 641  * through and acquire the semaphore.  The others will go back
 642  * to sleep.
 643  *
 644  * Note that these functions are only called when there is
 645  * contention on the lock, and as such all this is the
 646  * "non-critical" part of the whole semaphore business. The
 647  * critical part is the inline stuff in <asm/semaphore.h>
 648  * where we want to avoid any extra jumps and calls.
 649  */
 650 void __up(struct semaphore *sem)
 651 {
 652         wake_one_more(sem);
 653         wake_up(&sem->wait);
 654 }
 655
 656 /*
 657  * Perform the "down" function.  Return zero for semaphore acquired,
 658  * return negative for signalled out of the function.
 659  *
 660  * If called from __down, the return is ignored and the wait loop is
 661  * not interruptible.  This means that a task waiting on a semaphore
 662  * using "down()" cannot be killed until someone does an "up()" on
 663  * the semaphore.
 664  *
 665  * If called from __down_interruptible, the return value gets checked
 666  * upon return.  If the return value is negative then the task continues
 667  * with the negative value in the return register (it can be tested by
 668  * the caller).
 669  *
 670  * Either form may be used in conjunction with "up()".
 671  *
 672  */
 673 static inline int __do_down(struct semaphore * sem, int task_state)
 674 {
 675         struct task_struct *tsk = current;
 676         struct wait_queue wait = { tsk, NULL };
 677         int               ret = 0;
 678
 679         tsk->state = task_state;
 680         add_wait_queue(&sem->wait, &wait);
 681
 682         /*
 683          * Ok, we're set up.  sem->count is known to be less than zero
 684          * so we must wait.
 685          *
 686          * We can let go the lock for purposes of waiting.
 687          * We re-acquire it after awaking so as to protect
 688          * all semaphore operations.
 689          *
 690          * If "up()" is called before we call waking_non_zero() then
 691          * we will catch it right away.  If it is called later then
 692          * we will have to go through a wakeup cycle to catch it.
 693          *
 694          * Multiple waiters contend for the semaphore lock to see
 695          * who gets to gate through and who has to wait some more.
 696          */
 697         for (;;) {
 698                 if (waking_non_zero(sem))       /* are we waking up?  */
 699                         break;                  /* yes, exit loop */
 700
 701                 if (task_state == TASK_INTERRUPTIBLE && signal_pending(tsk)) {
 702                         ret = -EINTR;                   /* interrupted */
 703                         atomic_inc(&sem->count);        /* give up on down operation */
 704                         break;
 705                 }
 706
 707                 schedule();
 708                 tsk->state = task_state;
 709         }
 710
 711         tsk->state = TASK_RUNNING;
 712         remove_wait_queue(&sem->wait, &wait);
 713         return ret;
 714 }
 715
 716 void __down(struct semaphore * sem)
 717 {
 718         __do_down(sem,TASK_UNINTERRUPTIBLE);
 719 }
 720
 721 int __down_interruptible(struct semaphore * sem)
 722 {
 723         return __do_down(sem,TASK_INTERRUPTIBLE);
 724 }
 725
 726
 727 static void FASTCALL(__sleep_on(struct wait_queue **p, int state));
 728 static void __sleep_on(struct wait_queue **p, int state)
 729 {
 730         unsigned long flags;
 731         struct wait_queue wait;
 732
 733         current->state = state;
 734         wait.task = current;
 735         write_lock_irqsave(&waitqueue_lock, flags);
 736         __add_wait_queue(p, &wait);
 737         write_unlock(&waitqueue_lock);
 738         schedule();
 739         write_lock_irq(&waitqueue_lock);
 740         __remove_wait_queue(p, &wait);
 741         write_unlock_irqrestore(&waitqueue_lock, flags);
 742 }
 743
 744 void interruptible_sleep_on(struct wait_queue **p)
 745 {
 746         __sleep_on(p,TASK_INTERRUPTIBLE);
 747 }
 748
 749 void sleep_on(struct wait_queue **p)
 750 {
 751         __sleep_on(p,TASK_UNINTERRUPTIBLE);
 752 }
 753
 754 void scheduling_functions_end_here(void) { }
 755
 756 static inline void cascade_timers(struct timer_vec *tv)
 757 {
 758         /* cascade all the timers from tv up one level */
 759         struct timer_list *timer;
 760         timer = tv->vec[tv->index];
 761         /*
 762          * We are removing _all_ timers from the list, so we don't  have to
 763          * detach them individually, just clear the list afterwards.
 764          */
 765         while (timer) {
 766                 struct timer_list *tmp = timer;
 767                 timer = timer->next;
 768                 internal_add_timer(tmp);
 769         }
 770         tv->vec[tv->index] = NULL;
 771         tv->index = (tv->index + 1) & TVN_MASK;
 772 }
 773
 774 static inline void run_timer_list(void)
 775 {
 776         spin_lock_irq(&timerlist_lock);
 777         while ((long)(jiffies - timer_jiffies) >= 0) {
 778                 struct timer_list *timer;
 779                 if (!tv1.index) {
 780                         int n = 1;
 781                         do {
 782                                 cascade_timers(tvecs[n]);
 783                         } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
 784                 }
 785                 while ((timer = tv1.vec[tv1.index])) {
 786                         void (*fn)(unsigned long) = timer->function;
 787                         unsigned long data = timer->data;
 788                         detach_timer(timer);
 789                         timer->next = timer->prev = NULL;
 790                         spin_unlock_irq(&timerlist_lock);
 791                         fn(data);
 792                         spin_lock_irq(&timerlist_lock);
 793                 }
 794                 ++timer_jiffies;
 795                 tv1.index = (tv1.index + 1) & TVR_MASK;
 796         }
 797         spin_unlock_irq(&timerlist_lock);
 798 }
 799
 800
 801 static inline void run_old_timers(void)
 802 {
 803         struct timer_struct *tp;
 804         unsigned long mask;
 805
 806         for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
 807                 if (mask > timer_active)
 808                         break;
 809                 if (!(mask & timer_active))
 810                         continue;
 811                 if (tp->expires > jiffies)
 812                         continue;
 813                 timer_active &= ~mask;
 814                 tp->fn();
 815                 sti();
 816         }
 817 }
 818
 819 spinlock_t tqueue_lock;
 820
 821 void tqueue_bh(void)
 822 {
 823         run_task_queue(&tq_timer);
 824 }
 825
 826 void immediate_bh(void)
 827 {
 828         run_task_queue(&tq_immediate);
 829 }
 830
 831 unsigned long timer_active = 0;
 832 struct timer_struct timer_table[32];
 833
 834 /*
 835  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
 836  * imply that avenrun[] is the standard name for this kind of thing.
 837  * Nothing else seems to be standardized: the fractional size etc
 838  * all seem to differ on different machines.
 839  */
 840 unsigned long avenrun[3] = { 0,0,0 };
 841
 842 /*
 843  * Nr of active tasks - counted in fixed-point numbers
 844  */
 845 static unsigned long count_active_tasks(void)
 846 {
 847         struct task_struct *p;
 848         unsigned long nr = 0;
 849
 850         read_lock(&tasklist_lock);
 851         for_each_task(p) {
 852                 if (p->pid &&
 853                     (p->state == TASK_RUNNING ||
 854                      p->state == TASK_UNINTERRUPTIBLE ||
 855                      p->state == TASK_SWAPPING))
 856                         nr += FIXED_1;
 857         }
 858         read_unlock(&tasklist_lock);
 859         return nr;
 860 }
 861
 862 static inline void calc_load(unsigned long ticks)
 863 {
 864         unsigned long active_tasks; /* fixed-point */
 865         static int count = LOAD_FREQ;
 866
 867         count -= ticks;
 868         if (count < 0) {
 869                 count += LOAD_FREQ;
 870                 active_tasks = count_active_tasks();
 871                 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 872                 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 873                 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 874         }
 875 }
 876
 877 /*
 878  * this routine handles the overflow of the microsecond field
 879  *
 880  * The tricky bits of code to handle the accurate clock support
 881  * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
 882  * They were originally developed for SUN and DEC kernels.
 883  * All the kudos should go to Dave for this stuff.
 884  *
 885  */
 886 static void second_overflow(void)
 887 {
 888     long ltemp;
 889
 890     /* Bump the maxerror field */
 891     time_maxerror += time_tolerance >> SHIFT_USEC;
 892     if ( time_maxerror > MAXPHASE )
 893         time_maxerror = MAXPHASE;
 894
 895     /*
 896      * Leap second processing. If in leap-insert state at
 897      * the end of the day, the system clock is set back one
 898      * second; if in leap-delete state, the system clock is
 899      * set ahead one second. The microtime() routine or
 900      * external clock driver will insure that reported time
 901      * is always monotonic. The ugly divides should be
 902      * replaced.
 903      */
 904     switch (time_state) {
 905
 906     case TIME_OK:
 907         if (time_status & STA_INS)
 908             time_state = TIME_INS;
 909         else if (time_status & STA_DEL)
 910             time_state = TIME_DEL;
 911         break;
 912
 913     case TIME_INS:
 914         if (xtime.tv_sec % 86400 == 0) {
 915             xtime.tv_sec--;
 916             time_state = TIME_OOP;
 917             printk("Clock: inserting leap second 23:59:60 UTC\n");
 918         }
 919         break;
 920
 921     case TIME_DEL:
 922         if ((xtime.tv_sec + 1) % 86400 == 0) {
 923             xtime.tv_sec++;
 924             time_state = TIME_WAIT;
 925             printk("Clock: deleting leap second 23:59:59 UTC\n");
 926         }
 927         break;
 928
 929     case TIME_OOP:
 930         time_state = TIME_WAIT;
 931         break;
 932
 933     case TIME_WAIT:
 934         if (!(time_status & (STA_INS | STA_DEL)))
 935             time_state = TIME_OK;
 936     }
 937
 938     /*
 939      * Compute the phase adjustment for the next second. In
 940      * PLL mode, the offset is reduced by a fixed factor
 941      * times the time constant. In FLL mode the offset is
 942      * used directly. In either mode, the maximum phase
 943      * adjustment for each second is clamped so as to spread
 944      * the adjustment over not more than the number of
 945      * seconds between updates.
 946      */
 947     if (time_offset < 0) {
 948         ltemp = -time_offset;
 949         if (!(time_status & STA_FLL))
 950             ltemp >>= SHIFT_KG + time_constant;
 951         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 952             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 953         time_offset += ltemp;
 954         time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 955     } else {
 956         ltemp = time_offset;
 957         if (!(time_status & STA_FLL))
 958             ltemp >>= SHIFT_KG + time_constant;
 959         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 960             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 961         time_offset -= ltemp;
 962         time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 963     }
 964
 965     /*
 966      * Compute the frequency estimate and additional phase
 967      * adjustment due to frequency error for the next
 968      * second. When the PPS signal is engaged, gnaw on the
 969      * watchdog counter and update the frequency computed by
 970      * the pll and the PPS signal.
 971      */
 972     pps_valid++;
 973     if (pps_valid == PPS_VALID) {
 974         pps_jitter = MAXTIME;
 975         pps_stabil = MAXFREQ;
 976         time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
 977                          STA_PPSWANDER | STA_PPSERROR);
 978     }
 979     ltemp = time_freq + pps_freq;
 980     if (ltemp < 0)
 981         time_adj -= -ltemp >>
 982             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 983     else
 984         time_adj += ltemp >>
 985             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 986
 987 #if HZ == 100
 988     /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */
 989     if (time_adj < 0)
 990         time_adj -= -time_adj >> 2;
 991     else
 992         time_adj += time_adj >> 2;
 993 #endif
 994 }
 995
 996 /* in the NTP reference this is called "hardclock()" */
 997 static void update_wall_time_one_tick(void)
 998 {
 999         /*
1000          * Advance the phase, once it gets to one microsecond, then
1001          * advance the tick more.
1002          */
1003         time_phase += time_adj;
1004         if (time_phase <= -FINEUSEC) {
1005                 long ltemp = -time_phase >> SHIFT_SCALE;
1006                 time_phase += ltemp << SHIFT_SCALE;
1007                 xtime.tv_usec += tick + time_adjust_step - ltemp;
1008         }
1009         else if (time_phase >= FINEUSEC) {
1010                 long ltemp = time_phase >> SHIFT_SCALE;
1011                 time_phase -= ltemp << SHIFT_SCALE;
1012                 xtime.tv_usec += tick + time_adjust_step + ltemp;
1013         } else
1014                 xtime.tv_usec += tick + time_adjust_step;
1015
1016         if (time_adjust) {
1017             /* We are doing an adjtime thing.
1018              *
1019              * Modify the value of the tick for next time.
1020              * Note that a positive delta means we want the clock
1021              * to run fast. This means that the tick should be bigger
1022              *
1023              * Limit the amount of the step for *next* tick to be
1024              * in the range -tickadj .. +tickadj
1025              */
1026              if (time_adjust > tickadj)
1027                 time_adjust_step = tickadj;
1028              else if (time_adjust < -tickadj)
1029                 time_adjust_step = -tickadj;
1030              else
1031                 time_adjust_step = time_adjust;
1032
1033             /* Reduce by this step the amount of time left  */
1034             time_adjust -= time_adjust_step;
1035         }
1036         else
1037             time_adjust_step = 0;
1038 }
1039
1040 /*
1041  * Using a loop looks inefficient, but "ticks" is
1042  * usually just one (we shouldn't be losing ticks,
1043  * we're doing this this way mainly for interrupt
1044  * latency reasons, not because we think we'll
1045  * have lots of lost timer ticks
1046  */
1047 static void update_wall_time(unsigned long ticks)
1048 {
1049         do {
1050                 ticks--;
1051                 update_wall_time_one_tick();
1052         } while (ticks);
1053
1054         if (xtime.tv_usec >= 1000000) {
1055             xtime.tv_usec -= 1000000;
1056             xtime.tv_sec++;
1057             second_overflow();
1058         }
1059 }
1060
1061 static inline void do_process_times(struct task_struct *p,
1062         unsigned long user, unsigned long system)
1063 {
1064         long psecs;
1065
1066         psecs = (p->times.tms_utime += user);
1067         psecs += (p->times.tms_stime += system);
1068         if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
1069                 /* Send SIGXCPU every second.. */
1070                 if (!(psecs % HZ))
1071                         send_sig(SIGXCPU, p, 1);
1072                 /* and SIGKILL when we go over max.. */
1073                 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
1074                         send_sig(SIGKILL, p, 1);
1075         }
1076 }
1077
1078 static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
1079 {
1080         unsigned long it_virt = p->it_virt_value;
1081
1082         if (it_virt) {
1083                 if (it_virt <= ticks) {
1084                         it_virt = ticks + p->it_virt_incr;
1085                         send_sig(SIGVTALRM, p, 1);
1086                 }
1087                 p->it_virt_value = it_virt - ticks;
1088         }
1089 }
1090
1091 static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
1092 {
1093         unsigned long it_prof = p->it_prof_value;
1094
1095         if (it_prof) {
1096                 if (it_prof <= ticks) {
1097                         it_prof = ticks + p->it_prof_incr;
1098                         send_sig(SIGPROF, p, 1);
1099                 }
1100                 p->it_prof_value = it_prof - ticks;
1101         }
1102 }
1103
1104 void update_one_process(struct task_struct *p,
1105         unsigned long ticks, unsigned long user, unsigned long system, int cpu)
1106 {
1107         p->per_cpu_utime[cpu] += user;
1108         p->per_cpu_stime[cpu] += system;
1109         do_process_times(p, user, system);
1110         do_it_virt(p, user);
1111         do_it_prof(p, ticks);
1112 }
1113
1114 static void update_process_times(unsigned long ticks, unsigned long system)
1115 {
1116 /*
1117  * SMP does this on a per-CPU basis elsewhere
1118  */
1119 #ifndef  __SMP__
1120         struct task_struct * p = current;
1121         unsigned long user = ticks - system;
1122         if (p->pid) {
1123                 p->counter -= ticks;
1124                 if (p->counter < 0) {
1125                         p->counter = 0;
1126                         p->need_resched = 1;
1127                 }
1128                 if (p->priority < DEF_PRIORITY)
1129                         kstat.cpu_nice += user;
1130                 else
1131                         kstat.cpu_user += user;
1132                 kstat.cpu_system += system;
1133         }
1134         update_one_process(p, ticks, user, system, 0);
1135 #endif
1136 }
1137
1138 volatile unsigned long lost_ticks = 0;
1139 static unsigned long lost_ticks_system = 0;
1140
1141 static inline void update_times(void)
1142 {
1143         unsigned long ticks;
1144         unsigned long flags;
1145
1146         save_flags(flags);
1147         cli();
1148
1149         ticks = lost_ticks;
1150         lost_ticks = 0;
1151
1152         if (ticks) {
1153                 unsigned long system;
1154                 system = xchg(&lost_ticks_system, 0);
1155
1156                 calc_load(ticks);
1157                 update_wall_time(ticks);
1158                 restore_flags(flags);
1159
1160                 update_process_times(ticks, system);
1161
1162         } else
1163                 restore_flags(flags);
1164 }
1165
1166 static void timer_bh(void)
1167 {
1168         update_times();
1169         run_old_timers();
1170         run_timer_list();
1171 }
1172
1173 void do_timer(struct pt_regs * regs)
1174 {
1175         (*(unsigned long *)&jiffies)++;
1176         lost_ticks++;
1177         mark_bh(TIMER_BH);
1178         if (!user_mode(regs))
1179                 lost_ticks_system++;
1180         if (tq_timer)
1181                 mark_bh(TQUEUE_BH);
1182 }
1183
1184 #ifndef __alpha__
1185
1186 /*
1187  * For backwards compatibility?  This can be done in libc so Alpha
1188  * and all newer ports shouldn't need it.
1189  */
1190 asmlinkage unsigned int sys_alarm(unsigned int seconds)
1191 {
1192         struct itimerval it_new, it_old;
1193         unsigned int oldalarm;
1194
1195         it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
1196         it_new.it_value.tv_sec = seconds;
1197         it_new.it_value.tv_usec = 0;
1198         do_setitimer(ITIMER_REAL, &it_new, &it_old);
1199         oldalarm = it_old.it_value.tv_sec;
1200         /* ehhh.. We can't return 0 if we have an alarm pending.. */
1201         /* And we'd better return too much than too little anyway */
1202         if (it_old.it_value.tv_usec)
1203                 oldalarm++;
1204         return oldalarm;
1205 }
1206
1207 /*
1208  * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
1209  * should be moved into arch/i386 instead?
1210  */
1211
1212 asmlinkage int sys_getpid(void)
1213 {
1214         /* This is SMP safe - current->pid doesn't change */
1215         return current->pid;
1216 }
1217
1218 /*
1219  * This is not strictly SMP safe: p_opptr could change
1220  * from under us. However, rather than getting any lock
1221  * we can use an optimistic algorithm: get the parent
1222  * pid, and go back and check that the parent is still
1223  * the same. If it has changed (which is extremely unlikely
1224  * indeed), we just try again..
1225  *
1226  * NOTE! This depends on the fact that even if we _do_
1227  * get an old value of "parent", we can happily dereference
1228  * the pointer: we just can't necessarily trust the result
1229  * until we know that the parent pointer is valid.
1230  *
1231  * The "mb()" macro is a memory barrier - a synchronizing
1232  * event. It also makes sure that gcc doesn't optimize
1233  * away the necessary memory references.. The barrier doesn't
1234  * have to have all that strong semantics: on x86 we don't
1235  * really require a synchronizing instruction, for example.
1236  * The barrier is more important for code generation than
1237  * for any real memory ordering semantics (even if there is
1238  * a small window for a race, using the old pointer is
1239  * harmless for a while).
1240  */
1241 asmlinkage int sys_getppid(void)
1242 {
1243         int pid;
1244         struct task_struct * me = current;
1245         struct task_struct * parent;
1246
1247         parent = me->p_opptr;
1248         for (;;) {
1249                 pid = parent->pid;
1250 #if __SMP__
1251 {
1252                 struct task_struct *old = parent;
1253                 mb();
1254                 parent = me->p_opptr;
1255                 if (old != parent)
1256                         continue;
1257 }
1258 #endif
1259                 break;
1260         }
1261         return pid;
1262 }
1263
1264 asmlinkage int sys_getuid(void)
1265 {
1266         /* Only we change this so SMP safe */
1267         return current->uid;
1268 }
1269
1270 asmlinkage int sys_geteuid(void)
1271 {
1272         /* Only we change this so SMP safe */
1273         return current->euid;
1274 }
1275
1276 asmlinkage int sys_getgid(void)
1277 {
1278         /* Only we change this so SMP safe */
1279         return current->gid;
1280 }
1281
1282 asmlinkage int sys_getegid(void)
1283 {
1284         /* Only we change this so SMP safe */
1285         return  current->egid;
1286 }
1287
1288 /*
1289  * This has been replaced by sys_setpriority.  Maybe it should be
1290  * moved into the arch dependent tree for those ports that require
1291  * it for backward compatibility?
1292  */
1293
1294 asmlinkage int sys_nice(int increment)
1295 {
1296         unsigned long newprio;
1297         int increase = 0;
1298
1299         /*
1300          *      Setpriority might change our priority at the same moment.
1301          *      We don't have to worry. Conceptually one call occurs first
1302          *      and we have a single winner.
1303          */
1304
1305         newprio = increment;
1306         if (increment < 0) {
1307                 if (!capable(CAP_SYS_NICE))
1308                         return -EPERM;
1309                 newprio = -increment;
1310                 increase = 1;
1311         }
1312
1313         if (newprio > 40)
1314                 newprio = 40;
1315         /*
1316          * do a "normalization" of the priority (traditionally
1317          * Unix nice values are -20 to 20; Linux doesn't really
1318          * use that kind of thing, but uses the length of the
1319          * timeslice instead (default 150 ms). The rounding is
1320          * why we want to avoid negative values.
1321          */
1322         newprio = (newprio * DEF_PRIORITY + 10) / 20;
1323         increment = newprio;
1324         if (increase)
1325                 increment = -increment;
1326         /*
1327          *      Current->priority can change between this point
1328          *      and the assignment. We are assigning not doing add/subs
1329          *      so thats ok. Conceptually a process might just instantaneously
1330          *      read the value we stomp over. I don't think that is an issue
1331          *      unless posix makes it one. If so we can loop on changes
1332          *      to current->priority.
1333          */
1334         newprio = current->priority - increment;
1335         if ((signed) newprio < 1)
1336                 newprio = 1;
1337         if (newprio > DEF_PRIORITY*2)
1338                 newprio = DEF_PRIORITY*2;
1339         current->priority = newprio;
1340         return 0;
1341 }
1342
1343 #endif
1344
1345 static inline struct task_struct *find_process_by_pid(pid_t pid)
1346 {
1347         struct task_struct *tsk = current;
1348
1349         if (pid)
1350                 tsk = find_task_by_pid(pid);
1351         return tsk;
1352 }
1353
1354 static int setscheduler(pid_t pid, int policy,
1355                         struct sched_param *param)
1356 {
1357         struct sched_param lp;
1358         struct task_struct *p;
1359         int retval;
1360
1361         retval = -EINVAL;
1362         if (!param || pid < 0)
1363                 goto out_nounlock;
1364
1365         retval = -EFAULT;
1366         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
1367                 goto out_nounlock;
1368
1369         /*
1370          * We play safe to avoid deadlocks.
1371          */
1372         spin_lock(&scheduler_lock);
1373         spin_lock_irq(&runqueue_lock);
1374         read_lock(&tasklist_lock);
1375
1376         p = find_process_by_pid(pid);
1377
1378         retval = -ESRCH;
1379         if (!p)
1380                 goto out_unlock;
1381
1382         if (policy < 0)
1383                 policy = p->policy;
1384         else {
1385                 retval = -EINVAL;
1386                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
1387                                 policy != SCHED_OTHER)
1388                         goto out_unlock;
1389         }
1390
1391         /*
1392          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
1393          * priority for SCHED_OTHER is 0.
1394          */
1395         retval = -EINVAL;
1396         if (lp.sched_priority < 0 || lp.sched_priority > 99)
1397                 goto out_unlock;
1398         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
1399                 goto out_unlock;
1400
1401         retval = -EPERM;
1402         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1403             !capable(CAP_SYS_NICE))
1404                 goto out_unlock;
1405         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1406             !capable(CAP_SYS_NICE))
1407                 goto out_unlock;
1408
1409         retval = 0;
1410         p->policy = policy;
1411         p->rt_priority = lp.sched_priority;
1412         if (p->next_run)
1413                 move_first_runqueue(p);
1414
1415         current->need_resched = 1;
1416
1417 out_unlock:
1418         read_unlock(&tasklist_lock);
1419         spin_unlock_irq(&runqueue_lock);
1420         spin_unlock(&scheduler_lock);
1421
1422 out_nounlock:
1423         return retval;
1424 }
1425
1426 asmlinkage int sys_sched_setscheduler(pid_t pid, int policy,
1427                                       struct sched_param *param)
1428 {
1429         return setscheduler(pid, policy, param);
1430 }
1431
1432 asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param)
1433 {
1434         return setscheduler(pid, -1, param);
1435 }
1436
1437 asmlinkage int sys_sched_getscheduler(pid_t pid)
1438 {
1439         struct task_struct *p;
1440         int retval;
1441
1442         retval = -EINVAL;
1443         if (pid < 0)
1444                 goto out_nounlock;
1445
1446         read_lock(&tasklist_lock);
1447
1448         retval = -ESRCH;
1449         p = find_process_by_pid(pid);
1450         if (!p)
1451                 goto out_unlock;
1452
1453         retval = p->policy;
1454
1455 out_unlock:
1456         read_unlock(&tasklist_lock);
1457
1458 out_nounlock:
1459         return retval;
1460 }
1461
1462 asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param)
1463 {
1464         struct task_struct *p;
1465         struct sched_param lp;
1466         int retval;
1467
1468         retval = -EINVAL;
1469         if (!param || pid < 0)
1470                 goto out_nounlock;
1471
1472         read_lock(&tasklist_lock);
1473         p = find_process_by_pid(pid);
1474         retval = -ESRCH;
1475         if (!p)
1476                 goto out_unlock;
1477         lp.sched_priority = p->rt_priority;
1478         read_unlock(&tasklist_lock);
1479
1480         /*
1481          * This one might sleep, we cannot do it with a spinlock held ...
1482          */
1483         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1484
1485 out_nounlock:
1486         return retval;
1487
1488 out_unlock:
1489         read_unlock(&tasklist_lock);
1490         return retval;
1491 }
1492
1493 asmlinkage int sys_sched_yield(void)
1494 {
1495         spin_lock(&scheduler_lock);
1496         spin_lock_irq(&runqueue_lock);
1497         current->policy |= SCHED_YIELD;
1498         current->need_resched = 1;
1499         move_last_runqueue(current);
1500         spin_unlock_irq(&runqueue_lock);
1501         spin_unlock(&scheduler_lock);
1502         return 0;
1503 }
1504
1505 asmlinkage int sys_sched_get_priority_max(int policy)
1506 {
1507         int ret = -EINVAL;
1508
1509         switch (policy) {
1510         case SCHED_FIFO:
1511         case SCHED_RR:
1512                 ret = 99;
1513                 break;
1514         case SCHED_OTHER:
1515                 ret = 0;
1516                 break;
1517         }
1518         return ret;
1519 }
1520
1521 asmlinkage int sys_sched_get_priority_min(int policy)
1522 {
1523         int ret = -EINVAL;
1524
1525         switch (policy) {
1526         case SCHED_FIFO:
1527         case SCHED_RR:
1528                 ret = 1;
1529                 break;
1530         case SCHED_OTHER:
1531                 ret = 0;
1532         }
1533         return ret;
1534 }
1535
1536 asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1537 {
1538         struct timespec t;
1539
1540         t.tv_sec = 0;
1541         t.tv_nsec = 150000;
1542         if (copy_to_user(interval, &t, sizeof(struct timespec)))
1543                 return -EFAULT;
1544         return 0;
1545 }
1546
1547 asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
1548 {
1549         struct timespec t;
1550         unsigned long expire;
1551
1552         if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
1553                 return -EFAULT;
1554
1555         if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
1556                 return -EINVAL;
1557
1558
1559         if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
1560             current->policy != SCHED_OTHER)
1561         {
1562                 /*
1563                  * Short delay requests up to 2 ms will be handled with
1564                  * high precision by a busy wait for all real-time processes.
1565                  *
1566                  * Its important on SMP not to do this holding locks.
1567                  */
1568                 udelay((t.tv_nsec + 999) / 1000);
1569                 return 0;
1570         }
1571
1572         expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;
1573
1574         current->timeout = expire;
1575         current->state = TASK_INTERRUPTIBLE;
1576         schedule();
1577
1578         if (expire > jiffies) {
1579                 if (rmtp) {
1580                         jiffies_to_timespec(expire - jiffies -
1581                                             (expire > jiffies + 1), &t);
1582                         if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
1583                                 return -EFAULT;
1584                 }
1585                 return -EINTR;
1586         }
1587         return 0;
1588 }
1589
1590 static void show_task(int nr,struct task_struct * p)
1591 {
1592         unsigned long free = 0;
1593         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1594
1595         printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr);
1596         if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *))
1597                 printk(stat_nam[p->state]);
1598         else
1599                 printk(" ");
1600 #if (BITS_PER_LONG == 32)
1601         if (p == current)
1602                 printk(" current  ");
1603         else
1604                 printk(" %08lX ", thread_saved_pc(&p->tss));
1605 #else
1606         if (p == current)
1607                 printk("   current task   ");
1608         else
1609                 printk(" %016lx ", thread_saved_pc(&p->tss));
1610 #endif
1611         {
1612                 unsigned long * n = (unsigned long *) (p+1);
1613                 while (!*n)
1614                         n++;
1615                 free = (unsigned long) n - (unsigned long)(p+1);
1616         }
1617         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1618         if (p->p_cptr)
1619                 printk("%5d ", p->p_cptr->pid);
1620         else
1621                 printk("      ");
1622         if (p->p_ysptr)
1623                 printk("%7d", p->p_ysptr->pid);
1624         else
1625                 printk("       ");
1626         if (p->p_osptr)
1627                 printk(" %5d\n", p->p_osptr->pid);
1628         else
1629                 printk("\n");
1630
1631         {
1632                 struct signal_queue *q;
1633                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1634
1635                 render_sigset_t(&p->signal, s);
1636                 render_sigset_t(&p->blocked, b);
1637                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1638                 for (q = p->sigqueue; q ; q = q->next)
1639                         printk(" %d", q->info.si_signo);
1640                 printk(" X\n");
1641         }
1642 }
1643
1644 char * render_sigset_t(sigset_t *set, char *buffer)
1645 {
1646         int i = _NSIG, x;
1647         do {
1648                 i -= 4, x = 0;
1649                 if (sigismember(set, i+1)) x |= 1;
1650                 if (sigismember(set, i+2)) x |= 2;
1651                 if (sigismember(set, i+3)) x |= 4;
1652                 if (sigismember(set, i+4)) x |= 8;
1653                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1654         } while (i >= 4);
1655         *buffer = 0;
1656         return buffer;
1657 }
1658
1659 void show_state(void)
1660 {
1661         struct task_struct *p;
1662
1663 #if (BITS_PER_LONG == 32)
1664         printk("\n"
1665                "                         free                        sibling\n");
1666         printk("  task             PC    stack   pid father child younger older\n");
1667 #else
1668         printk("\n"
1669                "                                 free                        sibling\n");
1670         printk("  task                 PC        stack   pid father child younger older\n");
1671 #endif
1672         read_lock(&tasklist_lock);
1673         for_each_task(p)
1674                 show_task((p->tarray_ptr - &task[0]),p);
1675         read_unlock(&tasklist_lock);
1676 }
1677
1678 __initfunc(void sched_init(void))
1679 {
1680         /*
1681          *      We have to do a little magic to get the first
1682          *      process right in SMP mode.
1683          */
1684         int cpu=hard_smp_processor_id();
1685         int nr = NR_TASKS;
1686
1687         init_task.processor=cpu;
1688
1689         /* Init task array free list and pidhash table. */
1690         while(--nr > 0)
1691                 add_free_taskslot(&task[nr]);
1692
1693         for(nr = 0; nr < PIDHASH_SZ; nr++)
1694                 pidhash[nr] = NULL;
1695
1696         init_bh(TIMER_BH, timer_bh);
1697         init_bh(TQUEUE_BH, tqueue_bh);
1698         init_bh(IMMEDIATE_BH, immediate_bh);
1699 }