kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  1996-04-21  Modified by Ulrich Windl to make NTP work
   7  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   8  *              make semaphores SMP safe
   9  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
  10  */
  11
  12 /*
  13  * 'sched.c' is the main kernel file. It contains scheduling primitives
  14  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  15  * call functions (type getpid()), which just extract a field from
  16  * current-task
  17  */
  18
  19 #include <linux/signal.h>
  20 #include <linux/sched.h>
  21 #include <linux/timer.h>
  22 #include <linux/kernel.h>
  23 #include <linux/kernel_stat.h>
  24 #include <linux/fdreg.h>
  25 #include <linux/errno.h>
  26 #include <linux/time.h>
  27 #include <linux/ptrace.h>
  28 #include <linux/delay.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/tqueue.h>
  31 #include <linux/resource.h>
  32 #include <linux/mm.h>
  33 #include <linux/smp.h>
  34 #include <linux/smp_lock.h>
  35 #include <linux/init.h>
  36
  37 #include <asm/system.h>
  38 #include <asm/io.h>
  39 #include <asm/uaccess.h>
  40 #include <asm/pgtable.h>
  41 #include <asm/mmu_context.h>
  42 #include <asm/spinlock.h>
  43
  44 #include <linux/timex.h>
  45
  46 /*
  47  * kernel variables
  48  */
  49
  50 int securelevel = 0;                    /* system security level */
  51
  52 long tick = (1000000 + HZ/2) / HZ;      /* timer interrupt period */
  53 volatile struct timeval xtime __attribute__ ((aligned (8)));    /* The current time */
  54 int tickadj = 500/HZ;                   /* microsecs */
  55
  56 DECLARE_TASK_QUEUE(tq_timer);
  57 DECLARE_TASK_QUEUE(tq_immediate);
  58 DECLARE_TASK_QUEUE(tq_scheduler);
  59
  60 /*
  61  * phase-lock loop variables
  62  */
  63 /* TIME_ERROR prevents overwriting the CMOS clock */
  64 int time_state = TIME_ERROR;    /* clock synchronization status */
  65 int time_status = STA_UNSYNC;   /* clock status bits */
  66 long time_offset = 0;           /* time adjustment (us) */
  67 long time_constant = 2;         /* pll time constant */
  68 long time_tolerance = MAXFREQ;  /* frequency tolerance (ppm) */
  69 long time_precision = 1;        /* clock precision (us) */
  70 long time_maxerror = MAXPHASE;  /* maximum error (us) */
  71 long time_esterror = MAXPHASE;  /* estimated error (us) */
  72 long time_phase = 0;            /* phase offset (scaled us) */
  73 long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;  /* frequency offset (scaled ppm) */
  74 long time_adj = 0;              /* tick adjust (scaled 1 / HZ) */
  75 long time_reftime = 0;          /* time at last adjustment (s) */
  76
  77 long time_adjust = 0;
  78 long time_adjust_step = 0;
  79
  80 int need_resched = 0;
  81 unsigned long event = 0;
  82
  83 extern int _setitimer(int, struct itimerval *, struct itimerval *);
  84 unsigned int * prof_buffer = NULL;
  85 unsigned long prof_len = 0;
  86 unsigned long prof_shift = 0;
  87
  88 #define _S(nr) (1<<((nr)-1))
  89
  90 extern void mem_use(void);
  91
  92 unsigned long volatile jiffies=0;
  93
  94 /*
  95  *      Init task must be ok at boot for the ix86 as we will check its signals
  96  *      via the SMP irq return path.
  97  */
  98
  99 struct task_struct *last_task_used_math = NULL;
 100
 101 struct task_struct * task[NR_TASKS] = {&init_task, };
 102
 103 struct kernel_stat kstat = { 0 };
 104
 105 static inline void add_to_runqueue(struct task_struct * p)
 106 {
 107         if (p->counter > current->counter + 3)
 108                 need_resched = 1;
 109         nr_running++;
 110         (p->prev_run = init_task.prev_run)->next_run = p;
 111         p->next_run = &init_task;
 112         init_task.prev_run = p;
 113 }
 114
 115 static inline void del_from_runqueue(struct task_struct * p)
 116 {
 117         struct task_struct *next = p->next_run;
 118         struct task_struct *prev = p->prev_run;
 119
 120         nr_running--;
 121         next->prev_run = prev;
 122         prev->next_run = next;
 123         p->next_run = NULL;
 124         p->prev_run = NULL;
 125 }
 126
 127 static inline void move_last_runqueue(struct task_struct * p)
 128 {
 129         struct task_struct *next = p->next_run;
 130         struct task_struct *prev = p->prev_run;
 131
 132         /* remove from list */
 133         next->prev_run = prev;
 134         prev->next_run = next;
 135         /* add back to list */
 136         p->next_run = &init_task;
 137         prev = init_task.prev_run;
 138         init_task.prev_run = p;
 139         p->prev_run = prev;
 140         prev->next_run = p;
 141 }
 142
 143 #ifdef __SMP__
 144 /*
 145  * The tasklist_lock protects the linked list of processes.
 146  *
 147  * The scheduler lock is protecting against multiple entry
 148  * into the scheduling code, and doesn't need to worry
 149  * about interrupts (because interrupts cannot call the
 150  * scheduler).
 151  *
 152  * The run-queue lock locks the parts that actually access
 153  * and change the run-queues, and have to be interrupt-safe.
 154  */
 155 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;
 156 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED;
 157 static spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;
 158 #endif
 159
 160 /*
 161  * Wake up a process. Put it on the run-queue if it's not
 162  * already there.  The "current" process is always on the
 163  * run-queue (except when the actual re-schedule is in
 164  * progress), and as such you're allowed to do the simpler
 165  * "current->state = TASK_RUNNING" to mark yourself runnable
 166  * without the overhead of this.
 167  */
 168 inline void wake_up_process(struct task_struct * p)
 169 {
 170         unsigned long flags;
 171
 172         spin_lock_irqsave(&runqueue_lock, flags);
 173         p->state = TASK_RUNNING;
 174         if (!p->next_run)
 175                 add_to_runqueue(p);
 176         spin_unlock_irqrestore(&runqueue_lock, flags);
 177 }
 178
 179 static void process_timeout(unsigned long __data)
 180 {
 181         struct task_struct * p = (struct task_struct *) __data;
 182
 183         p->timeout = 0;
 184         wake_up_process(p);
 185 }
 186
 187 /*
 188  * This is the function that decides how desirable a process is..
 189  * You can weigh different processes against each other depending
 190  * on what CPU they've run on lately etc to try to handle cache
 191  * and TLB miss penalties.
 192  *
 193  * Return values:
 194  *       -1000: never select this
 195  *           0: out of time, recalculate counters (but it might still be
 196  *              selected)
 197  *         +ve: "goodness" value (the larger, the better)
 198  *       +1000: realtime process, select this.
 199  */
 200 static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
 201 {
 202         int weight;
 203
 204         /*
 205          * Realtime process, select the first one on the
 206          * runqueue (taking priorities within processes
 207          * into account).
 208          */
 209         if (p->policy != SCHED_OTHER)
 210                 return 1000 + p->rt_priority;
 211
 212         /*
 213          * Give the process a first-approximation goodness value
 214          * according to the number of clock-ticks it has left.
 215          *
 216          * Don't do any other calculations if the time slice is
 217          * over..
 218          */
 219         weight = p->counter;
 220         if (weight) {
 221
 222 #ifdef __SMP__
 223                 /* Give a largish advantage to the same processor...   */
 224                 /* (this is equivalent to penalizing other processors) */
 225                 if (p->processor == this_cpu)
 226                         weight += PROC_CHANGE_PENALTY;
 227 #endif
 228
 229                 /* .. and a slight advantage to the current process */
 230                 if (p == prev)
 231                         weight += 1;
 232         }
 233
 234         return weight;
 235 }
 236
 237 /*
 238  * Event timer code
 239  */
 240 #define TVN_BITS 6
 241 #define TVR_BITS 8
 242 #define TVN_SIZE (1 << TVN_BITS)
 243 #define TVR_SIZE (1 << TVR_BITS)
 244 #define TVN_MASK (TVN_SIZE - 1)
 245 #define TVR_MASK (TVR_SIZE - 1)
 246
 247 struct timer_vec {
 248         int index;
 249         struct timer_list *vec[TVN_SIZE];
 250 };
 251
 252 struct timer_vec_root {
 253         int index;
 254         struct timer_list *vec[TVR_SIZE];
 255 };
 256
 257 static struct timer_vec tv5 = { 0 };
 258 static struct timer_vec tv4 = { 0 };
 259 static struct timer_vec tv3 = { 0 };
 260 static struct timer_vec tv2 = { 0 };
 261 static struct timer_vec_root tv1 = { 0 };
 262
 263 static struct timer_vec * const tvecs[] = {
 264         (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
 265 };
 266
 267 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
 268
 269 static unsigned long timer_jiffies = 0;
 270
 271 static inline void insert_timer(struct timer_list *timer,
 272                                 struct timer_list **vec, int idx)
 273 {
 274         if ((timer->next = vec[idx]))
 275                 vec[idx]->prev = timer;
 276         vec[idx] = timer;
 277         timer->prev = (struct timer_list *)&vec[idx];
 278 }
 279
 280 static inline void internal_add_timer(struct timer_list *timer)
 281 {
 282         /*
 283          * must be cli-ed when calling this
 284          */
 285         unsigned long expires = timer->expires;
 286         unsigned long idx = expires - timer_jiffies;
 287
 288         if (idx < TVR_SIZE) {
 289                 int i = expires & TVR_MASK;
 290                 insert_timer(timer, tv1.vec, i);
 291         } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 292                 int i = (expires >> TVR_BITS) & TVN_MASK;
 293                 insert_timer(timer, tv2.vec, i);
 294         } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 295                 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 296                 insert_timer(timer, tv3.vec, i);
 297         } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 298                 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 299                 insert_timer(timer, tv4.vec, i);
 300         } else if (expires < timer_jiffies) {
 301                 /* can happen if you add a timer with expires == jiffies,
 302                  * or you set a timer to go off in the past
 303                  */
 304                 insert_timer(timer, tv1.vec, tv1.index);
 305         } else if (idx < 0xffffffffUL) {
 306                 int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 307                 insert_timer(timer, tv5.vec, i);
 308         } else {
 309                 /* Can only get here on architectures with 64-bit jiffies */
 310                 timer->next = timer->prev = timer;
 311         }
 312 }
 313
 314 static spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
 315
 316 void add_timer(struct timer_list *timer)
 317 {
 318         unsigned long flags;
 319
 320         spin_lock_irqsave(&timerlist_lock, flags);
 321         internal_add_timer(timer);
 322         spin_unlock_irqrestore(&timerlist_lock, flags);
 323 }
 324
 325 static inline int detach_timer(struct timer_list *timer)
 326 {
 327         int ret = 0;
 328         struct timer_list *next, *prev;
 329         next = timer->next;
 330         prev = timer->prev;
 331         if (next) {
 332                 next->prev = prev;
 333         }
 334         if (prev) {
 335                 ret = 1;
 336                 prev->next = next;
 337         }
 338         return ret;
 339 }
 340
 341
 342 int del_timer(struct timer_list * timer)
 343 {
 344         int ret;
 345         unsigned long flags;
 346
 347         spin_lock_irqsave(&timerlist_lock, flags);
 348         ret = detach_timer(timer);
 349         timer->next = timer->prev = 0;
 350         spin_unlock_irqrestore(&timerlist_lock, flags);
 351         return ret;
 352 }
 353
 354 #ifdef __SMP__
 355
 356 #define idle_task (task[cpu_number_map[this_cpu]])
 357 #define can_schedule(p) (!(p)->has_cpu)
 358
 359 #else
 360
 361 #define idle_task (&init_task)
 362 #define can_schedule(p) (1)
 363
 364 #endif
 365
 366 /*
 367  *  'schedule()' is the scheduler function. It's a very simple and nice
 368  * scheduler: it's not perfect, but certainly works for most things.
 369  *
 370  * The goto is "interesting".
 371  *
 372  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 373  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 374  * information in task[0] is never used.
 375  */
 376 asmlinkage void schedule(void)
 377 {
 378         int lock_depth;
 379         struct task_struct * prev, * next;
 380         unsigned long timeout;
 381         int this_cpu;
 382
 383         need_resched = 0;
 384         prev = current;
 385         this_cpu = smp_processor_id();
 386         if (local_irq_count[this_cpu])
 387                 goto scheduling_in_interrupt;
 388         release_kernel_lock(prev, this_cpu, lock_depth);
 389         if (bh_active & bh_mask)
 390                 do_bottom_half();
 391
 392         spin_lock(&scheduler_lock);
 393         spin_lock_irq(&runqueue_lock);
 394
 395         /* move an exhausted RR process to be last.. */
 396         if (!prev->counter && prev->policy == SCHED_RR) {
 397                 prev->counter = prev->priority;
 398                 move_last_runqueue(prev);
 399         }
 400         timeout = 0;
 401         switch (prev->state) {
 402                 case TASK_INTERRUPTIBLE:
 403                         if (prev->signal & ~prev->blocked)
 404                                 goto makerunnable;
 405                         timeout = prev->timeout;
 406                         if (timeout && (timeout <= jiffies)) {
 407                                 prev->timeout = 0;
 408                                 timeout = 0;
 409                 makerunnable:
 410                                 prev->state = TASK_RUNNING;
 411                                 break;
 412                         }
 413                 default:
 414                         del_from_runqueue(prev);
 415                 case TASK_RUNNING:
 416         }
 417         {
 418                 struct task_struct * p = init_task.next_run;
 419                 /*
 420                  * This is subtle.
 421                  * Note how we can enable interrupts here, even
 422                  * though interrupts can add processes to the run-
 423                  * queue. This is because any new processes will
 424                  * be added to the front of the queue, so "p" above
 425                  * is a safe starting point.
 426                  * run-queue deletion and re-ordering is protected by
 427                  * the scheduler lock
 428                  */
 429                 spin_unlock_irq(&runqueue_lock);
 430 #ifdef __SMP__
 431                 prev->has_cpu = 0;
 432 #endif
 433
 434 /*
 435  * Note! there may appear new tasks on the run-queue during this, as
 436  * interrupts are enabled. However, they will be put on front of the
 437  * list, so our list starting at "p" is essentially fixed.
 438  */
 439 /* this is the scheduler proper: */
 440                 {
 441                         int c = -1000;
 442                         next = idle_task;
 443                         while (p != &init_task) {
 444                                 if (can_schedule(p)) {
 445                                         int weight = goodness(p, prev, this_cpu);
 446                                         if (weight > c)
 447                                                 c = weight, next = p;
 448                                 }
 449                                 p = p->next_run;
 450                         }
 451
 452                         /* Do we need to re-calculate counters? */
 453                         if (!c) {
 454                                 struct task_struct *p;
 455                                 read_lock(&tasklist_lock);
 456                                 for_each_task(p)
 457                                         p->counter = (p->counter >> 1) + p->priority;
 458                                 read_unlock(&tasklist_lock);
 459                         }
 460                 }
 461         }
 462
 463 #ifdef __SMP__
 464         next->has_cpu = 1;
 465         next->processor = this_cpu;
 466 #endif
 467
 468         if (prev != next) {
 469                 struct timer_list timer;
 470
 471                 kstat.context_swtch++;
 472                 if (timeout) {
 473                         init_timer(&timer);
 474                         timer.expires = timeout;
 475                         timer.data = (unsigned long) prev;
 476                         timer.function = process_timeout;
 477                         add_timer(&timer);
 478                 }
 479                 get_mmu_context(next);
 480                 switch_to(prev,next);
 481
 482                 if (timeout)
 483                         del_timer(&timer);
 484         }
 485         spin_unlock(&scheduler_lock);
 486
 487         reacquire_kernel_lock(prev, smp_processor_id(), lock_depth);
 488         return;
 489
 490 scheduling_in_interrupt:
 491         printk("Scheduling in interrupt\n");
 492         *(int *)0 = 0;
 493 }
 494
 495
 496 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
 497
 498 /*
 499  * wake_up doesn't wake up stopped processes - they have to be awakened
 500  * with signals or similar.
 501  *
 502  * Note that we only need a read lock for the wait queue (and thus do not
 503  * have to protect against interrupts), as the actual removal from the
 504  * queue is handled by the process itself.
 505  */
 506 void wake_up(struct wait_queue **q)
 507 {
 508         struct wait_queue *next;
 509
 510         read_lock(&waitqueue_lock);
 511         if (q && (next = *q)) {
 512                 struct wait_queue *head;
 513
 514                 head = WAIT_QUEUE_HEAD(q);
 515                 while (next != head) {
 516                         struct task_struct *p = next->task;
 517                         next = next->next;
 518                         if ((p->state == TASK_UNINTERRUPTIBLE) ||
 519                             (p->state == TASK_INTERRUPTIBLE))
 520                                 wake_up_process(p);
 521                 }
 522         }
 523         read_unlock(&waitqueue_lock);
 524 }
 525
 526 void wake_up_interruptible(struct wait_queue **q)
 527 {
 528         struct wait_queue *next;
 529
 530         read_lock(&waitqueue_lock);
 531         if (q && (next = *q)) {
 532                 struct wait_queue *head;
 533
 534                 head = WAIT_QUEUE_HEAD(q);
 535                 while (next != head) {
 536                         struct task_struct *p = next->task;
 537                         next = next->next;
 538                         if (p->state == TASK_INTERRUPTIBLE)
 539                                 wake_up_process(p);
 540                 }
 541         }
 542         read_unlock(&waitqueue_lock);
 543 }
 544
 545 /*
 546  * Semaphores are implemented using a two-way counter:
 547  * The "count" variable is decremented for each process
 548  * that tries to sleep, while the "waking" variable is
 549  * incremented when the "up()" code goes to wake up waiting
 550  * processes.
 551  *
 552  * Notably, the inline "up()" and "down()" functions can
 553  * efficiently test if they need to do any extra work (up
 554  * needs to do something only if count was negative before
 555  * the increment operation.
 556  *
 557  * waking_non_zero() (from asm/semaphore.h) must execute
 558  * atomically.
 559  *
 560  * When __up() is called, the count was negative before
 561  * incrementing it, and we need to wake up somebody.
 562  *
 563  * This routine adds one to the count of processes that need to
 564  * wake up and exit.  ALL waiting processes actually wake up but
 565  * only the one that gets to the "waking" field first will gate
 566  * through and acquire the semaphore.  The others will go back
 567  * to sleep.
 568  *
 569  * Note that these functions are only called when there is
 570  * contention on the lock, and as such all this is the
 571  * "non-critical" part of the whole semaphore business. The
 572  * critical part is the inline stuff in <asm/semaphore.h>
 573  * where we want to avoid any extra jumps and calls.
 574  */
 575 void __up(struct semaphore *sem)
 576 {
 577         wake_one_more(sem);
 578         wake_up(&sem->wait);
 579 }
 580
 581 /*
 582  * Perform the "down" function.  Return zero for semaphore acquired,
 583  * return negative for signalled out of the function.
 584  *
 585  * If called from __down, the return is ignored and the wait loop is
 586  * not interruptible.  This means that a task waiting on a semaphore
 587  * using "down()" cannot be killed until someone does an "up()" on
 588  * the semaphore.
 589  *
 590  * If called from __down_interruptible, the return value gets checked
 591  * upon return.  If the return value is negative then the task continues
 592  * with the negative value in the return register (it can be tested by
 593  * the caller).
 594  *
 595  * Either form may be used in conjunction with "up()".
 596  *
 597  */
 598 static inline int __do_down(struct semaphore * sem, int task_state)
 599 {
 600         struct task_struct *tsk = current;
 601         struct wait_queue wait = { tsk, NULL };
 602         int               ret = 0;
 603
 604         tsk->state = task_state;
 605         add_wait_queue(&sem->wait, &wait);
 606
 607         /*
 608          * Ok, we're set up.  sem->count is known to be less than zero
 609          * so we must wait.
 610          *
 611          * We can let go the lock for purposes of waiting.
 612          * We re-acquire it after awaking so as to protect
 613          * all semaphore operations.
 614          *
 615          * If "up()" is called before we call waking_non_zero() then
 616          * we will catch it right away.  If it is called later then
 617          * we will have to go through a wakeup cycle to catch it.
 618          *
 619          * Multiple waiters contend for the semaphore lock to see
 620          * who gets to gate through and who has to wait some more.
 621          */
 622         for (;;) {
 623                 if (waking_non_zero(sem))       /* are we waking up?  */
 624                         break;                  /* yes, exit loop */
 625
 626                 if (   task_state == TASK_INTERRUPTIBLE
 627                     && (tsk->signal & ~tsk->blocked)    /* signalled */
 628                    ) {
 629                         ret = -EINTR;                   /* interrupted */
 630                         atomic_inc(&sem->count);        /* give up on down operation */
 631                         break;
 632                 }
 633
 634                 schedule();
 635                 tsk->state = task_state;
 636         }
 637
 638         tsk->state = TASK_RUNNING;
 639         remove_wait_queue(&sem->wait, &wait);
 640         return ret;
 641 }
 642
 643 void __down(struct semaphore * sem)
 644 {
 645         __do_down(sem,TASK_UNINTERRUPTIBLE);
 646 }
 647
 648 int __down_interruptible(struct semaphore * sem)
 649 {
 650         return __do_down(sem,TASK_INTERRUPTIBLE);
 651 }
 652
 653
 654 static inline void __sleep_on(struct wait_queue **p, int state)
 655 {
 656         unsigned long flags;
 657         struct wait_queue wait = { current, NULL };
 658
 659         if (!p)
 660                 return;
 661         current->state = state;
 662         write_lock_irqsave(&waitqueue_lock, flags);
 663         __add_wait_queue(p, &wait);
 664         write_unlock(&waitqueue_lock);
 665         schedule();
 666         write_lock_irq(&waitqueue_lock);
 667         __remove_wait_queue(p, &wait);
 668         write_unlock_irqrestore(&waitqueue_lock, flags);
 669 }
 670
 671 void interruptible_sleep_on(struct wait_queue **p)
 672 {
 673         __sleep_on(p,TASK_INTERRUPTIBLE);
 674 }
 675
 676 void sleep_on(struct wait_queue **p)
 677 {
 678         __sleep_on(p,TASK_UNINTERRUPTIBLE);
 679 }
 680
 681 static inline void cascade_timers(struct timer_vec *tv)
 682 {
 683         /* cascade all the timers from tv up one level */
 684         struct timer_list *timer;
 685         timer = tv->vec[tv->index];
 686         /*
 687          * We are removing _all_ timers from the list, so we don't  have to
 688          * detach them individually, just clear the list afterwards.
 689          */
 690         while (timer) {
 691                 struct timer_list *tmp = timer;
 692                 timer = timer->next;
 693                 internal_add_timer(tmp);
 694         }
 695         tv->vec[tv->index] = NULL;
 696         tv->index = (tv->index + 1) & TVN_MASK;
 697 }
 698
 699 static inline void run_timer_list(void)
 700 {
 701         spin_lock_irq(&timerlist_lock);
 702         while ((long)(jiffies - timer_jiffies) >= 0) {
 703                 struct timer_list *timer;
 704                 if (!tv1.index) {
 705                         int n = 1;
 706                         do {
 707                                 cascade_timers(tvecs[n]);
 708                         } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
 709                 }
 710                 while ((timer = tv1.vec[tv1.index])) {
 711                         void (*fn)(unsigned long) = timer->function;
 712                         unsigned long data = timer->data;
 713                         detach_timer(timer);
 714                         timer->next = timer->prev = NULL;
 715                         spin_unlock_irq(&timerlist_lock);
 716                         fn(data);
 717                         spin_lock_irq(&timerlist_lock);
 718                 }
 719                 ++timer_jiffies;
 720                 tv1.index = (tv1.index + 1) & TVR_MASK;
 721         }
 722         spin_unlock_irq(&timerlist_lock);
 723 }
 724
 725
 726 static inline void run_old_timers(void)
 727 {
 728         struct timer_struct *tp;
 729         unsigned long mask;
 730
 731         for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
 732                 if (mask > timer_active)
 733                         break;
 734                 if (!(mask & timer_active))
 735                         continue;
 736                 if (tp->expires > jiffies)
 737                         continue;
 738                 timer_active &= ~mask;
 739                 tp->fn();
 740                 sti();
 741         }
 742 }
 743
 744 spinlock_t tqueue_lock;
 745
 746 void tqueue_bh(void)
 747 {
 748         run_task_queue(&tq_timer);
 749 }
 750
 751 void immediate_bh(void)
 752 {
 753         run_task_queue(&tq_immediate);
 754 }
 755
 756 unsigned long timer_active = 0;
 757 struct timer_struct timer_table[32];
 758
 759 /*
 760  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
 761  * imply that avenrun[] is the standard name for this kind of thing.
 762  * Nothing else seems to be standardized: the fractional size etc
 763  * all seem to differ on different machines.
 764  */
 765 unsigned long avenrun[3] = { 0,0,0 };
 766
 767 /*
 768  * Nr of active tasks - counted in fixed-point numbers
 769  */
 770 static unsigned long count_active_tasks(void)
 771 {
 772         struct task_struct *p;
 773         unsigned long nr = 0;
 774
 775         read_lock(&tasklist_lock);
 776         for_each_task(p) {
 777                 if (p->pid &&
 778                     (p->state == TASK_RUNNING ||
 779                      p->state == TASK_UNINTERRUPTIBLE ||
 780                      p->state == TASK_SWAPPING))
 781                         nr += FIXED_1;
 782         }
 783         read_unlock(&tasklist_lock);
 784         return nr;
 785 }
 786
 787 static inline void calc_load(unsigned long ticks)
 788 {
 789         unsigned long active_tasks; /* fixed-point */
 790         static int count = LOAD_FREQ;
 791
 792         count -= ticks;
 793         if (count < 0) {
 794                 count += LOAD_FREQ;
 795                 active_tasks = count_active_tasks();
 796                 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 797                 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 798                 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 799         }
 800 }
 801
 802 /*
 803  * this routine handles the overflow of the microsecond field
 804  *
 805  * The tricky bits of code to handle the accurate clock support
 806  * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
 807  * They were originally developed for SUN and DEC kernels.
 808  * All the kudos should go to Dave for this stuff.
 809  *
 810  */
 811 static void second_overflow(void)
 812 {
 813     long ltemp;
 814
 815     /* Bump the maxerror field */
 816     time_maxerror += time_tolerance >> SHIFT_USEC;
 817     if ( time_maxerror > MAXPHASE )
 818         time_maxerror = MAXPHASE;
 819
 820     /*
 821      * Leap second processing. If in leap-insert state at
 822      * the end of the day, the system clock is set back one
 823      * second; if in leap-delete state, the system clock is
 824      * set ahead one second. The microtime() routine or
 825      * external clock driver will insure that reported time
 826      * is always monotonic. The ugly divides should be
 827      * replaced.
 828      */
 829     switch (time_state) {
 830
 831     case TIME_OK:
 832         if (time_status & STA_INS)
 833             time_state = TIME_INS;
 834         else if (time_status & STA_DEL)
 835             time_state = TIME_DEL;
 836         break;
 837
 838     case TIME_INS:
 839         if (xtime.tv_sec % 86400 == 0) {
 840             xtime.tv_sec--;
 841             time_state = TIME_OOP;
 842             printk("Clock: inserting leap second 23:59:60 UTC\n");
 843         }
 844         break;
 845
 846     case TIME_DEL:
 847         if ((xtime.tv_sec + 1) % 86400 == 0) {
 848             xtime.tv_sec++;
 849             time_state = TIME_WAIT;
 850             printk("Clock: deleting leap second 23:59:59 UTC\n");
 851         }
 852         break;
 853
 854     case TIME_OOP:
 855         time_state = TIME_WAIT;
 856         break;
 857
 858     case TIME_WAIT:
 859         if (!(time_status & (STA_INS | STA_DEL)))
 860             time_state = TIME_OK;
 861     }
 862
 863     /*
 864      * Compute the phase adjustment for the next second. In
 865      * PLL mode, the offset is reduced by a fixed factor
 866      * times the time constant. In FLL mode the offset is
 867      * used directly. In either mode, the maximum phase
 868      * adjustment for each second is clamped so as to spread
 869      * the adjustment over not more than the number of
 870      * seconds between updates.
 871      */
 872     if (time_offset < 0) {
 873         ltemp = -time_offset;
 874         if (!(time_status & STA_FLL))
 875             ltemp >>= SHIFT_KG + time_constant;
 876         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 877             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 878         time_offset += ltemp;
 879         time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 880     } else {
 881         ltemp = time_offset;
 882         if (!(time_status & STA_FLL))
 883             ltemp >>= SHIFT_KG + time_constant;
 884         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 885             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 886         time_offset -= ltemp;
 887         time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 888     }
 889
 890     /*
 891      * Compute the frequency estimate and additional phase
 892      * adjustment due to frequency error for the next
 893      * second. When the PPS signal is engaged, gnaw on the
 894      * watchdog counter and update the frequency computed by
 895      * the pll and the PPS signal.
 896      */
 897     pps_valid++;
 898     if (pps_valid == PPS_VALID) {
 899         pps_jitter = MAXTIME;
 900         pps_stabil = MAXFREQ;
 901         time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
 902                          STA_PPSWANDER | STA_PPSERROR);
 903     }
 904     ltemp = time_freq + pps_freq;
 905     if (ltemp < 0)
 906         time_adj -= -ltemp >>
 907             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 908     else
 909         time_adj += ltemp >>
 910             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 911
 912 #if HZ == 100
 913     /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */
 914     if (time_adj < 0)
 915         time_adj -= -time_adj >> 2;
 916     else
 917         time_adj += time_adj >> 2;
 918 #endif
 919 }
 920
 921 /* in the NTP reference this is called "hardclock()" */
 922 static void update_wall_time_one_tick(void)
 923 {
 924         /*
 925          * Advance the phase, once it gets to one microsecond, then
 926          * advance the tick more.
 927          */
 928         time_phase += time_adj;
 929         if (time_phase <= -FINEUSEC) {
 930                 long ltemp = -time_phase >> SHIFT_SCALE;
 931                 time_phase += ltemp << SHIFT_SCALE;
 932                 xtime.tv_usec += tick + time_adjust_step - ltemp;
 933         }
 934         else if (time_phase >= FINEUSEC) {
 935                 long ltemp = time_phase >> SHIFT_SCALE;
 936                 time_phase -= ltemp << SHIFT_SCALE;
 937                 xtime.tv_usec += tick + time_adjust_step + ltemp;
 938         } else
 939                 xtime.tv_usec += tick + time_adjust_step;
 940
 941         if (time_adjust) {
 942             /* We are doing an adjtime thing.
 943              *
 944              * Modify the value of the tick for next time.
 945              * Note that a positive delta means we want the clock
 946              * to run fast. This means that the tick should be bigger
 947              *
 948              * Limit the amount of the step for *next* tick to be
 949              * in the range -tickadj .. +tickadj
 950              */
 951              if (time_adjust > tickadj)
 952                 time_adjust_step = tickadj;
 953              else if (time_adjust < -tickadj)
 954                 time_adjust_step = -tickadj;
 955              else
 956                 time_adjust_step = time_adjust;
 957
 958             /* Reduce by this step the amount of time left  */
 959             time_adjust -= time_adjust_step;
 960         }
 961         else
 962             time_adjust_step = 0;
 963 }
 964
 965 /*
 966  * Using a loop looks inefficient, but "ticks" is
 967  * usually just one (we shouldn't be losing ticks,
 968  * we're doing this this way mainly for interrupt
 969  * latency reasons, not because we think we'll
 970  * have lots of lost timer ticks
 971  */
 972 static void update_wall_time(unsigned long ticks)
 973 {
 974         do {
 975                 ticks--;
 976                 update_wall_time_one_tick();
 977         } while (ticks);
 978
 979         if (xtime.tv_usec >= 1000000) {
 980             xtime.tv_usec -= 1000000;
 981             xtime.tv_sec++;
 982             second_overflow();
 983         }
 984 }
 985
 986 static inline void do_process_times(struct task_struct *p,
 987         unsigned long user, unsigned long system)
 988 {
 989         long psecs;
 990
 991         psecs = (p->times.tms_utime += user);
 992         psecs += (p->times.tms_stime += system);
 993         if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
 994                 /* Send SIGXCPU every second.. */
 995                 if (!(psecs % HZ))
 996                         send_sig(SIGXCPU, p, 1);
 997                 /* and SIGKILL when we go over max.. */
 998                 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
 999                         send_sig(SIGKILL, p, 1);
1000         }
1001 }
1002
1003 static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
1004 {
1005         unsigned long it_virt = p->it_virt_value;
1006
1007         if (it_virt) {
1008                 if (it_virt <= ticks) {
1009                         it_virt = ticks + p->it_virt_incr;
1010                         send_sig(SIGVTALRM, p, 1);
1011                 }
1012                 p->it_virt_value = it_virt - ticks;
1013         }
1014 }
1015
1016 static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
1017 {
1018         unsigned long it_prof = p->it_prof_value;
1019
1020         if (it_prof) {
1021                 if (it_prof <= ticks) {
1022                         it_prof = ticks + p->it_prof_incr;
1023                         send_sig(SIGPROF, p, 1);
1024                 }
1025                 p->it_prof_value = it_prof - ticks;
1026         }
1027 }
1028
1029 void update_one_process(struct task_struct *p,
1030         unsigned long ticks, unsigned long user, unsigned long system)
1031 {
1032         do_process_times(p, user, system);
1033         do_it_virt(p, user);
1034         do_it_prof(p, ticks);
1035 }
1036
1037 static void update_process_times(unsigned long ticks, unsigned long system)
1038 {
1039 /*
1040  * SMP does this on a per-CPU basis elsewhere
1041  */
1042 #ifndef  __SMP__
1043         struct task_struct * p = current;
1044         unsigned long user = ticks - system;
1045         if (p->pid) {
1046                 p->counter -= ticks;
1047                 if (p->counter < 0) {
1048                         p->counter = 0;
1049                         need_resched = 1;
1050                 }
1051                 if (p->priority < DEF_PRIORITY)
1052                         kstat.cpu_nice += user;
1053                 else
1054                         kstat.cpu_user += user;
1055                 kstat.cpu_system += system;
1056         }
1057         update_one_process(p, ticks, user, system);
1058 #endif
1059 }
1060
1061 volatile unsigned long lost_ticks = 0;
1062 static unsigned long lost_ticks_system = 0;
1063
1064 static inline void update_times(void)
1065 {
1066         unsigned long ticks;
1067         unsigned long flags;
1068
1069         save_flags(flags);
1070         cli();
1071
1072         ticks = lost_ticks;
1073         lost_ticks = 0;
1074
1075         if (ticks) {
1076                 unsigned long system;
1077                 system = xchg(&lost_ticks_system, 0);
1078
1079                 calc_load(ticks);
1080                 update_wall_time(ticks);
1081                 restore_flags(flags);
1082
1083                 update_process_times(ticks, system);
1084
1085         } else
1086                 restore_flags(flags);
1087 }
1088
1089 static void timer_bh(void)
1090 {
1091         update_times();
1092         run_old_timers();
1093         run_timer_list();
1094 }
1095
1096 void do_timer(struct pt_regs * regs)
1097 {
1098         (*(unsigned long *)&jiffies)++;
1099         lost_ticks++;
1100         mark_bh(TIMER_BH);
1101         if (!user_mode(regs))
1102                 lost_ticks_system++;
1103         if (tq_timer)
1104                 mark_bh(TQUEUE_BH);
1105 }
1106
1107 #ifndef __alpha__
1108
1109 /*
1110  * For backwards compatibility?  This can be done in libc so Alpha
1111  * and all newer ports shouldn't need it.
1112  */
1113 asmlinkage unsigned int sys_alarm(unsigned int seconds)
1114 {
1115         struct itimerval it_new, it_old;
1116         unsigned int oldalarm;
1117
1118         it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
1119         it_new.it_value.tv_sec = seconds;
1120         it_new.it_value.tv_usec = 0;
1121         _setitimer(ITIMER_REAL, &it_new, &it_old);
1122         oldalarm = it_old.it_value.tv_sec;
1123         /* ehhh.. We can't return 0 if we have an alarm pending.. */
1124         /* And we'd better return too much than too little anyway */
1125         if (it_old.it_value.tv_usec)
1126                 oldalarm++;
1127         return oldalarm;
1128 }
1129
1130 /*
1131  * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
1132  * should be moved into arch/i386 instead?
1133  */
1134
1135 asmlinkage int sys_getpid(void)
1136 {
1137         /* This is SMP safe - current->pid doesnt change */
1138         return current->pid;
1139 }
1140
1141 /*
1142  * This is not strictly SMP safe: p_opptr could change
1143  * from under us. However, rather than getting any lock
1144  * we can use an optimistic algorithm: get the parent
1145  * pid, and go back and check that the parent is still
1146  * the same. If it has changed (which is extremely unlikely
1147  * indeed), we just try again..
1148  *
1149  * NOTE! This depends on the fact that even if we _do_
1150  * get an old value of "parent", we can happily dereference
1151  * the pointer: we just can't necessarily trust the result
1152  * until we know that the parent pointer is valid.
1153  *
1154  * The "mb()" macro is a memory barrier - a synchronizing
1155  * event. It also makes sure that gcc doesn't optimize
1156  * away the necessary memory references.. The barrier doesn't
1157  * have to have all that strong semantics: on x86 we don't
1158  * really require a synchronizing instruction, for example.
1159  * The barrier is more important for code generation than
1160  * for any real memory ordering semantics (even if there is
1161  * a small window for a race, using the old pointer is
1162  * harmless for a while).
1163  */
1164 asmlinkage int sys_getppid(void)
1165 {
1166         int pid;
1167         struct task_struct * me = current;
1168         struct task_struct * parent;
1169
1170         parent = me->p_opptr;
1171         for (;;) {
1172                 pid = parent->pid;
1173 #if __SMP__
1174 {
1175                 struct task_struct *old = parent;
1176                 mb();
1177                 parent = me->p_opptr;
1178                 if (old != parent)
1179                         continue;
1180 }
1181 #endif
1182                 break;
1183         }
1184         return pid;
1185 }
1186
1187 asmlinkage int sys_getuid(void)
1188 {
1189         /* Only we change this so SMP safe */
1190         return current->uid;
1191 }
1192
1193 asmlinkage int sys_geteuid(void)
1194 {
1195         /* Only we change this so SMP safe */
1196         return current->euid;
1197 }
1198
1199 asmlinkage int sys_getgid(void)
1200 {
1201         /* Only we change this so SMP safe */
1202         return current->gid;
1203 }
1204
1205 asmlinkage int sys_getegid(void)
1206 {
1207         /* Only we change this so SMP safe */
1208         return  current->egid;
1209 }
1210
1211 /*
1212  * This has been replaced by sys_setpriority.  Maybe it should be
1213  * moved into the arch dependent tree for those ports that require
1214  * it for backward compatibility?
1215  */
1216
1217 asmlinkage int sys_nice(int increment)
1218 {
1219         unsigned long newprio;
1220         int increase = 0;
1221
1222         /*
1223          *      Setpriority might change our priority at the same moment.
1224          *      We don't have to worry. Conceptually one call occurs first
1225          *      and we have a single winner.
1226          */
1227
1228         newprio = increment;
1229         if (increment < 0) {
1230                 if (!suser())
1231                         return -EPERM;
1232                 newprio = -increment;
1233                 increase = 1;
1234         }
1235
1236         if (newprio > 40)
1237                 newprio = 40;
1238         /*
1239          * do a "normalization" of the priority (traditionally
1240          * unix nice values are -20..20, linux doesn't really
1241          * use that kind of thing, but uses the length of the
1242          * timeslice instead (default 150 msec). The rounding is
1243          * why we want to avoid negative values.
1244          */
1245         newprio = (newprio * DEF_PRIORITY + 10) / 20;
1246         increment = newprio;
1247         if (increase)
1248                 increment = -increment;
1249         /*
1250          *      Current->priority can change between this point
1251          *      and the assignment. We are assigning not doing add/subs
1252          *      so thats ok. Conceptually a process might just instantaneously
1253          *      read the value we stomp over. I don't think that is an issue
1254          *      unless posix makes it one. If so we can loop on changes
1255          *      to current->priority.
1256          */
1257         newprio = current->priority - increment;
1258         if ((signed) newprio < 1)
1259                 newprio = 1;
1260         if (newprio > DEF_PRIORITY*2)
1261                 newprio = DEF_PRIORITY*2;
1262         current->priority = newprio;
1263         return 0;
1264 }
1265
1266 #endif
1267
1268 static inline struct task_struct *find_process_by_pid(pid_t pid)
1269 {
1270         if (pid)
1271                 return find_task_by_pid(pid);
1272         else
1273                 return current;
1274 }
1275
1276 static int setscheduler(pid_t pid, int policy,
1277                         struct sched_param *param)
1278 {
1279         struct sched_param lp;
1280         struct task_struct *p;
1281
1282         if (!param || pid < 0)
1283                 return -EINVAL;
1284
1285         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
1286                 return -EFAULT;
1287
1288         p = find_process_by_pid(pid);
1289         if (!p)
1290                 return -ESRCH;
1291
1292         if (policy < 0)
1293                 policy = p->policy;
1294         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
1295                  policy != SCHED_OTHER)
1296                 return -EINVAL;
1297
1298         /*
1299          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
1300          * priority for SCHED_OTHER is 0.
1301          */
1302         if (lp.sched_priority < 0 || lp.sched_priority > 99)
1303                 return -EINVAL;
1304         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
1305                 return -EINVAL;
1306
1307         if ((policy == SCHED_FIFO || policy == SCHED_RR) && !suser())
1308                 return -EPERM;
1309         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1310             !suser())
1311                 return -EPERM;
1312
1313         p->policy = policy;
1314         p->rt_priority = lp.sched_priority;
1315         spin_lock(&scheduler_lock);
1316         spin_lock_irq(&runqueue_lock);
1317         if (p->next_run)
1318                 move_last_runqueue(p);
1319         spin_unlock_irq(&runqueue_lock);
1320         spin_unlock(&scheduler_lock);
1321         need_resched = 1;
1322         return 0;
1323 }
1324
1325 asmlinkage int sys_sched_setscheduler(pid_t pid, int policy,
1326                                       struct sched_param *param)
1327 {
1328         return setscheduler(pid, policy, param);
1329 }
1330
1331 asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param)
1332 {
1333         return setscheduler(pid, -1, param);
1334 }
1335
1336 asmlinkage int sys_sched_getscheduler(pid_t pid)
1337 {
1338         struct task_struct *p;
1339
1340         if (pid < 0)
1341                 return -EINVAL;
1342
1343         p = find_process_by_pid(pid);
1344         if (!p)
1345                 return -ESRCH;
1346
1347         return p->policy;
1348 }
1349
1350 asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param)
1351 {
1352         struct task_struct *p;
1353         struct sched_param lp;
1354
1355         if (!param || pid < 0)
1356                 return -EINVAL;
1357
1358         p = find_process_by_pid(pid);
1359         if (!p)
1360                 return -ESRCH;
1361
1362         lp.sched_priority = p->rt_priority;
1363         return copy_to_user(param, &lp, sizeof(struct sched_param)) ? -EFAULT : 0;
1364 }
1365
1366 asmlinkage int sys_sched_yield(void)
1367 {
1368         /*
1369          * This is not really right. We'd like to reschedule
1370          * just _once_ with this process having a zero count.
1371          */
1372         current->counter = 0;
1373         spin_lock(&scheduler_lock);
1374         spin_lock_irq(&runqueue_lock);
1375         move_last_runqueue(current);
1376         spin_unlock_irq(&runqueue_lock);
1377         spin_unlock(&scheduler_lock);
1378         need_resched = 1;
1379         return 0;
1380 }
1381
1382 asmlinkage int sys_sched_get_priority_max(int policy)
1383 {
1384         int ret = -EINVAL;
1385
1386         switch (policy) {
1387         case SCHED_FIFO:
1388         case SCHED_RR:
1389                 ret = 99;
1390                 break;
1391         case SCHED_OTHER:
1392                 ret = 0;
1393                 break;
1394         }
1395         return ret;
1396 }
1397
1398 asmlinkage int sys_sched_get_priority_min(int policy)
1399 {
1400         int ret = -EINVAL;
1401
1402         switch (policy) {
1403         case SCHED_FIFO:
1404         case SCHED_RR:
1405                 ret = 1;
1406                 break;
1407         case SCHED_OTHER:
1408                 ret = 0;
1409         }
1410         return ret;
1411 }
1412
1413 asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1414 {
1415         struct timespec t;
1416
1417         t.tv_sec = 0;
1418         t.tv_nsec = 150000;
1419         if (copy_to_user(interval, &t, sizeof(struct timespec)))
1420                 return -EFAULT;
1421         return 0;
1422 }
1423
1424 /*
1425  * change timeval to jiffies, trying to avoid the
1426  * most obvious overflows..
1427  */
1428 static unsigned long timespectojiffies(struct timespec *value)
1429 {
1430         unsigned long sec = (unsigned) value->tv_sec;
1431         long nsec = value->tv_nsec;
1432
1433         if (sec > (LONG_MAX / HZ))
1434                 return LONG_MAX;
1435         nsec += 1000000000L / HZ - 1;
1436         nsec /= 1000000000L / HZ;
1437         return HZ * sec + nsec;
1438 }
1439
1440 static void jiffiestotimespec(unsigned long jiffies, struct timespec *value)
1441 {
1442         value->tv_nsec = (jiffies % HZ) * (1000000000L / HZ);
1443         value->tv_sec = jiffies / HZ;
1444 }
1445
1446 asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
1447 {
1448         struct timespec t;
1449         unsigned long expire;
1450
1451         if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
1452                 return -EFAULT;
1453
1454         if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
1455                 return -EINVAL;
1456
1457
1458         if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
1459             current->policy != SCHED_OTHER)
1460         {
1461                 /*
1462                  * Short delay requests up to 2 ms will be handled with
1463                  * high precision by a busy wait for all real-time processes.
1464                  *
1465                  * Its important on SMP not to do this holding locks.
1466                  */
1467                 udelay((t.tv_nsec + 999) / 1000);
1468                 return 0;
1469         }
1470
1471         expire = timespectojiffies(&t) + (t.tv_sec || t.tv_nsec) + jiffies;
1472
1473         current->timeout = expire;
1474         current->state = TASK_INTERRUPTIBLE;
1475         schedule();
1476
1477         if (expire > jiffies) {
1478                 if (rmtp) {
1479                         jiffiestotimespec(expire - jiffies -
1480                                           (expire > jiffies + 1), &t);
1481                         if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
1482                                 return -EFAULT;
1483                 }
1484                 return -EINTR;
1485         }
1486         return 0;
1487 }
1488
1489 static void show_task(int nr,struct task_struct * p)
1490 {
1491         unsigned long free = 0;
1492         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1493
1494         printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr);
1495         if (((unsigned) p->state) < sizeof(stat_nam)/sizeof(char *))
1496                 printk(stat_nam[p->state]);
1497         else
1498                 printk(" ");
1499 #if ((~0UL) == 0xffffffff)
1500         if (p == current)
1501                 printk(" current  ");
1502         else
1503                 printk(" %08lX ", thread_saved_pc(&p->tss));
1504 #else
1505         if (p == current)
1506                 printk("   current task   ");
1507         else
1508                 printk(" %016lx ", thread_saved_pc(&p->tss));
1509 #endif
1510 #if 0
1511         for (free = 1; free < PAGE_SIZE/sizeof(long) ; free++) {
1512                 if (((unsigned long *)p->kernel_stack_page)[free])
1513                         break;
1514         }
1515 #endif
1516         printk("%5lu %5d %6d ", free*sizeof(long), p->pid, p->p_pptr->pid);
1517         if (p->p_cptr)
1518                 printk("%5d ", p->p_cptr->pid);
1519         else
1520                 printk("      ");
1521         if (p->p_ysptr)
1522                 printk("%7d", p->p_ysptr->pid);
1523         else
1524                 printk("       ");
1525         if (p->p_osptr)
1526                 printk(" %5d\n", p->p_osptr->pid);
1527         else
1528                 printk("\n");
1529 }
1530
1531 void show_state(void)
1532 {
1533         struct task_struct *p;
1534
1535 #if ((~0UL) == 0xffffffff)
1536         printk("\n"
1537                "                         free                        sibling\n");
1538         printk("  task             PC    stack   pid father child younger older\n");
1539 #else
1540         printk("\n"
1541                "                                 free                        sibling\n");
1542         printk("  task                 PC        stack   pid father child younger older\n");
1543 #endif
1544         read_lock(&tasklist_lock);
1545         for_each_task(p)
1546                 show_task((p->tarray_ptr - &task[0]),p);
1547         read_unlock(&tasklist_lock);
1548 }
1549
1550 __initfunc(void sched_init(void))
1551 {
1552         /*
1553          *      We have to do a little magic to get the first
1554          *      process right in SMP mode.
1555          */
1556         int cpu=hard_smp_processor_id();
1557         int nr = NR_TASKS;
1558
1559         init_task.processor=cpu;
1560
1561         /* Init task array free list and pidhash table. */
1562         while(--nr > 0)
1563                 add_free_taskslot(&task[nr]);
1564
1565         for(nr = 0; nr < PIDHASH_SZ; nr++)
1566                 pidhash[nr] = NULL;
1567
1568         init_bh(TIMER_BH, timer_bh);
1569         init_bh(TQUEUE_BH, tqueue_bh);
1570         init_bh(IMMEDIATE_BH, immediate_bh);
1571 }