kernel/sched.c

   1 /*
   2  *  linux/kernel/sched.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  *
   6  *  1996-04-21  Modified by Ulrich Windl to make NTP work
   7  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   8  *              make semaphores SMP safe
   9  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
  10  */
  11
  12 /*
  13  * 'sched.c' is the main kernel file. It contains scheduling primitives
  14  * (sleep_on, wakeup, schedule etc) as well as a number of simple system
  15  * call functions (type getpid()), which just extract a field from
  16  * current-task
  17  */
  18
  19 #include <linux/signal.h>
  20 #include <linux/sched.h>
  21 #include <linux/timer.h>
  22 #include <linux/kernel.h>
  23 #include <linux/kernel_stat.h>
  24 #include <linux/fdreg.h>
  25 #include <linux/errno.h>
  26 #include <linux/time.h>
  27 #include <linux/ptrace.h>
  28 #include <linux/delay.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/tqueue.h>
  31 #include <linux/resource.h>
  32 #include <linux/mm.h>
  33 #include <linux/smp.h>
  34 #include <linux/smp_lock.h>
  35 #include <linux/init.h>
  36
  37 #include <asm/system.h>
  38 #include <asm/io.h>
  39 #include <asm/uaccess.h>
  40 #include <asm/pgtable.h>
  41 #include <asm/mmu_context.h>
  42 #include <asm/spinlock.h>
  43
  44 #include <linux/timex.h>
  45
  46 /*
  47  * kernel variables
  48  */
  49
  50 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
  51
  52 long tick = (1000000 + HZ/2) / HZ;      /* timer interrupt period */
  53
  54 /* The current time */
  55 volatile struct timeval xtime __attribute__ ((aligned (16)));
  56
  57 /* Don't completely fail for HZ > 500.  */
  58 int tickadj = 500/HZ ? : 1;             /* microsecs */
  59
  60 DECLARE_TASK_QUEUE(tq_timer);
  61 DECLARE_TASK_QUEUE(tq_immediate);
  62 DECLARE_TASK_QUEUE(tq_scheduler);
  63
  64 /*
  65  * phase-lock loop variables
  66  */
  67 /* TIME_ERROR prevents overwriting the CMOS clock */
  68 int time_state = TIME_ERROR;    /* clock synchronization status */
  69 int time_status = STA_UNSYNC;   /* clock status bits */
  70 long time_offset = 0;           /* time adjustment (us) */
  71 long time_constant = 2;         /* pll time constant */
  72 long time_tolerance = MAXFREQ;  /* frequency tolerance (ppm) */
  73 long time_precision = 1;        /* clock precision (us) */
  74 long time_maxerror = MAXPHASE;  /* maximum error (us) */
  75 long time_esterror = MAXPHASE;  /* estimated error (us) */
  76 long time_phase = 0;            /* phase offset (scaled us) */
  77 long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;  /* frequency offset (scaled ppm) */
  78 long time_adj = 0;              /* tick adjust (scaled 1 / HZ) */
  79 long time_reftime = 0;          /* time at last adjustment (s) */
  80
  81 long time_adjust = 0;
  82 long time_adjust_step = 0;
  83
  84 unsigned long event = 0;
  85
  86 extern int do_setitimer(int, struct itimerval *, struct itimerval *);
  87 unsigned int * prof_buffer = NULL;
  88 unsigned long prof_len = 0;
  89 unsigned long prof_shift = 0;
  90
  91 extern void mem_use(void);
  92
  93 unsigned long volatile jiffies=0;
  94
  95 /*
  96  *      Init task must be ok at boot for the ix86 as we will check its signals
  97  *      via the SMP irq return path.
  98  */
  99
 100 struct task_struct * task[NR_TASKS] = {&init_task, };
 101
 102 struct kernel_stat kstat = { 0 };
 103
 104 void scheduling_functions_start_here(void) { }
 105
 106 static inline void reschedule_idle(struct task_struct * p)
 107 {
 108
 109         /*
 110          * For SMP, we try to see if the CPU the task used
 111          * to run on is idle..
 112          */
 113 #if 0
 114         /*
 115          * Disable this for now. Ingo has some interesting
 116          * code that looks too complex, and I have some ideas,
 117          * but in the meantime.. One problem is that "wakeup()"
 118          * can be (and is) called before we've even initialized
 119          * SMP completely, so..
 120          */
 121 #ifdef __SMP__
 122         int want_cpu = p->processor;
 123
 124         /*
 125          * Don't even try to find another CPU for us if the task
 126          * ran on this one before..
 127          */
 128         if (want_cpu != smp_processor_id()) {
 129                 struct task_struct **idle = task;
 130                 int i = smp_num_cpus;
 131
 132                 do {
 133                         struct task_struct *tsk = *idle;
 134                         idle++;
 135                         /* Something like this.. */
 136                         if (tsk->has_cpu && tsk->processor == want_cpu) {
 137                                 tsk->need_resched = 1;
 138                                 smp_send_reschedule(want_cpu);
 139                                 return;
 140                         }
 141                 } while (--i > 0);
 142         }
 143 #endif
 144 #endif
 145         if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
 146                 current->need_resched = 1;
 147 }
 148
 149 /*
 150  * Careful!
 151  *
 152  * This has to add the process to the _beginning_ of the
 153  * run-queue, not the end. See the comment about "This is
 154  * subtle" in the scheduler proper..
 155  */
 156 static inline void add_to_runqueue(struct task_struct * p)
 157 {
 158         struct task_struct *next = init_task.next_run;
 159
 160         p->prev_run = &init_task;
 161         init_task.next_run = p;
 162         p->next_run = next;
 163         next->prev_run = p;
 164 }
 165
 166 static inline void del_from_runqueue(struct task_struct * p)
 167 {
 168         struct task_struct *next = p->next_run;
 169         struct task_struct *prev = p->prev_run;
 170
 171         nr_running--;
 172         next->prev_run = prev;
 173         prev->next_run = next;
 174         p->next_run = NULL;
 175         p->prev_run = NULL;
 176 }
 177
 178 static inline void move_last_runqueue(struct task_struct * p)
 179 {
 180         struct task_struct *next = p->next_run;
 181         struct task_struct *prev = p->prev_run;
 182
 183         /* remove from list */
 184         next->prev_run = prev;
 185         prev->next_run = next;
 186         /* add back to list */
 187         p->next_run = &init_task;
 188         prev = init_task.prev_run;
 189         init_task.prev_run = p;
 190         p->prev_run = prev;
 191         prev->next_run = p;
 192 }
 193
 194 static inline void move_first_runqueue(struct task_struct * p)
 195 {
 196         struct task_struct *next = p->next_run;
 197         struct task_struct *prev = p->prev_run;
 198
 199         /* remove from list */
 200         next->prev_run = prev;
 201         prev->next_run = next;
 202         /* add back to list */
 203         p->prev_run = &init_task;
 204         next = init_task.next_run;
 205         init_task.next_run = p;
 206         p->next_run = next;
 207         next->prev_run = p;
 208 }
 209
 210 /*
 211  * The tasklist_lock protects the linked list of processes.
 212  *
 213  * The scheduler lock is protecting against multiple entry
 214  * into the scheduling code, and doesn't need to worry
 215  * about interrupts (because interrupts cannot call the
 216  * scheduler).
 217  *
 218  * The run-queue lock locks the parts that actually access
 219  * and change the run-queues, and have to be interrupt-safe.
 220  */
 221 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; /* should be acquired first */
 222 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
 223 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;      /* third */
 224
 225 /*
 226  * Wake up a process. Put it on the run-queue if it's not
 227  * already there.  The "current" process is always on the
 228  * run-queue (except when the actual re-schedule is in
 229  * progress), and as such you're allowed to do the simpler
 230  * "current->state = TASK_RUNNING" to mark yourself runnable
 231  * without the overhead of this.
 232  */
 233 void wake_up_process(struct task_struct * p)
 234 {
 235         unsigned long flags;
 236
 237         spin_lock_irqsave(&runqueue_lock, flags);
 238         p->state = TASK_RUNNING;
 239         if (!p->next_run) {
 240                 add_to_runqueue(p);
 241                 reschedule_idle(p);
 242                 nr_running++;
 243         }
 244         spin_unlock_irqrestore(&runqueue_lock, flags);
 245 }
 246
 247 static void process_timeout(unsigned long __data)
 248 {
 249         struct task_struct * p = (struct task_struct *) __data;
 250
 251         wake_up_process(p);
 252 }
 253
 254 /*
 255  * This is the function that decides how desirable a process is..
 256  * You can weigh different processes against each other depending
 257  * on what CPU they've run on lately etc to try to handle cache
 258  * and TLB miss penalties.
 259  *
 260  * Return values:
 261  *       -1000: never select this
 262  *           0: out of time, recalculate counters (but it might still be
 263  *              selected)
 264  *         +ve: "goodness" value (the larger, the better)
 265  *       +1000: realtime process, select this.
 266  */
 267 static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu)
 268 {
 269         int policy = p->policy;
 270         int weight;
 271
 272         if (policy & SCHED_YIELD) {
 273                 p->policy = policy & ~SCHED_YIELD;
 274                 return 0;
 275         }
 276
 277         /*
 278          * Realtime process, select the first one on the
 279          * runqueue (taking priorities within processes
 280          * into account).
 281          */
 282         if (policy != SCHED_OTHER)
 283                 return 1000 + p->rt_priority;
 284
 285         /*
 286          * Give the process a first-approximation goodness value
 287          * according to the number of clock-ticks it has left.
 288          *
 289          * Don't do any other calculations if the time slice is
 290          * over..
 291          */
 292         weight = p->counter;
 293         if (weight) {
 294
 295 #ifdef __SMP__
 296                 /* Give a largish advantage to the same processor...   */
 297                 /* (this is equivalent to penalizing other processors) */
 298                 if (p->processor == this_cpu)
 299                         weight += PROC_CHANGE_PENALTY;
 300 #endif
 301
 302                 /* .. and a slight advantage to the current thread */
 303                 if (p->mm == prev->mm)
 304                         weight += 1;
 305                 weight += p->priority;
 306         }
 307
 308         return weight;
 309 }
 310
 311 /*
 312  * Event timer code
 313  */
 314 #define TVN_BITS 6
 315 #define TVR_BITS 8
 316 #define TVN_SIZE (1 << TVN_BITS)
 317 #define TVR_SIZE (1 << TVR_BITS)
 318 #define TVN_MASK (TVN_SIZE - 1)
 319 #define TVR_MASK (TVR_SIZE - 1)
 320
 321 struct timer_vec {
 322         int index;
 323         struct timer_list *vec[TVN_SIZE];
 324 };
 325
 326 struct timer_vec_root {
 327         int index;
 328         struct timer_list *vec[TVR_SIZE];
 329 };
 330
 331 static struct timer_vec tv5 = { 0 };
 332 static struct timer_vec tv4 = { 0 };
 333 static struct timer_vec tv3 = { 0 };
 334 static struct timer_vec tv2 = { 0 };
 335 static struct timer_vec_root tv1 = { 0 };
 336
 337 static struct timer_vec * const tvecs[] = {
 338         (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
 339 };
 340
 341 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
 342
 343 static unsigned long timer_jiffies = 0;
 344
 345 static inline void insert_timer(struct timer_list *timer,
 346                                 struct timer_list **vec, int idx)
 347 {
 348         if ((timer->next = vec[idx]))
 349                 vec[idx]->prev = timer;
 350         vec[idx] = timer;
 351         timer->prev = (struct timer_list *)&vec[idx];
 352 }
 353
 354 static inline void internal_add_timer(struct timer_list *timer)
 355 {
 356         /*
 357          * must be cli-ed when calling this
 358          */
 359         unsigned long expires = timer->expires;
 360         unsigned long idx = expires - timer_jiffies;
 361
 362         if (idx < TVR_SIZE) {
 363                 int i = expires & TVR_MASK;
 364                 insert_timer(timer, tv1.vec, i);
 365         } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 366                 int i = (expires >> TVR_BITS) & TVN_MASK;
 367                 insert_timer(timer, tv2.vec, i);
 368         } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 369                 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 370                 insert_timer(timer, tv3.vec, i);
 371         } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 372                 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 373                 insert_timer(timer, tv4.vec, i);
 374         } else if ((signed long) idx < 0) {
 375                 /* can happen if you add a timer with expires == jiffies,
 376                  * or you set a timer to go off in the past
 377                  */
 378                 insert_timer(timer, tv1.vec, tv1.index);
 379         } else if (idx <= 0xffffffffUL) {
 380                 int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 381                 insert_timer(timer, tv5.vec, i);
 382         } else {
 383                 /* Can only get here on architectures with 64-bit jiffies */
 384                 timer->next = timer->prev = timer;
 385         }
 386 }
 387
 388 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
 389
 390 void add_timer(struct timer_list *timer)
 391 {
 392         unsigned long flags;
 393
 394         spin_lock_irqsave(&timerlist_lock, flags);
 395         internal_add_timer(timer);
 396         spin_unlock_irqrestore(&timerlist_lock, flags);
 397 }
 398
 399 static inline int detach_timer(struct timer_list *timer)
 400 {
 401         struct timer_list *prev = timer->prev;
 402         if (prev) {
 403                 struct timer_list *next = timer->next;
 404                 prev->next = next;
 405                 if (next)
 406                         next->prev = prev;
 407                 return 1;
 408         }
 409         return 0;
 410 }
 411
 412 void mod_timer(struct timer_list *timer, unsigned long expires)
 413 {
 414         unsigned long flags;
 415
 416         spin_lock_irqsave(&timerlist_lock, flags);
 417         timer->expires = expires;
 418         detach_timer(timer);
 419         internal_add_timer(timer);
 420         spin_unlock_irqrestore(&timerlist_lock, flags);
 421 }
 422
 423 int del_timer(struct timer_list * timer)
 424 {
 425         int ret;
 426         unsigned long flags;
 427
 428         spin_lock_irqsave(&timerlist_lock, flags);
 429         ret = detach_timer(timer);
 430         timer->next = timer->prev = 0;
 431         spin_unlock_irqrestore(&timerlist_lock, flags);
 432         return ret;
 433 }
 434
 435 #ifdef __SMP__
 436
 437 #define idle_task (task[cpu_number_map[this_cpu]])
 438 #define can_schedule(p) (!(p)->has_cpu)
 439
 440 #else
 441
 442 #define idle_task (&init_task)
 443 #define can_schedule(p) (1)
 444
 445 #endif
 446
 447 signed long schedule_timeout(signed long timeout)
 448 {
 449         struct timer_list timer;
 450         unsigned long expire;
 451
 452         /*
 453          * PARANOID.
 454          */
 455         if (current->state == TASK_UNINTERRUPTIBLE)
 456         {
 457                 printk(KERN_WARNING "schedule_timeout: task not interrutible "
 458                        "from %p\n", __builtin_return_address(0));
 459                 /*
 460                  * We don' t want to interrupt a not interruptible task
 461                  * risking to cause corruption. Better a a deadlock ;-).
 462                  */
 463                 timeout = MAX_SCHEDULE_TIMEOUT;
 464         }
 465
 466         /*
 467          * Here we start for real.
 468          */
 469         switch (timeout)
 470         {
 471         case MAX_SCHEDULE_TIMEOUT:
 472                 /*
 473                  * These two special cases are useful to be comfortable
 474                  * in the caller. Nothing more. We could take
 475                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
 476                  * but I' d like to return a valid offset (>=0) to allow
 477                  * the caller to do everything it want with the retval.
 478                  */
 479                 schedule();
 480                 goto out;
 481         default:
 482                 /*
 483                  * Another bit of PARANOID. Note that the retval will be
 484                  * 0 since no piece of kernel is supposed to do a check
 485                  * for a negative retval of schedule_timeout() (since it
 486                  * should never happens anyway). You just have the printk()
 487                  * that will tell you if something is gone wrong and where.
 488                  */
 489                 if (timeout < 0)
 490                 {
 491                         printk(KERN_ERR "schedule_timeout: wrong timeout "
 492                                "value %lx from %p\n", timeout,
 493                                __builtin_return_address(0));
 494                         goto out;
 495                 }
 496         }
 497
 498         expire = timeout + jiffies;
 499
 500         init_timer(&timer);
 501         timer.expires = expire;
 502         timer.data = (unsigned long) current;
 503         timer.function = process_timeout;
 504
 505         add_timer(&timer);
 506         schedule();
 507         del_timer(&timer);
 508
 509         timeout = expire - jiffies;
 510
 511  out:
 512         return timeout < 0 ? 0 : timeout;
 513 }
 514
 515 /*
 516  *  'schedule()' is the scheduler function. It's a very simple and nice
 517  * scheduler: it's not perfect, but certainly works for most things.
 518  *
 519  * The goto is "interesting".
 520  *
 521  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 522  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 523  * information in task[0] is never used.
 524  */
 525 asmlinkage void schedule(void)
 526 {
 527         struct task_struct * prev, * next;
 528         int this_cpu;
 529
 530         prev = current;
 531         this_cpu = prev->processor;
 532         if (in_interrupt())
 533                 goto scheduling_in_interrupt;
 534         release_kernel_lock(prev, this_cpu);
 535
 536         /* Do "administrative" work here while we don't hold any locks */
 537         if (bh_active & bh_mask)
 538                 do_bottom_half();
 539         run_task_queue(&tq_scheduler);
 540
 541         spin_lock(&scheduler_lock);
 542         spin_lock_irq(&runqueue_lock);
 543
 544         /* move an exhausted RR process to be last.. */
 545         prev->need_resched = 0;
 546         if (!prev->counter && prev->policy == SCHED_RR) {
 547                 prev->counter = prev->priority;
 548                 move_last_runqueue(prev);
 549         }
 550
 551         switch (prev->state) {
 552                 case TASK_INTERRUPTIBLE:
 553                         if (signal_pending(prev)) {
 554                                 prev->state = TASK_RUNNING;
 555                                 break;
 556                         }
 557                 default:
 558                         del_from_runqueue(prev);
 559                 case TASK_RUNNING:
 560         }
 561         {
 562                 struct task_struct * p = init_task.next_run;
 563                 /*
 564                  * This is subtle.
 565                  * Note how we can enable interrupts here, even
 566                  * though interrupts can add processes to the run-
 567                  * queue. This is because any new processes will
 568                  * be added to the front of the queue, so "p" above
 569                  * is a safe starting point.
 570                  * run-queue deletion and re-ordering is protected by
 571                  * the scheduler lock
 572                  */
 573                 spin_unlock_irq(&runqueue_lock);
 574 #ifdef __SMP__
 575                 prev->has_cpu = 0;
 576 #endif
 577
 578 /*
 579  * Note! there may appear new tasks on the run-queue during this, as
 580  * interrupts are enabled. However, they will be put on front of the
 581  * list, so our list starting at "p" is essentially fixed.
 582  */
 583 /* this is the scheduler proper: */
 584                 {
 585                         int c = -1000;
 586                         next = idle_task;
 587                         while (p != &init_task) {
 588                                 if (can_schedule(p)) {
 589                                         int weight = goodness(p, prev, this_cpu);
 590                                         if (weight > c)
 591                                                 c = weight, next = p;
 592                                 }
 593                                 p = p->next_run;
 594                         }
 595
 596                         /* Do we need to re-calculate counters? */
 597                         if (!c) {
 598                                 struct task_struct *p;
 599                                 read_lock(&tasklist_lock);
 600                                 for_each_task(p)
 601                                         p->counter = (p->counter >> 1) + p->priority;
 602                                 read_unlock(&tasklist_lock);
 603                         }
 604                 }
 605         }
 606
 607 #ifdef __SMP__
 608         next->has_cpu = 1;
 609         next->processor = this_cpu;
 610 #endif
 611
 612         if (prev != next) {
 613                 kstat.context_swtch++;
 614                 get_mmu_context(next);
 615                 switch_to(prev,next);
 616         }
 617
 618         spin_unlock(&scheduler_lock);
 619
 620         /*
 621          * At this point "prev" is "current", as we just
 622          * switched into it (from an even more "previous"
 623          * prev)
 624          */
 625         reacquire_kernel_lock(prev);
 626         return;
 627
 628 scheduling_in_interrupt:
 629         printk("Scheduling in interrupt\n");
 630         *(int *)0 = 0;
 631 }
 632
 633
 634 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
 635
 636 /*
 637  * wake_up doesn't wake up stopped processes - they have to be awakened
 638  * with signals or similar.
 639  *
 640  * Note that we only need a read lock for the wait queue (and thus do not
 641  * have to protect against interrupts), as the actual removal from the
 642  * queue is handled by the process itself.
 643  */
 644 void __wake_up(struct wait_queue **q, unsigned int mode)
 645 {
 646         struct wait_queue *next;
 647
 648         read_lock(&waitqueue_lock);
 649         if (q && (next = *q)) {
 650                 struct wait_queue *head;
 651
 652                 head = WAIT_QUEUE_HEAD(q);
 653                 while (next != head) {
 654                         struct task_struct *p = next->task;
 655                         next = next->next;
 656                         if (p->state & mode)
 657                                 wake_up_process(p);
 658                 }
 659         }
 660         read_unlock(&waitqueue_lock);
 661 }
 662
 663 /*
 664  * Semaphores are implemented using a two-way counter:
 665  * The "count" variable is decremented for each process
 666  * that tries to sleep, while the "waking" variable is
 667  * incremented when the "up()" code goes to wake up waiting
 668  * processes.
 669  *
 670  * Notably, the inline "up()" and "down()" functions can
 671  * efficiently test if they need to do any extra work (up
 672  * needs to do something only if count was negative before
 673  * the increment operation.
 674  *
 675  * waking_non_zero() (from asm/semaphore.h) must execute
 676  * atomically.
 677  *
 678  * When __up() is called, the count was negative before
 679  * incrementing it, and we need to wake up somebody.
 680  *
 681  * This routine adds one to the count of processes that need to
 682  * wake up and exit.  ALL waiting processes actually wake up but
 683  * only the one that gets to the "waking" field first will gate
 684  * through and acquire the semaphore.  The others will go back
 685  * to sleep.
 686  *
 687  * Note that these functions are only called when there is
 688  * contention on the lock, and as such all this is the
 689  * "non-critical" part of the whole semaphore business. The
 690  * critical part is the inline stuff in <asm/semaphore.h>
 691  * where we want to avoid any extra jumps and calls.
 692  */
 693 void __up(struct semaphore *sem)
 694 {
 695         wake_one_more(sem);
 696         wake_up(&sem->wait);
 697 }
 698
 699 /*
 700  * Perform the "down" function.  Return zero for semaphore acquired,
 701  * return negative for signalled out of the function.
 702  *
 703  * If called from __down, the return is ignored and the wait loop is
 704  * not interruptible.  This means that a task waiting on a semaphore
 705  * using "down()" cannot be killed until someone does an "up()" on
 706  * the semaphore.
 707  *
 708  * If called from __down_interruptible, the return value gets checked
 709  * upon return.  If the return value is negative then the task continues
 710  * with the negative value in the return register (it can be tested by
 711  * the caller).
 712  *
 713  * Either form may be used in conjunction with "up()".
 714  *
 715  */
 716 static inline int __do_down(struct semaphore * sem, int task_state)
 717 {
 718         struct task_struct *tsk = current;
 719         struct wait_queue wait = { tsk, NULL };
 720         int               ret = 0;
 721
 722         tsk->state = task_state;
 723         add_wait_queue(&sem->wait, &wait);
 724
 725         /*
 726          * Ok, we're set up.  sem->count is known to be less than zero
 727          * so we must wait.
 728          *
 729          * We can let go the lock for purposes of waiting.
 730          * We re-acquire it after awaking so as to protect
 731          * all semaphore operations.
 732          *
 733          * If "up()" is called before we call waking_non_zero() then
 734          * we will catch it right away.  If it is called later then
 735          * we will have to go through a wakeup cycle to catch it.
 736          *
 737          * Multiple waiters contend for the semaphore lock to see
 738          * who gets to gate through and who has to wait some more.
 739          */
 740         for (;;) {
 741                 if (waking_non_zero(sem))       /* are we waking up?  */
 742                         break;                  /* yes, exit loop */
 743
 744                 if (task_state == TASK_INTERRUPTIBLE && signal_pending(tsk)) {
 745                         ret = -EINTR;                   /* interrupted */
 746                         atomic_inc(&sem->count);        /* give up on down operation */
 747                         break;
 748                 }
 749
 750                 schedule();
 751                 tsk->state = task_state;
 752         }
 753         tsk->state = TASK_RUNNING;
 754         remove_wait_queue(&sem->wait, &wait);
 755         return ret;
 756 }
 757
 758 void __down(struct semaphore * sem)
 759 {
 760         __do_down(sem,TASK_UNINTERRUPTIBLE);
 761 }
 762
 763 int __down_interruptible(struct semaphore * sem)
 764 {
 765         return __do_down(sem,TASK_INTERRUPTIBLE);
 766 }
 767
 768 #define SLEEP_ON_VAR                            \
 769         unsigned long flags;                    \
 770         struct wait_queue wait;
 771
 772 #define SLEEP_ON_HEAD                                   \
 773         wait.task = current;                            \
 774         write_lock_irqsave(&waitqueue_lock, flags);     \
 775         __add_wait_queue(p, &wait);                     \
 776         write_unlock(&waitqueue_lock);
 777
 778 #define SLEEP_ON_TAIL                                           \
 779         write_lock_irq(&waitqueue_lock);                        \
 780         __remove_wait_queue(p, &wait);                          \
 781         write_unlock_irqrestore(&waitqueue_lock, flags);
 782
 783 void interruptible_sleep_on(struct wait_queue **p)
 784 {
 785         SLEEP_ON_VAR
 786
 787         current->state = TASK_INTERRUPTIBLE;
 788
 789         SLEEP_ON_HEAD
 790         schedule();
 791         SLEEP_ON_TAIL
 792 }
 793
 794 long interruptible_sleep_on_timeout(struct wait_queue **p, long timeout)
 795 {
 796         SLEEP_ON_VAR
 797
 798         current->state = TASK_INTERRUPTIBLE;
 799
 800         SLEEP_ON_HEAD
 801         timeout = schedule_timeout(timeout);
 802         SLEEP_ON_TAIL
 803
 804         return timeout;
 805 }
 806
 807 void sleep_on(struct wait_queue **p)
 808 {
 809         SLEEP_ON_VAR
 810
 811         current->state = TASK_UNINTERRUPTIBLE;
 812
 813         SLEEP_ON_HEAD
 814         schedule();
 815         SLEEP_ON_TAIL
 816 }
 817
 818 void scheduling_functions_end_here(void) { }
 819
 820 static inline void cascade_timers(struct timer_vec *tv)
 821 {
 822         /* cascade all the timers from tv up one level */
 823         struct timer_list *timer;
 824         timer = tv->vec[tv->index];
 825         /*
 826          * We are removing _all_ timers from the list, so we don't  have to
 827          * detach them individually, just clear the list afterwards.
 828          */
 829         while (timer) {
 830                 struct timer_list *tmp = timer;
 831                 timer = timer->next;
 832                 internal_add_timer(tmp);
 833         }
 834         tv->vec[tv->index] = NULL;
 835         tv->index = (tv->index + 1) & TVN_MASK;
 836 }
 837
 838 static inline void run_timer_list(void)
 839 {
 840         spin_lock_irq(&timerlist_lock);
 841         while ((long)(jiffies - timer_jiffies) >= 0) {
 842                 struct timer_list *timer;
 843                 if (!tv1.index) {
 844                         int n = 1;
 845                         do {
 846                                 cascade_timers(tvecs[n]);
 847                         } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
 848                 }
 849                 while ((timer = tv1.vec[tv1.index])) {
 850                         void (*fn)(unsigned long) = timer->function;
 851                         unsigned long data = timer->data;
 852                         detach_timer(timer);
 853                         timer->next = timer->prev = NULL;
 854                         spin_unlock_irq(&timerlist_lock);
 855                         fn(data);
 856                         spin_lock_irq(&timerlist_lock);
 857                 }
 858                 ++timer_jiffies;
 859                 tv1.index = (tv1.index + 1) & TVR_MASK;
 860         }
 861         spin_unlock_irq(&timerlist_lock);
 862 }
 863
 864
 865 static inline void run_old_timers(void)
 866 {
 867         struct timer_struct *tp;
 868         unsigned long mask;
 869
 870         for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) {
 871                 if (mask > timer_active)
 872                         break;
 873                 if (!(mask & timer_active))
 874                         continue;
 875                 if (time_after(tp->expires, jiffies))
 876                         continue;
 877                 timer_active &= ~mask;
 878                 tp->fn();
 879                 sti();
 880         }
 881 }
 882
 883 spinlock_t tqueue_lock;
 884
 885 void tqueue_bh(void)
 886 {
 887         run_task_queue(&tq_timer);
 888 }
 889
 890 void immediate_bh(void)
 891 {
 892         run_task_queue(&tq_immediate);
 893 }
 894
 895 unsigned long timer_active = 0;
 896 struct timer_struct timer_table[32];
 897
 898 /*
 899  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
 900  * imply that avenrun[] is the standard name for this kind of thing.
 901  * Nothing else seems to be standardized: the fractional size etc
 902  * all seem to differ on different machines.
 903  */
 904 unsigned long avenrun[3] = { 0,0,0 };
 905
 906 /*
 907  * Nr of active tasks - counted in fixed-point numbers
 908  */
 909 static unsigned long count_active_tasks(void)
 910 {
 911         struct task_struct *p;
 912         unsigned long nr = 0;
 913
 914         read_lock(&tasklist_lock);
 915         for_each_task(p) {
 916                 if ((p->state == TASK_RUNNING ||
 917                      p->state == TASK_UNINTERRUPTIBLE ||
 918                      p->state == TASK_SWAPPING))
 919                         nr += FIXED_1;
 920         }
 921         read_unlock(&tasklist_lock);
 922         return nr;
 923 }
 924
 925 static inline void calc_load(unsigned long ticks)
 926 {
 927         unsigned long active_tasks; /* fixed-point */
 928         static int count = LOAD_FREQ;
 929
 930         count -= ticks;
 931         if (count < 0) {
 932                 count += LOAD_FREQ;
 933                 active_tasks = count_active_tasks();
 934                 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 935                 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 936                 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 937         }
 938 }
 939
 940 /*
 941  * this routine handles the overflow of the microsecond field
 942  *
 943  * The tricky bits of code to handle the accurate clock support
 944  * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
 945  * They were originally developed for SUN and DEC kernels.
 946  * All the kudos should go to Dave for this stuff.
 947  *
 948  */
 949 static void second_overflow(void)
 950 {
 951     long ltemp;
 952
 953     /* Bump the maxerror field */
 954     time_maxerror += time_tolerance >> SHIFT_USEC;
 955     if ( time_maxerror > MAXPHASE )
 956         time_maxerror = MAXPHASE;
 957
 958     /*
 959      * Leap second processing. If in leap-insert state at
 960      * the end of the day, the system clock is set back one
 961      * second; if in leap-delete state, the system clock is
 962      * set ahead one second. The microtime() routine or
 963      * external clock driver will insure that reported time
 964      * is always monotonic. The ugly divides should be
 965      * replaced.
 966      */
 967     switch (time_state) {
 968
 969     case TIME_OK:
 970         if (time_status & STA_INS)
 971             time_state = TIME_INS;
 972         else if (time_status & STA_DEL)
 973             time_state = TIME_DEL;
 974         break;
 975
 976     case TIME_INS:
 977         if (xtime.tv_sec % 86400 == 0) {
 978             xtime.tv_sec--;
 979             time_state = TIME_OOP;
 980             printk("Clock: inserting leap second 23:59:60 UTC\n");
 981         }
 982         break;
 983
 984     case TIME_DEL:
 985         if ((xtime.tv_sec + 1) % 86400 == 0) {
 986             xtime.tv_sec++;
 987             time_state = TIME_WAIT;
 988             printk("Clock: deleting leap second 23:59:59 UTC\n");
 989         }
 990         break;
 991
 992     case TIME_OOP:
 993         time_state = TIME_WAIT;
 994         break;
 995
 996     case TIME_WAIT:
 997         if (!(time_status & (STA_INS | STA_DEL)))
 998             time_state = TIME_OK;
 999     }
1000
1001     /*
1002      * Compute the phase adjustment for the next second. In
1003      * PLL mode, the offset is reduced by a fixed factor
1004      * times the time constant. In FLL mode the offset is
1005      * used directly. In either mode, the maximum phase
1006      * adjustment for each second is clamped so as to spread
1007      * the adjustment over not more than the number of
1008      * seconds between updates.
1009      */
1010     if (time_offset < 0) {
1011         ltemp = -time_offset;
1012         if (!(time_status & STA_FLL))
1013             ltemp >>= SHIFT_KG + time_constant;
1014         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
1015             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
1016         time_offset += ltemp;
1017         time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
1018     } else {
1019         ltemp = time_offset;
1020         if (!(time_status & STA_FLL))
1021             ltemp >>= SHIFT_KG + time_constant;
1022         if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
1023             ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
1024         time_offset -= ltemp;
1025         time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
1026     }
1027
1028     /*
1029      * Compute the frequency estimate and additional phase
1030      * adjustment due to frequency error for the next
1031      * second. When the PPS signal is engaged, gnaw on the
1032      * watchdog counter and update the frequency computed by
1033      * the pll and the PPS signal.
1034      */
1035     pps_valid++;
1036     if (pps_valid == PPS_VALID) {
1037         pps_jitter = MAXTIME;
1038         pps_stabil = MAXFREQ;
1039         time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
1040                          STA_PPSWANDER | STA_PPSERROR);
1041     }
1042     ltemp = time_freq + pps_freq;
1043     if (ltemp < 0)
1044         time_adj -= -ltemp >>
1045             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
1046     else
1047         time_adj += ltemp >>
1048             (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
1049
1050 #if HZ == 100
1051     /* compensate for (HZ==100) != 128. Add 25% to get 125; => only 3% error */
1052     if (time_adj < 0)
1053         time_adj -= -time_adj >> 2;
1054     else
1055         time_adj += time_adj >> 2;
1056 #endif
1057 }
1058
1059 /* in the NTP reference this is called "hardclock()" */
1060 static void update_wall_time_one_tick(void)
1061 {
1062         /*
1063          * Advance the phase, once it gets to one microsecond, then
1064          * advance the tick more.
1065          */
1066         time_phase += time_adj;
1067         if (time_phase <= -FINEUSEC) {
1068                 long ltemp = -time_phase >> SHIFT_SCALE;
1069                 time_phase += ltemp << SHIFT_SCALE;
1070                 xtime.tv_usec += tick + time_adjust_step - ltemp;
1071         }
1072         else if (time_phase >= FINEUSEC) {
1073                 long ltemp = time_phase >> SHIFT_SCALE;
1074                 time_phase -= ltemp << SHIFT_SCALE;
1075                 xtime.tv_usec += tick + time_adjust_step + ltemp;
1076         } else
1077                 xtime.tv_usec += tick + time_adjust_step;
1078
1079         if (time_adjust) {
1080             /* We are doing an adjtime thing.
1081              *
1082              * Modify the value of the tick for next time.
1083              * Note that a positive delta means we want the clock
1084              * to run fast. This means that the tick should be bigger
1085              *
1086              * Limit the amount of the step for *next* tick to be
1087              * in the range -tickadj .. +tickadj
1088              */
1089              if (time_adjust > tickadj)
1090                 time_adjust_step = tickadj;
1091              else if (time_adjust < -tickadj)
1092                 time_adjust_step = -tickadj;
1093              else
1094                 time_adjust_step = time_adjust;
1095
1096             /* Reduce by this step the amount of time left  */
1097             time_adjust -= time_adjust_step;
1098         }
1099         else
1100             time_adjust_step = 0;
1101 }
1102
1103 /*
1104  * Using a loop looks inefficient, but "ticks" is
1105  * usually just one (we shouldn't be losing ticks,
1106  * we're doing this this way mainly for interrupt
1107  * latency reasons, not because we think we'll
1108  * have lots of lost timer ticks
1109  */
1110 static void update_wall_time(unsigned long ticks)
1111 {
1112         do {
1113                 ticks--;
1114                 update_wall_time_one_tick();
1115         } while (ticks);
1116
1117         if (xtime.tv_usec >= 1000000) {
1118             xtime.tv_usec -= 1000000;
1119             xtime.tv_sec++;
1120             second_overflow();
1121         }
1122 }
1123
1124 static inline void do_process_times(struct task_struct *p,
1125         unsigned long user, unsigned long system)
1126 {
1127         long psecs;
1128
1129         psecs = (p->times.tms_utime += user);
1130         psecs += (p->times.tms_stime += system);
1131         if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
1132                 /* Send SIGXCPU every second.. */
1133                 if (!(psecs % HZ))
1134                         send_sig(SIGXCPU, p, 1);
1135                 /* and SIGKILL when we go over max.. */
1136                 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
1137                         send_sig(SIGKILL, p, 1);
1138         }
1139 }
1140
1141 static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
1142 {
1143         unsigned long it_virt = p->it_virt_value;
1144
1145         if (it_virt) {
1146                 if (it_virt <= ticks) {
1147                         it_virt = ticks + p->it_virt_incr;
1148                         send_sig(SIGVTALRM, p, 1);
1149                 }
1150                 p->it_virt_value = it_virt - ticks;
1151         }
1152 }
1153
1154 static inline void do_it_prof(struct task_struct * p, unsigned long ticks)
1155 {
1156         unsigned long it_prof = p->it_prof_value;
1157
1158         if (it_prof) {
1159                 if (it_prof <= ticks) {
1160                         it_prof = ticks + p->it_prof_incr;
1161                         send_sig(SIGPROF, p, 1);
1162                 }
1163                 p->it_prof_value = it_prof - ticks;
1164         }
1165 }
1166
1167 void update_one_process(struct task_struct *p,
1168         unsigned long ticks, unsigned long user, unsigned long system, int cpu)
1169 {
1170         p->per_cpu_utime[cpu] += user;
1171         p->per_cpu_stime[cpu] += system;
1172         do_process_times(p, user, system);
1173         do_it_virt(p, user);
1174         do_it_prof(p, ticks);
1175 }
1176
1177 static void update_process_times(unsigned long ticks, unsigned long system)
1178 {
1179 /*
1180  * SMP does this on a per-CPU basis elsewhere
1181  */
1182 #ifndef  __SMP__
1183         struct task_struct * p = current;
1184         unsigned long user = ticks - system;
1185         if (p->pid) {
1186                 p->counter -= ticks;
1187                 if (p->counter < 0) {
1188                         p->counter = 0;
1189                         p->need_resched = 1;
1190                 }
1191                 if (p->priority < DEF_PRIORITY)
1192                         kstat.cpu_nice += user;
1193                 else
1194                         kstat.cpu_user += user;
1195                 kstat.cpu_system += system;
1196         }
1197         update_one_process(p, ticks, user, system, 0);
1198 #endif
1199 }
1200
1201 volatile unsigned long lost_ticks = 0;
1202 static unsigned long lost_ticks_system = 0;
1203
1204 static inline void update_times(void)
1205 {
1206         unsigned long ticks;
1207         unsigned long flags;
1208
1209         save_flags(flags);
1210         cli();
1211
1212         ticks = lost_ticks;
1213         lost_ticks = 0;
1214
1215         if (ticks) {
1216                 unsigned long system;
1217                 system = xchg(&lost_ticks_system, 0);
1218
1219                 calc_load(ticks);
1220                 update_wall_time(ticks);
1221                 restore_flags(flags);
1222
1223                 update_process_times(ticks, system);
1224
1225         } else
1226                 restore_flags(flags);
1227 }
1228
1229 static void timer_bh(void)
1230 {
1231         update_times();
1232         run_old_timers();
1233         run_timer_list();
1234 }
1235
1236 void do_timer(struct pt_regs * regs)
1237 {
1238         (*(unsigned long *)&jiffies)++;
1239         lost_ticks++;
1240         mark_bh(TIMER_BH);
1241         if (!user_mode(regs))
1242                 lost_ticks_system++;
1243         if (tq_timer)
1244                 mark_bh(TQUEUE_BH);
1245 }
1246
1247 #ifndef __alpha__
1248
1249 /*
1250  * For backwards compatibility?  This can be done in libc so Alpha
1251  * and all newer ports shouldn't need it.
1252  */
1253 asmlinkage unsigned int sys_alarm(unsigned int seconds)
1254 {
1255         struct itimerval it_new, it_old;
1256         unsigned int oldalarm;
1257
1258         it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
1259         it_new.it_value.tv_sec = seconds;
1260         it_new.it_value.tv_usec = 0;
1261         do_setitimer(ITIMER_REAL, &it_new, &it_old);
1262         oldalarm = it_old.it_value.tv_sec;
1263         /* ehhh.. We can't return 0 if we have an alarm pending.. */
1264         /* And we'd better return too much than too little anyway */
1265         if (it_old.it_value.tv_usec)
1266                 oldalarm++;
1267         return oldalarm;
1268 }
1269
1270 /*
1271  * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
1272  * should be moved into arch/i386 instead?
1273  */
1274
1275 asmlinkage int sys_getpid(void)
1276 {
1277         /* This is SMP safe - current->pid doesn't change */
1278         return current->pid;
1279 }
1280
1281 /*
1282  * This is not strictly SMP safe: p_opptr could change
1283  * from under us. However, rather than getting any lock
1284  * we can use an optimistic algorithm: get the parent
1285  * pid, and go back and check that the parent is still
1286  * the same. If it has changed (which is extremely unlikely
1287  * indeed), we just try again..
1288  *
1289  * NOTE! This depends on the fact that even if we _do_
1290  * get an old value of "parent", we can happily dereference
1291  * the pointer: we just can't necessarily trust the result
1292  * until we know that the parent pointer is valid.
1293  *
1294  * The "mb()" macro is a memory barrier - a synchronizing
1295  * event. It also makes sure that gcc doesn't optimize
1296  * away the necessary memory references.. The barrier doesn't
1297  * have to have all that strong semantics: on x86 we don't
1298  * really require a synchronizing instruction, for example.
1299  * The barrier is more important for code generation than
1300  * for any real memory ordering semantics (even if there is
1301  * a small window for a race, using the old pointer is
1302  * harmless for a while).
1303  */
1304 asmlinkage int sys_getppid(void)
1305 {
1306         int pid;
1307         struct task_struct * me = current;
1308         struct task_struct * parent;
1309
1310         parent = me->p_opptr;
1311         for (;;) {
1312                 pid = parent->pid;
1313 #if __SMP__
1314 {
1315                 struct task_struct *old = parent;
1316                 mb();
1317                 parent = me->p_opptr;
1318                 if (old != parent)
1319                         continue;
1320 }
1321 #endif
1322                 break;
1323         }
1324         return pid;
1325 }
1326
1327 asmlinkage int sys_getuid(void)
1328 {
1329         /* Only we change this so SMP safe */
1330         return current->uid;
1331 }
1332
1333 asmlinkage int sys_geteuid(void)
1334 {
1335         /* Only we change this so SMP safe */
1336         return current->euid;
1337 }
1338
1339 asmlinkage int sys_getgid(void)
1340 {
1341         /* Only we change this so SMP safe */
1342         return current->gid;
1343 }
1344
1345 asmlinkage int sys_getegid(void)
1346 {
1347         /* Only we change this so SMP safe */
1348         return  current->egid;
1349 }
1350
1351 /*
1352  * This has been replaced by sys_setpriority.  Maybe it should be
1353  * moved into the arch dependent tree for those ports that require
1354  * it for backward compatibility?
1355  */
1356
1357 asmlinkage int sys_nice(int increment)
1358 {
1359         unsigned long newprio;
1360         int increase = 0;
1361
1362         /*
1363          *      Setpriority might change our priority at the same moment.
1364          *      We don't have to worry. Conceptually one call occurs first
1365          *      and we have a single winner.
1366          */
1367
1368         newprio = increment;
1369         if (increment < 0) {
1370                 if (!capable(CAP_SYS_NICE))
1371                         return -EPERM;
1372                 newprio = -increment;
1373                 increase = 1;
1374         }
1375
1376         if (newprio > 40)
1377                 newprio = 40;
1378         /*
1379          * do a "normalization" of the priority (traditionally
1380          * Unix nice values are -20 to 20; Linux doesn't really
1381          * use that kind of thing, but uses the length of the
1382          * timeslice instead (default 150 ms). The rounding is
1383          * why we want to avoid negative values.
1384          */
1385         newprio = (newprio * DEF_PRIORITY + 10) / 20;
1386         increment = newprio;
1387         if (increase)
1388                 increment = -increment;
1389         /*
1390          *      Current->priority can change between this point
1391          *      and the assignment. We are assigning not doing add/subs
1392          *      so thats ok. Conceptually a process might just instantaneously
1393          *      read the value we stomp over. I don't think that is an issue
1394          *      unless posix makes it one. If so we can loop on changes
1395          *      to current->priority.
1396          */
1397         newprio = current->priority - increment;
1398         if ((signed) newprio < 1)
1399                 newprio = 1;
1400         if (newprio > DEF_PRIORITY*2)
1401                 newprio = DEF_PRIORITY*2;
1402         current->priority = newprio;
1403         return 0;
1404 }
1405
1406 #endif
1407
1408 static inline struct task_struct *find_process_by_pid(pid_t pid)
1409 {
1410         struct task_struct *tsk = current;
1411
1412         if (pid)
1413                 tsk = find_task_by_pid(pid);
1414         return tsk;
1415 }
1416
1417 static int setscheduler(pid_t pid, int policy,
1418                         struct sched_param *param)
1419 {
1420         struct sched_param lp;
1421         struct task_struct *p;
1422         int retval;
1423
1424         retval = -EINVAL;
1425         if (!param || pid < 0)
1426                 goto out_nounlock;
1427
1428         retval = -EFAULT;
1429         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
1430                 goto out_nounlock;
1431
1432         /*
1433          * We play safe to avoid deadlocks.
1434          */
1435         spin_lock(&scheduler_lock);
1436         spin_lock_irq(&runqueue_lock);
1437         read_lock(&tasklist_lock);
1438
1439         p = find_process_by_pid(pid);
1440
1441         retval = -ESRCH;
1442         if (!p)
1443                 goto out_unlock;
1444
1445         if (policy < 0)
1446                 policy = p->policy;
1447         else {
1448                 retval = -EINVAL;
1449                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
1450                                 policy != SCHED_OTHER)
1451                         goto out_unlock;
1452         }
1453
1454         /*
1455          * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
1456          * priority for SCHED_OTHER is 0.
1457          */
1458         retval = -EINVAL;
1459         if (lp.sched_priority < 0 || lp.sched_priority > 99)
1460                 goto out_unlock;
1461         if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
1462                 goto out_unlock;
1463
1464         retval = -EPERM;
1465         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1466             !capable(CAP_SYS_NICE))
1467                 goto out_unlock;
1468         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1469             !capable(CAP_SYS_NICE))
1470                 goto out_unlock;
1471
1472         retval = 0;
1473         p->policy = policy;
1474         p->rt_priority = lp.sched_priority;
1475         if (p->next_run)
1476                 move_first_runqueue(p);
1477
1478         current->need_resched = 1;
1479
1480 out_unlock:
1481         read_unlock(&tasklist_lock);
1482         spin_unlock_irq(&runqueue_lock);
1483         spin_unlock(&scheduler_lock);
1484
1485 out_nounlock:
1486         return retval;
1487 }
1488
1489 asmlinkage int sys_sched_setscheduler(pid_t pid, int policy,
1490                                       struct sched_param *param)
1491 {
1492         return setscheduler(pid, policy, param);
1493 }
1494
1495 asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param)
1496 {
1497         return setscheduler(pid, -1, param);
1498 }
1499
1500 asmlinkage int sys_sched_getscheduler(pid_t pid)
1501 {
1502         struct task_struct *p;
1503         int retval;
1504
1505         retval = -EINVAL;
1506         if (pid < 0)
1507                 goto out_nounlock;
1508
1509         read_lock(&tasklist_lock);
1510
1511         retval = -ESRCH;
1512         p = find_process_by_pid(pid);
1513         if (!p)
1514                 goto out_unlock;
1515
1516         retval = p->policy;
1517
1518 out_unlock:
1519         read_unlock(&tasklist_lock);
1520
1521 out_nounlock:
1522         return retval;
1523 }
1524
1525 asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param)
1526 {
1527         struct task_struct *p;
1528         struct sched_param lp;
1529         int retval;
1530
1531         retval = -EINVAL;
1532         if (!param || pid < 0)
1533                 goto out_nounlock;
1534
1535         read_lock(&tasklist_lock);
1536         p = find_process_by_pid(pid);
1537         retval = -ESRCH;
1538         if (!p)
1539                 goto out_unlock;
1540         lp.sched_priority = p->rt_priority;
1541         read_unlock(&tasklist_lock);
1542
1543         /*
1544          * This one might sleep, we cannot do it with a spinlock held ...
1545          */
1546         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1547
1548 out_nounlock:
1549         return retval;
1550
1551 out_unlock:
1552         read_unlock(&tasklist_lock);
1553         return retval;
1554 }
1555
1556 asmlinkage int sys_sched_yield(void)
1557 {
1558         spin_lock(&scheduler_lock);
1559         spin_lock_irq(&runqueue_lock);
1560         if (current->policy == SCHED_OTHER)
1561                 current->policy |= SCHED_YIELD;
1562         current->need_resched = 1;
1563         move_last_runqueue(current);
1564         spin_unlock_irq(&runqueue_lock);
1565         spin_unlock(&scheduler_lock);
1566         return 0;
1567 }
1568
1569 asmlinkage int sys_sched_get_priority_max(int policy)
1570 {
1571         int ret = -EINVAL;
1572
1573         switch (policy) {
1574         case SCHED_FIFO:
1575         case SCHED_RR:
1576                 ret = 99;
1577                 break;
1578         case SCHED_OTHER:
1579                 ret = 0;
1580                 break;
1581         }
1582         return ret;
1583 }
1584
1585 asmlinkage int sys_sched_get_priority_min(int policy)
1586 {
1587         int ret = -EINVAL;
1588
1589         switch (policy) {
1590         case SCHED_FIFO:
1591         case SCHED_RR:
1592                 ret = 1;
1593                 break;
1594         case SCHED_OTHER:
1595                 ret = 0;
1596         }
1597         return ret;
1598 }
1599
1600 asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
1601 {
1602         struct timespec t;
1603
1604         t.tv_sec = 0;
1605         t.tv_nsec = 150000;
1606         if (copy_to_user(interval, &t, sizeof(struct timespec)))
1607                 return -EFAULT;
1608         return 0;
1609 }
1610
1611 asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
1612 {
1613         struct timespec t;
1614         unsigned long expire;
1615
1616         if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
1617                 return -EFAULT;
1618
1619         if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
1620                 return -EINVAL;
1621
1622
1623         if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
1624             current->policy != SCHED_OTHER)
1625         {
1626                 /*
1627                  * Short delay requests up to 2 ms will be handled with
1628                  * high precision by a busy wait for all real-time processes.
1629                  *
1630                  * Its important on SMP not to do this holding locks.
1631                  */
1632                 udelay((t.tv_nsec + 999) / 1000);
1633                 return 0;
1634         }
1635
1636         expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1637
1638         current->state = TASK_INTERRUPTIBLE;
1639         expire = schedule_timeout(expire);
1640
1641         if (expire) {
1642                 if (rmtp) {
1643                         jiffies_to_timespec(expire, &t);
1644                         if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
1645                                 return -EFAULT;
1646                 }
1647                 return -EINTR;
1648         }
1649         return 0;
1650 }
1651
1652 static void show_task(int nr,struct task_struct * p)
1653 {
1654         unsigned long free = 0;
1655         int state;
1656         static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
1657
1658         printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr);
1659         state = p->state ? ffz(~p->state) + 1 : 0;
1660         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
1661                 printk(stat_nam[state]);
1662         else
1663                 printk(" ");
1664 #if (BITS_PER_LONG == 32)
1665         if (p == current)
1666                 printk(" current  ");
1667         else
1668                 printk(" %08lX ", thread_saved_pc(&p->tss));
1669 #else
1670         if (p == current)
1671                 printk("   current task   ");
1672         else
1673                 printk(" %016lx ", thread_saved_pc(&p->tss));
1674 #endif
1675         {
1676                 unsigned long * n = (unsigned long *) (p+1);
1677                 while (!*n)
1678                         n++;
1679                 free = (unsigned long) n - (unsigned long)(p+1);
1680         }
1681         printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
1682         if (p->p_cptr)
1683                 printk("%5d ", p->p_cptr->pid);
1684         else
1685                 printk("      ");
1686         if (p->p_ysptr)
1687                 printk("%7d", p->p_ysptr->pid);
1688         else
1689                 printk("       ");
1690         if (p->p_osptr)
1691                 printk(" %5d\n", p->p_osptr->pid);
1692         else
1693                 printk("\n");
1694
1695         {
1696                 struct signal_queue *q;
1697                 char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1];
1698
1699                 render_sigset_t(&p->signal, s);
1700                 render_sigset_t(&p->blocked, b);
1701                 printk("   sig: %d %s %s :", signal_pending(p), s, b);
1702                 for (q = p->sigqueue; q ; q = q->next)
1703                         printk(" %d", q->info.si_signo);
1704                 printk(" X\n");
1705         }
1706 }
1707
1708 char * render_sigset_t(sigset_t *set, char *buffer)
1709 {
1710         int i = _NSIG, x;
1711         do {
1712                 i -= 4, x = 0;
1713                 if (sigismember(set, i+1)) x |= 1;
1714                 if (sigismember(set, i+2)) x |= 2;
1715                 if (sigismember(set, i+3)) x |= 4;
1716                 if (sigismember(set, i+4)) x |= 8;
1717                 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
1718         } while (i >= 4);
1719         *buffer = 0;
1720         return buffer;
1721 }
1722
1723 void show_state(void)
1724 {
1725         struct task_struct *p;
1726
1727 #if (BITS_PER_LONG == 32)
1728         printk("\n"
1729                "                         free                        sibling\n");
1730         printk("  task             PC    stack   pid father child younger older\n");
1731 #else
1732         printk("\n"
1733                "                                 free                        sibling\n");
1734         printk("  task                 PC        stack   pid father child younger older\n");
1735 #endif
1736         read_lock(&tasklist_lock);
1737         for_each_task(p)
1738                 show_task((p->tarray_ptr - &task[0]),p);
1739         read_unlock(&tasklist_lock);
1740 }
1741
1742 void __init sched_init(void)
1743 {
1744         /*
1745          *      We have to do a little magic to get the first
1746          *      process right in SMP mode.
1747          */
1748         int cpu=hard_smp_processor_id();
1749         int nr = NR_TASKS;
1750
1751         init_task.processor=cpu;
1752
1753         /* Init task array free list and pidhash table. */
1754         while(--nr > 0)
1755                 add_free_taskslot(&task[nr]);
1756
1757         for(nr = 0; nr < PIDHASH_SZ; nr++)
1758                 pidhash[nr] = NULL;
1759
1760         init_bh(TIMER_BH, timer_bh);
1761         init_bh(TQUEUE_BH, tqueue_bh);
1762         init_bh(IMMEDIATE_BH, immediate_bh);
1763 }