4 * Kernel internal timers, kernel timekeeping, basic process system calls
6 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
18 #include <linux/config.h>
20 #include <linux/timex.h>
21 #include <linux/delay.h>
22 #include <linux/smp_lock.h>
23 #include <linux/interrupt.h>
24 #include <linux/kernel_stat.h>
26 #include <asm/uaccess.h>
29 * Timekeeping variables
32 long tick
= (1000000 + HZ
/2) / HZ
; /* timer interrupt period */
34 /* The current time */
35 volatile struct timeval xtime
__attribute__ ((aligned (16)));
37 /* Don't completely fail for HZ > 500. */
38 int tickadj
= 500/HZ
? : 1; /* microsecs */
40 DECLARE_TASK_QUEUE(tq_timer
);
41 DECLARE_TASK_QUEUE(tq_immediate
);
42 DECLARE_TASK_QUEUE(tq_scheduler
);
45 * phase-lock loop variables
47 /* TIME_ERROR prevents overwriting the CMOS clock */
48 int time_state
= TIME_OK
; /* clock synchronization status */
49 int time_status
= STA_UNSYNC
; /* clock status bits */
50 long time_offset
; /* time adjustment (us) */
51 long time_constant
= 2; /* pll time constant */
52 long time_tolerance
= MAXFREQ
; /* frequency tolerance (ppm) */
53 long time_precision
= 1; /* clock precision (us) */
54 long time_maxerror
= NTP_PHASE_LIMIT
; /* maximum error (us) */
55 long time_esterror
= NTP_PHASE_LIMIT
; /* estimated error (us) */
56 long time_phase
; /* phase offset (scaled us) */
57 long time_freq
= ((1000000 + HZ
/2) % HZ
- HZ
/2) << SHIFT_USEC
;
58 /* frequency offset (scaled ppm)*/
59 long time_adj
; /* tick adjust (scaled 1 / HZ) */
60 long time_reftime
; /* time at last adjustment (s) */
63 long time_adjust_step
;
67 extern int do_setitimer(int, struct itimerval
*, struct itimerval
*);
69 unsigned long volatile jiffies
;
71 unsigned int * prof_buffer
;
72 unsigned long prof_len
;
73 unsigned long prof_shift
;
80 #define TVN_SIZE (1 << TVN_BITS)
81 #define TVR_SIZE (1 << TVR_BITS)
82 #define TVN_MASK (TVN_SIZE - 1)
83 #define TVR_MASK (TVR_SIZE - 1)
87 struct list_head vec
[TVN_SIZE
];
90 struct timer_vec_root
{
92 struct list_head vec
[TVR_SIZE
];
95 static struct timer_vec tv5
;
96 static struct timer_vec tv4
;
97 static struct timer_vec tv3
;
98 static struct timer_vec tv2
;
99 static struct timer_vec_root tv1
;
101 static struct timer_vec
* const tvecs
[] = {
102 (struct timer_vec
*)&tv1
, &tv2
, &tv3
, &tv4
, &tv5
105 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
107 void init_timervecs (void)
111 for (i
= 0; i
< TVN_SIZE
; i
++) {
112 INIT_LIST_HEAD(tv5
.vec
+ i
);
113 INIT_LIST_HEAD(tv4
.vec
+ i
);
114 INIT_LIST_HEAD(tv3
.vec
+ i
);
115 INIT_LIST_HEAD(tv2
.vec
+ i
);
117 for (i
= 0; i
< TVR_SIZE
; i
++)
118 INIT_LIST_HEAD(tv1
.vec
+ i
);
121 static unsigned long timer_jiffies
;
123 static inline void internal_add_timer(struct timer_list
*timer
)
126 * must be cli-ed when calling this
128 unsigned long expires
= timer
->expires
;
129 unsigned long idx
= expires
- timer_jiffies
;
130 struct list_head
* vec
;
132 if (idx
< TVR_SIZE
) {
133 int i
= expires
& TVR_MASK
;
135 } else if (idx
< 1 << (TVR_BITS
+ TVN_BITS
)) {
136 int i
= (expires
>> TVR_BITS
) & TVN_MASK
;
138 } else if (idx
< 1 << (TVR_BITS
+ 2 * TVN_BITS
)) {
139 int i
= (expires
>> (TVR_BITS
+ TVN_BITS
)) & TVN_MASK
;
141 } else if (idx
< 1 << (TVR_BITS
+ 3 * TVN_BITS
)) {
142 int i
= (expires
>> (TVR_BITS
+ 2 * TVN_BITS
)) & TVN_MASK
;
144 } else if ((signed long) idx
< 0) {
145 /* can happen if you add a timer with expires == jiffies,
146 * or you set a timer to go off in the past
148 vec
= tv1
.vec
+ tv1
.index
;
149 } else if (idx
<= 0xffffffffUL
) {
150 int i
= (expires
>> (TVR_BITS
+ 3 * TVN_BITS
)) & TVN_MASK
;
153 /* Can only get here on architectures with 64-bit jiffies */
154 INIT_LIST_HEAD(&timer
->list
);
160 list_add(&timer
->list
, vec
->prev
);
163 /* Initialize both explicitly - let's try to have them in the same cache line */
164 spinlock_t timerlist_lock
= SPIN_LOCK_UNLOCKED
;
167 volatile struct timer_list
* volatile running_timer
;
168 #define timer_enter(t) do { running_timer = t; mb(); } while (0)
169 #define timer_exit() do { running_timer = NULL; } while (0)
170 #define timer_is_running(t) (running_timer == t)
171 #define timer_synchronize(t) while (timer_is_running(t)) barrier()
173 #define timer_enter(t) do { } while (0)
174 #define timer_exit() do { } while (0)
177 void add_timer(struct timer_list
*timer
)
181 spin_lock_irqsave(&timerlist_lock
, flags
);
182 if (timer_pending(timer
))
184 internal_add_timer(timer
);
185 spin_unlock_irqrestore(&timerlist_lock
, flags
);
188 spin_unlock_irqrestore(&timerlist_lock
, flags
);
189 printk("bug: kernel timer added twice at %p.\n",
190 __builtin_return_address(0));
193 static inline int detach_timer (struct timer_list
*timer
)
195 if (!timer_pending(timer
))
197 list_del(&timer
->list
);
201 int mod_timer(struct timer_list
*timer
, unsigned long expires
)
206 spin_lock_irqsave(&timerlist_lock
, flags
);
207 timer
->expires
= expires
;
208 ret
= detach_timer(timer
);
209 internal_add_timer(timer
);
210 spin_unlock_irqrestore(&timerlist_lock
, flags
);
214 int del_timer(struct timer_list
* timer
)
219 spin_lock_irqsave(&timerlist_lock
, flags
);
220 ret
= detach_timer(timer
);
221 timer
->list
.next
= timer
->list
.prev
= NULL
;
222 spin_unlock_irqrestore(&timerlist_lock
, flags
);
227 void sync_timers(void)
229 spin_unlock_wait(&global_bh_lock
);
233 * SMP specific function to delete periodic timer.
234 * Caller must disable by some means restarting the timer
235 * for new. Upon exit the timer is not queued and handler is not running
236 * on any CPU. It returns number of times, which timer was deleted
237 * (for reference counting).
240 int del_timer_sync(struct timer_list
* timer
)
248 spin_lock_irqsave(&timerlist_lock
, flags
);
249 ret
+= detach_timer(timer
);
250 timer
->list
.next
= timer
->list
.prev
= 0;
251 running
= timer_is_running(timer
);
252 spin_unlock_irqrestore(&timerlist_lock
, flags
);
257 timer_synchronize(timer
);
265 static inline void cascade_timers(struct timer_vec
*tv
)
267 /* cascade all the timers from tv up one level */
268 struct list_head
*head
, *curr
, *next
;
270 head
= tv
->vec
+ tv
->index
;
273 * We are removing _all_ timers from the list, so we don't have to
274 * detach them individually, just clear the list afterwards.
276 while (curr
!= head
) {
277 struct timer_list
*tmp
;
279 tmp
= list_entry(curr
, struct timer_list
, list
);
281 list_del(curr
); // not needed
282 internal_add_timer(tmp
);
285 INIT_LIST_HEAD(head
);
286 tv
->index
= (tv
->index
+ 1) & TVN_MASK
;
289 static inline void run_timer_list(void)
291 spin_lock_irq(&timerlist_lock
);
292 while ((long)(jiffies
- timer_jiffies
) >= 0) {
293 struct list_head
*head
, *curr
;
297 cascade_timers(tvecs
[n
]);
298 } while (tvecs
[n
]->index
== 1 && ++n
< NOOF_TVECS
);
301 head
= tv1
.vec
+ tv1
.index
;
304 struct timer_list
*timer
;
305 void (*fn
)(unsigned long);
308 timer
= list_entry(curr
, struct timer_list
, list
);
309 fn
= timer
->function
;
313 timer
->list
.next
= timer
->list
.prev
= NULL
;
315 spin_unlock_irq(&timerlist_lock
);
317 spin_lock_irq(&timerlist_lock
);
322 tv1
.index
= (tv1
.index
+ 1) & TVR_MASK
;
324 spin_unlock_irq(&timerlist_lock
);
327 spinlock_t tqueue_lock
= SPIN_LOCK_UNLOCKED
;
331 run_task_queue(&tq_timer
);
334 void immediate_bh(void)
336 run_task_queue(&tq_immediate
);
340 * this routine handles the overflow of the microsecond field
342 * The tricky bits of code to handle the accurate clock support
343 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
344 * They were originally developed for SUN and DEC kernels.
345 * All the kudos should go to Dave for this stuff.
348 static void second_overflow(void)
352 /* Bump the maxerror field */
353 time_maxerror
+= time_tolerance
>> SHIFT_USEC
;
354 if ( time_maxerror
> NTP_PHASE_LIMIT
) {
355 time_maxerror
= NTP_PHASE_LIMIT
;
356 time_status
|= STA_UNSYNC
;
360 * Leap second processing. If in leap-insert state at
361 * the end of the day, the system clock is set back one
362 * second; if in leap-delete state, the system clock is
363 * set ahead one second. The microtime() routine or
364 * external clock driver will insure that reported time
365 * is always monotonic. The ugly divides should be
368 switch (time_state
) {
371 if (time_status
& STA_INS
)
372 time_state
= TIME_INS
;
373 else if (time_status
& STA_DEL
)
374 time_state
= TIME_DEL
;
378 if (xtime
.tv_sec
% 86400 == 0) {
380 time_state
= TIME_OOP
;
381 printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
386 if ((xtime
.tv_sec
+ 1) % 86400 == 0) {
388 time_state
= TIME_WAIT
;
389 printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
394 time_state
= TIME_WAIT
;
398 if (!(time_status
& (STA_INS
| STA_DEL
)))
399 time_state
= TIME_OK
;
403 * Compute the phase adjustment for the next second. In
404 * PLL mode, the offset is reduced by a fixed factor
405 * times the time constant. In FLL mode the offset is
406 * used directly. In either mode, the maximum phase
407 * adjustment for each second is clamped so as to spread
408 * the adjustment over not more than the number of
409 * seconds between updates.
411 if (time_offset
< 0) {
412 ltemp
= -time_offset
;
413 if (!(time_status
& STA_FLL
))
414 ltemp
>>= SHIFT_KG
+ time_constant
;
415 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
416 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
417 time_offset
+= ltemp
;
418 time_adj
= -ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
421 if (!(time_status
& STA_FLL
))
422 ltemp
>>= SHIFT_KG
+ time_constant
;
423 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
424 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
425 time_offset
-= ltemp
;
426 time_adj
= ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
430 * Compute the frequency estimate and additional phase
431 * adjustment due to frequency error for the next
432 * second. When the PPS signal is engaged, gnaw on the
433 * watchdog counter and update the frequency computed by
434 * the pll and the PPS signal.
437 if (pps_valid
== PPS_VALID
) { /* PPS signal lost */
438 pps_jitter
= MAXTIME
;
439 pps_stabil
= MAXFREQ
;
440 time_status
&= ~(STA_PPSSIGNAL
| STA_PPSJITTER
|
441 STA_PPSWANDER
| STA_PPSERROR
);
443 ltemp
= time_freq
+ pps_freq
;
445 time_adj
-= -ltemp
>>
446 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
449 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
452 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
453 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
456 time_adj
-= (-time_adj
>> 2) + (-time_adj
>> 5);
458 time_adj
+= (time_adj
>> 2) + (time_adj
>> 5);
462 /* in the NTP reference this is called "hardclock()" */
463 static void update_wall_time_one_tick(void)
465 if ( (time_adjust_step
= time_adjust
) != 0 ) {
466 /* We are doing an adjtime thing.
468 * Prepare time_adjust_step to be within bounds.
469 * Note that a positive time_adjust means we want the clock
472 * Limit the amount of the step to be in the range
473 * -tickadj .. +tickadj
475 if (time_adjust
> tickadj
)
476 time_adjust_step
= tickadj
;
477 else if (time_adjust
< -tickadj
)
478 time_adjust_step
= -tickadj
;
480 /* Reduce by this step the amount of time left */
481 time_adjust
-= time_adjust_step
;
483 xtime
.tv_usec
+= tick
+ time_adjust_step
;
485 * Advance the phase, once it gets to one microsecond, then
486 * advance the tick more.
488 time_phase
+= time_adj
;
489 if (time_phase
<= -FINEUSEC
) {
490 long ltemp
= -time_phase
>> SHIFT_SCALE
;
491 time_phase
+= ltemp
<< SHIFT_SCALE
;
492 xtime
.tv_usec
-= ltemp
;
494 else if (time_phase
>= FINEUSEC
) {
495 long ltemp
= time_phase
>> SHIFT_SCALE
;
496 time_phase
-= ltemp
<< SHIFT_SCALE
;
497 xtime
.tv_usec
+= ltemp
;
502 * Using a loop looks inefficient, but "ticks" is
503 * usually just one (we shouldn't be losing ticks,
504 * we're doing this this way mainly for interrupt
505 * latency reasons, not because we think we'll
506 * have lots of lost timer ticks
508 static void update_wall_time(unsigned long ticks
)
512 update_wall_time_one_tick();
515 if (xtime
.tv_usec
>= 1000000) {
516 xtime
.tv_usec
-= 1000000;
522 static inline void do_process_times(struct task_struct
*p
,
523 unsigned long user
, unsigned long system
)
527 psecs
= (p
->times
.tms_utime
+= user
);
528 psecs
+= (p
->times
.tms_stime
+= system
);
529 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_cur
) {
530 /* Send SIGXCPU every second.. */
532 send_sig(SIGXCPU
, p
, 1);
533 /* and SIGKILL when we go over max.. */
534 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_max
)
535 send_sig(SIGKILL
, p
, 1);
539 static inline void do_it_virt(struct task_struct
* p
, unsigned long ticks
)
541 unsigned long it_virt
= p
->it_virt_value
;
546 it_virt
= p
->it_virt_incr
;
547 send_sig(SIGVTALRM
, p
, 1);
549 p
->it_virt_value
= it_virt
;
553 static inline void do_it_prof(struct task_struct
*p
)
555 unsigned long it_prof
= p
->it_prof_value
;
558 if (--it_prof
== 0) {
559 it_prof
= p
->it_prof_incr
;
560 send_sig(SIGPROF
, p
, 1);
562 p
->it_prof_value
= it_prof
;
566 void update_one_process(struct task_struct
*p
, unsigned long user
,
567 unsigned long system
, int cpu
)
569 p
->per_cpu_utime
[cpu
] += user
;
570 p
->per_cpu_stime
[cpu
] += system
;
571 do_process_times(p
, user
, system
);
577 * Called from the timer interrupt handler to charge one tick to the current
578 * process. user_tick is 1 if the tick is user time, 0 for system.
580 void update_process_times(int user_tick
)
582 struct task_struct
*p
= current
;
583 int cpu
= smp_processor_id(), system
= user_tick
^ 1;
585 update_one_process(p
, user_tick
, system
, cpu
);
587 if (--p
->counter
<= 0) {
592 kstat
.per_cpu_nice
[cpu
] += user_tick
;
594 kstat
.per_cpu_user
[cpu
] += user_tick
;
595 kstat
.per_cpu_system
[cpu
] += system
;
596 } else if (local_bh_count(cpu
) || local_irq_count(cpu
) > 1)
597 kstat
.per_cpu_system
[cpu
] += system
;
601 * Nr of active tasks - counted in fixed-point numbers
603 static unsigned long count_active_tasks(void)
605 struct task_struct
*p
;
606 unsigned long nr
= 0;
608 read_lock(&tasklist_lock
);
610 if ((p
->state
== TASK_RUNNING
||
611 (p
->state
& TASK_UNINTERRUPTIBLE
)))
614 read_unlock(&tasklist_lock
);
619 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
620 * imply that avenrun[] is the standard name for this kind of thing.
621 * Nothing else seems to be standardized: the fractional size etc
622 * all seem to differ on different machines.
624 unsigned long avenrun
[3];
626 static inline void calc_load(unsigned long ticks
)
628 unsigned long active_tasks
; /* fixed-point */
629 static int count
= LOAD_FREQ
;
634 active_tasks
= count_active_tasks();
635 CALC_LOAD(avenrun
[0], EXP_1
, active_tasks
);
636 CALC_LOAD(avenrun
[1], EXP_5
, active_tasks
);
637 CALC_LOAD(avenrun
[2], EXP_15
, active_tasks
);
641 /* jiffies at the most recent update of wall time */
642 unsigned long wall_jiffies
;
645 * This spinlock protect us from races in SMP while playing with xtime. -arca
647 rwlock_t xtime_lock
= RW_LOCK_UNLOCKED
;
649 static inline void update_times(void)
654 * update_times() is run from the raw timer_bh handler so we
655 * just know that the irqs are locally enabled and so we don't
656 * need to save/restore the flags of the local CPU here. -arca
658 write_lock_irq(&xtime_lock
);
660 ticks
= jiffies
- wall_jiffies
;
662 wall_jiffies
+= ticks
;
663 update_wall_time(ticks
);
665 write_unlock_irq(&xtime_lock
);
675 void do_timer(struct pt_regs
*regs
)
677 (*(unsigned long *)&jiffies
)++;
679 /* SMP process accounting uses the local APIC timer */
681 update_process_times(user_mode(regs
));
688 #if !defined(__alpha__) && !defined(__ia64__)
691 * For backwards compatibility? This can be done in libc so Alpha
692 * and all newer ports shouldn't need it.
694 asmlinkage
unsigned long sys_alarm(unsigned int seconds
)
696 struct itimerval it_new
, it_old
;
697 unsigned int oldalarm
;
699 it_new
.it_interval
.tv_sec
= it_new
.it_interval
.tv_usec
= 0;
700 it_new
.it_value
.tv_sec
= seconds
;
701 it_new
.it_value
.tv_usec
= 0;
702 do_setitimer(ITIMER_REAL
, &it_new
, &it_old
);
703 oldalarm
= it_old
.it_value
.tv_sec
;
704 /* ehhh.. We can't return 0 if we have an alarm pending.. */
705 /* And we'd better return too much than too little anyway */
706 if (it_old
.it_value
.tv_usec
)
716 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
717 * should be moved into arch/i386 instead?
720 asmlinkage
long sys_getpid(void)
722 /* This is SMP safe - current->pid doesn't change */
723 return current
->tgid
;
727 * This is not strictly SMP safe: p_opptr could change
728 * from under us. However, rather than getting any lock
729 * we can use an optimistic algorithm: get the parent
730 * pid, and go back and check that the parent is still
731 * the same. If it has changed (which is extremely unlikely
732 * indeed), we just try again..
734 * NOTE! This depends on the fact that even if we _do_
735 * get an old value of "parent", we can happily dereference
736 * the pointer: we just can't necessarily trust the result
737 * until we know that the parent pointer is valid.
739 * The "mb()" macro is a memory barrier - a synchronizing
740 * event. It also makes sure that gcc doesn't optimize
741 * away the necessary memory references.. The barrier doesn't
742 * have to have all that strong semantics: on x86 we don't
743 * really require a synchronizing instruction, for example.
744 * The barrier is more important for code generation than
745 * for any real memory ordering semantics (even if there is
746 * a small window for a race, using the old pointer is
747 * harmless for a while).
749 asmlinkage
long sys_getppid(void)
752 struct task_struct
* me
= current
;
753 struct task_struct
* parent
;
755 parent
= me
->p_opptr
;
760 struct task_struct
*old
= parent
;
762 parent
= me
->p_opptr
;
772 asmlinkage
long sys_getuid(void)
774 /* Only we change this so SMP safe */
778 asmlinkage
long sys_geteuid(void)
780 /* Only we change this so SMP safe */
781 return current
->euid
;
784 asmlinkage
long sys_getgid(void)
786 /* Only we change this so SMP safe */
790 asmlinkage
long sys_getegid(void)
792 /* Only we change this so SMP safe */
793 return current
->egid
;
798 asmlinkage
long sys_nanosleep(struct timespec
*rqtp
, struct timespec
*rmtp
)
801 unsigned long expire
;
803 if(copy_from_user(&t
, rqtp
, sizeof(struct timespec
)))
806 if (t
.tv_nsec
>= 1000000000L || t
.tv_nsec
< 0 || t
.tv_sec
< 0)
810 if (t
.tv_sec
== 0 && t
.tv_nsec
<= 2000000L &&
811 current
->policy
!= SCHED_OTHER
)
814 * Short delay requests up to 2 ms will be handled with
815 * high precision by a busy wait for all real-time processes.
817 * Its important on SMP not to do this holding locks.
819 udelay((t
.tv_nsec
+ 999) / 1000);
823 expire
= timespec_to_jiffies(&t
) + (t
.tv_sec
|| t
.tv_nsec
);
825 current
->state
= TASK_INTERRUPTIBLE
;
826 expire
= schedule_timeout(expire
);
830 jiffies_to_timespec(expire
, &t
);
831 if (copy_to_user(rmtp
, &t
, sizeof(struct timespec
)))