4 * Kernel internal timers, kernel timekeeping, basic process system calls
6 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
18 #include <linux/config.h>
20 #include <linux/timex.h>
21 #include <linux/delay.h>
22 #include <linux/smp_lock.h>
23 #include <linux/interrupt.h>
24 #include <linux/kernel_stat.h>
26 #include <asm/uaccess.h>
29 * Timekeeping variables
32 long tick
= (1000000 + HZ
/2) / HZ
; /* timer interrupt period */
34 /* The current time */
35 volatile struct timeval xtime
__attribute__ ((aligned (16)));
37 /* Don't completely fail for HZ > 500. */
38 int tickadj
= 500/HZ
? : 1; /* microsecs */
40 DECLARE_TASK_QUEUE(tq_timer
);
41 DECLARE_TASK_QUEUE(tq_immediate
);
44 * phase-lock loop variables
46 /* TIME_ERROR prevents overwriting the CMOS clock */
47 int time_state
= TIME_OK
; /* clock synchronization status */
48 int time_status
= STA_UNSYNC
; /* clock status bits */
49 long time_offset
; /* time adjustment (us) */
50 long time_constant
= 2; /* pll time constant */
51 long time_tolerance
= MAXFREQ
; /* frequency tolerance (ppm) */
52 long time_precision
= 1; /* clock precision (us) */
53 long time_maxerror
= NTP_PHASE_LIMIT
; /* maximum error (us) */
54 long time_esterror
= NTP_PHASE_LIMIT
; /* estimated error (us) */
55 long time_phase
; /* phase offset (scaled us) */
56 long time_freq
= ((1000000 + HZ
/2) % HZ
- HZ
/2) << SHIFT_USEC
;
57 /* frequency offset (scaled ppm)*/
58 long time_adj
; /* tick adjust (scaled 1 / HZ) */
59 long time_reftime
; /* time at last adjustment (s) */
62 long time_adjust_step
;
66 extern int do_setitimer(int, struct itimerval
*, struct itimerval
*);
68 unsigned long volatile jiffies
;
70 unsigned int * prof_buffer
;
71 unsigned long prof_len
;
72 unsigned long prof_shift
;
79 #define TVN_SIZE (1 << TVN_BITS)
80 #define TVR_SIZE (1 << TVR_BITS)
81 #define TVN_MASK (TVN_SIZE - 1)
82 #define TVR_MASK (TVR_SIZE - 1)
86 struct list_head vec
[TVN_SIZE
];
89 struct timer_vec_root
{
91 struct list_head vec
[TVR_SIZE
];
94 static struct timer_vec tv5
;
95 static struct timer_vec tv4
;
96 static struct timer_vec tv3
;
97 static struct timer_vec tv2
;
98 static struct timer_vec_root tv1
;
100 static struct timer_vec
* const tvecs
[] = {
101 (struct timer_vec
*)&tv1
, &tv2
, &tv3
, &tv4
, &tv5
104 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
106 void init_timervecs (void)
110 for (i
= 0; i
< TVN_SIZE
; i
++) {
111 INIT_LIST_HEAD(tv5
.vec
+ i
);
112 INIT_LIST_HEAD(tv4
.vec
+ i
);
113 INIT_LIST_HEAD(tv3
.vec
+ i
);
114 INIT_LIST_HEAD(tv2
.vec
+ i
);
116 for (i
= 0; i
< TVR_SIZE
; i
++)
117 INIT_LIST_HEAD(tv1
.vec
+ i
);
120 static unsigned long timer_jiffies
;
122 static inline void internal_add_timer(struct timer_list
*timer
)
125 * must be cli-ed when calling this
127 unsigned long expires
= timer
->expires
;
128 unsigned long idx
= expires
- timer_jiffies
;
129 struct list_head
* vec
;
131 if (idx
< TVR_SIZE
) {
132 int i
= expires
& TVR_MASK
;
134 } else if (idx
< 1 << (TVR_BITS
+ TVN_BITS
)) {
135 int i
= (expires
>> TVR_BITS
) & TVN_MASK
;
137 } else if (idx
< 1 << (TVR_BITS
+ 2 * TVN_BITS
)) {
138 int i
= (expires
>> (TVR_BITS
+ TVN_BITS
)) & TVN_MASK
;
140 } else if (idx
< 1 << (TVR_BITS
+ 3 * TVN_BITS
)) {
141 int i
= (expires
>> (TVR_BITS
+ 2 * TVN_BITS
)) & TVN_MASK
;
143 } else if ((signed long) idx
< 0) {
144 /* can happen if you add a timer with expires == jiffies,
145 * or you set a timer to go off in the past
147 vec
= tv1
.vec
+ tv1
.index
;
148 } else if (idx
<= 0xffffffffUL
) {
149 int i
= (expires
>> (TVR_BITS
+ 3 * TVN_BITS
)) & TVN_MASK
;
152 /* Can only get here on architectures with 64-bit jiffies */
153 INIT_LIST_HEAD(&timer
->list
);
159 list_add(&timer
->list
, vec
->prev
);
162 /* Initialize both explicitly - let's try to have them in the same cache line */
163 spinlock_t timerlist_lock
= SPIN_LOCK_UNLOCKED
;
166 volatile struct timer_list
* volatile running_timer
;
167 #define timer_enter(t) do { running_timer = t; mb(); } while (0)
168 #define timer_exit() do { running_timer = NULL; } while (0)
169 #define timer_is_running(t) (running_timer == t)
170 #define timer_synchronize(t) while (timer_is_running(t)) barrier()
172 #define timer_enter(t) do { } while (0)
173 #define timer_exit() do { } while (0)
176 void add_timer(struct timer_list
*timer
)
180 spin_lock_irqsave(&timerlist_lock
, flags
);
181 if (timer_pending(timer
))
183 internal_add_timer(timer
);
184 spin_unlock_irqrestore(&timerlist_lock
, flags
);
187 spin_unlock_irqrestore(&timerlist_lock
, flags
);
188 printk("bug: kernel timer added twice at %p.\n",
189 __builtin_return_address(0));
192 static inline int detach_timer (struct timer_list
*timer
)
194 if (!timer_pending(timer
))
196 list_del(&timer
->list
);
200 int mod_timer(struct timer_list
*timer
, unsigned long expires
)
205 spin_lock_irqsave(&timerlist_lock
, flags
);
206 timer
->expires
= expires
;
207 ret
= detach_timer(timer
);
208 internal_add_timer(timer
);
209 spin_unlock_irqrestore(&timerlist_lock
, flags
);
213 int del_timer(struct timer_list
* timer
)
218 spin_lock_irqsave(&timerlist_lock
, flags
);
219 ret
= detach_timer(timer
);
220 timer
->list
.next
= timer
->list
.prev
= NULL
;
221 spin_unlock_irqrestore(&timerlist_lock
, flags
);
226 void sync_timers(void)
228 spin_unlock_wait(&global_bh_lock
);
232 * SMP specific function to delete periodic timer.
233 * Caller must disable by some means restarting the timer
234 * for new. Upon exit the timer is not queued and handler is not running
235 * on any CPU. It returns number of times, which timer was deleted
236 * (for reference counting).
239 int del_timer_sync(struct timer_list
* timer
)
247 spin_lock_irqsave(&timerlist_lock
, flags
);
248 ret
+= detach_timer(timer
);
249 timer
->list
.next
= timer
->list
.prev
= 0;
250 running
= timer_is_running(timer
);
251 spin_unlock_irqrestore(&timerlist_lock
, flags
);
256 timer_synchronize(timer
);
264 static inline void cascade_timers(struct timer_vec
*tv
)
266 /* cascade all the timers from tv up one level */
267 struct list_head
*head
, *curr
, *next
;
269 head
= tv
->vec
+ tv
->index
;
272 * We are removing _all_ timers from the list, so we don't have to
273 * detach them individually, just clear the list afterwards.
275 while (curr
!= head
) {
276 struct timer_list
*tmp
;
278 tmp
= list_entry(curr
, struct timer_list
, list
);
280 list_del(curr
); // not needed
281 internal_add_timer(tmp
);
284 INIT_LIST_HEAD(head
);
285 tv
->index
= (tv
->index
+ 1) & TVN_MASK
;
288 static inline void run_timer_list(void)
290 spin_lock_irq(&timerlist_lock
);
291 while ((long)(jiffies
- timer_jiffies
) >= 0) {
292 struct list_head
*head
, *curr
;
296 cascade_timers(tvecs
[n
]);
297 } while (tvecs
[n
]->index
== 1 && ++n
< NOOF_TVECS
);
300 head
= tv1
.vec
+ tv1
.index
;
303 struct timer_list
*timer
;
304 void (*fn
)(unsigned long);
307 timer
= list_entry(curr
, struct timer_list
, list
);
308 fn
= timer
->function
;
312 timer
->list
.next
= timer
->list
.prev
= NULL
;
314 spin_unlock_irq(&timerlist_lock
);
316 spin_lock_irq(&timerlist_lock
);
321 tv1
.index
= (tv1
.index
+ 1) & TVR_MASK
;
323 spin_unlock_irq(&timerlist_lock
);
326 spinlock_t tqueue_lock
= SPIN_LOCK_UNLOCKED
;
330 run_task_queue(&tq_timer
);
333 void immediate_bh(void)
335 run_task_queue(&tq_immediate
);
339 * this routine handles the overflow of the microsecond field
341 * The tricky bits of code to handle the accurate clock support
342 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
343 * They were originally developed for SUN and DEC kernels.
344 * All the kudos should go to Dave for this stuff.
347 static void second_overflow(void)
351 /* Bump the maxerror field */
352 time_maxerror
+= time_tolerance
>> SHIFT_USEC
;
353 if ( time_maxerror
> NTP_PHASE_LIMIT
) {
354 time_maxerror
= NTP_PHASE_LIMIT
;
355 time_status
|= STA_UNSYNC
;
359 * Leap second processing. If in leap-insert state at
360 * the end of the day, the system clock is set back one
361 * second; if in leap-delete state, the system clock is
362 * set ahead one second. The microtime() routine or
363 * external clock driver will insure that reported time
364 * is always monotonic. The ugly divides should be
367 switch (time_state
) {
370 if (time_status
& STA_INS
)
371 time_state
= TIME_INS
;
372 else if (time_status
& STA_DEL
)
373 time_state
= TIME_DEL
;
377 if (xtime
.tv_sec
% 86400 == 0) {
379 time_state
= TIME_OOP
;
380 printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
385 if ((xtime
.tv_sec
+ 1) % 86400 == 0) {
387 time_state
= TIME_WAIT
;
388 printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
393 time_state
= TIME_WAIT
;
397 if (!(time_status
& (STA_INS
| STA_DEL
)))
398 time_state
= TIME_OK
;
402 * Compute the phase adjustment for the next second. In
403 * PLL mode, the offset is reduced by a fixed factor
404 * times the time constant. In FLL mode the offset is
405 * used directly. In either mode, the maximum phase
406 * adjustment for each second is clamped so as to spread
407 * the adjustment over not more than the number of
408 * seconds between updates.
410 if (time_offset
< 0) {
411 ltemp
= -time_offset
;
412 if (!(time_status
& STA_FLL
))
413 ltemp
>>= SHIFT_KG
+ time_constant
;
414 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
415 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
416 time_offset
+= ltemp
;
417 time_adj
= -ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
420 if (!(time_status
& STA_FLL
))
421 ltemp
>>= SHIFT_KG
+ time_constant
;
422 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
423 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
424 time_offset
-= ltemp
;
425 time_adj
= ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
429 * Compute the frequency estimate and additional phase
430 * adjustment due to frequency error for the next
431 * second. When the PPS signal is engaged, gnaw on the
432 * watchdog counter and update the frequency computed by
433 * the pll and the PPS signal.
436 if (pps_valid
== PPS_VALID
) { /* PPS signal lost */
437 pps_jitter
= MAXTIME
;
438 pps_stabil
= MAXFREQ
;
439 time_status
&= ~(STA_PPSSIGNAL
| STA_PPSJITTER
|
440 STA_PPSWANDER
| STA_PPSERROR
);
442 ltemp
= time_freq
+ pps_freq
;
444 time_adj
-= -ltemp
>>
445 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
448 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
451 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
452 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
455 time_adj
-= (-time_adj
>> 2) + (-time_adj
>> 5);
457 time_adj
+= (time_adj
>> 2) + (time_adj
>> 5);
461 /* in the NTP reference this is called "hardclock()" */
462 static void update_wall_time_one_tick(void)
464 if ( (time_adjust_step
= time_adjust
) != 0 ) {
465 /* We are doing an adjtime thing.
467 * Prepare time_adjust_step to be within bounds.
468 * Note that a positive time_adjust means we want the clock
471 * Limit the amount of the step to be in the range
472 * -tickadj .. +tickadj
474 if (time_adjust
> tickadj
)
475 time_adjust_step
= tickadj
;
476 else if (time_adjust
< -tickadj
)
477 time_adjust_step
= -tickadj
;
479 /* Reduce by this step the amount of time left */
480 time_adjust
-= time_adjust_step
;
482 xtime
.tv_usec
+= tick
+ time_adjust_step
;
484 * Advance the phase, once it gets to one microsecond, then
485 * advance the tick more.
487 time_phase
+= time_adj
;
488 if (time_phase
<= -FINEUSEC
) {
489 long ltemp
= -time_phase
>> SHIFT_SCALE
;
490 time_phase
+= ltemp
<< SHIFT_SCALE
;
491 xtime
.tv_usec
-= ltemp
;
493 else if (time_phase
>= FINEUSEC
) {
494 long ltemp
= time_phase
>> SHIFT_SCALE
;
495 time_phase
-= ltemp
<< SHIFT_SCALE
;
496 xtime
.tv_usec
+= ltemp
;
501 * Using a loop looks inefficient, but "ticks" is
502 * usually just one (we shouldn't be losing ticks,
503 * we're doing this this way mainly for interrupt
504 * latency reasons, not because we think we'll
505 * have lots of lost timer ticks
507 static void update_wall_time(unsigned long ticks
)
511 update_wall_time_one_tick();
514 if (xtime
.tv_usec
>= 1000000) {
515 xtime
.tv_usec
-= 1000000;
521 static inline void do_process_times(struct task_struct
*p
,
522 unsigned long user
, unsigned long system
)
526 psecs
= (p
->times
.tms_utime
+= user
);
527 psecs
+= (p
->times
.tms_stime
+= system
);
528 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_cur
) {
529 /* Send SIGXCPU every second.. */
531 send_sig(SIGXCPU
, p
, 1);
532 /* and SIGKILL when we go over max.. */
533 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_max
)
534 send_sig(SIGKILL
, p
, 1);
538 static inline void do_it_virt(struct task_struct
* p
, unsigned long ticks
)
540 unsigned long it_virt
= p
->it_virt_value
;
545 it_virt
= p
->it_virt_incr
;
546 send_sig(SIGVTALRM
, p
, 1);
548 p
->it_virt_value
= it_virt
;
552 static inline void do_it_prof(struct task_struct
*p
)
554 unsigned long it_prof
= p
->it_prof_value
;
557 if (--it_prof
== 0) {
558 it_prof
= p
->it_prof_incr
;
559 send_sig(SIGPROF
, p
, 1);
561 p
->it_prof_value
= it_prof
;
565 void update_one_process(struct task_struct
*p
, unsigned long user
,
566 unsigned long system
, int cpu
)
568 p
->per_cpu_utime
[cpu
] += user
;
569 p
->per_cpu_stime
[cpu
] += system
;
570 do_process_times(p
, user
, system
);
576 * Called from the timer interrupt handler to charge one tick to the current
577 * process. user_tick is 1 if the tick is user time, 0 for system.
579 void update_process_times(int user_tick
)
581 struct task_struct
*p
= current
;
582 int cpu
= smp_processor_id(), system
= user_tick
^ 1;
584 update_one_process(p
, user_tick
, system
, cpu
);
586 if (--p
->counter
<= 0) {
591 kstat
.per_cpu_nice
[cpu
] += user_tick
;
593 kstat
.per_cpu_user
[cpu
] += user_tick
;
594 kstat
.per_cpu_system
[cpu
] += system
;
595 } else if (local_bh_count(cpu
) || local_irq_count(cpu
) > 1)
596 kstat
.per_cpu_system
[cpu
] += system
;
600 * Nr of active tasks - counted in fixed-point numbers
602 static unsigned long count_active_tasks(void)
604 struct task_struct
*p
;
605 unsigned long nr
= 0;
607 read_lock(&tasklist_lock
);
609 if ((p
->state
== TASK_RUNNING
||
610 (p
->state
& TASK_UNINTERRUPTIBLE
)))
613 read_unlock(&tasklist_lock
);
618 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
619 * imply that avenrun[] is the standard name for this kind of thing.
620 * Nothing else seems to be standardized: the fractional size etc
621 * all seem to differ on different machines.
623 unsigned long avenrun
[3];
625 static inline void calc_load(unsigned long ticks
)
627 unsigned long active_tasks
; /* fixed-point */
628 static int count
= LOAD_FREQ
;
633 active_tasks
= count_active_tasks();
634 CALC_LOAD(avenrun
[0], EXP_1
, active_tasks
);
635 CALC_LOAD(avenrun
[1], EXP_5
, active_tasks
);
636 CALC_LOAD(avenrun
[2], EXP_15
, active_tasks
);
640 /* jiffies at the most recent update of wall time */
641 unsigned long wall_jiffies
;
644 * This spinlock protect us from races in SMP while playing with xtime. -arca
646 rwlock_t xtime_lock
= RW_LOCK_UNLOCKED
;
648 static inline void update_times(void)
653 * update_times() is run from the raw timer_bh handler so we
654 * just know that the irqs are locally enabled and so we don't
655 * need to save/restore the flags of the local CPU here. -arca
657 write_lock_irq(&xtime_lock
);
659 ticks
= jiffies
- wall_jiffies
;
661 wall_jiffies
+= ticks
;
662 update_wall_time(ticks
);
664 write_unlock_irq(&xtime_lock
);
674 void do_timer(struct pt_regs
*regs
)
676 (*(unsigned long *)&jiffies
)++;
678 /* SMP process accounting uses the local APIC timer */
680 update_process_times(user_mode(regs
));
683 if (TQ_ACTIVE(tq_timer
))
687 #if !defined(__alpha__) && !defined(__ia64__)
690 * For backwards compatibility? This can be done in libc so Alpha
691 * and all newer ports shouldn't need it.
693 asmlinkage
unsigned long sys_alarm(unsigned int seconds
)
695 struct itimerval it_new
, it_old
;
696 unsigned int oldalarm
;
698 it_new
.it_interval
.tv_sec
= it_new
.it_interval
.tv_usec
= 0;
699 it_new
.it_value
.tv_sec
= seconds
;
700 it_new
.it_value
.tv_usec
= 0;
701 do_setitimer(ITIMER_REAL
, &it_new
, &it_old
);
702 oldalarm
= it_old
.it_value
.tv_sec
;
703 /* ehhh.. We can't return 0 if we have an alarm pending.. */
704 /* And we'd better return too much than too little anyway */
705 if (it_old
.it_value
.tv_usec
)
715 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
716 * should be moved into arch/i386 instead?
719 asmlinkage
long sys_getpid(void)
721 /* This is SMP safe - current->pid doesn't change */
722 return current
->tgid
;
726 * This is not strictly SMP safe: p_opptr could change
727 * from under us. However, rather than getting any lock
728 * we can use an optimistic algorithm: get the parent
729 * pid, and go back and check that the parent is still
730 * the same. If it has changed (which is extremely unlikely
731 * indeed), we just try again..
733 * NOTE! This depends on the fact that even if we _do_
734 * get an old value of "parent", we can happily dereference
735 * the pointer: we just can't necessarily trust the result
736 * until we know that the parent pointer is valid.
738 * The "mb()" macro is a memory barrier - a synchronizing
739 * event. It also makes sure that gcc doesn't optimize
740 * away the necessary memory references.. The barrier doesn't
741 * have to have all that strong semantics: on x86 we don't
742 * really require a synchronizing instruction, for example.
743 * The barrier is more important for code generation than
744 * for any real memory ordering semantics (even if there is
745 * a small window for a race, using the old pointer is
746 * harmless for a while).
748 asmlinkage
long sys_getppid(void)
751 struct task_struct
* me
= current
;
752 struct task_struct
* parent
;
754 parent
= me
->p_opptr
;
759 struct task_struct
*old
= parent
;
761 parent
= me
->p_opptr
;
771 asmlinkage
long sys_getuid(void)
773 /* Only we change this so SMP safe */
777 asmlinkage
long sys_geteuid(void)
779 /* Only we change this so SMP safe */
780 return current
->euid
;
783 asmlinkage
long sys_getgid(void)
785 /* Only we change this so SMP safe */
789 asmlinkage
long sys_getegid(void)
791 /* Only we change this so SMP safe */
792 return current
->egid
;
797 asmlinkage
long sys_nanosleep(struct timespec
*rqtp
, struct timespec
*rmtp
)
800 unsigned long expire
;
802 if(copy_from_user(&t
, rqtp
, sizeof(struct timespec
)))
805 if (t
.tv_nsec
>= 1000000000L || t
.tv_nsec
< 0 || t
.tv_sec
< 0)
809 if (t
.tv_sec
== 0 && t
.tv_nsec
<= 2000000L &&
810 current
->policy
!= SCHED_OTHER
)
813 * Short delay requests up to 2 ms will be handled with
814 * high precision by a busy wait for all real-time processes.
816 * Its important on SMP not to do this holding locks.
818 udelay((t
.tv_nsec
+ 999) / 1000);
822 expire
= timespec_to_jiffies(&t
) + (t
.tv_sec
|| t
.tv_nsec
);
824 current
->state
= TASK_INTERRUPTIBLE
;
825 expire
= schedule_timeout(expire
);
829 jiffies_to_timespec(expire
, &t
);
830 if (copy_to_user(rmtp
, &t
, sizeof(struct timespec
)))