4 * Kernel internal timers, kernel timekeeping, basic process system calls
6 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
18 #include <linux/config.h>
20 #include <linux/timex.h>
21 #include <linux/delay.h>
22 #include <linux/smp_lock.h>
23 #include <linux/interrupt.h>
24 #include <linux/kernel_stat.h>
25 #include <linux/slab.h>
27 #include <asm/uaccess.h>
30 * Timekeeping variables
33 long tick
= (1000000 + HZ
/2) / HZ
; /* timer interrupt period */
35 /* The current time */
36 volatile struct timeval xtime
__attribute__ ((aligned (16)));
38 /* Don't completely fail for HZ > 500. */
39 int tickadj
= 500/HZ
? : 1; /* microsecs */
41 DECLARE_TASK_QUEUE(tq_timer
);
42 DECLARE_TASK_QUEUE(tq_immediate
);
43 DECLARE_TASK_QUEUE(tq_scheduler
);
46 * phase-lock loop variables
48 /* TIME_ERROR prevents overwriting the CMOS clock */
49 int time_state
= TIME_OK
; /* clock synchronization status */
50 int time_status
= STA_UNSYNC
; /* clock status bits */
51 long time_offset
; /* time adjustment (us) */
52 long time_constant
= 2; /* pll time constant */
53 long time_tolerance
= MAXFREQ
; /* frequency tolerance (ppm) */
54 long time_precision
= 1; /* clock precision (us) */
55 long time_maxerror
= NTP_PHASE_LIMIT
; /* maximum error (us) */
56 long time_esterror
= NTP_PHASE_LIMIT
; /* estimated error (us) */
57 long time_phase
; /* phase offset (scaled us) */
58 long time_freq
= ((1000000 + HZ
/2) % HZ
- HZ
/2) << SHIFT_USEC
;
59 /* frequency offset (scaled ppm)*/
60 long time_adj
; /* tick adjust (scaled 1 / HZ) */
61 long time_reftime
; /* time at last adjustment (s) */
64 long time_adjust_step
;
68 extern int do_setitimer(int, struct itimerval
*, struct itimerval
*);
70 unsigned long volatile jiffies
;
72 unsigned int * prof_buffer
;
73 unsigned long prof_len
;
74 unsigned long prof_shift
;
81 #define TVN_SIZE (1 << TVN_BITS)
82 #define TVR_SIZE (1 << TVR_BITS)
83 #define TVN_MASK (TVN_SIZE - 1)
84 #define TVR_MASK (TVR_SIZE - 1)
88 struct list_head vec
[TVN_SIZE
];
91 struct timer_vec_root
{
93 struct list_head vec
[TVR_SIZE
];
96 static struct timer_vec tv5
;
97 static struct timer_vec tv4
;
98 static struct timer_vec tv3
;
99 static struct timer_vec tv2
;
100 static struct timer_vec_root tv1
;
102 static struct timer_vec
* const tvecs
[] = {
103 (struct timer_vec
*)&tv1
, &tv2
, &tv3
, &tv4
, &tv5
106 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
108 void init_timervecs (void)
112 for (i
= 0; i
< TVN_SIZE
; i
++) {
113 INIT_LIST_HEAD(tv5
.vec
+ i
);
114 INIT_LIST_HEAD(tv4
.vec
+ i
);
115 INIT_LIST_HEAD(tv3
.vec
+ i
);
116 INIT_LIST_HEAD(tv2
.vec
+ i
);
118 for (i
= 0; i
< TVR_SIZE
; i
++)
119 INIT_LIST_HEAD(tv1
.vec
+ i
);
122 static unsigned long timer_jiffies
;
124 static inline void internal_add_timer(struct timer_list
*timer
)
127 * must be cli-ed when calling this
129 unsigned long expires
= timer
->expires
;
130 unsigned long idx
= expires
- timer_jiffies
;
131 struct list_head
* vec
;
133 if (idx
< TVR_SIZE
) {
134 int i
= expires
& TVR_MASK
;
136 } else if (idx
< 1 << (TVR_BITS
+ TVN_BITS
)) {
137 int i
= (expires
>> TVR_BITS
) & TVN_MASK
;
139 } else if (idx
< 1 << (TVR_BITS
+ 2 * TVN_BITS
)) {
140 int i
= (expires
>> (TVR_BITS
+ TVN_BITS
)) & TVN_MASK
;
142 } else if (idx
< 1 << (TVR_BITS
+ 3 * TVN_BITS
)) {
143 int i
= (expires
>> (TVR_BITS
+ 2 * TVN_BITS
)) & TVN_MASK
;
145 } else if ((signed long) idx
< 0) {
146 /* can happen if you add a timer with expires == jiffies,
147 * or you set a timer to go off in the past
149 vec
= tv1
.vec
+ tv1
.index
;
150 } else if (idx
<= 0xffffffffUL
) {
151 int i
= (expires
>> (TVR_BITS
+ 3 * TVN_BITS
)) & TVN_MASK
;
154 /* Can only get here on architectures with 64-bit jiffies */
155 INIT_LIST_HEAD(&timer
->list
);
161 list_add(&timer
->list
, vec
->prev
);
164 /* Initialize both explicitly - let's try to have them in the same cache line */
165 spinlock_t timerlist_lock
= SPIN_LOCK_UNLOCKED
;
168 volatile struct timer_list
* volatile running_timer
= NULL
;
169 #define timer_enter(t) do { running_timer = t; mb(); } while (0)
170 #define timer_exit() do { running_timer = NULL; } while (0)
171 #define timer_is_running(t) (running_timer == t)
172 #define timer_synchronize(t) while (timer_is_running(t)) barrier()
174 #define timer_enter(t) do { } while (0)
175 #define timer_exit() do { } while (0)
178 void add_timer(struct timer_list
*timer
)
182 spin_lock_irqsave(&timerlist_lock
, flags
);
183 if (timer_pending(timer
))
185 internal_add_timer(timer
);
186 spin_unlock_irqrestore(&timerlist_lock
, flags
);
189 spin_unlock_irqrestore(&timerlist_lock
, flags
);
190 printk("bug: kernel timer added twice at %p.\n",
191 __builtin_return_address(0));
194 static inline int detach_timer (struct timer_list
*timer
)
196 if (!timer_pending(timer
))
198 list_del(&timer
->list
);
202 int mod_timer(struct timer_list
*timer
, unsigned long expires
)
207 spin_lock_irqsave(&timerlist_lock
, flags
);
208 timer
->expires
= expires
;
209 ret
= detach_timer(timer
);
210 internal_add_timer(timer
);
211 spin_unlock_irqrestore(&timerlist_lock
, flags
);
215 int del_timer(struct timer_list
* timer
)
220 spin_lock_irqsave(&timerlist_lock
, flags
);
221 ret
= detach_timer(timer
);
222 timer
->list
.next
= timer
->list
.prev
= NULL
;
223 spin_unlock_irqrestore(&timerlist_lock
, flags
);
228 void sync_timers(void)
230 spin_unlock_wait(&global_bh_lock
);
234 * SMP specific function to delete periodic timer.
235 * Caller must disable by some means restarting the timer
236 * for new. Upon exit the timer is not queued and handler is not running
237 * on any CPU. It returns number of times, which timer was deleted
238 * (for reference counting).
241 int del_timer_sync(struct timer_list
* timer
)
249 spin_lock_irqsave(&timerlist_lock
, flags
);
250 ret
+= detach_timer(timer
);
251 timer
->list
.next
= timer
->list
.prev
= 0;
252 running
= timer_is_running(timer
);
253 spin_unlock_irqrestore(&timerlist_lock
, flags
);
258 timer_synchronize(timer
);
266 static inline void cascade_timers(struct timer_vec
*tv
)
268 /* cascade all the timers from tv up one level */
269 struct list_head
*head
, *curr
, *next
;
271 head
= tv
->vec
+ tv
->index
;
274 * We are removing _all_ timers from the list, so we don't have to
275 * detach them individually, just clear the list afterwards.
277 while (curr
!= head
) {
278 struct timer_list
*tmp
;
280 tmp
= list_entry(curr
, struct timer_list
, list
);
282 list_del(curr
); // not needed
283 internal_add_timer(tmp
);
286 INIT_LIST_HEAD(head
);
287 tv
->index
= (tv
->index
+ 1) & TVN_MASK
;
290 static inline void run_timer_list(void)
292 spin_lock_irq(&timerlist_lock
);
293 while ((long)(jiffies
- timer_jiffies
) >= 0) {
294 struct list_head
*head
, *curr
;
298 cascade_timers(tvecs
[n
]);
299 } while (tvecs
[n
]->index
== 1 && ++n
< NOOF_TVECS
);
302 head
= tv1
.vec
+ tv1
.index
;
305 struct timer_list
*timer
;
306 void (*fn
)(unsigned long);
309 timer
= list_entry(curr
, struct timer_list
, list
);
310 fn
= timer
->function
;
314 timer
->list
.next
= timer
->list
.prev
= NULL
;
316 spin_unlock_irq(&timerlist_lock
);
318 spin_lock_irq(&timerlist_lock
);
323 tv1
.index
= (tv1
.index
+ 1) & TVR_MASK
;
325 spin_unlock_irq(&timerlist_lock
);
328 spinlock_t tqueue_lock
= SPIN_LOCK_UNLOCKED
;
332 run_task_queue(&tq_timer
);
335 void immediate_bh(void)
337 run_task_queue(&tq_immediate
);
341 * this routine handles the overflow of the microsecond field
343 * The tricky bits of code to handle the accurate clock support
344 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
345 * They were originally developed for SUN and DEC kernels.
346 * All the kudos should go to Dave for this stuff.
349 static void second_overflow(void)
353 /* Bump the maxerror field */
354 time_maxerror
+= time_tolerance
>> SHIFT_USEC
;
355 if ( time_maxerror
> NTP_PHASE_LIMIT
) {
356 time_maxerror
= NTP_PHASE_LIMIT
;
357 time_status
|= STA_UNSYNC
;
361 * Leap second processing. If in leap-insert state at
362 * the end of the day, the system clock is set back one
363 * second; if in leap-delete state, the system clock is
364 * set ahead one second. The microtime() routine or
365 * external clock driver will insure that reported time
366 * is always monotonic. The ugly divides should be
369 switch (time_state
) {
372 if (time_status
& STA_INS
)
373 time_state
= TIME_INS
;
374 else if (time_status
& STA_DEL
)
375 time_state
= TIME_DEL
;
379 if (xtime
.tv_sec
% 86400 == 0) {
381 time_state
= TIME_OOP
;
382 printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
387 if ((xtime
.tv_sec
+ 1) % 86400 == 0) {
389 time_state
= TIME_WAIT
;
390 printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
395 time_state
= TIME_WAIT
;
399 if (!(time_status
& (STA_INS
| STA_DEL
)))
400 time_state
= TIME_OK
;
404 * Compute the phase adjustment for the next second. In
405 * PLL mode, the offset is reduced by a fixed factor
406 * times the time constant. In FLL mode the offset is
407 * used directly. In either mode, the maximum phase
408 * adjustment for each second is clamped so as to spread
409 * the adjustment over not more than the number of
410 * seconds between updates.
412 if (time_offset
< 0) {
413 ltemp
= -time_offset
;
414 if (!(time_status
& STA_FLL
))
415 ltemp
>>= SHIFT_KG
+ time_constant
;
416 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
417 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
418 time_offset
+= ltemp
;
419 time_adj
= -ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
422 if (!(time_status
& STA_FLL
))
423 ltemp
>>= SHIFT_KG
+ time_constant
;
424 if (ltemp
> (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
)
425 ltemp
= (MAXPHASE
/ MINSEC
) << SHIFT_UPDATE
;
426 time_offset
-= ltemp
;
427 time_adj
= ltemp
<< (SHIFT_SCALE
- SHIFT_HZ
- SHIFT_UPDATE
);
431 * Compute the frequency estimate and additional phase
432 * adjustment due to frequency error for the next
433 * second. When the PPS signal is engaged, gnaw on the
434 * watchdog counter and update the frequency computed by
435 * the pll and the PPS signal.
438 if (pps_valid
== PPS_VALID
) { /* PPS signal lost */
439 pps_jitter
= MAXTIME
;
440 pps_stabil
= MAXFREQ
;
441 time_status
&= ~(STA_PPSSIGNAL
| STA_PPSJITTER
|
442 STA_PPSWANDER
| STA_PPSERROR
);
444 ltemp
= time_freq
+ pps_freq
;
446 time_adj
-= -ltemp
>>
447 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
450 (SHIFT_USEC
+ SHIFT_HZ
- SHIFT_SCALE
);
453 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
454 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
457 time_adj
-= (-time_adj
>> 2) + (-time_adj
>> 5);
459 time_adj
+= (time_adj
>> 2) + (time_adj
>> 5);
463 /* in the NTP reference this is called "hardclock()" */
464 static void update_wall_time_one_tick(void)
466 if ( (time_adjust_step
= time_adjust
) != 0 ) {
467 /* We are doing an adjtime thing.
469 * Prepare time_adjust_step to be within bounds.
470 * Note that a positive time_adjust means we want the clock
473 * Limit the amount of the step to be in the range
474 * -tickadj .. +tickadj
476 if (time_adjust
> tickadj
)
477 time_adjust_step
= tickadj
;
478 else if (time_adjust
< -tickadj
)
479 time_adjust_step
= -tickadj
;
481 /* Reduce by this step the amount of time left */
482 time_adjust
-= time_adjust_step
;
484 xtime
.tv_usec
+= tick
+ time_adjust_step
;
486 * Advance the phase, once it gets to one microsecond, then
487 * advance the tick more.
489 time_phase
+= time_adj
;
490 if (time_phase
<= -FINEUSEC
) {
491 long ltemp
= -time_phase
>> SHIFT_SCALE
;
492 time_phase
+= ltemp
<< SHIFT_SCALE
;
493 xtime
.tv_usec
-= ltemp
;
495 else if (time_phase
>= FINEUSEC
) {
496 long ltemp
= time_phase
>> SHIFT_SCALE
;
497 time_phase
-= ltemp
<< SHIFT_SCALE
;
498 xtime
.tv_usec
+= ltemp
;
503 * Using a loop looks inefficient, but "ticks" is
504 * usually just one (we shouldn't be losing ticks,
505 * we're doing this this way mainly for interrupt
506 * latency reasons, not because we think we'll
507 * have lots of lost timer ticks
509 static void update_wall_time(unsigned long ticks
)
513 update_wall_time_one_tick();
516 if (xtime
.tv_usec
>= 1000000) {
517 xtime
.tv_usec
-= 1000000;
523 static inline void do_process_times(struct task_struct
*p
,
524 unsigned long user
, unsigned long system
)
528 psecs
= (p
->times
.tms_utime
+= user
);
529 psecs
+= (p
->times
.tms_stime
+= system
);
530 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_cur
) {
531 /* Send SIGXCPU every second.. */
533 send_sig(SIGXCPU
, p
, 1);
534 /* and SIGKILL when we go over max.. */
535 if (psecs
/ HZ
> p
->rlim
[RLIMIT_CPU
].rlim_max
)
536 send_sig(SIGKILL
, p
, 1);
540 static inline void do_it_virt(struct task_struct
* p
, unsigned long ticks
)
542 unsigned long it_virt
= p
->it_virt_value
;
547 it_virt
= p
->it_virt_incr
;
548 send_sig(SIGVTALRM
, p
, 1);
550 p
->it_virt_value
= it_virt
;
554 static inline void do_it_prof(struct task_struct
*p
)
556 unsigned long it_prof
= p
->it_prof_value
;
559 if (--it_prof
== 0) {
560 it_prof
= p
->it_prof_incr
;
561 send_sig(SIGPROF
, p
, 1);
563 p
->it_prof_value
= it_prof
;
567 void update_one_process(struct task_struct
*p
, unsigned long user
,
568 unsigned long system
, int cpu
)
570 p
->per_cpu_utime
[cpu
] += user
;
571 p
->per_cpu_stime
[cpu
] += system
;
572 do_process_times(p
, user
, system
);
578 * Called from the timer interrupt handler to charge one tick to the current
579 * process. user_tick is 1 if the tick is user time, 0 for system.
581 void update_process_times(int user_tick
)
583 struct task_struct
*p
= current
;
584 int cpu
= smp_processor_id(), system
= user_tick
^ 1;
586 update_one_process(p
, user_tick
, system
, cpu
);
588 if (--p
->counter
<= 0) {
593 kstat
.per_cpu_nice
[cpu
] += user_tick
;
595 kstat
.per_cpu_user
[cpu
] += user_tick
;
596 kstat
.per_cpu_system
[cpu
] += system
;
597 } else if (local_bh_count(cpu
) || local_irq_count(cpu
) > 1)
598 kstat
.per_cpu_system
[cpu
] += system
;
600 if (slab_cache_drain_mask
& (1UL << cpu
))
601 slab_drain_local_cache();
605 * Nr of active tasks - counted in fixed-point numbers
607 static unsigned long count_active_tasks(void)
609 struct task_struct
*p
;
610 unsigned long nr
= 0;
612 read_lock(&tasklist_lock
);
614 if ((p
->state
== TASK_RUNNING
||
615 (p
->state
& TASK_UNINTERRUPTIBLE
)))
618 read_unlock(&tasklist_lock
);
623 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
624 * imply that avenrun[] is the standard name for this kind of thing.
625 * Nothing else seems to be standardized: the fractional size etc
626 * all seem to differ on different machines.
628 unsigned long avenrun
[3];
630 static inline void calc_load(unsigned long ticks
)
632 unsigned long active_tasks
; /* fixed-point */
633 static int count
= LOAD_FREQ
;
638 active_tasks
= count_active_tasks();
639 CALC_LOAD(avenrun
[0], EXP_1
, active_tasks
);
640 CALC_LOAD(avenrun
[1], EXP_5
, active_tasks
);
641 CALC_LOAD(avenrun
[2], EXP_15
, active_tasks
);
645 /* jiffies at the most recent update of wall time */
646 unsigned long wall_jiffies
;
649 * This spinlock protect us from races in SMP while playing with xtime. -arca
651 rwlock_t xtime_lock
= RW_LOCK_UNLOCKED
;
653 static inline void update_times(void)
658 * update_times() is run from the raw timer_bh handler so we
659 * just know that the irqs are locally enabled and so we don't
660 * need to save/restore the flags of the local CPU here. -arca
662 write_lock_irq(&xtime_lock
);
664 ticks
= jiffies
- wall_jiffies
;
666 wall_jiffies
+= ticks
;
667 update_wall_time(ticks
);
669 write_unlock_irq(&xtime_lock
);
679 void do_timer(struct pt_regs
*regs
)
681 (*(unsigned long *)&jiffies
)++;
683 /* SMP process accounting uses the local APIC timer */
685 update_process_times(user_mode(regs
));
692 #if !defined(__alpha__) && !defined(__ia64__)
695 * For backwards compatibility? This can be done in libc so Alpha
696 * and all newer ports shouldn't need it.
698 asmlinkage
unsigned long sys_alarm(unsigned int seconds
)
700 struct itimerval it_new
, it_old
;
701 unsigned int oldalarm
;
703 it_new
.it_interval
.tv_sec
= it_new
.it_interval
.tv_usec
= 0;
704 it_new
.it_value
.tv_sec
= seconds
;
705 it_new
.it_value
.tv_usec
= 0;
706 do_setitimer(ITIMER_REAL
, &it_new
, &it_old
);
707 oldalarm
= it_old
.it_value
.tv_sec
;
708 /* ehhh.. We can't return 0 if we have an alarm pending.. */
709 /* And we'd better return too much than too little anyway */
710 if (it_old
.it_value
.tv_usec
)
720 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
721 * should be moved into arch/i386 instead?
724 asmlinkage
long sys_getpid(void)
726 /* This is SMP safe - current->pid doesn't change */
731 * This is not strictly SMP safe: p_opptr could change
732 * from under us. However, rather than getting any lock
733 * we can use an optimistic algorithm: get the parent
734 * pid, and go back and check that the parent is still
735 * the same. If it has changed (which is extremely unlikely
736 * indeed), we just try again..
738 * NOTE! This depends on the fact that even if we _do_
739 * get an old value of "parent", we can happily dereference
740 * the pointer: we just can't necessarily trust the result
741 * until we know that the parent pointer is valid.
743 * The "mb()" macro is a memory barrier - a synchronizing
744 * event. It also makes sure that gcc doesn't optimize
745 * away the necessary memory references.. The barrier doesn't
746 * have to have all that strong semantics: on x86 we don't
747 * really require a synchronizing instruction, for example.
748 * The barrier is more important for code generation than
749 * for any real memory ordering semantics (even if there is
750 * a small window for a race, using the old pointer is
751 * harmless for a while).
753 asmlinkage
long sys_getppid(void)
756 struct task_struct
* me
= current
;
757 struct task_struct
* parent
;
759 parent
= me
->p_opptr
;
764 struct task_struct
*old
= parent
;
766 parent
= me
->p_opptr
;
776 asmlinkage
long sys_getuid(void)
778 /* Only we change this so SMP safe */
782 asmlinkage
long sys_geteuid(void)
784 /* Only we change this so SMP safe */
785 return current
->euid
;
788 asmlinkage
long sys_getgid(void)
790 /* Only we change this so SMP safe */
794 asmlinkage
long sys_getegid(void)
796 /* Only we change this so SMP safe */
797 return current
->egid
;
802 asmlinkage
long sys_nanosleep(struct timespec
*rqtp
, struct timespec
*rmtp
)
805 unsigned long expire
;
807 if(copy_from_user(&t
, rqtp
, sizeof(struct timespec
)))
810 if (t
.tv_nsec
>= 1000000000L || t
.tv_nsec
< 0 || t
.tv_sec
< 0)
814 if (t
.tv_sec
== 0 && t
.tv_nsec
<= 2000000L &&
815 current
->policy
!= SCHED_OTHER
)
818 * Short delay requests up to 2 ms will be handled with
819 * high precision by a busy wait for all real-time processes.
821 * Its important on SMP not to do this holding locks.
823 udelay((t
.tv_nsec
+ 999) / 1000);
827 expire
= timespec_to_jiffies(&t
) + (t
.tv_sec
|| t
.tv_nsec
);
829 current
->state
= TASK_INTERRUPTIBLE
;
830 expire
= schedule_timeout(expire
);
834 jiffies_to_timespec(expire
, &t
);
835 if (copy_to_user(rmtp
, &t
, sizeof(struct timespec
)))