Make HZ_TO_STD macro name lowercase.
[linux-2.6/linux-mips.git] / kernel / fork.c
blob109219e0d61b8055e9979fb2dc32716b6919981d
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
14 #include <linux/config.h>
15 #include <linux/malloc.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/uaccess.h>
25 #include <asm/mmu_context.h>
27 /* The idle threads do not count.. */
28 int nr_threads;
29 int nr_running;
31 int max_threads;
32 unsigned long total_forks; /* Handle normal Linux uptimes. */
33 int last_pid;
35 /* SLAB cache for mm_struct's. */
36 kmem_cache_t *mm_cachep;
38 /* SLAB cache for files structs */
39 kmem_cache_t *files_cachep;
41 struct task_struct *pidhash[PIDHASH_SZ];
43 /* UID task count cache, to prevent walking entire process list every
44 * single fork() operation.
46 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
48 static struct user_struct {
49 atomic_t count;
50 struct user_struct *next, **pprev;
51 unsigned int uid;
52 } *uidhash[UIDHASH_SZ];
54 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
56 kmem_cache_t *uid_cachep;
58 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
61 * These routines must be called with the uidhash spinlock held!
63 static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
65 if((up->next = uidhash[hashent]) != NULL)
66 uidhash[hashent]->pprev = &up->next;
67 up->pprev = &uidhash[hashent];
68 uidhash[hashent] = up;
71 static inline void uid_hash_remove(struct user_struct *up)
73 if(up->next)
74 up->next->pprev = up->pprev;
75 *up->pprev = up->next;
78 static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent)
80 struct user_struct *up, *next;
82 next = uidhash[hashent];
83 for (;;) {
84 up = next;
85 if (next) {
86 next = up->next;
87 if (up->uid != uid)
88 continue;
89 atomic_inc(&up->count);
91 break;
93 return up;
97 * For SMP, we need to re-test the user struct counter
98 * after having aquired the spinlock. This allows us to do
99 * the common case (not freeing anything) without having
100 * any locking.
102 #ifdef CONFIG_SMP
103 #define uid_hash_free(up) (!atomic_read(&(up)->count))
104 #else
105 #define uid_hash_free(up) (1)
106 #endif
108 void free_uid(struct task_struct *p)
110 struct user_struct *up = p->user;
112 if (up) {
113 p->user = NULL;
114 if (atomic_dec_and_test(&up->count)) {
115 spin_lock(&uidhash_lock);
116 if (uid_hash_free(up)) {
117 uid_hash_remove(up);
118 kmem_cache_free(uid_cachep, up);
120 spin_unlock(&uidhash_lock);
125 int alloc_uid(struct task_struct *p)
127 unsigned int hashent = uidhashfn(p->uid);
128 struct user_struct *up;
130 spin_lock(&uidhash_lock);
131 up = uid_hash_find(p->uid, hashent);
132 spin_unlock(&uidhash_lock);
134 if (!up) {
135 struct user_struct *new;
137 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
138 if (!new)
139 return -EAGAIN;
140 new->uid = p->uid;
141 atomic_set(&new->count, 1);
144 * Before adding this, check whether we raced
145 * on adding the same user already..
147 spin_lock(&uidhash_lock);
148 up = uid_hash_find(p->uid, hashent);
149 if (up) {
150 kmem_cache_free(uid_cachep, new);
151 } else {
152 uid_hash_insert(new, hashent);
153 up = new;
155 spin_unlock(&uidhash_lock);
158 p->user = up;
159 return 0;
162 void __init fork_init(unsigned long mempages)
164 int i;
166 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
168 SLAB_HWCACHE_ALIGN, NULL, NULL);
169 if(!uid_cachep)
170 panic("Cannot create uid taskcount SLAB cache\n");
172 for(i = 0; i < UIDHASH_SZ; i++)
173 uidhash[i] = 0;
176 * The default maximum number of threads is set to a safe
177 * value: the thread structures can take up at most half
178 * of memory.
180 max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
182 init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
183 init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
186 /* Protects next_safe and last_pid. */
187 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
189 static int get_pid(unsigned long flags)
191 static int next_safe = PID_MAX;
192 struct task_struct *p;
194 if (flags & CLONE_PID)
195 return current->pid;
197 spin_lock(&lastpid_lock);
198 if((++last_pid) & 0xffff8000) {
199 last_pid = 300; /* Skip daemons etc. */
200 goto inside;
202 if(last_pid >= next_safe) {
203 inside:
204 next_safe = PID_MAX;
205 read_lock(&tasklist_lock);
206 repeat:
207 for_each_task(p) {
208 if(p->pid == last_pid ||
209 p->pgrp == last_pid ||
210 p->session == last_pid) {
211 if(++last_pid >= next_safe) {
212 if(last_pid & 0xffff8000)
213 last_pid = 300;
214 next_safe = PID_MAX;
216 goto repeat;
218 if(p->pid > last_pid && next_safe > p->pid)
219 next_safe = p->pid;
220 if(p->pgrp > last_pid && next_safe > p->pgrp)
221 next_safe = p->pgrp;
222 if(p->session > last_pid && next_safe > p->session)
223 next_safe = p->session;
225 read_unlock(&tasklist_lock);
227 spin_unlock(&lastpid_lock);
229 return last_pid;
232 static inline int dup_mmap(struct mm_struct * mm)
234 struct vm_area_struct * mpnt, *tmp, **pprev;
235 int retval;
237 /* Kill me slowly. UGLY! FIXME! */
238 memcpy(&mm->start_code, &current->mm->start_code, 15*sizeof(unsigned long));
240 flush_cache_mm(current->mm);
241 pprev = &mm->mmap;
242 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
243 struct file *file;
245 retval = -ENOMEM;
246 if(mpnt->vm_flags & VM_DONTCOPY)
247 continue;
248 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
249 if (!tmp)
250 goto fail_nomem;
251 *tmp = *mpnt;
252 tmp->vm_flags &= ~VM_LOCKED;
253 tmp->vm_mm = mm;
254 mm->map_count++;
255 tmp->vm_next = NULL;
256 file = tmp->vm_file;
257 if (file) {
258 struct inode *inode = file->f_dentry->d_inode;
259 get_file(file);
260 if (tmp->vm_flags & VM_DENYWRITE)
261 atomic_dec(&inode->i_writecount);
263 /* insert tmp into the share list, just after mpnt */
264 spin_lock(&inode->i_mapping->i_shared_lock);
265 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
266 mpnt->vm_next_share->vm_pprev_share =
267 &tmp->vm_next_share;
268 mpnt->vm_next_share = tmp;
269 tmp->vm_pprev_share = &mpnt->vm_next_share;
270 spin_unlock(&inode->i_mapping->i_shared_lock);
273 /* Copy the pages, but defer checking for errors */
274 retval = copy_page_range(mm, current->mm, tmp);
275 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
276 tmp->vm_ops->open(tmp);
279 * Link in the new vma even if an error occurred,
280 * so that exit_mmap() can clean up the mess.
282 tmp->vm_next = *pprev;
283 *pprev = tmp;
285 pprev = &tmp->vm_next;
286 if (retval)
287 goto fail_nomem;
289 retval = 0;
290 if (mm->map_count >= AVL_MIN_MAP_COUNT)
291 build_mmap_avl(mm);
293 fail_nomem:
294 flush_tlb_mm(current->mm);
295 return retval;
299 * Allocate and initialize an mm_struct.
301 struct mm_struct * mm_alloc(void)
303 struct mm_struct * mm;
305 mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
306 if (mm) {
307 memset(mm, 0, sizeof(*mm));
308 atomic_set(&mm->mm_users, 1);
309 atomic_set(&mm->mm_count, 1);
310 init_MUTEX(&mm->mmap_sem);
311 mm->page_table_lock = SPIN_LOCK_UNLOCKED;
312 mm->pgd = pgd_alloc();
313 if (mm->pgd)
314 return mm;
315 kmem_cache_free(mm_cachep, mm);
317 return NULL;
321 * Called when the last reference to the mm
322 * is dropped: either by a lazy thread or by
323 * mmput. Free the page directory and the mm.
325 inline void __mmdrop(struct mm_struct *mm)
327 if (mm == &init_mm) BUG();
328 pgd_free(mm->pgd);
329 destroy_context(mm);
330 kmem_cache_free(mm_cachep, mm);
334 * Decrement the use count and release all resources for an mm.
336 void mmput(struct mm_struct *mm)
338 if (atomic_dec_and_test(&mm->mm_users)) {
339 exit_mmap(mm);
340 mmdrop(mm);
344 /* Please note the differences between mmput and mm_release.
345 * mmput is called whenever we stop holding onto a mm_struct,
346 * error success whatever.
348 * mm_release is called after a mm_struct has been removed
349 * from the current process.
351 * This difference is important for error handling, when we
352 * only half set up a mm_struct for a new process and need to restore
353 * the old one. Because we mmput the new mm_struct before
354 * restoring the old one. . .
355 * Eric Biederman 10 January 1998
357 void mm_release(void)
359 struct task_struct *tsk = current;
361 /* notify parent sleeping on vfork() */
362 if (tsk->flags & PF_VFORK) {
363 tsk->flags &= ~PF_VFORK;
364 up(tsk->p_opptr->vfork_sem);
368 static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
370 struct mm_struct * mm;
371 int retval;
373 tsk->min_flt = tsk->maj_flt = 0;
374 tsk->cmin_flt = tsk->cmaj_flt = 0;
375 tsk->nswap = tsk->cnswap = 0;
377 tsk->mm = NULL;
378 tsk->active_mm = NULL;
381 * Are we cloning a kernel thread?
383 * We need to steal a active VM for that..
385 mm = current->mm;
386 if (!mm)
387 return 0;
389 if (clone_flags & CLONE_VM) {
390 atomic_inc(&mm->mm_users);
391 goto good_mm;
394 retval = -ENOMEM;
395 mm = mm_alloc();
396 if (!mm)
397 goto fail_nomem;
399 tsk->mm = mm;
400 tsk->active_mm = mm;
403 * child gets a private LDT (if there was an LDT in the parent)
405 copy_segments(tsk, mm);
407 down(&current->mm->mmap_sem);
408 retval = dup_mmap(mm);
409 up(&current->mm->mmap_sem);
410 if (retval)
411 goto free_pt;
413 good_mm:
414 tsk->mm = mm;
415 tsk->active_mm = mm;
416 init_new_context(tsk,mm);
417 return 0;
419 free_pt:
420 mmput(mm);
421 fail_nomem:
422 return retval;
425 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
427 struct fs_struct *fs = kmalloc(sizeof(*old), GFP_KERNEL);
428 /* We don't need to lock fs - think why ;-) */
429 if (fs) {
430 atomic_set(&fs->count, 1);
431 fs->lock = RW_LOCK_UNLOCKED;
432 fs->umask = old->umask;
433 read_lock(&old->lock);
434 fs->rootmnt = mntget(old->rootmnt);
435 fs->root = dget(old->root);
436 fs->pwdmnt = mntget(old->pwdmnt);
437 fs->pwd = dget(old->pwd);
438 if (old->altroot) {
439 fs->altrootmnt = mntget(old->altrootmnt);
440 fs->altroot = dget(old->altroot);
441 } else {
442 fs->altrootmnt = NULL;
443 fs->altroot = NULL;
445 read_unlock(&old->lock);
447 return fs;
450 struct fs_struct *copy_fs_struct(struct fs_struct *old)
452 return __copy_fs_struct(old);
455 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
457 if (clone_flags & CLONE_FS) {
458 atomic_inc(&current->fs->count);
459 return 0;
461 tsk->fs = __copy_fs_struct(current->fs);
462 if (!tsk->fs)
463 return -1;
464 return 0;
467 static int count_open_files(struct files_struct *files, int size)
469 int i;
471 /* Find the last open fd */
472 for (i = size/(8*sizeof(long)); i > 0; ) {
473 if (files->open_fds->fds_bits[--i])
474 break;
476 i = (i+1) * 8 * sizeof(long);
477 return i;
480 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
482 struct files_struct *oldf, *newf;
483 struct file **old_fds, **new_fds;
484 int open_files, nfds, size, i, error = 0;
487 * A background process may not have any files ...
489 oldf = current->files;
490 if (!oldf)
491 goto out;
493 if (clone_flags & CLONE_FILES) {
494 atomic_inc(&oldf->count);
495 goto out;
498 tsk->files = NULL;
499 error = -ENOMEM;
500 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
501 if (!newf)
502 goto out;
504 atomic_set(&newf->count, 1);
506 newf->file_lock = RW_LOCK_UNLOCKED;
507 newf->next_fd = 0;
508 newf->max_fds = NR_OPEN_DEFAULT;
509 newf->max_fdset = __FD_SETSIZE;
510 newf->close_on_exec = &newf->close_on_exec_init;
511 newf->open_fds = &newf->open_fds_init;
512 newf->fd = &newf->fd_array[0];
514 /* We don't yet have the oldf readlock, but even if the old
515 fdset gets grown now, we'll only copy up to "size" fds */
516 size = oldf->max_fdset;
517 if (size > __FD_SETSIZE) {
518 newf->max_fdset = 0;
519 write_lock(&newf->file_lock);
520 error = expand_fdset(newf, size);
521 write_unlock(&newf->file_lock);
522 if (error)
523 goto out_release;
525 read_lock(&oldf->file_lock);
527 open_files = count_open_files(oldf, size);
530 * Check whether we need to allocate a larger fd array.
531 * Note: we're not a clone task, so the open count won't
532 * change.
534 nfds = NR_OPEN_DEFAULT;
535 if (open_files > nfds) {
536 read_unlock(&oldf->file_lock);
537 newf->max_fds = 0;
538 write_lock(&newf->file_lock);
539 error = expand_fd_array(newf, open_files);
540 write_unlock(&newf->file_lock);
541 if (error)
542 goto out_release;
543 nfds = newf->max_fds;
544 read_lock(&oldf->file_lock);
547 old_fds = oldf->fd;
548 new_fds = newf->fd;
550 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
551 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
553 for (i = open_files; i != 0; i--) {
554 struct file *f = *old_fds++;
555 if (f)
556 get_file(f);
557 *new_fds++ = f;
559 read_unlock(&oldf->file_lock);
561 /* compute the remainder to be cleared */
562 size = (newf->max_fds - open_files) * sizeof(struct file *);
564 /* This is long word aligned thus could use a optimized version */
565 memset(new_fds, 0, size);
567 if (newf->max_fdset > open_files) {
568 int left = (newf->max_fdset-open_files)/8;
569 int start = open_files / (8 * sizeof(unsigned long));
571 memset(&newf->open_fds->fds_bits[start], 0, left);
572 memset(&newf->close_on_exec->fds_bits[start], 0, left);
575 tsk->files = newf;
576 error = 0;
577 out:
578 return error;
580 out_release:
581 free_fdset (newf->close_on_exec, newf->max_fdset);
582 free_fdset (newf->open_fds, newf->max_fdset);
583 kmem_cache_free(files_cachep, newf);
584 goto out;
587 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
589 if (clone_flags & CLONE_SIGHAND) {
590 atomic_inc(&current->sig->count);
591 return 0;
593 tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
594 if (!tsk->sig)
595 return -1;
596 spin_lock_init(&tsk->sig->siglock);
597 atomic_set(&tsk->sig->count, 1);
598 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
599 return 0;
602 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
604 unsigned long new_flags = p->flags;
606 new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
607 new_flags |= PF_FORKNOEXEC;
608 if (!(clone_flags & CLONE_PTRACE))
609 p->ptrace = 0;
610 if (clone_flags & CLONE_VFORK)
611 new_flags |= PF_VFORK;
612 p->flags = new_flags;
616 * Ok, this is the main fork-routine. It copies the system process
617 * information (task[nr]) and sets up the necessary registers. It
618 * also copies the data segment in its entirety.
620 int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
622 int retval = -ENOMEM;
623 struct task_struct *p;
624 DECLARE_MUTEX_LOCKED(sem);
626 if (clone_flags & CLONE_PID) {
627 /* This is only allowed from the boot up thread */
628 if (current->pid)
629 return -EPERM;
632 current->vfork_sem = &sem;
634 p = alloc_task_struct();
635 if (!p)
636 goto fork_out;
638 *p = *current;
640 lock_kernel();
642 retval = -EAGAIN;
643 if (p->user) {
644 if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur)
645 goto bad_fork_free;
646 atomic_inc(&p->user->count);
650 * Counter increases are protected by
651 * the kernel lock so nr_threads can't
652 * increase under us (but it may decrease).
654 if (nr_threads >= max_threads)
655 goto bad_fork_cleanup_count;
657 if (p->exec_domain && p->exec_domain->module)
658 __MOD_INC_USE_COUNT(p->exec_domain->module);
659 if (p->binfmt && p->binfmt->module)
660 __MOD_INC_USE_COUNT(p->binfmt->module);
662 p->did_exec = 0;
663 p->swappable = 0;
664 p->state = TASK_UNINTERRUPTIBLE;
666 copy_flags(clone_flags, p);
667 p->pid = get_pid(clone_flags);
670 * This is a "shadow run" state. The process
671 * is marked runnable, but isn't actually on
672 * any run queue yet.. (that happens at the
673 * very end).
675 p->state = TASK_RUNNING;
676 p->run_list.next = NULL;
677 p->run_list.prev = NULL;
679 if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
680 p->p_opptr = current;
681 if (!(p->ptrace & PT_PTRACED))
682 p->p_pptr = current;
684 p->p_cptr = NULL;
685 init_waitqueue_head(&p->wait_chldexit);
686 p->vfork_sem = NULL;
687 spin_lock_init(&p->alloc_lock);
689 p->sigpending = 0;
690 sigemptyset(&p->signal);
691 p->sigqueue = NULL;
692 p->sigqueue_tail = &p->sigqueue;
694 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
695 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
696 init_timer(&p->real_timer);
697 p->real_timer.data = (unsigned long) p;
699 p->leader = 0; /* session leadership doesn't inherit */
700 p->tty_old_pgrp = 0;
701 p->times.tms_utime = p->times.tms_stime = 0;
702 p->times.tms_cutime = p->times.tms_cstime = 0;
703 #ifdef CONFIG_SMP
705 int i;
706 p->has_cpu = 0;
707 p->processor = current->processor;
708 /* ?? should we just memset this ?? */
709 for(i = 0; i < smp_num_cpus; i++)
710 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
711 spin_lock_init(&p->sigmask_lock);
713 #endif
714 p->lock_depth = -1; /* -1 = no lock */
715 p->start_time = jiffies;
717 retval = -ENOMEM;
718 /* copy all the process information */
719 if (copy_files(clone_flags, p))
720 goto bad_fork_cleanup;
721 if (copy_fs(clone_flags, p))
722 goto bad_fork_cleanup_files;
723 if (copy_sighand(clone_flags, p))
724 goto bad_fork_cleanup_fs;
725 if (copy_mm(clone_flags, p))
726 goto bad_fork_cleanup_sighand;
727 retval = copy_thread(0, clone_flags, usp, p, regs);
728 if (retval)
729 goto bad_fork_cleanup_sighand;
730 p->semundo = NULL;
732 /* Our parent execution domain becomes current domain
733 These must match for thread signalling to apply */
735 p->parent_exec_id = p->self_exec_id;
737 /* ok, now we should be set up.. */
738 p->swappable = 1;
739 p->exit_signal = clone_flags & CSIGNAL;
740 p->pdeath_signal = 0;
743 * "share" dynamic priority between parent and child, thus the
744 * total amount of dynamic priorities in the system doesnt change,
745 * more scheduling fairness. This is only important in the first
746 * timeslice, on the long run the scheduling behaviour is unchanged.
748 p->counter = (current->counter + 1) >> 1;
749 current->counter >>= 1;
750 if (!current->counter)
751 current->need_resched = 1;
754 * Ok, add it to the run-queues and make it
755 * visible to the rest of the system.
757 * Let it rip!
759 retval = p->pid;
760 write_lock_irq(&tasklist_lock);
761 SET_LINKS(p);
762 hash_pid(p);
763 nr_threads++;
764 write_unlock_irq(&tasklist_lock);
766 wake_up_process(p); /* do this last */
767 ++total_forks;
769 bad_fork:
770 unlock_kernel();
771 fork_out:
772 if ((clone_flags & CLONE_VFORK) && (retval > 0))
773 down(&sem);
774 return retval;
776 bad_fork_cleanup_sighand:
777 exit_sighand(p);
778 bad_fork_cleanup_fs:
779 exit_fs(p); /* blocking */
780 bad_fork_cleanup_files:
781 exit_files(p); /* blocking */
782 bad_fork_cleanup:
783 put_exec_domain(p->exec_domain);
784 if (p->binfmt && p->binfmt->module)
785 __MOD_DEC_USE_COUNT(p->binfmt->module);
786 bad_fork_cleanup_count:
787 if (p->user)
788 free_uid(p);
789 bad_fork_free:
790 free_task_struct(p);
791 goto bad_fork;
794 void __init filescache_init(void)
796 files_cachep = kmem_cache_create("files_cache",
797 sizeof(struct files_struct),
799 SLAB_HWCACHE_ALIGN,
800 NULL, NULL);
801 if (!files_cachep)
802 panic("Cannot create files cache");