4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/malloc.h>
15 #include <linux/init.h>
16 #include <linux/unistd.h>
17 #include <linux/smp_lock.h>
18 #include <linux/module.h>
19 #include <linux/vmalloc.h>
21 #include <asm/pgtable.h>
22 #include <asm/mmu_context.h>
23 #include <asm/uaccess.h>
25 /* The idle tasks do not count.. */
29 unsigned long int total_forks
=0; /* Handle normal Linux uptimes. */
32 /* SLAB cache for mm_struct's. */
33 kmem_cache_t
*mm_cachep
;
35 /* SLAB cache for files structs */
36 kmem_cache_t
*files_cachep
;
38 struct task_struct
*pidhash
[PIDHASH_SZ
];
40 struct task_struct
**tarray_freelist
= NULL
;
41 spinlock_t taskslot_lock
= SPIN_LOCK_UNLOCKED
;
43 /* UID task count cache, to prevent walking entire process list every
44 * single fork() operation.
46 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
48 static struct user_struct
{
50 struct user_struct
*next
, **pprev
;
52 } *uidhash
[UIDHASH_SZ
];
54 spinlock_t uidhash_lock
= SPIN_LOCK_UNLOCKED
;
56 kmem_cache_t
*uid_cachep
;
58 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
61 * These routines must be called with the uidhash spinlock held!
63 static inline void uid_hash_insert(struct user_struct
*up
, unsigned int hashent
)
65 if((up
->next
= uidhash
[hashent
]) != NULL
)
66 uidhash
[hashent
]->pprev
= &up
->next
;
67 up
->pprev
= &uidhash
[hashent
];
68 uidhash
[hashent
] = up
;
71 static inline void uid_hash_remove(struct user_struct
*up
)
74 up
->next
->pprev
= up
->pprev
;
75 *up
->pprev
= up
->next
;
78 static inline struct user_struct
*uid_hash_find(unsigned short uid
, unsigned int hashent
)
80 struct user_struct
*up
, *next
;
82 next
= uidhash
[hashent
];
89 atomic_inc(&up
->count
);
97 * For SMP, we need to re-test the user struct counter
98 * after having aquired the spinlock. This allows us to do
99 * the common case (not freeing anything) without having
103 #define uid_hash_free(up) (!atomic_read(&(up)->count))
105 #define uid_hash_free(up) (1)
108 void free_uid(struct task_struct
*p
)
110 struct user_struct
*up
= p
->user
;
114 if (atomic_dec_and_test(&up
->count
)) {
115 spin_lock(&uidhash_lock
);
116 if (uid_hash_free(up
)) {
118 kmem_cache_free(uid_cachep
, up
);
120 spin_unlock(&uidhash_lock
);
125 int alloc_uid(struct task_struct
*p
)
127 unsigned int hashent
= uidhashfn(p
->uid
);
128 struct user_struct
*up
;
130 spin_lock(&uidhash_lock
);
131 up
= uid_hash_find(p
->uid
, hashent
);
132 spin_unlock(&uidhash_lock
);
135 struct user_struct
*new;
137 new = kmem_cache_alloc(uid_cachep
, SLAB_KERNEL
);
141 atomic_set(&new->count
, 1);
144 * Before adding this, check whether we raced
145 * on adding the same user already..
147 spin_lock(&uidhash_lock
);
148 up
= uid_hash_find(p
->uid
, hashent
);
150 kmem_cache_free(uid_cachep
, new);
152 uid_hash_insert(new, hashent
);
155 spin_unlock(&uidhash_lock
);
162 void __init
uidcache_init(void)
166 uid_cachep
= kmem_cache_create("uid_cache", sizeof(struct user_struct
),
168 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
170 panic("Cannot create uid taskcount SLAB cache\n");
172 for(i
= 0; i
< UIDHASH_SZ
; i
++)
176 static inline struct task_struct
** find_empty_process(void)
178 struct task_struct
**tslot
= NULL
;
180 if ((nr_tasks
< NR_TASKS
- MIN_TASKS_LEFT_FOR_ROOT
) || !current
->uid
)
181 tslot
= get_free_taskslot();
185 /* Protects next_safe and last_pid. */
186 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
;
188 static int get_pid(unsigned long flags
)
190 static int next_safe
= PID_MAX
;
191 struct task_struct
*p
;
193 if (flags
& CLONE_PID
)
196 spin_lock(&lastpid_lock
);
197 if((++last_pid
) & 0xffff8000) {
198 last_pid
= 300; /* Skip daemons etc. */
201 if(last_pid
>= next_safe
) {
204 read_lock(&tasklist_lock
);
207 if(p
->pid
== last_pid
||
208 p
->pgrp
== last_pid
||
209 p
->session
== last_pid
) {
210 if(++last_pid
>= next_safe
) {
211 if(last_pid
& 0xffff8000)
217 if(p
->pid
> last_pid
&& next_safe
> p
->pid
)
219 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
)
221 if(p
->session
> last_pid
&& next_safe
> p
->session
)
222 next_safe
= p
->session
;
224 read_unlock(&tasklist_lock
);
226 spin_unlock(&lastpid_lock
);
231 static inline int dup_mmap(struct mm_struct
* mm
)
233 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
236 flush_cache_mm(current
->mm
);
238 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
242 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
246 tmp
->vm_flags
&= ~VM_LOCKED
;
253 if (tmp
->vm_flags
& VM_DENYWRITE
)
254 file
->f_dentry
->d_inode
->i_writecount
--;
256 /* insert tmp into the share list, just after mpnt */
257 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
)
258 mpnt
->vm_next_share
->vm_pprev_share
=
260 mpnt
->vm_next_share
= tmp
;
261 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
;
264 /* Copy the pages, but defer checking for errors */
265 retval
= copy_page_range(mm
, current
->mm
, tmp
);
266 if (!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
)
267 tmp
->vm_ops
->open(tmp
);
270 * Link in the new vma even if an error occurred,
271 * so that exit_mmap() can clean up the mess.
273 tmp
->vm_next
= *pprev
;
276 pprev
= &tmp
->vm_next
;
281 if (mm
->map_count
>= AVL_MIN_MAP_COUNT
)
285 flush_tlb_mm(current
->mm
);
290 * Allocate and initialize an mm_struct.
292 * NOTE! The mm mutex will be locked until the
293 * caller decides that all systems are go..
295 struct mm_struct
* mm_alloc(void)
297 struct mm_struct
* mm
;
299 mm
= kmem_cache_alloc(mm_cachep
, SLAB_KERNEL
);
302 init_new_context(mm
);
303 atomic_set(&mm
->count
, 1);
306 init_MUTEX_LOCKED(&mm
->mmap_sem
);
308 * Leave mm->pgd set to the parent's pgd
309 * so that pgd_offset() is always valid.
311 mm
->mmap
= mm
->mmap_avl
= mm
->mmap_cache
= NULL
;
313 /* It has not run yet, so cannot be present in anyone's
321 /* Please note the differences between mmput and mm_release.
322 * mmput is called whenever we stop holding onto a mm_struct,
323 * error success whatever.
325 * mm_release is called after a mm_struct has been removed
326 * from the current process.
328 * This difference is important for error handling, when we
329 * only half set up a mm_struct for a new process and need to restore
330 * the old one. Because we mmput the new mm_struct before
331 * restoring the old one. . .
332 * Eric Biederman 10 January 1998
334 void mm_release(void)
336 struct task_struct
*tsk
= current
;
338 /* notify parent sleeping on vfork() */
339 if (tsk
->flags
& PF_VFORK
) {
340 tsk
->flags
&= ~PF_VFORK
;
341 up(tsk
->p_opptr
->vfork_sem
);
346 * Decrement the use count and release all resources for an mm.
348 void mmput(struct mm_struct
*mm
)
350 if (atomic_dec_and_test(&mm
->count
)) {
351 release_segments(mm
);
353 free_page_tables(mm
);
354 kmem_cache_free(mm_cachep
, mm
);
358 static inline int copy_mm(int nr
, unsigned long clone_flags
, struct task_struct
* tsk
)
360 struct mm_struct
* mm
;
363 if (clone_flags
& CLONE_VM
) {
366 * Set up the LDT descriptor for the clone task.
368 copy_segments(nr
, tsk
, NULL
);
369 SET_PAGE_DIR(tsk
, current
->mm
->pgd
);
379 tsk
->min_flt
= tsk
->maj_flt
= 0;
380 tsk
->cmin_flt
= tsk
->cmaj_flt
= 0;
381 tsk
->nswap
= tsk
->cnswap
= 0;
382 copy_segments(nr
, tsk
, mm
);
383 retval
= new_page_tables(tsk
);
386 retval
= dup_mmap(mm
);
394 release_segments(mm
);
395 kmem_cache_free(mm_cachep
, mm
);
404 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
406 if (clone_flags
& CLONE_FS
) {
407 atomic_inc(¤t
->fs
->count
);
410 tsk
->fs
= kmalloc(sizeof(*tsk
->fs
), GFP_KERNEL
);
413 atomic_set(&tsk
->fs
->count
, 1);
414 tsk
->fs
->umask
= current
->fs
->umask
;
415 tsk
->fs
->root
= dget(current
->fs
->root
);
416 tsk
->fs
->pwd
= dget(current
->fs
->pwd
);
421 * Copy a fd_set and compute the maximum fd it contains.
423 static inline int __copy_fdset(unsigned long *d
, unsigned long *src
)
426 unsigned long *p
= src
;
427 unsigned long *max
= src
;
429 for (i
= __FDSET_LONGS
; i
; --i
) {
430 if ((*d
++ = *p
++) != 0)
433 return (max
- src
)*sizeof(long)*8;
436 static inline int copy_fdset(fd_set
*dst
, fd_set
*src
)
438 return __copy_fdset(dst
->fds_bits
, src
->fds_bits
);
441 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
443 struct files_struct
*oldf
, *newf
;
444 struct file
**old_fds
, **new_fds
;
445 int size
, i
, error
= 0;
448 * A background process may not have any files ...
450 oldf
= current
->files
;
454 if (clone_flags
& CLONE_FILES
) {
455 atomic_inc(&oldf
->count
);
461 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
466 * Allocate the fd array, using get_free_page() if possible.
467 * Eventually we want to make the array size variable ...
469 size
= NR_OPEN
* sizeof(struct file
*);
470 if (size
== PAGE_SIZE
)
471 new_fds
= (struct file
**) __get_free_page(GFP_KERNEL
);
473 new_fds
= (struct file
**) kmalloc(size
, GFP_KERNEL
);
477 atomic_set(&newf
->count
, 1);
478 newf
->max_fds
= NR_OPEN
;
480 newf
->close_on_exec
= oldf
->close_on_exec
;
481 i
= copy_fdset(&newf
->open_fds
, &oldf
->open_fds
);
484 for (; i
!= 0; i
--) {
485 struct file
*f
= *old_fds
++;
491 /* This is long word aligned thus could use a optimized version */
492 memset(new_fds
, 0, (char *)newf
->fd
+ size
- (char *)new_fds
);
500 kmem_cache_free(files_cachep
, newf
);
504 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
506 if (clone_flags
& CLONE_SIGHAND
) {
507 atomic_inc(¤t
->sig
->count
);
510 tsk
->sig
= kmalloc(sizeof(*tsk
->sig
), GFP_KERNEL
);
513 spin_lock_init(&tsk
->sig
->siglock
);
514 atomic_set(&tsk
->sig
->count
, 1);
515 memcpy(tsk
->sig
->action
, current
->sig
->action
, sizeof(tsk
->sig
->action
));
519 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
521 unsigned long new_flags
= p
->flags
;
523 new_flags
&= ~(PF_SUPERPRIV
| PF_USEDFPU
| PF_VFORK
);
524 new_flags
|= PF_FORKNOEXEC
;
525 if (!(clone_flags
& CLONE_PTRACE
))
526 new_flags
&= ~(PF_PTRACED
|PF_TRACESYS
);
527 if (clone_flags
& CLONE_VFORK
)
528 new_flags
|= PF_VFORK
;
529 p
->flags
= new_flags
;
533 * Ok, this is the main fork-routine. It copies the system process
534 * information (task[nr]) and sets up the necessary registers. It
535 * also copies the data segment in its entirety.
537 int do_fork(unsigned long clone_flags
, unsigned long usp
, struct pt_regs
*regs
)
540 int retval
= -ENOMEM
;
541 struct task_struct
*p
;
542 DECLARE_MUTEX_LOCKED(sem
);
544 current
->vfork_sem
= &sem
;
546 p
= alloc_task_struct();
552 down(¤t
->mm
->mmap_sem
);
557 if (atomic_read(&p
->user
->count
) >= p
->rlim
[RLIMIT_NPROC
].rlim_cur
)
562 struct task_struct
**tslot
;
563 tslot
= find_empty_process();
566 p
->tarray_ptr
= tslot
;
568 nr
= tslot
- &task
[0];
571 if (p
->exec_domain
&& p
->exec_domain
->module
)
572 __MOD_INC_USE_COUNT(p
->exec_domain
->module
);
573 if (p
->binfmt
&& p
->binfmt
->module
)
574 __MOD_INC_USE_COUNT(p
->binfmt
->module
);
578 p
->state
= TASK_UNINTERRUPTIBLE
;
580 copy_flags(clone_flags
, p
);
581 p
->pid
= get_pid(clone_flags
);
584 * This is a "shadow run" state. The process
585 * is marked runnable, but isn't actually on
586 * any run queue yet.. (that happens at the
589 p
->state
= TASK_RUNNING
;
593 p
->p_pptr
= p
->p_opptr
= current
;
595 init_waitqueue_head(&p
->wait_chldexit
);
599 sigemptyset(&p
->signal
);
601 p
->sigqueue_tail
= &p
->sigqueue
;
603 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
604 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
605 init_timer(&p
->real_timer
);
606 p
->real_timer
.data
= (unsigned long) p
;
608 p
->leader
= 0; /* session leadership doesn't inherit */
610 p
->times
.tms_utime
= p
->times
.tms_stime
= 0;
611 p
->times
.tms_cutime
= p
->times
.tms_cstime
= 0;
616 p
->processor
= NO_PROC_ID
;
617 /* ?? should we just memset this ?? */
618 for(i
= 0; i
< smp_num_cpus
; i
++)
619 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] = 0;
620 spin_lock_init(&p
->sigmask_lock
);
623 p
->lock_depth
= -1; /* -1 = no lock */
624 p
->start_time
= jiffies
;
627 /* copy all the process information */
628 if (copy_files(clone_flags
, p
))
629 goto bad_fork_cleanup
;
630 if (copy_fs(clone_flags
, p
))
631 goto bad_fork_cleanup_files
;
632 if (copy_sighand(clone_flags
, p
))
633 goto bad_fork_cleanup_fs
;
634 if (copy_mm(nr
, clone_flags
, p
))
635 goto bad_fork_cleanup_sighand
;
636 retval
= copy_thread(nr
, clone_flags
, usp
, p
, regs
);
638 goto bad_fork_cleanup_sighand
;
641 /* ok, now we should be set up.. */
643 p
->exit_signal
= clone_flags
& CSIGNAL
;
644 p
->pdeath_signal
= 0;
647 * "share" dynamic priority between parent and child, thus the
648 * total amount of dynamic priorities in the system doesnt change,
649 * more scheduling fairness. This is only important in the first
650 * timeslice, on the long run the scheduling behaviour is unchanged.
652 current
->counter
>>= 1;
653 p
->counter
= current
->counter
;
656 * Ok, add it to the run-queues and make it
657 * visible to the rest of the system.
663 write_lock_irq(&tasklist_lock
);
666 write_unlock_irq(&tasklist_lock
);
670 atomic_inc(&p
->user
->count
);
674 wake_up_process(p
); /* do this last */
679 up(¤t
->mm
->mmap_sem
);
681 if ((clone_flags
& CLONE_VFORK
) && (retval
> 0))
685 bad_fork_cleanup_sighand
:
688 exit_fs(p
); /* blocking */
689 bad_fork_cleanup_files
:
690 exit_files(p
); /* blocking */
692 if (p
->exec_domain
&& p
->exec_domain
->module
)
693 __MOD_DEC_USE_COUNT(p
->exec_domain
->module
);
694 if (p
->binfmt
&& p
->binfmt
->module
)
695 __MOD_DEC_USE_COUNT(p
->binfmt
->module
);
697 add_free_taskslot(p
->tarray_ptr
);
703 void __init
filescache_init(void)
705 files_cachep
= kmem_cache_create("files_cache",
706 sizeof(struct files_struct
),
711 panic("Cannot create files cache");