4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/init.h>
15 #include <linux/errno.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/unistd.h>
21 #include <linux/ptrace.h>
22 #include <linux/malloc.h>
23 #include <linux/smp.h>
24 #include <linux/smp_lock.h>
25 #include <linux/module.h>
27 #include <asm/system.h>
28 #include <asm/pgtable.h>
29 #include <asm/mmu_context.h>
30 #include <asm/uaccess.h>
34 unsigned long int total_forks
=0; /* Handle normal Linux uptimes. */
37 /* SLAB cache for mm_struct's. */
38 kmem_cache_t
*mm_cachep
;
40 /* SLAB cache for files structs */
41 kmem_cache_t
*files_cachep
;
43 struct task_struct
*pidhash
[PIDHASH_SZ
];
45 struct task_struct
**tarray_freelist
= NULL
;
46 spinlock_t taskslot_lock
= SPIN_LOCK_UNLOCKED
;
48 /* UID task count cache, to prevent walking entire process list every
49 * single fork() operation.
51 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
53 static struct user_struct
{
54 struct user_struct
*next
, **pprev
;
57 } *uidhash
[UIDHASH_SZ
];
59 spinlock_t uidhash_lock
= SPIN_LOCK_UNLOCKED
;
61 kmem_cache_t
*uid_cachep
;
63 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
65 static inline void uid_hash_insert(struct user_struct
*up
, unsigned int hashent
)
67 spin_lock(&uidhash_lock
);
68 if((up
->next
= uidhash
[hashent
]) != NULL
)
69 uidhash
[hashent
]->pprev
= &up
->next
;
70 up
->pprev
= &uidhash
[hashent
];
71 uidhash
[hashent
] = up
;
72 spin_unlock(&uidhash_lock
);
75 static inline void uid_hash_remove(struct user_struct
*up
)
77 spin_lock(&uidhash_lock
);
79 up
->next
->pprev
= up
->pprev
;
80 *up
->pprev
= up
->next
;
81 spin_unlock(&uidhash_lock
);
84 static inline struct user_struct
*uid_find(unsigned short uid
, unsigned int hashent
)
86 struct user_struct
*up
;
88 spin_lock(&uidhash_lock
);
89 for(up
= uidhash
[hashent
]; (up
&& up
->uid
!= uid
); up
= up
->next
)
91 spin_unlock(&uidhash_lock
);
95 void free_uid(struct task_struct
*p
)
97 struct user_struct
*up
= p
->user
;
102 if (!--up
->task_count
) {
104 kmem_cache_free(uid_cachep
, up
);
110 int alloc_uid(struct task_struct
*p
)
112 unsigned int hashent
= uidhashfn(p
->uid
);
113 struct user_struct
*up
= uid_find(p
->uid
, hashent
);
117 up
= kmem_cache_alloc(uid_cachep
, SLAB_KERNEL
);
123 uid_hash_insert(up
, hashent
);
130 __initfunc(void uidcache_init(void))
134 uid_cachep
= kmem_cache_create("uid_cache", sizeof(struct user_struct
),
136 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
138 panic("Cannot create uid taskcount SLAB cache\n");
140 for(i
= 0; i
< UIDHASH_SZ
; i
++)
144 static inline struct task_struct
** find_empty_process(void)
146 struct task_struct
**tslot
= NULL
;
148 if (!current
->uid
|| (nr_tasks
< NR_TASKS
- MIN_TASKS_LEFT_FOR_ROOT
))
149 tslot
= get_free_taskslot();
153 /* Protects next_safe and last_pid. */
154 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
;
156 static int get_pid(unsigned long flags
)
158 static int next_safe
= PID_MAX
;
159 struct task_struct
*p
;
161 if (flags
& CLONE_PID
)
164 spin_lock(&lastpid_lock
);
165 if((++last_pid
) & 0xffff8000) {
166 last_pid
= 300; /* Skip daemons etc. */
169 if(last_pid
>= next_safe
) {
172 read_lock(&tasklist_lock
);
175 if(p
->pid
== last_pid
||
176 p
->pgrp
== last_pid
||
177 p
->session
== last_pid
) {
178 if(++last_pid
>= next_safe
) {
179 if(last_pid
& 0xffff8000)
185 if(p
->pid
> last_pid
&& next_safe
> p
->pid
)
187 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
)
189 if(p
->session
> last_pid
&& next_safe
> p
->session
)
190 next_safe
= p
->session
;
192 read_unlock(&tasklist_lock
);
194 spin_unlock(&lastpid_lock
);
199 static inline int dup_mmap(struct mm_struct
* mm
)
201 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
204 flush_cache_mm(current
->mm
);
206 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
210 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
214 tmp
->vm_flags
&= ~VM_LOCKED
;
221 if (tmp
->vm_flags
& VM_DENYWRITE
)
222 file
->f_dentry
->d_inode
->i_writecount
--;
224 /* insert tmp into the share list, just after mpnt */
225 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
)
226 mpnt
->vm_next_share
->vm_pprev_share
=
228 mpnt
->vm_next_share
= tmp
;
229 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
;
232 /* Copy the pages, but defer checking for errors */
233 retval
= copy_page_range(mm
, current
->mm
, tmp
);
234 if (!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
)
235 tmp
->vm_ops
->open(tmp
);
238 * Link in the new vma even if an error occurred,
239 * so that exit_mmap() can clean up the mess.
241 if((tmp
->vm_next
= *pprev
) != NULL
)
242 (*pprev
)->vm_pprev
= &tmp
->vm_next
;
244 tmp
->vm_pprev
= pprev
;
246 pprev
= &tmp
->vm_next
;
253 flush_tlb_mm(current
->mm
);
258 * Allocate and initialize an mm_struct.
260 * NOTE! The mm mutex will be locked until the
261 * caller decides that all systems are go..
263 struct mm_struct
* mm_alloc(void)
265 struct mm_struct
* mm
;
267 mm
= kmem_cache_alloc(mm_cachep
, SLAB_KERNEL
);
270 init_new_context(mm
);
271 atomic_set(&mm
->count
, 1);
274 mm
->mmap_sem
= MUTEX_LOCKED
;
276 * Leave mm->pgd set to the parent's pgd
277 * so that pgd_offset() is always valid.
279 mm
->mmap
= mm
->mmap_cache
= NULL
;
281 /* It has not run yet, so cannot be present in anyone's
290 * Decrement the use count and release all resources for an mm.
292 void mmput(struct mm_struct
*mm
)
294 if (atomic_dec_and_test(&mm
->count
)) {
295 release_segments(mm
);
297 free_page_tables(mm
);
298 kmem_cache_free(mm_cachep
, mm
);
302 static inline int copy_mm(int nr
, unsigned long clone_flags
, struct task_struct
* tsk
)
304 struct mm_struct
* mm
;
307 if (clone_flags
& CLONE_VM
) {
310 * Set up the LDT descriptor for the clone task.
312 copy_segments(nr
, tsk
, NULL
);
313 SET_PAGE_DIR(tsk
, current
->mm
->pgd
);
323 tsk
->min_flt
= tsk
->maj_flt
= 0;
324 tsk
->cmin_flt
= tsk
->cmaj_flt
= 0;
325 tsk
->nswap
= tsk
->cnswap
= 0;
326 copy_segments(nr
, tsk
, mm
);
327 retval
= new_page_tables(tsk
);
330 retval
= dup_mmap(mm
);
345 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
347 if (clone_flags
& CLONE_FS
) {
348 atomic_inc(¤t
->fs
->count
);
351 tsk
->fs
= kmalloc(sizeof(*tsk
->fs
), GFP_KERNEL
);
354 atomic_set(&tsk
->fs
->count
, 1);
355 tsk
->fs
->umask
= current
->fs
->umask
;
356 tsk
->fs
->root
= dget(current
->fs
->root
);
357 tsk
->fs
->pwd
= dget(current
->fs
->pwd
);
361 /* return value is only accurate by +-sizeof(long)*8 fds */
362 /* XXX make this architecture specific */
363 static inline int __copy_fdset(unsigned long *d
, unsigned long *src
)
366 unsigned long *p
= src
;
367 unsigned long *max
= src
;
369 for (i
= __FDSET_LONGS
; i
; --i
) {
370 if ((*d
++ = *p
++) != 0)
373 return (max
- src
)*sizeof(long)*8;
376 static inline int copy_fdset(fd_set
*dst
, fd_set
*src
)
378 return __copy_fdset(dst
->fds_bits
, src
->fds_bits
);
381 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
383 struct files_struct
*oldf
, *newf
;
384 struct file
**old_fds
, **new_fds
;
385 int size
, i
, error
= 0;
388 * A background process may not have any files ...
390 oldf
= current
->files
;
394 if (clone_flags
& CLONE_FILES
) {
395 atomic_inc(&oldf
->count
);
401 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
406 * Allocate the fd array, using get_free_page() if possible.
407 * Eventually we want to make the array size variable ...
409 size
= NR_OPEN
* sizeof(struct file
*);
410 if (size
== PAGE_SIZE
)
411 new_fds
= (struct file
**) __get_free_page(GFP_KERNEL
);
413 new_fds
= (struct file
**) kmalloc(size
, GFP_KERNEL
);
416 memset((void *) new_fds
, 0, size
);
418 atomic_set(&newf
->count
, 1);
419 newf
->max_fds
= NR_OPEN
;
421 newf
->close_on_exec
= oldf
->close_on_exec
;
422 i
= copy_fdset(&newf
->open_fds
, &oldf
->open_fds
);
425 for (; i
!= 0; i
--) {
426 struct file
* f
= *old_fds
;
439 kmem_cache_free(files_cachep
, newf
);
443 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
445 if (clone_flags
& CLONE_SIGHAND
) {
446 atomic_inc(¤t
->sig
->count
);
449 tsk
->sig
= kmalloc(sizeof(*tsk
->sig
), GFP_KERNEL
);
452 spin_lock_init(&tsk
->sig
->siglock
);
453 atomic_set(&tsk
->sig
->count
, 1);
454 memcpy(tsk
->sig
->action
, current
->sig
->action
, sizeof(tsk
->sig
->action
));
458 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
460 unsigned long new_flags
= p
->flags
;
462 new_flags
&= ~PF_SUPERPRIV
;
463 new_flags
|= PF_FORKNOEXEC
;
464 if (!(clone_flags
& CLONE_PTRACE
))
465 new_flags
&= ~(PF_PTRACED
|PF_TRACESYS
);
466 p
->flags
= new_flags
;
470 * Ok, this is the main fork-routine. It copies the system process
471 * information (task[nr]) and sets up the necessary registers. It
472 * also copies the data segment in its entirety.
474 int do_fork(unsigned long clone_flags
, unsigned long usp
, struct pt_regs
*regs
)
477 int retval
= -ENOMEM
;
478 struct task_struct
*p
;
480 p
= alloc_task_struct();
486 down(¤t
->mm
->mmap_sem
);
490 if (p
->user
->task_count
>= p
->rlim
[RLIMIT_NPROC
].rlim_cur
)
495 struct task_struct
**tslot
;
496 tslot
= find_empty_process();
500 p
->tarray_ptr
= tslot
;
502 nr
= tslot
- &task
[0];
505 if (p
->exec_domain
&& p
->exec_domain
->module
)
506 __MOD_INC_USE_COUNT(p
->exec_domain
->module
);
507 if (p
->binfmt
&& p
->binfmt
->module
)
508 __MOD_INC_USE_COUNT(p
->binfmt
->module
);
512 p
->state
= TASK_UNINTERRUPTIBLE
;
514 copy_flags(clone_flags
, p
);
515 p
->pid
= get_pid(clone_flags
);
518 * This is a "shadow run" state. The process
519 * is marked runnable, but isn't actually on
520 * any run queue yet.. (that happens at the
523 p
->state
= TASK_RUNNING
;
527 p
->p_pptr
= p
->p_opptr
= current
;
529 init_waitqueue(&p
->wait_chldexit
);
532 sigemptyset(&p
->signal
);
534 p
->sigqueue_tail
= &p
->sigqueue
;
536 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
537 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
538 init_timer(&p
->real_timer
);
539 p
->real_timer
.data
= (unsigned long) p
;
541 p
->leader
= 0; /* session leadership doesn't inherit */
543 p
->times
.tms_utime
= p
->times
.tms_stime
= 0;
544 p
->times
.tms_cutime
= p
->times
.tms_cstime
= 0;
549 p
->processor
= NO_PROC_ID
;
550 /* ?? should we just memset this ?? */
551 for(i
= 0; i
< smp_num_cpus
; i
++)
552 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] = 0;
553 spin_lock_init(&p
->sigmask_lock
);
556 p
->lock_depth
= -1; /* -1 = no lock */
557 p
->start_time
= jiffies
;
560 /* This makes it visible to the rest of the system */
562 write_lock_irqsave(&tasklist_lock
, flags
);
565 write_unlock_irqrestore(&tasklist_lock
, flags
);
570 p
->user
->task_count
++;
573 /* copy all the process information */
574 if (copy_files(clone_flags
, p
))
575 goto bad_fork_cleanup
;
576 if (copy_fs(clone_flags
, p
))
577 goto bad_fork_cleanup_files
;
578 if (copy_sighand(clone_flags
, p
))
579 goto bad_fork_cleanup_fs
;
580 if (copy_mm(nr
, clone_flags
, p
))
581 goto bad_fork_cleanup_sighand
;
582 retval
= copy_thread(nr
, clone_flags
, usp
, p
, regs
);
584 goto bad_fork_cleanup_sighand
;
587 /* ok, now we should be set up.. */
589 p
->exit_signal
= clone_flags
& CSIGNAL
;
590 p
->pdeath_signal
= 0;
593 * "share" dynamic priority between parent and child, thus the
594 * total amount of dynamic priorities in the system doesnt change,
595 * more scheduling fairness. This is only important in the first
596 * timeslice, on the long run the scheduling behaviour is unchanged.
598 current
->counter
>>= 1;
599 p
->counter
= current
->counter
;
601 /* Ok, add it to the run-queues, let it rip! */
606 wake_up_process(p
); /* do this last */
610 up(¤t
->mm
->mmap_sem
);
615 bad_fork_cleanup_sighand
:
618 exit_fs(p
); /* blocking */
619 bad_fork_cleanup_files
:
620 exit_files(p
); /* blocking */
622 if (p
->exec_domain
&& p
->exec_domain
->module
)
623 __MOD_DEC_USE_COUNT(p
->exec_domain
->module
);
624 if (p
->binfmt
&& p
->binfmt
->module
)
625 __MOD_DEC_USE_COUNT(p
->binfmt
->module
);
629 write_lock_irqsave(&tasklist_lock
, flags
);
632 write_unlock_irqrestore(&tasklist_lock
, flags
);
636 p
->user
->task_count
++;
638 add_free_taskslot(p
->tarray_ptr
);
644 static void files_ctor(void *fp
, kmem_cache_t
*cachep
, unsigned long flags
)
646 struct files_struct
*f
= fp
;
648 memset(f
, 0, sizeof(*f
));
651 __initfunc(void filescache_init(void))
653 files_cachep
= kmem_cache_create("files_cache",
654 sizeof(struct files_struct
),
659 panic("Cannot create files cache");