4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/malloc.h>
15 #include <linux/init.h>
16 #include <linux/unistd.h>
17 #include <linux/smp_lock.h>
18 #include <linux/module.h>
19 #include <linux/vmalloc.h>
21 #include <asm/pgtable.h>
22 #include <asm/mmu_context.h>
23 #include <asm/uaccess.h>
25 /* The idle threads do not count.. */
30 unsigned long total_forks
= 0; /* Handle normal Linux uptimes. */
33 /* SLAB cache for mm_struct's. */
34 kmem_cache_t
*mm_cachep
;
36 /* SLAB cache for files structs */
37 kmem_cache_t
*files_cachep
;
39 struct task_struct
*pidhash
[PIDHASH_SZ
];
41 /* UID task count cache, to prevent walking entire process list every
42 * single fork() operation.
44 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
46 static struct user_struct
{
48 struct user_struct
*next
, **pprev
;
50 } *uidhash
[UIDHASH_SZ
];
52 spinlock_t uidhash_lock
= SPIN_LOCK_UNLOCKED
;
54 kmem_cache_t
*uid_cachep
;
56 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
59 * These routines must be called with the uidhash spinlock held!
61 static inline void uid_hash_insert(struct user_struct
*up
, unsigned int hashent
)
63 if((up
->next
= uidhash
[hashent
]) != NULL
)
64 uidhash
[hashent
]->pprev
= &up
->next
;
65 up
->pprev
= &uidhash
[hashent
];
66 uidhash
[hashent
] = up
;
69 static inline void uid_hash_remove(struct user_struct
*up
)
72 up
->next
->pprev
= up
->pprev
;
73 *up
->pprev
= up
->next
;
76 static inline struct user_struct
*uid_hash_find(unsigned short uid
, unsigned int hashent
)
78 struct user_struct
*up
, *next
;
80 next
= uidhash
[hashent
];
87 atomic_inc(&up
->count
);
95 * For SMP, we need to re-test the user struct counter
96 * after having aquired the spinlock. This allows us to do
97 * the common case (not freeing anything) without having
101 #define uid_hash_free(up) (!atomic_read(&(up)->count))
103 #define uid_hash_free(up) (1)
106 void free_uid(struct task_struct
*p
)
108 struct user_struct
*up
= p
->user
;
112 if (atomic_dec_and_test(&up
->count
)) {
113 spin_lock(&uidhash_lock
);
114 if (uid_hash_free(up
)) {
116 kmem_cache_free(uid_cachep
, up
);
118 spin_unlock(&uidhash_lock
);
123 int alloc_uid(struct task_struct
*p
)
125 unsigned int hashent
= uidhashfn(p
->uid
);
126 struct user_struct
*up
;
128 spin_lock(&uidhash_lock
);
129 up
= uid_hash_find(p
->uid
, hashent
);
130 spin_unlock(&uidhash_lock
);
133 struct user_struct
*new;
135 new = kmem_cache_alloc(uid_cachep
, SLAB_KERNEL
);
139 atomic_set(&new->count
, 1);
142 * Before adding this, check whether we raced
143 * on adding the same user already..
145 spin_lock(&uidhash_lock
);
146 up
= uid_hash_find(p
->uid
, hashent
);
148 kmem_cache_free(uid_cachep
, new);
150 uid_hash_insert(new, hashent
);
153 spin_unlock(&uidhash_lock
);
160 void __init
fork_init(unsigned long memsize
)
164 uid_cachep
= kmem_cache_create("uid_cache", sizeof(struct user_struct
),
166 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
168 panic("Cannot create uid taskcount SLAB cache\n");
170 for(i
= 0; i
< UIDHASH_SZ
; i
++)
174 * The default maximum number of threads is set to a safe
175 * value: the thread structures can take up at most half
178 max_threads
= memsize
/ THREAD_SIZE
/ 2;
180 init_task
.rlim
[RLIMIT_NPROC
].rlim_cur
= max_threads
/2;
181 init_task
.rlim
[RLIMIT_NPROC
].rlim_max
= max_threads
/2;
184 /* Protects next_safe and last_pid. */
185 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
;
187 static int get_pid(unsigned long flags
)
189 static int next_safe
= PID_MAX
;
190 struct task_struct
*p
;
192 if (flags
& CLONE_PID
)
195 spin_lock(&lastpid_lock
);
196 if((++last_pid
) & 0xffff8000) {
197 last_pid
= 300; /* Skip daemons etc. */
200 if(last_pid
>= next_safe
) {
203 read_lock(&tasklist_lock
);
206 if(p
->pid
== last_pid
||
207 p
->pgrp
== last_pid
||
208 p
->session
== last_pid
) {
209 if(++last_pid
>= next_safe
) {
210 if(last_pid
& 0xffff8000)
216 if(p
->pid
> last_pid
&& next_safe
> p
->pid
)
218 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
)
220 if(p
->session
> last_pid
&& next_safe
> p
->session
)
221 next_safe
= p
->session
;
223 read_unlock(&tasklist_lock
);
225 spin_unlock(&lastpid_lock
);
230 static inline int dup_mmap(struct mm_struct
* mm
)
232 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
235 /* Kill me slowly. UGLY! FIXME! */
236 memcpy(&mm
->start_code
, ¤t
->mm
->start_code
, 15*sizeof(unsigned long));
238 flush_cache_mm(current
->mm
);
240 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
244 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
248 tmp
->vm_flags
&= ~VM_LOCKED
;
255 if (tmp
->vm_flags
& VM_DENYWRITE
)
256 atomic_dec(&file
->f_dentry
->d_inode
->i_writecount
);
258 /* insert tmp into the share list, just after mpnt */
259 spin_lock(&file
->f_dentry
->d_inode
->i_shared_lock
);
260 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
)
261 mpnt
->vm_next_share
->vm_pprev_share
=
263 mpnt
->vm_next_share
= tmp
;
264 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
;
265 spin_unlock(&file
->f_dentry
->d_inode
->i_shared_lock
);
268 /* Copy the pages, but defer checking for errors */
269 retval
= copy_page_range(mm
, current
->mm
, tmp
);
270 if (!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
)
271 tmp
->vm_ops
->open(tmp
);
274 * Link in the new vma even if an error occurred,
275 * so that exit_mmap() can clean up the mess.
277 tmp
->vm_next
= *pprev
;
280 pprev
= &tmp
->vm_next
;
285 if (mm
->map_count
>= AVL_MIN_MAP_COUNT
)
289 flush_tlb_mm(current
->mm
);
294 * Allocate and initialize an mm_struct.
296 struct mm_struct
* mm_alloc(void)
298 struct mm_struct
* mm
;
300 mm
= kmem_cache_alloc(mm_cachep
, SLAB_KERNEL
);
302 memset(mm
, 0, sizeof(*mm
));
303 atomic_set(&mm
->mm_users
, 1);
304 atomic_set(&mm
->mm_count
, 1);
305 init_MUTEX(&mm
->mmap_sem
);
306 mm
->page_table_lock
= SPIN_LOCK_UNLOCKED
;
307 mm
->pgd
= pgd_alloc();
310 kmem_cache_free(mm_cachep
, mm
);
316 * Called when the last reference to the mm
317 * is dropped: either by a lazy thread or by
318 * mmput. Free the page directory and the mm.
320 inline void __mmdrop(struct mm_struct
*mm
)
322 if (mm
== &init_mm
) BUG();
324 kmem_cache_free(mm_cachep
, mm
);
328 * Decrement the use count and release all resources for an mm.
330 void mmput(struct mm_struct
*mm
)
332 if (atomic_dec_and_test(&mm
->mm_users
)) {
338 /* Please note the differences between mmput and mm_release.
339 * mmput is called whenever we stop holding onto a mm_struct,
340 * error success whatever.
342 * mm_release is called after a mm_struct has been removed
343 * from the current process.
345 * This difference is important for error handling, when we
346 * only half set up a mm_struct for a new process and need to restore
347 * the old one. Because we mmput the new mm_struct before
348 * restoring the old one. . .
349 * Eric Biederman 10 January 1998
351 void mm_release(void)
353 struct task_struct
*tsk
= current
;
355 /* notify parent sleeping on vfork() */
356 if (tsk
->flags
& PF_VFORK
) {
357 tsk
->flags
&= ~PF_VFORK
;
358 up(tsk
->p_opptr
->vfork_sem
);
362 static inline int copy_mm(unsigned long clone_flags
, struct task_struct
* tsk
)
364 struct mm_struct
* mm
;
367 tsk
->min_flt
= tsk
->maj_flt
= 0;
368 tsk
->cmin_flt
= tsk
->cmaj_flt
= 0;
369 tsk
->nswap
= tsk
->cnswap
= 0;
372 tsk
->active_mm
= NULL
;
375 * Are we cloning a kernel thread?
377 * We need to steal a active VM for that..
383 if (clone_flags
& CLONE_VM
) {
384 atomic_inc(&mm
->mm_users
);
397 * child gets a private LDT (if there was an LDT in the parent)
399 copy_segments(tsk
, mm
);
401 down(¤t
->mm
->mmap_sem
);
402 retval
= dup_mmap(mm
);
403 up(¤t
->mm
->mmap_sem
);
410 init_new_context(tsk
,mm
);
419 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
421 if (clone_flags
& CLONE_FS
) {
422 atomic_inc(¤t
->fs
->count
);
425 tsk
->fs
= kmalloc(sizeof(*tsk
->fs
), GFP_KERNEL
);
428 atomic_set(&tsk
->fs
->count
, 1);
429 tsk
->fs
->umask
= current
->fs
->umask
;
430 tsk
->fs
->root
= dget(current
->fs
->root
);
431 tsk
->fs
->pwd
= dget(current
->fs
->pwd
);
435 static int count_open_files(struct files_struct
*files
, int size
)
439 /* Find the last open fd */
440 for (i
= size
/(8*sizeof(long)); i
> 0; ) {
441 if (files
->open_fds
->fds_bits
[--i
])
444 i
= (i
+1) * 8 * sizeof(long);
448 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
450 struct files_struct
*oldf
, *newf
;
451 struct file
**old_fds
, **new_fds
;
452 int open_files
, nfds
, size
, i
, error
= 0;
455 * A background process may not have any files ...
457 oldf
= current
->files
;
461 if (clone_flags
& CLONE_FILES
) {
462 atomic_inc(&oldf
->count
);
468 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
472 atomic_set(&newf
->count
, 1);
474 newf
->file_lock
= RW_LOCK_UNLOCKED
;
476 newf
->max_fds
= NR_OPEN_DEFAULT
;
477 newf
->max_fdset
= __FD_SETSIZE
;
478 newf
->close_on_exec
= &newf
->close_on_exec_init
;
479 newf
->open_fds
= &newf
->open_fds_init
;
480 newf
->fd
= &newf
->fd_array
[0];
482 /* We don't yet have the oldf readlock, but even if the old
483 fdset gets grown now, we'll only copy up to "size" fds */
484 size
= oldf
->max_fdset
;
485 if (size
> __FD_SETSIZE
) {
487 write_lock(&newf
->file_lock
);
488 error
= expand_fdset(newf
, size
);
489 write_unlock(&newf
->file_lock
);
493 read_lock(&oldf
->file_lock
);
495 open_files
= count_open_files(oldf
, size
);
498 * Check whether we need to allocate a larger fd array.
499 * Note: we're not a clone task, so the open count won't
502 nfds
= NR_OPEN_DEFAULT
;
503 if (open_files
> nfds
) {
504 read_unlock(&oldf
->file_lock
);
506 write_lock(&newf
->file_lock
);
507 error
= expand_fd_array(newf
, open_files
);
508 write_unlock(&newf
->file_lock
);
511 nfds
= newf
->max_fds
;
512 read_lock(&oldf
->file_lock
);
518 memcpy(newf
->open_fds
->fds_bits
, oldf
->open_fds
->fds_bits
, open_files
/8);
519 memcpy(newf
->close_on_exec
->fds_bits
, oldf
->close_on_exec
->fds_bits
, open_files
/8);
521 for (i
= open_files
; i
!= 0; i
--) {
522 struct file
*f
= *old_fds
++;
527 read_unlock(&oldf
->file_lock
);
529 /* compute the remainder to be cleared */
530 size
= (newf
->max_fds
- open_files
) * sizeof(struct file
*);
532 /* This is long word aligned thus could use a optimized version */
533 memset(new_fds
, 0, size
);
535 if (newf
->max_fdset
> open_files
) {
536 int left
= (newf
->max_fdset
-open_files
)/8;
537 int start
= open_files
/ (8 * sizeof(unsigned long));
539 memset(&newf
->open_fds
->fds_bits
[start
], 0, left
);
540 memset(&newf
->close_on_exec
->fds_bits
[start
], 0, left
);
549 free_fdset (newf
->close_on_exec
, newf
->max_fdset
);
550 free_fdset (newf
->open_fds
, newf
->max_fdset
);
551 kmem_cache_free(files_cachep
, newf
);
555 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
557 if (clone_flags
& CLONE_SIGHAND
) {
558 atomic_inc(¤t
->sig
->count
);
561 tsk
->sig
= kmalloc(sizeof(*tsk
->sig
), GFP_KERNEL
);
564 spin_lock_init(&tsk
->sig
->siglock
);
565 atomic_set(&tsk
->sig
->count
, 1);
566 memcpy(tsk
->sig
->action
, current
->sig
->action
, sizeof(tsk
->sig
->action
));
570 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
572 unsigned long new_flags
= p
->flags
;
574 new_flags
&= ~(PF_SUPERPRIV
| PF_USEDFPU
| PF_VFORK
);
575 new_flags
|= PF_FORKNOEXEC
;
576 if (!(clone_flags
& CLONE_PTRACE
))
577 new_flags
&= ~(PF_PTRACED
|PF_TRACESYS
);
578 if (clone_flags
& CLONE_VFORK
)
579 new_flags
|= PF_VFORK
;
580 p
->flags
= new_flags
;
584 * Ok, this is the main fork-routine. It copies the system process
585 * information (task[nr]) and sets up the necessary registers. It
586 * also copies the data segment in its entirety.
588 int do_fork(unsigned long clone_flags
, unsigned long usp
, struct pt_regs
*regs
)
590 int retval
= -ENOMEM
;
591 struct task_struct
*p
;
592 DECLARE_MUTEX_LOCKED(sem
);
594 current
->vfork_sem
= &sem
;
596 p
= alloc_task_struct();
606 if (atomic_read(&p
->user
->count
) >= p
->rlim
[RLIMIT_NPROC
].rlim_cur
)
608 atomic_inc(&p
->user
->count
);
612 * Counter atomicity is protected by
615 if (nr_threads
>= max_threads
)
616 goto bad_fork_cleanup_count
;
618 if (p
->exec_domain
&& p
->exec_domain
->module
)
619 __MOD_INC_USE_COUNT(p
->exec_domain
->module
);
620 if (p
->binfmt
&& p
->binfmt
->module
)
621 __MOD_INC_USE_COUNT(p
->binfmt
->module
);
625 p
->state
= TASK_UNINTERRUPTIBLE
;
627 copy_flags(clone_flags
, p
);
628 p
->pid
= get_pid(clone_flags
);
631 * This is a "shadow run" state. The process
632 * is marked runnable, but isn't actually on
633 * any run queue yet.. (that happens at the
636 p
->state
= TASK_RUNNING
;
637 p
->run_list
.next
= NULL
;
638 p
->run_list
.prev
= NULL
;
640 p
->p_pptr
= p
->p_opptr
= current
;
642 init_waitqueue_head(&p
->wait_chldexit
);
646 sigemptyset(&p
->signal
);
648 p
->sigqueue_tail
= &p
->sigqueue
;
650 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
651 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
652 init_timer(&p
->real_timer
);
653 p
->real_timer
.data
= (unsigned long) p
;
655 p
->leader
= 0; /* session leadership doesn't inherit */
657 p
->times
.tms_utime
= p
->times
.tms_stime
= 0;
658 p
->times
.tms_cutime
= p
->times
.tms_cstime
= 0;
663 p
->processor
= current
->processor
;
664 /* ?? should we just memset this ?? */
665 for(i
= 0; i
< smp_num_cpus
; i
++)
666 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] = 0;
667 spin_lock_init(&p
->sigmask_lock
);
670 p
->lock_depth
= -1; /* -1 = no lock */
671 p
->start_time
= jiffies
;
674 /* copy all the process information */
675 if (copy_files(clone_flags
, p
))
676 goto bad_fork_cleanup
;
677 if (copy_fs(clone_flags
, p
))
678 goto bad_fork_cleanup_files
;
679 if (copy_sighand(clone_flags
, p
))
680 goto bad_fork_cleanup_fs
;
681 if (copy_mm(clone_flags
, p
))
682 goto bad_fork_cleanup_sighand
;
683 retval
= copy_thread(0, clone_flags
, usp
, p
, regs
);
685 goto bad_fork_cleanup_sighand
;
688 /* ok, now we should be set up.. */
690 p
->exit_signal
= clone_flags
& CSIGNAL
;
691 p
->pdeath_signal
= 0;
694 * "share" dynamic priority between parent and child, thus the
695 * total amount of dynamic priorities in the system doesnt change,
696 * more scheduling fairness. This is only important in the first
697 * timeslice, on the long run the scheduling behaviour is unchanged.
699 current
->counter
>>= 1;
700 p
->counter
= current
->counter
;
703 * Ok, add it to the run-queues and make it
704 * visible to the rest of the system.
709 write_lock_irq(&tasklist_lock
);
712 write_unlock_irq(&tasklist_lock
);
715 wake_up_process(p
); /* do this last */
721 if ((clone_flags
& CLONE_VFORK
) && (retval
> 0))
725 bad_fork_cleanup_sighand
:
728 exit_fs(p
); /* blocking */
729 bad_fork_cleanup_files
:
730 exit_files(p
); /* blocking */
732 if (p
->exec_domain
&& p
->exec_domain
->module
)
733 __MOD_DEC_USE_COUNT(p
->exec_domain
->module
);
734 if (p
->binfmt
&& p
->binfmt
->module
)
735 __MOD_DEC_USE_COUNT(p
->binfmt
->module
);
738 bad_fork_cleanup_count
:
746 void __init
filescache_init(void)
748 files_cachep
= kmem_cache_create("files_cache",
749 sizeof(struct files_struct
),
754 panic("Cannot create files cache");