4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
14 #include <linux/config.h>
15 #include <linux/malloc.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/uaccess.h>
25 #include <asm/mmu_context.h>
27 /* The idle threads do not count.. */
32 unsigned long total_forks
; /* Handle normal Linux uptimes. */
35 /* SLAB cache for mm_struct's. */
36 kmem_cache_t
*mm_cachep
;
38 /* SLAB cache for files structs */
39 kmem_cache_t
*files_cachep
;
41 struct task_struct
*pidhash
[PIDHASH_SZ
];
43 /* UID task count cache, to prevent walking entire process list every
44 * single fork() operation.
46 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
48 static struct user_struct
{
50 struct user_struct
*next
, **pprev
;
52 } *uidhash
[UIDHASH_SZ
];
54 spinlock_t uidhash_lock
= SPIN_LOCK_UNLOCKED
;
56 kmem_cache_t
*uid_cachep
;
58 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
61 * These routines must be called with the uidhash spinlock held!
63 static inline void uid_hash_insert(struct user_struct
*up
, unsigned int hashent
)
65 if((up
->next
= uidhash
[hashent
]) != NULL
)
66 uidhash
[hashent
]->pprev
= &up
->next
;
67 up
->pprev
= &uidhash
[hashent
];
68 uidhash
[hashent
] = up
;
71 static inline void uid_hash_remove(struct user_struct
*up
)
74 up
->next
->pprev
= up
->pprev
;
75 *up
->pprev
= up
->next
;
78 static inline struct user_struct
*uid_hash_find(unsigned short uid
, unsigned int hashent
)
80 struct user_struct
*up
, *next
;
82 next
= uidhash
[hashent
];
89 atomic_inc(&up
->count
);
97 * For SMP, we need to re-test the user struct counter
98 * after having aquired the spinlock. This allows us to do
99 * the common case (not freeing anything) without having
103 #define uid_hash_free(up) (!atomic_read(&(up)->count))
105 #define uid_hash_free(up) (1)
108 void free_uid(struct task_struct
*p
)
110 struct user_struct
*up
= p
->user
;
114 if (atomic_dec_and_test(&up
->count
)) {
115 spin_lock(&uidhash_lock
);
116 if (uid_hash_free(up
)) {
118 kmem_cache_free(uid_cachep
, up
);
120 spin_unlock(&uidhash_lock
);
125 int alloc_uid(struct task_struct
*p
)
127 unsigned int hashent
= uidhashfn(p
->uid
);
128 struct user_struct
*up
;
130 spin_lock(&uidhash_lock
);
131 up
= uid_hash_find(p
->uid
, hashent
);
132 spin_unlock(&uidhash_lock
);
135 struct user_struct
*new;
137 new = kmem_cache_alloc(uid_cachep
, SLAB_KERNEL
);
141 atomic_set(&new->count
, 1);
144 * Before adding this, check whether we raced
145 * on adding the same user already..
147 spin_lock(&uidhash_lock
);
148 up
= uid_hash_find(p
->uid
, hashent
);
150 kmem_cache_free(uid_cachep
, new);
152 uid_hash_insert(new, hashent
);
155 spin_unlock(&uidhash_lock
);
162 void __init
fork_init(unsigned long mempages
)
166 uid_cachep
= kmem_cache_create("uid_cache", sizeof(struct user_struct
),
168 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
170 panic("Cannot create uid taskcount SLAB cache\n");
172 for(i
= 0; i
< UIDHASH_SZ
; i
++)
176 * The default maximum number of threads is set to a safe
177 * value: the thread structures can take up at most half
180 max_threads
= mempages
/ (THREAD_SIZE
/PAGE_SIZE
) / 2;
182 init_task
.rlim
[RLIMIT_NPROC
].rlim_cur
= max_threads
/2;
183 init_task
.rlim
[RLIMIT_NPROC
].rlim_max
= max_threads
/2;
186 /* Protects next_safe and last_pid. */
187 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
;
189 static int get_pid(unsigned long flags
)
191 static int next_safe
= PID_MAX
;
192 struct task_struct
*p
;
194 if (flags
& CLONE_PID
)
197 spin_lock(&lastpid_lock
);
198 if((++last_pid
) & 0xffff8000) {
199 last_pid
= 300; /* Skip daemons etc. */
202 if(last_pid
>= next_safe
) {
205 read_lock(&tasklist_lock
);
208 if(p
->pid
== last_pid
||
209 p
->pgrp
== last_pid
||
210 p
->session
== last_pid
) {
211 if(++last_pid
>= next_safe
) {
212 if(last_pid
& 0xffff8000)
218 if(p
->pid
> last_pid
&& next_safe
> p
->pid
)
220 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
)
222 if(p
->session
> last_pid
&& next_safe
> p
->session
)
223 next_safe
= p
->session
;
225 read_unlock(&tasklist_lock
);
227 spin_unlock(&lastpid_lock
);
232 static inline int dup_mmap(struct mm_struct
* mm
)
234 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
237 /* Kill me slowly. UGLY! FIXME! */
238 memcpy(&mm
->start_code
, ¤t
->mm
->start_code
, 15*sizeof(unsigned long));
240 flush_cache_mm(current
->mm
);
242 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
246 if(mpnt
->vm_flags
& VM_DONTCOPY
)
248 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
252 tmp
->vm_flags
&= ~VM_LOCKED
;
258 struct inode
*inode
= file
->f_dentry
->d_inode
;
260 if (tmp
->vm_flags
& VM_DENYWRITE
)
261 atomic_dec(&inode
->i_writecount
);
263 /* insert tmp into the share list, just after mpnt */
264 spin_lock(&inode
->i_mapping
->i_shared_lock
);
265 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
)
266 mpnt
->vm_next_share
->vm_pprev_share
=
268 mpnt
->vm_next_share
= tmp
;
269 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
;
270 spin_unlock(&inode
->i_mapping
->i_shared_lock
);
273 /* Copy the pages, but defer checking for errors */
274 retval
= copy_page_range(mm
, current
->mm
, tmp
);
275 if (!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
)
276 tmp
->vm_ops
->open(tmp
);
279 * Link in the new vma even if an error occurred,
280 * so that exit_mmap() can clean up the mess.
282 tmp
->vm_next
= *pprev
;
285 pprev
= &tmp
->vm_next
;
290 if (mm
->map_count
>= AVL_MIN_MAP_COUNT
)
294 flush_tlb_mm(current
->mm
);
299 * Allocate and initialize an mm_struct.
301 struct mm_struct
* mm_alloc(void)
303 struct mm_struct
* mm
;
305 mm
= kmem_cache_alloc(mm_cachep
, SLAB_KERNEL
);
307 memset(mm
, 0, sizeof(*mm
));
308 atomic_set(&mm
->mm_users
, 1);
309 atomic_set(&mm
->mm_count
, 1);
310 init_MUTEX(&mm
->mmap_sem
);
311 mm
->page_table_lock
= SPIN_LOCK_UNLOCKED
;
312 mm
->pgd
= pgd_alloc();
315 kmem_cache_free(mm_cachep
, mm
);
321 * Called when the last reference to the mm
322 * is dropped: either by a lazy thread or by
323 * mmput. Free the page directory and the mm.
325 inline void __mmdrop(struct mm_struct
*mm
)
327 if (mm
== &init_mm
) BUG();
330 kmem_cache_free(mm_cachep
, mm
);
334 * Decrement the use count and release all resources for an mm.
336 void mmput(struct mm_struct
*mm
)
338 if (atomic_dec_and_test(&mm
->mm_users
)) {
344 /* Please note the differences between mmput and mm_release.
345 * mmput is called whenever we stop holding onto a mm_struct,
346 * error success whatever.
348 * mm_release is called after a mm_struct has been removed
349 * from the current process.
351 * This difference is important for error handling, when we
352 * only half set up a mm_struct for a new process and need to restore
353 * the old one. Because we mmput the new mm_struct before
354 * restoring the old one. . .
355 * Eric Biederman 10 January 1998
357 void mm_release(void)
359 struct task_struct
*tsk
= current
;
361 /* notify parent sleeping on vfork() */
362 if (tsk
->flags
& PF_VFORK
) {
363 tsk
->flags
&= ~PF_VFORK
;
364 up(tsk
->p_opptr
->vfork_sem
);
368 static inline int copy_mm(unsigned long clone_flags
, struct task_struct
* tsk
)
370 struct mm_struct
* mm
;
373 tsk
->min_flt
= tsk
->maj_flt
= 0;
374 tsk
->cmin_flt
= tsk
->cmaj_flt
= 0;
375 tsk
->nswap
= tsk
->cnswap
= 0;
378 tsk
->active_mm
= NULL
;
381 * Are we cloning a kernel thread?
383 * We need to steal a active VM for that..
389 if (clone_flags
& CLONE_VM
) {
390 atomic_inc(&mm
->mm_users
);
403 * child gets a private LDT (if there was an LDT in the parent)
405 copy_segments(tsk
, mm
);
407 down(¤t
->mm
->mmap_sem
);
408 retval
= dup_mmap(mm
);
409 up(¤t
->mm
->mmap_sem
);
416 init_new_context(tsk
,mm
);
425 static inline struct fs_struct
*__copy_fs_struct(struct fs_struct
*old
)
427 struct fs_struct
*fs
= kmalloc(sizeof(*old
), GFP_KERNEL
);
429 atomic_set(&fs
->count
, 1);
430 fs
->umask
= old
->umask
;
431 fs
->rootmnt
= mntget(old
->rootmnt
);
432 fs
->root
= dget(old
->root
);
433 fs
->pwdmnt
= mntget(old
->pwdmnt
);
434 fs
->pwd
= dget(old
->pwd
);
436 fs
->altrootmnt
= mntget(old
->altrootmnt
);
437 fs
->altroot
= dget(old
->altroot
);
439 fs
->altrootmnt
= NULL
;
446 struct fs_struct
*copy_fs_struct(struct fs_struct
*old
)
448 return __copy_fs_struct(old
);
451 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
453 if (clone_flags
& CLONE_FS
) {
454 atomic_inc(¤t
->fs
->count
);
457 tsk
->fs
= __copy_fs_struct(current
->fs
);
463 static int count_open_files(struct files_struct
*files
, int size
)
467 /* Find the last open fd */
468 for (i
= size
/(8*sizeof(long)); i
> 0; ) {
469 if (files
->open_fds
->fds_bits
[--i
])
472 i
= (i
+1) * 8 * sizeof(long);
476 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
478 struct files_struct
*oldf
, *newf
;
479 struct file
**old_fds
, **new_fds
;
480 int open_files
, nfds
, size
, i
, error
= 0;
483 * A background process may not have any files ...
485 oldf
= current
->files
;
489 if (clone_flags
& CLONE_FILES
) {
490 atomic_inc(&oldf
->count
);
496 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
500 atomic_set(&newf
->count
, 1);
502 newf
->file_lock
= RW_LOCK_UNLOCKED
;
504 newf
->max_fds
= NR_OPEN_DEFAULT
;
505 newf
->max_fdset
= __FD_SETSIZE
;
506 newf
->close_on_exec
= &newf
->close_on_exec_init
;
507 newf
->open_fds
= &newf
->open_fds_init
;
508 newf
->fd
= &newf
->fd_array
[0];
510 /* We don't yet have the oldf readlock, but even if the old
511 fdset gets grown now, we'll only copy up to "size" fds */
512 size
= oldf
->max_fdset
;
513 if (size
> __FD_SETSIZE
) {
515 write_lock(&newf
->file_lock
);
516 error
= expand_fdset(newf
, size
);
517 write_unlock(&newf
->file_lock
);
521 read_lock(&oldf
->file_lock
);
523 open_files
= count_open_files(oldf
, size
);
526 * Check whether we need to allocate a larger fd array.
527 * Note: we're not a clone task, so the open count won't
530 nfds
= NR_OPEN_DEFAULT
;
531 if (open_files
> nfds
) {
532 read_unlock(&oldf
->file_lock
);
534 write_lock(&newf
->file_lock
);
535 error
= expand_fd_array(newf
, open_files
);
536 write_unlock(&newf
->file_lock
);
539 nfds
= newf
->max_fds
;
540 read_lock(&oldf
->file_lock
);
546 memcpy(newf
->open_fds
->fds_bits
, oldf
->open_fds
->fds_bits
, open_files
/8);
547 memcpy(newf
->close_on_exec
->fds_bits
, oldf
->close_on_exec
->fds_bits
, open_files
/8);
549 for (i
= open_files
; i
!= 0; i
--) {
550 struct file
*f
= *old_fds
++;
555 read_unlock(&oldf
->file_lock
);
557 /* compute the remainder to be cleared */
558 size
= (newf
->max_fds
- open_files
) * sizeof(struct file
*);
560 /* This is long word aligned thus could use a optimized version */
561 memset(new_fds
, 0, size
);
563 if (newf
->max_fdset
> open_files
) {
564 int left
= (newf
->max_fdset
-open_files
)/8;
565 int start
= open_files
/ (8 * sizeof(unsigned long));
567 memset(&newf
->open_fds
->fds_bits
[start
], 0, left
);
568 memset(&newf
->close_on_exec
->fds_bits
[start
], 0, left
);
577 free_fdset (newf
->close_on_exec
, newf
->max_fdset
);
578 free_fdset (newf
->open_fds
, newf
->max_fdset
);
579 kmem_cache_free(files_cachep
, newf
);
583 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
585 if (clone_flags
& CLONE_SIGHAND
) {
586 atomic_inc(¤t
->sig
->count
);
589 tsk
->sig
= kmalloc(sizeof(*tsk
->sig
), GFP_KERNEL
);
592 spin_lock_init(&tsk
->sig
->siglock
);
593 atomic_set(&tsk
->sig
->count
, 1);
594 memcpy(tsk
->sig
->action
, current
->sig
->action
, sizeof(tsk
->sig
->action
));
598 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
600 unsigned long new_flags
= p
->flags
;
602 new_flags
&= ~(PF_SUPERPRIV
| PF_USEDFPU
| PF_VFORK
);
603 new_flags
|= PF_FORKNOEXEC
;
604 if (!(clone_flags
& CLONE_PTRACE
))
605 new_flags
&= ~(PF_PTRACED
|PF_TRACESYS
);
606 if (clone_flags
& CLONE_VFORK
)
607 new_flags
|= PF_VFORK
;
608 p
->flags
= new_flags
;
612 * Ok, this is the main fork-routine. It copies the system process
613 * information (task[nr]) and sets up the necessary registers. It
614 * also copies the data segment in its entirety.
616 int do_fork(unsigned long clone_flags
, unsigned long usp
, struct pt_regs
*regs
)
618 int retval
= -ENOMEM
;
619 struct task_struct
*p
;
620 DECLARE_MUTEX_LOCKED(sem
);
622 if (clone_flags
& CLONE_PID
) {
623 /* This is only allowed from the boot up thread */
628 current
->vfork_sem
= &sem
;
630 p
= alloc_task_struct();
640 if (atomic_read(&p
->user
->count
) >= p
->rlim
[RLIMIT_NPROC
].rlim_cur
)
642 atomic_inc(&p
->user
->count
);
646 * Counter increases are protected by
647 * the kernel lock so nr_threads can't
648 * increase under us (but it may decrease).
650 if (nr_threads
>= max_threads
)
651 goto bad_fork_cleanup_count
;
653 if (p
->exec_domain
&& p
->exec_domain
->module
)
654 __MOD_INC_USE_COUNT(p
->exec_domain
->module
);
655 if (p
->binfmt
&& p
->binfmt
->module
)
656 __MOD_INC_USE_COUNT(p
->binfmt
->module
);
660 p
->state
= TASK_UNINTERRUPTIBLE
;
662 copy_flags(clone_flags
, p
);
663 p
->pid
= get_pid(clone_flags
);
666 * This is a "shadow run" state. The process
667 * is marked runnable, but isn't actually on
668 * any run queue yet.. (that happens at the
671 p
->state
= TASK_RUNNING
;
672 p
->run_list
.next
= NULL
;
673 p
->run_list
.prev
= NULL
;
675 if ((clone_flags
& CLONE_VFORK
) || !(clone_flags
& CLONE_PARENT
)) {
676 p
->p_opptr
= current
;
677 if (!(current
->flags
& PF_PTRACED
))
681 init_waitqueue_head(&p
->wait_chldexit
);
683 spin_lock_init(&p
->alloc_lock
);
686 sigemptyset(&p
->signal
);
688 p
->sigqueue_tail
= &p
->sigqueue
;
690 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
691 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
692 init_timer(&p
->real_timer
);
693 p
->real_timer
.data
= (unsigned long) p
;
695 p
->leader
= 0; /* session leadership doesn't inherit */
697 p
->times
.tms_utime
= p
->times
.tms_stime
= 0;
698 p
->times
.tms_cutime
= p
->times
.tms_cstime
= 0;
703 p
->processor
= current
->processor
;
704 /* ?? should we just memset this ?? */
705 for(i
= 0; i
< smp_num_cpus
; i
++)
706 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] = 0;
707 spin_lock_init(&p
->sigmask_lock
);
710 p
->lock_depth
= -1; /* -1 = no lock */
711 p
->start_time
= jiffies
;
714 /* copy all the process information */
715 if (copy_files(clone_flags
, p
))
716 goto bad_fork_cleanup
;
717 if (copy_fs(clone_flags
, p
))
718 goto bad_fork_cleanup_files
;
719 if (copy_sighand(clone_flags
, p
))
720 goto bad_fork_cleanup_fs
;
721 if (copy_mm(clone_flags
, p
))
722 goto bad_fork_cleanup_sighand
;
723 retval
= copy_thread(0, clone_flags
, usp
, p
, regs
);
725 goto bad_fork_cleanup_sighand
;
728 /* Our parent execution domain becomes current domain
729 These must match for thread signalling to apply */
731 p
->parent_exec_id
= p
->self_exec_id
;
733 /* ok, now we should be set up.. */
735 p
->exit_signal
= clone_flags
& CSIGNAL
;
736 p
->pdeath_signal
= 0;
739 * "share" dynamic priority between parent and child, thus the
740 * total amount of dynamic priorities in the system doesnt change,
741 * more scheduling fairness. This is only important in the first
742 * timeslice, on the long run the scheduling behaviour is unchanged.
744 p
->counter
= (current
->counter
+ 1) >> 1;
745 current
->counter
>>= 1;
746 if (!current
->counter
)
747 current
->need_resched
= 1;
750 * Ok, add it to the run-queues and make it
751 * visible to the rest of the system.
756 write_lock_irq(&tasklist_lock
);
760 write_unlock_irq(&tasklist_lock
);
762 wake_up_process(p
); /* do this last */
768 if ((clone_flags
& CLONE_VFORK
) && (retval
> 0))
772 bad_fork_cleanup_sighand
:
775 exit_fs(p
); /* blocking */
776 bad_fork_cleanup_files
:
777 exit_files(p
); /* blocking */
779 put_exec_domain(p
->exec_domain
);
780 if (p
->binfmt
&& p
->binfmt
->module
)
781 __MOD_DEC_USE_COUNT(p
->binfmt
->module
);
782 bad_fork_cleanup_count
:
790 void __init
filescache_init(void)
792 files_cachep
= kmem_cache_create("files_cache",
793 sizeof(struct files_struct
),
798 panic("Cannot create files cache");