4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
14 #include <linux/config.h>
15 #include <linux/malloc.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/uaccess.h>
25 #include <asm/mmu_context.h>
27 /* The idle threads do not count.. */
32 unsigned long total_forks
; /* Handle normal Linux uptimes. */
35 struct task_struct
*pidhash
[PIDHASH_SZ
];
37 void add_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
)
41 wq_write_lock_irqsave(&q
->lock
, flags
);
43 __add_wait_queue(q
, wait
);
44 wq_write_unlock_irqrestore(&q
->lock
, flags
);
47 void add_wait_queue_exclusive(wait_queue_head_t
*q
, wait_queue_t
* wait
)
51 wq_write_lock_irqsave(&q
->lock
, flags
);
52 wait
->flags
= WQ_FLAG_EXCLUSIVE
;
53 __add_wait_queue_tail(q
, wait
);
54 wq_write_unlock_irqrestore(&q
->lock
, flags
);
57 void remove_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
)
61 wq_write_lock_irqsave(&q
->lock
, flags
);
62 __remove_wait_queue(q
, wait
);
63 wq_write_unlock_irqrestore(&q
->lock
, flags
);
66 void __init
fork_init(unsigned long mempages
)
69 * The default maximum number of threads is set to a safe
70 * value: the thread structures can take up at most half
73 max_threads
= mempages
/ (THREAD_SIZE
/PAGE_SIZE
) / 2;
75 init_task
.rlim
[RLIMIT_NPROC
].rlim_cur
= max_threads
/2;
76 init_task
.rlim
[RLIMIT_NPROC
].rlim_max
= max_threads
/2;
79 /* Protects next_safe and last_pid. */
80 spinlock_t lastpid_lock
= SPIN_LOCK_UNLOCKED
;
82 static int get_pid(unsigned long flags
)
84 static int next_safe
= PID_MAX
;
85 struct task_struct
*p
;
87 if (flags
& CLONE_PID
)
90 spin_lock(&lastpid_lock
);
91 if((++last_pid
) & 0xffff8000) {
92 last_pid
= 300; /* Skip daemons etc. */
95 if(last_pid
>= next_safe
) {
98 read_lock(&tasklist_lock
);
101 if(p
->pid
== last_pid
||
102 p
->pgrp
== last_pid
||
103 p
->session
== last_pid
) {
104 if(++last_pid
>= next_safe
) {
105 if(last_pid
& 0xffff8000)
111 if(p
->pid
> last_pid
&& next_safe
> p
->pid
)
113 if(p
->pgrp
> last_pid
&& next_safe
> p
->pgrp
)
115 if(p
->session
> last_pid
&& next_safe
> p
->session
)
116 next_safe
= p
->session
;
118 read_unlock(&tasklist_lock
);
120 spin_unlock(&lastpid_lock
);
125 static inline int dup_mmap(struct mm_struct
* mm
)
127 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
130 flush_cache_mm(current
->mm
);
134 mm
->mmap_cache
= NULL
;
139 mm
->swap_address
= 0;
142 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
146 if(mpnt
->vm_flags
& VM_DONTCOPY
)
148 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
152 tmp
->vm_flags
&= ~VM_LOCKED
;
158 struct inode
*inode
= file
->f_dentry
->d_inode
;
160 if (tmp
->vm_flags
& VM_DENYWRITE
)
161 atomic_dec(&inode
->i_writecount
);
163 /* insert tmp into the share list, just after mpnt */
164 spin_lock(&inode
->i_mapping
->i_shared_lock
);
165 if((tmp
->vm_next_share
= mpnt
->vm_next_share
) != NULL
)
166 mpnt
->vm_next_share
->vm_pprev_share
=
168 mpnt
->vm_next_share
= tmp
;
169 tmp
->vm_pprev_share
= &mpnt
->vm_next_share
;
170 spin_unlock(&inode
->i_mapping
->i_shared_lock
);
173 /* Copy the pages, but defer checking for errors */
174 retval
= copy_page_range(mm
, current
->mm
, tmp
);
175 if (!retval
&& tmp
->vm_ops
&& tmp
->vm_ops
->open
)
176 tmp
->vm_ops
->open(tmp
);
179 * Link in the new vma even if an error occurred,
180 * so that exit_mmap() can clean up the mess.
182 tmp
->vm_next
= *pprev
;
185 pprev
= &tmp
->vm_next
;
190 if (mm
->map_count
>= AVL_MIN_MAP_COUNT
)
194 flush_tlb_mm(current
->mm
);
198 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
199 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
201 static struct mm_struct
* mm_init(struct mm_struct
* mm
)
203 atomic_set(&mm
->mm_users
, 1);
204 atomic_set(&mm
->mm_count
, 1);
205 init_MUTEX(&mm
->mmap_sem
);
206 mm
->page_table_lock
= SPIN_LOCK_UNLOCKED
;
207 mm
->pgd
= pgd_alloc();
216 * Allocate and initialize an mm_struct.
218 struct mm_struct
* mm_alloc(void)
220 struct mm_struct
* mm
;
224 memset(mm
, 0, sizeof(*mm
));
231 * Called when the last reference to the mm
232 * is dropped: either by a lazy thread or by
233 * mmput. Free the page directory and the mm.
235 inline void __mmdrop(struct mm_struct
*mm
)
237 if (mm
== &init_mm
) BUG();
244 * Decrement the use count and release all resources for an mm.
246 void mmput(struct mm_struct
*mm
)
248 if (atomic_dec_and_test(&mm
->mm_users
)) {
254 /* Please note the differences between mmput and mm_release.
255 * mmput is called whenever we stop holding onto a mm_struct,
256 * error success whatever.
258 * mm_release is called after a mm_struct has been removed
259 * from the current process.
261 * This difference is important for error handling, when we
262 * only half set up a mm_struct for a new process and need to restore
263 * the old one. Because we mmput the new mm_struct before
264 * restoring the old one. . .
265 * Eric Biederman 10 January 1998
267 void mm_release(void)
269 struct task_struct
*tsk
= current
;
271 /* notify parent sleeping on vfork() */
272 if (tsk
->flags
& PF_VFORK
) {
273 tsk
->flags
&= ~PF_VFORK
;
274 up(tsk
->p_opptr
->vfork_sem
);
278 static inline int copy_mm(unsigned long clone_flags
, struct task_struct
* tsk
)
280 struct mm_struct
* mm
;
283 tsk
->min_flt
= tsk
->maj_flt
= 0;
284 tsk
->cmin_flt
= tsk
->cmaj_flt
= 0;
285 tsk
->nswap
= tsk
->cnswap
= 0;
288 tsk
->active_mm
= NULL
;
291 * Are we cloning a kernel thread?
293 * We need to steal a active VM for that..
299 if (clone_flags
& CLONE_VM
) {
300 atomic_inc(&mm
->mm_users
);
309 /* Copy the current MM stuff.. */
310 memcpy(mm
, current
->mm
, sizeof(*mm
));
317 down(¤t
->mm
->mmap_sem
);
318 retval
= dup_mmap(mm
);
319 up(¤t
->mm
->mmap_sem
);
324 * child gets a private LDT (if there was an LDT in the parent)
326 copy_segments(tsk
, mm
);
328 if (init_new_context(tsk
,mm
))
342 static inline struct fs_struct
*__copy_fs_struct(struct fs_struct
*old
)
344 struct fs_struct
*fs
= kmem_cache_alloc(fs_cachep
, GFP_KERNEL
);
345 /* We don't need to lock fs - think why ;-) */
347 atomic_set(&fs
->count
, 1);
348 fs
->lock
= RW_LOCK_UNLOCKED
;
349 fs
->umask
= old
->umask
;
350 read_lock(&old
->lock
);
351 fs
->rootmnt
= mntget(old
->rootmnt
);
352 fs
->root
= dget(old
->root
);
353 fs
->pwdmnt
= mntget(old
->pwdmnt
);
354 fs
->pwd
= dget(old
->pwd
);
356 fs
->altrootmnt
= mntget(old
->altrootmnt
);
357 fs
->altroot
= dget(old
->altroot
);
359 fs
->altrootmnt
= NULL
;
362 read_unlock(&old
->lock
);
367 struct fs_struct
*copy_fs_struct(struct fs_struct
*old
)
369 return __copy_fs_struct(old
);
372 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
374 if (clone_flags
& CLONE_FS
) {
375 atomic_inc(¤t
->fs
->count
);
378 tsk
->fs
= __copy_fs_struct(current
->fs
);
384 static int count_open_files(struct files_struct
*files
, int size
)
388 /* Find the last open fd */
389 for (i
= size
/(8*sizeof(long)); i
> 0; ) {
390 if (files
->open_fds
->fds_bits
[--i
])
393 i
= (i
+1) * 8 * sizeof(long);
397 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
399 struct files_struct
*oldf
, *newf
;
400 struct file
**old_fds
, **new_fds
;
401 int open_files
, nfds
, size
, i
, error
= 0;
404 * A background process may not have any files ...
406 oldf
= current
->files
;
410 if (clone_flags
& CLONE_FILES
) {
411 atomic_inc(&oldf
->count
);
417 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
421 atomic_set(&newf
->count
, 1);
423 newf
->file_lock
= RW_LOCK_UNLOCKED
;
425 newf
->max_fds
= NR_OPEN_DEFAULT
;
426 newf
->max_fdset
= __FD_SETSIZE
;
427 newf
->close_on_exec
= &newf
->close_on_exec_init
;
428 newf
->open_fds
= &newf
->open_fds_init
;
429 newf
->fd
= &newf
->fd_array
[0];
431 /* We don't yet have the oldf readlock, but even if the old
432 fdset gets grown now, we'll only copy up to "size" fds */
433 size
= oldf
->max_fdset
;
434 if (size
> __FD_SETSIZE
) {
436 write_lock(&newf
->file_lock
);
437 error
= expand_fdset(newf
, size
);
438 write_unlock(&newf
->file_lock
);
442 read_lock(&oldf
->file_lock
);
444 open_files
= count_open_files(oldf
, size
);
447 * Check whether we need to allocate a larger fd array.
448 * Note: we're not a clone task, so the open count won't
451 nfds
= NR_OPEN_DEFAULT
;
452 if (open_files
> nfds
) {
453 read_unlock(&oldf
->file_lock
);
455 write_lock(&newf
->file_lock
);
456 error
= expand_fd_array(newf
, open_files
);
457 write_unlock(&newf
->file_lock
);
460 nfds
= newf
->max_fds
;
461 read_lock(&oldf
->file_lock
);
467 memcpy(newf
->open_fds
->fds_bits
, oldf
->open_fds
->fds_bits
, open_files
/8);
468 memcpy(newf
->close_on_exec
->fds_bits
, oldf
->close_on_exec
->fds_bits
, open_files
/8);
470 for (i
= open_files
; i
!= 0; i
--) {
471 struct file
*f
= *old_fds
++;
476 read_unlock(&oldf
->file_lock
);
478 /* compute the remainder to be cleared */
479 size
= (newf
->max_fds
- open_files
) * sizeof(struct file
*);
481 /* This is long word aligned thus could use a optimized version */
482 memset(new_fds
, 0, size
);
484 if (newf
->max_fdset
> open_files
) {
485 int left
= (newf
->max_fdset
-open_files
)/8;
486 int start
= open_files
/ (8 * sizeof(unsigned long));
488 memset(&newf
->open_fds
->fds_bits
[start
], 0, left
);
489 memset(&newf
->close_on_exec
->fds_bits
[start
], 0, left
);
498 free_fdset (newf
->close_on_exec
, newf
->max_fdset
);
499 free_fdset (newf
->open_fds
, newf
->max_fdset
);
500 kmem_cache_free(files_cachep
, newf
);
504 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
506 struct signal_struct
*sig
;
508 if (clone_flags
& CLONE_SIGHAND
) {
509 atomic_inc(¤t
->sig
->count
);
512 sig
= kmem_cache_alloc(sigact_cachep
, GFP_KERNEL
);
516 spin_lock_init(&sig
->siglock
);
517 atomic_set(&sig
->count
, 1);
518 memcpy(tsk
->sig
->action
, current
->sig
->action
, sizeof(tsk
->sig
->action
));
522 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
524 unsigned long new_flags
= p
->flags
;
526 new_flags
&= ~(PF_SUPERPRIV
| PF_USEDFPU
| PF_VFORK
);
527 new_flags
|= PF_FORKNOEXEC
;
528 if (!(clone_flags
& CLONE_PTRACE
))
530 if (clone_flags
& CLONE_VFORK
)
531 new_flags
|= PF_VFORK
;
532 p
->flags
= new_flags
;
536 * Ok, this is the main fork-routine. It copies the system process
537 * information (task[nr]) and sets up the necessary registers. It also
538 * copies the data segment in its entirety. The "stack_start" and
539 * "stack_top" arguments are simply passed along to the platform
540 * specific copy_thread() routine. Most platforms ignore stack_top.
541 * For an example that's using stack_top, see
542 * arch/ia64/kernel/process.c.
544 int do_fork(unsigned long clone_flags
, unsigned long stack_start
,
545 struct pt_regs
*regs
, unsigned long stack_top
)
547 int retval
= -ENOMEM
;
548 struct task_struct
*p
;
549 DECLARE_MUTEX_LOCKED(sem
);
551 if (clone_flags
& CLONE_PID
) {
552 /* This is only allowed from the boot up thread */
557 current
->vfork_sem
= &sem
;
559 p
= alloc_task_struct();
566 if (atomic_read(&p
->user
->processes
) >= p
->rlim
[RLIMIT_NPROC
].rlim_cur
)
568 atomic_inc(&p
->user
->__count
);
569 atomic_inc(&p
->user
->processes
);
572 * Counter increases are protected by
573 * the kernel lock so nr_threads can't
574 * increase under us (but it may decrease).
576 if (nr_threads
>= max_threads
)
577 goto bad_fork_cleanup_count
;
579 get_exec_domain(p
->exec_domain
);
581 if (p
->binfmt
&& p
->binfmt
->module
)
582 __MOD_INC_USE_COUNT(p
->binfmt
->module
);
586 p
->state
= TASK_UNINTERRUPTIBLE
;
588 copy_flags(clone_flags
, p
);
589 p
->pid
= get_pid(clone_flags
);
591 p
->run_list
.next
= NULL
;
592 p
->run_list
.prev
= NULL
;
594 if ((clone_flags
& CLONE_VFORK
) || !(clone_flags
& CLONE_PARENT
)) {
595 p
->p_opptr
= current
;
596 if (!(p
->ptrace
& PT_PTRACED
))
600 init_waitqueue_head(&p
->wait_chldexit
);
602 spin_lock_init(&p
->alloc_lock
);
605 init_sigpending(&p
->pending
);
607 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
608 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
609 init_timer(&p
->real_timer
);
610 p
->real_timer
.data
= (unsigned long) p
;
612 p
->leader
= 0; /* session leadership doesn't inherit */
614 p
->times
.tms_utime
= p
->times
.tms_stime
= 0;
615 p
->times
.tms_cutime
= p
->times
.tms_cstime
= 0;
620 p
->processor
= current
->processor
;
621 /* ?? should we just memset this ?? */
622 for(i
= 0; i
< smp_num_cpus
; i
++)
623 p
->per_cpu_utime
[i
] = p
->per_cpu_stime
[i
] = 0;
624 spin_lock_init(&p
->sigmask_lock
);
627 p
->lock_depth
= -1; /* -1 = no lock */
628 p
->start_time
= jiffies
;
631 /* copy all the process information */
632 if (copy_files(clone_flags
, p
))
633 goto bad_fork_cleanup
;
634 if (copy_fs(clone_flags
, p
))
635 goto bad_fork_cleanup_files
;
636 if (copy_sighand(clone_flags
, p
))
637 goto bad_fork_cleanup_fs
;
638 if (copy_mm(clone_flags
, p
))
639 goto bad_fork_cleanup_sighand
;
640 retval
= copy_thread(0, clone_flags
, stack_start
, stack_top
, p
, regs
);
642 goto bad_fork_cleanup_sighand
;
645 /* Our parent execution domain becomes current domain
646 These must match for thread signalling to apply */
648 p
->parent_exec_id
= p
->self_exec_id
;
650 /* ok, now we should be set up.. */
652 p
->exit_signal
= clone_flags
& CSIGNAL
;
653 p
->pdeath_signal
= 0;
656 * "share" dynamic priority between parent and child, thus the
657 * total amount of dynamic priorities in the system doesnt change,
658 * more scheduling fairness. This is only important in the first
659 * timeslice, on the long run the scheduling behaviour is unchanged.
661 p
->counter
= (current
->counter
+ 1) >> 1;
662 current
->counter
>>= 1;
663 if (!current
->counter
)
664 current
->need_resched
= 1;
667 * Ok, add it to the run-queues and make it
668 * visible to the rest of the system.
674 INIT_LIST_HEAD(&p
->thread_group
);
675 write_lock_irq(&tasklist_lock
);
676 if (clone_flags
& CLONE_THREAD
) {
677 p
->tgid
= current
->tgid
;
678 list_add(&p
->thread_group
, ¤t
->thread_group
);
683 write_unlock_irq(&tasklist_lock
);
685 if (p
->ptrace
& PT_PTRACED
)
686 send_sig(SIGSTOP
, p
, 1);
688 wake_up_process(p
); /* do this last */
692 if ((clone_flags
& CLONE_VFORK
) && (retval
> 0))
696 bad_fork_cleanup_sighand
:
699 exit_fs(p
); /* blocking */
700 bad_fork_cleanup_files
:
701 exit_files(p
); /* blocking */
703 put_exec_domain(p
->exec_domain
);
704 if (p
->binfmt
&& p
->binfmt
->module
)
705 __MOD_DEC_USE_COUNT(p
->binfmt
->module
);
706 bad_fork_cleanup_count
:
707 atomic_dec(&p
->user
->processes
);
714 /* SLAB cache for signal_struct structures (tsk->sig) */
715 kmem_cache_t
*sigact_cachep
;
717 /* SLAB cache for files_struct structures (tsk->files) */
718 kmem_cache_t
*files_cachep
;
720 /* SLAB cache for fs_struct structures (tsk->fs) */
721 kmem_cache_t
*fs_cachep
;
723 /* SLAB cache for vm_area_struct structures */
724 kmem_cache_t
*vm_area_cachep
;
726 /* SLAB cache for mm_struct structures (tsk->mm) */
727 kmem_cache_t
*mm_cachep
;
729 void __init
proc_caches_init(void)
731 sigact_cachep
= kmem_cache_create("signal_act",
732 sizeof(struct signal_struct
), 0,
733 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
735 panic("Cannot create signal action SLAB cache");
737 files_cachep
= kmem_cache_create("files_cache",
738 sizeof(struct files_struct
), 0,
739 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
741 panic("Cannot create files SLAB cache");
743 fs_cachep
= kmem_cache_create("fs_cache",
744 sizeof(struct fs_struct
), 0,
745 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
747 panic("Cannot create fs_struct SLAB cache");
749 vm_area_cachep
= kmem_cache_create("vm_area_struct",
750 sizeof(struct vm_area_struct
), 0,
751 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
753 panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
755 mm_cachep
= kmem_cache_create("mm_struct",
756 sizeof(struct mm_struct
), 0,
757 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
759 panic("vma_init: Cannot alloc mm_struct SLAB cache");