- Linus: more PageDirty / swapcache handling
[davej-history.git] / kernel / fork.c
blob68f72370946ff8257e475e22d0b2591ed57a5f00
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
14 #include <linux/config.h>
15 #include <linux/malloc.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/uaccess.h>
25 #include <asm/mmu_context.h>
27 /* The idle threads do not count.. */
28 int nr_threads;
29 int nr_running;
31 int max_threads;
32 unsigned long total_forks; /* Handle normal Linux uptimes. */
33 int last_pid;
35 struct task_struct *pidhash[PIDHASH_SZ];
37 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
39 unsigned long flags;
41 wq_write_lock_irqsave(&q->lock, flags);
42 wait->flags = 0;
43 __add_wait_queue(q, wait);
44 wq_write_unlock_irqrestore(&q->lock, flags);
47 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
49 unsigned long flags;
51 wq_write_lock_irqsave(&q->lock, flags);
52 wait->flags = WQ_FLAG_EXCLUSIVE;
53 __add_wait_queue_tail(q, wait);
54 wq_write_unlock_irqrestore(&q->lock, flags);
57 void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
59 unsigned long flags;
61 wq_write_lock_irqsave(&q->lock, flags);
62 __remove_wait_queue(q, wait);
63 wq_write_unlock_irqrestore(&q->lock, flags);
66 void __init fork_init(unsigned long mempages)
69 * The default maximum number of threads is set to a safe
70 * value: the thread structures can take up at most half
71 * of memory.
73 max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
75 init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
76 init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
79 /* Protects next_safe and last_pid. */
80 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
82 static int get_pid(unsigned long flags)
84 static int next_safe = PID_MAX;
85 struct task_struct *p;
87 if (flags & CLONE_PID)
88 return current->pid;
90 spin_lock(&lastpid_lock);
91 if((++last_pid) & 0xffff8000) {
92 last_pid = 300; /* Skip daemons etc. */
93 goto inside;
95 if(last_pid >= next_safe) {
96 inside:
97 next_safe = PID_MAX;
98 read_lock(&tasklist_lock);
99 repeat:
100 for_each_task(p) {
101 if(p->pid == last_pid ||
102 p->pgrp == last_pid ||
103 p->session == last_pid) {
104 if(++last_pid >= next_safe) {
105 if(last_pid & 0xffff8000)
106 last_pid = 300;
107 next_safe = PID_MAX;
109 goto repeat;
111 if(p->pid > last_pid && next_safe > p->pid)
112 next_safe = p->pid;
113 if(p->pgrp > last_pid && next_safe > p->pgrp)
114 next_safe = p->pgrp;
115 if(p->session > last_pid && next_safe > p->session)
116 next_safe = p->session;
118 read_unlock(&tasklist_lock);
120 spin_unlock(&lastpid_lock);
122 return last_pid;
125 static inline int dup_mmap(struct mm_struct * mm)
127 struct vm_area_struct * mpnt, *tmp, **pprev;
128 int retval;
130 flush_cache_mm(current->mm);
131 mm->locked_vm = 0;
132 mm->mmap = NULL;
133 mm->mmap_avl = NULL;
134 mm->mmap_cache = NULL;
135 mm->map_count = 0;
136 mm->context = 0;
137 mm->cpu_vm_mask = 0;
138 mm->swap_cnt = 0;
139 mm->swap_address = 0;
140 mm->segments = NULL;
141 pprev = &mm->mmap;
142 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
143 struct file *file;
145 retval = -ENOMEM;
146 if(mpnt->vm_flags & VM_DONTCOPY)
147 continue;
148 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
149 if (!tmp)
150 goto fail_nomem;
151 *tmp = *mpnt;
152 tmp->vm_flags &= ~VM_LOCKED;
153 tmp->vm_mm = mm;
154 mm->map_count++;
155 tmp->vm_next = NULL;
156 file = tmp->vm_file;
157 if (file) {
158 struct inode *inode = file->f_dentry->d_inode;
159 get_file(file);
160 if (tmp->vm_flags & VM_DENYWRITE)
161 atomic_dec(&inode->i_writecount);
163 /* insert tmp into the share list, just after mpnt */
164 spin_lock(&inode->i_mapping->i_shared_lock);
165 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
166 mpnt->vm_next_share->vm_pprev_share =
167 &tmp->vm_next_share;
168 mpnt->vm_next_share = tmp;
169 tmp->vm_pprev_share = &mpnt->vm_next_share;
170 spin_unlock(&inode->i_mapping->i_shared_lock);
173 /* Copy the pages, but defer checking for errors */
174 retval = copy_page_range(mm, current->mm, tmp);
175 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
176 tmp->vm_ops->open(tmp);
179 * Link in the new vma even if an error occurred,
180 * so that exit_mmap() can clean up the mess.
182 tmp->vm_next = *pprev;
183 *pprev = tmp;
185 pprev = &tmp->vm_next;
186 if (retval)
187 goto fail_nomem;
189 retval = 0;
190 if (mm->map_count >= AVL_MIN_MAP_COUNT)
191 build_mmap_avl(mm);
193 fail_nomem:
194 flush_tlb_mm(current->mm);
195 return retval;
198 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
199 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
201 static struct mm_struct * mm_init(struct mm_struct * mm)
203 atomic_set(&mm->mm_users, 1);
204 atomic_set(&mm->mm_count, 1);
205 init_MUTEX(&mm->mmap_sem);
206 mm->page_table_lock = SPIN_LOCK_UNLOCKED;
207 mm->pgd = pgd_alloc();
208 if (mm->pgd)
209 return mm;
210 free_mm(mm);
211 return NULL;
216 * Allocate and initialize an mm_struct.
218 struct mm_struct * mm_alloc(void)
220 struct mm_struct * mm;
222 mm = allocate_mm();
223 if (mm) {
224 memset(mm, 0, sizeof(*mm));
225 return mm_init(mm);
227 return NULL;
231 * Called when the last reference to the mm
232 * is dropped: either by a lazy thread or by
233 * mmput. Free the page directory and the mm.
235 inline void __mmdrop(struct mm_struct *mm)
237 if (mm == &init_mm) BUG();
238 pgd_free(mm->pgd);
239 destroy_context(mm);
240 free_mm(mm);
244 * Decrement the use count and release all resources for an mm.
246 void mmput(struct mm_struct *mm)
248 if (atomic_dec_and_test(&mm->mm_users)) {
249 exit_mmap(mm);
250 mmdrop(mm);
254 /* Please note the differences between mmput and mm_release.
255 * mmput is called whenever we stop holding onto a mm_struct,
256 * error success whatever.
258 * mm_release is called after a mm_struct has been removed
259 * from the current process.
261 * This difference is important for error handling, when we
262 * only half set up a mm_struct for a new process and need to restore
263 * the old one. Because we mmput the new mm_struct before
264 * restoring the old one. . .
265 * Eric Biederman 10 January 1998
267 void mm_release(void)
269 struct task_struct *tsk = current;
271 /* notify parent sleeping on vfork() */
272 if (tsk->flags & PF_VFORK) {
273 tsk->flags &= ~PF_VFORK;
274 up(tsk->p_opptr->vfork_sem);
278 static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
280 struct mm_struct * mm;
281 int retval;
283 tsk->min_flt = tsk->maj_flt = 0;
284 tsk->cmin_flt = tsk->cmaj_flt = 0;
285 tsk->nswap = tsk->cnswap = 0;
287 tsk->mm = NULL;
288 tsk->active_mm = NULL;
291 * Are we cloning a kernel thread?
293 * We need to steal a active VM for that..
295 mm = current->mm;
296 if (!mm)
297 return 0;
299 if (clone_flags & CLONE_VM) {
300 atomic_inc(&mm->mm_users);
301 goto good_mm;
304 retval = -ENOMEM;
305 mm = allocate_mm();
306 if (!mm)
307 goto fail_nomem;
309 /* Copy the current MM stuff.. */
310 memcpy(mm, current->mm, sizeof(*mm));
311 if (!mm_init(mm))
312 goto fail_nomem;
314 tsk->mm = mm;
315 tsk->active_mm = mm;
317 down(&current->mm->mmap_sem);
318 retval = dup_mmap(mm);
319 up(&current->mm->mmap_sem);
320 if (retval)
321 goto free_pt;
324 * child gets a private LDT (if there was an LDT in the parent)
326 copy_segments(tsk, mm);
328 if (init_new_context(tsk,mm))
329 goto free_pt;
331 good_mm:
332 tsk->mm = mm;
333 tsk->active_mm = mm;
334 return 0;
336 free_pt:
337 mmput(mm);
338 fail_nomem:
339 return retval;
342 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
344 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
345 /* We don't need to lock fs - think why ;-) */
346 if (fs) {
347 atomic_set(&fs->count, 1);
348 fs->lock = RW_LOCK_UNLOCKED;
349 fs->umask = old->umask;
350 read_lock(&old->lock);
351 fs->rootmnt = mntget(old->rootmnt);
352 fs->root = dget(old->root);
353 fs->pwdmnt = mntget(old->pwdmnt);
354 fs->pwd = dget(old->pwd);
355 if (old->altroot) {
356 fs->altrootmnt = mntget(old->altrootmnt);
357 fs->altroot = dget(old->altroot);
358 } else {
359 fs->altrootmnt = NULL;
360 fs->altroot = NULL;
362 read_unlock(&old->lock);
364 return fs;
367 struct fs_struct *copy_fs_struct(struct fs_struct *old)
369 return __copy_fs_struct(old);
372 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
374 if (clone_flags & CLONE_FS) {
375 atomic_inc(&current->fs->count);
376 return 0;
378 tsk->fs = __copy_fs_struct(current->fs);
379 if (!tsk->fs)
380 return -1;
381 return 0;
384 static int count_open_files(struct files_struct *files, int size)
386 int i;
388 /* Find the last open fd */
389 for (i = size/(8*sizeof(long)); i > 0; ) {
390 if (files->open_fds->fds_bits[--i])
391 break;
393 i = (i+1) * 8 * sizeof(long);
394 return i;
397 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
399 struct files_struct *oldf, *newf;
400 struct file **old_fds, **new_fds;
401 int open_files, nfds, size, i, error = 0;
404 * A background process may not have any files ...
406 oldf = current->files;
407 if (!oldf)
408 goto out;
410 if (clone_flags & CLONE_FILES) {
411 atomic_inc(&oldf->count);
412 goto out;
415 tsk->files = NULL;
416 error = -ENOMEM;
417 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
418 if (!newf)
419 goto out;
421 atomic_set(&newf->count, 1);
423 newf->file_lock = RW_LOCK_UNLOCKED;
424 newf->next_fd = 0;
425 newf->max_fds = NR_OPEN_DEFAULT;
426 newf->max_fdset = __FD_SETSIZE;
427 newf->close_on_exec = &newf->close_on_exec_init;
428 newf->open_fds = &newf->open_fds_init;
429 newf->fd = &newf->fd_array[0];
431 /* We don't yet have the oldf readlock, but even if the old
432 fdset gets grown now, we'll only copy up to "size" fds */
433 size = oldf->max_fdset;
434 if (size > __FD_SETSIZE) {
435 newf->max_fdset = 0;
436 write_lock(&newf->file_lock);
437 error = expand_fdset(newf, size);
438 write_unlock(&newf->file_lock);
439 if (error)
440 goto out_release;
442 read_lock(&oldf->file_lock);
444 open_files = count_open_files(oldf, size);
447 * Check whether we need to allocate a larger fd array.
448 * Note: we're not a clone task, so the open count won't
449 * change.
451 nfds = NR_OPEN_DEFAULT;
452 if (open_files > nfds) {
453 read_unlock(&oldf->file_lock);
454 newf->max_fds = 0;
455 write_lock(&newf->file_lock);
456 error = expand_fd_array(newf, open_files);
457 write_unlock(&newf->file_lock);
458 if (error)
459 goto out_release;
460 nfds = newf->max_fds;
461 read_lock(&oldf->file_lock);
464 old_fds = oldf->fd;
465 new_fds = newf->fd;
467 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
468 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
470 for (i = open_files; i != 0; i--) {
471 struct file *f = *old_fds++;
472 if (f)
473 get_file(f);
474 *new_fds++ = f;
476 read_unlock(&oldf->file_lock);
478 /* compute the remainder to be cleared */
479 size = (newf->max_fds - open_files) * sizeof(struct file *);
481 /* This is long word aligned thus could use a optimized version */
482 memset(new_fds, 0, size);
484 if (newf->max_fdset > open_files) {
485 int left = (newf->max_fdset-open_files)/8;
486 int start = open_files / (8 * sizeof(unsigned long));
488 memset(&newf->open_fds->fds_bits[start], 0, left);
489 memset(&newf->close_on_exec->fds_bits[start], 0, left);
492 tsk->files = newf;
493 error = 0;
494 out:
495 return error;
497 out_release:
498 free_fdset (newf->close_on_exec, newf->max_fdset);
499 free_fdset (newf->open_fds, newf->max_fdset);
500 kmem_cache_free(files_cachep, newf);
501 goto out;
504 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
506 struct signal_struct *sig;
508 if (clone_flags & CLONE_SIGHAND) {
509 atomic_inc(&current->sig->count);
510 return 0;
512 sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
513 tsk->sig = sig;
514 if (!sig)
515 return -1;
516 spin_lock_init(&sig->siglock);
517 atomic_set(&sig->count, 1);
518 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
519 return 0;
522 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
524 unsigned long new_flags = p->flags;
526 new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
527 new_flags |= PF_FORKNOEXEC;
528 if (!(clone_flags & CLONE_PTRACE))
529 p->ptrace = 0;
530 if (clone_flags & CLONE_VFORK)
531 new_flags |= PF_VFORK;
532 p->flags = new_flags;
536 * Ok, this is the main fork-routine. It copies the system process
537 * information (task[nr]) and sets up the necessary registers. It also
538 * copies the data segment in its entirety. The "stack_start" and
539 * "stack_top" arguments are simply passed along to the platform
540 * specific copy_thread() routine. Most platforms ignore stack_top.
541 * For an example that's using stack_top, see
542 * arch/ia64/kernel/process.c.
544 int do_fork(unsigned long clone_flags, unsigned long stack_start,
545 struct pt_regs *regs, unsigned long stack_top)
547 int retval = -ENOMEM;
548 struct task_struct *p;
549 DECLARE_MUTEX_LOCKED(sem);
551 if (clone_flags & CLONE_PID) {
552 /* This is only allowed from the boot up thread */
553 if (current->pid)
554 return -EPERM;
557 current->vfork_sem = &sem;
559 p = alloc_task_struct();
560 if (!p)
561 goto fork_out;
563 *p = *current;
565 retval = -EAGAIN;
566 if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
567 goto bad_fork_free;
568 atomic_inc(&p->user->__count);
569 atomic_inc(&p->user->processes);
572 * Counter increases are protected by
573 * the kernel lock so nr_threads can't
574 * increase under us (but it may decrease).
576 if (nr_threads >= max_threads)
577 goto bad_fork_cleanup_count;
579 get_exec_domain(p->exec_domain);
581 if (p->binfmt && p->binfmt->module)
582 __MOD_INC_USE_COUNT(p->binfmt->module);
584 p->did_exec = 0;
585 p->swappable = 0;
586 p->state = TASK_UNINTERRUPTIBLE;
588 copy_flags(clone_flags, p);
589 p->pid = get_pid(clone_flags);
591 p->run_list.next = NULL;
592 p->run_list.prev = NULL;
594 if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
595 p->p_opptr = current;
596 if (!(p->ptrace & PT_PTRACED))
597 p->p_pptr = current;
599 p->p_cptr = NULL;
600 init_waitqueue_head(&p->wait_chldexit);
601 p->vfork_sem = NULL;
602 spin_lock_init(&p->alloc_lock);
604 p->sigpending = 0;
605 init_sigpending(&p->pending);
607 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
608 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
609 init_timer(&p->real_timer);
610 p->real_timer.data = (unsigned long) p;
612 p->leader = 0; /* session leadership doesn't inherit */
613 p->tty_old_pgrp = 0;
614 p->times.tms_utime = p->times.tms_stime = 0;
615 p->times.tms_cutime = p->times.tms_cstime = 0;
616 #ifdef CONFIG_SMP
618 int i;
619 p->has_cpu = 0;
620 p->processor = current->processor;
621 /* ?? should we just memset this ?? */
622 for(i = 0; i < smp_num_cpus; i++)
623 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
624 spin_lock_init(&p->sigmask_lock);
626 #endif
627 p->lock_depth = -1; /* -1 = no lock */
628 p->start_time = jiffies;
630 retval = -ENOMEM;
631 /* copy all the process information */
632 if (copy_files(clone_flags, p))
633 goto bad_fork_cleanup;
634 if (copy_fs(clone_flags, p))
635 goto bad_fork_cleanup_files;
636 if (copy_sighand(clone_flags, p))
637 goto bad_fork_cleanup_fs;
638 if (copy_mm(clone_flags, p))
639 goto bad_fork_cleanup_sighand;
640 retval = copy_thread(0, clone_flags, stack_start, stack_top, p, regs);
641 if (retval)
642 goto bad_fork_cleanup_sighand;
643 p->semundo = NULL;
645 /* Our parent execution domain becomes current domain
646 These must match for thread signalling to apply */
648 p->parent_exec_id = p->self_exec_id;
650 /* ok, now we should be set up.. */
651 p->swappable = 1;
652 p->exit_signal = clone_flags & CSIGNAL;
653 p->pdeath_signal = 0;
656 * "share" dynamic priority between parent and child, thus the
657 * total amount of dynamic priorities in the system doesnt change,
658 * more scheduling fairness. This is only important in the first
659 * timeslice, on the long run the scheduling behaviour is unchanged.
661 p->counter = (current->counter + 1) >> 1;
662 current->counter >>= 1;
663 if (!current->counter)
664 current->need_resched = 1;
667 * Ok, add it to the run-queues and make it
668 * visible to the rest of the system.
670 * Let it rip!
672 retval = p->pid;
673 p->tgid = retval;
674 INIT_LIST_HEAD(&p->thread_group);
675 write_lock_irq(&tasklist_lock);
676 if (clone_flags & CLONE_THREAD) {
677 p->tgid = current->tgid;
678 list_add(&p->thread_group, &current->thread_group);
680 SET_LINKS(p);
681 hash_pid(p);
682 nr_threads++;
683 write_unlock_irq(&tasklist_lock);
685 if (p->ptrace & PT_PTRACED)
686 send_sig(SIGSTOP, p, 1);
688 wake_up_process(p); /* do this last */
689 ++total_forks;
691 fork_out:
692 if ((clone_flags & CLONE_VFORK) && (retval > 0))
693 down(&sem);
694 return retval;
696 bad_fork_cleanup_sighand:
697 exit_sighand(p);
698 bad_fork_cleanup_fs:
699 exit_fs(p); /* blocking */
700 bad_fork_cleanup_files:
701 exit_files(p); /* blocking */
702 bad_fork_cleanup:
703 put_exec_domain(p->exec_domain);
704 if (p->binfmt && p->binfmt->module)
705 __MOD_DEC_USE_COUNT(p->binfmt->module);
706 bad_fork_cleanup_count:
707 atomic_dec(&p->user->processes);
708 free_uid(p->user);
709 bad_fork_free:
710 free_task_struct(p);
711 goto fork_out;
714 /* SLAB cache for signal_struct structures (tsk->sig) */
715 kmem_cache_t *sigact_cachep;
717 /* SLAB cache for files_struct structures (tsk->files) */
718 kmem_cache_t *files_cachep;
720 /* SLAB cache for fs_struct structures (tsk->fs) */
721 kmem_cache_t *fs_cachep;
723 /* SLAB cache for vm_area_struct structures */
724 kmem_cache_t *vm_area_cachep;
726 /* SLAB cache for mm_struct structures (tsk->mm) */
727 kmem_cache_t *mm_cachep;
729 void __init proc_caches_init(void)
731 sigact_cachep = kmem_cache_create("signal_act",
732 sizeof(struct signal_struct), 0,
733 SLAB_HWCACHE_ALIGN, NULL, NULL);
734 if (!sigact_cachep)
735 panic("Cannot create signal action SLAB cache");
737 files_cachep = kmem_cache_create("files_cache",
738 sizeof(struct files_struct), 0,
739 SLAB_HWCACHE_ALIGN, NULL, NULL);
740 if (!files_cachep)
741 panic("Cannot create files SLAB cache");
743 fs_cachep = kmem_cache_create("fs_cache",
744 sizeof(struct fs_struct), 0,
745 SLAB_HWCACHE_ALIGN, NULL, NULL);
746 if (!fs_cachep)
747 panic("Cannot create fs_struct SLAB cache");
749 vm_area_cachep = kmem_cache_create("vm_area_struct",
750 sizeof(struct vm_area_struct), 0,
751 SLAB_HWCACHE_ALIGN, NULL, NULL);
752 if(!vm_area_cachep)
753 panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
755 mm_cachep = kmem_cache_create("mm_struct",
756 sizeof(struct mm_struct), 0,
757 SLAB_HWCACHE_ALIGN, NULL, NULL);
758 if(!mm_cachep)
759 panic("vma_init: Cannot alloc mm_struct SLAB cache");