4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * #!-checking implemented by tytso.
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
25 #include <linux/config.h>
26 #include <linux/slab.h>
27 #include <linux/file.h>
28 #include <linux/mman.h>
29 #include <linux/a.out.h>
30 #include <linux/stat.h>
31 #include <linux/fcntl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/init.h>
34 #include <linux/pagemap.h>
35 #include <linux/highmem.h>
36 #include <linux/spinlock.h>
37 #define __NO_VERSION__
38 #include <linux/module.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgalloc.h>
42 #include <asm/mmu_context.h>
45 #include <linux/kmod.h>
48 static struct linux_binfmt
*formats
;
49 static rwlock_t binfmt_lock
= RW_LOCK_UNLOCKED
;
51 int register_binfmt(struct linux_binfmt
* fmt
)
53 struct linux_binfmt
** tmp
= &formats
;
59 write_lock(&binfmt_lock
);
62 write_unlock(&binfmt_lock
);
69 write_unlock(&binfmt_lock
);
73 int unregister_binfmt(struct linux_binfmt
* fmt
)
75 struct linux_binfmt
** tmp
= &formats
;
77 write_lock(&binfmt_lock
);
81 write_unlock(&binfmt_lock
);
86 write_unlock(&binfmt_lock
);
90 static inline void put_binfmt(struct linux_binfmt
* fmt
)
93 __MOD_DEC_USE_COUNT(fmt
->module
);
97 * Note that a shared library must be both readable and executable due to
100 * Also note that we take the address to load from from the file itself.
102 asmlinkage
long sys_uselib(const char * library
)
108 error
= user_path_walk(library
, &nd
);
113 if (!S_ISREG(nd
.dentry
->d_inode
->i_mode
))
116 error
= permission(nd
.dentry
->d_inode
, MAY_READ
| MAY_EXEC
);
120 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
121 error
= PTR_ERR(file
);
126 if(file
->f_op
&& file
->f_op
->read
) {
127 struct linux_binfmt
* fmt
;
129 read_lock(&binfmt_lock
);
130 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
131 if (!fmt
->load_shlib
)
133 if (!try_inc_mod_count(fmt
->module
))
135 read_unlock(&binfmt_lock
);
136 error
= fmt
->load_shlib(file
);
137 read_lock(&binfmt_lock
);
139 if (error
!= -ENOEXEC
)
142 read_unlock(&binfmt_lock
);
153 * count() counts the number of arguments/envelopes
155 static int count(char ** argv
, int max
)
164 error
= get_user(p
,argv
);
178 * 'copy_strings()' copies argument/envelope strings from user
179 * memory to free pages in kernel mem. These are in a format ready
180 * to be put directly into the top of new user memory.
182 int copy_strings(int argc
,char ** argv
, struct linux_binprm
*bprm
)
189 if (get_user(str
, argv
+argc
) || !str
|| !(len
= strnlen_user(str
, bprm
->p
)))
195 /* XXX: add architecture specific overflow check here. */
202 int offset
, bytes_to_copy
;
204 offset
= pos
% PAGE_SIZE
;
206 page
= bprm
->page
[i
];
209 page
= alloc_page(GFP_HIGHUSER
);
210 bprm
->page
[i
] = page
;
218 memset(kaddr
, 0, offset
);
219 bytes_to_copy
= PAGE_SIZE
- offset
;
220 if (bytes_to_copy
> len
) {
223 memset(kaddr
+offset
+len
, 0, PAGE_SIZE
-offset
-len
);
225 err
= copy_from_user(kaddr
+ offset
, str
, bytes_to_copy
);
231 pos
+= bytes_to_copy
;
232 str
+= bytes_to_copy
;
233 len
-= bytes_to_copy
;
240 * Like copy_strings, but get argv and its values from kernel memory.
242 int copy_strings_kernel(int argc
,char ** argv
, struct linux_binprm
*bprm
)
245 mm_segment_t oldfs
= get_fs();
247 r
= copy_strings(argc
, argv
, bprm
);
253 * This routine is used to map in a page into an address space: needed by
254 * execve() for the initial stack and environment pages.
256 void put_dirty_page(struct task_struct
* tsk
, struct page
*page
, unsigned long address
)
262 if (page_count(page
) != 1)
263 printk("mem_map disagrees with %p at %08lx\n", page
, address
);
264 pgd
= pgd_offset(tsk
->mm
, address
);
265 pmd
= pmd_alloc(pgd
, address
);
268 force_sig(SIGKILL
, tsk
);
271 pte
= pte_alloc(pmd
, address
);
274 force_sig(SIGKILL
, tsk
);
277 if (!pte_none(*pte
)) {
282 flush_dcache_page(page
);
283 flush_page_to_ram(page
);
284 set_pte(pte
, pte_mkdirty(pte_mkwrite(mk_pte(page
, PAGE_COPY
))));
285 /* no need for flush_tlb */
288 int setup_arg_pages(struct linux_binprm
*bprm
)
290 unsigned long stack_base
;
291 struct vm_area_struct
*mpnt
;
294 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
;
296 bprm
->p
+= stack_base
;
298 bprm
->loader
+= stack_base
;
299 bprm
->exec
+= stack_base
;
301 mpnt
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
305 down(¤t
->mm
->mmap_sem
);
307 mpnt
->vm_mm
= current
->mm
;
308 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) bprm
->p
;
309 mpnt
->vm_end
= STACK_TOP
;
310 mpnt
->vm_page_prot
= PAGE_COPY
;
311 mpnt
->vm_flags
= VM_STACK_FLAGS
;
314 mpnt
->vm_file
= NULL
;
315 mpnt
->vm_private_data
= (void *) 0;
316 spin_lock(¤t
->mm
->page_table_lock
);
317 insert_vm_struct(current
->mm
, mpnt
);
318 spin_unlock(¤t
->mm
->page_table_lock
);
319 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
322 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
323 struct page
*page
= bprm
->page
[i
];
325 bprm
->page
[i
] = NULL
;
327 put_dirty_page(current
,page
,stack_base
);
329 stack_base
+= PAGE_SIZE
;
331 up(¤t
->mm
->mmap_sem
);
336 struct file
*open_exec(const char *name
)
343 if (path_init(name
, LOOKUP_FOLLOW
|LOOKUP_POSITIVE
, &nd
))
344 err
= path_walk(name
, &nd
);
347 inode
= nd
.dentry
->d_inode
;
348 file
= ERR_PTR(-EACCES
);
349 if (!IS_NOEXEC(inode
) && S_ISREG(inode
->i_mode
)) {
350 int err
= permission(inode
, MAY_EXEC
);
353 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
355 err
= deny_write_access(file
);
370 int kernel_read(struct file
*file
, unsigned long offset
,
371 char * addr
, unsigned long count
)
375 int result
= -ENOSYS
;
377 if (!file
->f_op
->read
)
381 result
= file
->f_op
->read(file
, addr
, count
, &pos
);
387 static int exec_mmap(void)
389 struct mm_struct
* mm
, * old_mm
;
391 old_mm
= current
->mm
;
392 if (old_mm
&& atomic_read(&old_mm
->mm_users
) == 1) {
393 flush_cache_mm(old_mm
);
396 flush_tlb_mm(old_mm
);
402 struct mm_struct
*active_mm
= current
->active_mm
;
404 if (init_new_context(current
, mm
)) {
410 current
->active_mm
= mm
;
411 task_unlock(current
);
412 activate_mm(active_mm
, mm
);
415 if (active_mm
!= old_mm
) BUG();
426 * This function makes sure the current process has its own signal table,
427 * so that flush_signal_handlers can later reset the handlers without
428 * disturbing other processes. (Other processes might share the signal
429 * table via the CLONE_SIGNAL option to clone().)
432 static inline int make_private_signals(void)
434 struct signal_struct
* newsig
;
436 if (atomic_read(¤t
->sig
->count
) <= 1)
438 newsig
= kmem_cache_alloc(sigact_cachep
, GFP_KERNEL
);
441 spin_lock_init(&newsig
->siglock
);
442 atomic_set(&newsig
->count
, 1);
443 memcpy(newsig
->action
, current
->sig
->action
, sizeof(newsig
->action
));
444 spin_lock_irq(¤t
->sigmask_lock
);
445 current
->sig
= newsig
;
446 spin_unlock_irq(¤t
->sigmask_lock
);
451 * If make_private_signals() made a copy of the signal table, decrement the
452 * refcount of the original table, and free it if necessary.
453 * We don't do that in make_private_signals() so that we can back off
454 * in flush_old_exec() if an error occurs after calling make_private_signals().
457 static inline void release_old_signals(struct signal_struct
* oldsig
)
459 if (current
->sig
== oldsig
)
461 if (atomic_dec_and_test(&oldsig
->count
))
462 kmem_cache_free(sigact_cachep
, oldsig
);
466 * These functions flushes out all traces of the currently running executable
467 * so that a new one can be started
470 static inline void flush_old_files(struct files_struct
* files
)
474 write_lock(&files
->file_lock
);
476 unsigned long set
, i
;
480 if (i
>= files
->max_fds
|| i
>= files
->max_fdset
)
482 set
= files
->close_on_exec
->fds_bits
[j
];
485 files
->close_on_exec
->fds_bits
[j
] = 0;
486 write_unlock(&files
->file_lock
);
487 for ( ; set
; i
++,set
>>= 1) {
492 write_lock(&files
->file_lock
);
495 write_unlock(&files
->file_lock
);
499 * An execve() will automatically "de-thread" the process.
500 * Note: we don't have to hold the tasklist_lock to test
501 * whether we migth need to do this. If we're not part of
502 * a thread group, there is no way we can become one
503 * dynamically. And if we are, we only need to protect the
504 * unlink - even if we race with the last other thread exit,
505 * at worst the list_del_init() might end up being a no-op.
507 static inline void de_thread(struct task_struct
*tsk
)
509 if (!list_empty(&tsk
->thread_group
)) {
510 write_lock_irq(&tasklist_lock
);
511 list_del_init(&tsk
->thread_group
);
512 write_unlock_irq(&tasklist_lock
);
515 /* Minor oddity: this might stay the same. */
516 tsk
->tgid
= tsk
->pid
;
519 int flush_old_exec(struct linux_binprm
* bprm
)
523 struct signal_struct
* oldsig
;
526 * Make sure we have a private signal table
528 oldsig
= current
->sig
;
529 retval
= make_private_signals();
530 if (retval
) goto flush_failed
;
533 * Release all of the old mmap stuff
535 retval
= exec_mmap();
536 if (retval
) goto mmap_failed
;
538 /* This is the point of no return */
539 release_old_signals(oldsig
);
541 current
->sas_ss_sp
= current
->sas_ss_size
= 0;
543 if (current
->euid
== current
->uid
&& current
->egid
== current
->gid
)
544 current
->dumpable
= 1;
545 name
= bprm
->filename
;
546 for (i
=0; (ch
= *(name
++)) != '\0';) {
551 current
->comm
[i
++] = ch
;
553 current
->comm
[i
] = '\0';
559 if (bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
||
560 permission(bprm
->file
->f_dentry
->d_inode
,MAY_READ
))
561 current
->dumpable
= 0;
563 /* An exec changes our domain. We are no longer part of the thread
566 current
->self_exec_id
++;
568 flush_signal_handlers(current
);
569 flush_old_files(current
->files
);
575 spin_lock_irq(¤t
->sigmask_lock
);
576 if (current
->sig
!= oldsig
)
578 current
->sig
= oldsig
;
579 spin_unlock_irq(¤t
->sigmask_lock
);
584 * We mustn't allow tracing of suid binaries, unless
585 * the tracer has the capability to trace anything..
587 static inline int must_not_trace_exec(struct task_struct
* p
)
589 return (p
->ptrace
& PT_PTRACED
) && !cap_raised(p
->p_pptr
->cap_effective
, CAP_SYS_PTRACE
);
593 * Fill the binprm structure from the inode.
594 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
596 int prepare_binprm(struct linux_binprm
*bprm
)
599 int id_change
,cap_raised
;
600 struct inode
* inode
= bprm
->file
->f_dentry
->d_inode
;
602 mode
= inode
->i_mode
;
603 /* Huh? We had already checked for MAY_EXEC, WTF do we check this? */
604 if (!(mode
& 0111)) /* with at least _one_ execute bit set */
606 if (bprm
->file
->f_op
== NULL
)
609 bprm
->e_uid
= current
->euid
;
610 bprm
->e_gid
= current
->egid
;
611 id_change
= cap_raised
= 0;
614 if (mode
& S_ISUID
) {
615 bprm
->e_uid
= inode
->i_uid
;
616 if (bprm
->e_uid
!= current
->euid
)
622 * If setgid is set but no group execute bit then this
623 * is a candidate for mandatory locking, not a setgid
626 if ((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) {
627 bprm
->e_gid
= inode
->i_gid
;
628 if (!in_group_p(bprm
->e_gid
))
632 /* We don't have VFS support for capabilities yet */
633 cap_clear(bprm
->cap_inheritable
);
634 cap_clear(bprm
->cap_permitted
);
635 cap_clear(bprm
->cap_effective
);
637 /* To support inheritance of root-permissions and suid-root
638 * executables under compatibility mode, we raise all three
639 * capability sets for the file.
641 * If only the real uid is 0, we only raise the inheritable
642 * and permitted sets of the executable file.
645 if (!issecure(SECURE_NOROOT
)) {
646 if (bprm
->e_uid
== 0 || current
->uid
== 0) {
647 cap_set_full(bprm
->cap_inheritable
);
648 cap_set_full(bprm
->cap_permitted
);
650 if (bprm
->e_uid
== 0)
651 cap_set_full(bprm
->cap_effective
);
654 /* Only if pP' is _not_ a subset of pP, do we consider there
655 * has been a capability related "change of capability". In
656 * such cases, we need to check that the elevation of
657 * privilege does not go against other system constraints.
658 * The new Permitted set is defined below -- see (***). */
660 kernel_cap_t permitted
, working
;
662 permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
663 working
= cap_intersect(bprm
->cap_inheritable
,
664 current
->cap_inheritable
);
665 working
= cap_combine(permitted
, working
);
666 if (!cap_issubset(working
, current
->cap_permitted
)) {
671 if (id_change
|| cap_raised
) {
672 /* We can't suid-execute if we're sharing parts of the executable */
673 /* or if we're being traced (or if suid execs are not allowed) */
674 /* (current->mm->mm_users > 1 is ok, as we'll get a new mm anyway) */
676 || must_not_trace_exec(current
)
677 || (atomic_read(¤t
->fs
->count
) > 1)
678 || (atomic_read(¤t
->sig
->count
) > 1)
679 || (atomic_read(¤t
->files
->count
) > 1)) {
680 if (id_change
&& !capable(CAP_SETUID
))
682 if (cap_raised
&& !capable(CAP_SETPCAP
))
687 memset(bprm
->buf
,0,BINPRM_BUF_SIZE
);
688 return kernel_read(bprm
->file
,0,bprm
->buf
,BINPRM_BUF_SIZE
);
692 * This function is used to produce the new IDs and capabilities
693 * from the old ones and the file's capabilities.
695 * The formula used for evolving capabilities is:
698 * (***) pP' = (fP & X) | (fI & pI)
699 * pE' = pP' & fE [NB. fE is 0 or ~0]
701 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
702 * ' indicates post-exec(), and X is the global 'cap_bset'.
705 void compute_creds(struct linux_binprm
*bprm
)
707 kernel_cap_t new_permitted
, working
;
709 new_permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
710 working
= cap_intersect(bprm
->cap_inheritable
,
711 current
->cap_inheritable
);
712 new_permitted
= cap_combine(new_permitted
, working
);
714 /* For init, we want to retain the capabilities set
715 * in the init_task struct. Thus we skip the usual
716 * capability rules */
717 if (current
->pid
!= 1) {
718 current
->cap_permitted
= new_permitted
;
719 current
->cap_effective
=
720 cap_intersect(new_permitted
, bprm
->cap_effective
);
723 /* AUD: Audit candidate if current->cap_effective is set */
725 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
;
726 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
;
727 if (current
->euid
!= current
->uid
|| current
->egid
!= current
->gid
||
728 !cap_issubset(new_permitted
, current
->cap_permitted
))
729 current
->dumpable
= 0;
731 current
->keep_capabilities
= 0;
735 void remove_arg_zero(struct linux_binprm
*bprm
)
738 unsigned long offset
;
742 offset
= bprm
->p
% PAGE_SIZE
;
745 while (bprm
->p
++, *(kaddr
+offset
++)) {
746 if (offset
!= PAGE_SIZE
)
751 page
= bprm
->page
[bprm
->p
/PAGE_SIZE
];
760 * cycle the list of binary formats handler, until one recognizes the image
762 int search_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
)
765 struct linux_binfmt
*fmt
;
767 /* handle /sbin/loader.. */
769 struct exec
* eh
= (struct exec
*) bprm
->buf
;
770 struct linux_binprm bprm_loader
;
772 if (!bprm
->loader
&& eh
->fh
.f_magic
== 0x183 &&
773 (eh
->fh
.f_flags
& 0x3000) == 0x3000)
776 char * dynloader
[] = { "/sbin/loader" };
779 allow_write_access(bprm
->file
);
783 bprm_loader
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
784 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) /* clear page-table */
785 bprm_loader
.page
[i
] = NULL
;
787 file
= open_exec(dynloader
[0]);
788 retval
= PTR_ERR(file
);
792 bprm
->loader
= bprm_loader
.p
;
793 retval
= prepare_binprm(bprm
);
796 /* should call search_binary_handler recursively here,
797 but it does not matter */
801 for (try=0; try<2; try++) {
802 read_lock(&binfmt_lock
);
803 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
804 int (*fn
)(struct linux_binprm
*, struct pt_regs
*) = fmt
->load_binary
;
807 if (!try_inc_mod_count(fmt
->module
))
809 read_unlock(&binfmt_lock
);
810 retval
= fn(bprm
, regs
);
813 allow_write_access(bprm
->file
);
817 current
->did_exec
= 1;
820 read_lock(&binfmt_lock
);
822 if (retval
!= -ENOEXEC
)
825 read_unlock(&binfmt_lock
);
829 read_unlock(&binfmt_lock
);
830 if (retval
!= -ENOEXEC
) {
834 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
836 if (printable(bprm
->buf
[0]) &&
837 printable(bprm
->buf
[1]) &&
838 printable(bprm
->buf
[2]) &&
839 printable(bprm
->buf
[3]))
840 break; /* -ENOEXEC */
841 sprintf(modname
, "binfmt-%04x", *(unsigned short *)(&bprm
->buf
[2]));
842 request_module(modname
);
851 * sys_execve() executes a new program.
853 int do_execve(char * filename
, char ** argv
, char ** envp
, struct pt_regs
* regs
)
855 struct linux_binprm bprm
;
860 file
= open_exec(filename
);
862 retval
= PTR_ERR(file
);
866 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
867 memset(bprm
.page
, 0, MAX_ARG_PAGES
*sizeof(bprm
.page
[0]));
870 bprm
.filename
= filename
;
874 if ((bprm
.argc
= count(argv
, bprm
.p
/ sizeof(void *))) < 0) {
875 allow_write_access(file
);
880 if ((bprm
.envc
= count(envp
, bprm
.p
/ sizeof(void *))) < 0) {
881 allow_write_access(file
);
886 retval
= prepare_binprm(&bprm
);
890 retval
= copy_strings_kernel(1, &bprm
.filename
, &bprm
);
895 retval
= copy_strings(bprm
.envc
, envp
, &bprm
);
899 retval
= copy_strings(bprm
.argc
, argv
, &bprm
);
903 retval
= search_binary_handler(&bprm
,regs
);
909 /* Something went wrong, return the inode and free the argument pages*/
910 allow_write_access(bprm
.file
);
914 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
915 struct page
* page
= bprm
.page
[i
];
923 void set_binfmt(struct linux_binfmt
*new)
925 struct linux_binfmt
*old
= current
->binfmt
;
926 if (new && new->module
)
927 __MOD_INC_USE_COUNT(new->module
);
928 current
->binfmt
= new;
929 if (old
&& old
->module
)
930 __MOD_DEC_USE_COUNT(old
->module
);
933 int do_coredump(long signr
, struct pt_regs
* regs
)
935 struct linux_binfmt
* binfmt
;
936 char corename
[6+sizeof(current
->comm
)];
938 struct inode
* inode
;
941 binfmt
= current
->binfmt
;
942 if (!binfmt
|| !binfmt
->core_dump
)
944 if (!current
->dumpable
|| atomic_read(¤t
->mm
->mm_users
) != 1)
946 current
->dumpable
= 0;
947 if (current
->rlim
[RLIMIT_CORE
].rlim_cur
< binfmt
->min_coredump
)
950 memcpy(corename
,"core.", 5);
952 memcpy(corename
+5,current
->comm
,sizeof(current
->comm
));
956 file
= filp_open(corename
, O_CREAT
| 2 | O_TRUNC
| O_NOFOLLOW
, 0600);
959 inode
= file
->f_dentry
->d_inode
;
960 if (inode
->i_nlink
> 1)
961 goto close_fail
; /* multiple links - don't dump */
963 if (!S_ISREG(inode
->i_mode
))
967 if (!file
->f_op
->write
)
969 if (!binfmt
->core_dump(signr
, regs
, file
))
972 filp_close(file
, NULL
);
976 filp_close(file
, NULL
);