4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * #!-checking implemented by tytso.
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
25 #include <linux/config.h>
26 #include <linux/slab.h>
27 #include <linux/file.h>
28 #include <linux/mman.h>
29 #include <linux/a.out.h>
30 #include <linux/stat.h>
31 #include <linux/fcntl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/init.h>
34 #include <linux/pagemap.h>
35 #include <linux/highmem.h>
36 #include <linux/spinlock.h>
37 #define __NO_VERSION__
38 #include <linux/module.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgalloc.h>
42 #include <asm/mmu_context.h>
45 #include <linux/kmod.h>
48 static struct linux_binfmt
*formats
;
49 static rwlock_t binfmt_lock
= RW_LOCK_UNLOCKED
;
51 int register_binfmt(struct linux_binfmt
* fmt
)
53 struct linux_binfmt
** tmp
= &formats
;
59 write_lock(&binfmt_lock
);
62 write_unlock(&binfmt_lock
);
69 write_unlock(&binfmt_lock
);
73 int unregister_binfmt(struct linux_binfmt
* fmt
)
75 struct linux_binfmt
** tmp
= &formats
;
77 write_lock(&binfmt_lock
);
81 write_unlock(&binfmt_lock
);
86 write_unlock(&binfmt_lock
);
90 static inline void put_binfmt(struct linux_binfmt
* fmt
)
93 __MOD_DEC_USE_COUNT(fmt
->module
);
97 * Note that a shared library must be both readable and executable due to
100 * Also note that we take the address to load from from the file itself.
102 asmlinkage
long sys_uselib(const char * library
)
108 error
= user_path_walk(library
, &nd
);
113 if (!S_ISREG(nd
.dentry
->d_inode
->i_mode
))
116 error
= permission(nd
.dentry
->d_inode
, MAY_READ
| MAY_EXEC
);
120 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
121 error
= PTR_ERR(file
);
126 if(file
->f_op
&& file
->f_op
->read
) {
127 struct linux_binfmt
* fmt
;
129 read_lock(&binfmt_lock
);
130 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
131 if (!fmt
->load_shlib
)
133 if (!try_inc_mod_count(fmt
->module
))
135 read_unlock(&binfmt_lock
);
136 error
= fmt
->load_shlib(file
);
137 read_lock(&binfmt_lock
);
139 if (error
!= -ENOEXEC
)
142 read_unlock(&binfmt_lock
);
153 * count() counts the number of arguments/envelopes
155 static int count(char ** argv
, int max
)
164 error
= get_user(p
,argv
);
178 * 'copy_strings()' copies argument/envelope strings from user
179 * memory to free pages in kernel mem. These are in a format ready
180 * to be put directly into the top of new user memory.
182 int copy_strings(int argc
,char ** argv
, struct linux_binprm
*bprm
)
189 if (get_user(str
, argv
+argc
) || !str
|| !(len
= strnlen_user(str
, bprm
->p
)))
195 /* XXX: add architecture specific overflow check here. */
202 int offset
, bytes_to_copy
;
204 offset
= pos
% PAGE_SIZE
;
206 page
= bprm
->page
[i
];
209 page
= alloc_page(GFP_HIGHUSER
);
210 bprm
->page
[i
] = page
;
215 kaddr
= (char *)kmap(page
);
218 memset(kaddr
, 0, offset
);
219 bytes_to_copy
= PAGE_SIZE
- offset
;
220 if (bytes_to_copy
> len
) {
223 memset(kaddr
+offset
+len
, 0, PAGE_SIZE
-offset
-len
);
225 err
= copy_from_user(kaddr
+ offset
, str
, bytes_to_copy
);
226 flush_page_to_ram(page
);
232 pos
+= bytes_to_copy
;
233 str
+= bytes_to_copy
;
234 len
-= bytes_to_copy
;
241 * Like copy_strings, but get argv and its values from kernel memory.
243 int copy_strings_kernel(int argc
,char ** argv
, struct linux_binprm
*bprm
)
246 mm_segment_t oldfs
= get_fs();
248 r
= copy_strings(argc
, argv
, bprm
);
254 * This routine is used to map in a page into an address space: needed by
255 * execve() for the initial stack and environment pages.
257 void put_dirty_page(struct task_struct
* tsk
, struct page
*page
, unsigned long address
)
263 if (page_count(page
) != 1)
264 printk("mem_map disagrees with %p at %08lx\n", page
, address
);
265 pgd
= pgd_offset(tsk
->mm
, address
);
266 pmd
= pmd_alloc(pgd
, address
);
269 force_sig(SIGKILL
, tsk
);
272 pte
= pte_alloc(pmd
, address
);
275 force_sig(SIGKILL
, tsk
);
278 if (!pte_none(*pte
)) {
283 flush_page_to_ram(page
);
284 set_pte(pte
, pte_mkdirty(pte_mkwrite(mk_pte(page
, PAGE_COPY
))));
285 /* no need for flush_tlb */
288 int setup_arg_pages(struct linux_binprm
*bprm
)
290 unsigned long stack_base
;
291 struct vm_area_struct
*mpnt
;
294 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
;
296 bprm
->p
+= stack_base
;
298 bprm
->loader
+= stack_base
;
299 bprm
->exec
+= stack_base
;
301 mpnt
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
305 down(¤t
->mm
->mmap_sem
);
307 mpnt
->vm_mm
= current
->mm
;
308 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) bprm
->p
;
309 mpnt
->vm_end
= STACK_TOP
;
310 mpnt
->vm_page_prot
= PAGE_COPY
;
311 mpnt
->vm_flags
= VM_STACK_FLAGS
;
314 mpnt
->vm_file
= NULL
;
315 mpnt
->vm_private_data
= (void *) 0;
316 vmlist_modify_lock(current
->mm
);
317 insert_vm_struct(current
->mm
, mpnt
);
318 vmlist_modify_unlock(current
->mm
);
319 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
322 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
323 struct page
*page
= bprm
->page
[i
];
325 bprm
->page
[i
] = NULL
;
327 put_dirty_page(current
,page
,stack_base
);
329 stack_base
+= PAGE_SIZE
;
331 up(¤t
->mm
->mmap_sem
);
336 struct file
*open_exec(const char *name
)
343 if (path_init(name
, LOOKUP_FOLLOW
|LOOKUP_POSITIVE
, &nd
))
344 err
= path_walk(name
, &nd
);
347 inode
= nd
.dentry
->d_inode
;
348 file
= ERR_PTR(-EACCES
);
349 if (!IS_NOEXEC(inode
) && S_ISREG(inode
->i_mode
)) {
350 int err
= permission(inode
, MAY_EXEC
);
353 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
355 err
= deny_write_access(file
);
370 int kernel_read(struct file
*file
, unsigned long offset
,
371 char * addr
, unsigned long count
)
375 int result
= -ENOSYS
;
377 if (!file
->f_op
->read
)
381 result
= file
->f_op
->read(file
, addr
, count
, &pos
);
387 static int exec_mmap(void)
389 struct mm_struct
* mm
, * old_mm
;
391 old_mm
= current
->mm
;
392 if (old_mm
&& atomic_read(&old_mm
->mm_users
) == 1) {
393 flush_cache_mm(old_mm
);
396 flush_tlb_mm(old_mm
);
402 struct mm_struct
*active_mm
= current
->active_mm
;
404 init_new_context(current
, mm
);
407 current
->active_mm
= mm
;
408 task_unlock(current
);
409 activate_mm(active_mm
, mm
);
412 if (active_mm
!= old_mm
) BUG();
423 * This function makes sure the current process has its own signal table,
424 * so that flush_signal_handlers can later reset the handlers without
425 * disturbing other processes. (Other processes might share the signal
426 * table via the CLONE_SIGHAND option to clone().)
429 static inline int make_private_signals(void)
431 struct signal_struct
* newsig
;
433 if (atomic_read(¤t
->sig
->count
) <= 1)
435 newsig
= kmalloc(sizeof(*newsig
), GFP_KERNEL
);
438 spin_lock_init(&newsig
->siglock
);
439 atomic_set(&newsig
->count
, 1);
440 memcpy(newsig
->action
, current
->sig
->action
, sizeof(newsig
->action
));
441 spin_lock_irq(¤t
->sigmask_lock
);
442 current
->sig
= newsig
;
443 spin_unlock_irq(¤t
->sigmask_lock
);
448 * If make_private_signals() made a copy of the signal table, decrement the
449 * refcount of the original table, and free it if necessary.
450 * We don't do that in make_private_signals() so that we can back off
451 * in flush_old_exec() if an error occurs after calling make_private_signals().
454 static inline void release_old_signals(struct signal_struct
* oldsig
)
456 if (current
->sig
== oldsig
)
458 if (atomic_dec_and_test(&oldsig
->count
))
463 * These functions flushes out all traces of the currently running executable
464 * so that a new one can be started
467 static inline void flush_old_files(struct files_struct
* files
)
473 unsigned long set
, i
;
476 if (i
>= files
->max_fds
|| i
>= files
->max_fdset
)
478 set
= xchg(&files
->close_on_exec
->fds_bits
[j
], 0);
480 for ( ; set
; i
++,set
>>= 1) {
487 int flush_old_exec(struct linux_binprm
* bprm
)
491 struct signal_struct
* oldsig
;
494 * Make sure we have a private signal table
496 oldsig
= current
->sig
;
497 retval
= make_private_signals();
498 if (retval
) goto flush_failed
;
501 * Release all of the old mmap stuff
503 retval
= exec_mmap();
504 if (retval
) goto mmap_failed
;
506 /* This is the point of no return */
507 release_old_signals(oldsig
);
509 current
->sas_ss_sp
= current
->sas_ss_size
= 0;
511 if (current
->euid
== current
->uid
&& current
->egid
== current
->gid
)
512 current
->dumpable
= 1;
513 name
= bprm
->filename
;
514 for (i
=0; (ch
= *(name
++)) != '\0';) {
519 current
->comm
[i
++] = ch
;
521 current
->comm
[i
] = '\0';
525 if (bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
||
526 permission(bprm
->file
->f_dentry
->d_inode
,MAY_READ
))
527 current
->dumpable
= 0;
529 /* An exec changes our domain. We are no longer part of the thread
532 current
->self_exec_id
++;
534 flush_signal_handlers(current
);
535 flush_old_files(current
->files
);
541 spin_lock_irq(¤t
->sigmask_lock
);
542 if (current
->sig
!= oldsig
)
544 current
->sig
= oldsig
;
545 spin_unlock_irq(¤t
->sigmask_lock
);
550 * We mustn't allow tracing of suid binaries, unless
551 * the tracer has the capability to trace anything..
553 static inline int must_not_trace_exec(struct task_struct
* p
)
555 return (p
->ptrace
& PT_PTRACED
) && !cap_raised(p
->p_pptr
->cap_effective
, CAP_SYS_PTRACE
);
559 * Fill the binprm structure from the inode.
560 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
562 int prepare_binprm(struct linux_binprm
*bprm
)
565 int id_change
,cap_raised
;
566 struct inode
* inode
= bprm
->file
->f_dentry
->d_inode
;
568 mode
= inode
->i_mode
;
569 /* Huh? We had already checked for MAY_EXEC, WTF do we check this? */
570 if (!(mode
& 0111)) /* with at least _one_ execute bit set */
573 bprm
->e_uid
= current
->euid
;
574 bprm
->e_gid
= current
->egid
;
575 id_change
= cap_raised
= 0;
578 if (mode
& S_ISUID
) {
579 bprm
->e_uid
= inode
->i_uid
;
580 if (bprm
->e_uid
!= current
->euid
)
586 * If setgid is set but no group execute bit then this
587 * is a candidate for mandatory locking, not a setgid
590 if ((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) {
591 bprm
->e_gid
= inode
->i_gid
;
592 if (!in_group_p(bprm
->e_gid
))
596 /* We don't have VFS support for capabilities yet */
597 cap_clear(bprm
->cap_inheritable
);
598 cap_clear(bprm
->cap_permitted
);
599 cap_clear(bprm
->cap_effective
);
601 /* To support inheritance of root-permissions and suid-root
602 * executables under compatibility mode, we raise all three
603 * capability sets for the file.
605 * If only the real uid is 0, we only raise the inheritable
606 * and permitted sets of the executable file.
609 if (!issecure(SECURE_NOROOT
)) {
610 if (bprm
->e_uid
== 0 || current
->uid
== 0) {
611 cap_set_full(bprm
->cap_inheritable
);
612 cap_set_full(bprm
->cap_permitted
);
614 if (bprm
->e_uid
== 0)
615 cap_set_full(bprm
->cap_effective
);
618 /* Only if pP' is _not_ a subset of pP, do we consider there
619 * has been a capability related "change of capability". In
620 * such cases, we need to check that the elevation of
621 * privilege does not go against other system constraints.
622 * The new Permitted set is defined below -- see (***). */
624 kernel_cap_t permitted
, working
;
626 permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
627 working
= cap_intersect(bprm
->cap_inheritable
,
628 current
->cap_inheritable
);
629 working
= cap_combine(permitted
, working
);
630 if (!cap_issubset(working
, current
->cap_permitted
)) {
635 if (id_change
|| cap_raised
) {
636 /* We can't suid-execute if we're sharing parts of the executable */
637 /* or if we're being traced (or if suid execs are not allowed) */
638 /* (current->mm->mm_users > 1 is ok, as we'll get a new mm anyway) */
640 || must_not_trace_exec(current
)
641 || (atomic_read(¤t
->fs
->count
) > 1)
642 || (atomic_read(¤t
->sig
->count
) > 1)
643 || (atomic_read(¤t
->files
->count
) > 1)) {
644 if (id_change
&& !capable(CAP_SETUID
))
646 if (cap_raised
&& !capable(CAP_SETPCAP
))
651 memset(bprm
->buf
,0,BINPRM_BUF_SIZE
);
652 return kernel_read(bprm
->file
,0,bprm
->buf
,BINPRM_BUF_SIZE
);
656 * This function is used to produce the new IDs and capabilities
657 * from the old ones and the file's capabilities.
659 * The formula used for evolving capabilities is:
662 * (***) pP' = (fP & X) | (fI & pI)
663 * pE' = pP' & fE [NB. fE is 0 or ~0]
665 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
666 * ' indicates post-exec(), and X is the global 'cap_bset'.
669 void compute_creds(struct linux_binprm
*bprm
)
671 kernel_cap_t new_permitted
, working
;
673 new_permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
674 working
= cap_intersect(bprm
->cap_inheritable
,
675 current
->cap_inheritable
);
676 new_permitted
= cap_combine(new_permitted
, working
);
678 /* For init, we want to retain the capabilities set
679 * in the init_task struct. Thus we skip the usual
680 * capability rules */
681 if (current
->pid
!= 1) {
682 current
->cap_permitted
= new_permitted
;
683 current
->cap_effective
=
684 cap_intersect(new_permitted
, bprm
->cap_effective
);
687 /* AUD: Audit candidate if current->cap_effective is set */
689 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
;
690 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
;
691 if (current
->euid
!= current
->uid
|| current
->egid
!= current
->gid
||
692 !cap_issubset(new_permitted
, current
->cap_permitted
))
693 current
->dumpable
= 0;
695 current
->keep_capabilities
= 0;
699 void remove_arg_zero(struct linux_binprm
*bprm
)
702 unsigned long offset
;
706 offset
= bprm
->p
% PAGE_SIZE
;
709 while (bprm
->p
++, *(kaddr
+offset
++)) {
710 if (offset
!= PAGE_SIZE
)
715 page
= bprm
->page
[bprm
->p
/PAGE_SIZE
];
716 kaddr
= (char *)kmap(page
);
724 * cycle the list of binary formats handler, until one recognizes the image
726 int search_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
)
729 struct linux_binfmt
*fmt
;
731 /* handle /sbin/loader.. */
733 struct exec
* eh
= (struct exec
*) bprm
->buf
;
734 struct linux_binprm bprm_loader
;
736 if (!bprm
->loader
&& eh
->fh
.f_magic
== 0x183 &&
737 (eh
->fh
.f_flags
& 0x3000) == 0x3000)
740 char * dynloader
[] = { "/sbin/loader" };
743 allow_write_access(bprm
->file
);
747 bprm_loader
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
748 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) /* clear page-table */
749 bprm_loader
.page
[i
] = NULL
;
751 file
= open_exec(dynloader
[0]);
752 retval
= PTR_ERR(file
);
756 bprm
->loader
= bprm_loader
.p
;
757 retval
= prepare_binprm(bprm
);
760 /* should call search_binary_handler recursively here,
761 but it does not matter */
765 for (try=0; try<2; try++) {
766 read_lock(&binfmt_lock
);
767 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
768 int (*fn
)(struct linux_binprm
*, struct pt_regs
*) = fmt
->load_binary
;
771 if (!try_inc_mod_count(fmt
->module
))
773 read_unlock(&binfmt_lock
);
774 retval
= fn(bprm
, regs
);
777 allow_write_access(bprm
->file
);
781 current
->did_exec
= 1;
784 read_lock(&binfmt_lock
);
786 if (retval
!= -ENOEXEC
)
789 read_unlock(&binfmt_lock
);
793 read_unlock(&binfmt_lock
);
794 if (retval
!= -ENOEXEC
) {
798 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
800 if (printable(bprm
->buf
[0]) &&
801 printable(bprm
->buf
[1]) &&
802 printable(bprm
->buf
[2]) &&
803 printable(bprm
->buf
[3]))
804 break; /* -ENOEXEC */
805 sprintf(modname
, "binfmt-%04x", *(unsigned short *)(&bprm
->buf
[2]));
806 request_module(modname
);
815 * sys_execve() executes a new program.
817 int do_execve(char * filename
, char ** argv
, char ** envp
, struct pt_regs
* regs
)
819 struct linux_binprm bprm
;
824 file
= open_exec(filename
);
826 retval
= PTR_ERR(file
);
830 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
831 memset(bprm
.page
, 0, MAX_ARG_PAGES
*sizeof(bprm
.page
[0]));
834 bprm
.filename
= filename
;
838 if ((bprm
.argc
= count(argv
, bprm
.p
/ sizeof(void *))) < 0) {
839 allow_write_access(file
);
844 if ((bprm
.envc
= count(envp
, bprm
.p
/ sizeof(void *))) < 0) {
845 allow_write_access(file
);
850 retval
= prepare_binprm(&bprm
);
854 retval
= copy_strings_kernel(1, &bprm
.filename
, &bprm
);
859 retval
= copy_strings(bprm
.envc
, envp
, &bprm
);
863 retval
= copy_strings(bprm
.argc
, argv
, &bprm
);
867 retval
= search_binary_handler(&bprm
,regs
);
873 /* Something went wrong, return the inode and free the argument pages*/
874 allow_write_access(bprm
.file
);
878 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
879 struct page
* page
= bprm
.page
[i
];
887 void set_binfmt(struct linux_binfmt
*new)
889 struct linux_binfmt
*old
= current
->binfmt
;
890 if (new && new->module
)
891 __MOD_INC_USE_COUNT(new->module
);
892 current
->binfmt
= new;
893 if (old
&& old
->module
)
894 __MOD_DEC_USE_COUNT(old
->module
);
897 int do_coredump(long signr
, struct pt_regs
* regs
)
899 struct linux_binfmt
* binfmt
;
900 char corename
[6+sizeof(current
->comm
)];
902 struct inode
* inode
;
905 binfmt
= current
->binfmt
;
906 if (!binfmt
|| !binfmt
->core_dump
)
908 if (!current
->dumpable
|| atomic_read(¤t
->mm
->mm_users
) != 1)
910 current
->dumpable
= 0;
911 if (current
->rlim
[RLIMIT_CORE
].rlim_cur
< binfmt
->min_coredump
)
914 memcpy(corename
,"core.", 5);
916 memcpy(corename
+5,current
->comm
,sizeof(current
->comm
));
920 file
= filp_open(corename
, O_CREAT
| 2 | O_TRUNC
| O_NOFOLLOW
, 0600);
923 inode
= file
->f_dentry
->d_inode
;
924 if (inode
->i_nlink
> 1)
925 goto close_fail
; /* multiple links - don't dump */
927 if (!S_ISREG(inode
->i_mode
))
931 if (!file
->f_op
->write
)
933 if (!binfmt
->core_dump(signr
, regs
, file
))
936 filp_close(file
, NULL
);
940 filp_close(file
, NULL
);