4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * #!-checking implemented by tytso.
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
25 #include <linux/config.h>
26 #include <linux/slab.h>
27 #include <linux/file.h>
28 #include <linux/mman.h>
29 #include <linux/a.out.h>
30 #include <linux/stat.h>
31 #include <linux/fcntl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/init.h>
34 #include <linux/pagemap.h>
35 #include <linux/highmem.h>
37 #include <asm/uaccess.h>
38 #include <asm/pgalloc.h>
39 #include <asm/mmu_context.h>
42 #include <linux/kmod.h>
45 static struct linux_binfmt
*formats
= (struct linux_binfmt
*) NULL
;
47 int register_binfmt(struct linux_binfmt
* fmt
)
49 struct linux_binfmt
** tmp
= &formats
;
65 int unregister_binfmt(struct linux_binfmt
* fmt
)
67 struct linux_binfmt
** tmp
= &formats
;
79 /* N.B. Error returns must be < 0 */
80 int open_dentry(struct dentry
* dentry
, int mode
)
82 struct inode
* inode
= dentry
->d_inode
;
84 struct list_head
* l
= NULL
;
88 l
= &inode
->i_sb
->s_files
;
100 f
->f_mode
= (mode
+1) & O_ACCMODE
;
101 f
->f_dentry
= dentry
;
104 f
->f_op
= inode
->i_fop
;
106 error
= f
->f_op
->open(inode
,f
);
127 * Note that a shared library must be both readable and executable due to
130 * Also note that we take the address to load from from the file itself.
132 asmlinkage
long sys_uselib(const char * library
)
136 struct linux_binfmt
* fmt
;
139 fd
= sys_open(library
, 0, 0);
145 if (file
&& file
->f_dentry
&& file
->f_op
&& file
->f_op
->read
) {
146 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
147 int (*fn
)(int) = fmt
->load_shlib
;
150 /* N.B. Should use file instead of fd */
152 if (retval
!= -ENOEXEC
)
164 * count() counts the number of arguments/envelopes
166 static int count(char ** argv
, int max
)
175 error
= get_user(p
,argv
);
189 * 'copy_strings()' copies argument/envelope strings from user
190 * memory to free pages in kernel mem. These are in a format ready
191 * to be put directly into the top of new user memory.
193 int copy_strings(int argc
,char ** argv
, struct linux_binprm
*bprm
)
200 if (get_user(str
, argv
+argc
) || !str
|| !(len
= strnlen_user(str
, bprm
->p
)))
206 /* XXX: add architecture specific overflow check here. */
213 int offset
, bytes_to_copy
;
215 offset
= pos
% PAGE_SIZE
;
217 page
= bprm
->page
[i
];
220 page
= alloc_page(GFP_HIGHUSER
);
221 bprm
->page
[i
] = page
;
226 kaddr
= (char *)kmap(page
);
229 memset(kaddr
, 0, offset
);
230 bytes_to_copy
= PAGE_SIZE
- offset
;
231 if (bytes_to_copy
> len
) {
234 memset(kaddr
+offset
+len
, 0, PAGE_SIZE
-offset
-len
);
236 err
= copy_from_user(kaddr
+ offset
, str
, bytes_to_copy
);
237 flush_page_to_ram(page
);
243 pos
+= bytes_to_copy
;
244 str
+= bytes_to_copy
;
245 len
-= bytes_to_copy
;
252 * Like copy_strings, but get argv and its values from kernel memory.
254 int copy_strings_kernel(int argc
,char ** argv
, struct linux_binprm
*bprm
)
257 mm_segment_t oldfs
= get_fs();
259 r
= copy_strings(argc
, argv
, bprm
);
265 * This routine is used to map in a page into an address space: needed by
266 * execve() for the initial stack and environment pages.
268 void put_dirty_page(struct task_struct
* tsk
, struct page
*page
, unsigned long address
)
274 if (page_count(page
) != 1)
275 printk("mem_map disagrees with %p at %08lx\n", page
, address
);
276 pgd
= pgd_offset(tsk
->mm
, address
);
277 pmd
= pmd_alloc(pgd
, address
);
280 force_sig(SIGKILL
, tsk
);
283 pte
= pte_alloc(pmd
, address
);
286 force_sig(SIGKILL
, tsk
);
289 if (!pte_none(*pte
)) {
294 flush_page_to_ram(page
);
295 set_pte(pte
, pte_mkdirty(pte_mkwrite(mk_pte(page
, PAGE_COPY
))));
296 /* no need for flush_tlb */
299 int setup_arg_pages(struct linux_binprm
*bprm
)
301 unsigned long stack_base
;
302 struct vm_area_struct
*mpnt
;
305 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
;
307 bprm
->p
+= stack_base
;
309 bprm
->loader
+= stack_base
;
310 bprm
->exec
+= stack_base
;
312 mpnt
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
317 mpnt
->vm_mm
= current
->mm
;
318 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) bprm
->p
;
319 mpnt
->vm_end
= STACK_TOP
;
320 mpnt
->vm_page_prot
= PAGE_COPY
;
321 mpnt
->vm_flags
= VM_STACK_FLAGS
;
324 mpnt
->vm_file
= NULL
;
325 mpnt
->vm_private_data
= (void *) 0;
326 vmlist_modify_lock(current
->mm
);
327 insert_vm_struct(current
->mm
, mpnt
);
328 vmlist_modify_unlock(current
->mm
);
329 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
332 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
335 put_dirty_page(current
,bprm
->page
[i
],stack_base
);
337 stack_base
+= PAGE_SIZE
;
344 * Read in the complete executable. This is used for "-N" files
345 * that aren't on a block boundary, and for files on filesystems
346 * without get_block support.
348 int read_exec(struct dentry
*dentry
, unsigned long offset
,
349 char * addr
, unsigned long count
, int to_kmem
)
352 struct inode
* inode
= dentry
->d_inode
;
353 int result
= -ENOEXEC
;
357 if (init_private_file(&file
, dentry
, 1))
359 if (!file
.f_op
->read
)
361 if (file
.f_op
->llseek
) {
362 if (file
.f_op
->llseek(&file
,offset
,0) != offset
)
367 mm_segment_t old_fs
= get_fs();
369 result
= file
.f_op
->read(&file
, addr
, count
, &file
.f_pos
);
372 result
= verify_area(VERIFY_WRITE
, addr
, count
);
375 result
= file
.f_op
->read(&file
, addr
, count
, &file
.f_pos
);
378 if (file
.f_op
->release
)
379 file
.f_op
->release(inode
,&file
);
384 static int exec_mmap(void)
386 struct mm_struct
* mm
, * old_mm
;
388 old_mm
= current
->mm
;
389 if (old_mm
&& atomic_read(&old_mm
->mm_users
) == 1) {
390 flush_cache_mm(old_mm
);
393 flush_tlb_mm(old_mm
);
399 struct mm_struct
*active_mm
= current
->active_mm
;
401 init_new_context(current
, mm
);
403 current
->active_mm
= mm
;
404 activate_mm(active_mm
, mm
);
407 if (active_mm
!= old_mm
) BUG();
418 * This function makes sure the current process has its own signal table,
419 * so that flush_signal_handlers can later reset the handlers without
420 * disturbing other processes. (Other processes might share the signal
421 * table via the CLONE_SIGHAND option to clone().)
424 static inline int make_private_signals(void)
426 struct signal_struct
* newsig
;
428 if (atomic_read(¤t
->sig
->count
) <= 1)
430 newsig
= kmalloc(sizeof(*newsig
), GFP_KERNEL
);
433 spin_lock_init(&newsig
->siglock
);
434 atomic_set(&newsig
->count
, 1);
435 memcpy(newsig
->action
, current
->sig
->action
, sizeof(newsig
->action
));
436 current
->sig
= newsig
;
441 * If make_private_signals() made a copy of the signal table, decrement the
442 * refcount of the original table, and free it if necessary.
443 * We don't do that in make_private_signals() so that we can back off
444 * in flush_old_exec() if an error occurs after calling make_private_signals().
447 static inline void release_old_signals(struct signal_struct
* oldsig
)
449 if (current
->sig
== oldsig
)
451 if (atomic_dec_and_test(&oldsig
->count
))
456 * These functions flushes out all traces of the currently running executable
457 * so that a new one can be started
460 static inline void flush_old_files(struct files_struct
* files
)
466 unsigned long set
, i
;
469 if (i
>= files
->max_fds
|| i
>= files
->max_fdset
)
471 set
= xchg(&files
->close_on_exec
->fds_bits
[j
], 0);
473 for ( ; set
; i
++,set
>>= 1) {
480 int flush_old_exec(struct linux_binprm
* bprm
)
484 struct signal_struct
* oldsig
;
487 * Make sure we have a private signal table
490 oldsig
= current
->sig
;
491 retval
= make_private_signals();
492 if (retval
) goto flush_failed
;
495 * Release all of the old mmap stuff
497 retval
= exec_mmap();
498 if (retval
) goto mmap_failed
;
500 /* This is the point of no return */
501 release_old_signals(oldsig
);
503 if (current
->euid
== current
->uid
&& current
->egid
== current
->gid
)
504 current
->dumpable
= 1;
505 name
= bprm
->filename
;
506 for (i
=0; (ch
= *(name
++)) != '\0';) {
511 current
->comm
[i
++] = ch
;
513 current
->comm
[i
] = '\0';
517 if (bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
||
518 permission(bprm
->dentry
->d_inode
,MAY_READ
))
519 current
->dumpable
= 0;
521 /* An exec changes our domain. We are no longer part of the thread
524 current
->self_exec_id
++;
526 flush_signal_handlers(current
);
527 flush_old_files(current
->files
);
528 task_unlock(current
);
533 if (current
->sig
!= oldsig
)
536 current
->sig
= oldsig
;
537 task_unlock(current
);
542 * We mustn't allow tracing of suid binaries, unless
543 * the tracer has the capability to trace anything..
545 static inline int must_not_trace_exec(struct task_struct
* p
)
547 return (p
->flags
& PF_PTRACED
) && !cap_raised(p
->p_pptr
->cap_effective
, CAP_SYS_PTRACE
);
551 * Fill the binprm structure from the inode.
552 * Check permissions, then read the first 512 bytes
554 int prepare_binprm(struct linux_binprm
*bprm
)
557 int retval
,id_change
,cap_raised
;
558 struct inode
* inode
= bprm
->dentry
->d_inode
;
560 mode
= inode
->i_mode
;
561 if (!S_ISREG(mode
)) /* must be regular file */
563 if (!(mode
& 0111)) /* with at least _one_ execute bit set */
565 if (IS_NOEXEC(inode
)) /* FS mustn't be mounted noexec */
569 if ((retval
= permission(inode
, MAY_EXEC
)) != 0)
571 /* better not execute files which are being written to */
572 if (atomic_read(&inode
->i_writecount
) > 0)
575 bprm
->e_uid
= current
->euid
;
576 bprm
->e_gid
= current
->egid
;
577 id_change
= cap_raised
= 0;
580 if (mode
& S_ISUID
) {
581 bprm
->e_uid
= inode
->i_uid
;
582 if (bprm
->e_uid
!= current
->euid
)
588 * If setgid is set but no group execute bit then this
589 * is a candidate for mandatory locking, not a setgid
592 if ((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) {
593 bprm
->e_gid
= inode
->i_gid
;
594 if (!in_group_p(bprm
->e_gid
))
598 /* We don't have VFS support for capabilities yet */
599 cap_clear(bprm
->cap_inheritable
);
600 cap_clear(bprm
->cap_permitted
);
601 cap_clear(bprm
->cap_effective
);
603 /* To support inheritance of root-permissions and suid-root
604 * executables under compatibility mode, we raise the
605 * effective and inherited bitmasks of the executable file
606 * (translation: we set the executable "capability dumb" and
607 * set the allowed set to maximum). We don't set any forced
610 * If only the real uid is 0, we only raise the inheritable
611 * bitmask of the executable file (translation: we set the
612 * allowed set to maximum and the application to "capability
616 if (!issecure(SECURE_NOROOT
)) {
617 if (bprm
->e_uid
== 0 || current
->uid
== 0)
618 cap_set_full(bprm
->cap_inheritable
);
619 if (bprm
->e_uid
== 0)
620 cap_set_full(bprm
->cap_effective
);
623 /* Only if pP' is _not_ a subset of pP, do we consider there
624 * has been a capability related "change of capability". In
625 * such cases, we need to check that the elevation of
626 * privilege does not go against other system constraints.
627 * The new Permitted set is defined below -- see (***). */
629 kernel_cap_t working
=
630 cap_combine(bprm
->cap_permitted
,
631 cap_intersect(bprm
->cap_inheritable
,
632 current
->cap_inheritable
));
633 if (!cap_issubset(working
, current
->cap_permitted
)) {
638 if (id_change
|| cap_raised
) {
639 /* We can't suid-execute if we're sharing parts of the executable */
640 /* or if we're being traced (or if suid execs are not allowed) */
641 /* (current->mm->mm_users > 1 is ok, as we'll get a new mm anyway) */
643 || must_not_trace_exec(current
)
644 || (atomic_read(¤t
->fs
->count
) > 1)
645 || (atomic_read(¤t
->sig
->count
) > 1)
646 || (atomic_read(¤t
->files
->count
) > 1)) {
647 if (id_change
&& !capable(CAP_SETUID
))
649 if (cap_raised
&& !capable(CAP_SETPCAP
))
654 memset(bprm
->buf
,0,sizeof(bprm
->buf
));
655 return read_exec(bprm
->dentry
,0,bprm
->buf
,128,1);
659 * This function is used to produce the new IDs and capabilities
660 * from the old ones and the file's capabilities.
662 * The formula used for evolving capabilities is:
665 * (***) pP' = fP | (fI & pI)
666 * pE' = pP' & fE [NB. fE is 0 or ~0]
668 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
669 * ' indicates post-exec().
672 void compute_creds(struct linux_binprm
*bprm
)
674 int new_permitted
= cap_t(bprm
->cap_permitted
) |
675 (cap_t(bprm
->cap_inheritable
) &
676 cap_t(current
->cap_inheritable
));
678 /* For init, we want to retain the capabilities set
679 * in the init_task struct. Thus we skip the usual
680 * capability rules */
681 if (current
->pid
!= 1) {
682 cap_t(current
->cap_permitted
) = new_permitted
;
683 cap_t(current
->cap_effective
) = new_permitted
&
684 cap_t(bprm
->cap_effective
);
687 /* AUD: Audit candidate if current->cap_effective is set */
689 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
;
690 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
;
691 if (current
->euid
!= current
->uid
|| current
->egid
!= current
->gid
||
692 !cap_issubset(new_permitted
, current
->cap_permitted
))
693 current
->dumpable
= 0;
697 void remove_arg_zero(struct linux_binprm
*bprm
)
700 unsigned long offset
;
704 offset
= bprm
->p
% PAGE_SIZE
;
707 while (bprm
->p
++, *(kaddr
+offset
++)) {
708 if (offset
!= PAGE_SIZE
)
713 page
= bprm
->page
[bprm
->p
/PAGE_SIZE
];
714 kaddr
= (char *)kmap(page
);
722 * cycle the list of binary formats handler, until one recognizes the image
724 int search_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
)
727 struct linux_binfmt
*fmt
;
729 /* handle /sbin/loader.. */
731 struct exec
* eh
= (struct exec
*) bprm
->buf
;
732 struct linux_binprm bprm_loader
;
734 if (!bprm
->loader
&& eh
->fh
.f_magic
== 0x183 &&
735 (eh
->fh
.f_flags
& 0x3000) == 0x3000)
738 char * dynloader
[] = { "/sbin/loader" };
739 struct dentry
* dentry
;
746 bprm_loader
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
747 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) /* clear page-table */
748 bprm_loader
.page
[i
] = NULL
;
751 dentry
= open_namei(dynloader
[0], 0, 0);
753 retval
= PTR_ERR(dentry
);
756 bprm
->dentry
= dentry
;
757 bprm
->loader
= bprm_loader
.p
;
758 retval
= prepare_binprm(bprm
);
761 /* should call search_binary_handler recursively here,
762 but it does not matter */
766 for (try=0; try<2; try++) {
767 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
768 int (*fn
)(struct linux_binprm
*, struct pt_regs
*) = fmt
->load_binary
;
771 retval
= fn(bprm
, regs
);
779 current
->did_exec
= 1;
782 if (retval
!= -ENOEXEC
)
784 if (!bprm
->dentry
) /* We don't have the dentry anymore */
787 if (retval
!= -ENOEXEC
) {
791 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
793 if (printable(bprm
->buf
[0]) &&
794 printable(bprm
->buf
[1]) &&
795 printable(bprm
->buf
[2]) &&
796 printable(bprm
->buf
[3]))
797 break; /* -ENOEXEC */
798 sprintf(modname
, "binfmt-%04x", *(unsigned short *)(&bprm
->buf
[2]));
799 request_module(modname
);
808 * sys_execve() executes a new program.
810 int do_execve(char * filename
, char ** argv
, char ** envp
, struct pt_regs
* regs
)
812 struct linux_binprm bprm
;
813 struct dentry
* dentry
;
817 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
818 memset(bprm
.page
, 0, MAX_ARG_PAGES
*sizeof(bprm
.page
[0]));
821 dentry
= open_namei(filename
, 0, 0);
824 retval
= PTR_ERR(dentry
);
828 bprm
.dentry
= dentry
;
829 bprm
.filename
= filename
;
833 if ((bprm
.argc
= count(argv
, bprm
.p
/ sizeof(void *))) < 0) {
840 if ((bprm
.envc
= count(envp
, bprm
.p
/ sizeof(void *))) < 0) {
847 retval
= prepare_binprm(&bprm
);
851 retval
= copy_strings_kernel(1, &bprm
.filename
, &bprm
);
856 retval
= copy_strings(bprm
.envc
, envp
, &bprm
);
860 retval
= copy_strings(bprm
.argc
, argv
, &bprm
);
864 retval
= search_binary_handler(&bprm
,regs
);
870 /* Something went wrong, return the inode and free the argument pages*/
877 /* Assumes that free_page() can take a NULL argument. */
878 /* I hope this is ok for all architectures */
879 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++)
881 __free_page(bprm
.page
[i
]);
886 int do_coredump(long signr
, struct pt_regs
* regs
)
888 struct linux_binfmt
* binfmt
;
889 char corename
[6+sizeof(current
->comm
)];
891 struct dentry
* dentry
;
892 struct inode
* inode
;
895 binfmt
= current
->binfmt
;
896 if (!binfmt
|| !binfmt
->core_dump
)
898 if (!current
->dumpable
|| atomic_read(¤t
->mm
->mm_users
) != 1)
900 current
->dumpable
= 0;
901 if (current
->rlim
[RLIMIT_CORE
].rlim_cur
< binfmt
->min_coredump
)
904 memcpy(corename
,"core.", 5);
906 memcpy(corename
+5,current
->comm
,sizeof(current
->comm
));
910 file
= filp_open(corename
, O_CREAT
| 2 | O_TRUNC
| O_NOFOLLOW
, 0600);
913 dentry
= file
->f_dentry
;
914 inode
= dentry
->d_inode
;
915 if (inode
->i_nlink
> 1)
916 goto close_fail
; /* multiple links - don't dump */
918 if (!S_ISREG(inode
->i_mode
))
922 if (!file
->f_op
->write
)
924 if (!binfmt
->core_dump(signr
, regs
, file
))
926 filp_close(file
, NULL
);
931 filp_close(file
, NULL
);