4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * #!-checking implemented by tytso.
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
26 #include <linux/sched.h>
27 #include <linux/kernel.h>
29 #include <linux/slab.h>
30 #include <linux/file.h>
31 #include <linux/mman.h>
32 #include <linux/a.out.h>
33 #include <linux/errno.h>
34 #include <linux/signal.h>
35 #include <linux/string.h>
36 #include <linux/stat.h>
37 #include <linux/fcntl.h>
38 #include <linux/ptrace.h>
39 #include <linux/user.h>
40 #include <linux/binfmts.h>
41 #include <linux/personality.h>
42 #include <linux/smp.h>
43 #include <linux/smp_lock.h>
44 #include <linux/init.h>
46 #include <asm/system.h>
47 #include <asm/uaccess.h>
48 #include <asm/pgtable.h>
49 #include <asm/mmu_context.h>
51 #include <linux/config.h>
54 #include <linux/kmod.h>
57 asmlinkage
int sys_exit(int exit_code
);
58 asmlinkage
int sys_brk(unsigned long);
61 * Here are the actual binaries that will be accepted:
62 * add more with "register_binfmt()" if using modules...
64 * These are defined again for the 'real' modules if you are using a
65 * module definition for these routines.
68 static struct linux_binfmt
*formats
= (struct linux_binfmt
*) NULL
;
70 void __init
binfmt_setup(void)
72 #ifdef CONFIG_BINFMT_MISC
76 #ifdef CONFIG_BINFMT_ELF
80 #ifdef CONFIG_BINFMT_ELF32
84 #ifdef CONFIG_BINFMT_AOUT
88 #ifdef CONFIG_BINFMT_AOUT32
92 #ifdef CONFIG_BINFMT_JAVA
96 #ifdef CONFIG_BINFMT_EM86
100 /* This cannot be configured out of the kernel */
101 init_script_binfmt();
104 int register_binfmt(struct linux_binfmt
* fmt
)
106 struct linux_binfmt
** tmp
= &formats
;
122 #ifdef CONFIG_MODULES
123 int unregister_binfmt(struct linux_binfmt
* fmt
)
125 struct linux_binfmt
** tmp
= &formats
;
136 #endif /* CONFIG_MODULES */
138 /* N.B. Error returns must be < 0 */
139 int open_dentry(struct dentry
* dentry
, int mode
)
141 struct inode
* inode
= dentry
->d_inode
;
146 if (!inode
->i_op
|| !inode
->i_op
->default_file_ops
)
148 fd
= get_unused_fd();
151 f
= get_empty_filp();
155 f
->f_mode
= (mode
+1) & O_ACCMODE
;
156 f
->f_dentry
= dentry
;
159 f
->f_op
= inode
->i_op
->default_file_ops
;
161 error
= f
->f_op
->open(inode
,f
);
181 * Note that a shared library must be both readable and executable due to
184 * Also note that we take the address to load from from the file itself.
186 asmlinkage
int sys_uselib(const char * library
)
190 struct linux_binfmt
* fmt
;
193 fd
= sys_open(library
, 0, 0);
199 if (file
&& file
->f_dentry
&& file
->f_op
&& file
->f_op
->read
) {
200 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
201 int (*fn
)(int) = fmt
->load_shlib
;
204 /* N.B. Should use file instead of fd */
206 if (retval
!= -ENOEXEC
)
218 * count() counts the number of arguments/envelopes
220 static int count(char ** argv
)
229 error
= get_user(p
,argv
);
242 * 'copy_string()' copies argument/envelope strings from user
243 * memory to free pages in kernel mem. These are in a format ready
244 * to be put directly into the top of new user memory.
246 * Modified by TYT, 11/24/91 to add the from_kmem argument, which specifies
247 * whether the string and the string array are from user or kernel segments:
249 * from_kmem argv * argv **
250 * 0 user space user space
251 * 1 kernel space user space
252 * 2 kernel space kernel space
254 * We do this by playing games with the fs segment register. Since it
255 * is expensive to load a segment register, we try to avoid calling
256 * set_fs() unless we absolutely have to.
258 unsigned long copy_strings(int argc
,char ** argv
,unsigned long *page
,
259 unsigned long p
, int from_kmem
)
265 return 0; /* bullet-proofing */
275 get_user(str
, argv
+argc
);
277 panic("VFS: argc is wrong");
280 len
= strlen_user(str
); /* includes the '\0' */
281 if (p
< len
) { /* this shouldn't happen - 128kB */
289 int offset
, bytes_to_copy
;
291 offset
= pos
% PAGE_SIZE
;
292 if (!(pag
= (char *) page
[pos
/PAGE_SIZE
]) &&
293 !(pag
= (char *) page
[pos
/PAGE_SIZE
] =
294 (unsigned long *) get_free_page(GFP_USER
))) {
299 bytes_to_copy
= PAGE_SIZE
- offset
;
300 if (bytes_to_copy
> len
)
302 copy_from_user(pag
+ offset
, str
, bytes_to_copy
);
303 pos
+= bytes_to_copy
;
304 str
+= bytes_to_copy
;
305 len
-= bytes_to_copy
;
313 unsigned long setup_arg_pages(unsigned long p
, struct linux_binprm
* bprm
)
315 unsigned long stack_base
;
316 struct vm_area_struct
*mpnt
;
319 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
;
323 bprm
->loader
+= stack_base
;
324 bprm
->exec
+= stack_base
;
326 mpnt
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
328 mpnt
->vm_mm
= current
->mm
;
329 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) p
;
330 mpnt
->vm_end
= STACK_TOP
;
331 mpnt
->vm_page_prot
= PAGE_COPY
;
332 mpnt
->vm_flags
= VM_STACK_FLAGS
;
335 mpnt
->vm_file
= NULL
;
337 insert_vm_struct(current
->mm
, mpnt
);
338 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
341 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
344 put_dirty_page(current
,bprm
->page
[i
],stack_base
);
346 stack_base
+= PAGE_SIZE
;
352 * Read in the complete executable. This is used for "-N" files
353 * that aren't on a block boundary, and for files on filesystems
354 * without bmap support.
356 int read_exec(struct dentry
*dentry
, unsigned long offset
,
357 char * addr
, unsigned long count
, int to_kmem
)
360 struct inode
* inode
= dentry
->d_inode
;
361 int result
= -ENOEXEC
;
363 if (!inode
->i_op
|| !inode
->i_op
->default_file_ops
)
365 if (init_private_file(&file
, dentry
, 1))
367 if (!file
.f_op
->read
)
369 if (file
.f_op
->llseek
) {
370 if (file
.f_op
->llseek(&file
,offset
,0) != offset
)
375 mm_segment_t old_fs
= get_fs();
377 result
= file
.f_op
->read(&file
, addr
, count
, &file
.f_pos
);
380 result
= verify_area(VERIFY_WRITE
, addr
, count
);
383 result
= file
.f_op
->read(&file
, addr
, count
, &file
.f_pos
);
386 if (file
.f_op
->release
)
387 file
.f_op
->release(inode
,&file
);
392 static int exec_mmap(void)
394 struct mm_struct
* mm
, * old_mm
;
397 if (atomic_read(¤t
->mm
->count
) == 1) {
398 flush_cache_mm(current
->mm
);
399 exit_mmap(current
->mm
);
400 clear_page_tables(current
);
401 flush_tlb_mm(current
->mm
);
406 * The clear_page_tables done later on exec does the right thing
407 * to the page directory when shared, except for graceful abort
408 * (the oom is wrong there, too, IMHO)
415 mm
->cpu_vm_mask
= (1UL << smp_processor_id());
419 * Make sure we have a private ldt if needed ...
421 nr
= current
->tarray_ptr
- &task
[0];
422 copy_segments(nr
, current
, mm
);
424 old_mm
= current
->mm
;
426 retval
= new_page_tables(current
);
429 activate_context(current
);
435 * Failure ... restore the prior mm_struct.
438 /* The pgd belongs to the parent ... don't free it! */
440 current
->mm
= old_mm
;
441 /* restore the ldt for this task */
442 copy_segments(nr
, current
, NULL
);
450 * This function makes sure the current process has its own signal table,
451 * so that flush_old_signals can later reset the signals without disturbing
452 * other processes. (Other processes might share the signal table via
453 * the CLONE_SIGHAND option to clone().)
456 static inline int make_private_signals(void)
458 struct signal_struct
* newsig
;
460 if (atomic_read(¤t
->sig
->count
) <= 1)
462 newsig
= kmalloc(sizeof(*newsig
), GFP_KERNEL
);
465 spin_lock_init(&newsig
->siglock
);
466 atomic_set(&newsig
->count
, 1);
467 memcpy(newsig
->action
, current
->sig
->action
, sizeof(newsig
->action
));
468 current
->sig
= newsig
;
473 * If make_private_signals() made a copy of the signal table, decrement the
474 * refcount of the original table, and free it if necessary.
475 * We don't do that in make_private_signals() so that we can back off
476 * in flush_old_exec() if an error occurs after calling make_private_signals().
479 static inline void release_old_signals(struct signal_struct
* oldsig
)
481 if (current
->sig
== oldsig
)
483 if (atomic_dec_and_test(&oldsig
->count
))
488 * These functions flushes out all traces of the currently running executable
489 * so that a new one can be started
492 static inline void flush_old_signals(struct task_struct
*t
)
495 flush_signal_handlers(t
);
498 static inline void flush_old_files(struct files_struct
* files
)
504 unsigned long set
, i
;
507 if (i
>= files
->max_fds
)
509 set
= files
->close_on_exec
.fds_bits
[j
];
510 files
->close_on_exec
.fds_bits
[j
] = 0;
512 for ( ; set
; i
++,set
>>= 1) {
519 int flush_old_exec(struct linux_binprm
* bprm
)
523 struct signal_struct
* oldsig
;
526 * Make sure we have a private signal table
528 oldsig
= current
->sig
;
529 retval
= make_private_signals();
530 if (retval
) goto flush_failed
;
533 * Release all of the old mmap stuff
535 retval
= exec_mmap();
536 if (retval
) goto mmap_failed
;
538 /* This is the point of no return */
539 release_old_signals(oldsig
);
541 if (current
->euid
== current
->uid
&& current
->egid
== current
->gid
)
542 current
->dumpable
= 1;
543 name
= bprm
->filename
;
544 for (i
=0; (ch
= *(name
++)) != '\0';) {
549 current
->comm
[i
++] = ch
;
551 current
->comm
[i
] = '\0';
555 if (bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
||
556 permission(bprm
->dentry
->d_inode
,MAY_READ
))
557 current
->dumpable
= 0;
559 flush_old_signals(current
);
560 flush_old_files(current
->files
);
565 if (current
->sig
!= oldsig
)
568 current
->sig
= oldsig
;
573 * Fill the binprm structure from the inode.
574 * Check permissions, then read the first 512 bytes
576 int prepare_binprm(struct linux_binprm
*bprm
)
579 int retval
,id_change
,cap_raised
;
580 struct inode
* inode
= bprm
->dentry
->d_inode
;
582 mode
= inode
->i_mode
;
583 if (!S_ISREG(mode
)) /* must be regular file */
585 if (!(mode
& 0111)) /* with at least _one_ execute bit set */
587 if (IS_NOEXEC(inode
)) /* FS mustn't be mounted noexec */
591 if ((retval
= permission(inode
, MAY_EXEC
)) != 0)
593 /* better not execute files which are being written to */
594 if (inode
->i_writecount
> 0)
597 bprm
->e_uid
= current
->euid
;
598 bprm
->e_gid
= current
->egid
;
599 id_change
= cap_raised
= 0;
602 if (mode
& S_ISUID
) {
603 bprm
->e_uid
= inode
->i_uid
;
604 if (bprm
->e_uid
!= current
->euid
)
610 * If setgid is set but no group execute bit then this
611 * is a candidate for mandatory locking, not a setgid
614 if ((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) {
615 bprm
->e_gid
= inode
->i_gid
;
616 if (!in_group_p(bprm
->e_gid
))
620 /* We don't have VFS support for capabilities yet */
621 cap_clear(bprm
->cap_inheritable
);
622 cap_clear(bprm
->cap_permitted
);
623 cap_clear(bprm
->cap_effective
);
625 /* To support inheritance of root-permissions and suid-root
626 * executables under compatibility mode, we raise the
627 * effective and inherited bitmasks of the executable file
628 * (translation: we set the executable "capability dumb" and
629 * set the allowed set to maximum). We don't set any forced
632 * If only the real uid is 0, we only raise the inheritable
633 * bitmask of the executable file (translation: we set the
634 * allowed set to maximum and the application to "capability
638 if (!issecure(SECURE_NOROOT
)) {
639 if (bprm
->e_uid
== 0 || current
->uid
== 0)
640 cap_set_full(bprm
->cap_inheritable
);
641 if (bprm
->e_uid
== 0)
642 cap_set_full(bprm
->cap_effective
);
645 /* Only if pP' is _not_ a subset of pP, do we consider there
646 * has been a capability related "change of capability". In
647 * such cases, we need to check that the elevation of
648 * privilege does not go against other system constraints.
649 * The new Permitted set is defined below -- see (***). */
651 kernel_cap_t working
=
652 cap_combine(bprm
->cap_permitted
,
653 cap_intersect(bprm
->cap_inheritable
,
654 current
->cap_inheritable
));
655 if (!cap_issubset(working
, current
->cap_permitted
)) {
663 if (id_change
|| cap_raised
) {
664 /* We can't suid-execute if we're sharing parts of the executable */
665 /* or if we're being traced (or if suid execs are not allowed) */
666 /* (current->mm->count > 1 is ok, as we'll get a new mm anyway) */
668 || (current
->flags
& PF_PTRACED
)
669 || (atomic_read(¤t
->fs
->count
) > 1)
670 || (atomic_read(¤t
->sig
->count
) > 1)
671 || (atomic_read(¤t
->files
->count
) > 1)) {
672 if (id_change
&& !capable(CAP_SETUID
))
674 if (cap_raised
&& !capable(CAP_SETPCAP
))
679 memset(bprm
->buf
,0,sizeof(bprm
->buf
));
680 return read_exec(bprm
->dentry
,0,bprm
->buf
,128,1);
684 * This function is used to produce the new IDs and capabilities
685 * from the old ones and the file's capabilities.
687 * The formula used for evolving capabilities is:
690 * (***) pP' = fP | (fI & pI)
691 * pE' = pP' & fE [NB. fE is 0 or ~0]
693 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
694 * ' indicates post-exec().
697 void compute_creds(struct linux_binprm
*bprm
)
699 /* For init, we want to retain the capabilities set
700 * in the init_task struct. Thus we skip the usual
701 * capability rules */
702 if (current
->pid
!= 1) {
703 int new_permitted
= bprm
->cap_permitted
.cap
|
704 (bprm
->cap_inheritable
.cap
&
705 current
->cap_inheritable
.cap
);
707 current
->cap_permitted
.cap
= new_permitted
;
708 current
->cap_effective
.cap
= new_permitted
&
709 bprm
->cap_effective
.cap
;
712 /* AUD: Audit candidate if current->cap_effective is set */
714 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
;
715 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
;
716 if (current
->euid
!= current
->uid
|| current
->egid
!= current
->gid
||
717 !cap_isclear(current
->cap_permitted
))
718 current
->dumpable
= 0;
722 void remove_arg_zero(struct linux_binprm
*bprm
)
725 unsigned long offset
;
727 offset
= bprm
->p
% PAGE_SIZE
;
728 page
= (char*)bprm
->page
[bprm
->p
/PAGE_SIZE
];
729 while(bprm
->p
++,*(page
+offset
++))
730 if(offset
==PAGE_SIZE
){
732 page
= (char*)bprm
->page
[bprm
->p
/PAGE_SIZE
];
739 * cycle the list of binary formats handler, until one recognizes the image
741 int search_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
)
744 struct linux_binfmt
*fmt
;
746 /* handle /sbin/loader.. */
748 struct exec
* eh
= (struct exec
*) bprm
->buf
;
750 if (!bprm
->loader
&& eh
->fh
.f_magic
== 0x183 &&
751 (eh
->fh
.f_flags
& 0x3000) == 0x3000)
753 char * dynloader
[] = { "/sbin/loader" };
754 struct dentry
* dentry
;
758 remove_arg_zero(bprm
);
759 bprm
->p
= copy_strings(1, dynloader
, bprm
->page
, bprm
->p
, 2);
761 bprm
->loader
= bprm
->p
;
762 dentry
= open_namei(dynloader
[0], 0, 0);
763 retval
= PTR_ERR(dentry
);
766 bprm
->dentry
= dentry
;
767 retval
= prepare_binprm(bprm
);
770 /* should call search_binary_handler recursively here,
771 but it does not matter */
775 for (try=0; try<2; try++) {
776 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
777 int (*fn
)(struct linux_binprm
*, struct pt_regs
*) = fmt
->load_binary
;
780 retval
= fn(bprm
, regs
);
785 current
->did_exec
= 1;
788 if (retval
!= -ENOEXEC
)
790 if (!bprm
->dentry
) /* We don't have the dentry anymore */
793 if (retval
!= -ENOEXEC
) {
797 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
799 if (printable(bprm
->buf
[0]) &&
800 printable(bprm
->buf
[1]) &&
801 printable(bprm
->buf
[2]) &&
802 printable(bprm
->buf
[3]))
803 break; /* -ENOEXEC */
804 sprintf(modname
, "binfmt-%04x", *(unsigned short *)(&bprm
->buf
[2]));
805 request_module(modname
);
814 * sys_execve() executes a new program.
816 int do_execve(char * filename
, char ** argv
, char ** envp
, struct pt_regs
* regs
)
818 struct linux_binprm bprm
;
819 struct dentry
* dentry
;
823 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
824 for (i
=0 ; i
<MAX_ARG_PAGES
; i
++) /* clear page-table */
827 dentry
= open_namei(filename
, 0, 0);
828 retval
= PTR_ERR(dentry
);
832 bprm
.dentry
= dentry
;
833 bprm
.filename
= filename
;
838 if ((bprm
.argc
= count(argv
)) < 0) {
843 if ((bprm
.envc
= count(envp
)) < 0) {
848 retval
= prepare_binprm(&bprm
);
851 bprm
.p
= copy_strings(1, &bprm
.filename
, bprm
.page
, bprm
.p
, 2);
853 bprm
.p
= copy_strings(bprm
.envc
,envp
,bprm
.page
,bprm
.p
,0);
854 bprm
.p
= copy_strings(bprm
.argc
,argv
,bprm
.page
,bprm
.p
,0);
860 retval
= search_binary_handler(&bprm
,regs
);
865 /* Something went wrong, return the inode and free the argument pages*/
869 for (i
=0 ; i
<MAX_ARG_PAGES
; i
++)
870 free_page(bprm
.page
[i
]);