4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * #!-checking implemented by tytso.
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
25 #include <linux/config.h>
26 #include <linux/slab.h>
27 #include <linux/file.h>
28 #include <linux/mman.h>
29 #include <linux/a.out.h>
30 #include <linux/stat.h>
31 #include <linux/fcntl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/init.h>
34 #include <linux/pagemap.h>
35 #include <linux/highmem.h>
36 #include <linux/spinlock.h>
37 #define __NO_VERSION__
38 #include <linux/module.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgalloc.h>
42 #include <asm/mmu_context.h>
45 #include <linux/kmod.h>
48 static struct linux_binfmt
*formats
;
49 static rwlock_t binfmt_lock
= RW_LOCK_UNLOCKED
;
51 int register_binfmt(struct linux_binfmt
* fmt
)
53 struct linux_binfmt
** tmp
= &formats
;
59 write_lock(&binfmt_lock
);
62 write_unlock(&binfmt_lock
);
69 write_unlock(&binfmt_lock
);
73 int unregister_binfmt(struct linux_binfmt
* fmt
)
75 struct linux_binfmt
** tmp
= &formats
;
77 write_lock(&binfmt_lock
);
81 write_unlock(&binfmt_lock
);
86 write_unlock(&binfmt_lock
);
90 static inline void put_binfmt(struct linux_binfmt
* fmt
)
93 __MOD_DEC_USE_COUNT(fmt
->module
);
97 * Note that a shared library must be both readable and executable due to
100 * Also note that we take the address to load from from the file itself.
102 asmlinkage
long sys_uselib(const char * library
)
108 error
= user_path_walk(library
, &nd
);
113 if (!S_ISREG(nd
.dentry
->d_inode
->i_mode
))
116 error
= permission(nd
.dentry
->d_inode
, MAY_READ
| MAY_EXEC
);
120 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
121 error
= PTR_ERR(file
);
126 if(file
->f_op
&& file
->f_op
->read
) {
127 struct linux_binfmt
* fmt
;
129 read_lock(&binfmt_lock
);
130 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
131 if (!fmt
->load_shlib
)
133 if (!try_inc_mod_count(fmt
->module
))
135 read_unlock(&binfmt_lock
);
136 error
= fmt
->load_shlib(file
);
137 read_lock(&binfmt_lock
);
139 if (error
!= -ENOEXEC
)
142 read_unlock(&binfmt_lock
);
153 * count() counts the number of arguments/envelopes
155 static int count(char ** argv
, int max
)
164 error
= get_user(p
,argv
);
178 * 'copy_strings()' copies argument/envelope strings from user
179 * memory to free pages in kernel mem. These are in a format ready
180 * to be put directly into the top of new user memory.
182 int copy_strings(int argc
,char ** argv
, struct linux_binprm
*bprm
)
189 if (get_user(str
, argv
+argc
) || !str
|| !(len
= strnlen_user(str
, bprm
->p
)))
195 /* XXX: add architecture specific overflow check here. */
202 int offset
, bytes_to_copy
;
204 offset
= pos
% PAGE_SIZE
;
206 page
= bprm
->page
[i
];
209 page
= alloc_page(GFP_HIGHUSER
);
210 bprm
->page
[i
] = page
;
215 kaddr
= (char *)kmap(page
);
218 memset(kaddr
, 0, offset
);
219 bytes_to_copy
= PAGE_SIZE
- offset
;
220 if (bytes_to_copy
> len
) {
223 memset(kaddr
+offset
+len
, 0, PAGE_SIZE
-offset
-len
);
225 err
= copy_from_user(kaddr
+ offset
, str
, bytes_to_copy
);
226 flush_dcache_page(page
);
227 flush_page_to_ram(page
);
233 pos
+= bytes_to_copy
;
234 str
+= bytes_to_copy
;
235 len
-= bytes_to_copy
;
242 * Like copy_strings, but get argv and its values from kernel memory.
244 int copy_strings_kernel(int argc
,char ** argv
, struct linux_binprm
*bprm
)
247 mm_segment_t oldfs
= get_fs();
249 r
= copy_strings(argc
, argv
, bprm
);
255 * This routine is used to map in a page into an address space: needed by
256 * execve() for the initial stack and environment pages.
258 void put_dirty_page(struct task_struct
* tsk
, struct page
*page
, unsigned long address
)
264 if (page_count(page
) != 1)
265 printk("mem_map disagrees with %p at %08lx\n", page
, address
);
266 pgd
= pgd_offset(tsk
->mm
, address
);
267 pmd
= pmd_alloc(pgd
, address
);
270 force_sig(SIGKILL
, tsk
);
273 pte
= pte_alloc(pmd
, address
);
276 force_sig(SIGKILL
, tsk
);
279 if (!pte_none(*pte
)) {
284 flush_page_to_ram(page
);
285 set_pte(pte
, pte_mkdirty(pte_mkwrite(mk_pte(page
, PAGE_COPY
))));
286 /* no need for flush_tlb */
289 int setup_arg_pages(struct linux_binprm
*bprm
)
291 unsigned long stack_base
;
292 struct vm_area_struct
*mpnt
;
295 stack_base
= STACK_TOP
- MAX_ARG_PAGES
*PAGE_SIZE
;
297 bprm
->p
+= stack_base
;
299 bprm
->loader
+= stack_base
;
300 bprm
->exec
+= stack_base
;
302 mpnt
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
306 down(¤t
->mm
->mmap_sem
);
308 mpnt
->vm_mm
= current
->mm
;
309 mpnt
->vm_start
= PAGE_MASK
& (unsigned long) bprm
->p
;
310 mpnt
->vm_end
= STACK_TOP
;
311 mpnt
->vm_page_prot
= PAGE_COPY
;
312 mpnt
->vm_flags
= VM_STACK_FLAGS
;
315 mpnt
->vm_file
= NULL
;
316 mpnt
->vm_private_data
= (void *) 0;
317 vmlist_modify_lock(current
->mm
);
318 insert_vm_struct(current
->mm
, mpnt
);
319 vmlist_modify_unlock(current
->mm
);
320 current
->mm
->total_vm
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
323 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
324 struct page
*page
= bprm
->page
[i
];
326 bprm
->page
[i
] = NULL
;
328 put_dirty_page(current
,page
,stack_base
);
330 stack_base
+= PAGE_SIZE
;
332 up(¤t
->mm
->mmap_sem
);
337 struct file
*open_exec(const char *name
)
344 if (path_init(name
, LOOKUP_FOLLOW
|LOOKUP_POSITIVE
, &nd
))
345 err
= path_walk(name
, &nd
);
348 inode
= nd
.dentry
->d_inode
;
349 file
= ERR_PTR(-EACCES
);
350 if (!IS_NOEXEC(inode
) && S_ISREG(inode
->i_mode
)) {
351 int err
= permission(inode
, MAY_EXEC
);
354 file
= dentry_open(nd
.dentry
, nd
.mnt
, O_RDONLY
);
356 err
= deny_write_access(file
);
371 int kernel_read(struct file
*file
, unsigned long offset
,
372 char * addr
, unsigned long count
)
376 int result
= -ENOSYS
;
378 if (!file
->f_op
->read
)
382 result
= file
->f_op
->read(file
, addr
, count
, &pos
);
388 static int exec_mmap(void)
390 struct mm_struct
* mm
, * old_mm
;
392 old_mm
= current
->mm
;
393 if (old_mm
&& atomic_read(&old_mm
->mm_users
) == 1) {
394 flush_cache_mm(old_mm
);
397 flush_tlb_mm(old_mm
);
403 struct mm_struct
*active_mm
= current
->active_mm
;
405 if (init_new_context(current
, mm
)) {
411 current
->active_mm
= mm
;
412 task_unlock(current
);
413 activate_mm(active_mm
, mm
);
416 if (active_mm
!= old_mm
) BUG();
427 * This function makes sure the current process has its own signal table,
428 * so that flush_signal_handlers can later reset the handlers without
429 * disturbing other processes. (Other processes might share the signal
430 * table via the CLONE_SIGNAL option to clone().)
433 static inline int make_private_signals(void)
435 struct signal_struct
* newsig
;
437 if (atomic_read(¤t
->sig
->count
) <= 1)
439 newsig
= kmem_cache_alloc(sigact_cachep
, GFP_KERNEL
);
442 spin_lock_init(&newsig
->siglock
);
443 atomic_set(&newsig
->count
, 1);
444 memcpy(newsig
->action
, current
->sig
->action
, sizeof(newsig
->action
));
445 spin_lock_irq(¤t
->sigmask_lock
);
446 current
->sig
= newsig
;
447 spin_unlock_irq(¤t
->sigmask_lock
);
452 * If make_private_signals() made a copy of the signal table, decrement the
453 * refcount of the original table, and free it if necessary.
454 * We don't do that in make_private_signals() so that we can back off
455 * in flush_old_exec() if an error occurs after calling make_private_signals().
458 static inline void release_old_signals(struct signal_struct
* oldsig
)
460 if (current
->sig
== oldsig
)
462 if (atomic_dec_and_test(&oldsig
->count
))
463 kmem_cache_free(sigact_cachep
, oldsig
);
467 * These functions flushes out all traces of the currently running executable
468 * so that a new one can be started
471 static inline void flush_old_files(struct files_struct
* files
)
475 write_lock(&files
->file_lock
);
477 unsigned long set
, i
;
481 if (i
>= files
->max_fds
|| i
>= files
->max_fdset
)
483 set
= files
->close_on_exec
->fds_bits
[j
];
486 files
->close_on_exec
->fds_bits
[j
] = 0;
487 write_unlock(&files
->file_lock
);
488 for ( ; set
; i
++,set
>>= 1) {
493 write_lock(&files
->file_lock
);
496 write_unlock(&files
->file_lock
);
500 * An execve() will automatically "de-thread" the process.
501 * Note: we don't have to hold the tasklist_lock to test
502 * whether we migth need to do this. If we're not part of
503 * a thread group, there is no way we can become one
504 * dynamically. And if we are, we only need to protect the
505 * unlink - even if we race with the last other thread exit,
506 * at worst the list_del_init() might end up being a no-op.
508 static inline void de_thread(struct task_struct
*tsk
)
510 if (!list_empty(&tsk
->thread_group
)) {
511 write_lock_irq(&tasklist_lock
);
512 list_del_init(&tsk
->thread_group
);
513 write_unlock_irq(&tasklist_lock
);
516 /* Minor oddity: this might stay the same. */
517 tsk
->tgid
= tsk
->pid
;
520 int flush_old_exec(struct linux_binprm
* bprm
)
524 struct signal_struct
* oldsig
;
527 * Make sure we have a private signal table
529 oldsig
= current
->sig
;
530 retval
= make_private_signals();
531 if (retval
) goto flush_failed
;
534 * Release all of the old mmap stuff
536 retval
= exec_mmap();
537 if (retval
) goto mmap_failed
;
539 /* This is the point of no return */
540 release_old_signals(oldsig
);
542 current
->sas_ss_sp
= current
->sas_ss_size
= 0;
544 if (current
->euid
== current
->uid
&& current
->egid
== current
->gid
)
545 current
->dumpable
= 1;
546 name
= bprm
->filename
;
547 for (i
=0; (ch
= *(name
++)) != '\0';) {
552 current
->comm
[i
++] = ch
;
554 current
->comm
[i
] = '\0';
560 if (bprm
->e_uid
!= current
->euid
|| bprm
->e_gid
!= current
->egid
||
561 permission(bprm
->file
->f_dentry
->d_inode
,MAY_READ
))
562 current
->dumpable
= 0;
564 /* An exec changes our domain. We are no longer part of the thread
567 current
->self_exec_id
++;
569 flush_signal_handlers(current
);
570 flush_old_files(current
->files
);
576 spin_lock_irq(¤t
->sigmask_lock
);
577 if (current
->sig
!= oldsig
)
579 current
->sig
= oldsig
;
580 spin_unlock_irq(¤t
->sigmask_lock
);
585 * We mustn't allow tracing of suid binaries, unless
586 * the tracer has the capability to trace anything..
588 static inline int must_not_trace_exec(struct task_struct
* p
)
590 return (p
->ptrace
& PT_PTRACED
) && !cap_raised(p
->p_pptr
->cap_effective
, CAP_SYS_PTRACE
);
594 * Fill the binprm structure from the inode.
595 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
597 int prepare_binprm(struct linux_binprm
*bprm
)
600 int id_change
,cap_raised
;
601 struct inode
* inode
= bprm
->file
->f_dentry
->d_inode
;
603 mode
= inode
->i_mode
;
604 /* Huh? We had already checked for MAY_EXEC, WTF do we check this? */
605 if (!(mode
& 0111)) /* with at least _one_ execute bit set */
608 bprm
->e_uid
= current
->euid
;
609 bprm
->e_gid
= current
->egid
;
610 id_change
= cap_raised
= 0;
613 if (mode
& S_ISUID
) {
614 bprm
->e_uid
= inode
->i_uid
;
615 if (bprm
->e_uid
!= current
->euid
)
621 * If setgid is set but no group execute bit then this
622 * is a candidate for mandatory locking, not a setgid
625 if ((mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
)) {
626 bprm
->e_gid
= inode
->i_gid
;
627 if (!in_group_p(bprm
->e_gid
))
631 /* We don't have VFS support for capabilities yet */
632 cap_clear(bprm
->cap_inheritable
);
633 cap_clear(bprm
->cap_permitted
);
634 cap_clear(bprm
->cap_effective
);
636 /* To support inheritance of root-permissions and suid-root
637 * executables under compatibility mode, we raise all three
638 * capability sets for the file.
640 * If only the real uid is 0, we only raise the inheritable
641 * and permitted sets of the executable file.
644 if (!issecure(SECURE_NOROOT
)) {
645 if (bprm
->e_uid
== 0 || current
->uid
== 0) {
646 cap_set_full(bprm
->cap_inheritable
);
647 cap_set_full(bprm
->cap_permitted
);
649 if (bprm
->e_uid
== 0)
650 cap_set_full(bprm
->cap_effective
);
653 /* Only if pP' is _not_ a subset of pP, do we consider there
654 * has been a capability related "change of capability". In
655 * such cases, we need to check that the elevation of
656 * privilege does not go against other system constraints.
657 * The new Permitted set is defined below -- see (***). */
659 kernel_cap_t permitted
, working
;
661 permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
662 working
= cap_intersect(bprm
->cap_inheritable
,
663 current
->cap_inheritable
);
664 working
= cap_combine(permitted
, working
);
665 if (!cap_issubset(working
, current
->cap_permitted
)) {
670 if (id_change
|| cap_raised
) {
671 /* We can't suid-execute if we're sharing parts of the executable */
672 /* or if we're being traced (or if suid execs are not allowed) */
673 /* (current->mm->mm_users > 1 is ok, as we'll get a new mm anyway) */
675 || must_not_trace_exec(current
)
676 || (atomic_read(¤t
->fs
->count
) > 1)
677 || (atomic_read(¤t
->sig
->count
) > 1)
678 || (atomic_read(¤t
->files
->count
) > 1)) {
679 if (id_change
&& !capable(CAP_SETUID
))
681 if (cap_raised
&& !capable(CAP_SETPCAP
))
686 memset(bprm
->buf
,0,BINPRM_BUF_SIZE
);
687 return kernel_read(bprm
->file
,0,bprm
->buf
,BINPRM_BUF_SIZE
);
691 * This function is used to produce the new IDs and capabilities
692 * from the old ones and the file's capabilities.
694 * The formula used for evolving capabilities is:
697 * (***) pP' = (fP & X) | (fI & pI)
698 * pE' = pP' & fE [NB. fE is 0 or ~0]
700 * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
701 * ' indicates post-exec(), and X is the global 'cap_bset'.
704 void compute_creds(struct linux_binprm
*bprm
)
706 kernel_cap_t new_permitted
, working
;
708 new_permitted
= cap_intersect(bprm
->cap_permitted
, cap_bset
);
709 working
= cap_intersect(bprm
->cap_inheritable
,
710 current
->cap_inheritable
);
711 new_permitted
= cap_combine(new_permitted
, working
);
713 /* For init, we want to retain the capabilities set
714 * in the init_task struct. Thus we skip the usual
715 * capability rules */
716 if (current
->pid
!= 1) {
717 current
->cap_permitted
= new_permitted
;
718 current
->cap_effective
=
719 cap_intersect(new_permitted
, bprm
->cap_effective
);
722 /* AUD: Audit candidate if current->cap_effective is set */
724 current
->suid
= current
->euid
= current
->fsuid
= bprm
->e_uid
;
725 current
->sgid
= current
->egid
= current
->fsgid
= bprm
->e_gid
;
726 if (current
->euid
!= current
->uid
|| current
->egid
!= current
->gid
||
727 !cap_issubset(new_permitted
, current
->cap_permitted
))
728 current
->dumpable
= 0;
730 current
->keep_capabilities
= 0;
734 void remove_arg_zero(struct linux_binprm
*bprm
)
737 unsigned long offset
;
741 offset
= bprm
->p
% PAGE_SIZE
;
744 while (bprm
->p
++, *(kaddr
+offset
++)) {
745 if (offset
!= PAGE_SIZE
)
750 page
= bprm
->page
[bprm
->p
/PAGE_SIZE
];
751 kaddr
= (char *)kmap(page
);
759 * cycle the list of binary formats handler, until one recognizes the image
761 int search_binary_handler(struct linux_binprm
*bprm
,struct pt_regs
*regs
)
764 struct linux_binfmt
*fmt
;
766 /* handle /sbin/loader.. */
768 struct exec
* eh
= (struct exec
*) bprm
->buf
;
769 struct linux_binprm bprm_loader
;
771 if (!bprm
->loader
&& eh
->fh
.f_magic
== 0x183 &&
772 (eh
->fh
.f_flags
& 0x3000) == 0x3000)
775 char * dynloader
[] = { "/sbin/loader" };
778 allow_write_access(bprm
->file
);
782 bprm_loader
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
783 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) /* clear page-table */
784 bprm_loader
.page
[i
] = NULL
;
786 file
= open_exec(dynloader
[0]);
787 retval
= PTR_ERR(file
);
791 bprm
->loader
= bprm_loader
.p
;
792 retval
= prepare_binprm(bprm
);
795 /* should call search_binary_handler recursively here,
796 but it does not matter */
800 for (try=0; try<2; try++) {
801 read_lock(&binfmt_lock
);
802 for (fmt
= formats
; fmt
; fmt
= fmt
->next
) {
803 int (*fn
)(struct linux_binprm
*, struct pt_regs
*) = fmt
->load_binary
;
806 if (!try_inc_mod_count(fmt
->module
))
808 read_unlock(&binfmt_lock
);
809 retval
= fn(bprm
, regs
);
812 allow_write_access(bprm
->file
);
816 current
->did_exec
= 1;
819 read_lock(&binfmt_lock
);
821 if (retval
!= -ENOEXEC
)
824 read_unlock(&binfmt_lock
);
828 read_unlock(&binfmt_lock
);
829 if (retval
!= -ENOEXEC
) {
833 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
835 if (printable(bprm
->buf
[0]) &&
836 printable(bprm
->buf
[1]) &&
837 printable(bprm
->buf
[2]) &&
838 printable(bprm
->buf
[3]))
839 break; /* -ENOEXEC */
840 sprintf(modname
, "binfmt-%04x", *(unsigned short *)(&bprm
->buf
[2]));
841 request_module(modname
);
850 * sys_execve() executes a new program.
852 int do_execve(char * filename
, char ** argv
, char ** envp
, struct pt_regs
* regs
)
854 struct linux_binprm bprm
;
859 file
= open_exec(filename
);
861 retval
= PTR_ERR(file
);
865 bprm
.p
= PAGE_SIZE
*MAX_ARG_PAGES
-sizeof(void *);
866 memset(bprm
.page
, 0, MAX_ARG_PAGES
*sizeof(bprm
.page
[0]));
869 bprm
.filename
= filename
;
873 if ((bprm
.argc
= count(argv
, bprm
.p
/ sizeof(void *))) < 0) {
874 allow_write_access(file
);
879 if ((bprm
.envc
= count(envp
, bprm
.p
/ sizeof(void *))) < 0) {
880 allow_write_access(file
);
885 retval
= prepare_binprm(&bprm
);
889 retval
= copy_strings_kernel(1, &bprm
.filename
, &bprm
);
894 retval
= copy_strings(bprm
.envc
, envp
, &bprm
);
898 retval
= copy_strings(bprm
.argc
, argv
, &bprm
);
902 retval
= search_binary_handler(&bprm
,regs
);
908 /* Something went wrong, return the inode and free the argument pages*/
909 allow_write_access(bprm
.file
);
913 for (i
= 0 ; i
< MAX_ARG_PAGES
; i
++) {
914 struct page
* page
= bprm
.page
[i
];
922 void set_binfmt(struct linux_binfmt
*new)
924 struct linux_binfmt
*old
= current
->binfmt
;
925 if (new && new->module
)
926 __MOD_INC_USE_COUNT(new->module
);
927 current
->binfmt
= new;
928 if (old
&& old
->module
)
929 __MOD_DEC_USE_COUNT(old
->module
);
932 int do_coredump(long signr
, struct pt_regs
* regs
)
934 struct linux_binfmt
* binfmt
;
935 char corename
[6+sizeof(current
->comm
)];
937 struct inode
* inode
;
940 binfmt
= current
->binfmt
;
941 if (!binfmt
|| !binfmt
->core_dump
)
943 if (!current
->dumpable
|| atomic_read(¤t
->mm
->mm_users
) != 1)
945 current
->dumpable
= 0;
946 if (current
->rlim
[RLIMIT_CORE
].rlim_cur
< binfmt
->min_coredump
)
949 memcpy(corename
,"core.", 5);
951 memcpy(corename
+5,current
->comm
,sizeof(current
->comm
));
955 file
= filp_open(corename
, O_CREAT
| 2 | O_TRUNC
| O_NOFOLLOW
, 0600);
958 inode
= file
->f_dentry
->d_inode
;
959 if (inode
->i_nlink
> 1)
960 goto close_fail
; /* multiple links - don't dump */
962 if (!S_ISREG(inode
->i_mode
))
966 if (!file
->f_op
->write
)
968 if (!binfmt
->core_dump(signr
, regs
, file
))
971 filp_close(file
, NULL
);
975 filp_close(file
, NULL
);