2 * hugetlbpage-backed filesystem. Based on ramfs.
6 * Copyright (C) 2002 Linus Torvalds.
9 #include <linux/module.h>
10 #include <linux/thread_info.h>
11 #include <asm/current.h>
12 #include <linux/sched.h> /* remove ASAP */
14 #include <linux/mount.h>
15 #include <linux/file.h>
16 #include <linux/writeback.h>
17 #include <linux/pagemap.h>
18 #include <linux/highmem.h>
19 #include <linux/init.h>
20 #include <linux/string.h>
21 #include <linux/backing-dev.h>
22 #include <linux/hugetlb.h>
23 #include <linux/pagevec.h>
24 #include <linux/quotaops.h>
25 #include <linux/slab.h>
26 #include <linux/dnotify.h>
27 #include <linux/statfs.h>
28 #include <linux/security.h>
30 #include <asm/uaccess.h>
32 /* some random number */
33 #define HUGETLBFS_MAGIC 0x958458f6
35 static struct super_operations hugetlbfs_ops
;
36 static struct address_space_operations hugetlbfs_aops
;
37 struct file_operations hugetlbfs_file_operations
;
38 static struct inode_operations hugetlbfs_dir_inode_operations
;
39 static struct inode_operations hugetlbfs_inode_operations
;
41 static struct backing_dev_info hugetlbfs_backing_dev_info
= {
42 .ra_pages
= 0, /* No readahead */
43 .memory_backed
= 1, /* Does not contribute to dirty memory */
46 int sysctl_hugetlb_shm_group
;
48 static int hugetlbfs_file_mmap(struct file
*file
, struct vm_area_struct
*vma
)
50 struct inode
*inode
= file
->f_dentry
->d_inode
;
51 struct address_space
*mapping
= inode
->i_mapping
;
55 if ((vma
->vm_flags
& (VM_MAYSHARE
| VM_WRITE
)) == VM_WRITE
)
58 if (vma
->vm_pgoff
& (HPAGE_SIZE
/ PAGE_SIZE
- 1))
61 if (vma
->vm_start
& ~HPAGE_MASK
)
64 if (vma
->vm_end
& ~HPAGE_MASK
)
67 if (vma
->vm_end
- vma
->vm_start
< HPAGE_SIZE
)
70 vma_len
= (loff_t
)(vma
->vm_end
- vma
->vm_start
);
74 vma
->vm_flags
|= VM_HUGETLB
| VM_RESERVED
;
75 vma
->vm_ops
= &hugetlb_vm_ops
;
78 len
= vma_len
+ ((loff_t
)vma
->vm_pgoff
<< PAGE_SHIFT
);
79 if (!(vma
->vm_flags
& VM_WRITE
) && len
> inode
->i_size
)
82 ret
= hugetlb_prefault(mapping
, vma
);
86 if (inode
->i_size
< len
)
95 * Called under down_write(mmap_sem), page_table_lock is not held
98 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
99 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
100 unsigned long len
, unsigned long pgoff
, unsigned long flags
);
103 hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
104 unsigned long len
, unsigned long pgoff
, unsigned long flags
)
106 struct mm_struct
*mm
= current
->mm
;
107 struct vm_area_struct
*vma
;
109 if (len
& ~HPAGE_MASK
)
115 addr
= ALIGN(addr
, HPAGE_SIZE
);
116 vma
= find_vma(mm
, addr
);
117 if (TASK_SIZE
- len
>= addr
&&
118 (!vma
|| addr
+ len
<= vma
->vm_start
))
122 addr
= ALIGN(mm
->free_area_cache
, HPAGE_SIZE
);
124 for (vma
= find_vma(mm
, addr
); ; vma
= vma
->vm_next
) {
125 /* At this point: (!vma || addr < vma->vm_end). */
126 if (TASK_SIZE
- len
< addr
)
128 if (!vma
|| addr
+ len
<= vma
->vm_start
)
130 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
136 * Read a page. Again trivial. If it didn't already exist
137 * in the page cache, it is zero-filled.
139 static int hugetlbfs_readpage(struct file
*file
, struct page
* page
)
145 static int hugetlbfs_prepare_write(struct file
*file
,
146 struct page
*page
, unsigned offset
, unsigned to
)
151 static int hugetlbfs_commit_write(struct file
*file
,
152 struct page
*page
, unsigned offset
, unsigned to
)
157 void huge_pagevec_release(struct pagevec
*pvec
)
161 for (i
= 0; i
< pagevec_count(pvec
); ++i
)
162 put_page(pvec
->pages
[i
]);
164 pagevec_reinit(pvec
);
167 void truncate_huge_page(struct page
*page
)
169 clear_page_dirty(page
);
170 ClearPageUptodate(page
);
171 remove_from_page_cache(page
);
175 void truncate_hugepages(struct address_space
*mapping
, loff_t lstart
)
177 const pgoff_t start
= lstart
>> HPAGE_SHIFT
;
182 pagevec_init(&pvec
, 0);
185 if (!pagevec_lookup(&pvec
, mapping
, next
, PAGEVEC_SIZE
)) {
192 for (i
= 0; i
< pagevec_count(&pvec
); ++i
) {
193 struct page
*page
= pvec
.pages
[i
];
196 if (page
->index
> next
)
199 truncate_huge_page(page
);
201 hugetlb_put_quota(mapping
);
203 huge_pagevec_release(&pvec
);
205 BUG_ON(!lstart
&& mapping
->nrpages
);
208 static void hugetlbfs_delete_inode(struct inode
*inode
)
210 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(inode
->i_sb
);
212 hlist_del_init(&inode
->i_hash
);
213 list_del_init(&inode
->i_list
);
214 inode
->i_state
|= I_FREEING
;
215 inodes_stat
.nr_inodes
--;
216 spin_unlock(&inode_lock
);
218 if (inode
->i_data
.nrpages
)
219 truncate_hugepages(&inode
->i_data
, 0);
221 security_inode_delete(inode
);
223 if (sbinfo
->free_inodes
>= 0) {
224 spin_lock(&sbinfo
->stat_lock
);
225 sbinfo
->free_inodes
++;
226 spin_unlock(&sbinfo
->stat_lock
);
230 destroy_inode(inode
);
233 static void hugetlbfs_forget_inode(struct inode
*inode
)
235 struct super_block
*super_block
= inode
->i_sb
;
236 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(super_block
);
238 if (hlist_unhashed(&inode
->i_hash
))
241 if (!(inode
->i_state
& (I_DIRTY
|I_LOCK
))) {
242 list_del(&inode
->i_list
);
243 list_add(&inode
->i_list
, &inode_unused
);
245 inodes_stat
.nr_unused
++;
246 if (!super_block
|| (super_block
->s_flags
& MS_ACTIVE
)) {
247 spin_unlock(&inode_lock
);
251 /* write_inode_now() ? */
252 inodes_stat
.nr_unused
--;
253 hlist_del_init(&inode
->i_hash
);
255 list_del_init(&inode
->i_list
);
256 inode
->i_state
|= I_FREEING
;
257 inodes_stat
.nr_inodes
--;
258 spin_unlock(&inode_lock
);
259 if (inode
->i_data
.nrpages
)
260 truncate_hugepages(&inode
->i_data
, 0);
262 if (sbinfo
->free_inodes
>= 0) {
263 spin_lock(&sbinfo
->stat_lock
);
264 sbinfo
->free_inodes
++;
265 spin_unlock(&sbinfo
->stat_lock
);
269 destroy_inode(inode
);
272 static void hugetlbfs_drop_inode(struct inode
*inode
)
275 hugetlbfs_delete_inode(inode
);
277 hugetlbfs_forget_inode(inode
);
281 * h_pgoff is in HPAGE_SIZE units.
282 * vma->vm_pgoff is in PAGE_SIZE units.
285 hugetlb_vmtruncate_list(struct prio_tree_root
*root
, unsigned long h_pgoff
)
287 struct vm_area_struct
*vma
;
288 struct prio_tree_iter iter
;
290 vma_prio_tree_foreach(vma
, &iter
, root
, h_pgoff
, ULONG_MAX
) {
291 unsigned long h_vm_pgoff
;
292 unsigned long v_length
;
293 unsigned long v_offset
;
295 h_vm_pgoff
= vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
);
296 v_offset
= (h_pgoff
- h_vm_pgoff
) << HPAGE_SHIFT
;
298 * Is this VMA fully outside the truncation point?
300 if (h_vm_pgoff
>= h_pgoff
)
303 v_length
= vma
->vm_end
- vma
->vm_start
;
305 zap_hugepage_range(vma
,
306 vma
->vm_start
+ v_offset
,
307 v_length
- v_offset
);
312 * Expanding truncates are not allowed.
314 static int hugetlb_vmtruncate(struct inode
*inode
, loff_t offset
)
317 struct address_space
*mapping
= inode
->i_mapping
;
319 if (offset
> inode
->i_size
)
322 BUG_ON(offset
& ~HPAGE_MASK
);
323 pgoff
= offset
>> HPAGE_SHIFT
;
325 inode
->i_size
= offset
;
326 spin_lock(&mapping
->i_mmap_lock
);
327 if (!prio_tree_empty(&mapping
->i_mmap
))
328 hugetlb_vmtruncate_list(&mapping
->i_mmap
, pgoff
);
329 spin_unlock(&mapping
->i_mmap_lock
);
330 truncate_hugepages(mapping
, offset
);
334 static int hugetlbfs_setattr(struct dentry
*dentry
, struct iattr
*attr
)
336 struct inode
*inode
= dentry
->d_inode
;
338 unsigned int ia_valid
= attr
->ia_valid
;
342 error
= inode_change_ok(inode
, attr
);
346 if (ia_valid
& ATTR_SIZE
) {
348 if (!(attr
->ia_size
& ~HPAGE_MASK
))
349 error
= hugetlb_vmtruncate(inode
, attr
->ia_size
);
352 attr
->ia_valid
&= ~ATTR_SIZE
;
354 error
= inode_setattr(inode
, attr
);
359 static struct inode
*hugetlbfs_get_inode(struct super_block
*sb
, uid_t uid
,
360 gid_t gid
, int mode
, dev_t dev
)
363 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(sb
);
365 if (sbinfo
->free_inodes
>= 0) {
366 spin_lock(&sbinfo
->stat_lock
);
367 if (!sbinfo
->free_inodes
) {
368 spin_unlock(&sbinfo
->stat_lock
);
371 sbinfo
->free_inodes
--;
372 spin_unlock(&sbinfo
->stat_lock
);
375 inode
= new_inode(sb
);
377 struct hugetlbfs_inode_info
*info
;
378 inode
->i_mode
= mode
;
381 inode
->i_blksize
= HPAGE_SIZE
;
383 inode
->i_mapping
->a_ops
= &hugetlbfs_aops
;
384 inode
->i_mapping
->backing_dev_info
=&hugetlbfs_backing_dev_info
;
385 inode
->i_atime
= inode
->i_mtime
= inode
->i_ctime
= CURRENT_TIME
;
386 info
= HUGETLBFS_I(inode
);
387 mpol_shared_policy_init(&info
->policy
);
388 switch (mode
& S_IFMT
) {
390 init_special_inode(inode
, mode
, dev
);
393 inode
->i_op
= &hugetlbfs_inode_operations
;
394 inode
->i_fop
= &hugetlbfs_file_operations
;
397 inode
->i_op
= &hugetlbfs_dir_inode_operations
;
398 inode
->i_fop
= &simple_dir_operations
;
400 /* directory inodes start off with i_nlink == 2 (for "." entry) */
404 inode
->i_op
= &page_symlink_inode_operations
;
412 * File creation. Allocate an inode, and we're done..
414 static int hugetlbfs_mknod(struct inode
*dir
,
415 struct dentry
*dentry
, int mode
, dev_t dev
)
421 if (dir
->i_mode
& S_ISGID
) {
426 gid
= current
->fsgid
;
428 inode
= hugetlbfs_get_inode(dir
->i_sb
, current
->fsuid
, gid
, mode
, dev
);
430 dir
->i_ctime
= dir
->i_mtime
= CURRENT_TIME
;
431 d_instantiate(dentry
, inode
);
432 dget(dentry
); /* Extra count - pin the dentry in core */
438 static int hugetlbfs_mkdir(struct inode
*dir
, struct dentry
*dentry
, int mode
)
440 int retval
= hugetlbfs_mknod(dir
, dentry
, mode
| S_IFDIR
, 0);
446 static int hugetlbfs_create(struct inode
*dir
, struct dentry
*dentry
, int mode
, struct nameidata
*nd
)
448 return hugetlbfs_mknod(dir
, dentry
, mode
| S_IFREG
, 0);
451 static int hugetlbfs_symlink(struct inode
*dir
,
452 struct dentry
*dentry
, const char *symname
)
458 if (dir
->i_mode
& S_ISGID
)
461 gid
= current
->fsgid
;
463 inode
= hugetlbfs_get_inode(dir
->i_sb
, current
->fsuid
,
464 gid
, S_IFLNK
|S_IRWXUGO
, 0);
466 int l
= strlen(symname
)+1;
467 error
= page_symlink(inode
, symname
, l
);
469 d_instantiate(dentry
, inode
);
474 dir
->i_ctime
= dir
->i_mtime
= CURRENT_TIME
;
480 * For direct-IO reads into hugetlb pages
482 int hugetlbfs_set_page_dirty(struct page
*page
)
487 static int hugetlbfs_statfs(struct super_block
*sb
, struct kstatfs
*buf
)
489 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(sb
);
491 buf
->f_type
= HUGETLBFS_MAGIC
;
492 buf
->f_bsize
= HPAGE_SIZE
;
494 spin_lock(&sbinfo
->stat_lock
);
495 buf
->f_blocks
= sbinfo
->max_blocks
;
496 buf
->f_bavail
= buf
->f_bfree
= sbinfo
->free_blocks
;
497 buf
->f_files
= sbinfo
->max_inodes
;
498 buf
->f_ffree
= sbinfo
->free_inodes
;
499 spin_unlock(&sbinfo
->stat_lock
);
501 buf
->f_namelen
= NAME_MAX
;
505 static void hugetlbfs_put_super(struct super_block
*sb
)
507 struct hugetlbfs_sb_info
*sbi
= HUGETLBFS_SB(sb
);
510 sb
->s_fs_info
= NULL
;
515 static kmem_cache_t
*hugetlbfs_inode_cachep
;
517 static struct inode
*hugetlbfs_alloc_inode(struct super_block
*sb
)
519 struct hugetlbfs_inode_info
*p
;
521 p
= kmem_cache_alloc(hugetlbfs_inode_cachep
, SLAB_KERNEL
);
524 return &p
->vfs_inode
;
527 static void init_once(void *foo
, kmem_cache_t
*cachep
, unsigned long flags
)
529 struct hugetlbfs_inode_info
*ei
= (struct hugetlbfs_inode_info
*)foo
;
531 if ((flags
& (SLAB_CTOR_VERIFY
|SLAB_CTOR_CONSTRUCTOR
)) ==
532 SLAB_CTOR_CONSTRUCTOR
)
533 inode_init_once(&ei
->vfs_inode
);
536 static void hugetlbfs_destroy_inode(struct inode
*inode
)
538 mpol_free_shared_policy(&HUGETLBFS_I(inode
)->policy
);
539 kmem_cache_free(hugetlbfs_inode_cachep
, HUGETLBFS_I(inode
));
542 static struct address_space_operations hugetlbfs_aops
= {
543 .readpage
= hugetlbfs_readpage
,
544 .prepare_write
= hugetlbfs_prepare_write
,
545 .commit_write
= hugetlbfs_commit_write
,
546 .set_page_dirty
= hugetlbfs_set_page_dirty
,
549 struct file_operations hugetlbfs_file_operations
= {
550 .mmap
= hugetlbfs_file_mmap
,
551 .fsync
= simple_sync_file
,
552 .get_unmapped_area
= hugetlb_get_unmapped_area
,
555 static struct inode_operations hugetlbfs_dir_inode_operations
= {
556 .create
= hugetlbfs_create
,
557 .lookup
= simple_lookup
,
559 .unlink
= simple_unlink
,
560 .symlink
= hugetlbfs_symlink
,
561 .mkdir
= hugetlbfs_mkdir
,
562 .rmdir
= simple_rmdir
,
563 .mknod
= hugetlbfs_mknod
,
564 .rename
= simple_rename
,
565 .setattr
= hugetlbfs_setattr
,
568 static struct inode_operations hugetlbfs_inode_operations
= {
569 .setattr
= hugetlbfs_setattr
,
572 static struct super_operations hugetlbfs_ops
= {
573 .alloc_inode
= hugetlbfs_alloc_inode
,
574 .destroy_inode
= hugetlbfs_destroy_inode
,
575 .statfs
= hugetlbfs_statfs
,
576 .drop_inode
= hugetlbfs_drop_inode
,
577 .put_super
= hugetlbfs_put_super
,
581 hugetlbfs_parse_options(char *options
, struct hugetlbfs_config
*pconfig
)
583 char *opt
, *value
, *rest
;
587 while ((opt
= strsep(&options
, ",")) != NULL
) {
591 value
= strchr(opt
, '=');
592 if (!value
|| !*value
)
597 if (!strcmp(opt
, "uid"))
598 pconfig
->uid
= simple_strtoul(value
, &value
, 0);
599 else if (!strcmp(opt
, "gid"))
600 pconfig
->gid
= simple_strtoul(value
, &value
, 0);
601 else if (!strcmp(opt
, "mode"))
602 pconfig
->mode
= simple_strtoul(value
,&value
,0) & 0777U;
603 else if (!strcmp(opt
, "size")) {
604 unsigned long long size
= memparse(value
, &rest
);
606 size
<<= HPAGE_SHIFT
;
607 size
*= max_huge_pages
;
612 pconfig
->nr_blocks
= (size
>> HPAGE_SHIFT
);
614 } else if (!strcmp(opt
,"nr_inodes")) {
615 pconfig
->nr_inodes
= memparse(value
, &rest
);
627 hugetlbfs_fill_super(struct super_block
*sb
, void *data
, int silent
)
629 struct inode
* inode
;
630 struct dentry
* root
;
632 struct hugetlbfs_config config
;
633 struct hugetlbfs_sb_info
*sbinfo
;
635 config
.nr_blocks
= -1; /* No limit on size by default */
636 config
.nr_inodes
= -1; /* No limit on number of inodes by default */
637 config
.uid
= current
->fsuid
;
638 config
.gid
= current
->fsgid
;
640 ret
= hugetlbfs_parse_options(data
, &config
);
645 sbinfo
= kmalloc(sizeof(struct hugetlbfs_sb_info
), GFP_KERNEL
);
648 sb
->s_fs_info
= sbinfo
;
649 spin_lock_init(&sbinfo
->stat_lock
);
650 sbinfo
->max_blocks
= config
.nr_blocks
;
651 sbinfo
->free_blocks
= config
.nr_blocks
;
652 sbinfo
->max_inodes
= config
.nr_inodes
;
653 sbinfo
->free_inodes
= config
.nr_inodes
;
654 sb
->s_maxbytes
= MAX_LFS_FILESIZE
;
655 sb
->s_blocksize
= HPAGE_SIZE
;
656 sb
->s_blocksize_bits
= HPAGE_SHIFT
;
657 sb
->s_magic
= HUGETLBFS_MAGIC
;
658 sb
->s_op
= &hugetlbfs_ops
;
659 inode
= hugetlbfs_get_inode(sb
, config
.uid
, config
.gid
,
660 S_IFDIR
| config
.mode
, 0);
664 root
= d_alloc_root(inode
);
676 int hugetlb_get_quota(struct address_space
*mapping
)
679 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(mapping
->host
->i_sb
);
681 if (sbinfo
->free_blocks
> -1) {
682 spin_lock(&sbinfo
->stat_lock
);
683 if (sbinfo
->free_blocks
> 0)
684 sbinfo
->free_blocks
--;
687 spin_unlock(&sbinfo
->stat_lock
);
693 void hugetlb_put_quota(struct address_space
*mapping
)
695 struct hugetlbfs_sb_info
*sbinfo
= HUGETLBFS_SB(mapping
->host
->i_sb
);
697 if (sbinfo
->free_blocks
> -1) {
698 spin_lock(&sbinfo
->stat_lock
);
699 sbinfo
->free_blocks
++;
700 spin_unlock(&sbinfo
->stat_lock
);
704 static struct super_block
*hugetlbfs_get_sb(struct file_system_type
*fs_type
,
705 int flags
, const char *dev_name
, void *data
)
707 return get_sb_nodev(fs_type
, flags
, data
, hugetlbfs_fill_super
);
710 static struct file_system_type hugetlbfs_fs_type
= {
712 .get_sb
= hugetlbfs_get_sb
,
713 .kill_sb
= kill_litter_super
,
716 static struct vfsmount
*hugetlbfs_vfsmount
;
719 * Return the next identifier for a shm file
721 static unsigned long hugetlbfs_counter(void)
723 static spinlock_t lock
= SPIN_LOCK_UNLOCKED
;
724 static unsigned long counter
;
733 static int can_do_hugetlb_shm(void)
735 return likely(capable(CAP_IPC_LOCK
) ||
736 in_group_p(sysctl_hugetlb_shm_group
) ||
740 struct file
*hugetlb_zero_setup(size_t size
)
745 struct dentry
*dentry
, *root
;
746 struct qstr quick_string
;
749 if (!can_do_hugetlb_shm())
750 return ERR_PTR(-EPERM
);
752 if (!is_hugepage_mem_enough(size
))
753 return ERR_PTR(-ENOMEM
);
755 if (!user_shm_lock(size
, current
->user
))
756 return ERR_PTR(-ENOMEM
);
758 root
= hugetlbfs_vfsmount
->mnt_root
;
759 snprintf(buf
, 16, "%lu", hugetlbfs_counter());
760 quick_string
.name
= buf
;
761 quick_string
.len
= strlen(quick_string
.name
);
762 quick_string
.hash
= 0;
763 dentry
= d_alloc(root
, &quick_string
);
768 file
= get_empty_filp();
773 inode
= hugetlbfs_get_inode(root
->d_sb
, current
->fsuid
,
774 current
->fsgid
, S_IFREG
| S_IRWXUGO
, 0);
778 d_instantiate(dentry
, inode
);
779 inode
->i_size
= size
;
781 file
->f_vfsmnt
= mntget(hugetlbfs_vfsmount
);
782 file
->f_dentry
= dentry
;
783 file
->f_mapping
= inode
->i_mapping
;
784 file
->f_op
= &hugetlbfs_file_operations
;
785 file
->f_mode
= FMODE_WRITE
| FMODE_READ
;
793 user_shm_unlock(size
, current
->user
);
794 return ERR_PTR(error
);
797 static int __init
init_hugetlbfs_fs(void)
800 struct vfsmount
*vfsmount
;
802 hugetlbfs_inode_cachep
= kmem_cache_create("hugetlbfs_inode_cache",
803 sizeof(struct hugetlbfs_inode_info
),
804 0, 0, init_once
, NULL
);
805 if (hugetlbfs_inode_cachep
== NULL
)
808 error
= register_filesystem(&hugetlbfs_fs_type
);
812 vfsmount
= kern_mount(&hugetlbfs_fs_type
);
814 if (!IS_ERR(vfsmount
)) {
815 hugetlbfs_vfsmount
= vfsmount
;
819 error
= PTR_ERR(vfsmount
);
823 kmem_cache_destroy(hugetlbfs_inode_cachep
);
827 static void __exit
exit_hugetlbfs_fs(void)
829 kmem_cache_destroy(hugetlbfs_inode_cachep
);
830 unregister_filesystem(&hugetlbfs_fs_type
);
833 module_init(init_hugetlbfs_fs
)
834 module_exit(exit_hugetlbfs_fs
)
836 MODULE_LICENSE("GPL");