2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
15 #include <asm/pgtable.h>
17 #include <linux/hugetlb.h>
19 const unsigned long hugetlb_zero
= 0, hugetlb_infinity
= ~0UL;
20 static unsigned long nr_huge_pages
, free_huge_pages
;
21 unsigned long max_huge_pages
;
22 static struct list_head hugepage_freelists
[MAX_NUMNODES
];
23 static unsigned int nr_huge_pages_node
[MAX_NUMNODES
];
24 static unsigned int free_huge_pages_node
[MAX_NUMNODES
];
25 static DEFINE_SPINLOCK(hugetlb_lock
);
27 static void enqueue_huge_page(struct page
*page
)
29 int nid
= page_to_nid(page
);
30 list_add(&page
->lru
, &hugepage_freelists
[nid
]);
32 free_huge_pages_node
[nid
]++;
35 static struct page
*dequeue_huge_page(void)
37 int nid
= numa_node_id();
38 struct page
*page
= NULL
;
40 if (list_empty(&hugepage_freelists
[nid
])) {
41 for (nid
= 0; nid
< MAX_NUMNODES
; ++nid
)
42 if (!list_empty(&hugepage_freelists
[nid
]))
45 if (nid
>= 0 && nid
< MAX_NUMNODES
&&
46 !list_empty(&hugepage_freelists
[nid
])) {
47 page
= list_entry(hugepage_freelists
[nid
].next
,
51 free_huge_pages_node
[nid
]--;
56 static struct page
*alloc_fresh_huge_page(void)
60 page
= alloc_pages_node(nid
, GFP_HIGHUSER
|__GFP_COMP
|__GFP_NOWARN
,
62 nid
= (nid
+ 1) % num_online_nodes();
65 nr_huge_pages_node
[page_to_nid(page
)]++;
70 void free_huge_page(struct page
*page
)
72 BUG_ON(page_count(page
));
74 INIT_LIST_HEAD(&page
->lru
);
75 page
[1].mapping
= NULL
;
77 spin_lock(&hugetlb_lock
);
78 enqueue_huge_page(page
);
79 spin_unlock(&hugetlb_lock
);
82 struct page
*alloc_huge_page(void)
87 spin_lock(&hugetlb_lock
);
88 page
= dequeue_huge_page();
90 spin_unlock(&hugetlb_lock
);
93 spin_unlock(&hugetlb_lock
);
94 set_page_count(page
, 1);
95 page
[1].mapping
= (void *)free_huge_page
;
96 for (i
= 0; i
< (HPAGE_SIZE
/PAGE_SIZE
); ++i
)
97 clear_highpage(&page
[i
]);
101 static int __init
hugetlb_init(void)
106 for (i
= 0; i
< MAX_NUMNODES
; ++i
)
107 INIT_LIST_HEAD(&hugepage_freelists
[i
]);
109 for (i
= 0; i
< max_huge_pages
; ++i
) {
110 page
= alloc_fresh_huge_page();
113 spin_lock(&hugetlb_lock
);
114 enqueue_huge_page(page
);
115 spin_unlock(&hugetlb_lock
);
117 max_huge_pages
= free_huge_pages
= nr_huge_pages
= i
;
118 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages
);
121 module_init(hugetlb_init
);
123 static int __init
hugetlb_setup(char *s
)
125 if (sscanf(s
, "%lu", &max_huge_pages
) <= 0)
129 __setup("hugepages=", hugetlb_setup
);
132 static void update_and_free_page(struct page
*page
)
136 nr_huge_pages_node
[page_zone(page
)->zone_pgdat
->node_id
]--;
137 for (i
= 0; i
< (HPAGE_SIZE
/ PAGE_SIZE
); i
++) {
138 page
[i
].flags
&= ~(1 << PG_locked
| 1 << PG_error
| 1 << PG_referenced
|
139 1 << PG_dirty
| 1 << PG_active
| 1 << PG_reserved
|
140 1 << PG_private
| 1<< PG_writeback
);
141 set_page_count(&page
[i
], 0);
143 set_page_count(page
, 1);
144 __free_pages(page
, HUGETLB_PAGE_ORDER
);
147 #ifdef CONFIG_HIGHMEM
148 static void try_to_free_low(unsigned long count
)
151 for (i
= 0; i
< MAX_NUMNODES
; ++i
) {
152 struct page
*page
, *next
;
153 list_for_each_entry_safe(page
, next
, &hugepage_freelists
[i
], lru
) {
154 if (PageHighMem(page
))
156 list_del(&page
->lru
);
157 update_and_free_page(page
);
158 nid
= page_zone(page
)->zone_pgdat
->node_id
;
160 free_huge_pages_node
[nid
]--;
161 if (count
>= nr_huge_pages
)
167 static inline void try_to_free_low(unsigned long count
)
172 static unsigned long set_max_huge_pages(unsigned long count
)
174 while (count
> nr_huge_pages
) {
175 struct page
*page
= alloc_fresh_huge_page();
177 return nr_huge_pages
;
178 spin_lock(&hugetlb_lock
);
179 enqueue_huge_page(page
);
180 spin_unlock(&hugetlb_lock
);
182 if (count
>= nr_huge_pages
)
183 return nr_huge_pages
;
185 spin_lock(&hugetlb_lock
);
186 try_to_free_low(count
);
187 while (count
< nr_huge_pages
) {
188 struct page
*page
= dequeue_huge_page();
191 update_and_free_page(page
);
193 spin_unlock(&hugetlb_lock
);
194 return nr_huge_pages
;
197 int hugetlb_sysctl_handler(struct ctl_table
*table
, int write
,
198 struct file
*file
, void __user
*buffer
,
199 size_t *length
, loff_t
*ppos
)
201 proc_doulongvec_minmax(table
, write
, file
, buffer
, length
, ppos
);
202 max_huge_pages
= set_max_huge_pages(max_huge_pages
);
205 #endif /* CONFIG_SYSCTL */
207 int hugetlb_report_meminfo(char *buf
)
210 "HugePages_Total: %5lu\n"
211 "HugePages_Free: %5lu\n"
212 "Hugepagesize: %5lu kB\n",
218 int hugetlb_report_node_meminfo(int nid
, char *buf
)
221 "Node %d HugePages_Total: %5u\n"
222 "Node %d HugePages_Free: %5u\n",
223 nid
, nr_huge_pages_node
[nid
],
224 nid
, free_huge_pages_node
[nid
]);
227 int is_hugepage_mem_enough(size_t size
)
229 return (size
+ ~HPAGE_MASK
)/HPAGE_SIZE
<= free_huge_pages
;
232 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
233 unsigned long hugetlb_total_pages(void)
235 return nr_huge_pages
* (HPAGE_SIZE
/ PAGE_SIZE
);
237 EXPORT_SYMBOL(hugetlb_total_pages
);
240 * We cannot handle pagefaults against hugetlb pages at all. They cause
241 * handle_mm_fault() to try to instantiate regular-sized pages in the
242 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
245 static struct page
*hugetlb_nopage(struct vm_area_struct
*vma
,
246 unsigned long address
, int *unused
)
252 struct vm_operations_struct hugetlb_vm_ops
= {
253 .nopage
= hugetlb_nopage
,
256 static pte_t
make_huge_pte(struct vm_area_struct
*vma
, struct page
*page
)
260 if (vma
->vm_flags
& VM_WRITE
) {
262 pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
264 entry
= pte_wrprotect(mk_pte(page
, vma
->vm_page_prot
));
266 entry
= pte_mkyoung(entry
);
267 entry
= pte_mkhuge(entry
);
272 int copy_hugetlb_page_range(struct mm_struct
*dst
, struct mm_struct
*src
,
273 struct vm_area_struct
*vma
)
275 pte_t
*src_pte
, *dst_pte
, entry
;
276 struct page
*ptepage
;
279 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= HPAGE_SIZE
) {
280 src_pte
= huge_pte_offset(src
, addr
);
283 dst_pte
= huge_pte_alloc(dst
, addr
);
286 spin_lock(&dst
->page_table_lock
);
287 spin_lock(&src
->page_table_lock
);
288 if (!pte_none(*src_pte
)) {
290 ptepage
= pte_page(entry
);
292 add_mm_counter(dst
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
293 set_huge_pte_at(dst
, addr
, dst_pte
, entry
);
295 spin_unlock(&src
->page_table_lock
);
296 spin_unlock(&dst
->page_table_lock
);
304 void unmap_hugepage_range(struct vm_area_struct
*vma
, unsigned long start
,
307 struct mm_struct
*mm
= vma
->vm_mm
;
308 unsigned long address
;
313 WARN_ON(!is_vm_hugetlb_page(vma
));
314 BUG_ON(start
& ~HPAGE_MASK
);
315 BUG_ON(end
& ~HPAGE_MASK
);
317 spin_lock(&mm
->page_table_lock
);
319 /* Update high watermark before we lower rss */
320 update_hiwater_rss(mm
);
322 for (address
= start
; address
< end
; address
+= HPAGE_SIZE
) {
323 ptep
= huge_pte_offset(mm
, address
);
327 pte
= huge_ptep_get_and_clear(mm
, address
, ptep
);
331 page
= pte_page(pte
);
333 add_mm_counter(mm
, file_rss
, (int) -(HPAGE_SIZE
/ PAGE_SIZE
));
336 spin_unlock(&mm
->page_table_lock
);
337 flush_tlb_range(vma
, start
, end
);
340 static struct page
*find_lock_huge_page(struct address_space
*mapping
,
345 struct inode
*inode
= mapping
->host
;
349 page
= find_lock_page(mapping
, idx
);
353 /* Check to make sure the mapping hasn't been truncated */
354 size
= i_size_read(inode
) >> HPAGE_SHIFT
;
358 if (hugetlb_get_quota(mapping
))
360 page
= alloc_huge_page();
362 hugetlb_put_quota(mapping
);
366 err
= add_to_page_cache(page
, mapping
, idx
, GFP_KERNEL
);
369 hugetlb_put_quota(mapping
);
378 int hugetlb_fault(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
379 unsigned long address
, int write_access
)
381 int ret
= VM_FAULT_SIGBUS
;
386 struct address_space
*mapping
;
388 pte
= huge_pte_alloc(mm
, address
);
392 mapping
= vma
->vm_file
->f_mapping
;
393 idx
= ((address
- vma
->vm_start
) >> HPAGE_SHIFT
)
394 + (vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
));
397 * Use page lock to guard against racing truncation
398 * before we get page_table_lock.
400 page
= find_lock_huge_page(mapping
, idx
);
404 spin_lock(&mm
->page_table_lock
);
405 size
= i_size_read(mapping
->host
) >> HPAGE_SHIFT
;
409 ret
= VM_FAULT_MINOR
;
413 add_mm_counter(mm
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
414 set_huge_pte_at(mm
, address
, pte
, make_huge_pte(vma
, page
));
415 spin_unlock(&mm
->page_table_lock
);
421 spin_unlock(&mm
->page_table_lock
);
422 hugetlb_put_quota(mapping
);
428 int follow_hugetlb_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
429 struct page
**pages
, struct vm_area_struct
**vmas
,
430 unsigned long *position
, int *length
, int i
)
432 unsigned long vpfn
, vaddr
= *position
;
433 int remainder
= *length
;
435 vpfn
= vaddr
/PAGE_SIZE
;
436 spin_lock(&mm
->page_table_lock
);
437 while (vaddr
< vma
->vm_end
&& remainder
) {
442 * Some archs (sparc64, sh*) have multiple pte_ts to
443 * each hugepage. We have to make * sure we get the
444 * first, for the page indexing below to work.
446 pte
= huge_pte_offset(mm
, vaddr
& HPAGE_MASK
);
448 if (!pte
|| pte_none(*pte
)) {
451 spin_unlock(&mm
->page_table_lock
);
452 ret
= hugetlb_fault(mm
, vma
, vaddr
, 0);
453 spin_lock(&mm
->page_table_lock
);
454 if (ret
== VM_FAULT_MINOR
)
464 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
477 spin_unlock(&mm
->page_table_lock
);