2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
15 #include <asm/pgtable.h>
17 #include <linux/hugetlb.h>
19 const unsigned long hugetlb_zero
= 0, hugetlb_infinity
= ~0UL;
20 static unsigned long nr_huge_pages
, free_huge_pages
;
21 unsigned long max_huge_pages
;
22 static struct list_head hugepage_freelists
[MAX_NUMNODES
];
23 static unsigned int nr_huge_pages_node
[MAX_NUMNODES
];
24 static unsigned int free_huge_pages_node
[MAX_NUMNODES
];
25 static DEFINE_SPINLOCK(hugetlb_lock
);
27 static void enqueue_huge_page(struct page
*page
)
29 int nid
= page_to_nid(page
);
30 list_add(&page
->lru
, &hugepage_freelists
[nid
]);
32 free_huge_pages_node
[nid
]++;
35 static struct page
*dequeue_huge_page(void)
37 int nid
= numa_node_id();
38 struct page
*page
= NULL
;
40 if (list_empty(&hugepage_freelists
[nid
])) {
41 for (nid
= 0; nid
< MAX_NUMNODES
; ++nid
)
42 if (!list_empty(&hugepage_freelists
[nid
]))
45 if (nid
>= 0 && nid
< MAX_NUMNODES
&&
46 !list_empty(&hugepage_freelists
[nid
])) {
47 page
= list_entry(hugepage_freelists
[nid
].next
,
51 free_huge_pages_node
[nid
]--;
56 static struct page
*alloc_fresh_huge_page(void)
60 page
= alloc_pages_node(nid
, GFP_HIGHUSER
|__GFP_COMP
|__GFP_NOWARN
,
62 nid
= (nid
+ 1) % num_online_nodes();
65 nr_huge_pages_node
[page_to_nid(page
)]++;
70 void free_huge_page(struct page
*page
)
72 BUG_ON(page_count(page
));
74 INIT_LIST_HEAD(&page
->lru
);
75 page
[1].mapping
= NULL
;
77 spin_lock(&hugetlb_lock
);
78 enqueue_huge_page(page
);
79 spin_unlock(&hugetlb_lock
);
82 struct page
*alloc_huge_page(void)
87 spin_lock(&hugetlb_lock
);
88 page
= dequeue_huge_page();
90 spin_unlock(&hugetlb_lock
);
93 spin_unlock(&hugetlb_lock
);
94 set_page_count(page
, 1);
95 page
[1].mapping
= (void *)free_huge_page
;
96 for (i
= 0; i
< (HPAGE_SIZE
/PAGE_SIZE
); ++i
)
97 clear_highpage(&page
[i
]);
101 static int __init
hugetlb_init(void)
106 for (i
= 0; i
< MAX_NUMNODES
; ++i
)
107 INIT_LIST_HEAD(&hugepage_freelists
[i
]);
109 for (i
= 0; i
< max_huge_pages
; ++i
) {
110 page
= alloc_fresh_huge_page();
113 spin_lock(&hugetlb_lock
);
114 enqueue_huge_page(page
);
115 spin_unlock(&hugetlb_lock
);
117 max_huge_pages
= free_huge_pages
= nr_huge_pages
= i
;
118 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages
);
121 module_init(hugetlb_init
);
123 static int __init
hugetlb_setup(char *s
)
125 if (sscanf(s
, "%lu", &max_huge_pages
) <= 0)
129 __setup("hugepages=", hugetlb_setup
);
132 static void update_and_free_page(struct page
*page
)
136 nr_huge_pages_node
[page_zone(page
)->zone_pgdat
->node_id
]--;
137 for (i
= 0; i
< (HPAGE_SIZE
/ PAGE_SIZE
); i
++) {
138 page
[i
].flags
&= ~(1 << PG_locked
| 1 << PG_error
| 1 << PG_referenced
|
139 1 << PG_dirty
| 1 << PG_active
| 1 << PG_reserved
|
140 1 << PG_private
| 1<< PG_writeback
);
141 set_page_count(&page
[i
], 0);
143 set_page_count(page
, 1);
144 __free_pages(page
, HUGETLB_PAGE_ORDER
);
147 #ifdef CONFIG_HIGHMEM
148 static void try_to_free_low(unsigned long count
)
151 for (i
= 0; i
< MAX_NUMNODES
; ++i
) {
152 struct page
*page
, *next
;
153 list_for_each_entry_safe(page
, next
, &hugepage_freelists
[i
], lru
) {
154 if (PageHighMem(page
))
156 list_del(&page
->lru
);
157 update_and_free_page(page
);
158 nid
= page_zone(page
)->zone_pgdat
->node_id
;
160 free_huge_pages_node
[nid
]--;
161 if (count
>= nr_huge_pages
)
167 static inline void try_to_free_low(unsigned long count
)
172 static unsigned long set_max_huge_pages(unsigned long count
)
174 while (count
> nr_huge_pages
) {
175 struct page
*page
= alloc_fresh_huge_page();
177 return nr_huge_pages
;
178 spin_lock(&hugetlb_lock
);
179 enqueue_huge_page(page
);
180 spin_unlock(&hugetlb_lock
);
182 if (count
>= nr_huge_pages
)
183 return nr_huge_pages
;
185 spin_lock(&hugetlb_lock
);
186 try_to_free_low(count
);
187 while (count
< nr_huge_pages
) {
188 struct page
*page
= dequeue_huge_page();
191 update_and_free_page(page
);
193 spin_unlock(&hugetlb_lock
);
194 return nr_huge_pages
;
197 int hugetlb_sysctl_handler(struct ctl_table
*table
, int write
,
198 struct file
*file
, void __user
*buffer
,
199 size_t *length
, loff_t
*ppos
)
201 proc_doulongvec_minmax(table
, write
, file
, buffer
, length
, ppos
);
202 max_huge_pages
= set_max_huge_pages(max_huge_pages
);
205 #endif /* CONFIG_SYSCTL */
207 int hugetlb_report_meminfo(char *buf
)
210 "HugePages_Total: %5lu\n"
211 "HugePages_Free: %5lu\n"
212 "Hugepagesize: %5lu kB\n",
218 int hugetlb_report_node_meminfo(int nid
, char *buf
)
221 "Node %d HugePages_Total: %5u\n"
222 "Node %d HugePages_Free: %5u\n",
223 nid
, nr_huge_pages_node
[nid
],
224 nid
, free_huge_pages_node
[nid
]);
227 int is_hugepage_mem_enough(size_t size
)
229 return (size
+ ~HPAGE_MASK
)/HPAGE_SIZE
<= free_huge_pages
;
232 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
233 unsigned long hugetlb_total_pages(void)
235 return nr_huge_pages
* (HPAGE_SIZE
/ PAGE_SIZE
);
237 EXPORT_SYMBOL(hugetlb_total_pages
);
240 * We cannot handle pagefaults against hugetlb pages at all. They cause
241 * handle_mm_fault() to try to instantiate regular-sized pages in the
242 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
245 static struct page
*hugetlb_nopage(struct vm_area_struct
*vma
,
246 unsigned long address
, int *unused
)
252 struct vm_operations_struct hugetlb_vm_ops
= {
253 .nopage
= hugetlb_nopage
,
256 static pte_t
make_huge_pte(struct vm_area_struct
*vma
, struct page
*page
)
260 if (vma
->vm_flags
& VM_WRITE
) {
262 pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
264 entry
= pte_wrprotect(mk_pte(page
, vma
->vm_page_prot
));
266 entry
= pte_mkyoung(entry
);
267 entry
= pte_mkhuge(entry
);
272 int copy_hugetlb_page_range(struct mm_struct
*dst
, struct mm_struct
*src
,
273 struct vm_area_struct
*vma
)
275 pte_t
*src_pte
, *dst_pte
, entry
;
276 struct page
*ptepage
;
277 unsigned long addr
= vma
->vm_start
;
278 unsigned long end
= vma
->vm_end
;
281 dst_pte
= huge_pte_alloc(dst
, addr
);
284 src_pte
= huge_pte_offset(src
, addr
);
285 BUG_ON(!src_pte
|| pte_none(*src_pte
)); /* prefaulted */
287 ptepage
= pte_page(entry
);
289 add_mm_counter(dst
, rss
, HPAGE_SIZE
/ PAGE_SIZE
);
290 set_huge_pte_at(dst
, addr
, dst_pte
, entry
);
299 void unmap_hugepage_range(struct vm_area_struct
*vma
, unsigned long start
,
302 struct mm_struct
*mm
= vma
->vm_mm
;
303 unsigned long address
;
308 WARN_ON(!is_vm_hugetlb_page(vma
));
309 BUG_ON(start
& ~HPAGE_MASK
);
310 BUG_ON(end
& ~HPAGE_MASK
);
312 for (address
= start
; address
< end
; address
+= HPAGE_SIZE
) {
313 ptep
= huge_pte_offset(mm
, address
);
315 /* This can happen on truncate, or if an
316 * mmap() is aborted due to an error before
320 pte
= huge_ptep_get_and_clear(mm
, address
, ptep
);
324 page
= pte_page(pte
);
327 add_mm_counter(mm
, rss
, -((end
- start
) >> PAGE_SHIFT
));
328 flush_tlb_range(vma
, start
, end
);
331 void zap_hugepage_range(struct vm_area_struct
*vma
,
332 unsigned long start
, unsigned long length
)
334 struct mm_struct
*mm
= vma
->vm_mm
;
336 spin_lock(&mm
->page_table_lock
);
337 unmap_hugepage_range(vma
, start
, start
+ length
);
338 spin_unlock(&mm
->page_table_lock
);
341 int hugetlb_prefault(struct address_space
*mapping
, struct vm_area_struct
*vma
)
343 struct mm_struct
*mm
= current
->mm
;
347 WARN_ON(!is_vm_hugetlb_page(vma
));
348 BUG_ON(vma
->vm_start
& ~HPAGE_MASK
);
349 BUG_ON(vma
->vm_end
& ~HPAGE_MASK
);
351 hugetlb_prefault_arch_hook(mm
);
353 spin_lock(&mm
->page_table_lock
);
354 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= HPAGE_SIZE
) {
356 pte_t
*pte
= huge_pte_alloc(mm
, addr
);
363 if (! pte_none(*pte
))
364 hugetlb_clean_stale_pgtable(pte
);
366 idx
= ((addr
- vma
->vm_start
) >> HPAGE_SHIFT
)
367 + (vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
));
368 page
= find_get_page(mapping
, idx
);
370 /* charge the fs quota first */
371 if (hugetlb_get_quota(mapping
)) {
375 page
= alloc_huge_page();
377 hugetlb_put_quota(mapping
);
381 ret
= add_to_page_cache(page
, mapping
, idx
, GFP_ATOMIC
);
385 hugetlb_put_quota(mapping
);
386 free_huge_page(page
);
390 add_mm_counter(mm
, rss
, HPAGE_SIZE
/ PAGE_SIZE
);
391 set_huge_pte_at(mm
, addr
, pte
, make_huge_pte(vma
, page
));
394 spin_unlock(&mm
->page_table_lock
);
398 int follow_hugetlb_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
399 struct page
**pages
, struct vm_area_struct
**vmas
,
400 unsigned long *position
, int *length
, int i
)
402 unsigned long vpfn
, vaddr
= *position
;
403 int remainder
= *length
;
405 BUG_ON(!is_vm_hugetlb_page(vma
));
407 vpfn
= vaddr
/PAGE_SIZE
;
408 while (vaddr
< vma
->vm_end
&& remainder
) {
414 /* Some archs (sparc64, sh*) have multiple
415 * pte_ts to each hugepage. We have to make
416 * sure we get the first, for the page
417 * indexing below to work. */
418 pte
= huge_pte_offset(mm
, vaddr
& HPAGE_MASK
);
420 /* hugetlb should be locked, and hence, prefaulted */
421 WARN_ON(!pte
|| pte_none(*pte
));
423 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
425 WARN_ON(!PageCompound(page
));