2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
15 #include <asm/pgtable.h>
17 #include <linux/hugetlb.h>
19 const unsigned long hugetlb_zero
= 0, hugetlb_infinity
= ~0UL;
20 static unsigned long nr_huge_pages
, free_huge_pages
;
21 unsigned long max_huge_pages
;
22 static struct list_head hugepage_freelists
[MAX_NUMNODES
];
23 static unsigned int nr_huge_pages_node
[MAX_NUMNODES
];
24 static unsigned int free_huge_pages_node
[MAX_NUMNODES
];
27 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
29 static DEFINE_SPINLOCK(hugetlb_lock
);
31 static void enqueue_huge_page(struct page
*page
)
33 int nid
= page_to_nid(page
);
34 list_add(&page
->lru
, &hugepage_freelists
[nid
]);
36 free_huge_pages_node
[nid
]++;
39 static struct page
*dequeue_huge_page(void)
41 int nid
= numa_node_id();
42 struct page
*page
= NULL
;
44 if (list_empty(&hugepage_freelists
[nid
])) {
45 for (nid
= 0; nid
< MAX_NUMNODES
; ++nid
)
46 if (!list_empty(&hugepage_freelists
[nid
]))
49 if (nid
>= 0 && nid
< MAX_NUMNODES
&&
50 !list_empty(&hugepage_freelists
[nid
])) {
51 page
= list_entry(hugepage_freelists
[nid
].next
,
55 free_huge_pages_node
[nid
]--;
60 static struct page
*alloc_fresh_huge_page(void)
64 page
= alloc_pages_node(nid
, GFP_HIGHUSER
|__GFP_COMP
|__GFP_NOWARN
,
66 nid
= (nid
+ 1) % num_online_nodes();
68 spin_lock(&hugetlb_lock
);
70 nr_huge_pages_node
[page_to_nid(page
)]++;
71 spin_unlock(&hugetlb_lock
);
76 void free_huge_page(struct page
*page
)
78 BUG_ON(page_count(page
));
80 INIT_LIST_HEAD(&page
->lru
);
81 page
[1].mapping
= NULL
;
83 spin_lock(&hugetlb_lock
);
84 enqueue_huge_page(page
);
85 spin_unlock(&hugetlb_lock
);
88 struct page
*alloc_huge_page(void)
93 spin_lock(&hugetlb_lock
);
94 page
= dequeue_huge_page();
96 spin_unlock(&hugetlb_lock
);
99 spin_unlock(&hugetlb_lock
);
100 set_page_count(page
, 1);
101 page
[1].mapping
= (void *)free_huge_page
;
102 for (i
= 0; i
< (HPAGE_SIZE
/PAGE_SIZE
); ++i
)
103 clear_highpage(&page
[i
]);
107 static int __init
hugetlb_init(void)
112 if (HPAGE_SHIFT
== 0)
115 for (i
= 0; i
< MAX_NUMNODES
; ++i
)
116 INIT_LIST_HEAD(&hugepage_freelists
[i
]);
118 for (i
= 0; i
< max_huge_pages
; ++i
) {
119 page
= alloc_fresh_huge_page();
122 spin_lock(&hugetlb_lock
);
123 enqueue_huge_page(page
);
124 spin_unlock(&hugetlb_lock
);
126 max_huge_pages
= free_huge_pages
= nr_huge_pages
= i
;
127 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages
);
130 module_init(hugetlb_init
);
132 static int __init
hugetlb_setup(char *s
)
134 if (sscanf(s
, "%lu", &max_huge_pages
) <= 0)
138 __setup("hugepages=", hugetlb_setup
);
141 static void update_and_free_page(struct page
*page
)
145 nr_huge_pages_node
[page_zone(page
)->zone_pgdat
->node_id
]--;
146 for (i
= 0; i
< (HPAGE_SIZE
/ PAGE_SIZE
); i
++) {
147 page
[i
].flags
&= ~(1 << PG_locked
| 1 << PG_error
| 1 << PG_referenced
|
148 1 << PG_dirty
| 1 << PG_active
| 1 << PG_reserved
|
149 1 << PG_private
| 1<< PG_writeback
);
150 set_page_count(&page
[i
], 0);
152 set_page_count(page
, 1);
153 __free_pages(page
, HUGETLB_PAGE_ORDER
);
156 #ifdef CONFIG_HIGHMEM
157 static void try_to_free_low(unsigned long count
)
160 for (i
= 0; i
< MAX_NUMNODES
; ++i
) {
161 struct page
*page
, *next
;
162 list_for_each_entry_safe(page
, next
, &hugepage_freelists
[i
], lru
) {
163 if (PageHighMem(page
))
165 list_del(&page
->lru
);
166 update_and_free_page(page
);
167 nid
= page_zone(page
)->zone_pgdat
->node_id
;
169 free_huge_pages_node
[nid
]--;
170 if (count
>= nr_huge_pages
)
176 static inline void try_to_free_low(unsigned long count
)
181 static unsigned long set_max_huge_pages(unsigned long count
)
183 while (count
> nr_huge_pages
) {
184 struct page
*page
= alloc_fresh_huge_page();
186 return nr_huge_pages
;
187 spin_lock(&hugetlb_lock
);
188 enqueue_huge_page(page
);
189 spin_unlock(&hugetlb_lock
);
191 if (count
>= nr_huge_pages
)
192 return nr_huge_pages
;
194 spin_lock(&hugetlb_lock
);
195 try_to_free_low(count
);
196 while (count
< nr_huge_pages
) {
197 struct page
*page
= dequeue_huge_page();
200 update_and_free_page(page
);
202 spin_unlock(&hugetlb_lock
);
203 return nr_huge_pages
;
206 int hugetlb_sysctl_handler(struct ctl_table
*table
, int write
,
207 struct file
*file
, void __user
*buffer
,
208 size_t *length
, loff_t
*ppos
)
210 proc_doulongvec_minmax(table
, write
, file
, buffer
, length
, ppos
);
211 max_huge_pages
= set_max_huge_pages(max_huge_pages
);
214 #endif /* CONFIG_SYSCTL */
216 int hugetlb_report_meminfo(char *buf
)
219 "HugePages_Total: %5lu\n"
220 "HugePages_Free: %5lu\n"
221 "Hugepagesize: %5lu kB\n",
227 int hugetlb_report_node_meminfo(int nid
, char *buf
)
230 "Node %d HugePages_Total: %5u\n"
231 "Node %d HugePages_Free: %5u\n",
232 nid
, nr_huge_pages_node
[nid
],
233 nid
, free_huge_pages_node
[nid
]);
236 int is_hugepage_mem_enough(size_t size
)
238 return (size
+ ~HPAGE_MASK
)/HPAGE_SIZE
<= free_huge_pages
;
241 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
242 unsigned long hugetlb_total_pages(void)
244 return nr_huge_pages
* (HPAGE_SIZE
/ PAGE_SIZE
);
248 * We cannot handle pagefaults against hugetlb pages at all. They cause
249 * handle_mm_fault() to try to instantiate regular-sized pages in the
250 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
253 static struct page
*hugetlb_nopage(struct vm_area_struct
*vma
,
254 unsigned long address
, int *unused
)
260 struct vm_operations_struct hugetlb_vm_ops
= {
261 .nopage
= hugetlb_nopage
,
264 static pte_t
make_huge_pte(struct vm_area_struct
*vma
, struct page
*page
)
268 if (vma
->vm_flags
& VM_WRITE
) {
270 pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
272 entry
= pte_wrprotect(mk_pte(page
, vma
->vm_page_prot
));
274 entry
= pte_mkyoung(entry
);
275 entry
= pte_mkhuge(entry
);
280 int copy_hugetlb_page_range(struct mm_struct
*dst
, struct mm_struct
*src
,
281 struct vm_area_struct
*vma
)
283 pte_t
*src_pte
, *dst_pte
, entry
;
284 struct page
*ptepage
;
287 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= HPAGE_SIZE
) {
288 src_pte
= huge_pte_offset(src
, addr
);
291 dst_pte
= huge_pte_alloc(dst
, addr
);
294 spin_lock(&dst
->page_table_lock
);
295 spin_lock(&src
->page_table_lock
);
296 if (!pte_none(*src_pte
)) {
298 ptepage
= pte_page(entry
);
300 add_mm_counter(dst
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
301 set_huge_pte_at(dst
, addr
, dst_pte
, entry
);
303 spin_unlock(&src
->page_table_lock
);
304 spin_unlock(&dst
->page_table_lock
);
312 void unmap_hugepage_range(struct vm_area_struct
*vma
, unsigned long start
,
315 struct mm_struct
*mm
= vma
->vm_mm
;
316 unsigned long address
;
321 WARN_ON(!is_vm_hugetlb_page(vma
));
322 BUG_ON(start
& ~HPAGE_MASK
);
323 BUG_ON(end
& ~HPAGE_MASK
);
325 spin_lock(&mm
->page_table_lock
);
327 /* Update high watermark before we lower rss */
328 update_hiwater_rss(mm
);
330 for (address
= start
; address
< end
; address
+= HPAGE_SIZE
) {
331 ptep
= huge_pte_offset(mm
, address
);
335 pte
= huge_ptep_get_and_clear(mm
, address
, ptep
);
339 page
= pte_page(pte
);
341 add_mm_counter(mm
, file_rss
, (int) -(HPAGE_SIZE
/ PAGE_SIZE
));
344 spin_unlock(&mm
->page_table_lock
);
345 flush_tlb_range(vma
, start
, end
);
348 static struct page
*find_lock_huge_page(struct address_space
*mapping
,
353 struct inode
*inode
= mapping
->host
;
357 page
= find_lock_page(mapping
, idx
);
361 /* Check to make sure the mapping hasn't been truncated */
362 size
= i_size_read(inode
) >> HPAGE_SHIFT
;
366 if (hugetlb_get_quota(mapping
))
368 page
= alloc_huge_page();
370 hugetlb_put_quota(mapping
);
374 err
= add_to_page_cache(page
, mapping
, idx
, GFP_KERNEL
);
377 hugetlb_put_quota(mapping
);
386 int hugetlb_fault(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
387 unsigned long address
, int write_access
)
389 int ret
= VM_FAULT_SIGBUS
;
394 struct address_space
*mapping
;
396 pte
= huge_pte_alloc(mm
, address
);
400 mapping
= vma
->vm_file
->f_mapping
;
401 idx
= ((address
- vma
->vm_start
) >> HPAGE_SHIFT
)
402 + (vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
));
405 * Use page lock to guard against racing truncation
406 * before we get page_table_lock.
408 page
= find_lock_huge_page(mapping
, idx
);
412 spin_lock(&mm
->page_table_lock
);
413 size
= i_size_read(mapping
->host
) >> HPAGE_SHIFT
;
417 ret
= VM_FAULT_MINOR
;
421 add_mm_counter(mm
, file_rss
, HPAGE_SIZE
/ PAGE_SIZE
);
422 set_huge_pte_at(mm
, address
, pte
, make_huge_pte(vma
, page
));
423 spin_unlock(&mm
->page_table_lock
);
429 spin_unlock(&mm
->page_table_lock
);
430 hugetlb_put_quota(mapping
);
436 int follow_hugetlb_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
437 struct page
**pages
, struct vm_area_struct
**vmas
,
438 unsigned long *position
, int *length
, int i
)
440 unsigned long vpfn
, vaddr
= *position
;
441 int remainder
= *length
;
443 vpfn
= vaddr
/PAGE_SIZE
;
444 spin_lock(&mm
->page_table_lock
);
445 while (vaddr
< vma
->vm_end
&& remainder
) {
450 * Some archs (sparc64, sh*) have multiple pte_ts to
451 * each hugepage. We have to make * sure we get the
452 * first, for the page indexing below to work.
454 pte
= huge_pte_offset(mm
, vaddr
& HPAGE_MASK
);
456 if (!pte
|| pte_none(*pte
)) {
459 spin_unlock(&mm
->page_table_lock
);
460 ret
= hugetlb_fault(mm
, vma
, vaddr
, 0);
461 spin_lock(&mm
->page_table_lock
);
462 if (ret
== VM_FAULT_MINOR
)
472 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
485 spin_unlock(&mm
->page_table_lock
);