initial commit with v2.6.9
[linux-2.6.9-moxart.git] / mm / prio_tree.c
blob2a1d02f933201677f045230fdd619bbcf6c83578
1 /*
2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
6 * This file is released under the GPL v2.
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
11 * 02Feb2004 Initial version
14 #include <linux/init.h>
15 #include <linux/module.h>
16 #include <linux/mm.h>
17 #include <linux/prio_tree.h>
20 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
21 * which is useful for storing intervals, e.g, we can consider a vma as a closed
22 * interval of file pages [offset_begin, offset_end], and store all vmas that
23 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
24 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
25 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
26 * time where 'log n' is the height of the PST, and 'm' is the number of stored
27 * intervals (vmas) that overlap (map) with the input interval X (the set of
28 * consecutive file pages).
30 * In our implementation, we store closed intervals of the form [radix_index,
31 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
32 * is designed for storing intervals with unique radix indices, i.e., each
33 * interval have different radix_index. However, this limitation can be easily
34 * overcome by using the size, i.e., heap_index - radix_index, as part of the
35 * index, so we index the tree using [(radix_index,size), heap_index].
37 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
38 * machine, the maximum height of a PST can be 64. We can use a balanced version
39 * of the priority search tree to optimize the tree height, but the balanced
40 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
44 * The following macros are used for implementing prio_tree for i_mmap
47 #define RADIX_INDEX(vma) ((vma)->vm_pgoff)
48 #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
49 /* avoid overflow */
50 #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
52 #define GET_INDEX_VMA(vma, radix, heap) \
53 do { \
54 radix = RADIX_INDEX(vma); \
55 heap = HEAP_INDEX(vma); \
56 } while (0)
58 #define GET_INDEX(node, radix, heap) \
59 do { \
60 struct vm_area_struct *__tmp = \
61 prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
62 GET_INDEX_VMA(__tmp, radix, heap); \
63 } while (0)
65 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
67 void __init prio_tree_init(void)
69 unsigned int i;
71 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
72 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
73 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
77 * Maximum heap_index that can be stored in a PST with index_bits bits
79 static inline unsigned long prio_tree_maxindex(unsigned int bits)
81 return index_bits_to_maxindex[bits - 1];
84 static void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
87 * Extend a priority search tree so that it can store a node with heap_index
88 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
89 * However, this function is used rarely and the common case performance is
90 * not bad.
92 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
93 struct prio_tree_node *node, unsigned long max_heap_index)
95 struct prio_tree_node *first = NULL, *prev, *last = NULL;
97 if (max_heap_index > prio_tree_maxindex(root->index_bits))
98 root->index_bits++;
100 while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
101 root->index_bits++;
103 if (prio_tree_empty(root))
104 continue;
106 if (first == NULL) {
107 first = root->prio_tree_node;
108 prio_tree_remove(root, root->prio_tree_node);
109 INIT_PRIO_TREE_NODE(first);
110 last = first;
111 } else {
112 prev = last;
113 last = root->prio_tree_node;
114 prio_tree_remove(root, root->prio_tree_node);
115 INIT_PRIO_TREE_NODE(last);
116 prev->left = last;
117 last->parent = prev;
121 INIT_PRIO_TREE_NODE(node);
123 if (first) {
124 node->left = first;
125 first->parent = node;
126 } else
127 last = node;
129 if (!prio_tree_empty(root)) {
130 last->left = root->prio_tree_node;
131 last->left->parent = last;
134 root->prio_tree_node = node;
135 return node;
139 * Replace a prio_tree_node with a new node and return the old node
141 static struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
142 struct prio_tree_node *old, struct prio_tree_node *node)
144 INIT_PRIO_TREE_NODE(node);
146 if (prio_tree_root(old)) {
147 BUG_ON(root->prio_tree_node != old);
149 * We can reduce root->index_bits here. However, it is complex
150 * and does not help much to improve performance (IMO).
152 node->parent = node;
153 root->prio_tree_node = node;
154 } else {
155 node->parent = old->parent;
156 if (old->parent->left == old)
157 old->parent->left = node;
158 else
159 old->parent->right = node;
162 if (!prio_tree_left_empty(old)) {
163 node->left = old->left;
164 old->left->parent = node;
167 if (!prio_tree_right_empty(old)) {
168 node->right = old->right;
169 old->right->parent = node;
172 return old;
176 * Insert a prio_tree_node @node into a radix priority search tree @root. The
177 * algorithm typically takes O(log n) time where 'log n' is the number of bits
178 * required to represent the maximum heap_index. In the worst case, the algo
179 * can take O((log n)^2) - check prio_tree_expand.
181 * If a prior node with same radix_index and heap_index is already found in
182 * the tree, then returns the address of the prior node. Otherwise, inserts
183 * @node into the tree and returns @node.
185 static struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
186 struct prio_tree_node *node)
188 struct prio_tree_node *cur, *res = node;
189 unsigned long radix_index, heap_index;
190 unsigned long r_index, h_index, index, mask;
191 int size_flag = 0;
193 GET_INDEX(node, radix_index, heap_index);
195 if (prio_tree_empty(root) ||
196 heap_index > prio_tree_maxindex(root->index_bits))
197 return prio_tree_expand(root, node, heap_index);
199 cur = root->prio_tree_node;
200 mask = 1UL << (root->index_bits - 1);
202 while (mask) {
203 GET_INDEX(cur, r_index, h_index);
205 if (r_index == radix_index && h_index == heap_index)
206 return cur;
208 if (h_index < heap_index ||
209 (h_index == heap_index && r_index > radix_index)) {
210 struct prio_tree_node *tmp = node;
211 node = prio_tree_replace(root, cur, node);
212 cur = tmp;
213 /* swap indices */
214 index = r_index;
215 r_index = radix_index;
216 radix_index = index;
217 index = h_index;
218 h_index = heap_index;
219 heap_index = index;
222 if (size_flag)
223 index = heap_index - radix_index;
224 else
225 index = radix_index;
227 if (index & mask) {
228 if (prio_tree_right_empty(cur)) {
229 INIT_PRIO_TREE_NODE(node);
230 cur->right = node;
231 node->parent = cur;
232 return res;
233 } else
234 cur = cur->right;
235 } else {
236 if (prio_tree_left_empty(cur)) {
237 INIT_PRIO_TREE_NODE(node);
238 cur->left = node;
239 node->parent = cur;
240 return res;
241 } else
242 cur = cur->left;
245 mask >>= 1;
247 if (!mask) {
248 mask = 1UL << (root->index_bits - 1);
249 size_flag = 1;
252 /* Should not reach here */
253 BUG();
254 return NULL;
258 * Remove a prio_tree_node @node from a radix priority search tree @root. The
259 * algorithm takes O(log n) time where 'log n' is the number of bits required
260 * to represent the maximum heap_index.
262 static void prio_tree_remove(struct prio_tree_root *root,
263 struct prio_tree_node *node)
265 struct prio_tree_node *cur;
266 unsigned long r_index, h_index_right, h_index_left;
268 cur = node;
270 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
271 if (!prio_tree_left_empty(cur))
272 GET_INDEX(cur->left, r_index, h_index_left);
273 else {
274 cur = cur->right;
275 continue;
278 if (!prio_tree_right_empty(cur))
279 GET_INDEX(cur->right, r_index, h_index_right);
280 else {
281 cur = cur->left;
282 continue;
285 /* both h_index_left and h_index_right cannot be 0 */
286 if (h_index_left >= h_index_right)
287 cur = cur->left;
288 else
289 cur = cur->right;
292 if (prio_tree_root(cur)) {
293 BUG_ON(root->prio_tree_node != cur);
294 INIT_PRIO_TREE_ROOT(root);
295 return;
298 if (cur->parent->right == cur)
299 cur->parent->right = cur->parent;
300 else
301 cur->parent->left = cur->parent;
303 while (cur != node)
304 cur = prio_tree_replace(root, cur->parent, cur);
308 * Following functions help to enumerate all prio_tree_nodes in the tree that
309 * overlap with the input interval X [radix_index, heap_index]. The enumeration
310 * takes O(log n + m) time where 'log n' is the height of the tree (which is
311 * proportional to # of bits required to represent the maximum heap_index) and
312 * 'm' is the number of prio_tree_nodes that overlap the interval X.
315 static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
316 unsigned long *r_index, unsigned long *h_index)
318 if (prio_tree_left_empty(iter->cur))
319 return NULL;
321 GET_INDEX(iter->cur->left, *r_index, *h_index);
323 if (iter->r_index <= *h_index) {
324 iter->cur = iter->cur->left;
325 iter->mask >>= 1;
326 if (iter->mask) {
327 if (iter->size_level)
328 iter->size_level++;
329 } else {
330 if (iter->size_level) {
331 BUG_ON(!prio_tree_left_empty(iter->cur));
332 BUG_ON(!prio_tree_right_empty(iter->cur));
333 iter->size_level++;
334 iter->mask = ULONG_MAX;
335 } else {
336 iter->size_level = 1;
337 iter->mask = 1UL << (iter->root->index_bits - 1);
340 return iter->cur;
343 return NULL;
346 static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
347 unsigned long *r_index, unsigned long *h_index)
349 unsigned long value;
351 if (prio_tree_right_empty(iter->cur))
352 return NULL;
354 if (iter->size_level)
355 value = iter->value;
356 else
357 value = iter->value | iter->mask;
359 if (iter->h_index < value)
360 return NULL;
362 GET_INDEX(iter->cur->right, *r_index, *h_index);
364 if (iter->r_index <= *h_index) {
365 iter->cur = iter->cur->right;
366 iter->mask >>= 1;
367 iter->value = value;
368 if (iter->mask) {
369 if (iter->size_level)
370 iter->size_level++;
371 } else {
372 if (iter->size_level) {
373 BUG_ON(!prio_tree_left_empty(iter->cur));
374 BUG_ON(!prio_tree_right_empty(iter->cur));
375 iter->size_level++;
376 iter->mask = ULONG_MAX;
377 } else {
378 iter->size_level = 1;
379 iter->mask = 1UL << (iter->root->index_bits - 1);
382 return iter->cur;
385 return NULL;
388 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
390 iter->cur = iter->cur->parent;
391 if (iter->mask == ULONG_MAX)
392 iter->mask = 1UL;
393 else if (iter->size_level == 1)
394 iter->mask = 1UL;
395 else
396 iter->mask <<= 1;
397 if (iter->size_level)
398 iter->size_level--;
399 if (!iter->size_level && (iter->value & iter->mask))
400 iter->value ^= iter->mask;
401 return iter->cur;
404 static inline int overlap(struct prio_tree_iter *iter,
405 unsigned long r_index, unsigned long h_index)
407 return iter->h_index >= r_index && iter->r_index <= h_index;
411 * prio_tree_first:
413 * Get the first prio_tree_node that overlaps with the interval [radix_index,
414 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
415 * traversal of the tree.
417 static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
419 struct prio_tree_root *root;
420 unsigned long r_index, h_index;
422 INIT_PRIO_TREE_ITER(iter);
424 root = iter->root;
425 if (prio_tree_empty(root))
426 return NULL;
428 GET_INDEX(root->prio_tree_node, r_index, h_index);
430 if (iter->r_index > h_index)
431 return NULL;
433 iter->mask = 1UL << (root->index_bits - 1);
434 iter->cur = root->prio_tree_node;
436 while (1) {
437 if (overlap(iter, r_index, h_index))
438 return iter->cur;
440 if (prio_tree_left(iter, &r_index, &h_index))
441 continue;
443 if (prio_tree_right(iter, &r_index, &h_index))
444 continue;
446 break;
448 return NULL;
452 * prio_tree_next:
454 * Get the next prio_tree_node that overlaps with the input interval in iter
456 static struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
458 unsigned long r_index, h_index;
460 repeat:
461 while (prio_tree_left(iter, &r_index, &h_index))
462 if (overlap(iter, r_index, h_index))
463 return iter->cur;
465 while (!prio_tree_right(iter, &r_index, &h_index)) {
466 while (!prio_tree_root(iter->cur) &&
467 iter->cur->parent->right == iter->cur)
468 prio_tree_parent(iter);
470 if (prio_tree_root(iter->cur))
471 return NULL;
473 prio_tree_parent(iter);
476 if (overlap(iter, r_index, h_index))
477 return iter->cur;
479 goto repeat;
483 * Radix priority search tree for address_space->i_mmap
485 * For each vma that map a unique set of file pages i.e., unique [radix_index,
486 * heap_index] value, we have a corresponing priority search tree node. If
487 * multiple vmas have identical [radix_index, heap_index] value, then one of
488 * them is used as a tree node and others are stored in a vm_set list. The tree
489 * node points to the first vma (head) of the list using vm_set.head.
491 * prio_tree_root
493 * A vm_set.head
494 * / \ /
495 * L R -> H-I-J-K-M-N-O-P-Q-S
496 * ^ ^ <-- vm_set.list -->
497 * tree nodes
499 * We need some way to identify whether a vma is a tree node, head of a vm_set
500 * list, or just a member of a vm_set list. We cannot use vm_flags to store
501 * such information. The reason is, in the above figure, it is possible that
502 * vm_flags' of R and H are covered by the different mmap_sems. When R is
503 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
504 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
505 * That's why some trick involving shared.vm_set.parent is used for identifying
506 * tree nodes and list head nodes.
508 * vma radix priority search tree node rules:
510 * vma->shared.vm_set.parent != NULL ==> a tree node
511 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
512 * vma->shared.vm_set.head == NULL ==> no others map the same range
514 * vma->shared.vm_set.parent == NULL
515 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
516 * vma->shared.vm_set.head == NULL ==> a list node
520 * Add a new vma known to map the same set of pages as the old vma:
521 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
522 * Note that it just happens to work correctly on i_mmap_nonlinear too.
524 void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
526 /* Leave these BUG_ONs till prio_tree patch stabilizes */
527 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
528 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
530 vma->shared.vm_set.head = NULL;
531 vma->shared.vm_set.parent = NULL;
533 if (!old->shared.vm_set.parent)
534 list_add(&vma->shared.vm_set.list,
535 &old->shared.vm_set.list);
536 else if (old->shared.vm_set.head)
537 list_add_tail(&vma->shared.vm_set.list,
538 &old->shared.vm_set.head->shared.vm_set.list);
539 else {
540 INIT_LIST_HEAD(&vma->shared.vm_set.list);
541 vma->shared.vm_set.head = old;
542 old->shared.vm_set.head = vma;
546 void vma_prio_tree_insert(struct vm_area_struct *vma,
547 struct prio_tree_root *root)
549 struct prio_tree_node *ptr;
550 struct vm_area_struct *old;
552 vma->shared.vm_set.head = NULL;
554 ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
555 if (ptr != &vma->shared.prio_tree_node) {
556 old = prio_tree_entry(ptr, struct vm_area_struct,
557 shared.prio_tree_node);
558 vma_prio_tree_add(vma, old);
562 void vma_prio_tree_remove(struct vm_area_struct *vma,
563 struct prio_tree_root *root)
565 struct vm_area_struct *node, *head, *new_head;
567 if (!vma->shared.vm_set.head) {
568 if (!vma->shared.vm_set.parent)
569 list_del_init(&vma->shared.vm_set.list);
570 else
571 prio_tree_remove(root, &vma->shared.prio_tree_node);
572 } else {
573 /* Leave this BUG_ON till prio_tree patch stabilizes */
574 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
575 if (vma->shared.vm_set.parent) {
576 head = vma->shared.vm_set.head;
577 if (!list_empty(&head->shared.vm_set.list)) {
578 new_head = list_entry(
579 head->shared.vm_set.list.next,
580 struct vm_area_struct,
581 shared.vm_set.list);
582 list_del_init(&head->shared.vm_set.list);
583 } else
584 new_head = NULL;
586 prio_tree_replace(root, &vma->shared.prio_tree_node,
587 &head->shared.prio_tree_node);
588 head->shared.vm_set.head = new_head;
589 if (new_head)
590 new_head->shared.vm_set.head = head;
592 } else {
593 node = vma->shared.vm_set.head;
594 if (!list_empty(&vma->shared.vm_set.list)) {
595 new_head = list_entry(
596 vma->shared.vm_set.list.next,
597 struct vm_area_struct,
598 shared.vm_set.list);
599 list_del_init(&vma->shared.vm_set.list);
600 node->shared.vm_set.head = new_head;
601 new_head->shared.vm_set.head = node;
602 } else
603 node->shared.vm_set.head = NULL;
609 * Helper function to enumerate vmas that map a given file page or a set of
610 * contiguous file pages. The function returns vmas that at least map a single
611 * page in the given range of contiguous file pages.
613 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
614 struct prio_tree_iter *iter)
616 struct prio_tree_node *ptr;
617 struct vm_area_struct *next;
619 if (!vma) {
621 * First call is with NULL vma
623 ptr = prio_tree_first(iter);
624 if (ptr) {
625 next = prio_tree_entry(ptr, struct vm_area_struct,
626 shared.prio_tree_node);
627 prefetch(next->shared.vm_set.head);
628 return next;
629 } else
630 return NULL;
633 if (vma->shared.vm_set.parent) {
634 if (vma->shared.vm_set.head) {
635 next = vma->shared.vm_set.head;
636 prefetch(next->shared.vm_set.list.next);
637 return next;
639 } else {
640 next = list_entry(vma->shared.vm_set.list.next,
641 struct vm_area_struct, shared.vm_set.list);
642 if (!next->shared.vm_set.head) {
643 prefetch(next->shared.vm_set.list.next);
644 return next;
648 ptr = prio_tree_next(iter);
649 if (ptr) {
650 next = prio_tree_entry(ptr, struct vm_area_struct,
651 shared.prio_tree_node);
652 prefetch(next->shared.vm_set.head);
653 return next;
654 } else
655 return NULL;