mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / innodb_plugin / btr / btr0btr.c
blob04f3a79866e7072a82c837b64af55d534df47c89
1 /*****************************************************************************
3 Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc.,
15 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17 *****************************************************************************/
19 /**************************************************//**
20 @file btr/btr0btr.c
21 The B-tree
23 Created 6/2/1994 Heikki Tuuri
24 *******************************************************/
26 #include "btr0btr.h"
28 #ifdef UNIV_NONINL
29 #include "btr0btr.ic"
30 #endif
32 #include "fsp0fsp.h"
33 #include "page0page.h"
34 #include "page0zip.h"
36 #ifndef UNIV_HOTBACKUP
37 #include "btr0cur.h"
38 #include "btr0sea.h"
39 #include "btr0pcur.h"
40 #include "rem0cmp.h"
41 #include "lock0lock.h"
42 #include "ibuf0ibuf.h"
43 #include "trx0trx.h"
45 #ifdef UNIV_BLOB_DEBUG
46 # include "srv0srv.h"
47 # include "ut0rbt.h"
49 /** TRUE when messages about index->blobs modification are enabled. */
50 static ibool btr_blob_dbg_msg;
52 /** Issue a message about an operation on index->blobs.
53 @param op operation
54 @param b the entry being subjected to the operation
55 @param ctx the context of the operation */
56 #define btr_blob_dbg_msg_issue(op, b, ctx) \
57 fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \
58 (b)->ref_page_no, (b)->ref_heap_no, \
59 (b)->ref_field_no, (b)->blob_page_no, ctx, \
60 (b)->owner, (b)->always_owner, (b)->del)
62 /** Insert to index->blobs a reference to an off-page column.
63 @param index the index tree
64 @param b the reference
65 @param ctx context (for logging) */
66 UNIV_INTERN
67 void
68 btr_blob_dbg_rbt_insert(
69 /*====================*/
70 dict_index_t* index, /*!< in/out: index tree */
71 const btr_blob_dbg_t* b, /*!< in: the reference */
72 const char* ctx) /*!< in: context (for logging) */
74 if (btr_blob_dbg_msg) {
75 btr_blob_dbg_msg_issue("insert", b, ctx);
77 mutex_enter(&index->blobs_mutex);
78 rbt_insert(index->blobs, b, b);
79 mutex_exit(&index->blobs_mutex);
82 /** Remove from index->blobs a reference to an off-page column.
83 @param index the index tree
84 @param b the reference
85 @param ctx context (for logging) */
86 UNIV_INTERN
87 void
88 btr_blob_dbg_rbt_delete(
89 /*====================*/
90 dict_index_t* index, /*!< in/out: index tree */
91 const btr_blob_dbg_t* b, /*!< in: the reference */
92 const char* ctx) /*!< in: context (for logging) */
94 if (btr_blob_dbg_msg) {
95 btr_blob_dbg_msg_issue("delete", b, ctx);
97 mutex_enter(&index->blobs_mutex);
98 ut_a(rbt_delete(index->blobs, b));
99 mutex_exit(&index->blobs_mutex);
102 /**************************************************************//**
103 Comparator for items (btr_blob_dbg_t) in index->blobs.
104 The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
105 @return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
106 static
108 btr_blob_dbg_cmp(
109 /*=============*/
110 const void* a, /*!< in: first btr_blob_dbg_t to compare */
111 const void* b) /*!< in: second btr_blob_dbg_t to compare */
113 const btr_blob_dbg_t* aa = a;
114 const btr_blob_dbg_t* bb = b;
116 ut_ad(aa != NULL);
117 ut_ad(bb != NULL);
119 if (aa->ref_page_no != bb->ref_page_no) {
120 return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
122 if (aa->ref_heap_no != bb->ref_heap_no) {
123 return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
125 if (aa->ref_field_no != bb->ref_field_no) {
126 return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
128 return(0);
131 /**************************************************************//**
132 Add a reference to an off-page column to the index->blobs map. */
133 UNIV_INTERN
134 void
135 btr_blob_dbg_add_blob(
136 /*==================*/
137 const rec_t* rec, /*!< in: clustered index record */
138 ulint field_no, /*!< in: off-page column number */
139 ulint page_no, /*!< in: start page of the column */
140 dict_index_t* index, /*!< in/out: index tree */
141 const char* ctx) /*!< in: context (for logging) */
143 btr_blob_dbg_t b;
144 const page_t* page = page_align(rec);
146 ut_a(index->blobs);
148 b.blob_page_no = page_no;
149 b.ref_page_no = page_get_page_no(page);
150 b.ref_heap_no = page_rec_get_heap_no(rec);
151 b.ref_field_no = field_no;
152 ut_a(b.ref_field_no >= index->n_uniq);
153 b.always_owner = b.owner = TRUE;
154 b.del = FALSE;
155 ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
156 btr_blob_dbg_rbt_insert(index, &b, ctx);
159 /**************************************************************//**
160 Add to index->blobs any references to off-page columns from a record.
161 @return number of references added */
162 UNIV_INTERN
163 ulint
164 btr_blob_dbg_add_rec(
165 /*=================*/
166 const rec_t* rec, /*!< in: record */
167 dict_index_t* index, /*!< in/out: index */
168 const ulint* offsets,/*!< in: offsets */
169 const char* ctx) /*!< in: context (for logging) */
171 ulint count = 0;
172 ulint i;
173 btr_blob_dbg_t b;
174 ibool del;
176 ut_ad(rec_offs_validate(rec, index, offsets));
178 if (!rec_offs_any_extern(offsets)) {
179 return(0);
182 b.ref_page_no = page_get_page_no(page_align(rec));
183 b.ref_heap_no = page_rec_get_heap_no(rec);
184 del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
186 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
187 if (rec_offs_nth_extern(offsets, i)) {
188 ulint len;
189 const byte* field_ref = rec_get_nth_field(
190 rec, offsets, i, &len);
192 ut_a(len != UNIV_SQL_NULL);
193 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
194 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
196 if (!memcmp(field_ref, field_ref_zero,
197 BTR_EXTERN_FIELD_REF_SIZE)) {
198 /* the column has not been stored yet */
199 continue;
202 b.ref_field_no = i;
203 b.blob_page_no = mach_read_from_4(
204 field_ref + BTR_EXTERN_PAGE_NO);
205 ut_a(b.ref_field_no >= index->n_uniq);
206 b.always_owner = b.owner
207 = !(field_ref[BTR_EXTERN_LEN]
208 & BTR_EXTERN_OWNER_FLAG);
209 b.del = del;
211 btr_blob_dbg_rbt_insert(index, &b, ctx);
212 count++;
216 return(count);
219 /**************************************************************//**
220 Display the references to off-page columns.
221 This function is to be called from a debugger,
222 for example when a breakpoint on ut_dbg_assertion_failed is hit. */
223 UNIV_INTERN
224 void
225 btr_blob_dbg_print(
226 /*===============*/
227 const dict_index_t* index) /*!< in: index tree */
229 const ib_rbt_node_t* node;
231 if (!index->blobs) {
232 return;
235 /* We intentionally do not acquire index->blobs_mutex here.
236 This function is to be called from a debugger, and the caller
237 should make sure that the index->blobs_mutex is held. */
239 for (node = rbt_first(index->blobs);
240 node != NULL; node = rbt_next(index->blobs, node)) {
241 const btr_blob_dbg_t* b
242 = rbt_value(btr_blob_dbg_t, node);
243 fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
244 b->ref_page_no, b->ref_heap_no, b->ref_field_no,
245 b->blob_page_no,
246 b->owner ? "" : "(disowned)",
247 b->always_owner ? "" : "(has disowned)",
248 b->del ? "(deleted)" : "");
252 /**************************************************************//**
253 Remove from index->blobs any references to off-page columns from a record.
254 @return number of references removed */
255 UNIV_INTERN
256 ulint
257 btr_blob_dbg_remove_rec(
258 /*====================*/
259 const rec_t* rec, /*!< in: record */
260 dict_index_t* index, /*!< in/out: index */
261 const ulint* offsets,/*!< in: offsets */
262 const char* ctx) /*!< in: context (for logging) */
264 ulint i;
265 ulint count = 0;
266 btr_blob_dbg_t b;
268 ut_ad(rec_offs_validate(rec, index, offsets));
270 if (!rec_offs_any_extern(offsets)) {
271 return(0);
274 b.ref_page_no = page_get_page_no(page_align(rec));
275 b.ref_heap_no = page_rec_get_heap_no(rec);
277 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
278 if (rec_offs_nth_extern(offsets, i)) {
279 ulint len;
280 const byte* field_ref = rec_get_nth_field(
281 rec, offsets, i, &len);
283 ut_a(len != UNIV_SQL_NULL);
284 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
285 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
287 b.ref_field_no = i;
288 b.blob_page_no = mach_read_from_4(
289 field_ref + BTR_EXTERN_PAGE_NO);
291 switch (b.blob_page_no) {
292 case 0:
293 /* The column has not been stored yet.
294 The BLOB pointer must be all zero.
295 There cannot be a BLOB starting at
296 page 0, because page 0 is reserved for
297 the tablespace header. */
298 ut_a(!memcmp(field_ref, field_ref_zero,
299 BTR_EXTERN_FIELD_REF_SIZE));
300 /* fall through */
301 case FIL_NULL:
302 /* the column has been freed already */
303 continue;
306 btr_blob_dbg_rbt_delete(index, &b, ctx);
307 count++;
311 return(count);
314 /**************************************************************//**
315 Check that there are no references to off-page columns from or to
316 the given page. Invoked when freeing or clearing a page.
317 @return TRUE when no orphan references exist */
318 UNIV_INTERN
319 ibool
320 btr_blob_dbg_is_empty(
321 /*==================*/
322 dict_index_t* index, /*!< in: index */
323 ulint page_no) /*!< in: page number */
325 const ib_rbt_node_t* node;
326 ibool success = TRUE;
328 if (!index->blobs) {
329 return(success);
332 mutex_enter(&index->blobs_mutex);
334 for (node = rbt_first(index->blobs);
335 node != NULL; node = rbt_next(index->blobs, node)) {
336 const btr_blob_dbg_t* b
337 = rbt_value(btr_blob_dbg_t, node);
339 if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
340 continue;
343 fprintf(stderr,
344 "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
345 b->owner ? "" : "(disowned)",
346 b->always_owner ? "" : "(has disowned)",
347 b->del ? "(deleted)" : "",
348 b->ref_page_no, b->ref_heap_no, b->ref_field_no,
349 b->blob_page_no);
351 if (b->blob_page_no != page_no || b->owner || !b->del) {
352 success = FALSE;
356 mutex_exit(&index->blobs_mutex);
357 return(success);
360 /**************************************************************//**
361 Count and process all references to off-page columns on a page.
362 @return number of references processed */
363 UNIV_INTERN
364 ulint
365 btr_blob_dbg_op(
366 /*============*/
367 const page_t* page, /*!< in: B-tree leaf page */
368 const rec_t* rec, /*!< in: record to start from
369 (NULL to process the whole page) */
370 dict_index_t* index, /*!< in/out: index */
371 const char* ctx, /*!< in: context (for logging) */
372 const btr_blob_dbg_op_f op) /*!< in: operation on records */
374 ulint count = 0;
375 mem_heap_t* heap = NULL;
376 ulint offsets_[REC_OFFS_NORMAL_SIZE];
377 ulint* offsets = offsets_;
378 rec_offs_init(offsets_);
380 ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
381 ut_a(!rec || page_align(rec) == page);
383 if (!index->blobs || !page_is_leaf(page)
384 || !dict_index_is_clust(index)) {
385 return(0);
388 if (rec == NULL) {
389 rec = page_get_infimum_rec(page);
392 do {
393 offsets = rec_get_offsets(rec, index, offsets,
394 ULINT_UNDEFINED, &heap);
395 count += op(rec, index, offsets, ctx);
396 rec = page_rec_get_next_const(rec);
397 } while (!page_rec_is_supremum(rec));
399 if (UNIV_LIKELY_NULL(heap)) {
400 mem_heap_free(heap);
403 return(count);
406 /**************************************************************//**
407 Count and add to index->blobs any references to off-page columns
408 from records on a page.
409 @return number of references added */
410 UNIV_INTERN
411 ulint
412 btr_blob_dbg_add(
413 /*=============*/
414 const page_t* page, /*!< in: rewritten page */
415 dict_index_t* index, /*!< in/out: index */
416 const char* ctx) /*!< in: context (for logging) */
418 btr_blob_dbg_assert_empty(index, page_get_page_no(page));
420 return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
423 /**************************************************************//**
424 Count and remove from index->blobs any references to off-page columns
425 from records on a page.
426 Used when reorganizing a page, before copying the records.
427 @return number of references removed */
428 UNIV_INTERN
429 ulint
430 btr_blob_dbg_remove(
431 /*================*/
432 const page_t* page, /*!< in: b-tree page */
433 dict_index_t* index, /*!< in/out: index */
434 const char* ctx) /*!< in: context (for logging) */
436 ulint count;
438 count = btr_blob_dbg_op(page, NULL, index, ctx,
439 btr_blob_dbg_remove_rec);
441 /* Check that no references exist. */
442 btr_blob_dbg_assert_empty(index, page_get_page_no(page));
444 return(count);
447 /**************************************************************//**
448 Restore in index->blobs any references to off-page columns
449 Used when page reorganize fails due to compressed page overflow. */
450 UNIV_INTERN
451 void
452 btr_blob_dbg_restore(
453 /*=================*/
454 const page_t* npage, /*!< in: page that failed to compress */
455 const page_t* page, /*!< in: copy of original page */
456 dict_index_t* index, /*!< in/out: index */
457 const char* ctx) /*!< in: context (for logging) */
459 ulint removed;
460 ulint added;
462 ut_a(page_get_page_no(npage) == page_get_page_no(page));
463 ut_a(page_get_space_id(npage) == page_get_space_id(page));
465 removed = btr_blob_dbg_remove(npage, index, ctx);
466 added = btr_blob_dbg_add(page, index, ctx);
467 ut_a(added == removed);
470 /**************************************************************//**
471 Modify the 'deleted' flag of a record. */
472 UNIV_INTERN
473 void
474 btr_blob_dbg_set_deleted_flag(
475 /*==========================*/
476 const rec_t* rec, /*!< in: record */
477 dict_index_t* index, /*!< in/out: index */
478 const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
479 ibool del) /*!< in: TRUE=deleted, FALSE=exists */
481 const ib_rbt_node_t* node;
482 btr_blob_dbg_t b;
483 btr_blob_dbg_t* c;
484 ulint i;
486 ut_ad(rec_offs_validate(rec, index, offsets));
487 ut_a(dict_index_is_clust(index));
488 ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
490 if (!rec_offs_any_extern(offsets) || !index->blobs) {
492 return;
495 b.ref_page_no = page_get_page_no(page_align(rec));
496 b.ref_heap_no = page_rec_get_heap_no(rec);
498 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
499 if (rec_offs_nth_extern(offsets, i)) {
500 ulint len;
501 const byte* field_ref = rec_get_nth_field(
502 rec, offsets, i, &len);
504 ut_a(len != UNIV_SQL_NULL);
505 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
506 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
508 b.ref_field_no = i;
509 b.blob_page_no = mach_read_from_4(
510 field_ref + BTR_EXTERN_PAGE_NO);
512 switch (b.blob_page_no) {
513 case 0:
514 ut_a(memcmp(field_ref, field_ref_zero,
515 BTR_EXTERN_FIELD_REF_SIZE));
516 /* page number 0 is for the
517 page allocation bitmap */
518 case FIL_NULL:
519 /* the column has been freed already */
520 ut_error;
523 mutex_enter(&index->blobs_mutex);
524 node = rbt_lookup(index->blobs, &b);
525 ut_a(node);
527 c = rbt_value(btr_blob_dbg_t, node);
528 /* The flag should be modified. */
529 c->del = del;
530 if (btr_blob_dbg_msg) {
531 b = *c;
532 mutex_exit(&index->blobs_mutex);
533 btr_blob_dbg_msg_issue("del_mk", &b, "");
534 } else {
535 mutex_exit(&index->blobs_mutex);
541 /**************************************************************//**
542 Change the ownership of an off-page column. */
543 UNIV_INTERN
544 void
545 btr_blob_dbg_owner(
546 /*===============*/
547 const rec_t* rec, /*!< in: record */
548 dict_index_t* index, /*!< in/out: index */
549 const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
550 ulint i, /*!< in: ith field in rec */
551 ibool own) /*!< in: TRUE=owned, FALSE=disowned */
553 const ib_rbt_node_t* node;
554 btr_blob_dbg_t b;
555 const byte* field_ref;
556 ulint len;
558 ut_ad(rec_offs_validate(rec, index, offsets));
559 ut_a(rec_offs_nth_extern(offsets, i));
561 field_ref = rec_get_nth_field(rec, offsets, i, &len);
562 ut_a(len != UNIV_SQL_NULL);
563 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
564 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
566 b.ref_page_no = page_get_page_no(page_align(rec));
567 b.ref_heap_no = page_rec_get_heap_no(rec);
568 b.ref_field_no = i;
569 b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
570 b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
572 ut_a(b.owner == own);
574 mutex_enter(&index->blobs_mutex);
575 node = rbt_lookup(index->blobs, &b);
576 /* row_ins_clust_index_entry_by_modify() invokes
577 btr_cur_unmark_extern_fields() also for the newly inserted
578 references, which are all zero bytes until the columns are stored.
579 The node lookup must fail if and only if that is the case. */
580 ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
581 == !node);
583 if (node) {
584 btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node);
585 /* Some code sets ownership from TRUE to TRUE.
586 We do not allow changing ownership from FALSE to FALSE. */
587 ut_a(own || c->owner);
589 c->owner = own;
590 if (!own) {
591 c->always_owner = FALSE;
595 mutex_exit(&index->blobs_mutex);
597 #endif /* UNIV_BLOB_DEBUG */
600 Latching strategy of the InnoDB B-tree
601 --------------------------------------
602 A tree latch protects all non-leaf nodes of the tree. Each node of a tree
603 also has a latch of its own.
605 A B-tree operation normally first acquires an S-latch on the tree. It
606 searches down the tree and releases the tree latch when it has the
607 leaf node latch. To save CPU time we do not acquire any latch on
608 non-leaf nodes of the tree during a search, those pages are only bufferfixed.
610 If an operation needs to restructure the tree, it acquires an X-latch on
611 the tree before searching to a leaf node. If it needs, for example, to
612 split a leaf,
613 (1) InnoDB decides the split point in the leaf,
614 (2) allocates a new page,
615 (3) inserts the appropriate node pointer to the first non-leaf level,
616 (4) releases the tree X-latch,
617 (5) and then moves records from the leaf to the new allocated page.
619 Node pointers
620 -------------
621 Leaf pages of a B-tree contain the index records stored in the
622 tree. On levels n > 0 we store 'node pointers' to pages on level
623 n - 1. For each page there is exactly one node pointer stored:
624 thus the our tree is an ordinary B-tree, not a B-link tree.
626 A node pointer contains a prefix P of an index record. The prefix
627 is long enough so that it determines an index record uniquely.
628 The file page number of the child page is added as the last
629 field. To the child page we can store node pointers or index records
630 which are >= P in the alphabetical order, but < P1 if there is
631 a next node pointer on the level, and P1 is its prefix.
633 If a node pointer with a prefix P points to a non-leaf child,
634 then the leftmost record in the child must have the same
635 prefix P. If it points to a leaf node, the child is not required
636 to contain any record with a prefix equal to P. The leaf case
637 is decided this way to allow arbitrary deletions in a leaf node
638 without touching upper levels of the tree.
640 We have predefined a special minimum record which we
641 define as the smallest record in any alphabetical order.
642 A minimum record is denoted by setting a bit in the record
643 header. A minimum record acts as the prefix of a node pointer
644 which points to a leftmost node on any level of the tree.
646 File page allocation
647 --------------------
648 In the root node of a B-tree there are two file segment headers.
649 The leaf pages of a tree are allocated from one file segment, to
650 make them consecutive on disk if possible. From the other file segment
651 we allocate pages for the non-leaf levels of the tree.
654 #ifdef UNIV_BTR_DEBUG
655 /**************************************************************//**
656 Checks a file segment header within a B-tree root page.
657 @return TRUE if valid */
658 static
659 ibool
660 btr_root_fseg_validate(
661 /*===================*/
662 const fseg_header_t* seg_header, /*!< in: segment header */
663 ulint space) /*!< in: tablespace identifier */
665 ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
667 ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
668 ut_a(offset >= FIL_PAGE_DATA);
669 ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
670 return(TRUE);
672 #endif /* UNIV_BTR_DEBUG */
674 /**************************************************************//**
675 Gets the root node of a tree and x-latches it.
676 @return root page, x-latched */
677 static
678 buf_block_t*
679 btr_root_block_get(
680 /*===============*/
681 dict_index_t* index, /*!< in: index tree */
682 mtr_t* mtr) /*!< in: mtr */
684 ulint space;
685 ulint zip_size;
686 ulint root_page_no;
687 buf_block_t* block;
689 space = dict_index_get_space(index);
690 zip_size = dict_table_zip_size(index->table);
691 root_page_no = dict_index_get_page(index);
693 block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
694 index, mtr);
695 ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
696 == dict_table_is_comp(index->table));
697 #ifdef UNIV_BTR_DEBUG
698 if (!dict_index_is_ibuf(index)) {
699 const page_t* root = buf_block_get_frame(block);
701 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
702 + root, space));
703 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
704 + root, space));
706 #endif /* UNIV_BTR_DEBUG */
708 return(block);
711 /**************************************************************//**
712 Gets the root node of a tree and x-latches it.
713 @return root page, x-latched */
714 UNIV_INTERN
715 page_t*
716 btr_root_get(
717 /*=========*/
718 dict_index_t* index, /*!< in: index tree */
719 mtr_t* mtr) /*!< in: mtr */
721 return(buf_block_get_frame(btr_root_block_get(index, mtr)));
724 /*************************************************************//**
725 Gets pointer to the previous user record in the tree. It is assumed that
726 the caller has appropriate latches on the page and its neighbor.
727 @return previous user record, NULL if there is none */
728 UNIV_INTERN
729 rec_t*
730 btr_get_prev_user_rec(
731 /*==================*/
732 rec_t* rec, /*!< in: record on leaf level */
733 mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
734 needed, also to the previous page */
736 page_t* page;
737 page_t* prev_page;
738 ulint prev_page_no;
740 if (!page_rec_is_infimum(rec)) {
742 rec_t* prev_rec = page_rec_get_prev(rec);
744 if (!page_rec_is_infimum(prev_rec)) {
746 return(prev_rec);
750 page = page_align(rec);
751 prev_page_no = btr_page_get_prev(page, mtr);
753 if (prev_page_no != FIL_NULL) {
755 ulint space;
756 ulint zip_size;
757 buf_block_t* prev_block;
759 space = page_get_space_id(page);
760 zip_size = fil_space_get_zip_size(space);
762 prev_block = buf_page_get_with_no_latch(space, zip_size,
763 prev_page_no, mtr);
764 prev_page = buf_block_get_frame(prev_block);
765 /* The caller must already have a latch to the brother */
766 ut_ad(mtr_memo_contains(mtr, prev_block,
767 MTR_MEMO_PAGE_S_FIX)
768 || mtr_memo_contains(mtr, prev_block,
769 MTR_MEMO_PAGE_X_FIX));
770 #ifdef UNIV_BTR_DEBUG
771 ut_a(page_is_comp(prev_page) == page_is_comp(page));
772 ut_a(btr_page_get_next(prev_page, mtr)
773 == page_get_page_no(page));
774 #endif /* UNIV_BTR_DEBUG */
776 return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
779 return(NULL);
782 /*************************************************************//**
783 Gets pointer to the next user record in the tree. It is assumed that the
784 caller has appropriate latches on the page and its neighbor.
785 @return next user record, NULL if there is none */
786 UNIV_INTERN
787 rec_t*
788 btr_get_next_user_rec(
789 /*==================*/
790 rec_t* rec, /*!< in: record on leaf level */
791 mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
792 needed, also to the next page */
794 page_t* page;
795 page_t* next_page;
796 ulint next_page_no;
798 if (!page_rec_is_supremum(rec)) {
800 rec_t* next_rec = page_rec_get_next(rec);
802 if (!page_rec_is_supremum(next_rec)) {
804 return(next_rec);
808 page = page_align(rec);
809 next_page_no = btr_page_get_next(page, mtr);
811 if (next_page_no != FIL_NULL) {
812 ulint space;
813 ulint zip_size;
814 buf_block_t* next_block;
816 space = page_get_space_id(page);
817 zip_size = fil_space_get_zip_size(space);
819 next_block = buf_page_get_with_no_latch(space, zip_size,
820 next_page_no, mtr);
821 next_page = buf_block_get_frame(next_block);
822 /* The caller must already have a latch to the brother */
823 ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
824 || mtr_memo_contains(mtr, next_block,
825 MTR_MEMO_PAGE_X_FIX));
826 #ifdef UNIV_BTR_DEBUG
827 ut_a(page_is_comp(next_page) == page_is_comp(page));
828 ut_a(btr_page_get_prev(next_page, mtr)
829 == page_get_page_no(page));
830 #endif /* UNIV_BTR_DEBUG */
832 return(page_rec_get_next(page_get_infimum_rec(next_page)));
835 return(NULL);
838 /**************************************************************//**
839 Creates a new index page (not the root, and also not
840 used in page reorganization). @see btr_page_empty(). */
841 static
842 void
843 btr_page_create(
844 /*============*/
845 buf_block_t* block, /*!< in/out: page to be created */
846 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
847 dict_index_t* index, /*!< in: index */
848 ulint level, /*!< in: the B-tree level of the page */
849 mtr_t* mtr) /*!< in: mtr */
851 page_t* page = buf_block_get_frame(block);
853 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
854 btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
856 if (UNIV_LIKELY_NULL(page_zip)) {
857 page_create_zip(block, index, level, mtr);
858 } else {
859 page_create(block, mtr, dict_table_is_comp(index->table));
860 /* Set the level of the new index page */
861 btr_page_set_level(page, NULL, level, mtr);
864 block->check_index_page_at_flush = TRUE;
866 btr_page_set_index_id(page, page_zip, index->id, mtr);
869 /**************************************************************//**
870 Allocates a new file page to be used in an ibuf tree. Takes the page from
871 the free list of the tree, which must contain pages!
872 @return new allocated block, x-latched */
873 static
874 buf_block_t*
875 btr_page_alloc_for_ibuf(
876 /*====================*/
877 dict_index_t* index, /*!< in: index tree */
878 mtr_t* mtr) /*!< in: mtr */
880 fil_addr_t node_addr;
881 page_t* root;
882 page_t* new_page;
883 buf_block_t* new_block;
885 root = btr_root_get(index, mtr);
887 node_addr = flst_get_first(root + PAGE_HEADER
888 + PAGE_BTR_IBUF_FREE_LIST, mtr);
889 ut_a(node_addr.page != FIL_NULL);
891 new_block = buf_page_get(dict_index_get_space(index),
892 dict_table_zip_size(index->table),
893 node_addr.page, RW_X_LATCH, mtr);
894 new_page = buf_block_get_frame(new_block);
895 buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
897 flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
898 new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
899 mtr);
900 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
901 mtr));
903 return(new_block);
906 /**************************************************************//**
907 Allocates a new file page to be used in an index tree. NOTE: we assume
908 that the caller has made the reservation for free extents!
909 @retval NULL if no page could be allocated
910 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
911 (init_mtr == mtr, or the page was not previously freed in mtr)
912 @retval block (not allocated or initialized) otherwise */
913 static __attribute__((nonnull, warn_unused_result))
914 buf_block_t*
915 btr_page_alloc_low(
916 /*===============*/
917 dict_index_t* index, /*!< in: index */
918 ulint hint_page_no, /*!< in: hint of a good page */
919 byte file_direction, /*!< in: direction where a possible
920 page split is made */
921 ulint level, /*!< in: level where the page is placed
922 in the tree */
923 mtr_t* mtr, /*!< in/out: mini-transaction
924 for the allocation */
925 mtr_t* init_mtr) /*!< in/out: mtr or another
926 mini-transaction in which the
927 page should be initialized.
928 If init_mtr!=mtr, but the page
929 is already X-latched in mtr, do
930 not initialize the page. */
932 fseg_header_t* seg_header;
933 page_t* root;
935 root = btr_root_get(index, mtr);
937 if (level == 0) {
938 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
939 } else {
940 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
943 /* Parameter TRUE below states that the caller has made the
944 reservation for free extents, and thus we know that a page can
945 be allocated: */
947 return(fseg_alloc_free_page_general(
948 seg_header, hint_page_no, file_direction,
949 TRUE, mtr, init_mtr));
952 /**************************************************************//**
953 Allocates a new file page to be used in an index tree. NOTE: we assume
954 that the caller has made the reservation for free extents!
955 @retval NULL if no page could be allocated
956 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
957 (init_mtr == mtr, or the page was not previously freed in mtr)
958 @retval block (not allocated or initialized) otherwise */
959 UNIV_INTERN
960 buf_block_t*
961 btr_page_alloc(
962 /*===========*/
963 dict_index_t* index, /*!< in: index */
964 ulint hint_page_no, /*!< in: hint of a good page */
965 byte file_direction, /*!< in: direction where a possible
966 page split is made */
967 ulint level, /*!< in: level where the page is placed
968 in the tree */
969 mtr_t* mtr, /*!< in/out: mini-transaction
970 for the allocation */
971 mtr_t* init_mtr) /*!< in/out: mini-transaction
972 for x-latching and initializing
973 the page */
975 buf_block_t* new_block;
977 if (dict_index_is_ibuf(index)) {
979 return(btr_page_alloc_for_ibuf(index, mtr));
982 new_block = btr_page_alloc_low(
983 index, hint_page_no, file_direction, level, mtr, init_mtr);
985 if (new_block) {
986 buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
989 return(new_block);
992 /**************************************************************//**
993 Gets the number of pages in a B-tree.
994 @return number of pages, or ULINT_UNDEFINED if the index is unavailable */
995 UNIV_INTERN
996 ulint
997 btr_get_size(
998 /*=========*/
999 dict_index_t* index, /*!< in: index */
1000 ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
1001 mtr_t* mtr) /*!< in/out: mini-transaction where index
1002 is s-latched */
1004 fseg_header_t* seg_header;
1005 page_t* root;
1006 ulint n;
1007 ulint dummy;
1009 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1010 MTR_MEMO_S_LOCK));
1012 if (index->page == FIL_NULL
1013 || index->to_be_dropped
1014 || *index->name == TEMP_INDEX_PREFIX) {
1015 return(ULINT_UNDEFINED);
1018 root = btr_root_get(index, mtr);
1020 if (flag == BTR_N_LEAF_PAGES) {
1021 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1023 fseg_n_reserved_pages(seg_header, &n, mtr);
1025 } else if (flag == BTR_TOTAL_SIZE) {
1026 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1028 n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
1030 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1032 n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
1033 } else {
1034 ut_error;
1037 return(n);
1040 /**************************************************************//**
1041 Frees a page used in an ibuf tree. Puts the page to the free list of the
1042 ibuf tree. */
1043 static
1044 void
1045 btr_page_free_for_ibuf(
1046 /*===================*/
1047 dict_index_t* index, /*!< in: index tree */
1048 buf_block_t* block, /*!< in: block to be freed, x-latched */
1049 mtr_t* mtr) /*!< in: mtr */
1051 page_t* root;
1053 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1054 root = btr_root_get(index, mtr);
1056 flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1057 buf_block_get_frame(block)
1058 + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
1060 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1061 mtr));
1064 /**************************************************************//**
1065 Frees a file page used in an index tree. Can be used also to (BLOB)
1066 external storage pages, because the page level 0 can be given as an
1067 argument. */
1068 UNIV_INTERN
1069 void
1070 btr_page_free_low(
1071 /*==============*/
1072 dict_index_t* index, /*!< in: index tree */
1073 buf_block_t* block, /*!< in: block to be freed, x-latched */
1074 ulint level, /*!< in: page level */
1075 mtr_t* mtr) /*!< in: mtr */
1077 fseg_header_t* seg_header;
1078 page_t* root;
1080 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1081 /* The page gets invalid for optimistic searches: increment the frame
1082 modify clock */
1084 buf_block_modify_clock_inc(block);
1085 btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
1087 if (dict_index_is_ibuf(index)) {
1089 btr_page_free_for_ibuf(index, block, mtr);
1091 return;
1094 root = btr_root_get(index, mtr);
1096 if (level == 0) {
1097 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1098 } else {
1099 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1102 fseg_free_page(seg_header,
1103 buf_block_get_space(block),
1104 buf_block_get_page_no(block), mtr);
1106 /* The page was marked free in the allocation bitmap, but it
1107 should remain buffer-fixed until mtr_commit(mtr) or until it
1108 is explicitly freed from the mini-transaction. */
1109 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1110 /* TODO: Discard any operations on the page from the redo log
1111 and remove the block from the flush list and the buffer pool.
1112 This would free up buffer pool earlier and reduce writes to
1113 both the tablespace and the redo log. */
1116 /**************************************************************//**
1117 Frees a file page used in an index tree. NOTE: cannot free field external
1118 storage pages because the page must contain info on its level. */
1119 UNIV_INTERN
1120 void
1121 btr_page_free(
1122 /*==========*/
1123 dict_index_t* index, /*!< in: index tree */
1124 buf_block_t* block, /*!< in: block to be freed, x-latched */
1125 mtr_t* mtr) /*!< in: mtr */
1127 const page_t* page = buf_block_get_frame(block);
1128 ulint level = btr_page_get_level(page, mtr);
1130 ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
1131 btr_page_free_low(index, block, level, mtr);
1134 /**************************************************************//**
1135 Sets the child node file address in a node pointer. */
1136 UNIV_INLINE
1137 void
1138 btr_node_ptr_set_child_page_no(
1139 /*===========================*/
1140 rec_t* rec, /*!< in: node pointer record */
1141 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
1142 part will be updated, or NULL */
1143 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
1144 ulint page_no,/*!< in: child node address */
1145 mtr_t* mtr) /*!< in: mtr */
1147 byte* field;
1148 ulint len;
1150 ut_ad(rec_offs_validate(rec, NULL, offsets));
1151 ut_ad(!page_is_leaf(page_align(rec)));
1152 ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
1154 /* The child address is in the last field */
1155 field = rec_get_nth_field(rec, offsets,
1156 rec_offs_n_fields(offsets) - 1, &len);
1158 ut_ad(len == REC_NODE_PTR_SIZE);
1160 if (UNIV_LIKELY_NULL(page_zip)) {
1161 page_zip_write_node_ptr(page_zip, rec,
1162 rec_offs_data_size(offsets),
1163 page_no, mtr);
1164 } else {
1165 mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
1169 /************************************************************//**
1170 Returns the child page of a node pointer and x-latches it.
1171 @return child page, x-latched */
1172 static
1173 buf_block_t*
1174 btr_node_ptr_get_child(
1175 /*===================*/
1176 const rec_t* node_ptr,/*!< in: node pointer */
1177 dict_index_t* index, /*!< in: index */
1178 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
1179 mtr_t* mtr) /*!< in: mtr */
1181 ulint page_no;
1182 ulint space;
1184 ut_ad(rec_offs_validate(node_ptr, index, offsets));
1185 space = page_get_space_id(page_align(node_ptr));
1186 page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
1188 return(btr_block_get(space, dict_table_zip_size(index->table),
1189 page_no, RW_X_LATCH, index, mtr));
1192 /************************************************************//**
1193 Returns the upper level node pointer to a page. It is assumed that mtr holds
1194 an x-latch on the tree.
1195 @return rec_get_offsets() of the node pointer record */
1196 static
1197 ulint*
1198 btr_page_get_father_node_ptr_func(
1199 /*==============================*/
1200 ulint* offsets,/*!< in: work area for the return value */
1201 mem_heap_t* heap, /*!< in: memory heap to use */
1202 btr_cur_t* cursor, /*!< in: cursor pointing to user record,
1203 out: cursor on node pointer record,
1204 its page x-latched */
1205 const char* file, /*!< in: file name */
1206 ulint line, /*!< in: line where called */
1207 mtr_t* mtr) /*!< in: mtr */
1209 dtuple_t* tuple;
1210 rec_t* user_rec;
1211 rec_t* node_ptr;
1212 ulint level;
1213 ulint page_no;
1214 dict_index_t* index;
1216 page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
1217 index = btr_cur_get_index(cursor);
1219 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1220 MTR_MEMO_X_LOCK));
1222 ut_ad(dict_index_get_page(index) != page_no);
1224 level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
1225 user_rec = btr_cur_get_rec(cursor);
1226 ut_a(page_rec_is_user_rec(user_rec));
1227 tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
1229 btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
1230 BTR_CONT_MODIFY_TREE, cursor, 0,
1231 file, line, mtr);
1233 node_ptr = btr_cur_get_rec(cursor);
1234 ut_ad(!page_rec_is_comp(node_ptr)
1235 || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
1236 offsets = rec_get_offsets(node_ptr, index, offsets,
1237 ULINT_UNDEFINED, &heap);
1239 if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
1240 != page_no)) {
1241 rec_t* print_rec;
1242 fputs("InnoDB: Dump of the child page:\n", stderr);
1243 buf_page_print(page_align(user_rec), 0);
1244 fputs("InnoDB: Dump of the parent page:\n", stderr);
1245 buf_page_print(page_align(node_ptr), 0);
1247 fputs("InnoDB: Corruption of an index tree: table ", stderr);
1248 ut_print_name(stderr, NULL, TRUE, index->table_name);
1249 fputs(", index ", stderr);
1250 ut_print_name(stderr, NULL, FALSE, index->name);
1251 fprintf(stderr, ",\n"
1252 "InnoDB: father ptr page no %lu, child page no %lu\n",
1253 (ulong)
1254 btr_node_ptr_get_child_page_no(node_ptr, offsets),
1255 (ulong) page_no);
1256 print_rec = page_rec_get_next(
1257 page_get_infimum_rec(page_align(user_rec)));
1258 offsets = rec_get_offsets(print_rec, index,
1259 offsets, ULINT_UNDEFINED, &heap);
1260 page_rec_print(print_rec, offsets);
1261 offsets = rec_get_offsets(node_ptr, index, offsets,
1262 ULINT_UNDEFINED, &heap);
1263 page_rec_print(node_ptr, offsets);
1265 fputs("InnoDB: You should dump + drop + reimport the table"
1266 " to fix the\n"
1267 "InnoDB: corruption. If the crash happens at "
1268 "the database startup, see\n"
1269 "InnoDB: " REFMAN "forcing-innodb-recovery.html about\n"
1270 "InnoDB: forcing recovery. "
1271 "Then dump + drop + reimport.\n", stderr);
1273 ut_error;
1276 return(offsets);
1279 #define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
1280 btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
1282 /************************************************************//**
1283 Returns the upper level node pointer to a page. It is assumed that mtr holds
1284 an x-latch on the tree.
1285 @return rec_get_offsets() of the node pointer record */
1286 static
1287 ulint*
1288 btr_page_get_father_block(
1289 /*======================*/
1290 ulint* offsets,/*!< in: work area for the return value */
1291 mem_heap_t* heap, /*!< in: memory heap to use */
1292 dict_index_t* index, /*!< in: b-tree index */
1293 buf_block_t* block, /*!< in: child page in the index */
1294 mtr_t* mtr, /*!< in: mtr */
1295 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1296 its page x-latched */
1298 rec_t* rec
1299 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1300 block)));
1301 btr_cur_position(index, rec, block, cursor);
1302 return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
1305 /************************************************************//**
1306 Seeks to the upper level node pointer to a page.
1307 It is assumed that mtr holds an x-latch on the tree. */
1308 static
1309 void
1310 btr_page_get_father(
1311 /*================*/
1312 dict_index_t* index, /*!< in: b-tree index */
1313 buf_block_t* block, /*!< in: child page in the index */
1314 mtr_t* mtr, /*!< in: mtr */
1315 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1316 its page x-latched */
1318 mem_heap_t* heap;
1319 rec_t* rec
1320 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1321 block)));
1322 btr_cur_position(index, rec, block, cursor);
1324 heap = mem_heap_create(100);
1325 btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
1326 mem_heap_free(heap);
1329 /************************************************************//**
1330 Creates the root node for a new index tree.
1331 @return page number of the created root, FIL_NULL if did not succeed */
1332 UNIV_INTERN
1333 ulint
1334 btr_create(
1335 /*=======*/
1336 ulint type, /*!< in: type of the index */
1337 ulint space, /*!< in: space where created */
1338 ulint zip_size,/*!< in: compressed page size in bytes
1339 or 0 for uncompressed pages */
1340 dulint index_id,/*!< in: index id */
1341 dict_index_t* index, /*!< in: index */
1342 mtr_t* mtr) /*!< in: mini-transaction handle */
1344 ulint page_no;
1345 buf_block_t* block;
1346 buf_frame_t* frame;
1347 page_t* page;
1348 page_zip_des_t* page_zip;
1350 /* Create the two new segments (one, in the case of an ibuf tree) for
1351 the index tree; the segment headers are put on the allocated root page
1352 (for an ibuf tree, not in the root, but on a separate ibuf header
1353 page) */
1355 if (type & DICT_IBUF) {
1356 /* Allocate first the ibuf header page */
1357 buf_block_t* ibuf_hdr_block = fseg_create(
1358 space, 0,
1359 IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
1361 buf_block_dbg_add_level(
1362 ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
1364 ut_ad(buf_block_get_page_no(ibuf_hdr_block)
1365 == IBUF_HEADER_PAGE_NO);
1366 /* Allocate then the next page to the segment: it will be the
1367 tree root page */
1369 block = fseg_alloc_free_page(
1370 buf_block_get_frame(ibuf_hdr_block)
1371 + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
1372 IBUF_TREE_ROOT_PAGE_NO,
1373 FSP_UP, mtr);
1374 ut_ad(buf_block_get_page_no(block) == IBUF_TREE_ROOT_PAGE_NO);
1375 } else {
1376 #ifdef UNIV_BLOB_DEBUG
1377 if ((type & DICT_CLUSTERED) && !index->blobs) {
1378 mutex_create(&index->blobs_mutex, SYNC_ANY_LATCH);
1379 index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
1380 btr_blob_dbg_cmp);
1382 #endif /* UNIV_BLOB_DEBUG */
1383 block = fseg_create(space, 0,
1384 PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
1387 if (block == NULL) {
1389 return(FIL_NULL);
1392 page_no = buf_block_get_page_no(block);
1393 frame = buf_block_get_frame(block);
1395 if (type & DICT_IBUF) {
1396 /* It is an insert buffer tree: initialize the free list */
1397 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
1399 ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
1401 flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
1402 } else {
1403 /* It is a non-ibuf tree: create a file segment for leaf
1404 pages */
1405 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1407 if (!fseg_create(space, page_no,
1408 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
1409 /* Not enough space for new segment, free root
1410 segment before return. */
1411 btr_free_root(space, zip_size, page_no, mtr);
1413 return(FIL_NULL);
1416 /* The fseg create acquires a second latch on the page,
1417 therefore we must declare it: */
1418 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1421 /* Create a new index page on the allocated segment page */
1422 page_zip = buf_block_get_page_zip(block);
1424 if (UNIV_LIKELY_NULL(page_zip)) {
1425 page = page_create_zip(block, index, 0, mtr);
1426 } else {
1427 page = page_create(block, mtr,
1428 dict_table_is_comp(index->table));
1429 /* Set the level of the new index page */
1430 btr_page_set_level(page, NULL, 0, mtr);
1433 block->check_index_page_at_flush = TRUE;
1435 /* Set the index id of the page */
1436 btr_page_set_index_id(page, page_zip, index_id, mtr);
1438 /* Set the next node and previous node fields */
1439 btr_page_set_next(page, page_zip, FIL_NULL, mtr);
1440 btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
1442 /* We reset the free bits for the page to allow creation of several
1443 trees in the same mtr, otherwise the latch on a bitmap page would
1444 prevent it because of the latching order */
1446 if (!(type & DICT_CLUSTERED)) {
1447 ibuf_reset_free_bits(block);
1450 /* In the following assertion we test that two records of maximum
1451 allowed size fit on the root page: this fact is needed to ensure
1452 correctness of split algorithms */
1454 ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
1456 return(page_no);
1459 /************************************************************//**
1460 Frees a B-tree except the root page, which MUST be freed after this
1461 by calling btr_free_root. */
1462 UNIV_INTERN
1463 void
1464 btr_free_but_not_root(
1465 /*==================*/
1466 ulint space, /*!< in: space where created */
1467 ulint zip_size, /*!< in: compressed page size in bytes
1468 or 0 for uncompressed pages */
1469 ulint root_page_no) /*!< in: root page number */
1471 ibool finished;
1472 page_t* root;
1473 mtr_t mtr;
1475 leaf_loop:
1476 mtr_start(&mtr);
1478 root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1479 NULL, &mtr);
1480 #ifdef UNIV_BTR_DEBUG
1481 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1482 + root, space));
1483 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1484 + root, space));
1485 #endif /* UNIV_BTR_DEBUG */
1487 /* NOTE: page hash indexes are dropped when a page is freed inside
1488 fsp0fsp. */
1490 finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
1491 &mtr);
1492 mtr_commit(&mtr);
1494 if (!finished) {
1496 goto leaf_loop;
1498 top_loop:
1499 mtr_start(&mtr);
1501 root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1502 NULL, &mtr);
1503 #ifdef UNIV_BTR_DEBUG
1504 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1505 + root, space));
1506 #endif /* UNIV_BTR_DEBUG */
1508 finished = fseg_free_step_not_header(
1509 root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
1510 mtr_commit(&mtr);
1512 if (!finished) {
1514 goto top_loop;
1518 /************************************************************//**
1519 Frees the B-tree root page. Other tree MUST already have been freed. */
1520 UNIV_INTERN
1521 void
1522 btr_free_root(
1523 /*==========*/
1524 ulint space, /*!< in: space where created */
1525 ulint zip_size, /*!< in: compressed page size in bytes
1526 or 0 for uncompressed pages */
1527 ulint root_page_no, /*!< in: root page number */
1528 mtr_t* mtr) /*!< in/out: mini-transaction */
1530 buf_block_t* block;
1531 fseg_header_t* header;
1533 block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
1534 NULL, mtr);
1536 btr_search_drop_page_hash_index(block);
1538 header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1539 #ifdef UNIV_BTR_DEBUG
1540 ut_a(btr_root_fseg_validate(header, space));
1541 #endif /* UNIV_BTR_DEBUG */
1543 while (!fseg_free_step(header, mtr));
1545 #endif /* !UNIV_HOTBACKUP */
1547 /*************************************************************//**
1548 Reorganizes an index page. */
1549 static
1550 ibool
1551 btr_page_reorganize_low(
1552 /*====================*/
1553 ibool recovery,/*!< in: TRUE if called in recovery:
1554 locks should not be updated, i.e.,
1555 there cannot exist locks on the
1556 page, and a hash index should not be
1557 dropped: it cannot exist */
1558 buf_block_t* block, /*!< in: page to be reorganized */
1559 dict_index_t* index, /*!< in: record descriptor */
1560 mtr_t* mtr) /*!< in: mtr */
1562 page_t* page = buf_block_get_frame(block);
1563 page_zip_des_t* page_zip = buf_block_get_page_zip(block);
1564 buf_block_t* temp_block;
1565 page_t* temp_page;
1566 ulint log_mode;
1567 ulint data_size1;
1568 ulint data_size2;
1569 ulint max_ins_size1;
1570 ulint max_ins_size2;
1571 ibool success = FALSE;
1573 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1574 ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1575 #ifdef UNIV_ZIP_DEBUG
1576 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1577 #endif /* UNIV_ZIP_DEBUG */
1578 data_size1 = page_get_data_size(page);
1579 max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
1581 #ifndef UNIV_HOTBACKUP
1582 /* Write the log record */
1583 mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
1584 ? MLOG_COMP_PAGE_REORGANIZE
1585 : MLOG_PAGE_REORGANIZE, 0);
1586 #endif /* !UNIV_HOTBACKUP */
1588 /* Turn logging off */
1589 log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
1591 #ifndef UNIV_HOTBACKUP
1592 temp_block = buf_block_alloc();
1593 #else /* !UNIV_HOTBACKUP */
1594 ut_ad(block == back_block1);
1595 temp_block = back_block2;
1596 #endif /* !UNIV_HOTBACKUP */
1597 temp_page = temp_block->frame;
1599 /* Copy the old page to temporary space */
1600 buf_frame_copy(temp_page, page);
1602 #ifndef UNIV_HOTBACKUP
1603 if (UNIV_LIKELY(!recovery)) {
1604 btr_search_drop_page_hash_index(block);
1607 block->check_index_page_at_flush = TRUE;
1608 #endif /* !UNIV_HOTBACKUP */
1609 btr_blob_dbg_remove(page, index, "btr_page_reorganize");
1611 /* Recreate the page: note that global data on page (possible
1612 segment headers, next page-field, etc.) is preserved intact */
1614 page_create(block, mtr, dict_table_is_comp(index->table));
1616 /* Copy the records from the temporary space to the recreated page;
1617 do not copy the lock bits yet */
1619 page_copy_rec_list_end_no_locks(block, temp_block,
1620 page_get_infimum_rec(temp_page),
1621 index, mtr);
1623 if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
1624 /* Copy max trx id to recreated page */
1625 trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
1626 page_set_max_trx_id(block, NULL, max_trx_id, mtr);
1627 /* In crash recovery, dict_index_is_sec_or_ibuf() always
1628 returns TRUE, even for clustered indexes. max_trx_id is
1629 unused in clustered index pages. */
1630 ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
1633 if (UNIV_LIKELY_NULL(page_zip)
1634 && UNIV_UNLIKELY
1635 (!page_zip_compress(page_zip, page, index, NULL))) {
1637 /* Restore the old page and exit. */
1638 btr_blob_dbg_restore(page, temp_page, index,
1639 "btr_page_reorganize_compress_fail");
1641 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1642 /* Check that the bytes that we skip are identical. */
1643 ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1644 ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1645 PAGE_HEADER + PAGE_N_RECS + temp_page,
1646 PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1647 ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
1648 UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
1649 FIL_PAGE_DATA_END));
1650 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1652 memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1653 PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1654 memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1655 UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
1657 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1658 ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
1659 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1661 goto func_exit;
1664 #ifndef UNIV_HOTBACKUP
1665 if (UNIV_LIKELY(!recovery)) {
1666 /* Update the record lock bitmaps */
1667 lock_move_reorganize_page(block, temp_block);
1669 #endif /* !UNIV_HOTBACKUP */
1671 data_size2 = page_get_data_size(page);
1672 max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1674 if (UNIV_UNLIKELY(data_size1 != data_size2)
1675 || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
1676 buf_page_print(page, 0);
1677 buf_page_print(temp_page, 0);
1678 fprintf(stderr,
1679 "InnoDB: Error: page old data size %lu"
1680 " new data size %lu\n"
1681 "InnoDB: Error: page old max ins size %lu"
1682 " new max ins size %lu\n"
1683 "InnoDB: Submit a detailed bug report"
1684 " to http://bugs.mysql.com\n",
1685 (unsigned long) data_size1, (unsigned long) data_size2,
1686 (unsigned long) max_ins_size1,
1687 (unsigned long) max_ins_size2);
1688 } else {
1689 success = TRUE;
1692 func_exit:
1693 #ifdef UNIV_ZIP_DEBUG
1694 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1695 #endif /* UNIV_ZIP_DEBUG */
1696 #ifndef UNIV_HOTBACKUP
1697 buf_block_free(temp_block);
1698 #endif /* !UNIV_HOTBACKUP */
1700 /* Restore logging mode */
1701 mtr_set_log_mode(mtr, log_mode);
1703 return(success);
1706 #ifndef UNIV_HOTBACKUP
1707 /*************************************************************//**
1708 Reorganizes an index page.
1709 IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
1710 page of a non-clustered index, the caller must update the insert
1711 buffer free bits in the same mini-transaction in such a way that the
1712 modification will be redo-logged.
1713 @return TRUE on success, FALSE on failure */
1714 UNIV_INTERN
1715 ibool
1716 btr_page_reorganize(
1717 /*================*/
1718 buf_block_t* block, /*!< in: page to be reorganized */
1719 dict_index_t* index, /*!< in: record descriptor */
1720 mtr_t* mtr) /*!< in: mtr */
1722 return(btr_page_reorganize_low(FALSE, block, index, mtr));
1724 #endif /* !UNIV_HOTBACKUP */
1726 /***********************************************************//**
1727 Parses a redo log record of reorganizing a page.
1728 @return end of log record or NULL */
1729 UNIV_INTERN
1730 byte*
1731 btr_parse_page_reorganize(
1732 /*======================*/
1733 byte* ptr, /*!< in: buffer */
1734 byte* end_ptr __attribute__((unused)),
1735 /*!< in: buffer end */
1736 dict_index_t* index, /*!< in: record descriptor */
1737 buf_block_t* block, /*!< in: page to be reorganized, or NULL */
1738 mtr_t* mtr) /*!< in: mtr or NULL */
1740 ut_ad(ptr && end_ptr);
1742 /* The record is empty, except for the record initial part */
1744 if (UNIV_LIKELY(block != NULL)) {
1745 btr_page_reorganize_low(TRUE, block, index, mtr);
1748 return(ptr);
1751 #ifndef UNIV_HOTBACKUP
1752 /*************************************************************//**
1753 Empties an index page. @see btr_page_create(). */
1754 static
1755 void
1756 btr_page_empty(
1757 /*===========*/
1758 buf_block_t* block, /*!< in: page to be emptied */
1759 page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
1760 dict_index_t* index, /*!< in: index of the page */
1761 ulint level, /*!< in: the B-tree level of the page */
1762 mtr_t* mtr) /*!< in: mtr */
1764 page_t* page = buf_block_get_frame(block);
1766 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1767 ut_ad(page_zip == buf_block_get_page_zip(block));
1768 #ifdef UNIV_ZIP_DEBUG
1769 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1770 #endif /* UNIV_ZIP_DEBUG */
1772 btr_search_drop_page_hash_index(block);
1773 btr_blob_dbg_remove(page, index, "btr_page_empty");
1775 /* Recreate the page: note that global data on page (possible
1776 segment headers, next page-field, etc.) is preserved intact */
1778 if (UNIV_LIKELY_NULL(page_zip)) {
1779 page_create_zip(block, index, level, mtr);
1780 } else {
1781 page_create(block, mtr, dict_table_is_comp(index->table));
1782 btr_page_set_level(page, NULL, level, mtr);
1785 block->check_index_page_at_flush = TRUE;
1788 /*************************************************************//**
1789 Makes tree one level higher by splitting the root, and inserts
1790 the tuple. It is assumed that mtr contains an x-latch on the tree.
1791 NOTE that the operation of this function must always succeed,
1792 we cannot reverse it: therefore enough free disk space must be
1793 guaranteed to be available before this function is called.
1794 @return inserted record */
1795 UNIV_INTERN
1796 rec_t*
1797 btr_root_raise_and_insert(
1798 /*======================*/
1799 btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
1800 on the root page; when the function returns,
1801 the cursor is positioned on the predecessor
1802 of the inserted record */
1803 const dtuple_t* tuple, /*!< in: tuple to insert */
1804 ulint n_ext, /*!< in: number of externally stored columns */
1805 mtr_t* mtr) /*!< in: mtr */
1807 dict_index_t* index;
1808 page_t* root;
1809 page_t* new_page;
1810 ulint new_page_no;
1811 rec_t* rec;
1812 mem_heap_t* heap;
1813 dtuple_t* node_ptr;
1814 ulint level;
1815 rec_t* node_ptr_rec;
1816 page_cur_t* page_cursor;
1817 page_zip_des_t* root_page_zip;
1818 page_zip_des_t* new_page_zip;
1819 buf_block_t* root_block;
1820 buf_block_t* new_block;
1822 root = btr_cur_get_page(cursor);
1823 root_block = btr_cur_get_block(cursor);
1824 root_page_zip = buf_block_get_page_zip(root_block);
1825 ut_ad(page_get_n_recs(root) > 0);
1826 index = btr_cur_get_index(cursor);
1827 #ifdef UNIV_ZIP_DEBUG
1828 ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
1829 #endif /* UNIV_ZIP_DEBUG */
1830 #ifdef UNIV_BTR_DEBUG
1831 if (!dict_index_is_ibuf(index)) {
1832 ulint space = dict_index_get_space(index);
1834 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1835 + root, space));
1836 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1837 + root, space));
1840 ut_a(dict_index_get_page(index) == page_get_page_no(root));
1841 #endif /* UNIV_BTR_DEBUG */
1842 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1843 MTR_MEMO_X_LOCK));
1844 ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
1846 /* Allocate a new page to the tree. Root splitting is done by first
1847 moving the root records to the new page, emptying the root, putting
1848 a node pointer to the new page, and then splitting the new page. */
1850 level = btr_page_get_level(root, mtr);
1852 new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
1853 new_page = buf_block_get_frame(new_block);
1854 new_page_zip = buf_block_get_page_zip(new_block);
1855 ut_a(!new_page_zip == !root_page_zip);
1856 ut_a(!new_page_zip
1857 || page_zip_get_size(new_page_zip)
1858 == page_zip_get_size(root_page_zip));
1860 btr_page_create(new_block, new_page_zip, index, level, mtr);
1862 /* Set the next node and previous node fields of new page */
1863 btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
1864 btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
1866 /* Copy the records from root to the new page one by one. */
1868 if (0
1869 #ifdef UNIV_ZIP_COPY
1870 || new_page_zip
1871 #endif /* UNIV_ZIP_COPY */
1872 || UNIV_UNLIKELY
1873 (!page_copy_rec_list_end(new_block, root_block,
1874 page_get_infimum_rec(root),
1875 index, mtr))) {
1876 ut_a(new_page_zip);
1878 /* Copy the page byte for byte. */
1879 page_zip_copy_recs(new_page_zip, new_page,
1880 root_page_zip, root, index, mtr);
1882 /* Update the lock table and possible hash index. */
1884 lock_move_rec_list_end(new_block, root_block,
1885 page_get_infimum_rec(root));
1887 btr_search_move_or_delete_hash_entries(new_block, root_block,
1888 index);
1891 /* If this is a pessimistic insert which is actually done to
1892 perform a pessimistic update then we have stored the lock
1893 information of the record to be inserted on the infimum of the
1894 root page: we cannot discard the lock structs on the root page */
1896 lock_update_root_raise(new_block, root_block);
1898 /* Create a memory heap where the node pointer is stored */
1899 heap = mem_heap_create(100);
1901 rec = page_rec_get_next(page_get_infimum_rec(new_page));
1902 new_page_no = buf_block_get_page_no(new_block);
1904 /* Build the node pointer (= node key and page address) for the
1905 child */
1907 node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
1908 level);
1909 /* The node pointer must be marked as the predefined minimum record,
1910 as there is no lower alphabetical limit to records in the leftmost
1911 node of a level: */
1912 dtuple_set_info_bits(node_ptr,
1913 dtuple_get_info_bits(node_ptr)
1914 | REC_INFO_MIN_REC_FLAG);
1916 /* Rebuild the root page to get free space */
1917 btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
1919 /* Set the next node and previous node fields, although
1920 they should already have been set. The previous node field
1921 must be FIL_NULL if root_page_zip != NULL, because the
1922 REC_INFO_MIN_REC_FLAG (of the first user record) will be
1923 set if and only if btr_page_get_prev() == FIL_NULL. */
1924 btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
1925 btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
1927 page_cursor = btr_cur_get_page_cur(cursor);
1929 /* Insert node pointer to the root */
1931 page_cur_set_before_first(root_block, page_cursor);
1933 node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
1934 index, 0, mtr);
1936 /* The root page should only contain the node pointer
1937 to new_page at this point. Thus, the data should fit. */
1938 ut_a(node_ptr_rec);
1940 /* Free the memory heap */
1941 mem_heap_free(heap);
1943 /* We play safe and reset the free bits for the new page */
1945 #if 0
1946 fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
1947 #endif
1949 if (!dict_index_is_clust(index)) {
1950 ibuf_reset_free_bits(new_block);
1953 /* Reposition the cursor to the child node */
1954 page_cur_search(new_block, index, tuple,
1955 PAGE_CUR_LE, page_cursor);
1957 /* Split the child and insert tuple */
1958 return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
1961 /*************************************************************//**
1962 Decides if the page should be split at the convergence point of inserts
1963 converging to the left.
1964 @return TRUE if split recommended */
1965 UNIV_INTERN
1966 ibool
1967 btr_page_get_split_rec_to_left(
1968 /*===========================*/
1969 btr_cur_t* cursor, /*!< in: cursor at which to insert */
1970 rec_t** split_rec) /*!< out: if split recommended,
1971 the first record on upper half page,
1972 or NULL if tuple to be inserted should
1973 be first */
1975 page_t* page;
1976 rec_t* insert_point;
1977 rec_t* infimum;
1979 page = btr_cur_get_page(cursor);
1980 insert_point = btr_cur_get_rec(cursor);
1982 if (page_header_get_ptr(page, PAGE_LAST_INSERT)
1983 == page_rec_get_next(insert_point)) {
1985 infimum = page_get_infimum_rec(page);
1987 /* If the convergence is in the middle of a page, include also
1988 the record immediately before the new insert to the upper
1989 page. Otherwise, we could repeatedly move from page to page
1990 lots of records smaller than the convergence point. */
1992 if (infimum != insert_point
1993 && page_rec_get_next(infimum) != insert_point) {
1995 *split_rec = insert_point;
1996 } else {
1997 *split_rec = page_rec_get_next(insert_point);
2000 return(TRUE);
2003 return(FALSE);
2006 /*************************************************************//**
2007 Decides if the page should be split at the convergence point of inserts
2008 converging to the right.
2009 @return TRUE if split recommended */
2010 UNIV_INTERN
2011 ibool
2012 btr_page_get_split_rec_to_right(
2013 /*============================*/
2014 btr_cur_t* cursor, /*!< in: cursor at which to insert */
2015 rec_t** split_rec) /*!< out: if split recommended,
2016 the first record on upper half page,
2017 or NULL if tuple to be inserted should
2018 be first */
2020 page_t* page;
2021 rec_t* insert_point;
2023 page = btr_cur_get_page(cursor);
2024 insert_point = btr_cur_get_rec(cursor);
2026 /* We use eager heuristics: if the new insert would be right after
2027 the previous insert on the same page, we assume that there is a
2028 pattern of sequential inserts here. */
2030 if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
2031 == insert_point)) {
2033 rec_t* next_rec;
2035 next_rec = page_rec_get_next(insert_point);
2037 if (page_rec_is_supremum(next_rec)) {
2038 split_at_new:
2039 /* Split at the new record to insert */
2040 *split_rec = NULL;
2041 } else {
2042 rec_t* next_next_rec = page_rec_get_next(next_rec);
2043 if (page_rec_is_supremum(next_next_rec)) {
2045 goto split_at_new;
2048 /* If there are >= 2 user records up from the insert
2049 point, split all but 1 off. We want to keep one because
2050 then sequential inserts can use the adaptive hash
2051 index, as they can do the necessary checks of the right
2052 search position just by looking at the records on this
2053 page. */
2055 *split_rec = next_next_rec;
2058 return(TRUE);
2061 return(FALSE);
2064 /*************************************************************//**
2065 Calculates a split record such that the tuple will certainly fit on
2066 its half-page when the split is performed. We assume in this function
2067 only that the cursor page has at least one user record.
2068 @return split record, or NULL if tuple will be the first record on
2069 the lower or upper half-page (determined by btr_page_tuple_smaller()) */
2070 static
2071 rec_t*
2072 btr_page_get_split_rec(
2073 /*===================*/
2074 btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
2075 const dtuple_t* tuple, /*!< in: tuple to insert */
2076 ulint n_ext) /*!< in: number of externally stored columns */
2078 page_t* page;
2079 page_zip_des_t* page_zip;
2080 ulint insert_size;
2081 ulint free_space;
2082 ulint total_data;
2083 ulint total_n_recs;
2084 ulint total_space;
2085 ulint incl_data;
2086 rec_t* ins_rec;
2087 rec_t* rec;
2088 rec_t* next_rec;
2089 ulint n;
2090 mem_heap_t* heap;
2091 ulint* offsets;
2093 page = btr_cur_get_page(cursor);
2095 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2096 free_space = page_get_free_space_of_empty(page_is_comp(page));
2098 page_zip = btr_cur_get_page_zip(cursor);
2099 if (UNIV_LIKELY_NULL(page_zip)) {
2100 /* Estimate the free space of an empty compressed page. */
2101 ulint free_space_zip = page_zip_empty_size(
2102 cursor->index->n_fields,
2103 page_zip_get_size(page_zip));
2105 if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
2106 free_space = (ulint) free_space_zip;
2110 /* free_space is now the free space of a created new page */
2112 total_data = page_get_data_size(page) + insert_size;
2113 total_n_recs = page_get_n_recs(page) + 1;
2114 ut_ad(total_n_recs >= 2);
2115 total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
2117 n = 0;
2118 incl_data = 0;
2119 ins_rec = btr_cur_get_rec(cursor);
2120 rec = page_get_infimum_rec(page);
2122 heap = NULL;
2123 offsets = NULL;
2125 /* We start to include records to the left half, and when the
2126 space reserved by them exceeds half of total_space, then if
2127 the included records fit on the left page, they will be put there
2128 if something was left over also for the right page,
2129 otherwise the last included record will be the first on the right
2130 half page */
2132 do {
2133 /* Decide the next record to include */
2134 if (rec == ins_rec) {
2135 rec = NULL; /* NULL denotes that tuple is
2136 now included */
2137 } else if (rec == NULL) {
2138 rec = page_rec_get_next(ins_rec);
2139 } else {
2140 rec = page_rec_get_next(rec);
2143 if (rec == NULL) {
2144 /* Include tuple */
2145 incl_data += insert_size;
2146 } else {
2147 offsets = rec_get_offsets(rec, cursor->index,
2148 offsets, ULINT_UNDEFINED,
2149 &heap);
2150 incl_data += rec_offs_size(offsets);
2153 n++;
2154 } while (incl_data + page_dir_calc_reserved_space(n)
2155 < total_space / 2);
2157 if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
2158 /* The next record will be the first on
2159 the right half page if it is not the
2160 supremum record of page */
2162 if (rec == ins_rec) {
2163 rec = NULL;
2165 goto func_exit;
2166 } else if (rec == NULL) {
2167 next_rec = page_rec_get_next(ins_rec);
2168 } else {
2169 next_rec = page_rec_get_next(rec);
2171 ut_ad(next_rec);
2172 if (!page_rec_is_supremum(next_rec)) {
2173 rec = next_rec;
2177 func_exit:
2178 if (UNIV_LIKELY_NULL(heap)) {
2179 mem_heap_free(heap);
2181 return(rec);
2184 /*************************************************************//**
2185 Returns TRUE if the insert fits on the appropriate half-page with the
2186 chosen split_rec.
2187 @return TRUE if fits */
2188 static
2189 ibool
2190 btr_page_insert_fits(
2191 /*=================*/
2192 btr_cur_t* cursor, /*!< in: cursor at which insert
2193 should be made */
2194 const rec_t* split_rec,/*!< in: suggestion for first record
2195 on upper half-page, or NULL if
2196 tuple to be inserted should be first */
2197 const ulint* offsets,/*!< in: rec_get_offsets(
2198 split_rec, cursor->index) */
2199 const dtuple_t* tuple, /*!< in: tuple to insert */
2200 ulint n_ext, /*!< in: number of externally stored columns */
2201 mem_heap_t* heap) /*!< in: temporary memory heap */
2203 page_t* page;
2204 ulint insert_size;
2205 ulint free_space;
2206 ulint total_data;
2207 ulint total_n_recs;
2208 const rec_t* rec;
2209 const rec_t* end_rec;
2210 ulint* offs;
2212 page = btr_cur_get_page(cursor);
2214 ut_ad(!split_rec == !offsets);
2215 ut_ad(!offsets
2216 || !page_is_comp(page) == !rec_offs_comp(offsets));
2217 ut_ad(!offsets
2218 || rec_offs_validate(split_rec, cursor->index, offsets));
2220 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2221 free_space = page_get_free_space_of_empty(page_is_comp(page));
2223 /* free_space is now the free space of a created new page */
2225 total_data = page_get_data_size(page) + insert_size;
2226 total_n_recs = page_get_n_recs(page) + 1;
2228 /* We determine which records (from rec to end_rec, not including
2229 end_rec) will end up on the other half page from tuple when it is
2230 inserted. */
2232 if (split_rec == NULL) {
2233 rec = page_rec_get_next(page_get_infimum_rec(page));
2234 end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
2236 } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
2238 rec = page_rec_get_next(page_get_infimum_rec(page));
2239 end_rec = split_rec;
2240 } else {
2241 rec = split_rec;
2242 end_rec = page_get_supremum_rec(page);
2245 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2246 <= free_space) {
2248 /* Ok, there will be enough available space on the
2249 half page where the tuple is inserted */
2251 return(TRUE);
2254 offs = NULL;
2256 while (rec != end_rec) {
2257 /* In this loop we calculate the amount of reserved
2258 space after rec is removed from page. */
2260 offs = rec_get_offsets(rec, cursor->index, offs,
2261 ULINT_UNDEFINED, &heap);
2263 total_data -= rec_offs_size(offs);
2264 total_n_recs--;
2266 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2267 <= free_space) {
2269 /* Ok, there will be enough available space on the
2270 half page where the tuple is inserted */
2272 return(TRUE);
2275 rec = page_rec_get_next_const(rec);
2278 return(FALSE);
2281 /*******************************************************//**
2282 Inserts a data tuple to a tree on a non-leaf level. It is assumed
2283 that mtr holds an x-latch on the tree. */
2284 UNIV_INTERN
2285 void
2286 btr_insert_on_non_leaf_level_func(
2287 /*==============================*/
2288 dict_index_t* index, /*!< in: index */
2289 ulint level, /*!< in: level, must be > 0 */
2290 dtuple_t* tuple, /*!< in: the record to be inserted */
2291 const char* file, /*!< in: file name */
2292 ulint line, /*!< in: line where called */
2293 mtr_t* mtr) /*!< in: mtr */
2295 big_rec_t* dummy_big_rec;
2296 btr_cur_t cursor;
2297 ulint err;
2298 rec_t* rec;
2300 ut_ad(level > 0);
2302 btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
2303 BTR_CONT_MODIFY_TREE,
2304 &cursor, 0, file, line, mtr);
2306 ut_ad(cursor.flag == BTR_CUR_BINARY);
2308 err = btr_cur_optimistic_insert(
2309 BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
2310 | BTR_NO_UNDO_LOG_FLAG, &cursor, tuple, &rec,
2311 &dummy_big_rec, 0, NULL, mtr);
2313 if (err == DB_FAIL) {
2314 err = btr_cur_pessimistic_insert(
2315 BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
2316 | BTR_NO_UNDO_LOG_FLAG,
2317 &cursor, tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
2318 ut_a(err == DB_SUCCESS);
2322 /**************************************************************//**
2323 Attaches the halves of an index page on the appropriate level in an
2324 index tree. */
2325 static
2326 void
2327 btr_attach_half_pages(
2328 /*==================*/
2329 dict_index_t* index, /*!< in: the index tree */
2330 buf_block_t* block, /*!< in/out: page to be split */
2331 const rec_t* split_rec, /*!< in: first record on upper
2332 half page */
2333 buf_block_t* new_block, /*!< in/out: the new half page */
2334 ulint direction, /*!< in: FSP_UP or FSP_DOWN */
2335 mtr_t* mtr) /*!< in: mtr */
2337 ulint space;
2338 ulint zip_size;
2339 ulint prev_page_no;
2340 ulint next_page_no;
2341 ulint level;
2342 page_t* page = buf_block_get_frame(block);
2343 page_t* lower_page;
2344 page_t* upper_page;
2345 ulint lower_page_no;
2346 ulint upper_page_no;
2347 page_zip_des_t* lower_page_zip;
2348 page_zip_des_t* upper_page_zip;
2349 dtuple_t* node_ptr_upper;
2350 mem_heap_t* heap;
2352 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2353 ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
2355 /* Create a memory heap where the data tuple is stored */
2356 heap = mem_heap_create(1024);
2358 /* Based on split direction, decide upper and lower pages */
2359 if (direction == FSP_DOWN) {
2361 btr_cur_t cursor;
2362 ulint* offsets;
2364 lower_page = buf_block_get_frame(new_block);
2365 lower_page_no = buf_block_get_page_no(new_block);
2366 lower_page_zip = buf_block_get_page_zip(new_block);
2367 upper_page = buf_block_get_frame(block);
2368 upper_page_no = buf_block_get_page_no(block);
2369 upper_page_zip = buf_block_get_page_zip(block);
2371 /* Look up the index for the node pointer to page */
2372 offsets = btr_page_get_father_block(NULL, heap, index,
2373 block, mtr, &cursor);
2375 /* Replace the address of the old child node (= page) with the
2376 address of the new lower half */
2378 btr_node_ptr_set_child_page_no(
2379 btr_cur_get_rec(&cursor),
2380 btr_cur_get_page_zip(&cursor),
2381 offsets, lower_page_no, mtr);
2382 mem_heap_empty(heap);
2383 } else {
2384 lower_page = buf_block_get_frame(block);
2385 lower_page_no = buf_block_get_page_no(block);
2386 lower_page_zip = buf_block_get_page_zip(block);
2387 upper_page = buf_block_get_frame(new_block);
2388 upper_page_no = buf_block_get_page_no(new_block);
2389 upper_page_zip = buf_block_get_page_zip(new_block);
2392 /* Get the level of the split pages */
2393 level = btr_page_get_level(buf_block_get_frame(block), mtr);
2394 ut_ad(level
2395 == btr_page_get_level(buf_block_get_frame(new_block), mtr));
2397 /* Build the node pointer (= node key and page address) for the upper
2398 half */
2400 node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
2401 upper_page_no, heap, level);
2403 /* Insert it next to the pointer to the lower half. Note that this
2404 may generate recursion leading to a split on the higher level. */
2406 btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
2408 /* Free the memory heap */
2409 mem_heap_free(heap);
2411 /* Get the previous and next pages of page */
2413 prev_page_no = btr_page_get_prev(page, mtr);
2414 next_page_no = btr_page_get_next(page, mtr);
2415 space = buf_block_get_space(block);
2416 zip_size = buf_block_get_zip_size(block);
2418 /* Update page links of the level */
2420 if (prev_page_no != FIL_NULL) {
2421 buf_block_t* prev_block = btr_block_get(
2422 space, zip_size, prev_page_no, RW_X_LATCH, index, mtr);
2423 #ifdef UNIV_BTR_DEBUG
2424 ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
2425 ut_a(btr_page_get_next(prev_block->frame, mtr)
2426 == buf_block_get_page_no(block));
2427 #endif /* UNIV_BTR_DEBUG */
2429 btr_page_set_next(buf_block_get_frame(prev_block),
2430 buf_block_get_page_zip(prev_block),
2431 lower_page_no, mtr);
2434 if (next_page_no != FIL_NULL) {
2435 buf_block_t* next_block = btr_block_get(
2436 space, zip_size, next_page_no, RW_X_LATCH, index, mtr);
2437 #ifdef UNIV_BTR_DEBUG
2438 ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
2439 ut_a(btr_page_get_prev(next_block->frame, mtr)
2440 == page_get_page_no(page));
2441 #endif /* UNIV_BTR_DEBUG */
2443 btr_page_set_prev(buf_block_get_frame(next_block),
2444 buf_block_get_page_zip(next_block),
2445 upper_page_no, mtr);
2448 btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
2449 btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
2451 btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
2452 btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
2455 /*************************************************************//**
2456 Determine if a tuple is smaller than any record on the page.
2457 @return TRUE if smaller */
2458 static
2459 ibool
2460 btr_page_tuple_smaller(
2461 /*===================*/
2462 btr_cur_t* cursor, /*!< in: b-tree cursor */
2463 const dtuple_t* tuple, /*!< in: tuple to consider */
2464 ulint* offsets,/*!< in/out: temporary storage */
2465 ulint n_uniq, /*!< in: number of unique fields
2466 in the index page records */
2467 mem_heap_t** heap) /*!< in/out: heap for offsets */
2469 buf_block_t* block;
2470 const rec_t* first_rec;
2471 page_cur_t pcur;
2473 /* Read the first user record in the page. */
2474 block = btr_cur_get_block(cursor);
2475 page_cur_set_before_first(block, &pcur);
2476 page_cur_move_to_next(&pcur);
2477 first_rec = page_cur_get_rec(&pcur);
2479 offsets = rec_get_offsets(
2480 first_rec, cursor->index, offsets,
2481 n_uniq, heap);
2483 return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
2486 /*************************************************************//**
2487 Splits an index page to halves and inserts the tuple. It is assumed
2488 that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
2489 released within this function! NOTE that the operation of this
2490 function must always succeed, we cannot reverse it: therefore enough
2491 free disk space (2 pages) must be guaranteed to be available before
2492 this function is called.
2494 @return inserted record */
2495 UNIV_INTERN
2496 rec_t*
2497 btr_page_split_and_insert(
2498 /*======================*/
2499 btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
2500 function returns, the cursor is positioned
2501 on the predecessor of the inserted record */
2502 const dtuple_t* tuple, /*!< in: tuple to insert */
2503 ulint n_ext, /*!< in: number of externally stored columns */
2504 mtr_t* mtr) /*!< in: mtr */
2506 buf_block_t* block;
2507 page_t* page;
2508 page_zip_des_t* page_zip;
2509 ulint page_no;
2510 byte direction;
2511 ulint hint_page_no;
2512 buf_block_t* new_block;
2513 page_t* new_page;
2514 page_zip_des_t* new_page_zip;
2515 rec_t* split_rec;
2516 buf_block_t* left_block;
2517 buf_block_t* right_block;
2518 buf_block_t* insert_block;
2519 page_cur_t* page_cursor;
2520 rec_t* first_rec;
2521 byte* buf = 0; /* remove warning */
2522 rec_t* move_limit;
2523 ibool insert_will_fit;
2524 ibool insert_left;
2525 ulint n_iterations = 0;
2526 rec_t* rec;
2527 mem_heap_t* heap;
2528 ulint n_uniq;
2529 ulint* offsets;
2531 heap = mem_heap_create(1024);
2532 n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
2533 func_start:
2534 mem_heap_empty(heap);
2535 offsets = NULL;
2537 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
2538 MTR_MEMO_X_LOCK));
2539 #ifdef UNIV_SYNC_DEBUG
2540 ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
2541 #endif /* UNIV_SYNC_DEBUG */
2543 block = btr_cur_get_block(cursor);
2544 page = buf_block_get_frame(block);
2545 page_zip = buf_block_get_page_zip(block);
2547 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2548 ut_ad(page_get_n_recs(page) >= 1);
2550 page_no = buf_block_get_page_no(block);
2552 /* 1. Decide the split record; split_rec == NULL means that the
2553 tuple to be inserted should be the first record on the upper
2554 half-page */
2555 insert_left = FALSE;
2557 if (n_iterations > 0) {
2558 direction = FSP_UP;
2559 hint_page_no = page_no + 1;
2560 split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
2562 if (UNIV_UNLIKELY(split_rec == NULL)) {
2563 insert_left = btr_page_tuple_smaller(
2564 cursor, tuple, offsets, n_uniq, &heap);
2566 } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
2567 direction = FSP_UP;
2568 hint_page_no = page_no + 1;
2570 } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
2571 direction = FSP_DOWN;
2572 hint_page_no = page_no - 1;
2573 ut_ad(split_rec);
2574 } else {
2575 direction = FSP_UP;
2576 hint_page_no = page_no + 1;
2578 /* If there is only one record in the index page, we
2579 can't split the node in the middle by default. We need
2580 to determine whether the new record will be inserted
2581 to the left or right. */
2583 if (page_get_n_recs(page) > 1) {
2584 split_rec = page_get_middle_rec(page);
2585 } else if (btr_page_tuple_smaller(cursor, tuple,
2586 offsets, n_uniq, &heap)) {
2587 split_rec = page_rec_get_next(
2588 page_get_infimum_rec(page));
2589 } else {
2590 split_rec = NULL;
2594 /* 2. Allocate a new page to the index */
2595 new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
2596 btr_page_get_level(page, mtr), mtr, mtr);
2597 new_page = buf_block_get_frame(new_block);
2598 new_page_zip = buf_block_get_page_zip(new_block);
2599 btr_page_create(new_block, new_page_zip, cursor->index,
2600 btr_page_get_level(page, mtr), mtr);
2602 /* 3. Calculate the first record on the upper half-page, and the
2603 first record (move_limit) on original page which ends up on the
2604 upper half */
2606 if (split_rec) {
2607 first_rec = move_limit = split_rec;
2609 offsets = rec_get_offsets(split_rec, cursor->index, offsets,
2610 n_uniq, &heap);
2612 insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
2614 if (UNIV_UNLIKELY(!insert_left && new_page_zip
2615 && n_iterations > 0)) {
2616 /* If a compressed page has already been split,
2617 avoid further splits by inserting the record
2618 to an empty page. */
2619 split_rec = NULL;
2620 goto insert_empty;
2622 } else if (UNIV_UNLIKELY(insert_left)) {
2623 ut_a(n_iterations > 0);
2624 first_rec = page_rec_get_next(page_get_infimum_rec(page));
2625 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
2626 } else {
2627 insert_empty:
2628 ut_ad(!split_rec);
2629 ut_ad(!insert_left);
2630 buf = mem_alloc(rec_get_converted_size(cursor->index,
2631 tuple, n_ext));
2633 first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
2634 tuple, n_ext);
2635 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
2638 /* 4. Do first the modifications in the tree structure */
2640 btr_attach_half_pages(cursor->index, block,
2641 first_rec, new_block, direction, mtr);
2643 /* If the split is made on the leaf level and the insert will fit
2644 on the appropriate half-page, we may release the tree x-latch.
2645 We can then move the records after releasing the tree latch,
2646 thus reducing the tree latch contention. */
2648 if (split_rec) {
2649 insert_will_fit = !new_page_zip
2650 && btr_page_insert_fits(cursor, split_rec,
2651 offsets, tuple, n_ext, heap);
2652 } else {
2653 if (!insert_left) {
2654 mem_free(buf);
2655 buf = NULL;
2658 insert_will_fit = !new_page_zip
2659 && btr_page_insert_fits(cursor, NULL,
2660 NULL, tuple, n_ext, heap);
2663 if (insert_will_fit && page_is_leaf(page)) {
2665 mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
2666 MTR_MEMO_X_LOCK);
2669 /* 5. Move then the records to the new page */
2670 if (direction == FSP_DOWN) {
2671 /* fputs("Split left\n", stderr); */
2673 if (0
2674 #ifdef UNIV_ZIP_COPY
2675 || page_zip
2676 #endif /* UNIV_ZIP_COPY */
2677 || UNIV_UNLIKELY
2678 (!page_move_rec_list_start(new_block, block, move_limit,
2679 cursor->index, mtr))) {
2680 /* For some reason, compressing new_page failed,
2681 even though it should contain fewer records than
2682 the original page. Copy the page byte for byte
2683 and then delete the records from both pages
2684 as appropriate. Deleting will always succeed. */
2685 ut_a(new_page_zip);
2687 page_zip_copy_recs(new_page_zip, new_page,
2688 page_zip, page, cursor->index, mtr);
2689 page_delete_rec_list_end(move_limit - page + new_page,
2690 new_block, cursor->index,
2691 ULINT_UNDEFINED,
2692 ULINT_UNDEFINED, mtr);
2694 /* Update the lock table and possible hash index. */
2696 lock_move_rec_list_start(
2697 new_block, block, move_limit,
2698 new_page + PAGE_NEW_INFIMUM);
2700 btr_search_move_or_delete_hash_entries(
2701 new_block, block, cursor->index);
2703 /* Delete the records from the source page. */
2705 page_delete_rec_list_start(move_limit, block,
2706 cursor->index, mtr);
2709 left_block = new_block;
2710 right_block = block;
2712 lock_update_split_left(right_block, left_block);
2713 } else {
2714 /* fputs("Split right\n", stderr); */
2716 if (0
2717 #ifdef UNIV_ZIP_COPY
2718 || page_zip
2719 #endif /* UNIV_ZIP_COPY */
2720 || UNIV_UNLIKELY
2721 (!page_move_rec_list_end(new_block, block, move_limit,
2722 cursor->index, mtr))) {
2723 /* For some reason, compressing new_page failed,
2724 even though it should contain fewer records than
2725 the original page. Copy the page byte for byte
2726 and then delete the records from both pages
2727 as appropriate. Deleting will always succeed. */
2728 ut_a(new_page_zip);
2730 page_zip_copy_recs(new_page_zip, new_page,
2731 page_zip, page, cursor->index, mtr);
2732 page_delete_rec_list_start(move_limit - page
2733 + new_page, new_block,
2734 cursor->index, mtr);
2736 /* Update the lock table and possible hash index. */
2738 lock_move_rec_list_end(new_block, block, move_limit);
2740 btr_search_move_or_delete_hash_entries(
2741 new_block, block, cursor->index);
2743 /* Delete the records from the source page. */
2745 page_delete_rec_list_end(move_limit, block,
2746 cursor->index,
2747 ULINT_UNDEFINED,
2748 ULINT_UNDEFINED, mtr);
2751 left_block = block;
2752 right_block = new_block;
2754 lock_update_split_right(right_block, left_block);
2757 #ifdef UNIV_ZIP_DEBUG
2758 if (UNIV_LIKELY_NULL(page_zip)) {
2759 ut_a(page_zip_validate(page_zip, page, cursor->index));
2760 ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
2762 #endif /* UNIV_ZIP_DEBUG */
2764 /* At this point, split_rec, move_limit and first_rec may point
2765 to garbage on the old page. */
2767 /* 6. The split and the tree modification is now completed. Decide the
2768 page where the tuple should be inserted */
2770 if (insert_left) {
2771 insert_block = left_block;
2772 } else {
2773 insert_block = right_block;
2776 /* 7. Reposition the cursor for insert and try insertion */
2777 page_cursor = btr_cur_get_page_cur(cursor);
2779 page_cur_search(insert_block, cursor->index, tuple,
2780 PAGE_CUR_LE, page_cursor);
2782 rec = page_cur_tuple_insert(page_cursor, tuple,
2783 cursor->index, n_ext, mtr);
2785 #ifdef UNIV_ZIP_DEBUG
2787 page_t* insert_page
2788 = buf_block_get_frame(insert_block);
2790 page_zip_des_t* insert_page_zip
2791 = buf_block_get_page_zip(insert_block);
2793 ut_a(!insert_page_zip
2794 || page_zip_validate(insert_page_zip, insert_page,
2795 cursor->index));
2797 #endif /* UNIV_ZIP_DEBUG */
2799 if (UNIV_LIKELY(rec != NULL)) {
2801 goto func_exit;
2804 /* 8. If insert did not fit, try page reorganization */
2806 if (UNIV_UNLIKELY
2807 (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
2809 goto insert_failed;
2812 page_cur_search(insert_block, cursor->index, tuple,
2813 PAGE_CUR_LE, page_cursor);
2814 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2815 n_ext, mtr);
2817 if (UNIV_UNLIKELY(rec == NULL)) {
2818 /* The insert did not fit on the page: loop back to the
2819 start of the function for a new split */
2820 insert_failed:
2821 /* We play safe and reset the free bits for new_page */
2822 if (!dict_index_is_clust(cursor->index)) {
2823 ibuf_reset_free_bits(new_block);
2826 /* fprintf(stderr, "Split second round %lu\n",
2827 page_get_page_no(page)); */
2828 n_iterations++;
2829 ut_ad(n_iterations < 2
2830 || buf_block_get_page_zip(insert_block));
2831 ut_ad(!insert_will_fit);
2833 goto func_start;
2836 func_exit:
2837 /* Insert fit on the page: update the free bits for the
2838 left and right pages in the same mtr */
2840 if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
2841 ibuf_update_free_bits_for_two_pages_low(
2842 buf_block_get_zip_size(left_block),
2843 left_block, right_block, mtr);
2846 #if 0
2847 fprintf(stderr, "Split and insert done %lu %lu\n",
2848 buf_block_get_page_no(left_block),
2849 buf_block_get_page_no(right_block));
2850 #endif
2852 ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
2853 ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
2855 mem_heap_free(heap);
2856 return(rec);
2859 #ifdef UNIV_SYNC_DEBUG
2860 /*************************************************************//**
2861 Removes a page from the level list of pages.
2862 @param space in: space where removed
2863 @param zip_size in: compressed page size in bytes, or 0 for uncompressed
2864 @param page in/out: page to remove
2865 @param index in: index tree
2866 @param mtr in/out: mini-transaction */
2867 # define btr_level_list_remove(space,zip_size,page,index,mtr) \
2868 btr_level_list_remove_func(space,zip_size,page,index,mtr)
2869 #else /* UNIV_SYNC_DEBUG */
2870 /*************************************************************//**
2871 Removes a page from the level list of pages.
2872 @param space in: space where removed
2873 @param zip_size in: compressed page size in bytes, or 0 for uncompressed
2874 @param page in/out: page to remove
2875 @param index in: index tree
2876 @param mtr in/out: mini-transaction */
2877 # define btr_level_list_remove(space,zip_size,page,index,mtr) \
2878 btr_level_list_remove_func(space,zip_size,page,mtr)
2879 #endif /* UNIV_SYNC_DEBUG */
2881 /*************************************************************//**
2882 Removes a page from the level list of pages. */
2883 static __attribute__((nonnull))
2884 void
2885 btr_level_list_remove_func(
2886 /*=======================*/
2887 ulint space, /*!< in: space where removed */
2888 ulint zip_size,/*!< in: compressed page size in bytes
2889 or 0 for uncompressed pages */
2890 page_t* page, /*!< in/out: page to remove */
2891 #ifdef UNIV_SYNC_DEBUG
2892 const dict_index_t* index, /*!< in: index tree */
2893 #endif /* UNIV_SYNC_DEBUG */
2894 mtr_t* mtr) /*!< in/out: mini-transaction */
2896 ulint prev_page_no;
2897 ulint next_page_no;
2899 ut_ad(page && mtr);
2900 ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
2901 ut_ad(space == page_get_space_id(page));
2902 /* Get the previous and next page numbers of page */
2904 prev_page_no = btr_page_get_prev(page, mtr);
2905 next_page_no = btr_page_get_next(page, mtr);
2907 /* Update page links of the level */
2909 if (prev_page_no != FIL_NULL) {
2910 buf_block_t* prev_block
2911 = btr_block_get(space, zip_size, prev_page_no,
2912 RW_X_LATCH, index, mtr);
2913 page_t* prev_page
2914 = buf_block_get_frame(prev_block);
2915 #ifdef UNIV_BTR_DEBUG
2916 ut_a(page_is_comp(prev_page) == page_is_comp(page));
2917 ut_a(btr_page_get_next(prev_page, mtr)
2918 == page_get_page_no(page));
2919 #endif /* UNIV_BTR_DEBUG */
2921 btr_page_set_next(prev_page,
2922 buf_block_get_page_zip(prev_block),
2923 next_page_no, mtr);
2926 if (next_page_no != FIL_NULL) {
2927 buf_block_t* next_block
2928 = btr_block_get(space, zip_size, next_page_no,
2929 RW_X_LATCH, index, mtr);
2930 page_t* next_page
2931 = buf_block_get_frame(next_block);
2932 #ifdef UNIV_BTR_DEBUG
2933 ut_a(page_is_comp(next_page) == page_is_comp(page));
2934 ut_a(btr_page_get_prev(next_page, mtr)
2935 == page_get_page_no(page));
2936 #endif /* UNIV_BTR_DEBUG */
2938 btr_page_set_prev(next_page,
2939 buf_block_get_page_zip(next_block),
2940 prev_page_no, mtr);
2944 /****************************************************************//**
2945 Writes the redo log record for setting an index record as the predefined
2946 minimum record. */
2947 UNIV_INLINE
2948 void
2949 btr_set_min_rec_mark_log(
2950 /*=====================*/
2951 rec_t* rec, /*!< in: record */
2952 byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
2953 mtr_t* mtr) /*!< in: mtr */
2955 mlog_write_initial_log_record(rec, type, mtr);
2957 /* Write rec offset as a 2-byte ulint */
2958 mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
2960 #else /* !UNIV_HOTBACKUP */
2961 # define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
2962 #endif /* !UNIV_HOTBACKUP */
2964 /****************************************************************//**
2965 Parses the redo log record for setting an index record as the predefined
2966 minimum record.
2967 @return end of log record or NULL */
2968 UNIV_INTERN
2969 byte*
2970 btr_parse_set_min_rec_mark(
2971 /*=======================*/
2972 byte* ptr, /*!< in: buffer */
2973 byte* end_ptr,/*!< in: buffer end */
2974 ulint comp, /*!< in: nonzero=compact page format */
2975 page_t* page, /*!< in: page or NULL */
2976 mtr_t* mtr) /*!< in: mtr or NULL */
2978 rec_t* rec;
2980 if (end_ptr < ptr + 2) {
2982 return(NULL);
2985 if (page) {
2986 ut_a(!page_is_comp(page) == !comp);
2988 rec = page + mach_read_from_2(ptr);
2990 btr_set_min_rec_mark(rec, mtr);
2993 return(ptr + 2);
2996 /****************************************************************//**
2997 Sets a record as the predefined minimum record. */
2998 UNIV_INTERN
2999 void
3000 btr_set_min_rec_mark(
3001 /*=================*/
3002 rec_t* rec, /*!< in: record */
3003 mtr_t* mtr) /*!< in: mtr */
3005 ulint info_bits;
3007 if (UNIV_LIKELY(page_rec_is_comp(rec))) {
3008 info_bits = rec_get_info_bits(rec, TRUE);
3010 rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3012 btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
3013 } else {
3014 info_bits = rec_get_info_bits(rec, FALSE);
3016 rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3018 btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
3022 #ifndef UNIV_HOTBACKUP
3023 /*************************************************************//**
3024 Deletes on the upper level the node pointer to a page. */
3025 UNIV_INTERN
3026 void
3027 btr_node_ptr_delete(
3028 /*================*/
3029 dict_index_t* index, /*!< in: index tree */
3030 buf_block_t* block, /*!< in: page whose node pointer is deleted */
3031 mtr_t* mtr) /*!< in: mtr */
3033 btr_cur_t cursor;
3034 ibool compressed;
3035 ulint err;
3037 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3039 /* Delete node pointer on father page */
3040 btr_page_get_father(index, block, mtr, &cursor);
3042 compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
3043 mtr);
3044 ut_a(err == DB_SUCCESS);
3046 if (!compressed) {
3047 btr_cur_compress_if_useful(&cursor, FALSE, mtr);
3051 /*************************************************************//**
3052 If page is the only on its level, this function moves its records to the
3053 father page, thus reducing the tree height.
3054 @return father block */
3055 static
3056 buf_block_t*
3057 btr_lift_page_up(
3058 /*=============*/
3059 dict_index_t* index, /*!< in: index tree */
3060 buf_block_t* block, /*!< in: page which is the only on its level;
3061 must not be empty: use
3062 btr_discard_only_page_on_level if the last
3063 record from the page should be removed */
3064 mtr_t* mtr) /*!< in: mtr */
3066 buf_block_t* father_block;
3067 page_t* father_page;
3068 ulint page_level;
3069 page_zip_des_t* father_page_zip;
3070 page_t* page = buf_block_get_frame(block);
3071 ulint root_page_no;
3072 buf_block_t* blocks[BTR_MAX_LEVELS];
3073 ulint n_blocks; /*!< last used index in blocks[] */
3074 ulint i;
3075 ibool lift_father_up = FALSE;
3076 buf_block_t* block_orig = block;
3078 ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3079 ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3080 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3082 page_level = btr_page_get_level(page, mtr);
3083 root_page_no = dict_index_get_page(index);
3086 btr_cur_t cursor;
3087 ulint* offsets = NULL;
3088 mem_heap_t* heap = mem_heap_create(
3089 sizeof(*offsets)
3090 * (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields));
3091 buf_block_t* b;
3093 offsets = btr_page_get_father_block(offsets, heap, index,
3094 block, mtr, &cursor);
3095 father_block = btr_cur_get_block(&cursor);
3096 father_page_zip = buf_block_get_page_zip(father_block);
3097 father_page = buf_block_get_frame(father_block);
3099 n_blocks = 0;
3101 /* Store all ancestor pages so we can reset their
3102 levels later on. We have to do all the searches on
3103 the tree now because later on, after we've replaced
3104 the first level, the tree is in an inconsistent state
3105 and can not be searched. */
3106 for (b = father_block;
3107 buf_block_get_page_no(b) != root_page_no; ) {
3108 ut_a(n_blocks < BTR_MAX_LEVELS);
3110 offsets = btr_page_get_father_block(offsets, heap,
3111 index, b,
3112 mtr, &cursor);
3114 blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
3117 if (n_blocks && page_level == 0) {
3118 /* The father page also should be the only on its level (not
3119 root). We should lift up the father page at first.
3120 Because the leaf page should be lifted up only for root page.
3121 The freeing page is based on page_level (==0 or !=0)
3122 to choose segment. If the page_level is changed ==0 from !=0,
3123 later freeing of the page doesn't find the page allocation
3124 to be freed.*/
3126 lift_father_up = TRUE;
3127 block = father_block;
3128 page = buf_block_get_frame(block);
3129 page_level = btr_page_get_level(page, mtr);
3131 ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3132 ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3133 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3135 father_block = blocks[0];
3136 father_page_zip = buf_block_get_page_zip(father_block);
3137 father_page = buf_block_get_frame(father_block);
3140 mem_heap_free(heap);
3143 btr_search_drop_page_hash_index(block);
3145 /* Make the father empty */
3146 btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
3147 page_level++;
3149 /* Copy the records to the father page one by one. */
3150 if (0
3151 #ifdef UNIV_ZIP_COPY
3152 || father_page_zip
3153 #endif /* UNIV_ZIP_COPY */
3154 || UNIV_UNLIKELY
3155 (!page_copy_rec_list_end(father_block, block,
3156 page_get_infimum_rec(page),
3157 index, mtr))) {
3158 const page_zip_des_t* page_zip
3159 = buf_block_get_page_zip(block);
3160 ut_a(father_page_zip);
3161 ut_a(page_zip);
3163 /* Copy the page byte for byte. */
3164 page_zip_copy_recs(father_page_zip, father_page,
3165 page_zip, page, index, mtr);
3167 /* Update the lock table and possible hash index. */
3169 lock_move_rec_list_end(father_block, block,
3170 page_get_infimum_rec(page));
3172 btr_search_move_or_delete_hash_entries(father_block, block,
3173 index);
3176 btr_blob_dbg_remove(page, index, "btr_lift_page_up");
3177 lock_update_copy_and_discard(father_block, block);
3179 /* Go upward to root page, decrementing levels by one. */
3180 for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
3181 page_t* page = buf_block_get_frame(blocks[i]);
3182 page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
3184 ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
3186 btr_page_set_level(page, page_zip, page_level, mtr);
3187 #ifdef UNIV_ZIP_DEBUG
3188 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3189 #endif /* UNIV_ZIP_DEBUG */
3192 /* Free the file page */
3193 btr_page_free(index, block, mtr);
3195 /* We play it safe and reset the free bits for the father */
3196 if (!dict_index_is_clust(index)) {
3197 ibuf_reset_free_bits(father_block);
3199 ut_ad(page_validate(father_page, index));
3200 ut_ad(btr_check_node_ptr(index, father_block, mtr));
3202 return(lift_father_up ? block_orig : father_block);
3205 /*************************************************************//**
3206 Tries to merge the page first to the left immediate brother if such a
3207 brother exists, and the node pointers to the current page and to the brother
3208 reside on the same page. If the left brother does not satisfy these
3209 conditions, looks at the right brother. If the page is the only one on that
3210 level lifts the records of the page to the father page, thus reducing the
3211 tree height. It is assumed that mtr holds an x-latch on the tree and on the
3212 page. If cursor is on the leaf level, mtr must also hold x-latches to the
3213 brothers, if they exist.
3214 @return TRUE on success */
3215 UNIV_INTERN
3216 ibool
3217 btr_compress(
3218 /*=========*/
3219 btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
3220 or lift; the page must not be empty:
3221 when deleting records, use btr_discard_page()
3222 if the page would become empty */
3223 ibool adjust, /*!< in: TRUE if should adjust the
3224 cursor position even if compression occurs */
3225 mtr_t* mtr) /*!< in/out: mini-transaction */
3227 dict_index_t* index;
3228 ulint space;
3229 ulint zip_size;
3230 ulint left_page_no;
3231 ulint right_page_no;
3232 buf_block_t* merge_block;
3233 page_t* merge_page;
3234 page_zip_des_t* merge_page_zip;
3235 ibool is_left;
3236 buf_block_t* block;
3237 page_t* page;
3238 btr_cur_t father_cursor;
3239 mem_heap_t* heap;
3240 ulint* offsets;
3241 ulint data_size;
3242 ulint n_recs;
3243 ulint nth_rec = 0; /* remove bogus warning */
3244 ulint max_ins_size;
3245 ulint max_ins_size_reorg;
3247 block = btr_cur_get_block(cursor);
3248 page = btr_cur_get_page(cursor);
3249 index = btr_cur_get_index(cursor);
3251 ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
3253 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3254 MTR_MEMO_X_LOCK));
3255 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3256 space = dict_index_get_space(index);
3257 zip_size = dict_table_zip_size(index->table);
3259 left_page_no = btr_page_get_prev(page, mtr);
3260 right_page_no = btr_page_get_next(page, mtr);
3262 #if 0
3263 fprintf(stderr, "Merge left page %lu right %lu \n",
3264 left_page_no, right_page_no);
3265 #endif
3267 heap = mem_heap_create(100);
3268 offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3269 &father_cursor);
3271 if (adjust) {
3272 nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
3273 ut_ad(nth_rec > 0);
3276 /* Decide the page to which we try to merge and which will inherit
3277 the locks */
3279 is_left = left_page_no != FIL_NULL;
3281 if (is_left) {
3283 merge_block = btr_block_get(space, zip_size, left_page_no,
3284 RW_X_LATCH, index, mtr);
3285 merge_page = buf_block_get_frame(merge_block);
3286 #ifdef UNIV_BTR_DEBUG
3287 ut_a(btr_page_get_next(merge_page, mtr)
3288 == buf_block_get_page_no(block));
3289 #endif /* UNIV_BTR_DEBUG */
3290 } else if (right_page_no != FIL_NULL) {
3292 merge_block = btr_block_get(space, zip_size, right_page_no,
3293 RW_X_LATCH, index, mtr);
3294 merge_page = buf_block_get_frame(merge_block);
3295 #ifdef UNIV_BTR_DEBUG
3296 ut_a(btr_page_get_prev(merge_page, mtr)
3297 == buf_block_get_page_no(block));
3298 #endif /* UNIV_BTR_DEBUG */
3299 } else {
3300 /* The page is the only one on the level, lift the records
3301 to the father */
3303 merge_block = btr_lift_page_up(index, block, mtr);
3304 goto func_exit;
3307 n_recs = page_get_n_recs(page);
3308 data_size = page_get_data_size(page);
3309 #ifdef UNIV_BTR_DEBUG
3310 ut_a(page_is_comp(merge_page) == page_is_comp(page));
3311 #endif /* UNIV_BTR_DEBUG */
3313 max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
3314 merge_page, n_recs);
3315 if (data_size > max_ins_size_reorg) {
3317 /* No space for merge */
3318 err_exit:
3319 /* We play it safe and reset the free bits. */
3320 if (zip_size
3321 && page_is_leaf(merge_page)
3322 && !dict_index_is_clust(index)) {
3323 ibuf_reset_free_bits(merge_block);
3326 mem_heap_free(heap);
3327 return(FALSE);
3330 ut_ad(page_validate(merge_page, index));
3332 max_ins_size = page_get_max_insert_size(merge_page, n_recs);
3334 if (UNIV_UNLIKELY(data_size > max_ins_size)) {
3336 /* We have to reorganize merge_page */
3338 if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
3339 index, mtr))) {
3341 goto err_exit;
3344 max_ins_size = page_get_max_insert_size(merge_page, n_recs);
3346 ut_ad(page_validate(merge_page, index));
3347 ut_ad(max_ins_size == max_ins_size_reorg);
3349 if (UNIV_UNLIKELY(data_size > max_ins_size)) {
3351 /* Add fault tolerance, though this should
3352 never happen */
3354 goto err_exit;
3358 merge_page_zip = buf_block_get_page_zip(merge_block);
3359 #ifdef UNIV_ZIP_DEBUG
3360 if (UNIV_LIKELY_NULL(merge_page_zip)) {
3361 const page_zip_des_t* page_zip
3362 = buf_block_get_page_zip(block);
3363 ut_a(page_zip);
3364 ut_a(page_zip_validate(merge_page_zip, merge_page, index));
3365 ut_a(page_zip_validate(page_zip, page, index));
3367 #endif /* UNIV_ZIP_DEBUG */
3369 /* Move records to the merge page */
3370 if (is_left) {
3371 rec_t* orig_pred = page_copy_rec_list_start(
3372 merge_block, block, page_get_supremum_rec(page),
3373 index, mtr);
3375 if (UNIV_UNLIKELY(!orig_pred)) {
3376 goto err_exit;
3379 btr_search_drop_page_hash_index(block);
3381 /* Remove the page from the level list */
3382 btr_level_list_remove(space, zip_size, page, index, mtr);
3384 btr_node_ptr_delete(index, block, mtr);
3385 lock_update_merge_left(merge_block, orig_pred, block);
3387 if (adjust) {
3388 nth_rec += page_rec_get_n_recs_before(orig_pred);
3390 } else {
3391 rec_t* orig_succ;
3392 #ifdef UNIV_BTR_DEBUG
3393 byte fil_page_prev[4];
3394 #endif /* UNIV_BTR_DEBUG */
3396 if (UNIV_LIKELY_NULL(merge_page_zip)) {
3397 /* The function page_zip_compress(), which will be
3398 invoked by page_copy_rec_list_end() below,
3399 requires that FIL_PAGE_PREV be FIL_NULL.
3400 Clear the field, but prepare to restore it. */
3401 #ifdef UNIV_BTR_DEBUG
3402 memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
3403 #endif /* UNIV_BTR_DEBUG */
3404 #if FIL_NULL != 0xffffffff
3405 # error "FIL_NULL != 0xffffffff"
3406 #endif
3407 memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
3410 orig_succ = page_copy_rec_list_end(merge_block, block,
3411 page_get_infimum_rec(page),
3412 cursor->index, mtr);
3414 if (UNIV_UNLIKELY(!orig_succ)) {
3415 ut_a(merge_page_zip);
3416 #ifdef UNIV_BTR_DEBUG
3417 /* FIL_PAGE_PREV was restored from merge_page_zip. */
3418 ut_a(!memcmp(fil_page_prev,
3419 merge_page + FIL_PAGE_PREV, 4));
3420 #endif /* UNIV_BTR_DEBUG */
3421 goto err_exit;
3424 btr_search_drop_page_hash_index(block);
3426 #ifdef UNIV_BTR_DEBUG
3427 if (UNIV_LIKELY_NULL(merge_page_zip)) {
3428 /* Restore FIL_PAGE_PREV in order to avoid an assertion
3429 failure in btr_level_list_remove(), which will set
3430 the field again to FIL_NULL. Even though this makes
3431 merge_page and merge_page_zip inconsistent for a
3432 split second, it is harmless, because the pages
3433 are X-latched. */
3434 memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
3436 #endif /* UNIV_BTR_DEBUG */
3438 /* Remove the page from the level list */
3439 btr_level_list_remove(space, zip_size, page, index, mtr);
3441 /* Replace the address of the old child node (= page) with the
3442 address of the merge page to the right */
3444 btr_node_ptr_set_child_page_no(
3445 btr_cur_get_rec(&father_cursor),
3446 btr_cur_get_page_zip(&father_cursor),
3447 offsets, right_page_no, mtr);
3448 btr_node_ptr_delete(index, merge_block, mtr);
3450 lock_update_merge_right(merge_block, orig_succ, block);
3453 btr_blob_dbg_remove(page, index, "btr_compress");
3455 if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
3456 /* Update the free bits of the B-tree page in the
3457 insert buffer bitmap. This has to be done in a
3458 separate mini-transaction that is committed before the
3459 main mini-transaction. We cannot update the insert
3460 buffer bitmap in this mini-transaction, because
3461 btr_compress() can be invoked recursively without
3462 committing the mini-transaction in between. Since
3463 insert buffer bitmap pages have a lower rank than
3464 B-tree pages, we must not access other pages in the
3465 same mini-transaction after accessing an insert buffer
3466 bitmap page. */
3468 /* The free bits in the insert buffer bitmap must
3469 never exceed the free space on a page. It is safe to
3470 decrement or reset the bits in the bitmap in a
3471 mini-transaction that is committed before the
3472 mini-transaction that affects the free space. */
3474 /* It is unsafe to increment the bits in a separately
3475 committed mini-transaction, because in crash recovery,
3476 the free bits could momentarily be set too high. */
3478 if (zip_size) {
3479 /* Because the free bits may be incremented
3480 and we cannot update the insert buffer bitmap
3481 in the same mini-transaction, the only safe
3482 thing we can do here is the pessimistic
3483 approach: reset the free bits. */
3484 ibuf_reset_free_bits(merge_block);
3485 } else {
3486 /* On uncompressed pages, the free bits will
3487 never increase here. Thus, it is safe to
3488 write the bits accurately in a separate
3489 mini-transaction. */
3490 ibuf_update_free_bits_if_full(merge_block,
3491 UNIV_PAGE_SIZE,
3492 ULINT_UNDEFINED);
3496 ut_ad(page_validate(merge_page, index));
3497 #ifdef UNIV_ZIP_DEBUG
3498 ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
3499 index));
3500 #endif /* UNIV_ZIP_DEBUG */
3502 /* Free the file page */
3503 btr_page_free(index, block, mtr);
3505 ut_ad(btr_check_node_ptr(index, merge_block, mtr));
3506 func_exit:
3507 mem_heap_free(heap);
3509 if (adjust) {
3510 ut_ad(nth_rec > 0);
3511 btr_cur_position(
3512 index,
3513 page_rec_get_nth(merge_block->frame, nth_rec),
3514 merge_block, cursor);
3517 return(TRUE);
3520 /*************************************************************//**
3521 Discards a page that is the only page on its level. This will empty
3522 the whole B-tree, leaving just an empty root page. This function
3523 should never be reached, because btr_compress(), which is invoked in
3524 delete operations, calls btr_lift_page_up() to flatten the B-tree. */
3525 static
3526 void
3527 btr_discard_only_page_on_level(
3528 /*===========================*/
3529 dict_index_t* index, /*!< in: index tree */
3530 buf_block_t* block, /*!< in: page which is the only on its level */
3531 mtr_t* mtr) /*!< in: mtr */
3533 ulint page_level = 0;
3534 trx_id_t max_trx_id;
3536 /* Save the PAGE_MAX_TRX_ID from the leaf page. */
3537 max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
3539 while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
3540 btr_cur_t cursor;
3541 buf_block_t* father;
3542 const page_t* page = buf_block_get_frame(block);
3544 ut_a(page_get_n_recs(page) == 1);
3545 ut_a(page_level == btr_page_get_level(page, mtr));
3546 ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
3547 ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
3549 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3550 btr_search_drop_page_hash_index(block);
3552 btr_page_get_father(index, block, mtr, &cursor);
3553 father = btr_cur_get_block(&cursor);
3555 lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
3557 /* Free the file page */
3558 btr_page_free(index, block, mtr);
3560 block = father;
3561 page_level++;
3564 /* block is the root page, which must be empty, except
3565 for the node pointer to the (now discarded) block(s). */
3567 #ifdef UNIV_BTR_DEBUG
3568 if (!dict_index_is_ibuf(index)) {
3569 const page_t* root = buf_block_get_frame(block);
3570 const ulint space = dict_index_get_space(index);
3571 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
3572 + root, space));
3573 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
3574 + root, space));
3576 #endif /* UNIV_BTR_DEBUG */
3578 btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
3580 if (!dict_index_is_clust(index)) {
3581 /* We play it safe and reset the free bits for the root */
3582 ibuf_reset_free_bits(block);
3584 if (page_is_leaf(buf_block_get_frame(block))) {
3585 ut_a(!ut_dulint_is_zero(max_trx_id));
3586 page_set_max_trx_id(block,
3587 buf_block_get_page_zip(block),
3588 max_trx_id, mtr);
3593 /*************************************************************//**
3594 Discards a page from a B-tree. This is used to remove the last record from
3595 a B-tree page: the whole page must be removed at the same time. This cannot
3596 be used for the root page, which is allowed to be empty. */
3597 UNIV_INTERN
3598 void
3599 btr_discard_page(
3600 /*=============*/
3601 btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
3602 the root page */
3603 mtr_t* mtr) /*!< in: mtr */
3605 dict_index_t* index;
3606 ulint space;
3607 ulint zip_size;
3608 ulint left_page_no;
3609 ulint right_page_no;
3610 buf_block_t* merge_block;
3611 page_t* merge_page;
3612 buf_block_t* block;
3613 page_t* page;
3614 rec_t* node_ptr;
3616 block = btr_cur_get_block(cursor);
3617 index = btr_cur_get_index(cursor);
3619 ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
3620 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3621 MTR_MEMO_X_LOCK));
3622 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3623 space = dict_index_get_space(index);
3624 zip_size = dict_table_zip_size(index->table);
3626 /* Decide the page which will inherit the locks */
3628 left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
3629 right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
3631 if (left_page_no != FIL_NULL) {
3632 merge_block = btr_block_get(space, zip_size, left_page_no,
3633 RW_X_LATCH, index, mtr);
3634 merge_page = buf_block_get_frame(merge_block);
3635 #ifdef UNIV_BTR_DEBUG
3636 ut_a(btr_page_get_next(merge_page, mtr)
3637 == buf_block_get_page_no(block));
3638 #endif /* UNIV_BTR_DEBUG */
3639 } else if (right_page_no != FIL_NULL) {
3640 merge_block = btr_block_get(space, zip_size, right_page_no,
3641 RW_X_LATCH, index, mtr);
3642 merge_page = buf_block_get_frame(merge_block);
3643 #ifdef UNIV_BTR_DEBUG
3644 ut_a(btr_page_get_prev(merge_page, mtr)
3645 == buf_block_get_page_no(block));
3646 #endif /* UNIV_BTR_DEBUG */
3647 } else {
3648 btr_discard_only_page_on_level(index, block, mtr);
3650 return;
3653 page = buf_block_get_frame(block);
3654 ut_a(page_is_comp(merge_page) == page_is_comp(page));
3655 btr_search_drop_page_hash_index(block);
3657 if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
3659 /* We have to mark the leftmost node pointer on the right
3660 side page as the predefined minimum record */
3661 node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
3663 ut_ad(page_rec_is_user_rec(node_ptr));
3665 /* This will make page_zip_validate() fail on merge_page
3666 until btr_level_list_remove() completes. This is harmless,
3667 because everything will take place within a single
3668 mini-transaction and because writing to the redo log
3669 is an atomic operation (performed by mtr_commit()). */
3670 btr_set_min_rec_mark(node_ptr, mtr);
3673 btr_node_ptr_delete(index, block, mtr);
3675 /* Remove the page from the level list */
3676 btr_level_list_remove(space, zip_size, page, index, mtr);
3677 #ifdef UNIV_ZIP_DEBUG
3679 page_zip_des_t* merge_page_zip
3680 = buf_block_get_page_zip(merge_block);
3681 ut_a(!merge_page_zip
3682 || page_zip_validate(merge_page_zip, merge_page, index));
3684 #endif /* UNIV_ZIP_DEBUG */
3686 if (left_page_no != FIL_NULL) {
3687 lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
3688 block);
3689 } else {
3690 lock_update_discard(merge_block,
3691 lock_get_min_heap_no(merge_block),
3692 block);
3695 btr_blob_dbg_remove(page, index, "btr_discard_page");
3697 /* Free the file page */
3698 btr_page_free(index, block, mtr);
3700 ut_ad(btr_check_node_ptr(index, merge_block, mtr));
3703 #ifdef UNIV_BTR_PRINT
3704 /*************************************************************//**
3705 Prints size info of a B-tree. */
3706 UNIV_INTERN
3707 void
3708 btr_print_size(
3709 /*===========*/
3710 dict_index_t* index) /*!< in: index tree */
3712 page_t* root;
3713 fseg_header_t* seg;
3714 mtr_t mtr;
3716 if (dict_index_is_ibuf(index)) {
3717 fputs("Sorry, cannot print info of an ibuf tree:"
3718 " use ibuf functions\n", stderr);
3720 return;
3723 mtr_start(&mtr);
3725 root = btr_root_get(index, &mtr);
3727 seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
3729 fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
3730 fseg_print(seg, &mtr);
3732 if (!(index->type & DICT_UNIVERSAL)) {
3734 seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
3736 fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
3737 fseg_print(seg, &mtr);
3740 mtr_commit(&mtr);
3743 /************************************************************//**
3744 Prints recursively index tree pages. */
3745 static
3746 void
3747 btr_print_recursive(
3748 /*================*/
3749 dict_index_t* index, /*!< in: index tree */
3750 buf_block_t* block, /*!< in: index page */
3751 ulint width, /*!< in: print this many entries from start
3752 and end */
3753 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3754 ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
3755 mtr_t* mtr) /*!< in: mtr */
3757 const page_t* page = buf_block_get_frame(block);
3758 page_cur_t cursor;
3759 ulint n_recs;
3760 ulint i = 0;
3761 mtr_t mtr2;
3763 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3764 fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
3765 (ulong) btr_page_get_level(page, mtr),
3766 (ulong) buf_block_get_page_no(block));
3768 page_print(block, index, width, width);
3770 n_recs = page_get_n_recs(page);
3772 page_cur_set_before_first(block, &cursor);
3773 page_cur_move_to_next(&cursor);
3775 while (!page_cur_is_after_last(&cursor)) {
3777 if (page_is_leaf(page)) {
3779 /* If this is the leaf level, do nothing */
3781 } else if ((i <= width) || (i >= n_recs - width)) {
3783 const rec_t* node_ptr;
3785 mtr_start(&mtr2);
3787 node_ptr = page_cur_get_rec(&cursor);
3789 *offsets = rec_get_offsets(node_ptr, index, *offsets,
3790 ULINT_UNDEFINED, heap);
3791 btr_print_recursive(index,
3792 btr_node_ptr_get_child(node_ptr,
3793 index,
3794 *offsets,
3795 &mtr2),
3796 width, heap, offsets, &mtr2);
3797 mtr_commit(&mtr2);
3800 page_cur_move_to_next(&cursor);
3801 i++;
3805 /**************************************************************//**
3806 Prints directories and other info of all nodes in the tree. */
3807 UNIV_INTERN
3808 void
3809 btr_print_index(
3810 /*============*/
3811 dict_index_t* index, /*!< in: index */
3812 ulint width) /*!< in: print this many entries from start
3813 and end */
3815 mtr_t mtr;
3816 buf_block_t* root;
3817 mem_heap_t* heap = NULL;
3818 ulint offsets_[REC_OFFS_NORMAL_SIZE];
3819 ulint* offsets = offsets_;
3820 rec_offs_init(offsets_);
3822 fputs("--------------------------\n"
3823 "INDEX TREE PRINT\n", stderr);
3825 mtr_start(&mtr);
3827 root = btr_root_block_get(index, &mtr);
3829 btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
3830 if (UNIV_LIKELY_NULL(heap)) {
3831 mem_heap_free(heap);
3834 mtr_commit(&mtr);
3836 btr_validate_index(index, NULL);
3838 #endif /* UNIV_BTR_PRINT */
3840 #ifdef UNIV_DEBUG
3841 /************************************************************//**
3842 Checks that the node pointer to a page is appropriate.
3843 @return TRUE */
3844 UNIV_INTERN
3845 ibool
3846 btr_check_node_ptr(
3847 /*===============*/
3848 dict_index_t* index, /*!< in: index tree */
3849 buf_block_t* block, /*!< in: index page */
3850 mtr_t* mtr) /*!< in: mtr */
3852 mem_heap_t* heap;
3853 dtuple_t* tuple;
3854 ulint* offsets;
3855 btr_cur_t cursor;
3856 page_t* page = buf_block_get_frame(block);
3858 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3859 if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
3861 return(TRUE);
3864 heap = mem_heap_create(256);
3865 offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3866 &cursor);
3868 if (page_is_leaf(page)) {
3870 goto func_exit;
3873 tuple = dict_index_build_node_ptr(
3874 index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
3875 btr_page_get_level(page, mtr));
3877 ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
3878 func_exit:
3879 mem_heap_free(heap);
3881 return(TRUE);
3883 #endif /* UNIV_DEBUG */
3885 /************************************************************//**
3886 Display identification information for a record. */
3887 static
3888 void
3889 btr_index_rec_validate_report(
3890 /*==========================*/
3891 const page_t* page, /*!< in: index page */
3892 const rec_t* rec, /*!< in: index record */
3893 const dict_index_t* index) /*!< in: index */
3895 fputs("InnoDB: Record in ", stderr);
3896 dict_index_name_print(stderr, NULL, index);
3897 fprintf(stderr, ", page %lu, at offset %lu\n",
3898 page_get_page_no(page), (ulint) page_offset(rec));
3901 /************************************************************//**
3902 Checks the size and number of fields in a record based on the definition of
3903 the index.
3904 @return TRUE if ok */
3905 UNIV_INTERN
3906 ibool
3907 btr_index_rec_validate(
3908 /*===================*/
3909 const rec_t* rec, /*!< in: index record */
3910 const dict_index_t* index, /*!< in: index */
3911 ibool dump_on_error) /*!< in: TRUE if the function
3912 should print hex dump of record
3913 and page on error */
3915 ulint len;
3916 ulint n;
3917 ulint i;
3918 const page_t* page;
3919 mem_heap_t* heap = NULL;
3920 ulint offsets_[REC_OFFS_NORMAL_SIZE];
3921 ulint* offsets = offsets_;
3922 rec_offs_init(offsets_);
3924 page = page_align(rec);
3926 if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
3927 /* The insert buffer index tree can contain records from any
3928 other index: we cannot check the number of fields or
3929 their length */
3931 return(TRUE);
3934 if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
3935 != dict_table_is_comp(index->table))) {
3936 btr_index_rec_validate_report(page, rec, index);
3937 fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
3938 (ulong) !!page_is_comp(page),
3939 (ulong) dict_table_is_comp(index->table));
3941 return(FALSE);
3944 n = dict_index_get_n_fields(index);
3946 if (!page_is_comp(page)
3947 && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
3948 btr_index_rec_validate_report(page, rec, index);
3949 fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
3950 (ulong) rec_get_n_fields_old(rec), (ulong) n);
3952 if (dump_on_error) {
3953 buf_page_print(page, 0);
3955 fputs("InnoDB: corrupt record ", stderr);
3956 rec_print_old(stderr, rec);
3957 putc('\n', stderr);
3959 return(FALSE);
3962 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3964 for (i = 0; i < n; i++) {
3965 ulint fixed_size = dict_col_get_fixed_size(
3966 dict_index_get_nth_col(index, i), page_is_comp(page));
3968 rec_get_nth_field_offs(offsets, i, &len);
3970 /* Note that if fixed_size != 0, it equals the
3971 length of a fixed-size column in the clustered index.
3972 A prefix index of the column is of fixed, but different
3973 length. When fixed_size == 0, prefix_len is the maximum
3974 length of the prefix index column. */
3976 if ((dict_index_get_nth_field(index, i)->prefix_len == 0
3977 && len != UNIV_SQL_NULL && fixed_size
3978 && len != fixed_size)
3979 || (dict_index_get_nth_field(index, i)->prefix_len > 0
3980 && len != UNIV_SQL_NULL
3981 && len
3982 > dict_index_get_nth_field(index, i)->prefix_len)) {
3984 btr_index_rec_validate_report(page, rec, index);
3985 fprintf(stderr,
3986 "InnoDB: field %lu len is %lu,"
3987 " should be %lu\n",
3988 (ulong) i, (ulong) len, (ulong) fixed_size);
3990 if (dump_on_error) {
3991 buf_page_print(page, 0);
3993 fputs("InnoDB: corrupt record ", stderr);
3994 rec_print_new(stderr, rec, offsets);
3995 putc('\n', stderr);
3997 if (UNIV_LIKELY_NULL(heap)) {
3998 mem_heap_free(heap);
4000 return(FALSE);
4004 if (UNIV_LIKELY_NULL(heap)) {
4005 mem_heap_free(heap);
4007 return(TRUE);
4010 /************************************************************//**
4011 Checks the size and number of fields in records based on the definition of
4012 the index.
4013 @return TRUE if ok */
4014 static
4015 ibool
4016 btr_index_page_validate(
4017 /*====================*/
4018 buf_block_t* block, /*!< in: index page */
4019 dict_index_t* index) /*!< in: index */
4021 page_cur_t cur;
4022 ibool ret = TRUE;
4023 #ifndef DBUG_OFF
4024 ulint nth = 1;
4025 #endif /* !DBUG_OFF */
4027 page_cur_set_before_first(block, &cur);
4029 /* Directory slot 0 should only contain the infimum record. */
4030 DBUG_EXECUTE_IF("check_table_rec_next",
4031 ut_a(page_rec_get_nth_const(
4032 page_cur_get_page(&cur), 0)
4033 == cur.rec);
4034 ut_a(page_dir_slot_get_n_owned(
4035 page_dir_get_nth_slot(
4036 page_cur_get_page(&cur), 0))
4037 == 1););
4039 page_cur_move_to_next(&cur);
4041 for (;;) {
4042 if (page_cur_is_after_last(&cur)) {
4044 break;
4047 if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
4049 return(FALSE);
4052 /* Verify that page_rec_get_nth_const() is correctly
4053 retrieving each record. */
4054 DBUG_EXECUTE_IF("check_table_rec_next",
4055 ut_a(cur.rec == page_rec_get_nth_const(
4056 page_cur_get_page(&cur),
4057 page_rec_get_n_recs_before(
4058 cur.rec)));
4059 ut_a(nth++ == page_rec_get_n_recs_before(
4060 cur.rec)););
4062 page_cur_move_to_next(&cur);
4065 return(ret);
4068 /************************************************************//**
4069 Report an error on one page of an index tree. */
4070 static
4071 void
4072 btr_validate_report1(
4073 /*=================*/
4074 dict_index_t* index, /*!< in: index */
4075 ulint level, /*!< in: B-tree level */
4076 const buf_block_t* block) /*!< in: index page */
4078 fprintf(stderr, "InnoDB: Error in page %lu of ",
4079 buf_block_get_page_no(block));
4080 dict_index_name_print(stderr, NULL, index);
4081 if (level) {
4082 fprintf(stderr, ", index tree level %lu", level);
4084 putc('\n', stderr);
4087 /************************************************************//**
4088 Report an error on two pages of an index tree. */
4089 static
4090 void
4091 btr_validate_report2(
4092 /*=================*/
4093 const dict_index_t* index, /*!< in: index */
4094 ulint level, /*!< in: B-tree level */
4095 const buf_block_t* block1, /*!< in: first index page */
4096 const buf_block_t* block2) /*!< in: second index page */
4098 fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
4099 buf_block_get_page_no(block1),
4100 buf_block_get_page_no(block2));
4101 dict_index_name_print(stderr, NULL, index);
4102 if (level) {
4103 fprintf(stderr, ", index tree level %lu", level);
4105 putc('\n', stderr);
4108 /************************************************************//**
4109 Validates index tree level.
4110 @return TRUE if ok */
4111 static
4112 ibool
4113 btr_validate_level(
4114 /*===============*/
4115 dict_index_t* index, /*!< in: index tree */
4116 trx_t* trx, /*!< in: transaction or NULL */
4117 ulint level) /*!< in: level number */
4119 ulint space;
4120 ulint zip_size;
4121 buf_block_t* block;
4122 page_t* page;
4123 buf_block_t* right_block = 0; /* remove warning */
4124 page_t* right_page = 0; /* remove warning */
4125 page_t* father_page;
4126 btr_cur_t node_cur;
4127 btr_cur_t right_node_cur;
4128 rec_t* rec;
4129 ulint right_page_no;
4130 ulint left_page_no;
4131 page_cur_t cursor;
4132 dtuple_t* node_ptr_tuple;
4133 ibool ret = TRUE;
4134 mtr_t mtr;
4135 mem_heap_t* heap = mem_heap_create(256);
4136 ulint* offsets = NULL;
4137 ulint* offsets2= NULL;
4138 #ifdef UNIV_ZIP_DEBUG
4139 page_zip_des_t* page_zip;
4140 #endif /* UNIV_ZIP_DEBUG */
4142 mtr_start(&mtr);
4144 mtr_x_lock(dict_index_get_lock(index), &mtr);
4146 block = btr_root_block_get(index, &mtr);
4147 page = buf_block_get_frame(block);
4149 space = dict_index_get_space(index);
4150 zip_size = dict_table_zip_size(index->table);
4152 while (level != btr_page_get_level(page, &mtr)) {
4153 const rec_t* node_ptr;
4155 ut_a(space == buf_block_get_space(block));
4156 ut_a(space == page_get_space_id(page));
4157 #ifdef UNIV_ZIP_DEBUG
4158 page_zip = buf_block_get_page_zip(block);
4159 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4160 #endif /* UNIV_ZIP_DEBUG */
4161 ut_a(!page_is_leaf(page));
4163 page_cur_set_before_first(block, &cursor);
4164 page_cur_move_to_next(&cursor);
4166 node_ptr = page_cur_get_rec(&cursor);
4167 offsets = rec_get_offsets(node_ptr, index, offsets,
4168 ULINT_UNDEFINED, &heap);
4169 block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
4170 page = buf_block_get_frame(block);
4173 /* Now we are on the desired level. Loop through the pages on that
4174 level. */
4175 loop:
4176 if (trx_is_interrupted(trx)) {
4177 mtr_commit(&mtr);
4178 mem_heap_free(heap);
4179 return(ret);
4181 mem_heap_empty(heap);
4182 offsets = offsets2 = NULL;
4183 mtr_x_lock(dict_index_get_lock(index), &mtr);
4185 #ifdef UNIV_ZIP_DEBUG
4186 page_zip = buf_block_get_page_zip(block);
4187 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4188 #endif /* UNIV_ZIP_DEBUG */
4190 /* Check ordering etc. of records */
4192 if (!page_validate(page, index)) {
4193 btr_validate_report1(index, level, block);
4195 ret = FALSE;
4196 } else if (level == 0) {
4197 /* We are on level 0. Check that the records have the right
4198 number of fields, and field lengths are right. */
4200 if (!btr_index_page_validate(block, index)) {
4202 ret = FALSE;
4206 ut_a(btr_page_get_level(page, &mtr) == level);
4208 right_page_no = btr_page_get_next(page, &mtr);
4209 left_page_no = btr_page_get_prev(page, &mtr);
4211 ut_a(page_get_n_recs(page) > 0 || (level == 0
4212 && page_get_page_no(page)
4213 == dict_index_get_page(index)));
4215 if (right_page_no != FIL_NULL) {
4216 const rec_t* right_rec;
4217 right_block = btr_block_get(space, zip_size, right_page_no,
4218 RW_X_LATCH, index, &mtr);
4219 right_page = buf_block_get_frame(right_block);
4220 if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
4221 != page_get_page_no(page))) {
4222 btr_validate_report2(index, level, block, right_block);
4223 fputs("InnoDB: broken FIL_PAGE_NEXT"
4224 " or FIL_PAGE_PREV links\n", stderr);
4225 buf_page_print(page, 0);
4226 buf_page_print(right_page, 0);
4228 ret = FALSE;
4231 if (UNIV_UNLIKELY(page_is_comp(right_page)
4232 != page_is_comp(page))) {
4233 btr_validate_report2(index, level, block, right_block);
4234 fputs("InnoDB: 'compact' flag mismatch\n", stderr);
4235 buf_page_print(page, 0);
4236 buf_page_print(right_page, 0);
4238 ret = FALSE;
4240 goto node_ptr_fails;
4243 rec = page_rec_get_prev(page_get_supremum_rec(page));
4244 right_rec = page_rec_get_next(page_get_infimum_rec(
4245 right_page));
4246 offsets = rec_get_offsets(rec, index,
4247 offsets, ULINT_UNDEFINED, &heap);
4248 offsets2 = rec_get_offsets(right_rec, index,
4249 offsets2, ULINT_UNDEFINED, &heap);
4250 if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
4251 offsets, offsets2,
4252 index) >= 0)) {
4254 btr_validate_report2(index, level, block, right_block);
4256 fputs("InnoDB: records in wrong order"
4257 " on adjacent pages\n", stderr);
4259 buf_page_print(page, 0);
4260 buf_page_print(right_page, 0);
4262 fputs("InnoDB: record ", stderr);
4263 rec = page_rec_get_prev(page_get_supremum_rec(page));
4264 rec_print(stderr, rec, index);
4265 putc('\n', stderr);
4266 fputs("InnoDB: record ", stderr);
4267 rec = page_rec_get_next(
4268 page_get_infimum_rec(right_page));
4269 rec_print(stderr, rec, index);
4270 putc('\n', stderr);
4272 ret = FALSE;
4276 if (level > 0 && left_page_no == FIL_NULL) {
4277 ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
4278 page_rec_get_next(page_get_infimum_rec(page)),
4279 page_is_comp(page)));
4282 if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
4284 /* Check father node pointers */
4286 rec_t* node_ptr;
4288 offsets = btr_page_get_father_block(offsets, heap, index,
4289 block, &mtr, &node_cur);
4290 father_page = btr_cur_get_page(&node_cur);
4291 node_ptr = btr_cur_get_rec(&node_cur);
4293 btr_cur_position(
4294 index, page_rec_get_prev(page_get_supremum_rec(page)),
4295 block, &node_cur);
4296 offsets = btr_page_get_father_node_ptr(offsets, heap,
4297 &node_cur, &mtr);
4299 if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
4300 || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
4301 offsets)
4302 != buf_block_get_page_no(block))) {
4304 btr_validate_report1(index, level, block);
4306 fputs("InnoDB: node pointer to the page is wrong\n",
4307 stderr);
4309 buf_page_print(father_page, 0);
4310 buf_page_print(page, 0);
4312 fputs("InnoDB: node ptr ", stderr);
4313 rec_print(stderr, node_ptr, index);
4315 rec = btr_cur_get_rec(&node_cur);
4316 fprintf(stderr, "\n"
4317 "InnoDB: node ptr child page n:o %lu\n",
4318 (ulong) btr_node_ptr_get_child_page_no(
4319 rec, offsets));
4321 fputs("InnoDB: record on page ", stderr);
4322 rec_print_new(stderr, rec, offsets);
4323 putc('\n', stderr);
4324 ret = FALSE;
4326 goto node_ptr_fails;
4329 if (!page_is_leaf(page)) {
4330 node_ptr_tuple = dict_index_build_node_ptr(
4331 index,
4332 page_rec_get_next(page_get_infimum_rec(page)),
4333 0, heap, btr_page_get_level(page, &mtr));
4335 if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
4336 offsets)) {
4337 const rec_t* first_rec = page_rec_get_next(
4338 page_get_infimum_rec(page));
4340 btr_validate_report1(index, level, block);
4342 buf_page_print(father_page, 0);
4343 buf_page_print(page, 0);
4345 fputs("InnoDB: Error: node ptrs differ"
4346 " on levels > 0\n"
4347 "InnoDB: node ptr ", stderr);
4348 rec_print_new(stderr, node_ptr, offsets);
4349 fputs("InnoDB: first rec ", stderr);
4350 rec_print(stderr, first_rec, index);
4351 putc('\n', stderr);
4352 ret = FALSE;
4354 goto node_ptr_fails;
4358 if (left_page_no == FIL_NULL) {
4359 ut_a(node_ptr == page_rec_get_next(
4360 page_get_infimum_rec(father_page)));
4361 ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
4364 if (right_page_no == FIL_NULL) {
4365 ut_a(node_ptr == page_rec_get_prev(
4366 page_get_supremum_rec(father_page)));
4367 ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
4368 } else {
4369 const rec_t* right_node_ptr
4370 = page_rec_get_next(node_ptr);
4372 offsets = btr_page_get_father_block(
4373 offsets, heap, index, right_block,
4374 &mtr, &right_node_cur);
4375 if (right_node_ptr
4376 != page_get_supremum_rec(father_page)) {
4378 if (btr_cur_get_rec(&right_node_cur)
4379 != right_node_ptr) {
4380 ret = FALSE;
4381 fputs("InnoDB: node pointer to"
4382 " the right page is wrong\n",
4383 stderr);
4385 btr_validate_report1(index, level,
4386 block);
4388 buf_page_print(father_page, 0);
4389 buf_page_print(page, 0);
4390 buf_page_print(right_page, 0);
4392 } else {
4393 page_t* right_father_page
4394 = btr_cur_get_page(&right_node_cur);
4396 if (btr_cur_get_rec(&right_node_cur)
4397 != page_rec_get_next(
4398 page_get_infimum_rec(
4399 right_father_page))) {
4400 ret = FALSE;
4401 fputs("InnoDB: node pointer 2 to"
4402 " the right page is wrong\n",
4403 stderr);
4405 btr_validate_report1(index, level,
4406 block);
4408 buf_page_print(father_page, 0);
4409 buf_page_print(right_father_page, 0);
4410 buf_page_print(page, 0);
4411 buf_page_print(right_page, 0);
4414 if (page_get_page_no(right_father_page)
4415 != btr_page_get_next(father_page, &mtr)) {
4417 ret = FALSE;
4418 fputs("InnoDB: node pointer 3 to"
4419 " the right page is wrong\n",
4420 stderr);
4422 btr_validate_report1(index, level,
4423 block);
4425 buf_page_print(father_page, 0);
4426 buf_page_print(right_father_page, 0);
4427 buf_page_print(page, 0);
4428 buf_page_print(right_page, 0);
4434 node_ptr_fails:
4435 /* Commit the mini-transaction to release the latch on 'page'.
4436 Re-acquire the latch on right_page, which will become 'page'
4437 on the next loop. The page has already been checked. */
4438 mtr_commit(&mtr);
4440 if (right_page_no != FIL_NULL) {
4441 mtr_start(&mtr);
4443 block = btr_block_get(space, zip_size, right_page_no,
4444 RW_X_LATCH, index, &mtr);
4445 page = buf_block_get_frame(block);
4447 goto loop;
4450 mem_heap_free(heap);
4451 return(ret);
4454 /**************************************************************//**
4455 Checks the consistency of an index tree.
4456 @return TRUE if ok */
4457 UNIV_INTERN
4458 ibool
4459 btr_validate_index(
4460 /*===============*/
4461 dict_index_t* index, /*!< in: index */
4462 trx_t* trx) /*!< in: transaction or NULL */
4464 mtr_t mtr;
4465 page_t* root;
4466 ulint i;
4467 ulint n;
4469 mtr_start(&mtr);
4470 mtr_x_lock(dict_index_get_lock(index), &mtr);
4472 root = btr_root_get(index, &mtr);
4473 n = btr_page_get_level(root, &mtr);
4475 for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
4476 if (!btr_validate_level(index, trx, n - i)) {
4478 mtr_commit(&mtr);
4480 return(FALSE);
4484 mtr_commit(&mtr);
4486 return(TRUE);
4488 #endif /* !UNIV_HOTBACKUP */