mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / innobase / buf / buf0buf.c
blob5463098a65474b69dbeb740db361ce2a99bf6155
1 /* Innobase relational database engine; Copyright (C) 2001 Innobase Oy
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License 2
5 as published by the Free Software Foundation in June 1991.
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License 2
13 along with this program (in file COPYING); if not, write to the Free
14 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
15 /******************************************************
16 The database buffer buf_pool
18 (c) 1995 Innobase Oy
20 Created 11/5/1995 Heikki Tuuri
21 *******************************************************/
23 #include "buf0buf.h"
25 #ifdef UNIV_NONINL
26 #include "buf0buf.ic"
27 #endif
29 #include "mem0mem.h"
30 #include "btr0btr.h"
31 #include "fil0fil.h"
32 #include "lock0lock.h"
33 #include "btr0sea.h"
34 #include "ibuf0ibuf.h"
35 #include "dict0dict.h"
36 #include "log0recv.h"
37 #include "log0log.h"
38 #include "trx0undo.h"
39 #include "srv0srv.h"
42 IMPLEMENTATION OF THE BUFFER POOL
43 =================================
45 Performance improvement:
46 ------------------------
47 Thread scheduling in NT may be so slow that the OS wait mechanism should
48 not be used even in waiting for disk reads to complete.
49 Rather, we should put waiting query threads to the queue of
50 waiting jobs, and let the OS thread do something useful while the i/o
51 is processed. In this way we could remove most OS thread switches in
52 an i/o-intensive benchmark like TPC-C.
54 A possibility is to put a user space thread library between the database
55 and NT. User space thread libraries might be very fast.
57 SQL Server 7.0 can be configured to use 'fibers' which are lightweight
58 threads in NT. These should be studied.
60 Buffer frames and blocks
61 ------------------------
62 Following the terminology of Gray and Reuter, we call the memory
63 blocks where file pages are loaded buffer frames. For each buffer
64 frame there is a control block, or shortly, a block, in the buffer
65 control array. The control info which does not need to be stored
66 in the file along with the file page, resides in the control block.
68 Buffer pool struct
69 ------------------
70 The buffer buf_pool contains a single mutex which protects all the
71 control data structures of the buf_pool. The content of a buffer frame is
72 protected by a separate read-write lock in its control block, though.
73 These locks can be locked and unlocked without owning the buf_pool mutex.
74 The OS events in the buf_pool struct can be waited for without owning the
75 buf_pool mutex.
77 The buf_pool mutex is a hot-spot in main memory, causing a lot of
78 memory bus traffic on multiprocessor systems when processors
79 alternately access the mutex. On our Pentium, the mutex is accessed
80 maybe every 10 microseconds. We gave up the solution to have mutexes
81 for each control block, for instance, because it seemed to be
82 complicated.
84 A solution to reduce mutex contention of the buf_pool mutex is to
85 create a separate mutex for the page hash table. On Pentium,
86 accessing the hash table takes 2 microseconds, about half
87 of the total buf_pool mutex hold time.
89 Control blocks
90 --------------
92 The control block contains, for instance, the bufferfix count
93 which is incremented when a thread wants a file page to be fixed
94 in a buffer frame. The bufferfix operation does not lock the
95 contents of the frame, however. For this purpose, the control
96 block contains a read-write lock.
98 The buffer frames have to be aligned so that the start memory
99 address of a frame is divisible by the universal page size, which
100 is a power of two.
102 We intend to make the buffer buf_pool size on-line reconfigurable,
103 that is, the buf_pool size can be changed without closing the database.
104 Then the database administarator may adjust it to be bigger
105 at night, for example. The control block array must
106 contain enough control blocks for the maximum buffer buf_pool size
107 which is used in the particular database.
108 If the buf_pool size is cut, we exploit the virtual memory mechanism of
109 the OS, and just refrain from using frames at high addresses. Then the OS
110 can swap them to disk.
112 The control blocks containing file pages are put to a hash table
113 according to the file address of the page.
114 We could speed up the access to an individual page by using
115 "pointer swizzling": we could replace the page references on
116 non-leaf index pages by direct pointers to the page, if it exists
117 in the buf_pool. We could make a separate hash table where we could
118 chain all the page references in non-leaf pages residing in the buf_pool,
119 using the page reference as the hash key,
120 and at the time of reading of a page update the pointers accordingly.
121 Drawbacks of this solution are added complexity and,
122 possibly, extra space required on non-leaf pages for memory pointers.
123 A simpler solution is just to speed up the hash table mechanism
124 in the database, using tables whose size is a power of 2.
126 Lists of blocks
127 ---------------
129 There are several lists of control blocks. The free list contains
130 blocks which are currently not used.
132 The LRU-list contains all the blocks holding a file page
133 except those for which the bufferfix count is non-zero.
134 The pages are in the LRU list roughly in the order of the last
135 access to the page, so that the oldest pages are at the end of the
136 list. We also keep a pointer to near the end of the LRU list,
137 which we can use when we want to artificially age a page in the
138 buf_pool. This is used if we know that some page is not needed
139 again for some time: we insert the block right after the pointer,
140 causing it to be replaced sooner than would noramlly be the case.
141 Currently this aging mechanism is used for read-ahead mechanism
142 of pages, and it can also be used when there is a scan of a full
143 table which cannot fit in the memory. Putting the pages near the
144 of the LRU list, we make sure that most of the buf_pool stays in the
145 main memory, undisturbed.
147 The chain of modified blocks contains the blocks
148 holding file pages that have been modified in the memory
149 but not written to disk yet. The block with the oldest modification
150 which has not yet been written to disk is at the end of the chain.
152 Loading a file page
153 -------------------
155 First, a victim block for replacement has to be found in the
156 buf_pool. It is taken from the free list or searched for from the
157 end of the LRU-list. An exclusive lock is reserved for the frame,
158 the io_fix field is set in the block fixing the block in buf_pool,
159 and the io-operation for loading the page is queued. The io-handler thread
160 releases the X-lock on the frame and resets the io_fix field
161 when the io operation completes.
163 A thread may request the above operation using the function
164 buf_page_get(). It may then continue to request a lock on the frame.
165 The lock is granted when the io-handler releases the x-lock.
167 Read-ahead
168 ----------
170 The read-ahead mechanism is intended to be intelligent and
171 isolated from the semantically higher levels of the database
172 index management. From the higher level we only need the
173 information if a file page has a natural successor or
174 predecessor page. On the leaf level of a B-tree index,
175 these are the next and previous pages in the natural
176 order of the pages.
178 Let us first explain the read-ahead mechanism when the leafs
179 of a B-tree are scanned in an ascending or descending order.
180 When a read page is the first time referenced in the buf_pool,
181 the buffer manager checks if it is at the border of a so-called
182 linear read-ahead area. The tablespace is divided into these
183 areas of size 64 blocks, for example. So if the page is at the
184 border of such an area, the read-ahead mechanism checks if
185 all the other blocks in the area have been accessed in an
186 ascending or descending order. If this is the case, the system
187 looks at the natural successor or predecessor of the page,
188 checks if that is at the border of another area, and in this case
189 issues read-requests for all the pages in that area. Maybe
190 we could relax the condition that all the pages in the area
191 have to be accessed: if data is deleted from a table, there may
192 appear holes of unused pages in the area.
194 A different read-ahead mechanism is used when there appears
195 to be a random access pattern to a file.
196 If a new page is referenced in the buf_pool, and several pages
197 of its random access area (for instance, 32 consecutive pages
198 in a tablespace) have recently been referenced, we may predict
199 that the whole area may be needed in the near future, and issue
200 the read requests for the whole area.
202 AWE implementation
203 ------------------
205 By a 'block' we mean the buffer header of type buf_block_t. By a 'page'
206 we mean the physical 16 kB memory area allocated from RAM for that block.
207 By a 'frame' we mean a 16 kB area in the virtual address space of the
208 process, in the frame_mem of buf_pool.
210 We can map pages to the frames of the buffer pool.
212 1) A buffer block allocated to use as a non-data page, e.g., to the lock
213 table, is always mapped to a frame.
214 2) A bufferfixed or io-fixed data page is always mapped to a frame.
215 3) When we need to map a block to frame, we look from the list
216 awe_LRU_free_mapped and try to unmap its last block, but note that
217 bufferfixed or io-fixed pages cannot be unmapped.
218 4) For every frame in the buffer pool there is always a block whose page is
219 mapped to it. When we create the buffer pool, we map the first elements
220 in the free list to the frames.
221 5) When we have AWE enabled, we disable adaptive hash indexes.
224 /* Value in microseconds */
225 static const int WAIT_FOR_READ = 20000;
227 /* Number of attemtps made to read in a page in the buffer pool */
228 static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
230 buf_pool_t* buf_pool = NULL; /* The buffer buf_pool of the database */
232 #ifdef UNIV_DEBUG
233 ulint buf_dbg_counter = 0; /* This is used to insert validation
234 operations in excution in the
235 debug version */
236 ibool buf_debug_prints = FALSE; /* If this is set TRUE,
237 the program prints info whenever
238 read-ahead or flush occurs */
239 #endif /* UNIV_DEBUG */
240 /************************************************************************
241 Calculates a page checksum which is stored to the page when it is written
242 to a file. Note that we must be careful to calculate the same value on
243 32-bit and 64-bit architectures. */
245 ulint
246 buf_calc_page_new_checksum(
247 /*=======================*/
248 /* out: checksum */
249 byte* page) /* in: buffer page */
251 ulint checksum;
253 /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
254 ..._ARCH_LOG_NO, are written outside the buffer pool to the first
255 pages of data files, we have to skip them in the page checksum
256 calculation.
257 We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
258 checksum is stored, and also the last 8 bytes of page because
259 there we store the old formula checksum. */
261 checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
262 FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
263 + ut_fold_binary(page + FIL_PAGE_DATA,
264 UNIV_PAGE_SIZE - FIL_PAGE_DATA
265 - FIL_PAGE_END_LSN_OLD_CHKSUM);
266 checksum = checksum & 0xFFFFFFFFUL;
268 return(checksum);
271 /************************************************************************
272 In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
273 looked at the first few bytes of the page. This calculates that old
274 checksum.
275 NOTE: we must first store the new formula checksum to
276 FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
277 because this takes that field as an input! */
279 ulint
280 buf_calc_page_old_checksum(
281 /*=======================*/
282 /* out: checksum */
283 byte* page) /* in: buffer page */
285 ulint checksum;
287 checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
289 checksum = checksum & 0xFFFFFFFFUL;
291 return(checksum);
294 /************************************************************************
295 Checks if a page is corrupt. */
297 ibool
298 buf_page_is_corrupted(
299 /*==================*/
300 /* out: TRUE if corrupted */
301 byte* read_buf) /* in: a database page */
303 ulint checksum;
304 ulint old_checksum;
305 ulint checksum_field;
306 ulint old_checksum_field;
307 #ifndef UNIV_HOTBACKUP
308 dulint current_lsn;
309 #endif
310 if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
311 != mach_read_from_4(read_buf + UNIV_PAGE_SIZE
312 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
314 /* Stored log sequence numbers at the start and the end
315 of page do not match */
317 return(TRUE);
320 #ifndef UNIV_HOTBACKUP
321 if (recv_lsn_checks_on && log_peek_lsn(&current_lsn)) {
322 if (ut_dulint_cmp(current_lsn,
323 mach_read_from_8(read_buf + FIL_PAGE_LSN))
324 < 0) {
325 ut_print_timestamp(stderr);
327 fprintf(stderr,
328 " InnoDB: Error: page %lu log sequence number"
329 " %lu %lu\n"
330 "InnoDB: is in the future! Current system "
331 "log sequence number %lu %lu.\n"
332 "InnoDB: Your database may be corrupt or "
333 "you may have copied the InnoDB\n"
334 "InnoDB: tablespace but not the InnoDB "
335 "log files. See\n"
336 "InnoDB: http://dev.mysql.com/doc/refman/"
337 "5.1/en/forcing-innodb-recovery.html\n"
338 "InnoDB: for more information.\n",
339 (ulong) mach_read_from_4(read_buf
340 + FIL_PAGE_OFFSET),
341 (ulong) ut_dulint_get_high
342 (mach_read_from_8(read_buf + FIL_PAGE_LSN)),
343 (ulong) ut_dulint_get_low
344 (mach_read_from_8(read_buf + FIL_PAGE_LSN)),
345 (ulong) ut_dulint_get_high(current_lsn),
346 (ulong) ut_dulint_get_low(current_lsn));
349 #endif
351 /* If we use checksums validation, make additional check before
352 returning TRUE to ensure that the checksum is not equal to
353 BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
354 disabled. Otherwise, skip checksum calculation and return FALSE */
356 if (srv_use_checksums) {
357 old_checksum = buf_calc_page_old_checksum(read_buf);
359 old_checksum_field = mach_read_from_4(
360 read_buf + UNIV_PAGE_SIZE
361 - FIL_PAGE_END_LSN_OLD_CHKSUM);
363 /* There are 2 valid formulas for old_checksum_field:
365 1. Very old versions of InnoDB only stored 8 byte lsn to the
366 start and the end of the page.
368 2. Newer InnoDB versions store the old formula checksum
369 there. */
371 if (old_checksum_field != mach_read_from_4(read_buf
372 + FIL_PAGE_LSN)
373 && old_checksum_field != old_checksum
374 && old_checksum_field != BUF_NO_CHECKSUM_MAGIC) {
376 return(TRUE);
379 checksum = buf_calc_page_new_checksum(read_buf);
380 checksum_field = mach_read_from_4(read_buf
381 + FIL_PAGE_SPACE_OR_CHKSUM);
383 /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
384 (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */
386 if (checksum_field != 0 && checksum_field != checksum
387 && checksum_field != BUF_NO_CHECKSUM_MAGIC) {
389 return(TRUE);
393 return(FALSE);
396 /************************************************************************
397 Prints a page to stderr. */
399 void
400 buf_page_print(
401 /*===========*/
402 byte* read_buf) /* in: a database page */
404 dict_index_t* index;
405 ulint checksum;
406 ulint old_checksum;
408 ut_print_timestamp(stderr);
409 fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
410 (ulint)UNIV_PAGE_SIZE);
411 ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE);
412 fputs("InnoDB: End of page dump\n", stderr);
414 checksum = srv_use_checksums
415 ? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
416 old_checksum = srv_use_checksums
417 ? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
419 ut_print_timestamp(stderr);
420 fprintf(stderr,
421 " InnoDB: Page checksum %lu, prior-to-4.0.14-form"
422 " checksum %lu\n"
423 "InnoDB: stored checksum %lu, prior-to-4.0.14-form"
424 " stored checksum %lu\n"
425 "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
426 " at page end %lu\n"
427 "InnoDB: Page number (if stored to page already) %lu,\n"
428 "InnoDB: space id (if created with >= MySQL-4.1.1"
429 " and stored already) %lu\n",
430 (ulong) checksum, (ulong) old_checksum,
431 (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
432 (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
433 - FIL_PAGE_END_LSN_OLD_CHKSUM),
434 (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
435 (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
436 (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
437 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
438 (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
439 (ulong) mach_read_from_4(read_buf
440 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
442 if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
443 == TRX_UNDO_INSERT) {
444 fprintf(stderr,
445 "InnoDB: Page may be an insert undo log page\n");
446 } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
447 + TRX_UNDO_PAGE_TYPE)
448 == TRX_UNDO_UPDATE) {
449 fprintf(stderr,
450 "InnoDB: Page may be an update undo log page\n");
453 switch (fil_page_get_type(read_buf)) {
454 case FIL_PAGE_INDEX:
455 fprintf(stderr,
456 "InnoDB: Page may be an index page where"
457 " index id is %lu %lu\n",
458 (ulong) ut_dulint_get_high
459 (btr_page_get_index_id(read_buf)),
460 (ulong) ut_dulint_get_low
461 (btr_page_get_index_id(read_buf)));
463 /* If the code is in ibbackup, dict_sys may be uninitialized,
464 i.e., NULL */
466 if (dict_sys != NULL) {
468 index = dict_index_find_on_id_low(
469 btr_page_get_index_id(read_buf));
470 if (index) {
471 fputs("InnoDB: (", stderr);
472 dict_index_name_print(stderr, NULL, index);
473 fputs(")\n", stderr);
476 break;
477 case FIL_PAGE_INODE:
478 fputs("InnoDB: Page may be an 'inode' page\n", stderr);
479 break;
480 case FIL_PAGE_IBUF_FREE_LIST:
481 fputs("InnoDB: Page may be an insert buffer free list page\n",
482 stderr);
483 break;
484 case FIL_PAGE_TYPE_ALLOCATED:
485 fputs("InnoDB: Page may be a freshly allocated page\n",
486 stderr);
487 break;
488 case FIL_PAGE_IBUF_BITMAP:
489 fputs("InnoDB: Page may be an insert buffer bitmap page\n",
490 stderr);
491 break;
492 case FIL_PAGE_TYPE_SYS:
493 fputs("InnoDB: Page may be a system page\n",
494 stderr);
495 break;
496 case FIL_PAGE_TYPE_TRX_SYS:
497 fputs("InnoDB: Page may be a transaction system page\n",
498 stderr);
499 break;
500 case FIL_PAGE_TYPE_FSP_HDR:
501 fputs("InnoDB: Page may be a file space header page\n",
502 stderr);
503 break;
504 case FIL_PAGE_TYPE_XDES:
505 fputs("InnoDB: Page may be an extent descriptor page\n",
506 stderr);
507 break;
508 case FIL_PAGE_TYPE_BLOB:
509 fputs("InnoDB: Page may be a BLOB page\n",
510 stderr);
511 break;
515 /************************************************************************
516 Initializes a buffer control block when the buf_pool is created. */
517 static
518 void
519 buf_block_init(
520 /*===========*/
521 buf_block_t* block, /* in: pointer to control block */
522 byte* frame) /* in: pointer to buffer frame, or NULL if in
523 the case of AWE there is no frame */
525 block->magic_n = 0;
527 block->state = BUF_BLOCK_NOT_USED;
529 block->frame = frame;
531 block->awe_info = NULL;
533 block->buf_fix_count = 0;
534 block->io_fix = 0;
536 block->modify_clock = ut_dulint_zero;
538 block->file_page_was_freed = FALSE;
540 block->check_index_page_at_flush = FALSE;
541 block->index = NULL;
543 block->in_free_list = FALSE;
544 block->in_LRU_list = FALSE;
546 block->n_pointers = 0;
548 mutex_create(&block->mutex, SYNC_BUF_BLOCK);
550 rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
551 ut_ad(rw_lock_validate(&(block->lock)));
553 #ifdef UNIV_SYNC_DEBUG
554 rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
555 #endif /* UNIV_SYNC_DEBUG */
558 /************************************************************************
559 Creates the buffer pool. */
561 buf_pool_t*
562 buf_pool_init(
563 /*==========*/
564 /* out, own: buf_pool object, NULL if not
565 enough memory or error */
566 ulint max_size, /* in: maximum size of the buf_pool in
567 blocks */
568 ulint curr_size, /* in: current size to use, must be <=
569 max_size, currently must be equal to
570 max_size */
571 ulint n_frames) /* in: number of frames; if AWE is used,
572 this is the size of the address space window
573 where physical memory pages are mapped; if
574 AWE is not used then this must be the same
575 as max_size */
577 byte* frame;
578 ulint i;
579 buf_block_t* block;
581 ut_a(max_size == curr_size);
582 ut_a(srv_use_awe || n_frames == max_size);
584 if (n_frames > curr_size) {
585 fprintf(stderr,
586 "InnoDB: AWE: Error: you must specify in my.cnf"
587 " .._awe_mem_mb larger\n"
588 "InnoDB: than .._buffer_pool_size. Now the former"
589 " is %lu pages,\n"
590 "InnoDB: the latter %lu pages.\n",
591 (ulong) curr_size, (ulong) n_frames);
593 return(NULL);
596 buf_pool = mem_alloc(sizeof(buf_pool_t));
598 /* 1. Initialize general fields
599 ---------------------------- */
600 mutex_create(&buf_pool->mutex, SYNC_BUF_POOL);
602 mutex_enter(&(buf_pool->mutex));
604 if (srv_use_awe) {
605 /*----------------------------------------*/
606 /* Allocate the virtual address space window, i.e., the
607 buffer pool frames */
609 buf_pool->frame_mem = os_awe_allocate_virtual_mem_window(
610 UNIV_PAGE_SIZE * (n_frames + 1));
612 /* Allocate the physical memory for AWE and the AWE info array
613 for buf_pool */
615 if ((curr_size % ((1024 * 1024) / UNIV_PAGE_SIZE)) != 0) {
617 fprintf(stderr,
618 "InnoDB: AWE: Error: physical memory must be"
619 " allocated in full megabytes.\n"
620 "InnoDB: Trying to allocate %lu"
621 " database pages.\n",
622 (ulong) curr_size);
624 return(NULL);
627 if (!os_awe_allocate_physical_mem(&(buf_pool->awe_info),
628 curr_size
629 / ((1024 * 1024)
630 / UNIV_PAGE_SIZE))) {
632 return(NULL);
634 /*----------------------------------------*/
635 } else {
636 buf_pool->frame_mem = os_mem_alloc_large(
637 UNIV_PAGE_SIZE * (n_frames + 1), FALSE);
640 if (buf_pool->frame_mem == NULL) {
642 return(NULL);
645 buf_pool->blocks = ut_malloc(sizeof(buf_block_t) * max_size);
647 if (buf_pool->blocks == NULL) {
649 return(NULL);
652 buf_pool->max_size = max_size;
653 buf_pool->curr_size = curr_size;
655 buf_pool->n_frames = n_frames;
657 /* Align pointer to the first frame */
659 frame = ut_align(buf_pool->frame_mem, UNIV_PAGE_SIZE);
661 buf_pool->frame_zero = frame;
662 buf_pool->high_end = frame + UNIV_PAGE_SIZE * n_frames;
664 if (srv_use_awe) {
665 /*----------------------------------------*/
666 /* Map an initial part of the allocated physical memory to
667 the window */
669 os_awe_map_physical_mem_to_window(buf_pool->frame_zero,
670 n_frames
671 * (UNIV_PAGE_SIZE
672 / OS_AWE_X86_PAGE_SIZE),
673 buf_pool->awe_info);
674 /*----------------------------------------*/
677 buf_pool->blocks_of_frames = ut_malloc(sizeof(void*) * n_frames);
679 if (buf_pool->blocks_of_frames == NULL) {
681 return(NULL);
684 /* Init block structs and assign frames for them; in the case of
685 AWE there are less frames than blocks. Then we assign the frames
686 to the first blocks (we already mapped the memory above). We also
687 init the awe_info for every block. */
689 for (i = 0; i < max_size; i++) {
691 block = buf_pool_get_nth_block(buf_pool, i);
693 if (i < n_frames) {
694 frame = buf_pool->frame_zero + i * UNIV_PAGE_SIZE;
695 *(buf_pool->blocks_of_frames + i) = block;
696 } else {
697 frame = NULL;
700 buf_block_init(block, frame);
702 if (srv_use_awe) {
703 /*----------------------------------------*/
704 block->awe_info = buf_pool->awe_info
705 + i * (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE);
706 /*----------------------------------------*/
710 buf_pool->page_hash = hash_create(2 * max_size);
712 buf_pool->n_pend_reads = 0;
714 buf_pool->last_printout_time = time(NULL);
716 buf_pool->n_pages_read = 0;
717 buf_pool->n_pages_written = 0;
718 buf_pool->n_pages_created = 0;
719 buf_pool->n_pages_awe_remapped = 0;
721 buf_pool->n_page_gets = 0;
722 buf_pool->n_page_gets_old = 0;
723 buf_pool->n_pages_read_old = 0;
724 buf_pool->n_pages_written_old = 0;
725 buf_pool->n_pages_created_old = 0;
726 buf_pool->n_pages_awe_remapped_old = 0;
728 /* 2. Initialize flushing fields
729 ---------------------------- */
730 UT_LIST_INIT(buf_pool->flush_list);
732 for (i = BUF_FLUSH_LRU; i <= BUF_FLUSH_LIST; i++) {
733 buf_pool->n_flush[i] = 0;
734 buf_pool->init_flush[i] = FALSE;
735 buf_pool->no_flush[i] = os_event_create(NULL);
738 buf_pool->LRU_flush_ended = 0;
740 buf_pool->ulint_clock = 1;
741 buf_pool->freed_page_clock = 0;
743 /* 3. Initialize LRU fields
744 ---------------------------- */
745 UT_LIST_INIT(buf_pool->LRU);
747 buf_pool->LRU_old = NULL;
749 UT_LIST_INIT(buf_pool->awe_LRU_free_mapped);
751 /* Add control blocks to the free list */
752 UT_LIST_INIT(buf_pool->free);
754 for (i = 0; i < curr_size; i++) {
756 block = buf_pool_get_nth_block(buf_pool, i);
758 if (block->frame) {
759 UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
761 if (srv_use_awe) {
762 /* Add to the list of blocks mapped to
763 frames */
765 UT_LIST_ADD_LAST(awe_LRU_free_mapped,
766 buf_pool->awe_LRU_free_mapped,
767 block);
771 UT_LIST_ADD_LAST(free, buf_pool->free, block);
772 block->in_free_list = TRUE;
775 mutex_exit(&(buf_pool->mutex));
777 if (srv_use_adaptive_hash_indexes) {
778 btr_search_sys_create(curr_size * UNIV_PAGE_SIZE
779 / sizeof(void*) / 64);
780 } else {
781 /* Create only a small dummy system */
782 btr_search_sys_create(1000);
785 return(buf_pool);
788 /************************************************************************
789 Maps the page of block to a frame, if not mapped yet. Unmaps some page
790 from the end of the awe_LRU_free_mapped. */
792 void
793 buf_awe_map_page_to_frame(
794 /*======================*/
795 buf_block_t* block, /* in: block whose page should be
796 mapped to a frame */
797 ibool add_to_mapped_list) /* in: TRUE if we in the case
798 we need to map the page should also
799 add the block to the
800 awe_LRU_free_mapped list */
802 buf_block_t* bck;
804 ut_ad(mutex_own(&(buf_pool->mutex)));
805 ut_ad(block);
807 if (block->frame) {
809 return;
812 /* Scan awe_LRU_free_mapped from the end and try to find a block
813 which is not bufferfixed or io-fixed */
815 bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);
817 while (bck) {
818 ibool skip;
820 mutex_enter(&bck->mutex);
822 skip = (bck->state == BUF_BLOCK_FILE_PAGE
823 && (bck->buf_fix_count != 0 || bck->io_fix != 0));
825 if (skip) {
826 mutex_exit(&bck->mutex);
828 /* We have to skip this */
829 bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
830 } else {
831 /* We can map block to the frame of bck */
833 os_awe_map_physical_mem_to_window(
834 bck->frame,
835 UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE,
836 block->awe_info);
838 block->frame = bck->frame;
840 *(buf_pool->blocks_of_frames
841 + (((ulint)(block->frame
842 - buf_pool->frame_zero))
843 >> UNIV_PAGE_SIZE_SHIFT))
844 = block;
846 bck->frame = NULL;
847 UT_LIST_REMOVE(awe_LRU_free_mapped,
848 buf_pool->awe_LRU_free_mapped,
849 bck);
851 if (add_to_mapped_list) {
852 UT_LIST_ADD_FIRST(
853 awe_LRU_free_mapped,
854 buf_pool->awe_LRU_free_mapped,
855 block);
858 buf_pool->n_pages_awe_remapped++;
860 mutex_exit(&bck->mutex);
862 return;
866 fprintf(stderr,
867 "InnoDB: AWE: Fatal error: cannot find a page to unmap\n"
868 "InnoDB: awe_LRU_free_mapped list length %lu\n",
869 (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
871 ut_a(0);
874 /************************************************************************
875 Allocates a buffer block. */
876 UNIV_INLINE
877 buf_block_t*
878 buf_block_alloc(void)
879 /*=================*/
880 /* out, own: the allocated block; also if AWE
881 is used it is guaranteed that the page is
882 mapped to a frame */
884 buf_block_t* block;
886 block = buf_LRU_get_free_block();
888 return(block);
891 /************************************************************************
892 Moves to the block to the start of the LRU list if there is a danger
893 that the block would drift out of the buffer pool. */
894 UNIV_INLINE
895 void
896 buf_block_make_young(
897 /*=================*/
898 buf_block_t* block) /* in: block to make younger */
900 ut_ad(!mutex_own(&(buf_pool->mutex)));
902 /* Note that we read freed_page_clock's without holding any mutex:
903 this is allowed since the result is used only in heuristics */
905 if (buf_block_peek_if_too_old(block)) {
907 mutex_enter(&buf_pool->mutex);
908 /* There has been freeing activity in the LRU list:
909 best to move to the head of the LRU list */
911 buf_LRU_make_block_young(block);
912 mutex_exit(&buf_pool->mutex);
916 /************************************************************************
917 Moves a page to the start of the buffer pool LRU list. This high-level
918 function can be used to prevent an important page from from slipping out of
919 the buffer pool. */
921 void
922 buf_page_make_young(
923 /*================*/
924 buf_frame_t* frame) /* in: buffer frame of a file page */
926 buf_block_t* block;
928 mutex_enter(&(buf_pool->mutex));
930 block = buf_block_align(frame);
932 ut_a(block->state == BUF_BLOCK_FILE_PAGE);
934 buf_LRU_make_block_young(block);
936 mutex_exit(&(buf_pool->mutex));
939 /************************************************************************
940 Frees a buffer block which does not contain a file page. */
941 UNIV_INLINE
942 void
943 buf_block_free(
944 /*===========*/
945 buf_block_t* block) /* in, own: block to be freed */
947 mutex_enter(&(buf_pool->mutex));
949 mutex_enter(&block->mutex);
951 ut_a(block->state != BUF_BLOCK_FILE_PAGE);
953 buf_LRU_block_free_non_file_page(block);
955 mutex_exit(&block->mutex);
957 mutex_exit(&(buf_pool->mutex));
960 /*************************************************************************
961 Allocates a buffer frame. */
963 buf_frame_t*
964 buf_frame_alloc(void)
965 /*=================*/
966 /* out: buffer frame */
968 return(buf_block_alloc()->frame);
971 /*************************************************************************
972 Frees a buffer frame which does not contain a file page. */
974 void
975 buf_frame_free(
976 /*===========*/
977 buf_frame_t* frame) /* in: buffer frame */
979 buf_block_free(buf_block_align(frame));
982 /************************************************************************
983 Returns the buffer control block if the page can be found in the buffer
984 pool. NOTE that it is possible that the page is not yet read
985 from disk, though. This is a very low-level function: use with care! */
987 buf_block_t*
988 buf_page_peek_block(
989 /*================*/
990 /* out: control block if found from page hash table,
991 otherwise NULL; NOTE that the page is not necessarily
992 yet read from disk! */
993 ulint space, /* in: space id */
994 ulint offset) /* in: page number */
996 buf_block_t* block;
998 mutex_enter_fast(&(buf_pool->mutex));
1000 block = buf_page_hash_get(space, offset);
1002 mutex_exit(&(buf_pool->mutex));
1004 return(block);
1007 /************************************************************************
1008 Returns the current state of is_hashed of a page. FALSE if the page is
1009 not in the pool. NOTE that this operation does not fix the page in the
1010 pool if it is found there. */
1012 ibool
1013 buf_page_peek_if_search_hashed(
1014 /*===========================*/
1015 /* out: TRUE if page hash index is built in search
1016 system */
1017 ulint space, /* in: space id */
1018 ulint offset) /* in: page number */
1020 buf_block_t* block;
1021 ibool is_hashed;
1023 mutex_enter_fast(&(buf_pool->mutex));
1025 block = buf_page_hash_get(space, offset);
1027 if (!block) {
1028 is_hashed = FALSE;
1029 } else {
1030 is_hashed = block->is_hashed;
1033 mutex_exit(&(buf_pool->mutex));
1035 return(is_hashed);
1038 /************************************************************************
1039 Returns TRUE if the page can be found in the buffer pool hash table. NOTE
1040 that it is possible that the page is not yet read from disk, though. */
1042 ibool
1043 buf_page_peek(
1044 /*==========*/
1045 /* out: TRUE if found from page hash table,
1046 NOTE that the page is not necessarily yet read
1047 from disk! */
1048 ulint space, /* in: space id */
1049 ulint offset) /* in: page number */
1051 if (buf_page_peek_block(space, offset)) {
1053 return(TRUE);
1056 return(FALSE);
1059 /************************************************************************
1060 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
1061 This function should be called when we free a file page and want the
1062 debug version to check that it is not accessed any more unless
1063 reallocated. */
1065 buf_block_t*
1066 buf_page_set_file_page_was_freed(
1067 /*=============================*/
1068 /* out: control block if found from page hash table,
1069 otherwise NULL */
1070 ulint space, /* in: space id */
1071 ulint offset) /* in: page number */
1073 buf_block_t* block;
1075 mutex_enter_fast(&(buf_pool->mutex));
1077 block = buf_page_hash_get(space, offset);
1079 if (block) {
1080 block->file_page_was_freed = TRUE;
1083 mutex_exit(&(buf_pool->mutex));
1085 return(block);
1088 /************************************************************************
1089 Sets file_page_was_freed FALSE if the page is found in the buffer pool.
1090 This function should be called when we free a file page and want the
1091 debug version to check that it is not accessed any more unless
1092 reallocated. */
1094 buf_block_t*
1095 buf_page_reset_file_page_was_freed(
1096 /*===============================*/
1097 /* out: control block if found from page hash table,
1098 otherwise NULL */
1099 ulint space, /* in: space id */
1100 ulint offset) /* in: page number */
1102 buf_block_t* block;
1104 mutex_enter_fast(&(buf_pool->mutex));
1106 block = buf_page_hash_get(space, offset);
1108 if (block) {
1109 block->file_page_was_freed = FALSE;
1112 mutex_exit(&(buf_pool->mutex));
1114 return(block);
1117 /************************************************************************
1118 This is the general function used to get access to a database page. */
1120 buf_frame_t*
1121 buf_page_get_gen(
1122 /*=============*/
1123 /* out: pointer to the frame or NULL */
1124 ulint space, /* in: space id */
1125 ulint offset, /* in: page number */
1126 ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
1127 buf_frame_t* guess, /* in: guessed frame or NULL */
1128 ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL,
1129 BUF_GET_NO_LATCH, BUF_GET_NOWAIT */
1130 const char* file, /* in: file name */
1131 ulint line, /* in: line where called */
1132 mtr_t* mtr) /* in: mini-transaction */
1134 buf_block_t* block;
1135 ibool accessed;
1136 ulint fix_type;
1137 ibool success;
1138 ibool must_read;
1139 ulint retries = 0;
1141 ut_ad(mtr);
1142 ut_ad((rw_latch == RW_S_LATCH)
1143 || (rw_latch == RW_X_LATCH)
1144 || (rw_latch == RW_NO_LATCH));
1145 ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
1146 ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
1147 || (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT));
1148 #ifndef UNIV_LOG_DEBUG
1149 ut_ad(!ibuf_inside() || ibuf_page(space, offset));
1150 #endif
1151 buf_pool->n_page_gets++;
1152 loop:
1153 block = NULL;
1154 mutex_enter_fast(&(buf_pool->mutex));
1156 if (guess) {
1157 block = buf_block_align(guess);
1159 if ((offset != block->offset) || (space != block->space)
1160 || (block->state != BUF_BLOCK_FILE_PAGE)) {
1162 block = NULL;
1166 if (block == NULL) {
1167 block = buf_page_hash_get(space, offset);
1170 if (block == NULL) {
1171 /* Page not in buf_pool: needs to be read from file */
1173 mutex_exit(&(buf_pool->mutex));
1175 if (mode == BUF_GET_IF_IN_POOL) {
1177 return(NULL);
1180 if (buf_read_page(space, offset)) {
1181 retries = 0;
1182 } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
1183 ++retries;
1184 } else {
1185 fprintf(stderr, "InnoDB: Error: Unable"
1186 " to read tablespace %lu page no"
1187 " %lu into the buffer pool after"
1188 " %lu attempts\n"
1189 "InnoDB: The most probable cause"
1190 " of this error may be that the"
1191 " table has been corrupted.\n"
1192 "InnoDB: You can try to fix this"
1193 " problem by using"
1194 " innodb_force_recovery.\n"
1195 "InnoDB: Please see reference manual"
1196 " for more details.\n"
1197 "InnoDB: Aborting...\n",
1198 space, offset,
1199 BUF_PAGE_READ_MAX_RETRIES);
1201 ut_error;
1204 #ifdef UNIV_DEBUG
1205 buf_dbg_counter++;
1207 if (buf_dbg_counter % 37 == 0) {
1208 ut_ad(buf_validate());
1210 #endif
1211 goto loop;
1214 mutex_enter(&block->mutex);
1216 ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1218 must_read = FALSE;
1220 if (block->io_fix == BUF_IO_READ) {
1222 must_read = TRUE;
1224 if (mode == BUF_GET_IF_IN_POOL) {
1225 /* The page is only being read to buffer */
1226 mutex_exit(&buf_pool->mutex);
1227 mutex_exit(&block->mutex);
1229 return(NULL);
1233 /* If AWE is enabled and the page is not mapped to a frame, then
1234 map it */
1236 if (block->frame == NULL) {
1237 ut_a(srv_use_awe);
1239 /* We set second parameter TRUE because the block is in the
1240 LRU list and we must put it to awe_LRU_free_mapped list once
1241 mapped to a frame */
1243 buf_awe_map_page_to_frame(block, TRUE);
1246 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1247 if (mode == BUF_GET_IF_IN_POOL && ibuf_debug) {
1248 /* Try to evict the block from the buffer pool, to use the
1249 insert buffer as much as possible. */
1251 if (buf_LRU_free_block(block)) {
1252 mutex_exit(&buf_pool->mutex);
1253 mutex_exit(&block->mutex);
1254 fprintf(stderr,
1255 "innodb_change_buffering_debug evict %u %u\n",
1256 (unsigned) space, (unsigned) offset);
1257 return(NULL);
1258 } else if (buf_flush_page_try(block)) {
1259 fprintf(stderr,
1260 "innodb_change_buffering_debug flush %u %u\n",
1261 (unsigned) space, (unsigned) offset);
1262 guess = block->frame;
1263 goto loop;
1266 /* Failed to evict the page; change it directly */
1268 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1270 #ifdef UNIV_SYNC_DEBUG
1271 buf_block_buf_fix_inc_debug(block, file, line);
1272 #else
1273 buf_block_buf_fix_inc(block);
1274 #endif
1275 mutex_exit(&buf_pool->mutex);
1277 /* Check if this is the first access to the page */
1279 accessed = block->accessed;
1281 block->accessed = TRUE;
1283 mutex_exit(&block->mutex);
1285 buf_block_make_young(block);
1287 #ifdef UNIV_DEBUG_FILE_ACCESSES
1288 ut_a(block->file_page_was_freed == FALSE);
1289 #endif
1291 #ifdef UNIV_DEBUG
1292 buf_dbg_counter++;
1294 if (buf_dbg_counter % 5771 == 0) {
1295 ut_ad(buf_validate());
1297 #endif
1298 ut_ad(block->buf_fix_count > 0);
1299 ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1301 if (mode == BUF_GET_NOWAIT) {
1302 if (rw_latch == RW_S_LATCH) {
1303 success = rw_lock_s_lock_func_nowait(&(block->lock),
1304 file, line);
1305 fix_type = MTR_MEMO_PAGE_S_FIX;
1306 } else {
1307 ut_ad(rw_latch == RW_X_LATCH);
1308 success = rw_lock_x_lock_func_nowait(&(block->lock),
1309 file, line);
1310 fix_type = MTR_MEMO_PAGE_X_FIX;
1313 if (!success) {
1314 mutex_enter(&block->mutex);
1316 block->buf_fix_count--;
1318 mutex_exit(&block->mutex);
1319 #ifdef UNIV_SYNC_DEBUG
1320 rw_lock_s_unlock(&(block->debug_latch));
1321 #endif
1323 return(NULL);
1325 } else if (rw_latch == RW_NO_LATCH) {
1327 if (must_read) {
1328 /* Let us wait until the read operation
1329 completes */
1331 for (;;) {
1332 mutex_enter(&block->mutex);
1334 if (block->io_fix == BUF_IO_READ) {
1336 mutex_exit(&block->mutex);
1338 os_thread_sleep(WAIT_FOR_READ);
1339 } else {
1341 mutex_exit(&block->mutex);
1343 break;
1348 fix_type = MTR_MEMO_BUF_FIX;
1349 } else if (rw_latch == RW_S_LATCH) {
1351 rw_lock_s_lock_func(&(block->lock), 0, file, line);
1353 fix_type = MTR_MEMO_PAGE_S_FIX;
1354 } else {
1355 rw_lock_x_lock_func(&(block->lock), 0, file, line);
1357 fix_type = MTR_MEMO_PAGE_X_FIX;
1360 mtr_memo_push(mtr, block, fix_type);
1362 if (!accessed) {
1363 /* In the case of a first access, try to apply linear
1364 read-ahead */
1366 buf_read_ahead_linear(space, offset);
1369 #ifdef UNIV_IBUF_DEBUG
1370 ut_a(ibuf_count_get(block->space, block->offset) == 0);
1371 #endif
1372 return(block->frame);
1375 /************************************************************************
1376 This is the general function used to get optimistic access to a database
1377 page. */
1379 ibool
1380 buf_page_optimistic_get_func(
1381 /*=========================*/
1382 /* out: TRUE if success */
1383 ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
1384 buf_block_t* block, /* in: guessed buffer block */
1385 buf_frame_t* guess, /* in: guessed frame; note that AWE may move
1386 frames */
1387 dulint modify_clock,/* in: modify clock value if mode is
1388 ..._GUESS_ON_CLOCK */
1389 const char* file, /* in: file name */
1390 ulint line, /* in: line where called */
1391 mtr_t* mtr) /* in: mini-transaction */
1393 ibool accessed;
1394 ibool success;
1395 ulint fix_type;
1397 ut_ad(mtr && block);
1398 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
1400 /* If AWE is used, block may have a different frame now, e.g., NULL */
1402 mutex_enter(&block->mutex);
1404 if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE)
1405 || UNIV_UNLIKELY(block->frame != guess)) {
1407 mutex_exit(&block->mutex);
1409 return(FALSE);
1412 #ifdef UNIV_SYNC_DEBUG
1413 buf_block_buf_fix_inc_debug(block, file, line);
1414 #else
1415 buf_block_buf_fix_inc(block);
1416 #endif
1417 accessed = block->accessed;
1418 block->accessed = TRUE;
1420 mutex_exit(&block->mutex);
1422 buf_block_make_young(block);
1424 /* Check if this is the first access to the page */
1426 ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset));
1428 if (rw_latch == RW_S_LATCH) {
1429 success = rw_lock_s_lock_func_nowait(&(block->lock),
1430 file, line);
1431 fix_type = MTR_MEMO_PAGE_S_FIX;
1432 } else {
1433 success = rw_lock_x_lock_func_nowait(&(block->lock),
1434 file, line);
1435 fix_type = MTR_MEMO_PAGE_X_FIX;
1438 if (UNIV_UNLIKELY(!success)) {
1439 mutex_enter(&block->mutex);
1441 block->buf_fix_count--;
1443 mutex_exit(&block->mutex);
1445 #ifdef UNIV_SYNC_DEBUG
1446 rw_lock_s_unlock(&(block->debug_latch));
1447 #endif
1448 return(FALSE);
1451 if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) {
1452 #ifdef UNIV_SYNC_DEBUG
1453 buf_page_dbg_add_level(block->frame, SYNC_NO_ORDER_CHECK);
1454 #endif /* UNIV_SYNC_DEBUG */
1455 if (rw_latch == RW_S_LATCH) {
1456 rw_lock_s_unlock(&(block->lock));
1457 } else {
1458 rw_lock_x_unlock(&(block->lock));
1461 mutex_enter(&block->mutex);
1463 block->buf_fix_count--;
1465 mutex_exit(&block->mutex);
1467 #ifdef UNIV_SYNC_DEBUG
1468 rw_lock_s_unlock(&(block->debug_latch));
1469 #endif
1470 return(FALSE);
1473 mtr_memo_push(mtr, block, fix_type);
1475 #ifdef UNIV_DEBUG
1476 buf_dbg_counter++;
1478 if (buf_dbg_counter % 5771 == 0) {
1479 ut_ad(buf_validate());
1481 #endif
1482 ut_ad(block->buf_fix_count > 0);
1483 ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1485 #ifdef UNIV_DEBUG_FILE_ACCESSES
1486 ut_a(block->file_page_was_freed == FALSE);
1487 #endif
1488 if (UNIV_UNLIKELY(!accessed)) {
1489 /* In the case of a first access, try to apply linear
1490 read-ahead */
1492 buf_read_ahead_linear(buf_frame_get_space_id(guess),
1493 buf_frame_get_page_no(guess));
1496 #ifdef UNIV_IBUF_DEBUG
1497 ut_a(ibuf_count_get(block->space, block->offset) == 0);
1498 #endif
1499 buf_pool->n_page_gets++;
1501 return(TRUE);
1504 /************************************************************************
1505 This is used to get access to a known database page, when no waiting can be
1506 done. For example, if a search in an adaptive hash index leads us to this
1507 frame. */
1509 ibool
1510 buf_page_get_known_nowait(
1511 /*======================*/
1512 /* out: TRUE if success */
1513 ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */
1514 buf_frame_t* guess, /* in: the known page frame */
1515 ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
1516 const char* file, /* in: file name */
1517 ulint line, /* in: line where called */
1518 mtr_t* mtr) /* in: mini-transaction */
1520 buf_block_t* block;
1521 ibool success;
1522 ulint fix_type;
1524 ut_ad(mtr);
1525 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
1527 block = buf_block_align(guess);
1529 mutex_enter(&block->mutex);
1531 if (block->state == BUF_BLOCK_REMOVE_HASH) {
1532 /* Another thread is just freeing the block from the LRU list
1533 of the buffer pool: do not try to access this page; this
1534 attempt to access the page can only come through the hash
1535 index because when the buffer block state is ..._REMOVE_HASH,
1536 we have already removed it from the page address hash table
1537 of the buffer pool. */
1539 mutex_exit(&block->mutex);
1541 return(FALSE);
1544 ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1546 #ifdef UNIV_SYNC_DEBUG
1547 buf_block_buf_fix_inc_debug(block, file, line);
1548 #else
1549 buf_block_buf_fix_inc(block);
1550 #endif
1551 mutex_exit(&block->mutex);
1553 if (mode == BUF_MAKE_YOUNG) {
1554 buf_block_make_young(block);
1557 ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
1559 if (rw_latch == RW_S_LATCH) {
1560 success = rw_lock_s_lock_func_nowait(&(block->lock),
1561 file, line);
1562 fix_type = MTR_MEMO_PAGE_S_FIX;
1563 } else {
1564 success = rw_lock_x_lock_func_nowait(&(block->lock),
1565 file, line);
1566 fix_type = MTR_MEMO_PAGE_X_FIX;
1569 if (!success) {
1570 mutex_enter(&block->mutex);
1572 block->buf_fix_count--;
1574 mutex_exit(&block->mutex);
1576 #ifdef UNIV_SYNC_DEBUG
1577 rw_lock_s_unlock(&(block->debug_latch));
1578 #endif
1580 return(FALSE);
1583 mtr_memo_push(mtr, block, fix_type);
1585 #ifdef UNIV_DEBUG
1586 buf_dbg_counter++;
1588 if (buf_dbg_counter % 5771 == 0) {
1589 ut_ad(buf_validate());
1591 #endif
1592 ut_ad(block->buf_fix_count > 0);
1593 ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
1594 #ifdef UNIV_DEBUG_FILE_ACCESSES
1595 ut_a(block->file_page_was_freed == FALSE);
1596 #endif
1598 #ifdef UNIV_IBUF_DEBUG
1599 ut_a((mode == BUF_KEEP_OLD)
1600 || (ibuf_count_get(block->space, block->offset) == 0));
1601 #endif
1602 buf_pool->n_page_gets++;
1604 return(TRUE);
1607 /************************************************************************
1608 Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
1610 void
1611 buf_page_init_for_backup_restore(
1612 /*=============================*/
1613 ulint space, /* in: space id */
1614 ulint offset, /* in: offset of the page within space
1615 in units of a page */
1616 buf_block_t* block) /* in: block to init */
1618 /* Set the state of the block */
1619 block->magic_n = BUF_BLOCK_MAGIC_N;
1621 block->state = BUF_BLOCK_FILE_PAGE;
1622 block->space = space;
1623 block->offset = offset;
1625 block->lock_hash_val = 0;
1627 block->freed_page_clock = 0;
1629 block->newest_modification = ut_dulint_zero;
1630 block->oldest_modification = ut_dulint_zero;
1632 block->accessed = FALSE;
1633 block->buf_fix_count = 0;
1634 block->io_fix = 0;
1636 block->n_hash_helps = 0;
1637 block->is_hashed = FALSE;
1638 block->n_fields = 1;
1639 block->n_bytes = 0;
1640 block->left_side = TRUE;
1642 block->file_page_was_freed = FALSE;
1645 /************************************************************************
1646 Inits a page to the buffer buf_pool. */
1647 static
1648 void
1649 buf_page_init(
1650 /*==========*/
1651 ulint space, /* in: space id */
1652 ulint offset, /* in: offset of the page within space
1653 in units of a page */
1654 buf_block_t* block) /* in: block to init */
1657 ut_ad(mutex_own(&(buf_pool->mutex)));
1658 ut_ad(mutex_own(&(block->mutex)));
1659 ut_a(block->state != BUF_BLOCK_FILE_PAGE);
1661 /* Set the state of the block */
1662 block->magic_n = BUF_BLOCK_MAGIC_N;
1664 block->state = BUF_BLOCK_FILE_PAGE;
1665 block->space = space;
1666 block->offset = offset;
1668 block->check_index_page_at_flush = FALSE;
1669 block->index = NULL;
1671 block->lock_hash_val = lock_rec_hash(space, offset);
1673 #ifdef UNIV_DEBUG_VALGRIND
1674 if (!space) {
1675 /* Silence valid Valgrind warnings about uninitialized
1676 data being written to data files. There are some unused
1677 bytes on some pages that InnoDB does not initialize. */
1678 UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
1680 #endif /* UNIV_DEBUG_VALGRIND */
1682 /* Insert into the hash table of file pages */
1684 if (buf_page_hash_get(space, offset)) {
1685 fprintf(stderr,
1686 "InnoDB: Error: page %lu %lu already found"
1687 " in the hash table\n",
1688 (ulong) space,
1689 (ulong) offset);
1690 #ifdef UNIV_DEBUG
1691 buf_print();
1692 buf_LRU_print();
1693 buf_validate();
1694 buf_LRU_validate();
1695 #endif /* UNIV_DEBUG */
1696 ut_a(0);
1699 HASH_INSERT(buf_block_t, hash, buf_pool->page_hash,
1700 buf_page_address_fold(space, offset), block);
1702 block->freed_page_clock = 0;
1704 block->newest_modification = ut_dulint_zero;
1705 block->oldest_modification = ut_dulint_zero;
1707 block->accessed = FALSE;
1708 block->buf_fix_count = 0;
1709 block->io_fix = 0;
1711 block->n_hash_helps = 0;
1712 block->is_hashed = FALSE;
1713 block->n_fields = 1;
1714 block->n_bytes = 0;
1715 block->left_side = TRUE;
1717 block->file_page_was_freed = FALSE;
1720 /************************************************************************
1721 Function which inits a page for read to the buffer buf_pool. If the page is
1722 (1) already in buf_pool, or
1723 (2) if we specify to read only ibuf pages and the page is not an ibuf page, or
1724 (3) if the space is deleted or being deleted,
1725 then this function does nothing.
1726 Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
1727 on the buffer frame. The io-handler must take care that the flag is cleared
1728 and the lock released later. This is one of the functions which perform the
1729 state transition NOT_USED => FILE_PAGE to a block (the other is
1730 buf_page_create). */
1732 buf_block_t*
1733 buf_page_init_for_read(
1734 /*===================*/
1735 /* out: pointer to the block or NULL */
1736 ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */
1737 ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */
1738 ulint space, /* in: space id */
1739 ib_longlong tablespace_version,/* in: prevents reading from a wrong
1740 version of the tablespace in case we have done
1741 DISCARD + IMPORT */
1742 ulint offset) /* in: page number */
1744 buf_block_t* block;
1745 mtr_t mtr;
1747 ut_ad(buf_pool);
1749 *err = DB_SUCCESS;
1751 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1752 /* It is a read-ahead within an ibuf routine */
1754 ut_ad(!ibuf_bitmap_page(offset));
1755 ut_ad(ibuf_inside());
1757 mtr_start(&mtr);
1759 if (!ibuf_page_low(space, offset, &mtr)) {
1761 mtr_commit(&mtr);
1763 return(NULL);
1765 } else {
1766 ut_ad(mode == BUF_READ_ANY_PAGE);
1769 block = buf_block_alloc();
1771 ut_a(block);
1773 mutex_enter(&(buf_pool->mutex));
1774 mutex_enter(&block->mutex);
1776 if (fil_tablespace_deleted_or_being_deleted_in_mem(
1777 space, tablespace_version)) {
1778 *err = DB_TABLESPACE_DELETED;
1781 if (*err == DB_TABLESPACE_DELETED
1782 || NULL != buf_page_hash_get(space, offset)) {
1784 /* The page belongs to a space which has been
1785 deleted or is being deleted, or the page is
1786 already in buf_pool, return */
1788 mutex_exit(&block->mutex);
1789 mutex_exit(&(buf_pool->mutex));
1791 buf_block_free(block);
1793 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1795 mtr_commit(&mtr);
1798 return(NULL);
1801 ut_ad(block);
1803 buf_page_init(space, offset, block);
1805 /* The block must be put to the LRU list, to the old blocks */
1807 buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */
1809 block->io_fix = BUF_IO_READ;
1811 buf_pool->n_pend_reads++;
1813 /* We set a pass-type x-lock on the frame because then the same
1814 thread which called for the read operation (and is running now at
1815 this point of code) can wait for the read to complete by waiting
1816 for the x-lock on the frame; if the x-lock were recursive, the
1817 same thread would illegally get the x-lock before the page read
1818 is completed. The x-lock is cleared by the io-handler thread. */
1820 rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ);
1822 mutex_exit(&block->mutex);
1823 mutex_exit(&(buf_pool->mutex));
1825 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
1827 mtr_commit(&mtr);
1830 return(block);
1833 /************************************************************************
1834 Initializes a page to the buffer buf_pool. The page is usually not read
1835 from a file even if it cannot be found in the buffer buf_pool. This is one
1836 of the functions which perform to a block a state transition NOT_USED =>
1837 FILE_PAGE (the other is buf_page_init_for_read above). */
1839 buf_frame_t*
1840 buf_page_create(
1841 /*============*/
1842 /* out: pointer to the frame, page bufferfixed */
1843 ulint space, /* in: space id */
1844 ulint offset, /* in: offset of the page within space in units of
1845 a page */
1846 mtr_t* mtr) /* in: mini-transaction handle */
1848 buf_frame_t* frame;
1849 buf_block_t* block;
1850 buf_block_t* free_block = NULL;
1852 ut_ad(mtr);
1854 free_block = buf_LRU_get_free_block();
1856 mutex_enter(&(buf_pool->mutex));
1858 block = buf_page_hash_get(space, offset);
1860 if (block != NULL) {
1861 #ifdef UNIV_IBUF_DEBUG
1862 ut_a(ibuf_count_get(block->space, block->offset) == 0);
1863 #endif
1864 block->file_page_was_freed = FALSE;
1866 /* Page can be found in buf_pool */
1867 mutex_exit(&(buf_pool->mutex));
1869 buf_block_free(free_block);
1871 frame = buf_page_get_with_no_latch(space, offset, mtr);
1873 return(frame);
1876 /* If we get here, the page was not in buf_pool: init it there */
1878 #ifdef UNIV_DEBUG
1879 if (buf_debug_prints) {
1880 fprintf(stderr, "Creating space %lu page %lu to buffer\n",
1881 (ulong) space, (ulong) offset);
1883 #endif /* UNIV_DEBUG */
1885 block = free_block;
1887 mutex_enter(&block->mutex);
1889 buf_page_init(space, offset, block);
1891 /* The block must be put to the LRU list */
1892 buf_LRU_add_block(block, FALSE);
1894 #ifdef UNIV_SYNC_DEBUG
1895 buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__);
1896 #else
1897 buf_block_buf_fix_inc(block);
1898 #endif
1899 buf_pool->n_pages_created++;
1901 mutex_exit(&(buf_pool->mutex));
1903 mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
1905 block->accessed = TRUE;
1907 mutex_exit(&block->mutex);
1909 /* Delete possible entries for the page from the insert buffer:
1910 such can exist if the page belonged to an index which was dropped */
1912 ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE);
1914 /* Flush pages from the end of the LRU list if necessary */
1915 buf_flush_free_margin();
1917 frame = block->frame;
1919 memset(frame + FIL_PAGE_PREV, 0xff, 4);
1920 memset(frame + FIL_PAGE_NEXT, 0xff, 4);
1921 mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
1923 /* Reset to zero the file flush lsn field in the page; if the first
1924 page of an ibdata file is 'created' in this function into the buffer
1925 pool then we lose the original contents of the file flush lsn stamp.
1926 Then InnoDB could in a crash recovery print a big, false, corruption
1927 warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
1929 memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1931 #ifdef UNIV_DEBUG
1932 buf_dbg_counter++;
1934 if (buf_dbg_counter % 357 == 0) {
1935 ut_ad(buf_validate());
1937 #endif
1938 #ifdef UNIV_IBUF_DEBUG
1939 ut_a(ibuf_count_get(block->space, block->offset) == 0);
1940 #endif
1941 return(frame);
1944 /************************************************************************
1945 Completes an asynchronous read or write request of a file page to or from
1946 the buffer pool. */
1948 void
1949 buf_page_io_complete(
1950 /*=================*/
1951 buf_block_t* block) /* in: pointer to the block in question */
1953 ulint io_type;
1955 ut_ad(block);
1957 ut_a(block->state == BUF_BLOCK_FILE_PAGE);
1959 /* We do not need protect block->io_fix here by block->mutex to read
1960 it because this is the only function where we can change the value
1961 from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
1962 ensures that this is the only thread that handles the i/o for this
1963 block. */
1965 io_type = block->io_fix;
1967 if (io_type == BUF_IO_READ) {
1968 /* If this page is not uninitialized and not in the
1969 doublewrite buffer, then the page number and space id
1970 should be the same as in block. */
1971 ulint read_page_no = mach_read_from_4(
1972 block->frame + FIL_PAGE_OFFSET);
1973 ulint read_space_id = mach_read_from_4(
1974 block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1976 if (!block->space
1977 && trx_doublewrite_page_inside(block->offset)) {
1979 ut_print_timestamp(stderr);
1980 fprintf(stderr,
1981 " InnoDB: Error: reading page %lu\n"
1982 "InnoDB: which is in the"
1983 " doublewrite buffer!\n",
1984 (ulong) block->offset);
1985 } else if (!read_space_id && !read_page_no) {
1986 /* This is likely an uninitialized page. */
1987 } else if ((block->space && block->space != read_space_id)
1988 || block->offset != read_page_no) {
1989 /* We did not compare space_id to read_space_id
1990 if block->space == 0, because the field on the
1991 page may contain garbage in MySQL < 4.1.1,
1992 which only supported block->space == 0. */
1994 ut_print_timestamp(stderr);
1995 fprintf(stderr,
1996 " InnoDB: Error: space id and page n:o"
1997 " stored in the page\n"
1998 "InnoDB: read in are %lu:%lu,"
1999 " should be %lu:%lu!\n",
2000 (ulong) read_space_id, (ulong) read_page_no,
2001 (ulong) block->space, (ulong) block->offset);
2003 /* From version 3.23.38 up we store the page checksum
2004 to the 4 first bytes of the page end lsn field */
2006 if (buf_page_is_corrupted(block->frame)) {
2007 fprintf(stderr,
2008 "InnoDB: Database page corruption on disk"
2009 " or a failed\n"
2010 "InnoDB: file read of page %lu.\n",
2011 (ulong) block->offset);
2013 fputs("InnoDB: You may have to recover"
2014 " from a backup.\n", stderr);
2016 buf_page_print(block->frame);
2018 fprintf(stderr,
2019 "InnoDB: Database page corruption on disk"
2020 " or a failed\n"
2021 "InnoDB: file read of page %lu.\n",
2022 (ulong) block->offset);
2023 fputs("InnoDB: You may have to recover"
2024 " from a backup.\n", stderr);
2025 fputs("InnoDB: It is also possible that"
2026 " your operating\n"
2027 "InnoDB: system has corrupted its"
2028 " own file cache\n"
2029 "InnoDB: and rebooting your computer"
2030 " removes the\n"
2031 "InnoDB: error.\n"
2032 "InnoDB: If the corrupt page is an index page\n"
2033 "InnoDB: you can also try to"
2034 " fix the corruption\n"
2035 "InnoDB: by dumping, dropping,"
2036 " and reimporting\n"
2037 "InnoDB: the corrupt table."
2038 " You can use CHECK\n"
2039 "InnoDB: TABLE to scan your"
2040 " table for corruption.\n"
2041 "InnoDB: See also"
2042 " http://dev.mysql.com/doc/refman/5.1/en/"
2043 "forcing-innodb-recovery.html\n"
2044 "InnoDB: about forcing recovery.\n", stderr);
2046 if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
2047 fputs("InnoDB: Ending processing because of"
2048 " a corrupt database page.\n",
2049 stderr);
2050 exit(1);
2054 if (recv_recovery_is_on()) {
2055 recv_recover_page(FALSE, TRUE, block->frame,
2056 block->space, block->offset);
2059 if (!recv_no_ibuf_operations) {
2060 ibuf_merge_or_delete_for_page(
2061 block->frame, block->space, block->offset,
2062 TRUE);
2066 mutex_enter(&(buf_pool->mutex));
2067 mutex_enter(&block->mutex);
2069 #ifdef UNIV_IBUF_DEBUG
2070 ut_a(ibuf_count_get(block->space, block->offset) == 0);
2071 #endif
2072 /* Because this thread which does the unlocking is not the same that
2073 did the locking, we use a pass value != 0 in unlock, which simply
2074 removes the newest lock debug record, without checking the thread
2075 id. */
2077 block->io_fix = 0;
2079 if (io_type == BUF_IO_READ) {
2080 /* NOTE that the call to ibuf may have moved the ownership of
2081 the x-latch to this OS thread: do not let this confuse you in
2082 debugging! */
2084 ut_ad(buf_pool->n_pend_reads > 0);
2085 buf_pool->n_pend_reads--;
2086 buf_pool->n_pages_read++;
2088 rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ);
2090 #ifdef UNIV_DEBUG
2091 if (buf_debug_prints) {
2092 fputs("Has read ", stderr);
2094 #endif /* UNIV_DEBUG */
2095 } else {
2096 ut_ad(io_type == BUF_IO_WRITE);
2098 /* Write means a flush operation: call the completion
2099 routine in the flush system */
2101 buf_flush_write_complete(block);
2103 rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE);
2105 buf_pool->n_pages_written++;
2107 #ifdef UNIV_DEBUG
2108 if (buf_debug_prints) {
2109 fputs("Has written ", stderr);
2111 #endif /* UNIV_DEBUG */
2114 mutex_exit(&block->mutex);
2115 mutex_exit(&(buf_pool->mutex));
2117 #ifdef UNIV_DEBUG
2118 if (buf_debug_prints) {
2119 fprintf(stderr, "page space %lu page no %lu\n",
2120 (ulong) block->space, (ulong) block->offset);
2122 #endif /* UNIV_DEBUG */
2125 /*************************************************************************
2126 Invalidates the file pages in the buffer pool when an archive recovery is
2127 completed. All the file pages buffered must be in a replaceable state when
2128 this function is called: not latched and not modified. */
2130 void
2131 buf_pool_invalidate(void)
2132 /*=====================*/
2134 ibool freed;
2136 ut_ad(buf_all_freed());
2138 freed = TRUE;
2140 while (freed) {
2141 freed = buf_LRU_search_and_free_block(100);
2144 mutex_enter(&(buf_pool->mutex));
2146 ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
2148 mutex_exit(&(buf_pool->mutex));
2151 #ifdef UNIV_DEBUG
2152 /*************************************************************************
2153 Validates the buffer buf_pool data structure. */
2155 ibool
2156 buf_validate(void)
2157 /*==============*/
2159 buf_block_t* block;
2160 ulint i;
2161 ulint n_single_flush = 0;
2162 ulint n_lru_flush = 0;
2163 ulint n_list_flush = 0;
2164 ulint n_lru = 0;
2165 ulint n_flush = 0;
2166 ulint n_free = 0;
2167 ulint n_page = 0;
2169 ut_ad(buf_pool);
2171 mutex_enter(&(buf_pool->mutex));
2173 for (i = 0; i < buf_pool->curr_size; i++) {
2175 block = buf_pool_get_nth_block(buf_pool, i);
2177 mutex_enter(&block->mutex);
2179 if (block->state == BUF_BLOCK_FILE_PAGE) {
2181 ut_a(buf_page_hash_get(block->space,
2182 block->offset) == block);
2183 n_page++;
2185 #ifdef UNIV_IBUF_DEBUG
2186 ut_a((block->io_fix == BUF_IO_READ)
2187 || ibuf_count_get(block->space, block->offset)
2188 == 0);
2189 #endif
2190 if (block->io_fix == BUF_IO_WRITE) {
2192 if (block->flush_type == BUF_FLUSH_LRU) {
2193 n_lru_flush++;
2194 ut_a(rw_lock_is_locked(
2195 &block->lock,
2196 RW_LOCK_SHARED));
2197 } else if (block->flush_type
2198 == BUF_FLUSH_LIST) {
2199 n_list_flush++;
2200 } else if (block->flush_type
2201 == BUF_FLUSH_SINGLE_PAGE) {
2202 n_single_flush++;
2203 } else {
2204 ut_error;
2207 } else if (block->io_fix == BUF_IO_READ) {
2209 ut_a(rw_lock_is_locked(&(block->lock),
2210 RW_LOCK_EX));
2213 n_lru++;
2215 if (ut_dulint_cmp(block->oldest_modification,
2216 ut_dulint_zero) > 0) {
2217 n_flush++;
2220 } else if (block->state == BUF_BLOCK_NOT_USED) {
2221 n_free++;
2224 mutex_exit(&block->mutex);
2227 if (n_lru + n_free > buf_pool->curr_size) {
2228 fprintf(stderr, "n LRU %lu, n free %lu\n",
2229 (ulong) n_lru, (ulong) n_free);
2230 ut_error;
2233 ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
2234 if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
2235 fprintf(stderr, "Free list len %lu, free blocks %lu\n",
2236 (ulong) UT_LIST_GET_LEN(buf_pool->free),
2237 (ulong) n_free);
2238 ut_error;
2240 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
2242 ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
2243 ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
2244 ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
2246 mutex_exit(&(buf_pool->mutex));
2248 ut_a(buf_LRU_validate());
2249 ut_a(buf_flush_validate());
2251 return(TRUE);
2254 /*************************************************************************
2255 Prints info of the buffer buf_pool data structure. */
2257 void
2258 buf_print(void)
2259 /*===========*/
2261 dulint* index_ids;
2262 ulint* counts;
2263 ulint size;
2264 ulint i;
2265 ulint j;
2266 dulint id;
2267 ulint n_found;
2268 buf_frame_t* frame;
2269 dict_index_t* index;
2271 ut_ad(buf_pool);
2273 size = buf_pool->curr_size;
2275 index_ids = mem_alloc(sizeof(dulint) * size);
2276 counts = mem_alloc(sizeof(ulint) * size);
2278 mutex_enter(&(buf_pool->mutex));
2280 fprintf(stderr,
2281 "buf_pool size %lu\n"
2282 "database pages %lu\n"
2283 "free pages %lu\n"
2284 "modified database pages %lu\n"
2285 "n pending reads %lu\n"
2286 "n pending flush LRU %lu list %lu single page %lu\n"
2287 "pages read %lu, created %lu, written %lu\n",
2288 (ulong) size,
2289 (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
2290 (ulong) UT_LIST_GET_LEN(buf_pool->free),
2291 (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
2292 (ulong) buf_pool->n_pend_reads,
2293 (ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
2294 (ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
2295 (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
2296 (ulong) buf_pool->n_pages_read, buf_pool->n_pages_created,
2297 (ulong) buf_pool->n_pages_written);
2299 /* Count the number of blocks belonging to each index in the buffer */
2301 n_found = 0;
2303 for (i = 0; i < size; i++) {
2304 frame = buf_pool_get_nth_block(buf_pool, i)->frame;
2306 if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
2308 id = btr_page_get_index_id(frame);
2310 /* Look for the id in the index_ids array */
2311 j = 0;
2313 while (j < n_found) {
2315 if (ut_dulint_cmp(index_ids[j], id) == 0) {
2316 (counts[j])++;
2318 break;
2320 j++;
2323 if (j == n_found) {
2324 n_found++;
2325 index_ids[j] = id;
2326 counts[j] = 1;
2331 mutex_exit(&(buf_pool->mutex));
2333 for (i = 0; i < n_found; i++) {
2334 index = dict_index_get_if_in_cache(index_ids[i]);
2336 fprintf(stderr,
2337 "Block count for index %lu in buffer is about %lu",
2338 (ulong) ut_dulint_get_low(index_ids[i]),
2339 (ulong) counts[i]);
2341 if (index) {
2342 putc(' ', stderr);
2343 dict_index_name_print(stderr, NULL, index);
2346 putc('\n', stderr);
2349 mem_free(index_ids);
2350 mem_free(counts);
2352 ut_a(buf_validate());
2355 /*************************************************************************
2356 Returns the number of latched pages in the buffer pool. */
2358 ulint
2359 buf_get_latched_pages_number(void)
2361 buf_block_t* block;
2362 ulint i;
2363 ulint fixed_pages_number = 0;
2365 mutex_enter(&(buf_pool->mutex));
2367 for (i = 0; i < buf_pool->curr_size; i++) {
2369 block = buf_pool_get_nth_block(buf_pool, i);
2371 if (block->magic_n == BUF_BLOCK_MAGIC_N) {
2372 mutex_enter(&block->mutex);
2374 if (block->buf_fix_count != 0 || block->io_fix != 0) {
2375 fixed_pages_number++;
2378 mutex_exit(&block->mutex);
2382 mutex_exit(&(buf_pool->mutex));
2384 return(fixed_pages_number);
2386 #endif /* UNIV_DEBUG */
2388 /*************************************************************************
2389 Returns the number of pending buf pool ios. */
2391 ulint
2392 buf_get_n_pending_ios(void)
2393 /*=======================*/
2395 return(buf_pool->n_pend_reads
2396 + buf_pool->n_flush[BUF_FLUSH_LRU]
2397 + buf_pool->n_flush[BUF_FLUSH_LIST]
2398 + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
2401 /*************************************************************************
2402 Returns the ratio in percents of modified pages in the buffer pool /
2403 database pages in the buffer pool. */
2405 ulint
2406 buf_get_modified_ratio_pct(void)
2407 /*============================*/
2409 ulint ratio;
2411 mutex_enter(&(buf_pool->mutex));
2413 ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
2414 / (1 + UT_LIST_GET_LEN(buf_pool->LRU)
2415 + UT_LIST_GET_LEN(buf_pool->free));
2417 /* 1 + is there to avoid division by zero */
2419 mutex_exit(&(buf_pool->mutex));
2421 return(ratio);
2424 /*************************************************************************
2425 Prints info of the buffer i/o. */
2427 void
2428 buf_print_io(
2429 /*=========*/
2430 FILE* file) /* in/out: buffer where to print */
2432 time_t current_time;
2433 double time_elapsed;
2434 ulint size;
2436 ut_ad(buf_pool);
2437 size = buf_pool->curr_size;
2439 mutex_enter(&(buf_pool->mutex));
2441 if (srv_use_awe) {
2442 fprintf(stderr,
2443 "AWE: Buffer pool memory frames %lu\n",
2444 (ulong) buf_pool->n_frames);
2446 fprintf(stderr,
2447 "AWE: Database pages and free buffers"
2448 " mapped in frames %lu\n",
2449 (ulong)
2450 UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
2452 fprintf(file,
2453 "Buffer pool size %lu\n"
2454 "Free buffers %lu\n"
2455 "Database pages %lu\n"
2456 "Modified db pages %lu\n"
2457 "Pending reads %lu\n"
2458 "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
2459 (ulong) size,
2460 (ulong) UT_LIST_GET_LEN(buf_pool->free),
2461 (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
2462 (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
2463 (ulong) buf_pool->n_pend_reads,
2464 (ulong) buf_pool->n_flush[BUF_FLUSH_LRU]
2465 + buf_pool->init_flush[BUF_FLUSH_LRU],
2466 (ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
2467 + buf_pool->init_flush[BUF_FLUSH_LIST],
2468 (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
2470 current_time = time(NULL);
2471 time_elapsed = 0.001 + difftime(current_time,
2472 buf_pool->last_printout_time);
2473 buf_pool->last_printout_time = current_time;
2475 fprintf(file,
2476 "Pages read %lu, created %lu, written %lu\n"
2477 "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
2478 (ulong) buf_pool->n_pages_read,
2479 (ulong) buf_pool->n_pages_created,
2480 (ulong) buf_pool->n_pages_written,
2481 (buf_pool->n_pages_read - buf_pool->n_pages_read_old)
2482 / time_elapsed,
2483 (buf_pool->n_pages_created - buf_pool->n_pages_created_old)
2484 / time_elapsed,
2485 (buf_pool->n_pages_written - buf_pool->n_pages_written_old)
2486 / time_elapsed);
2488 if (srv_use_awe) {
2489 fprintf(file, "AWE: %.2f page remaps/s\n",
2490 (buf_pool->n_pages_awe_remapped
2491 - buf_pool->n_pages_awe_remapped_old)
2492 / time_elapsed);
2495 if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) {
2496 fprintf(file, "Buffer pool hit rate %lu / 1000\n",
2497 (ulong)
2498 (1000 - ((1000 * (buf_pool->n_pages_read
2499 - buf_pool->n_pages_read_old))
2500 / (buf_pool->n_page_gets
2501 - buf_pool->n_page_gets_old))));
2502 } else {
2503 fputs("No buffer pool page gets since the last printout\n",
2504 file);
2507 buf_pool->n_page_gets_old = buf_pool->n_page_gets;
2508 buf_pool->n_pages_read_old = buf_pool->n_pages_read;
2509 buf_pool->n_pages_created_old = buf_pool->n_pages_created;
2510 buf_pool->n_pages_written_old = buf_pool->n_pages_written;
2511 buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
2513 mutex_exit(&(buf_pool->mutex));
2516 /**************************************************************************
2517 Refreshes the statistics used to print per-second averages. */
2519 void
2520 buf_refresh_io_stats(void)
2521 /*======================*/
2523 buf_pool->last_printout_time = time(NULL);
2524 buf_pool->n_page_gets_old = buf_pool->n_page_gets;
2525 buf_pool->n_pages_read_old = buf_pool->n_pages_read;
2526 buf_pool->n_pages_created_old = buf_pool->n_pages_created;
2527 buf_pool->n_pages_written_old = buf_pool->n_pages_written;
2528 buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
2531 /*************************************************************************
2532 Checks that all file pages in the buffer are in a replaceable state. */
2534 ibool
2535 buf_all_freed(void)
2536 /*===============*/
2538 buf_block_t* block;
2539 ulint i;
2541 ut_ad(buf_pool);
2543 mutex_enter(&(buf_pool->mutex));
2545 for (i = 0; i < buf_pool->curr_size; i++) {
2547 block = buf_pool_get_nth_block(buf_pool, i);
2549 mutex_enter(&block->mutex);
2551 if (block->state == BUF_BLOCK_FILE_PAGE) {
2553 if (!buf_flush_ready_for_replace(block)) {
2555 fprintf(stderr,
2556 "Page %lu %lu still fixed or dirty\n",
2557 (ulong) block->space,
2558 (ulong) block->offset);
2559 ut_error;
2563 mutex_exit(&block->mutex);
2566 mutex_exit(&(buf_pool->mutex));
2568 return(TRUE);
2571 /*************************************************************************
2572 Checks that there currently are no pending i/o-operations for the buffer
2573 pool. */
2575 ibool
2576 buf_pool_check_no_pending_io(void)
2577 /*==============================*/
2578 /* out: TRUE if there is no pending i/o */
2580 ibool ret;
2582 mutex_enter(&(buf_pool->mutex));
2584 if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU]
2585 + buf_pool->n_flush[BUF_FLUSH_LIST]
2586 + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) {
2587 ret = FALSE;
2588 } else {
2589 ret = TRUE;
2592 mutex_exit(&(buf_pool->mutex));
2594 return(ret);
2597 /*************************************************************************
2598 Gets the current length of the free list of buffer blocks. */
2600 ulint
2601 buf_get_free_list_len(void)
2602 /*=======================*/
2604 ulint len;
2606 mutex_enter(&(buf_pool->mutex));
2608 len = UT_LIST_GET_LEN(buf_pool->free);
2610 mutex_exit(&(buf_pool->mutex));
2612 return(len);