mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / innodb_plugin / row / row0sel.c
blob76370a38a52251efa1cd989536d32efd75b0d7c8
1 /*****************************************************************************
3 Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
12 This program is free software; you can redistribute it and/or modify it under
13 the terms of the GNU General Public License as published by the Free Software
14 Foundation; version 2 of the License.
16 This program is distributed in the hope that it will be useful, but WITHOUT
17 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc.,
22 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 *****************************************************************************/
26 /***************************************************//**
27 @file row/row0sel.c
28 Select
30 Created 12/19/1997 Heikki Tuuri
31 *******************************************************/
33 #include "row0sel.h"
35 #ifdef UNIV_NONINL
36 #include "row0sel.ic"
37 #endif
39 #include "dict0dict.h"
40 #include "dict0boot.h"
41 #include "trx0undo.h"
42 #include "trx0trx.h"
43 #include "btr0btr.h"
44 #include "btr0cur.h"
45 #include "btr0sea.h"
46 #include "mach0data.h"
47 #include "que0que.h"
48 #include "row0upd.h"
49 #include "row0row.h"
50 #include "row0vers.h"
51 #include "rem0cmp.h"
52 #include "lock0lock.h"
53 #include "eval0eval.h"
54 #include "pars0sym.h"
55 #include "pars0pars.h"
56 #include "row0mysql.h"
57 #include "read0read.h"
58 #include "buf0lru.h"
59 #include "ha_prototypes.h"
60 #ifdef __WIN__
61 /* error LNK2001: unresolved external symbol _debug_sync_C_callback_ptr */
62 # define DEBUG_SYNC_C(dummy) ((void) 0)
63 #else
64 #include "m_string.h" /* for my_sys.h */
65 #include "my_sys.h" /* DEBUG_SYNC_C */
66 #endif
68 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
69 #define SEL_MAX_N_PREFETCH 16
71 /* Number of rows fetched, after which to start prefetching; MySQL interface
72 has another parameter */
73 #define SEL_PREFETCH_LIMIT 1
75 /* When a select has accessed about this many pages, it returns control back
76 to que_run_threads: this is to allow canceling runaway queries */
78 #define SEL_COST_LIMIT 100
80 /* Flags for search shortcut */
81 #define SEL_FOUND 0
82 #define SEL_EXHAUSTED 1
83 #define SEL_RETRY 2
85 /********************************************************************//**
86 Returns TRUE if the user-defined column in a secondary index record
87 is alphabetically the same as the corresponding BLOB column in the clustered
88 index record.
89 NOTE: the comparison is NOT done as a binary comparison, but character
90 fields are compared with collation!
91 @return TRUE if the columns are equal */
92 static
93 ibool
94 row_sel_sec_rec_is_for_blob(
95 /*========================*/
96 ulint mtype, /*!< in: main type */
97 ulint prtype, /*!< in: precise type */
98 ulint mbminlen, /*!< in: minimum length of a
99 multi-byte character */
100 ulint mbmaxlen, /*!< in: maximum length of a
101 multi-byte character */
102 const byte* clust_field, /*!< in: the locally stored part of
103 the clustered index column, including
104 the BLOB pointer; the clustered
105 index record must be covered by
106 a lock or a page latch to protect it
107 against deletion (rollback or purge) */
108 ulint clust_len, /*!< in: length of clust_field */
109 const byte* sec_field, /*!< in: column in secondary index */
110 ulint sec_len, /*!< in: length of sec_field */
111 ulint prefix_len, /*!< in: index column prefix length
112 in bytes */
113 ulint zip_size) /*!< in: compressed page size, or 0 */
115 ulint len;
116 byte buf[DICT_MAX_INDEX_COL_LEN];
118 ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
119 ut_ad(prefix_len >= sec_len);
120 ut_ad(prefix_len > 0);
121 ut_a(prefix_len <= sizeof buf);
123 if (UNIV_UNLIKELY
124 (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
125 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
126 /* The externally stored field was not written yet.
127 This record should only be seen by
128 recv_recovery_rollback_active() or any
129 TRX_ISO_READ_UNCOMMITTED transactions. */
130 return(FALSE);
133 len = btr_copy_externally_stored_field_prefix(buf, prefix_len,
134 zip_size,
135 clust_field, clust_len);
137 if (UNIV_UNLIKELY(len == 0)) {
138 /* The BLOB was being deleted as the server crashed.
139 There should not be any secondary index records
140 referring to this clustered index record, because
141 btr_free_externally_stored_field() is called after all
142 secondary index entries of the row have been purged. */
143 return(FALSE);
146 len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
147 prefix_len, len, (const char*) buf);
149 return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
152 /********************************************************************//**
153 Returns TRUE if the user-defined column values in a secondary index record
154 are alphabetically the same as the corresponding columns in the clustered
155 index record.
156 NOTE: the comparison is NOT done as a binary comparison, but character
157 fields are compared with collation!
158 @return TRUE if the secondary record is equal to the corresponding
159 fields in the clustered record, when compared with collation;
160 FALSE if not equal or if the clustered record has been marked for deletion */
161 static
162 ibool
163 row_sel_sec_rec_is_for_clust_rec(
164 /*=============================*/
165 const rec_t* sec_rec, /*!< in: secondary index record */
166 dict_index_t* sec_index, /*!< in: secondary index */
167 const rec_t* clust_rec, /*!< in: clustered index record;
168 must be protected by a lock or
169 a page latch against deletion
170 in rollback or purge */
171 dict_index_t* clust_index) /*!< in: clustered index */
173 const byte* sec_field;
174 ulint sec_len;
175 const byte* clust_field;
176 ulint n;
177 ulint i;
178 mem_heap_t* heap = NULL;
179 ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
180 ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
181 ulint* clust_offs = clust_offsets_;
182 ulint* sec_offs = sec_offsets_;
183 ibool is_equal = TRUE;
185 rec_offs_init(clust_offsets_);
186 rec_offs_init(sec_offsets_);
188 if (rec_get_deleted_flag(clust_rec,
189 dict_table_is_comp(clust_index->table))) {
191 /* The clustered index record is delete-marked;
192 it is not visible in the read view. Besides,
193 if there are any externally stored columns,
194 some of them may have already been purged. */
195 return(FALSE);
198 clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
199 ULINT_UNDEFINED, &heap);
200 sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
201 ULINT_UNDEFINED, &heap);
203 n = dict_index_get_n_ordering_defined_by_user(sec_index);
205 for (i = 0; i < n; i++) {
206 const dict_field_t* ifield;
207 const dict_col_t* col;
208 ulint clust_pos;
209 ulint clust_len;
210 ulint len;
212 ifield = dict_index_get_nth_field(sec_index, i);
213 col = dict_field_get_col(ifield);
214 clust_pos = dict_col_get_clust_pos(col, clust_index);
216 clust_field = rec_get_nth_field(
217 clust_rec, clust_offs, clust_pos, &clust_len);
218 sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
220 len = clust_len;
222 if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
223 && sec_len != UNIV_SQL_NULL) {
225 if (rec_offs_nth_extern(clust_offs, clust_pos)) {
226 len -= BTR_EXTERN_FIELD_REF_SIZE;
229 len = dtype_get_at_most_n_mbchars(
230 col->prtype, col->mbminlen, col->mbmaxlen,
231 ifield->prefix_len, len, (char*) clust_field);
233 if (rec_offs_nth_extern(clust_offs, clust_pos)
234 && len < sec_len) {
235 /* This function should never be
236 invoked on an Antelope format table,
237 because they should always contain
238 enough prefix in the clustered index
239 record. */
240 ut_ad(dict_table_get_format(clust_index->table)
241 >= DICT_TF_FORMAT_ZIP);
243 if (!row_sel_sec_rec_is_for_blob(
244 col->mtype, col->prtype,
245 col->mbminlen, col->mbmaxlen,
246 clust_field, clust_len,
247 sec_field, sec_len,
248 ifield->prefix_len,
249 dict_table_zip_size(
250 clust_index->table))) {
251 goto inequal;
254 continue;
258 if (0 != cmp_data_data(col->mtype, col->prtype,
259 clust_field, len,
260 sec_field, sec_len)) {
261 inequal:
262 is_equal = FALSE;
263 goto func_exit;
267 func_exit:
268 if (UNIV_LIKELY_NULL(heap)) {
269 mem_heap_free(heap);
271 return(is_equal);
274 /*********************************************************************//**
275 Creates a select node struct.
276 @return own: select node struct */
277 UNIV_INTERN
278 sel_node_t*
279 sel_node_create(
280 /*============*/
281 mem_heap_t* heap) /*!< in: memory heap where created */
283 sel_node_t* node;
285 node = mem_heap_alloc(heap, sizeof(sel_node_t));
286 node->common.type = QUE_NODE_SELECT;
287 node->state = SEL_NODE_OPEN;
289 node->plans = NULL;
291 return(node);
294 /*********************************************************************//**
295 Frees the memory private to a select node when a query graph is freed,
296 does not free the heap where the node was originally created. */
297 UNIV_INTERN
298 void
299 sel_node_free_private(
300 /*==================*/
301 sel_node_t* node) /*!< in: select node struct */
303 ulint i;
304 plan_t* plan;
306 if (node->plans != NULL) {
307 for (i = 0; i < node->n_tables; i++) {
308 plan = sel_node_get_nth_plan(node, i);
310 btr_pcur_close(&(plan->pcur));
311 btr_pcur_close(&(plan->clust_pcur));
313 if (plan->old_vers_heap) {
314 mem_heap_free(plan->old_vers_heap);
320 /*********************************************************************//**
321 Evaluates the values in a select list. If there are aggregate functions,
322 their argument value is added to the aggregate total. */
323 UNIV_INLINE
324 void
325 sel_eval_select_list(
326 /*=================*/
327 sel_node_t* node) /*!< in: select node */
329 que_node_t* exp;
331 exp = node->select_list;
333 while (exp) {
334 eval_exp(exp);
336 exp = que_node_get_next(exp);
340 /*********************************************************************//**
341 Assigns the values in the select list to the possible into-variables in
342 SELECT ... INTO ... */
343 UNIV_INLINE
344 void
345 sel_assign_into_var_values(
346 /*=======================*/
347 sym_node_t* var, /*!< in: first variable in a list of variables */
348 sel_node_t* node) /*!< in: select node */
350 que_node_t* exp;
352 if (var == NULL) {
354 return;
357 exp = node->select_list;
359 while (var) {
360 ut_ad(exp);
362 eval_node_copy_val(var->alias, exp);
364 exp = que_node_get_next(exp);
365 var = que_node_get_next(var);
369 /*********************************************************************//**
370 Resets the aggregate value totals in the select list of an aggregate type
371 query. */
372 UNIV_INLINE
373 void
374 sel_reset_aggregate_vals(
375 /*=====================*/
376 sel_node_t* node) /*!< in: select node */
378 func_node_t* func_node;
380 ut_ad(node->is_aggregate);
382 func_node = node->select_list;
384 while (func_node) {
385 eval_node_set_int_val(func_node, 0);
387 func_node = que_node_get_next(func_node);
390 node->aggregate_already_fetched = FALSE;
393 /*********************************************************************//**
394 Copies the input variable values when an explicit cursor is opened. */
395 UNIV_INLINE
396 void
397 row_sel_copy_input_variable_vals(
398 /*=============================*/
399 sel_node_t* node) /*!< in: select node */
401 sym_node_t* var;
403 var = UT_LIST_GET_FIRST(node->copy_variables);
405 while (var) {
406 eval_node_copy_val(var, var->alias);
408 var->indirection = NULL;
410 var = UT_LIST_GET_NEXT(col_var_list, var);
414 /*********************************************************************//**
415 Fetches the column values from a record. */
416 static
417 void
418 row_sel_fetch_columns(
419 /*==================*/
420 dict_index_t* index, /*!< in: record index */
421 const rec_t* rec, /*!< in: record in a clustered or non-clustered
422 index; must be protected by a page latch */
423 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
424 sym_node_t* column) /*!< in: first column in a column list, or
425 NULL */
427 dfield_t* val;
428 ulint index_type;
429 ulint field_no;
430 const byte* data;
431 ulint len;
433 ut_ad(rec_offs_validate(rec, index, offsets));
435 if (dict_index_is_clust(index)) {
436 index_type = SYM_CLUST_FIELD_NO;
437 } else {
438 index_type = SYM_SEC_FIELD_NO;
441 while (column) {
442 mem_heap_t* heap = NULL;
443 ibool needs_copy;
445 field_no = column->field_nos[index_type];
447 if (field_no != ULINT_UNDEFINED) {
449 if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
450 field_no))) {
452 /* Copy an externally stored field to the
453 temporary heap, if possible. */
455 heap = mem_heap_create(1);
457 data = btr_rec_copy_externally_stored_field(
458 rec, offsets,
459 dict_table_zip_size(index->table),
460 field_no, &len, heap);
462 /* data == NULL means that the
463 externally stored field was not
464 written yet. This record
465 should only be seen by
466 recv_recovery_rollback_active() or any
467 TRX_ISO_READ_UNCOMMITTED
468 transactions. The InnoDB SQL parser
469 (the sole caller of this function)
470 does not implement READ UNCOMMITTED,
471 and it is not involved during rollback. */
472 ut_a(data);
473 ut_a(len != UNIV_SQL_NULL);
475 needs_copy = TRUE;
476 } else {
477 data = rec_get_nth_field(rec, offsets,
478 field_no, &len);
480 needs_copy = column->copy_val;
483 if (needs_copy) {
484 eval_node_copy_and_alloc_val(column, data,
485 len);
486 } else {
487 val = que_node_get_val(column);
488 dfield_set_data(val, data, len);
491 if (UNIV_LIKELY_NULL(heap)) {
492 mem_heap_free(heap);
496 column = UT_LIST_GET_NEXT(col_var_list, column);
500 /*********************************************************************//**
501 Allocates a prefetch buffer for a column when prefetch is first time done. */
502 static
503 void
504 sel_col_prefetch_buf_alloc(
505 /*=======================*/
506 sym_node_t* column) /*!< in: symbol table node for a column */
508 sel_buf_t* sel_buf;
509 ulint i;
511 ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
513 column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
514 * sizeof(sel_buf_t));
515 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
516 sel_buf = column->prefetch_buf + i;
518 sel_buf->data = NULL;
519 sel_buf->len = 0;
520 sel_buf->val_buf_size = 0;
524 /*********************************************************************//**
525 Frees a prefetch buffer for a column, including the dynamically allocated
526 memory for data stored there. */
527 UNIV_INTERN
528 void
529 sel_col_prefetch_buf_free(
530 /*======================*/
531 sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
533 sel_buf_t* sel_buf;
534 ulint i;
536 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
537 sel_buf = prefetch_buf + i;
539 if (sel_buf->val_buf_size > 0) {
541 mem_free(sel_buf->data);
545 mem_free(prefetch_buf);
548 /*********************************************************************//**
549 Pops the column values for a prefetched, cached row from the column prefetch
550 buffers and places them to the val fields in the column nodes. */
551 static
552 void
553 sel_pop_prefetched_row(
554 /*===================*/
555 plan_t* plan) /*!< in: plan node for a table */
557 sym_node_t* column;
558 sel_buf_t* sel_buf;
559 dfield_t* val;
560 byte* data;
561 ulint len;
562 ulint val_buf_size;
564 ut_ad(plan->n_rows_prefetched > 0);
566 column = UT_LIST_GET_FIRST(plan->columns);
568 while (column) {
569 val = que_node_get_val(column);
571 if (!column->copy_val) {
572 /* We did not really push any value for the
573 column */
575 ut_ad(!column->prefetch_buf);
576 ut_ad(que_node_get_val_buf_size(column) == 0);
577 ut_d(dfield_set_null(val));
579 goto next_col;
582 ut_ad(column->prefetch_buf);
583 ut_ad(!dfield_is_ext(val));
585 sel_buf = column->prefetch_buf + plan->first_prefetched;
587 data = sel_buf->data;
588 len = sel_buf->len;
589 val_buf_size = sel_buf->val_buf_size;
591 /* We must keep track of the allocated memory for
592 column values to be able to free it later: therefore
593 we swap the values for sel_buf and val */
595 sel_buf->data = dfield_get_data(val);
596 sel_buf->len = dfield_get_len(val);
597 sel_buf->val_buf_size = que_node_get_val_buf_size(column);
599 dfield_set_data(val, data, len);
600 que_node_set_val_buf_size(column, val_buf_size);
601 next_col:
602 column = UT_LIST_GET_NEXT(col_var_list, column);
605 plan->n_rows_prefetched--;
607 plan->first_prefetched++;
610 /*********************************************************************//**
611 Pushes the column values for a prefetched, cached row to the column prefetch
612 buffers from the val fields in the column nodes. */
613 UNIV_INLINE
614 void
615 sel_push_prefetched_row(
616 /*====================*/
617 plan_t* plan) /*!< in: plan node for a table */
619 sym_node_t* column;
620 sel_buf_t* sel_buf;
621 dfield_t* val;
622 byte* data;
623 ulint len;
624 ulint pos;
625 ulint val_buf_size;
627 if (plan->n_rows_prefetched == 0) {
628 pos = 0;
629 plan->first_prefetched = 0;
630 } else {
631 pos = plan->n_rows_prefetched;
633 /* We have the convention that pushing new rows starts only
634 after the prefetch stack has been emptied: */
636 ut_ad(plan->first_prefetched == 0);
639 plan->n_rows_prefetched++;
641 ut_ad(pos < SEL_MAX_N_PREFETCH);
643 column = UT_LIST_GET_FIRST(plan->columns);
645 while (column) {
646 if (!column->copy_val) {
647 /* There is no sense to push pointers to database
648 page fields when we do not keep latch on the page! */
650 goto next_col;
653 if (!column->prefetch_buf) {
654 /* Allocate a new prefetch buffer */
656 sel_col_prefetch_buf_alloc(column);
659 sel_buf = column->prefetch_buf + pos;
661 val = que_node_get_val(column);
663 data = dfield_get_data(val);
664 len = dfield_get_len(val);
665 val_buf_size = que_node_get_val_buf_size(column);
667 /* We must keep track of the allocated memory for
668 column values to be able to free it later: therefore
669 we swap the values for sel_buf and val */
671 dfield_set_data(val, sel_buf->data, sel_buf->len);
672 que_node_set_val_buf_size(column, sel_buf->val_buf_size);
674 sel_buf->data = data;
675 sel_buf->len = len;
676 sel_buf->val_buf_size = val_buf_size;
677 next_col:
678 column = UT_LIST_GET_NEXT(col_var_list, column);
682 /*********************************************************************//**
683 Builds a previous version of a clustered index record for a consistent read
684 @return DB_SUCCESS or error code */
685 static
686 ulint
687 row_sel_build_prev_vers(
688 /*====================*/
689 read_view_t* read_view, /*!< in: read view */
690 dict_index_t* index, /*!< in: plan node for table */
691 rec_t* rec, /*!< in: record in a clustered index */
692 ulint** offsets, /*!< in/out: offsets returned by
693 rec_get_offsets(rec, plan->index) */
694 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
695 the offsets are allocated */
696 mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
697 rec_t** old_vers, /*!< out: old version, or NULL if the
698 record does not exist in the view:
699 i.e., it was freshly inserted
700 afterwards */
701 mtr_t* mtr) /*!< in: mtr */
703 ulint err;
705 if (*old_vers_heap) {
706 mem_heap_empty(*old_vers_heap);
707 } else {
708 *old_vers_heap = mem_heap_create(512);
711 err = row_vers_build_for_consistent_read(
712 rec, mtr, index, offsets, read_view, offset_heap,
713 *old_vers_heap, old_vers);
714 return(err);
717 /*********************************************************************//**
718 Builds the last committed version of a clustered index record for a
719 semi-consistent read.
720 @return DB_SUCCESS or error code */
721 static
722 ulint
723 row_sel_build_committed_vers_for_mysql(
724 /*===================================*/
725 dict_index_t* clust_index, /*!< in: clustered index */
726 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
727 const rec_t* rec, /*!< in: record in a clustered index */
728 ulint** offsets, /*!< in/out: offsets returned by
729 rec_get_offsets(rec, clust_index) */
730 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
731 the offsets are allocated */
732 const rec_t** old_vers, /*!< out: old version, or NULL if the
733 record does not exist in the view:
734 i.e., it was freshly inserted
735 afterwards */
736 mtr_t* mtr) /*!< in: mtr */
738 ulint err;
740 if (prebuilt->old_vers_heap) {
741 mem_heap_empty(prebuilt->old_vers_heap);
742 } else {
743 prebuilt->old_vers_heap = mem_heap_create(200);
746 err = row_vers_build_for_semi_consistent_read(
747 rec, mtr, clust_index, offsets, offset_heap,
748 prebuilt->old_vers_heap, old_vers);
749 return(err);
752 /*********************************************************************//**
753 Tests the conditions which determine when the index segment we are searching
754 through has been exhausted.
755 @return TRUE if row passed the tests */
756 UNIV_INLINE
757 ibool
758 row_sel_test_end_conds(
759 /*===================*/
760 plan_t* plan) /*!< in: plan for the table; the column values must
761 already have been retrieved and the right sides of
762 comparisons evaluated */
764 func_node_t* cond;
766 /* All conditions in end_conds are comparisons of a column to an
767 expression */
769 cond = UT_LIST_GET_FIRST(plan->end_conds);
771 while (cond) {
772 /* Evaluate the left side of the comparison, i.e., get the
773 column value if there is an indirection */
775 eval_sym(cond->args);
777 /* Do the comparison */
779 if (!eval_cmp(cond)) {
781 return(FALSE);
784 cond = UT_LIST_GET_NEXT(cond_list, cond);
787 return(TRUE);
790 /*********************************************************************//**
791 Tests the other conditions.
792 @return TRUE if row passed the tests */
793 UNIV_INLINE
794 ibool
795 row_sel_test_other_conds(
796 /*=====================*/
797 plan_t* plan) /*!< in: plan for the table; the column values must
798 already have been retrieved */
800 func_node_t* cond;
802 cond = UT_LIST_GET_FIRST(plan->other_conds);
804 while (cond) {
805 eval_exp(cond);
807 if (!eval_node_get_ibool_val(cond)) {
809 return(FALSE);
812 cond = UT_LIST_GET_NEXT(cond_list, cond);
815 return(TRUE);
818 /*********************************************************************//**
819 Retrieves the clustered index record corresponding to a record in a
820 non-clustered index. Does the necessary locking.
821 @return DB_SUCCESS or error code */
822 static
823 ulint
824 row_sel_get_clust_rec(
825 /*==================*/
826 sel_node_t* node, /*!< in: select_node */
827 plan_t* plan, /*!< in: plan node for table */
828 rec_t* rec, /*!< in: record in a non-clustered index */
829 que_thr_t* thr, /*!< in: query thread */
830 rec_t** out_rec,/*!< out: clustered record or an old version of
831 it, NULL if the old version did not exist
832 in the read view, i.e., it was a fresh
833 inserted version */
834 mtr_t* mtr) /*!< in: mtr used to get access to the
835 non-clustered record; the same mtr is used to
836 access the clustered index */
838 dict_index_t* index;
839 rec_t* clust_rec;
840 rec_t* old_vers;
841 ulint err;
842 mem_heap_t* heap = NULL;
843 ulint offsets_[REC_OFFS_NORMAL_SIZE];
844 ulint* offsets = offsets_;
845 rec_offs_init(offsets_);
847 *out_rec = NULL;
849 offsets = rec_get_offsets(rec,
850 btr_pcur_get_btr_cur(&plan->pcur)->index,
851 offsets, ULINT_UNDEFINED, &heap);
853 row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
855 index = dict_table_get_first_index(plan->table);
857 btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
858 BTR_SEARCH_LEAF, &plan->clust_pcur,
859 0, mtr);
861 clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
863 /* Note: only if the search ends up on a non-infimum record is the
864 low_match value the real match to the search tuple */
866 if (!page_rec_is_user_rec(clust_rec)
867 || btr_pcur_get_low_match(&(plan->clust_pcur))
868 < dict_index_get_n_unique(index)) {
870 ut_a(rec_get_deleted_flag(rec,
871 dict_table_is_comp(plan->table)));
872 ut_a(node->read_view);
874 /* In a rare case it is possible that no clust rec is found
875 for a delete-marked secondary index record: if in row0umod.c
876 in row_undo_mod_remove_clust_low() we have already removed
877 the clust rec, while purge is still cleaning and removing
878 secondary index records associated with earlier versions of
879 the clustered index record. In that case we know that the
880 clustered index record did not exist in the read view of
881 trx. */
883 goto func_exit;
886 offsets = rec_get_offsets(clust_rec, index, offsets,
887 ULINT_UNDEFINED, &heap);
889 if (!node->read_view) {
890 /* Try to place a lock on the index record */
892 /* If innodb_locks_unsafe_for_binlog option is used
893 or this session is using READ COMMITTED isolation level
894 we lock only the record, i.e., next-key locking is
895 not used. */
896 ulint lock_type;
897 trx_t* trx;
899 trx = thr_get_trx(thr);
901 if (srv_locks_unsafe_for_binlog
902 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
903 lock_type = LOCK_REC_NOT_GAP;
904 } else {
905 lock_type = LOCK_ORDINARY;
908 err = lock_clust_rec_read_check_and_lock(
909 0, btr_pcur_get_block(&plan->clust_pcur),
910 clust_rec, index, offsets,
911 node->row_lock_mode, lock_type, thr);
913 switch (err) {
914 case DB_SUCCESS:
915 case DB_SUCCESS_LOCKED_REC:
916 /* Declare the variable uninitialized in Valgrind.
917 It should be set to DB_SUCCESS at func_exit. */
918 UNIV_MEM_INVALID(&err, sizeof err);
919 break;
920 default:
921 goto err_exit;
923 } else {
924 /* This is a non-locking consistent read: if necessary, fetch
925 a previous version of the record */
927 old_vers = NULL;
929 if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
930 node->read_view)) {
932 err = row_sel_build_prev_vers(
933 node->read_view, index, clust_rec,
934 &offsets, &heap, &plan->old_vers_heap,
935 &old_vers, mtr);
937 if (err != DB_SUCCESS) {
939 goto err_exit;
942 clust_rec = old_vers;
944 if (clust_rec == NULL) {
945 goto func_exit;
949 /* If we had to go to an earlier version of row or the
950 secondary index record is delete marked, then it may be that
951 the secondary index record corresponding to clust_rec
952 (or old_vers) is not rec; in that case we must ignore
953 such row because in our snapshot rec would not have existed.
954 Remember that from rec we cannot see directly which transaction
955 id corresponds to it: we have to go to the clustered index
956 record. A query where we want to fetch all rows where
957 the secondary index value is in some interval would return
958 a wrong result if we would not drop rows which we come to
959 visit through secondary index records that would not really
960 exist in our snapshot. */
962 if ((old_vers
963 || rec_get_deleted_flag(rec, dict_table_is_comp(
964 plan->table)))
965 && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
966 clust_rec, index)) {
967 goto func_exit;
971 /* Fetch the columns needed in test conditions. The clustered
972 index record is protected by a page latch that was acquired
973 when plan->clust_pcur was positioned. The latch will not be
974 released until mtr_commit(mtr). */
976 ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
977 row_sel_fetch_columns(index, clust_rec, offsets,
978 UT_LIST_GET_FIRST(plan->columns));
979 *out_rec = clust_rec;
980 func_exit:
981 err = DB_SUCCESS;
982 err_exit:
983 if (UNIV_LIKELY_NULL(heap)) {
984 mem_heap_free(heap);
986 return(err);
989 /*********************************************************************//**
990 Sets a lock on a record.
991 @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
992 UNIV_INLINE
993 enum db_err
994 sel_set_rec_lock(
995 /*=============*/
996 const buf_block_t* block, /*!< in: buffer block of rec */
997 const rec_t* rec, /*!< in: record */
998 dict_index_t* index, /*!< in: index */
999 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
1000 ulint mode, /*!< in: lock mode */
1001 ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
1002 LOC_REC_NOT_GAP */
1003 que_thr_t* thr) /*!< in: query thread */
1005 trx_t* trx;
1006 enum db_err err;
1008 trx = thr_get_trx(thr);
1010 if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
1011 if (buf_LRU_buf_pool_running_out()) {
1013 return(DB_LOCK_TABLE_FULL);
1017 if (dict_index_is_clust(index)) {
1018 err = lock_clust_rec_read_check_and_lock(
1019 0, block, rec, index, offsets, mode, type, thr);
1020 } else {
1021 err = lock_sec_rec_read_check_and_lock(
1022 0, block, rec, index, offsets, mode, type, thr);
1025 return(err);
1028 /*********************************************************************//**
1029 Opens a pcur to a table index. */
1030 static
1031 void
1032 row_sel_open_pcur(
1033 /*==============*/
1034 plan_t* plan, /*!< in: table plan */
1035 ibool search_latch_locked,
1036 /*!< in: TRUE if the thread currently
1037 has the search latch locked in
1038 s-mode */
1039 mtr_t* mtr) /*!< in: mtr */
1041 dict_index_t* index;
1042 func_node_t* cond;
1043 que_node_t* exp;
1044 ulint n_fields;
1045 ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
1046 ulint i;
1048 if (search_latch_locked) {
1049 has_search_latch = RW_S_LATCH;
1052 index = plan->index;
1054 /* Calculate the value of the search tuple: the exact match columns
1055 get their expressions evaluated when we evaluate the right sides of
1056 end_conds */
1058 cond = UT_LIST_GET_FIRST(plan->end_conds);
1060 while (cond) {
1061 eval_exp(que_node_get_next(cond->args));
1063 cond = UT_LIST_GET_NEXT(cond_list, cond);
1066 if (plan->tuple) {
1067 n_fields = dtuple_get_n_fields(plan->tuple);
1069 if (plan->n_exact_match < n_fields) {
1070 /* There is a non-exact match field which must be
1071 evaluated separately */
1073 eval_exp(plan->tuple_exps[n_fields - 1]);
1076 for (i = 0; i < n_fields; i++) {
1077 exp = plan->tuple_exps[i];
1079 dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1080 que_node_get_val(exp));
1083 /* Open pcur to the index */
1085 btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1086 BTR_SEARCH_LEAF, &plan->pcur,
1087 has_search_latch, mtr);
1088 } else {
1089 /* Open the cursor to the start or the end of the index
1090 (FALSE: no init) */
1092 btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1093 &(plan->pcur), FALSE, mtr);
1096 ut_ad(plan->n_rows_prefetched == 0);
1097 ut_ad(plan->n_rows_fetched == 0);
1098 ut_ad(plan->cursor_at_end == FALSE);
1100 plan->pcur_is_open = TRUE;
1103 /*********************************************************************//**
1104 Restores a stored pcur position to a table index.
1105 @return TRUE if the cursor should be moved to the next record after we
1106 return from this function (moved to the previous, in the case of a
1107 descending cursor) without processing again the current cursor
1108 record */
1109 static
1110 ibool
1111 row_sel_restore_pcur_pos(
1112 /*=====================*/
1113 plan_t* plan, /*!< in: table plan */
1114 mtr_t* mtr) /*!< in: mtr */
1116 ibool equal_position;
1117 ulint relative_position;
1119 ut_ad(!plan->cursor_at_end);
1121 relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1123 equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1124 &(plan->pcur), mtr);
1126 /* If the cursor is traveling upwards, and relative_position is
1128 (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1129 yet on the successor of the page infimum;
1130 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1131 first record GREATER than the predecessor of a page supremum; we have
1132 not yet processed the cursor record: no need to move the cursor to the
1133 next record;
1134 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1135 last record LESS or EQUAL to the old stored user record; (a) if
1136 equal_position is FALSE, this means that the cursor is now on a record
1137 less than the old user record, and we must move to the next record;
1138 (b) if equal_position is TRUE, then if
1139 plan->stored_cursor_rec_processed is TRUE, we must move to the next
1140 record, else there is no need to move the cursor. */
1142 if (plan->asc) {
1143 if (relative_position == BTR_PCUR_ON) {
1145 if (equal_position) {
1147 return(plan->stored_cursor_rec_processed);
1150 return(TRUE);
1153 ut_ad(relative_position == BTR_PCUR_AFTER
1154 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1156 return(FALSE);
1159 /* If the cursor is traveling downwards, and relative_position is
1161 (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1162 the last record LESS than the successor of a page infimum; we have not
1163 processed the cursor record: no need to move the cursor;
1164 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1165 first record GREATER than the predecessor of a page supremum; we have
1166 processed the cursor record: we should move the cursor to the previous
1167 record;
1168 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1169 last record LESS or EQUAL to the old stored user record; (a) if
1170 equal_position is FALSE, this means that the cursor is now on a record
1171 less than the old user record, and we need not move to the previous
1172 record; (b) if equal_position is TRUE, then if
1173 plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1174 record, else there is no need to move the cursor. */
1176 if (relative_position == BTR_PCUR_BEFORE
1177 || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1179 return(FALSE);
1182 if (relative_position == BTR_PCUR_ON) {
1184 if (equal_position) {
1186 return(plan->stored_cursor_rec_processed);
1189 return(FALSE);
1192 ut_ad(relative_position == BTR_PCUR_AFTER
1193 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1195 return(TRUE);
1198 /*********************************************************************//**
1199 Resets a plan cursor to a closed state. */
1200 UNIV_INLINE
1201 void
1202 plan_reset_cursor(
1203 /*==============*/
1204 plan_t* plan) /*!< in: plan */
1206 plan->pcur_is_open = FALSE;
1207 plan->cursor_at_end = FALSE;
1208 plan->n_rows_fetched = 0;
1209 plan->n_rows_prefetched = 0;
1212 /*********************************************************************//**
1213 Tries to do a shortcut to fetch a clustered index record with a unique key,
1214 using the hash index if possible (not always).
1215 @return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1216 static
1217 ulint
1218 row_sel_try_search_shortcut(
1219 /*========================*/
1220 sel_node_t* node, /*!< in: select node for a consistent read */
1221 plan_t* plan, /*!< in: plan for a unique search in clustered
1222 index */
1223 mtr_t* mtr) /*!< in: mtr */
1225 dict_index_t* index;
1226 rec_t* rec;
1227 mem_heap_t* heap = NULL;
1228 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1229 ulint* offsets = offsets_;
1230 ulint ret;
1231 rec_offs_init(offsets_);
1233 index = plan->index;
1235 ut_ad(node->read_view);
1236 ut_ad(plan->unique_search);
1237 ut_ad(!plan->must_get_clust);
1238 #ifdef UNIV_SYNC_DEBUG
1239 ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1240 #endif /* UNIV_SYNC_DEBUG */
1242 row_sel_open_pcur(plan, TRUE, mtr);
1244 rec = btr_pcur_get_rec(&(plan->pcur));
1246 if (!page_rec_is_user_rec(rec)) {
1248 return(SEL_RETRY);
1251 ut_ad(plan->mode == PAGE_CUR_GE);
1253 /* As the cursor is now placed on a user record after a search with
1254 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1255 fields in the user record matched to the search tuple */
1257 if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1259 return(SEL_EXHAUSTED);
1262 /* This is a non-locking consistent read: if necessary, fetch
1263 a previous version of the record */
1265 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1267 if (dict_index_is_clust(index)) {
1268 if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1269 node->read_view)) {
1270 ret = SEL_RETRY;
1271 goto func_exit;
1273 } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1275 ret = SEL_RETRY;
1276 goto func_exit;
1279 /* Test the deleted flag. */
1281 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1283 ret = SEL_EXHAUSTED;
1284 goto func_exit;
1287 /* Fetch the columns needed in test conditions. The index
1288 record is protected by a page latch that was acquired when
1289 plan->pcur was positioned. The latch will not be released
1290 until mtr_commit(mtr). */
1292 row_sel_fetch_columns(index, rec, offsets,
1293 UT_LIST_GET_FIRST(plan->columns));
1295 /* Test the rest of search conditions */
1297 if (!row_sel_test_other_conds(plan)) {
1299 ret = SEL_EXHAUSTED;
1300 goto func_exit;
1303 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1305 plan->n_rows_fetched++;
1306 ret = SEL_FOUND;
1307 func_exit:
1308 if (UNIV_LIKELY_NULL(heap)) {
1309 mem_heap_free(heap);
1311 return(ret);
1314 /*********************************************************************//**
1315 Performs a select step.
1316 @return DB_SUCCESS or error code */
1317 static
1318 ulint
1319 row_sel(
1320 /*====*/
1321 sel_node_t* node, /*!< in: select node */
1322 que_thr_t* thr) /*!< in: query thread */
1324 dict_index_t* index;
1325 plan_t* plan;
1326 mtr_t mtr;
1327 ibool moved;
1328 rec_t* rec;
1329 rec_t* old_vers;
1330 rec_t* clust_rec;
1331 ibool search_latch_locked;
1332 ibool consistent_read;
1334 /* The following flag becomes TRUE when we are doing a
1335 consistent read from a non-clustered index and we must look
1336 at the clustered index to find out the previous delete mark
1337 state of the non-clustered record: */
1339 ibool cons_read_requires_clust_rec = FALSE;
1340 ulint cost_counter = 0;
1341 ibool cursor_just_opened;
1342 ibool must_go_to_next;
1343 ibool mtr_has_extra_clust_latch = FALSE;
1344 /* TRUE if the search was made using
1345 a non-clustered index, and we had to
1346 access the clustered record: now &mtr
1347 contains a clustered index latch, and
1348 &mtr must be committed before we move
1349 to the next non-clustered record */
1350 ulint found_flag;
1351 ulint err;
1352 mem_heap_t* heap = NULL;
1353 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1354 ulint* offsets = offsets_;
1355 rec_offs_init(offsets_);
1357 ut_ad(thr->run_node == node);
1359 search_latch_locked = FALSE;
1361 if (node->read_view) {
1362 /* In consistent reads, we try to do with the hash index and
1363 not to use the buffer page get. This is to reduce memory bus
1364 load resulting from semaphore operations. The search latch
1365 will be s-locked when we access an index with a unique search
1366 condition, but not locked when we access an index with a
1367 less selective search condition. */
1369 consistent_read = TRUE;
1370 } else {
1371 consistent_read = FALSE;
1374 table_loop:
1375 /* TABLE LOOP
1376 ----------
1377 This is the outer major loop in calculating a join. We come here when
1378 node->fetch_table changes, and after adding a row to aggregate totals
1379 and, of course, when this function is called. */
1381 ut_ad(mtr_has_extra_clust_latch == FALSE);
1383 plan = sel_node_get_nth_plan(node, node->fetch_table);
1384 index = plan->index;
1386 if (plan->n_rows_prefetched > 0) {
1387 sel_pop_prefetched_row(plan);
1389 goto next_table_no_mtr;
1392 if (plan->cursor_at_end) {
1393 /* The cursor has already reached the result set end: no more
1394 rows to process for this table cursor, as also the prefetch
1395 stack was empty */
1397 ut_ad(plan->pcur_is_open);
1399 goto table_exhausted_no_mtr;
1402 /* Open a cursor to index, or restore an open cursor position */
1404 mtr_start(&mtr);
1406 if (consistent_read && plan->unique_search && !plan->pcur_is_open
1407 && !plan->must_get_clust
1408 && !plan->table->big_rows) {
1409 if (!search_latch_locked) {
1410 rw_lock_s_lock(&btr_search_latch);
1412 search_latch_locked = TRUE;
1413 } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1415 /* There is an x-latch request waiting: release the
1416 s-latch for a moment; as an s-latch here is often
1417 kept for some 10 searches before being released,
1418 a waiting x-latch request would block other threads
1419 from acquiring an s-latch for a long time, lowering
1420 performance significantly in multiprocessors. */
1422 rw_lock_s_unlock(&btr_search_latch);
1423 rw_lock_s_lock(&btr_search_latch);
1426 found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1428 if (found_flag == SEL_FOUND) {
1430 goto next_table;
1432 } else if (found_flag == SEL_EXHAUSTED) {
1434 goto table_exhausted;
1437 ut_ad(found_flag == SEL_RETRY);
1439 plan_reset_cursor(plan);
1441 mtr_commit(&mtr);
1442 mtr_start(&mtr);
1445 if (search_latch_locked) {
1446 rw_lock_s_unlock(&btr_search_latch);
1448 search_latch_locked = FALSE;
1451 if (!plan->pcur_is_open) {
1452 /* Evaluate the expressions to build the search tuple and
1453 open the cursor */
1455 row_sel_open_pcur(plan, search_latch_locked, &mtr);
1457 cursor_just_opened = TRUE;
1459 /* A new search was made: increment the cost counter */
1460 cost_counter++;
1461 } else {
1462 /* Restore pcur position to the index */
1464 must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1466 cursor_just_opened = FALSE;
1468 if (must_go_to_next) {
1469 /* We have already processed the cursor record: move
1470 to the next */
1472 goto next_rec;
1476 rec_loop:
1477 /* RECORD LOOP
1478 -----------
1479 In this loop we use pcur and try to fetch a qualifying row, and
1480 also fill the prefetch buffer for this table if n_rows_fetched has
1481 exceeded a threshold. While we are inside this loop, the following
1482 holds:
1483 (1) &mtr is started,
1484 (2) pcur is positioned and open.
1486 NOTE that if cursor_just_opened is TRUE here, it means that we came
1487 to this point right after row_sel_open_pcur. */
1489 ut_ad(mtr_has_extra_clust_latch == FALSE);
1491 rec = btr_pcur_get_rec(&(plan->pcur));
1493 /* PHASE 1: Set a lock if specified */
1495 if (!node->asc && cursor_just_opened
1496 && !page_rec_is_supremum(rec)) {
1498 /* When we open a cursor for a descending search, we must set
1499 a next-key lock on the successor record: otherwise it would
1500 be possible to insert new records next to the cursor position,
1501 and it might be that these new records should appear in the
1502 search result set, resulting in the phantom problem. */
1504 if (!consistent_read) {
1506 /* If innodb_locks_unsafe_for_binlog option is used
1507 or this session is using READ COMMITTED isolation
1508 level, we lock only the record, i.e., next-key
1509 locking is not used. */
1511 rec_t* next_rec = page_rec_get_next(rec);
1512 ulint lock_type;
1513 trx_t* trx;
1515 trx = thr_get_trx(thr);
1517 offsets = rec_get_offsets(next_rec, index, offsets,
1518 ULINT_UNDEFINED, &heap);
1520 if (srv_locks_unsafe_for_binlog
1521 || trx->isolation_level
1522 <= TRX_ISO_READ_COMMITTED) {
1524 if (page_rec_is_supremum(next_rec)) {
1526 goto skip_lock;
1529 lock_type = LOCK_REC_NOT_GAP;
1530 } else {
1531 lock_type = LOCK_ORDINARY;
1534 err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1535 next_rec, index, offsets,
1536 node->row_lock_mode,
1537 lock_type, thr);
1539 switch (err) {
1540 case DB_SUCCESS_LOCKED_REC:
1541 err = DB_SUCCESS;
1542 case DB_SUCCESS:
1543 break;
1544 default:
1545 /* Note that in this case we will store in pcur
1546 the PREDECESSOR of the record we are waiting
1547 the lock for */
1548 goto lock_wait_or_error;
1553 skip_lock:
1554 if (page_rec_is_infimum(rec)) {
1556 /* The infimum record on a page cannot be in the result set,
1557 and neither can a record lock be placed on it: we skip such
1558 a record. We also increment the cost counter as we may have
1559 processed yet another page of index. */
1561 cost_counter++;
1563 goto next_rec;
1566 if (!consistent_read) {
1567 /* Try to place a lock on the index record */
1569 /* If innodb_locks_unsafe_for_binlog option is used
1570 or this session is using READ COMMITTED isolation level,
1571 we lock only the record, i.e., next-key locking is
1572 not used. */
1574 ulint lock_type;
1575 trx_t* trx;
1577 offsets = rec_get_offsets(rec, index, offsets,
1578 ULINT_UNDEFINED, &heap);
1580 trx = thr_get_trx(thr);
1582 if (srv_locks_unsafe_for_binlog
1583 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1585 if (page_rec_is_supremum(rec)) {
1587 goto next_rec;
1590 lock_type = LOCK_REC_NOT_GAP;
1591 } else {
1592 lock_type = LOCK_ORDINARY;
1595 err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1596 rec, index, offsets,
1597 node->row_lock_mode, lock_type, thr);
1599 switch (err) {
1600 case DB_SUCCESS_LOCKED_REC:
1601 err = DB_SUCCESS;
1602 case DB_SUCCESS:
1603 break;
1604 default:
1605 goto lock_wait_or_error;
1609 if (page_rec_is_supremum(rec)) {
1611 /* A page supremum record cannot be in the result set: skip
1612 it now when we have placed a possible lock on it */
1614 goto next_rec;
1617 ut_ad(page_rec_is_user_rec(rec));
1619 if (cost_counter > SEL_COST_LIMIT) {
1621 /* Now that we have placed the necessary locks, we can stop
1622 for a while and store the cursor position; NOTE that if we
1623 would store the cursor position BEFORE placing a record lock,
1624 it might happen that the cursor would jump over some records
1625 that another transaction could meanwhile insert adjacent to
1626 the cursor: this would result in the phantom problem. */
1628 goto stop_for_a_while;
1631 /* PHASE 2: Check a mixed index mix id if needed */
1633 if (plan->unique_search && cursor_just_opened) {
1635 ut_ad(plan->mode == PAGE_CUR_GE);
1637 /* As the cursor is now placed on a user record after a search
1638 with the mode PAGE_CUR_GE, the up_match field in the cursor
1639 tells how many fields in the user record matched to the search
1640 tuple */
1642 if (btr_pcur_get_up_match(&(plan->pcur))
1643 < plan->n_exact_match) {
1644 goto table_exhausted;
1647 /* Ok, no need to test end_conds or mix id */
1651 /* We are ready to look at a possible new index entry in the result
1652 set: the cursor is now placed on a user record */
1654 /* PHASE 3: Get previous version in a consistent read */
1656 cons_read_requires_clust_rec = FALSE;
1657 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1659 if (consistent_read) {
1660 /* This is a non-locking consistent read: if necessary, fetch
1661 a previous version of the record */
1663 if (dict_index_is_clust(index)) {
1665 if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1666 node->read_view)) {
1668 err = row_sel_build_prev_vers(
1669 node->read_view, index, rec,
1670 &offsets, &heap, &plan->old_vers_heap,
1671 &old_vers, &mtr);
1673 if (err != DB_SUCCESS) {
1675 goto lock_wait_or_error;
1678 if (old_vers == NULL) {
1679 /* The record does not exist
1680 in our read view. Skip it, but
1681 first attempt to determine
1682 whether the index segment we
1683 are searching through has been
1684 exhausted. */
1686 offsets = rec_get_offsets(
1687 rec, index, offsets,
1688 ULINT_UNDEFINED, &heap);
1690 /* Fetch the columns needed in
1691 test conditions. The clustered
1692 index record is protected by a
1693 page latch that was acquired
1694 by row_sel_open_pcur() or
1695 row_sel_restore_pcur_pos().
1696 The latch will not be released
1697 until mtr_commit(mtr). */
1699 row_sel_fetch_columns(
1700 index, rec, offsets,
1701 UT_LIST_GET_FIRST(
1702 plan->columns));
1704 if (!row_sel_test_end_conds(plan)) {
1706 goto table_exhausted;
1709 goto next_rec;
1712 rec = old_vers;
1714 } else if (!lock_sec_rec_cons_read_sees(rec,
1715 node->read_view)) {
1716 cons_read_requires_clust_rec = TRUE;
1720 /* PHASE 4: Test search end conditions and deleted flag */
1722 /* Fetch the columns needed in test conditions. The record is
1723 protected by a page latch that was acquired by
1724 row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1725 will not be released until mtr_commit(mtr). */
1727 row_sel_fetch_columns(index, rec, offsets,
1728 UT_LIST_GET_FIRST(plan->columns));
1730 /* Test the selection end conditions: these can only contain columns
1731 which already are found in the index, even though the index might be
1732 non-clustered */
1734 if (plan->unique_search && cursor_just_opened) {
1736 /* No test necessary: the test was already made above */
1738 } else if (!row_sel_test_end_conds(plan)) {
1740 goto table_exhausted;
1743 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1744 && !cons_read_requires_clust_rec) {
1746 /* The record is delete marked: we can skip it if this is
1747 not a consistent read which might see an earlier version
1748 of a non-clustered index record */
1750 if (plan->unique_search) {
1752 goto table_exhausted;
1755 goto next_rec;
1758 /* PHASE 5: Get the clustered index record, if needed and if we did
1759 not do the search using the clustered index */
1761 if (plan->must_get_clust || cons_read_requires_clust_rec) {
1763 /* It was a non-clustered index and we must fetch also the
1764 clustered index record */
1766 err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1767 &mtr);
1768 mtr_has_extra_clust_latch = TRUE;
1770 if (err != DB_SUCCESS) {
1772 goto lock_wait_or_error;
1775 /* Retrieving the clustered record required a search:
1776 increment the cost counter */
1778 cost_counter++;
1780 if (clust_rec == NULL) {
1781 /* The record did not exist in the read view */
1782 ut_ad(consistent_read);
1784 goto next_rec;
1787 if (rec_get_deleted_flag(clust_rec,
1788 dict_table_is_comp(plan->table))) {
1790 /* The record is delete marked: we can skip it */
1792 goto next_rec;
1795 if (node->can_get_updated) {
1797 btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1801 /* PHASE 6: Test the rest of search conditions */
1803 if (!row_sel_test_other_conds(plan)) {
1805 if (plan->unique_search) {
1807 goto table_exhausted;
1810 goto next_rec;
1813 /* PHASE 7: We found a new qualifying row for the current table; push
1814 the row if prefetch is on, or move to the next table in the join */
1816 plan->n_rows_fetched++;
1818 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1820 if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1821 || plan->unique_search || plan->no_prefetch
1822 || plan->table->big_rows) {
1824 /* No prefetch in operation: go to the next table */
1826 goto next_table;
1829 sel_push_prefetched_row(plan);
1831 if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1833 /* The prefetch buffer is now full */
1835 sel_pop_prefetched_row(plan);
1837 goto next_table;
1840 next_rec:
1841 ut_ad(!search_latch_locked);
1843 if (mtr_has_extra_clust_latch) {
1845 /* We must commit &mtr if we are moving to the next
1846 non-clustered index record, because we could break the
1847 latching order if we would access a different clustered
1848 index page right away without releasing the previous. */
1850 goto commit_mtr_for_a_while;
1853 if (node->asc) {
1854 moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1855 } else {
1856 moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1859 if (!moved) {
1861 goto table_exhausted;
1864 cursor_just_opened = FALSE;
1866 /* END OF RECORD LOOP
1867 ------------------ */
1868 goto rec_loop;
1870 next_table:
1871 /* We found a record which satisfies the conditions: we can move to
1872 the next table or return a row in the result set */
1874 ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1876 if (plan->unique_search && !node->can_get_updated) {
1878 plan->cursor_at_end = TRUE;
1879 } else {
1880 ut_ad(!search_latch_locked);
1882 plan->stored_cursor_rec_processed = TRUE;
1884 btr_pcur_store_position(&(plan->pcur), &mtr);
1887 mtr_commit(&mtr);
1889 mtr_has_extra_clust_latch = FALSE;
1891 next_table_no_mtr:
1892 /* If we use 'goto' to this label, it means that the row was popped
1893 from the prefetched rows stack, and &mtr is already committed */
1895 if (node->fetch_table + 1 == node->n_tables) {
1897 sel_eval_select_list(node);
1899 if (node->is_aggregate) {
1901 goto table_loop;
1904 sel_assign_into_var_values(node->into_list, node);
1906 thr->run_node = que_node_get_parent(node);
1908 err = DB_SUCCESS;
1909 goto func_exit;
1912 node->fetch_table++;
1914 /* When we move to the next table, we first reset the plan cursor:
1915 we do not care about resetting it when we backtrack from a table */
1917 plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1919 goto table_loop;
1921 table_exhausted:
1922 /* The table cursor pcur reached the result set end: backtrack to the
1923 previous table in the join if we do not have cached prefetched rows */
1925 plan->cursor_at_end = TRUE;
1927 mtr_commit(&mtr);
1929 mtr_has_extra_clust_latch = FALSE;
1931 if (plan->n_rows_prefetched > 0) {
1932 /* The table became exhausted during a prefetch */
1934 sel_pop_prefetched_row(plan);
1936 goto next_table_no_mtr;
1939 table_exhausted_no_mtr:
1940 if (node->fetch_table == 0) {
1941 err = DB_SUCCESS;
1943 if (node->is_aggregate && !node->aggregate_already_fetched) {
1945 node->aggregate_already_fetched = TRUE;
1947 sel_assign_into_var_values(node->into_list, node);
1949 thr->run_node = que_node_get_parent(node);
1950 } else {
1951 node->state = SEL_NODE_NO_MORE_ROWS;
1953 thr->run_node = que_node_get_parent(node);
1956 goto func_exit;
1959 node->fetch_table--;
1961 goto table_loop;
1963 stop_for_a_while:
1964 /* Return control for a while to que_run_threads, so that runaway
1965 queries can be canceled. NOTE that when we come here, we must, in a
1966 locking read, have placed the necessary (possibly waiting request)
1967 record lock on the cursor record or its successor: when we reposition
1968 the cursor, this record lock guarantees that nobody can meanwhile have
1969 inserted new records which should have appeared in the result set,
1970 which would result in the phantom problem. */
1972 ut_ad(!search_latch_locked);
1974 plan->stored_cursor_rec_processed = FALSE;
1975 btr_pcur_store_position(&(plan->pcur), &mtr);
1977 mtr_commit(&mtr);
1979 #ifdef UNIV_SYNC_DEBUG
1980 ut_ad(sync_thread_levels_empty_gen(TRUE));
1981 #endif /* UNIV_SYNC_DEBUG */
1982 err = DB_SUCCESS;
1983 goto func_exit;
1985 commit_mtr_for_a_while:
1986 /* Stores the cursor position and commits &mtr; this is used if
1987 &mtr may contain latches which would break the latching order if
1988 &mtr would not be committed and the latches released. */
1990 plan->stored_cursor_rec_processed = TRUE;
1992 ut_ad(!search_latch_locked);
1993 btr_pcur_store_position(&(plan->pcur), &mtr);
1995 mtr_commit(&mtr);
1997 mtr_has_extra_clust_latch = FALSE;
1999 #ifdef UNIV_SYNC_DEBUG
2000 ut_ad(sync_thread_levels_empty_gen(TRUE));
2001 #endif /* UNIV_SYNC_DEBUG */
2003 goto table_loop;
2005 lock_wait_or_error:
2006 /* See the note at stop_for_a_while: the same holds for this case */
2008 ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
2009 ut_ad(!search_latch_locked);
2011 plan->stored_cursor_rec_processed = FALSE;
2012 btr_pcur_store_position(&(plan->pcur), &mtr);
2014 mtr_commit(&mtr);
2016 #ifdef UNIV_SYNC_DEBUG
2017 ut_ad(sync_thread_levels_empty_gen(TRUE));
2018 #endif /* UNIV_SYNC_DEBUG */
2020 func_exit:
2021 if (search_latch_locked) {
2022 rw_lock_s_unlock(&btr_search_latch);
2024 if (UNIV_LIKELY_NULL(heap)) {
2025 mem_heap_free(heap);
2027 return(err);
2030 /**********************************************************************//**
2031 Performs a select step. This is a high-level function used in SQL execution
2032 graphs.
2033 @return query thread to run next or NULL */
2034 UNIV_INTERN
2035 que_thr_t*
2036 row_sel_step(
2037 /*=========*/
2038 que_thr_t* thr) /*!< in: query thread */
2040 ulint i_lock_mode;
2041 sym_node_t* table_node;
2042 sel_node_t* node;
2043 ulint err;
2045 ut_ad(thr);
2047 node = thr->run_node;
2049 ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2051 /* If this is a new time this node is executed (or when execution
2052 resumes after wait for a table intention lock), set intention locks
2053 on the tables, or assign a read view */
2055 if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2057 node->state = SEL_NODE_OPEN;
2060 if (node->state == SEL_NODE_OPEN) {
2062 /* It may be that the current session has not yet started
2063 its transaction, or it has been committed: */
2065 trx_start_if_not_started(thr_get_trx(thr));
2067 plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2069 if (node->consistent_read) {
2070 /* Assign a read view for the query */
2071 node->read_view = trx_assign_read_view(
2072 thr_get_trx(thr));
2073 } else {
2074 if (node->set_x_locks) {
2075 i_lock_mode = LOCK_IX;
2076 } else {
2077 i_lock_mode = LOCK_IS;
2080 table_node = node->table_list;
2082 while (table_node) {
2083 err = lock_table(0, table_node->table,
2084 i_lock_mode, thr);
2085 if (err != DB_SUCCESS) {
2086 thr_get_trx(thr)->error_state = err;
2088 return(NULL);
2091 table_node = que_node_get_next(table_node);
2095 /* If this is an explicit cursor, copy stored procedure
2096 variable values, so that the values cannot change between
2097 fetches (currently, we copy them also for non-explicit
2098 cursors) */
2100 if (node->explicit_cursor
2101 && UT_LIST_GET_FIRST(node->copy_variables)) {
2103 row_sel_copy_input_variable_vals(node);
2106 node->state = SEL_NODE_FETCH;
2107 node->fetch_table = 0;
2109 if (node->is_aggregate) {
2110 /* Reset the aggregate total values */
2111 sel_reset_aggregate_vals(node);
2115 err = row_sel(node, thr);
2117 /* NOTE! if queries are parallelized, the following assignment may
2118 have problems; the assignment should be made only if thr is the
2119 only top-level thr in the graph: */
2121 thr->graph->last_sel_node = node;
2123 if (err != DB_SUCCESS) {
2124 thr_get_trx(thr)->error_state = err;
2126 return(NULL);
2129 return(thr);
2132 /**********************************************************************//**
2133 Performs a fetch for a cursor.
2134 @return query thread to run next or NULL */
2135 UNIV_INTERN
2136 que_thr_t*
2137 fetch_step(
2138 /*=======*/
2139 que_thr_t* thr) /*!< in: query thread */
2141 sel_node_t* sel_node;
2142 fetch_node_t* node;
2144 ut_ad(thr);
2146 node = thr->run_node;
2147 sel_node = node->cursor_def;
2149 ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2151 if (thr->prev_node != que_node_get_parent(node)) {
2153 if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2155 if (node->into_list) {
2156 sel_assign_into_var_values(node->into_list,
2157 sel_node);
2158 } else {
2159 void* ret = (*node->func->func)(
2160 sel_node, node->func->arg);
2162 if (!ret) {
2163 sel_node->state
2164 = SEL_NODE_NO_MORE_ROWS;
2169 thr->run_node = que_node_get_parent(node);
2171 return(thr);
2174 /* Make the fetch node the parent of the cursor definition for
2175 the time of the fetch, so that execution knows to return to this
2176 fetch node after a row has been selected or we know that there is
2177 no row left */
2179 sel_node->common.parent = node;
2181 if (sel_node->state == SEL_NODE_CLOSED) {
2182 fprintf(stderr,
2183 "InnoDB: Error: fetch called on a closed cursor\n");
2185 thr_get_trx(thr)->error_state = DB_ERROR;
2187 return(NULL);
2190 thr->run_node = sel_node;
2192 return(thr);
2195 /****************************************************************//**
2196 Sample callback function for fetch that prints each row.
2197 @return always returns non-NULL */
2198 UNIV_INTERN
2199 void*
2200 row_fetch_print(
2201 /*============*/
2202 void* row, /*!< in: sel_node_t* */
2203 void* user_arg) /*!< in: not used */
2205 sel_node_t* node = row;
2206 que_node_t* exp;
2207 ulint i = 0;
2209 UT_NOT_USED(user_arg);
2211 fprintf(stderr, "row_fetch_print: row %p\n", row);
2213 exp = node->select_list;
2215 while (exp) {
2216 dfield_t* dfield = que_node_get_val(exp);
2217 const dtype_t* type = dfield_get_type(dfield);
2219 fprintf(stderr, " column %lu:\n", (ulong)i);
2221 dtype_print(type);
2222 putc('\n', stderr);
2224 if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2225 ut_print_buf(stderr, dfield_get_data(dfield),
2226 dfield_get_len(dfield));
2227 putc('\n', stderr);
2228 } else {
2229 fputs(" <NULL>;\n", stderr);
2232 exp = que_node_get_next(exp);
2233 i++;
2236 return((void*)42);
2239 /***********************************************************//**
2240 Prints a row in a select result.
2241 @return query thread to run next or NULL */
2242 UNIV_INTERN
2243 que_thr_t*
2244 row_printf_step(
2245 /*============*/
2246 que_thr_t* thr) /*!< in: query thread */
2248 row_printf_node_t* node;
2249 sel_node_t* sel_node;
2250 que_node_t* arg;
2252 ut_ad(thr);
2254 node = thr->run_node;
2256 sel_node = node->sel_node;
2258 ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2260 if (thr->prev_node == que_node_get_parent(node)) {
2262 /* Reset the cursor */
2263 sel_node->state = SEL_NODE_OPEN;
2265 /* Fetch next row to print */
2267 thr->run_node = sel_node;
2269 return(thr);
2272 if (sel_node->state != SEL_NODE_FETCH) {
2274 ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2276 /* No more rows to print */
2278 thr->run_node = que_node_get_parent(node);
2280 return(thr);
2283 arg = sel_node->select_list;
2285 while (arg) {
2286 dfield_print_also_hex(que_node_get_val(arg));
2288 fputs(" ::: ", stderr);
2290 arg = que_node_get_next(arg);
2293 putc('\n', stderr);
2295 /* Fetch next row to print */
2297 thr->run_node = sel_node;
2299 return(thr);
2302 /****************************************************************//**
2303 Converts a key value stored in MySQL format to an Innobase dtuple. The last
2304 field of the key value may be just a prefix of a fixed length field: hence
2305 the parameter key_len. But currently we do not allow search keys where the
2306 last field is only a prefix of the full key field len and print a warning if
2307 such appears. A counterpart of this function is
2308 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2309 UNIV_INTERN
2310 void
2311 row_sel_convert_mysql_key_to_innobase(
2312 /*==================================*/
2313 dtuple_t* tuple, /*!< in/out: tuple where to build;
2314 NOTE: we assume that the type info
2315 in the tuple is already according
2316 to index! */
2317 byte* buf, /*!< in: buffer to use in field
2318 conversions */
2319 ulint buf_len, /*!< in: buffer length */
2320 dict_index_t* index, /*!< in: index of the key value */
2321 const byte* key_ptr, /*!< in: MySQL key value */
2322 ulint key_len, /*!< in: MySQL key value length */
2323 trx_t* trx) /*!< in: transaction */
2325 byte* original_buf = buf;
2326 const byte* original_key_ptr = key_ptr;
2327 dict_field_t* field;
2328 dfield_t* dfield;
2329 ulint data_offset;
2330 ulint data_len;
2331 ulint data_field_len;
2332 ibool is_null;
2333 const byte* key_end;
2334 ulint n_fields = 0;
2336 /* For documentation of the key value storage format in MySQL, see
2337 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2339 key_end = key_ptr + key_len;
2341 /* Permit us to access any field in the tuple (ULINT_MAX): */
2343 dtuple_set_n_fields(tuple, ULINT_MAX);
2345 dfield = dtuple_get_nth_field(tuple, 0);
2346 field = dict_index_get_nth_field(index, 0);
2348 if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2349 /* A special case: we are looking for a position in the
2350 generated clustered index which InnoDB automatically added
2351 to a table with no primary key: the first and the only
2352 ordering column is ROW_ID which InnoDB stored to the key_ptr
2353 buffer. */
2355 ut_a(key_len == DATA_ROW_ID_LEN);
2357 dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2359 dtuple_set_n_fields(tuple, 1);
2361 return;
2364 while (key_ptr < key_end) {
2366 ulint type = dfield_get_type(dfield)->mtype;
2367 ut_a(field->col->mtype == type);
2369 data_offset = 0;
2370 is_null = FALSE;
2372 if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2373 /* The first byte in the field tells if this is
2374 an SQL NULL value */
2376 data_offset = 1;
2378 if (*key_ptr != 0) {
2379 dfield_set_null(dfield);
2381 is_null = TRUE;
2385 /* Calculate data length and data field total length */
2387 if (type == DATA_BLOB) {
2388 /* The key field is a column prefix of a BLOB or
2389 TEXT */
2391 ut_a(field->prefix_len > 0);
2393 /* MySQL stores the actual data length to the first 2
2394 bytes after the optional SQL NULL marker byte. The
2395 storage format is little-endian, that is, the most
2396 significant byte at a higher address. In UTF-8, MySQL
2397 seems to reserve field->prefix_len bytes for
2398 storing this field in the key value buffer, even
2399 though the actual value only takes data_len bytes
2400 from the start. */
2402 data_len = key_ptr[data_offset]
2403 + 256 * key_ptr[data_offset + 1];
2404 data_field_len = data_offset + 2 + field->prefix_len;
2406 data_offset += 2;
2408 /* Now that we know the length, we store the column
2409 value like it would be a fixed char field */
2411 } else if (field->prefix_len > 0) {
2412 /* Looks like MySQL pads unused end bytes in the
2413 prefix with space. Therefore, also in UTF-8, it is ok
2414 to compare with a prefix containing full prefix_len
2415 bytes, and no need to take at most prefix_len / 3
2416 UTF-8 characters from the start.
2417 If the prefix is used as the upper end of a LIKE
2418 'abc%' query, then MySQL pads the end with chars
2419 0xff. TODO: in that case does it any harm to compare
2420 with the full prefix_len bytes. How do characters
2421 0xff in UTF-8 behave? */
2423 data_len = field->prefix_len;
2424 data_field_len = data_offset + data_len;
2425 } else {
2426 data_len = dfield_get_type(dfield)->len;
2427 data_field_len = data_offset + data_len;
2430 if (UNIV_UNLIKELY
2431 (dtype_get_mysql_type(dfield_get_type(dfield))
2432 == DATA_MYSQL_TRUE_VARCHAR)
2433 && UNIV_LIKELY(type != DATA_INT)) {
2434 /* In a MySQL key value format, a true VARCHAR is
2435 always preceded by 2 bytes of a length field.
2436 dfield_get_type(dfield)->len returns the maximum
2437 'payload' len in bytes. That does not include the
2438 2 bytes that tell the actual data length.
2440 We added the check != DATA_INT to make sure we do
2441 not treat MySQL ENUM or SET as a true VARCHAR! */
2443 data_len += 2;
2444 data_field_len += 2;
2447 /* Storing may use at most data_len bytes of buf */
2449 if (UNIV_LIKELY(!is_null)) {
2450 row_mysql_store_col_in_innobase_format(
2451 dfield, buf,
2452 FALSE, /* MySQL key value format col */
2453 key_ptr + data_offset, data_len,
2454 dict_table_is_comp(index->table));
2455 buf += data_len;
2458 key_ptr += data_field_len;
2460 if (UNIV_UNLIKELY(key_ptr > key_end)) {
2461 /* The last field in key was not a complete key field
2462 but a prefix of it.
2464 Print a warning about this! HA_READ_PREFIX_LAST does
2465 not currently work in InnoDB with partial-field key
2466 value prefixes. Since MySQL currently uses a padding
2467 trick to calculate LIKE 'abc%' type queries there
2468 should never be partial-field prefixes in searches. */
2470 ut_print_timestamp(stderr);
2472 fputs(" InnoDB: Warning: using a partial-field"
2473 " key prefix in search.\n"
2474 "InnoDB: ", stderr);
2475 dict_index_name_print(stderr, trx, index);
2476 fprintf(stderr, ". Last data field length %lu bytes,\n"
2477 "InnoDB: key ptr now exceeds"
2478 " key end by %lu bytes.\n"
2479 "InnoDB: Key value in the MySQL format:\n",
2480 (ulong) data_field_len,
2481 (ulong) (key_ptr - key_end));
2482 fflush(stderr);
2483 ut_print_buf(stderr, original_key_ptr, key_len);
2484 putc('\n', stderr);
2486 if (!is_null) {
2487 ulint len = dfield_get_len(dfield);
2488 dfield_set_len(dfield, len
2489 - (ulint) (key_ptr - key_end));
2493 n_fields++;
2494 field++;
2495 dfield++;
2498 ut_a(buf <= original_buf + buf_len);
2500 /* We set the length of tuple to n_fields: we assume that the memory
2501 area allocated for it is big enough (usually bigger than n_fields). */
2503 dtuple_set_n_fields(tuple, n_fields);
2506 /**************************************************************//**
2507 Stores the row id to the prebuilt struct. */
2508 static
2509 void
2510 row_sel_store_row_id_to_prebuilt(
2511 /*=============================*/
2512 row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2513 const rec_t* index_rec, /*!< in: record */
2514 const dict_index_t* index, /*!< in: index of the record */
2515 const ulint* offsets) /*!< in: rec_get_offsets
2516 (index_rec, index) */
2518 const byte* data;
2519 ulint len;
2521 ut_ad(rec_offs_validate(index_rec, index, offsets));
2523 data = rec_get_nth_field(
2524 index_rec, offsets,
2525 dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2527 if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2528 fprintf(stderr,
2529 "InnoDB: Error: Row id field is"
2530 " wrong length %lu in ", (ulong) len);
2531 dict_index_name_print(stderr, prebuilt->trx, index);
2532 fprintf(stderr, "\n"
2533 "InnoDB: Field number %lu, record:\n",
2534 (ulong) dict_index_get_sys_col_pos(index,
2535 DATA_ROW_ID));
2536 rec_print_new(stderr, index_rec, offsets);
2537 putc('\n', stderr);
2538 ut_error;
2541 ut_memcpy(prebuilt->row_id, data, len);
2544 /**************************************************************//**
2545 Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2546 function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2547 static
2548 void
2549 row_sel_field_store_in_mysql_format(
2550 /*================================*/
2551 byte* dest, /*!< in/out: buffer where to store; NOTE
2552 that BLOBs are not in themselves
2553 stored here: the caller must allocate
2554 and copy the BLOB into buffer before,
2555 and pass the pointer to the BLOB in
2556 'data' */
2557 const mysql_row_templ_t* templ,
2558 /*!< in: MySQL column template.
2559 Its following fields are referenced:
2560 type, is_unsigned, mysql_col_len,
2561 mbminlen, mbmaxlen */
2562 const byte* data, /*!< in: data to store */
2563 ulint len) /*!< in: length of the data */
2565 byte* ptr;
2566 byte* field_end;
2567 byte* pad_ptr;
2569 ut_ad(len != UNIV_SQL_NULL);
2570 UNIV_MEM_ASSERT_RW(data, len);
2571 UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
2572 UNIV_MEM_INVALID(dest, templ->mysql_col_len);
2574 switch (templ->type) {
2575 case DATA_INT:
2576 /* Convert integer data from Innobase to a little-endian
2577 format, sign bit restored to normal */
2579 ptr = dest + len;
2581 for (;;) {
2582 ptr--;
2583 *ptr = *data;
2584 if (ptr == dest) {
2585 break;
2587 data++;
2590 if (!templ->is_unsigned) {
2591 dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2594 ut_ad(templ->mysql_col_len == len);
2595 break;
2597 case DATA_VARCHAR:
2598 case DATA_VARMYSQL:
2599 case DATA_BINARY:
2600 field_end = dest + templ->mysql_col_len;
2602 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2603 /* This is a >= 5.0.3 type true VARCHAR. Store the
2604 length of the data to the first byte or the first
2605 two bytes of dest. */
2607 dest = row_mysql_store_true_var_len(
2608 dest, len, templ->mysql_length_bytes);
2609 /* Copy the actual data. Leave the rest of the
2610 buffer uninitialized. */
2611 memcpy(dest, data, len);
2612 break;
2615 /* Copy the actual data */
2616 ut_memcpy(dest, data, len);
2618 /* Pad with trailing spaces. */
2620 pad_ptr = dest + len;
2622 ut_ad(templ->mbminlen <= templ->mbmaxlen);
2624 /* We handle UCS2 charset strings differently. */
2625 if (templ->mbminlen == 2) {
2626 /* A space char is two bytes, 0x0020 in UCS2 */
2628 if (len & 1) {
2629 /* A 0x20 has been stripped from the column.
2630 Pad it back. */
2632 if (pad_ptr < field_end) {
2633 *pad_ptr = 0x20;
2634 pad_ptr++;
2638 /* Pad the rest of the string with 0x0020 */
2640 while (pad_ptr < field_end) {
2641 *pad_ptr = 0x00;
2642 pad_ptr++;
2643 *pad_ptr = 0x20;
2644 pad_ptr++;
2646 } else {
2647 ut_ad(templ->mbminlen == 1);
2648 /* space=0x20 */
2650 memset(pad_ptr, 0x20, field_end - pad_ptr);
2652 break;
2654 case DATA_BLOB:
2655 /* Store a pointer to the BLOB buffer to dest: the BLOB was
2656 already copied to the buffer in row_sel_store_mysql_rec */
2658 row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2659 len);
2660 break;
2662 case DATA_MYSQL:
2663 memcpy(dest, data, len);
2665 ut_ad(templ->mysql_col_len >= len);
2666 ut_ad(templ->mbmaxlen >= templ->mbminlen);
2668 ut_ad(templ->mbmaxlen > templ->mbminlen
2669 || templ->mysql_col_len == len);
2670 /* The following assertion would fail for old tables
2671 containing UTF-8 ENUM columns due to Bug #9526. */
2672 ut_ad(!templ->mbmaxlen
2673 || !(templ->mysql_col_len % templ->mbmaxlen));
2674 ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2676 if (templ->mbminlen != templ->mbmaxlen) {
2677 /* Pad with spaces. This undoes the stripping
2678 done in row0mysql.ic, function
2679 row_mysql_store_col_in_innobase_format(). */
2681 memset(dest + len, 0x20, templ->mysql_col_len - len);
2683 break;
2685 default:
2686 #ifdef UNIV_DEBUG
2687 case DATA_SYS_CHILD:
2688 case DATA_SYS:
2689 /* These column types should never be shipped to MySQL. */
2690 ut_ad(0);
2692 case DATA_CHAR:
2693 case DATA_FIXBINARY:
2694 case DATA_FLOAT:
2695 case DATA_DOUBLE:
2696 case DATA_DECIMAL:
2697 /* Above are the valid column types for MySQL data. */
2698 #endif /* UNIV_DEBUG */
2699 ut_ad(templ->mysql_col_len == len);
2700 memcpy(dest, data, len);
2704 /**************************************************************//**
2705 Convert a row in the Innobase format to a row in the MySQL format.
2706 Note that the template in prebuilt may advise us to copy only a few
2707 columns to mysql_rec, other columns are left blank. All columns may not
2708 be needed in the query.
2709 @return TRUE on success, FALSE if not all columns could be retrieved */
2710 static __attribute__((warn_unused_result))
2711 ibool
2712 row_sel_store_mysql_rec(
2713 /*====================*/
2714 byte* mysql_rec, /*!< out: row in the MySQL format */
2715 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2716 const rec_t* rec, /*!< in: Innobase record in the index
2717 which was described in prebuilt's
2718 template, or in the clustered index;
2719 must be protected by a page latch */
2720 ibool rec_clust, /*!< in: TRUE if rec is in the
2721 clustered index instead of
2722 prebuilt->index */
2723 const ulint* offsets) /*!< in: array returned by
2724 rec_get_offsets(rec) */
2726 mem_heap_t* extern_field_heap = NULL;
2727 mem_heap_t* heap;
2728 ulint i;
2730 ut_ad(prebuilt->mysql_template);
2731 ut_ad(prebuilt->default_rec);
2732 ut_ad(rec_offs_validate(rec, NULL, offsets));
2733 ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2735 if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2736 mem_heap_free(prebuilt->blob_heap);
2737 prebuilt->blob_heap = NULL;
2740 for (i = 0; i < prebuilt->n_template; i++) {
2742 const mysql_row_templ_t*templ = prebuilt->mysql_template + i;
2743 const byte* data;
2744 ulint len;
2745 ulint field_no;
2747 field_no = rec_clust
2748 ? templ->clust_rec_field_no : templ->rec_field_no;
2750 if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
2752 /* Copy an externally stored field to the temporary
2753 heap */
2755 ut_a(!prebuilt->trx->has_search_latch);
2757 if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2758 if (prebuilt->blob_heap == NULL) {
2759 prebuilt->blob_heap = mem_heap_create(
2760 UNIV_PAGE_SIZE);
2763 heap = prebuilt->blob_heap;
2764 } else {
2765 extern_field_heap
2766 = mem_heap_create(UNIV_PAGE_SIZE);
2768 heap = extern_field_heap;
2771 /* NOTE: if we are retrieving a big BLOB, we may
2772 already run out of memory in the next call, which
2773 causes an assert */
2775 data = btr_rec_copy_externally_stored_field(
2776 rec, offsets,
2777 dict_table_zip_size(prebuilt->table),
2778 field_no, &len, heap);
2780 if (UNIV_UNLIKELY(!data)) {
2781 /* The externally stored field
2782 was not written yet. This
2783 record should only be seen by
2784 recv_recovery_rollback_active()
2785 or any TRX_ISO_READ_UNCOMMITTED
2786 transactions. */
2788 if (extern_field_heap) {
2789 mem_heap_free(extern_field_heap);
2792 return(FALSE);
2795 ut_a(len != UNIV_SQL_NULL);
2796 } else {
2797 /* Field is stored in the row. */
2799 data = rec_get_nth_field(rec, offsets, field_no, &len);
2801 if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2802 && len != UNIV_SQL_NULL) {
2804 /* It is a BLOB field locally stored in the
2805 InnoDB record: we MUST copy its contents to
2806 prebuilt->blob_heap here because later code
2807 assumes all BLOB values have been copied to a
2808 safe place. */
2810 if (prebuilt->blob_heap == NULL) {
2811 prebuilt->blob_heap = mem_heap_create(
2812 UNIV_PAGE_SIZE);
2815 data = memcpy(mem_heap_alloc(
2816 prebuilt->blob_heap, len),
2817 data, len);
2821 if (len != UNIV_SQL_NULL) {
2822 row_sel_field_store_in_mysql_format(
2823 mysql_rec + templ->mysql_col_offset,
2824 templ, data, len);
2826 /* Cleanup */
2827 if (extern_field_heap) {
2828 mem_heap_free(extern_field_heap);
2829 extern_field_heap = NULL;
2832 if (templ->mysql_null_bit_mask) {
2833 /* It is a nullable column with a non-NULL
2834 value */
2835 mysql_rec[templ->mysql_null_byte_offset]
2836 &= ~(byte) templ->mysql_null_bit_mask;
2838 } else {
2839 /* MySQL assumes that the field for an SQL
2840 NULL value is set to the default value. */
2842 UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2843 + templ->mysql_col_offset,
2844 templ->mysql_col_len);
2845 mysql_rec[templ->mysql_null_byte_offset]
2846 |= (byte) templ->mysql_null_bit_mask;
2847 memcpy(mysql_rec + templ->mysql_col_offset,
2848 (const byte*) prebuilt->default_rec
2849 + templ->mysql_col_offset,
2850 templ->mysql_col_len);
2854 return(TRUE);
2857 /*********************************************************************//**
2858 Builds a previous version of a clustered index record for a consistent read
2859 @return DB_SUCCESS or error code */
2860 static
2861 ulint
2862 row_sel_build_prev_vers_for_mysql(
2863 /*==============================*/
2864 read_view_t* read_view, /*!< in: read view */
2865 dict_index_t* clust_index, /*!< in: clustered index */
2866 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2867 const rec_t* rec, /*!< in: record in a clustered index */
2868 ulint** offsets, /*!< in/out: offsets returned by
2869 rec_get_offsets(rec, clust_index) */
2870 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
2871 the offsets are allocated */
2872 rec_t** old_vers, /*!< out: old version, or NULL if the
2873 record does not exist in the view:
2874 i.e., it was freshly inserted
2875 afterwards */
2876 mtr_t* mtr) /*!< in: mtr */
2878 ulint err;
2880 if (prebuilt->old_vers_heap) {
2881 mem_heap_empty(prebuilt->old_vers_heap);
2882 } else {
2883 prebuilt->old_vers_heap = mem_heap_create(200);
2886 err = row_vers_build_for_consistent_read(
2887 rec, mtr, clust_index, offsets, read_view, offset_heap,
2888 prebuilt->old_vers_heap, old_vers);
2889 return(err);
2892 /*********************************************************************//**
2893 Retrieves the clustered index record corresponding to a record in a
2894 non-clustered index. Does the necessary locking. Used in the MySQL
2895 interface.
2896 @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
2897 static
2898 enum db_err
2899 row_sel_get_clust_rec_for_mysql(
2900 /*============================*/
2901 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
2902 dict_index_t* sec_index,/*!< in: secondary index where rec resides */
2903 const rec_t* rec, /*!< in: record in a non-clustered index; if
2904 this is a locking read, then rec is not
2905 allowed to be delete-marked, and that would
2906 not make sense either */
2907 que_thr_t* thr, /*!< in: query thread */
2908 const rec_t** out_rec,/*!< out: clustered record or an old version of
2909 it, NULL if the old version did not exist
2910 in the read view, i.e., it was a fresh
2911 inserted version */
2912 ulint** offsets,/*!< in: offsets returned by
2913 rec_get_offsets(rec, sec_index);
2914 out: offsets returned by
2915 rec_get_offsets(out_rec, clust_index) */
2916 mem_heap_t** offset_heap,/*!< in/out: memory heap from which
2917 the offsets are allocated */
2918 mtr_t* mtr) /*!< in: mtr used to get access to the
2919 non-clustered record; the same mtr is used to
2920 access the clustered index */
2922 dict_index_t* clust_index;
2923 const rec_t* clust_rec;
2924 rec_t* old_vers;
2925 enum db_err err;
2926 trx_t* trx;
2928 *out_rec = NULL;
2929 trx = thr_get_trx(thr);
2931 row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
2932 sec_index, *offsets, trx);
2934 clust_index = dict_table_get_first_index(sec_index->table);
2936 btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2937 PAGE_CUR_LE, BTR_SEARCH_LEAF,
2938 prebuilt->clust_pcur, 0, mtr);
2940 clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2942 prebuilt->clust_pcur->trx_if_known = trx;
2944 /* Note: only if the search ends up on a non-infimum record is the
2945 low_match value the real match to the search tuple */
2947 if (!page_rec_is_user_rec(clust_rec)
2948 || btr_pcur_get_low_match(prebuilt->clust_pcur)
2949 < dict_index_get_n_unique(clust_index)) {
2951 /* In a rare case it is possible that no clust rec is found
2952 for a delete-marked secondary index record: if in row0umod.c
2953 in row_undo_mod_remove_clust_low() we have already removed
2954 the clust rec, while purge is still cleaning and removing
2955 secondary index records associated with earlier versions of
2956 the clustered index record. In that case we know that the
2957 clustered index record did not exist in the read view of
2958 trx. */
2960 if (!rec_get_deleted_flag(rec,
2961 dict_table_is_comp(sec_index->table))
2962 || prebuilt->select_lock_type != LOCK_NONE) {
2963 ut_print_timestamp(stderr);
2964 fputs(" InnoDB: error clustered record"
2965 " for sec rec not found\n"
2966 "InnoDB: ", stderr);
2967 dict_index_name_print(stderr, trx, sec_index);
2968 fputs("\n"
2969 "InnoDB: sec index record ", stderr);
2970 rec_print(stderr, rec, sec_index);
2971 fputs("\n"
2972 "InnoDB: clust index record ", stderr);
2973 rec_print(stderr, clust_rec, clust_index);
2974 putc('\n', stderr);
2975 trx_print(stderr, trx, 600);
2977 fputs("\n"
2978 "InnoDB: Submit a detailed bug report"
2979 " to http://bugs.mysql.com\n", stderr);
2982 clust_rec = NULL;
2984 err = DB_SUCCESS;
2985 goto func_exit;
2988 *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2989 ULINT_UNDEFINED, offset_heap);
2991 if (prebuilt->select_lock_type != LOCK_NONE) {
2992 /* Try to place a lock on the index record; we are searching
2993 the clust rec with a unique condition, hence
2994 we set a LOCK_REC_NOT_GAP type lock */
2996 err = lock_clust_rec_read_check_and_lock(
2997 0, btr_pcur_get_block(prebuilt->clust_pcur),
2998 clust_rec, clust_index, *offsets,
2999 prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
3000 switch (err) {
3001 case DB_SUCCESS:
3002 case DB_SUCCESS_LOCKED_REC:
3003 break;
3004 default:
3005 goto err_exit;
3007 } else {
3008 /* This is a non-locking consistent read: if necessary, fetch
3009 a previous version of the record */
3011 old_vers = NULL;
3013 /* If the isolation level allows reading of uncommitted data,
3014 then we never look for an earlier version */
3016 if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3017 && !lock_clust_rec_cons_read_sees(
3018 clust_rec, clust_index, *offsets,
3019 trx->read_view)) {
3021 /* The following call returns 'offsets' associated with
3022 'old_vers' */
3023 err = row_sel_build_prev_vers_for_mysql(
3024 trx->read_view, clust_index, prebuilt,
3025 clust_rec, offsets, offset_heap, &old_vers,
3026 mtr);
3028 if (err != DB_SUCCESS || old_vers == NULL) {
3030 goto err_exit;
3033 clust_rec = old_vers;
3036 /* If we had to go to an earlier version of row or the
3037 secondary index record is delete marked, then it may be that
3038 the secondary index record corresponding to clust_rec
3039 (or old_vers) is not rec; in that case we must ignore
3040 such row because in our snapshot rec would not have existed.
3041 Remember that from rec we cannot see directly which transaction
3042 id corresponds to it: we have to go to the clustered index
3043 record. A query where we want to fetch all rows where
3044 the secondary index value is in some interval would return
3045 a wrong result if we would not drop rows which we come to
3046 visit through secondary index records that would not really
3047 exist in our snapshot. */
3049 if (clust_rec
3050 && (old_vers
3051 || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3052 || rec_get_deleted_flag(rec, dict_table_is_comp(
3053 sec_index->table)))
3054 && !row_sel_sec_rec_is_for_clust_rec(
3055 rec, sec_index, clust_rec, clust_index)) {
3056 clust_rec = NULL;
3057 #ifdef UNIV_SEARCH_DEBUG
3058 } else {
3059 ut_a(clust_rec == NULL
3060 || row_sel_sec_rec_is_for_clust_rec(
3061 rec, sec_index, clust_rec, clust_index));
3062 #endif
3065 err = DB_SUCCESS;
3068 func_exit:
3069 *out_rec = clust_rec;
3071 if (prebuilt->select_lock_type != LOCK_NONE) {
3072 /* We may use the cursor in update or in unlock_row():
3073 store its position */
3075 btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3078 err_exit:
3079 return(err);
3082 /********************************************************************//**
3083 Restores cursor position after it has been stored. We have to take into
3084 account that the record cursor was positioned on may have been deleted.
3085 Then we may have to move the cursor one step up or down.
3086 @return TRUE if we may need to process the record the cursor is now
3087 positioned on (i.e. we should not go to the next record yet) */
3088 static
3089 ibool
3090 sel_restore_position_for_mysql(
3091 /*===========================*/
3092 ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3093 the cursor on a user record with the
3094 same ordering prefix in in the
3095 B-tree index */
3096 ulint latch_mode, /*!< in: latch mode wished in
3097 restoration */
3098 btr_pcur_t* pcur, /*!< in: cursor whose position
3099 has been stored */
3100 ibool moves_up, /*!< in: TRUE if the cursor moves up
3101 in the index */
3102 mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3103 mtr temporarily! */
3105 ibool success;
3106 ulint relative_position;
3108 relative_position = pcur->rel_pos;
3110 success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3112 *same_user_rec = success;
3114 if (relative_position == BTR_PCUR_ON) {
3115 if (success) {
3116 return(FALSE);
3119 if (moves_up) {
3120 btr_pcur_move_to_next(pcur, mtr);
3123 return(TRUE);
3126 if (relative_position == BTR_PCUR_AFTER
3127 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3129 if (moves_up) {
3130 return(TRUE);
3133 if (btr_pcur_is_on_user_rec(pcur)) {
3134 btr_pcur_move_to_prev(pcur, mtr);
3137 return(TRUE);
3140 ut_ad(relative_position == BTR_PCUR_BEFORE
3141 || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3143 if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3144 btr_pcur_move_to_next(pcur, mtr);
3147 return(TRUE);
3150 /********************************************************************//**
3151 Copies a cached field for MySQL from the fetch cache. */
3152 static
3153 void
3154 row_sel_copy_cached_field_for_mysql(
3155 /*================================*/
3156 byte* buf, /*!< in/out: row buffer */
3157 const byte* cache, /*!< in: cached row */
3158 const mysql_row_templ_t*templ) /*!< in: column template */
3160 ulint len;
3162 buf += templ->mysql_col_offset;
3163 cache += templ->mysql_col_offset;
3165 UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
3167 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
3168 && templ->type != DATA_INT) {
3169 /* Check for != DATA_INT to make sure we do
3170 not treat MySQL ENUM or SET as a true VARCHAR!
3171 Find the actual length of the true VARCHAR field. */
3172 row_mysql_read_true_varchar(
3173 &len, cache, templ->mysql_length_bytes);
3174 len += templ->mysql_length_bytes;
3175 UNIV_MEM_INVALID(buf, templ->mysql_col_len);
3176 } else {
3177 len = templ->mysql_col_len;
3180 ut_memcpy(buf, cache, len);
3183 /********************************************************************//**
3184 Pops a cached row for MySQL from the fetch cache. */
3185 UNIV_INLINE
3186 void
3187 row_sel_pop_cached_row_for_mysql(
3188 /*=============================*/
3189 byte* buf, /*!< in/out: buffer where to copy the
3190 row */
3191 row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3193 ulint i;
3194 const mysql_row_templ_t*templ;
3195 const byte* cached_rec;
3196 ut_ad(prebuilt->n_fetch_cached > 0);
3197 ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3199 UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
3201 cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
3203 if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3204 /* Copy cache record field by field, don't touch fields that
3205 are not covered by current key */
3207 for (i = 0; i < prebuilt->n_template; i++) {
3208 templ = prebuilt->mysql_template + i;
3209 row_sel_copy_cached_field_for_mysql(
3210 buf, cached_rec, templ);
3211 /* Copy NULL bit of the current field from cached_rec
3212 to buf */
3213 if (templ->mysql_null_bit_mask) {
3214 buf[templ->mysql_null_byte_offset]
3215 ^= (buf[templ->mysql_null_byte_offset]
3216 ^ cached_rec[templ->mysql_null_byte_offset])
3217 & (byte)templ->mysql_null_bit_mask;
3220 } else if (prebuilt->mysql_prefix_len > 63) {
3221 /* The record is long. Copy it field by field, in case
3222 there are some long VARCHAR column of which only a
3223 small length is being used. */
3224 UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
3226 /* First copy the NULL bits. */
3227 ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
3228 /* Then copy the requested fields. */
3230 for (i = 0; i < prebuilt->n_template; i++) {
3231 row_sel_copy_cached_field_for_mysql(
3232 buf, cached_rec, prebuilt->mysql_template + i);
3234 } else {
3235 ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
3238 prebuilt->n_fetch_cached--;
3239 prebuilt->fetch_cache_first++;
3241 if (prebuilt->n_fetch_cached == 0) {
3242 prebuilt->fetch_cache_first = 0;
3246 /********************************************************************//**
3247 Pushes a row for MySQL to the fetch cache.
3248 @return TRUE on success, FALSE if the record contains incomplete BLOBs */
3249 UNIV_INLINE __attribute__((warn_unused_result))
3250 ibool
3251 row_sel_push_cache_row_for_mysql(
3252 /*=============================*/
3253 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3254 const rec_t* rec, /*!< in: record to push, in the index
3255 which was described in prebuilt's
3256 template, or in the clustered index;
3257 must be protected by a page latch */
3258 ibool rec_clust, /*!< in: TRUE if rec is in the
3259 clustered index instead of
3260 prebuilt->index */
3261 const ulint* offsets) /*!< in: rec_get_offsets(rec) */
3263 byte* buf;
3264 ulint i;
3266 ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3267 ut_ad(rec_offs_validate(rec, NULL, offsets));
3268 ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
3269 ut_a(!prebuilt->templ_contains_blob);
3271 if (prebuilt->fetch_cache[0] == NULL) {
3272 /* Allocate memory for the fetch cache */
3274 for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3276 /* A user has reported memory corruption in these
3277 buffers in Linux. Put magic numbers there to help
3278 to track a possible bug. */
3280 buf = mem_alloc(prebuilt->mysql_row_len + 8);
3282 prebuilt->fetch_cache[i] = buf + 4;
3284 mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3285 mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3286 ROW_PREBUILT_FETCH_MAGIC_N);
3290 ut_ad(prebuilt->fetch_cache_first == 0);
3291 UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3292 prebuilt->mysql_row_len);
3294 if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3295 prebuilt->fetch_cache[
3296 prebuilt->n_fetch_cached],
3297 prebuilt, rec, rec_clust, offsets))) {
3298 return(FALSE);
3301 prebuilt->n_fetch_cached++;
3302 return(TRUE);
3305 /*********************************************************************//**
3306 Tries to do a shortcut to fetch a clustered index record with a unique key,
3307 using the hash index if possible (not always). We assume that the search
3308 mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3309 btr search latch has been locked in S-mode.
3310 @return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3311 static
3312 ulint
3313 row_sel_try_search_shortcut_for_mysql(
3314 /*==================================*/
3315 const rec_t** out_rec,/*!< out: record if found */
3316 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3317 ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3318 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3319 mtr_t* mtr) /*!< in: started mtr */
3321 dict_index_t* index = prebuilt->index;
3322 const dtuple_t* search_tuple = prebuilt->search_tuple;
3323 btr_pcur_t* pcur = prebuilt->pcur;
3324 trx_t* trx = prebuilt->trx;
3325 const rec_t* rec;
3327 ut_ad(dict_index_is_clust(index));
3328 ut_ad(!prebuilt->templ_contains_blob);
3330 #ifndef UNIV_SEARCH_DEBUG
3331 btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3332 BTR_SEARCH_LEAF, pcur,
3333 RW_S_LATCH,
3334 mtr);
3335 #else /* UNIV_SEARCH_DEBUG */
3336 btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3337 BTR_SEARCH_LEAF, pcur,
3339 mtr);
3340 #endif /* UNIV_SEARCH_DEBUG */
3341 rec = btr_pcur_get_rec(pcur);
3343 if (!page_rec_is_user_rec(rec)) {
3345 return(SEL_RETRY);
3348 /* As the cursor is now placed on a user record after a search with
3349 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3350 fields in the user record matched to the search tuple */
3352 if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3354 return(SEL_EXHAUSTED);
3357 /* This is a non-locking consistent read: if necessary, fetch
3358 a previous version of the record */
3360 *offsets = rec_get_offsets(rec, index, *offsets,
3361 ULINT_UNDEFINED, heap);
3363 if (!lock_clust_rec_cons_read_sees(rec, index,
3364 *offsets, trx->read_view)) {
3366 return(SEL_RETRY);
3369 if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3371 return(SEL_EXHAUSTED);
3374 *out_rec = rec;
3376 return(SEL_FOUND);
3379 /********************************************************************//**
3380 Searches for rows in the database. This is used in the interface to
3381 MySQL. This function opens a cursor, and also implements fetch next
3382 and fetch prev. NOTE that if we do a search with a full key value
3383 from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3384 position and fetch next or fetch prev must not be tried to the cursor!
3385 @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3386 DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3387 UNIV_INTERN
3388 ulint
3389 row_search_for_mysql(
3390 /*=================*/
3391 byte* buf, /*!< in/out: buffer for the fetched
3392 row in the MySQL format */
3393 ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
3394 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
3395 table handle; this contains the info
3396 of search_tuple, index; if search
3397 tuple contains 0 fields then we
3398 position the cursor at the start or
3399 the end of the index, depending on
3400 'mode' */
3401 ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
3402 ROW_SEL_EXACT_PREFIX */
3403 ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
3404 ROW_SEL_PREV; NOTE: if this is != 0,
3405 then prebuilt must have a pcur
3406 with stored position! In opening of a
3407 cursor 'direction' should be 0. */
3409 dict_index_t* index = prebuilt->index;
3410 ibool comp = dict_table_is_comp(index->table);
3411 const dtuple_t* search_tuple = prebuilt->search_tuple;
3412 btr_pcur_t* pcur = prebuilt->pcur;
3413 trx_t* trx = prebuilt->trx;
3414 dict_index_t* clust_index;
3415 que_thr_t* thr;
3416 const rec_t* rec;
3417 const rec_t* result_rec;
3418 const rec_t* clust_rec;
3419 ulint err = DB_SUCCESS;
3420 ibool unique_search = FALSE;
3421 ibool unique_search_from_clust_index = FALSE;
3422 ibool mtr_has_extra_clust_latch = FALSE;
3423 ibool moves_up = FALSE;
3424 ibool set_also_gap_locks = TRUE;
3425 /* if the query is a plain locking SELECT, and the isolation level
3426 is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3427 ibool did_semi_consistent_read = FALSE;
3428 /* if the returned record was locked and we did a semi-consistent
3429 read (fetch the newest committed version), then this is set to
3430 TRUE */
3431 #ifdef UNIV_SEARCH_DEBUG
3432 ulint cnt = 0;
3433 #endif /* UNIV_SEARCH_DEBUG */
3434 ulint next_offs;
3435 ibool same_user_rec;
3436 mtr_t mtr;
3437 mem_heap_t* heap = NULL;
3438 ulint offsets_[REC_OFFS_NORMAL_SIZE];
3439 ulint* offsets = offsets_;
3440 ibool table_lock_waited = FALSE;
3442 rec_offs_init(offsets_);
3444 ut_ad(index && pcur && search_tuple);
3445 ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3447 if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3448 ut_print_timestamp(stderr);
3449 fprintf(stderr, " InnoDB: Error:\n"
3450 "InnoDB: MySQL is trying to use a table handle"
3451 " but the .ibd file for\n"
3452 "InnoDB: table %s does not exist.\n"
3453 "InnoDB: Have you deleted the .ibd file"
3454 " from the database directory under\n"
3455 "InnoDB: the MySQL datadir, or have you used"
3456 " DISCARD TABLESPACE?\n"
3457 "InnoDB: Look from\n"
3458 "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
3459 "InnoDB: how you can resolve the problem.\n",
3460 prebuilt->table->name);
3462 return(DB_ERROR);
3465 if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
3467 return(DB_MISSING_HISTORY);
3470 if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3471 fprintf(stderr,
3472 "InnoDB: Error: trying to free a corrupt\n"
3473 "InnoDB: table handle. Magic n %lu, table name ",
3474 (ulong) prebuilt->magic_n);
3475 ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3476 putc('\n', stderr);
3478 mem_analyze_corruption(prebuilt);
3480 ut_error;
3483 #if 0
3484 /* August 19, 2005 by Heikki: temporarily disable this error
3485 print until the cursor lock count is done correctly.
3486 See bugs #12263 and #12456!*/
3488 if (trx->n_mysql_tables_in_use == 0
3489 && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3490 /* Note that if MySQL uses an InnoDB temp table that it
3491 created inside LOCK TABLES, then n_mysql_tables_in_use can
3492 be zero; in that case select_lock_type is set to LOCK_X in
3493 ::start_stmt. */
3495 fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3496 "InnoDB: but it has not locked"
3497 " any tables in ::external_lock()!\n",
3498 stderr);
3499 trx_print(stderr, trx, 600);
3500 fputc('\n', stderr);
3502 #endif
3504 #if 0
3505 fprintf(stderr, "Match mode %lu\n search tuple ",
3506 (ulong) match_mode);
3507 dtuple_print(search_tuple);
3508 fprintf(stderr, "N tables locked %lu\n",
3509 (ulong) trx->mysql_n_tables_locked);
3510 #endif
3511 /*-------------------------------------------------------------*/
3512 /* PHASE 0: Release a possible s-latch we are holding on the
3513 adaptive hash index latch if there is someone waiting behind */
3515 if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3516 && trx->has_search_latch) {
3518 /* There is an x-latch request on the adaptive hash index:
3519 release the s-latch to reduce starvation and wait for
3520 BTR_SEA_TIMEOUT rounds before trying to keep it again over
3521 calls from MySQL */
3523 rw_lock_s_unlock(&btr_search_latch);
3524 trx->has_search_latch = FALSE;
3526 trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3529 /* Reset the new record lock info if srv_locks_unsafe_for_binlog
3530 is set or session is using a READ COMMITED isolation level. Then
3531 we are able to remove the record locks set here on an individual
3532 row. */
3533 prebuilt->new_rec_locks = 0;
3535 /*-------------------------------------------------------------*/
3536 /* PHASE 1: Try to pop the row from the prefetch cache */
3538 if (UNIV_UNLIKELY(direction == 0)) {
3539 trx->op_info = "starting index read";
3541 prebuilt->n_rows_fetched = 0;
3542 prebuilt->n_fetch_cached = 0;
3543 prebuilt->fetch_cache_first = 0;
3545 if (prebuilt->sel_graph == NULL) {
3546 /* Build a dummy select query graph */
3547 row_prebuild_sel_graph(prebuilt);
3549 } else {
3550 trx->op_info = "fetching rows";
3552 if (prebuilt->n_rows_fetched == 0) {
3553 prebuilt->fetch_direction = direction;
3556 if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3557 if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3558 ut_error;
3559 /* TODO: scrollable cursor: restore cursor to
3560 the place of the latest returned row,
3561 or better: prevent caching for a scroll
3562 cursor! */
3565 prebuilt->n_rows_fetched = 0;
3566 prebuilt->n_fetch_cached = 0;
3567 prebuilt->fetch_cache_first = 0;
3569 } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3570 row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3572 prebuilt->n_rows_fetched++;
3574 srv_n_rows_read++;
3575 err = DB_SUCCESS;
3576 goto func_exit;
3579 if (prebuilt->fetch_cache_first > 0
3580 && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3582 /* The previous returned row was popped from the fetch
3583 cache, but the cache was not full at the time of the
3584 popping: no more rows can exist in the result set */
3586 err = DB_RECORD_NOT_FOUND;
3587 goto func_exit;
3590 prebuilt->n_rows_fetched++;
3592 if (prebuilt->n_rows_fetched > 1000000000) {
3593 /* Prevent wrap-over */
3594 prebuilt->n_rows_fetched = 500000000;
3597 mode = pcur->search_mode;
3600 /* In a search where at most one record in the index may match, we
3601 can use a LOCK_REC_NOT_GAP type record lock when locking a
3602 non-delete-marked matching record.
3604 Note that in a unique secondary index there may be different
3605 delete-marked versions of a record where only the primary key
3606 values differ: thus in a secondary index we must use next-key
3607 locks when locking delete-marked records. */
3609 if (match_mode == ROW_SEL_EXACT
3610 && dict_index_is_unique(index)
3611 && dtuple_get_n_fields(search_tuple)
3612 == dict_index_get_n_unique(index)
3613 && (dict_index_is_clust(index)
3614 || !dtuple_contains_null(search_tuple))) {
3616 /* Note above that a UNIQUE secondary index can contain many
3617 rows with the same key value if one of the columns is the SQL
3618 null. A clustered index under MySQL can never contain null
3619 columns because we demand that all the columns in primary key
3620 are non-null. */
3622 unique_search = TRUE;
3624 /* Even if the condition is unique, MySQL seems to try to
3625 retrieve also a second row if a primary key contains more than
3626 1 column. Return immediately if this is not a HANDLER
3627 command. */
3629 if (UNIV_UNLIKELY(direction != 0
3630 && !prebuilt->used_in_HANDLER)) {
3632 err = DB_RECORD_NOT_FOUND;
3633 goto func_exit;
3637 mtr_start(&mtr);
3639 /*-------------------------------------------------------------*/
3640 /* PHASE 2: Try fast adaptive hash index search if possible */
3642 /* Next test if this is the special case where we can use the fast
3643 adaptive hash index to try the search. Since we must release the
3644 search system latch when we retrieve an externally stored field, we
3645 cannot use the adaptive hash index in a search in the case the row
3646 may be long and there may be externally stored fields */
3648 if (UNIV_UNLIKELY(direction == 0)
3649 && unique_search
3650 && dict_index_is_clust(index)
3651 && !prebuilt->templ_contains_blob
3652 && !prebuilt->used_in_HANDLER
3653 && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3655 mode = PAGE_CUR_GE;
3657 unique_search_from_clust_index = TRUE;
3659 if (trx->mysql_n_tables_locked == 0
3660 && prebuilt->select_lock_type == LOCK_NONE
3661 && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3662 && trx->read_view) {
3664 /* This is a SELECT query done as a consistent read,
3665 and the read view has already been allocated:
3666 let us try a search shortcut through the hash
3667 index.
3668 NOTE that we must also test that
3669 mysql_n_tables_locked == 0, because this might
3670 also be INSERT INTO ... SELECT ... or
3671 CREATE TABLE ... SELECT ... . Our algorithm is
3672 NOT prepared to inserts interleaved with the SELECT,
3673 and if we try that, we can deadlock on the adaptive
3674 hash index semaphore! */
3676 #ifndef UNIV_SEARCH_DEBUG
3677 if (!trx->has_search_latch) {
3678 rw_lock_s_lock(&btr_search_latch);
3679 trx->has_search_latch = TRUE;
3681 #endif
3682 switch (row_sel_try_search_shortcut_for_mysql(
3683 &rec, prebuilt, &offsets, &heap,
3684 &mtr)) {
3685 case SEL_FOUND:
3686 #ifdef UNIV_SEARCH_DEBUG
3687 ut_a(0 == cmp_dtuple_rec(search_tuple,
3688 rec, offsets));
3689 #endif
3690 /* At this point, rec is protected by
3691 a page latch that was acquired by
3692 row_sel_try_search_shortcut_for_mysql().
3693 The latch will not be released until
3694 mtr_commit(&mtr). */
3695 ut_ad(!rec_get_deleted_flag(rec, comp));
3697 if (!row_sel_store_mysql_rec(buf, prebuilt,
3698 rec, FALSE,
3699 offsets)) {
3700 /* Only fresh inserts may contain
3701 incomplete externally stored
3702 columns. Pretend that such
3703 records do not exist. Such
3704 records may only be accessed
3705 at the READ UNCOMMITTED
3706 isolation level or when
3707 rolling back a recovered
3708 transaction. Rollback happens
3709 at a lower level, not here. */
3710 ut_a(trx->isolation_level
3711 == TRX_ISO_READ_UNCOMMITTED);
3713 /* Proceed as in case SEL_RETRY. */
3714 break;
3717 mtr_commit(&mtr);
3719 /* ut_print_name(stderr, index->name);
3720 fputs(" shortcut\n", stderr); */
3722 srv_n_rows_read++;
3724 err = DB_SUCCESS;
3725 goto release_search_latch_if_needed;
3727 case SEL_EXHAUSTED:
3728 mtr_commit(&mtr);
3730 /* ut_print_name(stderr, index->name);
3731 fputs(" record not found 2\n", stderr); */
3733 err = DB_RECORD_NOT_FOUND;
3734 release_search_latch_if_needed:
3735 if (trx->search_latch_timeout > 0
3736 && trx->has_search_latch) {
3738 trx->search_latch_timeout--;
3740 rw_lock_s_unlock(&btr_search_latch);
3741 trx->has_search_latch = FALSE;
3744 /* NOTE that we do NOT store the cursor
3745 position */
3746 goto func_exit;
3748 case SEL_RETRY:
3749 break;
3751 default:
3752 ut_ad(0);
3755 mtr_commit(&mtr);
3756 mtr_start(&mtr);
3760 /*-------------------------------------------------------------*/
3761 /* PHASE 3: Open or restore index cursor position */
3763 if (trx->has_search_latch) {
3764 rw_lock_s_unlock(&btr_search_latch);
3765 trx->has_search_latch = FALSE;
3768 ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
3769 ut_ad(trx->conc_state == TRX_NOT_STARTED
3770 || trx->conc_state == TRX_ACTIVE);
3771 ut_ad(prebuilt->sql_stat_start
3772 || prebuilt->select_lock_type != LOCK_NONE
3773 || trx->read_view);
3775 trx_start_if_not_started(trx);
3777 if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3778 && prebuilt->select_lock_type != LOCK_NONE
3779 && trx->mysql_thd != NULL
3780 && thd_is_select(trx->mysql_thd)) {
3781 /* It is a plain locking SELECT and the isolation
3782 level is low: do not lock gaps */
3784 set_also_gap_locks = FALSE;
3787 /* Note that if the search mode was GE or G, then the cursor
3788 naturally moves upward (in fetch next) in alphabetical order,
3789 otherwise downward */
3791 if (UNIV_UNLIKELY(direction == 0)) {
3792 if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3793 moves_up = TRUE;
3795 } else if (direction == ROW_SEL_NEXT) {
3796 moves_up = TRUE;
3799 thr = que_fork_get_first_thr(prebuilt->sel_graph);
3801 que_thr_move_to_run_state_for_mysql(thr, trx);
3803 clust_index = dict_table_get_first_index(index->table);
3805 /* Do some start-of-statement preparations */
3807 if (!prebuilt->sql_stat_start) {
3808 /* No need to set an intention lock or assign a read view */
3810 if (trx->read_view == NULL
3811 && prebuilt->select_lock_type == LOCK_NONE) {
3813 fputs("InnoDB: Error: MySQL is trying to"
3814 " perform a consistent read\n"
3815 "InnoDB: but the read view is not assigned!\n",
3816 stderr);
3817 trx_print(stderr, trx, 600);
3818 fputc('\n', stderr);
3819 ut_error;
3821 } else if (prebuilt->select_lock_type == LOCK_NONE) {
3822 /* This is a consistent read */
3823 /* Assign a read view for the query */
3825 trx_assign_read_view(trx);
3826 prebuilt->sql_stat_start = FALSE;
3827 } else {
3828 wait_table_again:
3829 err = lock_table(0, index->table,
3830 prebuilt->select_lock_type == LOCK_S
3831 ? LOCK_IS : LOCK_IX, thr);
3833 if (err != DB_SUCCESS) {
3835 table_lock_waited = TRUE;
3836 goto lock_table_wait;
3838 prebuilt->sql_stat_start = FALSE;
3841 /* Open or restore index cursor position */
3843 if (UNIV_LIKELY(direction != 0)) {
3844 ibool need_to_process = sel_restore_position_for_mysql(
3845 &same_user_rec, BTR_SEARCH_LEAF,
3846 pcur, moves_up, &mtr);
3848 if (UNIV_UNLIKELY(need_to_process)) {
3849 if (UNIV_UNLIKELY(prebuilt->row_read_type
3850 == ROW_READ_DID_SEMI_CONSISTENT)) {
3851 /* We did a semi-consistent read,
3852 but the record was removed in
3853 the meantime. */
3854 prebuilt->row_read_type
3855 = ROW_READ_TRY_SEMI_CONSISTENT;
3857 } else if (UNIV_LIKELY(prebuilt->row_read_type
3858 != ROW_READ_DID_SEMI_CONSISTENT)) {
3860 /* The cursor was positioned on the record
3861 that we returned previously. If we need
3862 to repeat a semi-consistent read as a
3863 pessimistic locking read, the record
3864 cannot be skipped. */
3866 goto next_rec;
3869 } else if (dtuple_get_n_fields(search_tuple) > 0) {
3871 btr_pcur_open_with_no_init(index, search_tuple, mode,
3872 BTR_SEARCH_LEAF,
3873 pcur, 0, &mtr);
3875 pcur->trx_if_known = trx;
3877 rec = btr_pcur_get_rec(pcur);
3879 if (!moves_up
3880 && !page_rec_is_supremum(rec)
3881 && set_also_gap_locks
3882 && !(srv_locks_unsafe_for_binlog
3883 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3884 && prebuilt->select_lock_type != LOCK_NONE) {
3886 /* Try to place a gap lock on the next index record
3887 to prevent phantoms in ORDER BY ... DESC queries */
3888 const rec_t* next = page_rec_get_next_const(rec);
3890 offsets = rec_get_offsets(next, index, offsets,
3891 ULINT_UNDEFINED, &heap);
3892 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3893 next, index, offsets,
3894 prebuilt->select_lock_type,
3895 LOCK_GAP, thr);
3897 switch (err) {
3898 case DB_SUCCESS_LOCKED_REC:
3899 err = DB_SUCCESS;
3900 case DB_SUCCESS:
3901 break;
3902 default:
3903 goto lock_wait_or_error;
3906 } else {
3907 if (mode == PAGE_CUR_G) {
3908 btr_pcur_open_at_index_side(
3909 TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3910 &mtr);
3911 } else if (mode == PAGE_CUR_L) {
3912 btr_pcur_open_at_index_side(
3913 FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3914 &mtr);
3918 rec_loop:
3919 DEBUG_SYNC_C("row_search_rec_loop");
3920 if (trx_is_interrupted(trx)) {
3921 btr_pcur_store_position(pcur, &mtr);
3922 err = DB_INTERRUPTED;
3923 goto normal_return;
3926 /*-------------------------------------------------------------*/
3927 /* PHASE 4: Look for matching records in a loop */
3929 rec = btr_pcur_get_rec(pcur);
3930 ut_ad(!!page_rec_is_comp(rec) == comp);
3931 #ifdef UNIV_SEARCH_DEBUG
3933 fputs("Using ", stderr);
3934 dict_index_name_print(stderr, index);
3935 fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3936 page_get_page_no(page_align(rec)));
3937 rec_print(rec);
3939 #endif /* UNIV_SEARCH_DEBUG */
3941 if (page_rec_is_infimum(rec)) {
3943 /* The infimum record on a page cannot be in the result set,
3944 and neither can a record lock be placed on it: we skip such
3945 a record. */
3947 goto next_rec;
3950 if (page_rec_is_supremum(rec)) {
3952 if (set_also_gap_locks
3953 && !(srv_locks_unsafe_for_binlog
3954 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3955 && prebuilt->select_lock_type != LOCK_NONE) {
3957 /* Try to place a lock on the index record */
3959 /* If innodb_locks_unsafe_for_binlog option is used
3960 or this session is using a READ COMMITTED isolation
3961 level we do not lock gaps. Supremum record is really
3962 a gap and therefore we do not set locks there. */
3964 offsets = rec_get_offsets(rec, index, offsets,
3965 ULINT_UNDEFINED, &heap);
3966 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3967 rec, index, offsets,
3968 prebuilt->select_lock_type,
3969 LOCK_ORDINARY, thr);
3971 switch (err) {
3972 case DB_SUCCESS_LOCKED_REC:
3973 err = DB_SUCCESS;
3974 case DB_SUCCESS:
3975 break;
3976 default:
3977 goto lock_wait_or_error;
3980 /* A page supremum record cannot be in the result set: skip
3981 it now that we have placed a possible lock on it */
3983 goto next_rec;
3986 /*-------------------------------------------------------------*/
3987 /* Do sanity checks in case our cursor has bumped into page
3988 corruption */
3990 if (comp) {
3991 next_offs = rec_get_next_offs(rec, TRUE);
3992 if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3994 goto wrong_offs;
3996 } else {
3997 next_offs = rec_get_next_offs(rec, FALSE);
3998 if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
4000 goto wrong_offs;
4004 if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
4006 wrong_offs:
4007 if (srv_force_recovery == 0 || moves_up == FALSE) {
4008 ut_print_timestamp(stderr);
4009 buf_page_print(page_align(rec), 0);
4010 fprintf(stderr,
4011 "\nInnoDB: rec address %p,"
4012 " buf block fix count %lu\n",
4013 (void*) rec, (ulong)
4014 btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
4015 ->page.buf_fix_count);
4016 fprintf(stderr,
4017 "InnoDB: Index corruption: rec offs %lu"
4018 " next offs %lu, page no %lu,\n"
4019 "InnoDB: ",
4020 (ulong) page_offset(rec),
4021 (ulong) next_offs,
4022 (ulong) page_get_page_no(page_align(rec)));
4023 dict_index_name_print(stderr, trx, index);
4024 fputs(". Run CHECK TABLE. You may need to\n"
4025 "InnoDB: restore from a backup, or"
4026 " dump + drop + reimport the table.\n",
4027 stderr);
4029 err = DB_CORRUPTION;
4031 goto lock_wait_or_error;
4032 } else {
4033 /* The user may be dumping a corrupt table. Jump
4034 over the corruption to recover as much as possible. */
4036 fprintf(stderr,
4037 "InnoDB: Index corruption: rec offs %lu"
4038 " next offs %lu, page no %lu,\n"
4039 "InnoDB: ",
4040 (ulong) page_offset(rec),
4041 (ulong) next_offs,
4042 (ulong) page_get_page_no(page_align(rec)));
4043 dict_index_name_print(stderr, trx, index);
4044 fputs(". We try to skip the rest of the page.\n",
4045 stderr);
4047 btr_pcur_move_to_last_on_page(pcur, &mtr);
4049 goto next_rec;
4052 /*-------------------------------------------------------------*/
4054 /* Calculate the 'offsets' associated with 'rec' */
4056 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4058 if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4059 if (!rec_validate(rec, offsets)
4060 || !btr_index_rec_validate(rec, index, FALSE)) {
4061 fprintf(stderr,
4062 "InnoDB: Index corruption: rec offs %lu"
4063 " next offs %lu, page no %lu,\n"
4064 "InnoDB: ",
4065 (ulong) page_offset(rec),
4066 (ulong) next_offs,
4067 (ulong) page_get_page_no(page_align(rec)));
4068 dict_index_name_print(stderr, trx, index);
4069 fputs(". We try to skip the record.\n",
4070 stderr);
4072 goto next_rec;
4076 /* Note that we cannot trust the up_match value in the cursor at this
4077 place because we can arrive here after moving the cursor! Thus
4078 we have to recompare rec and search_tuple to determine if they
4079 match enough. */
4081 if (match_mode == ROW_SEL_EXACT) {
4082 /* Test if the index record matches completely to search_tuple
4083 in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4085 /* fputs("Comparing rec and search tuple\n", stderr); */
4087 if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4089 if (set_also_gap_locks
4090 && !(srv_locks_unsafe_for_binlog
4091 || trx->isolation_level
4092 <= TRX_ISO_READ_COMMITTED)
4093 && prebuilt->select_lock_type != LOCK_NONE) {
4095 /* Try to place a gap lock on the index
4096 record only if innodb_locks_unsafe_for_binlog
4097 option is not set or this session is not
4098 using a READ COMMITTED isolation level. */
4100 err = sel_set_rec_lock(
4101 btr_pcur_get_block(pcur),
4102 rec, index, offsets,
4103 prebuilt->select_lock_type, LOCK_GAP,
4104 thr);
4106 switch (err) {
4107 case DB_SUCCESS_LOCKED_REC:
4108 case DB_SUCCESS:
4109 break;
4110 default:
4111 goto lock_wait_or_error;
4115 btr_pcur_store_position(pcur, &mtr);
4117 err = DB_RECORD_NOT_FOUND;
4118 /* ut_print_name(stderr, index->name);
4119 fputs(" record not found 3\n", stderr); */
4121 goto normal_return;
4124 } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4126 if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4128 if (set_also_gap_locks
4129 && !(srv_locks_unsafe_for_binlog
4130 || trx->isolation_level
4131 <= TRX_ISO_READ_COMMITTED)
4132 && prebuilt->select_lock_type != LOCK_NONE) {
4134 /* Try to place a gap lock on the index
4135 record only if innodb_locks_unsafe_for_binlog
4136 option is not set or this session is not
4137 using a READ COMMITTED isolation level. */
4139 err = sel_set_rec_lock(
4140 btr_pcur_get_block(pcur),
4141 rec, index, offsets,
4142 prebuilt->select_lock_type, LOCK_GAP,
4143 thr);
4145 switch (err) {
4146 case DB_SUCCESS_LOCKED_REC:
4147 case DB_SUCCESS:
4148 break;
4149 default:
4150 goto lock_wait_or_error;
4154 btr_pcur_store_position(pcur, &mtr);
4156 err = DB_RECORD_NOT_FOUND;
4157 /* ut_print_name(stderr, index->name);
4158 fputs(" record not found 4\n", stderr); */
4160 goto normal_return;
4164 /* We are ready to look at a possible new index entry in the result
4165 set: the cursor is now placed on a user record */
4167 if (prebuilt->select_lock_type != LOCK_NONE) {
4168 /* Try to place a lock on the index record; note that delete
4169 marked records are a special case in a unique search. If there
4170 is a non-delete marked record, then it is enough to lock its
4171 existence with LOCK_REC_NOT_GAP. */
4173 /* If innodb_locks_unsafe_for_binlog option is used
4174 or this session is using a READ COMMITED isolation
4175 level we lock only the record, i.e., next-key locking is
4176 not used. */
4178 ulint lock_type;
4180 if (!set_also_gap_locks
4181 || srv_locks_unsafe_for_binlog
4182 || trx->isolation_level <= TRX_ISO_READ_COMMITTED
4183 || (unique_search
4184 && !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
4186 goto no_gap_lock;
4187 } else {
4188 lock_type = LOCK_ORDINARY;
4191 /* If we are doing a 'greater or equal than a primary key
4192 value' search from a clustered index, and we find a record
4193 that has that exact primary key value, then there is no need
4194 to lock the gap before the record, because no insert in the
4195 gap can be in our search range. That is, no phantom row can
4196 appear that way.
4198 An example: if col1 is the primary key, the search is WHERE
4199 col1 >= 100, and we find a record where col1 = 100, then no
4200 need to lock the gap before that record. */
4202 if (index == clust_index
4203 && mode == PAGE_CUR_GE
4204 && direction == 0
4205 && dtuple_get_n_fields_cmp(search_tuple)
4206 == dict_index_get_n_unique(index)
4207 && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4208 no_gap_lock:
4209 lock_type = LOCK_REC_NOT_GAP;
4212 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4213 rec, index, offsets,
4214 prebuilt->select_lock_type,
4215 lock_type, thr);
4217 switch (err) {
4218 const rec_t* old_vers;
4219 case DB_SUCCESS_LOCKED_REC:
4220 if (srv_locks_unsafe_for_binlog
4221 || trx->isolation_level
4222 <= TRX_ISO_READ_COMMITTED) {
4223 /* Note that a record of
4224 prebuilt->index was locked. */
4225 prebuilt->new_rec_locks = 1;
4227 err = DB_SUCCESS;
4228 case DB_SUCCESS:
4229 break;
4230 case DB_LOCK_WAIT:
4231 /* Never unlock rows that were part of a conflict. */
4232 prebuilt->new_rec_locks = 0;
4234 if (UNIV_LIKELY(prebuilt->row_read_type
4235 != ROW_READ_TRY_SEMI_CONSISTENT)
4236 || unique_search
4237 || index != clust_index) {
4239 goto lock_wait_or_error;
4242 /* The following call returns 'offsets'
4243 associated with 'old_vers' */
4244 err = row_sel_build_committed_vers_for_mysql(
4245 clust_index, prebuilt, rec,
4246 &offsets, &heap, &old_vers, &mtr);
4248 if (err != DB_SUCCESS) {
4250 goto lock_wait_or_error;
4253 mutex_enter(&kernel_mutex);
4254 if (trx->was_chosen_as_deadlock_victim) {
4255 mutex_exit(&kernel_mutex);
4256 err = DB_DEADLOCK;
4258 goto lock_wait_or_error;
4260 if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4261 lock_cancel_waiting_and_release(
4262 trx->wait_lock);
4263 } else {
4264 mutex_exit(&kernel_mutex);
4266 /* The lock was granted while we were
4267 searching for the last committed version.
4268 Do a normal locking read. */
4270 offsets = rec_get_offsets(rec, index, offsets,
4271 ULINT_UNDEFINED,
4272 &heap);
4273 err = DB_SUCCESS;
4274 break;
4276 mutex_exit(&kernel_mutex);
4278 if (old_vers == NULL) {
4279 /* The row was not yet committed */
4281 goto next_rec;
4284 did_semi_consistent_read = TRUE;
4285 rec = old_vers;
4286 break;
4287 default:
4289 goto lock_wait_or_error;
4291 } else {
4292 /* This is a non-locking consistent read: if necessary, fetch
4293 a previous version of the record */
4295 if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4297 /* Do nothing: we let a non-locking SELECT read the
4298 latest version of the record */
4300 } else if (index == clust_index) {
4302 /* Fetch a previous version of the row if the current
4303 one is not visible in the snapshot; if we have a very
4304 high force recovery level set, we try to avoid crashes
4305 by skipping this lookup */
4307 if (UNIV_LIKELY(srv_force_recovery < 5)
4308 && !lock_clust_rec_cons_read_sees(
4309 rec, index, offsets, trx->read_view)) {
4311 rec_t* old_vers;
4312 /* The following call returns 'offsets'
4313 associated with 'old_vers' */
4314 err = row_sel_build_prev_vers_for_mysql(
4315 trx->read_view, clust_index,
4316 prebuilt, rec, &offsets, &heap,
4317 &old_vers, &mtr);
4319 if (err != DB_SUCCESS) {
4321 goto lock_wait_or_error;
4324 if (old_vers == NULL) {
4325 /* The row did not exist yet in
4326 the read view */
4328 goto next_rec;
4331 rec = old_vers;
4333 } else {
4334 /* We are looking into a non-clustered index,
4335 and to get the right version of the record we
4336 have to look also into the clustered index: this
4337 is necessary, because we can only get the undo
4338 information via the clustered index record. */
4340 ut_ad(!dict_index_is_clust(index));
4342 if (!lock_sec_rec_cons_read_sees(
4343 rec, trx->read_view)) {
4344 goto requires_clust_rec;
4349 /* NOTE that at this point rec can be an old version of a clustered
4350 index record built for a consistent read. We cannot assume after this
4351 point that rec is on a buffer pool page. Functions like
4352 page_rec_is_comp() cannot be used! */
4354 if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4356 /* The record is delete-marked: we can skip it */
4358 if ((srv_locks_unsafe_for_binlog
4359 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4360 && prebuilt->select_lock_type != LOCK_NONE
4361 && !did_semi_consistent_read) {
4363 /* No need to keep a lock on a delete-marked record
4364 if we do not want to use next-key locking. */
4366 row_unlock_for_mysql(prebuilt, TRUE);
4369 /* This is an optimization to skip setting the next key lock
4370 on the record that follows this delete-marked record. This
4371 optimization works because of the unique search criteria
4372 which precludes the presence of a range lock between this
4373 delete marked record and the record following it.
4375 For now this is applicable only to clustered indexes while
4376 doing a unique search. There is scope for further optimization
4377 applicable to unique secondary indexes. Current behaviour is
4378 to widen the scope of a lock on an already delete marked record
4379 if the same record is deleted twice by the same transaction */
4380 if (index == clust_index && unique_search
4381 && !prebuilt->used_in_HANDLER) {
4383 err = DB_RECORD_NOT_FOUND;
4385 goto normal_return;
4388 goto next_rec;
4391 /* Get the clustered index record if needed, if we did not do the
4392 search using the clustered index. */
4394 if (index != clust_index && prebuilt->need_to_access_clustered) {
4396 requires_clust_rec:
4397 /* We use a 'goto' to the preceding label if a consistent
4398 read of a secondary index record requires us to look up old
4399 versions of the associated clustered index record. */
4401 ut_ad(rec_offs_validate(rec, index, offsets));
4403 /* It was a non-clustered index and we must fetch also the
4404 clustered index record */
4406 mtr_has_extra_clust_latch = TRUE;
4408 /* The following call returns 'offsets' associated with
4409 'clust_rec'. Note that 'clust_rec' can be an old version
4410 built for a consistent read. */
4412 err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4413 thr, &clust_rec,
4414 &offsets, &heap, &mtr);
4415 switch (err) {
4416 case DB_SUCCESS:
4417 if (clust_rec == NULL) {
4418 /* The record did not exist in the read view */
4419 ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4421 goto next_rec;
4423 break;
4424 case DB_SUCCESS_LOCKED_REC:
4425 ut_a(clust_rec != NULL);
4426 if (srv_locks_unsafe_for_binlog
4427 || trx->isolation_level
4428 <= TRX_ISO_READ_COMMITTED) {
4429 /* Note that the clustered index record
4430 was locked. */
4431 prebuilt->new_rec_locks = 2;
4433 err = DB_SUCCESS;
4434 break;
4435 default:
4436 goto lock_wait_or_error;
4439 if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4441 /* The record is delete marked: we can skip it */
4443 if ((srv_locks_unsafe_for_binlog
4444 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4445 && prebuilt->select_lock_type != LOCK_NONE) {
4447 /* No need to keep a lock on a delete-marked
4448 record if we do not want to use next-key
4449 locking. */
4451 row_unlock_for_mysql(prebuilt, TRUE);
4454 goto next_rec;
4457 result_rec = clust_rec;
4458 ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
4459 } else {
4460 result_rec = rec;
4463 /* We found a qualifying record 'result_rec'. At this point,
4464 'offsets' are associated with 'result_rec'. */
4466 ut_ad(rec_offs_validate(result_rec,
4467 result_rec != rec ? clust_index : index,
4468 offsets));
4469 ut_ad(!rec_get_deleted_flag(result_rec, comp));
4471 /* At this point, the clustered index record is protected
4472 by a page latch that was acquired when pcur was positioned.
4473 The latch will not be released until mtr_commit(&mtr). */
4475 if ((match_mode == ROW_SEL_EXACT
4476 || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4477 && prebuilt->select_lock_type == LOCK_NONE
4478 && !prebuilt->templ_contains_blob
4479 && !prebuilt->clust_index_was_generated
4480 && !prebuilt->used_in_HANDLER
4481 && prebuilt->template_type
4482 != ROW_MYSQL_DUMMY_TEMPLATE) {
4484 /* Inside an update, for example, we do not cache rows,
4485 since we may use the cursor position to do the actual
4486 update, that is why we require ...lock_type == LOCK_NONE.
4487 Since we keep space in prebuilt only for the BLOBs of
4488 a single row, we cannot cache rows in the case there
4489 are BLOBs in the fields to be fetched. In HANDLER we do
4490 not cache rows because there the cursor is a scrollable
4491 cursor. */
4493 if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4494 result_rec != rec,
4495 offsets)) {
4496 /* Only fresh inserts may contain incomplete
4497 externally stored columns. Pretend that such
4498 records do not exist. Such records may only be
4499 accessed at the READ UNCOMMITTED isolation
4500 level or when rolling back a recovered
4501 transaction. Rollback happens at a lower
4502 level, not here. */
4503 ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
4504 } else if (prebuilt->n_fetch_cached
4505 == MYSQL_FETCH_CACHE_SIZE) {
4507 goto got_row;
4510 goto next_rec;
4511 } else {
4512 if (UNIV_UNLIKELY
4513 (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
4514 /* CHECK TABLE: fetch the row */
4516 if (result_rec != rec
4517 && !prebuilt->need_to_access_clustered) {
4518 /* We used 'offsets' for the clust
4519 rec, recalculate them for 'rec' */
4520 offsets = rec_get_offsets(rec, index, offsets,
4521 ULINT_UNDEFINED,
4522 &heap);
4523 result_rec = rec;
4526 memcpy(buf + 4, result_rec
4527 - rec_offs_extra_size(offsets),
4528 rec_offs_size(offsets));
4529 mach_write_to_4(buf,
4530 rec_offs_extra_size(offsets) + 4);
4531 } else {
4532 /* Returning a row to MySQL */
4534 if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec,
4535 result_rec != rec,
4536 offsets)) {
4537 /* Only fresh inserts may contain
4538 incomplete externally stored
4539 columns. Pretend that such records do
4540 not exist. Such records may only be
4541 accessed at the READ UNCOMMITTED
4542 isolation level or when rolling back a
4543 recovered transaction. Rollback
4544 happens at a lower level, not here. */
4545 ut_a(trx->isolation_level
4546 == TRX_ISO_READ_UNCOMMITTED);
4547 goto next_rec;
4551 if (prebuilt->clust_index_was_generated) {
4552 if (result_rec != rec) {
4553 offsets = rec_get_offsets(
4554 rec, index, offsets, ULINT_UNDEFINED,
4555 &heap);
4557 row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4558 index, offsets);
4562 /* From this point on, 'offsets' are invalid. */
4564 got_row:
4565 /* We have an optimization to save CPU time: if this is a consistent
4566 read on a unique condition on the clustered index, then we do not
4567 store the pcur position, because any fetch next or prev will anyway
4568 return 'end of file'. Exceptions are locking reads and the MySQL
4569 HANDLER command where the user can move the cursor with PREV or NEXT
4570 even after a unique search. */
4572 if (!unique_search_from_clust_index
4573 || prebuilt->select_lock_type != LOCK_NONE
4574 || prebuilt->used_in_HANDLER) {
4576 /* Inside an update always store the cursor position */
4578 btr_pcur_store_position(pcur, &mtr);
4581 err = DB_SUCCESS;
4583 goto normal_return;
4585 next_rec:
4586 /* Reset the old and new "did semi-consistent read" flags. */
4587 if (UNIV_UNLIKELY(prebuilt->row_read_type
4588 == ROW_READ_DID_SEMI_CONSISTENT)) {
4589 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4591 did_semi_consistent_read = FALSE;
4592 prebuilt->new_rec_locks = 0;
4594 /*-------------------------------------------------------------*/
4595 /* PHASE 5: Move the cursor to the next index record */
4597 if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4598 /* We must commit mtr if we are moving to the next
4599 non-clustered index record, because we could break the
4600 latching order if we would access a different clustered
4601 index page right away without releasing the previous. */
4603 btr_pcur_store_position(pcur, &mtr);
4605 mtr_commit(&mtr);
4606 mtr_has_extra_clust_latch = FALSE;
4608 mtr_start(&mtr);
4609 if (sel_restore_position_for_mysql(&same_user_rec,
4610 BTR_SEARCH_LEAF,
4611 pcur, moves_up, &mtr)) {
4612 #ifdef UNIV_SEARCH_DEBUG
4613 cnt++;
4614 #endif /* UNIV_SEARCH_DEBUG */
4616 goto rec_loop;
4620 if (moves_up) {
4621 if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4622 not_moved:
4623 btr_pcur_store_position(pcur, &mtr);
4625 if (match_mode != 0) {
4626 err = DB_RECORD_NOT_FOUND;
4627 } else {
4628 err = DB_END_OF_INDEX;
4631 goto normal_return;
4633 } else {
4634 if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4635 goto not_moved;
4639 #ifdef UNIV_SEARCH_DEBUG
4640 cnt++;
4641 #endif /* UNIV_SEARCH_DEBUG */
4643 goto rec_loop;
4645 lock_wait_or_error:
4646 /* Reset the old and new "did semi-consistent read" flags. */
4647 if (UNIV_UNLIKELY(prebuilt->row_read_type
4648 == ROW_READ_DID_SEMI_CONSISTENT)) {
4649 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4651 did_semi_consistent_read = FALSE;
4653 /*-------------------------------------------------------------*/
4655 btr_pcur_store_position(pcur, &mtr);
4657 lock_table_wait:
4658 mtr_commit(&mtr);
4659 mtr_has_extra_clust_latch = FALSE;
4661 trx->error_state = err;
4663 /* The following is a patch for MySQL */
4665 que_thr_stop_for_mysql(thr);
4667 thr->lock_state = QUE_THR_LOCK_ROW;
4669 if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4670 /* It was a lock wait, and it ended */
4672 thr->lock_state = QUE_THR_LOCK_NOLOCK;
4673 mtr_start(&mtr);
4675 /* Table lock waited, go try to obtain table lock
4676 again */
4677 if (table_lock_waited) {
4678 table_lock_waited = FALSE;
4680 goto wait_table_again;
4683 sel_restore_position_for_mysql(&same_user_rec,
4684 BTR_SEARCH_LEAF, pcur,
4685 moves_up, &mtr);
4687 if ((srv_locks_unsafe_for_binlog
4688 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4689 && !same_user_rec) {
4691 /* Since we were not able to restore the cursor
4692 on the same user record, we cannot use
4693 row_unlock_for_mysql() to unlock any records, and
4694 we must thus reset the new rec lock info. Since
4695 in lock0lock.c we have blocked the inheriting of gap
4696 X-locks, we actually do not have any new record locks
4697 set in this case.
4699 Note that if we were able to restore on the 'same'
4700 user record, it is still possible that we were actually
4701 waiting on a delete-marked record, and meanwhile
4702 it was removed by purge and inserted again by some
4703 other user. But that is no problem, because in
4704 rec_loop we will again try to set a lock, and
4705 new_rec_lock_info in trx will be right at the end. */
4707 prebuilt->new_rec_locks = 0;
4710 mode = pcur->search_mode;
4712 goto rec_loop;
4715 thr->lock_state = QUE_THR_LOCK_NOLOCK;
4717 #ifdef UNIV_SEARCH_DEBUG
4718 /* fputs("Using ", stderr);
4719 dict_index_name_print(stderr, index);
4720 fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4721 #endif /* UNIV_SEARCH_DEBUG */
4722 goto func_exit;
4724 normal_return:
4725 /*-------------------------------------------------------------*/
4726 que_thr_stop_for_mysql_no_error(thr, trx);
4728 mtr_commit(&mtr);
4730 if (prebuilt->n_fetch_cached > 0) {
4731 row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4733 err = DB_SUCCESS;
4736 #ifdef UNIV_SEARCH_DEBUG
4737 /* fputs("Using ", stderr);
4738 dict_index_name_print(stderr, index);
4739 fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4740 #endif /* UNIV_SEARCH_DEBUG */
4741 if (err == DB_SUCCESS) {
4742 srv_n_rows_read++;
4745 func_exit:
4746 trx->op_info = "";
4747 if (UNIV_LIKELY_NULL(heap)) {
4748 mem_heap_free(heap);
4751 /* Set or reset the "did semi-consistent read" flag on return.
4752 The flag did_semi_consistent_read is set if and only if
4753 the record being returned was fetched with a semi-consistent read. */
4754 ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4755 || !did_semi_consistent_read);
4757 if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4758 if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4759 prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4760 } else {
4761 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4764 return(err);
4767 /*******************************************************************//**
4768 Checks if MySQL at the moment is allowed for this table to retrieve a
4769 consistent read result, or store it to the query cache.
4770 @return TRUE if storing or retrieving from the query cache is permitted */
4771 UNIV_INTERN
4772 ibool
4773 row_search_check_if_query_cache_permitted(
4774 /*======================================*/
4775 trx_t* trx, /*!< in: transaction object */
4776 const char* norm_name) /*!< in: concatenation of database name,
4777 '/' char, table name */
4779 dict_table_t* table;
4780 ibool ret = FALSE;
4782 table = dict_table_get(norm_name, FALSE);
4784 if (table == NULL) {
4786 return(FALSE);
4789 mutex_enter(&kernel_mutex);
4791 /* Start the transaction if it is not started yet */
4793 trx_start_if_not_started_low(trx);
4795 /* If there are locks on the table or some trx has invalidated the
4796 cache up to our trx id, then ret = FALSE.
4797 We do not check what type locks there are on the table, though only
4798 IX type locks actually would require ret = FALSE. */
4800 if (UT_LIST_GET_LEN(table->locks) == 0
4801 && ut_dulint_cmp(trx->id,
4802 table->query_cache_inv_trx_id) >= 0) {
4804 ret = TRUE;
4806 /* If the isolation level is high, assign a read view for the
4807 transaction if it does not yet have one */
4809 if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4810 && !trx->read_view) {
4812 trx->read_view = read_view_open_now(
4813 trx->id, trx->global_read_view_heap);
4814 trx->global_read_view = trx->read_view;
4818 mutex_exit(&kernel_mutex);
4820 return(ret);
4823 /*******************************************************************//**
4824 Read the AUTOINC column from the current row. If the value is less than
4825 0 and the type is not unsigned then we reset the value to 0.
4826 @return value read from the column */
4827 static
4828 ib_uint64_t
4829 row_search_autoinc_read_column(
4830 /*===========================*/
4831 dict_index_t* index, /*!< in: index to read from */
4832 const rec_t* rec, /*!< in: current rec */
4833 ulint col_no, /*!< in: column number */
4834 ulint mtype, /*!< in: column main type */
4835 ibool unsigned_type) /*!< in: signed or unsigned flag */
4837 ulint len;
4838 const byte* data;
4839 ib_uint64_t value;
4840 mem_heap_t* heap = NULL;
4841 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4842 ulint* offsets = offsets_;
4844 rec_offs_init(offsets_);
4846 offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap);
4848 if (rec_offs_nth_sql_null(offsets, col_no)) {
4849 /* There is no non-NULL value in the auto-increment column. */
4850 value = 0;
4851 goto func_exit;
4854 data = rec_get_nth_field(rec, offsets, col_no, &len);
4856 switch (mtype) {
4857 case DATA_INT:
4858 ut_a(len <= sizeof value);
4859 value = mach_read_int_type(data, len, unsigned_type);
4860 break;
4862 case DATA_FLOAT:
4863 ut_a(len == sizeof(float));
4864 value = (ib_uint64_t) mach_float_read(data);
4865 break;
4867 case DATA_DOUBLE:
4868 ut_a(len == sizeof(double));
4869 value = (ib_uint64_t) mach_double_read(data);
4870 break;
4872 default:
4873 ut_error;
4876 if (!unsigned_type && (ib_int64_t) value < 0) {
4877 value = 0;
4880 func_exit:
4881 if (UNIV_LIKELY_NULL(heap)) {
4882 mem_heap_free(heap);
4885 return(value);
4888 /*******************************************************************//**
4889 Get the last row.
4890 @return current rec or NULL */
4891 static
4892 const rec_t*
4893 row_search_autoinc_get_rec(
4894 /*=======================*/
4895 btr_pcur_t* pcur, /*!< in: the current cursor */
4896 mtr_t* mtr) /*!< in: mini transaction */
4898 do {
4899 const rec_t* rec = btr_pcur_get_rec(pcur);
4901 if (page_rec_is_user_rec(rec)) {
4902 return(rec);
4904 } while (btr_pcur_move_to_prev(pcur, mtr));
4906 return(NULL);
4909 /*******************************************************************//**
4910 Read the max AUTOINC value from an index.
4911 @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
4912 column name can't be found in index */
4913 UNIV_INTERN
4914 ulint
4915 row_search_max_autoinc(
4916 /*===================*/
4917 dict_index_t* index, /*!< in: index to search */
4918 const char* col_name, /*!< in: name of autoinc column */
4919 ib_uint64_t* value) /*!< out: AUTOINC value read */
4921 ulint i;
4922 ulint n_cols;
4923 dict_field_t* dfield = NULL;
4924 ulint error = DB_SUCCESS;
4926 n_cols = dict_index_get_n_ordering_defined_by_user(index);
4928 /* Search the index for the AUTOINC column name */
4929 for (i = 0; i < n_cols; ++i) {
4930 dfield = dict_index_get_nth_field(index, i);
4932 if (strcmp(col_name, dfield->name) == 0) {
4933 break;
4937 *value = 0;
4939 /* Must find the AUTOINC column name */
4940 if (i < n_cols && dfield) {
4941 mtr_t mtr;
4942 btr_pcur_t pcur;
4944 mtr_start(&mtr);
4946 /* Open at the high/right end (FALSE), and INIT
4947 cursor (TRUE) */
4948 btr_pcur_open_at_index_side(
4949 FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4951 if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4952 const rec_t* rec;
4954 rec = row_search_autoinc_get_rec(&pcur, &mtr);
4956 if (rec != NULL) {
4957 ibool unsigned_type = (
4958 dfield->col->prtype & DATA_UNSIGNED);
4960 *value = row_search_autoinc_read_column(
4961 index, rec, i,
4962 dfield->col->mtype, unsigned_type);
4966 btr_pcur_close(&pcur);
4968 mtr_commit(&mtr);
4969 } else {
4970 error = DB_RECORD_NOT_FOUND;
4973 return(error);