mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / innobase / trx / trx0trx.c
blobd174f1e1b37539d984a76c5ebbe69450a67edfe6
1 /******************************************************
2 The transaction
4 (c) 1996 Innobase Oy
6 Created 3/26/1996 Heikki Tuuri
7 *******************************************************/
9 #include "trx0trx.h"
11 #ifdef UNIV_NONINL
12 #include "trx0trx.ic"
13 #endif
15 #include "trx0undo.h"
16 #include "trx0rseg.h"
17 #include "log0log.h"
18 #include "que0que.h"
19 #include "lock0lock.h"
20 #include "trx0roll.h"
21 #include "usr0sess.h"
22 #include "read0read.h"
23 #include "srv0srv.h"
24 #include "thr0loc.h"
25 #include "btr0sea.h"
26 #include "os0proc.h"
27 #include "trx0xa.h"
28 #include "ha_prototypes.h"
30 /* Copy of the prototype for innobase_mysql_print_thd: this
31 copy MUST be equal to the one in mysql/sql/ha_innodb.cc ! */
33 void innobase_mysql_print_thd(
34 FILE* f,
35 void* thd,
36 ulint max_query_len);
38 /* Dummy session used currently in MySQL interface */
39 sess_t* trx_dummy_sess = NULL;
41 /* Number of transactions currently allocated for MySQL: protected by
42 the kernel mutex */
43 ulint trx_n_mysql_transactions = 0;
44 /* Number of transactions currently in the XA PREPARED state: protected by
45 the kernel mutex */
46 ulint trx_n_prepared = 0;
48 /*****************************************************************
49 Starts the transaction if it is not yet started. */
51 void
52 trx_start_if_not_started_noninline(
53 /*===============================*/
54 trx_t* trx) /* in: transaction */
56 trx_start_if_not_started(trx);
59 /*****************************************************************
60 Set detailed error message for the transaction. */
62 void
63 trx_set_detailed_error(
64 /*===================*/
65 trx_t* trx, /* in: transaction struct */
66 const char* msg) /* in: detailed error message */
68 ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
71 /*****************************************************************
72 Set detailed error message for the transaction from a file. Note that the
73 file is rewinded before reading from it. */
75 void
76 trx_set_detailed_error_from_file(
77 /*=============================*/
78 trx_t* trx, /* in: transaction struct */
79 FILE* file) /* in: file to read message from */
81 os_file_read_string(file, trx->detailed_error,
82 sizeof(trx->detailed_error));
85 /********************************************************************
86 Retrieves the error_info field from a trx. */
88 void*
89 trx_get_error_info(
90 /*===============*/
91 /* out: the error info */
92 trx_t* trx) /* in: trx object */
94 return(trx->error_info);
97 /********************************************************************
98 Creates and initializes a transaction object. */
100 trx_t*
101 trx_create(
102 /*=======*/
103 /* out, own: the transaction */
104 sess_t* sess) /* in: session or NULL */
106 trx_t* trx;
108 ut_ad(mutex_own(&kernel_mutex));
110 trx = mem_alloc(sizeof(trx_t));
112 trx->magic_n = TRX_MAGIC_N;
114 trx->op_info = "";
116 trx->is_purge = 0;
117 trx->conc_state = TRX_NOT_STARTED;
118 trx->start_time = time(NULL);
120 trx->isolation_level = TRX_ISO_REPEATABLE_READ;
122 trx->id = ut_dulint_zero;
123 trx->no = ut_dulint_max;
125 trx->support_xa = TRUE;
127 trx->check_foreigns = TRUE;
128 trx->check_unique_secondary = TRUE;
130 trx->flush_log_later = FALSE;
131 trx->must_flush_log_later = FALSE;
133 trx->dict_operation = FALSE;
135 trx->mysql_thd = NULL;
136 trx->mysql_query_str = NULL;
137 trx->mysql_query_len = NULL;
139 trx->active_trans = 0;
140 trx->duplicates = 0;
142 trx->n_mysql_tables_in_use = 0;
143 trx->mysql_n_tables_locked = 0;
145 trx->mysql_log_file_name = NULL;
146 trx->mysql_log_offset = 0;
148 mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
150 trx->rseg = NULL;
152 trx->undo_no = ut_dulint_zero;
153 trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
154 trx->insert_undo = NULL;
155 trx->update_undo = NULL;
156 trx->undo_no_arr = NULL;
158 trx->error_state = DB_SUCCESS;
159 trx->detailed_error[0] = '\0';
161 trx->sess = sess;
162 trx->que_state = TRX_QUE_RUNNING;
163 trx->n_active_thrs = 0;
165 trx->handling_signals = FALSE;
167 UT_LIST_INIT(trx->signals);
168 UT_LIST_INIT(trx->reply_signals);
170 trx->graph = NULL;
172 trx->wait_lock = NULL;
173 trx->was_chosen_as_deadlock_victim = FALSE;
174 UT_LIST_INIT(trx->wait_thrs);
176 trx->lock_heap = mem_heap_create_in_buffer(256);
177 UT_LIST_INIT(trx->trx_locks);
179 UT_LIST_INIT(trx->trx_savepoints);
181 trx->dict_operation_lock_mode = 0;
182 trx->has_search_latch = FALSE;
183 trx->search_latch_timeout = BTR_SEA_TIMEOUT;
185 trx->declared_to_be_inside_innodb = FALSE;
186 trx->n_tickets_to_enter_innodb = 0;
188 trx->auto_inc_lock = NULL;
190 trx->global_read_view_heap = mem_heap_create(256);
191 trx->global_read_view = NULL;
192 trx->read_view = NULL;
194 /* Set X/Open XA transaction identification to NULL */
195 memset(&trx->xid, 0, sizeof(trx->xid));
196 trx->xid.formatID = -1;
198 trx->n_autoinc_rows = 0;
200 return(trx);
203 /************************************************************************
204 Creates a transaction object for MySQL. */
206 trx_t*
207 trx_allocate_for_mysql(void)
208 /*========================*/
209 /* out, own: transaction object */
211 trx_t* trx;
213 mutex_enter(&kernel_mutex);
215 /* Open a dummy session */
217 if (!trx_dummy_sess) {
218 trx_dummy_sess = sess_open();
221 trx = trx_create(trx_dummy_sess);
223 trx_n_mysql_transactions++;
225 UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
227 mutex_exit(&kernel_mutex);
229 trx->mysql_thread_id = os_thread_get_curr_id();
231 trx->mysql_process_no = os_proc_get_number();
233 return(trx);
236 /************************************************************************
237 Creates a transaction object for background operations by the master thread. */
239 trx_t*
240 trx_allocate_for_background(void)
241 /*=============================*/
242 /* out, own: transaction object */
244 trx_t* trx;
246 mutex_enter(&kernel_mutex);
248 /* Open a dummy session */
250 if (!trx_dummy_sess) {
251 trx_dummy_sess = sess_open();
254 trx = trx_create(trx_dummy_sess);
256 mutex_exit(&kernel_mutex);
258 return(trx);
261 /************************************************************************
262 Releases the search latch if trx has reserved it. */
264 void
265 trx_search_latch_release_if_reserved(
266 /*=================================*/
267 trx_t* trx) /* in: transaction */
269 if (trx->has_search_latch) {
270 rw_lock_s_unlock(&btr_search_latch);
272 trx->has_search_latch = FALSE;
276 /************************************************************************
277 Frees a transaction object. */
279 void
280 trx_free(
281 /*=====*/
282 trx_t* trx) /* in, own: trx object */
284 ut_ad(mutex_own(&kernel_mutex));
286 if (trx->declared_to_be_inside_innodb) {
287 ut_print_timestamp(stderr);
288 fputs(" InnoDB: Error: Freeing a trx which is declared"
289 " to be processing\n"
290 "InnoDB: inside InnoDB.\n", stderr);
291 trx_print(stderr, trx, 600);
292 putc('\n', stderr);
294 /* This is an error but not a fatal error. We must keep
295 the counters like srv_conc_n_threads accurate. */
296 srv_conc_force_exit_innodb(trx);
299 if (trx->n_mysql_tables_in_use != 0
300 || trx->mysql_n_tables_locked != 0) {
302 ut_print_timestamp(stderr);
303 fprintf(stderr,
304 " InnoDB: Error: MySQL is freeing a thd\n"
305 "InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
306 "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
307 (ulong)trx->n_mysql_tables_in_use,
308 (ulong)trx->mysql_n_tables_locked);
310 trx_print(stderr, trx, 600);
312 ut_print_buf(stderr, trx, sizeof(trx_t));
315 ut_a(trx->magic_n == TRX_MAGIC_N);
317 trx->magic_n = 11112222;
319 ut_a(trx->conc_state == TRX_NOT_STARTED);
321 mutex_free(&(trx->undo_mutex));
323 ut_a(trx->insert_undo == NULL);
324 ut_a(trx->update_undo == NULL);
326 if (trx->undo_no_arr) {
327 trx_undo_arr_free(trx->undo_no_arr);
330 ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
331 ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
333 ut_a(trx->wait_lock == NULL);
334 ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
336 ut_a(!trx->has_search_latch);
337 ut_a(!trx->auto_inc_lock);
339 ut_a(trx->dict_operation_lock_mode == 0);
341 if (trx->lock_heap) {
342 mem_heap_free(trx->lock_heap);
345 ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
347 if (trx->global_read_view_heap) {
348 mem_heap_free(trx->global_read_view_heap);
351 trx->global_read_view = NULL;
353 ut_a(trx->read_view == NULL);
355 mem_free(trx);
358 /************************************************************************
359 Frees a transaction object for MySQL. */
361 void
362 trx_free_for_mysql(
363 /*===============*/
364 trx_t* trx) /* in, own: trx object */
366 mutex_enter(&kernel_mutex);
368 UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
370 trx_free(trx);
372 ut_a(trx_n_mysql_transactions > 0);
374 trx_n_mysql_transactions--;
376 mutex_exit(&kernel_mutex);
379 /************************************************************************
380 Frees a transaction object of a background operation of the master thread. */
382 void
383 trx_free_for_background(
384 /*====================*/
385 trx_t* trx) /* in, own: trx object */
387 mutex_enter(&kernel_mutex);
389 trx_free(trx);
391 mutex_exit(&kernel_mutex);
394 /********************************************************************
395 Inserts the trx handle in the trx system trx list in the right position.
396 The list is sorted on the trx id so that the biggest id is at the list
397 start. This function is used at the database startup to insert incomplete
398 transactions to the list. */
399 static
400 void
401 trx_list_insert_ordered(
402 /*====================*/
403 trx_t* trx) /* in: trx handle */
405 trx_t* trx2;
407 ut_ad(mutex_own(&kernel_mutex));
409 trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
411 while (trx2 != NULL) {
412 if (ut_dulint_cmp(trx->id, trx2->id) >= 0) {
414 ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1);
415 break;
417 trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
420 if (trx2 != NULL) {
421 trx2 = UT_LIST_GET_PREV(trx_list, trx2);
423 if (trx2 == NULL) {
424 UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
425 } else {
426 UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
427 trx2, trx);
429 } else {
430 UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
434 /********************************************************************
435 Creates trx objects for transactions and initializes the trx list of
436 trx_sys at database start. Rollback segment and undo log lists must
437 already exist when this function is called, because the lists of
438 transactions to be rolled back or cleaned up are built based on the
439 undo log lists. */
441 void
442 trx_lists_init_at_db_start(void)
443 /*============================*/
445 trx_rseg_t* rseg;
446 trx_undo_t* undo;
447 trx_t* trx;
449 UT_LIST_INIT(trx_sys->trx_list);
451 /* Look from the rollback segments if there exist undo logs for
452 transactions */
454 rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
456 while (rseg != NULL) {
457 undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
459 while (undo != NULL) {
461 trx = trx_create(NULL);
463 trx->id = undo->trx_id;
464 trx->xid = undo->xid;
465 trx->insert_undo = undo;
466 trx->rseg = rseg;
468 if (undo->state != TRX_UNDO_ACTIVE) {
470 /* Prepared transactions are left in
471 the prepared state waiting for a
472 commit or abort decision from MySQL */
474 if (undo->state == TRX_UNDO_PREPARED) {
476 fprintf(stderr,
477 "InnoDB: Transaction %lu %lu"
478 " was in the"
479 " XA prepared state.\n",
480 ut_dulint_get_high(trx->id),
481 ut_dulint_get_low(trx->id));
483 if (srv_force_recovery == 0) {
485 trx->conc_state = TRX_PREPARED;
486 trx_n_prepared++;
487 } else {
488 fprintf(stderr,
489 "InnoDB: Since"
490 " innodb_force_recovery"
491 " > 0, we will"
492 " rollback it"
493 " anyway.\n");
495 trx->conc_state = TRX_ACTIVE;
497 } else {
498 trx->conc_state
499 = TRX_COMMITTED_IN_MEMORY;
502 /* We give a dummy value for the trx no;
503 this should have no relevance since purge
504 is not interested in committed transaction
505 numbers, unless they are in the history
506 list, in which case it looks the number
507 from the disk based undo log structure */
509 trx->no = trx->id;
510 } else {
511 trx->conc_state = TRX_ACTIVE;
513 /* A running transaction always has the number
514 field inited to ut_dulint_max */
516 trx->no = ut_dulint_max;
519 if (undo->dict_operation) {
520 trx->dict_operation = undo->dict_operation;
521 trx->table_id = undo->table_id;
524 if (!undo->empty) {
525 trx->undo_no = ut_dulint_add(undo->top_undo_no,
529 trx_list_insert_ordered(trx);
531 undo = UT_LIST_GET_NEXT(undo_list, undo);
534 undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
536 while (undo != NULL) {
537 trx = trx_get_on_id(undo->trx_id);
539 if (NULL == trx) {
540 trx = trx_create(NULL);
542 trx->id = undo->trx_id;
543 trx->xid = undo->xid;
545 if (undo->state != TRX_UNDO_ACTIVE) {
547 /* Prepared transactions are left in
548 the prepared state waiting for a
549 commit or abort decision from MySQL */
551 if (undo->state == TRX_UNDO_PREPARED) {
552 fprintf(stderr,
553 "InnoDB: Transaction"
554 " %lu %lu was in the"
555 " XA prepared state.\n",
556 ut_dulint_get_high(
557 trx->id),
558 ut_dulint_get_low(
559 trx->id));
561 if (srv_force_recovery == 0) {
563 trx->conc_state
564 = TRX_PREPARED;
565 trx_n_prepared++;
566 } else {
567 fprintf(stderr,
568 "InnoDB: Since"
569 " innodb_force_recovery"
570 " > 0, we will"
571 " rollback it"
572 " anyway.\n");
574 trx->conc_state
575 = TRX_ACTIVE;
577 } else {
578 trx->conc_state
579 = TRX_COMMITTED_IN_MEMORY;
582 /* We give a dummy value for the trx
583 number */
585 trx->no = trx->id;
586 } else {
587 trx->conc_state = TRX_ACTIVE;
589 /* A running transaction always has
590 the number field inited to
591 ut_dulint_max */
593 trx->no = ut_dulint_max;
596 trx->rseg = rseg;
597 trx_list_insert_ordered(trx);
599 if (undo->dict_operation) {
600 trx->dict_operation
601 = undo->dict_operation;
602 trx->table_id = undo->table_id;
606 trx->update_undo = undo;
608 if ((!undo->empty)
609 && (ut_dulint_cmp(undo->top_undo_no,
610 trx->undo_no) >= 0)) {
612 trx->undo_no = ut_dulint_add(undo->top_undo_no,
616 undo = UT_LIST_GET_NEXT(undo_list, undo);
619 rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
623 /**********************************************************************
624 Assigns a rollback segment to a transaction in a round-robin fashion.
625 Skips the SYSTEM rollback segment if another is available. */
626 UNIV_INLINE
627 ulint
628 trx_assign_rseg(void)
629 /*=================*/
630 /* out: assigned rollback segment id */
632 trx_rseg_t* rseg = trx_sys->latest_rseg;
634 ut_ad(mutex_own(&kernel_mutex));
635 loop:
636 /* Get next rseg in a round-robin fashion */
638 rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
640 if (rseg == NULL) {
641 rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
644 /* If it is the SYSTEM rollback segment, and there exist others, skip
645 it */
647 if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
648 && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
649 goto loop;
652 trx_sys->latest_rseg = rseg;
654 return(rseg->id);
657 /********************************************************************
658 Starts a new transaction. */
660 ibool
661 trx_start_low(
662 /*==========*/
663 /* out: TRUE */
664 trx_t* trx, /* in: transaction */
665 ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
666 is passed, the system chooses the rollback segment
667 automatically in a round-robin fashion */
669 trx_rseg_t* rseg;
671 ut_ad(mutex_own(&kernel_mutex));
672 ut_ad(trx->rseg == NULL);
674 if (trx->is_purge) {
675 trx->id = ut_dulint_zero;
676 trx->conc_state = TRX_ACTIVE;
677 trx->start_time = time(NULL);
679 return(TRUE);
682 ut_ad(trx->conc_state != TRX_ACTIVE);
684 if (rseg_id == ULINT_UNDEFINED) {
686 rseg_id = trx_assign_rseg();
689 rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
691 trx->id = trx_sys_get_new_trx_id();
693 /* The initial value for trx->no: ut_dulint_max is used in
694 read_view_open_now: */
696 trx->no = ut_dulint_max;
698 trx->rseg = rseg;
700 trx->conc_state = TRX_ACTIVE;
701 trx->start_time = time(NULL);
703 UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
705 return(TRUE);
708 /********************************************************************
709 Starts a new transaction. */
711 ibool
712 trx_start(
713 /*======*/
714 /* out: TRUE */
715 trx_t* trx, /* in: transaction */
716 ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED
717 is passed, the system chooses the rollback segment
718 automatically in a round-robin fashion */
720 ibool ret;
722 mutex_enter(&kernel_mutex);
724 ret = trx_start_low(trx, rseg_id);
726 mutex_exit(&kernel_mutex);
728 return(ret);
731 /********************************************************************
732 Commits a transaction. */
734 void
735 trx_commit_off_kernel(
736 /*==================*/
737 trx_t* trx) /* in: transaction */
739 page_t* update_hdr_page;
740 dulint lsn;
741 trx_rseg_t* rseg;
742 trx_undo_t* undo;
743 ibool must_flush_log = FALSE;
744 mtr_t mtr;
746 ut_ad(mutex_own(&kernel_mutex));
748 trx->must_flush_log_later = FALSE;
750 rseg = trx->rseg;
752 if (trx->insert_undo != NULL || trx->update_undo != NULL) {
754 mutex_exit(&kernel_mutex);
756 mtr_start(&mtr);
758 must_flush_log = TRUE;
760 /* Change the undo log segment states from TRX_UNDO_ACTIVE
761 to some other state: these modifications to the file data
762 structure define the transaction as committed in the file
763 based world, at the serialization point of the log sequence
764 number lsn obtained below. */
766 mutex_enter(&(rseg->mutex));
768 if (trx->insert_undo != NULL) {
769 trx_undo_set_state_at_finish(
770 rseg, trx, trx->insert_undo, &mtr);
773 undo = trx->update_undo;
775 if (undo) {
776 mutex_enter(&kernel_mutex);
777 trx->no = trx_sys_get_new_trx_no();
779 mutex_exit(&kernel_mutex);
781 /* It is not necessary to obtain trx->undo_mutex here
782 because only a single OS thread is allowed to do the
783 transaction commit for this transaction. */
785 update_hdr_page = trx_undo_set_state_at_finish(
786 rseg, trx, undo, &mtr);
788 /* We have to do the cleanup for the update log while
789 holding the rseg mutex because update log headers
790 have to be put to the history list in the order of
791 the trx number. */
793 trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
796 mutex_exit(&(rseg->mutex));
798 /* Update the latest MySQL binlog name and offset info
799 in trx sys header if MySQL binlogging is on or the database
800 server is a MySQL replication slave */
802 if (trx->mysql_log_file_name
803 && trx->mysql_log_file_name[0] != '\0') {
804 trx_sys_update_mysql_binlog_offset(
805 trx->mysql_log_file_name,
806 trx->mysql_log_offset,
807 TRX_SYS_MYSQL_LOG_INFO, &mtr);
808 trx->mysql_log_file_name = NULL;
811 /* The following call commits the mini-transaction, making the
812 whole transaction committed in the file-based world, at this
813 log sequence number. The transaction becomes 'durable' when
814 we write the log to disk, but in the logical sense the commit
815 in the file-based data structures (undo logs etc.) happens
816 here.
818 NOTE that transaction numbers, which are assigned only to
819 transactions with an update undo log, do not necessarily come
820 in exactly the same order as commit lsn's, if the transactions
821 have different rollback segments. To get exactly the same
822 order we should hold the kernel mutex up to this point,
823 adding to to the contention of the kernel mutex. However, if
824 a transaction T2 is able to see modifications made by
825 a transaction T1, T2 will always get a bigger transaction
826 number and a bigger commit lsn than T1. */
828 /*--------------*/
829 mtr_commit(&mtr);
830 /*--------------*/
831 lsn = mtr.end_lsn;
833 mutex_enter(&kernel_mutex);
836 ut_ad(trx->conc_state == TRX_ACTIVE
837 || trx->conc_state == TRX_PREPARED);
838 ut_ad(mutex_own(&kernel_mutex));
840 if (UNIV_UNLIKELY(trx->conc_state == TRX_PREPARED)) {
841 ut_a(trx_n_prepared > 0);
842 trx_n_prepared--;
845 /* The following assignment makes the transaction committed in memory
846 and makes its changes to data visible to other transactions.
847 NOTE that there is a small discrepancy from the strict formal
848 visibility rules here: a human user of the database can see
849 modifications made by another transaction T even before the necessary
850 log segment has been flushed to the disk. If the database happens to
851 crash before the flush, the user has seen modifications from T which
852 will never be a committed transaction. However, any transaction T2
853 which sees the modifications of the committing transaction T, and
854 which also itself makes modifications to the database, will get an lsn
855 larger than the committing transaction T. In the case where the log
856 flush fails, and T never gets committed, also T2 will never get
857 committed. */
859 /*--------------------------------------*/
860 trx->conc_state = TRX_COMMITTED_IN_MEMORY;
861 /*--------------------------------------*/
863 lock_release_off_kernel(trx);
865 if (trx->global_read_view) {
866 read_view_close(trx->global_read_view);
867 mem_heap_empty(trx->global_read_view_heap);
868 trx->global_read_view = NULL;
871 trx->read_view = NULL;
873 if (must_flush_log) {
875 mutex_exit(&kernel_mutex);
877 if (trx->insert_undo != NULL) {
879 trx_undo_insert_cleanup(trx);
882 /* NOTE that we could possibly make a group commit more
883 efficient here: call os_thread_yield here to allow also other
884 trxs to come to commit! */
886 /*-------------------------------------*/
888 /* Depending on the my.cnf options, we may now write the log
889 buffer to the log files, making the transaction durable if
890 the OS does not crash. We may also flush the log files to
891 disk, making the transaction durable also at an OS crash or a
892 power outage.
894 The idea in InnoDB's group commit is that a group of
895 transactions gather behind a trx doing a physical disk write
896 to log files, and when that physical write has been completed,
897 one of those transactions does a write which commits the whole
898 group. Note that this group commit will only bring benefit if
899 there are > 2 users in the database. Then at least 2 users can
900 gather behind one doing the physical log write to disk.
902 If we are calling trx_commit() under MySQL's binlog mutex, we
903 will delay possible log write and flush to a separate function
904 trx_commit_complete_for_mysql(), which is only called when the
905 thread has released the binlog mutex. This is to make the
906 group commit algorithm to work. Otherwise, the MySQL binlog
907 mutex would serialize all commits and prevent a group of
908 transactions from gathering. */
910 if (trx->flush_log_later) {
911 /* Do nothing yet */
912 trx->must_flush_log_later = TRUE;
913 } else if (srv_flush_log_at_trx_commit == 0) {
914 /* Do nothing */
915 } else if (srv_flush_log_at_trx_commit == 1) {
916 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
917 /* Write the log but do not flush it to disk */
919 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
920 FALSE);
921 } else {
922 /* Write the log to the log files AND flush
923 them to disk */
925 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
927 } else if (srv_flush_log_at_trx_commit == 2) {
929 /* Write the log but do not flush it to disk */
931 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
932 } else {
933 ut_error;
936 trx->commit_lsn = lsn;
938 /*-------------------------------------*/
940 mutex_enter(&kernel_mutex);
943 /* Free all savepoints */
944 trx_roll_free_all_savepoints(trx);
946 trx->conc_state = TRX_NOT_STARTED;
947 trx->rseg = NULL;
948 trx->undo_no = ut_dulint_zero;
949 trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
950 trx->mysql_query_str = NULL;
951 trx->mysql_query_len = NULL;
953 ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
954 ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
956 UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
959 /********************************************************************
960 Cleans up a transaction at database startup. The cleanup is needed if
961 the transaction already got to the middle of a commit when the database
962 crashed, andf we cannot roll it back. */
964 void
965 trx_cleanup_at_db_startup(
966 /*======================*/
967 trx_t* trx) /* in: transaction */
969 if (trx->insert_undo != NULL) {
971 trx_undo_insert_cleanup(trx);
974 trx->conc_state = TRX_NOT_STARTED;
975 trx->rseg = NULL;
976 trx->undo_no = ut_dulint_zero;
977 trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
979 UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
982 /************************************************************************
983 Assigns a read view for a consistent read query. All the consistent reads
984 within the same transaction will get the same read view, which is created
985 when this function is first called for a new started transaction. */
987 read_view_t*
988 trx_assign_read_view(
989 /*=================*/
990 /* out: consistent read view */
991 trx_t* trx) /* in: active transaction */
993 ut_ad(trx->conc_state == TRX_ACTIVE);
995 if (trx->read_view) {
996 return(trx->read_view);
999 mutex_enter(&kernel_mutex);
1001 if (!trx->read_view) {
1002 trx->read_view = read_view_open_now(
1003 trx->id, trx->global_read_view_heap);
1004 trx->global_read_view = trx->read_view;
1007 mutex_exit(&kernel_mutex);
1009 return(trx->read_view);
1012 /********************************************************************
1013 Commits a transaction. NOTE that the kernel mutex is temporarily released. */
1014 static
1015 void
1016 trx_handle_commit_sig_off_kernel(
1017 /*=============================*/
1018 trx_t* trx, /* in: transaction */
1019 que_thr_t** next_thr) /* in/out: next query thread to run;
1020 if the value which is passed in is
1021 a pointer to a NULL pointer, then the
1022 calling function can start running
1023 a new query thread */
1025 trx_sig_t* sig;
1026 trx_sig_t* next_sig;
1028 ut_ad(mutex_own(&kernel_mutex));
1030 trx->que_state = TRX_QUE_COMMITTING;
1032 trx_commit_off_kernel(trx);
1034 ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
1036 /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
1037 reply messages to them */
1039 sig = UT_LIST_GET_FIRST(trx->signals);
1041 while (sig != NULL) {
1042 next_sig = UT_LIST_GET_NEXT(signals, sig);
1044 if (sig->type == TRX_SIG_COMMIT) {
1046 trx_sig_reply(sig, next_thr);
1047 trx_sig_remove(trx, sig);
1050 sig = next_sig;
1053 trx->que_state = TRX_QUE_RUNNING;
1056 /***************************************************************
1057 The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
1058 the TRX_QUE_RUNNING state and releases query threads which were
1059 waiting for a lock in the wait_thrs list. */
1061 void
1062 trx_end_lock_wait(
1063 /*==============*/
1064 trx_t* trx) /* in: transaction */
1066 que_thr_t* thr;
1068 ut_ad(mutex_own(&kernel_mutex));
1069 ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1071 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1073 while (thr != NULL) {
1074 que_thr_end_wait_no_next_thr(thr);
1076 UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1078 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1081 trx->que_state = TRX_QUE_RUNNING;
1084 /***************************************************************
1085 Moves the query threads in the lock wait list to the SUSPENDED state and puts
1086 the transaction to the TRX_QUE_RUNNING state. */
1087 static
1088 void
1089 trx_lock_wait_to_suspended(
1090 /*=======================*/
1091 trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */
1093 que_thr_t* thr;
1095 ut_ad(mutex_own(&kernel_mutex));
1096 ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
1098 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1100 while (thr != NULL) {
1101 thr->state = QUE_THR_SUSPENDED;
1103 UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
1105 thr = UT_LIST_GET_FIRST(trx->wait_thrs);
1108 trx->que_state = TRX_QUE_RUNNING;
1111 /***************************************************************
1112 Moves the query threads in the sig reply wait list of trx to the SUSPENDED
1113 state. */
1114 static
1115 void
1116 trx_sig_reply_wait_to_suspended(
1117 /*============================*/
1118 trx_t* trx) /* in: transaction */
1120 trx_sig_t* sig;
1121 que_thr_t* thr;
1123 ut_ad(mutex_own(&kernel_mutex));
1125 sig = UT_LIST_GET_FIRST(trx->reply_signals);
1127 while (sig != NULL) {
1128 thr = sig->receiver;
1130 ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
1132 thr->state = QUE_THR_SUSPENDED;
1134 sig->receiver = NULL;
1136 UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
1138 sig = UT_LIST_GET_FIRST(trx->reply_signals);
1142 /*********************************************************************
1143 Checks the compatibility of a new signal with the other signals in the
1144 queue. */
1145 static
1146 ibool
1147 trx_sig_is_compatible(
1148 /*==================*/
1149 /* out: TRUE if the signal can be queued */
1150 trx_t* trx, /* in: trx handle */
1151 ulint type, /* in: signal type */
1152 ulint sender) /* in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
1154 trx_sig_t* sig;
1156 ut_ad(mutex_own(&kernel_mutex));
1158 if (UT_LIST_GET_LEN(trx->signals) == 0) {
1160 return(TRUE);
1163 if (sender == TRX_SIG_SELF) {
1164 if (type == TRX_SIG_ERROR_OCCURRED) {
1166 return(TRUE);
1168 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1170 return(TRUE);
1171 } else {
1172 return(FALSE);
1176 ut_ad(sender == TRX_SIG_OTHER_SESS);
1178 sig = UT_LIST_GET_FIRST(trx->signals);
1180 if (type == TRX_SIG_COMMIT) {
1181 while (sig != NULL) {
1183 if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
1185 return(FALSE);
1188 sig = UT_LIST_GET_NEXT(signals, sig);
1191 return(TRUE);
1193 } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
1194 while (sig != NULL) {
1196 if (sig->type == TRX_SIG_COMMIT) {
1198 return(FALSE);
1201 sig = UT_LIST_GET_NEXT(signals, sig);
1204 return(TRUE);
1206 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1208 return(TRUE);
1209 } else {
1210 ut_error;
1212 return(FALSE);
1216 /********************************************************************
1217 Sends a signal to a trx object. */
1219 void
1220 trx_sig_send(
1221 /*=========*/
1222 trx_t* trx, /* in: trx handle */
1223 ulint type, /* in: signal type */
1224 ulint sender, /* in: TRX_SIG_SELF or
1225 TRX_SIG_OTHER_SESS */
1226 que_thr_t* receiver_thr, /* in: query thread which wants the
1227 reply, or NULL; if type is
1228 TRX_SIG_END_WAIT, this must be NULL */
1229 trx_savept_t* savept, /* in: possible rollback savepoint, or
1230 NULL */
1231 que_thr_t** next_thr) /* in/out: next query thread to run;
1232 if the value which is passed in is
1233 a pointer to a NULL pointer, then the
1234 calling function can start running
1235 a new query thread; if the parameter
1236 is NULL, it is ignored */
1238 trx_sig_t* sig;
1239 trx_t* receiver_trx;
1241 ut_ad(trx);
1242 ut_ad(mutex_own(&kernel_mutex));
1244 if (!trx_sig_is_compatible(trx, type, sender)) {
1245 /* The signal is not compatible with the other signals in
1246 the queue: die */
1248 ut_error;
1251 /* Queue the signal object */
1253 if (UT_LIST_GET_LEN(trx->signals) == 0) {
1255 /* The signal list is empty: the 'sig' slot must be unused
1256 (we improve performance a bit by avoiding mem_alloc) */
1257 sig = &(trx->sig);
1258 } else {
1259 /* It might be that the 'sig' slot is unused also in this
1260 case, but we choose the easy way of using mem_alloc */
1262 sig = mem_alloc(sizeof(trx_sig_t));
1265 UT_LIST_ADD_LAST(signals, trx->signals, sig);
1267 sig->type = type;
1268 sig->sender = sender;
1269 sig->receiver = receiver_thr;
1271 if (savept) {
1272 sig->savept = *savept;
1275 if (receiver_thr) {
1276 receiver_trx = thr_get_trx(receiver_thr);
1278 UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
1279 sig);
1282 if (trx->sess->state == SESS_ERROR) {
1284 trx_sig_reply_wait_to_suspended(trx);
1287 if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
1288 ut_error;
1291 /* If there were no other signals ahead in the queue, try to start
1292 handling of the signal */
1294 if (UT_LIST_GET_FIRST(trx->signals) == sig) {
1296 trx_sig_start_handle(trx, next_thr);
1300 /********************************************************************
1301 Ends signal handling. If the session is in the error state, and
1302 trx->graph_before_signal_handling != NULL, then returns control to the error
1303 handling routine of the graph (currently just returns the control to the
1304 graph root which then will send an error message to the client). */
1306 void
1307 trx_end_signal_handling(
1308 /*====================*/
1309 trx_t* trx) /* in: trx */
1311 ut_ad(mutex_own(&kernel_mutex));
1312 ut_ad(trx->handling_signals == TRUE);
1314 trx->handling_signals = FALSE;
1316 trx->graph = trx->graph_before_signal_handling;
1318 if (trx->graph && (trx->sess->state == SESS_ERROR)) {
1320 que_fork_error_handle(trx, trx->graph);
1324 /********************************************************************
1325 Starts handling of a trx signal. */
1327 void
1328 trx_sig_start_handle(
1329 /*=================*/
1330 trx_t* trx, /* in: trx handle */
1331 que_thr_t** next_thr) /* in/out: next query thread to run;
1332 if the value which is passed in is
1333 a pointer to a NULL pointer, then the
1334 calling function can start running
1335 a new query thread; if the parameter
1336 is NULL, it is ignored */
1338 trx_sig_t* sig;
1339 ulint type;
1340 loop:
1341 /* We loop in this function body as long as there are queued signals
1342 we can process immediately */
1344 ut_ad(trx);
1345 ut_ad(mutex_own(&kernel_mutex));
1347 if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
1349 trx_end_signal_handling(trx);
1351 return;
1354 if (trx->conc_state == TRX_NOT_STARTED) {
1356 trx_start_low(trx, ULINT_UNDEFINED);
1359 /* If the trx is in a lock wait state, moves the waiting query threads
1360 to the suspended state */
1362 if (trx->que_state == TRX_QUE_LOCK_WAIT) {
1364 trx_lock_wait_to_suspended(trx);
1367 /* If the session is in the error state and this trx has threads
1368 waiting for reply from signals, moves these threads to the suspended
1369 state, canceling wait reservations; note that if the transaction has
1370 sent a commit or rollback signal to itself, and its session is not in
1371 the error state, then nothing is done here. */
1373 if (trx->sess->state == SESS_ERROR) {
1374 trx_sig_reply_wait_to_suspended(trx);
1377 /* If there are no running query threads, we can start processing of a
1378 signal, otherwise we have to wait until all query threads of this
1379 transaction are aware of the arrival of the signal. */
1381 if (trx->n_active_thrs > 0) {
1383 return;
1386 if (trx->handling_signals == FALSE) {
1387 trx->graph_before_signal_handling = trx->graph;
1389 trx->handling_signals = TRUE;
1392 sig = UT_LIST_GET_FIRST(trx->signals);
1393 type = sig->type;
1395 if (type == TRX_SIG_COMMIT) {
1397 trx_handle_commit_sig_off_kernel(trx, next_thr);
1399 } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
1400 || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
1402 trx_rollback(trx, sig, next_thr);
1404 /* No further signals can be handled until the rollback
1405 completes, therefore we return */
1407 return;
1409 } else if (type == TRX_SIG_ERROR_OCCURRED) {
1411 trx_rollback(trx, sig, next_thr);
1413 /* No further signals can be handled until the rollback
1414 completes, therefore we return */
1416 return;
1418 } else if (type == TRX_SIG_BREAK_EXECUTION) {
1420 trx_sig_reply(sig, next_thr);
1421 trx_sig_remove(trx, sig);
1422 } else {
1423 ut_error;
1426 goto loop;
1429 /********************************************************************
1430 Send the reply message when a signal in the queue of the trx has been
1431 handled. */
1433 void
1434 trx_sig_reply(
1435 /*==========*/
1436 trx_sig_t* sig, /* in: signal */
1437 que_thr_t** next_thr) /* in/out: next query thread to run;
1438 if the value which is passed in is
1439 a pointer to a NULL pointer, then the
1440 calling function can start running
1441 a new query thread */
1443 trx_t* receiver_trx;
1445 ut_ad(sig);
1446 ut_ad(mutex_own(&kernel_mutex));
1448 if (sig->receiver != NULL) {
1449 ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
1451 receiver_trx = thr_get_trx(sig->receiver);
1453 UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
1454 sig);
1455 ut_ad(receiver_trx->sess->state != SESS_ERROR);
1457 que_thr_end_wait(sig->receiver, next_thr);
1459 sig->receiver = NULL;
1464 /********************************************************************
1465 Removes a signal object from the trx signal queue. */
1467 void
1468 trx_sig_remove(
1469 /*===========*/
1470 trx_t* trx, /* in: trx handle */
1471 trx_sig_t* sig) /* in, own: signal */
1473 ut_ad(trx && sig);
1474 ut_ad(mutex_own(&kernel_mutex));
1476 ut_ad(sig->receiver == NULL);
1478 UT_LIST_REMOVE(signals, trx->signals, sig);
1479 sig->type = 0; /* reset the field to catch possible bugs */
1481 if (sig != &(trx->sig)) {
1482 mem_free(sig);
1486 /*************************************************************************
1487 Creates a commit command node struct. */
1489 commit_node_t*
1490 commit_node_create(
1491 /*===============*/
1492 /* out, own: commit node struct */
1493 mem_heap_t* heap) /* in: mem heap where created */
1495 commit_node_t* node;
1497 node = mem_heap_alloc(heap, sizeof(commit_node_t));
1498 node->common.type = QUE_NODE_COMMIT;
1499 node->state = COMMIT_NODE_SEND;
1501 return(node);
1504 /***************************************************************
1505 Performs an execution step for a commit type node in a query graph. */
1507 que_thr_t*
1508 trx_commit_step(
1509 /*============*/
1510 /* out: query thread to run next, or NULL */
1511 que_thr_t* thr) /* in: query thread */
1513 commit_node_t* node;
1514 que_thr_t* next_thr;
1516 node = thr->run_node;
1518 ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
1520 if (thr->prev_node == que_node_get_parent(node)) {
1521 node->state = COMMIT_NODE_SEND;
1524 if (node->state == COMMIT_NODE_SEND) {
1525 mutex_enter(&kernel_mutex);
1527 node->state = COMMIT_NODE_WAIT;
1529 next_thr = NULL;
1531 thr->state = QUE_THR_SIG_REPLY_WAIT;
1533 /* Send the commit signal to the transaction */
1535 trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
1536 thr, NULL, &next_thr);
1538 mutex_exit(&kernel_mutex);
1540 return(next_thr);
1543 ut_ad(node->state == COMMIT_NODE_WAIT);
1545 node->state = COMMIT_NODE_SEND;
1547 thr->run_node = que_node_get_parent(node);
1549 return(thr);
1552 /**************************************************************************
1553 Does the transaction commit for MySQL. */
1555 ulint
1556 trx_commit_for_mysql(
1557 /*=================*/
1558 /* out: 0 or error number */
1559 trx_t* trx) /* in: trx handle */
1561 /* Because we do not do the commit by sending an Innobase
1562 sig to the transaction, we must here make sure that trx has been
1563 started. */
1565 ut_a(trx);
1567 trx->op_info = "committing";
1569 /* If we are doing the XA recovery of prepared transactions, then
1570 the transaction object does not have an InnoDB session object, and we
1571 set the dummy session that we use for all MySQL transactions. */
1573 if (trx->sess == NULL) {
1574 /* Open a dummy session */
1576 if (!trx_dummy_sess) {
1577 mutex_enter(&kernel_mutex);
1579 if (!trx_dummy_sess) {
1580 trx_dummy_sess = sess_open();
1583 mutex_exit(&kernel_mutex);
1586 trx->sess = trx_dummy_sess;
1589 trx_start_if_not_started(trx);
1591 mutex_enter(&kernel_mutex);
1593 trx_commit_off_kernel(trx);
1595 mutex_exit(&kernel_mutex);
1597 trx->op_info = "";
1599 return(0);
1602 /**************************************************************************
1603 If required, flushes the log to disk if we called trx_commit_for_mysql()
1604 with trx->flush_log_later == TRUE. */
1606 ulint
1607 trx_commit_complete_for_mysql(
1608 /*==========================*/
1609 /* out: 0 or error number */
1610 trx_t* trx) /* in: trx handle */
1612 dulint lsn = trx->commit_lsn;
1614 ut_a(trx);
1616 trx->op_info = "flushing log";
1618 if (!trx->must_flush_log_later) {
1619 /* Do nothing */
1620 } else if (srv_flush_log_at_trx_commit == 0) {
1621 /* Do nothing */
1622 } else if (srv_flush_log_at_trx_commit == 1) {
1623 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1624 /* Write the log but do not flush it to disk */
1626 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1627 } else {
1628 /* Write the log to the log files AND flush them to
1629 disk */
1631 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1633 } else if (srv_flush_log_at_trx_commit == 2) {
1635 /* Write the log but do not flush it to disk */
1637 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1638 } else {
1639 ut_error;
1642 trx->must_flush_log_later = FALSE;
1644 trx->op_info = "";
1646 return(0);
1649 /**************************************************************************
1650 Marks the latest SQL statement ended. */
1652 void
1653 trx_mark_sql_stat_end(
1654 /*==================*/
1655 trx_t* trx) /* in: trx handle */
1657 ut_a(trx);
1659 if (trx->conc_state == TRX_NOT_STARTED) {
1660 trx->undo_no = ut_dulint_zero;
1663 trx->last_sql_stat_start.least_undo_no = trx->undo_no;
1666 /**************************************************************************
1667 Prints info about a transaction to the given file. The caller must own the
1668 kernel mutex. */
1670 void
1671 trx_print(
1672 /*======*/
1673 FILE* f, /* in: output stream */
1674 trx_t* trx, /* in: transaction */
1675 ulint max_query_len) /* in: max query length to print, or 0 to
1676 use the default max length */
1678 ibool newline;
1680 fprintf(f, "TRANSACTION %lu %lu",
1681 (ulong) ut_dulint_get_high(trx->id),
1682 (ulong) ut_dulint_get_low(trx->id));
1684 switch (trx->conc_state) {
1685 case TRX_NOT_STARTED:
1686 fputs(", not started", f);
1687 break;
1688 case TRX_ACTIVE:
1689 fprintf(f, ", ACTIVE %lu sec",
1690 (ulong)difftime(time(NULL), trx->start_time));
1691 break;
1692 case TRX_PREPARED:
1693 fprintf(f, ", ACTIVE (PREPARED) %lu sec",
1694 (ulong)difftime(time(NULL), trx->start_time));
1695 break;
1696 case TRX_COMMITTED_IN_MEMORY:
1697 fputs(", COMMITTED IN MEMORY", f);
1698 break;
1699 default:
1700 fprintf(f, " state %lu", (ulong) trx->conc_state);
1703 #ifdef UNIV_LINUX
1704 fprintf(f, ", process no %lu", trx->mysql_process_no);
1705 #endif
1706 fprintf(f, ", OS thread id %lu",
1707 (ulong) os_thread_pf(trx->mysql_thread_id));
1709 if (*trx->op_info) {
1710 putc(' ', f);
1711 fputs(trx->op_info, f);
1714 if (trx->is_purge) {
1715 fputs(" purge trx", f);
1718 if (trx->declared_to_be_inside_innodb) {
1719 fprintf(f, ", thread declared inside InnoDB %lu",
1720 (ulong) trx->n_tickets_to_enter_innodb);
1723 putc('\n', f);
1725 if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
1726 fprintf(f, "mysql tables in use %lu, locked %lu\n",
1727 (ulong) trx->n_mysql_tables_in_use,
1728 (ulong) trx->mysql_n_tables_locked);
1731 newline = TRUE;
1733 switch (trx->que_state) {
1734 case TRX_QUE_RUNNING:
1735 newline = FALSE; break;
1736 case TRX_QUE_LOCK_WAIT:
1737 fputs("LOCK WAIT ", f); break;
1738 case TRX_QUE_ROLLING_BACK:
1739 fputs("ROLLING BACK ", f); break;
1740 case TRX_QUE_COMMITTING:
1741 fputs("COMMITTING ", f); break;
1742 default:
1743 fprintf(f, "que state %lu ", (ulong) trx->que_state);
1746 if (0 < UT_LIST_GET_LEN(trx->trx_locks)
1747 || mem_heap_get_size(trx->lock_heap) > 400) {
1748 newline = TRUE;
1750 fprintf(f, "%lu lock struct(s), heap size %lu,"
1751 " %lu row lock(s)",
1752 (ulong) UT_LIST_GET_LEN(trx->trx_locks),
1753 (ulong) mem_heap_get_size(trx->lock_heap),
1754 (ulong) lock_number_of_rows_locked(trx));
1757 if (trx->has_search_latch) {
1758 newline = TRUE;
1759 fputs(", holds adaptive hash latch", f);
1762 if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) {
1763 newline = TRUE;
1764 fprintf(f, ", undo log entries %lu",
1765 (ulong) ut_dulint_get_low(trx->undo_no));
1768 if (newline) {
1769 putc('\n', f);
1772 if (trx->mysql_thd != NULL) {
1773 innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
1777 /***********************************************************************
1778 Compares the "weight" (or size) of two transactions. The weight of one
1779 transaction is estimated as the number of altered rows + the number of
1780 locked rows. Transactions that have edited non-transactional tables are
1781 considered heavier than ones that have not. */
1784 trx_weight_cmp(
1785 /*===========*/
1786 /* out: <0, 0 or >0; similar to strcmp(3) */
1787 trx_t* a, /* in: the first transaction to be compared */
1788 trx_t* b) /* in: the second transaction to be compared */
1790 ibool a_notrans_edit;
1791 ibool b_notrans_edit;
1793 /* If mysql_thd is NULL for a transaction we assume that it has
1794 not edited non-transactional tables. */
1796 a_notrans_edit = a->mysql_thd != NULL
1797 && thd_has_edited_nontrans_tables(a->mysql_thd);
1799 b_notrans_edit = b->mysql_thd != NULL
1800 && thd_has_edited_nontrans_tables(b->mysql_thd);
1802 if (a_notrans_edit && !b_notrans_edit) {
1804 return(1);
1807 if (!a_notrans_edit && b_notrans_edit) {
1809 return(-1);
1812 /* Either both had edited non-transactional tables or both had
1813 not, we fall back to comparing the number of altered/locked
1814 rows. */
1816 #if 0
1817 fprintf(stderr,
1818 "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
1819 __func__,
1820 ut_conv_dulint_to_longlong(a->undo_no),
1821 UT_LIST_GET_LEN(a->trx_locks),
1822 ut_conv_dulint_to_longlong(b->undo_no),
1823 UT_LIST_GET_LEN(b->trx_locks));
1824 #endif
1826 #define TRX_WEIGHT(t) \
1827 ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks))
1829 return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b)));
1832 /********************************************************************
1833 Prepares a transaction. */
1835 void
1836 trx_prepare_off_kernel(
1837 /*===================*/
1838 trx_t* trx) /* in: transaction */
1840 trx_rseg_t* rseg;
1841 ibool must_flush_log = FALSE;
1842 dulint lsn;
1843 mtr_t mtr;
1845 ut_ad(mutex_own(&kernel_mutex));
1847 rseg = trx->rseg;
1849 if (trx->insert_undo != NULL || trx->update_undo != NULL) {
1851 mutex_exit(&kernel_mutex);
1853 mtr_start(&mtr);
1855 must_flush_log = TRUE;
1857 /* Change the undo log segment states from TRX_UNDO_ACTIVE
1858 to TRX_UNDO_PREPARED: these modifications to the file data
1859 structure define the transaction as prepared in the
1860 file-based world, at the serialization point of lsn. */
1862 mutex_enter(&(rseg->mutex));
1864 if (trx->insert_undo != NULL) {
1866 /* It is not necessary to obtain trx->undo_mutex here
1867 because only a single OS thread is allowed to do the
1868 transaction prepare for this transaction. */
1870 trx_undo_set_state_at_prepare(trx, trx->insert_undo,
1871 &mtr);
1874 if (trx->update_undo) {
1875 trx_undo_set_state_at_prepare(
1876 trx, trx->update_undo, &mtr);
1879 mutex_exit(&(rseg->mutex));
1881 /*--------------*/
1882 mtr_commit(&mtr); /* This mtr commit makes the
1883 transaction prepared in the file-based
1884 world */
1885 /*--------------*/
1886 lsn = mtr.end_lsn;
1888 mutex_enter(&kernel_mutex);
1891 ut_ad(mutex_own(&kernel_mutex));
1893 /*--------------------------------------*/
1894 trx->conc_state = TRX_PREPARED;
1895 trx_n_prepared++;
1896 /*--------------------------------------*/
1898 if (must_flush_log) {
1899 /* Depending on the my.cnf options, we may now write the log
1900 buffer to the log files, making the prepared state of the
1901 transaction durable if the OS does not crash. We may also
1902 flush the log files to disk, making the prepared state of the
1903 transaction durable also at an OS crash or a power outage.
1905 The idea in InnoDB's group prepare is that a group of
1906 transactions gather behind a trx doing a physical disk write
1907 to log files, and when that physical write has been completed,
1908 one of those transactions does a write which prepares the whole
1909 group. Note that this group prepare will only bring benefit if
1910 there are > 2 users in the database. Then at least 2 users can
1911 gather behind one doing the physical log write to disk.
1913 TODO: find out if MySQL holds some mutex when calling this.
1914 That would spoil our group prepare algorithm. */
1916 mutex_exit(&kernel_mutex);
1918 if (srv_flush_log_at_trx_commit == 0) {
1919 /* Do nothing */
1920 } else if (srv_flush_log_at_trx_commit == 1) {
1921 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
1922 /* Write the log but do not flush it to disk */
1924 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
1925 FALSE);
1926 } else {
1927 /* Write the log to the log files AND flush
1928 them to disk */
1930 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
1932 } else if (srv_flush_log_at_trx_commit == 2) {
1934 /* Write the log but do not flush it to disk */
1936 log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
1937 } else {
1938 ut_error;
1941 mutex_enter(&kernel_mutex);
1945 /**************************************************************************
1946 Does the transaction prepare for MySQL. */
1948 ulint
1949 trx_prepare_for_mysql(
1950 /*==================*/
1951 /* out: 0 or error number */
1952 trx_t* trx) /* in: trx handle */
1954 /* Because we do not do the prepare by sending an Innobase
1955 sig to the transaction, we must here make sure that trx has been
1956 started. */
1958 ut_a(trx);
1960 trx->op_info = "preparing";
1962 trx_start_if_not_started(trx);
1964 mutex_enter(&kernel_mutex);
1966 trx_prepare_off_kernel(trx);
1968 mutex_exit(&kernel_mutex);
1970 trx->op_info = "";
1972 return(0);
1975 /**************************************************************************
1976 This function is used to find number of prepared transactions and
1977 their transaction objects for a recovery. */
1980 trx_recover_for_mysql(
1981 /*==================*/
1982 /* out: number of prepared transactions
1983 stored in xid_list */
1984 XID* xid_list, /* in/out: prepared transactions */
1985 ulint len) /* in: number of slots in xid_list */
1987 trx_t* trx;
1988 ulint count = 0;
1990 ut_ad(xid_list);
1991 ut_ad(len);
1993 /* We should set those transactions which are in the prepared state
1994 to the xid_list */
1996 mutex_enter(&kernel_mutex);
1998 trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2000 while (trx) {
2001 if (trx->conc_state == TRX_PREPARED) {
2002 xid_list[count] = trx->xid;
2004 if (count == 0) {
2005 ut_print_timestamp(stderr);
2006 fprintf(stderr,
2007 " InnoDB: Starting recovery for"
2008 " XA transactions...\n");
2011 ut_print_timestamp(stderr);
2012 fprintf(stderr,
2013 " InnoDB: Transaction %lu %lu in"
2014 " prepared state after recovery\n",
2015 (ulong) ut_dulint_get_high(trx->id),
2016 (ulong) ut_dulint_get_low(trx->id));
2018 ut_print_timestamp(stderr);
2019 fprintf(stderr,
2020 " InnoDB: Transaction contains changes"
2021 " to %lu rows\n",
2022 (ulong) ut_conv_dulint_to_longlong(
2023 trx->undo_no));
2025 count++;
2027 if (count == len) {
2028 break;
2032 trx = UT_LIST_GET_NEXT(trx_list, trx);
2035 mutex_exit(&kernel_mutex);
2037 if (count > 0){
2038 ut_print_timestamp(stderr);
2039 fprintf(stderr,
2040 " InnoDB: %lu transactions in prepared state"
2041 " after recovery\n",
2042 (ulong) count);
2045 return ((int) count);
2048 /***********************************************************************
2049 This function is used to find one X/Open XA distributed transaction
2050 which is in the prepared state */
2052 trx_t*
2053 trx_get_trx_by_xid(
2054 /*===============*/
2055 /* out: trx or NULL;
2056 on match, the trx->xid will be invalidated */
2057 const XID* xid) /* in: X/Open XA transaction identifier */
2059 trx_t* trx;
2061 if (xid == NULL) {
2063 return(NULL);
2066 mutex_enter(&kernel_mutex);
2068 trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
2070 while (trx) {
2071 /* Compare two X/Open XA transaction id's: their
2072 length should be the same and binary comparison
2073 of gtrid_lenght+bqual_length bytes should be
2074 the same */
2076 if (trx->conc_state == TRX_PREPARED
2077 && xid->gtrid_length == trx->xid.gtrid_length
2078 && xid->bqual_length == trx->xid.bqual_length
2079 && memcmp(xid->data, trx->xid.data,
2080 xid->gtrid_length + xid->bqual_length) == 0) {
2082 /* Invalidate the XID, so that subsequent calls
2083 will not find it. */
2084 memset(&trx->xid, 0, sizeof(trx->xid));
2085 trx->xid.formatID = -1;
2086 break;
2089 trx = UT_LIST_GET_NEXT(trx_list, trx);
2092 mutex_exit(&kernel_mutex);
2094 return(trx);