1 /******************************************************
2 The interface to the operating system file i/o primitives
6 Created 10/21/1995 Heikki Tuuri
7 *******************************************************/
11 #include "os0thread.h"
14 #include "srv0start.h"
18 #if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19 /* Add includes for the _stat() call to compile on Windows */
20 #include <sys/types.h>
23 #endif /* UNIV_HOTBACKUP */
26 /* We assume in this case that the OS has standard Posix aio (at least SunOS
27 2.6, HP-UX 11i and AIX 4.3 have) */
31 /* This specifies the file permissions InnoDB uses when it creates files in
32 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
36 ulint os_innodb_umask
= S_IRUSR
| S_IWUSR
| S_IRGRP
| S_IWGRP
;
38 ulint os_innodb_umask
= 0;
42 /* If the following is set to TRUE, we do not call os_file_flush in every
43 os_file_write. We can set this TRUE when the doublewrite buffer is used. */
44 ibool os_do_not_call_flush_at_each_write
= FALSE
;
46 /* We do not call os_file_flush in every os_file_write. */
47 #endif /* UNIV_DO_FLUSH */
49 /* We use these mutexes to protect lseek + file i/o operation, if the
50 OS does not provide an atomic pread or pwrite, or similar */
51 #define OS_FILE_N_SEEK_MUTEXES 16
52 os_mutex_t os_file_seek_mutexes
[OS_FILE_N_SEEK_MUTEXES
];
54 /* In simulated aio, merge at most this many consecutive i/os */
55 #define OS_AIO_MERGE_N_CONSECUTIVE 64
57 /* If this flag is TRUE, then we will use the native aio of the
58 OS (provided we compiled Innobase with it in), otherwise we will
59 use simulated aio we build below with threads */
61 ibool os_aio_use_native_aio
= FALSE
;
63 ibool os_aio_print_debug
= FALSE
;
65 /* The aio array slot structure */
66 typedef struct os_aio_slot_struct os_aio_slot_t
;
68 struct os_aio_slot_struct
{
69 ibool is_read
; /* TRUE if a read operation */
70 ulint pos
; /* index of the slot in the aio
72 ibool reserved
; /* TRUE if this slot is reserved */
73 time_t reservation_time
;/* time when reserved */
74 ulint len
; /* length of the block to read or
76 byte
* buf
; /* buffer used in i/o */
77 ulint type
; /* OS_FILE_READ or OS_FILE_WRITE */
78 ulint offset
; /* 32 low bits of file offset in
80 ulint offset_high
; /* 32 high bits of file offset */
81 os_file_t file
; /* file where to read or write */
82 const char* name
; /* file name or path */
83 ibool io_already_done
;/* used only in simulated aio:
84 TRUE if the physical i/o already
85 made and only the slot message
86 needs to be passed to the caller
87 of os_aio_simulated_handle */
88 fil_node_t
* message1
; /* message which is given by the */
89 void* message2
; /* the requester of an aio operation
90 and which can be used to identify
91 which pending aio operation was
94 os_event_t event
; /* event object we need in the
96 OVERLAPPED control
; /* Windows control block for the
98 #elif defined(POSIX_ASYNC_IO)
99 struct aiocb control
; /* Posix control block for aio
104 /* The aio array structure */
105 typedef struct os_aio_array_struct os_aio_array_t
;
107 struct os_aio_array_struct
{
108 os_mutex_t mutex
; /* the mutex protecting the aio array */
109 os_event_t not_full
; /* The event which is set to the signaled
110 state when there is space in the aio
111 outside the ibuf segment */
112 os_event_t is_empty
; /* The event which is set to the signaled
113 state when there are no pending i/os
115 ulint n_slots
; /* Total number of slots in the aio array.
116 This must be divisible by n_threads. */
117 ulint n_segments
;/* Number of segments in the aio array of
118 pending aio requests. A thread can wait
119 separately for any one of the segments. */
120 ulint n_reserved
;/* Number of reserved slots in the
121 aio array outside the ibuf segment */
122 os_aio_slot_t
* slots
; /* Pointer to the slots in the array */
124 os_native_event_t
* native_events
;
125 /* Pointer to an array of OS native event
126 handles where we copied the handles from
127 slots, in the same order. This can be used
128 in WaitForMultipleObjects; used only in
133 /* Array of events used in simulated aio */
134 os_event_t
* os_aio_segment_wait_events
= NULL
;
136 /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
137 are NULL when the module has not yet been initialized. */
138 static os_aio_array_t
* os_aio_read_array
= NULL
;
139 static os_aio_array_t
* os_aio_write_array
= NULL
;
140 static os_aio_array_t
* os_aio_ibuf_array
= NULL
;
141 static os_aio_array_t
* os_aio_log_array
= NULL
;
142 static os_aio_array_t
* os_aio_sync_array
= NULL
;
144 static ulint os_aio_n_segments
= ULINT_UNDEFINED
;
146 /* If the following is TRUE, read i/o handler threads try to
147 wait until a batch of new read requests have been posted */
148 static ibool os_aio_recommend_sleep_for_read_threads
= FALSE
;
150 ulint os_n_file_reads
= 0;
151 ulint os_bytes_read_since_printout
= 0;
152 ulint os_n_file_writes
= 0;
153 ulint os_n_fsyncs
= 0;
154 ulint os_n_file_reads_old
= 0;
155 ulint os_n_file_writes_old
= 0;
156 ulint os_n_fsyncs_old
= 0;
157 time_t os_last_printout
;
159 ibool os_has_said_disk_full
= FALSE
;
161 /* The mutex protecting the following counts of pending I/O operations */
162 static os_mutex_t os_file_count_mutex
;
163 ulint os_file_n_pending_preads
= 0;
164 ulint os_file_n_pending_pwrites
= 0;
165 ulint os_n_pending_writes
= 0;
166 ulint os_n_pending_reads
= 0;
168 /***************************************************************************
169 Gets the operating system version. Currently works only on Windows. */
172 os_get_os_version(void)
173 /*===================*/
174 /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
177 OSVERSIONINFO os_info
;
179 os_info
.dwOSVersionInfoSize
= sizeof(OSVERSIONINFO
);
181 ut_a(GetVersionEx(&os_info
));
183 if (os_info
.dwPlatformId
== VER_PLATFORM_WIN32s
) {
185 } else if (os_info
.dwPlatformId
== VER_PLATFORM_WIN32_WINDOWS
) {
187 } else if (os_info
.dwPlatformId
== VER_PLATFORM_WIN32_NT
) {
188 if (os_info
.dwMajorVersion
<= 4) {
204 /***************************************************************************
205 Retrieves the last error number if an error occurs in a file io function.
206 The number should be retrieved before any other OS calls (because they may
207 overwrite the error number). If the number is not known to this program,
208 the OS error number + 100 is returned. */
211 os_file_get_last_error(
212 /*===================*/
213 /* out: error number, or OS error
215 ibool report_all_errors
) /* in: TRUE if we want an error message
216 printed of all errors */
222 err
= (ulint
) GetLastError();
224 if (report_all_errors
225 || (err
!= ERROR_DISK_FULL
&& err
!= ERROR_FILE_EXISTS
)) {
227 ut_print_timestamp(stderr
);
229 " InnoDB: Operating system error number %lu"
230 " in a file operation.\n", (ulong
) err
);
232 if (err
== ERROR_PATH_NOT_FOUND
) {
234 "InnoDB: The error means the system"
235 " cannot find the path specified.\n");
237 if (srv_is_being_started
) {
239 "InnoDB: If you are installing InnoDB,"
240 " remember that you must create\n"
241 "InnoDB: directories yourself, InnoDB"
242 " does not create them.\n");
244 } else if (err
== ERROR_ACCESS_DENIED
) {
246 "InnoDB: The error means mysqld does not have"
247 " the access rights to\n"
248 "InnoDB: the directory. It may also be"
249 " you have created a subdirectory\n"
250 "InnoDB: of the same name as a data file.\n");
251 } else if (err
== ERROR_SHARING_VIOLATION
252 || err
== ERROR_LOCK_VIOLATION
) {
254 "InnoDB: The error means that another program"
255 " is using InnoDB's files.\n"
256 "InnoDB: This might be a backup or antivirus"
257 " software or another instance\n"
259 " Please close it to get rid of this error.\n");
260 } else if (err
== ERROR_OPERATION_ABORTED
) {
262 "InnoDB: The error means that the I/O"
263 " operation has been aborted\n"
264 "InnoDB: because of either a thread exit"
265 " or an application request.\n"
266 "InnoDB: Retry attempt is made.\n");
269 "InnoDB: Some operating system error numbers"
270 " are described at\n"
272 "http://dev.mysql.com/doc/refman/5.1/en/"
273 "operating-system-error-codes.html\n");
279 if (err
== ERROR_FILE_NOT_FOUND
) {
280 return(OS_FILE_NOT_FOUND
);
281 } else if (err
== ERROR_DISK_FULL
) {
282 return(OS_FILE_DISK_FULL
);
283 } else if (err
== ERROR_FILE_EXISTS
) {
284 return(OS_FILE_ALREADY_EXISTS
);
285 } else if (err
== ERROR_SHARING_VIOLATION
286 || err
== ERROR_LOCK_VIOLATION
) {
287 return(OS_FILE_SHARING_VIOLATION
);
288 } else if (err
== ERROR_OPERATION_ABORTED
) {
289 return(OS_FILE_OPERATION_ABORTED
);
296 if (report_all_errors
297 || (err
!= ENOSPC
&& err
!= EEXIST
)) {
299 ut_print_timestamp(stderr
);
301 " InnoDB: Operating system error number %lu"
302 " in a file operation.\n", (ulong
) err
);
306 "InnoDB: The error means the system"
307 " cannot find the path specified.\n");
309 if (srv_is_being_started
) {
311 "InnoDB: If you are installing InnoDB,"
312 " remember that you must create\n"
313 "InnoDB: directories yourself, InnoDB"
314 " does not create them.\n");
316 } else if (err
== EACCES
) {
318 "InnoDB: The error means mysqld does not have"
319 " the access rights to\n"
320 "InnoDB: the directory.\n");
322 if (strerror((int)err
) != NULL
) {
324 "InnoDB: Error number %lu"
326 err
, strerror((int)err
));
330 "InnoDB: Some operating system"
331 " error numbers are described at\n"
333 "http://dev.mysql.com/doc/refman/5.1/en/"
334 "operating-system-error-codes.html\n");
341 return(OS_FILE_DISK_FULL
);
342 #ifdef POSIX_ASYNC_IO
343 } else if (err
== EAGAIN
) {
344 return(OS_FILE_AIO_RESOURCES_RESERVED
);
346 } else if (err
== ENOENT
) {
347 return(OS_FILE_NOT_FOUND
);
348 } else if (err
== EEXIST
) {
349 return(OS_FILE_ALREADY_EXISTS
);
350 } else if (err
== EXDEV
|| err
== ENOTDIR
|| err
== EISDIR
) {
351 return(OS_FILE_PATH_ERROR
);
358 /********************************************************************
359 Does error handling when a file operation fails.
360 Conditionally exits (calling exit(3)) based on should_exit value and the
365 os_file_handle_error_cond_exit(
366 /*===========================*/
367 /* out: TRUE if we should retry the
369 const char* name
, /* in: name of a file or NULL */
370 const char* operation
, /* in: operation */
371 ibool should_exit
) /* in: call exit(3) if unknown error
372 and this parameter is TRUE */
376 err
= os_file_get_last_error(FALSE
);
378 if (err
== OS_FILE_DISK_FULL
) {
379 /* We only print a warning about disk full once */
381 if (os_has_said_disk_full
) {
387 ut_print_timestamp(stderr
);
389 " InnoDB: Encountered a problem with"
393 ut_print_timestamp(stderr
);
395 " InnoDB: Disk is full. Try to clean the disk"
396 " to free space.\n");
398 os_has_said_disk_full
= TRUE
;
403 } else if (err
== OS_FILE_AIO_RESOURCES_RESERVED
) {
406 } else if (err
== OS_FILE_ALREADY_EXISTS
407 || err
== OS_FILE_PATH_ERROR
) {
410 } else if (err
== OS_FILE_SHARING_VIOLATION
) {
412 os_thread_sleep(10000000); /* 10 sec */
414 } else if (err
== OS_FILE_OPERATION_ABORTED
) {
416 os_thread_sleep(100000); /* 100 ms */
420 fprintf(stderr
, "InnoDB: File name %s\n", name
);
423 fprintf(stderr
, "InnoDB: File operation call: '%s'.\n",
427 fprintf(stderr
, "InnoDB: Cannot continue operation.\n");
438 /********************************************************************
439 Does error handling when a file operation fails. */
442 os_file_handle_error(
443 /*=================*/
444 /* out: TRUE if we should retry the
446 const char* name
, /* in: name of a file or NULL */
447 const char* operation
)/* in: operation */
449 /* exit in case of unknown error */
450 return(os_file_handle_error_cond_exit(name
, operation
, TRUE
));
453 /********************************************************************
454 Does error handling when a file operation fails. */
457 os_file_handle_error_no_exit(
458 /*=========================*/
459 /* out: TRUE if we should retry the
461 const char* name
, /* in: name of a file or NULL */
462 const char* operation
)/* in: operation */
464 /* don't exit in case of unknown error */
465 return(os_file_handle_error_cond_exit(name
, operation
, FALSE
));
469 #define USE_FILE_LOCK
470 #if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
471 /* InnoDB Hot Backup does not lock the data files.
472 * On Windows, mandatory locking is used.
474 # undef USE_FILE_LOCK
477 /********************************************************************
478 Obtain an exclusive lock on a file. */
483 /* out: 0 on success */
484 int fd
, /* in: file descriptor */
485 const char* name
) /* in: file name */
489 lk
.l_whence
= SEEK_SET
;
490 lk
.l_start
= lk
.l_len
= 0;
491 if (fcntl(fd
, F_SETLK
, &lk
) == -1) {
493 "InnoDB: Unable to lock %s, error: %d\n", name
, errno
);
495 if (errno
== EAGAIN
|| errno
== EACCES
) {
497 "InnoDB: Check that you do not already have"
498 " another mysqld process\n"
499 "InnoDB: using the same InnoDB data"
508 #endif /* USE_FILE_LOCK */
510 /********************************************************************
511 Creates the seek mutexes used in positioned reads and writes. */
514 os_io_init_simple(void)
515 /*===================*/
519 os_file_count_mutex
= os_mutex_create(NULL
);
521 for (i
= 0; i
< OS_FILE_N_SEEK_MUTEXES
; i
++) {
522 os_file_seek_mutexes
[i
] = os_mutex_create(NULL
);
526 #if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
527 /*************************************************************************
528 Creates a temporary file that will be deleted on close.
529 This function is defined in ha_innodb.cc. */
532 innobase_mysql_tmpfile(void);
533 /*========================*/
534 /* out: temporary file descriptor, or < 0 on error */
535 #endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
537 /***************************************************************************
538 Creates a temporary file. This function is like tmpfile(3), but
539 the temporary file is created in the MySQL temporary directory.
540 On Netware, this function is like tmpfile(3), because the C run-time
541 library of Netware does not expose the delete-on-close flag. */
544 os_file_create_tmpfile(void)
545 /*========================*/
546 /* out: temporary file handle, or NULL on error */
548 #ifdef UNIV_HOTBACKUP
554 FILE* file
= tmpfile();
555 # else /* __NETWARE__ */
557 int fd
= innobase_mysql_tmpfile();
560 file
= fdopen(fd
, "w+b");
562 # endif /* __NETWARE__ */
565 ut_print_timestamp(stderr
);
567 " InnoDB: Error: unable to create temporary file;"
568 " errno: %d\n", errno
);
573 # endif /* !__NETWARE__ */
577 #endif /* UNIV_HOTBACKUP */
580 /***************************************************************************
581 The os_file_opendir() function opens a directory stream corresponding to the
582 directory named by the dirname argument. The directory stream is positioned
583 at the first entry. In both Unix and Windows we automatically skip the '.'
584 and '..' items at the start of the directory listing. */
589 /* out: directory stream, NULL if
591 const char* dirname
, /* in: directory name; it must not
592 contain a trailing '\' or '/' */
593 ibool error_is_fatal
) /* in: TRUE if we should treat an
594 error as a fatal error; if we try to
595 open symlinks then we do not wish a
596 fatal error if it happens not to be
601 LPWIN32_FIND_DATA lpFindFileData
;
602 char path
[OS_FILE_MAX_PATH
+ 3];
604 ut_a(strlen(dirname
) < OS_FILE_MAX_PATH
);
606 strcpy(path
, dirname
);
607 strcpy(path
+ strlen(path
), "\\*");
609 /* Note that in Windows opening the 'directory stream' also retrieves
610 the first entry in the directory. Since it is '.', that is no problem,
611 as we will skip over the '.' and '..' entries anyway. */
613 lpFindFileData
= ut_malloc(sizeof(WIN32_FIND_DATA
));
615 dir
= FindFirstFile((LPCTSTR
) path
, lpFindFileData
);
617 ut_free(lpFindFileData
);
619 if (dir
== INVALID_HANDLE_VALUE
) {
621 if (error_is_fatal
) {
622 os_file_handle_error(dirname
, "opendir");
630 dir
= opendir(dirname
);
632 if (dir
== NULL
&& error_is_fatal
) {
633 os_file_handle_error(dirname
, "opendir");
640 /***************************************************************************
641 Closes a directory stream. */
646 /* out: 0 if success, -1 if failure */
647 os_file_dir_t dir
) /* in: directory stream */
652 ret
= FindClose(dir
);
655 os_file_handle_error_no_exit(NULL
, "closedir");
667 os_file_handle_error_no_exit(NULL
, "closedir");
674 /***************************************************************************
675 This function returns information of the next file in the directory. We jump
676 over the '.' and '..' entries in the directory. */
679 os_file_readdir_next_file(
680 /*======================*/
681 /* out: 0 if ok, -1 if error, 1 if at the end
683 const char* dirname
,/* in: directory name or path */
684 os_file_dir_t dir
, /* in: directory stream */
685 os_file_stat_t
* info
) /* in/out: buffer where the info is returned */
688 LPWIN32_FIND_DATA lpFindFileData
;
691 lpFindFileData
= ut_malloc(sizeof(WIN32_FIND_DATA
));
693 ret
= FindNextFile(dir
, lpFindFileData
);
696 ut_a(strlen((char *) lpFindFileData
->cFileName
)
699 if (strcmp((char *) lpFindFileData
->cFileName
, ".") == 0
700 || strcmp((char *) lpFindFileData
->cFileName
, "..") == 0) {
705 strcpy(info
->name
, (char *) lpFindFileData
->cFileName
);
707 info
->size
= (ib_longlong
)(lpFindFileData
->nFileSizeLow
)
708 + (((ib_longlong
)(lpFindFileData
->nFileSizeHigh
))
711 if (lpFindFileData
->dwFileAttributes
712 & FILE_ATTRIBUTE_REPARSE_POINT
) {
713 /* TODO: test Windows symlinks */
714 /* TODO: MySQL has apparently its own symlink
715 implementation in Windows, dbname.sym can
716 redirect a database directory:
717 http://dev.mysql.com/doc/refman/5.1/en/
718 windows-symbolic-links.html */
719 info
->type
= OS_FILE_TYPE_LINK
;
720 } else if (lpFindFileData
->dwFileAttributes
721 & FILE_ATTRIBUTE_DIRECTORY
) {
722 info
->type
= OS_FILE_TYPE_DIR
;
724 /* It is probably safest to assume that all other
725 file types are normal. Better to check them rather
726 than blindly skip them. */
728 info
->type
= OS_FILE_TYPE_FILE
;
732 ut_free(lpFindFileData
);
736 } else if (GetLastError() == ERROR_NO_MORE_FILES
) {
740 os_file_handle_error_no_exit(dirname
,
741 "readdir_next_file");
748 struct stat statinfo
;
749 #ifdef HAVE_READDIR_R
750 char dirent_buf
[sizeof(struct dirent
)
751 + _POSIX_PATH_MAX
+ 100];
752 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
753 the max file name len; but in most standards, the
754 length is NAME_MAX; we add 100 to be even safer */
759 #ifdef HAVE_READDIR_R
760 ret
= readdir_r(dir
, (struct dirent
*)dirent_buf
, &ent
);
764 /* On AIX, only if we got non-NULL 'ent' (result) value and
765 a non-zero 'ret' (return) value, it indicates a failed
766 readdir_r() call. An NULL 'ent' with an non-zero 'ret'
767 would indicate the "end of the directory" is reached. */
772 "InnoDB: cannot read directory %s, error %lu\n",
773 dirname
, (ulong
)ret
);
779 /* End of directory */
784 ut_a(strlen(ent
->d_name
) < _POSIX_PATH_MAX
+ 100 - 1);
793 ut_a(strlen(ent
->d_name
) < OS_FILE_MAX_PATH
);
795 if (strcmp(ent
->d_name
, ".") == 0 || strcmp(ent
->d_name
, "..") == 0) {
800 strcpy(info
->name
, ent
->d_name
);
802 full_path
= ut_malloc(strlen(dirname
) + strlen(ent
->d_name
) + 10);
804 sprintf(full_path
, "%s/%s", dirname
, ent
->d_name
);
806 ret
= stat(full_path
, &statinfo
);
809 os_file_handle_error_no_exit(full_path
, "stat");
816 info
->size
= (ib_longlong
)statinfo
.st_size
;
818 if (S_ISDIR(statinfo
.st_mode
)) {
819 info
->type
= OS_FILE_TYPE_DIR
;
820 } else if (S_ISLNK(statinfo
.st_mode
)) {
821 info
->type
= OS_FILE_TYPE_LINK
;
822 } else if (S_ISREG(statinfo
.st_mode
)) {
823 info
->type
= OS_FILE_TYPE_FILE
;
825 info
->type
= OS_FILE_TYPE_UNKNOWN
;
834 /*********************************************************************
835 This function attempts to create a directory named pathname. The new directory
836 gets default permissions. On Unix the permissions are (0770 & ~umask). If the
837 directory exists already, nothing is done and the call succeeds, unless the
838 fail_if_exists arguments is true. */
841 os_file_create_directory(
842 /*=====================*/
843 /* out: TRUE if call succeeds,
845 const char* pathname
, /* in: directory name as
846 null-terminated string */
847 ibool fail_if_exists
) /* in: if TRUE, pre-existing directory
848 is treated as an error. */
853 rcode
= CreateDirectory((LPCTSTR
) pathname
, NULL
);
855 || (GetLastError() == ERROR_ALREADY_EXISTS
856 && !fail_if_exists
))) {
858 os_file_handle_error(pathname
, "CreateDirectory");
867 rcode
= mkdir(pathname
, 0770);
869 if (!(rcode
== 0 || (errno
== EEXIST
&& !fail_if_exists
))) {
871 os_file_handle_error(pathname
, "mkdir");
880 /********************************************************************
881 A simple function to open or create a file. */
884 os_file_create_simple(
885 /*==================*/
886 /* out, own: handle to the file, not defined
887 if error, error number can be retrieved with
888 os_file_get_last_error */
889 const char* name
, /* in: name of the file or path as a
890 null-terminated string */
891 ulint create_mode
,/* in: OS_FILE_OPEN if an existing file is
892 opened (if does not exist, error), or
893 OS_FILE_CREATE if a new file is created
894 (if exists, error), or
895 OS_FILE_CREATE_PATH if new file
896 (if exists, error) and subdirectories along
897 its path are created (if needed)*/
898 ulint access_type
,/* in: OS_FILE_READ_ONLY or
899 OS_FILE_READ_WRITE */
900 ibool
* success
)/* out: TRUE if succeed, FALSE if error */
906 DWORD attributes
= 0;
912 if (create_mode
== OS_FILE_OPEN
) {
913 create_flag
= OPEN_EXISTING
;
914 } else if (create_mode
== OS_FILE_CREATE
) {
915 create_flag
= CREATE_NEW
;
916 } else if (create_mode
== OS_FILE_CREATE_PATH
) {
917 /* create subdirs along the path if needed */
918 *success
= os_file_create_subdirs_if_needed(name
);
922 create_flag
= CREATE_NEW
;
923 create_mode
= OS_FILE_CREATE
;
929 if (access_type
== OS_FILE_READ_ONLY
) {
930 access
= GENERIC_READ
;
931 } else if (access_type
== OS_FILE_READ_WRITE
) {
932 access
= GENERIC_READ
| GENERIC_WRITE
;
938 file
= CreateFile((LPCTSTR
) name
,
940 FILE_SHARE_READ
| FILE_SHARE_WRITE
,
941 /* file can be read and written also
942 by other processes */
943 NULL
, /* default security attributes */
946 NULL
); /* no template file */
948 if (file
== INVALID_HANDLE_VALUE
) {
951 retry
= os_file_handle_error(name
,
952 create_mode
== OS_FILE_OPEN
?
970 if (create_mode
== OS_FILE_OPEN
) {
971 if (access_type
== OS_FILE_READ_ONLY
) {
972 create_flag
= O_RDONLY
;
974 create_flag
= O_RDWR
;
976 } else if (create_mode
== OS_FILE_CREATE
) {
977 create_flag
= O_RDWR
| O_CREAT
| O_EXCL
;
978 } else if (create_mode
== OS_FILE_CREATE_PATH
) {
979 /* create subdirs along the path if needed */
980 *success
= os_file_create_subdirs_if_needed(name
);
984 create_flag
= O_RDWR
| O_CREAT
| O_EXCL
;
985 create_mode
= OS_FILE_CREATE
;
991 if (create_mode
== OS_FILE_CREATE
) {
992 file
= open(name
, create_flag
, S_IRUSR
| S_IWUSR
993 | S_IRGRP
| S_IWGRP
);
995 file
= open(name
, create_flag
);
1001 retry
= os_file_handle_error(name
,
1002 create_mode
== OS_FILE_OPEN
?
1007 #ifdef USE_FILE_LOCK
1008 } else if (access_type
== OS_FILE_READ_WRITE
1009 && os_file_lock(file
, name
)) {
1019 #endif /* __WIN__ */
1022 /********************************************************************
1023 A simple function to open or create a file. */
1026 os_file_create_simple_no_error_handling(
1027 /*====================================*/
1028 /* out, own: handle to the file, not defined
1029 if error, error number can be retrieved with
1030 os_file_get_last_error */
1031 const char* name
, /* in: name of the file or path as a
1032 null-terminated string */
1033 ulint create_mode
,/* in: OS_FILE_OPEN if an existing file
1034 is opened (if does not exist, error), or
1035 OS_FILE_CREATE if a new file is created
1036 (if exists, error) */
1037 ulint access_type
,/* in: OS_FILE_READ_ONLY,
1038 OS_FILE_READ_WRITE, or
1039 OS_FILE_READ_ALLOW_DELETE; the last option is
1040 used by a backup program reading the file */
1041 ibool
* success
)/* out: TRUE if succeed, FALSE if error */
1047 DWORD attributes
= 0;
1048 DWORD share_mode
= FILE_SHARE_READ
| FILE_SHARE_WRITE
;
1052 if (create_mode
== OS_FILE_OPEN
) {
1053 create_flag
= OPEN_EXISTING
;
1054 } else if (create_mode
== OS_FILE_CREATE
) {
1055 create_flag
= CREATE_NEW
;
1061 if (access_type
== OS_FILE_READ_ONLY
) {
1062 access
= GENERIC_READ
;
1063 } else if (access_type
== OS_FILE_READ_WRITE
) {
1064 access
= GENERIC_READ
| GENERIC_WRITE
;
1065 } else if (access_type
== OS_FILE_READ_ALLOW_DELETE
) {
1066 access
= GENERIC_READ
;
1067 share_mode
= FILE_SHARE_DELETE
| FILE_SHARE_READ
1068 | FILE_SHARE_WRITE
; /* A backup program has to give
1069 mysqld the maximum freedom to
1070 do what it likes with the
1077 file
= CreateFile((LPCTSTR
) name
,
1080 NULL
, /* default security attributes */
1083 NULL
); /* no template file */
1085 if (file
== INVALID_HANDLE_VALUE
) {
1098 if (create_mode
== OS_FILE_OPEN
) {
1099 if (access_type
== OS_FILE_READ_ONLY
) {
1100 create_flag
= O_RDONLY
;
1102 create_flag
= O_RDWR
;
1104 } else if (create_mode
== OS_FILE_CREATE
) {
1105 create_flag
= O_RDWR
| O_CREAT
| O_EXCL
;
1111 if (create_mode
== OS_FILE_CREATE
) {
1112 file
= open(name
, create_flag
, S_IRUSR
| S_IWUSR
1113 | S_IRGRP
| S_IWGRP
);
1115 file
= open(name
, create_flag
);
1120 #ifdef USE_FILE_LOCK
1121 } else if (access_type
== OS_FILE_READ_WRITE
1122 && os_file_lock(file
, name
)) {
1132 #endif /* __WIN__ */
1135 /********************************************************************
1136 Tries to disable OS caching on an opened file descriptor. */
1139 os_file_set_nocache(
1140 /*================*/
1141 int fd
/* in: file descriptor to alter */
1142 __attribute__((unused
)),
1143 const char* file_name
/* in: used in the diagnostic message */
1144 __attribute__((unused
)),
1145 const char* operation_name
__attribute__((unused
)))
1146 /* in: used in the diagnostic message,
1147 we call os_file_set_nocache()
1148 immediately after opening or creating
1149 a file, so this is either "open" or
1152 /* some versions of Solaris may not have DIRECTIO_ON */
1153 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1154 if (directio(fd
, DIRECTIO_ON
) == -1) {
1156 errno_save
= (int)errno
;
1157 ut_print_timestamp(stderr
);
1159 " InnoDB: Failed to set DIRECTIO_ON "
1160 "on file %s: %s: %s, continuing anyway\n",
1161 file_name
, operation_name
, strerror(errno_save
));
1163 #elif defined(O_DIRECT)
1164 if (fcntl(fd
, F_SETFL
, O_DIRECT
) == -1) {
1166 errno_save
= (int)errno
;
1167 ut_print_timestamp(stderr
);
1169 " InnoDB: Failed to set O_DIRECT "
1170 "on file %s: %s: %s, continuing anyway\n",
1171 file_name
, operation_name
, strerror(errno_save
));
1172 if (errno_save
== EINVAL
) {
1173 ut_print_timestamp(stderr
);
1175 " InnoDB: O_DIRECT is known to result in "
1176 "'Invalid argument' on Linux on tmpfs, "
1177 "see MySQL Bug#26662\n");
1183 /********************************************************************
1184 Opens an existing file or creates a new. */
1189 /* out, own: handle to the file, not defined
1190 if error, error number can be retrieved with
1191 os_file_get_last_error */
1192 const char* name
, /* in: name of the file or path as a
1193 null-terminated string */
1194 ulint create_mode
,/* in: OS_FILE_OPEN if an existing file
1195 is opened (if does not exist, error), or
1196 OS_FILE_CREATE if a new file is created
1198 OS_FILE_OVERWRITE if a new file is created
1199 or an old overwritten;
1200 OS_FILE_OPEN_RAW, if a raw device or disk
1201 partition should be opened */
1202 ulint purpose
,/* in: OS_FILE_AIO, if asynchronous,
1203 non-buffered i/o is desired,
1204 OS_FILE_NORMAL, if any normal file;
1205 NOTE that it also depends on type, os_aio_..
1206 and srv_.. variables whether we really use
1207 async i/o or unbuffered i/o: look in the
1208 function source code for the exact rules */
1209 ulint type
, /* in: OS_DATA_FILE or OS_LOG_FILE */
1210 ibool
* success
)/* out: TRUE if succeed, FALSE if error */
1214 DWORD share_mode
= FILE_SHARE_READ
;
1220 "ib_create_table_fail_disk_full",
1222 SetLastError(ERROR_DISK_FULL
);
1223 return((os_file_t
) -1);
1229 if (create_mode
== OS_FILE_OPEN_RAW
) {
1230 create_flag
= OPEN_EXISTING
;
1231 share_mode
= FILE_SHARE_WRITE
;
1232 } else if (create_mode
== OS_FILE_OPEN
1233 || create_mode
== OS_FILE_OPEN_RETRY
) {
1234 create_flag
= OPEN_EXISTING
;
1235 } else if (create_mode
== OS_FILE_CREATE
) {
1236 create_flag
= CREATE_NEW
;
1237 } else if (create_mode
== OS_FILE_OVERWRITE
) {
1238 create_flag
= CREATE_ALWAYS
;
1244 if (purpose
== OS_FILE_AIO
) {
1245 /* If specified, use asynchronous (overlapped) io and no
1246 buffering of writes in the OS */
1249 if (os_aio_use_native_aio
) {
1250 attributes
= attributes
| FILE_FLAG_OVERLAPPED
;
1253 #ifdef UNIV_NON_BUFFERED_IO
1254 if (type
== OS_LOG_FILE
&& srv_flush_log_at_trx_commit
== 2) {
1255 /* Do not use unbuffered i/o to log files because
1256 value 2 denotes that we do not flush the log at every
1257 commit, but only once per second */
1258 } else if (srv_win_file_flush_method
1259 == SRV_WIN_IO_UNBUFFERED
) {
1260 attributes
= attributes
| FILE_FLAG_NO_BUFFERING
;
1263 } else if (purpose
== OS_FILE_NORMAL
) {
1265 #ifdef UNIV_NON_BUFFERED_IO
1266 if (type
== OS_LOG_FILE
&& srv_flush_log_at_trx_commit
== 2) {
1267 /* Do not use unbuffered i/o to log files because
1268 value 2 denotes that we do not flush the log at every
1269 commit, but only once per second */
1270 } else if (srv_win_file_flush_method
1271 == SRV_WIN_IO_UNBUFFERED
) {
1272 attributes
= attributes
| FILE_FLAG_NO_BUFFERING
;
1280 file
= CreateFile((LPCTSTR
) name
,
1281 GENERIC_READ
| GENERIC_WRITE
, /* read and write
1283 share_mode
, /* File can be read also by other
1284 processes; we must give the read
1285 permission because of ibbackup. We do
1286 not give the write permission to
1287 others because if one would succeed to
1288 start 2 instances of mysqld on the
1289 SAME files, that could cause severe
1290 database corruption! When opening
1291 raw disk partitions, Microsoft manuals
1292 say that we must give also the write
1294 NULL
, /* default security attributes */
1297 NULL
); /* no template file */
1299 if (file
== INVALID_HANDLE_VALUE
) {
1302 /* When srv_file_per_table is on, file creation failure may not
1303 be critical to the whole instance. Do not crash the server in
1304 case of unknown errors. */
1305 if (srv_file_per_table
) {
1306 retry
= os_file_handle_error_no_exit(name
,
1307 create_mode
== OS_FILE_CREATE
?
1310 retry
= os_file_handle_error(name
,
1311 create_mode
== OS_FILE_CREATE
?
1327 const char* mode_str
= NULL
;
1330 "ib_create_table_fail_disk_full",
1333 return((os_file_t
) -1);
1339 if (create_mode
== OS_FILE_OPEN
|| create_mode
== OS_FILE_OPEN_RAW
1340 || create_mode
== OS_FILE_OPEN_RETRY
) {
1342 create_flag
= O_RDWR
;
1343 } else if (create_mode
== OS_FILE_CREATE
) {
1344 mode_str
= "CREATE";
1345 create_flag
= O_RDWR
| O_CREAT
| O_EXCL
;
1346 } else if (create_mode
== OS_FILE_OVERWRITE
) {
1347 mode_str
= "OVERWRITE";
1348 create_flag
= O_RDWR
| O_CREAT
| O_TRUNC
;
1354 ut_a(type
== OS_LOG_FILE
|| type
== OS_DATA_FILE
);
1355 ut_a(purpose
== OS_FILE_AIO
|| purpose
== OS_FILE_NORMAL
);
1358 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1359 O_SYNC because the datasync options seemed to corrupt files in 2001
1360 in both Linux and Solaris */
1361 if (type
== OS_LOG_FILE
1362 && srv_unix_file_flush_method
== SRV_UNIX_O_DSYNC
) {
1365 fprintf(stderr
, "Using O_SYNC for file %s\n", name
);
1368 create_flag
= create_flag
| O_SYNC
;
1372 file
= open(name
, create_flag
, os_innodb_umask
);
1377 /* When srv_file_per_table is on, file creation failure may not
1378 be critical to the whole instance. Do not crash the server in
1379 case of unknown errors. */
1380 if (srv_file_per_table
) {
1381 retry
= os_file_handle_error_no_exit(name
,
1382 create_mode
== OS_FILE_CREATE
?
1385 retry
= os_file_handle_error(name
,
1386 create_mode
== OS_FILE_CREATE
?
1393 return(file
/* -1 */);
1400 /* We disable OS caching (O_DIRECT) only on data files */
1401 if (type
!= OS_LOG_FILE
1402 && srv_unix_file_flush_method
== SRV_UNIX_O_DIRECT
) {
1404 os_file_set_nocache(file
, name
, mode_str
);
1407 #ifdef USE_FILE_LOCK
1408 if (create_mode
!= OS_FILE_OPEN_RAW
&& os_file_lock(file
, name
)) {
1410 if (create_mode
== OS_FILE_OPEN_RETRY
) {
1412 ut_print_timestamp(stderr
);
1413 fputs(" InnoDB: Retrying to lock"
1414 " the first data file\n",
1416 for (i
= 0; i
< 100; i
++) {
1417 os_thread_sleep(1000000);
1418 if (!os_file_lock(file
, name
)) {
1423 ut_print_timestamp(stderr
);
1424 fputs(" InnoDB: Unable to open the first data file\n",
1432 #endif /* USE_FILE_LOCK */
1435 #endif /* __WIN__ */
1438 /***************************************************************************
1439 Deletes a file if it exists. The file has to be closed before calling this. */
1442 os_file_delete_if_exists(
1443 /*=====================*/
1444 /* out: TRUE if success */
1445 const char* name
) /* in: file path as a null-terminated string */
1451 /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1454 ret
= DeleteFile((LPCTSTR
)name
);
1460 if (GetLastError() == ERROR_FILE_NOT_FOUND
) {
1461 /* the file does not exist, this not an error */
1468 if (count
> 100 && 0 == (count
% 10)) {
1470 "InnoDB: Warning: cannot delete file %s\n"
1471 "InnoDB: Are you running ibbackup"
1472 " to back up the file?\n", name
);
1474 os_file_get_last_error(TRUE
); /* print error information */
1477 os_thread_sleep(1000000); /* sleep for a second */
1488 ret
= unlink((const char*)name
);
1490 if (ret
!= 0 && errno
!= ENOENT
) {
1491 os_file_handle_error_no_exit(name
, "delete");
1500 /***************************************************************************
1501 Deletes a file. The file has to be closed before calling this. */
1506 /* out: TRUE if success */
1507 const char* name
) /* in: file path as a null-terminated string */
1513 /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1516 ret
= DeleteFile((LPCTSTR
)name
);
1522 if (GetLastError() == ERROR_FILE_NOT_FOUND
) {
1523 /* If the file does not exist, we classify this as a 'mild'
1531 if (count
> 100 && 0 == (count
% 10)) {
1533 "InnoDB: Warning: cannot delete file %s\n"
1534 "InnoDB: Are you running ibbackup"
1535 " to back up the file?\n", name
);
1537 os_file_get_last_error(TRUE
); /* print error information */
1540 os_thread_sleep(1000000); /* sleep for a second */
1551 ret
= unlink((const char*)name
);
1554 os_file_handle_error_no_exit(name
, "delete");
1563 /***************************************************************************
1564 Renames a file (can also move it to another directory). It is safest that the
1565 file is closed before calling this function. */
1570 /* out: TRUE if success */
1571 const char* oldpath
,/* in: old file path as a null-terminated
1573 const char* newpath
)/* in: new file path */
1578 ret
= MoveFile((LPCTSTR
)oldpath
, (LPCTSTR
)newpath
);
1584 os_file_handle_error_no_exit(oldpath
, "rename");
1590 ret
= rename((const char*)oldpath
, (const char*)newpath
);
1593 os_file_handle_error_no_exit(oldpath
, "rename");
1602 /***************************************************************************
1603 Closes a file handle. In case of error, error number can be retrieved with
1604 os_file_get_last_error. */
1609 /* out: TRUE if success */
1610 os_file_t file
) /* in, own: handle to a file */
1617 ret
= CloseHandle(file
);
1623 os_file_handle_error(NULL
, "close");
1632 os_file_handle_error(NULL
, "close");
1641 /***************************************************************************
1642 Closes a file handle. */
1645 os_file_close_no_error_handling(
1646 /*============================*/
1647 /* out: TRUE if success */
1648 os_file_t file
) /* in, own: handle to a file */
1655 ret
= CloseHandle(file
);
1676 /***************************************************************************
1677 Gets a file size. */
1682 /* out: TRUE if success */
1683 os_file_t file
, /* in: handle to a file */
1684 ulint
* size
, /* out: least significant 32 bits of file
1686 ulint
* size_high
)/* out: most significant 32 bits of size */
1692 low
= GetFileSize(file
, &high
);
1694 if ((low
== 0xFFFFFFFF) && (GetLastError() != NO_ERROR
)) {
1705 offs
= lseek(file
, 0, SEEK_END
);
1707 if (offs
== ((off_t
)-1)) {
1712 if (sizeof(off_t
) > 4) {
1713 *size
= (ulint
)(offs
& 0xFFFFFFFFUL
);
1714 *size_high
= (ulint
)(offs
>> 32);
1716 *size
= (ulint
) offs
;
1724 /***************************************************************************
1725 Gets file size as a 64-bit integer ib_longlong. */
1728 os_file_get_size_as_iblonglong(
1729 /*===========================*/
1730 /* out: size in bytes, -1 if error */
1731 os_file_t file
) /* in: handle to a file */
1737 success
= os_file_get_size(file
, &size
, &size_high
);
1744 return((((ib_longlong
)size_high
) << 32) + (ib_longlong
)size
);
1747 /***************************************************************************
1748 Write the specified number of zeros to a newly created file. */
1753 /* out: TRUE if success */
1754 const char* name
, /* in: name of the file or path as a
1755 null-terminated string */
1756 os_file_t file
, /* in: handle to a file */
1757 ulint size
, /* in: least significant 32 bits of file
1759 ulint size_high
)/* in: most significant 32 bits of size */
1761 ib_longlong current_size
;
1762 ib_longlong desired_size
;
1768 ut_a(size
== (size
& 0xFFFFFFFF));
1771 desired_size
= (ib_longlong
)size
+ (((ib_longlong
)size_high
) << 32);
1773 /* Write up to 1 megabyte at a time. */
1774 buf_size
= ut_min(64, (ulint
) (desired_size
/ UNIV_PAGE_SIZE
))
1776 buf2
= ut_malloc(buf_size
+ UNIV_PAGE_SIZE
);
1778 /* Align the buffer for possible raw i/o */
1779 buf
= ut_align(buf2
, UNIV_PAGE_SIZE
);
1781 /* Write buffer full of zeros */
1782 memset(buf
, 0, buf_size
);
1784 if (desired_size
>= (ib_longlong
)(100 * 1024 * 1024)) {
1786 fprintf(stderr
, "InnoDB: Progress in MB:");
1789 while (current_size
< desired_size
) {
1792 if (desired_size
- current_size
< (ib_longlong
) buf_size
) {
1793 n_bytes
= (ulint
) (desired_size
- current_size
);
1798 ret
= os_file_write(name
, file
, buf
,
1799 (ulint
)(current_size
& 0xFFFFFFFF),
1800 (ulint
)(current_size
>> 32),
1804 goto error_handling
;
1807 /* Print about progress for each 100 MB written */
1808 if ((ib_longlong
) (current_size
+ n_bytes
) / (ib_longlong
)(100 * 1024 * 1024)
1809 != current_size
/ (ib_longlong
)(100 * 1024 * 1024)) {
1811 fprintf(stderr
, " %lu00",
1812 (ulong
) ((current_size
+ n_bytes
)
1813 / (ib_longlong
)(100 * 1024 * 1024)));
1816 current_size
+= n_bytes
;
1819 if (desired_size
>= (ib_longlong
)(100 * 1024 * 1024)) {
1821 fprintf(stderr
, "\n");
1826 ret
= os_file_flush(file
);
1836 /***************************************************************************
1837 Truncates a file at its current position. */
1842 /* out: TRUE if success */
1843 FILE* file
) /* in: file to be truncated */
1846 HANDLE h
= (HANDLE
) _get_osfhandle(fileno(file
));
1847 return(SetEndOfFile(h
));
1849 return(!ftruncate(fileno(file
), ftell(file
)));
1850 #endif /* __WIN__ */
1854 /***************************************************************************
1855 Wrapper to fsync(2) that retries the call on some errors.
1856 Returns the value 0 if successful; otherwise the value -1 is returned and
1857 the global variable errno is set to indicate the error. */
1863 /* out: 0 if success, -1 otherwise */
1864 os_file_t file
) /* in: handle to a file */
1877 if (ret
== -1 && errno
== ENOLCK
) {
1879 if (failures
% 100 == 0) {
1881 ut_print_timestamp(stderr
);
1883 " InnoDB: fsync(): "
1884 "No locks available; retrying\n");
1887 os_thread_sleep(200000 /* 0.2 sec */);
1900 #endif /* !__WIN__ */
1902 /***************************************************************************
1903 Flushes the write buffers of a given file to the disk. */
1908 /* out: TRUE if success */
1909 os_file_t file
) /* in, own: handle to a file */
1918 ret
= FlushFileBuffers(file
);
1924 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1925 actually a raw device, we choose to ignore that error if we are using
1928 if (srv_start_raw_disk_in_use
&& GetLastError()
1929 == ERROR_INVALID_FUNCTION
) {
1933 os_file_handle_error(NULL
, "flush");
1935 /* It is a fatal error if a file flush does not succeed, because then
1936 the database can get corrupt on disk */
1943 #if defined(HAVE_DARWIN_THREADS)
1944 # ifndef F_FULLFSYNC
1945 /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
1946 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
1947 # elif F_FULLFSYNC != 51
1948 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
1950 /* Apple has disabled fsync() for internal disk drives in OS X. That
1951 caused corruption for a user when he tested a power outage. Let us in
1952 OS X use a nonstandard flush method recommended by an Apple
1955 if (!srv_have_fullfsync
) {
1956 /* If we are not on an operating system that supports this,
1957 then fall back to a plain fsync. */
1959 ret
= os_file_fsync(file
);
1961 ret
= fcntl(file
, F_FULLFSYNC
, NULL
);
1964 /* If we are not on a file system that supports this,
1965 then fall back to a plain fsync. */
1966 ret
= os_file_fsync(file
);
1970 ret
= os_file_fsync(file
);
1977 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
1978 we choose to ignore that error if we are using raw disks */
1980 if (srv_start_raw_disk_in_use
&& errno
== EINVAL
) {
1985 ut_print_timestamp(stderr
);
1988 " InnoDB: Error: the OS said file flush did not succeed\n");
1990 os_file_handle_error(NULL
, "flush");
1992 /* It is a fatal error if a file flush does not succeed, because then
1993 the database can get corrupt on disk */
2001 /***********************************************************************
2002 Does a synchronous read operation in Posix. */
2007 /* out: number of bytes read, -1 if error */
2008 os_file_t file
, /* in: handle to a file */
2009 void* buf
, /* in: buffer where to read */
2010 ulint n
, /* in: number of bytes to read */
2011 ulint offset
, /* in: least significant 32 bits of file
2012 offset from where to read */
2013 ulint offset_high
) /* in: most significant 32 bits of
2019 ut_a((offset
& 0xFFFFFFFFUL
) == offset
);
2021 /* If off_t is > 4 bytes in size, then we assume we can pass a
2024 if (sizeof(off_t
) > 4) {
2025 offs
= (off_t
)offset
+ (((off_t
)offset_high
) << 32);
2028 offs
= (off_t
)offset
;
2030 if (offset_high
> 0) {
2032 "InnoDB: Error: file read at offset > 4 GB\n");
2038 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2039 os_mutex_enter(os_file_count_mutex
);
2040 os_file_n_pending_preads
++;
2041 os_n_pending_reads
++;
2042 os_mutex_exit(os_file_count_mutex
);
2044 n_bytes
= pread(file
, buf
, (ssize_t
)n
, offs
);
2046 os_mutex_enter(os_file_count_mutex
);
2047 os_file_n_pending_preads
--;
2048 os_n_pending_reads
--;
2049 os_mutex_exit(os_file_count_mutex
);
2058 os_mutex_enter(os_file_count_mutex
);
2059 os_n_pending_reads
++;
2060 os_mutex_exit(os_file_count_mutex
);
2062 /* Protect the seek / read operation with a mutex */
2063 i
= ((ulint
) file
) % OS_FILE_N_SEEK_MUTEXES
;
2065 os_mutex_enter(os_file_seek_mutexes
[i
]);
2067 ret_offset
= lseek(file
, offs
, SEEK_SET
);
2069 if (ret_offset
< 0) {
2072 ret
= read(file
, buf
, (ssize_t
)n
);
2075 os_mutex_exit(os_file_seek_mutexes
[i
]);
2077 os_mutex_enter(os_file_count_mutex
);
2078 os_n_pending_reads
--;
2079 os_mutex_exit(os_file_count_mutex
);
2086 /***********************************************************************
2087 Does a synchronous write operation in Posix. */
2092 /* out: number of bytes written, -1 if error */
2093 os_file_t file
, /* in: handle to a file */
2094 const void* buf
, /* in: buffer from where to write */
2095 ulint n
, /* in: number of bytes to write */
2096 ulint offset
, /* in: least significant 32 bits of file
2097 offset where to write */
2098 ulint offset_high
) /* in: most significant 32 bits of
2104 ut_a((offset
& 0xFFFFFFFFUL
) == offset
);
2106 /* If off_t is > 4 bytes in size, then we assume we can pass a
2109 if (sizeof(off_t
) > 4) {
2110 offs
= (off_t
)offset
+ (((off_t
)offset_high
) << 32);
2112 offs
= (off_t
)offset
;
2114 if (offset_high
> 0) {
2116 "InnoDB: Error: file write"
2117 " at offset > 4 GB\n");
2123 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2124 os_mutex_enter(os_file_count_mutex
);
2125 os_file_n_pending_pwrites
++;
2126 os_n_pending_writes
++;
2127 os_mutex_exit(os_file_count_mutex
);
2129 ret
= pwrite(file
, buf
, (ssize_t
)n
, offs
);
2131 os_mutex_enter(os_file_count_mutex
);
2132 os_file_n_pending_pwrites
--;
2133 os_n_pending_writes
--;
2134 os_mutex_exit(os_file_count_mutex
);
2136 # ifdef UNIV_DO_FLUSH
2137 if (srv_unix_file_flush_method
!= SRV_UNIX_LITTLESYNC
2138 && srv_unix_file_flush_method
!= SRV_UNIX_NOSYNC
2139 && !os_do_not_call_flush_at_each_write
) {
2141 /* Always do fsync to reduce the probability that when
2142 the OS crashes, a database page is only partially
2143 physically written to disk. */
2145 ut_a(TRUE
== os_file_flush(file
));
2147 # endif /* UNIV_DO_FLUSH */
2155 os_mutex_enter(os_file_count_mutex
);
2156 os_n_pending_writes
++;
2157 os_mutex_exit(os_file_count_mutex
);
2159 /* Protect the seek / write operation with a mutex */
2160 i
= ((ulint
) file
) % OS_FILE_N_SEEK_MUTEXES
;
2162 os_mutex_enter(os_file_seek_mutexes
[i
]);
2164 ret_offset
= lseek(file
, offs
, SEEK_SET
);
2166 if (ret_offset
< 0) {
2172 ret
= write(file
, buf
, (ssize_t
)n
);
2174 # ifdef UNIV_DO_FLUSH
2175 if (srv_unix_file_flush_method
!= SRV_UNIX_LITTLESYNC
2176 && srv_unix_file_flush_method
!= SRV_UNIX_NOSYNC
2177 && !os_do_not_call_flush_at_each_write
) {
2179 /* Always do fsync to reduce the probability that when
2180 the OS crashes, a database page is only partially
2181 physically written to disk. */
2183 ut_a(TRUE
== os_file_flush(file
));
2185 # endif /* UNIV_DO_FLUSH */
2188 os_mutex_exit(os_file_seek_mutexes
[i
]);
2190 os_mutex_enter(os_file_count_mutex
);
2191 os_n_pending_writes
--;
2192 os_mutex_exit(os_file_count_mutex
);
2200 /***********************************************************************
2201 Requests a synchronous positioned read operation. */
2206 /* out: TRUE if request was
2207 successful, FALSE if fail */
2208 os_file_t file
, /* in: handle to a file */
2209 void* buf
, /* in: buffer where to read */
2210 ulint offset
, /* in: least significant 32 bits of file
2211 offset where to read */
2212 ulint offset_high
, /* in: most significant 32 bits of
2214 ulint n
) /* in: number of bytes to read */
2225 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2226 no more than 32 bits. */
2227 ut_a((offset
& 0xFFFFFFFFUL
) == offset
);
2228 ut_a((n
& 0xFFFFFFFFUL
) == n
);
2231 os_bytes_read_since_printout
+= n
;
2238 low
= (DWORD
) offset
;
2239 high
= (DWORD
) offset_high
;
2241 os_mutex_enter(os_file_count_mutex
);
2242 os_n_pending_reads
++;
2243 os_mutex_exit(os_file_count_mutex
);
2245 /* Protect the seek / read operation with a mutex */
2246 i
= ((ulint
) file
) % OS_FILE_N_SEEK_MUTEXES
;
2248 os_mutex_enter(os_file_seek_mutexes
[i
]);
2250 ret2
= SetFilePointer(file
, low
, &high
, FILE_BEGIN
);
2252 if (ret2
== 0xFFFFFFFF && GetLastError() != NO_ERROR
) {
2254 os_mutex_exit(os_file_seek_mutexes
[i
]);
2256 os_mutex_enter(os_file_count_mutex
);
2257 os_n_pending_reads
--;
2258 os_mutex_exit(os_file_count_mutex
);
2260 goto error_handling
;
2263 ret
= ReadFile(file
, buf
, (DWORD
) n
, &len
, NULL
);
2265 os_mutex_exit(os_file_seek_mutexes
[i
]);
2267 os_mutex_enter(os_file_count_mutex
);
2268 os_n_pending_reads
--;
2269 os_mutex_exit(os_file_count_mutex
);
2271 if (ret
&& len
== n
) {
2278 os_bytes_read_since_printout
+= n
;
2281 ret
= os_file_pread(file
, buf
, n
, offset
, offset_high
);
2283 if ((ulint
)ret
== n
) {
2289 "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2290 "InnoDB: Was only able to read %ld.\n",
2291 (ulong
)n
, (ulong
)offset_high
,
2292 (ulong
)offset
, (long)ret
);
2297 retry
= os_file_handle_error(NULL
, "read");
2304 "InnoDB: Fatal error: cannot read from file."
2305 " OS error number %lu.\n",
2307 (ulong
) GetLastError()
2319 /***********************************************************************
2320 Requests a synchronous positioned read operation. This function does not do
2321 any error handling. In case of error it returns FALSE. */
2324 os_file_read_no_error_handling(
2325 /*===========================*/
2326 /* out: TRUE if request was
2327 successful, FALSE if fail */
2328 os_file_t file
, /* in: handle to a file */
2329 void* buf
, /* in: buffer where to read */
2330 ulint offset
, /* in: least significant 32 bits of file
2331 offset where to read */
2332 ulint offset_high
, /* in: most significant 32 bits of
2334 ulint n
) /* in: number of bytes to read */
2345 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2346 no more than 32 bits. */
2347 ut_a((offset
& 0xFFFFFFFFUL
) == offset
);
2348 ut_a((n
& 0xFFFFFFFFUL
) == n
);
2351 os_bytes_read_since_printout
+= n
;
2358 low
= (DWORD
) offset
;
2359 high
= (DWORD
) offset_high
;
2361 os_mutex_enter(os_file_count_mutex
);
2362 os_n_pending_reads
++;
2363 os_mutex_exit(os_file_count_mutex
);
2365 /* Protect the seek / read operation with a mutex */
2366 i
= ((ulint
) file
) % OS_FILE_N_SEEK_MUTEXES
;
2368 os_mutex_enter(os_file_seek_mutexes
[i
]);
2370 ret2
= SetFilePointer(file
, low
, &high
, FILE_BEGIN
);
2372 if (ret2
== 0xFFFFFFFF && GetLastError() != NO_ERROR
) {
2374 os_mutex_exit(os_file_seek_mutexes
[i
]);
2376 os_mutex_enter(os_file_count_mutex
);
2377 os_n_pending_reads
--;
2378 os_mutex_exit(os_file_count_mutex
);
2380 goto error_handling
;
2383 ret
= ReadFile(file
, buf
, (DWORD
) n
, &len
, NULL
);
2385 os_mutex_exit(os_file_seek_mutexes
[i
]);
2387 os_mutex_enter(os_file_count_mutex
);
2388 os_n_pending_reads
--;
2389 os_mutex_exit(os_file_count_mutex
);
2391 if (ret
&& len
== n
) {
2398 os_bytes_read_since_printout
+= n
;
2401 ret
= os_file_pread(file
, buf
, n
, offset
, offset_high
);
2403 if ((ulint
)ret
== n
) {
2411 retry
= os_file_handle_error_no_exit(NULL
, "read");
2420 /***********************************************************************
2421 Rewind file to its start, read at most size - 1 bytes from it to str, and
2422 NUL-terminate str. All errors are silently ignored. This function is
2423 mostly meant to be used with temporary files. */
2426 os_file_read_string(
2427 /*================*/
2428 FILE* file
, /* in: file to read from */
2429 char* str
, /* in: buffer where to read */
2430 ulint size
) /* in: size of buffer */
2439 flen
= fread(str
, 1, size
- 1, file
);
2443 /***********************************************************************
2444 Requests a synchronous write operation. */
2449 /* out: TRUE if request was
2450 successful, FALSE if fail */
2451 const char* name
, /* in: name of the file or path as a
2452 null-terminated string */
2453 os_file_t file
, /* in: handle to a file */
2454 const void* buf
, /* in: buffer from which to write */
2455 ulint offset
, /* in: least significant 32 bits of file
2456 offset where to write */
2457 ulint offset_high
, /* in: most significant 32 bits of
2459 ulint n
) /* in: number of bytes to write */
2468 ulint n_retries
= 0;
2471 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2472 no more than 32 bits. */
2473 ut_a((offset
& 0xFFFFFFFFUL
) == offset
);
2474 ut_a((n
& 0xFFFFFFFFUL
) == n
);
2482 low
= (DWORD
) offset
;
2483 high
= (DWORD
) offset_high
;
2485 os_mutex_enter(os_file_count_mutex
);
2486 os_n_pending_writes
++;
2487 os_mutex_exit(os_file_count_mutex
);
2489 /* Protect the seek / write operation with a mutex */
2490 i
= ((ulint
) file
) % OS_FILE_N_SEEK_MUTEXES
;
2492 os_mutex_enter(os_file_seek_mutexes
[i
]);
2494 ret2
= SetFilePointer(file
, low
, &high
, FILE_BEGIN
);
2496 if (ret2
== 0xFFFFFFFF && GetLastError() != NO_ERROR
) {
2498 os_mutex_exit(os_file_seek_mutexes
[i
]);
2500 os_mutex_enter(os_file_count_mutex
);
2501 os_n_pending_writes
--;
2502 os_mutex_exit(os_file_count_mutex
);
2504 ut_print_timestamp(stderr
);
2507 " InnoDB: Error: File pointer positioning to"
2508 " file %s failed at\n"
2509 "InnoDB: offset %lu %lu. Operating system"
2510 " error number %lu.\n"
2511 "InnoDB: Some operating system error numbers"
2512 " are described at\n"
2514 "http://dev.mysql.com/doc/refman/5.1/en/"
2515 "operating-system-error-codes.html\n",
2516 name
, (ulong
) offset_high
, (ulong
) offset
,
2517 (ulong
) GetLastError());
2522 ret
= WriteFile(file
, buf
, (DWORD
) n
, &len
, NULL
);
2524 /* Always do fsync to reduce the probability that when the OS crashes,
2525 a database page is only partially physically written to disk. */
2527 # ifdef UNIV_DO_FLUSH
2528 if (!os_do_not_call_flush_at_each_write
) {
2529 ut_a(TRUE
== os_file_flush(file
));
2531 # endif /* UNIV_DO_FLUSH */
2533 os_mutex_exit(os_file_seek_mutexes
[i
]);
2535 os_mutex_enter(os_file_count_mutex
);
2536 os_n_pending_writes
--;
2537 os_mutex_exit(os_file_count_mutex
);
2539 if (ret
&& len
== n
) {
2544 /* If some background file system backup tool is running, then, at
2545 least in Windows 2000, we may get here a specific error. Let us
2546 retry the operation 100 times, with 1 second waits. */
2548 if (GetLastError() == ERROR_LOCK_VIOLATION
&& n_retries
< 100) {
2550 os_thread_sleep(1000000);
2557 if (!os_has_said_disk_full
) {
2559 err
= (ulint
)GetLastError();
2561 ut_print_timestamp(stderr
);
2564 " InnoDB: Error: Write to file %s failed"
2565 " at offset %lu %lu.\n"
2566 "InnoDB: %lu bytes should have been written,"
2567 " only %lu were written.\n"
2568 "InnoDB: Operating system error number %lu.\n"
2569 "InnoDB: Check that your OS and file system"
2570 " support files of this size.\n"
2571 "InnoDB: Check also that the disk is not full"
2572 " or a disk quota exceeded.\n",
2573 name
, (ulong
) offset_high
, (ulong
) offset
,
2574 (ulong
) n
, (ulong
) len
, (ulong
) err
);
2576 if (strerror((int)err
) != NULL
) {
2578 "InnoDB: Error number %lu means '%s'.\n",
2579 (ulong
) err
, strerror((int)err
));
2583 "InnoDB: Some operating system error numbers"
2584 " are described at\n"
2586 "http://dev.mysql.com/doc/refman/5.1/en/"
2587 "operating-system-error-codes.html\n");
2589 os_has_said_disk_full
= TRUE
;
2596 ret
= os_file_pwrite(file
, buf
, n
, offset
, offset_high
);
2598 if ((ulint
)ret
== n
) {
2603 if (!os_has_said_disk_full
) {
2605 ut_print_timestamp(stderr
);
2608 " InnoDB: Error: Write to file %s failed"
2609 " at offset %lu %lu.\n"
2610 "InnoDB: %lu bytes should have been written,"
2611 " only %ld were written.\n"
2612 "InnoDB: Operating system error number %lu.\n"
2613 "InnoDB: Check that your OS and file system"
2614 " support files of this size.\n"
2615 "InnoDB: Check also that the disk is not full"
2616 " or a disk quota exceeded.\n",
2617 name
, offset_high
, offset
, n
, (long int)ret
,
2619 if (strerror(errno
) != NULL
) {
2621 "InnoDB: Error number %lu means '%s'.\n",
2622 (ulint
)errno
, strerror(errno
));
2626 "InnoDB: Some operating system error numbers"
2627 " are described at\n"
2629 "http://dev.mysql.com/doc/refman/5.1/en/"
2630 "operating-system-error-codes.html\n");
2632 os_has_said_disk_full
= TRUE
;
2639 /***********************************************************************
2640 Check the existence and type of the given file. */
2645 /* out: TRUE if call succeeded */
2646 const char* path
, /* in: pathname of the file */
2647 ibool
* exists
, /* out: TRUE if file exists */
2648 os_file_type_t
* type
) /* out: type of the file (if it exists) */
2652 struct _stat statinfo
;
2654 ret
= _stat(path
, &statinfo
);
2655 if (ret
&& (errno
== ENOENT
|| errno
== ENOTDIR
)) {
2656 /* file does not exist */
2660 /* file exists, but stat call failed */
2662 os_file_handle_error_no_exit(path
, "stat");
2667 if (_S_IFDIR
& statinfo
.st_mode
) {
2668 *type
= OS_FILE_TYPE_DIR
;
2669 } else if (_S_IFREG
& statinfo
.st_mode
) {
2670 *type
= OS_FILE_TYPE_FILE
;
2672 *type
= OS_FILE_TYPE_UNKNOWN
;
2680 struct stat statinfo
;
2682 ret
= stat(path
, &statinfo
);
2683 if (ret
&& (errno
== ENOENT
|| errno
== ENOTDIR
)) {
2684 /* file does not exist */
2688 /* file exists, but stat call failed */
2690 os_file_handle_error_no_exit(path
, "stat");
2695 if (S_ISDIR(statinfo
.st_mode
)) {
2696 *type
= OS_FILE_TYPE_DIR
;
2697 } else if (S_ISLNK(statinfo
.st_mode
)) {
2698 *type
= OS_FILE_TYPE_LINK
;
2699 } else if (S_ISREG(statinfo
.st_mode
)) {
2700 *type
= OS_FILE_TYPE_FILE
;
2702 *type
= OS_FILE_TYPE_UNKNOWN
;
2711 /***********************************************************************
2712 This function returns information about the specified file */
2717 /* out: TRUE if stat
2718 information found */
2719 const char* path
, /* in: pathname of the file */
2720 os_file_stat_t
* stat_info
) /* information of a file in a
2725 struct _stat statinfo
;
2727 ret
= _stat(path
, &statinfo
);
2728 if (ret
&& (errno
== ENOENT
|| errno
== ENOTDIR
)) {
2729 /* file does not exist */
2733 /* file exists, but stat call failed */
2735 os_file_handle_error_no_exit(path
, "stat");
2739 if (_S_IFDIR
& statinfo
.st_mode
) {
2740 stat_info
->type
= OS_FILE_TYPE_DIR
;
2741 } else if (_S_IFREG
& statinfo
.st_mode
) {
2742 stat_info
->type
= OS_FILE_TYPE_FILE
;
2744 stat_info
->type
= OS_FILE_TYPE_UNKNOWN
;
2747 stat_info
->ctime
= statinfo
.st_ctime
;
2748 stat_info
->atime
= statinfo
.st_atime
;
2749 stat_info
->mtime
= statinfo
.st_mtime
;
2750 stat_info
->size
= statinfo
.st_size
;
2755 struct stat statinfo
;
2757 ret
= stat(path
, &statinfo
);
2759 if (ret
&& (errno
== ENOENT
|| errno
== ENOTDIR
)) {
2760 /* file does not exist */
2764 /* file exists, but stat call failed */
2766 os_file_handle_error_no_exit(path
, "stat");
2771 if (S_ISDIR(statinfo
.st_mode
)) {
2772 stat_info
->type
= OS_FILE_TYPE_DIR
;
2773 } else if (S_ISLNK(statinfo
.st_mode
)) {
2774 stat_info
->type
= OS_FILE_TYPE_LINK
;
2775 } else if (S_ISREG(statinfo
.st_mode
)) {
2776 stat_info
->type
= OS_FILE_TYPE_FILE
;
2778 stat_info
->type
= OS_FILE_TYPE_UNKNOWN
;
2781 stat_info
->ctime
= statinfo
.st_ctime
;
2782 stat_info
->atime
= statinfo
.st_atime
;
2783 stat_info
->mtime
= statinfo
.st_mtime
;
2784 stat_info
->size
= statinfo
.st_size
;
2790 /* path name separator character */
2792 # define OS_FILE_PATH_SEPARATOR '\\'
2794 # define OS_FILE_PATH_SEPARATOR '/'
2797 /********************************************************************
2798 The function os_file_dirname returns a directory component of a
2799 null-terminated pathname string. In the usual case, dirname returns
2800 the string up to, but not including, the final '/', and basename
2801 is the component following the final '/'. Trailing '/' characÂ
2802 ters are not counted as part of the pathname.
2804 If path does not contain a slash, dirname returns the string ".".
2806 Concatenating the string returned by dirname, a "/", and the basename
2807 yields a complete pathname.
2809 The return value is a copy of the directory component of the pathname.
2810 The copy is allocated from heap. It is the caller responsibility
2811 to free it after it is no longer needed.
2813 The following list of examples (taken from SUSv2) shows the strings
2814 returned by dirname and basename for different paths:
2816 path dirname basename
2817 "/usr/lib" "/usr" "lib"
2828 /* out, own: directory component of the
2830 const char* path
) /* in: pathname */
2832 /* Find the offset of the last slash */
2833 const char* last_slash
= strrchr(path
, OS_FILE_PATH_SEPARATOR
);
2835 /* No slash in the path, return "." */
2837 return(mem_strdup("."));
2840 /* Ok, there is a slash */
2842 if (last_slash
== path
) {
2843 /* last slash is the first char of the path */
2845 return(mem_strdup("/"));
2848 /* Non-trivial directory component */
2850 return(mem_strdupl(path
, last_slash
- path
));
2853 /********************************************************************
2854 Creates all missing subdirectories along the given path. */
2857 os_file_create_subdirs_if_needed(
2858 /*=============================*/
2859 /* out: TRUE if call succeeded
2861 const char* path
) /* in: path name */
2864 ibool success
, subdir_exists
;
2865 os_file_type_t type
;
2867 subdir
= os_file_dirname(path
);
2868 if (strlen(subdir
) == 1
2869 && (*subdir
== OS_FILE_PATH_SEPARATOR
|| *subdir
== '.')) {
2870 /* subdir is root or cwd, nothing to do */
2876 /* Test if subdir exists */
2877 success
= os_file_status(subdir
, &subdir_exists
, &type
);
2878 if (success
&& !subdir_exists
) {
2879 /* subdir does not exist, create it */
2880 success
= os_file_create_subdirs_if_needed(subdir
);
2886 success
= os_file_create_directory(subdir
, FALSE
);
2894 /********************************************************************
2895 Returns a pointer to the nth slot in the aio array. */
2898 os_aio_array_get_nth_slot(
2899 /*======================*/
2900 /* out: pointer to slot */
2901 os_aio_array_t
* array
, /* in: aio array */
2902 ulint index
) /* in: index of the slot */
2904 ut_a(index
< array
->n_slots
);
2906 return((array
->slots
) + index
);
2909 /****************************************************************************
2910 Creates an aio wait array. */
2913 os_aio_array_create(
2914 /*================*/
2915 /* out, own: aio array */
2916 ulint n
, /* in: maximum number of pending aio operations
2917 allowed; n must be divisible by n_segments */
2918 ulint n_segments
) /* in: number of segments in the aio array */
2920 os_aio_array_t
* array
;
2922 os_aio_slot_t
* slot
;
2927 ut_a(n_segments
> 0);
2929 array
= ut_malloc(sizeof(os_aio_array_t
));
2931 array
->mutex
= os_mutex_create(NULL
);
2932 array
->not_full
= os_event_create(NULL
);
2933 array
->is_empty
= os_event_create(NULL
);
2935 os_event_set(array
->is_empty
);
2938 array
->n_segments
= n_segments
;
2939 array
->n_reserved
= 0;
2940 array
->slots
= ut_malloc(n
* sizeof(os_aio_slot_t
));
2942 array
->native_events
= ut_malloc(n
* sizeof(os_native_event_t
));
2944 for (i
= 0; i
< n
; i
++) {
2945 slot
= os_aio_array_get_nth_slot(array
, i
);
2948 slot
->reserved
= FALSE
;
2950 slot
->event
= os_event_create(NULL
);
2952 over
= &(slot
->control
);
2954 over
->hEvent
= slot
->event
->handle
;
2956 *((array
->native_events
) + i
) = over
->hEvent
;
2963 /****************************************************************************
2964 Initializes the asynchronous io system. Calls also os_io_init_simple.
2965 Creates a separate aio array for
2966 non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2967 segment, two aio arrays for log reads and writes with one segment, and a
2968 synchronous aio array of the specified size. The combined number of segments
2969 in the three first aio arrays is the parameter n_segments given to the
2970 function. The caller must create an i/o handler thread for each segment in
2971 the four first arrays, but not for the sync aio array. */
2976 ulint n
, /* in: maximum number of pending aio operations
2977 allowed; n must be divisible by n_segments */
2978 ulint n_segments
, /* in: combined number of segments in the four
2979 first aio arrays; must be >= 4 */
2980 ulint n_slots_sync
) /* in: number of slots in the sync aio array */
2986 #ifdef POSIX_ASYNC_IO
2989 ut_ad(n
% n_segments
== 0);
2990 ut_ad(n_segments
>= 4);
2992 os_io_init_simple();
2994 for (i
= 0; i
< n_segments
; i
++) {
2995 srv_set_io_thread_op_info(i
, "not started yet");
2998 n_per_seg
= n
/ n_segments
;
2999 n_write_segs
= (n_segments
- 2) / 2;
3000 n_read_segs
= n_segments
- 2 - n_write_segs
;
3002 /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3004 os_aio_ibuf_array
= os_aio_array_create(n_per_seg
, 1);
3006 srv_io_thread_function
[0] = "insert buffer thread";
3008 os_aio_log_array
= os_aio_array_create(n_per_seg
, 1);
3010 srv_io_thread_function
[1] = "log thread";
3012 os_aio_read_array
= os_aio_array_create(n_read_segs
* n_per_seg
,
3014 for (i
= 2; i
< 2 + n_read_segs
; i
++) {
3015 ut_a(i
< SRV_MAX_N_IO_THREADS
);
3016 srv_io_thread_function
[i
] = "read thread";
3019 os_aio_write_array
= os_aio_array_create(n_write_segs
* n_per_seg
,
3021 for (i
= 2 + n_read_segs
; i
< n_segments
; i
++) {
3022 ut_a(i
< SRV_MAX_N_IO_THREADS
);
3023 srv_io_thread_function
[i
] = "write thread";
3026 os_aio_sync_array
= os_aio_array_create(n_slots_sync
, 1);
3028 os_aio_n_segments
= n_segments
;
3032 os_aio_segment_wait_events
= ut_malloc(n_segments
* sizeof(void*));
3034 for (i
= 0; i
< n_segments
; i
++) {
3035 os_aio_segment_wait_events
[i
] = os_event_create(NULL
);
3038 os_last_printout
= time(NULL
);
3040 #ifdef POSIX_ASYNC_IO
3041 /* Block aio signals from the current thread and its children:
3042 for this to work, the current thread must be the first created
3043 in the database, so that all its children will inherit its
3046 /* TODO: to work MySQL needs the SIGALARM signal; the following
3047 will not work yet! */
3048 sigemptyset(&sigset
);
3049 sigaddset(&sigset
, SIGRTMIN
+ 1 + 0);
3050 sigaddset(&sigset
, SIGRTMIN
+ 1 + 1);
3051 sigaddset(&sigset
, SIGRTMIN
+ 1 + 2);
3052 sigaddset(&sigset
, SIGRTMIN
+ 1 + 3);
3054 pthread_sigmask(SIG_BLOCK
, &sigset
, NULL
); */
3059 /****************************************************************************
3060 Wakes up all async i/o threads in the array in Windows async i/o at
3064 os_aio_array_wake_win_aio_at_shutdown(
3065 /*==================================*/
3066 os_aio_array_t
* array
) /* in: aio array */
3070 for (i
= 0; i
< array
->n_slots
; i
++) {
3072 os_event_set((array
->slots
+ i
)->event
);
3077 /****************************************************************************
3078 Wakes up all async i/o threads so that they know to exit themselves in
3082 os_aio_wake_all_threads_at_shutdown(void)
3083 /*=====================================*/
3088 /* This code wakes up all ai/o threads in Windows native aio */
3089 os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array
);
3090 os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array
);
3091 os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array
);
3092 os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array
);
3094 /* This loop wakes up all simulated ai/o threads */
3096 for (i
= 0; i
< os_aio_n_segments
; i
++) {
3098 os_event_set(os_aio_segment_wait_events
[i
]);
3102 /****************************************************************************
3103 Waits until there are no pending writes in os_aio_write_array. There can
3104 be other, synchronous, pending writes. */
3107 os_aio_wait_until_no_pending_writes(void)
3108 /*=====================================*/
3110 os_event_wait(os_aio_write_array
->is_empty
);
3113 /**************************************************************************
3114 Calculates segment number for a slot. */
3117 os_aio_get_segment_no_from_slot(
3118 /*============================*/
3119 /* out: segment number (which is the number
3120 used by, for example, i/o-handler threads) */
3121 os_aio_array_t
* array
, /* in: aio wait array */
3122 os_aio_slot_t
* slot
) /* in: slot in this array */
3127 if (array
== os_aio_ibuf_array
) {
3130 } else if (array
== os_aio_log_array
) {
3133 } else if (array
== os_aio_read_array
) {
3134 seg_len
= os_aio_read_array
->n_slots
3135 / os_aio_read_array
->n_segments
;
3137 segment
= 2 + slot
->pos
/ seg_len
;
3139 ut_a(array
== os_aio_write_array
);
3140 seg_len
= os_aio_write_array
->n_slots
3141 / os_aio_write_array
->n_segments
;
3143 segment
= os_aio_read_array
->n_segments
+ 2
3144 + slot
->pos
/ seg_len
;
3150 /**************************************************************************
3151 Calculates local segment number and aio array from global segment number. */
3154 os_aio_get_array_and_local_segment(
3155 /*===============================*/
3156 /* out: local segment number within
3158 os_aio_array_t
** array
, /* out: aio wait array */
3159 ulint global_segment
)/* in: global segment number */
3163 ut_a(global_segment
< os_aio_n_segments
);
3165 if (global_segment
== 0) {
3166 *array
= os_aio_ibuf_array
;
3169 } else if (global_segment
== 1) {
3170 *array
= os_aio_log_array
;
3173 } else if (global_segment
< os_aio_read_array
->n_segments
+ 2) {
3174 *array
= os_aio_read_array
;
3176 segment
= global_segment
- 2;
3178 *array
= os_aio_write_array
;
3180 segment
= global_segment
- (os_aio_read_array
->n_segments
+ 2);
3186 /***********************************************************************
3187 Gets an integer value designating a specified aio array. This is used
3188 to give numbers to signals in Posix aio. */
3190 #if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
3193 os_aio_get_array_no(
3194 /*================*/
3195 os_aio_array_t
* array
) /* in: aio array */
3197 if (array
== os_aio_ibuf_array
) {
3201 } else if (array
== os_aio_log_array
) {
3205 } else if (array
== os_aio_read_array
) {
3208 } else if (array
== os_aio_write_array
) {
3218 /***********************************************************************
3219 Gets the aio array for its number. */
3222 os_aio_get_array_from_no(
3223 /*=====================*/
3224 /* out: aio array */
3225 ulint n
) /* in: array number */
3228 return(os_aio_ibuf_array
);
3229 } else if (n
== 1) {
3231 return(os_aio_log_array
);
3232 } else if (n
== 2) {
3234 return(os_aio_read_array
);
3235 } else if (n
== 3) {
3237 return(os_aio_write_array
);
3244 #endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
3246 /***********************************************************************
3247 Requests for a slot in the aio array. If no slot is available, waits until
3248 not_full-event becomes signaled. */
3251 os_aio_array_reserve_slot(
3252 /*======================*/
3253 /* out: pointer to slot */
3254 ulint type
, /* in: OS_FILE_READ or OS_FILE_WRITE */
3255 os_aio_array_t
* array
, /* in: aio array */
3256 fil_node_t
* message1
,/* in: message to be passed along with
3257 the aio operation */
3258 void* message2
,/* in: message to be passed along with
3259 the aio operation */
3260 os_file_t file
, /* in: file handle */
3261 const char* name
, /* in: name of the file or path as a
3262 null-terminated string */
3263 void* buf
, /* in: buffer where to read or from which
3265 ulint offset
, /* in: least significant 32 bits of file
3267 ulint offset_high
, /* in: most significant 32 bits of
3269 ulint len
) /* in: length of the block to read or write */
3271 os_aio_slot_t
* slot
;
3274 OVERLAPPED
* control
;
3276 ut_a((len
& 0xFFFFFFFFUL
) == len
);
3277 #elif defined(POSIX_ASYNC_IO)
3279 struct aiocb
* control
;
3283 os_mutex_enter(array
->mutex
);
3285 if (array
->n_reserved
== array
->n_slots
) {
3286 os_mutex_exit(array
->mutex
);
3288 if (!os_aio_use_native_aio
) {
3289 /* If the handler threads are suspended, wake them
3290 so that we get more slots */
3292 os_aio_simulated_wake_handler_threads();
3295 os_event_wait(array
->not_full
);
3301 slot
= os_aio_array_get_nth_slot(array
, i
);
3303 if (slot
->reserved
== FALSE
) {
3308 array
->n_reserved
++;
3310 if (array
->n_reserved
== 1) {
3311 os_event_reset(array
->is_empty
);
3314 if (array
->n_reserved
== array
->n_slots
) {
3315 os_event_reset(array
->not_full
);
3318 slot
->reserved
= TRUE
;
3319 slot
->reservation_time
= time(NULL
);
3320 slot
->message1
= message1
;
3321 slot
->message2
= message2
;
3327 slot
->offset
= offset
;
3328 slot
->offset_high
= offset_high
;
3329 slot
->io_already_done
= FALSE
;
3332 control
= &(slot
->control
);
3333 control
->Offset
= (DWORD
)offset
;
3334 control
->OffsetHigh
= (DWORD
)offset_high
;
3335 os_event_reset(slot
->event
);
3337 #elif defined(POSIX_ASYNC_IO)
3339 #if (UNIV_WORD_SIZE == 8)
3340 offset
= offset
+ (offset_high
<< 32);
3342 ut_a(offset_high
== 0);
3344 control
= &(slot
->control
);
3345 control
->aio_fildes
= file
;
3346 control
->aio_buf
= buf
;
3347 control
->aio_nbytes
= len
;
3348 control
->aio_offset
= offset
;
3349 control
->aio_reqprio
= 0;
3350 control
->aio_sigevent
.sigev_notify
= SIGEV_SIGNAL
;
3351 control
->aio_sigevent
.sigev_signo
3352 = SIGRTMIN
+ 1 + os_aio_get_array_no(array
);
3353 /* TODO: How to choose the signal numbers? */
3355 fprintf(stderr, "AIO signal number %lu\n",
3356 (ulint) control->aio_sigevent.sigev_signo);
3358 control
->aio_sigevent
.sigev_value
.sival_ptr
= slot
;
3360 os_mutex_exit(array
->mutex
);
3365 /***********************************************************************
3366 Frees a slot in the aio array. */
3369 os_aio_array_free_slot(
3370 /*===================*/
3371 os_aio_array_t
* array
, /* in: aio array */
3372 os_aio_slot_t
* slot
) /* in: pointer to slot */
3377 os_mutex_enter(array
->mutex
);
3379 ut_ad(slot
->reserved
);
3381 slot
->reserved
= FALSE
;
3383 array
->n_reserved
--;
3385 if (array
->n_reserved
== array
->n_slots
- 1) {
3386 os_event_set(array
->not_full
);
3389 if (array
->n_reserved
== 0) {
3390 os_event_set(array
->is_empty
);
3394 os_event_reset(slot
->event
);
3396 os_mutex_exit(array
->mutex
);
3399 /**************************************************************************
3400 Wakes up a simulated aio i/o-handler thread if it has something to do. */
3403 os_aio_simulated_wake_handler_thread(
3404 /*=================================*/
3405 ulint global_segment
) /* in: the number of the segment in the aio
3408 os_aio_array_t
* array
;
3409 os_aio_slot_t
* slot
;
3414 ut_ad(!os_aio_use_native_aio
);
3416 segment
= os_aio_get_array_and_local_segment(&array
, global_segment
);
3418 n
= array
->n_slots
/ array
->n_segments
;
3420 /* Look through n slots after the segment * n'th slot */
3422 os_mutex_enter(array
->mutex
);
3424 for (i
= 0; i
< n
; i
++) {
3425 slot
= os_aio_array_get_nth_slot(array
, i
+ segment
* n
);
3427 if (slot
->reserved
) {
3428 /* Found an i/o request */
3434 os_mutex_exit(array
->mutex
);
3437 os_event_set(os_aio_segment_wait_events
[global_segment
]);
3441 /**************************************************************************
3442 Wakes up simulated aio i/o-handler threads if they have something to do. */
3445 os_aio_simulated_wake_handler_threads(void)
3446 /*=======================================*/
3450 if (os_aio_use_native_aio
) {
3451 /* We do not use simulated aio: do nothing */
3456 os_aio_recommend_sleep_for_read_threads
= FALSE
;
3458 for (i
= 0; i
< os_aio_n_segments
; i
++) {
3459 os_aio_simulated_wake_handler_thread(i
);
3463 /**************************************************************************
3464 This function can be called if one wants to post a batch of reads and
3465 prefers an i/o-handler thread to handle them all at once later. You must
3466 call os_aio_simulated_wake_handler_threads later to ensure the threads
3467 are not left sleeping! */
3470 os_aio_simulated_put_read_threads_to_sleep(void)
3471 /*============================================*/
3473 os_aio_array_t
* array
;
3476 os_aio_recommend_sleep_for_read_threads
= TRUE
;
3478 for (g
= 0; g
< os_aio_n_segments
; g
++) {
3479 os_aio_get_array_and_local_segment(&array
, g
);
3481 if (array
== os_aio_read_array
) {
3483 os_event_reset(os_aio_segment_wait_events
[g
]);
3488 /***********************************************************************
3489 Requests an asynchronous i/o operation. */
3494 /* out: TRUE if request was queued
3495 successfully, FALSE if fail */
3496 ulint type
, /* in: OS_FILE_READ or OS_FILE_WRITE */
3497 ulint mode
, /* in: OS_AIO_NORMAL, ..., possibly ORed
3498 to OS_AIO_SIMULATED_WAKE_LATER: the
3499 last flag advises this function not to wake
3500 i/o-handler threads, but the caller will
3501 do the waking explicitly later, in this
3502 way the caller can post several requests in
3503 a batch; NOTE that the batch must not be
3504 so big that it exhausts the slots in aio
3505 arrays! NOTE that a simulated batch
3506 may introduce hidden chances of deadlocks,
3507 because i/os are not actually handled until
3508 all have been posted: use with great
3510 const char* name
, /* in: name of the file or path as a
3511 null-terminated string */
3512 os_file_t file
, /* in: handle to a file */
3513 void* buf
, /* in: buffer where to read or from which
3515 ulint offset
, /* in: least significant 32 bits of file
3516 offset where to read or write */
3517 ulint offset_high
, /* in: most significant 32 bits of
3519 ulint n
, /* in: number of bytes to read or write */
3520 fil_node_t
* message1
,/* in: messages for the aio handler (these
3521 can be used to identify a completed aio
3522 operation); if mode is OS_AIO_SYNC, these
3526 os_aio_array_t
* array
;
3527 os_aio_slot_t
* slot
;
3531 DWORD len
= (DWORD
) n
;
3532 struct fil_node_struct
* dummy_mess1
;
3543 ut_ad(n
% OS_FILE_LOG_BLOCK_SIZE
== 0);
3544 ut_ad(offset
% OS_FILE_LOG_BLOCK_SIZE
== 0);
3545 ut_ad(os_aio_validate());
3547 ut_ad((n
& 0xFFFFFFFFUL
) == n
);
3550 wake_later
= mode
& OS_AIO_SIMULATED_WAKE_LATER
;
3551 mode
= mode
& (~OS_AIO_SIMULATED_WAKE_LATER
);
3553 if (mode
== OS_AIO_SYNC
3555 && !os_aio_use_native_aio
3558 /* This is actually an ordinary synchronous read or write:
3559 no need to use an i/o-handler thread. NOTE that if we use
3560 Windows async i/o, Windows does not allow us to use
3561 ordinary synchronous os_file_read etc. on the same file,
3562 therefore we have built a special mechanism for synchronous
3563 wait in the Windows case. */
3565 if (type
== OS_FILE_READ
) {
3566 return(os_file_read(file
, buf
, offset
,
3570 ut_a(type
== OS_FILE_WRITE
);
3572 return(os_file_write(name
, file
, buf
, offset
, offset_high
, n
));
3576 if (mode
== OS_AIO_NORMAL
) {
3577 if (type
== OS_FILE_READ
) {
3578 array
= os_aio_read_array
;
3580 array
= os_aio_write_array
;
3582 } else if (mode
== OS_AIO_IBUF
) {
3583 ut_ad(type
== OS_FILE_READ
);
3584 /* Reduce probability of deadlock bugs in connection with ibuf:
3585 do not let the ibuf i/o handler sleep */
3589 array
= os_aio_ibuf_array
;
3590 } else if (mode
== OS_AIO_LOG
) {
3592 array
= os_aio_log_array
;
3593 } else if (mode
== OS_AIO_SYNC
) {
3594 array
= os_aio_sync_array
;
3596 array
= NULL
; /* Eliminate compiler warning */
3600 slot
= os_aio_array_reserve_slot(type
, array
, message1
, message2
, file
,
3601 name
, buf
, offset
, offset_high
, n
);
3602 if (type
== OS_FILE_READ
) {
3603 if (os_aio_use_native_aio
) {
3606 os_bytes_read_since_printout
+= len
;
3608 ret
= ReadFile(file
, buf
, (DWORD
)n
, &len
,
3610 #elif defined(POSIX_ASYNC_IO)
3611 slot
->control
.aio_lio_opcode
= LIO_READ
;
3612 err
= (ulint
) aio_read(&(slot
->control
));
3613 fprintf(stderr
, "Starting POSIX aio read %lu\n", err
);
3617 os_aio_simulated_wake_handler_thread(
3618 os_aio_get_segment_no_from_slot(
3622 } else if (type
== OS_FILE_WRITE
) {
3623 if (os_aio_use_native_aio
) {
3626 ret
= WriteFile(file
, buf
, (DWORD
)n
, &len
,
3628 #elif defined(POSIX_ASYNC_IO)
3629 slot
->control
.aio_lio_opcode
= LIO_WRITE
;
3630 err
= (ulint
) aio_write(&(slot
->control
));
3631 fprintf(stderr
, "Starting POSIX aio write %lu\n", err
);
3635 os_aio_simulated_wake_handler_thread(
3636 os_aio_get_segment_no_from_slot(
3645 if (os_aio_use_native_aio
) {
3646 if ((ret
&& len
== n
)
3647 || (!ret
&& GetLastError() == ERROR_IO_PENDING
)) {
3648 /* aio was queued successfully! */
3650 if (mode
== OS_AIO_SYNC
) {
3651 /* We want a synchronous i/o operation on a
3652 file where we also use async i/o: in Windows
3653 we must use the same wait mechanism as for
3656 retval
= os_aio_windows_handle(ULINT_UNDEFINED
,
3668 err
= 1; /* Fall through the next if */
3672 /* aio was queued successfully! */
3677 os_aio_array_free_slot(array
, slot
);
3679 retry
= os_file_handle_error(name
,
3680 type
== OS_FILE_READ
3681 ? "aio read" : "aio write");
3691 /**************************************************************************
3692 This function is only used in Windows asynchronous i/o.
3693 Waits for an aio operation to complete. This function is used to wait the
3694 for completed requests. The aio array of pending requests is divided
3695 into segments. The thread specifies which segment or slot it wants to wait
3696 for. NOTE: this function will also take care of freeing the aio slot,
3697 therefore no other thread is allowed to do the freeing! */
3700 os_aio_windows_handle(
3701 /*==================*/
3702 /* out: TRUE if the aio operation succeeded */
3703 ulint segment
, /* in: the number of the segment in the aio
3704 arrays to wait for; segment 0 is the ibuf
3705 i/o thread, segment 1 the log i/o thread,
3706 then follow the non-ibuf read threads, and as
3707 the last are the non-ibuf write threads; if
3708 this is ULINT_UNDEFINED, then it means that
3709 sync aio is used, and this parameter is
3711 ulint pos
, /* this parameter is used only in sync aio:
3712 wait for the aio slot at this position */
3713 fil_node_t
**message1
, /* out: the messages passed with the aio
3714 request; note that also in the case where
3715 the aio operation failed, these output
3716 parameters are valid and can be used to
3717 restart the operation, for example */
3719 ulint
* type
) /* out: OS_FILE_WRITE or ..._READ */
3721 ulint orig_seg
= segment
;
3722 os_aio_array_t
* array
;
3723 os_aio_slot_t
* slot
;
3731 if (segment
== ULINT_UNDEFINED
) {
3732 array
= os_aio_sync_array
;
3735 segment
= os_aio_get_array_and_local_segment(&array
, segment
);
3738 /* NOTE! We only access constant fields in os_aio_array. Therefore
3739 we do not have to acquire the protecting mutex yet */
3741 ut_ad(os_aio_validate());
3742 ut_ad(segment
< array
->n_segments
);
3744 n
= array
->n_slots
/ array
->n_segments
;
3746 if (array
== os_aio_sync_array
) {
3747 os_event_wait(os_aio_array_get_nth_slot(array
, pos
)->event
);
3750 srv_set_io_thread_op_info(orig_seg
, "wait Windows aio");
3751 i
= os_event_wait_multiple(n
,
3752 (array
->native_events
)
3756 os_mutex_enter(array
->mutex
);
3758 slot
= os_aio_array_get_nth_slot(array
, i
+ segment
* n
);
3760 ut_a(slot
->reserved
);
3762 if (orig_seg
!= ULINT_UNDEFINED
) {
3763 srv_set_io_thread_op_info(orig_seg
,
3764 "get windows aio return value");
3767 ret
= GetOverlappedResult(slot
->file
, &(slot
->control
), &len
, TRUE
);
3769 *message1
= slot
->message1
;
3770 *message2
= slot
->message2
;
3774 if (ret
&& len
== slot
->len
) {
3777 # ifdef UNIV_DO_FLUSH
3778 if (slot
->type
== OS_FILE_WRITE
3779 && !os_do_not_call_flush_at_each_write
) {
3780 ut_a(TRUE
== os_file_flush(slot
->file
));
3782 # endif /* UNIV_DO_FLUSH */
3783 } else if (os_file_handle_error(slot
->name
, "Windows aio")) {
3791 os_mutex_exit(array
->mutex
);
3794 /* retry failed read/write operation synchronously.
3795 No need to hold array->mutex. */
3797 ut_a((slot
->len
& 0xFFFFFFFFUL
) == slot
->len
);
3799 switch (slot
->type
) {
3801 ret
= WriteFile(slot
->file
, slot
->buf
,
3802 (DWORD
) slot
->len
, &len
,
3807 ret
= ReadFile(slot
->file
, slot
->buf
,
3808 (DWORD
) slot
->len
, &len
,
3816 if (!ret
&& GetLastError() == ERROR_IO_PENDING
) {
3817 /* aio was queued successfully!
3818 We want a synchronous i/o operation on a
3819 file where we also use async i/o: in Windows
3820 we must use the same wait mechanism as for
3823 ret
= GetOverlappedResult(slot
->file
,
3828 ret_val
= ret
&& len
== slot
->len
;
3831 os_aio_array_free_slot(array
, slot
);
3837 #ifdef POSIX_ASYNC_IO
3839 /**************************************************************************
3840 This function is only used in Posix asynchronous i/o. Waits for an aio
3841 operation to complete. */
3844 os_aio_posix_handle(
3845 /*================*/
3846 /* out: TRUE if the aio operation succeeded */
3847 ulint array_no
, /* in: array number 0 - 3 */
3848 fil_node_t
**message1
, /* out: the messages passed with the aio
3849 request; note that also in the case where
3850 the aio operation failed, these output
3851 parameters are valid and can be used to
3852 restart the operation, for example */
3855 os_aio_array_t
* array
;
3856 os_aio_slot_t
* slot
;
3859 sigset_t proc_sigset
;
3860 sigset_t thr_sigset
;
3865 sigemptyset(&sigset
);
3866 sigaddset(&sigset
, SIGRTMIN
+ 1 + array_no
);
3868 pthread_sigmask(SIG_UNBLOCK
, &sigset
, NULL
);
3871 sigprocmask(0, NULL
, &proc_sigset
);
3872 pthread_sigmask(0, NULL
, &thr_sigset
);
3874 for (i
= 32 ; i
< 40; i
++) {
3875 fprintf(stderr
, "%lu : %lu %lu\n", (ulint
)i
,
3876 (ulint
) sigismember(&proc_sigset
, i
),
3877 (ulint
) sigismember(&thr_sigset
, i
));
3881 ret
= sigwaitinfo(&sigset
, &info
);
3883 if (sig
!= SIGRTMIN
+ 1 + array_no
) {
3890 fputs("Handling POSIX aio\n", stderr
);
3892 array
= os_aio_get_array_from_no(array_no
);
3894 os_mutex_enter(array
->mutex
);
3896 slot
= info
.si_value
.sival_ptr
;
3898 ut_a(slot
->reserved
);
3900 *message1
= slot
->message1
;
3901 *message2
= slot
->message2
;
3903 # ifdef UNIV_DO_FLUSH
3904 if (slot
->type
== OS_FILE_WRITE
3905 && !os_do_not_call_flush_at_each_write
) {
3906 ut_a(TRUE
== os_file_flush(slot
->file
));
3908 # endif /* UNIV_DO_FLUSH */
3910 os_mutex_exit(array
->mutex
);
3912 os_aio_array_free_slot(array
, slot
);
3918 /**************************************************************************
3919 Do a 'last millisecond' check that the page end is sensible;
3920 reported page checksum errors from Linux seem to wipe over the page end. */
3923 os_file_check_page_trailers(
3924 /*========================*/
3925 byte
* combined_buf
, /* in: combined write buffer */
3926 ulint total_len
) /* in: size of combined_buf, in bytes
3927 (a multiple of UNIV_PAGE_SIZE) */
3931 for (len
= 0; len
+ UNIV_PAGE_SIZE
<= total_len
;
3932 len
+= UNIV_PAGE_SIZE
) {
3933 byte
* buf
= combined_buf
+ len
;
3936 (memcmp(buf
+ (FIL_PAGE_LSN
+ 4),
3937 buf
+ (UNIV_PAGE_SIZE
3938 - FIL_PAGE_END_LSN_OLD_CHKSUM
+ 4), 4))) {
3939 ut_print_timestamp(stderr
);
3941 " InnoDB: ERROR: The page to be written"
3943 "InnoDB: Writing a block of %lu bytes,"
3944 " currently at offset %lu\n",
3945 (ulong
)total_len
, (ulong
)len
);
3946 buf_page_print(buf
);
3948 "InnoDB: ERROR: The page to be written"
3949 " seems corrupt!\n");
3954 /**************************************************************************
3955 Does simulated aio. This function should be called by an i/o-handler
3959 os_aio_simulated_handle(
3960 /*====================*/
3961 /* out: TRUE if the aio operation succeeded */
3962 ulint global_segment
, /* in: the number of the segment in the aio
3963 arrays to wait for; segment 0 is the ibuf
3964 i/o thread, segment 1 the log i/o thread,
3965 then follow the non-ibuf read threads, and as
3966 the last are the non-ibuf write threads */
3967 fil_node_t
**message1
, /* out: the messages passed with the aio
3968 request; note that also in the case where
3969 the aio operation failed, these output
3970 parameters are valid and can be used to
3971 restart the operation, for example */
3973 ulint
* type
) /* out: OS_FILE_WRITE or ..._READ */
3975 os_aio_array_t
* array
;
3977 os_aio_slot_t
* slot
;
3978 os_aio_slot_t
* slot2
;
3979 os_aio_slot_t
* consecutive_ios
[OS_AIO_MERGE_N_CONSECUTIVE
];
3980 ulint n_consecutive
;
3983 ulint lowest_offset
;
3987 byte
* combined_buf2
;
3992 /* Fix compiler warning */
3993 *consecutive_ios
= NULL
;
3995 segment
= os_aio_get_array_and_local_segment(&array
, global_segment
);
3998 /* NOTE! We only access constant fields in os_aio_array. Therefore
3999 we do not have to acquire the protecting mutex yet */
4001 srv_set_io_thread_op_info(global_segment
,
4002 "looking for i/o requests (a)");
4003 ut_ad(os_aio_validate());
4004 ut_ad(segment
< array
->n_segments
);
4006 n
= array
->n_slots
/ array
->n_segments
;
4008 /* Look through n slots after the segment * n'th slot */
4010 if (array
== os_aio_read_array
4011 && os_aio_recommend_sleep_for_read_threads
) {
4013 /* Give other threads chance to add several i/os to the array
4016 goto recommended_sleep
;
4019 os_mutex_enter(array
->mutex
);
4021 srv_set_io_thread_op_info(global_segment
,
4022 "looking for i/o requests (b)");
4024 /* Check if there is a slot for which the i/o has already been
4027 for (i
= 0; i
< n
; i
++) {
4028 slot
= os_aio_array_get_nth_slot(array
, i
+ segment
* n
);
4030 if (slot
->reserved
&& slot
->io_already_done
) {
4032 if (os_aio_print_debug
) {
4034 "InnoDB: i/o for slot %lu"
4035 " already done, returning\n",
4047 /* If there are at least 2 seconds old requests, then pick the oldest
4048 one to prevent starvation. If several requests have the same age,
4049 then pick the one at the lowest offset. */
4052 lowest_offset
= ULINT_MAX
;
4054 for (i
= 0; i
< n
; i
++) {
4055 slot
= os_aio_array_get_nth_slot(array
, i
+ segment
* n
);
4057 if (slot
->reserved
) {
4058 age
= (ulint
)difftime(time(NULL
),
4059 slot
->reservation_time
);
4061 if ((age
>= 2 && age
> biggest_age
)
4062 || (age
>= 2 && age
== biggest_age
4063 && slot
->offset
< lowest_offset
)) {
4065 /* Found an i/o request */
4066 consecutive_ios
[0] = slot
;
4071 lowest_offset
= slot
->offset
;
4076 if (n_consecutive
== 0) {
4077 /* There were no old requests. Look for an i/o request at the
4078 lowest offset in the array (we ignore the high 32 bits of the
4079 offset in these heuristics) */
4081 lowest_offset
= ULINT_MAX
;
4083 for (i
= 0; i
< n
; i
++) {
4084 slot
= os_aio_array_get_nth_slot(array
,
4087 if (slot
->reserved
&& slot
->offset
< lowest_offset
) {
4089 /* Found an i/o request */
4090 consecutive_ios
[0] = slot
;
4094 lowest_offset
= slot
->offset
;
4099 if (n_consecutive
== 0) {
4101 /* No i/o requested at the moment */
4106 slot
= consecutive_ios
[0];
4108 /* Check if there are several consecutive blocks to read or write */
4111 for (i
= 0; i
< n
; i
++) {
4112 slot2
= os_aio_array_get_nth_slot(array
, i
+ segment
* n
);
4114 if (slot2
->reserved
&& slot2
!= slot
4115 && slot2
->offset
== slot
->offset
+ slot
->len
4116 /* check that sum does not wrap over */
4117 && slot
->offset
+ slot
->len
> slot
->offset
4118 && slot2
->offset_high
== slot
->offset_high
4119 && slot2
->type
== slot
->type
4120 && slot2
->file
== slot
->file
) {
4122 /* Found a consecutive i/o request */
4124 consecutive_ios
[n_consecutive
] = slot2
;
4129 if (n_consecutive
< OS_AIO_MERGE_N_CONSECUTIVE
) {
4131 goto consecutive_loop
;
4138 srv_set_io_thread_op_info(global_segment
, "consecutive i/o requests");
4140 /* We have now collected n_consecutive i/o requests in the array;
4141 allocate a single buffer which can hold all data, and perform the
4145 slot
= consecutive_ios
[0];
4147 for (i
= 0; i
< n_consecutive
; i
++) {
4148 total_len
+= consecutive_ios
[i
]->len
;
4151 if (n_consecutive
== 1) {
4152 /* We can use the buffer of the i/o request */
4153 combined_buf
= slot
->buf
;
4154 combined_buf2
= NULL
;
4156 combined_buf2
= ut_malloc(total_len
+ UNIV_PAGE_SIZE
);
4158 ut_a(combined_buf2
);
4160 combined_buf
= ut_align(combined_buf2
, UNIV_PAGE_SIZE
);
4163 /* We release the array mutex for the time of the i/o: NOTE that
4164 this assumes that there is just one i/o-handler thread serving
4165 a single segment of slots! */
4167 os_mutex_exit(array
->mutex
);
4169 if (slot
->type
== OS_FILE_WRITE
&& n_consecutive
> 1) {
4170 /* Copy the buffers to the combined buffer */
4173 for (i
= 0; i
< n_consecutive
; i
++) {
4175 ut_memcpy(combined_buf
+ offs
, consecutive_ios
[i
]->buf
,
4176 consecutive_ios
[i
]->len
);
4177 offs
+= consecutive_ios
[i
]->len
;
4181 srv_set_io_thread_op_info(global_segment
, "doing file i/o");
4183 if (os_aio_print_debug
) {
4185 "InnoDB: doing i/o of type %lu at offset %lu %lu,"
4187 (ulong
) slot
->type
, (ulong
) slot
->offset_high
,
4188 (ulong
) slot
->offset
, (ulong
) total_len
);
4191 /* Do the i/o with ordinary, synchronous i/o functions: */
4192 if (slot
->type
== OS_FILE_WRITE
) {
4193 if (array
== os_aio_write_array
) {
4194 if ((total_len
% UNIV_PAGE_SIZE
!= 0)
4195 || (slot
->offset
% UNIV_PAGE_SIZE
!= 0)) {
4197 "InnoDB: Error: trying a displaced"
4198 " write to %s %lu %lu, len %lu\n",
4199 slot
->name
, (ulong
) slot
->offset_high
,
4200 (ulong
) slot
->offset
,
4205 os_file_check_page_trailers(combined_buf
, total_len
);
4208 ret
= os_file_write(slot
->name
, slot
->file
, combined_buf
,
4209 slot
->offset
, slot
->offset_high
,
4212 if (array
== os_aio_write_array
) {
4213 os_file_check_page_trailers(combined_buf
, total_len
);
4216 ret
= os_file_read(slot
->file
, combined_buf
,
4217 slot
->offset
, slot
->offset_high
, total_len
);
4221 srv_set_io_thread_op_info(global_segment
, "file i/o done");
4225 "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4226 n_consecutive
, global_segment
, slot
->offset
/ UNIV_PAGE_SIZE
);
4229 if (slot
->type
== OS_FILE_READ
&& n_consecutive
> 1) {
4230 /* Copy the combined buffer to individual buffers */
4233 for (i
= 0; i
< n_consecutive
; i
++) {
4235 ut_memcpy(consecutive_ios
[i
]->buf
, combined_buf
+ offs
,
4236 consecutive_ios
[i
]->len
);
4237 offs
+= consecutive_ios
[i
]->len
;
4241 if (combined_buf2
) {
4242 ut_free(combined_buf2
);
4245 os_mutex_enter(array
->mutex
);
4247 /* Mark the i/os done in slots */
4249 for (i
= 0; i
< n_consecutive
; i
++) {
4250 consecutive_ios
[i
]->io_already_done
= TRUE
;
4253 /* We return the messages for the first slot now, and if there were
4254 several slots, the messages will be returned with subsequent calls
4259 ut_a(slot
->reserved
);
4261 *message1
= slot
->message1
;
4262 *message2
= slot
->message2
;
4266 os_mutex_exit(array
->mutex
);
4268 os_aio_array_free_slot(array
, slot
);
4273 srv_set_io_thread_op_info(global_segment
, "resetting wait event");
4275 /* We wait here until there again can be i/os in the segment
4278 os_event_reset(os_aio_segment_wait_events
[global_segment
]);
4280 os_mutex_exit(array
->mutex
);
4283 srv_set_io_thread_op_info(global_segment
, "waiting for i/o request");
4285 os_event_wait(os_aio_segment_wait_events
[global_segment
]);
4287 if (os_aio_print_debug
) {
4289 "InnoDB: i/o handler thread for i/o"
4290 " segment %lu wakes up\n",
4291 (ulong
) global_segment
);
4297 /**************************************************************************
4298 Validates the consistency of an aio array. */
4301 os_aio_array_validate(
4302 /*==================*/
4303 /* out: TRUE if ok */
4304 os_aio_array_t
* array
) /* in: aio wait array */
4306 os_aio_slot_t
* slot
;
4307 ulint n_reserved
= 0;
4312 os_mutex_enter(array
->mutex
);
4314 ut_a(array
->n_slots
> 0);
4315 ut_a(array
->n_segments
> 0);
4317 for (i
= 0; i
< array
->n_slots
; i
++) {
4318 slot
= os_aio_array_get_nth_slot(array
, i
);
4320 if (slot
->reserved
) {
4322 ut_a(slot
->len
> 0);
4326 ut_a(array
->n_reserved
== n_reserved
);
4328 os_mutex_exit(array
->mutex
);
4333 /**************************************************************************
4334 Validates the consistency the aio system. */
4337 os_aio_validate(void)
4338 /*=================*/
4339 /* out: TRUE if ok */
4341 os_aio_array_validate(os_aio_read_array
);
4342 os_aio_array_validate(os_aio_write_array
);
4343 os_aio_array_validate(os_aio_ibuf_array
);
4344 os_aio_array_validate(os_aio_log_array
);
4345 os_aio_array_validate(os_aio_sync_array
);
4350 /**************************************************************************
4351 Prints info of the aio arrays. */
4356 FILE* file
) /* in: file where to print */
4358 os_aio_array_t
* array
;
4359 os_aio_slot_t
* slot
;
4361 time_t current_time
;
4362 double time_elapsed
;
4363 double avg_bytes_read
;
4366 for (i
= 0; i
< srv_n_file_io_threads
; i
++) {
4367 fprintf(file
, "I/O thread %lu state: %s (%s)", (ulong
) i
,
4368 srv_io_thread_op_info
[i
],
4369 srv_io_thread_function
[i
]);
4372 if (os_aio_segment_wait_events
[i
]->is_set
) {
4373 fprintf(file
, " ev set");
4377 fprintf(file
, "\n");
4380 fputs("Pending normal aio reads:", file
);
4382 array
= os_aio_read_array
;
4386 os_mutex_enter(array
->mutex
);
4388 ut_a(array
->n_slots
> 0);
4389 ut_a(array
->n_segments
> 0);
4393 for (i
= 0; i
< array
->n_slots
; i
++) {
4394 slot
= os_aio_array_get_nth_slot(array
, i
);
4396 if (slot
->reserved
) {
4399 fprintf(stderr
, "Reserved slot, messages %p %p\n",
4400 (void*) slot
->message1
,
4401 (void*) slot
->message2
);
4403 ut_a(slot
->len
> 0);
4407 ut_a(array
->n_reserved
== n_reserved
);
4409 fprintf(file
, " %lu", (ulong
) n_reserved
);
4411 os_mutex_exit(array
->mutex
);
4413 if (array
== os_aio_read_array
) {
4414 fputs(", aio writes:", file
);
4416 array
= os_aio_write_array
;
4421 if (array
== os_aio_write_array
) {
4422 fputs(",\n ibuf aio reads:", file
);
4423 array
= os_aio_ibuf_array
;
4428 if (array
== os_aio_ibuf_array
) {
4429 fputs(", log i/o's:", file
);
4430 array
= os_aio_log_array
;
4435 if (array
== os_aio_log_array
) {
4436 fputs(", sync i/o's:", file
);
4437 array
= os_aio_sync_array
;
4443 current_time
= time(NULL
);
4444 time_elapsed
= 0.001 + difftime(current_time
, os_last_printout
);
4447 "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4448 "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4449 (ulong
) fil_n_pending_log_flushes
,
4450 (ulong
) fil_n_pending_tablespace_flushes
,
4451 (ulong
) os_n_file_reads
, (ulong
) os_n_file_writes
,
4452 (ulong
) os_n_fsyncs
);
4454 if (os_file_n_pending_preads
!= 0 || os_file_n_pending_pwrites
!= 0) {
4456 "%lu pending preads, %lu pending pwrites\n",
4457 (ulong
) os_file_n_pending_preads
,
4458 (ulong
) os_file_n_pending_pwrites
);
4461 if (os_n_file_reads
== os_n_file_reads_old
) {
4462 avg_bytes_read
= 0.0;
4464 avg_bytes_read
= (double) os_bytes_read_since_printout
4465 / (os_n_file_reads
- os_n_file_reads_old
);
4469 "%.2f reads/s, %lu avg bytes/read,"
4470 " %.2f writes/s, %.2f fsyncs/s\n",
4471 (os_n_file_reads
- os_n_file_reads_old
)
4473 (ulong
)avg_bytes_read
,
4474 (os_n_file_writes
- os_n_file_writes_old
)
4476 (os_n_fsyncs
- os_n_fsyncs_old
)
4479 os_n_file_reads_old
= os_n_file_reads
;
4480 os_n_file_writes_old
= os_n_file_writes
;
4481 os_n_fsyncs_old
= os_n_fsyncs
;
4482 os_bytes_read_since_printout
= 0;
4484 os_last_printout
= current_time
;
4487 /**************************************************************************
4488 Refreshes the statistics used to print per-second averages. */
4491 os_aio_refresh_stats(void)
4492 /*======================*/
4494 os_n_file_reads_old
= os_n_file_reads
;
4495 os_n_file_writes_old
= os_n_file_writes
;
4496 os_n_fsyncs_old
= os_n_fsyncs
;
4497 os_bytes_read_since_printout
= 0;
4499 os_last_printout
= time(NULL
);
4503 /**************************************************************************
4504 Checks that all slots in the system have been freed, that is, there are
4505 no pending io operations. */
4508 os_aio_all_slots_free(void)
4509 /*=======================*/
4510 /* out: TRUE if all free */
4512 os_aio_array_t
* array
;
4515 array
= os_aio_read_array
;
4517 os_mutex_enter(array
->mutex
);
4519 n_res
+= array
->n_reserved
;
4521 os_mutex_exit(array
->mutex
);
4523 array
= os_aio_write_array
;
4525 os_mutex_enter(array
->mutex
);
4527 n_res
+= array
->n_reserved
;
4529 os_mutex_exit(array
->mutex
);
4531 array
= os_aio_ibuf_array
;
4533 os_mutex_enter(array
->mutex
);
4535 n_res
+= array
->n_reserved
;
4537 os_mutex_exit(array
->mutex
);
4539 array
= os_aio_log_array
;
4541 os_mutex_enter(array
->mutex
);
4543 n_res
+= array
->n_reserved
;
4545 os_mutex_exit(array
->mutex
);
4547 array
= os_aio_sync_array
;
4549 os_mutex_enter(array
->mutex
);
4551 n_res
+= array
->n_reserved
;
4553 os_mutex_exit(array
->mutex
);
4562 #endif /* UNIV_DEBUG */