mySQL 5.0.11 sources for tomato
[tomato.git] / release / src / router / mysql / storage / innobase / os / os0file.c
blobd088ff54ebc567fc36613c615f161c4f63874db2
1 /******************************************************
2 The interface to the operating system file i/o primitives
4 (c) 1995 Innobase Oy
6 Created 10/21/1995 Heikki Tuuri
7 *******************************************************/
9 #include "os0file.h"
10 #include "os0sync.h"
11 #include "os0thread.h"
12 #include "ut0mem.h"
13 #include "srv0srv.h"
14 #include "srv0start.h"
15 #include "fil0fil.h"
16 #include "buf0buf.h"
18 #if defined(UNIV_HOTBACKUP) && defined(__WIN__)
19 /* Add includes for the _stat() call to compile on Windows */
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <errno.h>
23 #endif /* UNIV_HOTBACKUP */
25 #ifdef POSIX_ASYNC_IO
26 /* We assume in this case that the OS has standard Posix aio (at least SunOS
27 2.6, HP-UX 11i and AIX 4.3 have) */
29 #endif
31 /* This specifies the file permissions InnoDB uses when it creates files in
32 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
33 my_umask */
35 #ifndef __WIN__
36 ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
37 #else
38 ulint os_innodb_umask = 0;
39 #endif
41 #ifdef UNIV_DO_FLUSH
42 /* If the following is set to TRUE, we do not call os_file_flush in every
43 os_file_write. We can set this TRUE when the doublewrite buffer is used. */
44 ibool os_do_not_call_flush_at_each_write = FALSE;
45 #else
46 /* We do not call os_file_flush in every os_file_write. */
47 #endif /* UNIV_DO_FLUSH */
49 /* We use these mutexes to protect lseek + file i/o operation, if the
50 OS does not provide an atomic pread or pwrite, or similar */
51 #define OS_FILE_N_SEEK_MUTEXES 16
52 os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
54 /* In simulated aio, merge at most this many consecutive i/os */
55 #define OS_AIO_MERGE_N_CONSECUTIVE 64
57 /* If this flag is TRUE, then we will use the native aio of the
58 OS (provided we compiled Innobase with it in), otherwise we will
59 use simulated aio we build below with threads */
61 ibool os_aio_use_native_aio = FALSE;
63 ibool os_aio_print_debug = FALSE;
65 /* The aio array slot structure */
66 typedef struct os_aio_slot_struct os_aio_slot_t;
68 struct os_aio_slot_struct{
69 ibool is_read; /* TRUE if a read operation */
70 ulint pos; /* index of the slot in the aio
71 array */
72 ibool reserved; /* TRUE if this slot is reserved */
73 time_t reservation_time;/* time when reserved */
74 ulint len; /* length of the block to read or
75 write */
76 byte* buf; /* buffer used in i/o */
77 ulint type; /* OS_FILE_READ or OS_FILE_WRITE */
78 ulint offset; /* 32 low bits of file offset in
79 bytes */
80 ulint offset_high; /* 32 high bits of file offset */
81 os_file_t file; /* file where to read or write */
82 const char* name; /* file name or path */
83 ibool io_already_done;/* used only in simulated aio:
84 TRUE if the physical i/o already
85 made and only the slot message
86 needs to be passed to the caller
87 of os_aio_simulated_handle */
88 fil_node_t* message1; /* message which is given by the */
89 void* message2; /* the requester of an aio operation
90 and which can be used to identify
91 which pending aio operation was
92 completed */
93 #ifdef WIN_ASYNC_IO
94 os_event_t event; /* event object we need in the
95 OVERLAPPED struct */
96 OVERLAPPED control; /* Windows control block for the
97 aio request */
98 #elif defined(POSIX_ASYNC_IO)
99 struct aiocb control; /* Posix control block for aio
100 request */
101 #endif
104 /* The aio array structure */
105 typedef struct os_aio_array_struct os_aio_array_t;
107 struct os_aio_array_struct{
108 os_mutex_t mutex; /* the mutex protecting the aio array */
109 os_event_t not_full; /* The event which is set to the signaled
110 state when there is space in the aio
111 outside the ibuf segment */
112 os_event_t is_empty; /* The event which is set to the signaled
113 state when there are no pending i/os
114 in this array */
115 ulint n_slots; /* Total number of slots in the aio array.
116 This must be divisible by n_threads. */
117 ulint n_segments;/* Number of segments in the aio array of
118 pending aio requests. A thread can wait
119 separately for any one of the segments. */
120 ulint n_reserved;/* Number of reserved slots in the
121 aio array outside the ibuf segment */
122 os_aio_slot_t* slots; /* Pointer to the slots in the array */
123 #ifdef __WIN__
124 os_native_event_t* native_events;
125 /* Pointer to an array of OS native event
126 handles where we copied the handles from
127 slots, in the same order. This can be used
128 in WaitForMultipleObjects; used only in
129 Windows */
130 #endif
133 /* Array of events used in simulated aio */
134 os_event_t* os_aio_segment_wait_events = NULL;
136 /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
137 are NULL when the module has not yet been initialized. */
138 static os_aio_array_t* os_aio_read_array = NULL;
139 static os_aio_array_t* os_aio_write_array = NULL;
140 static os_aio_array_t* os_aio_ibuf_array = NULL;
141 static os_aio_array_t* os_aio_log_array = NULL;
142 static os_aio_array_t* os_aio_sync_array = NULL;
144 static ulint os_aio_n_segments = ULINT_UNDEFINED;
146 /* If the following is TRUE, read i/o handler threads try to
147 wait until a batch of new read requests have been posted */
148 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
150 ulint os_n_file_reads = 0;
151 ulint os_bytes_read_since_printout = 0;
152 ulint os_n_file_writes = 0;
153 ulint os_n_fsyncs = 0;
154 ulint os_n_file_reads_old = 0;
155 ulint os_n_file_writes_old = 0;
156 ulint os_n_fsyncs_old = 0;
157 time_t os_last_printout;
159 ibool os_has_said_disk_full = FALSE;
161 /* The mutex protecting the following counts of pending I/O operations */
162 static os_mutex_t os_file_count_mutex;
163 ulint os_file_n_pending_preads = 0;
164 ulint os_file_n_pending_pwrites = 0;
165 ulint os_n_pending_writes = 0;
166 ulint os_n_pending_reads = 0;
168 /***************************************************************************
169 Gets the operating system version. Currently works only on Windows. */
171 ulint
172 os_get_os_version(void)
173 /*===================*/
174 /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
176 #ifdef __WIN__
177 OSVERSIONINFO os_info;
179 os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
181 ut_a(GetVersionEx(&os_info));
183 if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
184 return(OS_WIN31);
185 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
186 return(OS_WIN95);
187 } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
188 if (os_info.dwMajorVersion <= 4) {
189 return(OS_WINNT);
190 } else {
191 return(OS_WIN2000);
193 } else {
194 ut_error;
195 return(0);
197 #else
198 ut_error;
200 return(0);
201 #endif
204 /***************************************************************************
205 Retrieves the last error number if an error occurs in a file io function.
206 The number should be retrieved before any other OS calls (because they may
207 overwrite the error number). If the number is not known to this program,
208 the OS error number + 100 is returned. */
210 ulint
211 os_file_get_last_error(
212 /*===================*/
213 /* out: error number, or OS error
214 number + 100 */
215 ibool report_all_errors) /* in: TRUE if we want an error message
216 printed of all errors */
218 ulint err;
220 #ifdef __WIN__
222 err = (ulint) GetLastError();
224 if (report_all_errors
225 || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
227 ut_print_timestamp(stderr);
228 fprintf(stderr,
229 " InnoDB: Operating system error number %lu"
230 " in a file operation.\n", (ulong) err);
232 if (err == ERROR_PATH_NOT_FOUND) {
233 fprintf(stderr,
234 "InnoDB: The error means the system"
235 " cannot find the path specified.\n");
237 if (srv_is_being_started) {
238 fprintf(stderr,
239 "InnoDB: If you are installing InnoDB,"
240 " remember that you must create\n"
241 "InnoDB: directories yourself, InnoDB"
242 " does not create them.\n");
244 } else if (err == ERROR_ACCESS_DENIED) {
245 fprintf(stderr,
246 "InnoDB: The error means mysqld does not have"
247 " the access rights to\n"
248 "InnoDB: the directory. It may also be"
249 " you have created a subdirectory\n"
250 "InnoDB: of the same name as a data file.\n");
251 } else if (err == ERROR_SHARING_VIOLATION
252 || err == ERROR_LOCK_VIOLATION) {
253 fprintf(stderr,
254 "InnoDB: The error means that another program"
255 " is using InnoDB's files.\n"
256 "InnoDB: This might be a backup or antivirus"
257 " software or another instance\n"
258 "InnoDB: of MySQL."
259 " Please close it to get rid of this error.\n");
260 } else if (err == ERROR_OPERATION_ABORTED) {
261 fprintf(stderr,
262 "InnoDB: The error means that the I/O"
263 " operation has been aborted\n"
264 "InnoDB: because of either a thread exit"
265 " or an application request.\n"
266 "InnoDB: Retry attempt is made.\n");
267 } else {
268 fprintf(stderr,
269 "InnoDB: Some operating system error numbers"
270 " are described at\n"
271 "InnoDB: "
272 "http://dev.mysql.com/doc/refman/5.1/en/"
273 "operating-system-error-codes.html\n");
277 fflush(stderr);
279 if (err == ERROR_FILE_NOT_FOUND) {
280 return(OS_FILE_NOT_FOUND);
281 } else if (err == ERROR_DISK_FULL) {
282 return(OS_FILE_DISK_FULL);
283 } else if (err == ERROR_FILE_EXISTS) {
284 return(OS_FILE_ALREADY_EXISTS);
285 } else if (err == ERROR_SHARING_VIOLATION
286 || err == ERROR_LOCK_VIOLATION) {
287 return(OS_FILE_SHARING_VIOLATION);
288 } else if (err == ERROR_OPERATION_ABORTED) {
289 return(OS_FILE_OPERATION_ABORTED);
290 } else {
291 return(100 + err);
293 #else
294 err = (ulint) errno;
296 if (report_all_errors
297 || (err != ENOSPC && err != EEXIST)) {
299 ut_print_timestamp(stderr);
300 fprintf(stderr,
301 " InnoDB: Operating system error number %lu"
302 " in a file operation.\n", (ulong) err);
304 if (err == ENOENT) {
305 fprintf(stderr,
306 "InnoDB: The error means the system"
307 " cannot find the path specified.\n");
309 if (srv_is_being_started) {
310 fprintf(stderr,
311 "InnoDB: If you are installing InnoDB,"
312 " remember that you must create\n"
313 "InnoDB: directories yourself, InnoDB"
314 " does not create them.\n");
316 } else if (err == EACCES) {
317 fprintf(stderr,
318 "InnoDB: The error means mysqld does not have"
319 " the access rights to\n"
320 "InnoDB: the directory.\n");
321 } else {
322 if (strerror((int)err) != NULL) {
323 fprintf(stderr,
324 "InnoDB: Error number %lu"
325 " means '%s'.\n",
326 err, strerror((int)err));
329 fprintf(stderr,
330 "InnoDB: Some operating system"
331 " error numbers are described at\n"
332 "InnoDB: "
333 "http://dev.mysql.com/doc/refman/5.1/en/"
334 "operating-system-error-codes.html\n");
338 fflush(stderr);
340 if (err == ENOSPC) {
341 return(OS_FILE_DISK_FULL);
342 #ifdef POSIX_ASYNC_IO
343 } else if (err == EAGAIN) {
344 return(OS_FILE_AIO_RESOURCES_RESERVED);
345 #endif
346 } else if (err == ENOENT) {
347 return(OS_FILE_NOT_FOUND);
348 } else if (err == EEXIST) {
349 return(OS_FILE_ALREADY_EXISTS);
350 } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
351 return(OS_FILE_PATH_ERROR);
352 } else {
353 return(100 + err);
355 #endif
358 /********************************************************************
359 Does error handling when a file operation fails.
360 Conditionally exits (calling exit(3)) based on should_exit value and the
361 error type */
363 static
364 ibool
365 os_file_handle_error_cond_exit(
366 /*===========================*/
367 /* out: TRUE if we should retry the
368 operation */
369 const char* name, /* in: name of a file or NULL */
370 const char* operation, /* in: operation */
371 ibool should_exit) /* in: call exit(3) if unknown error
372 and this parameter is TRUE */
374 ulint err;
376 err = os_file_get_last_error(FALSE);
378 if (err == OS_FILE_DISK_FULL) {
379 /* We only print a warning about disk full once */
381 if (os_has_said_disk_full) {
383 return(FALSE);
386 if (name) {
387 ut_print_timestamp(stderr);
388 fprintf(stderr,
389 " InnoDB: Encountered a problem with"
390 " file %s\n", name);
393 ut_print_timestamp(stderr);
394 fprintf(stderr,
395 " InnoDB: Disk is full. Try to clean the disk"
396 " to free space.\n");
398 os_has_said_disk_full = TRUE;
400 fflush(stderr);
402 return(FALSE);
403 } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
405 return(TRUE);
406 } else if (err == OS_FILE_ALREADY_EXISTS
407 || err == OS_FILE_PATH_ERROR) {
409 return(FALSE);
410 } else if (err == OS_FILE_SHARING_VIOLATION) {
412 os_thread_sleep(10000000); /* 10 sec */
413 return(TRUE);
414 } else if (err == OS_FILE_OPERATION_ABORTED) {
416 os_thread_sleep(100000); /* 100 ms */
417 return(TRUE);
418 } else {
419 if (name) {
420 fprintf(stderr, "InnoDB: File name %s\n", name);
423 fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
424 operation);
426 if (should_exit) {
427 fprintf(stderr, "InnoDB: Cannot continue operation.\n");
429 fflush(stderr);
431 exit(1);
435 return(FALSE);
438 /********************************************************************
439 Does error handling when a file operation fails. */
440 static
441 ibool
442 os_file_handle_error(
443 /*=================*/
444 /* out: TRUE if we should retry the
445 operation */
446 const char* name, /* in: name of a file or NULL */
447 const char* operation)/* in: operation */
449 /* exit in case of unknown error */
450 return(os_file_handle_error_cond_exit(name, operation, TRUE));
453 /********************************************************************
454 Does error handling when a file operation fails. */
455 static
456 ibool
457 os_file_handle_error_no_exit(
458 /*=========================*/
459 /* out: TRUE if we should retry the
460 operation */
461 const char* name, /* in: name of a file or NULL */
462 const char* operation)/* in: operation */
464 /* don't exit in case of unknown error */
465 return(os_file_handle_error_cond_exit(name, operation, FALSE));
468 #undef USE_FILE_LOCK
469 #define USE_FILE_LOCK
470 #if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
471 /* InnoDB Hot Backup does not lock the data files.
472 * On Windows, mandatory locking is used.
474 # undef USE_FILE_LOCK
475 #endif
476 #ifdef USE_FILE_LOCK
477 /********************************************************************
478 Obtain an exclusive lock on a file. */
479 static
481 os_file_lock(
482 /*=========*/
483 /* out: 0 on success */
484 int fd, /* in: file descriptor */
485 const char* name) /* in: file name */
487 struct flock lk;
488 lk.l_type = F_WRLCK;
489 lk.l_whence = SEEK_SET;
490 lk.l_start = lk.l_len = 0;
491 if (fcntl(fd, F_SETLK, &lk) == -1) {
492 fprintf(stderr,
493 "InnoDB: Unable to lock %s, error: %d\n", name, errno);
495 if (errno == EAGAIN || errno == EACCES) {
496 fprintf(stderr,
497 "InnoDB: Check that you do not already have"
498 " another mysqld process\n"
499 "InnoDB: using the same InnoDB data"
500 " or log files.\n");
503 return(-1);
506 return(0);
508 #endif /* USE_FILE_LOCK */
510 /********************************************************************
511 Creates the seek mutexes used in positioned reads and writes. */
513 void
514 os_io_init_simple(void)
515 /*===================*/
517 ulint i;
519 os_file_count_mutex = os_mutex_create(NULL);
521 for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
522 os_file_seek_mutexes[i] = os_mutex_create(NULL);
526 #if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
527 /*************************************************************************
528 Creates a temporary file that will be deleted on close.
529 This function is defined in ha_innodb.cc. */
532 innobase_mysql_tmpfile(void);
533 /*========================*/
534 /* out: temporary file descriptor, or < 0 on error */
535 #endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
537 /***************************************************************************
538 Creates a temporary file. This function is like tmpfile(3), but
539 the temporary file is created in the MySQL temporary directory.
540 On Netware, this function is like tmpfile(3), because the C run-time
541 library of Netware does not expose the delete-on-close flag. */
543 FILE*
544 os_file_create_tmpfile(void)
545 /*========================*/
546 /* out: temporary file handle, or NULL on error */
548 #ifdef UNIV_HOTBACKUP
549 ut_error;
551 return(NULL);
552 #else
553 # ifdef __NETWARE__
554 FILE* file = tmpfile();
555 # else /* __NETWARE__ */
556 FILE* file = NULL;
557 int fd = innobase_mysql_tmpfile();
559 if (fd >= 0) {
560 file = fdopen(fd, "w+b");
562 # endif /* __NETWARE__ */
564 if (!file) {
565 ut_print_timestamp(stderr);
566 fprintf(stderr,
567 " InnoDB: Error: unable to create temporary file;"
568 " errno: %d\n", errno);
569 # ifndef __NETWARE__
570 if (fd >= 0) {
571 close(fd);
573 # endif /* !__NETWARE__ */
576 return(file);
577 #endif /* UNIV_HOTBACKUP */
580 /***************************************************************************
581 The os_file_opendir() function opens a directory stream corresponding to the
582 directory named by the dirname argument. The directory stream is positioned
583 at the first entry. In both Unix and Windows we automatically skip the '.'
584 and '..' items at the start of the directory listing. */
586 os_file_dir_t
587 os_file_opendir(
588 /*============*/
589 /* out: directory stream, NULL if
590 error */
591 const char* dirname, /* in: directory name; it must not
592 contain a trailing '\' or '/' */
593 ibool error_is_fatal) /* in: TRUE if we should treat an
594 error as a fatal error; if we try to
595 open symlinks then we do not wish a
596 fatal error if it happens not to be
597 a directory */
599 os_file_dir_t dir;
600 #ifdef __WIN__
601 LPWIN32_FIND_DATA lpFindFileData;
602 char path[OS_FILE_MAX_PATH + 3];
604 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
606 strcpy(path, dirname);
607 strcpy(path + strlen(path), "\\*");
609 /* Note that in Windows opening the 'directory stream' also retrieves
610 the first entry in the directory. Since it is '.', that is no problem,
611 as we will skip over the '.' and '..' entries anyway. */
613 lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
615 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
617 ut_free(lpFindFileData);
619 if (dir == INVALID_HANDLE_VALUE) {
621 if (error_is_fatal) {
622 os_file_handle_error(dirname, "opendir");
625 return(NULL);
628 return(dir);
629 #else
630 dir = opendir(dirname);
632 if (dir == NULL && error_is_fatal) {
633 os_file_handle_error(dirname, "opendir");
636 return(dir);
637 #endif
640 /***************************************************************************
641 Closes a directory stream. */
644 os_file_closedir(
645 /*=============*/
646 /* out: 0 if success, -1 if failure */
647 os_file_dir_t dir) /* in: directory stream */
649 #ifdef __WIN__
650 BOOL ret;
652 ret = FindClose(dir);
654 if (!ret) {
655 os_file_handle_error_no_exit(NULL, "closedir");
657 return(-1);
660 return(0);
661 #else
662 int ret;
664 ret = closedir(dir);
666 if (ret) {
667 os_file_handle_error_no_exit(NULL, "closedir");
670 return(ret);
671 #endif
674 /***************************************************************************
675 This function returns information of the next file in the directory. We jump
676 over the '.' and '..' entries in the directory. */
679 os_file_readdir_next_file(
680 /*======================*/
681 /* out: 0 if ok, -1 if error, 1 if at the end
682 of the directory */
683 const char* dirname,/* in: directory name or path */
684 os_file_dir_t dir, /* in: directory stream */
685 os_file_stat_t* info) /* in/out: buffer where the info is returned */
687 #ifdef __WIN__
688 LPWIN32_FIND_DATA lpFindFileData;
689 BOOL ret;
691 lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
692 next_file:
693 ret = FindNextFile(dir, lpFindFileData);
695 if (ret) {
696 ut_a(strlen((char *) lpFindFileData->cFileName)
697 < OS_FILE_MAX_PATH);
699 if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
700 || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
702 goto next_file;
705 strcpy(info->name, (char *) lpFindFileData->cFileName);
707 info->size = (ib_longlong)(lpFindFileData->nFileSizeLow)
708 + (((ib_longlong)(lpFindFileData->nFileSizeHigh))
709 << 32);
711 if (lpFindFileData->dwFileAttributes
712 & FILE_ATTRIBUTE_REPARSE_POINT) {
713 /* TODO: test Windows symlinks */
714 /* TODO: MySQL has apparently its own symlink
715 implementation in Windows, dbname.sym can
716 redirect a database directory:
717 http://dev.mysql.com/doc/refman/5.1/en/
718 windows-symbolic-links.html */
719 info->type = OS_FILE_TYPE_LINK;
720 } else if (lpFindFileData->dwFileAttributes
721 & FILE_ATTRIBUTE_DIRECTORY) {
722 info->type = OS_FILE_TYPE_DIR;
723 } else {
724 /* It is probably safest to assume that all other
725 file types are normal. Better to check them rather
726 than blindly skip them. */
728 info->type = OS_FILE_TYPE_FILE;
732 ut_free(lpFindFileData);
734 if (ret) {
735 return(0);
736 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
738 return(1);
739 } else {
740 os_file_handle_error_no_exit(dirname,
741 "readdir_next_file");
742 return(-1);
744 #else
745 struct dirent* ent;
746 char* full_path;
747 int ret;
748 struct stat statinfo;
749 #ifdef HAVE_READDIR_R
750 char dirent_buf[sizeof(struct dirent)
751 + _POSIX_PATH_MAX + 100];
752 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
753 the max file name len; but in most standards, the
754 length is NAME_MAX; we add 100 to be even safer */
755 #endif
757 next_file:
759 #ifdef HAVE_READDIR_R
760 ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
762 if (ret != 0
763 #ifdef UNIV_AIX
764 /* On AIX, only if we got non-NULL 'ent' (result) value and
765 a non-zero 'ret' (return) value, it indicates a failed
766 readdir_r() call. An NULL 'ent' with an non-zero 'ret'
767 would indicate the "end of the directory" is reached. */
768 && ent != NULL
769 #endif
771 fprintf(stderr,
772 "InnoDB: cannot read directory %s, error %lu\n",
773 dirname, (ulong)ret);
775 return(-1);
778 if (ent == NULL) {
779 /* End of directory */
781 return(1);
784 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
785 #else
786 ent = readdir(dir);
788 if (ent == NULL) {
790 return(1);
792 #endif
793 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
795 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
797 goto next_file;
800 strcpy(info->name, ent->d_name);
802 full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
804 sprintf(full_path, "%s/%s", dirname, ent->d_name);
806 ret = stat(full_path, &statinfo);
808 if (ret) {
809 os_file_handle_error_no_exit(full_path, "stat");
811 ut_free(full_path);
813 return(-1);
816 info->size = (ib_longlong)statinfo.st_size;
818 if (S_ISDIR(statinfo.st_mode)) {
819 info->type = OS_FILE_TYPE_DIR;
820 } else if (S_ISLNK(statinfo.st_mode)) {
821 info->type = OS_FILE_TYPE_LINK;
822 } else if (S_ISREG(statinfo.st_mode)) {
823 info->type = OS_FILE_TYPE_FILE;
824 } else {
825 info->type = OS_FILE_TYPE_UNKNOWN;
828 ut_free(full_path);
830 return(0);
831 #endif
834 /*********************************************************************
835 This function attempts to create a directory named pathname. The new directory
836 gets default permissions. On Unix the permissions are (0770 & ~umask). If the
837 directory exists already, nothing is done and the call succeeds, unless the
838 fail_if_exists arguments is true. */
840 ibool
841 os_file_create_directory(
842 /*=====================*/
843 /* out: TRUE if call succeeds,
844 FALSE on error */
845 const char* pathname, /* in: directory name as
846 null-terminated string */
847 ibool fail_if_exists) /* in: if TRUE, pre-existing directory
848 is treated as an error. */
850 #ifdef __WIN__
851 BOOL rcode;
853 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
854 if (!(rcode != 0
855 || (GetLastError() == ERROR_ALREADY_EXISTS
856 && !fail_if_exists))) {
857 /* failure */
858 os_file_handle_error(pathname, "CreateDirectory");
860 return(FALSE);
863 return (TRUE);
864 #else
865 int rcode;
867 rcode = mkdir(pathname, 0770);
869 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
870 /* failure */
871 os_file_handle_error(pathname, "mkdir");
873 return(FALSE);
876 return (TRUE);
877 #endif
880 /********************************************************************
881 A simple function to open or create a file. */
883 os_file_t
884 os_file_create_simple(
885 /*==================*/
886 /* out, own: handle to the file, not defined
887 if error, error number can be retrieved with
888 os_file_get_last_error */
889 const char* name, /* in: name of the file or path as a
890 null-terminated string */
891 ulint create_mode,/* in: OS_FILE_OPEN if an existing file is
892 opened (if does not exist, error), or
893 OS_FILE_CREATE if a new file is created
894 (if exists, error), or
895 OS_FILE_CREATE_PATH if new file
896 (if exists, error) and subdirectories along
897 its path are created (if needed)*/
898 ulint access_type,/* in: OS_FILE_READ_ONLY or
899 OS_FILE_READ_WRITE */
900 ibool* success)/* out: TRUE if succeed, FALSE if error */
902 #ifdef __WIN__
903 os_file_t file;
904 DWORD create_flag;
905 DWORD access;
906 DWORD attributes = 0;
907 ibool retry;
909 try_again:
910 ut_a(name);
912 if (create_mode == OS_FILE_OPEN) {
913 create_flag = OPEN_EXISTING;
914 } else if (create_mode == OS_FILE_CREATE) {
915 create_flag = CREATE_NEW;
916 } else if (create_mode == OS_FILE_CREATE_PATH) {
917 /* create subdirs along the path if needed */
918 *success = os_file_create_subdirs_if_needed(name);
919 if (!*success) {
920 ut_error;
922 create_flag = CREATE_NEW;
923 create_mode = OS_FILE_CREATE;
924 } else {
925 create_flag = 0;
926 ut_error;
929 if (access_type == OS_FILE_READ_ONLY) {
930 access = GENERIC_READ;
931 } else if (access_type == OS_FILE_READ_WRITE) {
932 access = GENERIC_READ | GENERIC_WRITE;
933 } else {
934 access = 0;
935 ut_error;
938 file = CreateFile((LPCTSTR) name,
939 access,
940 FILE_SHARE_READ | FILE_SHARE_WRITE,
941 /* file can be read and written also
942 by other processes */
943 NULL, /* default security attributes */
944 create_flag,
945 attributes,
946 NULL); /* no template file */
948 if (file == INVALID_HANDLE_VALUE) {
949 *success = FALSE;
951 retry = os_file_handle_error(name,
952 create_mode == OS_FILE_OPEN ?
953 "open" : "create");
954 if (retry) {
955 goto try_again;
957 } else {
958 *success = TRUE;
961 return(file);
962 #else /* __WIN__ */
963 os_file_t file;
964 int create_flag;
965 ibool retry;
967 try_again:
968 ut_a(name);
970 if (create_mode == OS_FILE_OPEN) {
971 if (access_type == OS_FILE_READ_ONLY) {
972 create_flag = O_RDONLY;
973 } else {
974 create_flag = O_RDWR;
976 } else if (create_mode == OS_FILE_CREATE) {
977 create_flag = O_RDWR | O_CREAT | O_EXCL;
978 } else if (create_mode == OS_FILE_CREATE_PATH) {
979 /* create subdirs along the path if needed */
980 *success = os_file_create_subdirs_if_needed(name);
981 if (!*success) {
982 return (-1);
984 create_flag = O_RDWR | O_CREAT | O_EXCL;
985 create_mode = OS_FILE_CREATE;
986 } else {
987 create_flag = 0;
988 ut_error;
991 if (create_mode == OS_FILE_CREATE) {
992 file = open(name, create_flag, S_IRUSR | S_IWUSR
993 | S_IRGRP | S_IWGRP);
994 } else {
995 file = open(name, create_flag);
998 if (file == -1) {
999 *success = FALSE;
1001 retry = os_file_handle_error(name,
1002 create_mode == OS_FILE_OPEN ?
1003 "open" : "create");
1004 if (retry) {
1005 goto try_again;
1007 #ifdef USE_FILE_LOCK
1008 } else if (access_type == OS_FILE_READ_WRITE
1009 && os_file_lock(file, name)) {
1010 *success = FALSE;
1011 close(file);
1012 file = -1;
1013 #endif
1014 } else {
1015 *success = TRUE;
1018 return(file);
1019 #endif /* __WIN__ */
1022 /********************************************************************
1023 A simple function to open or create a file. */
1025 os_file_t
1026 os_file_create_simple_no_error_handling(
1027 /*====================================*/
1028 /* out, own: handle to the file, not defined
1029 if error, error number can be retrieved with
1030 os_file_get_last_error */
1031 const char* name, /* in: name of the file or path as a
1032 null-terminated string */
1033 ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1034 is opened (if does not exist, error), or
1035 OS_FILE_CREATE if a new file is created
1036 (if exists, error) */
1037 ulint access_type,/* in: OS_FILE_READ_ONLY,
1038 OS_FILE_READ_WRITE, or
1039 OS_FILE_READ_ALLOW_DELETE; the last option is
1040 used by a backup program reading the file */
1041 ibool* success)/* out: TRUE if succeed, FALSE if error */
1043 #ifdef __WIN__
1044 os_file_t file;
1045 DWORD create_flag;
1046 DWORD access;
1047 DWORD attributes = 0;
1048 DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1050 ut_a(name);
1052 if (create_mode == OS_FILE_OPEN) {
1053 create_flag = OPEN_EXISTING;
1054 } else if (create_mode == OS_FILE_CREATE) {
1055 create_flag = CREATE_NEW;
1056 } else {
1057 create_flag = 0;
1058 ut_error;
1061 if (access_type == OS_FILE_READ_ONLY) {
1062 access = GENERIC_READ;
1063 } else if (access_type == OS_FILE_READ_WRITE) {
1064 access = GENERIC_READ | GENERIC_WRITE;
1065 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1066 access = GENERIC_READ;
1067 share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1068 | FILE_SHARE_WRITE; /* A backup program has to give
1069 mysqld the maximum freedom to
1070 do what it likes with the
1071 file */
1072 } else {
1073 access = 0;
1074 ut_error;
1077 file = CreateFile((LPCTSTR) name,
1078 access,
1079 share_mode,
1080 NULL, /* default security attributes */
1081 create_flag,
1082 attributes,
1083 NULL); /* no template file */
1085 if (file == INVALID_HANDLE_VALUE) {
1086 *success = FALSE;
1087 } else {
1088 *success = TRUE;
1091 return(file);
1092 #else /* __WIN__ */
1093 os_file_t file;
1094 int create_flag;
1096 ut_a(name);
1098 if (create_mode == OS_FILE_OPEN) {
1099 if (access_type == OS_FILE_READ_ONLY) {
1100 create_flag = O_RDONLY;
1101 } else {
1102 create_flag = O_RDWR;
1104 } else if (create_mode == OS_FILE_CREATE) {
1105 create_flag = O_RDWR | O_CREAT | O_EXCL;
1106 } else {
1107 create_flag = 0;
1108 ut_error;
1111 if (create_mode == OS_FILE_CREATE) {
1112 file = open(name, create_flag, S_IRUSR | S_IWUSR
1113 | S_IRGRP | S_IWGRP);
1114 } else {
1115 file = open(name, create_flag);
1118 if (file == -1) {
1119 *success = FALSE;
1120 #ifdef USE_FILE_LOCK
1121 } else if (access_type == OS_FILE_READ_WRITE
1122 && os_file_lock(file, name)) {
1123 *success = FALSE;
1124 close(file);
1125 file = -1;
1126 #endif
1127 } else {
1128 *success = TRUE;
1131 return(file);
1132 #endif /* __WIN__ */
1135 /********************************************************************
1136 Tries to disable OS caching on an opened file descriptor. */
1138 void
1139 os_file_set_nocache(
1140 /*================*/
1141 int fd /* in: file descriptor to alter */
1142 __attribute__((unused)),
1143 const char* file_name /* in: used in the diagnostic message */
1144 __attribute__((unused)),
1145 const char* operation_name __attribute__((unused)))
1146 /* in: used in the diagnostic message,
1147 we call os_file_set_nocache()
1148 immediately after opening or creating
1149 a file, so this is either "open" or
1150 "create" */
1152 /* some versions of Solaris may not have DIRECTIO_ON */
1153 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1154 if (directio(fd, DIRECTIO_ON) == -1) {
1155 int errno_save;
1156 errno_save = (int)errno;
1157 ut_print_timestamp(stderr);
1158 fprintf(stderr,
1159 " InnoDB: Failed to set DIRECTIO_ON "
1160 "on file %s: %s: %s, continuing anyway\n",
1161 file_name, operation_name, strerror(errno_save));
1163 #elif defined(O_DIRECT)
1164 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1165 int errno_save;
1166 errno_save = (int)errno;
1167 ut_print_timestamp(stderr);
1168 fprintf(stderr,
1169 " InnoDB: Failed to set O_DIRECT "
1170 "on file %s: %s: %s, continuing anyway\n",
1171 file_name, operation_name, strerror(errno_save));
1172 if (errno_save == EINVAL) {
1173 ut_print_timestamp(stderr);
1174 fprintf(stderr,
1175 " InnoDB: O_DIRECT is known to result in "
1176 "'Invalid argument' on Linux on tmpfs, "
1177 "see MySQL Bug#26662\n");
1180 #endif
1183 /********************************************************************
1184 Opens an existing file or creates a new. */
1186 os_file_t
1187 os_file_create(
1188 /*===========*/
1189 /* out, own: handle to the file, not defined
1190 if error, error number can be retrieved with
1191 os_file_get_last_error */
1192 const char* name, /* in: name of the file or path as a
1193 null-terminated string */
1194 ulint create_mode,/* in: OS_FILE_OPEN if an existing file
1195 is opened (if does not exist, error), or
1196 OS_FILE_CREATE if a new file is created
1197 (if exists, error),
1198 OS_FILE_OVERWRITE if a new file is created
1199 or an old overwritten;
1200 OS_FILE_OPEN_RAW, if a raw device or disk
1201 partition should be opened */
1202 ulint purpose,/* in: OS_FILE_AIO, if asynchronous,
1203 non-buffered i/o is desired,
1204 OS_FILE_NORMAL, if any normal file;
1205 NOTE that it also depends on type, os_aio_..
1206 and srv_.. variables whether we really use
1207 async i/o or unbuffered i/o: look in the
1208 function source code for the exact rules */
1209 ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */
1210 ibool* success)/* out: TRUE if succeed, FALSE if error */
1212 #ifdef __WIN__
1213 os_file_t file;
1214 DWORD share_mode = FILE_SHARE_READ;
1215 DWORD create_flag;
1216 DWORD attributes;
1217 ibool retry;
1219 DBUG_EXECUTE_IF(
1220 "ib_create_table_fail_disk_full",
1221 *success = FALSE;
1222 SetLastError(ERROR_DISK_FULL);
1223 return((os_file_t) -1);
1226 try_again:
1227 ut_a(name);
1229 if (create_mode == OS_FILE_OPEN_RAW) {
1230 create_flag = OPEN_EXISTING;
1231 share_mode = FILE_SHARE_WRITE;
1232 } else if (create_mode == OS_FILE_OPEN
1233 || create_mode == OS_FILE_OPEN_RETRY) {
1234 create_flag = OPEN_EXISTING;
1235 } else if (create_mode == OS_FILE_CREATE) {
1236 create_flag = CREATE_NEW;
1237 } else if (create_mode == OS_FILE_OVERWRITE) {
1238 create_flag = CREATE_ALWAYS;
1239 } else {
1240 create_flag = 0;
1241 ut_error;
1244 if (purpose == OS_FILE_AIO) {
1245 /* If specified, use asynchronous (overlapped) io and no
1246 buffering of writes in the OS */
1247 attributes = 0;
1248 #ifdef WIN_ASYNC_IO
1249 if (os_aio_use_native_aio) {
1250 attributes = attributes | FILE_FLAG_OVERLAPPED;
1252 #endif
1253 #ifdef UNIV_NON_BUFFERED_IO
1254 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1255 /* Do not use unbuffered i/o to log files because
1256 value 2 denotes that we do not flush the log at every
1257 commit, but only once per second */
1258 } else if (srv_win_file_flush_method
1259 == SRV_WIN_IO_UNBUFFERED) {
1260 attributes = attributes | FILE_FLAG_NO_BUFFERING;
1262 #endif
1263 } else if (purpose == OS_FILE_NORMAL) {
1264 attributes = 0;
1265 #ifdef UNIV_NON_BUFFERED_IO
1266 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1267 /* Do not use unbuffered i/o to log files because
1268 value 2 denotes that we do not flush the log at every
1269 commit, but only once per second */
1270 } else if (srv_win_file_flush_method
1271 == SRV_WIN_IO_UNBUFFERED) {
1272 attributes = attributes | FILE_FLAG_NO_BUFFERING;
1274 #endif
1275 } else {
1276 attributes = 0;
1277 ut_error;
1280 file = CreateFile((LPCTSTR) name,
1281 GENERIC_READ | GENERIC_WRITE, /* read and write
1282 access */
1283 share_mode, /* File can be read also by other
1284 processes; we must give the read
1285 permission because of ibbackup. We do
1286 not give the write permission to
1287 others because if one would succeed to
1288 start 2 instances of mysqld on the
1289 SAME files, that could cause severe
1290 database corruption! When opening
1291 raw disk partitions, Microsoft manuals
1292 say that we must give also the write
1293 permission. */
1294 NULL, /* default security attributes */
1295 create_flag,
1296 attributes,
1297 NULL); /* no template file */
1299 if (file == INVALID_HANDLE_VALUE) {
1300 *success = FALSE;
1302 /* When srv_file_per_table is on, file creation failure may not
1303 be critical to the whole instance. Do not crash the server in
1304 case of unknown errors. */
1305 if (srv_file_per_table) {
1306 retry = os_file_handle_error_no_exit(name,
1307 create_mode == OS_FILE_CREATE ?
1308 "create" : "open");
1309 } else {
1310 retry = os_file_handle_error(name,
1311 create_mode == OS_FILE_CREATE ?
1312 "create" : "open");
1315 if (retry) {
1316 goto try_again;
1318 } else {
1319 *success = TRUE;
1322 return(file);
1323 #else /* __WIN__ */
1324 os_file_t file;
1325 int create_flag;
1326 ibool retry;
1327 const char* mode_str = NULL;
1329 DBUG_EXECUTE_IF(
1330 "ib_create_table_fail_disk_full",
1331 *success = FALSE;
1332 errno = ENOSPC;
1333 return((os_file_t) -1);
1336 try_again:
1337 ut_a(name);
1339 if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1340 || create_mode == OS_FILE_OPEN_RETRY) {
1341 mode_str = "OPEN";
1342 create_flag = O_RDWR;
1343 } else if (create_mode == OS_FILE_CREATE) {
1344 mode_str = "CREATE";
1345 create_flag = O_RDWR | O_CREAT | O_EXCL;
1346 } else if (create_mode == OS_FILE_OVERWRITE) {
1347 mode_str = "OVERWRITE";
1348 create_flag = O_RDWR | O_CREAT | O_TRUNC;
1349 } else {
1350 create_flag = 0;
1351 ut_error;
1354 ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1355 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1357 #ifdef O_SYNC
1358 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1359 O_SYNC because the datasync options seemed to corrupt files in 2001
1360 in both Linux and Solaris */
1361 if (type == OS_LOG_FILE
1362 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1364 # if 0
1365 fprintf(stderr, "Using O_SYNC for file %s\n", name);
1366 # endif
1368 create_flag = create_flag | O_SYNC;
1370 #endif /* O_SYNC */
1372 file = open(name, create_flag, os_innodb_umask);
1374 if (file == -1) {
1375 *success = FALSE;
1377 /* When srv_file_per_table is on, file creation failure may not
1378 be critical to the whole instance. Do not crash the server in
1379 case of unknown errors. */
1380 if (srv_file_per_table) {
1381 retry = os_file_handle_error_no_exit(name,
1382 create_mode == OS_FILE_CREATE ?
1383 "create" : "open");
1384 } else {
1385 retry = os_file_handle_error(name,
1386 create_mode == OS_FILE_CREATE ?
1387 "create" : "open");
1390 if (retry) {
1391 goto try_again;
1392 } else {
1393 return(file /* -1 */);
1396 /* else */
1398 *success = TRUE;
1400 /* We disable OS caching (O_DIRECT) only on data files */
1401 if (type != OS_LOG_FILE
1402 && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1404 os_file_set_nocache(file, name, mode_str);
1407 #ifdef USE_FILE_LOCK
1408 if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1410 if (create_mode == OS_FILE_OPEN_RETRY) {
1411 int i;
1412 ut_print_timestamp(stderr);
1413 fputs(" InnoDB: Retrying to lock"
1414 " the first data file\n",
1415 stderr);
1416 for (i = 0; i < 100; i++) {
1417 os_thread_sleep(1000000);
1418 if (!os_file_lock(file, name)) {
1419 *success = TRUE;
1420 return(file);
1423 ut_print_timestamp(stderr);
1424 fputs(" InnoDB: Unable to open the first data file\n",
1425 stderr);
1428 *success = FALSE;
1429 close(file);
1430 file = -1;
1432 #endif /* USE_FILE_LOCK */
1434 return(file);
1435 #endif /* __WIN__ */
1438 /***************************************************************************
1439 Deletes a file if it exists. The file has to be closed before calling this. */
1441 ibool
1442 os_file_delete_if_exists(
1443 /*=====================*/
1444 /* out: TRUE if success */
1445 const char* name) /* in: file path as a null-terminated string */
1447 #ifdef __WIN__
1448 BOOL ret;
1449 ulint count = 0;
1450 loop:
1451 /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1452 it */
1454 ret = DeleteFile((LPCTSTR)name);
1456 if (ret) {
1457 return(TRUE);
1460 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1461 /* the file does not exist, this not an error */
1463 return(TRUE);
1466 count++;
1468 if (count > 100 && 0 == (count % 10)) {
1469 fprintf(stderr,
1470 "InnoDB: Warning: cannot delete file %s\n"
1471 "InnoDB: Are you running ibbackup"
1472 " to back up the file?\n", name);
1474 os_file_get_last_error(TRUE); /* print error information */
1477 os_thread_sleep(1000000); /* sleep for a second */
1479 if (count > 2000) {
1481 return(FALSE);
1484 goto loop;
1485 #else
1486 int ret;
1488 ret = unlink((const char*)name);
1490 if (ret != 0 && errno != ENOENT) {
1491 os_file_handle_error_no_exit(name, "delete");
1493 return(FALSE);
1496 return(TRUE);
1497 #endif
1500 /***************************************************************************
1501 Deletes a file. The file has to be closed before calling this. */
1503 ibool
1504 os_file_delete(
1505 /*===========*/
1506 /* out: TRUE if success */
1507 const char* name) /* in: file path as a null-terminated string */
1509 #ifdef __WIN__
1510 BOOL ret;
1511 ulint count = 0;
1512 loop:
1513 /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1514 it */
1516 ret = DeleteFile((LPCTSTR)name);
1518 if (ret) {
1519 return(TRUE);
1522 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1523 /* If the file does not exist, we classify this as a 'mild'
1524 error and return */
1526 return(FALSE);
1529 count++;
1531 if (count > 100 && 0 == (count % 10)) {
1532 fprintf(stderr,
1533 "InnoDB: Warning: cannot delete file %s\n"
1534 "InnoDB: Are you running ibbackup"
1535 " to back up the file?\n", name);
1537 os_file_get_last_error(TRUE); /* print error information */
1540 os_thread_sleep(1000000); /* sleep for a second */
1542 if (count > 2000) {
1544 return(FALSE);
1547 goto loop;
1548 #else
1549 int ret;
1551 ret = unlink((const char*)name);
1553 if (ret != 0) {
1554 os_file_handle_error_no_exit(name, "delete");
1556 return(FALSE);
1559 return(TRUE);
1560 #endif
1563 /***************************************************************************
1564 Renames a file (can also move it to another directory). It is safest that the
1565 file is closed before calling this function. */
1567 ibool
1568 os_file_rename(
1569 /*===========*/
1570 /* out: TRUE if success */
1571 const char* oldpath,/* in: old file path as a null-terminated
1572 string */
1573 const char* newpath)/* in: new file path */
1575 #ifdef __WIN__
1576 BOOL ret;
1578 ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1580 if (ret) {
1581 return(TRUE);
1584 os_file_handle_error_no_exit(oldpath, "rename");
1586 return(FALSE);
1587 #else
1588 int ret;
1590 ret = rename((const char*)oldpath, (const char*)newpath);
1592 if (ret != 0) {
1593 os_file_handle_error_no_exit(oldpath, "rename");
1595 return(FALSE);
1598 return(TRUE);
1599 #endif
1602 /***************************************************************************
1603 Closes a file handle. In case of error, error number can be retrieved with
1604 os_file_get_last_error. */
1606 ibool
1607 os_file_close(
1608 /*==========*/
1609 /* out: TRUE if success */
1610 os_file_t file) /* in, own: handle to a file */
1612 #ifdef __WIN__
1613 BOOL ret;
1615 ut_a(file);
1617 ret = CloseHandle(file);
1619 if (ret) {
1620 return(TRUE);
1623 os_file_handle_error(NULL, "close");
1625 return(FALSE);
1626 #else
1627 int ret;
1629 ret = close(file);
1631 if (ret == -1) {
1632 os_file_handle_error(NULL, "close");
1634 return(FALSE);
1637 return(TRUE);
1638 #endif
1641 /***************************************************************************
1642 Closes a file handle. */
1644 ibool
1645 os_file_close_no_error_handling(
1646 /*============================*/
1647 /* out: TRUE if success */
1648 os_file_t file) /* in, own: handle to a file */
1650 #ifdef __WIN__
1651 BOOL ret;
1653 ut_a(file);
1655 ret = CloseHandle(file);
1657 if (ret) {
1658 return(TRUE);
1661 return(FALSE);
1662 #else
1663 int ret;
1665 ret = close(file);
1667 if (ret == -1) {
1669 return(FALSE);
1672 return(TRUE);
1673 #endif
1676 /***************************************************************************
1677 Gets a file size. */
1679 ibool
1680 os_file_get_size(
1681 /*=============*/
1682 /* out: TRUE if success */
1683 os_file_t file, /* in: handle to a file */
1684 ulint* size, /* out: least significant 32 bits of file
1685 size */
1686 ulint* size_high)/* out: most significant 32 bits of size */
1688 #ifdef __WIN__
1689 DWORD high;
1690 DWORD low;
1692 low = GetFileSize(file, &high);
1694 if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1695 return(FALSE);
1698 *size = low;
1699 *size_high = high;
1701 return(TRUE);
1702 #else
1703 off_t offs;
1705 offs = lseek(file, 0, SEEK_END);
1707 if (offs == ((off_t)-1)) {
1709 return(FALSE);
1712 if (sizeof(off_t) > 4) {
1713 *size = (ulint)(offs & 0xFFFFFFFFUL);
1714 *size_high = (ulint)(offs >> 32);
1715 } else {
1716 *size = (ulint) offs;
1717 *size_high = 0;
1720 return(TRUE);
1721 #endif
1724 /***************************************************************************
1725 Gets file size as a 64-bit integer ib_longlong. */
1727 ib_longlong
1728 os_file_get_size_as_iblonglong(
1729 /*===========================*/
1730 /* out: size in bytes, -1 if error */
1731 os_file_t file) /* in: handle to a file */
1733 ulint size;
1734 ulint size_high;
1735 ibool success;
1737 success = os_file_get_size(file, &size, &size_high);
1739 if (!success) {
1741 return(-1);
1744 return((((ib_longlong)size_high) << 32) + (ib_longlong)size);
1747 /***************************************************************************
1748 Write the specified number of zeros to a newly created file. */
1750 ibool
1751 os_file_set_size(
1752 /*=============*/
1753 /* out: TRUE if success */
1754 const char* name, /* in: name of the file or path as a
1755 null-terminated string */
1756 os_file_t file, /* in: handle to a file */
1757 ulint size, /* in: least significant 32 bits of file
1758 size */
1759 ulint size_high)/* in: most significant 32 bits of size */
1761 ib_longlong current_size;
1762 ib_longlong desired_size;
1763 ibool ret;
1764 byte* buf;
1765 byte* buf2;
1766 ulint buf_size;
1768 ut_a(size == (size & 0xFFFFFFFF));
1770 current_size = 0;
1771 desired_size = (ib_longlong)size + (((ib_longlong)size_high) << 32);
1773 /* Write up to 1 megabyte at a time. */
1774 buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1775 * UNIV_PAGE_SIZE;
1776 buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
1778 /* Align the buffer for possible raw i/o */
1779 buf = ut_align(buf2, UNIV_PAGE_SIZE);
1781 /* Write buffer full of zeros */
1782 memset(buf, 0, buf_size);
1784 if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1786 fprintf(stderr, "InnoDB: Progress in MB:");
1789 while (current_size < desired_size) {
1790 ulint n_bytes;
1792 if (desired_size - current_size < (ib_longlong) buf_size) {
1793 n_bytes = (ulint) (desired_size - current_size);
1794 } else {
1795 n_bytes = buf_size;
1798 ret = os_file_write(name, file, buf,
1799 (ulint)(current_size & 0xFFFFFFFF),
1800 (ulint)(current_size >> 32),
1801 n_bytes);
1802 if (!ret) {
1803 ut_free(buf2);
1804 goto error_handling;
1807 /* Print about progress for each 100 MB written */
1808 if ((ib_longlong) (current_size + n_bytes) / (ib_longlong)(100 * 1024 * 1024)
1809 != current_size / (ib_longlong)(100 * 1024 * 1024)) {
1811 fprintf(stderr, " %lu00",
1812 (ulong) ((current_size + n_bytes)
1813 / (ib_longlong)(100 * 1024 * 1024)));
1816 current_size += n_bytes;
1819 if (desired_size >= (ib_longlong)(100 * 1024 * 1024)) {
1821 fprintf(stderr, "\n");
1824 ut_free(buf2);
1826 ret = os_file_flush(file);
1828 if (ret) {
1829 return(TRUE);
1832 error_handling:
1833 return(FALSE);
1836 /***************************************************************************
1837 Truncates a file at its current position. */
1839 ibool
1840 os_file_set_eof(
1841 /*============*/
1842 /* out: TRUE if success */
1843 FILE* file) /* in: file to be truncated */
1845 #ifdef __WIN__
1846 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
1847 return(SetEndOfFile(h));
1848 #else /* __WIN__ */
1849 return(!ftruncate(fileno(file), ftell(file)));
1850 #endif /* __WIN__ */
1853 #ifndef __WIN__
1854 /***************************************************************************
1855 Wrapper to fsync(2) that retries the call on some errors.
1856 Returns the value 0 if successful; otherwise the value -1 is returned and
1857 the global variable errno is set to indicate the error. */
1859 static
1861 os_file_fsync(
1862 /*==========*/
1863 /* out: 0 if success, -1 otherwise */
1864 os_file_t file) /* in: handle to a file */
1866 int ret;
1867 int failures;
1868 ibool retry;
1870 failures = 0;
1872 do {
1873 ret = fsync(file);
1875 os_n_fsyncs++;
1877 if (ret == -1 && errno == ENOLCK) {
1879 if (failures % 100 == 0) {
1881 ut_print_timestamp(stderr);
1882 fprintf(stderr,
1883 " InnoDB: fsync(): "
1884 "No locks available; retrying\n");
1887 os_thread_sleep(200000 /* 0.2 sec */);
1889 failures++;
1891 retry = TRUE;
1892 } else {
1894 retry = FALSE;
1896 } while (retry);
1898 return(ret);
1900 #endif /* !__WIN__ */
1902 /***************************************************************************
1903 Flushes the write buffers of a given file to the disk. */
1905 ibool
1906 os_file_flush(
1907 /*==========*/
1908 /* out: TRUE if success */
1909 os_file_t file) /* in, own: handle to a file */
1911 #ifdef __WIN__
1912 BOOL ret;
1914 ut_a(file);
1916 os_n_fsyncs++;
1918 ret = FlushFileBuffers(file);
1920 if (ret) {
1921 return(TRUE);
1924 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
1925 actually a raw device, we choose to ignore that error if we are using
1926 raw disks */
1928 if (srv_start_raw_disk_in_use && GetLastError()
1929 == ERROR_INVALID_FUNCTION) {
1930 return(TRUE);
1933 os_file_handle_error(NULL, "flush");
1935 /* It is a fatal error if a file flush does not succeed, because then
1936 the database can get corrupt on disk */
1937 ut_error;
1939 return(FALSE);
1940 #else
1941 int ret;
1943 #if defined(HAVE_DARWIN_THREADS)
1944 # ifndef F_FULLFSYNC
1945 /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
1946 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
1947 # elif F_FULLFSYNC != 51
1948 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
1949 # endif
1950 /* Apple has disabled fsync() for internal disk drives in OS X. That
1951 caused corruption for a user when he tested a power outage. Let us in
1952 OS X use a nonstandard flush method recommended by an Apple
1953 engineer. */
1955 if (!srv_have_fullfsync) {
1956 /* If we are not on an operating system that supports this,
1957 then fall back to a plain fsync. */
1959 ret = os_file_fsync(file);
1960 } else {
1961 ret = fcntl(file, F_FULLFSYNC, NULL);
1963 if (ret) {
1964 /* If we are not on a file system that supports this,
1965 then fall back to a plain fsync. */
1966 ret = os_file_fsync(file);
1969 #else
1970 ret = os_file_fsync(file);
1971 #endif
1973 if (ret == 0) {
1974 return(TRUE);
1977 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
1978 we choose to ignore that error if we are using raw disks */
1980 if (srv_start_raw_disk_in_use && errno == EINVAL) {
1982 return(TRUE);
1985 ut_print_timestamp(stderr);
1987 fprintf(stderr,
1988 " InnoDB: Error: the OS said file flush did not succeed\n");
1990 os_file_handle_error(NULL, "flush");
1992 /* It is a fatal error if a file flush does not succeed, because then
1993 the database can get corrupt on disk */
1994 ut_error;
1996 return(FALSE);
1997 #endif
2000 #ifndef __WIN__
2001 /***********************************************************************
2002 Does a synchronous read operation in Posix. */
2003 static
2004 ssize_t
2005 os_file_pread(
2006 /*==========*/
2007 /* out: number of bytes read, -1 if error */
2008 os_file_t file, /* in: handle to a file */
2009 void* buf, /* in: buffer where to read */
2010 ulint n, /* in: number of bytes to read */
2011 ulint offset, /* in: least significant 32 bits of file
2012 offset from where to read */
2013 ulint offset_high) /* in: most significant 32 bits of
2014 offset */
2016 off_t offs;
2017 ssize_t n_bytes;
2019 ut_a((offset & 0xFFFFFFFFUL) == offset);
2021 /* If off_t is > 4 bytes in size, then we assume we can pass a
2022 64-bit address */
2024 if (sizeof(off_t) > 4) {
2025 offs = (off_t)offset + (((off_t)offset_high) << 32);
2027 } else {
2028 offs = (off_t)offset;
2030 if (offset_high > 0) {
2031 fprintf(stderr,
2032 "InnoDB: Error: file read at offset > 4 GB\n");
2036 os_n_file_reads++;
2038 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2039 os_mutex_enter(os_file_count_mutex);
2040 os_file_n_pending_preads++;
2041 os_n_pending_reads++;
2042 os_mutex_exit(os_file_count_mutex);
2044 n_bytes = pread(file, buf, (ssize_t)n, offs);
2046 os_mutex_enter(os_file_count_mutex);
2047 os_file_n_pending_preads--;
2048 os_n_pending_reads--;
2049 os_mutex_exit(os_file_count_mutex);
2051 return(n_bytes);
2052 #else
2054 off_t ret_offset;
2055 ssize_t ret;
2056 ulint i;
2058 os_mutex_enter(os_file_count_mutex);
2059 os_n_pending_reads++;
2060 os_mutex_exit(os_file_count_mutex);
2062 /* Protect the seek / read operation with a mutex */
2063 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2065 os_mutex_enter(os_file_seek_mutexes[i]);
2067 ret_offset = lseek(file, offs, SEEK_SET);
2069 if (ret_offset < 0) {
2070 ret = -1;
2071 } else {
2072 ret = read(file, buf, (ssize_t)n);
2075 os_mutex_exit(os_file_seek_mutexes[i]);
2077 os_mutex_enter(os_file_count_mutex);
2078 os_n_pending_reads--;
2079 os_mutex_exit(os_file_count_mutex);
2081 return(ret);
2083 #endif
2086 /***********************************************************************
2087 Does a synchronous write operation in Posix. */
2088 static
2089 ssize_t
2090 os_file_pwrite(
2091 /*===========*/
2092 /* out: number of bytes written, -1 if error */
2093 os_file_t file, /* in: handle to a file */
2094 const void* buf, /* in: buffer from where to write */
2095 ulint n, /* in: number of bytes to write */
2096 ulint offset, /* in: least significant 32 bits of file
2097 offset where to write */
2098 ulint offset_high) /* in: most significant 32 bits of
2099 offset */
2101 ssize_t ret;
2102 off_t offs;
2104 ut_a((offset & 0xFFFFFFFFUL) == offset);
2106 /* If off_t is > 4 bytes in size, then we assume we can pass a
2107 64-bit address */
2109 if (sizeof(off_t) > 4) {
2110 offs = (off_t)offset + (((off_t)offset_high) << 32);
2111 } else {
2112 offs = (off_t)offset;
2114 if (offset_high > 0) {
2115 fprintf(stderr,
2116 "InnoDB: Error: file write"
2117 " at offset > 4 GB\n");
2121 os_n_file_writes++;
2123 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2124 os_mutex_enter(os_file_count_mutex);
2125 os_file_n_pending_pwrites++;
2126 os_n_pending_writes++;
2127 os_mutex_exit(os_file_count_mutex);
2129 ret = pwrite(file, buf, (ssize_t)n, offs);
2131 os_mutex_enter(os_file_count_mutex);
2132 os_file_n_pending_pwrites--;
2133 os_n_pending_writes--;
2134 os_mutex_exit(os_file_count_mutex);
2136 # ifdef UNIV_DO_FLUSH
2137 if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2138 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2139 && !os_do_not_call_flush_at_each_write) {
2141 /* Always do fsync to reduce the probability that when
2142 the OS crashes, a database page is only partially
2143 physically written to disk. */
2145 ut_a(TRUE == os_file_flush(file));
2147 # endif /* UNIV_DO_FLUSH */
2149 return(ret);
2150 #else
2152 off_t ret_offset;
2153 ulint i;
2155 os_mutex_enter(os_file_count_mutex);
2156 os_n_pending_writes++;
2157 os_mutex_exit(os_file_count_mutex);
2159 /* Protect the seek / write operation with a mutex */
2160 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2162 os_mutex_enter(os_file_seek_mutexes[i]);
2164 ret_offset = lseek(file, offs, SEEK_SET);
2166 if (ret_offset < 0) {
2167 ret = -1;
2169 goto func_exit;
2172 ret = write(file, buf, (ssize_t)n);
2174 # ifdef UNIV_DO_FLUSH
2175 if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2176 && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2177 && !os_do_not_call_flush_at_each_write) {
2179 /* Always do fsync to reduce the probability that when
2180 the OS crashes, a database page is only partially
2181 physically written to disk. */
2183 ut_a(TRUE == os_file_flush(file));
2185 # endif /* UNIV_DO_FLUSH */
2187 func_exit:
2188 os_mutex_exit(os_file_seek_mutexes[i]);
2190 os_mutex_enter(os_file_count_mutex);
2191 os_n_pending_writes--;
2192 os_mutex_exit(os_file_count_mutex);
2194 return(ret);
2196 #endif
2198 #endif
2200 /***********************************************************************
2201 Requests a synchronous positioned read operation. */
2203 ibool
2204 os_file_read(
2205 /*=========*/
2206 /* out: TRUE if request was
2207 successful, FALSE if fail */
2208 os_file_t file, /* in: handle to a file */
2209 void* buf, /* in: buffer where to read */
2210 ulint offset, /* in: least significant 32 bits of file
2211 offset where to read */
2212 ulint offset_high, /* in: most significant 32 bits of
2213 offset */
2214 ulint n) /* in: number of bytes to read */
2216 #ifdef __WIN__
2217 BOOL ret;
2218 DWORD len;
2219 DWORD ret2;
2220 DWORD low;
2221 DWORD high;
2222 ibool retry;
2223 ulint i;
2225 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2226 no more than 32 bits. */
2227 ut_a((offset & 0xFFFFFFFFUL) == offset);
2228 ut_a((n & 0xFFFFFFFFUL) == n);
2230 os_n_file_reads++;
2231 os_bytes_read_since_printout += n;
2233 try_again:
2234 ut_ad(file);
2235 ut_ad(buf);
2236 ut_ad(n > 0);
2238 low = (DWORD) offset;
2239 high = (DWORD) offset_high;
2241 os_mutex_enter(os_file_count_mutex);
2242 os_n_pending_reads++;
2243 os_mutex_exit(os_file_count_mutex);
2245 /* Protect the seek / read operation with a mutex */
2246 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2248 os_mutex_enter(os_file_seek_mutexes[i]);
2250 ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2252 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2254 os_mutex_exit(os_file_seek_mutexes[i]);
2256 os_mutex_enter(os_file_count_mutex);
2257 os_n_pending_reads--;
2258 os_mutex_exit(os_file_count_mutex);
2260 goto error_handling;
2263 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2265 os_mutex_exit(os_file_seek_mutexes[i]);
2267 os_mutex_enter(os_file_count_mutex);
2268 os_n_pending_reads--;
2269 os_mutex_exit(os_file_count_mutex);
2271 if (ret && len == n) {
2272 return(TRUE);
2274 #else
2275 ibool retry;
2276 ssize_t ret;
2278 os_bytes_read_since_printout += n;
2280 try_again:
2281 ret = os_file_pread(file, buf, n, offset, offset_high);
2283 if ((ulint)ret == n) {
2285 return(TRUE);
2288 fprintf(stderr,
2289 "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2290 "InnoDB: Was only able to read %ld.\n",
2291 (ulong)n, (ulong)offset_high,
2292 (ulong)offset, (long)ret);
2293 #endif
2294 #ifdef __WIN__
2295 error_handling:
2296 #endif
2297 retry = os_file_handle_error(NULL, "read");
2299 if (retry) {
2300 goto try_again;
2303 fprintf(stderr,
2304 "InnoDB: Fatal error: cannot read from file."
2305 " OS error number %lu.\n",
2306 #ifdef __WIN__
2307 (ulong) GetLastError()
2308 #else
2309 (ulong) errno
2310 #endif
2312 fflush(stderr);
2314 ut_error;
2316 return(FALSE);
2319 /***********************************************************************
2320 Requests a synchronous positioned read operation. This function does not do
2321 any error handling. In case of error it returns FALSE. */
2323 ibool
2324 os_file_read_no_error_handling(
2325 /*===========================*/
2326 /* out: TRUE if request was
2327 successful, FALSE if fail */
2328 os_file_t file, /* in: handle to a file */
2329 void* buf, /* in: buffer where to read */
2330 ulint offset, /* in: least significant 32 bits of file
2331 offset where to read */
2332 ulint offset_high, /* in: most significant 32 bits of
2333 offset */
2334 ulint n) /* in: number of bytes to read */
2336 #ifdef __WIN__
2337 BOOL ret;
2338 DWORD len;
2339 DWORD ret2;
2340 DWORD low;
2341 DWORD high;
2342 ibool retry;
2343 ulint i;
2345 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2346 no more than 32 bits. */
2347 ut_a((offset & 0xFFFFFFFFUL) == offset);
2348 ut_a((n & 0xFFFFFFFFUL) == n);
2350 os_n_file_reads++;
2351 os_bytes_read_since_printout += n;
2353 try_again:
2354 ut_ad(file);
2355 ut_ad(buf);
2356 ut_ad(n > 0);
2358 low = (DWORD) offset;
2359 high = (DWORD) offset_high;
2361 os_mutex_enter(os_file_count_mutex);
2362 os_n_pending_reads++;
2363 os_mutex_exit(os_file_count_mutex);
2365 /* Protect the seek / read operation with a mutex */
2366 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2368 os_mutex_enter(os_file_seek_mutexes[i]);
2370 ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2372 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2374 os_mutex_exit(os_file_seek_mutexes[i]);
2376 os_mutex_enter(os_file_count_mutex);
2377 os_n_pending_reads--;
2378 os_mutex_exit(os_file_count_mutex);
2380 goto error_handling;
2383 ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2385 os_mutex_exit(os_file_seek_mutexes[i]);
2387 os_mutex_enter(os_file_count_mutex);
2388 os_n_pending_reads--;
2389 os_mutex_exit(os_file_count_mutex);
2391 if (ret && len == n) {
2392 return(TRUE);
2394 #else
2395 ibool retry;
2396 ssize_t ret;
2398 os_bytes_read_since_printout += n;
2400 try_again:
2401 ret = os_file_pread(file, buf, n, offset, offset_high);
2403 if ((ulint)ret == n) {
2405 return(TRUE);
2407 #endif
2408 #ifdef __WIN__
2409 error_handling:
2410 #endif
2411 retry = os_file_handle_error_no_exit(NULL, "read");
2413 if (retry) {
2414 goto try_again;
2417 return(FALSE);
2420 /***********************************************************************
2421 Rewind file to its start, read at most size - 1 bytes from it to str, and
2422 NUL-terminate str. All errors are silently ignored. This function is
2423 mostly meant to be used with temporary files. */
2425 void
2426 os_file_read_string(
2427 /*================*/
2428 FILE* file, /* in: file to read from */
2429 char* str, /* in: buffer where to read */
2430 ulint size) /* in: size of buffer */
2432 size_t flen;
2434 if (size == 0) {
2435 return;
2438 rewind(file);
2439 flen = fread(str, 1, size - 1, file);
2440 str[flen] = '\0';
2443 /***********************************************************************
2444 Requests a synchronous write operation. */
2446 ibool
2447 os_file_write(
2448 /*==========*/
2449 /* out: TRUE if request was
2450 successful, FALSE if fail */
2451 const char* name, /* in: name of the file or path as a
2452 null-terminated string */
2453 os_file_t file, /* in: handle to a file */
2454 const void* buf, /* in: buffer from which to write */
2455 ulint offset, /* in: least significant 32 bits of file
2456 offset where to write */
2457 ulint offset_high, /* in: most significant 32 bits of
2458 offset */
2459 ulint n) /* in: number of bytes to write */
2461 #ifdef __WIN__
2462 BOOL ret;
2463 DWORD len;
2464 DWORD ret2;
2465 DWORD low;
2466 DWORD high;
2467 ulint i;
2468 ulint n_retries = 0;
2469 ulint err;
2471 /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2472 no more than 32 bits. */
2473 ut_a((offset & 0xFFFFFFFFUL) == offset);
2474 ut_a((n & 0xFFFFFFFFUL) == n);
2476 os_n_file_writes++;
2478 ut_ad(file);
2479 ut_ad(buf);
2480 ut_ad(n > 0);
2481 retry:
2482 low = (DWORD) offset;
2483 high = (DWORD) offset_high;
2485 os_mutex_enter(os_file_count_mutex);
2486 os_n_pending_writes++;
2487 os_mutex_exit(os_file_count_mutex);
2489 /* Protect the seek / write operation with a mutex */
2490 i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2492 os_mutex_enter(os_file_seek_mutexes[i]);
2494 ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2496 if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2498 os_mutex_exit(os_file_seek_mutexes[i]);
2500 os_mutex_enter(os_file_count_mutex);
2501 os_n_pending_writes--;
2502 os_mutex_exit(os_file_count_mutex);
2504 ut_print_timestamp(stderr);
2506 fprintf(stderr,
2507 " InnoDB: Error: File pointer positioning to"
2508 " file %s failed at\n"
2509 "InnoDB: offset %lu %lu. Operating system"
2510 " error number %lu.\n"
2511 "InnoDB: Some operating system error numbers"
2512 " are described at\n"
2513 "InnoDB: "
2514 "http://dev.mysql.com/doc/refman/5.1/en/"
2515 "operating-system-error-codes.html\n",
2516 name, (ulong) offset_high, (ulong) offset,
2517 (ulong) GetLastError());
2519 return(FALSE);
2522 ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2524 /* Always do fsync to reduce the probability that when the OS crashes,
2525 a database page is only partially physically written to disk. */
2527 # ifdef UNIV_DO_FLUSH
2528 if (!os_do_not_call_flush_at_each_write) {
2529 ut_a(TRUE == os_file_flush(file));
2531 # endif /* UNIV_DO_FLUSH */
2533 os_mutex_exit(os_file_seek_mutexes[i]);
2535 os_mutex_enter(os_file_count_mutex);
2536 os_n_pending_writes--;
2537 os_mutex_exit(os_file_count_mutex);
2539 if (ret && len == n) {
2541 return(TRUE);
2544 /* If some background file system backup tool is running, then, at
2545 least in Windows 2000, we may get here a specific error. Let us
2546 retry the operation 100 times, with 1 second waits. */
2548 if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2550 os_thread_sleep(1000000);
2552 n_retries++;
2554 goto retry;
2557 if (!os_has_said_disk_full) {
2559 err = (ulint)GetLastError();
2561 ut_print_timestamp(stderr);
2563 fprintf(stderr,
2564 " InnoDB: Error: Write to file %s failed"
2565 " at offset %lu %lu.\n"
2566 "InnoDB: %lu bytes should have been written,"
2567 " only %lu were written.\n"
2568 "InnoDB: Operating system error number %lu.\n"
2569 "InnoDB: Check that your OS and file system"
2570 " support files of this size.\n"
2571 "InnoDB: Check also that the disk is not full"
2572 " or a disk quota exceeded.\n",
2573 name, (ulong) offset_high, (ulong) offset,
2574 (ulong) n, (ulong) len, (ulong) err);
2576 if (strerror((int)err) != NULL) {
2577 fprintf(stderr,
2578 "InnoDB: Error number %lu means '%s'.\n",
2579 (ulong) err, strerror((int)err));
2582 fprintf(stderr,
2583 "InnoDB: Some operating system error numbers"
2584 " are described at\n"
2585 "InnoDB: "
2586 "http://dev.mysql.com/doc/refman/5.1/en/"
2587 "operating-system-error-codes.html\n");
2589 os_has_said_disk_full = TRUE;
2592 return(FALSE);
2593 #else
2594 ssize_t ret;
2596 ret = os_file_pwrite(file, buf, n, offset, offset_high);
2598 if ((ulint)ret == n) {
2600 return(TRUE);
2603 if (!os_has_said_disk_full) {
2605 ut_print_timestamp(stderr);
2607 fprintf(stderr,
2608 " InnoDB: Error: Write to file %s failed"
2609 " at offset %lu %lu.\n"
2610 "InnoDB: %lu bytes should have been written,"
2611 " only %ld were written.\n"
2612 "InnoDB: Operating system error number %lu.\n"
2613 "InnoDB: Check that your OS and file system"
2614 " support files of this size.\n"
2615 "InnoDB: Check also that the disk is not full"
2616 " or a disk quota exceeded.\n",
2617 name, offset_high, offset, n, (long int)ret,
2618 (ulint)errno);
2619 if (strerror(errno) != NULL) {
2620 fprintf(stderr,
2621 "InnoDB: Error number %lu means '%s'.\n",
2622 (ulint)errno, strerror(errno));
2625 fprintf(stderr,
2626 "InnoDB: Some operating system error numbers"
2627 " are described at\n"
2628 "InnoDB: "
2629 "http://dev.mysql.com/doc/refman/5.1/en/"
2630 "operating-system-error-codes.html\n");
2632 os_has_said_disk_full = TRUE;
2635 return(FALSE);
2636 #endif
2639 /***********************************************************************
2640 Check the existence and type of the given file. */
2642 ibool
2643 os_file_status(
2644 /*===========*/
2645 /* out: TRUE if call succeeded */
2646 const char* path, /* in: pathname of the file */
2647 ibool* exists, /* out: TRUE if file exists */
2648 os_file_type_t* type) /* out: type of the file (if it exists) */
2650 #ifdef __WIN__
2651 int ret;
2652 struct _stat statinfo;
2654 ret = _stat(path, &statinfo);
2655 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2656 /* file does not exist */
2657 *exists = FALSE;
2658 return(TRUE);
2659 } else if (ret) {
2660 /* file exists, but stat call failed */
2662 os_file_handle_error_no_exit(path, "stat");
2664 return(FALSE);
2667 if (_S_IFDIR & statinfo.st_mode) {
2668 *type = OS_FILE_TYPE_DIR;
2669 } else if (_S_IFREG & statinfo.st_mode) {
2670 *type = OS_FILE_TYPE_FILE;
2671 } else {
2672 *type = OS_FILE_TYPE_UNKNOWN;
2675 *exists = TRUE;
2677 return(TRUE);
2678 #else
2679 int ret;
2680 struct stat statinfo;
2682 ret = stat(path, &statinfo);
2683 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2684 /* file does not exist */
2685 *exists = FALSE;
2686 return(TRUE);
2687 } else if (ret) {
2688 /* file exists, but stat call failed */
2690 os_file_handle_error_no_exit(path, "stat");
2692 return(FALSE);
2695 if (S_ISDIR(statinfo.st_mode)) {
2696 *type = OS_FILE_TYPE_DIR;
2697 } else if (S_ISLNK(statinfo.st_mode)) {
2698 *type = OS_FILE_TYPE_LINK;
2699 } else if (S_ISREG(statinfo.st_mode)) {
2700 *type = OS_FILE_TYPE_FILE;
2701 } else {
2702 *type = OS_FILE_TYPE_UNKNOWN;
2705 *exists = TRUE;
2707 return(TRUE);
2708 #endif
2711 /***********************************************************************
2712 This function returns information about the specified file */
2714 ibool
2715 os_file_get_status(
2716 /*===============*/
2717 /* out: TRUE if stat
2718 information found */
2719 const char* path, /* in: pathname of the file */
2720 os_file_stat_t* stat_info) /* information of a file in a
2721 directory */
2723 #ifdef __WIN__
2724 int ret;
2725 struct _stat statinfo;
2727 ret = _stat(path, &statinfo);
2728 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2729 /* file does not exist */
2731 return(FALSE);
2732 } else if (ret) {
2733 /* file exists, but stat call failed */
2735 os_file_handle_error_no_exit(path, "stat");
2737 return(FALSE);
2739 if (_S_IFDIR & statinfo.st_mode) {
2740 stat_info->type = OS_FILE_TYPE_DIR;
2741 } else if (_S_IFREG & statinfo.st_mode) {
2742 stat_info->type = OS_FILE_TYPE_FILE;
2743 } else {
2744 stat_info->type = OS_FILE_TYPE_UNKNOWN;
2747 stat_info->ctime = statinfo.st_ctime;
2748 stat_info->atime = statinfo.st_atime;
2749 stat_info->mtime = statinfo.st_mtime;
2750 stat_info->size = statinfo.st_size;
2752 return(TRUE);
2753 #else
2754 int ret;
2755 struct stat statinfo;
2757 ret = stat(path, &statinfo);
2759 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2760 /* file does not exist */
2762 return(FALSE);
2763 } else if (ret) {
2764 /* file exists, but stat call failed */
2766 os_file_handle_error_no_exit(path, "stat");
2768 return(FALSE);
2771 if (S_ISDIR(statinfo.st_mode)) {
2772 stat_info->type = OS_FILE_TYPE_DIR;
2773 } else if (S_ISLNK(statinfo.st_mode)) {
2774 stat_info->type = OS_FILE_TYPE_LINK;
2775 } else if (S_ISREG(statinfo.st_mode)) {
2776 stat_info->type = OS_FILE_TYPE_FILE;
2777 } else {
2778 stat_info->type = OS_FILE_TYPE_UNKNOWN;
2781 stat_info->ctime = statinfo.st_ctime;
2782 stat_info->atime = statinfo.st_atime;
2783 stat_info->mtime = statinfo.st_mtime;
2784 stat_info->size = statinfo.st_size;
2786 return(TRUE);
2787 #endif
2790 /* path name separator character */
2791 #ifdef __WIN__
2792 # define OS_FILE_PATH_SEPARATOR '\\'
2793 #else
2794 # define OS_FILE_PATH_SEPARATOR '/'
2795 #endif
2797 /********************************************************************
2798 The function os_file_dirname returns a directory component of a
2799 null-terminated pathname string. In the usual case, dirname returns
2800 the string up to, but not including, the final '/', and basename
2801 is the component following the final '/'. Trailing '/' charac­
2802 ters are not counted as part of the pathname.
2804 If path does not contain a slash, dirname returns the string ".".
2806 Concatenating the string returned by dirname, a "/", and the basename
2807 yields a complete pathname.
2809 The return value is a copy of the directory component of the pathname.
2810 The copy is allocated from heap. It is the caller responsibility
2811 to free it after it is no longer needed.
2813 The following list of examples (taken from SUSv2) shows the strings
2814 returned by dirname and basename for different paths:
2816 path dirname basename
2817 "/usr/lib" "/usr" "lib"
2818 "/usr/" "/" "usr"
2819 "usr" "." "usr"
2820 "/" "/" "/"
2821 "." "." "."
2822 ".." "." ".."
2825 char*
2826 os_file_dirname(
2827 /*============*/
2828 /* out, own: directory component of the
2829 pathname */
2830 const char* path) /* in: pathname */
2832 /* Find the offset of the last slash */
2833 const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
2834 if (!last_slash) {
2835 /* No slash in the path, return "." */
2837 return(mem_strdup("."));
2840 /* Ok, there is a slash */
2842 if (last_slash == path) {
2843 /* last slash is the first char of the path */
2845 return(mem_strdup("/"));
2848 /* Non-trivial directory component */
2850 return(mem_strdupl(path, last_slash - path));
2853 /********************************************************************
2854 Creates all missing subdirectories along the given path. */
2856 ibool
2857 os_file_create_subdirs_if_needed(
2858 /*=============================*/
2859 /* out: TRUE if call succeeded
2860 FALSE otherwise */
2861 const char* path) /* in: path name */
2863 char* subdir;
2864 ibool success, subdir_exists;
2865 os_file_type_t type;
2867 subdir = os_file_dirname(path);
2868 if (strlen(subdir) == 1
2869 && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
2870 /* subdir is root or cwd, nothing to do */
2871 mem_free(subdir);
2873 return(TRUE);
2876 /* Test if subdir exists */
2877 success = os_file_status(subdir, &subdir_exists, &type);
2878 if (success && !subdir_exists) {
2879 /* subdir does not exist, create it */
2880 success = os_file_create_subdirs_if_needed(subdir);
2881 if (!success) {
2882 mem_free(subdir);
2884 return(FALSE);
2886 success = os_file_create_directory(subdir, FALSE);
2889 mem_free(subdir);
2891 return(success);
2894 /********************************************************************
2895 Returns a pointer to the nth slot in the aio array. */
2896 static
2897 os_aio_slot_t*
2898 os_aio_array_get_nth_slot(
2899 /*======================*/
2900 /* out: pointer to slot */
2901 os_aio_array_t* array, /* in: aio array */
2902 ulint index) /* in: index of the slot */
2904 ut_a(index < array->n_slots);
2906 return((array->slots) + index);
2909 /****************************************************************************
2910 Creates an aio wait array. */
2911 static
2912 os_aio_array_t*
2913 os_aio_array_create(
2914 /*================*/
2915 /* out, own: aio array */
2916 ulint n, /* in: maximum number of pending aio operations
2917 allowed; n must be divisible by n_segments */
2918 ulint n_segments) /* in: number of segments in the aio array */
2920 os_aio_array_t* array;
2921 ulint i;
2922 os_aio_slot_t* slot;
2923 #ifdef WIN_ASYNC_IO
2924 OVERLAPPED* over;
2925 #endif
2926 ut_a(n > 0);
2927 ut_a(n_segments > 0);
2929 array = ut_malloc(sizeof(os_aio_array_t));
2931 array->mutex = os_mutex_create(NULL);
2932 array->not_full = os_event_create(NULL);
2933 array->is_empty = os_event_create(NULL);
2935 os_event_set(array->is_empty);
2937 array->n_slots = n;
2938 array->n_segments = n_segments;
2939 array->n_reserved = 0;
2940 array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
2941 #ifdef __WIN__
2942 array->native_events = ut_malloc(n * sizeof(os_native_event_t));
2943 #endif
2944 for (i = 0; i < n; i++) {
2945 slot = os_aio_array_get_nth_slot(array, i);
2947 slot->pos = i;
2948 slot->reserved = FALSE;
2949 #ifdef WIN_ASYNC_IO
2950 slot->event = os_event_create(NULL);
2952 over = &(slot->control);
2954 over->hEvent = slot->event->handle;
2956 *((array->native_events) + i) = over->hEvent;
2957 #endif
2960 return(array);
2963 /****************************************************************************
2964 Initializes the asynchronous io system. Calls also os_io_init_simple.
2965 Creates a separate aio array for
2966 non-ibuf read and write, a third aio array for the ibuf i/o, with just one
2967 segment, two aio arrays for log reads and writes with one segment, and a
2968 synchronous aio array of the specified size. The combined number of segments
2969 in the three first aio arrays is the parameter n_segments given to the
2970 function. The caller must create an i/o handler thread for each segment in
2971 the four first arrays, but not for the sync aio array. */
2973 void
2974 os_aio_init(
2975 /*========*/
2976 ulint n, /* in: maximum number of pending aio operations
2977 allowed; n must be divisible by n_segments */
2978 ulint n_segments, /* in: combined number of segments in the four
2979 first aio arrays; must be >= 4 */
2980 ulint n_slots_sync) /* in: number of slots in the sync aio array */
2982 ulint n_read_segs;
2983 ulint n_write_segs;
2984 ulint n_per_seg;
2985 ulint i;
2986 #ifdef POSIX_ASYNC_IO
2987 sigset_t sigset;
2988 #endif
2989 ut_ad(n % n_segments == 0);
2990 ut_ad(n_segments >= 4);
2992 os_io_init_simple();
2994 for (i = 0; i < n_segments; i++) {
2995 srv_set_io_thread_op_info(i, "not started yet");
2998 n_per_seg = n / n_segments;
2999 n_write_segs = (n_segments - 2) / 2;
3000 n_read_segs = n_segments - 2 - n_write_segs;
3002 /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3004 os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3006 srv_io_thread_function[0] = "insert buffer thread";
3008 os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3010 srv_io_thread_function[1] = "log thread";
3012 os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3013 n_read_segs);
3014 for (i = 2; i < 2 + n_read_segs; i++) {
3015 ut_a(i < SRV_MAX_N_IO_THREADS);
3016 srv_io_thread_function[i] = "read thread";
3019 os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3020 n_write_segs);
3021 for (i = 2 + n_read_segs; i < n_segments; i++) {
3022 ut_a(i < SRV_MAX_N_IO_THREADS);
3023 srv_io_thread_function[i] = "write thread";
3026 os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3028 os_aio_n_segments = n_segments;
3030 os_aio_validate();
3032 os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
3034 for (i = 0; i < n_segments; i++) {
3035 os_aio_segment_wait_events[i] = os_event_create(NULL);
3038 os_last_printout = time(NULL);
3040 #ifdef POSIX_ASYNC_IO
3041 /* Block aio signals from the current thread and its children:
3042 for this to work, the current thread must be the first created
3043 in the database, so that all its children will inherit its
3044 signal mask */
3046 /* TODO: to work MySQL needs the SIGALARM signal; the following
3047 will not work yet! */
3048 sigemptyset(&sigset);
3049 sigaddset(&sigset, SIGRTMIN + 1 + 0);
3050 sigaddset(&sigset, SIGRTMIN + 1 + 1);
3051 sigaddset(&sigset, SIGRTMIN + 1 + 2);
3052 sigaddset(&sigset, SIGRTMIN + 1 + 3);
3054 pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
3055 #endif
3058 #ifdef WIN_ASYNC_IO
3059 /****************************************************************************
3060 Wakes up all async i/o threads in the array in Windows async i/o at
3061 shutdown. */
3062 static
3063 void
3064 os_aio_array_wake_win_aio_at_shutdown(
3065 /*==================================*/
3066 os_aio_array_t* array) /* in: aio array */
3068 ulint i;
3070 for (i = 0; i < array->n_slots; i++) {
3072 os_event_set((array->slots + i)->event);
3075 #endif
3077 /****************************************************************************
3078 Wakes up all async i/o threads so that they know to exit themselves in
3079 shutdown. */
3081 void
3082 os_aio_wake_all_threads_at_shutdown(void)
3083 /*=====================================*/
3085 ulint i;
3087 #ifdef WIN_ASYNC_IO
3088 /* This code wakes up all ai/o threads in Windows native aio */
3089 os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3090 os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3091 os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3092 os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3093 #endif
3094 /* This loop wakes up all simulated ai/o threads */
3096 for (i = 0; i < os_aio_n_segments; i++) {
3098 os_event_set(os_aio_segment_wait_events[i]);
3102 /****************************************************************************
3103 Waits until there are no pending writes in os_aio_write_array. There can
3104 be other, synchronous, pending writes. */
3106 void
3107 os_aio_wait_until_no_pending_writes(void)
3108 /*=====================================*/
3110 os_event_wait(os_aio_write_array->is_empty);
3113 /**************************************************************************
3114 Calculates segment number for a slot. */
3115 static
3116 ulint
3117 os_aio_get_segment_no_from_slot(
3118 /*============================*/
3119 /* out: segment number (which is the number
3120 used by, for example, i/o-handler threads) */
3121 os_aio_array_t* array, /* in: aio wait array */
3122 os_aio_slot_t* slot) /* in: slot in this array */
3124 ulint segment;
3125 ulint seg_len;
3127 if (array == os_aio_ibuf_array) {
3128 segment = 0;
3130 } else if (array == os_aio_log_array) {
3131 segment = 1;
3133 } else if (array == os_aio_read_array) {
3134 seg_len = os_aio_read_array->n_slots
3135 / os_aio_read_array->n_segments;
3137 segment = 2 + slot->pos / seg_len;
3138 } else {
3139 ut_a(array == os_aio_write_array);
3140 seg_len = os_aio_write_array->n_slots
3141 / os_aio_write_array->n_segments;
3143 segment = os_aio_read_array->n_segments + 2
3144 + slot->pos / seg_len;
3147 return(segment);
3150 /**************************************************************************
3151 Calculates local segment number and aio array from global segment number. */
3152 static
3153 ulint
3154 os_aio_get_array_and_local_segment(
3155 /*===============================*/
3156 /* out: local segment number within
3157 the aio array */
3158 os_aio_array_t** array, /* out: aio wait array */
3159 ulint global_segment)/* in: global segment number */
3161 ulint segment;
3163 ut_a(global_segment < os_aio_n_segments);
3165 if (global_segment == 0) {
3166 *array = os_aio_ibuf_array;
3167 segment = 0;
3169 } else if (global_segment == 1) {
3170 *array = os_aio_log_array;
3171 segment = 0;
3173 } else if (global_segment < os_aio_read_array->n_segments + 2) {
3174 *array = os_aio_read_array;
3176 segment = global_segment - 2;
3177 } else {
3178 *array = os_aio_write_array;
3180 segment = global_segment - (os_aio_read_array->n_segments + 2);
3183 return(segment);
3186 /***********************************************************************
3187 Gets an integer value designating a specified aio array. This is used
3188 to give numbers to signals in Posix aio. */
3190 #if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO)
3191 static
3192 ulint
3193 os_aio_get_array_no(
3194 /*================*/
3195 os_aio_array_t* array) /* in: aio array */
3197 if (array == os_aio_ibuf_array) {
3199 return(0);
3201 } else if (array == os_aio_log_array) {
3203 return(1);
3205 } else if (array == os_aio_read_array) {
3207 return(2);
3208 } else if (array == os_aio_write_array) {
3210 return(3);
3211 } else {
3212 ut_error;
3214 return(0);
3218 /***********************************************************************
3219 Gets the aio array for its number. */
3220 static
3221 os_aio_array_t*
3222 os_aio_get_array_from_no(
3223 /*=====================*/
3224 /* out: aio array */
3225 ulint n) /* in: array number */
3227 if (n == 0) {
3228 return(os_aio_ibuf_array);
3229 } else if (n == 1) {
3231 return(os_aio_log_array);
3232 } else if (n == 2) {
3234 return(os_aio_read_array);
3235 } else if (n == 3) {
3237 return(os_aio_write_array);
3238 } else {
3239 ut_error;
3241 return(NULL);
3244 #endif /* if !defined(WIN_ASYNC_IO) && defined(POSIX_ASYNC_IO) */
3246 /***********************************************************************
3247 Requests for a slot in the aio array. If no slot is available, waits until
3248 not_full-event becomes signaled. */
3249 static
3250 os_aio_slot_t*
3251 os_aio_array_reserve_slot(
3252 /*======================*/
3253 /* out: pointer to slot */
3254 ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3255 os_aio_array_t* array, /* in: aio array */
3256 fil_node_t* message1,/* in: message to be passed along with
3257 the aio operation */
3258 void* message2,/* in: message to be passed along with
3259 the aio operation */
3260 os_file_t file, /* in: file handle */
3261 const char* name, /* in: name of the file or path as a
3262 null-terminated string */
3263 void* buf, /* in: buffer where to read or from which
3264 to write */
3265 ulint offset, /* in: least significant 32 bits of file
3266 offset */
3267 ulint offset_high, /* in: most significant 32 bits of
3268 offset */
3269 ulint len) /* in: length of the block to read or write */
3271 os_aio_slot_t* slot;
3272 ulint i;
3273 #ifdef WIN_ASYNC_IO
3274 OVERLAPPED* control;
3276 ut_a((len & 0xFFFFFFFFUL) == len);
3277 #elif defined(POSIX_ASYNC_IO)
3279 struct aiocb* control;
3280 #endif
3282 loop:
3283 os_mutex_enter(array->mutex);
3285 if (array->n_reserved == array->n_slots) {
3286 os_mutex_exit(array->mutex);
3288 if (!os_aio_use_native_aio) {
3289 /* If the handler threads are suspended, wake them
3290 so that we get more slots */
3292 os_aio_simulated_wake_handler_threads();
3295 os_event_wait(array->not_full);
3297 goto loop;
3300 for (i = 0;; i++) {
3301 slot = os_aio_array_get_nth_slot(array, i);
3303 if (slot->reserved == FALSE) {
3304 break;
3308 array->n_reserved++;
3310 if (array->n_reserved == 1) {
3311 os_event_reset(array->is_empty);
3314 if (array->n_reserved == array->n_slots) {
3315 os_event_reset(array->not_full);
3318 slot->reserved = TRUE;
3319 slot->reservation_time = time(NULL);
3320 slot->message1 = message1;
3321 slot->message2 = message2;
3322 slot->file = file;
3323 slot->name = name;
3324 slot->len = len;
3325 slot->type = type;
3326 slot->buf = buf;
3327 slot->offset = offset;
3328 slot->offset_high = offset_high;
3329 slot->io_already_done = FALSE;
3331 #ifdef WIN_ASYNC_IO
3332 control = &(slot->control);
3333 control->Offset = (DWORD)offset;
3334 control->OffsetHigh = (DWORD)offset_high;
3335 os_event_reset(slot->event);
3337 #elif defined(POSIX_ASYNC_IO)
3339 #if (UNIV_WORD_SIZE == 8)
3340 offset = offset + (offset_high << 32);
3341 #else
3342 ut_a(offset_high == 0);
3343 #endif
3344 control = &(slot->control);
3345 control->aio_fildes = file;
3346 control->aio_buf = buf;
3347 control->aio_nbytes = len;
3348 control->aio_offset = offset;
3349 control->aio_reqprio = 0;
3350 control->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
3351 control->aio_sigevent.sigev_signo
3352 = SIGRTMIN + 1 + os_aio_get_array_no(array);
3353 /* TODO: How to choose the signal numbers? */
3355 fprintf(stderr, "AIO signal number %lu\n",
3356 (ulint) control->aio_sigevent.sigev_signo);
3358 control->aio_sigevent.sigev_value.sival_ptr = slot;
3359 #endif
3360 os_mutex_exit(array->mutex);
3362 return(slot);
3365 /***********************************************************************
3366 Frees a slot in the aio array. */
3367 static
3368 void
3369 os_aio_array_free_slot(
3370 /*===================*/
3371 os_aio_array_t* array, /* in: aio array */
3372 os_aio_slot_t* slot) /* in: pointer to slot */
3374 ut_ad(array);
3375 ut_ad(slot);
3377 os_mutex_enter(array->mutex);
3379 ut_ad(slot->reserved);
3381 slot->reserved = FALSE;
3383 array->n_reserved--;
3385 if (array->n_reserved == array->n_slots - 1) {
3386 os_event_set(array->not_full);
3389 if (array->n_reserved == 0) {
3390 os_event_set(array->is_empty);
3393 #ifdef WIN_ASYNC_IO
3394 os_event_reset(slot->event);
3395 #endif
3396 os_mutex_exit(array->mutex);
3399 /**************************************************************************
3400 Wakes up a simulated aio i/o-handler thread if it has something to do. */
3401 static
3402 void
3403 os_aio_simulated_wake_handler_thread(
3404 /*=================================*/
3405 ulint global_segment) /* in: the number of the segment in the aio
3406 arrays */
3408 os_aio_array_t* array;
3409 os_aio_slot_t* slot;
3410 ulint segment;
3411 ulint n;
3412 ulint i;
3414 ut_ad(!os_aio_use_native_aio);
3416 segment = os_aio_get_array_and_local_segment(&array, global_segment);
3418 n = array->n_slots / array->n_segments;
3420 /* Look through n slots after the segment * n'th slot */
3422 os_mutex_enter(array->mutex);
3424 for (i = 0; i < n; i++) {
3425 slot = os_aio_array_get_nth_slot(array, i + segment * n);
3427 if (slot->reserved) {
3428 /* Found an i/o request */
3430 break;
3434 os_mutex_exit(array->mutex);
3436 if (i < n) {
3437 os_event_set(os_aio_segment_wait_events[global_segment]);
3441 /**************************************************************************
3442 Wakes up simulated aio i/o-handler threads if they have something to do. */
3444 void
3445 os_aio_simulated_wake_handler_threads(void)
3446 /*=======================================*/
3448 ulint i;
3450 if (os_aio_use_native_aio) {
3451 /* We do not use simulated aio: do nothing */
3453 return;
3456 os_aio_recommend_sleep_for_read_threads = FALSE;
3458 for (i = 0; i < os_aio_n_segments; i++) {
3459 os_aio_simulated_wake_handler_thread(i);
3463 /**************************************************************************
3464 This function can be called if one wants to post a batch of reads and
3465 prefers an i/o-handler thread to handle them all at once later. You must
3466 call os_aio_simulated_wake_handler_threads later to ensure the threads
3467 are not left sleeping! */
3469 void
3470 os_aio_simulated_put_read_threads_to_sleep(void)
3471 /*============================================*/
3473 os_aio_array_t* array;
3474 ulint g;
3476 os_aio_recommend_sleep_for_read_threads = TRUE;
3478 for (g = 0; g < os_aio_n_segments; g++) {
3479 os_aio_get_array_and_local_segment(&array, g);
3481 if (array == os_aio_read_array) {
3483 os_event_reset(os_aio_segment_wait_events[g]);
3488 /***********************************************************************
3489 Requests an asynchronous i/o operation. */
3491 ibool
3492 os_aio(
3493 /*===*/
3494 /* out: TRUE if request was queued
3495 successfully, FALSE if fail */
3496 ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */
3497 ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed
3498 to OS_AIO_SIMULATED_WAKE_LATER: the
3499 last flag advises this function not to wake
3500 i/o-handler threads, but the caller will
3501 do the waking explicitly later, in this
3502 way the caller can post several requests in
3503 a batch; NOTE that the batch must not be
3504 so big that it exhausts the slots in aio
3505 arrays! NOTE that a simulated batch
3506 may introduce hidden chances of deadlocks,
3507 because i/os are not actually handled until
3508 all have been posted: use with great
3509 caution! */
3510 const char* name, /* in: name of the file or path as a
3511 null-terminated string */
3512 os_file_t file, /* in: handle to a file */
3513 void* buf, /* in: buffer where to read or from which
3514 to write */
3515 ulint offset, /* in: least significant 32 bits of file
3516 offset where to read or write */
3517 ulint offset_high, /* in: most significant 32 bits of
3518 offset */
3519 ulint n, /* in: number of bytes to read or write */
3520 fil_node_t* message1,/* in: messages for the aio handler (these
3521 can be used to identify a completed aio
3522 operation); if mode is OS_AIO_SYNC, these
3523 are ignored */
3524 void* message2)
3526 os_aio_array_t* array;
3527 os_aio_slot_t* slot;
3528 #ifdef WIN_ASYNC_IO
3529 ibool retval;
3530 BOOL ret = TRUE;
3531 DWORD len = (DWORD) n;
3532 struct fil_node_struct * dummy_mess1;
3533 void* dummy_mess2;
3534 ulint dummy_type;
3535 #endif
3536 ulint err = 0;
3537 ibool retry;
3538 ulint wake_later;
3540 ut_ad(file);
3541 ut_ad(buf);
3542 ut_ad(n > 0);
3543 ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
3544 ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
3545 ut_ad(os_aio_validate());
3546 #ifdef WIN_ASYNC_IO
3547 ut_ad((n & 0xFFFFFFFFUL) == n);
3548 #endif
3550 wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
3551 mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
3553 if (mode == OS_AIO_SYNC
3554 #ifdef WIN_ASYNC_IO
3555 && !os_aio_use_native_aio
3556 #endif
3558 /* This is actually an ordinary synchronous read or write:
3559 no need to use an i/o-handler thread. NOTE that if we use
3560 Windows async i/o, Windows does not allow us to use
3561 ordinary synchronous os_file_read etc. on the same file,
3562 therefore we have built a special mechanism for synchronous
3563 wait in the Windows case. */
3565 if (type == OS_FILE_READ) {
3566 return(os_file_read(file, buf, offset,
3567 offset_high, n));
3570 ut_a(type == OS_FILE_WRITE);
3572 return(os_file_write(name, file, buf, offset, offset_high, n));
3575 try_again:
3576 if (mode == OS_AIO_NORMAL) {
3577 if (type == OS_FILE_READ) {
3578 array = os_aio_read_array;
3579 } else {
3580 array = os_aio_write_array;
3582 } else if (mode == OS_AIO_IBUF) {
3583 ut_ad(type == OS_FILE_READ);
3584 /* Reduce probability of deadlock bugs in connection with ibuf:
3585 do not let the ibuf i/o handler sleep */
3587 wake_later = FALSE;
3589 array = os_aio_ibuf_array;
3590 } else if (mode == OS_AIO_LOG) {
3592 array = os_aio_log_array;
3593 } else if (mode == OS_AIO_SYNC) {
3594 array = os_aio_sync_array;
3595 } else {
3596 array = NULL; /* Eliminate compiler warning */
3597 ut_error;
3600 slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
3601 name, buf, offset, offset_high, n);
3602 if (type == OS_FILE_READ) {
3603 if (os_aio_use_native_aio) {
3604 #ifdef WIN_ASYNC_IO
3605 os_n_file_reads++;
3606 os_bytes_read_since_printout += len;
3608 ret = ReadFile(file, buf, (DWORD)n, &len,
3609 &(slot->control));
3610 #elif defined(POSIX_ASYNC_IO)
3611 slot->control.aio_lio_opcode = LIO_READ;
3612 err = (ulint) aio_read(&(slot->control));
3613 fprintf(stderr, "Starting POSIX aio read %lu\n", err);
3614 #endif
3615 } else {
3616 if (!wake_later) {
3617 os_aio_simulated_wake_handler_thread(
3618 os_aio_get_segment_no_from_slot(
3619 array, slot));
3622 } else if (type == OS_FILE_WRITE) {
3623 if (os_aio_use_native_aio) {
3624 #ifdef WIN_ASYNC_IO
3625 os_n_file_writes++;
3626 ret = WriteFile(file, buf, (DWORD)n, &len,
3627 &(slot->control));
3628 #elif defined(POSIX_ASYNC_IO)
3629 slot->control.aio_lio_opcode = LIO_WRITE;
3630 err = (ulint) aio_write(&(slot->control));
3631 fprintf(stderr, "Starting POSIX aio write %lu\n", err);
3632 #endif
3633 } else {
3634 if (!wake_later) {
3635 os_aio_simulated_wake_handler_thread(
3636 os_aio_get_segment_no_from_slot(
3637 array, slot));
3640 } else {
3641 ut_error;
3644 #ifdef WIN_ASYNC_IO
3645 if (os_aio_use_native_aio) {
3646 if ((ret && len == n)
3647 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
3648 /* aio was queued successfully! */
3650 if (mode == OS_AIO_SYNC) {
3651 /* We want a synchronous i/o operation on a
3652 file where we also use async i/o: in Windows
3653 we must use the same wait mechanism as for
3654 async i/o */
3656 retval = os_aio_windows_handle(ULINT_UNDEFINED,
3657 slot->pos,
3658 &dummy_mess1,
3659 &dummy_mess2,
3660 &dummy_type);
3662 return(retval);
3665 return(TRUE);
3668 err = 1; /* Fall through the next if */
3670 #endif
3671 if (err == 0) {
3672 /* aio was queued successfully! */
3674 return(TRUE);
3677 os_aio_array_free_slot(array, slot);
3679 retry = os_file_handle_error(name,
3680 type == OS_FILE_READ
3681 ? "aio read" : "aio write");
3682 if (retry) {
3684 goto try_again;
3687 return(FALSE);
3690 #ifdef WIN_ASYNC_IO
3691 /**************************************************************************
3692 This function is only used in Windows asynchronous i/o.
3693 Waits for an aio operation to complete. This function is used to wait the
3694 for completed requests. The aio array of pending requests is divided
3695 into segments. The thread specifies which segment or slot it wants to wait
3696 for. NOTE: this function will also take care of freeing the aio slot,
3697 therefore no other thread is allowed to do the freeing! */
3699 ibool
3700 os_aio_windows_handle(
3701 /*==================*/
3702 /* out: TRUE if the aio operation succeeded */
3703 ulint segment, /* in: the number of the segment in the aio
3704 arrays to wait for; segment 0 is the ibuf
3705 i/o thread, segment 1 the log i/o thread,
3706 then follow the non-ibuf read threads, and as
3707 the last are the non-ibuf write threads; if
3708 this is ULINT_UNDEFINED, then it means that
3709 sync aio is used, and this parameter is
3710 ignored */
3711 ulint pos, /* this parameter is used only in sync aio:
3712 wait for the aio slot at this position */
3713 fil_node_t**message1, /* out: the messages passed with the aio
3714 request; note that also in the case where
3715 the aio operation failed, these output
3716 parameters are valid and can be used to
3717 restart the operation, for example */
3718 void** message2,
3719 ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3721 ulint orig_seg = segment;
3722 os_aio_array_t* array;
3723 os_aio_slot_t* slot;
3724 ulint n;
3725 ulint i;
3726 ibool ret_val;
3727 BOOL ret;
3728 DWORD len;
3729 BOOL retry = FALSE;
3731 if (segment == ULINT_UNDEFINED) {
3732 array = os_aio_sync_array;
3733 segment = 0;
3734 } else {
3735 segment = os_aio_get_array_and_local_segment(&array, segment);
3738 /* NOTE! We only access constant fields in os_aio_array. Therefore
3739 we do not have to acquire the protecting mutex yet */
3741 ut_ad(os_aio_validate());
3742 ut_ad(segment < array->n_segments);
3744 n = array->n_slots / array->n_segments;
3746 if (array == os_aio_sync_array) {
3747 os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
3748 i = pos;
3749 } else {
3750 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
3751 i = os_event_wait_multiple(n,
3752 (array->native_events)
3753 + segment * n);
3756 os_mutex_enter(array->mutex);
3758 slot = os_aio_array_get_nth_slot(array, i + segment * n);
3760 ut_a(slot->reserved);
3762 if (orig_seg != ULINT_UNDEFINED) {
3763 srv_set_io_thread_op_info(orig_seg,
3764 "get windows aio return value");
3767 ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
3769 *message1 = slot->message1;
3770 *message2 = slot->message2;
3772 *type = slot->type;
3774 if (ret && len == slot->len) {
3775 ret_val = TRUE;
3777 # ifdef UNIV_DO_FLUSH
3778 if (slot->type == OS_FILE_WRITE
3779 && !os_do_not_call_flush_at_each_write) {
3780 ut_a(TRUE == os_file_flush(slot->file));
3782 # endif /* UNIV_DO_FLUSH */
3783 } else if (os_file_handle_error(slot->name, "Windows aio")) {
3785 retry = TRUE;
3786 } else {
3788 ret_val = FALSE;
3791 os_mutex_exit(array->mutex);
3793 if (retry) {
3794 /* retry failed read/write operation synchronously.
3795 No need to hold array->mutex. */
3797 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
3799 switch (slot->type) {
3800 case OS_FILE_WRITE:
3801 ret = WriteFile(slot->file, slot->buf,
3802 (DWORD) slot->len, &len,
3803 &(slot->control));
3805 break;
3806 case OS_FILE_READ:
3807 ret = ReadFile(slot->file, slot->buf,
3808 (DWORD) slot->len, &len,
3809 &(slot->control));
3811 break;
3812 default:
3813 ut_error;
3816 if (!ret && GetLastError() == ERROR_IO_PENDING) {
3817 /* aio was queued successfully!
3818 We want a synchronous i/o operation on a
3819 file where we also use async i/o: in Windows
3820 we must use the same wait mechanism as for
3821 async i/o */
3823 ret = GetOverlappedResult(slot->file,
3824 &(slot->control),
3825 &len, TRUE);
3828 ret_val = ret && len == slot->len;
3831 os_aio_array_free_slot(array, slot);
3833 return(ret_val);
3835 #endif
3837 #ifdef POSIX_ASYNC_IO
3839 /**************************************************************************
3840 This function is only used in Posix asynchronous i/o. Waits for an aio
3841 operation to complete. */
3843 ibool
3844 os_aio_posix_handle(
3845 /*================*/
3846 /* out: TRUE if the aio operation succeeded */
3847 ulint array_no, /* in: array number 0 - 3 */
3848 fil_node_t**message1, /* out: the messages passed with the aio
3849 request; note that also in the case where
3850 the aio operation failed, these output
3851 parameters are valid and can be used to
3852 restart the operation, for example */
3853 void** message2)
3855 os_aio_array_t* array;
3856 os_aio_slot_t* slot;
3857 siginfo_t info;
3858 sigset_t sigset;
3859 sigset_t proc_sigset;
3860 sigset_t thr_sigset;
3861 int ret;
3862 int i;
3863 int sig;
3865 sigemptyset(&sigset);
3866 sigaddset(&sigset, SIGRTMIN + 1 + array_no);
3868 pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
3870 #if 0
3871 sigprocmask(0, NULL, &proc_sigset);
3872 pthread_sigmask(0, NULL, &thr_sigset);
3874 for (i = 32 ; i < 40; i++) {
3875 fprintf(stderr, "%lu : %lu %lu\n", (ulint)i,
3876 (ulint) sigismember(&proc_sigset, i),
3877 (ulint) sigismember(&thr_sigset, i));
3879 #endif
3881 ret = sigwaitinfo(&sigset, &info);
3883 if (sig != SIGRTMIN + 1 + array_no) {
3885 ut_error;
3887 return(FALSE);
3890 fputs("Handling POSIX aio\n", stderr);
3892 array = os_aio_get_array_from_no(array_no);
3894 os_mutex_enter(array->mutex);
3896 slot = info.si_value.sival_ptr;
3898 ut_a(slot->reserved);
3900 *message1 = slot->message1;
3901 *message2 = slot->message2;
3903 # ifdef UNIV_DO_FLUSH
3904 if (slot->type == OS_FILE_WRITE
3905 && !os_do_not_call_flush_at_each_write) {
3906 ut_a(TRUE == os_file_flush(slot->file));
3908 # endif /* UNIV_DO_FLUSH */
3910 os_mutex_exit(array->mutex);
3912 os_aio_array_free_slot(array, slot);
3914 return(TRUE);
3916 #endif
3918 /**************************************************************************
3919 Do a 'last millisecond' check that the page end is sensible;
3920 reported page checksum errors from Linux seem to wipe over the page end. */
3921 static
3922 void
3923 os_file_check_page_trailers(
3924 /*========================*/
3925 byte* combined_buf, /* in: combined write buffer */
3926 ulint total_len) /* in: size of combined_buf, in bytes
3927 (a multiple of UNIV_PAGE_SIZE) */
3929 ulint len;
3931 for (len = 0; len + UNIV_PAGE_SIZE <= total_len;
3932 len += UNIV_PAGE_SIZE) {
3933 byte* buf = combined_buf + len;
3935 if (UNIV_UNLIKELY
3936 (memcmp(buf + (FIL_PAGE_LSN + 4),
3937 buf + (UNIV_PAGE_SIZE
3938 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
3939 ut_print_timestamp(stderr);
3940 fprintf(stderr,
3941 " InnoDB: ERROR: The page to be written"
3942 " seems corrupt!\n"
3943 "InnoDB: Writing a block of %lu bytes,"
3944 " currently at offset %lu\n",
3945 (ulong)total_len, (ulong)len);
3946 buf_page_print(buf);
3947 fprintf(stderr,
3948 "InnoDB: ERROR: The page to be written"
3949 " seems corrupt!\n");
3954 /**************************************************************************
3955 Does simulated aio. This function should be called by an i/o-handler
3956 thread. */
3958 ibool
3959 os_aio_simulated_handle(
3960 /*====================*/
3961 /* out: TRUE if the aio operation succeeded */
3962 ulint global_segment, /* in: the number of the segment in the aio
3963 arrays to wait for; segment 0 is the ibuf
3964 i/o thread, segment 1 the log i/o thread,
3965 then follow the non-ibuf read threads, and as
3966 the last are the non-ibuf write threads */
3967 fil_node_t**message1, /* out: the messages passed with the aio
3968 request; note that also in the case where
3969 the aio operation failed, these output
3970 parameters are valid and can be used to
3971 restart the operation, for example */
3972 void** message2,
3973 ulint* type) /* out: OS_FILE_WRITE or ..._READ */
3975 os_aio_array_t* array;
3976 ulint segment;
3977 os_aio_slot_t* slot;
3978 os_aio_slot_t* slot2;
3979 os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
3980 ulint n_consecutive;
3981 ulint total_len;
3982 ulint offs;
3983 ulint lowest_offset;
3984 ulint biggest_age;
3985 ulint age;
3986 byte* combined_buf;
3987 byte* combined_buf2;
3988 ibool ret;
3989 ulint n;
3990 ulint i;
3992 /* Fix compiler warning */
3993 *consecutive_ios = NULL;
3995 segment = os_aio_get_array_and_local_segment(&array, global_segment);
3997 restart:
3998 /* NOTE! We only access constant fields in os_aio_array. Therefore
3999 we do not have to acquire the protecting mutex yet */
4001 srv_set_io_thread_op_info(global_segment,
4002 "looking for i/o requests (a)");
4003 ut_ad(os_aio_validate());
4004 ut_ad(segment < array->n_segments);
4006 n = array->n_slots / array->n_segments;
4008 /* Look through n slots after the segment * n'th slot */
4010 if (array == os_aio_read_array
4011 && os_aio_recommend_sleep_for_read_threads) {
4013 /* Give other threads chance to add several i/os to the array
4014 at once. */
4016 goto recommended_sleep;
4019 os_mutex_enter(array->mutex);
4021 srv_set_io_thread_op_info(global_segment,
4022 "looking for i/o requests (b)");
4024 /* Check if there is a slot for which the i/o has already been
4025 done */
4027 for (i = 0; i < n; i++) {
4028 slot = os_aio_array_get_nth_slot(array, i + segment * n);
4030 if (slot->reserved && slot->io_already_done) {
4032 if (os_aio_print_debug) {
4033 fprintf(stderr,
4034 "InnoDB: i/o for slot %lu"
4035 " already done, returning\n",
4036 (ulong) i);
4039 ret = TRUE;
4041 goto slot_io_done;
4045 n_consecutive = 0;
4047 /* If there are at least 2 seconds old requests, then pick the oldest
4048 one to prevent starvation. If several requests have the same age,
4049 then pick the one at the lowest offset. */
4051 biggest_age = 0;
4052 lowest_offset = ULINT_MAX;
4054 for (i = 0; i < n; i++) {
4055 slot = os_aio_array_get_nth_slot(array, i + segment * n);
4057 if (slot->reserved) {
4058 age = (ulint)difftime(time(NULL),
4059 slot->reservation_time);
4061 if ((age >= 2 && age > biggest_age)
4062 || (age >= 2 && age == biggest_age
4063 && slot->offset < lowest_offset)) {
4065 /* Found an i/o request */
4066 consecutive_ios[0] = slot;
4068 n_consecutive = 1;
4070 biggest_age = age;
4071 lowest_offset = slot->offset;
4076 if (n_consecutive == 0) {
4077 /* There were no old requests. Look for an i/o request at the
4078 lowest offset in the array (we ignore the high 32 bits of the
4079 offset in these heuristics) */
4081 lowest_offset = ULINT_MAX;
4083 for (i = 0; i < n; i++) {
4084 slot = os_aio_array_get_nth_slot(array,
4085 i + segment * n);
4087 if (slot->reserved && slot->offset < lowest_offset) {
4089 /* Found an i/o request */
4090 consecutive_ios[0] = slot;
4092 n_consecutive = 1;
4094 lowest_offset = slot->offset;
4099 if (n_consecutive == 0) {
4101 /* No i/o requested at the moment */
4103 goto wait_for_io;
4106 slot = consecutive_ios[0];
4108 /* Check if there are several consecutive blocks to read or write */
4110 consecutive_loop:
4111 for (i = 0; i < n; i++) {
4112 slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4114 if (slot2->reserved && slot2 != slot
4115 && slot2->offset == slot->offset + slot->len
4116 /* check that sum does not wrap over */
4117 && slot->offset + slot->len > slot->offset
4118 && slot2->offset_high == slot->offset_high
4119 && slot2->type == slot->type
4120 && slot2->file == slot->file) {
4122 /* Found a consecutive i/o request */
4124 consecutive_ios[n_consecutive] = slot2;
4125 n_consecutive++;
4127 slot = slot2;
4129 if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4131 goto consecutive_loop;
4132 } else {
4133 break;
4138 srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4140 /* We have now collected n_consecutive i/o requests in the array;
4141 allocate a single buffer which can hold all data, and perform the
4142 i/o */
4144 total_len = 0;
4145 slot = consecutive_ios[0];
4147 for (i = 0; i < n_consecutive; i++) {
4148 total_len += consecutive_ios[i]->len;
4151 if (n_consecutive == 1) {
4152 /* We can use the buffer of the i/o request */
4153 combined_buf = slot->buf;
4154 combined_buf2 = NULL;
4155 } else {
4156 combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
4158 ut_a(combined_buf2);
4160 combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
4163 /* We release the array mutex for the time of the i/o: NOTE that
4164 this assumes that there is just one i/o-handler thread serving
4165 a single segment of slots! */
4167 os_mutex_exit(array->mutex);
4169 if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4170 /* Copy the buffers to the combined buffer */
4171 offs = 0;
4173 for (i = 0; i < n_consecutive; i++) {
4175 ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4176 consecutive_ios[i]->len);
4177 offs += consecutive_ios[i]->len;
4181 srv_set_io_thread_op_info(global_segment, "doing file i/o");
4183 if (os_aio_print_debug) {
4184 fprintf(stderr,
4185 "InnoDB: doing i/o of type %lu at offset %lu %lu,"
4186 " length %lu\n",
4187 (ulong) slot->type, (ulong) slot->offset_high,
4188 (ulong) slot->offset, (ulong) total_len);
4191 /* Do the i/o with ordinary, synchronous i/o functions: */
4192 if (slot->type == OS_FILE_WRITE) {
4193 if (array == os_aio_write_array) {
4194 if ((total_len % UNIV_PAGE_SIZE != 0)
4195 || (slot->offset % UNIV_PAGE_SIZE != 0)) {
4196 fprintf(stderr,
4197 "InnoDB: Error: trying a displaced"
4198 " write to %s %lu %lu, len %lu\n",
4199 slot->name, (ulong) slot->offset_high,
4200 (ulong) slot->offset,
4201 (ulong) total_len);
4202 ut_error;
4205 os_file_check_page_trailers(combined_buf, total_len);
4208 ret = os_file_write(slot->name, slot->file, combined_buf,
4209 slot->offset, slot->offset_high,
4210 total_len);
4212 if (array == os_aio_write_array) {
4213 os_file_check_page_trailers(combined_buf, total_len);
4215 } else {
4216 ret = os_file_read(slot->file, combined_buf,
4217 slot->offset, slot->offset_high, total_len);
4220 ut_a(ret);
4221 srv_set_io_thread_op_info(global_segment, "file i/o done");
4223 #if 0
4224 fprintf(stderr,
4225 "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4226 n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4227 #endif
4229 if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4230 /* Copy the combined buffer to individual buffers */
4231 offs = 0;
4233 for (i = 0; i < n_consecutive; i++) {
4235 ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4236 consecutive_ios[i]->len);
4237 offs += consecutive_ios[i]->len;
4241 if (combined_buf2) {
4242 ut_free(combined_buf2);
4245 os_mutex_enter(array->mutex);
4247 /* Mark the i/os done in slots */
4249 for (i = 0; i < n_consecutive; i++) {
4250 consecutive_ios[i]->io_already_done = TRUE;
4253 /* We return the messages for the first slot now, and if there were
4254 several slots, the messages will be returned with subsequent calls
4255 of this function */
4257 slot_io_done:
4259 ut_a(slot->reserved);
4261 *message1 = slot->message1;
4262 *message2 = slot->message2;
4264 *type = slot->type;
4266 os_mutex_exit(array->mutex);
4268 os_aio_array_free_slot(array, slot);
4270 return(ret);
4272 wait_for_io:
4273 srv_set_io_thread_op_info(global_segment, "resetting wait event");
4275 /* We wait here until there again can be i/os in the segment
4276 of this thread */
4278 os_event_reset(os_aio_segment_wait_events[global_segment]);
4280 os_mutex_exit(array->mutex);
4282 recommended_sleep:
4283 srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4285 os_event_wait(os_aio_segment_wait_events[global_segment]);
4287 if (os_aio_print_debug) {
4288 fprintf(stderr,
4289 "InnoDB: i/o handler thread for i/o"
4290 " segment %lu wakes up\n",
4291 (ulong) global_segment);
4294 goto restart;
4297 /**************************************************************************
4298 Validates the consistency of an aio array. */
4299 static
4300 ibool
4301 os_aio_array_validate(
4302 /*==================*/
4303 /* out: TRUE if ok */
4304 os_aio_array_t* array) /* in: aio wait array */
4306 os_aio_slot_t* slot;
4307 ulint n_reserved = 0;
4308 ulint i;
4310 ut_a(array);
4312 os_mutex_enter(array->mutex);
4314 ut_a(array->n_slots > 0);
4315 ut_a(array->n_segments > 0);
4317 for (i = 0; i < array->n_slots; i++) {
4318 slot = os_aio_array_get_nth_slot(array, i);
4320 if (slot->reserved) {
4321 n_reserved++;
4322 ut_a(slot->len > 0);
4326 ut_a(array->n_reserved == n_reserved);
4328 os_mutex_exit(array->mutex);
4330 return(TRUE);
4333 /**************************************************************************
4334 Validates the consistency the aio system. */
4336 ibool
4337 os_aio_validate(void)
4338 /*=================*/
4339 /* out: TRUE if ok */
4341 os_aio_array_validate(os_aio_read_array);
4342 os_aio_array_validate(os_aio_write_array);
4343 os_aio_array_validate(os_aio_ibuf_array);
4344 os_aio_array_validate(os_aio_log_array);
4345 os_aio_array_validate(os_aio_sync_array);
4347 return(TRUE);
4350 /**************************************************************************
4351 Prints info of the aio arrays. */
4353 void
4354 os_aio_print(
4355 /*=========*/
4356 FILE* file) /* in: file where to print */
4358 os_aio_array_t* array;
4359 os_aio_slot_t* slot;
4360 ulint n_reserved;
4361 time_t current_time;
4362 double time_elapsed;
4363 double avg_bytes_read;
4364 ulint i;
4366 for (i = 0; i < srv_n_file_io_threads; i++) {
4367 fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
4368 srv_io_thread_op_info[i],
4369 srv_io_thread_function[i]);
4371 #ifndef __WIN__
4372 if (os_aio_segment_wait_events[i]->is_set) {
4373 fprintf(file, " ev set");
4375 #endif
4377 fprintf(file, "\n");
4380 fputs("Pending normal aio reads:", file);
4382 array = os_aio_read_array;
4383 loop:
4384 ut_a(array);
4386 os_mutex_enter(array->mutex);
4388 ut_a(array->n_slots > 0);
4389 ut_a(array->n_segments > 0);
4391 n_reserved = 0;
4393 for (i = 0; i < array->n_slots; i++) {
4394 slot = os_aio_array_get_nth_slot(array, i);
4396 if (slot->reserved) {
4397 n_reserved++;
4398 #if 0
4399 fprintf(stderr, "Reserved slot, messages %p %p\n",
4400 (void*) slot->message1,
4401 (void*) slot->message2);
4402 #endif
4403 ut_a(slot->len > 0);
4407 ut_a(array->n_reserved == n_reserved);
4409 fprintf(file, " %lu", (ulong) n_reserved);
4411 os_mutex_exit(array->mutex);
4413 if (array == os_aio_read_array) {
4414 fputs(", aio writes:", file);
4416 array = os_aio_write_array;
4418 goto loop;
4421 if (array == os_aio_write_array) {
4422 fputs(",\n ibuf aio reads:", file);
4423 array = os_aio_ibuf_array;
4425 goto loop;
4428 if (array == os_aio_ibuf_array) {
4429 fputs(", log i/o's:", file);
4430 array = os_aio_log_array;
4432 goto loop;
4435 if (array == os_aio_log_array) {
4436 fputs(", sync i/o's:", file);
4437 array = os_aio_sync_array;
4439 goto loop;
4442 putc('\n', file);
4443 current_time = time(NULL);
4444 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
4446 fprintf(file,
4447 "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
4448 "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
4449 (ulong) fil_n_pending_log_flushes,
4450 (ulong) fil_n_pending_tablespace_flushes,
4451 (ulong) os_n_file_reads, (ulong) os_n_file_writes,
4452 (ulong) os_n_fsyncs);
4454 if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
4455 fprintf(file,
4456 "%lu pending preads, %lu pending pwrites\n",
4457 (ulong) os_file_n_pending_preads,
4458 (ulong) os_file_n_pending_pwrites);
4461 if (os_n_file_reads == os_n_file_reads_old) {
4462 avg_bytes_read = 0.0;
4463 } else {
4464 avg_bytes_read = (double) os_bytes_read_since_printout
4465 / (os_n_file_reads - os_n_file_reads_old);
4468 fprintf(file,
4469 "%.2f reads/s, %lu avg bytes/read,"
4470 " %.2f writes/s, %.2f fsyncs/s\n",
4471 (os_n_file_reads - os_n_file_reads_old)
4472 / time_elapsed,
4473 (ulong)avg_bytes_read,
4474 (os_n_file_writes - os_n_file_writes_old)
4475 / time_elapsed,
4476 (os_n_fsyncs - os_n_fsyncs_old)
4477 / time_elapsed);
4479 os_n_file_reads_old = os_n_file_reads;
4480 os_n_file_writes_old = os_n_file_writes;
4481 os_n_fsyncs_old = os_n_fsyncs;
4482 os_bytes_read_since_printout = 0;
4484 os_last_printout = current_time;
4487 /**************************************************************************
4488 Refreshes the statistics used to print per-second averages. */
4490 void
4491 os_aio_refresh_stats(void)
4492 /*======================*/
4494 os_n_file_reads_old = os_n_file_reads;
4495 os_n_file_writes_old = os_n_file_writes;
4496 os_n_fsyncs_old = os_n_fsyncs;
4497 os_bytes_read_since_printout = 0;
4499 os_last_printout = time(NULL);
4502 #ifdef UNIV_DEBUG
4503 /**************************************************************************
4504 Checks that all slots in the system have been freed, that is, there are
4505 no pending io operations. */
4507 ibool
4508 os_aio_all_slots_free(void)
4509 /*=======================*/
4510 /* out: TRUE if all free */
4512 os_aio_array_t* array;
4513 ulint n_res = 0;
4515 array = os_aio_read_array;
4517 os_mutex_enter(array->mutex);
4519 n_res += array->n_reserved;
4521 os_mutex_exit(array->mutex);
4523 array = os_aio_write_array;
4525 os_mutex_enter(array->mutex);
4527 n_res += array->n_reserved;
4529 os_mutex_exit(array->mutex);
4531 array = os_aio_ibuf_array;
4533 os_mutex_enter(array->mutex);
4535 n_res += array->n_reserved;
4537 os_mutex_exit(array->mutex);
4539 array = os_aio_log_array;
4541 os_mutex_enter(array->mutex);
4543 n_res += array->n_reserved;
4545 os_mutex_exit(array->mutex);
4547 array = os_aio_sync_array;
4549 os_mutex_enter(array->mutex);
4551 n_res += array->n_reserved;
4553 os_mutex_exit(array->mutex);
4555 if (n_res == 0) {
4557 return(TRUE);
4560 return(FALSE);
4562 #endif /* UNIV_DEBUG */