2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
11 static const char sccsid
[] = "@(#)db_region.c 10.46 (Sleepycat) 5/26/98";
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
23 #include "common_ext.h"
25 static int __db_growregion
__P((REGINFO
*, size_t));
29 * Optionally create and attach to a shared memory region.
31 * PUBLIC: int __db_rattach __P((REGINFO *));
38 size_t grow_region
, size
;
40 u_int32_t flags
, mbytes
, bytes
;
42 int malloc_possible
, ret
, retry_cnt
;
48 /* Round off the requested size to the next page boundary. */
49 DB_ROUNDOFF(infop
->size
);
51 /* Some architectures have hard limits on the maximum region size. */
52 #ifdef DB_REGIONSIZE_MAX
53 if (infop
->size
> DB_REGIONSIZE_MAX
) {
54 __db_err(infop
->dbenv
, "__db_rattach: cache size too large");
59 /* Intialize the return information in the REGINFO structure. */
60 loop
: infop
->addr
= NULL
;
62 infop
->segid
= INVALID_SEGID
;
63 if (infop
->name
!= NULL
) {
67 F_CLR(infop
, REGION_CANGROW
| REGION_CREATED
);
69 #ifndef HAVE_SPINLOCKS
72 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73 * locking, which implies using mmap(2) to map in a regular file.
74 * (Theoretically, we could probably get a file descriptor to lock
75 * other types of shared regions, but I don't see any reason to
84 * HP-UX won't permit mutexes to live in anything but shared memory.
85 * Instantiate a shared region file on that architecture, regardless.
90 * If a region is truly private, malloc the memory. That's faster
91 * than either anonymous memory or a shared file.
93 if (malloc_possible
&& F_ISSET(infop
, REGION_PRIVATE
)) {
94 if ((infop
->addr
= __db_malloc(infop
->size
)) == NULL
)
98 * It's sometimes significantly faster to page-fault in all
99 * of the region's pages before we run the application, as
100 * we can see fairly nasty side-effects when we page-fault
101 * while holding various locks, i.e., the lock takes a long
102 * time, and other threads convoy behind the lock holder.
104 if (DB_GLOBAL(db_region_init
))
105 for (p
= infop
->addr
;
106 p
< (u_int8_t
*)infop
->addr
+ infop
->size
;
110 F_SET(infop
, REGION_CREATED
| REGION_MALLOC
);
115 * Get the name of the region (creating the file if a temporary file
116 * is being used). The dbenv contains the current DB environment,
117 * including naming information. The path argument may be a file or
118 * a directory. If path is a directory, it must exist and file is the
119 * file name to be created inside the directory. If path is a file,
120 * then file must be NULL.
122 if ((ret
= __db_appname(infop
->dbenv
, infop
->appname
, infop
->path
,
123 infop
->file
, infop
->dbflags
, &infop
->fd
, &infop
->name
)) != 0)
126 F_SET(infop
, REGION_CREATED
);
129 * Try to create the file, if we have authority. We have to make sure
130 * that multiple threads/processes attempting to simultaneously create
131 * the region are properly ordered, so we open it using DB_CREATE and
132 * DB_EXCL, so two attempts to create the region will return failure in
135 if (infop
->fd
== -1 && infop
->dbflags
& DB_CREATE
) {
136 flags
= infop
->dbflags
;
138 if ((ret
= __db_open(infop
->name
,
139 flags
, flags
, infop
->mode
, &infop
->fd
)) == 0)
140 F_SET(infop
, REGION_CREATED
);
146 /* If we couldn't create the file, try and open it. */
147 if (infop
->fd
== -1) {
148 flags
= infop
->dbflags
;
149 LF_CLR(DB_CREATE
| DB_EXCL
);
150 if ((ret
= __db_open(infop
->name
,
151 flags
, flags
, infop
->mode
, &infop
->fd
)) != 0)
156 * There are three cases we support:
157 * 1. Named anonymous memory (shmget(2)).
158 * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
159 * 3. Memory backed by a regular file (mmap(2)).
161 * We instantiate a backing file in all cases, which contains at least
162 * the RLAYOUT structure, and in case #4, contains the actual region.
163 * This is necessary for a couple of reasons:
165 * First, the mpool region uses temporary files to name regions, and
166 * since you may have multiple regions in the same directory, we need
167 * a filesystem name to ensure that they don't collide.
169 * Second, applications are allowed to forcibly remove regions, even
170 * if they don't know anything about them other than the name. If a
171 * region is backed by anonymous memory, there has to be some way for
172 * the application to find out that information, and, in some cases,
173 * determine ID information for the anonymous memory.
175 if (F_ISSET(infop
, REGION_CREATED
)) {
177 * If we're using anonymous memory to back this region, set
180 if (DB_GLOBAL(db_region_anon
))
181 F_SET(infop
, REGION_ANONYMOUS
);
184 * If we're using a regular file to back a region we created,
185 * grow it to the specified size.
187 if (!DB_GLOBAL(db_region_anon
) &&
188 (ret
= __db_growregion(infop
, infop
->size
)) != 0)
192 * If we're joining a region, figure out what it looks like.
195 * We have to figure out if the file is a regular file backing
196 * a region that we want to map into our address space, or a
197 * file with the information we need to find a shared anonymous
198 * region that we want to map into our address space.
200 * All this noise is because some systems don't have a coherent
201 * VM and buffer cache, and worse, if you mix operations on the
202 * VM and buffer cache, half the time you hang the system.
204 * There are two possibilities. If the file is the size of an
205 * RLAYOUT structure, then we know that the real region is in
206 * shared memory, because otherwise it would be bigger. (As
207 * the RLAYOUT structure size is smaller than a disk sector,
208 * the only way it can be this size is if deliberately written
209 * that way.) In which case, retrieve the information we need
210 * from the RLAYOUT structure and use it to acquire the shared
213 * If the structure is larger than an RLAYOUT structure, then
214 * the file is backing the shared memory region, and we use
215 * the current size of the file without reading any information
216 * from the file itself so that we don't confuse the VM.
218 * And yes, this makes me want to take somebody and kill them,
219 * but I can't think of any other solution.
221 if ((ret
= __db_ioinfo(infop
->name
,
222 infop
->fd
, &mbytes
, &bytes
, NULL
)) != 0)
224 size
= mbytes
* MEGABYTE
+ bytes
;
226 if (size
<= sizeof(RLAYOUT
)) {
228 * If the size is too small, the read fails or the
229 * valid flag is incorrect, assume it's because the
230 * RLAYOUT information hasn't been written out yet,
233 if (size
< sizeof(RLAYOUT
))
236 __db_read(infop
->fd
, &rl
, sizeof(rl
), &nr
)) != 0)
238 if (rl
.valid
!= DB_REGIONMAGIC
)
241 /* Copy the size, memory id and characteristics. */
243 infop
->segid
= rl
.segid
;
244 if (F_ISSET(&rl
, REGION_ANONYMOUS
))
245 F_SET(infop
, REGION_ANONYMOUS
);
249 * If the region is larger than we think, that's okay, use the
250 * current size. If it's smaller than we think, and we were
251 * just using the default size, that's okay, use the current
252 * size. If it's smaller than we think and we really care,
253 * save the size and we'll catch that further down -- we can't
254 * correct it here because we have to have a lock to grow the
257 if (infop
->size
> size
&& !F_ISSET(infop
, REGION_SIZEDEF
))
258 grow_region
= infop
->size
;
263 * Map the region into our address space. If we're creating it, the
264 * underlying routines will make it the right size.
266 * There are at least two cases where we can "reasonably" fail when
267 * we attempt to map in the region. On Windows/95, closing the last
268 * reference to a region causes it to be zeroed out. On UNIX, when
269 * using the shmget(2) interfaces, the region will no longer exist
270 * if the system was rebooted. In these cases, the underlying map call
271 * returns EAGAIN, and we *remove* our file and try again. There are
272 * obvious races in doing this, but it should eventually settle down
273 * to a winner and then things should proceed normally.
275 if ((ret
= __db_mapregion(infop
->name
, infop
)) != 0) {
278 * Pretend we created the region even if we didn't so
279 * that our error processing unlinks it.
281 F_SET(infop
, REGION_CREATED
);
289 * Initialize the common region information.
292 * We have to order the region creates so that two processes don't try
293 * to simultaneously create the region. This is handled by using the
294 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
296 * We also have to order region joins so that processes joining regions
297 * never see inconsistent data. We'd like to play permissions games
298 * with the backing file, but we can't because WNT filesystems won't
299 * open a file mode 0.
301 rlp
= (RLAYOUT
*)infop
->addr
;
302 if (F_ISSET(infop
, REGION_CREATED
)) {
304 * The process creating the region acquires a lock before it
305 * sets the valid flag. Any processes joining the region will
306 * check the valid flag before acquiring the lock.
308 * Check the return of __db_mutex_init() and __db_mutex_lock(),
309 * even though we don't usually check elsewhere. This is the
310 * first lock we initialize and acquire, and we have to know if
311 * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2)
312 * for locking, with an in-memory filesystem specified as the
315 if ((ret
= __db_mutex_init(&rlp
->lock
,
316 MUTEX_LOCK_OFFSET(rlp
, &rlp
->lock
))) != 0 ||
317 (ret
= __db_mutex_lock(&rlp
->lock
, infop
->fd
)) != 0)
320 /* Initialize the remaining region information. */
322 rlp
->size
= infop
->size
;
323 db_version(&rlp
->majver
, &rlp
->minver
, &rlp
->patch
);
324 rlp
->segid
= infop
->segid
;
326 if (F_ISSET(infop
, REGION_ANONYMOUS
))
327 F_SET(rlp
, REGION_ANONYMOUS
);
330 * Fill in the valid field last -- use a magic number, memory
331 * may not be zero-filled, and we want to minimize the chance
334 rlp
->valid
= DB_REGIONMAGIC
;
337 * If the region is anonymous, write the RLAYOUT information
338 * into the backing file so that future region join and unlink
342 * We MUST do the seek before we do the write. On Win95, while
343 * closing the last reference to an anonymous shared region
344 * doesn't discard the region, it does zero it out. So, the
345 * REGION_CREATED may be set, but the file may have already
346 * been written and the file descriptor may be at the end of
349 if (F_ISSET(infop
, REGION_ANONYMOUS
)) {
350 if ((ret
= __db_seek(infop
->fd
, 0, 0, 0, 0, 0)) != 0)
353 __db_write(infop
->fd
, rlp
, sizeof(*rlp
), &nw
)) != 0)
358 * Check the valid flag to ensure the region is initialized.
359 * If the valid flag has not been set, the mutex may not have
360 * been initialized, and an attempt to get it could lead to
363 if (rlp
->valid
!= DB_REGIONMAGIC
)
366 /* Get the region lock. */
367 (void)__db_mutex_lock(&rlp
->lock
, infop
->fd
);
370 * We now own the region. There are a couple of things that
371 * may have gone wrong, however.
373 * Problem #1: while we were waiting for the lock, the region
374 * was deleted. Detected by re-checking the valid flag, since
375 * it's cleared by the delete region routines.
377 if (rlp
->valid
!= DB_REGIONMAGIC
) {
378 (void)__db_mutex_unlock(&rlp
->lock
, infop
->fd
);
383 * Problem #2: We want a bigger region than has previously been
384 * created. Detected by checking if the region is smaller than
385 * our caller requested. If it is, we grow the region, (which
386 * does the detach and re-attach for us).
388 if (grow_region
!= 0 &&
389 (ret
= __db_rgrow(infop
, grow_region
)) != 0) {
390 (void)__db_mutex_unlock(&rlp
->lock
, infop
->fd
);
395 * Problem #3: when we checked the size of the file, it was
396 * still growing as part of creation. Detected by the fact
397 * that infop->size isn't the same size as the region.
399 if (infop
->size
!= rlp
->size
) {
400 (void)__db_mutex_unlock(&rlp
->lock
, infop
->fd
);
404 /* Increment the reference count. */
408 /* Return the region in a locked condition. */
411 errmsg
: __db_err(infop
->dbenv
, "%s: %s", infop
->name
, strerror(ret
));
414 retry
: /* Discard the region. */
415 if (infop
->addr
!= NULL
) {
416 (void)__db_unmapregion(infop
);
420 /* Discard the backing file. */
421 if (infop
->fd
!= -1) {
422 (void)__db_close(infop
->fd
);
425 if (F_ISSET(infop
, REGION_CREATED
))
426 (void)__db_unlink(infop
->name
);
429 /* Discard the name. */
430 if (infop
->name
!= NULL
) {
436 * If we had a temporary error, wait a few seconds and
440 if (++retry_cnt
<= 3) {
441 __db_sleep(retry_cnt
* 2, 0);
450 * HP-UX won't permit mutexes to live in anything but shared memory.
451 * Instantiate a shared region file on that architecture, regardless.
454 * There's a problem in cleaning this up on application exit, or on
455 * application failure. If an application opens a database without
456 * an environment, we create a temporary backing mpool region for it.
457 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
458 * mutexes to live in anything but shared memory, we instantiate a
459 * real file plus a memory region of some form. If the application
460 * crashes, the necessary information to delete the backing file and
461 * any system region (e.g., the shmget(2) segment ID) is no longer
462 * available. We can't completely fix the problem, but we try.
464 * The underlying UNIX __db_mapregion() code preferentially uses the
465 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
466 * that are marked REGION_PRIVATE. This means that we normally aren't
467 * holding any system resources when we get here, in which case we can
468 * delete the backing file. This results in a short race, from the
469 * __db_open() call above to here.
471 * If, for some reason, we are holding system resources when we get
472 * here, we don't have any choice -- we can't delete the backing file
473 * because we may need it to detach from the resources. Set the
474 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
475 * the application closes the region.
477 if (F_ISSET(infop
, REGION_PRIVATE
) && !F_ISSET(infop
, REGION_MALLOC
)) {
478 if (F_ISSET(infop
, REGION_HOLDINGSYS
))
479 F_SET(infop
, REGION_LASTDETACH
);
481 F_SET(infop
, REGION_REMOVED
);
482 F_CLR(infop
, REGION_CANGROW
);
484 (void)__db_close(infop
->fd
);
485 (void)__db_unlink(infop
->name
);
493 * De-attach from a shared memory region.
495 * PUBLIC: int __db_rdetach __P((REGINFO *));
502 int detach
, ret
, t_ret
;
507 * If the region was removed when it was created, no further action
510 if (F_ISSET(infop
, REGION_REMOVED
))
513 * If the region was created in memory returned by malloc, the only
514 * action required is freeing the memory.
516 if (F_ISSET(infop
, REGION_MALLOC
)) {
517 __db_free(infop
->addr
);
521 /* Otherwise, attach to the region and optionally delete it. */
525 (void)__db_mutex_lock(&rlp
->lock
, infop
->fd
);
527 /* Decrement the reference count. */
528 if (rlp
->refcnt
== 0)
529 __db_err(infop
->dbenv
,
530 "region rdetach: reference count went to zero!");
535 * If we're going to remove the region, clear the valid flag so
536 * that any region join that's blocked waiting for us will know
540 if (F_ISSET(infop
, REGION_LASTDETACH
)) {
541 if (rlp
->refcnt
== 0) {
548 /* Release the lock. */
549 (void)__db_mutex_unlock(&rlp
->lock
, infop
->fd
);
551 /* Close the backing file descriptor. */
552 (void)__db_close(infop
->fd
);
555 /* Discard our mapping of the region. */
556 if ((t_ret
= __db_unmapregion(infop
)) != 0 && ret
== 0)
559 /* Discard the region itself. */
562 __db_unlinkregion(infop
->name
, infop
) != 0) && ret
== 0)
564 if ((t_ret
= __db_unlink(infop
->name
) != 0) && ret
== 0)
568 done
: /* Discard the name. */
569 if (infop
->name
!= NULL
) {
581 * PUBLIC: int __db_runlink __P((REGINFO *, int));
584 __db_runlink(infop
, force
)
591 u_int32_t mbytes
, bytes
;
597 * We assume that we've created a new REGINFO structure for this
598 * call, not used one that was already initialized. Regardless,
599 * if anyone is planning to use it after we're done, they're going
600 * to be sorely disappointed.
602 * If force isn't set, we attach to the region, set a flag to delete
603 * the region on last close, and let the region delete code do the
607 if ((ret
= __db_rattach(infop
)) != 0)
610 rlp
= (RLAYOUT
*)infop
->addr
;
611 (void)__db_mutex_unlock(&rlp
->lock
, infop
->fd
);
613 F_SET(infop
, REGION_LASTDETACH
);
615 return (__db_rdetach(infop
));
619 * Otherwise, we don't want to attach to the region. We may have been
620 * called to clean up if a process died leaving a region locked and/or
621 * corrupted, which could cause the attach to hang.
623 if ((ret
= __db_appname(infop
->dbenv
, infop
->appname
,
624 infop
->path
, infop
->file
, infop
->dbflags
, NULL
, &name
)) != 0)
628 * An underlying file is created for all regions other than private
629 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630 * back the region. If that file doesn't exist, we're done.
632 if (__db_exists(name
, NULL
) != 0) {
638 * See the comments in __db_rattach -- figure out if this is a regular
639 * file backing a region or if it's a regular file with information
642 if ((ret
= __db_open(name
, DB_RDONLY
, DB_RDONLY
, 0, &fd
)) != 0)
644 if ((ret
= __db_ioinfo(name
, fd
, &mbytes
, &bytes
, NULL
)) != 0)
646 size
= mbytes
* MEGABYTE
+ bytes
;
648 if (size
<= sizeof(RLAYOUT
)) {
649 if ((ret
= __db_read(fd
, &rl
, sizeof(rl
), &nr
)) != 0)
651 if (rl
.valid
!= DB_REGIONMAGIC
) {
652 __db_err(infop
->dbenv
,
653 "%s: illegal region magic number", name
);
658 /* Set the size, memory id and characteristics. */
659 infop
->size
= rl
.size
;
660 infop
->segid
= rl
.segid
;
661 if (F_ISSET(&rl
, REGION_ANONYMOUS
))
662 F_SET(infop
, REGION_ANONYMOUS
);
665 infop
->segid
= INVALID_SEGID
;
668 /* Remove the underlying region. */
669 ret
= __db_unlinkregion(name
, infop
);
672 * Unlink the backing file. Close the open file descriptor first,
673 * because some architectures (e.g., Win32) won't unlink a file if
674 * open file descriptors remain.
676 (void)__db_close(fd
);
677 if ((t_ret
= __db_unlink(name
)) != 0 && ret
== 0)
681 errmsg
: __db_err(infop
->dbenv
, "%s: %s", name
, strerror(ret
));
682 err
: (void)__db_close(fd
);
693 * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
696 __db_rgrow(infop
, new_size
)
706 * This routine MUST be called with the region already locked.
709 /* The underlying routines have flagged if this region can grow. */
710 if (!F_ISSET(infop
, REGION_CANGROW
))
714 * Round off the requested size to the next page boundary, and
715 * determine the additional space required.
717 rlp
= (RLAYOUT
*)infop
->addr
;
718 DB_ROUNDOFF(new_size
);
719 increment
= new_size
- rlp
->size
;
721 if ((ret
= __db_growregion(infop
, increment
)) != 0)
724 /* Update the on-disk region size. */
725 rlp
->size
= new_size
;
727 /* Detach from and reattach to the region. */
728 return (__db_rreattach(infop
, new_size
));
733 * Grow a shared memory region.
736 __db_growregion(infop
, increment
)
745 char buf
[DB_VMPAGESIZE
];
747 /* Seek to the end of the region. */
748 if ((ret
= __db_seek(infop
->fd
, 0, 0, 0, 0, SEEK_END
)) != 0)
751 /* Write nuls to the new bytes. */
752 memset(buf
, 0, sizeof(buf
));
755 * Some systems require that all of the bytes of the region be
756 * written before it can be mapped and accessed randomly, and
757 * other systems don't zero out the pages.
760 /* Extend the region by writing each new page. */
761 for (i
= 0; i
< increment
; i
+= DB_VMPAGESIZE
) {
763 __db_write(infop
->fd
, buf
, sizeof(buf
), &nw
)) != 0)
765 if (nw
!= sizeof(buf
))
770 * Extend the region by writing the last page. If the region
771 * is >4Gb, increment may be larger than the maximum possible
772 * seek "relative" argument, as it's an unsigned 32-bit value.
773 * Break the offset into pages of 1MB each so that we don't
774 * overflow (2^20 + 2^32 is bigger than any memory I expect
775 * to see for awhile).
777 pages
= (increment
- DB_VMPAGESIZE
) / MEGABYTE
;
778 relative
= (increment
- DB_VMPAGESIZE
) % MEGABYTE
;
779 if ((ret
= __db_seek(infop
->fd
,
780 MEGABYTE
, pages
, relative
, 0, SEEK_CUR
)) != 0)
782 if ((ret
= __db_write(infop
->fd
, buf
, sizeof(buf
), &nw
)) != 0)
784 if (nw
!= sizeof(buf
))
788 * It's sometimes significantly faster to page-fault in all
789 * of the region's pages before we run the application, as
790 * we can see fairly nasty side-effects when we page-fault
791 * while holding various locks, i.e., the lock takes a long
792 * time, and other threads convoy behind the lock holder.
794 if (DB_GLOBAL(db_region_init
)) {
795 pages
= increment
/ MEGABYTE
;
796 relative
= increment
% MEGABYTE
;
797 if ((ret
= __db_seek(infop
->fd
,
798 MEGABYTE
, pages
, relative
, 1, SEEK_END
)) != 0)
801 /* Read a byte from each page. */
802 for (i
= 0; i
< increment
; i
+= DB_VMPAGESIZE
) {
804 __db_read(infop
->fd
, buf
, 1, &nr
)) != 0)
808 if ((ret
= __db_seek(infop
->fd
,
809 0, 0, DB_VMPAGESIZE
- 1, 0, SEEK_CUR
)) != 0)
817 err
: __db_err(infop
->dbenv
, "region grow: %s", strerror(ret
));
823 * Detach from and reattach to a region.
825 * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
828 __db_rreattach(infop
, new_size
)
835 if (infop
->name
== NULL
) {
836 __db_err(infop
->dbenv
, "__db_rreattach: name was NULL");
841 * If we're growing an already mapped region, we have to unmap it
842 * and get it back. We have it locked, so nobody else can get in,
843 * which makes it fairly straight-forward to do, as everybody else
844 * is going to block while we do the unmap/remap. NB: if we fail
845 * to get it back, the pooch is genuinely screwed, because we can
846 * never release the lock we're holding.
848 * Detach from the region. We have to do this first so architectures
849 * that don't permit a file to be mapped into different places in the
850 * address space simultaneously, e.g., HP's PaRisc, will work.
852 if ((ret
= __db_unmapregion(infop
)) != 0)
855 /* Update the caller's REGINFO size to the new map size. */
856 infop
->size
= new_size
;
858 /* Attach to the region. */
859 ret
= __db_mapregion(infop
->name
, infop
);