2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include "afs/param.h"
14 #include "afs/sysincludes.h"
15 #include "afsincludes.h"
17 #if !defined(AFS_LINUX_ENV)
22 #if defined(AFS_AIX31_ENV)
25 #if !defined(AFS_AIX_ENV) && !defined(AFS_SUN5_ENV) && !defined(AFS_SGI_ENV) && !defined(AFS_LINUX_ENV)
26 #include "h/kernel.h" /* Doesn't needed, so it should go */
28 #endif /* !defined(UKERNEL) */
30 #include "afs/afs_osi.h"
34 #if !defined(UKERNEL) && !defined(AFS_LINUX_ENV)
36 #endif /* !defined(UKERNEL) */
39 #include "afs/volerrors.h"
40 #include "afs/exporter.h"
41 #include "afs/prs_fs.h"
42 #include "afs/afs_chunkops.h"
45 #include "afs/afs_stats.h"
49 #define BUF_TIME_MAX 0x7fffffff
51 #define NPB 8 /* must be a pwer of 2 */
52 static int afs_max_buffers
; /* should be an integral multiple of NPB */
55 #define AFS_BUFFER_PAGESIZE 2048
58 /* If you change any of this PH stuff, make sure you don't break DZap() */
59 /* use last two bits for page */
61 /* use next five bits for fid */
63 /* page hash table size - this is pretty intertwined with pHash */
64 #define PHSIZE (PHPAGEMASK + PHFIDMASK + 1)
66 #define pHash(fid,page) ((((afs_int32)(fid)) & PHFIDMASK) \
67 | (page & PHPAGEMASK))
70 #undef dirty /* XXX */
73 static struct buffer
*Buffers
= 0;
74 static char *BufferData
;
77 extern struct buf
*geteblk();
80 #define timecounter afs_timecounter
83 /* A note on locking in 'struct buffer'
85 * afs_bufferLock protects the hash chain, and the 'lockers' field where that
86 * has a zero value. It must be held whenever lockers is incremented from zero.
88 * The individual buffer lock protects the contents of the structure, including
91 * For safety: afs_bufferLock and the individual buffer lock must be held
92 * when obtaining a reference on a structure. Only the individual buffer lock
93 * need be held when releasing a reference.
95 * The locking hierarchy is afs_bufferLock-> buffer.lock
99 static afs_lock_t afs_bufferLock
;
100 static struct buffer
*phTable
[PHSIZE
]; /* page hash table */
102 static afs_int32 timecounter
;
104 /* Prototypes for static routines */
105 static struct buffer
*afs_newslot(struct dcache
*adc
, afs_int32 apage
,
108 static int dinit_flag
= 0;
112 /* Initialize the venus buffer system. */
120 /* round up to next multiple of NPB, since we allocate multiple pages per chunk */
121 abuffers
= ((abuffers
- 1) | (NPB
- 1)) + 1;
122 afs_max_buffers
= abuffers
<< 2; /* possibly grow up to 4 times as big */
123 LOCK_INIT(&afs_bufferLock
, "afs_bufferLock");
124 Buffers
= afs_osi_Alloc(afs_max_buffers
* sizeof(struct buffer
));
125 osi_Assert(Buffers
!= NULL
);
127 afs_stats_cmperf
.bufAlloced
= nbuffers
= abuffers
;
128 for (i
= 0; i
< PHSIZE
; i
++)
130 for (i
= 0; i
< abuffers
; i
++) {
131 if ((i
& (NPB
- 1)) == 0) {
132 /* time to allocate a fresh buffer */
133 BufferData
= afs_osi_Alloc(AFS_BUFFER_PAGESIZE
* NPB
);
134 osi_Assert(BufferData
!= NULL
);
136 /* Fill in each buffer with an empty indication. */
139 afs_reset_inode(&tb
->inode
);
142 tb
->data
= &BufferData
[AFS_BUFFER_PAGESIZE
* (i
& (NPB
- 1))];
145 AFS_RWLOCK_INIT(&tb
->lock
, "buffer lock");
151 * Read and return the requested directory page.
153 * \param[in] adc pointer to directory dcache
154 * \param[in] page number of the desired directory page
155 * \param[out] entry buffer to return requested page
156 * \param[out] physerr (optional) pointer to return errno, if any
159 * \retval non-zero invalid directory or internal IO error;
160 * if physerr is supplied by caller, it will be set:
162 * errno physical error
165 DReadWithErrno(struct dcache
*adc
, int page
, struct DirBuffer
*entry
, int *physerr
)
167 /* Read a page from the disk. */
168 struct buffer
*tb
, *tb2
;
169 struct osi_file
*tfile
;
177 memset(entry
, 0, sizeof(struct DirBuffer
));
179 if (adc
->f
.chunk
== 0 && adc
->f
.chunkBytes
== 0) {
180 /* The directory blob is empty, apparently. This is not a valid dir
181 * blob, so throw an error. */
184 if (page
* AFS_BUFFER_PAGESIZE
>= adc
->f
.chunkBytes
) {
185 return ENOENT
; /* past the end */
188 ObtainWriteLock(&afs_bufferLock
, 256);
190 #define bufmatch(tb) (tb->page == page && tb->fid == adc->index)
191 #define buf_Front(head,parent,p) {(parent)->hashNext = (p)->hashNext; (p)->hashNext= *(head);*(head)=(p);}
193 /* this apparently-complicated-looking code is simply an example of
194 * a little bit of loop unrolling, and is a standard linked-list
195 * traversal trick. It saves a few assignments at the the expense
196 * of larger code size. This could be simplified by better use of
199 if ((tb
= phTable
[pHash(adc
->index
, page
)])) {
201 ObtainWriteLock(&tb
->lock
, 257);
203 ReleaseWriteLock(&afs_bufferLock
);
204 tb
->accesstime
= timecounter
++;
205 AFS_STATS(afs_stats_cmperf
.bufHits
++);
206 ReleaseWriteLock(&tb
->lock
);
208 entry
->data
= tb
->data
;
211 struct buffer
**bufhead
;
212 bufhead
= &(phTable
[pHash(adc
->index
, page
)]);
213 while ((tb2
= tb
->hashNext
)) {
215 buf_Front(bufhead
, tb
, tb2
);
216 ObtainWriteLock(&tb2
->lock
, 258);
218 ReleaseWriteLock(&afs_bufferLock
);
219 tb2
->accesstime
= timecounter
++;
220 AFS_STATS(afs_stats_cmperf
.bufHits
++);
221 ReleaseWriteLock(&tb2
->lock
);
223 entry
->data
= tb2
->data
;
226 if ((tb
= tb2
->hashNext
)) {
228 buf_Front(bufhead
, tb2
, tb
);
229 ObtainWriteLock(&tb
->lock
, 259);
231 ReleaseWriteLock(&afs_bufferLock
);
232 tb
->accesstime
= timecounter
++;
233 AFS_STATS(afs_stats_cmperf
.bufHits
++);
234 ReleaseWriteLock(&tb
->lock
);
236 entry
->data
= tb
->data
;
246 AFS_STATS(afs_stats_cmperf
.bufMisses
++);
248 /* The last thing we looked at was either tb or tb2 (or nothing). That
249 * is at least the oldest buffer on one particular hash chain, so it's
250 * a pretty good place to start looking for the truly oldest buffer.
252 tb
= afs_newslot(adc
, page
, (tb
? tb
: tb2
));
254 ReleaseWriteLock(&afs_bufferLock
);
257 ObtainWriteLock(&tb
->lock
, 260);
259 ReleaseWriteLock(&afs_bufferLock
);
260 tfile
= afs_CFileOpen(&adc
->f
.inode
);
266 afs_CFileRead(tfile
, tb
->page
* AFS_BUFFER_PAGESIZE
, tb
->data
,
267 AFS_BUFFER_PAGESIZE
);
268 afs_CFileClose(tfile
);
269 if (code
< AFS_BUFFER_PAGESIZE
) {
270 if (code
< 0 && physerr
!= NULL
)
275 /* Note that findslot sets the page field in the buffer equal to
276 * what it is searching for. */
277 ReleaseWriteLock(&tb
->lock
);
279 entry
->data
= tb
->data
;
284 afs_reset_inode(&tb
->inode
);
286 ReleaseWriteLock(&tb
->lock
);
291 * Read and return the requested directory page.
293 * \param[in] adc pointer to directory dcache
294 * \param[in] page number of the desired directory page
295 * \param[out] entry buffer to return requested page
298 * \retval non-zero invalid directory or internal IO error;
301 DRead(struct dcache
*adc
, int page
, struct DirBuffer
*entry
)
303 return DReadWithErrno(adc
, page
, entry
, NULL
);
307 FixupBucket(struct buffer
*ap
)
309 struct buffer
**lp
, *tp
;
311 /* first try to get it out of its current hash bucket, in which it
313 AFS_STATCNT(FixupBucket
);
316 for (tp
= *lp
; tp
; tp
= tp
->hashNext
) {
323 /* now figure the new hash bucket */
324 i
= pHash(ap
->fid
, ap
->page
);
325 ap
->hashIndex
= i
; /* remember where we are for deletion */
326 ap
->hashNext
= phTable
[i
]; /* add us to the list */
327 phTable
[i
] = ap
; /* at the front, since it's LRU */
330 /* lp is pointer to a fairly-old buffer */
331 static struct buffer
*
332 afs_newslot(struct dcache
*adc
, afs_int32 apage
, struct buffer
*lp
)
334 /* Find a usable buffer slot */
338 struct osi_file
*tfile
;
340 AFS_STATCNT(afs_newslot
);
341 /* we take a pointer here to a buffer which was at the end of an
342 * LRU hash chain. Odds are, it's one of the older buffers, not
343 * one of the newer. Having an older buffer to start with may
344 * permit us to avoid a few of the assignments in the "typical
345 * case" for loop below.
347 if (lp
&& (lp
->lockers
== 0)) {
353 /* timecounter might have wrapped, if machine is very very busy
354 * and stays up for a long time. Timecounter mustn't wrap twice
355 * (positive->negative->positive) before calling newslot, but that
356 * would require 2 billion consecutive cache hits... Anyway, the
357 * penalty is only that the cache replacement policy will be
358 * almost MRU for the next ~2 billion DReads... newslot doesn't
359 * get called nearly as often as DRead, so in order to avoid the
360 * performance penalty of using the hypers, it's worth doing the
361 * extra check here every time. It's probably cheaper than doing
362 * hcmp, anyway. There is a little performance hit resulting from
363 * resetting all the access times to 0, but it only happens once
364 * every month or so, and the access times will rapidly sort
365 * themselves back out after just a few more DReads.
367 if (timecounter
< 0) {
370 for (i
= 0; i
< nbuffers
; i
++, tp
++) {
372 if (!lp
&& !tp
->lockers
) /* one is as good as the rest, I guess */
376 /* this is the typical case */
378 for (i
= 0; i
< nbuffers
; i
++, tp
++) {
379 if (tp
->lockers
== 0) {
380 if (!lp
|| tp
->accesstime
< lt
) {
389 /* No unlocked buffers. If still possible, allocate a new increment */
390 if (nbuffers
+ NPB
> afs_max_buffers
) {
391 /* There are no unlocked buffers -- this used to panic, but that
392 * seems extreme. To the best of my knowledge, all the callers
393 * of DRead are prepared to handle a zero return. Some of them
394 * just panic directly, but not all of them. */
395 afs_warn("afs: all buffers locked\n");
399 BufferData
= afs_osi_Alloc(AFS_BUFFER_PAGESIZE
* NPB
);
400 osi_Assert(BufferData
!= NULL
);
401 for (i
= 0; i
< NPB
; i
++) {
402 /* Fill in each buffer with an empty indication. */
403 tp
= &Buffers
[i
+ nbuffers
];
405 afs_reset_inode(&tp
->inode
);
408 tp
->data
= &BufferData
[AFS_BUFFER_PAGESIZE
* i
];
411 AFS_RWLOCK_INIT(&tp
->lock
, "buffer lock");
413 lp
= &Buffers
[nbuffers
];
418 /* see DFlush for rationale for not getting and locking the dcache */
419 tfile
= afs_CFileOpen(&lp
->inode
);
421 return NULL
; /* Callers will flag as EIO */
423 afs_CFileWrite(tfile
, lp
->page
* AFS_BUFFER_PAGESIZE
, lp
->data
,
424 AFS_BUFFER_PAGESIZE
);
426 afs_CFileClose(tfile
);
427 AFS_STATS(afs_stats_cmperf
.bufFlushDirty
++);
430 /* Zero out the data so we don't leak something we shouldn't. */
431 memset(lp
->data
, 0, AFS_BUFFER_PAGESIZE
);
432 /* Now fill in the header. */
433 lp
->fid
= adc
->index
;
434 afs_copy_inode(&lp
->inode
, &adc
->f
.inode
);
436 lp
->accesstime
= timecounter
++;
437 FixupBucket(lp
); /* move to the right hash bucket */
443 DRelease(struct DirBuffer
*entry
, int flag
)
447 AFS_STATCNT(DRelease
);
453 ObtainWriteLock(&tp
->lock
, 261);
457 ReleaseWriteLock(&tp
->lock
);
461 DVOffset(struct DirBuffer
*entry
)
465 AFS_STATCNT(DVOffset
);
468 return AFS_BUFFER_PAGESIZE
* bp
->page
469 + (char *)entry
->data
- (char *)bp
->data
;
473 * Zap one dcache entry: destroy one FID's buffers.
475 * 1/1/91 - I've modified the hash function to take the page as well
476 * as the *fid, so that lookup will be a bit faster. That presents some
477 * difficulties for Zap, which now has to have some knowledge of the nature
478 * of the hash function. Oh well. This should use the list traversal
481 * \param adc The dcache entry to be zapped.
484 DZap(struct dcache
*adc
)
487 /* Destroy all buffers pertaining to a particular fid. */
491 ObtainReadLock(&afs_bufferLock
);
493 for (i
= 0; i
<= PHPAGEMASK
; i
++)
494 for (tb
= phTable
[pHash(adc
->index
, i
)]; tb
; tb
= tb
->hashNext
)
495 if (tb
->fid
== adc
->index
) {
496 ObtainWriteLock(&tb
->lock
, 262);
498 afs_reset_inode(&tb
->inode
);
500 ReleaseWriteLock(&tb
->lock
);
502 ReleaseReadLock(&afs_bufferLock
);
506 DFlushBuffer(struct buffer
*ab
)
508 struct osi_file
*tfile
;
510 tfile
= afs_CFileOpen(&ab
->inode
);
512 afs_CFileWrite(tfile
, ab
->page
* AFS_BUFFER_PAGESIZE
,
513 ab
->data
, AFS_BUFFER_PAGESIZE
);
514 ab
->dirty
= 0; /* Clear the dirty flag */
515 afs_CFileClose(tfile
);
519 DFlushDCache(struct dcache
*adc
)
524 ObtainReadLock(&afs_bufferLock
);
526 for (i
= 0; i
<= PHPAGEMASK
; i
++)
527 for (tb
= phTable
[pHash(adc
->index
, i
)]; tb
!= NULL
; tb
= tb
->hashNext
)
528 if (tb
->fid
== adc
->index
) {
529 ObtainWriteLock(&tb
->lock
, 701);
531 ReleaseReadLock(&afs_bufferLock
);
536 ReleaseWriteLock(&tb
->lock
);
537 ObtainReadLock(&afs_bufferLock
);
540 ReleaseReadLock(&afs_bufferLock
);
546 /* Flush all the modified buffers. */
552 ObtainReadLock(&afs_bufferLock
);
553 for (i
= 0; i
< nbuffers
; i
++, tb
++) {
555 ObtainWriteLock(&tb
->lock
, 263);
557 ReleaseReadLock(&afs_bufferLock
);
559 /* it seems safe to do this I/O without having the dcache
560 * locked, since the only things that will update the data in
561 * a directory are the buffer package, which holds the relevant
562 * tb->lock while doing the write, or afs_GetDCache, which
563 * DZap's the directory while holding the dcache lock.
564 * It is not possible to lock the dcache or even call
565 * afs_GetDSlot to map the index to the dcache since the dir
566 * package's caller has some dcache object locked already (so
567 * we cannot lock afs_xdcache). In addition, we cannot obtain
568 * a dcache lock while holding the tb->lock of the same file
569 * since that can deadlock with DRead/DNew */
573 ReleaseWriteLock(&tb
->lock
);
574 ObtainReadLock(&afs_bufferLock
);
577 ReleaseReadLock(&afs_bufferLock
);
583 * Prepare a new directory page buffer
585 * \param adc pointer to the directory object dcache
586 * \param nblobs page we want
587 * \param entry buffer to return requested page
589 * \retval 0 success; entry is updated
590 * \retval non-zero internal error or IO error writing to disk
593 DNew(struct dcache
*adc
, int page
, struct DirBuffer
*entry
)
595 /* Same as read, only do *not* even try to read the page, since it
596 * probably doesn't exist. */
602 ObtainWriteLock(&afs_bufferLock
, 264);
603 if ((tb
= afs_newslot(adc
, page
, NULL
)) == 0) {
604 ReleaseWriteLock(&afs_bufferLock
);
607 /* extend the chunk, if needed */
608 /* Do it now, not in DFlush or afs_newslot when the data is written out,
609 * since now our caller has adc->lock writelocked, and we can't acquire
610 * that lock (or even map from a fid to a dcache) in afs_newslot or
611 * DFlush due to lock hierarchy issues */
612 if ((page
+ 1) * AFS_BUFFER_PAGESIZE
> adc
->f
.chunkBytes
) {
613 afs_AdjustSize(adc
, (page
+ 1) * AFS_BUFFER_PAGESIZE
);
614 code
= afs_WriteDCache(adc
, 1);
616 ReleaseWriteLock(&afs_bufferLock
);
620 ObtainWriteLock(&tb
->lock
, 265);
622 ReleaseWriteLock(&afs_bufferLock
);
623 ReleaseWriteLock(&tb
->lock
);
625 entry
->data
= tb
->data
;
631 shutdown_bufferpackage(void)
636 AFS_STATCNT(shutdown_bufferpackage
);
637 /* Free all allocated Buffers and associated buffer pages */
642 for (i
= 0; i
< nbuffers
; i
+= NPB
, tp
+= NPB
) {
643 afs_osi_Free(tp
->data
, NPB
* AFS_BUFFER_PAGESIZE
);
645 afs_osi_Free(Buffers
, afs_max_buffers
* sizeof(struct buffer
));
649 for (i
= 0; i
< PHSIZE
; i
++)
652 if (afs_cold_shutdown
) {
653 memset(&afs_bufferLock
, 0, sizeof(afs_lock_t
));