1 /* $NetBSD: rf_paritylogging.c,v 1.27 2006/11/16 01:33:23 christos Exp $ */
3 * Copyright (c) 1995 Carnegie-Mellon University.
6 * Author: William V. Courtright II
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 * Carnegie Mellon requests users of this software to return to
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
31 parity logging configuration, dag selection, and mapping is implemented here
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.27 2006/11/16 01:33:23 christos Exp $");
39 #if RF_INCLUDE_PARITYLOGGING > 0
41 #include <dev/raidframe/raidframevar.h>
45 #include "rf_dagutils.h"
46 #include "rf_dagfuncs.h"
47 #include "rf_dagffrd.h"
48 #include "rf_dagffwr.h"
49 #include "rf_dagdegrd.h"
50 #include "rf_dagdegwr.h"
51 #include "rf_paritylog.h"
52 #include "rf_paritylogDiskMgr.h"
53 #include "rf_paritylogging.h"
54 #include "rf_parityloggingdags.h"
55 #include "rf_general.h"
58 #include "rf_shutdown.h"
60 typedef struct RF_ParityLoggingConfigInfo_s
{
61 RF_RowCol_t
**stripeIdentifier
; /* filled in at config time & used by
63 } RF_ParityLoggingConfigInfo_t
;
65 static void FreeRegionInfo(RF_Raid_t
* raidPtr
, RF_RegionId_t regionID
);
66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg
);
67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg
);
68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg
);
69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg
);
70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg
);
71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg
);
74 rf_ConfigureParityLogging(
75 RF_ShutdownList_t
** listp
,
79 int i
, j
, startdisk
, rc
;
80 RF_SectorCount_t totalLogCapacity
, fragmentation
, lastRegionCapacity
;
81 RF_SectorCount_t parityBufferCapacity
, maxRegionParityRange
;
82 RF_RaidLayout_t
*layoutPtr
= &raidPtr
->Layout
;
83 RF_ParityLoggingConfigInfo_t
*info
;
84 RF_ParityLog_t
*l
= NULL
, *next
;
87 if (rf_numParityRegions
<= 0)
91 * We create multiple entries on the shutdown list here, since
92 * this configuration routine is fairly complicated in and of
93 * itself, and this makes backing out of a failed configuration
97 raidPtr
->numSectorsPerLog
= RF_DEFAULT_NUM_SECTORS_PER_LOG
;
99 /* create a parity logging configuration structure */
100 RF_MallocAndAdd(info
, sizeof(RF_ParityLoggingConfigInfo_t
),
101 (RF_ParityLoggingConfigInfo_t
*),
102 raidPtr
->cleanupList
);
105 layoutPtr
->layoutSpecificInfo
= (void *) info
;
107 /* the stripe identifier must identify the disks in each stripe, IN
108 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
109 info
->stripeIdentifier
= rf_make_2d_array((raidPtr
->numCol
),
111 raidPtr
->cleanupList
);
112 if (info
->stripeIdentifier
== NULL
)
116 for (i
= 0; i
< (raidPtr
->numCol
); i
++) {
117 for (j
= 0; j
< (raidPtr
->numCol
); j
++) {
118 info
->stripeIdentifier
[i
][j
] = (startdisk
+ j
) %
119 (raidPtr
->numCol
- 1);
121 if ((--startdisk
) < 0)
122 startdisk
= raidPtr
->numCol
- 1 - 1;
125 /* fill in the remaining layout parameters */
126 layoutPtr
->numStripe
= layoutPtr
->stripeUnitsPerDisk
;
127 layoutPtr
->numParityCol
= 1;
128 layoutPtr
->numParityLogCol
= 1;
129 layoutPtr
->numDataCol
= raidPtr
->numCol
- layoutPtr
->numParityCol
-
130 layoutPtr
->numParityLogCol
;
131 layoutPtr
->dataSectorsPerStripe
= layoutPtr
->numDataCol
*
132 layoutPtr
->sectorsPerStripeUnit
;
133 layoutPtr
->dataStripeUnitsPerDisk
= layoutPtr
->stripeUnitsPerDisk
;
134 raidPtr
->sectorsPerDisk
= layoutPtr
->stripeUnitsPerDisk
*
135 layoutPtr
->sectorsPerStripeUnit
;
137 raidPtr
->totalSectors
= layoutPtr
->stripeUnitsPerDisk
*
138 layoutPtr
->numDataCol
* layoutPtr
->sectorsPerStripeUnit
;
140 /* configure parity log parameters
142 * parameter comment/constraints
143 * -------------------------------------------
144 * numParityRegions* all regions (except possibly last)
146 * totalInCoreLogCapacity* amount of memory in bytes available
147 * for in-core logs (default 1 MB)
148 * numSectorsPerLog# capacity of an in-core log in sectors
150 * numParityLogs total number of in-core logs,
151 * should be at least numParityRegions
152 * regionLogCapacity size of a region log (except possibly
153 * last one) in sectors
154 * totalLogCapacity total amount of log space in sectors
156 * where '*' denotes a user settable parameter.
157 * Note that logs are fixed to be the size of a disk track,
158 * value #defined in rf_paritylog.h
162 totalLogCapacity
= layoutPtr
->stripeUnitsPerDisk
* layoutPtr
->sectorsPerStripeUnit
* layoutPtr
->numParityLogCol
;
163 raidPtr
->regionLogCapacity
= totalLogCapacity
/ rf_numParityRegions
;
164 if (rf_parityLogDebug
)
165 printf("bytes per sector %d\n", raidPtr
->bytesPerSector
);
167 /* reduce fragmentation within a disk region by adjusting the number
168 * of regions in an attempt to allow an integral number of logs to fit
169 * into a disk region */
170 fragmentation
= raidPtr
->regionLogCapacity
% raidPtr
->numSectorsPerLog
;
171 if (fragmentation
> 0)
172 for (i
= 1; i
< (raidPtr
->numSectorsPerLog
/ 2); i
++) {
173 if (((totalLogCapacity
/ (rf_numParityRegions
+ i
)) %
174 raidPtr
->numSectorsPerLog
) < fragmentation
) {
175 rf_numParityRegions
++;
176 raidPtr
->regionLogCapacity
= totalLogCapacity
/
178 fragmentation
= raidPtr
->regionLogCapacity
%
179 raidPtr
->numSectorsPerLog
;
181 if (((totalLogCapacity
/ (rf_numParityRegions
- i
)) %
182 raidPtr
->numSectorsPerLog
) < fragmentation
) {
183 rf_numParityRegions
--;
184 raidPtr
->regionLogCapacity
= totalLogCapacity
/
186 fragmentation
= raidPtr
->regionLogCapacity
%
187 raidPtr
->numSectorsPerLog
;
190 /* ensure integral number of regions per log */
191 raidPtr
->regionLogCapacity
= (raidPtr
->regionLogCapacity
/
192 raidPtr
->numSectorsPerLog
) *
193 raidPtr
->numSectorsPerLog
;
195 raidPtr
->numParityLogs
= rf_totalInCoreLogCapacity
/
196 (raidPtr
->bytesPerSector
* raidPtr
->numSectorsPerLog
);
197 /* to avoid deadlock, must ensure that enough logs exist for each
198 * region to have one simultaneously */
199 if (raidPtr
->numParityLogs
< rf_numParityRegions
)
200 raidPtr
->numParityLogs
= rf_numParityRegions
;
202 /* create region information structs */
203 printf("Allocating %d bytes for in-core parity region info\n",
204 (int) (rf_numParityRegions
* sizeof(RF_RegionInfo_t
)));
205 RF_Malloc(raidPtr
->regionInfo
,
206 (rf_numParityRegions
* sizeof(RF_RegionInfo_t
)),
207 (RF_RegionInfo_t
*));
208 if (raidPtr
->regionInfo
== NULL
)
211 /* last region may not be full capacity */
212 lastRegionCapacity
= raidPtr
->regionLogCapacity
;
213 while ((rf_numParityRegions
- 1) * raidPtr
->regionLogCapacity
+
214 lastRegionCapacity
> totalLogCapacity
)
215 lastRegionCapacity
= lastRegionCapacity
-
216 raidPtr
->numSectorsPerLog
;
218 raidPtr
->regionParityRange
= raidPtr
->sectorsPerDisk
/
220 maxRegionParityRange
= raidPtr
->regionParityRange
;
222 /* i can't remember why this line is in the code -wvcii 6/30/95 */
223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
224 regionParityRange++; */
226 /* build pool of unused parity logs */
227 printf("Allocating %d bytes for %d parity logs\n",
228 raidPtr
->numParityLogs
* raidPtr
->numSectorsPerLog
*
229 raidPtr
->bytesPerSector
,
230 raidPtr
->numParityLogs
);
231 RF_Malloc(raidPtr
->parityLogBufferHeap
, raidPtr
->numParityLogs
*
232 raidPtr
->numSectorsPerLog
* raidPtr
->bytesPerSector
,
234 if (raidPtr
->parityLogBufferHeap
== NULL
)
236 lHeapPtr
= raidPtr
->parityLogBufferHeap
;
237 rf_mutex_init(&raidPtr
->parityLogPool
.mutex
);
238 for (i
= 0; i
< raidPtr
->numParityLogs
; i
++) {
240 RF_Malloc(raidPtr
->parityLogPool
.parityLogs
,
241 sizeof(RF_ParityLog_t
), (RF_ParityLog_t
*));
242 if (raidPtr
->parityLogPool
.parityLogs
== NULL
) {
243 RF_Free(raidPtr
->parityLogBufferHeap
,
244 raidPtr
->numParityLogs
*
245 raidPtr
->numSectorsPerLog
*
246 raidPtr
->bytesPerSector
);
249 l
= raidPtr
->parityLogPool
.parityLogs
;
251 RF_Malloc(l
->next
, sizeof(RF_ParityLog_t
),
253 if (l
->next
== NULL
) {
254 RF_Free(raidPtr
->parityLogBufferHeap
,
255 raidPtr
->numParityLogs
*
256 raidPtr
->numSectorsPerLog
*
257 raidPtr
->bytesPerSector
);
258 for (l
= raidPtr
->parityLogPool
.parityLogs
;
263 RF_Free(l
->records
, (raidPtr
->numSectorsPerLog
* sizeof(RF_ParityLogRecord_t
)));
264 RF_Free(l
, sizeof(RF_ParityLog_t
));
270 l
->bufPtr
= lHeapPtr
;
271 lHeapPtr
= (char *)lHeapPtr
+ raidPtr
->numSectorsPerLog
*
272 raidPtr
->bytesPerSector
;
273 RF_Malloc(l
->records
, (raidPtr
->numSectorsPerLog
*
274 sizeof(RF_ParityLogRecord_t
)),
275 (RF_ParityLogRecord_t
*));
276 if (l
->records
== NULL
) {
277 RF_Free(raidPtr
->parityLogBufferHeap
,
278 raidPtr
->numParityLogs
*
279 raidPtr
->numSectorsPerLog
*
280 raidPtr
->bytesPerSector
);
281 for (l
= raidPtr
->parityLogPool
.parityLogs
;
287 (raidPtr
->numSectorsPerLog
*
288 sizeof(RF_ParityLogRecord_t
)));
289 RF_Free(l
, sizeof(RF_ParityLog_t
));
294 rf_ShutdownCreate(listp
, rf_ShutdownParityLoggingPool
, raidPtr
);
295 /* build pool of region buffers */
296 rf_mutex_init(&raidPtr
->regionBufferPool
.mutex
);
297 raidPtr
->regionBufferPool
.cond
= 0;
298 raidPtr
->regionBufferPool
.bufferSize
= raidPtr
->regionLogCapacity
*
299 raidPtr
->bytesPerSector
;
300 printf("regionBufferPool.bufferSize %d\n",
301 raidPtr
->regionBufferPool
.bufferSize
);
303 /* for now, only one region at a time may be reintegrated */
304 raidPtr
->regionBufferPool
.totalBuffers
= 1;
306 raidPtr
->regionBufferPool
.availableBuffers
=
307 raidPtr
->regionBufferPool
.totalBuffers
;
308 raidPtr
->regionBufferPool
.availBuffersIndex
= 0;
309 raidPtr
->regionBufferPool
.emptyBuffersIndex
= 0;
310 printf("Allocating %d bytes for regionBufferPool\n",
311 (int) (raidPtr
->regionBufferPool
.totalBuffers
*
313 RF_Malloc(raidPtr
->regionBufferPool
.buffers
,
314 raidPtr
->regionBufferPool
.totalBuffers
* sizeof(void *),
316 if (raidPtr
->regionBufferPool
.buffers
== NULL
) {
319 for (i
= 0; i
< raidPtr
->regionBufferPool
.totalBuffers
; i
++) {
320 printf("Allocating %d bytes for regionBufferPool#%d\n",
321 (int) (raidPtr
->regionBufferPool
.bufferSize
*
323 RF_Malloc(raidPtr
->regionBufferPool
.buffers
[i
],
324 raidPtr
->regionBufferPool
.bufferSize
* sizeof(char),
326 if (raidPtr
->regionBufferPool
.buffers
[i
] == NULL
) {
327 for (j
= 0; j
< i
; j
++) {
328 RF_Free(raidPtr
->regionBufferPool
.buffers
[i
],
329 raidPtr
->regionBufferPool
.bufferSize
*
332 RF_Free(raidPtr
->regionBufferPool
.buffers
,
333 raidPtr
->regionBufferPool
.totalBuffers
*
337 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i
,
338 (long) raidPtr
->regionBufferPool
.buffers
[i
]);
340 rf_ShutdownCreate(listp
,
341 rf_ShutdownParityLoggingRegionBufferPool
,
343 /* build pool of parity buffers */
344 parityBufferCapacity
= maxRegionParityRange
;
345 rf_mutex_init(&raidPtr
->parityBufferPool
.mutex
);
346 raidPtr
->parityBufferPool
.cond
= 0;
347 raidPtr
->parityBufferPool
.bufferSize
= parityBufferCapacity
*
348 raidPtr
->bytesPerSector
;
349 printf("parityBufferPool.bufferSize %d\n",
350 raidPtr
->parityBufferPool
.bufferSize
);
352 /* for now, only one region at a time may be reintegrated */
353 raidPtr
->parityBufferPool
.totalBuffers
= 1;
355 raidPtr
->parityBufferPool
.availableBuffers
=
356 raidPtr
->parityBufferPool
.totalBuffers
;
357 raidPtr
->parityBufferPool
.availBuffersIndex
= 0;
358 raidPtr
->parityBufferPool
.emptyBuffersIndex
= 0;
359 printf("Allocating %d bytes for parityBufferPool of %d units\n",
360 (int) (raidPtr
->parityBufferPool
.totalBuffers
*
362 raidPtr
->parityBufferPool
.totalBuffers
);
363 RF_Malloc(raidPtr
->parityBufferPool
.buffers
,
364 raidPtr
->parityBufferPool
.totalBuffers
* sizeof(void *),
366 if (raidPtr
->parityBufferPool
.buffers
== NULL
) {
369 for (i
= 0; i
< raidPtr
->parityBufferPool
.totalBuffers
; i
++) {
370 printf("Allocating %d bytes for parityBufferPool#%d\n",
371 (int) (raidPtr
->parityBufferPool
.bufferSize
*
373 RF_Malloc(raidPtr
->parityBufferPool
.buffers
[i
],
374 raidPtr
->parityBufferPool
.bufferSize
* sizeof(char),
376 if (raidPtr
->parityBufferPool
.buffers
== NULL
) {
377 for (j
= 0; j
< i
; j
++) {
378 RF_Free(raidPtr
->parityBufferPool
.buffers
[i
],
379 raidPtr
->regionBufferPool
.bufferSize
*
382 RF_Free(raidPtr
->parityBufferPool
.buffers
,
383 raidPtr
->regionBufferPool
.totalBuffers
*
387 printf("parityBufferPool.buffers[%d] = %lx\n", i
,
388 (long) raidPtr
->parityBufferPool
.buffers
[i
]);
390 rf_ShutdownCreate(listp
,
391 rf_ShutdownParityLoggingParityBufferPool
,
393 /* initialize parityLogDiskQueue */
394 rf_mutex_init(&raidPtr
->parityLogDiskQueue
.mutex
);
395 raidPtr
->parityLogDiskQueue
.cond
= 0;
396 raidPtr
->parityLogDiskQueue
.flushQueue
= NULL
;
397 raidPtr
->parityLogDiskQueue
.reintQueue
= NULL
;
398 raidPtr
->parityLogDiskQueue
.bufHead
= NULL
;
399 raidPtr
->parityLogDiskQueue
.bufTail
= NULL
;
400 raidPtr
->parityLogDiskQueue
.reintHead
= NULL
;
401 raidPtr
->parityLogDiskQueue
.reintTail
= NULL
;
402 raidPtr
->parityLogDiskQueue
.logBlockHead
= NULL
;
403 raidPtr
->parityLogDiskQueue
.logBlockTail
= NULL
;
404 raidPtr
->parityLogDiskQueue
.reintBlockHead
= NULL
;
405 raidPtr
->parityLogDiskQueue
.reintBlockTail
= NULL
;
406 raidPtr
->parityLogDiskQueue
.freeDataList
= NULL
;
407 raidPtr
->parityLogDiskQueue
.freeCommonList
= NULL
;
409 rf_ShutdownCreate(listp
,
410 rf_ShutdownParityLoggingDiskQueue
,
412 for (i
= 0; i
< rf_numParityRegions
; i
++) {
413 rf_mutex_init(&raidPtr
->regionInfo
[i
].mutex
);
414 rf_mutex_init(&raidPtr
->regionInfo
[i
].reintMutex
);
415 raidPtr
->regionInfo
[i
].reintInProgress
= RF_FALSE
;
416 raidPtr
->regionInfo
[i
].regionStartAddr
=
417 raidPtr
->regionLogCapacity
* i
;
418 raidPtr
->regionInfo
[i
].parityStartAddr
=
419 raidPtr
->regionParityRange
* i
;
420 if (i
< rf_numParityRegions
- 1) {
421 raidPtr
->regionInfo
[i
].capacity
=
422 raidPtr
->regionLogCapacity
;
423 raidPtr
->regionInfo
[i
].numSectorsParity
=
424 raidPtr
->regionParityRange
;
426 raidPtr
->regionInfo
[i
].capacity
=
428 raidPtr
->regionInfo
[i
].numSectorsParity
=
429 raidPtr
->sectorsPerDisk
-
430 raidPtr
->regionParityRange
* i
;
431 if (raidPtr
->regionInfo
[i
].numSectorsParity
>
432 maxRegionParityRange
)
433 maxRegionParityRange
=
434 raidPtr
->regionInfo
[i
].numSectorsParity
;
436 raidPtr
->regionInfo
[i
].diskCount
= 0;
437 RF_ASSERT(raidPtr
->regionInfo
[i
].capacity
+
438 raidPtr
->regionInfo
[i
].regionStartAddr
<=
440 RF_ASSERT(raidPtr
->regionInfo
[i
].parityStartAddr
+
441 raidPtr
->regionInfo
[i
].numSectorsParity
<=
442 raidPtr
->sectorsPerDisk
);
443 printf("Allocating %d bytes for region %d\n",
444 (int) (raidPtr
->regionInfo
[i
].capacity
*
445 sizeof(RF_DiskMap_t
)), i
);
446 RF_Malloc(raidPtr
->regionInfo
[i
].diskMap
,
447 (raidPtr
->regionInfo
[i
].capacity
*
448 sizeof(RF_DiskMap_t
)),
450 if (raidPtr
->regionInfo
[i
].diskMap
== NULL
) {
451 for (j
= 0; j
< i
; j
++)
452 FreeRegionInfo(raidPtr
, j
);
453 RF_Free(raidPtr
->regionInfo
,
454 (rf_numParityRegions
*
455 sizeof(RF_RegionInfo_t
)));
458 raidPtr
->regionInfo
[i
].loggingEnabled
= RF_FALSE
;
459 raidPtr
->regionInfo
[i
].coreLog
= NULL
;
461 rf_ShutdownCreate(listp
,
462 rf_ShutdownParityLoggingRegionInfo
,
464 RF_ASSERT(raidPtr
->parityLogDiskQueue
.threadState
== 0);
465 raidPtr
->parityLogDiskQueue
.threadState
= RF_PLOG_CREATED
;
466 rc
= RF_CREATE_THREAD(raidPtr
->pLogDiskThreadHandle
,
467 rf_ParityLoggingDiskManager
, raidPtr
,"rf_log");
469 raidPtr
->parityLogDiskQueue
.threadState
= 0;
470 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
471 __FILE__
, __LINE__
, rc
);
474 /* wait for thread to start */
475 RF_LOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
476 while (!(raidPtr
->parityLogDiskQueue
.threadState
& RF_PLOG_RUNNING
)) {
477 RF_WAIT_COND(raidPtr
->parityLogDiskQueue
.cond
,
478 raidPtr
->parityLogDiskQueue
.mutex
);
480 RF_UNLOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
482 rf_ShutdownCreate(listp
, rf_ShutdownParityLogging
, raidPtr
);
483 if (rf_parityLogDebug
) {
484 printf(" size of disk log in sectors: %d\n",
485 (int) totalLogCapacity
);
486 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions
);
487 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr
->regionLogCapacity
);
488 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation
);
489 printf(" total number of parity logs is %d\n", raidPtr
->numParityLogs
);
490 printf(" parity log size is %d sectors\n", raidPtr
->numSectorsPerLog
);
491 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity
);
493 rf_EnableParityLogging(raidPtr
);
501 RF_RegionId_t regionID
)
503 RF_LOCK_MUTEX(raidPtr
->regionInfo
[regionID
].mutex
);
504 RF_Free(raidPtr
->regionInfo
[regionID
].diskMap
,
505 (raidPtr
->regionInfo
[regionID
].capacity
*
506 sizeof(RF_DiskMap_t
)));
507 if (!rf_forceParityLogReint
&& raidPtr
->regionInfo
[regionID
].coreLog
) {
508 rf_ReleaseParityLogs(raidPtr
,
509 raidPtr
->regionInfo
[regionID
].coreLog
);
510 raidPtr
->regionInfo
[regionID
].coreLog
= NULL
;
512 RF_ASSERT(raidPtr
->regionInfo
[regionID
].coreLog
== NULL
);
513 RF_ASSERT(raidPtr
->regionInfo
[regionID
].diskCount
== 0);
515 RF_UNLOCK_MUTEX(raidPtr
->regionInfo
[regionID
].mutex
);
522 RF_ParityLogQueue_t
* queue
)
524 RF_ParityLog_t
*l1
, *l2
;
526 RF_LOCK_MUTEX(queue
->mutex
);
527 l1
= queue
->parityLogs
;
531 RF_Free(l2
->records
, (raidPtr
->numSectorsPerLog
*
532 sizeof(RF_ParityLogRecord_t
)));
533 RF_Free(l2
, sizeof(RF_ParityLog_t
));
535 RF_UNLOCK_MUTEX(queue
->mutex
);
540 FreeRegionBufferQueue(RF_RegionBufferQueue_t
* queue
)
544 RF_LOCK_MUTEX(queue
->mutex
);
545 if (queue
->availableBuffers
!= queue
->totalBuffers
) {
546 printf("Attempt to free region queue which is still in use!\n");
549 for (i
= 0; i
< queue
->totalBuffers
; i
++)
550 RF_Free(queue
->buffers
[i
], queue
->bufferSize
);
551 RF_Free(queue
->buffers
, queue
->totalBuffers
* sizeof(void *));
552 RF_UNLOCK_MUTEX(queue
->mutex
);
556 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg
)
561 raidPtr
= (RF_Raid_t
*) arg
;
562 if (rf_parityLogDebug
) {
563 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
566 /* free region information structs */
567 for (i
= 0; i
< rf_numParityRegions
; i
++)
568 FreeRegionInfo(raidPtr
, i
);
569 RF_Free(raidPtr
->regionInfo
, (rf_numParityRegions
*
570 sizeof(raidPtr
->regionInfo
)));
571 raidPtr
->regionInfo
= NULL
;
575 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg
)
579 raidPtr
= (RF_Raid_t
*) arg
;
580 if (rf_parityLogDebug
) {
581 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr
->raidid
);
583 /* free contents of parityLogPool */
584 FreeParityLogQueue(raidPtr
, &raidPtr
->parityLogPool
);
585 RF_Free(raidPtr
->parityLogBufferHeap
, raidPtr
->numParityLogs
*
586 raidPtr
->numSectorsPerLog
* raidPtr
->bytesPerSector
);
590 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg
)
594 raidPtr
= (RF_Raid_t
*) arg
;
595 if (rf_parityLogDebug
) {
596 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
599 FreeRegionBufferQueue(&raidPtr
->regionBufferPool
);
603 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg
)
607 raidPtr
= (RF_Raid_t
*) arg
;
608 if (rf_parityLogDebug
) {
609 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
612 FreeRegionBufferQueue(&raidPtr
->parityBufferPool
);
616 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg
)
618 RF_ParityLogData_t
*d
;
619 RF_CommonLogData_t
*c
;
622 raidPtr
= (RF_Raid_t
*) arg
;
623 if (rf_parityLogDebug
) {
624 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
627 /* free disk manager stuff */
628 RF_ASSERT(raidPtr
->parityLogDiskQueue
.bufHead
== NULL
);
629 RF_ASSERT(raidPtr
->parityLogDiskQueue
.bufTail
== NULL
);
630 RF_ASSERT(raidPtr
->parityLogDiskQueue
.reintHead
== NULL
);
631 RF_ASSERT(raidPtr
->parityLogDiskQueue
.reintTail
== NULL
);
632 while (raidPtr
->parityLogDiskQueue
.freeDataList
) {
633 d
= raidPtr
->parityLogDiskQueue
.freeDataList
;
634 raidPtr
->parityLogDiskQueue
.freeDataList
=
635 raidPtr
->parityLogDiskQueue
.freeDataList
->next
;
636 RF_Free(d
, sizeof(RF_ParityLogData_t
));
638 while (raidPtr
->parityLogDiskQueue
.freeCommonList
) {
639 c
= raidPtr
->parityLogDiskQueue
.freeCommonList
;
640 raidPtr
->parityLogDiskQueue
.freeCommonList
=
641 raidPtr
->parityLogDiskQueue
.freeCommonList
->next
;
642 RF_Free(c
, sizeof(RF_CommonLogData_t
));
647 rf_ShutdownParityLogging(RF_ThreadArg_t arg
)
651 raidPtr
= (RF_Raid_t
*) arg
;
652 if (rf_parityLogDebug
) {
653 printf("raid%d: ShutdownParityLogging\n", raidPtr
->raidid
);
655 /* shutdown disk thread */
656 /* This has the desirable side-effect of forcing all regions to be
657 * reintegrated. This is necessary since all parity log maps are
658 * currently held in volatile memory. */
660 RF_LOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
661 raidPtr
->parityLogDiskQueue
.threadState
|= RF_PLOG_TERMINATE
;
662 RF_UNLOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
663 RF_SIGNAL_COND(raidPtr
->parityLogDiskQueue
.cond
);
665 * pLogDiskThread will now terminate when queues are cleared
666 * now wait for it to be done
668 RF_LOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
669 while (!(raidPtr
->parityLogDiskQueue
.threadState
& RF_PLOG_SHUTDOWN
)) {
670 RF_WAIT_COND(raidPtr
->parityLogDiskQueue
.cond
,
671 raidPtr
->parityLogDiskQueue
.mutex
);
673 RF_UNLOCK_MUTEX(raidPtr
->parityLogDiskQueue
.mutex
);
674 if (rf_parityLogDebug
) {
675 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr
->raidid
);
680 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t
* raidPtr
)
686 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t
* raidPtr
)
690 /* return the region ID for a given RAID address */
692 rf_MapRegionIDParityLogging(
694 RF_SectorNum_t address
)
696 RF_RegionId_t regionID
;
698 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
699 regionID
= address
/ raidPtr
->regionParityRange
;
700 if (regionID
== rf_numParityRegions
) {
701 /* last region may be larger than other regions */
704 RF_ASSERT(address
>= raidPtr
->regionInfo
[regionID
].parityStartAddr
);
705 RF_ASSERT(address
< raidPtr
->regionInfo
[regionID
].parityStartAddr
+
706 raidPtr
->regionInfo
[regionID
].numSectorsParity
);
707 RF_ASSERT(regionID
< rf_numParityRegions
);
712 /* given a logical RAID sector, determine physical disk address of data */
714 rf_MapSectorParityLogging(
716 RF_RaidAddr_t raidSector
,
718 RF_SectorNum_t
* diskSector
,
721 RF_StripeNum_t SUID
= raidSector
/
722 raidPtr
->Layout
.sectorsPerStripeUnit
;
723 /* *col = (SUID % (raidPtr->numCol -
724 * raidPtr->Layout.numParityLogCol)); */
725 *col
= SUID
% raidPtr
->Layout
.numDataCol
;
726 *diskSector
= (SUID
/ (raidPtr
->Layout
.numDataCol
)) *
727 raidPtr
->Layout
.sectorsPerStripeUnit
+
728 (raidSector
% raidPtr
->Layout
.sectorsPerStripeUnit
);
732 /* given a logical RAID sector, determine physical disk address of parity */
734 rf_MapParityParityLogging(
736 RF_RaidAddr_t raidSector
,
738 RF_SectorNum_t
* diskSector
,
741 RF_StripeNum_t SUID
= raidSector
/
742 raidPtr
->Layout
.sectorsPerStripeUnit
;
745 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
746 * r->numCol - raidPtr->Layout.numParityLogCol); */
747 *col
= raidPtr
->Layout
.numDataCol
;
748 *diskSector
= (SUID
/ (raidPtr
->Layout
.numDataCol
)) *
749 raidPtr
->Layout
.sectorsPerStripeUnit
+
750 (raidSector
% raidPtr
->Layout
.sectorsPerStripeUnit
);
754 /* given a regionID and sector offset, determine the physical disk address of the parity log */
756 rf_MapLogParityLogging(
758 RF_RegionId_t regionID
,
759 RF_SectorNum_t regionOffset
,
761 RF_SectorNum_t
* startSector
)
763 *col
= raidPtr
->numCol
- 1;
764 *startSector
= raidPtr
->regionInfo
[regionID
].regionStartAddr
+ regionOffset
;
768 /* given a regionID, determine the physical disk address of the logged
769 parity for that region */
773 RF_RegionId_t regionID
,
775 RF_SectorNum_t
* startSector
,
776 RF_SectorCount_t
* numSector
)
778 *col
= raidPtr
->numCol
- 2;
779 *startSector
= raidPtr
->regionInfo
[regionID
].parityStartAddr
;
780 *numSector
= raidPtr
->regionInfo
[regionID
].numSectorsParity
;
784 /* given a logical RAID address, determine the participating disks in
787 rf_IdentifyStripeParityLogging(
790 RF_RowCol_t
** diskids
)
792 RF_StripeNum_t stripeID
= rf_RaidAddressToStripeID(&raidPtr
->Layout
,
794 RF_ParityLoggingConfigInfo_t
*info
= (RF_ParityLoggingConfigInfo_t
*)
795 raidPtr
->Layout
.layoutSpecificInfo
;
796 *diskids
= info
->stripeIdentifier
[stripeID
% raidPtr
->numCol
];
801 rf_MapSIDToPSIDParityLogging(
802 RF_RaidLayout_t
* layoutPtr
,
803 RF_StripeNum_t stripeID
,
804 RF_StripeNum_t
* psID
,
805 RF_ReconUnitNum_t
* which_ru
)
812 /* select an algorithm for performing an access. Returns two pointers,
813 * one to a function that will return information about the DAG, and
814 * another to a function that will create the dag.
817 rf_ParityLoggingDagSelect(
820 RF_AccessStripeMap_t
* asmp
,
821 RF_VoidFuncPtr
* createFunc
)
823 RF_RaidLayout_t
*layoutPtr
= &(raidPtr
->Layout
);
824 RF_PhysDiskAddr_t
*failedPDA
= NULL
;
826 RF_RowStatus_t rstat
;
829 RF_ASSERT(RF_IO_IS_R_OR_W(type
));
831 if (asmp
->numDataFailed
+ asmp
->numParityFailed
> 1) {
832 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
836 if (asmp
->numDataFailed
+ asmp
->numParityFailed
== 1) {
838 /* if under recon & already reconstructed, redirect
839 * the access to the spare drive and eliminate the
840 * failure indication */
841 failedPDA
= asmp
->failedPDAs
[0];
842 fcol
= failedPDA
->col
;
843 rstat
= raidPtr
->status
;
844 prior_recon
= (rstat
== rf_rs_reconfigured
) || (
845 (rstat
== rf_rs_reconstructing
) ?
846 rf_CheckRUReconstructed(raidPtr
->reconControl
->reconMap
, failedPDA
->startSector
) : 0
849 RF_RowCol_t oc
= failedPDA
->col
;
850 RF_SectorNum_t oo
= failedPDA
->startSector
;
851 if (layoutPtr
->map
->flags
&
852 RF_DISTRIBUTE_SPARE
) {
853 /* redirect to dist spare space */
855 if (failedPDA
== asmp
->parityInfo
) {
857 /* parity has failed */
858 (layoutPtr
->map
->MapParity
) (raidPtr
, failedPDA
->raidAddress
,
859 &failedPDA
->col
, &failedPDA
->startSector
, RF_REMAP
);
861 if (asmp
->parityInfo
->next
) { /* redir 2nd component,
863 RF_PhysDiskAddr_t
*p
= asmp
->parityInfo
->next
;
864 RF_SectorNum_t SUoffs
= p
->startSector
% layoutPtr
->sectorsPerStripeUnit
;
865 p
->col
= failedPDA
->col
;
866 p
->startSector
= rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr
, failedPDA
->startSector
) +
869 * really a RAID address */
872 if (asmp
->parityInfo
->next
&& failedPDA
== asmp
->parityInfo
->next
) {
873 RF_ASSERT(0); /* should not ever
877 /* data has failed */
878 (layoutPtr
->map
->MapSector
) (raidPtr
, failedPDA
->raidAddress
,
879 &failedPDA
->col
, &failedPDA
->startSector
, RF_REMAP
);
884 /* redirect to dedicated spare space */
886 failedPDA
->col
= raidPtr
->Disks
[fcol
].spareCol
;
888 /* the parity may have two distinct
889 * components, both of which may need
890 * to be redirected */
891 if (asmp
->parityInfo
->next
) {
892 if (failedPDA
== asmp
->parityInfo
) {
893 failedPDA
->next
->col
= failedPDA
->col
;
895 if (failedPDA
== asmp
->parityInfo
->next
) { /* paranoid: should never occur */
896 asmp
->parityInfo
->col
= failedPDA
->col
;
901 RF_ASSERT(failedPDA
->col
!= -1);
903 if (rf_dagDebug
|| rf_mapDebug
) {
904 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n",
905 raidPtr
->raidid
, type
, oc
, (long) oo
, failedPDA
->col
, (long) failedPDA
->startSector
);
907 asmp
->numDataFailed
= asmp
->numParityFailed
= 0;
910 if (type
== RF_IO_TYPE_READ
) {
912 if (asmp
->numDataFailed
== 0)
913 *createFunc
= (RF_VoidFuncPtr
) rf_CreateFaultFreeReadDAG
;
915 *createFunc
= (RF_VoidFuncPtr
) rf_CreateRaidFiveDegradedReadDAG
;
920 /* if mirroring, always use large writes. If the access
921 * requires two distinct parity updates, always do a small
922 * write. If the stripe contains a failure but the access
923 * does not, do a small write. The first conditional
924 * (numStripeUnitsAccessed <= numDataCol/2) uses a
925 * less-than-or-equal rather than just a less-than because
926 * when G is 3 or 4, numDataCol/2 is 1, and I want
927 * single-stripe-unit updates to use just one disk. */
928 if ((asmp
->numDataFailed
+ asmp
->numParityFailed
) == 0) {
929 if (((asmp
->numStripeUnitsAccessed
<=
930 (layoutPtr
->numDataCol
/ 2)) &&
931 (layoutPtr
->numDataCol
!= 1)) ||
932 (asmp
->parityInfo
->next
!= NULL
) ||
933 rf_CheckStripeForFailures(raidPtr
, asmp
)) {
934 *createFunc
= (RF_VoidFuncPtr
) rf_CreateParityLoggingSmallWriteDAG
;
936 *createFunc
= (RF_VoidFuncPtr
) rf_CreateParityLoggingLargeWriteDAG
;
938 if (asmp
->numParityFailed
== 1)
939 *createFunc
= (RF_VoidFuncPtr
) rf_CreateNonRedundantWriteDAG
;
941 if (asmp
->numStripeUnitsAccessed
!= 1 && failedPDA
->numSector
!= layoutPtr
->sectorsPerStripeUnit
)
944 *createFunc
= (RF_VoidFuncPtr
) rf_CreateDegradedWriteDAG
;
947 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */