4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
31 * DESCRIPTION: Main RAID driver source file containing open, close and I/O
34 * ROUTINES PROVIDED FOR EXTERNAL USE:
35 * raid_open() - open the RAID metadevice for access.
36 * raid_internal_open() - internal open routine of RAID metdevice.
37 * md_raid_strategy() - perform normal I/O operations,
38 * such as read and write.
39 * raid_close() - close the RAID metadevice.
40 * raid_internal_close() - internal close routine of RAID metadevice.
41 * raid_snarf() - initialize and clean up MDD records.
42 * raid_halt() - reset the RAID metadevice
43 * raid_line() - return the line # of this segment
44 * raid_dcolumn() - return the data column # of this segment
45 * raid_pcolumn() - return the parity column # of this segment
48 #include <sys/param.h>
49 #include <sys/systm.h>
54 #include <sys/t_lock.h>
60 #include <sys/cmn_err.h>
61 #include <sys/sysmacros.h>
62 #include <sys/types.h>
63 #include <sys/mkdev.h>
66 #include <sys/modctl.h>
68 #include <sys/sunddi.h>
69 #include <sys/debug.h>
70 #include <sys/lvm/md_raid.h>
71 #include <sys/lvm/mdvar.h>
72 #include <sys/lvm/md_convert.h>
74 #include <sys/sysevent/eventdefs.h>
75 #include <sys/sysevent/svm.h>
79 md_ops_t
*md_interface_ops
= &raid_md_ops
;
82 extern unit_t md_nunits
;
83 extern unit_t md_nsets
;
84 extern md_set_t md_set
[];
86 extern major_t md_major
;
87 extern mdq_anchor_t md_done_daemon
;
88 extern mdq_anchor_t md_mstr_daemon
;
89 extern int md_sleep_for_test
;
92 extern md_event_queue_t
*md_event_queue
;
105 int raid_total_io
= 0;
108 int raid_no_bpmaps
= 0;
111 int raid_1024_8192
= 0;
113 int raid_8192_bigger
= 0;
114 int raid_line_lock_wait
= 0;
116 int data_buffer_waits
= 0;
117 int parity_buffer_waits
= 0;
119 /* writer line locks */
120 int raid_writer_locks
= 0; /* total writer locks */
121 int raid_write_waits
= 0; /* total writer locks that waited */
122 int raid_full_line_writes
= 0; /* total full line writes */
123 int raid_write_queue_length
= 0; /* wait queue length */
124 int raid_max_write_q_length
= 0; /* maximum queue length */
125 int raid_write_locks_active
= 0; /* writer locks at any time */
126 int raid_max_write_locks
= 0; /* maximum writer locks active */
128 /* read line locks */
129 int raid_reader_locks
= 0; /* total reader locks held */
130 int raid_reader_locks_active
= 0; /* reader locks held */
131 int raid_max_reader_locks
= 0; /* maximum reader locks held in run */
132 int raid_read_overlaps
= 0; /* number of times 2 reads hit same line */
133 int raid_read_waits
= 0; /* times a reader waited on writer */
136 int raid_prewrite_waits
= 0; /* number of waits for a pw slot */
137 int raid_pw
= 0; /* number of pw slots in use */
138 int raid_prewrite_max
= 0; /* maximum number of pw slots in use */
139 int raid_pw_invalidates
= 0;
141 static clock_t md_wr_wait
= 0;
143 int nv_available
= 0; /* presence of nv-ram support in device */
144 int nv_prewrite
= 1; /* mark prewrites with nv_available */
145 int nv_parity
= 1; /* mark parity with nv_available */
147 kmem_cache_t
*raid_parent_cache
= NULL
;
148 kmem_cache_t
*raid_child_cache
= NULL
;
149 kmem_cache_t
*raid_cbuf_cache
= NULL
;
151 int raid_internal_open(minor_t mnum
, int flag
, int otyp
,
154 static void freebuffers(md_raidcs_t
*cs
);
155 static int raid_read(mr_unit_t
*un
, md_raidcs_t
*cs
);
156 static void raid_read_io(mr_unit_t
*un
, md_raidcs_t
*cs
);
157 static int raid_write(mr_unit_t
*un
, md_raidcs_t
*cs
);
158 static void raid_write_io(mr_unit_t
*un
, md_raidcs_t
*cs
);
159 static void raid_stage(md_raidcs_t
*cs
);
160 static void raid_enqueue(md_raidcs_t
*cs
);
161 static diskaddr_t
raid_line(diskaddr_t segment
, mr_unit_t
*un
);
162 uint_t
raid_dcolumn(diskaddr_t segment
, mr_unit_t
*un
);
163 static void getpbuffer(md_raidcs_t
*cs
);
164 static void getdbuffer(md_raidcs_t
*cs
);
165 static void raid_done(buf_t
*bp
);
166 static void raid_io_startup(mr_unit_t
*un
);
169 raid_col2unit(rcs_state_t state
, rus_state_t unitstate
)
177 if (unitstate
& RUS_LAST_ERRED
)
178 return (RUS_LAST_ERRED
);
188 panic("raid_col2unit");
193 raid_set_state(mr_unit_t
*un
, int col
, rcs_state_t newstate
, int force
)
196 rus_state_t unitstate
, origstate
;
197 rcs_state_t colstate
;
198 rcs_state_t orig_colstate
;
199 int errcnt
= 0, okaycnt
= 0, resynccnt
= 0;
204 ASSERT(col
< un
->un_totalcolumncnt
);
206 (RCS_INIT
| RCS_INIT_ERRED
| RCS_OKAY
| RCS_RESYNC
| RCS_ERRED
|
207 RCS_LAST_ERRED
| RCS_REGEN
));
209 ~(RCS_INIT
| RCS_INIT_ERRED
| RCS_OKAY
| RCS_RESYNC
| RCS_ERRED
|
210 RCS_LAST_ERRED
| RCS_REGEN
))
213 ASSERT(MDI_UNIT(MD_SID(un
)) ? UNIT_WRITER_HELD(un
) : 1);
215 unitstate
= un
->un_state
;
216 origstate
= unitstate
;
219 un
->un_column
[col
].un_devstate
= newstate
;
220 un
->un_state
= raid_col2unit(newstate
, unitstate
);
221 uniqtime32(&un
->un_column
[col
].un_devtimestamp
);
222 uniqtime32(&un
->un_timestamp
);
226 ASSERT(un
->un_state
&
227 (RUS_INIT
| RUS_OKAY
| RUS_ERRED
| RUS_DOI
| RUS_LAST_ERRED
|
229 ASSERT((un
->un_state
& ~(RUS_INIT
|
230 RUS_OKAY
| RUS_ERRED
| RUS_DOI
| RUS_LAST_ERRED
| RUS_REGEN
)) == 0);
232 if (un
->un_column
[col
].un_devstate
== newstate
)
235 if (newstate
== RCS_REGEN
) {
236 if (raid_state_cnt(un
, RCS_OKAY
) != un
->un_totalcolumncnt
)
238 un
->un_state
= RUS_REGEN
;
242 orig_colstate
= un
->un_column
[col
].un_devstate
;
245 * if there is another column in the error state then this
246 * column should go to the last errored state
248 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
252 colstate
= un
->un_column
[i
].un_devstate
;
253 if (colstate
& (RCS_ERRED
| RCS_LAST_ERRED
| RCS_INIT_ERRED
))
255 if (colstate
& RCS_OKAY
)
257 if (colstate
& RCS_RESYNC
)
260 ASSERT(resynccnt
< 2);
262 if (okaycnt
== un
->un_totalcolumncnt
)
263 unitstate
= RUS_OKAY
;
264 else if (errcnt
> 1) {
265 unitstate
= RUS_LAST_ERRED
;
266 if (newstate
& RCS_ERRED
)
267 newstate
= RCS_LAST_ERRED
;
268 } else if (errcnt
== 1)
269 if (!(unitstate
& RUS_LAST_ERRED
))
270 unitstate
= RUS_ERRED
;
272 if (un
->un_state
== RUS_DOI
)
275 un
->un_column
[col
].un_devstate
= newstate
;
276 uniqtime32(&un
->un_column
[col
].un_devtimestamp
);
278 * if there are last errored column being brought back online
279 * by open or snarf, then be sure to clear the RUS_LAST_ERRED
280 * bit to allow writes. If there is a real error then the
281 * column will go back into last erred.
283 if ((raid_state_cnt(un
, RCS_LAST_ERRED
) == 0) &&
284 (raid_state_cnt(un
, RCS_ERRED
) == 1))
285 unitstate
= RUS_ERRED
;
287 un
->un_state
= unitstate
;
288 uniqtime32(&un
->un_timestamp
);
290 if ((! (origstate
& (RUS_ERRED
|RUS_LAST_ERRED
|RUS_DOI
))) &&
291 (unitstate
& (RUS_ERRED
|RUS_LAST_ERRED
|RUS_DOI
))) {
292 devname
= md_devname(MD_UN2SET(un
),
293 un
->un_column
[col
].un_dev
, NULL
, 0);
295 cmn_err(CE_WARN
, "md: %s: %s needs maintenance",
296 md_shortname(MD_SID(un
)), devname
);
298 if (unitstate
& RUS_LAST_ERRED
) {
299 cmn_err(CE_WARN
, "md: %s: %s last erred",
300 md_shortname(MD_SID(un
)), devname
);
302 } else if (un
->un_column
[col
].un_devflags
&
303 MD_RAID_DEV_ISOPEN
) {
305 * Close the broken device and clear the open flag on
306 * it. We have to check that the device is open,
307 * otherwise the first open on it has resulted in the
308 * error that is being processed and the actual un_dev
311 md_layered_close(un
->un_column
[col
].un_dev
,
313 un
->un_column
[col
].un_devflags
&= ~MD_RAID_DEV_ISOPEN
;
315 } else if (orig_colstate
== RCS_LAST_ERRED
&& newstate
== RCS_ERRED
&&
316 un
->un_column
[col
].un_devflags
& MD_RAID_DEV_ISOPEN
) {
318 * Similar to logic above except no log messages since we
319 * are just transitioning from Last Erred to Erred.
321 md_layered_close(un
->un_column
[col
].un_dev
, MD_OFLG_NULL
);
322 un
->un_column
[col
].un_devflags
&= ~MD_RAID_DEV_ISOPEN
;
326 * If a resync has completed, see if there is a Last Erred
327 * component that we can change to the Erred state.
329 if ((orig_colstate
== RCS_RESYNC
) && (newstate
== RCS_OKAY
)) {
330 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
332 (un
->un_column
[i
].un_devstate
& RCS_LAST_ERRED
)) {
333 raid_set_state(un
, i
, RCS_ERRED
, 0);
341 * NAME: erred_check_line
343 * DESCRIPTION: Return the type of write to perform on an erred column based
344 * upon any resync activity.
346 * if a column is being resynced and the write is above the
347 * resync point may have to write to the target being resynced.
349 * Column state may make it impossible to do the write
350 * in which case RCL_EIO or RCL_ENXIO is returned.
352 * If a column cannot be written directly, RCL_ERRED is
353 * returned and processing should proceed accordingly.
355 * PARAMETERS: minor_t mnum - minor number identity of metadevice
356 * md_raidcs_t *cs - child save structure
357 * mr_column_t *dcolumn - pointer to data column structure
358 * mr_column_t *pcolumn - pointer to parity column structure
360 * RETURNS: RCL_OKAY, RCL_ERRED
362 * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held
367 erred_check_line(mr_unit_t
*un
, md_raidcs_t
*cs
, mr_column_t
*column
)
371 ASSERT(cs
->cs_flags
& MD_RCS_LLOCKD
);
373 if (column
->un_devstate
& RCS_OKAY
)
376 if (column
->un_devstate
& RCS_ERRED
)
377 return (RCL_ERRED
); /* do not read from errored disk */
380 * for the last errored case their are two considerations.
381 * When the last errored column is the only errored column then
382 * do treat it like a maintenance column, not doing I/O from
383 * it. When it there are other failures then just attempt
386 if (column
->un_devstate
& RCS_LAST_ERRED
)
389 ASSERT(column
->un_devstate
& RCS_RESYNC
);
392 * When a resync from a hotspare is being done (copy resync)
393 * then always treat it as an OKAY column, since no regen
396 if (column
->un_devflags
& MD_RAID_COPY_RESYNC
) {
400 mutex_enter(&un
->un_mx
);
401 if (cs
->cs_line
< un
->un_resync_line_index
) {
402 mutex_exit(&un
->un_mx
);
405 mutex_exit(&un
->un_mx
);
411 * NAMES: raid_state_cnt
413 * DESCRIPTION: counts number of column in a specific state
415 * PARAMETERS: md_raid_t *un
419 raid_state_cnt(mr_unit_t
*un
, rcs_state_t state
)
423 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++)
424 if (un
->un_column
[i
].un_devstate
& state
)
430 * NAMES: raid_io_overlaps
432 * DESCRIPTION: checkst for overlap of 2 child save structures
434 * PARAMETERS: md_raidcs_t cs1
437 * RETURNS: 0 - no overlap
441 raid_io_overlaps(md_raidcs_t
*cs1
, md_raidcs_t
*cs2
)
443 if (cs1
->cs_blkno
> cs2
->cs_lastblk
)
445 if (cs1
->cs_lastblk
< cs2
->cs_blkno
)
451 * NAMES: raid_parent_constructor
452 * DESCRIPTION: parent structure constructor routine
457 raid_parent_constructor(void *p
, void *d1
, int d2
)
459 mutex_init(&((md_raidps_t
*)p
)->ps_mx
,
460 NULL
, MUTEX_DEFAULT
, NULL
);
461 mutex_init(&((md_raidps_t
*)p
)->ps_mapin_mx
,
462 NULL
, MUTEX_DEFAULT
, NULL
);
467 raid_parent_init(md_raidps_t
*ps
)
469 bzero(ps
, offsetof(md_raidps_t
, ps_mx
));
470 ((md_raidps_t
*)ps
)->ps_flags
= MD_RPS_INUSE
;
471 ((md_raidps_t
*)ps
)->ps_magic
= RAID_PSMAGIC
;
476 raid_parent_destructor(void *p
, void *d
)
478 mutex_destroy(&((md_raidps_t
*)p
)->ps_mx
);
479 mutex_destroy(&((md_raidps_t
*)p
)->ps_mapin_mx
);
483 * NAMES: raid_child_constructor
484 * DESCRIPTION: child structure constructor routine
489 raid_child_constructor(void *p
, void *d1
, int d2
)
491 md_raidcs_t
*cs
= (md_raidcs_t
*)p
;
492 mutex_init(&cs
->cs_mx
, NULL
, MUTEX_DEFAULT
, NULL
);
493 bioinit(&cs
->cs_dbuf
);
494 bioinit(&cs
->cs_pbuf
);
495 bioinit(&cs
->cs_hbuf
);
500 raid_child_init(md_raidcs_t
*cs
)
502 bzero(cs
, offsetof(md_raidcs_t
, cs_mx
));
504 md_bioreset(&cs
->cs_dbuf
);
505 md_bioreset(&cs
->cs_pbuf
);
506 md_bioreset(&cs
->cs_hbuf
);
508 ((md_raidcs_t
*)cs
)->cs_dbuf
.b_chain
=
509 ((md_raidcs_t
*)cs
)->cs_pbuf
.b_chain
=
510 ((md_raidcs_t
*)cs
)->cs_hbuf
.b_chain
=
513 cs
->cs_magic
= RAID_CSMAGIC
;
514 cs
->cs_line
= MD_DISKADDR_ERROR
;
521 raid_child_destructor(void *p
, void *d
)
523 biofini(&((md_raidcs_t
*)p
)->cs_dbuf
);
524 biofini(&((md_raidcs_t
*)p
)->cs_hbuf
);
525 biofini(&((md_raidcs_t
*)p
)->cs_pbuf
);
526 mutex_destroy(&((md_raidcs_t
*)p
)->cs_mx
);
531 raid_cbuf_constructor(void *p
, void *d1
, int d2
)
533 bioinit(&((md_raidcbuf_t
*)p
)->cbuf_bp
);
538 raid_cbuf_init(md_raidcbuf_t
*cb
)
540 bzero(cb
, offsetof(md_raidcbuf_t
, cbuf_bp
));
541 md_bioreset(&cb
->cbuf_bp
);
542 cb
->cbuf_magic
= RAID_BUFMAGIC
;
543 cb
->cbuf_pwslot
= -1;
544 cb
->cbuf_flags
= CBUF_WRITE
;
549 raid_cbuf_destructor(void *p
, void *d
)
551 biofini(&((md_raidcbuf_t
*)p
)->cbuf_bp
);
555 * NAMES: raid_run_queue
556 * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
561 raid_run_queue(void *d
)
563 if (!(md_status
& MD_GBL_DAEMONS_LIVE
))
564 md_daemon(1, &md_done_daemon
);
568 * NAME: raid_build_pwslot
569 * DESCRIPTION: builds mr_pw_reserve for the column
570 * PARAMETERS: un is the pointer to the unit structure
571 * colindex is the column to create the structure for
574 raid_build_pw_reservation(mr_unit_t
*un
, int colindex
)
580 pw
= (mr_pw_reserve_t
*) kmem_zalloc(sizeof (mr_pw_reserve_t
) +
581 (sizeof (mr_scoreboard_t
) * un
->un_pwcnt
), KM_SLEEP
);
582 pw
->pw_magic
= RAID_PWMAGIC
;
583 pw
->pw_column
= colindex
;
584 pw
->pw_free
= un
->un_pwcnt
;
586 for (i
= 0; i
< un
->un_pwcnt
; i
++) {
587 sb
[i
].sb_column
= colindex
;
588 sb
[i
].sb_flags
= SB_UNUSED
;
589 sb
[i
].sb_start_blk
= 0;
590 sb
[i
].sb_last_blk
= 0;
593 un
->un_column_ic
[colindex
].un_pw_reserve
= pw
;
597 * NAME: raid_free_pw_reservation
598 * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
599 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
600 * int colindex - index of the column whose pre-write slot struct
601 * is to be destroyed.
604 raid_free_pw_reservation(mr_unit_t
*un
, int colindex
)
606 mr_pw_reserve_t
*pw
= un
->un_column_ic
[colindex
].un_pw_reserve
;
608 kmem_free(pw
, sizeof (mr_pw_reserve_t
) +
609 (sizeof (mr_scoreboard_t
) * un
->un_pwcnt
));
613 * NAME: raid_cancel_pwslot
614 * DESCRIPTION: RAID metadevice write routine
615 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
618 raid_cancel_pwslot(md_raidcs_t
*cs
)
620 mr_unit_t
*un
= cs
->cs_un
;
627 if (cs
->cs_ps
->ps_flags
& MD_RPS_READ
)
629 if (cs
->cs_dpwslot
!= -1) {
630 col
= &un
->un_column_ic
[cs
->cs_dcolumn
];
631 pw
= col
->un_pw_reserve
;
632 sb
= &pw
->pw_sb
[cs
->cs_dpwslot
];
633 sb
->sb_flags
= SB_AVAIL
;
634 if ((pw
->pw_free
++ == 0) || (un
->un_rflags
& MD_RFLAG_NEEDPW
))
639 if (cs
->cs_ppwslot
!= -1) {
640 col
= &un
->un_column_ic
[cs
->cs_pcolumn
];
641 pw
= col
->un_pw_reserve
;
642 sb
= &pw
->pw_sb
[cs
->cs_ppwslot
];
643 sb
->sb_flags
= SB_AVAIL
;
644 if ((pw
->pw_free
++ == 0) || (un
->un_rflags
& MD_RFLAG_NEEDPW
))
649 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
) {
650 if (cbuf
->cbuf_pwslot
== -1)
652 col
= &un
->un_column_ic
[cbuf
->cbuf_column
];
653 pw
= col
->un_pw_reserve
;
654 sb
= &pw
->pw_sb
[cbuf
->cbuf_pwslot
];
655 sb
->sb_flags
= SB_AVAIL
;
656 if ((pw
->pw_free
++ == 0) || (un
->un_rflags
& MD_RFLAG_NEEDPW
))
661 cv_broadcast(&un
->un_cv
);
664 mutex_enter(&un
->un_mx
);
665 if (un
->un_rflags
& MD_RFLAG_NEEDPW
)
666 cv_broadcast(&un
->un_cv
);
667 mutex_exit(&un
->un_mx
);
671 raid_free_pwinvalidate(md_raidcs_t
*cs
)
674 md_raidcbuf_t
*cbuf_to_free
;
675 mr_unit_t
*un
= cs
->cs_un
;
676 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
681 cbuf
= cs
->cs_pw_inval_list
;
683 mutex_enter(&un
->un_linlck_mx
);
685 pw
= un
->un_column_ic
[cbuf
->cbuf_column
].un_pw_reserve
;
687 ASSERT(sb
[cbuf
->cbuf_pwslot
].sb_flags
& SB_INVAL_PEND
);
688 sb
[cbuf
->cbuf_pwslot
].sb_flags
= SB_UNUSED
;
689 sb
[cbuf
->cbuf_pwslot
].sb_cs
= NULL
;
690 if ((pw
->pw_free
++ == 0) || (un
->un_rflags
& MD_RFLAG_NEEDPW
))
693 cbuf
= cbuf
->cbuf_next
;
694 kmem_free(cbuf_to_free
->cbuf_buffer
, dbtob(un
->un_iosize
));
695 kmem_cache_free(raid_cbuf_cache
, cbuf_to_free
);
697 cs
->cs_pw_inval_list
= (md_raidcbuf_t
*)NULL
;
699 * now that there is a free prewrite slot, check to see if there
700 * are any io operations waiting first wake up the raid_io_startup
701 * then signal the the processes waiting in raid_write.
703 if (ui
->ui_io_lock
->io_list_front
)
705 mutex_exit(&un
->un_linlck_mx
);
707 cv_broadcast(&un
->un_cv
);
710 mutex_enter(&un
->un_mx
);
711 if (un
->un_rflags
& MD_RFLAG_NEEDPW
)
712 cv_broadcast(&un
->un_cv
);
713 mutex_exit(&un
->un_mx
);
718 raid_get_pwslot(md_raidcs_t
*cs
, int column
)
722 mr_unit_t
*un
= cs
->cs_un
;
723 diskaddr_t start_blk
= cs
->cs_blkno
;
724 diskaddr_t last_blk
= cs
->cs_lastblk
;
726 int pwcnt
= un
->un_pwcnt
;
732 /* start with the data column */
733 pw
= cs
->cs_un
->un_column_ic
[column
].un_pw_reserve
;
735 ASSERT(pw
->pw_free
> 0);
736 for (i
= 0; i
< pwcnt
; i
++) {
737 flags
= sb
[i
].sb_flags
;
738 if (flags
& SB_INVAL_PEND
)
741 if ((avail
== -1) && (flags
& (SB_AVAIL
| SB_UNUSED
)))
744 if ((start_blk
> sb
[i
].sb_last_blk
) ||
745 (last_blk
< sb
[i
].sb_start_blk
))
749 ASSERT(! (sb
[i
].sb_flags
& SB_INUSE
));
752 * raid_invalidate_pwslot attempts to zero out prewrite entry
753 * in parallel with other disk reads/writes related to current
754 * transaction. however cs_frags accounting for this case is
755 * broken because raid_write_io resets cs_frags i.e. ignoring
756 * that it could have been been set to > 0 value by
757 * raid_invalidate_pwslot. While this can be fixed an
758 * additional problem is that we don't seem to handle
759 * correctly the case of getting a disk error for prewrite
760 * entry invalidation.
761 * It does not look like we really need
762 * to invalidate prewrite slots because raid_replay sorts
763 * prewrite id's in ascending order and during recovery the
764 * latest prewrite entry for the same block will be replay
765 * last. That's why i ifdef'd out the call to
766 * raid_invalidate_pwslot. --aguzovsk@east
779 ASSERT(! (sb
[use
].sb_flags
& SB_INUSE
));
780 sb
[use
].sb_flags
= SB_INUSE
;
782 sb
[use
].sb_start_blk
= start_blk
;
783 sb
[use
].sb_last_blk
= last_blk
;
784 ASSERT((use
>= 0) && (use
< un
->un_pwcnt
));
789 raid_check_pw(md_raidcs_t
*cs
)
792 mr_unit_t
*un
= cs
->cs_un
;
795 ASSERT(! (cs
->cs_flags
& MD_RCS_HAVE_PW_SLOTS
));
797 * check to be sure there is a prewrite slot available
798 * if not just return.
800 if (cs
->cs_flags
& MD_RCS_LINE
) {
801 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++)
802 if (un
->un_column_ic
[i
].un_pw_reserve
->pw_free
<= 0)
807 if (un
->un_column_ic
[cs
->cs_dcolumn
].un_pw_reserve
->pw_free
<= 0)
809 if (un
->un_column_ic
[cs
->cs_pcolumn
].un_pw_reserve
->pw_free
<= 0)
814 raid_alloc_pwslot(md_raidcs_t
*cs
)
816 mr_unit_t
*un
= cs
->cs_un
;
819 ASSERT(! (cs
->cs_flags
& MD_RCS_HAVE_PW_SLOTS
));
820 if (raid_check_pw(cs
))
823 mutex_enter(&un
->un_mx
);
825 cs
->cs_pwid
= un
->un_pwid
;
826 mutex_exit(&un
->un_mx
);
828 cs
->cs_dpwslot
= raid_get_pwslot(cs
, cs
->cs_dcolumn
);
829 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
) {
830 cbuf
->cbuf_pwslot
= raid_get_pwslot(cs
, cbuf
->cbuf_column
);
832 cs
->cs_ppwslot
= raid_get_pwslot(cs
, cs
->cs_pcolumn
);
834 cs
->cs_flags
|= MD_RCS_HAVE_PW_SLOTS
;
840 * NAMES: raid_build_incore
841 * DESCRIPTION: RAID metadevice incore structure building routine
842 * PARAMETERS: void *p - pointer to a unit structure
843 * int snarfing - a flag to indicate snarfing is required
846 raid_build_incore(void *p
, int snarfing
)
848 mr_unit_t
*un
= (mr_unit_t
*)p
;
849 minor_t mnum
= MD_SID(un
);
850 mddb_recid_t hs_recid
= 0;
856 int resync_cnt
= 0, error_cnt
= 0;
861 /* clear out bogus pointer incase we return(1) prior to alloc */
864 if (MD_STATUS(un
) & MD_UN_BEING_RESET
) {
865 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCLEAN
);
869 if (MD_UNIT(mnum
) != NULL
)
875 un
->mr_ic
= (mr_unit_ic_t
*)kmem_zalloc(sizeof (*un
->mr_ic
),
878 un
->un_column_ic
= (mr_column_ic_t
*)
879 kmem_zalloc(sizeof (mr_column_ic_t
) *
880 un
->un_totalcolumncnt
, KM_SLEEP
);
882 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
884 column
= &un
->un_column
[i
];
885 preserve_flags
= column
->un_devflags
&
886 (MD_RAID_COPY_RESYNC
| MD_RAID_REGEN_RESYNC
);
887 column
->un_devflags
&=
888 ~(MD_RAID_ALT_ISOPEN
| MD_RAID_DEV_ISOPEN
|
890 if (raid_build_pw_reservation(un
, i
) != 0) {
891 /* could not build pwslot */
896 set_t setno
= MD_MIN2SET(mnum
);
897 dev
= md_getdevnum(setno
, mddb_getsidenum(setno
),
898 column
->un_orig_key
, MD_NOTRUST_DEVT
);
900 * Comment out instead of remove so we have history
901 * In the pre-SVM releases stored devt is used so
902 * as long as there is one snarf is always happy
903 * even the component is powered off. This is not
904 * the case in current SVM implementation. NODEV64
905 * can be returned and in this case since we resolve
906 * the devt at 'open' time (first use of metadevice)
907 * we will allow snarf continue.
909 * if (dev == NODEV64)
914 * Setup un_orig_dev from device id info if the device
915 * is valid (not NODEV64).
918 column
->un_orig_dev
= dev
;
920 if (column
->un_devstate
& RCS_RESYNC
)
922 if (column
->un_devstate
& (RCS_ERRED
| RCS_LAST_ERRED
))
925 if (HOTSPARED(un
, i
)) {
926 (void) md_hot_spare_ifc(HS_MKDEV
,
927 0, 0, 0, &column
->un_hs_id
, NULL
,
937 if (HOTSPARED(un
, i
)) {
938 if (column
->un_devstate
&
939 (RCS_OKAY
| RCS_LAST_ERRED
)) {
942 column
->un_hs_pwstart
;
943 column
->un_devstart
=
944 column
->un_hs_devstart
;
946 ~(MD_RAID_COPY_RESYNC
|
947 MD_RAID_REGEN_RESYNC
);
948 } else if (column
->un_devstate
& RCS_RESYNC
) {
950 * if previous system was 4.0 set
951 * the direction flags
953 if ((preserve_flags
&
954 (MD_RAID_COPY_RESYNC
|
955 MD_RAID_REGEN_RESYNC
)) == 0) {
956 if (column
->un_alt_dev
!=
963 MD_RAID_REGEN_RESYNC
;
966 } else { /* no hot spares */
967 column
->un_dev
= dev
;
968 column
->un_pwstart
= column
->un_orig_pwstart
;
969 column
->un_devstart
= column
->un_orig_devstart
;
970 if (column
->un_devstate
& RCS_RESYNC
) {
971 preserve_flags
|= MD_RAID_REGEN_RESYNC
;
972 preserve_flags
&= ~MD_RAID_COPY_RESYNC
;
975 if (! (column
->un_devstate
& RCS_RESYNC
)) {
977 ~(MD_RAID_REGEN_RESYNC
|
978 MD_RAID_COPY_RESYNC
);
981 column
->un_devflags
= preserve_flags
;
982 column
->un_alt_dev
= NODEV64
;
983 column
->un_alt_pwstart
= 0;
984 column
->un_alt_devstart
= 0;
985 un
->un_resync_line_index
= 0;
986 un
->un_resync_index
= 0;
987 un
->un_percent_done
= 0;
991 if (resync_cnt
&& error_cnt
) {
992 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
993 column
= &un
->un_column
[i
];
994 if (HOTSPARED(un
, i
) &&
995 (column
->un_devstate
& RCS_RESYNC
) &&
996 (column
->un_devflags
& MD_RAID_COPY_RESYNC
))
997 /* hotspare has data */
1000 if (HOTSPARED(un
, i
) &&
1001 (column
->un_devstate
& RCS_RESYNC
)) {
1002 /* hotspare does not have data */
1003 raid_hs_release(HS_FREE
, un
, &hs_recid
, i
);
1004 column
->un_dev
= column
->un_orig_dev
;
1005 column
->un_pwstart
= column
->un_orig_pwstart
;
1006 column
->un_devstart
= column
->un_orig_devstart
;
1007 mddb_setrecprivate(hs_recid
, MD_PRV_PENDCOM
);
1010 if (column
->un_devstate
& RCS_ERRED
)
1011 column
->un_devstate
= RCS_LAST_ERRED
;
1013 if (column
->un_devstate
& RCS_RESYNC
)
1014 column
->un_devstate
= RCS_ERRED
;
1017 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCOM
);
1019 un
->un_pwid
= 1; /* or some other possible value */
1020 un
->un_magic
= RAID_UNMAGIC
;
1021 iosize
= un
->un_iosize
;
1022 un
->un_pbuffer
= kmem_alloc(dbtob(iosize
), KM_SLEEP
);
1023 un
->un_dbuffer
= kmem_alloc(dbtob(iosize
), KM_SLEEP
);
1024 mutex_init(&un
->un_linlck_mx
, NULL
, MUTEX_DEFAULT
, NULL
);
1025 cv_init(&un
->un_linlck_cv
, NULL
, CV_DEFAULT
, NULL
);
1026 un
->un_linlck_chn
= NULL
;
1028 /* place various information in the in-core data structures */
1029 md_nblocks_set(mnum
, un
->c
.un_total_blocks
);
1037 * DESCRIPTION: RAID metadevice reset routine
1038 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
1039 * minor_t mnum - RAID metadevice minor number
1040 * int removing - a flag to imply removing device name from
1044 reset_raid(mr_unit_t
*un
, minor_t mnum
, int removing
)
1048 mr_column_t
*column
;
1049 int column_cnt
= un
->un_totalcolumncnt
;
1050 mddb_recid_t
*recids
, vtoc_id
;
1053 ASSERT((MDI_UNIT(mnum
)->ui_io_lock
->io_list_front
== NULL
) &&
1054 (MDI_UNIT(mnum
)->ui_io_lock
->io_list_back
== NULL
));
1056 md_destroy_unit_incore(mnum
, &raid_md_ops
);
1058 md_nblocks_set(mnum
, -1ULL);
1059 MD_UNIT(mnum
) = NULL
;
1061 if (un
->un_pbuffer
) {
1062 kmem_free(un
->un_pbuffer
, dbtob(un
->un_iosize
));
1063 un
->un_pbuffer
= NULL
;
1065 if (un
->un_dbuffer
) {
1066 kmem_free(un
->un_dbuffer
, dbtob(un
->un_iosize
));
1067 un
->un_dbuffer
= NULL
;
1070 /* free all pre-write slots created during build incore */
1071 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++)
1072 raid_free_pw_reservation(un
, i
);
1074 kmem_free(un
->un_column_ic
, sizeof (mr_column_ic_t
) *
1075 un
->un_totalcolumncnt
);
1077 kmem_free(un
->mr_ic
, sizeof (*un
->mr_ic
));
1080 * Attempt release of its minor node
1082 md_remove_minor_node(mnum
);
1087 sv
= (sv_dev_t
*)kmem_zalloc((column_cnt
+ 1) * sizeof (sv_dev_t
),
1090 recids
= (mddb_recid_t
*)
1091 kmem_zalloc((column_cnt
+ 2) * sizeof (mddb_recid_t
), KM_SLEEP
);
1093 for (i
= 0; i
< column_cnt
; i
++) {
1095 md_dev64_t comp_dev
;
1097 column
= &un
->un_column
[i
];
1098 sv
[i
].setno
= MD_MIN2SET(mnum
);
1099 sv
[i
].key
= column
->un_orig_key
;
1100 if (HOTSPARED(un
, i
)) {
1101 if (column
->un_devstate
& (RCS_ERRED
| RCS_LAST_ERRED
))
1105 raid_hs_release(hserr
, un
, &recids
[n
++], i
);
1108 * deparent any metadevices.
1109 * NOTE: currently soft partitions are the only metadevices
1110 * allowed in RAID metadevices.
1112 comp_dev
= column
->un_dev
;
1113 if (md_getmajor(comp_dev
) == md_major
) {
1114 comp_un
= MD_UNIT(md_getminor(comp_dev
));
1115 recids
[n
++] = MD_RECID(comp_un
);
1116 md_reset_parent(comp_dev
);
1119 /* decrement the reference count of the old hsp */
1120 if (un
->un_hsp_id
!= -1)
1121 (void) md_hot_spare_ifc(HSP_DECREF
, un
->un_hsp_id
, 0, 0,
1122 &recids
[n
++], NULL
, NULL
, NULL
);
1124 MD_STATUS(un
) |= MD_UN_BEING_RESET
;
1125 vtoc_id
= un
->c
.un_vtoc_id
;
1127 raid_commit(un
, recids
);
1130 * Remove self from the namespace
1132 if (un
->c
.un_revision
& MD_FN_META_DEV
) {
1133 (void) md_rem_selfname(un
->c
.un_self_id
);
1136 /* Remove the unit structure */
1137 mddb_deleterec_wrapper(un
->c
.un_record_id
);
1139 /* Remove the vtoc, if present */
1141 mddb_deleterec_wrapper(vtoc_id
);
1142 md_rem_names(sv
, column_cnt
);
1143 kmem_free(sv
, (column_cnt
+ 1) * sizeof (sv_dev_t
));
1144 kmem_free(recids
, (column_cnt
+ 2) * sizeof (mddb_recid_t
));
1146 SE_NOTIFY(EC_SVM_CONFIG
, ESC_SVM_DELETE
, SVM_TAG_METADEVICE
,
1147 MD_MIN2SET(mnum
), mnum
);
1151 * NAMES: raid_error_parent
1152 * DESCRIPTION: mark a parent structure in error
1153 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1154 * int error - error value to set
1155 * NOTE: (TBR) - this routine currently is not in use.
1158 raid_error_parent(md_raidps_t
*ps
, int error
)
1160 mutex_enter(&ps
->ps_mx
);
1161 ps
->ps_flags
|= MD_RPS_ERROR
;
1162 ps
->ps_error
= error
;
1163 mutex_exit(&ps
->ps_mx
);
1167 * The following defines tell raid_free_parent
1168 * RFP_RLS_LOCK release the unit reader lock when done.
1169 * RFP_DECR_PWFRAGS decrement ps_pwfrags
1170 * RFP_DECR_FRAGS decrement ps_frags
1171 * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep
1173 #define RFP_RLS_LOCK 0x00001
1174 #define RFP_DECR_PWFRAGS 0x00002
1175 #define RFP_DECR_FRAGS 0x00004
1176 #define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
1179 * NAMES: raid_free_parent
1180 * DESCRIPTION: free a parent structure
1181 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1182 * int todo - indicates what needs to be done
1185 raid_free_parent(md_raidps_t
*ps
, int todo
)
1187 mdi_unit_t
*ui
= ps
->ps_ui
;
1189 ASSERT(ps
->ps_magic
== RAID_PSMAGIC
);
1190 ASSERT(ps
->ps_flags
& MD_RPS_INUSE
);
1191 mutex_enter(&ps
->ps_mx
);
1192 if (todo
& RFP_DECR_PWFRAGS
) {
1193 ASSERT(ps
->ps_pwfrags
);
1195 if (ps
->ps_pwfrags
== 0 && (! (ps
->ps_flags
& MD_RPS_IODONE
))) {
1196 if (ps
->ps_flags
& MD_RPS_ERROR
) {
1197 ps
->ps_bp
->b_flags
|= B_ERROR
;
1198 ps
->ps_bp
->b_error
= ps
->ps_error
;
1200 md_kstat_done(ui
, ps
->ps_bp
, 0);
1202 ps
->ps_flags
|= MD_RPS_IODONE
;
1206 if (todo
& RFP_DECR_FRAGS
) {
1207 ASSERT(ps
->ps_frags
);
1211 if (ps
->ps_frags
!= 0) {
1212 mutex_exit(&ps
->ps_mx
);
1216 ASSERT((ps
->ps_frags
== 0) && (ps
->ps_pwfrags
== 0));
1217 mutex_exit(&ps
->ps_mx
);
1219 if (todo
& RFP_RLS_LOCK
)
1220 md_io_readerexit(ui
);
1223 ps
->ps_flags
|= MD_RPS_DONE
;
1227 if (ps
->ps_flags
& MD_RPS_HSREQ
)
1228 (void) raid_hotspares();
1230 ASSERT(todo
& RFP_RLS_LOCK
);
1231 ps
->ps_flags
&= ~MD_RPS_INUSE
;
1233 md_dec_iocount(MD_MIN2SET(ps
->ps_un
->c
.un_self_id
));
1235 kmem_cache_free(raid_parent_cache
, ps
);
1239 * NAMES: raid_free_child
1240 * DESCRIPTION: free a parent structure
1241 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1242 * int drop_locks - 0 for no locks held
1243 * NOTE: (TBR) - this routine currently is not in use.
1246 raid_free_child(md_raidcs_t
*cs
, int drop_locks
)
1248 mr_unit_t
*un
= cs
->cs_un
;
1249 md_raidcbuf_t
*cbuf
, *cbuf1
;
1251 if (cs
->cs_pw_inval_list
)
1252 raid_free_pwinvalidate(cs
);
1255 ASSERT(cs
->cs_flags
& MD_RCS_LLOCKD
&&
1256 (cs
->cs_flags
& (MD_RCS_READER
| MD_RCS_WRITER
)));
1257 md_unit_readerexit(MDI_UNIT(MD_SID(un
)));
1260 ASSERT(!(cs
->cs_flags
& MD_RCS_LLOCKD
));
1264 cbuf
= cs
->cs_buflist
;
1266 cbuf1
= cbuf
->cbuf_next
;
1267 kmem_cache_free(raid_cbuf_cache
, cbuf
);
1270 if (cs
->cs_dbuf
.b_flags
& B_REMAPPED
)
1271 bp_mapout(&cs
->cs_dbuf
);
1272 kmem_cache_free(raid_child_cache
, cs
);
1276 * NAME: raid_regen_parity
1278 * DESCRIPTION: This routine is used to regenerate the parity blocks
1279 * for the entire raid device. It is called from
1280 * both the regen thread and the IO path.
1282 * On error the entire device is marked as in error by
1283 * placing the erroring device in error and all other
1284 * devices in last_errored.
1286 * PARAMETERS: md_raidcs_t *cs
1289 raid_regen_parity(md_raidcs_t
*cs
)
1291 mr_unit_t
*un
= cs
->cs_un
;
1292 mdi_unit_t
*ui
= MDI_UNIT(un
->c
.un_self_id
);
1294 caddr_t parity_buffer
;
1296 uint_t
*dbuf
, *pbuf
;
1297 uint_t colcnt
= un
->un_totalcolumncnt
;
1299 int parity_column
= cs
->cs_pcolumn
;
1304 * This routine uses the data and parity buffers allocated to a
1305 * write. In the case of a read the buffers are allocated and
1309 ASSERT(IO_READER_HELD(un
));
1310 ASSERT(cs
->cs_flags
& MD_RCS_LLOCKD
);
1311 ASSERT(UNIT_READER_HELD(un
));
1313 if (raid_state_cnt(un
, RCS_OKAY
) != colcnt
)
1316 if (cs
->cs_flags
& MD_RCS_READER
) {
1320 ASSERT(cs
->cs_dbuffer
&& cs
->cs_pbuffer
);
1321 bcount
= cs
->cs_bcount
;
1322 buffer
= cs
->cs_dbuffer
;
1323 parity_buffer
= cs
->cs_pbuffer
;
1324 bzero(parity_buffer
, bcount
);
1326 for (column
= 0; column
< colcnt
; column
++) {
1327 if (column
== parity_column
)
1329 reset_buf(bp
, B_READ
| B_BUSY
, bcount
);
1330 bp
->b_un
.b_addr
= buffer
;
1331 bp
->b_edev
= md_dev64_to_dev(un
->un_column
[column
].un_dev
);
1332 bp
->b_lblkno
= cs
->cs_blkno
+ un
->un_column
[column
].un_devstart
;
1333 bp
->b_bcount
= bcount
;
1334 bp
->b_bufsize
= bcount
;
1335 (void) md_call_strategy(bp
, MD_STR_NOTTOP
, NULL
);
1338 pbuf
= (uint_t
*)(void *)parity_buffer
;
1339 dbuf
= (uint_t
*)(void *)buffer
;
1340 for (j
= 0; j
< (bcount
/ (sizeof (uint_t
))); j
++) {
1341 *pbuf
= *pbuf
^ *dbuf
;
1347 reset_buf(bp
, B_WRITE
| B_BUSY
, cs
->cs_bcount
);
1348 bp
->b_un
.b_addr
= parity_buffer
;
1349 bp
->b_edev
= md_dev64_to_dev(un
->un_column
[parity_column
].un_dev
);
1350 bp
->b_lblkno
= cs
->cs_blkno
+ un
->un_column
[parity_column
].un_devstart
;
1351 bp
->b_bcount
= bcount
;
1352 bp
->b_bufsize
= bcount
;
1353 (void) md_call_strategy(bp
, MD_STR_NOTTOP
, NULL
);
1357 if (cs
->cs_flags
& MD_RCS_READER
) {
1359 cs
->cs_pbuffer
= NULL
;
1360 cs
->cs_dbuffer
= NULL
;
1362 bp
->b_chain
= (struct buf
*)cs
;
1365 if (cs
->cs_flags
& MD_RCS_READER
) {
1367 cs
->cs_pbuffer
= NULL
;
1368 cs
->cs_dbuffer
= NULL
;
1370 md_unit_readerexit(ui
);
1371 un
= md_unit_writerlock(ui
);
1372 raid_set_state(un
, column
, RCS_ERRED
, 0);
1373 for (column
= 0; column
< colcnt
; column
++)
1374 raid_set_state(un
, column
, RCS_ERRED
, 0);
1375 raid_commit(un
, NULL
);
1376 md_unit_writerexit(ui
);
1377 un
= md_unit_readerlock(ui
);
1378 bp
->b_chain
= (struct buf
*)cs
;
1382 * NAMES: raid_error_state
1383 * DESCRIPTION: check unit and column states' impact on I/O error
1384 * NOTE: the state now may not be the state when the
1385 * I/O completed due to race conditions.
1386 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure
1387 * md_raidcs_t *cs - pointer to child structure
1388 * buf_t *bp - pointer to buffer structure
1391 raid_error_state(mr_unit_t
*un
, buf_t
*bp
)
1396 ASSERT(IO_READER_HELD(un
));
1397 ASSERT(UNIT_WRITER_HELD(un
));
1400 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
1401 if (un
->un_column
[i
].un_dev
== md_expldev(bp
->b_edev
)) {
1405 if (un
->un_column
[i
].un_alt_dev
== md_expldev(bp
->b_edev
)) {
1411 /* in case a replace snuck in while waiting on unit writer lock */
1417 (void) raid_set_state(un
, column
, RCS_ERRED
, 0);
1418 ASSERT(un
->un_state
& (RUS_ERRED
| RUS_LAST_ERRED
));
1420 raid_commit(un
, NULL
);
1421 if (un
->un_state
& RUS_ERRED
) {
1422 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_ERRED
, SVM_TAG_METADEVICE
,
1423 MD_UN2SET(un
), MD_SID(un
));
1424 } else if (un
->un_state
& RUS_LAST_ERRED
) {
1425 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_LASTERRED
, SVM_TAG_METADEVICE
,
1426 MD_UN2SET(un
), MD_SID(un
));
1433 * NAME: raid_mapin_buf
1434 * DESCRIPTION: wait for the input buffer header to be maped in
1435 * PARAMETERS: md_raidps_t *ps
1438 raid_mapin_buf(md_raidcs_t
*cs
)
1440 md_raidps_t
*ps
= cs
->cs_ps
;
1443 * check to see if the buffer is maped. If all is ok return the
1444 * offset of the data and return. Since it is expensive to grab
1445 * a mutex this is only done if the mapin is not complete.
1446 * Once the mutex is aquired it is possible that the mapin was
1447 * not done so recheck and if necessary do the mapin.
1449 if (ps
->ps_mapin
> 0) {
1450 cs
->cs_addr
= ps
->ps_addr
+ cs
->cs_offset
;
1453 mutex_enter(&ps
->ps_mapin_mx
);
1454 if (ps
->ps_mapin
> 0) {
1455 cs
->cs_addr
= ps
->ps_addr
+ cs
->cs_offset
;
1456 mutex_exit(&ps
->ps_mapin_mx
);
1459 bp_mapin(ps
->ps_bp
);
1461 * get the new b_addr out of the parent since bp_mapin just changed it
1463 ps
->ps_addr
= ps
->ps_bp
->b_un
.b_addr
;
1464 cs
->cs_addr
= ps
->ps_addr
+ cs
->cs_offset
;
1466 mutex_exit(&ps
->ps_mapin_mx
);
1470 * NAMES: raid_read_no_retry
1471 * DESCRIPTION: I/O retry routine for a RAID metadevice read
1472 * read failed attempting to regenerate the data,
1473 * no retry possible, error occured in raid_raidregenloop().
1474 * PARAMETERS: mr_unit_t *un - pointer to raid unit structure
1475 * md_raidcs_t *cs - pointer to child structure
1479 raid_read_no_retry(mr_unit_t
*un
, md_raidcs_t
*cs
)
1481 md_raidps_t
*ps
= cs
->cs_ps
;
1483 raid_error_parent(ps
, EIO
);
1484 raid_free_child(cs
, 1);
1486 /* decrement readfrags */
1487 raid_free_parent(ps
, RFP_DECR_READFRAGS
| RFP_RLS_LOCK
);
1491 * NAMES: raid_read_retry
1492 * DESCRIPTION: I/O retry routine for a RAID metadevice read
1493 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1496 raid_read_retry(mr_unit_t
*un
, md_raidcs_t
*cs
)
1498 /* re-initialize the buf_t structure for raid_read() */
1499 cs
->cs_dbuf
.b_chain
= (struct buf
*)cs
;
1500 cs
->cs_dbuf
.b_back
= &cs
->cs_dbuf
;
1501 cs
->cs_dbuf
.b_forw
= &cs
->cs_dbuf
;
1502 cs
->cs_dbuf
.b_flags
= B_BUSY
; /* initialize flags */
1503 cs
->cs_dbuf
.b_error
= 0; /* initialize error */
1504 cs
->cs_dbuf
.b_offset
= -1;
1505 /* Initialize semaphores */
1506 sema_init(&cs
->cs_dbuf
.b_io
, 0, NULL
,
1507 SEMA_DEFAULT
, NULL
);
1508 sema_init(&cs
->cs_dbuf
.b_sem
, 0, NULL
,
1509 SEMA_DEFAULT
, NULL
);
1511 cs
->cs_pbuf
.b_chain
= (struct buf
*)cs
;
1512 cs
->cs_pbuf
.b_back
= &cs
->cs_pbuf
;
1513 cs
->cs_pbuf
.b_forw
= &cs
->cs_pbuf
;
1514 cs
->cs_pbuf
.b_flags
= B_BUSY
; /* initialize flags */
1515 cs
->cs_pbuf
.b_error
= 0; /* initialize error */
1516 cs
->cs_pbuf
.b_offset
= -1;
1517 sema_init(&cs
->cs_pbuf
.b_io
, 0, NULL
,
1518 SEMA_DEFAULT
, NULL
);
1519 sema_init(&cs
->cs_pbuf
.b_sem
, 0, NULL
,
1520 SEMA_DEFAULT
, NULL
);
1522 cs
->cs_flags
&= ~MD_RCS_ERROR
; /* reset child error flag */
1523 cs
->cs_flags
|= MD_RCS_RECOVERY
; /* set RECOVERY flag */
1526 * re-scheduling I/O with raid_read_io() is simpler. basically,
1527 * raid_read_io() is invoked again with same child structure.
1528 * (NOTE: we aren`t supposed to do any error recovery when an I/O
1529 * error occured in raid_raidregenloop().
1532 raid_read_io(un
, cs
);
1537 * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1538 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1539 * LOCKS: must obtain unit writer lock while calling raid_error_state
1540 * since a unit or column state transition may take place.
1541 * must obtain unit reader lock to retry I/O.
1545 raid_rderr(md_raidcs_t
*cs
)
1554 un
= (mr_unit_t
*)md_unit_writerlock(ui
);
1557 if (cs
->cs_dbuf
.b_flags
& B_ERROR
)
1558 error
= raid_error_state(un
, &cs
->cs_dbuf
);
1559 if (cs
->cs_pbuf
.b_flags
& B_ERROR
)
1560 error
|= raid_error_state(un
, &cs
->cs_pbuf
);
1562 md_unit_writerexit(ui
);
1564 ps
->ps_flags
|= MD_RPS_HSREQ
;
1566 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
1568 /* now attempt the appropriate retry routine */
1569 (*(cs
->cs_retry_call
))(un
, cs
);
1574 * NAMES: raid_read_error
1575 * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1576 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1580 raid_read_error(md_raidcs_t
*cs
)
1591 setno
= MD_UN2SET(un
);
1593 if ((cs
->cs_dbuf
.b_flags
& B_ERROR
) &&
1594 (COLUMN_STATE(un
, cs
->cs_dcolumn
) != RCS_ERRED
) &&
1595 (COLUMN_STATE(un
, cs
->cs_dcolumn
) != RCS_LAST_ERRED
))
1596 cmn_err(CE_WARN
, "md %s: read error on %s",
1597 md_shortname(MD_SID(un
)),
1598 md_devname(setno
, md_expldev(cs
->cs_dbuf
.b_edev
), NULL
, 0));
1600 if ((cs
->cs_pbuf
.b_flags
& B_ERROR
) &&
1601 (COLUMN_STATE(un
, cs
->cs_pcolumn
) != RCS_ERRED
) &&
1602 (COLUMN_STATE(un
, cs
->cs_pcolumn
) != RCS_LAST_ERRED
))
1603 cmn_err(CE_WARN
, "md %s: read error on %s",
1604 md_shortname(MD_SID(un
)),
1605 md_devname(setno
, md_expldev(cs
->cs_pbuf
.b_edev
), NULL
, 0));
1607 md_unit_readerexit(ui
);
1609 ASSERT(cs
->cs_frags
== 0);
1611 /* now schedule processing for possible state change */
1612 daemon_request(&md_mstr_daemon
, raid_rderr
,
1613 (daemon_queue_t
*)cs
, REQ_OLD
);
1619 * DESCRIPTION: data buffer allocation for a child structure
1620 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1622 * NOTE: always get dbuffer before pbuffer
1623 * and get both buffers before pwslot
1624 * otherwise a deadlock could be introduced.
1627 getdbuffer(md_raidcs_t
*cs
)
1631 cs
->cs_dbuffer
= kmem_alloc(cs
->cs_bcount
+ DEV_BSIZE
, KM_NOSLEEP
);
1632 if (cs
->cs_dbuffer
!= NULL
)
1634 un
= cs
->cs_ps
->ps_un
;
1635 mutex_enter(&un
->un_mx
);
1636 while (un
->un_dbuffer
== NULL
) {
1637 STAT_INC(data_buffer_waits
);
1638 un
->un_rflags
|= MD_RFLAG_NEEDBUF
;
1639 cv_wait(&un
->un_cv
, &un
->un_mx
);
1641 cs
->cs_dbuffer
= un
->un_dbuffer
;
1642 cs
->cs_flags
|= MD_RCS_UNDBUF
;
1643 un
->un_dbuffer
= NULL
;
1644 mutex_exit(&un
->un_mx
);
1649 * DESCRIPTION: parity buffer allocation for a child structure
1650 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1652 * NOTE: always get dbuffer before pbuffer
1653 * and get both buffers before pwslot
1654 * otherwise a deadlock could be introduced.
1657 getpbuffer(md_raidcs_t
*cs
)
1661 cs
->cs_pbuffer
= kmem_alloc(cs
->cs_bcount
+ DEV_BSIZE
, KM_NOSLEEP
);
1662 if (cs
->cs_pbuffer
!= NULL
)
1664 un
= cs
->cs_ps
->ps_un
;
1665 mutex_enter(&un
->un_mx
);
1666 while (un
->un_pbuffer
== NULL
) {
1667 STAT_INC(parity_buffer_waits
);
1668 un
->un_rflags
|= MD_RFLAG_NEEDBUF
;
1669 cv_wait(&un
->un_cv
, &un
->un_mx
);
1671 cs
->cs_pbuffer
= un
->un_pbuffer
;
1672 cs
->cs_flags
|= MD_RCS_UNPBUF
;
1673 un
->un_pbuffer
= NULL
;
1674 mutex_exit(&un
->un_mx
);
1677 getresources(md_raidcs_t
*cs
)
1679 md_raidcbuf_t
*cbuf
;
1681 * NOTE: always get dbuffer before pbuffer
1682 * and get both buffers before pwslot
1683 * otherwise a deadlock could be introduced.
1687 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
)
1689 kmem_alloc(cs
->cs_bcount
+ DEV_BSIZE
, KM_SLEEP
);
1692 * NAMES: freebuffers
1693 * DESCRIPTION: child structure buffer freeing routine
1694 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1697 freebuffers(md_raidcs_t
*cs
)
1700 md_raidcbuf_t
*cbuf
;
1702 /* free buffers used for full line write */
1703 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
) {
1704 if (cbuf
->cbuf_buffer
== NULL
)
1706 kmem_free(cbuf
->cbuf_buffer
, cbuf
->cbuf_bcount
+ DEV_BSIZE
);
1707 cbuf
->cbuf_buffer
= NULL
;
1708 cbuf
->cbuf_bcount
= 0;
1711 if (cs
->cs_flags
& (MD_RCS_UNDBUF
| MD_RCS_UNPBUF
)) {
1713 mutex_enter(&un
->un_mx
);
1715 if (cs
->cs_dbuffer
) {
1716 if (cs
->cs_flags
& MD_RCS_UNDBUF
)
1717 un
->un_dbuffer
= cs
->cs_dbuffer
;
1719 kmem_free(cs
->cs_dbuffer
, cs
->cs_bcount
+ DEV_BSIZE
);
1721 if (cs
->cs_pbuffer
) {
1722 if (cs
->cs_flags
& MD_RCS_UNPBUF
)
1723 un
->un_pbuffer
= cs
->cs_pbuffer
;
1725 kmem_free(cs
->cs_pbuffer
, cs
->cs_bcount
+ DEV_BSIZE
);
1727 if (cs
->cs_flags
& (MD_RCS_UNDBUF
| MD_RCS_UNPBUF
)) {
1728 un
->un_rflags
&= ~MD_RFLAG_NEEDBUF
;
1729 cv_broadcast(&un
->un_cv
);
1730 mutex_exit(&un
->un_mx
);
1735 * NAMES: raid_line_reader_lock, raid_line_writer_lock
1736 * DESCRIPTION: RAID metadevice line reader and writer lock routines
1737 * data column # and parity column #.
1738 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
1742 raid_line_reader_lock(md_raidcs_t
*cs
, int resync_thread
)
1747 ASSERT(cs
->cs_line
!= MD_DISKADDR_ERROR
);
1749 cs
->cs_flags
|= MD_RCS_READER
;
1750 STAT_CHECK(raid_line_lock_wait
, MUTEX_HELD(&un
->un_linlck_mx
));
1752 mutex_enter(&un
->un_linlck_mx
);
1753 cs1
= un
->un_linlck_chn
;
1754 while (cs1
!= NULL
) {
1755 for (cs1
= un
->un_linlck_chn
; cs1
; cs1
= cs1
->cs_linlck_next
)
1756 if (raid_io_overlaps(cs
, cs1
) == 1)
1757 if (cs1
->cs_flags
& MD_RCS_WRITER
)
1762 panic("md; raid line write lock held");
1763 un
->un_linlck_flg
= 1;
1764 cv_wait(&un
->un_linlck_cv
, &un
->un_linlck_mx
);
1765 STAT_INC(raid_read_waits
);
1768 STAT_MAX(raid_max_reader_locks
, raid_reader_locks_active
);
1769 STAT_INC(raid_reader_locks
);
1770 cs1
= un
->un_linlck_chn
;
1772 cs1
->cs_linlck_prev
= cs
;
1773 cs
->cs_linlck_next
= cs1
;
1774 cs
->cs_linlck_prev
= NULL
;
1775 un
->un_linlck_chn
= cs
;
1776 cs
->cs_flags
|= MD_RCS_LLOCKD
;
1777 if (resync_thread
) {
1778 diskaddr_t lastblk
= cs
->cs_blkno
+ cs
->cs_blkcnt
- 1;
1779 diskaddr_t line
= (lastblk
+ 1) / un
->un_segsize
;
1780 ASSERT(raid_state_cnt(un
, RCS_RESYNC
));
1781 mutex_enter(&un
->un_mx
);
1782 un
->un_resync_line_index
= line
;
1783 mutex_exit(&un
->un_mx
);
1786 mutex_exit(&un
->un_linlck_mx
);
1790 raid_line_writer_lock(md_raidcs_t
*cs
, int lock
)
1795 ASSERT(cs
->cs_line
!= MD_DISKADDR_ERROR
);
1796 cs
->cs_flags
|= MD_RCS_WRITER
;
1797 un
= cs
->cs_ps
->ps_un
;
1799 STAT_CHECK(raid_line_lock_wait
, MUTEX_HELD(&un
->un_linlck_mx
));
1800 if (lock
&& !panicstr
)
1801 mutex_enter(&un
->un_linlck_mx
);
1802 ASSERT(MUTEX_HELD(&un
->un_linlck_mx
));
1804 cs1
= un
->un_linlck_chn
;
1805 for (cs1
= un
->un_linlck_chn
; cs1
; cs1
= cs1
->cs_linlck_next
)
1806 if (raid_io_overlaps(cs
, cs1
))
1811 panic("md: line writer lock inaccessible");
1815 if (raid_alloc_pwslot(cs
)) {
1817 panic("md: no prewrite slots");
1818 STAT_INC(raid_prewrite_waits
);
1822 cs1
= un
->un_linlck_chn
;
1824 cs1
->cs_linlck_prev
= cs
;
1825 cs
->cs_linlck_next
= cs1
;
1826 cs
->cs_linlck_prev
= NULL
;
1827 un
->un_linlck_chn
= cs
;
1828 cs
->cs_flags
|= MD_RCS_LLOCKD
;
1829 cs
->cs_flags
&= ~MD_RCS_WAITING
;
1830 STAT_INC(raid_writer_locks
);
1831 STAT_MAX(raid_max_write_locks
, raid_write_locks_active
);
1832 if (lock
&& !panicstr
)
1833 mutex_exit(&un
->un_linlck_mx
);
1837 /* if this is already queued then do not requeue it */
1838 ASSERT(! (cs
->cs_flags
& MD_RCS_LLOCKD
));
1839 if (!lock
|| (cs
->cs_flags
& MD_RCS_WAITING
))
1841 cs
->cs_flags
|= MD_RCS_WAITING
;
1844 if (lock
&& !panicstr
)
1845 mutex_exit(&un
->un_linlck_mx
);
1850 raid_startio(md_raidcs_t
*cs
)
1852 mdi_unit_t
*ui
= cs
->cs_ps
->ps_ui
;
1853 mr_unit_t
*un
= cs
->cs_un
;
1855 un
= md_unit_readerlock(ui
);
1856 raid_write_io(un
, cs
);
1860 raid_io_startup(mr_unit_t
*un
)
1862 md_raidcs_t
*waiting_list
, *cs1
;
1863 md_raidcs_t
*previous
= NULL
, *next
= NULL
;
1864 mdi_unit_t
*ui
= MDI_UNIT(un
->c
.un_self_id
);
1865 kmutex_t
*io_list_mutex
= &ui
->ui_io_lock
->io_list_mutex
;
1867 ASSERT(MUTEX_HELD(&un
->un_linlck_mx
));
1868 mutex_enter(io_list_mutex
);
1871 * check to be sure there are no reader locks outstanding. If
1872 * there are not then pass on the writer lock.
1874 waiting_list
= ui
->ui_io_lock
->io_list_front
;
1875 while (waiting_list
) {
1876 ASSERT(waiting_list
->cs_flags
& MD_RCS_WAITING
);
1877 ASSERT(! (waiting_list
->cs_flags
& MD_RCS_LLOCKD
));
1878 for (cs1
= un
->un_linlck_chn
; cs1
; cs1
= cs1
->cs_linlck_next
)
1879 if (raid_io_overlaps(waiting_list
, cs1
) == 1)
1882 * there was an IOs that overlaps this io so go onto
1883 * the next io in the waiting list
1886 previous
= waiting_list
;
1887 waiting_list
= waiting_list
->cs_linlck_next
;
1892 * There are no IOs that overlap this, so remove it from
1893 * the waiting queue, and start it
1896 if (raid_check_pw(waiting_list
)) {
1897 ASSERT(waiting_list
->cs_flags
& MD_RCS_WAITING
);
1898 previous
= waiting_list
;
1899 waiting_list
= waiting_list
->cs_linlck_next
;
1902 ASSERT(waiting_list
->cs_flags
& MD_RCS_WAITING
);
1904 next
= waiting_list
->cs_linlck_next
;
1906 previous
->cs_linlck_next
= next
;
1908 ui
->ui_io_lock
->io_list_front
= next
;
1910 if (ui
->ui_io_lock
->io_list_front
== NULL
)
1911 ui
->ui_io_lock
->io_list_back
= NULL
;
1913 if (ui
->ui_io_lock
->io_list_back
== waiting_list
)
1914 ui
->ui_io_lock
->io_list_back
= previous
;
1916 waiting_list
->cs_linlck_next
= NULL
;
1917 waiting_list
->cs_flags
&= ~MD_RCS_WAITING
;
1918 STAT_DEC(raid_write_queue_length
);
1919 if (raid_line_writer_lock(waiting_list
, 0))
1920 panic("region locking corrupted");
1922 ASSERT(waiting_list
->cs_flags
& MD_RCS_LLOCKD
);
1923 daemon_request(&md_mstr_daemon
, raid_startio
,
1924 (daemon_queue_t
*)waiting_list
, REQ_OLD
);
1925 waiting_list
= next
;
1928 mutex_exit(io_list_mutex
);
1932 raid_line_exit(md_raidcs_t
*cs
)
1936 un
= cs
->cs_ps
->ps_un
;
1937 STAT_CHECK(raid_line_lock_wait
, MUTEX_HELD(&un
->un_linlck_mx
));
1938 mutex_enter(&un
->un_linlck_mx
);
1939 if (cs
->cs_flags
& MD_RCS_READER
)
1940 STAT_DEC(raid_reader_locks_active
);
1942 STAT_DEC(raid_write_locks_active
);
1944 if (cs
->cs_linlck_prev
)
1945 cs
->cs_linlck_prev
->cs_linlck_next
= cs
->cs_linlck_next
;
1947 un
->un_linlck_chn
= cs
->cs_linlck_next
;
1948 if (cs
->cs_linlck_next
)
1949 cs
->cs_linlck_next
->cs_linlck_prev
= cs
->cs_linlck_prev
;
1951 cs
->cs_flags
&= ~MD_RCS_LLOCKD
;
1953 if (un
->un_linlck_flg
)
1954 cv_broadcast(&un
->un_linlck_cv
);
1956 un
->un_linlck_flg
= 0;
1957 cs
->cs_line
= MD_DISKADDR_ERROR
;
1959 raid_cancel_pwslot(cs
);
1961 * now that the lock is droped go ahead and see if there are any
1962 * other writes that can be started up
1964 raid_io_startup(un
);
1966 mutex_exit(&un
->un_linlck_mx
);
1970 * NAMES: raid_line, raid_pcolumn, raid_dcolumn
1971 * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
1972 * data column # and parity column #.
1973 * PARAMETERS: int segment - segment number
1974 * mr_unit_t *un - pointer to an unit structure
1975 * RETURNS: raid_line returns line #
1976 * raid_dcolumn returns data column #
1977 * raid_pcolumn returns parity column #
1980 raid_line(diskaddr_t segment
, mr_unit_t
*un
)
1984 diskaddr_t max_orig_segment
;
1986 max_orig_segment
= (un
->un_origcolumncnt
- 1) * un
->un_segsincolumn
;
1987 if (segment
>= max_orig_segment
) {
1988 adj_seg
= segment
- max_orig_segment
;
1989 line
= adj_seg
% un
->un_segsincolumn
;
1991 line
= segment
/ (un
->un_origcolumncnt
- 1);
1997 raid_dcolumn(diskaddr_t segment
, mr_unit_t
*un
)
2001 diskaddr_t max_orig_segment
;
2004 max_orig_segment
= (un
->un_origcolumncnt
- 1) * un
->un_segsincolumn
;
2005 if (segment
>= max_orig_segment
) {
2006 adj_seg
= segment
- max_orig_segment
;
2007 column
= un
->un_origcolumncnt
+
2008 (uint_t
)(adj_seg
/ un
->un_segsincolumn
);
2010 line
= segment
/ (un
->un_origcolumncnt
- 1);
2011 column
= (uint_t
)((segment
%
2012 (un
->un_origcolumncnt
- 1) + line
) % un
->un_origcolumncnt
);
2018 raid_pcolumn(diskaddr_t segment
, mr_unit_t
*un
)
2022 diskaddr_t max_orig_segment
;
2025 max_orig_segment
= (un
->un_origcolumncnt
- 1) * un
->un_segsincolumn
;
2026 if (segment
>= max_orig_segment
) {
2027 adj_seg
= segment
- max_orig_segment
;
2028 line
= adj_seg
% un
->un_segsincolumn
;
2030 line
= segment
/ (un
->un_origcolumncnt
- 1);
2032 column
= (uint_t
)((line
+ (un
->un_origcolumncnt
- 1)) %
2033 un
->un_origcolumncnt
);
2039 * Is called in raid_iosetup to probe each column to insure
2040 * that all the columns are in 'okay' state and meet the
2041 * 'full line' requirement. If any column is in error,
2042 * we don't want to enable the 'full line' flag. Previously,
2043 * we would do so and disable it only when a error is
2044 * detected after the first 'full line' io which is too late
2045 * and leads to the potential data corruption.
2048 raid_check_cols(mr_unit_t
*un
)
2052 mr_column_t
*colptr
;
2053 minor_t mnum
= MD_SID(un
);
2057 buf
= kmem_zalloc((uint_t
)DEV_BSIZE
, KM_SLEEP
);
2059 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
2062 colptr
= &un
->un_column
[i
];
2064 tmpdev
= colptr
->un_dev
;
2067 * If this device is hotspared
2068 * use the hotspare key
2070 tmpdev
= md_resolve_bydevid(mnum
, tmpdev
, HOTSPARED(un
, i
) ?
2071 colptr
->un_hs_key
: colptr
->un_orig_key
);
2073 if (tmpdev
== NODEV64
) {
2078 colptr
->un_dev
= tmpdev
;
2080 bzero((caddr_t
)&bp
, sizeof (buf_t
));
2083 bp
.b_flags
= (B_READ
| B_BUSY
);
2084 sema_init(&bp
.b_io
, 0, NULL
,
2085 SEMA_DEFAULT
, NULL
);
2086 sema_init(&bp
.b_sem
, 0, NULL
,
2087 SEMA_DEFAULT
, NULL
);
2088 bp
.b_edev
= md_dev64_to_dev(colptr
->un_dev
);
2089 bp
.b_lblkno
= colptr
->un_pwstart
;
2090 bp
.b_bcount
= DEV_BSIZE
;
2091 bp
.b_bufsize
= DEV_BSIZE
;
2092 bp
.b_un
.b_addr
= (caddr_t
)buf
;
2093 (void) md_call_strategy(&bp
, 0, NULL
);
2100 kmem_free(buf
, DEV_BSIZE
);
2105 * NAME: raid_iosetup
2106 * DESCRIPTION: RAID metadevice specific I/O set up routine which does
2107 * all the necessary calculations to determine the location
2108 * of the segement for the I/O.
2109 * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice
2110 * diskaddr_t blkno - block number of the I/O attempt
2111 * size_t blkcnt - block count for this I/O
2112 * md_raidcs_t *cs - child structure for each segmented I/O
2114 * NOTE: The following is an example of a raid disk layer out:
2117 * Original Column = 4
2118 * Segment Per Column = 10
2120 * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6
2121 * -------------------------------------------------------------
2122 * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40
2123 * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31
2124 * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32
2125 * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33
2126 * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34
2127 * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35
2128 * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36
2129 * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37
2130 * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38
2131 * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39
2142 diskaddr_t segstart
;
2149 /* caculate the segment# and offset for the block */
2150 segment
= blkno
/ un
->un_segsize
;
2151 segstart
= segment
* un
->un_segsize
;
2152 segoff
= blkno
- segstart
;
2153 iosize
= un
->un_iosize
- 1;
2154 colcnt
= un
->un_totalcolumncnt
- 1;
2155 line
= raid_line(segment
, un
);
2156 cs
->cs_dcolumn
= raid_dcolumn(segment
, un
);
2157 cs
->cs_pcolumn
= raid_pcolumn(segment
, un
);
2158 cs
->cs_dflags
= un
->un_column
[cs
->cs_dcolumn
].un_devflags
;
2159 cs
->cs_pflags
= un
->un_column
[cs
->cs_pcolumn
].un_devflags
;
2162 if ((cs
->cs_ps
->ps_flags
& MD_RPS_WRITE
) &&
2163 (UNIT_STATE(un
) & RCS_OKAY
) &&
2165 (un
->un_totalcolumncnt
== un
->un_origcolumncnt
) &&
2166 (un
->un_segsize
< un
->un_iosize
) &&
2167 (un
->un_iosize
<= un
->un_maxio
) &&
2168 (blkno
== line
* un
->un_segsize
* colcnt
) &&
2169 (blkcnt
>= ((un
->un_totalcolumncnt
-1) * un
->un_segsize
)) &&
2170 (raid_state_cnt(un
, RCS_OKAY
) == un
->un_origcolumncnt
) &&
2171 (raid_check_cols(un
) == 0)) {
2173 md_raidcbuf_t
**cbufp
;
2174 md_raidcbuf_t
*cbuf
;
2177 STAT_INC(raid_full_line_writes
);
2178 leftover
= blkcnt
- (un
->un_segsize
* colcnt
);
2179 ASSERT(blkcnt
>= (un
->un_segsize
* colcnt
));
2180 cs
->cs_blkno
= line
* un
->un_segsize
;
2181 cs
->cs_blkcnt
= un
->un_segsize
;
2182 cs
->cs_lastblk
= cs
->cs_blkno
+ cs
->cs_blkcnt
- 1;
2183 cs
->cs_bcount
= dbtob(cs
->cs_blkcnt
);
2184 cs
->cs_flags
|= MD_RCS_LINE
;
2186 cbufp
= &cs
->cs_buflist
;
2187 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
2188 j
= cs
->cs_dcolumn
+ i
;
2189 j
= j
% un
->un_totalcolumncnt
;
2191 if ((j
== cs
->cs_dcolumn
) || (j
== cs
->cs_pcolumn
))
2193 cbuf
= kmem_cache_alloc(raid_cbuf_cache
,
2195 raid_cbuf_init(cbuf
);
2196 cbuf
->cbuf_un
= cs
->cs_un
;
2197 cbuf
->cbuf_ps
= cs
->cs_ps
;
2198 cbuf
->cbuf_column
= j
;
2199 cbuf
->cbuf_bcount
= dbtob(un
->un_segsize
);
2201 cbufp
= &cbuf
->cbuf_next
;
2206 leftover
= blkcnt
- (un
->un_segsize
- segoff
);
2207 if (blkcnt
> (un
->un_segsize
- segoff
))
2212 if (blkcnt
> (size_t)iosize
) {
2213 leftover
+= (blkcnt
- iosize
);
2217 /* calculate the line# and column# for the segment */
2218 cs
->cs_flags
&= ~MD_RCS_LINE
;
2219 cs
->cs_blkno
= line
* un
->un_segsize
+ segoff
;
2220 cs
->cs_blkcnt
= (uint_t
)blkcnt
;
2221 cs
->cs_lastblk
= cs
->cs_blkno
+ cs
->cs_blkcnt
- 1;
2222 cs
->cs_bcount
= dbtob((uint_t
)blkcnt
);
2228 * DESCRIPTION: RAID metadevice I/O done interrupt routine
2229 * PARAMETERS: struct buf *bp - pointer to a buffer structure
2232 raid_done(struct buf
*bp
)
2238 cs
= (md_raidcs_t
*)bp
->b_chain
;
2242 mutex_enter(&cs
->cs_mx
);
2243 if (bp
->b_flags
& B_ERROR
) {
2244 cs
->cs_flags
|= MD_RCS_ERROR
;
2245 cs
->cs_flags
&= ~(MD_RCS_ISCALL
);
2248 flags
= cs
->cs_flags
;
2249 frags
= --cs
->cs_frags
;
2250 mutex_exit(&cs
->cs_mx
);
2255 if (flags
& MD_RCS_ERROR
) {
2256 if (cs
->cs_error_call
) {
2257 daemon_request(&md_done_daemon
, cs
->cs_error_call
,
2258 (daemon_queue_t
*)cs
, REQ_OLD
);
2263 if (flags
& MD_RCS_ISCALL
) {
2264 cs
->cs_flags
&= ~(MD_RCS_ISCALL
);
2265 (*(cs
->cs_call
))(cs
);
2268 daemon_request(&md_done_daemon
, cs
->cs_call
,
2269 (daemon_queue_t
*)cs
, REQ_OLD
);
2272 * the flag RIO_EXTRA is used when dealing with a column in the process
2273 * of being resynced. During the resync, writes may have to take place
2274 * on both the original component and a hotspare component.
2276 #define RIO_DATA 0x00100 /* use data buffer & data column */
2277 #define RIO_PARITY 0x00200 /* use parity buffer & parity column */
2278 #define RIO_WRITE 0x00400 /* issue a write */
2279 #define RIO_READ 0x00800 /* issue a read */
2280 #define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */
2281 #define RIO_ALT 0x02000 /* do write to alternate device */
2282 #define RIO_EXTRA 0x04000 /* use extra buffer */
2284 #define RIO_COLMASK 0x000ff
2286 #define RIO_PREWRITE RIO_WRITE | RIO_PWIO
2290 * DESCRIPTION: RAID metadevice write routine
2291 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
2294 raidio(md_raidcs_t
*cs
, int flags
)
2303 diskaddr_t devstart
;
2308 ASSERT(IO_READER_HELD(un
));
2309 ASSERT(UNIT_READER_HELD(un
));
2311 if (flags
& RIO_DATA
) {
2312 if (flags
& RIO_EXTRA
)
2316 bp
->b_un
.b_addr
= cs
->cs_dbuffer
;
2317 column
= cs
->cs_dcolumn
;
2319 if (flags
& RIO_EXTRA
)
2323 bp
->b_un
.b_addr
= cs
->cs_pbuffer
;
2324 column
= cs
->cs_pcolumn
;
2326 if (flags
& RIO_COLMASK
)
2327 column
= (flags
& RIO_COLMASK
) - 1;
2329 bp
->b_bcount
= cs
->cs_bcount
;
2330 bp
->b_bufsize
= cs
->cs_bcount
;
2331 iosize
= un
->un_iosize
;
2333 /* check if the hotspared device will be used */
2334 if (flags
& RIO_ALT
&& (flags
& RIO_WRITE
)) {
2335 pwstart
= un
->un_column
[column
].un_alt_pwstart
;
2336 devstart
= un
->un_column
[column
].un_alt_devstart
;
2337 dev
= un
->un_column
[column
].un_alt_dev
;
2339 pwstart
= un
->un_column
[column
].un_pwstart
;
2340 devstart
= un
->un_column
[column
].un_devstart
;
2341 dev
= un
->un_column
[column
].un_dev
;
2344 /* if not writing to log skip log header */
2345 if ((flags
& RIO_PWIO
) == 0) {
2346 bp
->b_lblkno
= devstart
+ cs
->cs_blkno
;
2347 bp
->b_un
.b_addr
+= DEV_BSIZE
;
2349 bp
->b_bcount
+= DEV_BSIZE
;
2350 bp
->b_bufsize
= bp
->b_bcount
;
2351 if (flags
& RIO_DATA
) {
2352 bp
->b_lblkno
= cs
->cs_dpwslot
* iosize
+ pwstart
;
2353 } else { /* not DATA -> PARITY */
2354 bp
->b_lblkno
= cs
->cs_ppwslot
* iosize
+ pwstart
;
2358 bp
->b_flags
&= ~(B_READ
| B_WRITE
| B_ERROR
| nv_available
);
2359 bp
->b_flags
|= B_BUSY
;
2360 if (flags
& RIO_READ
) {
2361 bp
->b_flags
|= B_READ
;
2363 bp
->b_flags
|= B_WRITE
;
2364 if ((nv_available
&& nv_parity
&& (flags
& RIO_PARITY
)) ||
2365 (nv_available
&& nv_prewrite
&& (flags
& RIO_PWIO
)))
2366 bp
->b_flags
|= nv_available
;
2368 bp
->b_iodone
= (int (*)())raid_done
;
2369 bp
->b_edev
= md_dev64_to_dev(dev
);
2371 ASSERT((bp
->b_edev
!= 0) && (bp
->b_edev
!= NODEV
));
2373 private = cs
->cs_strategy_private
;
2374 flag
= cs
->cs_strategy_flag
;
2376 md_call_strategy(bp
, flag
, private);
2380 * NAME: genstandardparity
2381 * DESCRIPTION: This routine
2382 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
2385 genstandardparity(md_raidcs_t
*cs
)
2387 uint_t
*dbuf
, *pbuf
;
2392 ASSERT((cs
->cs_bcount
& 0x3) == 0);
2394 wordcnt
= cs
->cs_bcount
/ sizeof (uint_t
);
2396 dbuf
= (uint_t
*)(void *)(cs
->cs_dbuffer
+ DEV_BSIZE
);
2397 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
2400 if (((uintptr_t)cs
->cs_addr
& 0x3) == 0) {
2401 uint_t
*uwbuf
= (uint_t
*)(void *)(cs
->cs_addr
);
2406 psum
^= (*pbuf
= ((*pbuf
^ *dbuf
) ^ uval
));
2413 uchar_t
*ubbuf
= (uchar_t
*)(cs
->cs_addr
);
2420 cb
.bb
[0] = *ubbuf
++;
2421 cb
.bb
[1] = *ubbuf
++;
2422 cb
.bb
[2] = *ubbuf
++;
2423 cb
.bb
[3] = *ubbuf
++;
2424 psum
^= (*pbuf
= ((*pbuf
^ *dbuf
) ^ cb
.wb
));
2432 RAID_FILLIN_RPW(cs
->cs_dbuffer
, cs
->cs_un
, dsum
, cs
->cs_pcolumn
,
2433 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
2434 2, cs
->cs_dcolumn
, RAID_PWMAGIC
);
2436 RAID_FILLIN_RPW(cs
->cs_pbuffer
, cs
->cs_un
, psum
, cs
->cs_dcolumn
,
2437 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
2438 2, cs
->cs_pcolumn
, RAID_PWMAGIC
);
2442 genlineparity(md_raidcs_t
*cs
)
2445 mr_unit_t
*un
= cs
->cs_un
;
2446 md_raidcbuf_t
*cbuf
;
2447 uint_t
*pbuf
, *dbuf
;
2451 uint_t psum
= 0, dsum
= 0;
2452 size_t count
= un
->un_segsize
* DEV_BSIZE
;
2456 ASSERT((cs
->cs_bcount
& 0x3) == 0);
2458 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
2459 dbuf
= (uint_t
*)(void *)(cs
->cs_dbuffer
+ DEV_BSIZE
);
2460 uwbuf
= (uint_t
*)(void *)(cs
->cs_addr
);
2461 ubbuf
= (uchar_t
*)(void *)(cs
->cs_addr
);
2463 wordcnt
= count
/ sizeof (uint_t
);
2466 if (((uintptr_t)cs
->cs_addr
& 0x3) == 0) {
2484 cb
.bb
[0] = *ubbuf
++;
2485 cb
.bb
[1] = *ubbuf
++;
2486 cb
.bb
[2] = *ubbuf
++;
2487 cb
.bb
[3] = *ubbuf
++;
2496 RAID_FILLIN_RPW(cs
->cs_dbuffer
, un
, dsum
, cs
->cs_pcolumn
,
2497 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
2498 un
->un_totalcolumncnt
, cs
->cs_dcolumn
, RAID_PWMAGIC
);
2500 raidio(cs
, RIO_PREWRITE
| RIO_DATA
);
2502 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
) {
2505 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
2506 dbuf
= (uint_t
*)(void *)(cbuf
->cbuf_buffer
+ DEV_BSIZE
);
2508 wordcnt
= count
/ sizeof (uint_t
);
2510 col
= cbuf
->cbuf_column
;
2513 if (((uintptr_t)cs
->cs_addr
& 0x3) == 0) {
2517 * Only calculate psum when working on the last
2520 if (cbuf
->cbuf_next
== NULL
) {
2525 psum
^= (*pbuf
^= uval
);
2547 * Only calculate psum when working on the last
2550 if (cbuf
->cbuf_next
== NULL
) {
2553 cb
.bb
[0] = *ubbuf
++;
2554 cb
.bb
[1] = *ubbuf
++;
2555 cb
.bb
[2] = *ubbuf
++;
2556 cb
.bb
[3] = *ubbuf
++;
2558 psum
^= (*pbuf
^= cb
.wb
);
2565 cb
.bb
[0] = *ubbuf
++;
2566 cb
.bb
[1] = *ubbuf
++;
2567 cb
.bb
[2] = *ubbuf
++;
2568 cb
.bb
[3] = *ubbuf
++;
2577 RAID_FILLIN_RPW(cbuf
->cbuf_buffer
, un
, dsum
, cs
->cs_pcolumn
,
2578 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
2579 un
->un_totalcolumncnt
, col
, RAID_PWMAGIC
);
2582 * fill in buffer for write to prewrite area
2584 bp
= &cbuf
->cbuf_bp
;
2585 bp
->b_un
.b_addr
= cbuf
->cbuf_buffer
;
2586 bp
->b_bcount
= cbuf
->cbuf_bcount
+ DEV_BSIZE
;
2587 bp
->b_bufsize
= bp
->b_bcount
;
2588 bp
->b_lblkno
= (cbuf
->cbuf_pwslot
* un
->un_iosize
) +
2589 un
->un_column
[col
].un_pwstart
;
2590 bp
->b_flags
= B_WRITE
| B_BUSY
;
2591 if (nv_available
&& nv_prewrite
)
2592 bp
->b_flags
|= nv_available
;
2593 bp
->b_iodone
= (int (*)())raid_done
;
2594 bp
->b_edev
= md_dev64_to_dev(un
->un_column
[col
].un_dev
);
2595 bp
->b_chain
= (struct buf
*)cs
;
2596 md_call_strategy(bp
,
2597 cs
->cs_strategy_flag
, cs
->cs_strategy_private
);
2600 RAID_FILLIN_RPW(cs
->cs_pbuffer
, un
, psum
, cs
->cs_dcolumn
,
2601 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
2602 un
->un_totalcolumncnt
, cs
->cs_pcolumn
, RAID_PWMAGIC
);
2604 raidio(cs
, RIO_PREWRITE
| RIO_PARITY
);
2608 * NAME: raid_readregenloop
2609 * DESCRIPTION: RAID metadevice write routine
2610 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
2613 raid_readregenloop(md_raidcs_t
*cs
)
2624 * XOR the parity with data bytes, must skip the
2625 * pre-write entry header in all data/parity buffers
2627 wordcnt
= cs
->cs_bcount
/ sizeof (uint_t
);
2628 dbuf
= (uint_t
*)(void *)(cs
->cs_dbuffer
+ DEV_BSIZE
);
2629 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
2633 /* bump up the loop count */
2636 /* skip the errored component */
2637 if (cs
->cs_loop
== cs
->cs_dcolumn
)
2640 if (cs
->cs_loop
!= un
->un_totalcolumncnt
) {
2642 raidio(cs
, RIO_PARITY
| RIO_READ
| (cs
->cs_loop
+ 1));
2645 /* reaching the end sof loop */
2647 bcopy(cs
->cs_dbuffer
+ DEV_BSIZE
, cs
->cs_addr
, cs
->cs_bcount
);
2648 raid_free_child(cs
, 1);
2650 /* decrement readfrags */
2651 raid_free_parent(ps
, RFP_DECR_READFRAGS
| RFP_RLS_LOCK
);
2655 * NAME: raid_read_io
2656 * DESCRIPTION: RAID metadevice read I/O routine
2657 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
2658 * md_raidcs_t *cs - pointer to a child structure
2661 raid_read_io(mr_unit_t
*un
, md_raidcs_t
*cs
)
2666 buf_t
*pb
= cs
->cs_ps
->ps_bp
;
2667 mr_column_t
*column
;
2669 flag
= cs
->cs_strategy_flag
;
2670 private = cs
->cs_strategy_private
;
2671 column
= &un
->un_column
[cs
->cs_dcolumn
];
2674 * The component to be read is good, simply set up bp structure
2675 * and call low level md routine doing the read.
2678 if (COLUMN_ISOKAY(un
, cs
->cs_dcolumn
) ||
2679 (COLUMN_ISLASTERR(un
, cs
->cs_dcolumn
) &&
2680 (cs
->cs_flags
& MD_RCS_RECOVERY
) == 0)) {
2681 dev_t ddi_dev
; /* needed for bioclone, so not md_dev64_t */
2682 ddi_dev
= md_dev64_to_dev(column
->un_dev
);
2685 bp
= md_bioclone(pb
, cs
->cs_offset
, cs
->cs_bcount
, ddi_dev
,
2686 column
->un_devstart
+ cs
->cs_blkno
,
2687 (int (*)())raid_done
, bp
, KM_NOSLEEP
);
2689 bp
->b_chain
= (buf_t
*)cs
;
2692 cs
->cs_error_call
= raid_read_error
;
2693 cs
->cs_retry_call
= raid_read_retry
;
2694 cs
->cs_flags
|= MD_RCS_ISCALL
;
2695 cs
->cs_stage
= RAID_READ_DONE
;
2696 cs
->cs_call
= raid_stage
;
2698 ASSERT(bp
->b_edev
!= 0);
2700 md_call_strategy(bp
, flag
, private);
2705 * The component to be read is bad, have to go through
2706 * raid specific method to read data from other members.
2710 * NOTE: always get dbuffer before pbuffer
2711 * and get both buffers before pwslot
2712 * otherwise a deadlock could be introduced.
2717 if (cs
->cs_loop
== cs
->cs_dcolumn
)
2720 /* zero out data buffer for use as a data sink */
2721 bzero(cs
->cs_dbuffer
+ DEV_BSIZE
, cs
->cs_bcount
);
2722 cs
->cs_stage
= RAID_NONE
;
2723 cs
->cs_call
= raid_readregenloop
;
2724 cs
->cs_error_call
= raid_read_error
;
2725 cs
->cs_retry_call
= raid_read_no_retry
;
2728 /* use parity buffer to read other columns */
2729 raidio(cs
, RIO_PARITY
| RIO_READ
| (cs
->cs_loop
+ 1));
2734 * DESCRIPTION: RAID metadevice write routine
2735 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
2736 * md_raidcs_t *cs - pointer to a child structure
2739 raid_read(mr_unit_t
*un
, md_raidcs_t
*cs
)
2746 ASSERT(IO_READER_HELD(un
));
2749 raid_line_reader_lock(cs
, 0);
2750 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
2751 ASSERT(UNIT_STATE(un
) != RUS_INIT
);
2755 /* make sure the read doesn't go beyond the end of the column */
2756 if (cs
->cs_blkno
+ cs
->cs_blkcnt
>
2757 un
->un_segsize
* un
->un_segsincolumn
) {
2763 if (un
->un_state
& RUS_REGEN
) {
2764 raid_regen_parity(cs
);
2769 raid_read_io(un
, cs
);
2773 raid_error_parent(ps
, error
);
2774 raid_free_child(cs
, 1);
2775 /* decrement readfrags */
2776 raid_free_parent(ps
, RFP_DECR_READFRAGS
| RFP_RLS_LOCK
);
2781 * NAME: raid_write_err_retry
2782 * DESCRIPTION: RAID metadevice write retry routine
2783 * write was for parity or data only;
2784 * complete write with error, no recovery possible
2785 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
2786 * md_raidcs_t *cs - pointer to a child structure
2790 raid_write_err_retry(mr_unit_t
*un
, md_raidcs_t
*cs
)
2792 md_raidps_t
*ps
= cs
->cs_ps
;
2793 int flags
= RFP_DECR_FRAGS
| RFP_RLS_LOCK
;
2795 /* decrement pwfrags if needed, and frags */
2796 if (!(cs
->cs_flags
& MD_RCS_PWDONE
))
2797 flags
|= RFP_DECR_PWFRAGS
;
2798 raid_error_parent(ps
, EIO
);
2799 raid_free_child(cs
, 1);
2800 raid_free_parent(ps
, flags
);
2804 * NAME: raid_write_err_retry
2805 * DESCRIPTION: RAID metadevice write retry routine
2806 * write is too far along to retry and parent
2807 * has already been signaled with iodone.
2808 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
2809 * md_raidcs_t *cs - pointer to a child structure
2813 raid_write_no_retry(mr_unit_t
*un
, md_raidcs_t
*cs
)
2815 md_raidps_t
*ps
= cs
->cs_ps
;
2816 int flags
= RFP_DECR_FRAGS
| RFP_RLS_LOCK
;
2818 /* decrement pwfrags if needed, and frags */
2819 if (!(cs
->cs_flags
& MD_RCS_PWDONE
))
2820 flags
|= RFP_DECR_PWFRAGS
;
2821 raid_free_child(cs
, 1);
2822 raid_free_parent(ps
, flags
);
2826 * NAME: raid_write_retry
2827 * DESCRIPTION: RAID metadevice write retry routine
2828 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
2829 * md_raidcs_t *cs - pointer to a child structure
2832 raid_write_retry(mr_unit_t
*un
, md_raidcs_t
*cs
)
2838 /* re-initialize the buf_t structure for raid_write() */
2839 cs
->cs_dbuf
.b_chain
= (struct buf
*)cs
;
2840 cs
->cs_dbuf
.b_back
= &cs
->cs_dbuf
;
2841 cs
->cs_dbuf
.b_forw
= &cs
->cs_dbuf
;
2842 cs
->cs_dbuf
.b_flags
= B_BUSY
; /* initialize flags */
2843 cs
->cs_dbuf
.b_error
= 0; /* initialize error */
2844 cs
->cs_dbuf
.b_offset
= -1;
2845 /* Initialize semaphores */
2846 sema_init(&cs
->cs_dbuf
.b_io
, 0, NULL
,
2847 SEMA_DEFAULT
, NULL
);
2848 sema_init(&cs
->cs_dbuf
.b_sem
, 0, NULL
,
2849 SEMA_DEFAULT
, NULL
);
2851 cs
->cs_pbuf
.b_chain
= (struct buf
*)cs
;
2852 cs
->cs_pbuf
.b_back
= &cs
->cs_pbuf
;
2853 cs
->cs_pbuf
.b_forw
= &cs
->cs_pbuf
;
2854 cs
->cs_pbuf
.b_flags
= B_BUSY
; /* initialize flags */
2855 cs
->cs_pbuf
.b_error
= 0; /* initialize error */
2856 cs
->cs_pbuf
.b_offset
= -1;
2857 sema_init(&cs
->cs_pbuf
.b_io
, 0, NULL
,
2858 SEMA_DEFAULT
, NULL
);
2859 sema_init(&cs
->cs_pbuf
.b_sem
, 0, NULL
,
2860 SEMA_DEFAULT
, NULL
);
2862 cs
->cs_hbuf
.b_chain
= (struct buf
*)cs
;
2863 cs
->cs_hbuf
.b_back
= &cs
->cs_hbuf
;
2864 cs
->cs_hbuf
.b_forw
= &cs
->cs_hbuf
;
2865 cs
->cs_hbuf
.b_flags
= B_BUSY
; /* initialize flags */
2866 cs
->cs_hbuf
.b_error
= 0; /* initialize error */
2867 cs
->cs_hbuf
.b_offset
= -1;
2868 sema_init(&cs
->cs_hbuf
.b_io
, 0, NULL
,
2869 SEMA_DEFAULT
, NULL
);
2870 sema_init(&cs
->cs_hbuf
.b_sem
, 0, NULL
,
2871 SEMA_DEFAULT
, NULL
);
2873 cs
->cs_flags
&= ~(MD_RCS_ERROR
);
2875 * If we have already done'ed the i/o but have done prewrite
2876 * on this child, then reset PWDONE flag and bump pwfrags before
2878 * If pwfrags is zero, we have already 'iodone'd the i/o so
2879 * leave things alone. We don't want to re-'done' it.
2881 mutex_enter(&ps
->ps_mx
);
2882 if (cs
->cs_flags
& MD_RCS_PWDONE
) {
2883 cs
->cs_flags
&= ~MD_RCS_PWDONE
;
2886 mutex_exit(&ps
->ps_mx
);
2887 raid_write_io(un
, cs
);
2892 * DESCRIPTION: RAID metadevice write routine
2893 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
2894 * LOCKS: must obtain unit writer lock while calling raid_error_state
2895 * since a unit or column state transition may take place.
2896 * must obtain unit reader lock to retry I/O.
2899 raid_wrerr(md_raidcs_t
*cs
)
2904 md_raidcbuf_t
*cbuf
;
2909 un
= (mr_unit_t
*)md_unit_writerlock(ui
);
2912 if (cs
->cs_dbuf
.b_flags
& B_ERROR
)
2913 (void) raid_error_state(un
, &cs
->cs_dbuf
);
2914 if (cs
->cs_pbuf
.b_flags
& B_ERROR
)
2915 (void) raid_error_state(un
, &cs
->cs_pbuf
);
2916 if (cs
->cs_hbuf
.b_flags
& B_ERROR
)
2917 (void) raid_error_state(un
, &cs
->cs_hbuf
);
2918 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
)
2919 if (cbuf
->cbuf_bp
.b_flags
& B_ERROR
)
2920 (void) raid_error_state(un
, &cbuf
->cbuf_bp
);
2922 md_unit_writerexit(ui
);
2924 ps
->ps_flags
|= MD_RPS_HSREQ
;
2926 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
2928 /* now attempt the appropriate retry routine */
2929 (*(cs
->cs_retry_call
))(un
, cs
);
2932 * NAMES: raid_write_error
2933 * DESCRIPTION: I/O error handling routine for a RAID metadevice write
2934 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
2938 raid_write_error(md_raidcs_t
*cs
)
2943 md_raidcbuf_t
*cbuf
;
2950 setno
= MD_UN2SET(un
);
2953 * locate each buf that is in error on this io and then
2954 * output an error message
2956 if ((cs
->cs_dbuf
.b_flags
& B_ERROR
) &&
2957 (COLUMN_STATE(un
, cs
->cs_dcolumn
) != RCS_ERRED
) &&
2958 (COLUMN_STATE(un
, cs
->cs_dcolumn
) != RCS_LAST_ERRED
))
2959 cmn_err(CE_WARN
, "md %s: write error on %s",
2960 md_shortname(MD_SID(un
)),
2961 md_devname(setno
, md_expldev(cs
->cs_dbuf
.b_edev
), NULL
, 0));
2963 if ((cs
->cs_pbuf
.b_flags
& B_ERROR
) &&
2964 (COLUMN_STATE(un
, cs
->cs_pcolumn
) != RCS_ERRED
) &&
2965 (COLUMN_STATE(un
, cs
->cs_pcolumn
) != RCS_LAST_ERRED
))
2966 cmn_err(CE_WARN
, "md %s: write error on %s",
2967 md_shortname(MD_SID(un
)),
2968 md_devname(setno
, md_expldev(cs
->cs_pbuf
.b_edev
), NULL
, 0));
2970 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
)
2971 if ((cbuf
->cbuf_bp
.b_flags
& B_ERROR
) &&
2972 (COLUMN_STATE(un
, cbuf
->cbuf_column
) != RCS_ERRED
) &&
2973 (COLUMN_STATE(un
, cbuf
->cbuf_column
) != RCS_LAST_ERRED
))
2974 cmn_err(CE_WARN
, "md %s: write error on %s",
2975 md_shortname(MD_SID(un
)),
2976 md_devname(setno
, md_expldev(cbuf
->cbuf_bp
.b_edev
),
2979 md_unit_readerexit(ui
);
2981 ASSERT(cs
->cs_frags
== 0);
2983 /* now schedule processing for possible state change */
2984 daemon_request(&md_mstr_daemon
, raid_wrerr
,
2985 (daemon_queue_t
*)cs
, REQ_OLD
);
2990 * NAME: raid_write_ponly
2991 * DESCRIPTION: RAID metadevice write routine
2992 * in the case where only the parity column can be written
2993 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
2996 raid_write_ponly(md_raidcs_t
*cs
)
2999 mr_unit_t
*un
= cs
->cs_un
;
3002 /* decrement pwfrags if needed, but not frags */
3003 ASSERT(!(cs
->cs_flags
& MD_RCS_PWDONE
));
3004 raid_free_parent(ps
, RFP_DECR_PWFRAGS
);
3005 cs
->cs_flags
|= MD_RCS_PWDONE
;
3007 cs
->cs_stage
= RAID_WRITE_PONLY_DONE
;
3008 cs
->cs_call
= raid_stage
;
3009 cs
->cs_error_call
= raid_write_error
;
3010 cs
->cs_retry_call
= raid_write_no_retry
;
3011 if (WRITE_ALT(un
, cs
->cs_pcolumn
)) {
3013 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_PARITY
| RIO_WRITE
);
3015 raidio(cs
, RIO_PARITY
| RIO_WRITE
);
3019 * NAME: raid_write_ploop
3020 * DESCRIPTION: RAID metadevice write routine, constructs parity from
3021 * data in other columns.
3022 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
3025 raid_write_ploop(md_raidcs_t
*cs
)
3027 mr_unit_t
*un
= cs
->cs_un
;
3033 wordcnt
= cs
->cs_bcount
/ sizeof (uint_t
);
3034 dbuf
= (uint_t
*)(void *)(cs
->cs_dbuffer
+ DEV_BSIZE
);
3035 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
3041 * build parity from scratch using new data,
3042 * skip reading the data and parity columns.
3044 while (cs
->cs_loop
== cs
->cs_dcolumn
|| cs
->cs_loop
== cs
->cs_pcolumn
)
3047 if (cs
->cs_loop
!= un
->un_totalcolumncnt
) {
3049 raidio(cs
, RIO_DATA
| RIO_READ
| (cs
->cs_loop
+ 1));
3053 /* construct checksum for parity buffer */
3054 wordcnt
= cs
->cs_bcount
/ sizeof (uint_t
);
3055 pbuf
= (uint_t
*)(void *)(cs
->cs_pbuffer
+ DEV_BSIZE
);
3060 RAID_FILLIN_RPW(cs
->cs_pbuffer
, un
, psum
, -1,
3061 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
3062 1, cs
->cs_pcolumn
, RAID_PWMAGIC
);
3064 cs
->cs_stage
= RAID_NONE
;
3065 cs
->cs_call
= raid_write_ponly
;
3066 cs
->cs_error_call
= raid_write_error
;
3067 cs
->cs_retry_call
= raid_write_err_retry
;
3069 if (WRITE_ALT(un
, cs
->cs_pcolumn
)) {
3071 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_PARITY
| RIO_PREWRITE
);
3073 raidio(cs
, RIO_PARITY
| RIO_PREWRITE
);
3077 * NAME: raid_write_donly
3078 * DESCRIPTION: RAID metadevice write routine
3079 * Completed writing data to prewrite entry
3080 * in the case where only the data column can be written
3081 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
3084 raid_write_donly(md_raidcs_t
*cs
)
3087 mr_unit_t
*un
= cs
->cs_un
;
3090 /* WARNING: don't release unit reader lock here... */
3091 /* decrement pwfrags if needed, but not frags */
3092 ASSERT(!(cs
->cs_flags
& MD_RCS_PWDONE
));
3093 raid_free_parent(ps
, RFP_DECR_PWFRAGS
);
3094 cs
->cs_flags
|= MD_RCS_PWDONE
;
3096 cs
->cs_stage
= RAID_WRITE_DONLY_DONE
;
3097 cs
->cs_call
= raid_stage
;
3098 cs
->cs_error_call
= raid_write_error
;
3099 cs
->cs_retry_call
= raid_write_err_retry
;
3100 if (WRITE_ALT(un
, cs
->cs_dcolumn
)) {
3102 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_DATA
| RIO_WRITE
);
3104 raidio(cs
, RIO_DATA
| RIO_WRITE
);
3108 * NAME: raid_write_got_old
3109 * DESCRIPTION: RAID metadevice write routine
3110 * completed read of old data and old parity
3111 * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
3114 raid_write_got_old(md_raidcs_t
*cs
)
3116 mr_unit_t
*un
= cs
->cs_un
;
3118 ASSERT(IO_READER_HELD(cs
->cs_un
));
3119 ASSERT(UNIT_READER_HELD(cs
->cs_un
));
3122 genstandardparity(cs
);
3124 cs
->cs_call
= raid_stage
;
3125 cs
->cs_stage
= RAID_PREWRITE_DONE
;
3126 cs
->cs_error_call
= raid_write_error
;
3127 cs
->cs_retry_call
= raid_write_retry
;
3129 if (WRITE_ALT(un
, cs
->cs_dcolumn
)) {
3131 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_DATA
| RIO_PREWRITE
);
3134 if (WRITE_ALT(un
, cs
->cs_pcolumn
)) {
3136 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_PARITY
| RIO_PREWRITE
);
3138 ASSERT(cs
->cs_frags
< 4);
3139 raidio(cs
, RIO_DATA
| RIO_PREWRITE
);
3140 raidio(cs
, RIO_PARITY
| RIO_PREWRITE
);
3144 * NAME: raid_write_io
3145 * DESCRIPTION: RAID metadevice write I/O routine
3146 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
3147 * md_raidcs_t *cs - pointer to a child structure
3152 raid_write_io(mr_unit_t
*un
, md_raidcs_t
*cs
)
3154 md_raidps_t
*ps
= cs
->cs_ps
;
3162 ASSERT((un
->un_column
[cs
->cs_pcolumn
].un_devstate
&
3164 ASSERT((un
->un_column
[cs
->cs_dcolumn
].un_devstate
&
3166 ASSERT(IO_READER_HELD(un
));
3167 ASSERT(UNIT_READER_HELD(un
));
3168 ASSERT(cs
->cs_flags
& MD_RCS_HAVE_PW_SLOTS
);
3169 if (cs
->cs_flags
& MD_RCS_LINE
) {
3171 mr_unit_t
*un
= cs
->cs_un
;
3173 ASSERT(un
->un_origcolumncnt
== un
->un_totalcolumncnt
);
3175 cs
->cs_frags
= un
->un_origcolumncnt
;
3176 cs
->cs_call
= raid_stage
;
3177 cs
->cs_error_call
= raid_write_error
;
3178 cs
->cs_retry_call
= raid_write_no_retry
;
3179 cs
->cs_stage
= RAID_LINE_PWDONE
;
3184 pcheck
= erred_check_line(un
, cs
, &un
->un_column
[cs
->cs_pcolumn
]);
3185 dcheck
= erred_check_line(un
, cs
, &un
->un_column
[cs
->cs_dcolumn
]);
3186 cs
->cs_resync_check
= pcheck
<< RCL_PARITY_OFFSET
|| dcheck
;
3188 if (pcheck
== RCL_ERRED
&& dcheck
== RCL_ERRED
) {
3191 if ((un
->un_column
[cs
->cs_pcolumn
].un_devstate
==
3193 (un
->un_column
[cs
->cs_dcolumn
].un_devstate
==
3196 raid_error_parent(ps
, err
);
3197 ASSERT(!(cs
->cs_flags
& MD_RCS_PWDONE
));
3198 raid_free_child(cs
, 1);
3199 raid_free_parent(ps
, RFP_DECR_FRAGS
3200 | RFP_RLS_LOCK
| RFP_DECR_PWFRAGS
);
3204 if (pcheck
& RCL_ERRED
) {
3206 * handle case of only having data drive
3209 wordcnt
= cs
->cs_bcount
/ sizeof (uint_t
);
3211 dbuf
= (uint_t
*)(void *)(cs
->cs_dbuffer
+ DEV_BSIZE
);
3212 ubuf
= (uint_t
*)(void *)(cs
->cs_addr
);
3220 RAID_FILLIN_RPW(cs
->cs_dbuffer
, un
, dsum
, -1,
3221 cs
->cs_blkno
, cs
->cs_blkcnt
, cs
->cs_pwid
,
3222 1, cs
->cs_dcolumn
, RAID_PWMAGIC
);
3224 cs
->cs_stage
= RAID_NONE
;
3225 cs
->cs_call
= raid_write_donly
;
3226 cs
->cs_error_call
= raid_write_error
;
3227 cs
->cs_retry_call
= raid_write_err_retry
;
3228 if (WRITE_ALT(un
, cs
->cs_dcolumn
)) {
3230 raidio(cs
, RIO_DATA
| RIO_ALT
| RIO_EXTRA
|
3233 raidio(cs
, RIO_DATA
| RIO_PREWRITE
);
3237 if (dcheck
& RCL_ERRED
) {
3239 * handle case of only having parity drive
3240 * build parity from scratch using new data,
3241 * skip reading the data and parity columns.
3245 while (cs
->cs_loop
== cs
->cs_dcolumn
||
3246 cs
->cs_loop
== cs
->cs_pcolumn
)
3249 /* copy new data in to begin building parity */
3250 bcopy(cs
->cs_addr
, cs
->cs_pbuffer
+ DEV_BSIZE
, cs
->cs_bcount
);
3251 cs
->cs_stage
= RAID_NONE
;
3252 cs
->cs_call
= raid_write_ploop
;
3253 cs
->cs_error_call
= raid_write_error
;
3254 cs
->cs_retry_call
= raid_write_err_retry
;
3256 raidio(cs
, RIO_DATA
| RIO_READ
| (cs
->cs_loop
+ 1));
3260 * handle normal cases
3261 * read old data and old parity
3264 cs
->cs_stage
= RAID_NONE
;
3265 cs
->cs_call
= raid_write_got_old
;
3266 cs
->cs_error_call
= raid_write_error
;
3267 cs
->cs_retry_call
= raid_write_retry
;
3268 ASSERT(ps
->ps_magic
== RAID_PSMAGIC
);
3269 raidio(cs
, RIO_DATA
| RIO_READ
);
3270 raidio(cs
, RIO_PARITY
| RIO_READ
);
3274 raid_enqueue(md_raidcs_t
*cs
)
3276 mdi_unit_t
*ui
= cs
->cs_ps
->ps_ui
;
3277 kmutex_t
*io_list_mutex
= &ui
->ui_io_lock
->io_list_mutex
;
3280 mutex_enter(io_list_mutex
);
3281 ASSERT(! (cs
->cs_flags
& MD_RCS_LLOCKD
));
3282 if (ui
->ui_io_lock
->io_list_front
== NULL
) {
3283 ui
->ui_io_lock
->io_list_front
= cs
;
3284 ui
->ui_io_lock
->io_list_back
= cs
;
3286 cs1
= ui
->ui_io_lock
->io_list_back
;
3287 cs1
->cs_linlck_next
= cs
;
3288 ui
->ui_io_lock
->io_list_back
= cs
;
3290 STAT_INC(raid_write_waits
);
3291 STAT_MAX(raid_max_write_q_length
, raid_write_queue_length
);
3292 cs
->cs_linlck_next
= NULL
;
3293 mutex_exit(io_list_mutex
);
3298 * DESCRIPTION: RAID metadevice write routine
3299 * PARAMETERS: mr_unit_t *un - pointer to a unit structure
3300 * md_raidcs_t *cs - pointer to a child structure
3305 raid_write(mr_unit_t
*un
, md_raidcs_t
*cs
)
3312 ASSERT(IO_READER_HELD(un
));
3316 ASSERT(UNIT_STATE(un
) != RUS_INIT
);
3317 if (UNIT_STATE(un
) == RUS_LAST_ERRED
)
3320 /* make sure the write doesn't go beyond the column */
3321 if (cs
->cs_blkno
+ cs
->cs_blkcnt
> un
->un_segsize
* un
->un_segsincolumn
)
3329 * this is an advisory loop that keeps the waiting lists short
3330 * to reduce cpu time. Since there is a race introduced by not
3331 * aquiring all the correct mutexes, use a cv_timedwait to be
3332 * sure the write always will wake up and start.
3334 while (raid_check_pw(cs
)) {
3335 mutex_enter(&un
->un_mx
);
3336 un
->un_rflags
|= MD_RFLAG_NEEDPW
;
3337 STAT_INC(raid_prewrite_waits
);
3338 (void) cv_reltimedwait(&un
->un_cv
, &un
->un_mx
, md_wr_wait
,
3340 un
->un_rflags
&= ~MD_RFLAG_NEEDPW
;
3341 mutex_exit(&un
->un_mx
);
3344 if (raid_line_writer_lock(cs
, 1))
3347 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
3351 if (un
->un_state
& RUS_REGEN
) {
3352 raid_regen_parity(cs
);
3357 raid_write_io(un
, cs
);
3360 /* aquire unit reader lock sinc raid_free_child always drops it */
3361 raid_error_parent(ps
, error
);
3362 raid_free_child(cs
, 0);
3363 /* decrement both pwfrags and frags */
3364 raid_free_parent(ps
, RFP_DECR_PWFRAGS
| RFP_DECR_FRAGS
| RFP_RLS_LOCK
);
3371 * DESCRIPTION: post-processing routine for a RAID metadevice
3372 * PARAMETERS: md_raidcs_t *cs - pointer to child structure
3375 raid_stage(md_raidcs_t
*cs
)
3377 md_raidps_t
*ps
= cs
->cs_ps
;
3378 mr_unit_t
*un
= cs
->cs_un
;
3379 md_raidcbuf_t
*cbuf
;
3384 switch (cs
->cs_stage
) {
3385 case RAID_READ_DONE
:
3386 raid_free_child(cs
, 1);
3387 /* decrement readfrags */
3388 raid_free_parent(ps
, RFP_DECR_READFRAGS
| RFP_RLS_LOCK
);
3391 case RAID_WRITE_DONE
:
3392 case RAID_WRITE_PONLY_DONE
:
3393 case RAID_WRITE_DONLY_DONE
:
3395 * Completed writing real parity and/or data.
3397 ASSERT(cs
->cs_flags
& MD_RCS_PWDONE
);
3398 raid_free_child(cs
, 1);
3399 /* decrement frags but not pwfrags */
3400 raid_free_parent(ps
, RFP_DECR_FRAGS
| RFP_RLS_LOCK
);
3403 case RAID_PREWRITE_DONE
:
3405 * completed writing data and parity to prewrite entries
3408 * WARNING: don't release unit reader lock here..
3409 * decrement pwfrags but not frags
3411 raid_free_parent(ps
, RFP_DECR_PWFRAGS
);
3412 cs
->cs_flags
|= MD_RCS_PWDONE
;
3414 cs
->cs_stage
= RAID_WRITE_DONE
;
3415 cs
->cs_call
= raid_stage
;
3416 cs
->cs_error_call
= raid_write_error
;
3417 cs
->cs_retry_call
= raid_write_no_retry
;
3418 if (WRITE_ALT(un
, cs
->cs_pcolumn
)) {
3420 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_PARITY
|
3423 if (WRITE_ALT(un
, cs
->cs_dcolumn
)) {
3425 raidio(cs
, RIO_ALT
| RIO_EXTRA
| RIO_DATA
| RIO_WRITE
);
3427 ASSERT(cs
->cs_frags
< 4);
3428 raidio(cs
, RIO_DATA
| RIO_WRITE
);
3429 raidio(cs
, RIO_PARITY
| RIO_WRITE
);
3430 if (cs
->cs_pw_inval_list
) {
3431 raid_free_pwinvalidate(cs
);
3435 case RAID_LINE_PWDONE
:
3436 ASSERT(cs
->cs_frags
== 0);
3437 raid_free_parent(ps
, RFP_DECR_PWFRAGS
);
3438 cs
->cs_flags
|= MD_RCS_PWDONE
;
3439 cs
->cs_frags
= un
->un_origcolumncnt
;
3440 cs
->cs_call
= raid_stage
;
3441 cs
->cs_error_call
= raid_write_error
;
3442 cs
->cs_retry_call
= raid_write_no_retry
;
3443 cs
->cs_stage
= RAID_WRITE_DONE
;
3444 for (cbuf
= cs
->cs_buflist
; cbuf
; cbuf
= cbuf
->cbuf_next
) {
3446 * fill in buffer for write to prewrite area
3448 bp
= &cbuf
->cbuf_bp
;
3451 bp
->b_un
.b_addr
= cbuf
->cbuf_buffer
+ DEV_BSIZE
;
3452 bp
->b_bcount
= cbuf
->cbuf_bcount
;
3453 bp
->b_bufsize
= cbuf
->cbuf_bcount
;
3455 un
->un_column
[cbuf
->cbuf_column
].un_devstart
+
3457 bp
->b_flags
&= ~(B_READ
| B_WRITE
| B_ERROR
);
3458 bp
->b_flags
&= ~nv_available
;
3459 bp
->b_flags
|= B_WRITE
| B_BUSY
;
3460 bp
->b_iodone
= (int (*)())raid_done
;
3461 bp
->b_edev
= md_dev64_to_dev(
3462 un
->un_column
[cbuf
->cbuf_column
].un_dev
);
3463 bp
->b_chain
= (struct buf
*)cs
;
3464 private = cs
->cs_strategy_private
;
3465 flag
= cs
->cs_strategy_flag
;
3466 md_call_strategy(bp
, flag
, private);
3468 raidio(cs
, RIO_DATA
| RIO_WRITE
);
3469 raidio(cs
, RIO_PARITY
| RIO_WRITE
);
3470 if (cs
->cs_pw_inval_list
) {
3471 raid_free_pwinvalidate(cs
);
3481 * NAME: md_raid_strategy
3482 * DESCRIPTION: RAID metadevice I/O oprations entry point.
3483 * PARAMETERS: buf_t *pb - pointer to a user I/O buffer
3484 * int flag - metadevice specific flag
3485 * void *private - carry over flag ??
3490 md_raid_strategy(buf_t
*pb
, int flag
, void *private)
3506 ui
= MDI_UNIT(getminor(pb
->b_edev
));
3507 md_kstat_waitq_enter(ui
);
3508 un
= (mr_unit_t
*)md_io_readerlock(ui
);
3509 setno
= MD_MIN2SET(getminor(pb
->b_edev
));
3511 if ((flag
& MD_NOBLOCK
) == 0) {
3512 if (md_inc_iocount(setno
) != 0) {
3513 pb
->b_flags
|= B_ERROR
;
3514 pb
->b_error
= ENXIO
;
3515 pb
->b_resid
= pb
->b_bcount
;
3516 md_kstat_waitq_exit(ui
);
3517 md_io_readerexit(ui
);
3522 md_inc_iocount_noblock(setno
);
3526 colcnt
= un
->un_totalcolumncnt
- 1;
3527 count
= pb
->b_bcount
;
3529 STAT_CHECK(raid_512
, count
== 512);
3530 STAT_CHECK(raid_1024
, count
== 1024);
3531 STAT_CHECK(raid_1024_8192
, count
> 1024 && count
< 8192);
3532 STAT_CHECK(raid_8192
, count
== 8192);
3533 STAT_CHECK(raid_8192_bigger
, count
> 8192);
3535 (void *) md_unit_readerlock(ui
);
3536 if (!(flag
& MD_STR_NOTTOP
)) {
3537 err
= md_checkbuf(ui
, (md_unit_t
*)un
, pb
); /* check and map */
3539 md_kstat_waitq_exit(ui
);
3540 md_io_readerexit(ui
);
3544 md_unit_readerexit(ui
);
3546 STAT_INC(raid_total_io
);
3548 /* allocate a parent structure for the user I/O */
3549 ps
= kmem_cache_alloc(raid_parent_cache
, MD_ALLOCFLAGS
);
3550 raid_parent_init(ps
);
3553 * Save essential information from the original buffhdr
3554 * in the md_save structure.
3559 ps
->ps_addr
= pb
->b_un
.b_addr
;
3561 if ((pb
->b_flags
& B_READ
) == 0) {
3562 ps
->ps_flags
|= MD_RPS_WRITE
;
3564 STAT_INC(raid_writes
);
3566 ps
->ps_flags
|= MD_RPS_READ
;
3568 STAT_INC(raid_reads
);
3571 count
= lbtodb(pb
->b_bcount
); /* transfer count (in blocks) */
3572 blkno
= pb
->b_lblkno
; /* block number on device */
3577 md_kstat_waitq_to_runq(ui
);
3580 cs
= kmem_cache_alloc(raid_child_cache
, MD_ALLOCFLAGS
);
3581 raid_child_init(cs
);
3584 cs
->cs_mdunit
= mnum
;
3585 cs
->cs_strategy_flag
= flag
;
3586 cs
->cs_strategy_private
= private;
3588 cs
->cs_offset
= offset
;
3589 count
= raid_iosetup(un
, blkno
, count
, cs
);
3590 if (cs
->cs_flags
& MD_RCS_LINE
) {
3591 blkno
+= (cs
->cs_blkcnt
* colcnt
);
3592 offset
+= (cs
->cs_bcount
* colcnt
);
3594 blkno
+= cs
->cs_blkcnt
;
3595 offset
+= cs
->cs_bcount
;
3597 /* for each cs bump up the ps_pwfrags and ps_frags fields */
3599 mutex_enter(&ps
->ps_mx
);
3602 mutex_exit(&ps
->ps_mx
);
3604 (void) raid_write(un
, cs
);
3606 (void) raid_read(un
, cs
);
3610 (void) raid_write(un
, cs
);
3612 (void) raid_read(un
, cs
);
3614 if (! (flag
& MD_STR_NOTTOP
) && panicstr
) {
3615 while (! (ps
->ps_flags
& MD_RPS_DONE
)) {
3616 md_daemon(1, &md_done_daemon
);
3619 kmem_cache_free(raid_parent_cache
, ps
);
3625 * DESCRIPTION: RAID metadevice SNARF entry point
3626 * PARAMETERS: md_snarfcmd_t cmd,
3631 raid_snarf(md_snarfcmd_t cmd
, set_t setno
)
3636 int all_raid_gotten
;
3643 mr_unit32_od_t
*small_un
;
3646 if (cmd
== MD_SNARF_CLEANUP
)
3649 all_raid_gotten
= 1;
3651 typ1
= (mddb_type_t
)md_getshared_key(setno
,
3652 raid_md_ops
.md_driver
.md_drivername
);
3653 recid
= mddb_makerecid(setno
, 0);
3655 while ((recid
= mddb_getnextrec(recid
, typ1
, 0)) > 0) {
3656 if (mddb_getrecprivate(recid
) & MD_PRV_GOTIT
) {
3660 dep
= mddb_getrecdep(recid
);
3661 dep
->de_flags
= MDDB_F_RAID
;
3663 switch (rbp
->rb_revision
) {
3666 if ((rbp
->rb_private
& MD_PRV_CONVD
) == 0) {
3668 * This means, we have an old and small record
3669 * and this record hasn't already been
3670 * converted. Before we create an incore
3671 * metadevice from this we have to convert it to
3675 (mr_unit32_od_t
*)mddb_getrecaddr(recid
);
3676 ncol
= small_un
->un_totalcolumncnt
;
3677 newreqsize
= sizeof (mr_unit_t
) +
3678 ((ncol
- 1) * sizeof (mr_column_t
));
3679 big_un
= (mr_unit_t
*)kmem_zalloc(newreqsize
,
3681 raid_convert((caddr_t
)small_un
, (caddr_t
)big_un
,
3683 kmem_free(small_un
, dep
->de_reqsize
);
3684 dep
->de_rb_userdata
= big_un
;
3685 dep
->de_reqsize
= newreqsize
;
3687 rbp
->rb_private
|= MD_PRV_CONVD
;
3690 * Record has already been converted. Just
3693 un
= (mr_unit_t
*)mddb_getrecaddr(recid
);
3695 un
->c
.un_revision
&= ~MD_64BIT_META_DEV
;
3698 case MDDB_REV_RB64FN
:
3700 un
= (mr_unit_t
*)mddb_getrecaddr(recid
);
3701 un
->c
.un_revision
|= MD_64BIT_META_DEV
;
3702 un
->c
.un_flag
|= MD_EFILABEL
;
3705 MDDB_NOTE_FN(rbp
->rb_revision
, un
->c
.un_revision
);
3708 * Create minor device node for snarfed entry.
3710 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un
)), MD_SID(un
));
3712 if (MD_UNIT(MD_SID(un
)) != NULL
) {
3713 mddb_setrecprivate(recid
, MD_PRV_PENDDEL
);
3716 all_raid_gotten
= 0;
3717 if (raid_build_incore((void *)un
, 1) == 0) {
3718 mddb_setrecprivate(recid
, MD_PRV_GOTIT
);
3719 md_create_unit_incore(MD_SID(un
), &raid_md_ops
, 1);
3721 } else if (un
->mr_ic
) {
3722 kmem_free(un
->un_column_ic
, sizeof (mr_column_ic_t
) *
3723 un
->un_totalcolumncnt
);
3724 kmem_free(un
->mr_ic
, sizeof (*un
->mr_ic
));
3728 if (!all_raid_gotten
) {
3729 return (gotsomething
);
3732 recid
= mddb_makerecid(setno
, 0);
3733 while ((recid
= mddb_getnextrec(recid
, typ1
, 0)) > 0)
3734 if (!(mddb_getrecprivate(recid
) & MD_PRV_GOTIT
))
3735 mddb_setrecprivate(recid
, MD_PRV_PENDDEL
);
3742 * DESCRIPTION: RAID metadevice HALT entry point
3743 * PARAMETERS: md_haltcmd_t cmd -
3748 raid_halt(md_haltcmd_t cmd
, set_t setno
)
3754 if (cmd
== MD_HALT_CLOSE
)
3757 if (cmd
== MD_HALT_OPEN
)
3760 if (cmd
== MD_HALT_UNLOAD
)
3763 if (cmd
== MD_HALT_CHECK
) {
3764 for (i
= 0; i
< md_nunits
; i
++) {
3765 mnum
= MD_MKMIN(setno
, i
);
3766 if ((ui
= MDI_UNIT(mnum
)) == NULL
)
3768 if (ui
->ui_opsindex
!= raid_md_ops
.md_selfindex
)
3770 if (md_unit_isopen(ui
))
3776 if (cmd
!= MD_HALT_DOIT
)
3779 for (i
= 0; i
< md_nunits
; i
++) {
3780 mnum
= MD_MKMIN(setno
, i
);
3781 if ((ui
= MDI_UNIT(mnum
)) == NULL
)
3783 if (ui
->ui_opsindex
!= raid_md_ops
.md_selfindex
)
3785 reset_raid((mr_unit_t
*)MD_UNIT(mnum
), mnum
, 0);
3791 * NAMES: raid_close_all_devs
3792 * DESCRIPTION: Close all the devices of the unit.
3793 * PARAMETERS: mr_unit_t *un - pointer to unit structure
3797 raid_close_all_devs(mr_unit_t
*un
, int init_pw
, int md_cflags
)
3800 mr_column_t
*device
;
3802 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
3803 device
= &un
->un_column
[i
];
3804 if (device
->un_devflags
& MD_RAID_DEV_ISOPEN
) {
3805 ASSERT((device
->un_dev
!= (md_dev64_t
)0) &&
3806 (device
->un_dev
!= NODEV64
));
3807 if ((device
->un_devstate
& RCS_OKAY
) && init_pw
)
3808 (void) init_pw_area(un
, device
->un_dev
,
3809 device
->un_pwstart
, i
);
3810 md_layered_close(device
->un_dev
, md_cflags
);
3811 device
->un_devflags
&= ~MD_RAID_DEV_ISOPEN
;
3817 * NAMES: raid_open_all_devs
3818 * DESCRIPTION: Open all the components (columns) of the device unit.
3819 * PARAMETERS: mr_unit_t *un - pointer to unit structure
3823 raid_open_all_devs(mr_unit_t
*un
, int md_oflags
)
3825 minor_t mnum
= MD_SID(un
);
3830 mr_column_t
*device
;
3831 set_t setno
= MD_MIN2SET(MD_SID(un
));
3832 side_t side
= mddb_getsidenum(setno
);
3834 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
3836 ui
->ui_tstate
&= ~MD_INACCESSIBLE
;
3838 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
3841 device
= &un
->un_column
[i
];
3843 if (COLUMN_STATE(un
, i
) & RCS_ERRED
) {
3848 if (device
->un_devflags
& MD_RAID_DEV_ISOPEN
)
3851 tmpdev
= device
->un_dev
;
3855 key
= HOTSPARED(un
, i
) ?
3856 device
->un_hs_key
: device
->un_orig_key
;
3857 if ((md_getmajor(tmpdev
) != md_major
) &&
3858 md_devid_found(setno
, side
, key
) == 1) {
3859 tmpdev
= md_resolve_bydevid(mnum
, tmpdev
, key
);
3861 if (md_layered_open(mnum
, &tmpdev
, md_oflags
)) {
3862 device
->un_dev
= tmpdev
;
3866 device
->un_dev
= tmpdev
;
3867 device
->un_devflags
|= MD_RAID_DEV_ISOPEN
;
3870 /* if open errors and errored devices are 1 then device can run */
3871 if (not_opened
> 1) {
3873 "md: %s failed to open. open error on %s\n",
3874 md_shortname(MD_SID(un
)),
3875 md_devname(MD_UN2SET(un
), device
->un_orig_dev
, NULL
, 0));
3877 ui
->ui_tstate
|= MD_INACCESSIBLE
;
3879 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_OPEN_FAIL
, SVM_TAG_METADEVICE
,
3880 MD_UN2SET(un
), MD_SID(un
));
3882 return (not_opened
> 1);
3885 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
3886 device
= &un
->un_column
[i
];
3887 if (device
->un_devflags
& MD_RAID_DEV_ISOPEN
) {
3888 if (device
->un_devstate
& RCS_LAST_ERRED
) {
3890 * At this point in time there is a possibility
3891 * that errors were the result of a controller
3892 * failure with more than a single column on it
3893 * so clear out last errored columns and let errors
3894 * re-occur is necessary.
3896 raid_set_state(un
, i
, RCS_OKAY
, 0);
3906 raid_set_state(un
, col
, RCS_ERRED
, 0);
3911 raid_commit(un
, NULL
);
3914 if (COLUMN_STATE(un
, col
) & RCS_ERRED
) {
3915 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_ERRED
,
3916 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
3917 } else if (COLUMN_STATE(un
, col
) & RCS_LAST_ERRED
) {
3918 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_LASTERRED
,
3919 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
3927 * NAMES: raid_internal_open
3928 * DESCRIPTION: Do the actual RAID open
3929 * PARAMETERS: minor_t mnum - minor number of the RAID device
3932 * int md_oflags - RAID open flags
3933 * RETURNS: 0 if successful, nonzero otherwise
3936 raid_internal_open(minor_t mnum
, int flag
, int otyp
, int md_oflags
)
3941 int replay_error
= 0;
3943 ui
= MDI_UNIT(mnum
);
3946 un
= (mr_unit_t
*)md_unit_openclose_enter(ui
);
3948 * this MUST be checked before md_unit_isopen is checked.
3949 * raid_init_columns sets md_unit_isopen to block reset, halt.
3951 if ((UNIT_STATE(un
) & (RUS_INIT
| RUS_DOI
)) &&
3952 !(md_oflags
& MD_OFLG_ISINIT
)) {
3953 md_unit_openclose_exit(ui
);
3957 if ((md_oflags
& MD_OFLG_ISINIT
) || md_unit_isopen(ui
)) {
3958 err
= md_unit_incopen(mnum
, flag
, otyp
);
3962 md_unit_readerexit(ui
);
3964 un
= (mr_unit_t
*)md_unit_writerlock(ui
);
3965 if (raid_open_all_devs(un
, md_oflags
) == 0) {
3966 if ((err
= md_unit_incopen(mnum
, flag
, otyp
)) != 0) {
3967 md_unit_writerexit(ui
);
3968 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
3969 raid_close_all_devs(un
, 0, md_oflags
);
3974 * if this unit contains more than two errored components
3975 * should return error and close all opened devices
3978 md_unit_writerexit(ui
);
3979 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
3980 raid_close_all_devs(un
, 0, md_oflags
);
3981 md_unit_openclose_exit(ui
);
3982 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_OPEN_FAIL
, SVM_TAG_METADEVICE
,
3983 MD_UN2SET(un
), MD_SID(un
));
3987 if (!(MD_STATUS(un
) & MD_UN_REPLAYED
)) {
3988 replay_error
= raid_replay(un
);
3989 MD_STATUS(un
) |= MD_UN_REPLAYED
;
3992 md_unit_writerexit(ui
);
3993 un
= (mr_unit_t
*)md_unit_readerlock(ui
);
3995 if ((replay_error
== RAID_RPLY_READONLY
) &&
3996 ((flag
& (FREAD
| FWRITE
)) == FREAD
)) {
3997 md_unit_openclose_exit(ui
);
4001 /* allocate hotspare if possible */
4002 (void) raid_hotspares();
4006 md_unit_openclose_exit(ui
);
4011 * DESCRIPTION: RAID metadevice OPEN entry point
4012 * PARAMETERS: dev_t dev -
4021 raid_open(dev_t
*dev
, int flag
, int otyp
, cred_t
*cred_p
, int md_oflags
)
4025 if (error
= raid_internal_open(getminor(*dev
), flag
, otyp
, md_oflags
)) {
4032 * NAMES: raid_internal_close
4033 * DESCRIPTION: RAID metadevice CLOSE actual implementation
4034 * PARAMETERS: minor_t - minor number of the RAID device
4037 * int md_cflags - RAID close flags
4038 * RETURNS: 0 if successful, nonzero otherwise
4042 raid_internal_close(minor_t mnum
, int otyp
, int init_pw
, int md_cflags
)
4044 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
4049 un
= (mr_unit_t
*)md_unit_openclose_enter(ui
);
4052 if ((err
= md_unit_decopen(mnum
, otyp
)) != 0)
4054 /* close devices, if necessary */
4055 if (! md_unit_isopen(ui
) || (md_cflags
& MD_OFLG_PROBEDEV
)) {
4056 raid_close_all_devs(un
, init_pw
, md_cflags
);
4059 /* unlock, return success */
4061 md_unit_openclose_exit(ui
);
4067 * DESCRIPTION: RAID metadevice close entry point
4068 * PARAMETERS: dev_t dev -
4077 raid_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred_p
, int md_cflags
)
4081 (void) md_io_writerlock(MDI_UNIT(getminor(dev
)));
4082 retval
= raid_internal_close(getminor(dev
), otyp
, 1, md_cflags
);
4083 (void) md_io_writerexit(MDI_UNIT(getminor(dev
)));
4088 * raid_probe_close_all_devs
4091 raid_probe_close_all_devs(mr_unit_t
*un
)
4094 mr_column_t
*device
;
4096 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
4097 device
= &un
->un_column
[i
];
4099 if (device
->un_devflags
& MD_RAID_DEV_PROBEOPEN
) {
4100 md_layered_close(device
->un_dev
,
4102 device
->un_devflags
&= ~MD_RAID_DEV_PROBEOPEN
;
4109 * On entry the unit writerlock is held
4112 raid_probe_dev(mdi_unit_t
*ui
, minor_t mnum
)
4119 mr_column_t
*device
;
4122 if (md_unit_isopen(ui
))
4127 * If the state has been set to LAST_ERRED because
4128 * of an error when the raid device was open at some
4129 * point in the past, don't probe. We really don't want
4130 * to reset the state in this case.
4132 if (UNIT_STATE(un
) == RUS_LAST_ERRED
)
4135 ui
->ui_tstate
&= ~MD_INACCESSIBLE
;
4137 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
4140 device
= &un
->un_column
[i
];
4141 if (COLUMN_STATE(un
, i
) & RCS_ERRED
) {
4146 tmpdev
= device
->un_dev
;
4148 * Currently the flags passed are not needed since
4149 * there cannot be an underlying metadevice. However
4150 * they are kept here for consistency.
4154 tmpdev
= md_resolve_bydevid(mnum
, tmpdev
, HOTSPARED(un
, i
)?
4155 device
->un_hs_key
: device
->un_orig_key
);
4156 if (md_layered_open(mnum
, &tmpdev
,
4157 MD_OFLG_CONT_ERRS
| MD_OFLG_PROBEDEV
)) {
4158 device
->un_dev
= tmpdev
;
4162 device
->un_dev
= tmpdev
;
4164 device
->un_devflags
|= MD_RAID_DEV_PROBEOPEN
;
4168 * The code below is careful on setting the LAST_ERRED state.
4170 * If open errors and exactly one device has failed we can run.
4171 * If more then one device fails we have to figure out when to set
4172 * LAST_ERRED state. The rationale is to avoid unnecessary resyncs
4173 * since they are painful and time consuming.
4175 * When more than one component/column fails there are 2 scenerios.
4177 * 1. Metadevice has NOT been opened: In this case, the behavior
4178 * mimics the open symantics. ie. Only the first failed device
4179 * is ERRED and LAST_ERRED is not set.
4181 * 2. Metadevice has been opened: Here the read/write sematics are
4182 * followed. The first failed devicce is ERRED and on the next
4183 * failed device LAST_ERRED is set.
4186 if (not_opened
> 1 && !md_devopen
) {
4188 "md: %s failed to open. open error on %s\n",
4189 md_shortname(MD_SID(un
)),
4190 md_devname(MD_UN2SET(un
), device
->un_orig_dev
, NULL
, 0));
4191 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_OPEN_FAIL
, SVM_TAG_METADEVICE
,
4192 MD_UN2SET(un
), MD_SID(un
));
4193 raid_probe_close_all_devs(un
);
4194 ui
->ui_tstate
|= MD_INACCESSIBLE
;
4195 return (not_opened
> 1);
4199 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
4200 device
= &un
->un_column
[i
];
4201 if (device
->un_devflags
& MD_RAID_DEV_PROBEOPEN
) {
4202 if (device
->un_devstate
& RCS_LAST_ERRED
) {
4204 * At this point in time there is a
4205 * possibility that errors were the
4206 * result of a controller failure with
4207 * more than a single column on it so
4208 * clear out last errored columns and
4209 * let errors re-occur is necessary.
4211 raid_set_state(un
, i
, RCS_OKAY
, 0);
4218 * note if multiple devices are failing then only
4219 * the last one is marked as error
4225 raid_set_state(un
, col
, RCS_ERRED
, 0);
4230 for (i
= 0; i
< un
->un_totalcolumncnt
; i
++) {
4231 device
= &un
->un_column
[i
];
4233 /* if we have LAST_ERRED go ahead and commit. */
4234 if (un
->un_state
& RUS_LAST_ERRED
)
4237 * could not open the component
4240 if (!(device
->un_devflags
& MD_RAID_DEV_PROBEOPEN
)) {
4242 raid_set_state(un
, col
, RCS_ERRED
, 0);
4249 raid_commit(un
, NULL
);
4252 if (COLUMN_STATE(un
, col
) & RCS_ERRED
) {
4253 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_ERRED
,
4254 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
4255 } else if (COLUMN_STATE(un
, col
) & RCS_LAST_ERRED
) {
4256 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_LASTERRED
,
4257 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
4261 raid_probe_close_all_devs(un
);
4271 int i
, gotsomething
;
4276 mr_unit32_od_t
*un32
;
4277 md_dev64_t self_devt
;
4278 minor_t
*self_id
; /* minor needs to be updated */
4279 md_parent_t
*parent_id
; /* parent needs to be updated */
4280 mddb_recid_t
*record_id
; /* record id needs to be updated */
4285 typ1
= (mddb_type_t
)md_getshared_key(setno
,
4286 raid_md_ops
.md_driver
.md_drivername
);
4287 recid
= mddb_makerecid(setno
, 0);
4289 while ((recid
= mddb_getnextrec(recid
, typ1
, 0)) > 0) {
4290 if (mddb_getrecprivate(recid
) & MD_PRV_GOTIT
)
4293 dep
= mddb_getrecdep(recid
);
4296 switch (rbp
->rb_revision
) {
4302 un32
= (mr_unit32_od_t
*)mddb_getrecaddr(recid
);
4303 self_id
= &(un32
->c
.un_self_id
);
4304 parent_id
= &(un32
->c
.un_parent
);
4305 record_id
= &(un32
->c
.un_record_id
);
4306 hsp_id
= &(un32
->un_hsp_id
);
4308 for (i
= 0; i
< un32
->un_totalcolumncnt
; i
++) {
4309 mr_column32_od_t
*device
;
4311 device
= &un32
->un_column
[i
];
4312 if (!md_update_minor(setno
, mddb_getsidenum
4313 (setno
), device
->un_orig_key
))
4316 if (device
->un_hs_id
!= 0)
4318 MAKERECID(setno
, device
->un_hs_id
);
4322 case MDDB_REV_RB64FN
:
4323 un64
= (mr_unit_t
*)mddb_getrecaddr(recid
);
4324 self_id
= &(un64
->c
.un_self_id
);
4325 parent_id
= &(un64
->c
.un_parent
);
4326 record_id
= &(un64
->c
.un_record_id
);
4327 hsp_id
= &(un64
->un_hsp_id
);
4329 for (i
= 0; i
< un64
->un_totalcolumncnt
; i
++) {
4330 mr_column_t
*device
;
4332 device
= &un64
->un_column
[i
];
4333 if (!md_update_minor(setno
, mddb_getsidenum
4334 (setno
), device
->un_orig_key
))
4337 if (device
->un_hs_id
!= 0)
4339 MAKERECID(setno
, device
->un_hs_id
);
4345 * If this is a top level and a friendly name metadevice,
4346 * update its minor in the namespace.
4348 if ((*parent_id
== MD_NO_PARENT
) &&
4349 ((rbp
->rb_revision
== MDDB_REV_RBFN
) ||
4350 (rbp
->rb_revision
== MDDB_REV_RB64FN
))) {
4352 self_devt
= md_makedevice(md_major
, *self_id
);
4353 if (!md_update_top_device_minor(setno
,
4354 mddb_getsidenum(setno
), self_devt
))
4359 * Update unit with the imported setno
4361 mddb_setrecprivate(recid
, MD_PRV_GOTIT
);
4363 *self_id
= MD_MKMIN(setno
, MD_MIN2UNIT(*self_id
));
4366 *hsp_id
= MAKERECID(setno
, DBID(*hsp_id
));
4368 if (*parent_id
!= MD_NO_PARENT
)
4369 *parent_id
= MD_MKMIN(setno
, MD_MIN2UNIT(*parent_id
));
4370 *record_id
= MAKERECID(setno
, DBID(*record_id
));
4375 return (gotsomething
);
4378 static md_named_services_t raid_named_services
[] = {
4379 {raid_hotspares
, "poke hotspares" },
4380 {raid_rename_check
, MDRNM_CHECK
},
4381 {raid_rename_lock
, MDRNM_LOCK
},
4382 {(intptr_t (*)()) raid_rename_unlock
, MDRNM_UNLOCK
},
4383 {(intptr_t (*)()) raid_probe_dev
, "probe open test" },
4387 md_ops_t raid_md_ops
= {
4388 raid_open
, /* open */
4389 raid_close
, /* close */
4390 md_raid_strategy
, /* strategy */
4395 md_raid_ioctl
, /* ioctl, */
4396 raid_snarf
, /* raid_snarf */
4397 raid_halt
, /* raid_halt */
4400 raid_imp_set
, /* import set */
4407 /* default to a second */
4408 if (md_wr_wait
== 0)
4409 md_wr_wait
= md_hz
>> 1;
4411 raid_parent_cache
= kmem_cache_create("md_raid_parent",
4412 sizeof (md_raidps_t
), 0, raid_parent_constructor
,
4413 raid_parent_destructor
, raid_run_queue
, NULL
, NULL
, 0);
4414 raid_child_cache
= kmem_cache_create("md_raid_child",
4415 sizeof (md_raidcs_t
) - sizeof (buf_t
) + biosize(), 0,
4416 raid_child_constructor
, raid_child_destructor
,
4417 raid_run_queue
, NULL
, NULL
, 0);
4418 raid_cbuf_cache
= kmem_cache_create("md_raid_cbufs",
4419 sizeof (md_raidcbuf_t
), 0, raid_cbuf_constructor
,
4420 raid_cbuf_destructor
, raid_run_queue
, NULL
, NULL
, 0);
4426 kmem_cache_destroy(raid_parent_cache
);
4427 kmem_cache_destroy(raid_child_cache
);
4428 kmem_cache_destroy(raid_cbuf_cache
);
4429 raid_parent_cache
= raid_child_cache
= raid_cbuf_cache
= NULL
;
4432 /* define the module linkage */
4433 MD_PLUGIN_MISC_MODULE("raid module", init_init(), fini_uninit())