4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 Andrey Sokolov
26 * Copyright 2016 Toomas Soome <tsoome@me.com>
30 * lofi (loopback file) driver - allows you to attach a file to a device,
31 * which can then be accessed through that device. The simple model is that
32 * you tell lofi to open a file, and then use the block device you get as
33 * you would any block device. lofi translates access to the block device
34 * into I/O on the underlying file. This is mostly useful for
35 * mounting images of filesystems.
37 * lofi is controlled through /dev/lofictl - this is the only device exported
38 * during attach, and is instance number 0. lofiadm communicates with lofi
39 * through ioctls on this device. When a file is attached to lofi, block and
40 * character devices are exported in /dev/lofi and /dev/rlofi. These devices
41 * are identified by lofi instance number, and the instance number is also used
42 * as the name in /dev/lofi.
44 * Virtual disks, or, labeled lofi, implements virtual disk support to
45 * support partition table and related tools. Such mappings will cause
46 * block and character devices to be exported in /dev/dsk and /dev/rdsk
49 * To support virtual disks, the instance number space is divided to two
50 * parts, upper part for instance number and lower part for minor number
51 * space to identify partitions and slices. The virtual disk support is
52 * implemented by stacking cmlb module. For virtual disks, the partition
53 * related ioctl calls are routed to cmlb module. Compression and encryption
54 * is not supported for virtual disks.
56 * Mapped devices are tracked with state structures handled with
57 * ddi_soft_state(9F) for simplicity.
59 * A file attached to lofi is opened when attached and not closed until
60 * explicitly detached from lofi. This seems more sensible than deferring
61 * the open until the /dev/lofi device is opened, for a number of reasons.
62 * One is that any failure is likely to be noticed by the person (or script)
63 * running lofiadm. Another is that it would be a security problem if the
64 * file was replaced by another one after being added but before being opened.
66 * The only hard part about lofi is the ioctls. In order to support things
67 * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
68 * So it has to fake disk geometry and partition information. More may need
69 * to be faked if your favorite utility doesn't work and you think it should
70 * (fdformat doesn't work because it really wants to know the type of floppy
71 * controller to talk to, and that didn't seem easy to fake. Or possibly even
72 * necessary, since we have mkfs_pcfs now).
74 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To
75 * support simulation of hotplug events, an optional force flag is provided.
76 * If a lofi device is open when a force detach is requested, then the
77 * underlying file is closed and any subsequent operations return EIO. When the
78 * device is closed for the last time, it will be cleaned up at that time. In
79 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
80 * detached but not removed.
84 * UFS logging. Mounting a UFS filesystem image "logging"
85 * works for basic copy testing but wedges during a build of ON through
86 * that image. Some deadlock in lufs holding the log mutex and then
87 * getting stuck on a buf. So for now, don't do that.
89 * Direct I/O. Since the filesystem data is being cached in the buffer
90 * cache, _and_ again in the underlying filesystem, it's tempting to
91 * enable direct I/O on the underlying file. Don't, because that deadlocks.
92 * I think to fix the cache-twice problem we might need filesystem support.
94 * Interesting things to do:
96 * Allow multiple files for each device. A poor-man's metadisk, basically.
98 * Pass-through ioctls on block devices. You can (though it's not
99 * documented), give lofi a block device as a file name. Then we shouldn't
100 * need to fake a geometry, however, it may be relevant if you're replacing
101 * metadisk, or using lofi to get crypto.
102 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1
103 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home.
104 * In fact this even makes sense if you have lofi "above" metadisk.
107 * Each lofi device can have its own symmetric key and cipher.
108 * They are passed to us by lofiadm(1m) in the correct format for use
109 * with the misc/kcf crypto_* routines.
111 * Each block has its own IV, that is calculated in lofi_blk_mech(), based
112 * on the "master" key held in the lsp and the block number of the buffer.
115 #include <sys/types.h>
116 #include <netinet/in.h>
117 #include <sys/sysmacros.h>
119 #include <sys/kmem.h>
120 #include <sys/cred.h>
121 #include <sys/mman.h>
122 #include <sys/errno.h>
123 #include <sys/aio_req.h>
124 #include <sys/stat.h>
125 #include <sys/file.h>
126 #include <sys/modctl.h>
127 #include <sys/conf.h>
128 #include <sys/debug.h>
129 #include <sys/vnode.h>
130 #include <sys/lofi.h>
131 #include <sys/lofi_impl.h> /* for cache structure */
132 #include <sys/fcntl.h>
133 #include <sys/pathname.h>
134 #include <sys/filio.h>
135 #include <sys/fdio.h>
136 #include <sys/open.h>
137 #include <sys/disp.h>
138 #include <vm/seg_map.h>
140 #include <sys/sunddi.h>
141 #include <sys/zmod.h>
142 #include <sys/id_space.h>
143 #include <sys/mkdev.h>
144 #include <sys/crypto/common.h>
145 #include <sys/crypto/api.h>
146 #include <sys/rctl.h>
147 #include <sys/vtoc.h>
148 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */
149 #include <sys/scsi/impl/uscsi.h>
150 #include <sys/sysevent/dev.h>
153 #define NBLOCKS_PROP_NAME "Nblocks"
154 #define SIZE_PROP_NAME "Size"
155 #define ZONE_PROP_NAME "zone"
157 #define SETUP_C_DATA(cd, buf, len) \
158 (cd).cd_format = CRYPTO_DATA_RAW; \
159 (cd).cd_offset = 0; \
160 (cd).cd_miscdata = NULL; \
161 (cd).cd_length = (len); \
162 (cd).cd_raw.iov_base = (buf); \
163 (cd).cd_raw.iov_len = (len);
165 #define UIO_CHECK(uio) \
166 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \
167 ((uio)->uio_resid % DEV_BSIZE) != 0) { \
171 #define LOFI_TIMEOUT 30
173 static void *lofi_statep
;
174 static kmutex_t lofi_lock
; /* state lock */
175 static id_space_t
*lofi_id
; /* lofi ID values */
176 static list_t lofi_list
;
177 static zone_key_t lofi_zone_key
;
180 * Because lofi_taskq_nthreads limits the actual swamping of the device, the
181 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
182 * high. If we want to be assured that the underlying device is always busy,
183 * we must be sure that the number of bytes enqueued when the number of
184 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
185 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should
186 * set maxalloc to be the maximum throughput (in bytes per second) of the
187 * underlying device divided by the minimum I/O size. We assume a realistic
188 * maximum throughput of one hundred megabytes per second; we set maxalloc on
189 * the lofi task queue to be 104857600 divided by DEV_BSIZE.
191 static int lofi_taskq_maxalloc
= 104857600 / DEV_BSIZE
;
192 static int lofi_taskq_nthreads
= 4; /* # of taskq threads per device */
194 const char lofi_crypto_magic
[6] = LOFI_CRYPTO_MAGIC
;
197 * To avoid decompressing data in a compressed segment multiple times
198 * when accessing small parts of a segment's data, we cache and reuse
199 * the uncompressed segment's data.
201 * A single cached segment is sufficient to avoid lots of duplicate
202 * segment decompress operations. A small cache size also reduces the
205 * lofi_max_comp_cache is the maximum number of decompressed data segments
206 * cached for each compressed lofi image. It can be set to 0 to disable
210 uint32_t lofi_max_comp_cache
= 1;
212 static int gzip_decompress(void *src
, size_t srclen
, void *dst
,
213 size_t *destlen
, int level
);
215 static int lzma_decompress(void *src
, size_t srclen
, void *dst
,
216 size_t *dstlen
, int level
);
218 lofi_compress_info_t lofi_compress_table
[LOFI_COMPRESS_FUNCTIONS
] = {
219 {gzip_decompress
, NULL
, 6, "gzip"}, /* default */
220 {gzip_decompress
, NULL
, 6, "gzip-6"},
221 {gzip_decompress
, NULL
, 9, "gzip-9"},
222 {lzma_decompress
, NULL
, 0, "lzma"}
225 static void lofi_strategy_task(void *);
226 static int lofi_tg_rdwr(dev_info_t
*, uchar_t
, void *, diskaddr_t
,
228 static int lofi_tg_getinfo(dev_info_t
*, int, void *, void *);
230 struct cmlb_tg_ops lofi_tg_ops
= {
238 *SzAlloc(void *p
, size_t size
)
240 return (kmem_alloc(size
, KM_SLEEP
));
245 SzFree(void *p
, void *address
, size_t size
)
247 kmem_free(address
, size
);
250 static ISzAlloc g_Alloc
= { SzAlloc
, SzFree
};
253 * Free data referenced by the linked list of cached uncompressed
257 lofi_free_comp_cache(struct lofi_state
*lsp
)
259 struct lofi_comp_cache
*lc
;
261 while ((lc
= list_remove_head(&lsp
->ls_comp_cache
)) != NULL
) {
262 kmem_free(lc
->lc_data
, lsp
->ls_uncomp_seg_sz
);
263 kmem_free(lc
, sizeof (struct lofi_comp_cache
));
264 lsp
->ls_comp_cache_count
--;
266 ASSERT(lsp
->ls_comp_cache_count
== 0);
270 is_opened(struct lofi_state
*lsp
)
273 boolean_t last
= B_TRUE
;
275 ASSERT(MUTEX_HELD(&lofi_lock
));
276 for (i
= 0; i
< LOFI_PART_MAX
; i
++) {
277 if (lsp
->ls_open_lyr
[i
]) {
283 for (i
= 0; last
&& (i
< OTYP_LYR
); i
++) {
284 if (lsp
->ls_open_reg
[i
]) {
293 lofi_free_crypto(struct lofi_state
*lsp
)
295 ASSERT(MUTEX_HELD(&lofi_lock
));
297 if (lsp
->ls_crypto_enabled
) {
299 * Clean up the crypto state so that it doesn't hang around
300 * in memory after we are done with it.
302 if (lsp
->ls_key
.ck_data
!= NULL
) {
303 bzero(lsp
->ls_key
.ck_data
,
304 CRYPTO_BITS2BYTES(lsp
->ls_key
.ck_length
));
305 kmem_free(lsp
->ls_key
.ck_data
,
306 CRYPTO_BITS2BYTES(lsp
->ls_key
.ck_length
));
307 lsp
->ls_key
.ck_data
= NULL
;
308 lsp
->ls_key
.ck_length
= 0;
311 if (lsp
->ls_mech
.cm_param
!= NULL
) {
312 kmem_free(lsp
->ls_mech
.cm_param
,
313 lsp
->ls_mech
.cm_param_len
);
314 lsp
->ls_mech
.cm_param
= NULL
;
315 lsp
->ls_mech
.cm_param_len
= 0;
318 if (lsp
->ls_iv_mech
.cm_param
!= NULL
) {
319 kmem_free(lsp
->ls_iv_mech
.cm_param
,
320 lsp
->ls_iv_mech
.cm_param_len
);
321 lsp
->ls_iv_mech
.cm_param
= NULL
;
322 lsp
->ls_iv_mech
.cm_param_len
= 0;
325 mutex_destroy(&lsp
->ls_crypto_lock
);
331 lofi_tg_rdwr(dev_info_t
*dip
, uchar_t cmd
, void *bufaddr
, diskaddr_t start
,
332 size_t length
, void *tg_cookie
)
334 struct lofi_state
*lsp
;
339 instance
= ddi_get_instance(dip
);
340 if (instance
== 0) /* control node does not have disk */
343 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
348 if (cmd
!= TG_READ
&& cmd
!= TG_WRITE
)
352 * Make sure the mapping is set up by checking lsp->ls_vp_ready.
354 mutex_enter(&lsp
->ls_vp_lock
);
355 while (lsp
->ls_vp_ready
== B_FALSE
)
356 cv_wait(&lsp
->ls_vp_cv
, &lsp
->ls_vp_lock
);
357 mutex_exit(&lsp
->ls_vp_lock
);
359 if (P2PHASE(length
, (1U << lsp
->ls_lbshift
)) != 0) {
360 /* We can only transfer whole blocks at a time! */
364 bp
= getrbuf(KM_SLEEP
);
366 if (cmd
== TG_READ
) {
367 bp
->b_flags
= B_READ
;
369 if (lsp
->ls_readonly
== B_TRUE
) {
373 bp
->b_flags
= B_WRITE
;
376 bp
->b_un
.b_addr
= bufaddr
;
377 bp
->b_bcount
= length
;
378 bp
->b_lblkno
= start
;
379 bp
->b_private
= NULL
;
380 bp
->b_edev
= lsp
->ls_dev
;
383 mutex_enter(lsp
->ls_kstat
->ks_lock
);
384 kstat_waitq_enter(KSTAT_IO_PTR(lsp
->ls_kstat
));
385 mutex_exit(lsp
->ls_kstat
->ks_lock
);
387 (void) taskq_dispatch(lsp
->ls_taskq
, lofi_strategy_task
, bp
, KM_SLEEP
);
396 * Get device geometry info for cmlb.
398 * We have mapped disk image as virtual block device and have to report
399 * physical/virtual geometry to cmlb.
401 * So we have two principal cases:
402 * 1. Uninitialised image without any existing labels,
403 * for this case we fabricate the data based on mapped image.
404 * 2. Image with existing label information.
405 * Since we have no information how the image was created (it may be
406 * dump from some physical device), we need to rely on label information
407 * from image, or we get "corrupted label" errors.
408 * NOTE: label can be MBR, MBR+SMI, GPT
411 lofi_tg_getinfo(dev_info_t
*dip
, int cmd
, void *arg
, void *tg_cookie
)
413 struct lofi_state
*lsp
;
417 _NOTE(ARGUNUSED(tg_cookie
));
418 instance
= ddi_get_instance(dip
);
419 if (instance
== 0) /* control device has no storage */
422 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
428 * Make sure the mapping is set up by checking lsp->ls_vp_ready.
430 * When mapping is created, new lofi instance is created and
431 * lofi_attach() will call cmlb_attach() as part of the procedure
432 * to set the mapping up. This chain of events will happen in
434 * Since cmlb_attach() will call lofi_tg_getinfo to get
435 * capacity, we return error on that call if cookie is set,
436 * otherwise lofi_attach will be stuck as the mapping is not yet
437 * finalized and lofi is not yet ready.
438 * Note, such error is not fatal for cmlb, as the label setup
439 * will be finalized when cmlb_validate() is called.
441 mutex_enter(&lsp
->ls_vp_lock
);
442 if (tg_cookie
!= NULL
&& lsp
->ls_vp_ready
== B_FALSE
) {
443 mutex_exit(&lsp
->ls_vp_lock
);
446 while (lsp
->ls_vp_ready
== B_FALSE
)
447 cv_wait(&lsp
->ls_vp_cv
, &lsp
->ls_vp_lock
);
448 mutex_exit(&lsp
->ls_vp_lock
);
450 ashift
= lsp
->ls_lbshift
;
453 case TG_GETPHYGEOM
: {
454 cmlb_geom_t
*geomp
= arg
;
457 (lsp
->ls_vp_size
- lsp
->ls_crypto_offset
) >> ashift
;
458 geomp
->g_nsect
= lsp
->ls_dkg
.dkg_nsect
;
459 geomp
->g_nhead
= lsp
->ls_dkg
.dkg_nhead
;
460 geomp
->g_acyl
= lsp
->ls_dkg
.dkg_acyl
;
461 geomp
->g_ncyl
= lsp
->ls_dkg
.dkg_ncyl
;
462 geomp
->g_secsize
= (1U << ashift
);
463 geomp
->g_intrlv
= lsp
->ls_dkg
.dkg_intrlv
;
464 geomp
->g_rpm
= lsp
->ls_dkg
.dkg_rpm
;
470 (lsp
->ls_vp_size
- lsp
->ls_crypto_offset
) >> ashift
;
473 case TG_GETBLOCKSIZE
:
474 *(uint32_t *)arg
= (1U << ashift
);
478 tg_attribute_t
*tgattr
= arg
;
480 tgattr
->media_is_writable
= !lsp
->ls_readonly
;
481 tgattr
->media_is_solid_state
= B_FALSE
;
491 lofi_destroy(struct lofi_state
*lsp
, cred_t
*credp
)
493 int id
= LOFI_MINOR2ID(getminor(lsp
->ls_dev
));
496 ASSERT(MUTEX_HELD(&lofi_lock
));
498 list_remove(&lofi_list
, lsp
);
500 lofi_free_crypto(lsp
);
503 * Free pre-allocated compressed buffers
505 if (lsp
->ls_comp_bufs
!= NULL
) {
506 for (i
= 0; i
< lofi_taskq_nthreads
; i
++) {
507 if (lsp
->ls_comp_bufs
[i
].bufsize
> 0)
508 kmem_free(lsp
->ls_comp_bufs
[i
].buf
,
509 lsp
->ls_comp_bufs
[i
].bufsize
);
511 kmem_free(lsp
->ls_comp_bufs
,
512 sizeof (struct compbuf
) * lofi_taskq_nthreads
);
515 if (lsp
->ls_vp
!= NULL
) {
516 (void) VOP_PUTPAGE(lsp
->ls_vp
, 0, 0, B_INVAL
, credp
, NULL
);
517 (void) VOP_CLOSE(lsp
->ls_vp
, lsp
->ls_openflag
,
521 if (lsp
->ls_stacked_vp
!= lsp
->ls_vp
)
522 VN_RELE(lsp
->ls_stacked_vp
);
524 if (lsp
->ls_taskq
!= NULL
)
525 taskq_destroy(lsp
->ls_taskq
);
527 if (lsp
->ls_kstat
!= NULL
)
528 kstat_delete(lsp
->ls_kstat
);
531 * Free cached decompressed segment data
533 lofi_free_comp_cache(lsp
);
534 list_destroy(&lsp
->ls_comp_cache
);
536 if (lsp
->ls_uncomp_seg_sz
> 0) {
537 kmem_free(lsp
->ls_comp_index_data
, lsp
->ls_comp_index_data_sz
);
538 lsp
->ls_uncomp_seg_sz
= 0;
541 rctl_decr_lofi(lsp
->ls_zone
.zref_zone
, 1);
542 zone_rele_ref(&lsp
->ls_zone
, ZONE_REF_LOFI
);
544 mutex_destroy(&lsp
->ls_comp_cache_lock
);
545 mutex_destroy(&lsp
->ls_comp_bufs_lock
);
546 mutex_destroy(&lsp
->ls_kstat_lock
);
547 mutex_destroy(&lsp
->ls_vp_lock
);
548 cv_destroy(&lsp
->ls_vp_cv
);
549 lsp
->ls_vp_ready
= B_FALSE
;
551 ASSERT(ddi_get_soft_state(lofi_statep
, id
) == lsp
);
552 (void) ndi_devi_offline(lsp
->ls_dip
, NDI_DEVI_REMOVE
);
553 id_free(lofi_id
, id
);
557 lofi_free_dev(struct lofi_state
*lsp
)
559 ASSERT(MUTEX_HELD(&lofi_lock
));
561 if (lsp
->ls_cmlbhandle
!= NULL
) {
562 cmlb_invalidate(lsp
->ls_cmlbhandle
, 0);
563 cmlb_detach(lsp
->ls_cmlbhandle
, 0);
564 cmlb_free_handle(&lsp
->ls_cmlbhandle
);
565 lsp
->ls_cmlbhandle
= NULL
;
567 (void) ddi_prop_remove_all(lsp
->ls_dip
);
568 ddi_remove_minor_node(lsp
->ls_dip
, NULL
);
573 lofi_zone_shutdown(zoneid_t zoneid
, void *arg
)
575 struct lofi_state
*lsp
;
576 struct lofi_state
*next
;
578 mutex_enter(&lofi_lock
);
580 for (lsp
= list_head(&lofi_list
); lsp
!= NULL
; lsp
= next
) {
582 /* lofi_destroy() frees lsp */
583 next
= list_next(&lofi_list
, lsp
);
585 if (lsp
->ls_zone
.zref_zone
->zone_id
!= zoneid
)
589 * No in-zone processes are running, but something has this
590 * open. It's either a global zone process, or a lofi
591 * mount. In either case we set ls_cleanup so the last
592 * user destroys the device.
594 if (is_opened(lsp
)) {
598 lofi_destroy(lsp
, kcred
);
602 mutex_exit(&lofi_lock
);
607 lofi_open(dev_t
*devp
, int flag
, int otyp
, struct cred
*credp
)
616 struct lofi_state
*lsp
;
621 ndelay
= (flag
& (FNDELAY
| FNONBLOCK
)) ? B_TRUE
: B_FALSE
;
624 * lofiadm -a /dev/lofi/1 gets us here.
626 if (mutex_owner(&lofi_lock
) == curthread
)
629 mutex_enter(&lofi_lock
);
631 id
= LOFI_MINOR2ID(getminor(*devp
));
632 part
= LOFI_PART(getminor(*devp
));
635 /* master control device */
637 mutex_exit(&lofi_lock
);
641 /* otherwise, the mapping should already exist */
642 lsp
= ddi_get_soft_state(lofi_statep
, id
);
644 mutex_exit(&lofi_lock
);
648 if (lsp
->ls_vp
== NULL
) {
649 mutex_exit(&lofi_lock
);
653 if (lsp
->ls_readonly
&& (flag
& FWRITE
)) {
654 mutex_exit(&lofi_lock
);
658 if ((lsp
->ls_open_excl
) & (mask
)) {
659 mutex_exit(&lofi_lock
);
664 if (lsp
->ls_open_lyr
[part
]) {
665 mutex_exit(&lofi_lock
);
668 for (int i
= 0; i
< OTYP_LYR
; i
++) {
669 if (lsp
->ls_open_reg
[i
] & mask
) {
670 mutex_exit(&lofi_lock
);
676 if (lsp
->ls_cmlbhandle
!= NULL
) {
677 if (cmlb_validate(lsp
->ls_cmlbhandle
, 0, 0) != 0) {
679 * non-blocking opens are allowed to succeed to
680 * support format and fdisk to create partitioning.
683 mutex_exit(&lofi_lock
);
686 } else if (cmlb_partinfo(lsp
->ls_cmlbhandle
, part
, &nblks
, &lba
,
687 NULL
, NULL
, 0) == 0) {
688 if ((!nblks
) && ((!ndelay
) || (otyp
!= OTYP_CHR
))) {
689 mutex_exit(&lofi_lock
);
692 } else if (!ndelay
) {
693 mutex_exit(&lofi_lock
);
698 if (otyp
== OTYP_LYR
) {
699 lsp
->ls_open_lyr
[part
]++;
701 lsp
->ls_open_reg
[otyp
] |= mask
;
704 lsp
->ls_open_excl
|= mask
;
707 mutex_exit(&lofi_lock
);
713 lofi_close(dev_t dev
, int flag
, int otyp
, struct cred
*credp
)
718 struct lofi_state
*lsp
;
720 id
= LOFI_MINOR2ID(getminor(dev
));
721 part
= LOFI_PART(getminor(dev
));
724 mutex_enter(&lofi_lock
);
725 lsp
= ddi_get_soft_state(lofi_statep
, id
);
727 mutex_exit(&lofi_lock
);
732 mutex_exit(&lofi_lock
);
736 if (lsp
->ls_open_excl
& mask
)
737 lsp
->ls_open_excl
&= ~mask
;
739 if (otyp
== OTYP_LYR
) {
740 lsp
->ls_open_lyr
[part
]--;
742 lsp
->ls_open_reg
[otyp
] &= ~mask
;
746 * If we forcibly closed the underlying device (li_force), or
747 * asked for cleanup (li_cleanup), finish up if we're the last
750 if (!is_opened(lsp
) && (lsp
->ls_cleanup
|| lsp
->ls_vp
== NULL
)) {
752 lofi_destroy(lsp
, credp
);
755 mutex_exit(&lofi_lock
);
760 * Sets the mechanism's initialization vector (IV) if one is needed.
761 * The IV is computed from the data block number. lsp->ls_mech is
763 * lsp->ls_mech.cm_param_len is set to the IV len.
764 * lsp->ls_mech.cm_param is set to the IV.
767 lofi_blk_mech(struct lofi_state
*lsp
, longlong_t lblkno
)
777 ASSERT(MUTEX_HELD(&lsp
->ls_crypto_lock
));
780 return (CRYPTO_DEVICE_ERROR
);
782 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */
783 if (lsp
->ls_iv_type
== IVM_NONE
) {
784 return (CRYPTO_SUCCESS
);
788 * if kmem already alloced from previous call and it's the same size
789 * we need now, just recycle it; allocate new kmem only if we have to
791 if (lsp
->ls_mech
.cm_param
== NULL
||
792 lsp
->ls_mech
.cm_param_len
!= lsp
->ls_iv_len
) {
793 iv_len
= lsp
->ls_iv_len
;
794 iv
= kmem_zalloc(iv_len
, KM_SLEEP
);
796 iv_len
= lsp
->ls_mech
.cm_param_len
;
797 iv
= lsp
->ls_mech
.cm_param
;
801 switch (lsp
->ls_iv_type
) {
803 /* iv is not static, lblkno changes each time */
805 datasz
= sizeof (lblkno
);
814 * write blkno into the iv buffer padded on the left in case
815 * blkno ever grows bigger than its current longlong_t size
816 * or a variation other than blkno is used for the iv data
818 min
= MIN(datasz
, iv_len
);
819 bcopy(data
, iv
+ (iv_len
- min
), min
);
821 /* encrypt the data in-place to get the IV */
822 SETUP_C_DATA(cdata
, iv
, iv_len
);
824 ret
= crypto_encrypt(&lsp
->ls_iv_mech
, &cdata
, &lsp
->ls_key
,
826 if (ret
!= CRYPTO_SUCCESS
) {
827 cmn_err(CE_WARN
, "failed to create iv for block %lld: (0x%x)",
829 if (lsp
->ls_mech
.cm_param
!= iv
)
830 kmem_free(iv
, iv_len
);
835 /* clean up the iv from the last computation */
836 if (lsp
->ls_mech
.cm_param
!= NULL
&& lsp
->ls_mech
.cm_param
!= iv
)
837 kmem_free(lsp
->ls_mech
.cm_param
, lsp
->ls_mech
.cm_param_len
);
839 lsp
->ls_mech
.cm_param_len
= iv_len
;
840 lsp
->ls_mech
.cm_param
= iv
;
842 return (CRYPTO_SUCCESS
);
846 * Performs encryption and decryption of a chunk of data of size "len",
847 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of
851 lofi_crypto(struct lofi_state
*lsp
, struct buf
*bp
, caddr_t plaintext
,
852 caddr_t ciphertext
, size_t len
, boolean_t op_encrypt
)
857 longlong_t lblkno
= bp
->b_lblkno
;
859 mutex_enter(&lsp
->ls_crypto_lock
);
862 * though we could encrypt/decrypt entire "len" chunk of data, we need
863 * to break it into DEV_BSIZE pieces to capture blkno incrementing
865 SETUP_C_DATA(cdata
, plaintext
, len
);
866 cdata
.cd_length
= DEV_BSIZE
;
867 if (ciphertext
!= NULL
) { /* not in-place crypto */
868 SETUP_C_DATA(wdata
, ciphertext
, len
);
869 wdata
.cd_length
= DEV_BSIZE
;
873 ret
= lofi_blk_mech(lsp
, lblkno
);
874 if (ret
!= CRYPTO_SUCCESS
)
878 ret
= crypto_encrypt(&lsp
->ls_mech
, &cdata
,
880 ((ciphertext
!= NULL
) ? &wdata
: NULL
), NULL
);
882 ret
= crypto_decrypt(&lsp
->ls_mech
, &cdata
,
884 ((ciphertext
!= NULL
) ? &wdata
: NULL
), NULL
);
887 cdata
.cd_offset
+= DEV_BSIZE
;
888 if (ciphertext
!= NULL
)
889 wdata
.cd_offset
+= DEV_BSIZE
;
891 } while (ret
== CRYPTO_SUCCESS
&& cdata
.cd_offset
< len
);
893 mutex_exit(&lsp
->ls_crypto_lock
);
895 if (ret
!= CRYPTO_SUCCESS
) {
896 cmn_err(CE_WARN
, "%s failed for block %lld: (0x%x)",
897 op_encrypt
? "crypto_encrypt()" : "crypto_decrypt()",
908 lofi_rdwr(caddr_t bufaddr
, offset_t offset
, struct buf
*bp
,
909 struct lofi_state
*lsp
, size_t len
, int method
, caddr_t bcopy_locn
)
916 * Handles reads/writes for both plain and encrypted lofi
917 * Note: offset is already shifted by lsp->ls_crypto_offset
921 isread
= bp
->b_flags
& B_READ
;
923 if (method
== RDWR_BCOPY
) {
924 /* DO NOT update bp->b_resid for bcopy */
925 bcopy(bcopy_locn
, bufaddr
, len
);
927 } else { /* RDWR_RAW */
928 error
= vn_rdwr(UIO_READ
, lsp
->ls_vp
, bufaddr
, len
,
929 offset
, UIO_SYSSPACE
, 0, RLIM64_INFINITY
, kcred
,
933 if (lsp
->ls_crypto_enabled
&& error
== 0) {
934 if (lofi_crypto(lsp
, bp
, bufaddr
, NULL
, len
,
935 B_FALSE
) != CRYPTO_SUCCESS
) {
937 * XXX: original code didn't set residual
938 * back to len because no error was expected
939 * from bcopy() if encryption is not enabled
941 if (method
!= RDWR_BCOPY
)
948 void *iobuf
= bufaddr
;
950 if (lsp
->ls_crypto_enabled
) {
951 /* don't do in-place crypto to keep bufaddr intact */
952 iobuf
= kmem_alloc(len
, KM_SLEEP
);
953 if (lofi_crypto(lsp
, bp
, bufaddr
, iobuf
, len
,
954 B_TRUE
) != CRYPTO_SUCCESS
) {
955 kmem_free(iobuf
, len
);
956 if (method
!= RDWR_BCOPY
)
961 if (method
== RDWR_BCOPY
) {
962 /* DO NOT update bp->b_resid for bcopy */
963 bcopy(iobuf
, bcopy_locn
, len
);
965 } else { /* RDWR_RAW */
966 error
= vn_rdwr(UIO_WRITE
, lsp
->ls_vp
, iobuf
, len
,
967 offset
, UIO_SYSSPACE
, 0, RLIM64_INFINITY
, kcred
,
971 if (lsp
->ls_crypto_enabled
) {
972 kmem_free(iobuf
, len
);
979 lofi_mapped_rdwr(caddr_t bufaddr
, offset_t offset
, struct buf
*bp
,
980 struct lofi_state
*lsp
)
983 offset_t alignedoffset
, mapoffset
;
993 * Note: offset is already shifted by lsp->ls_crypto_offset
996 if (lsp
->ls_crypto_enabled
)
997 ASSERT(lsp
->ls_vp_comp_size
== lsp
->ls_vp_size
);
1000 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
1001 * an 8K boundary, but the buf transfer address may not be
1002 * aligned on more than a 512-byte boundary (we don't enforce
1003 * that even though we could). This matters since the initial
1004 * part of the transfer may not start at offset 0 within the
1005 * segmap'd chunk. So we have to compensate for that with
1006 * 'mapoffset'. Subsequent chunks always start off at the
1007 * beginning, and the last is capped by b_resid
1009 * Visually, where "|" represents page map boundaries:
1010 * alignedoffset (mapaddr begins at this segmap boundary)
1011 * | offset (from beginning of file)
1014 * ===|====X========|====...======|========X====|====
1015 * /-------------...---------------/
1016 * ^ bp->b_bcount/bp->b_resid at start
1017 * /----/--------/----...------/--------/
1019 * | | | | nth xfersize (<= MAXBSIZE)
1020 * | | 2nd thru n-1st xfersize (= MAXBSIZE)
1021 * | 1st xfersize (<= MAXBSIZE)
1022 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter)
1024 * Notes: "alignedoffset" is "offset" rounded down to nearest
1025 * MAXBSIZE boundary. "len" is next page boundary of size
1026 * PAGESIZE after "alignedoffset".
1028 mapoffset
= offset
& MAXBOFFSET
;
1029 alignedoffset
= offset
- mapoffset
;
1030 bp
->b_resid
= bp
->b_bcount
;
1031 isread
= bp
->b_flags
& B_READ
;
1032 srw
= isread
? S_READ
: S_WRITE
;
1034 xfersize
= MIN(lsp
->ls_vp_comp_size
- offset
,
1035 MIN(MAXBSIZE
- mapoffset
, bp
->b_resid
));
1036 len
= roundup(mapoffset
+ xfersize
, PAGESIZE
);
1037 mapaddr
= segmap_getmapflt(segkmap
, lsp
->ls_vp
,
1038 alignedoffset
, MAXBSIZE
, 1, srw
);
1040 * Now fault in the pages. This lets us check
1041 * for errors before we reference mapaddr and
1042 * try to resolve the fault in bcopy (which would
1043 * panic instead). And this can easily happen,
1044 * particularly if you've lofi'd a file over NFS
1045 * and someone deletes the file on the server.
1047 error
= segmap_fault(kas
.a_hat
, segkmap
, mapaddr
,
1048 len
, F_SOFTLOCK
, srw
);
1050 (void) segmap_release(segkmap
, mapaddr
, 0);
1051 if (FC_CODE(error
) == FC_OBJERR
)
1052 error
= FC_ERRNO(error
);
1057 /* error may be non-zero for encrypted lofi */
1058 error
= lofi_rdwr(bufaddr
, 0, bp
, lsp
, xfersize
,
1059 RDWR_BCOPY
, mapaddr
+ mapoffset
);
1061 bp
->b_resid
-= xfersize
;
1062 bufaddr
+= xfersize
;
1069 * If we're reading an entire page starting
1070 * at a page boundary, there's a good chance
1071 * we won't need it again. Put it on the
1072 * head of the freelist.
1074 if (mapoffset
== 0 && xfersize
== MAXBSIZE
)
1075 smflags
|= SM_DONTNEED
;
1078 * Write back good pages, it is okay to
1079 * always release asynchronous here as we'll
1080 * follow with VOP_FSYNC for B_SYNC buffers.
1083 smflags
|= SM_WRITE
| SM_ASYNC
;
1085 (void) segmap_fault(kas
.a_hat
, segkmap
, mapaddr
,
1086 len
, F_SOFTUNLOCK
, srw
);
1087 save_error
= segmap_release(segkmap
, mapaddr
, smflags
);
1090 /* only the first map may start partial */
1092 alignedoffset
+= MAXBSIZE
;
1093 } while ((error
== 0) && (bp
->b_resid
> 0) &&
1094 (offset
< lsp
->ls_vp_comp_size
));
1100 * Check if segment seg_index is present in the decompressed segment
1103 * Returns a pointer to the decompressed segment data cache entry if
1104 * found, and NULL when decompressed data for this segment is not yet
1107 static struct lofi_comp_cache
*
1108 lofi_find_comp_data(struct lofi_state
*lsp
, uint64_t seg_index
)
1110 struct lofi_comp_cache
*lc
;
1112 ASSERT(MUTEX_HELD(&lsp
->ls_comp_cache_lock
));
1114 for (lc
= list_head(&lsp
->ls_comp_cache
); lc
!= NULL
;
1115 lc
= list_next(&lsp
->ls_comp_cache
, lc
)) {
1116 if (lc
->lc_index
== seg_index
) {
1118 * Decompressed segment data was found in the
1121 * The cache uses an LRU replacement strategy;
1122 * move the entry to head of list.
1124 list_remove(&lsp
->ls_comp_cache
, lc
);
1125 list_insert_head(&lsp
->ls_comp_cache
, lc
);
1133 * Add the data for a decompressed segment at segment index
1134 * seg_index to the cache of the decompressed segments.
1136 * Returns a pointer to the cache element structure in case
1137 * the data was added to the cache; returns NULL when the data
1140 static struct lofi_comp_cache
*
1141 lofi_add_comp_data(struct lofi_state
*lsp
, uint64_t seg_index
,
1144 struct lofi_comp_cache
*lc
;
1146 ASSERT(MUTEX_HELD(&lsp
->ls_comp_cache_lock
));
1148 while (lsp
->ls_comp_cache_count
> lofi_max_comp_cache
) {
1149 lc
= list_remove_tail(&lsp
->ls_comp_cache
);
1151 kmem_free(lc
->lc_data
, lsp
->ls_uncomp_seg_sz
);
1152 kmem_free(lc
, sizeof (struct lofi_comp_cache
));
1153 lsp
->ls_comp_cache_count
--;
1157 * Do not cache when disabled by tunable variable
1159 if (lofi_max_comp_cache
== 0)
1163 * When the cache has not yet reached the maximum allowed
1164 * number of segments, allocate a new cache element.
1165 * Otherwise the cache is full; reuse the last list element
1166 * (LRU) for caching the decompressed segment data.
1168 * The cache element for the new decompressed segment data is
1169 * added to the head of the list.
1171 if (lsp
->ls_comp_cache_count
< lofi_max_comp_cache
) {
1172 lc
= kmem_alloc(sizeof (struct lofi_comp_cache
), KM_SLEEP
);
1174 list_insert_head(&lsp
->ls_comp_cache
, lc
);
1175 lsp
->ls_comp_cache_count
++;
1177 lc
= list_remove_tail(&lsp
->ls_comp_cache
);
1180 list_insert_head(&lsp
->ls_comp_cache
, lc
);
1184 * Free old uncompressed segment data when reusing a cache
1187 if (lc
->lc_data
!= NULL
)
1188 kmem_free(lc
->lc_data
, lsp
->ls_uncomp_seg_sz
);
1191 lc
->lc_index
= seg_index
;
1198 gzip_decompress(void *src
, size_t srclen
, void *dst
,
1199 size_t *dstlen
, int level
)
1201 ASSERT(*dstlen
>= srclen
);
1203 if (z_uncompress(dst
, dstlen
, src
, srclen
) != Z_OK
)
1208 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8)
1211 lzma_decompress(void *src
, size_t srclen
, void *dst
,
1212 size_t *dstlen
, int level
)
1218 insizepure
= srclen
- LZMA_HEADER_SIZE
;
1219 actual_src
= (void *)((Byte
*)src
+ LZMA_HEADER_SIZE
);
1221 if (LzmaDecode((Byte
*)dst
, (size_t *)dstlen
,
1222 (const Byte
*)actual_src
, &insizepure
,
1223 (const Byte
*)src
, LZMA_PROPS_SIZE
, LZMA_FINISH_ANY
, &status
,
1224 &g_Alloc
) != SZ_OK
) {
1231 * This is basically what strategy used to be before we found we
1232 * needed task queues.
1235 lofi_strategy_task(void *arg
)
1237 struct buf
*bp
= (struct buf
*)arg
;
1240 struct lofi_state
*lsp
;
1245 boolean_t bufinited
= B_FALSE
;
1247 lsp
= ddi_get_soft_state(lofi_statep
,
1248 LOFI_MINOR2ID(getminor(bp
->b_edev
)));
1254 if (lsp
->ls_kstat
) {
1255 mutex_enter(lsp
->ls_kstat
->ks_lock
);
1256 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp
->ls_kstat
));
1257 mutex_exit(lsp
->ls_kstat
->ks_lock
);
1260 mutex_enter(&lsp
->ls_vp_lock
);
1261 lsp
->ls_vp_iocount
++;
1262 mutex_exit(&lsp
->ls_vp_lock
);
1265 bufaddr
= bp
->b_un
.b_addr
;
1266 offset
= (bp
->b_lblkno
+ (diskaddr_t
)(uintptr_t)bp
->b_private
)
1267 << lsp
->ls_lbshift
; /* offset within file */
1268 if (lsp
->ls_crypto_enabled
) {
1269 /* encrypted data really begins after crypto header */
1270 offset
+= lsp
->ls_crypto_offset
;
1275 if (lsp
->ls_vp
== NULL
|| lsp
->ls_vp_closereq
) {
1281 * If we're writing and the buffer was not B_ASYNC
1282 * we'll follow up with a VOP_FSYNC() to force any
1283 * asynchronous I/O to stable storage.
1285 if (!(bp
->b_flags
& B_READ
) && !(bp
->b_flags
& B_ASYNC
))
1289 * We used to always use vn_rdwr here, but we cannot do that because
1290 * we might decide to read or write from the the underlying
1291 * file during this call, which would be a deadlock because
1292 * we have the rw_lock. So instead we page, unless it's not
1293 * mapable or it's a character device or it's an encrypted lofi.
1295 if ((lsp
->ls_vp
->v_flag
& VNOMAP
) || (lsp
->ls_vp
->v_type
== VCHR
) ||
1296 lsp
->ls_crypto_enabled
) {
1297 error
= lofi_rdwr(bufaddr
, offset
, bp
, lsp
, len
, RDWR_RAW
,
1299 } else if (lsp
->ls_uncomp_seg_sz
== 0) {
1300 error
= lofi_mapped_rdwr(bufaddr
, offset
, bp
, lsp
);
1302 uchar_t
*compressed_seg
= NULL
, *cmpbuf
;
1303 uchar_t
*uncompressed_seg
= NULL
;
1304 lofi_compress_info_t
*li
;
1307 uint64_t sblkno
, eblkno
, cmpbytes
;
1308 uint64_t uncompressed_seg_index
;
1309 struct lofi_comp_cache
*lc
;
1310 offset_t sblkoff
, eblkoff
;
1311 u_offset_t salign
, ealign
;
1313 uint32_t comp_data_sz
;
1318 * From here on we're dealing primarily with compressed files
1320 ASSERT(!lsp
->ls_crypto_enabled
);
1323 * Compressed files can only be read from and
1326 if (!(bp
->b_flags
& B_READ
)) {
1327 bp
->b_resid
= bp
->b_bcount
;
1332 ASSERT(lsp
->ls_comp_algorithm_index
>= 0);
1333 li
= &lofi_compress_table
[lsp
->ls_comp_algorithm_index
];
1335 * Compute starting and ending compressed segment numbers
1336 * We use only bitwise operations avoiding division and
1337 * modulus because we enforce the compression segment size
1340 sblkno
= offset
>> lsp
->ls_comp_seg_shift
;
1341 sblkoff
= offset
& (lsp
->ls_uncomp_seg_sz
- 1);
1342 eblkno
= (offset
+ bp
->b_bcount
) >> lsp
->ls_comp_seg_shift
;
1343 eblkoff
= (offset
+ bp
->b_bcount
) & (lsp
->ls_uncomp_seg_sz
- 1);
1346 * Check the decompressed segment cache.
1348 * The cache is used only when the requested data
1349 * is within a segment. Requests that cross
1350 * segment boundaries bypass the cache.
1352 if (sblkno
== eblkno
||
1353 (sblkno
+ 1 == eblkno
&& eblkoff
== 0)) {
1355 * Request doesn't cross a segment boundary,
1356 * now check the cache.
1358 mutex_enter(&lsp
->ls_comp_cache_lock
);
1359 lc
= lofi_find_comp_data(lsp
, sblkno
);
1362 * We've found the decompressed segment
1363 * data in the cache; reuse it.
1365 bcopy(lc
->lc_data
+ sblkoff
, bufaddr
,
1367 mutex_exit(&lsp
->ls_comp_cache_lock
);
1372 mutex_exit(&lsp
->ls_comp_cache_lock
);
1376 * Align start offset to block boundary for segmap
1378 salign
= lsp
->ls_comp_seg_index
[sblkno
];
1379 sdiff
= salign
& (DEV_BSIZE
- 1);
1381 if (eblkno
>= (lsp
->ls_comp_index_sz
- 1)) {
1383 * We're dealing with the last segment of
1384 * the compressed file -- the size of this
1385 * segment *may not* be the same as the
1386 * segment size for the file
1388 eblkoff
= (offset
+ bp
->b_bcount
) &
1389 (lsp
->ls_uncomp_last_seg_sz
- 1);
1390 ealign
= lsp
->ls_vp_comp_size
;
1392 ealign
= lsp
->ls_comp_seg_index
[eblkno
+ 1];
1396 * Preserve original request paramaters
1398 oblkcount
= bp
->b_bcount
;
1401 * Assign the calculated parameters
1403 comp_data_sz
= ealign
- salign
;
1404 bp
->b_bcount
= comp_data_sz
;
1407 * Buffers to hold compressed segments are pre-allocated
1408 * on a per-thread basis. Find a pre-allocated buffer
1409 * that is not currently in use and mark it for use.
1411 mutex_enter(&lsp
->ls_comp_bufs_lock
);
1412 for (j
= 0; j
< lofi_taskq_nthreads
; j
++) {
1413 if (lsp
->ls_comp_bufs
[j
].inuse
== 0) {
1414 lsp
->ls_comp_bufs
[j
].inuse
= 1;
1419 mutex_exit(&lsp
->ls_comp_bufs_lock
);
1420 ASSERT(j
< lofi_taskq_nthreads
);
1423 * If the pre-allocated buffer size does not match
1424 * the size of the I/O request, re-allocate it with
1425 * the appropriate size
1427 if (lsp
->ls_comp_bufs
[j
].bufsize
< bp
->b_bcount
) {
1428 if (lsp
->ls_comp_bufs
[j
].bufsize
> 0)
1429 kmem_free(lsp
->ls_comp_bufs
[j
].buf
,
1430 lsp
->ls_comp_bufs
[j
].bufsize
);
1431 lsp
->ls_comp_bufs
[j
].buf
= kmem_alloc(bp
->b_bcount
,
1433 lsp
->ls_comp_bufs
[j
].bufsize
= bp
->b_bcount
;
1435 compressed_seg
= lsp
->ls_comp_bufs
[j
].buf
;
1438 * Map in the calculated number of blocks
1440 error
= lofi_mapped_rdwr((caddr_t
)compressed_seg
, salign
,
1443 bp
->b_bcount
= oblkcount
;
1444 bp
->b_resid
= oblkcount
;
1449 * decompress compressed blocks start
1451 cmpbuf
= compressed_seg
+ sdiff
;
1452 for (i
= sblkno
; i
<= eblkno
; i
++) {
1453 ASSERT(i
< lsp
->ls_comp_index_sz
- 1);
1457 * The last segment is special in that it is
1458 * most likely not going to be the same
1459 * (uncompressed) size as the other segments.
1461 if (i
== (lsp
->ls_comp_index_sz
- 2)) {
1462 seglen
= lsp
->ls_uncomp_last_seg_sz
;
1464 seglen
= lsp
->ls_uncomp_seg_sz
;
1468 * Each of the segment index entries contains
1469 * the starting block number for that segment.
1470 * The number of compressed bytes in a segment
1471 * is thus the difference between the starting
1472 * block number of this segment and the starting
1473 * block number of the next segment.
1475 cmpbytes
= lsp
->ls_comp_seg_index
[i
+ 1] -
1476 lsp
->ls_comp_seg_index
[i
];
1479 * The first byte in a compressed segment is a flag
1480 * that indicates whether this segment is compressed
1483 * The variable 'useg' is used (instead of
1484 * uncompressed_seg) in this loop to keep a
1485 * reference to the uncompressed segment.
1487 * N.B. If 'useg' is replaced with uncompressed_seg,
1488 * it leads to memory leaks and heap corruption in
1489 * corner cases where compressed segments lie
1490 * adjacent to uncompressed segments.
1492 if (*cmpbuf
== UNCOMPRESSED
) {
1493 useg
= cmpbuf
+ SEGHDR
;
1495 if (uncompressed_seg
== NULL
)
1497 kmem_alloc(lsp
->ls_uncomp_seg_sz
,
1499 useg
= uncompressed_seg
;
1500 uncompressed_seg_index
= i
;
1502 if (li
->l_decompress((cmpbuf
+ SEGHDR
),
1503 (cmpbytes
- SEGHDR
), uncompressed_seg
,
1504 &seglen
, li
->l_level
) != 0) {
1511 * Determine how much uncompressed data we
1512 * have to copy and copy it
1514 xfersize
= lsp
->ls_uncomp_seg_sz
- sblkoff
;
1516 xfersize
-= (lsp
->ls_uncomp_seg_sz
- eblkoff
);
1518 bcopy((useg
+ sblkoff
), bufaddr
, xfersize
);
1521 bufaddr
+= xfersize
;
1522 bp
->b_resid
-= xfersize
;
1525 if (bp
->b_resid
== 0)
1527 } /* decompress compressed blocks ends */
1530 * Skip to done if there is no uncompressed data to cache
1532 if (uncompressed_seg
== NULL
)
1536 * Add the data for the last decompressed segment to
1539 * In case the uncompressed segment data was added to (and
1540 * is referenced by) the cache, make sure we don't free it
1543 mutex_enter(&lsp
->ls_comp_cache_lock
);
1544 if ((lc
= lofi_add_comp_data(lsp
, uncompressed_seg_index
,
1545 uncompressed_seg
)) != NULL
) {
1546 uncompressed_seg
= NULL
;
1548 mutex_exit(&lsp
->ls_comp_cache_lock
);
1551 if (compressed_seg
!= NULL
) {
1552 mutex_enter(&lsp
->ls_comp_bufs_lock
);
1553 lsp
->ls_comp_bufs
[j
].inuse
= 0;
1554 mutex_exit(&lsp
->ls_comp_bufs_lock
);
1556 if (uncompressed_seg
!= NULL
)
1557 kmem_free(uncompressed_seg
, lsp
->ls_uncomp_seg_sz
);
1558 } /* end of handling compressed files */
1560 if ((error
== 0) && (syncflag
!= 0))
1561 error
= VOP_FSYNC(lsp
->ls_vp
, syncflag
, kcred
, NULL
);
1564 if (bufinited
&& lsp
->ls_kstat
) {
1565 size_t n_done
= bp
->b_bcount
- bp
->b_resid
;
1568 mutex_enter(lsp
->ls_kstat
->ks_lock
);
1569 kioptr
= KSTAT_IO_PTR(lsp
->ls_kstat
);
1570 if (bp
->b_flags
& B_READ
) {
1571 kioptr
->nread
+= n_done
;
1574 kioptr
->nwritten
+= n_done
;
1577 kstat_runq_exit(kioptr
);
1578 mutex_exit(lsp
->ls_kstat
->ks_lock
);
1581 mutex_enter(&lsp
->ls_vp_lock
);
1582 if (--lsp
->ls_vp_iocount
== 0)
1583 cv_broadcast(&lsp
->ls_vp_cv
);
1584 mutex_exit(&lsp
->ls_vp_lock
);
1586 bioerror(bp
, error
);
1591 lofi_strategy(struct buf
*bp
)
1593 struct lofi_state
*lsp
;
1601 * We cannot just do I/O here, because the current thread
1602 * _might_ end up back in here because the underlying filesystem
1603 * wants a buffer, which eventually gets into bio_recycle and
1604 * might call into lofi to write out a delayed-write buffer.
1605 * This is bad if the filesystem above lofi is the same as below.
1607 * We could come up with a complex strategy using threads to
1608 * do the I/O asynchronously, or we could use task queues. task
1609 * queues were incredibly easy so they win.
1612 lsp
= ddi_get_soft_state(lofi_statep
,
1613 LOFI_MINOR2ID(getminor(bp
->b_edev
)));
1614 part
= LOFI_PART(getminor(bp
->b_edev
));
1617 bioerror(bp
, ENXIO
);
1621 shift
= lsp
->ls_lbshift
;
1624 p_nblks
= lsp
->ls_vp_size
>> shift
;
1626 if (lsp
->ls_cmlbhandle
!= NULL
) {
1627 if (cmlb_partinfo(lsp
->ls_cmlbhandle
, part
, &p_nblks
, &p_lba
,
1629 bioerror(bp
, ENXIO
);
1635 /* start block past partition end? */
1636 if (bp
->b_lblkno
> p_nblks
) {
1637 bioerror(bp
, ENXIO
);
1642 offset
= (bp
->b_lblkno
+p_lba
) << shift
; /* offset within file */
1644 mutex_enter(&lsp
->ls_vp_lock
);
1645 if (lsp
->ls_vp
== NULL
|| lsp
->ls_vp_closereq
) {
1648 mutex_exit(&lsp
->ls_vp_lock
);
1652 if (lsp
->ls_crypto_enabled
) {
1653 /* encrypted data really begins after crypto header */
1654 offset
+= lsp
->ls_crypto_offset
;
1657 /* make sure we will not pass the file or partition size */
1658 if (offset
== lsp
->ls_vp_size
||
1659 offset
== (((p_lba
+ p_nblks
) << shift
) + lsp
->ls_crypto_offset
)) {
1661 if ((bp
->b_flags
& B_READ
) != 0) {
1662 bp
->b_resid
= bp
->b_bcount
;
1665 /* writes should fail */
1666 bioerror(bp
, ENXIO
);
1669 mutex_exit(&lsp
->ls_vp_lock
);
1672 if ((offset
> lsp
->ls_vp_size
) ||
1673 (offset
> (((p_lba
+ p_nblks
) << shift
) + lsp
->ls_crypto_offset
)) ||
1674 ((offset
+ bp
->b_bcount
) > ((p_lba
+ p_nblks
) << shift
))) {
1675 bioerror(bp
, ENXIO
);
1677 mutex_exit(&lsp
->ls_vp_lock
);
1681 mutex_exit(&lsp
->ls_vp_lock
);
1683 if (lsp
->ls_kstat
) {
1684 mutex_enter(lsp
->ls_kstat
->ks_lock
);
1685 kstat_waitq_enter(KSTAT_IO_PTR(lsp
->ls_kstat
));
1686 mutex_exit(lsp
->ls_kstat
->ks_lock
);
1688 bp
->b_private
= (void *)(uintptr_t)p_lba
; /* partition start */
1689 (void) taskq_dispatch(lsp
->ls_taskq
, lofi_strategy_task
, bp
, KM_SLEEP
);
1695 lofi_read(dev_t dev
, struct uio
*uio
, struct cred
*credp
)
1697 if (getminor(dev
) == 0)
1700 return (physio(lofi_strategy
, NULL
, dev
, B_READ
, minphys
, uio
));
1705 lofi_write(dev_t dev
, struct uio
*uio
, struct cred
*credp
)
1707 if (getminor(dev
) == 0)
1710 return (physio(lofi_strategy
, NULL
, dev
, B_WRITE
, minphys
, uio
));
1715 lofi_aread(dev_t dev
, struct aio_req
*aio
, struct cred
*credp
)
1717 if (getminor(dev
) == 0)
1719 UIO_CHECK(aio
->aio_uio
);
1720 return (aphysio(lofi_strategy
, anocancel
, dev
, B_READ
, minphys
, aio
));
1725 lofi_awrite(dev_t dev
, struct aio_req
*aio
, struct cred
*credp
)
1727 if (getminor(dev
) == 0)
1729 UIO_CHECK(aio
->aio_uio
);
1730 return (aphysio(lofi_strategy
, anocancel
, dev
, B_WRITE
, minphys
, aio
));
1735 lofi_info(dev_info_t
*dip
, ddi_info_cmd_t infocmd
, void *arg
, void **result
)
1737 struct lofi_state
*lsp
;
1738 dev_t dev
= (dev_t
)arg
;
1741 instance
= LOFI_MINOR2ID(getminor(dev
));
1743 case DDI_INFO_DEVT2DEVINFO
:
1744 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
1746 return (DDI_FAILURE
);
1747 *result
= lsp
->ls_dip
;
1748 return (DDI_SUCCESS
);
1749 case DDI_INFO_DEVT2INSTANCE
:
1750 *result
= (void *) (intptr_t)instance
;
1751 return (DDI_SUCCESS
);
1753 return (DDI_FAILURE
);
1757 lofi_create_minor_nodes(struct lofi_state
*lsp
, boolean_t labeled
)
1760 int instance
= ddi_get_instance(lsp
->ls_dip
);
1762 if (labeled
== B_TRUE
) {
1763 cmlb_alloc_handle(&lsp
->ls_cmlbhandle
);
1764 error
= cmlb_attach(lsp
->ls_dip
, &lofi_tg_ops
, DTYPE_DIRECT
,
1765 B_FALSE
, B_FALSE
, DDI_NT_BLOCK_CHAN
,
1766 CMLB_CREATE_P0_MINOR_NODE
, lsp
->ls_cmlbhandle
, (void *)1);
1768 if (error
!= DDI_SUCCESS
) {
1769 cmlb_free_handle(&lsp
->ls_cmlbhandle
);
1770 lsp
->ls_cmlbhandle
= NULL
;
1774 /* create minor nodes */
1775 error
= ddi_create_minor_node(lsp
->ls_dip
, LOFI_BLOCK_NODE
,
1776 S_IFBLK
, LOFI_ID2MINOR(instance
), DDI_PSEUDO
, 0);
1777 if (error
== DDI_SUCCESS
) {
1778 error
= ddi_create_minor_node(lsp
->ls_dip
,
1779 LOFI_CHAR_NODE
, S_IFCHR
, LOFI_ID2MINOR(instance
),
1781 if (error
!= DDI_SUCCESS
) {
1782 ddi_remove_minor_node(lsp
->ls_dip
,
1793 lofi_zone_bind(struct lofi_state
*lsp
)
1797 mutex_enter(&curproc
->p_lock
);
1798 if ((error
= rctl_incr_lofi(curproc
, curproc
->p_zone
, 1)) != 0) {
1799 mutex_exit(&curproc
->p_lock
);
1802 mutex_exit(&curproc
->p_lock
);
1804 if (ddi_prop_update_string(DDI_DEV_T_NONE
, lsp
->ls_dip
, ZONE_PROP_NAME
,
1805 (char *)curproc
->p_zone
->zone_name
) != DDI_PROP_SUCCESS
) {
1806 rctl_decr_lofi(curproc
->p_zone
, 1);
1809 zone_init_ref(&lsp
->ls_zone
);
1810 zone_hold_ref(curzone
, &lsp
->ls_zone
, ZONE_REF_LOFI
);
1816 lofi_zone_unbind(struct lofi_state
*lsp
)
1818 (void) ddi_prop_remove(DDI_DEV_T_NONE
, lsp
->ls_dip
, ZONE_PROP_NAME
);
1819 rctl_decr_lofi(curproc
->p_zone
, 1);
1820 zone_rele_ref(&lsp
->ls_zone
, ZONE_REF_LOFI
);
1824 lofi_online_dev(dev_info_t
*dip
)
1828 int instance
= ddi_get_instance(dip
);
1829 struct lofi_state
*lsp
;
1832 if (ddi_prop_exists(DDI_DEV_T_ANY
, dip
, DDI_PROP_DONTPASS
, "labeled"))
1835 /* lsp alloc+init, soft state is freed in lofi_detach */
1836 error
= ddi_soft_state_zalloc(lofi_statep
, instance
);
1837 if (error
== DDI_FAILURE
) {
1841 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
1844 if ((error
= lofi_zone_bind(lsp
)) != 0)
1847 cv_init(&lsp
->ls_vp_cv
, NULL
, CV_DRIVER
, NULL
);
1848 mutex_init(&lsp
->ls_comp_cache_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1849 mutex_init(&lsp
->ls_comp_bufs_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1850 mutex_init(&lsp
->ls_kstat_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1851 mutex_init(&lsp
->ls_vp_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1853 if ((error
= lofi_create_minor_nodes(lsp
, labeled
)) != 0) {
1854 lofi_zone_unbind(lsp
);
1858 /* driver handles kernel-issued IOCTLs */
1859 if (ddi_prop_create(DDI_DEV_T_NONE
, dip
, DDI_PROP_CANSLEEP
,
1860 DDI_KERNEL_IOCTL
, NULL
, 0) != DDI_PROP_SUCCESS
) {
1861 error
= DDI_FAILURE
;
1865 lsp
->ls_kstat
= kstat_create_zone(LOFI_DRIVER_NAME
, instance
,
1866 NULL
, "disk", KSTAT_TYPE_IO
, 1, 0, getzoneid());
1867 if (lsp
->ls_kstat
== NULL
) {
1868 (void) ddi_prop_remove(DDI_DEV_T_NONE
, lsp
->ls_dip
,
1874 lsp
->ls_kstat
->ks_lock
= &lsp
->ls_kstat_lock
;
1875 kstat_zone_add(lsp
->ls_kstat
, GLOBAL_ZONEID
);
1876 kstat_install(lsp
->ls_kstat
);
1877 return (DDI_SUCCESS
);
1879 if (lsp
->ls_cmlbhandle
!= NULL
) {
1880 cmlb_detach(lsp
->ls_cmlbhandle
, 0);
1881 cmlb_free_handle(&lsp
->ls_cmlbhandle
);
1883 ddi_remove_minor_node(dip
, NULL
);
1884 lofi_zone_unbind(lsp
);
1886 mutex_destroy(&lsp
->ls_comp_cache_lock
);
1887 mutex_destroy(&lsp
->ls_comp_bufs_lock
);
1888 mutex_destroy(&lsp
->ls_kstat_lock
);
1889 mutex_destroy(&lsp
->ls_vp_lock
);
1890 cv_destroy(&lsp
->ls_vp_cv
);
1892 ddi_soft_state_free(lofi_statep
, instance
);
1897 lofi_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
1900 int instance
= ddi_get_instance(dip
);
1901 struct lofi_state
*lsp
;
1903 if (cmd
!= DDI_ATTACH
)
1904 return (DDI_FAILURE
);
1907 * Instance 0 is control instance, attaching control instance
1908 * will set the lofi up and ready.
1910 if (instance
== 0) {
1911 rv
= ddi_soft_state_zalloc(lofi_statep
, 0);
1912 if (rv
== DDI_FAILURE
) {
1913 return (DDI_FAILURE
);
1915 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
1916 rv
= ddi_create_minor_node(dip
, LOFI_CTL_NODE
, S_IFCHR
, 0,
1918 if (rv
== DDI_FAILURE
) {
1919 ddi_soft_state_free(lofi_statep
, 0);
1920 return (DDI_FAILURE
);
1922 /* driver handles kernel-issued IOCTLs */
1923 if (ddi_prop_create(DDI_DEV_T_NONE
, dip
, DDI_PROP_CANSLEEP
,
1924 DDI_KERNEL_IOCTL
, NULL
, 0) != DDI_PROP_SUCCESS
) {
1925 ddi_remove_minor_node(dip
, NULL
);
1926 ddi_soft_state_free(lofi_statep
, 0);
1927 return (DDI_FAILURE
);
1930 zone_key_create(&lofi_zone_key
, NULL
, lofi_zone_shutdown
, NULL
);
1934 if (lofi_online_dev(dip
) == DDI_FAILURE
)
1935 return (DDI_FAILURE
);
1938 ddi_report_dev(dip
);
1939 return (DDI_SUCCESS
);
1943 lofi_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
1945 struct lofi_state
*lsp
;
1946 int instance
= ddi_get_instance(dip
);
1948 if (cmd
!= DDI_DETACH
)
1949 return (DDI_FAILURE
);
1952 * If the instance is not 0, release state.
1953 * The instance 0 is control device, we can not detach it
1954 * before other instances are detached.
1956 if (instance
!= 0) {
1957 lsp
= ddi_get_soft_state(lofi_statep
, instance
);
1958 if (lsp
!= NULL
&& lsp
->ls_vp_ready
== B_FALSE
) {
1959 ddi_soft_state_free(lofi_statep
, instance
);
1960 return (DDI_SUCCESS
);
1962 return (DDI_FAILURE
);
1964 mutex_enter(&lofi_lock
);
1966 if (!list_is_empty(&lofi_list
)) {
1967 mutex_exit(&lofi_lock
);
1968 return (DDI_FAILURE
);
1971 ddi_remove_minor_node(dip
, NULL
);
1972 ddi_prop_remove_all(dip
);
1974 mutex_exit(&lofi_lock
);
1976 if (zone_key_delete(lofi_zone_key
) != 0)
1977 cmn_err(CE_WARN
, "failed to delete zone key");
1979 ddi_soft_state_free(lofi_statep
, 0);
1981 return (DDI_SUCCESS
);
1985 * With the addition of encryption, we must be careful that encryption key is
1986 * wiped before kernel's data structures are freed so it cannot accidentally
1987 * slip out to userland through uninitialized data elsewhere.
1990 free_lofi_ioctl(struct lofi_ioctl
*klip
)
1992 /* Make sure this encryption key doesn't stick around */
1993 bzero(klip
->li_key
, sizeof (klip
->li_key
));
1994 kmem_free(klip
, sizeof (struct lofi_ioctl
));
1998 * These two functions simplify the rest of the ioctls that need to copyin/out
1999 * the lofi_ioctl structure.
2002 copy_in_lofi_ioctl(const struct lofi_ioctl
*ulip
, struct lofi_ioctl
**klipp
,
2005 struct lofi_ioctl
*klip
;
2008 klip
= *klipp
= kmem_alloc(sizeof (struct lofi_ioctl
), KM_SLEEP
);
2009 error
= ddi_copyin(ulip
, klip
, sizeof (struct lofi_ioctl
), flag
);
2013 /* ensure NULL termination */
2014 klip
->li_filename
[MAXPATHLEN
-1] = '\0';
2015 klip
->li_devpath
[MAXPATHLEN
-1] = '\0';
2016 klip
->li_algorithm
[MAXALGLEN
-1] = '\0';
2017 klip
->li_cipher
[CRYPTO_MAX_MECH_NAME
-1] = '\0';
2018 klip
->li_iv_cipher
[CRYPTO_MAX_MECH_NAME
-1] = '\0';
2020 if (klip
->li_id
> L_MAXMIN32
) {
2028 free_lofi_ioctl(klip
);
2033 copy_out_lofi_ioctl(const struct lofi_ioctl
*klip
, struct lofi_ioctl
*ulip
,
2039 * NOTE: Do NOT copy the crypto_key_t "back" to userland.
2040 * This ensures that an attacker can't trivially find the
2041 * key for a mapping just by issuing the ioctl.
2043 * It can still be found by poking around in kmem with mdb(1),
2044 * but there is no point in making it easy when the info isn't
2045 * of any use in this direction anyway.
2047 * Either way we don't actually have the raw key stored in
2048 * a form that we can get it anyway, since we just used it
2049 * to create a ctx template and didn't keep "the original".
2051 error
= ddi_copyout(klip
, ulip
, sizeof (struct lofi_ioctl
), flag
);
2058 lofi_access(struct lofi_state
*lsp
)
2060 ASSERT(MUTEX_HELD(&lofi_lock
));
2061 if (INGLOBALZONE(curproc
) || lsp
->ls_zone
.zref_zone
== curzone
)
2067 * Find the lofi state for the given filename. We compare by vnode to
2068 * allow the global zone visibility into NGZ lofi nodes.
2071 file_to_lofi_nocheck(char *filename
, boolean_t readonly
,
2072 struct lofi_state
**lspp
)
2074 struct lofi_state
*lsp
;
2079 ASSERT(MUTEX_HELD(&lofi_lock
));
2081 if ((err
= lookupname(filename
, UIO_SYSSPACE
, FOLLOW
,
2082 NULLVPP
, &vp
)) != 0)
2085 if (vp
->v_type
== VREG
) {
2087 if (VOP_REALVP(vp
, &realvp
, NULL
) == 0) {
2094 for (lsp
= list_head(&lofi_list
); lsp
!= NULL
;
2095 lsp
= list_next(&lofi_list
, lsp
)) {
2096 if (lsp
->ls_vp
== vp
) {
2099 if (lsp
->ls_readonly
) {
2101 /* Skip if '-r' is specified */
2112 * If a filename is given as an argument for lofi_unmap, we shouldn't
2113 * allow unmap if there are multiple read-only lofi devices associated
2119 else if (rdfiles
> 1)
2130 * Find the minor for the given filename, checking the zone can access
2134 file_to_lofi(char *filename
, boolean_t readonly
, struct lofi_state
**lspp
)
2138 ASSERT(MUTEX_HELD(&lofi_lock
));
2140 if ((err
= file_to_lofi_nocheck(filename
, readonly
, lspp
)) != 0)
2143 if ((err
= lofi_access(*lspp
)) != 0)
2150 * Fakes up a disk geometry based on the size of the file. This is needed
2151 * to support newfs on traditional lofi device, but also will provide
2152 * geometry hint for cmlb.
2155 fake_disk_geometry(struct lofi_state
*lsp
)
2157 u_offset_t dsize
= lsp
->ls_vp_size
- lsp
->ls_crypto_offset
;
2159 /* dk_geom - see dkio(7I) */
2161 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
2162 * of sectors), but that breaks programs like fdisk which want to
2163 * partition a disk by cylinder. With one cylinder, you can't create
2164 * an fdisk partition and put pcfs on it for testing (hard to pick
2165 * a number between one and one).
2167 * The cheezy floppy test is an attempt to not have too few cylinders
2168 * for a small file, or so many on a big file that you waste space
2169 * for backup superblocks or cylinder group structures.
2171 bzero(&lsp
->ls_dkg
, sizeof (lsp
->ls_dkg
));
2172 if (dsize
< (2 * 1024 * 1024)) /* floppy? */
2173 lsp
->ls_dkg
.dkg_ncyl
= dsize
/ (100 * 1024);
2175 lsp
->ls_dkg
.dkg_ncyl
= dsize
/ (300 * 1024);
2176 /* in case file file is < 100k */
2177 if (lsp
->ls_dkg
.dkg_ncyl
== 0)
2178 lsp
->ls_dkg
.dkg_ncyl
= 1;
2180 lsp
->ls_dkg
.dkg_pcyl
= lsp
->ls_dkg
.dkg_ncyl
;
2181 lsp
->ls_dkg
.dkg_nhead
= 1;
2182 lsp
->ls_dkg
.dkg_rpm
= 7200;
2184 lsp
->ls_dkg
.dkg_nsect
= dsize
/
2185 (lsp
->ls_dkg
.dkg_ncyl
<< lsp
->ls_pbshift
);
2189 * build vtoc - see dkio(7I)
2191 * Fakes one big partition based on the size of the file. This is needed
2192 * because we allow newfs'ing the traditional lofi device and newfs will
2193 * do several disk ioctls to figure out the geometry and partition information.
2194 * It uses that information to determine the parameters to pass to mkfs.
2197 fake_disk_vtoc(struct lofi_state
*lsp
, struct vtoc
*vt
)
2199 bzero(vt
, sizeof (struct vtoc
));
2200 vt
->v_sanity
= VTOC_SANE
;
2201 vt
->v_version
= V_VERSION
;
2202 (void) strncpy(vt
->v_volume
, LOFI_DRIVER_NAME
,
2203 sizeof (vt
->v_volume
));
2204 vt
->v_sectorsz
= 1 << lsp
->ls_pbshift
;
2206 vt
->v_part
[0].p_tag
= V_UNASSIGNED
;
2209 * A compressed file is read-only, other files can
2212 if (lsp
->ls_uncomp_seg_sz
> 0) {
2213 vt
->v_part
[0].p_flag
= V_UNMNT
| V_RONLY
;
2215 vt
->v_part
[0].p_flag
= V_UNMNT
;
2217 vt
->v_part
[0].p_start
= (daddr_t
)0;
2219 * The partition size cannot just be the number of sectors, because
2220 * that might not end on a cylinder boundary. And if that's the case,
2221 * newfs/mkfs will print a scary warning. So just figure the size
2222 * based on the number of cylinders and sectors/cylinder.
2224 vt
->v_part
[0].p_size
= lsp
->ls_dkg
.dkg_pcyl
*
2225 lsp
->ls_dkg
.dkg_nsect
* lsp
->ls_dkg
.dkg_nhead
;
2229 * build dk_cinfo - see dkio(7I)
2232 fake_disk_info(dev_t dev
, struct dk_cinfo
*ci
)
2234 bzero(ci
, sizeof (struct dk_cinfo
));
2235 (void) strlcpy(ci
->dki_cname
, LOFI_DRIVER_NAME
, sizeof (ci
->dki_cname
));
2236 ci
->dki_ctype
= DKC_SCSI_CCS
;
2237 (void) strlcpy(ci
->dki_dname
, LOFI_DRIVER_NAME
, sizeof (ci
->dki_dname
));
2238 ci
->dki_unit
= LOFI_MINOR2ID(getminor(dev
));
2239 ci
->dki_partition
= LOFI_PART(getminor(dev
));
2241 * newfs uses this to set maxcontig. Must not be < 16, or it
2242 * will be 0 when newfs multiplies it by DEV_BSIZE and divides
2243 * it by the block size. Then tunefs doesn't work because
2246 ci
->dki_maxtransfer
= 16;
2250 * map in a compressed file
2252 * Read in the header and the index that follows.
2254 * The header is as follows -
2256 * Signature (name of the compression algorithm)
2257 * Compression segment size (a multiple of 512)
2258 * Number of index entries
2259 * Size of the last block
2260 * The array containing the index entries
2262 * The header information is always stored in
2263 * network byte order on disk.
2266 lofi_map_compressed_file(struct lofi_state
*lsp
, char *buf
)
2268 uint32_t index_sz
, header_len
, i
;
2274 /* The signature has already been read */
2275 tbuf
+= sizeof (lsp
->ls_comp_algorithm
);
2276 bcopy(tbuf
, &(lsp
->ls_uncomp_seg_sz
), sizeof (lsp
->ls_uncomp_seg_sz
));
2277 lsp
->ls_uncomp_seg_sz
= ntohl(lsp
->ls_uncomp_seg_sz
);
2280 * The compressed segment size must be a power of 2
2282 if (lsp
->ls_uncomp_seg_sz
< DEV_BSIZE
||
2283 !ISP2(lsp
->ls_uncomp_seg_sz
))
2286 for (i
= 0; !((lsp
->ls_uncomp_seg_sz
>> i
) & 1); i
++)
2289 lsp
->ls_comp_seg_shift
= i
;
2291 tbuf
+= sizeof (lsp
->ls_uncomp_seg_sz
);
2292 bcopy(tbuf
, &(lsp
->ls_comp_index_sz
), sizeof (lsp
->ls_comp_index_sz
));
2293 lsp
->ls_comp_index_sz
= ntohl(lsp
->ls_comp_index_sz
);
2295 tbuf
+= sizeof (lsp
->ls_comp_index_sz
);
2296 bcopy(tbuf
, &(lsp
->ls_uncomp_last_seg_sz
),
2297 sizeof (lsp
->ls_uncomp_last_seg_sz
));
2298 lsp
->ls_uncomp_last_seg_sz
= ntohl(lsp
->ls_uncomp_last_seg_sz
);
2301 * Compute the total size of the uncompressed data
2302 * for use in fake_disk_geometry and other calculations.
2303 * Disk geometry has to be faked with respect to the
2304 * actual uncompressed data size rather than the
2305 * compressed file size.
2308 (u_offset_t
)(lsp
->ls_comp_index_sz
- 2) * lsp
->ls_uncomp_seg_sz
2309 + lsp
->ls_uncomp_last_seg_sz
;
2312 * Index size is rounded up to DEV_BSIZE for ease
2315 index_sz
= sizeof (*lsp
->ls_comp_seg_index
) * lsp
->ls_comp_index_sz
;
2316 header_len
= sizeof (lsp
->ls_comp_algorithm
) +
2317 sizeof (lsp
->ls_uncomp_seg_sz
) +
2318 sizeof (lsp
->ls_comp_index_sz
) +
2319 sizeof (lsp
->ls_uncomp_last_seg_sz
);
2320 lsp
->ls_comp_offbase
= header_len
+ index_sz
;
2322 index_sz
+= header_len
;
2323 index_sz
= roundup(index_sz
, DEV_BSIZE
);
2325 lsp
->ls_comp_index_data
= kmem_alloc(index_sz
, KM_SLEEP
);
2326 lsp
->ls_comp_index_data_sz
= index_sz
;
2329 * Read in the index -- this has a side-effect
2330 * of reading in the header as well
2333 error
= vn_rdwr(rw
, lsp
->ls_vp
, lsp
->ls_comp_index_data
, index_sz
,
2334 0, UIO_SYSSPACE
, 0, RLIM64_INFINITY
, kcred
, &resid
);
2339 /* Skip the header, this is where the index really begins */
2340 lsp
->ls_comp_seg_index
=
2342 (uint64_t *)(lsp
->ls_comp_index_data
+ header_len
);
2345 * Now recompute offsets in the index to account for
2348 for (i
= 0; i
< lsp
->ls_comp_index_sz
; i
++) {
2349 lsp
->ls_comp_seg_index
[i
] = lsp
->ls_comp_offbase
+
2350 BE_64(lsp
->ls_comp_seg_index
[i
]);
2357 lofi_init_crypto(struct lofi_state
*lsp
, struct lofi_ioctl
*klip
)
2359 struct crypto_meta chead
;
2360 char buf
[DEV_BSIZE
];
2367 if (!klip
->li_crypto_enabled
)
2371 * All current algorithms have a max of 448 bits.
2373 if (klip
->li_iv_len
> CRYPTO_BITS2BYTES(512))
2376 if (CRYPTO_BITS2BYTES(klip
->li_key_len
) > sizeof (klip
->li_key
))
2379 lsp
->ls_crypto_enabled
= klip
->li_crypto_enabled
;
2381 mutex_init(&lsp
->ls_crypto_lock
, NULL
, MUTEX_DRIVER
, NULL
);
2383 lsp
->ls_mech
.cm_type
= crypto_mech2id(klip
->li_cipher
);
2384 if (lsp
->ls_mech
.cm_type
== CRYPTO_MECH_INVALID
) {
2385 cmn_err(CE_WARN
, "invalid cipher %s requested for %s",
2386 klip
->li_cipher
, klip
->li_filename
);
2390 /* this is just initialization here */
2391 lsp
->ls_mech
.cm_param
= NULL
;
2392 lsp
->ls_mech
.cm_param_len
= 0;
2394 lsp
->ls_iv_type
= klip
->li_iv_type
;
2395 lsp
->ls_iv_mech
.cm_type
= crypto_mech2id(klip
->li_iv_cipher
);
2396 if (lsp
->ls_iv_mech
.cm_type
== CRYPTO_MECH_INVALID
) {
2397 cmn_err(CE_WARN
, "invalid iv cipher %s requested"
2398 " for %s", klip
->li_iv_cipher
, klip
->li_filename
);
2402 /* iv mech must itself take a null iv */
2403 lsp
->ls_iv_mech
.cm_param
= NULL
;
2404 lsp
->ls_iv_mech
.cm_param_len
= 0;
2405 lsp
->ls_iv_len
= klip
->li_iv_len
;
2408 * Create ctx using li_cipher & the raw li_key after checking
2409 * that it isn't a weak key.
2411 lsp
->ls_key
.ck_format
= CRYPTO_KEY_RAW
;
2412 lsp
->ls_key
.ck_length
= klip
->li_key_len
;
2413 lsp
->ls_key
.ck_data
= kmem_alloc(
2414 CRYPTO_BITS2BYTES(lsp
->ls_key
.ck_length
), KM_SLEEP
);
2415 bcopy(klip
->li_key
, lsp
->ls_key
.ck_data
,
2416 CRYPTO_BITS2BYTES(lsp
->ls_key
.ck_length
));
2418 ret
= crypto_key_check(&lsp
->ls_mech
, &lsp
->ls_key
);
2419 if (ret
!= CRYPTO_SUCCESS
) {
2420 cmn_err(CE_WARN
, "weak key check failed for cipher "
2421 "%s on file %s (0x%x)", klip
->li_cipher
,
2422 klip
->li_filename
, ret
);
2426 error
= vn_rdwr(UIO_READ
, lsp
->ls_vp
, buf
, DEV_BSIZE
,
2427 CRYOFF
, UIO_SYSSPACE
, 0, RLIM64_INFINITY
, kcred
, &resid
);
2432 * This is the case where the header in the lofi image is already
2433 * initialized to indicate it is encrypted.
2435 if (strncmp(buf
, lofi_crypto_magic
, sizeof (lofi_crypto_magic
)) == 0) {
2437 * The encryption header information is laid out this way:
2438 * 6 bytes: hex "CFLOFI"
2439 * 2 bytes: version = 0 ... for now
2440 * 96 bytes: reserved1 (not implemented yet)
2441 * 4 bytes: data_sector = 2 ... for now
2442 * more... not implemented yet
2447 /* copy the magic */
2448 bcopy(marker
, lsp
->ls_crypto
.magic
,
2449 sizeof (lsp
->ls_crypto
.magic
));
2450 marker
+= sizeof (lsp
->ls_crypto
.magic
);
2452 /* read the encryption version number */
2453 bcopy(marker
, &(lsp
->ls_crypto
.version
),
2454 sizeof (lsp
->ls_crypto
.version
));
2455 lsp
->ls_crypto
.version
= ntohs(lsp
->ls_crypto
.version
);
2456 marker
+= sizeof (lsp
->ls_crypto
.version
);
2458 /* read a chunk of reserved data */
2459 bcopy(marker
, lsp
->ls_crypto
.reserved1
,
2460 sizeof (lsp
->ls_crypto
.reserved1
));
2461 marker
+= sizeof (lsp
->ls_crypto
.reserved1
);
2463 /* read block number where encrypted data begins */
2464 bcopy(marker
, &(lsp
->ls_crypto
.data_sector
),
2465 sizeof (lsp
->ls_crypto
.data_sector
));
2466 lsp
->ls_crypto
.data_sector
= ntohl(lsp
->ls_crypto
.data_sector
);
2467 marker
+= sizeof (lsp
->ls_crypto
.data_sector
);
2469 /* and ignore the rest until it is implemented */
2471 lsp
->ls_crypto_offset
= lsp
->ls_crypto
.data_sector
* DEV_BSIZE
;
2476 * We've requested encryption, but no magic was found, so it must be
2480 for (i
= 0; i
< sizeof (struct crypto_meta
); i
++) {
2486 bcopy(lofi_crypto_magic
, marker
, sizeof (lofi_crypto_magic
));
2487 marker
+= sizeof (lofi_crypto_magic
);
2488 chead
.version
= htons(LOFI_CRYPTO_VERSION
);
2489 bcopy(&(chead
.version
), marker
, sizeof (chead
.version
));
2490 marker
+= sizeof (chead
.version
);
2491 marker
+= sizeof (chead
.reserved1
);
2492 chead
.data_sector
= htonl(LOFI_CRYPTO_DATA_SECTOR
);
2493 bcopy(&(chead
.data_sector
), marker
, sizeof (chead
.data_sector
));
2495 /* write the header */
2496 error
= vn_rdwr(UIO_WRITE
, lsp
->ls_vp
, buf
, DEV_BSIZE
,
2497 CRYOFF
, UIO_SYSSPACE
, 0, RLIM64_INFINITY
, kcred
, &resid
);
2501 /* fix things up so it looks like we read this info */
2502 bcopy(lofi_crypto_magic
, lsp
->ls_crypto
.magic
,
2503 sizeof (lofi_crypto_magic
));
2504 lsp
->ls_crypto
.version
= LOFI_CRYPTO_VERSION
;
2505 lsp
->ls_crypto
.data_sector
= LOFI_CRYPTO_DATA_SECTOR
;
2506 lsp
->ls_crypto_offset
= lsp
->ls_crypto
.data_sector
* DEV_BSIZE
;
2511 * Check to see if the passed in signature is a valid one. If it is
2512 * valid, return the index into lofi_compress_table.
2514 * Return -1 if it is invalid
2517 lofi_compress_select(const char *signature
)
2521 for (i
= 0; i
< LOFI_COMPRESS_FUNCTIONS
; i
++) {
2522 if (strcmp(lofi_compress_table
[i
].l_name
, signature
) == 0)
2530 lofi_init_compress(struct lofi_state
*lsp
)
2532 char buf
[DEV_BSIZE
];
2537 error
= vn_rdwr(UIO_READ
, lsp
->ls_vp
, buf
, DEV_BSIZE
, 0, UIO_SYSSPACE
,
2538 0, RLIM64_INFINITY
, kcred
, &resid
);
2543 if ((compress_index
= lofi_compress_select(buf
)) == -1)
2546 /* compression and encryption are mutually exclusive */
2547 if (lsp
->ls_crypto_enabled
)
2550 /* initialize compression info for compressed lofi */
2551 lsp
->ls_comp_algorithm_index
= compress_index
;
2552 (void) strlcpy(lsp
->ls_comp_algorithm
,
2553 lofi_compress_table
[compress_index
].l_name
,
2554 sizeof (lsp
->ls_comp_algorithm
));
2556 /* Finally setup per-thread pre-allocated buffers */
2557 lsp
->ls_comp_bufs
= kmem_zalloc(lofi_taskq_nthreads
*
2558 sizeof (struct compbuf
), KM_SLEEP
);
2560 return (lofi_map_compressed_file(lsp
, buf
));
2564 * Allocate new or proposed id from lofi_id.
2566 * Special cases for proposed id:
2567 * 0: not allowed, 0 is id for control device.
2568 * -1: allocate first usable id from lofi_id.
2569 * any other value is proposed value from userland
2571 * returns DDI_SUCCESS or errno.
2574 lofi_alloc_id(int *idp
)
2576 int id
, error
= DDI_SUCCESS
;
2579 id
= id_allocff_nosleep(lofi_id
);
2584 } else if (*idp
== 0) {
2587 } else if (*idp
> ((1 << (L_BITSMINOR
- LOFI_CMLB_SHIFT
)) - 1)) {
2591 if (ddi_get_soft_state(lofi_statep
, *idp
) != NULL
) {
2596 id
= id_alloc_specific_nosleep(lofi_id
, *idp
);
2608 lofi_create_dev(struct lofi_ioctl
*klip
)
2610 dev_info_t
*parent
, *child
;
2611 struct lofi_state
*lsp
= NULL
;
2612 char namebuf
[MAXNAMELEN
];
2615 /* get control device */
2616 lsp
= ddi_get_soft_state(lofi_statep
, 0);
2617 parent
= ddi_get_parent(lsp
->ls_dip
);
2619 if ((error
= lofi_alloc_id((int *)&klip
->li_id
)))
2622 (void) snprintf(namebuf
, sizeof (namebuf
), LOFI_DRIVER_NAME
"@%d",
2625 ndi_devi_enter(parent
, &circ
);
2626 child
= ndi_devi_findchild(parent
, namebuf
);
2627 ndi_devi_exit(parent
, circ
);
2629 if (child
== NULL
) {
2630 child
= ddi_add_child(parent
, LOFI_DRIVER_NAME
,
2631 (pnode_t
)DEVI_SID_NODEID
, klip
->li_id
);
2632 if ((error
= ddi_prop_update_int(DDI_DEV_T_NONE
, child
,
2633 "instance", klip
->li_id
)) != DDI_PROP_SUCCESS
)
2636 if (klip
->li_labeled
== B_TRUE
) {
2637 if ((error
= ddi_prop_create(DDI_DEV_T_NONE
, child
,
2638 DDI_PROP_CANSLEEP
, "labeled", 0, 0))
2639 != DDI_PROP_SUCCESS
)
2643 if ((error
= ndi_devi_online(child
, NDI_ONLINE_ATTACH
))
2647 id_free(lofi_id
, klip
->li_id
);
2655 ddi_prop_remove_all(child
);
2656 (void) ndi_devi_offline(child
, NDI_DEVI_REMOVE
);
2657 id_free(lofi_id
, klip
->li_id
);
2664 lofi_create_inquiry(struct lofi_state
*lsp
, struct scsi_inquiry
*inq
)
2668 (void) strlcpy(inq
->inq_vid
, LOFI_DRIVER_NAME
, sizeof (inq
->inq_vid
));
2670 mutex_enter(&lsp
->ls_vp_lock
);
2671 if (lsp
->ls_vp
!= NULL
)
2672 p
= strrchr(lsp
->ls_vp
->v_path
, '/');
2674 (void) strncpy(inq
->inq_pid
, p
+ 1, sizeof (inq
->inq_pid
));
2675 mutex_exit(&lsp
->ls_vp_lock
);
2676 (void) strlcpy(inq
->inq_revision
, "1.0", sizeof (inq
->inq_revision
));
2680 * copy devlink name from event cache
2683 lofi_copy_devpath(struct lofi_ioctl
*klip
)
2686 char namebuf
[MAXNAMELEN
], *str
;
2688 nvlist_t
*nvl
= NULL
;
2690 if (klip
->li_labeled
== B_TRUE
)
2691 klip
->li_devpath
[0] = '\0';
2693 /* no need to wait for messages */
2694 (void) snprintf(klip
->li_devpath
, sizeof (klip
->li_devpath
),
2695 "/dev/" LOFI_CHAR_NAME
"/%d", klip
->li_id
);
2699 (void) snprintf(namebuf
, sizeof (namebuf
), "%d", klip
->li_id
);
2700 ticks
= ddi_get_lbolt() + LOFI_TIMEOUT
* drv_usectohz(1000000);
2702 mutex_enter(&lofi_devlink_cache
.ln_lock
);
2703 error
= nvlist_lookup_nvlist(lofi_devlink_cache
.ln_data
, namebuf
, &nvl
);
2704 while (error
!= 0) {
2705 error
= cv_timedwait(&lofi_devlink_cache
.ln_cv
,
2706 &lofi_devlink_cache
.ln_lock
, ticks
);
2709 error
= nvlist_lookup_nvlist(lofi_devlink_cache
.ln_data
,
2714 if (nvlist_lookup_string(nvl
, DEV_NAME
, &str
) == 0) {
2715 (void) strlcpy(klip
->li_devpath
, str
,
2716 sizeof (klip
->li_devpath
));
2719 mutex_exit(&lofi_devlink_cache
.ln_lock
);
2723 * map a file to a minor number. Return the minor number.
2726 lofi_map_file(dev_t dev
, struct lofi_ioctl
*ulip
, int pickminor
,
2727 int *rvalp
, struct cred
*credp
, int ioctl_flag
)
2730 struct lofi_state
*lsp
= NULL
;
2731 struct lofi_ioctl
*klip
;
2733 struct vnode
*vp
= NULL
;
2736 char namebuf
[MAXNAMELEN
];
2738 error
= copy_in_lofi_ioctl(ulip
, &klip
, ioctl_flag
);
2742 mutex_enter(&lofi_lock
);
2744 if (file_to_lofi_nocheck(klip
->li_filename
, klip
->li_readonly
,
2750 flag
= FREAD
| FWRITE
| FOFFMAX
| FEXCL
;
2751 error
= vn_open(klip
->li_filename
, UIO_SYSSPACE
, flag
, 0, &vp
, 0, 0);
2755 error
= vn_open(klip
->li_filename
, UIO_SYSSPACE
, flag
, 0,
2761 if (!V_ISLOFIABLE(vp
->v_type
)) {
2766 vattr
.va_mask
= AT_SIZE
;
2767 error
= VOP_GETATTR(vp
, &vattr
, 0, credp
, NULL
);
2771 /* the file needs to be a multiple of the block size */
2772 if ((vattr
.va_size
% DEV_BSIZE
) != 0) {
2778 klip
->li_id
= (uint32_t)-1;
2780 if ((error
= lofi_create_dev(klip
)) != 0)
2784 lsp
= ddi_get_soft_state(lofi_statep
, id
);
2789 * from this point lofi_destroy() is used to clean up on error
2790 * make sure the basic data is set
2792 lsp
->ls_dev
= makedevice(getmajor(dev
), LOFI_ID2MINOR(id
));
2794 list_create(&lsp
->ls_comp_cache
, sizeof (struct lofi_comp_cache
),
2795 offsetof(struct lofi_comp_cache
, lc_list
));
2798 * save open mode so file can be closed properly and vnode counts
2799 * updated correctly.
2801 lsp
->ls_openflag
= flag
;
2804 lsp
->ls_stacked_vp
= vp
;
2806 lsp
->ls_vp_size
= vattr
.va_size
;
2807 lsp
->ls_vp_comp_size
= lsp
->ls_vp_size
;
2810 * Try to handle stacked lofs vnodes.
2812 if (vp
->v_type
== VREG
) {
2815 if (VOP_REALVP(vp
, &realvp
, NULL
) == 0) {
2817 * We need to use the realvp for uniqueness
2818 * checking, but keep the stacked vp for
2819 * LOFI_GET_FILENAME display.
2822 lsp
->ls_vp
= realvp
;
2826 lsp
->ls_lbshift
= highbit(DEV_BSIZE
) - 1;
2827 lsp
->ls_pbshift
= lsp
->ls_lbshift
;
2829 lsp
->ls_readonly
= klip
->li_readonly
;
2830 lsp
->ls_uncomp_seg_sz
= 0;
2831 lsp
->ls_comp_algorithm
[0] = '\0';
2832 lsp
->ls_crypto_offset
= 0;
2834 (void) snprintf(namebuf
, sizeof (namebuf
), "%s_taskq_%d",
2835 LOFI_DRIVER_NAME
, id
);
2836 lsp
->ls_taskq
= taskq_create_proc(namebuf
, lofi_taskq_nthreads
,
2837 minclsyspri
, 1, lofi_taskq_maxalloc
, curzone
->zone_zsched
, 0);
2839 if ((error
= lofi_init_crypto(lsp
, klip
)) != 0)
2842 if ((error
= lofi_init_compress(lsp
)) != 0)
2845 fake_disk_geometry(lsp
);
2847 /* For unlabeled lofi add Nblocks and Size */
2848 if (klip
->li_labeled
== B_FALSE
) {
2849 error
= ddi_prop_update_int64(lsp
->ls_dev
, lsp
->ls_dip
,
2850 SIZE_PROP_NAME
, lsp
->ls_vp_size
- lsp
->ls_crypto_offset
);
2851 if (error
!= DDI_PROP_SUCCESS
) {
2855 error
= ddi_prop_update_int64(lsp
->ls_dev
, lsp
->ls_dip
,
2857 (lsp
->ls_vp_size
- lsp
->ls_crypto_offset
) / DEV_BSIZE
);
2858 if (error
!= DDI_PROP_SUCCESS
) {
2864 list_insert_tail(&lofi_list
, lsp
);
2866 * Notify we are ready to rock.
2868 mutex_enter(&lsp
->ls_vp_lock
);
2869 lsp
->ls_vp_ready
= B_TRUE
;
2870 cv_broadcast(&lsp
->ls_vp_cv
);
2871 mutex_exit(&lsp
->ls_vp_lock
);
2872 mutex_exit(&lofi_lock
);
2874 lofi_copy_devpath(klip
);
2878 (void) copy_out_lofi_ioctl(klip
, ulip
, ioctl_flag
);
2879 free_lofi_ioctl(klip
);
2884 lofi_destroy(lsp
, credp
);
2887 (void) VOP_PUTPAGE(vp
, 0, 0, B_INVAL
, credp
, NULL
);
2888 (void) VOP_CLOSE(vp
, flag
, 1, 0, credp
, NULL
);
2893 mutex_exit(&lofi_lock
);
2894 free_lofi_ioctl(klip
);
2902 lofi_unmap_file(struct lofi_ioctl
*ulip
, int byfilename
,
2903 struct cred
*credp
, int ioctl_flag
)
2905 struct lofi_state
*lsp
;
2906 struct lofi_ioctl
*klip
;
2907 nvlist_t
*nvl
= NULL
;
2909 char name
[MAXNAMELEN
];
2912 err
= copy_in_lofi_ioctl(ulip
, &klip
, ioctl_flag
);
2916 mutex_enter(&lofi_lock
);
2918 if ((err
= file_to_lofi(klip
->li_filename
, klip
->li_readonly
,
2920 mutex_exit(&lofi_lock
);
2923 } else if (klip
->li_id
== 0) {
2924 mutex_exit(&lofi_lock
);
2925 free_lofi_ioctl(klip
);
2928 lsp
= ddi_get_soft_state(lofi_statep
, klip
->li_id
);
2931 if (lsp
== NULL
|| lsp
->ls_vp
== NULL
|| lofi_access(lsp
) != 0) {
2932 mutex_exit(&lofi_lock
);
2933 free_lofi_ioctl(klip
);
2937 klip
->li_id
= LOFI_MINOR2ID(getminor(lsp
->ls_dev
));
2940 * If it's still held open, we'll do one of three things:
2942 * If no flag is set, just return EBUSY.
2944 * If the 'cleanup' flag is set, unmap and remove the device when
2945 * the last user finishes.
2947 * If the 'force' flag is set, then we forcibly close the underlying
2948 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl
2949 * will return DKIO_DEV_GONE. When the device is last closed, the
2950 * device will be cleaned up appropriately.
2952 * This is complicated by the fact that we may have outstanding
2953 * dispatched I/Os. Rather than having a single mutex to serialize all
2954 * I/O, we keep a count of the number of outstanding I/O requests
2955 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os
2956 * should be dispatched (ls_vp_closereq).
2958 * We set the flag, wait for the number of outstanding I/Os to reach 0,
2959 * and then close the underlying vnode.
2961 if (is_opened(lsp
)) {
2962 if (klip
->li_force
) {
2963 mutex_enter(&lsp
->ls_vp_lock
);
2964 lsp
->ls_vp_closereq
= B_TRUE
;
2965 /* wake up any threads waiting on dkiocstate */
2966 cv_broadcast(&lsp
->ls_vp_cv
);
2967 while (lsp
->ls_vp_iocount
> 0)
2968 cv_wait(&lsp
->ls_vp_cv
, &lsp
->ls_vp_lock
);
2969 mutex_exit(&lsp
->ls_vp_lock
);
2972 } else if (klip
->li_cleanup
) {
2973 lsp
->ls_cleanup
= 1;
2974 mutex_exit(&lofi_lock
);
2975 free_lofi_ioctl(klip
);
2979 mutex_exit(&lofi_lock
);
2980 free_lofi_ioctl(klip
);
2986 lofi_destroy(lsp
, credp
);
2989 * check the lofi_devlink_cache if device is really gone.
2990 * note: we just wait for timeout here and dont give error if
2991 * timer will expire. This check is to try to ensure the unmap is
2992 * really done when lofiadm -d completes.
2993 * Since lofi_lock is held, also hopefully the lofiadm -a calls
2994 * wont interfere the the unmap.
2996 (void) snprintf(name
, sizeof (name
), "%d", klip
->li_id
);
2997 ticks
= ddi_get_lbolt() + LOFI_TIMEOUT
* drv_usectohz(1000000);
2998 mutex_enter(&lofi_devlink_cache
.ln_lock
);
2999 err
= nvlist_lookup_nvlist(lofi_devlink_cache
.ln_data
, name
, &nvl
);
3001 err
= cv_timedwait(&lofi_devlink_cache
.ln_cv
,
3002 &lofi_devlink_cache
.ln_lock
, ticks
);
3005 err
= nvlist_lookup_nvlist(lofi_devlink_cache
.ln_data
,
3008 mutex_exit(&lofi_devlink_cache
.ln_lock
);
3010 mutex_exit(&lofi_lock
);
3011 (void) copy_out_lofi_ioctl(klip
, ulip
, ioctl_flag
);
3012 free_lofi_ioctl(klip
);
3017 * get the filename given the minor number, or the minor number given
3022 lofi_get_info(dev_t dev
, struct lofi_ioctl
*ulip
, int which
,
3023 struct cred
*credp
, int ioctl_flag
)
3025 struct lofi_ioctl
*klip
;
3026 struct lofi_state
*lsp
;
3029 error
= copy_in_lofi_ioctl(ulip
, &klip
, ioctl_flag
);
3034 case LOFI_GET_FILENAME
:
3035 if (klip
->li_id
== 0) {
3036 free_lofi_ioctl(klip
);
3040 mutex_enter(&lofi_lock
);
3041 lsp
= ddi_get_soft_state(lofi_statep
, klip
->li_id
);
3042 if (lsp
== NULL
|| lofi_access(lsp
) != 0) {
3043 mutex_exit(&lofi_lock
);
3044 free_lofi_ioctl(klip
);
3049 * This may fail if, for example, we're trying to look
3050 * up a zoned NFS path from the global zone.
3052 if (vnodetopath(NULL
, lsp
->ls_stacked_vp
, klip
->li_filename
,
3053 sizeof (klip
->li_filename
), CRED()) != 0) {
3054 (void) strlcpy(klip
->li_filename
, "?",
3055 sizeof (klip
->li_filename
));
3058 klip
->li_readonly
= lsp
->ls_readonly
;
3059 klip
->li_labeled
= lsp
->ls_cmlbhandle
!= NULL
;
3061 (void) strlcpy(klip
->li_algorithm
, lsp
->ls_comp_algorithm
,
3062 sizeof (klip
->li_algorithm
));
3063 klip
->li_crypto_enabled
= lsp
->ls_crypto_enabled
;
3064 mutex_exit(&lofi_lock
);
3066 lofi_copy_devpath(klip
);
3067 error
= copy_out_lofi_ioctl(klip
, ulip
, ioctl_flag
);
3068 free_lofi_ioctl(klip
);
3070 case LOFI_GET_MINOR
:
3071 mutex_enter(&lofi_lock
);
3072 error
= file_to_lofi(klip
->li_filename
,
3073 klip
->li_readonly
, &lsp
);
3075 mutex_exit(&lofi_lock
);
3076 free_lofi_ioctl(klip
);
3079 klip
->li_id
= LOFI_MINOR2ID(getminor(lsp
->ls_dev
));
3081 klip
->li_readonly
= lsp
->ls_readonly
;
3082 klip
->li_labeled
= lsp
->ls_cmlbhandle
!= NULL
;
3083 mutex_exit(&lofi_lock
);
3085 lofi_copy_devpath(klip
);
3086 error
= copy_out_lofi_ioctl(klip
, ulip
, ioctl_flag
);
3088 free_lofi_ioctl(klip
);
3090 case LOFI_CHECK_COMPRESSED
:
3091 mutex_enter(&lofi_lock
);
3092 error
= file_to_lofi(klip
->li_filename
,
3093 klip
->li_readonly
, &lsp
);
3095 mutex_exit(&lofi_lock
);
3096 free_lofi_ioctl(klip
);
3100 klip
->li_id
= LOFI_MINOR2ID(getminor(lsp
->ls_dev
));
3101 (void) strlcpy(klip
->li_algorithm
, lsp
->ls_comp_algorithm
,
3102 sizeof (klip
->li_algorithm
));
3104 mutex_exit(&lofi_lock
);
3105 error
= copy_out_lofi_ioctl(klip
, ulip
, ioctl_flag
);
3106 free_lofi_ioctl(klip
);
3109 free_lofi_ioctl(klip
);
3115 uscsi_is_inquiry(intptr_t arg
, int flag
, union scsi_cdb
*cdb
,
3116 struct uscsi_cmd
*uscmd
)
3120 #ifdef _MULTI_DATAMODEL
3121 switch (ddi_model_convert_from(flag
& FMODELS
)) {
3122 case DDI_MODEL_ILP32
: {
3123 struct uscsi_cmd32 ucmd32
;
3125 if (ddi_copyin((void *)arg
, &ucmd32
, sizeof (ucmd32
), flag
)) {
3129 uscsi_cmd32touscsi_cmd((&ucmd32
), uscmd
);
3132 case DDI_MODEL_NONE
:
3133 if (ddi_copyin((void *)arg
, uscmd
, sizeof (*uscmd
), flag
)) {
3143 if (ddi_copyin((void *)arg
, uscmd
, sizeof (*uscmd
), flag
)) {
3147 #endif /* _MULTI_DATAMODEL */
3148 if (ddi_copyin(uscmd
->uscsi_cdb
, cdb
, uscmd
->uscsi_cdblen
, flag
)) {
3152 if (cdb
->scc_cmd
== SCMD_INQUIRY
) {
3160 lofi_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int flag
, cred_t
*credp
,
3164 enum dkio_state dkstate
;
3165 struct lofi_state
*lsp
;
3168 id
= LOFI_MINOR2ID(getminor(dev
));
3170 /* lofi ioctls only apply to the master device */
3172 struct lofi_ioctl
*lip
= (struct lofi_ioctl
*)arg
;
3175 * the query command only need read-access - i.e., normal
3176 * users are allowed to do those on the ctl device as
3177 * long as they can open it read-only.
3181 if ((flag
& FWRITE
) == 0)
3183 return (lofi_map_file(dev
, lip
, 1, rvalp
, credp
, flag
));
3184 case LOFI_MAP_FILE_MINOR
:
3185 if ((flag
& FWRITE
) == 0)
3187 return (lofi_map_file(dev
, lip
, 0, rvalp
, credp
, flag
));
3188 case LOFI_UNMAP_FILE
:
3189 if ((flag
& FWRITE
) == 0)
3191 return (lofi_unmap_file(lip
, 1, credp
, flag
));
3192 case LOFI_UNMAP_FILE_MINOR
:
3193 if ((flag
& FWRITE
) == 0)
3195 return (lofi_unmap_file(lip
, 0, credp
, flag
));
3196 case LOFI_GET_FILENAME
:
3197 return (lofi_get_info(dev
, lip
, LOFI_GET_FILENAME
,
3199 case LOFI_GET_MINOR
:
3200 return (lofi_get_info(dev
, lip
, LOFI_GET_MINOR
,
3204 * This API made limited sense when this value was fixed
3205 * at LOFI_MAX_FILES. However, its use to iterate
3206 * across all possible devices in lofiadm means we don't
3207 * want to return L_MAXMIN, but the highest
3210 case LOFI_GET_MAXMINOR
:
3213 mutex_enter(&lofi_lock
);
3215 for (lsp
= list_head(&lofi_list
); lsp
!= NULL
;
3216 lsp
= list_next(&lofi_list
, lsp
)) {
3218 if (lofi_access(lsp
) != 0)
3221 i
= ddi_get_instance(lsp
->ls_dip
);
3226 mutex_exit(&lofi_lock
);
3228 error
= ddi_copyout(&id
, &lip
->li_id
,
3234 case LOFI_CHECK_COMPRESSED
:
3235 return (lofi_get_info(dev
, lip
, LOFI_CHECK_COMPRESSED
,
3242 mutex_enter(&lofi_lock
);
3243 lsp
= ddi_get_soft_state(lofi_statep
, id
);
3244 if (lsp
== NULL
|| lsp
->ls_vp_closereq
) {
3245 mutex_exit(&lofi_lock
);
3248 mutex_exit(&lofi_lock
);
3250 if (ddi_prop_exists(DDI_DEV_T_ANY
, lsp
->ls_dip
, DDI_PROP_DONTPASS
,
3252 error
= cmlb_ioctl(lsp
->ls_cmlbhandle
, dev
, cmd
, arg
, flag
,
3254 if (error
!= ENOTTY
)
3259 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
3260 * EIO as if the device was no longer present.
3262 if (lsp
->ls_vp
== NULL
&& cmd
!= DKIOCSTATE
)
3265 /* these are for faking out utilities like newfs */
3267 case DKIOCGMEDIAINFO
:
3268 case DKIOCGMEDIAINFOEXT
: {
3269 struct dk_minfo_ext media_info
;
3270 int shift
= lsp
->ls_lbshift
;
3273 if (cmd
== DKIOCGMEDIAINFOEXT
) {
3274 media_info
.dki_pbsize
= 1U << lsp
->ls_pbshift
;
3275 size
= sizeof (struct dk_minfo_ext
);
3277 size
= sizeof (struct dk_minfo
);
3280 media_info
.dki_media_type
= DK_FIXED_DISK
;
3281 media_info
.dki_lbsize
= 1U << shift
;
3282 media_info
.dki_capacity
=
3283 (lsp
->ls_vp_size
- lsp
->ls_crypto_offset
) >> shift
;
3285 if (ddi_copyout(&media_info
, (void *)arg
, size
, flag
))
3289 case DKIOCREMOVABLE
: {
3291 if (ddi_copyout(&i
, (caddr_t
)arg
, sizeof (int), flag
))
3298 fake_disk_vtoc(lsp
, &vt
);
3300 switch (ddi_model_convert_from(flag
& FMODELS
)) {
3301 case DDI_MODEL_ILP32
: {
3302 struct vtoc32 vtoc32
;
3304 vtoctovtoc32(vt
, vtoc32
);
3305 if (ddi_copyout(&vtoc32
, (void *)arg
,
3306 sizeof (struct vtoc32
), flag
))
3311 case DDI_MODEL_NONE
:
3312 if (ddi_copyout(&vt
, (void *)arg
,
3313 sizeof (struct vtoc
), flag
))
3321 fake_disk_info(dev
, &ci
);
3322 if (ddi_copyout(&ci
, (void *)arg
, sizeof (ci
), flag
))
3326 case DKIOCG_VIRTGEOM
:
3327 case DKIOCG_PHYGEOM
:
3329 error
= ddi_copyout(&lsp
->ls_dkg
, (void *)arg
,
3330 sizeof (struct dk_geom
), flag
);
3336 * Normally, lofi devices are always in the INSERTED state. If
3337 * a device is forcefully unmapped, then the device transitions
3338 * to the DKIO_DEV_GONE state.
3340 if (ddi_copyin((void *)arg
, &dkstate
, sizeof (dkstate
),
3344 mutex_enter(&lsp
->ls_vp_lock
);
3345 lsp
->ls_vp_iocount
++;
3346 while (((dkstate
== DKIO_INSERTED
&& lsp
->ls_vp
!= NULL
) ||
3347 (dkstate
== DKIO_DEV_GONE
&& lsp
->ls_vp
== NULL
)) &&
3348 !lsp
->ls_vp_closereq
) {
3350 * By virtue of having the device open, we know that
3351 * 'lsp' will remain valid when we return.
3353 if (!cv_wait_sig(&lsp
->ls_vp_cv
,
3354 &lsp
->ls_vp_lock
)) {
3355 lsp
->ls_vp_iocount
--;
3356 cv_broadcast(&lsp
->ls_vp_cv
);
3357 mutex_exit(&lsp
->ls_vp_lock
);
3362 dkstate
= (!lsp
->ls_vp_closereq
&& lsp
->ls_vp
!= NULL
?
3363 DKIO_INSERTED
: DKIO_DEV_GONE
);
3364 lsp
->ls_vp_iocount
--;
3365 cv_broadcast(&lsp
->ls_vp_cv
);
3366 mutex_exit(&lsp
->ls_vp_lock
);
3368 if (ddi_copyout(&dkstate
, (void *)arg
,
3369 sizeof (dkstate
), flag
) != 0)
3373 struct uscsi_cmd uscmd
;
3376 if (uscsi_is_inquiry(arg
, flag
, &cdb
, &uscmd
) == 0) {
3377 struct scsi_inquiry inq
= {0};
3379 lofi_create_inquiry(lsp
, &inq
);
3380 if (ddi_copyout(&inq
, uscmd
.uscsi_bufaddr
,
3381 uscmd
.uscsi_buflen
, flag
) != 0)
3384 } else if (cdb
.scc_cmd
== SCMD_READ_CAPACITY
) {
3385 struct scsi_capacity capacity
;
3388 BE_32((lsp
->ls_vp_size
- lsp
->ls_crypto_offset
) >>
3390 capacity
.lbasize
= BE_32(1 << lsp
->ls_lbshift
);
3391 if (ddi_copyout(&capacity
, uscmd
.uscsi_bufaddr
,
3392 uscmd
.uscsi_buflen
, flag
) != 0)
3397 uscmd
.uscsi_rqstatus
= 0xff;
3398 #ifdef _MULTI_DATAMODEL
3399 switch (ddi_model_convert_from(flag
& FMODELS
)) {
3400 case DDI_MODEL_ILP32
: {
3401 struct uscsi_cmd32 ucmd32
;
3402 uscsi_cmdtouscsi_cmd32((&uscmd
), (&ucmd32
));
3403 if (ddi_copyout(&ucmd32
, (void *)arg
, sizeof (ucmd32
),
3408 case DDI_MODEL_NONE
:
3409 if (ddi_copyout(&uscmd
, (void *)arg
, sizeof (uscmd
),
3417 if (ddi_copyout(&uscmd
, (void *)arg
, sizeof (uscmd
), flag
) != 0)
3419 #endif /* _MULTI_DATAMODEL */
3424 cmn_err(CE_WARN
, "lofi_ioctl: %d is not implemented\n", cmd
);
3431 lofi_prop_op(dev_t dev
, dev_info_t
*dip
, ddi_prop_op_t prop_op
, int mod_flags
,
3432 char *name
, caddr_t valuep
, int *lengthp
)
3434 struct lofi_state
*lsp
;
3437 lsp
= ddi_get_soft_state(lofi_statep
, ddi_get_instance(dip
));
3439 return (ddi_prop_op(dev
, dip
, prop_op
, mod_flags
,
3440 name
, valuep
, lengthp
));
3443 rc
= cmlb_prop_op(lsp
->ls_cmlbhandle
, dev
, dip
, prop_op
, mod_flags
,
3444 name
, valuep
, lengthp
, LOFI_PART(getminor(dev
)), NULL
);
3445 if (rc
== DDI_PROP_SUCCESS
)
3448 return (ddi_prop_op(DDI_DEV_T_ANY
, dip
, prop_op
, mod_flags
,
3449 name
, valuep
, lengthp
));
3452 static struct cb_ops lofi_cb_ops
= {
3453 lofi_open
, /* open */
3454 lofi_close
, /* close */
3455 lofi_strategy
, /* strategy */
3458 lofi_read
, /* read */
3459 lofi_write
, /* write */
3460 lofi_ioctl
, /* ioctl */
3464 nochpoll
, /* poll */
3465 lofi_prop_op
, /* prop_op */
3467 D_64BIT
| D_NEW
| D_MP
, /* Driver compatibility flag */
3473 static struct dev_ops lofi_ops
= {
3474 DEVO_REV
, /* devo_rev, */
3476 lofi_info
, /* info */
3477 nulldev
, /* identify */
3478 nulldev
, /* probe */
3479 lofi_attach
, /* attach */
3480 lofi_detach
, /* detach */
3482 &lofi_cb_ops
, /* driver operations */
3483 NULL
, /* no bus operations */
3485 ddi_quiesce_not_needed
, /* quiesce */
3488 static struct modldrv modldrv
= {
3490 "loopback file driver",
3494 static struct modlinkage modlinkage
= {
3505 list_create(&lofi_list
, sizeof (struct lofi_state
),
3506 offsetof(struct lofi_state
, ls_list
));
3508 error
= ddi_soft_state_init((void **)&lofi_statep
,
3509 sizeof (struct lofi_state
), 0);
3511 list_destroy(&lofi_list
);
3516 * The minor number is stored as id << LOFI_CMLB_SHIFT as
3517 * we need to reserve space for cmlb minor numbers.
3518 * This will leave out 4096 id values on 32bit kernel, which should
3521 lofi_id
= id_space_create("lofi_id", 1,
3522 (1 << (L_BITSMINOR
- LOFI_CMLB_SHIFT
)));
3524 if (lofi_id
== NULL
) {
3525 ddi_soft_state_fini((void **)&lofi_statep
);
3526 list_destroy(&lofi_list
);
3527 return (DDI_FAILURE
);
3530 mutex_init(&lofi_lock
, NULL
, MUTEX_DRIVER
, NULL
);
3532 error
= mod_install(&modlinkage
);
3535 id_space_destroy(lofi_id
);
3536 mutex_destroy(&lofi_lock
);
3537 ddi_soft_state_fini((void **)&lofi_statep
);
3538 list_destroy(&lofi_list
);
3549 mutex_enter(&lofi_lock
);
3551 if (!list_is_empty(&lofi_list
)) {
3552 mutex_exit(&lofi_lock
);
3556 mutex_exit(&lofi_lock
);
3558 error
= mod_remove(&modlinkage
);
3562 mutex_destroy(&lofi_lock
);
3563 id_space_destroy(lofi_id
);
3564 ddi_soft_state_fini((void **)&lofi_statep
);
3565 list_destroy(&lofi_list
);
3571 _info(struct modinfo
*modinfop
)
3573 return (mod_info(&modlinkage
, modinfop
));