2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
4 * Device operations for the pnfs nfs4 file layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
32 #include <linux/module.h>
33 #include <linux/buffer_head.h> /* __bread */
35 #include <linux/genhd.h>
36 #include <linux/blkdev.h>
37 #include <linux/hash.h>
39 #include "blocklayout.h"
41 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
43 static int decode_sector_number(__be32
**rp
, sector_t
*sp
)
47 *rp
= xdr_decode_hyper(*rp
, &s
);
49 printk(KERN_WARNING
"%s: sector not aligned\n", __func__
);
52 *sp
= s
>> SECTOR_SHIFT
;
56 /* Open a block_device by device number. */
57 struct block_device
*nfs4_blkdev_get(dev_t dev
)
59 struct block_device
*bd
;
61 dprintk("%s enter\n", __func__
);
62 bd
= blkdev_get_by_dev(dev
, FMODE_READ
, NULL
);
67 dprintk("%s failed to open device : %ld\n",
68 __func__
, PTR_ERR(bd
));
73 * Release the block device
75 int nfs4_blkdev_put(struct block_device
*bdev
)
77 dprintk("%s for device %d:%d\n", __func__
, MAJOR(bdev
->bd_dev
),
79 return blkdev_put(bdev
, FMODE_READ
);
82 static struct bl_dev_msg bl_mount_reply
;
84 ssize_t
bl_pipe_downcall(struct file
*filp
, const char __user
*src
,
87 if (mlen
!= sizeof (struct bl_dev_msg
))
90 if (copy_from_user(&bl_mount_reply
, src
, mlen
) != 0)
98 void bl_pipe_destroy_msg(struct rpc_pipe_msg
*msg
)
106 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
108 struct pnfs_block_dev
*
109 nfs4_blk_decode_device(struct nfs_server
*server
,
110 struct pnfs_device
*dev
)
112 struct pnfs_block_dev
*rv
;
113 struct block_device
*bd
= NULL
;
114 struct rpc_pipe_msg msg
;
115 struct bl_msg_hdr bl_msg
= {
116 .type
= BL_DEVICE_MOUNT
,
117 .totallen
= dev
->mincount
,
120 DECLARE_WAITQUEUE(wq
, current
);
121 struct bl_dev_msg
*reply
= &bl_mount_reply
;
122 int offset
, len
, i
, rc
;
124 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__
);
125 dprintk("%s: deviceid: %s, mincount: %d\n", __func__
, dev
->dev_id
.data
,
128 memset(&msg
, 0, sizeof(msg
));
129 msg
.data
= kzalloc(sizeof(bl_msg
) + dev
->mincount
, GFP_NOFS
);
131 rv
= ERR_PTR(-ENOMEM
);
135 memcpy(msg
.data
, &bl_msg
, sizeof(bl_msg
));
136 dataptr
= (uint8_t *) msg
.data
;
138 offset
= sizeof(bl_msg
);
139 for (i
= 0; len
> 0; i
++) {
140 memcpy(&dataptr
[offset
], page_address(dev
->pages
[i
]),
141 len
< PAGE_CACHE_SIZE
? len
: PAGE_CACHE_SIZE
);
142 len
-= PAGE_CACHE_SIZE
;
143 offset
+= PAGE_CACHE_SIZE
;
145 msg
.len
= sizeof(bl_msg
) + dev
->mincount
;
147 dprintk("%s CALLING USERSPACE DAEMON\n", __func__
);
148 add_wait_queue(&bl_wq
, &wq
);
149 rc
= rpc_queue_upcall(bl_device_pipe
->d_inode
, &msg
);
151 remove_wait_queue(&bl_wq
, &wq
);
156 set_current_state(TASK_UNINTERRUPTIBLE
);
158 __set_current_state(TASK_RUNNING
);
159 remove_wait_queue(&bl_wq
, &wq
);
161 if (reply
->status
!= BL_DEVICE_REQUEST_PROC
) {
162 dprintk("%s failed to open device: %d\n",
163 __func__
, reply
->status
);
164 rv
= ERR_PTR(-EINVAL
);
168 bd
= nfs4_blkdev_get(MKDEV(reply
->major
, reply
->minor
));
171 dprintk("%s failed to open device : %d\n", __func__
, rc
);
176 rv
= kzalloc(sizeof(*rv
), GFP_NOFS
);
178 rv
= ERR_PTR(-ENOMEM
);
183 memcpy(&rv
->bm_mdevid
, &dev
->dev_id
, sizeof(struct nfs4_deviceid
));
184 dprintk("%s Created device %s with bd_block_size %u\n",
186 bd
->bd_disk
->disk_name
,
194 /* Map deviceid returned by the server to constructed block_device */
195 static struct block_device
*translate_devid(struct pnfs_layout_hdr
*lo
,
196 struct nfs4_deviceid
*id
)
198 struct block_device
*rv
= NULL
;
199 struct block_mount_id
*mid
;
200 struct pnfs_block_dev
*dev
;
202 dprintk("%s enter, lo=%p, id=%p\n", __func__
, lo
, id
);
204 spin_lock(&mid
->bm_lock
);
205 list_for_each_entry(dev
, &mid
->bm_devlist
, bm_node
) {
206 if (memcmp(id
->data
, dev
->bm_mdevid
.data
,
207 NFS4_DEVICEID4_SIZE
) == 0) {
213 spin_unlock(&mid
->bm_lock
);
214 dprintk("%s returning %p\n", __func__
, rv
);
218 /* Tracks info needed to ensure extents in layout obey constraints of spec */
219 struct layout_verification
{
220 u32 mode
; /* R or RW */
221 u64 start
; /* Expected start of next non-COW extent */
222 u64 inval
; /* Start of INVAL coverage */
223 u64 cowread
; /* End of COW read coverage */
226 /* Verify the extent meets the layout requirements of the pnfs-block draft,
229 static int verify_extent(struct pnfs_block_extent
*be
,
230 struct layout_verification
*lv
)
232 if (lv
->mode
== IOMODE_READ
) {
233 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
||
234 be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
236 if (be
->be_f_offset
!= lv
->start
)
238 lv
->start
+= be
->be_length
;
241 /* lv->mode == IOMODE_RW */
242 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
) {
243 if (be
->be_f_offset
!= lv
->start
)
245 if (lv
->cowread
> lv
->start
)
247 lv
->start
+= be
->be_length
;
248 lv
->inval
= lv
->start
;
250 } else if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
251 if (be
->be_f_offset
!= lv
->start
)
253 lv
->start
+= be
->be_length
;
255 } else if (be
->be_state
== PNFS_BLOCK_READ_DATA
) {
256 if (be
->be_f_offset
> lv
->start
)
258 if (be
->be_f_offset
< lv
->inval
)
260 if (be
->be_f_offset
< lv
->cowread
)
262 /* It looks like you might want to min this with lv->start,
263 * but you really don't.
265 lv
->inval
= lv
->inval
+ be
->be_length
;
266 lv
->cowread
= be
->be_f_offset
+ be
->be_length
;
272 /* XDR decode pnfs_block_layout4 structure */
274 nfs4_blk_process_layoutget(struct pnfs_layout_hdr
*lo
,
275 struct nfs4_layoutget_res
*lgr
, gfp_t gfp_flags
)
277 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
278 int i
, status
= -EIO
;
280 struct pnfs_block_extent
*be
= NULL
, *save
;
281 struct xdr_stream stream
;
283 struct page
*scratch
;
285 struct layout_verification lv
= {
286 .mode
= lgr
->range
.iomode
,
287 .start
= lgr
->range
.offset
>> SECTOR_SHIFT
,
288 .inval
= lgr
->range
.offset
>> SECTOR_SHIFT
,
289 .cowread
= lgr
->range
.offset
>> SECTOR_SHIFT
,
293 dprintk("---> %s\n", __func__
);
295 scratch
= alloc_page(gfp_flags
);
299 xdr_init_decode_pages(&stream
, &buf
, lgr
->layoutp
->pages
, lgr
->layoutp
->len
);
300 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
302 p
= xdr_inline_decode(&stream
, 4);
306 count
= be32_to_cpup(p
++);
308 dprintk("%s enter, number of extents %i\n", __func__
, count
);
309 p
= xdr_inline_decode(&stream
, (28 + NFS4_DEVICEID4_SIZE
) * count
);
313 /* Decode individual extents, putting them in temporary
314 * staging area until whole layout is decoded to make error
317 for (i
= 0; i
< count
; i
++) {
318 be
= bl_alloc_extent();
323 memcpy(&be
->be_devid
, p
, NFS4_DEVICEID4_SIZE
);
324 p
+= XDR_QUADLEN(NFS4_DEVICEID4_SIZE
);
325 be
->be_mdev
= translate_devid(lo
, &be
->be_devid
);
329 /* The next three values are read in as bytes,
330 * but stored as 512-byte sector lengths
332 if (decode_sector_number(&p
, &be
->be_f_offset
) < 0)
334 if (decode_sector_number(&p
, &be
->be_length
) < 0)
336 if (decode_sector_number(&p
, &be
->be_v_offset
) < 0)
338 be
->be_state
= be32_to_cpup(p
++);
339 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
340 be
->be_inval
= &bl
->bl_inval
;
341 if (verify_extent(be
, &lv
)) {
342 dprintk("%s verify failed\n", __func__
);
345 list_add_tail(&be
->be_node
, &extents
);
347 if (lgr
->range
.offset
+ lgr
->range
.length
!=
348 lv
.start
<< SECTOR_SHIFT
) {
349 dprintk("%s Final length mismatch\n", __func__
);
353 if (lv
.start
< lv
.cowread
) {
354 dprintk("%s Final uncovered COW extent\n", __func__
);
358 /* Extents decoded properly, now try to merge them in to
359 * existing layout extents.
361 spin_lock(&bl
->bl_ext_lock
);
362 list_for_each_entry_safe(be
, save
, &extents
, be_node
) {
363 list_del(&be
->be_node
);
364 status
= bl_add_merge_extent(bl
, be
);
366 spin_unlock(&bl
->bl_ext_lock
);
367 /* This is a fairly catastrophic error, as the
368 * entire layout extent lists are now corrupted.
369 * We should have some way to distinguish this.
375 spin_unlock(&bl
->bl_ext_lock
);
378 __free_page(scratch
);
379 dprintk("%s returns %i\n", __func__
, status
);
384 while (!list_empty(&extents
)) {
385 be
= list_first_entry(&extents
, struct pnfs_block_extent
,
387 list_del(&be
->be_node
);