2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
4 * Device operations for the pnfs nfs4 file layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
32 #include <linux/module.h>
33 #include <linux/buffer_head.h> /* __bread */
35 #include <linux/genhd.h>
36 #include <linux/blkdev.h>
37 #include <linux/hash.h>
39 #include "blocklayout.h"
41 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
43 static int decode_sector_number(__be32
**rp
, sector_t
*sp
)
47 *rp
= xdr_decode_hyper(*rp
, &s
);
49 printk(KERN_WARNING
"NFS: %s: sector not aligned\n", __func__
);
52 *sp
= s
>> SECTOR_SHIFT
;
57 * Release the block device
59 int nfs4_blkdev_put(struct block_device
*bdev
)
61 dprintk("%s for device %d:%d\n", __func__
, MAJOR(bdev
->bd_dev
),
63 return blkdev_put(bdev
, FMODE_READ
);
66 ssize_t
bl_pipe_downcall(struct file
*filp
, const char __user
*src
,
69 struct nfs_net
*nn
= net_generic(filp
->f_dentry
->d_sb
->s_fs_info
,
72 if (mlen
!= sizeof (struct bl_dev_msg
))
75 if (copy_from_user(&nn
->bl_mount_reply
, src
, mlen
) != 0)
83 void bl_pipe_destroy_msg(struct rpc_pipe_msg
*msg
)
85 struct bl_pipe_msg
*bl_pipe_msg
= container_of(msg
, struct bl_pipe_msg
, msg
);
89 wake_up(bl_pipe_msg
->bl_wq
);
93 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
95 struct pnfs_block_dev
*
96 nfs4_blk_decode_device(struct nfs_server
*server
,
97 struct pnfs_device
*dev
)
99 struct pnfs_block_dev
*rv
;
100 struct block_device
*bd
= NULL
;
101 struct bl_pipe_msg bl_pipe_msg
;
102 struct rpc_pipe_msg
*msg
= &bl_pipe_msg
.msg
;
103 struct bl_msg_hdr bl_msg
= {
104 .type
= BL_DEVICE_MOUNT
,
105 .totallen
= dev
->mincount
,
108 DECLARE_WAITQUEUE(wq
, current
);
109 int offset
, len
, i
, rc
;
110 struct net
*net
= server
->nfs_client
->cl_net
;
111 struct nfs_net
*nn
= net_generic(net
, nfs_net_id
);
112 struct bl_dev_msg
*reply
= &nn
->bl_mount_reply
;
114 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__
);
115 dprintk("%s: deviceid: %s, mincount: %d\n", __func__
, dev
->dev_id
.data
,
118 bl_pipe_msg
.bl_wq
= &nn
->bl_wq
;
119 memset(msg
, 0, sizeof(*msg
));
120 msg
->data
= kzalloc(sizeof(bl_msg
) + dev
->mincount
, GFP_NOFS
);
122 rv
= ERR_PTR(-ENOMEM
);
126 memcpy(msg
->data
, &bl_msg
, sizeof(bl_msg
));
127 dataptr
= (uint8_t *) msg
->data
;
129 offset
= sizeof(bl_msg
);
130 for (i
= 0; len
> 0; i
++) {
131 memcpy(&dataptr
[offset
], page_address(dev
->pages
[i
]),
132 len
< PAGE_CACHE_SIZE
? len
: PAGE_CACHE_SIZE
);
133 len
-= PAGE_CACHE_SIZE
;
134 offset
+= PAGE_CACHE_SIZE
;
136 msg
->len
= sizeof(bl_msg
) + dev
->mincount
;
138 dprintk("%s CALLING USERSPACE DAEMON\n", __func__
);
139 add_wait_queue(&nn
->bl_wq
, &wq
);
140 rc
= rpc_queue_upcall(nn
->bl_device_pipe
, msg
);
142 remove_wait_queue(&nn
->bl_wq
, &wq
);
147 set_current_state(TASK_UNINTERRUPTIBLE
);
149 __set_current_state(TASK_RUNNING
);
150 remove_wait_queue(&nn
->bl_wq
, &wq
);
152 if (reply
->status
!= BL_DEVICE_REQUEST_PROC
) {
153 dprintk("%s failed to open device: %d\n",
154 __func__
, reply
->status
);
155 rv
= ERR_PTR(-EINVAL
);
159 bd
= blkdev_get_by_dev(MKDEV(reply
->major
, reply
->minor
),
162 dprintk("%s failed to open device : %ld\n", __func__
,
168 rv
= kzalloc(sizeof(*rv
), GFP_NOFS
);
170 rv
= ERR_PTR(-ENOMEM
);
175 memcpy(&rv
->bm_mdevid
, &dev
->dev_id
, sizeof(struct nfs4_deviceid
));
177 dprintk("%s Created device %s with bd_block_size %u\n",
179 bd
->bd_disk
->disk_name
,
187 /* Map deviceid returned by the server to constructed block_device */
188 static struct block_device
*translate_devid(struct pnfs_layout_hdr
*lo
,
189 struct nfs4_deviceid
*id
)
191 struct block_device
*rv
= NULL
;
192 struct block_mount_id
*mid
;
193 struct pnfs_block_dev
*dev
;
195 dprintk("%s enter, lo=%p, id=%p\n", __func__
, lo
, id
);
197 spin_lock(&mid
->bm_lock
);
198 list_for_each_entry(dev
, &mid
->bm_devlist
, bm_node
) {
199 if (memcmp(id
->data
, dev
->bm_mdevid
.data
,
200 NFS4_DEVICEID4_SIZE
) == 0) {
206 spin_unlock(&mid
->bm_lock
);
207 dprintk("%s returning %p\n", __func__
, rv
);
211 /* Tracks info needed to ensure extents in layout obey constraints of spec */
212 struct layout_verification
{
213 u32 mode
; /* R or RW */
214 u64 start
; /* Expected start of next non-COW extent */
215 u64 inval
; /* Start of INVAL coverage */
216 u64 cowread
; /* End of COW read coverage */
219 /* Verify the extent meets the layout requirements of the pnfs-block draft,
222 static int verify_extent(struct pnfs_block_extent
*be
,
223 struct layout_verification
*lv
)
225 if (lv
->mode
== IOMODE_READ
) {
226 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
||
227 be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
229 if (be
->be_f_offset
!= lv
->start
)
231 lv
->start
+= be
->be_length
;
234 /* lv->mode == IOMODE_RW */
235 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
) {
236 if (be
->be_f_offset
!= lv
->start
)
238 if (lv
->cowread
> lv
->start
)
240 lv
->start
+= be
->be_length
;
241 lv
->inval
= lv
->start
;
243 } else if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
244 if (be
->be_f_offset
!= lv
->start
)
246 lv
->start
+= be
->be_length
;
248 } else if (be
->be_state
== PNFS_BLOCK_READ_DATA
) {
249 if (be
->be_f_offset
> lv
->start
)
251 if (be
->be_f_offset
< lv
->inval
)
253 if (be
->be_f_offset
< lv
->cowread
)
255 /* It looks like you might want to min this with lv->start,
256 * but you really don't.
258 lv
->inval
= lv
->inval
+ be
->be_length
;
259 lv
->cowread
= be
->be_f_offset
+ be
->be_length
;
265 /* XDR decode pnfs_block_layout4 structure */
267 nfs4_blk_process_layoutget(struct pnfs_layout_hdr
*lo
,
268 struct nfs4_layoutget_res
*lgr
, gfp_t gfp_flags
)
270 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
271 int i
, status
= -EIO
;
273 struct pnfs_block_extent
*be
= NULL
, *save
;
274 struct xdr_stream stream
;
276 struct page
*scratch
;
278 struct layout_verification lv
= {
279 .mode
= lgr
->range
.iomode
,
280 .start
= lgr
->range
.offset
>> SECTOR_SHIFT
,
281 .inval
= lgr
->range
.offset
>> SECTOR_SHIFT
,
282 .cowread
= lgr
->range
.offset
>> SECTOR_SHIFT
,
286 dprintk("---> %s\n", __func__
);
288 scratch
= alloc_page(gfp_flags
);
292 xdr_init_decode_pages(&stream
, &buf
, lgr
->layoutp
->pages
, lgr
->layoutp
->len
);
293 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
295 p
= xdr_inline_decode(&stream
, 4);
299 count
= be32_to_cpup(p
++);
301 dprintk("%s enter, number of extents %i\n", __func__
, count
);
302 p
= xdr_inline_decode(&stream
, (28 + NFS4_DEVICEID4_SIZE
) * count
);
306 /* Decode individual extents, putting them in temporary
307 * staging area until whole layout is decoded to make error
310 for (i
= 0; i
< count
; i
++) {
311 be
= bl_alloc_extent();
316 memcpy(&be
->be_devid
, p
, NFS4_DEVICEID4_SIZE
);
317 p
+= XDR_QUADLEN(NFS4_DEVICEID4_SIZE
);
318 be
->be_mdev
= translate_devid(lo
, &be
->be_devid
);
322 /* The next three values are read in as bytes,
323 * but stored as 512-byte sector lengths
325 if (decode_sector_number(&p
, &be
->be_f_offset
) < 0)
327 if (decode_sector_number(&p
, &be
->be_length
) < 0)
329 if (decode_sector_number(&p
, &be
->be_v_offset
) < 0)
331 be
->be_state
= be32_to_cpup(p
++);
332 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
333 be
->be_inval
= &bl
->bl_inval
;
334 if (verify_extent(be
, &lv
)) {
335 dprintk("%s verify failed\n", __func__
);
338 list_add_tail(&be
->be_node
, &extents
);
340 if (lgr
->range
.offset
+ lgr
->range
.length
!=
341 lv
.start
<< SECTOR_SHIFT
) {
342 dprintk("%s Final length mismatch\n", __func__
);
346 if (lv
.start
< lv
.cowread
) {
347 dprintk("%s Final uncovered COW extent\n", __func__
);
351 /* Extents decoded properly, now try to merge them in to
352 * existing layout extents.
354 spin_lock(&bl
->bl_ext_lock
);
355 list_for_each_entry_safe(be
, save
, &extents
, be_node
) {
356 list_del(&be
->be_node
);
357 status
= bl_add_merge_extent(bl
, be
);
359 spin_unlock(&bl
->bl_ext_lock
);
360 /* This is a fairly catastrophic error, as the
361 * entire layout extent lists are now corrupted.
362 * We should have some way to distinguish this.
368 spin_unlock(&bl
->bl_ext_lock
);
371 __free_page(scratch
);
372 dprintk("%s returns %i\n", __func__
, status
);
377 while (!list_empty(&extents
)) {
378 be
= list_first_entry(&extents
, struct pnfs_block_extent
,
380 list_del(&be
->be_node
);