Merge with Linux 2.5.74.
[linux-2.6/linux-mips.git] / drivers / block / nbd.c
blob55cde8caf8a86fd6a03187a01b52fcf073f0338c
1 /*
2 * Network block device - make block devices work over TCP
4 * Note that you can not swap over this thing, yet. Seems to work but
5 * deadlocks sometimes - you can not swap over TCP in general.
6 *
7 * Copyright 1997-2000 Pavel Machek <pavel@ucw.cz>
8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
10 * (part of code stolen from loop.c)
12 * 97-3-25 compiled 0-th version, not yet tested it
13 * (it did not work, BTW) (later that day) HEY! it works!
14 * (bit later) hmm, not that much... 2:00am next day:
15 * yes, it works, but it gives something like 50kB/sec
16 * 97-4-01 complete rewrite to make it possible for many requests at
17 * once to be processed
18 * 97-4-11 Making protocol independent of endianity etc.
19 * 97-9-13 Cosmetic changes
20 * 98-5-13 Attempt to make 64-bit-clean on 64-bit machines
21 * 99-1-11 Attempt to make 64-bit-clean on 32-bit machines <ankry@mif.pg.gda.pl>
22 * 01-2-27 Fix to store proper blockcount for kernel (calculated using
23 * BLOCK_SIZE_BITS, not device blocksize) <aga@permonline.ru>
24 * 01-3-11 Make nbd work with new Linux block layer code. It now supports
25 * plugging like all the other block devices. Also added in MSG_MORE to
26 * reduce number of partial TCP segments sent. <steve@chygwyn.com>
27 * 01-12-6 Fix deadlock condition by making queue locks independent of
28 * the transmit lock. <steve@chygwyn.com>
29 * 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes.
30 * <Paul.Clements@SteelEye.com> <James.Bottomley@SteelEye.com>
31 * 03-06-22 Make nbd work with new linux 2.5 block layer design. This fixes
32 * memory corruption from module removal and possible memory corruption
33 * from sending/receiving disk data. <ldl@aros.net>
35 * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
36 * why not: would need verify_area and friends, would share yet another
37 * structure with userland
40 #define PARANOIA
41 #include <linux/major.h>
43 #include <linux/blk.h>
44 #include <linux/blkdev.h>
45 #include <linux/module.h>
46 #include <linux/init.h>
47 #include <linux/sched.h>
48 #include <linux/fs.h>
49 #include <linux/bio.h>
50 #include <linux/stat.h>
51 #include <linux/errno.h>
52 #include <linux/file.h>
53 #include <linux/ioctl.h>
54 #include <linux/blkdev.h>
55 #include <linux/blk.h>
56 #include <net/sock.h>
58 #include <linux/devfs_fs_kernel.h>
60 #include <asm/uaccess.h>
61 #include <asm/types.h>
63 #include <linux/nbd.h>
65 #define LO_MAGIC 0x68797548
67 static struct nbd_device nbd_dev[MAX_NBD];
70 * Use just one lock (or at most 1 per NIC). Two arguments for this:
71 * 1. Each NIC is essentially a synchronization point for all servers
72 * accessed through that NIC so there's no need to have more locks
73 * than NICs anyway.
74 * 2. More locks lead to more "Dirty cache line bouncing" which will slow
75 * down each lock to the point where they're actually slower than just
76 * a single lock.
77 * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
79 static spinlock_t nbd_lock = SPIN_LOCK_UNLOCKED;
81 #define DEBUG( s )
82 /* #define DEBUG( s ) printk( s )
85 static int requests_in;
86 static int requests_out;
88 static void nbd_end_request(struct request *req)
90 int uptodate = (req->errors == 0) ? 1 : 0;
91 request_queue_t *q = req->q;
92 unsigned long flags;
94 #ifdef PARANOIA
95 requests_out++;
96 #endif
97 spin_lock_irqsave(q->queue_lock, flags);
98 if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
99 end_that_request_last(req);
101 spin_unlock_irqrestore(q->queue_lock, flags);
104 static int nbd_open(struct inode *inode, struct file *file)
106 struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
107 lo->refcnt++;
108 return 0;
112 * Send or receive packet.
114 static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags)
116 mm_segment_t oldfs;
117 int result;
118 struct msghdr msg;
119 struct iovec iov;
120 unsigned long flags;
121 sigset_t oldset;
123 oldfs = get_fs();
124 set_fs(get_ds());
125 /* Allow interception of SIGKILL only
126 * Don't allow other signals to interrupt the transmission */
127 spin_lock_irqsave(&current->sighand->siglock, flags);
128 oldset = current->blocked;
129 sigfillset(&current->blocked);
130 sigdelsetmask(&current->blocked, sigmask(SIGKILL));
131 recalc_sigpending();
132 spin_unlock_irqrestore(&current->sighand->siglock, flags);
135 do {
136 sock->sk->sk_allocation = GFP_NOIO;
137 iov.iov_base = buf;
138 iov.iov_len = size;
139 msg.msg_name = NULL;
140 msg.msg_namelen = 0;
141 msg.msg_iov = &iov;
142 msg.msg_iovlen = 1;
143 msg.msg_control = NULL;
144 msg.msg_controllen = 0;
145 msg.msg_namelen = 0;
146 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
148 if (send)
149 result = sock_sendmsg(sock, &msg, size);
150 else
151 result = sock_recvmsg(sock, &msg, size, 0);
153 if (signal_pending(current)) {
154 siginfo_t info;
155 spin_lock_irqsave(&current->sighand->siglock, flags);
156 printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n",
157 current->pid, current->comm,
158 dequeue_signal(current, &current->blocked, &info));
159 spin_unlock_irqrestore(&current->sighand->siglock, flags);
160 result = -EINTR;
161 break;
164 if (result <= 0) {
165 #ifdef PARANOIA
166 printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n",
167 send ? "send" : "receive", (long) sock, (long) buf, size, result);
168 #endif
169 break;
171 size -= result;
172 buf += result;
173 } while (size > 0);
175 spin_lock_irqsave(&current->sighand->siglock, flags);
176 current->blocked = oldset;
177 recalc_sigpending();
178 spin_unlock_irqrestore(&current->sighand->siglock, flags);
180 set_fs(oldfs);
181 return result;
184 static inline int sock_send_bvec(struct socket *sock, struct bio_vec *bvec,
185 int flags)
187 int result;
188 void *kaddr = kmap(bvec->bv_page);
189 result = nbd_xmit(1, sock, kaddr + bvec->bv_offset, bvec->bv_len,
190 flags);
191 kunmap(bvec->bv_page);
192 return result;
195 #define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; }
197 void nbd_send_req(struct nbd_device *lo, struct request *req)
199 int result, i, flags;
200 struct nbd_request request;
201 unsigned long size = req->nr_sectors << 9;
202 struct socket *sock = lo->sock;
204 DEBUG("NBD: sending control, ");
206 request.magic = htonl(NBD_REQUEST_MAGIC);
207 request.type = htonl(nbd_cmd(req));
208 request.from = cpu_to_be64( (u64) req->sector << 9);
209 request.len = htonl(size);
210 memcpy(request.handle, &req, sizeof(req));
212 down(&lo->tx_lock);
214 if (!sock || !lo->sock) {
215 printk(KERN_ERR "NBD: Attempted sendmsg to closed socket\n");
216 goto error_out;
219 result = nbd_xmit(1, sock, (char *) &request, sizeof(request), nbd_cmd(req) == NBD_CMD_WRITE ? MSG_MORE : 0);
220 if (result <= 0)
221 FAIL("Sendmsg failed for control.");
223 if (nbd_cmd(req) == NBD_CMD_WRITE) {
224 struct bio *bio;
226 * we are really probing at internals to determine
227 * whether to set MSG_MORE or not...
229 rq_for_each_bio(bio, req) {
230 struct bio_vec *bvec;
231 bio_for_each_segment(bvec, bio, i) {
232 flags = 0;
233 if ((i < (bio->bi_vcnt - 1)) || bio->bi_next)
234 flags = MSG_MORE;
235 DEBUG("data, ");
236 result = sock_send_bvec(sock, bvec, flags);
237 if (result <= 0)
238 FAIL("Send data failed.");
242 up(&lo->tx_lock);
243 return;
245 error_out:
246 up(&lo->tx_lock);
247 req->errors++;
250 static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
252 struct request *req;
253 struct list_head *tmp;
254 struct request *xreq;
256 memcpy(&xreq, handle, sizeof(xreq));
258 spin_lock(&lo->queue_lock);
259 list_for_each(tmp, &lo->queue_head) {
260 req = list_entry(tmp, struct request, queuelist);
261 if (req != xreq)
262 continue;
263 list_del_init(&req->queuelist);
264 spin_unlock(&lo->queue_lock);
265 return req;
267 spin_unlock(&lo->queue_lock);
268 return NULL;
271 static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
273 int result;
274 void *kaddr = kmap(bvec->bv_page);
275 result = nbd_xmit(0, sock, kaddr + bvec->bv_offset, bvec->bv_len,
276 MSG_WAITALL);
277 kunmap(bvec->bv_page);
278 return result;
281 #define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; }
282 struct request *nbd_read_stat(struct nbd_device *lo)
283 /* NULL returned = something went wrong, inform userspace */
285 int result;
286 struct nbd_reply reply;
287 struct request *req;
288 struct socket *sock = lo->sock;
290 DEBUG("reading control, ");
291 reply.magic = 0;
292 result = nbd_xmit(0, sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
293 if (result <= 0)
294 HARDFAIL("Recv control failed.");
295 req = nbd_find_request(lo, reply.handle);
296 if (req == NULL)
297 HARDFAIL("Unexpected reply");
299 DEBUG("ok, ");
300 if (ntohl(reply.magic) != NBD_REPLY_MAGIC)
301 HARDFAIL("Not enough magic.");
302 if (ntohl(reply.error))
303 FAIL("Other side returned error.");
305 if (nbd_cmd(req) == NBD_CMD_READ) {
306 int i;
307 struct bio *bio;
308 DEBUG("data, ");
309 rq_for_each_bio(bio, req) {
310 struct bio_vec *bvec;
311 bio_for_each_segment(bvec, bio, i) {
312 result = sock_recv_bvec(sock, bvec);
313 if (result <= 0)
314 HARDFAIL("Recv data failed.");
318 DEBUG("done.\n");
319 return req;
321 /* Can we get here? Yes, if other side returns error */
322 error_out:
323 req->errors++;
324 return req;
327 void nbd_do_it(struct nbd_device *lo)
329 struct request *req;
331 while (1) {
332 req = nbd_read_stat(lo);
334 if (!req) {
335 printk(KERN_ALERT "req should never be null\n" );
336 goto out;
338 BUG_ON(lo->magic != LO_MAGIC);
339 nbd_end_request(req);
341 out:
342 return;
345 void nbd_clear_que(struct nbd_device *lo)
347 struct request *req;
349 BUG_ON(lo->magic != LO_MAGIC);
351 do {
352 req = NULL;
353 spin_lock(&lo->queue_lock);
354 if (!list_empty(&lo->queue_head)) {
355 req = list_entry(lo->queue_head.next, struct request, queuelist);
356 list_del_init(&req->queuelist);
358 spin_unlock(&lo->queue_lock);
359 if (req) {
360 req->errors++;
361 nbd_end_request(req);
363 } while(req);
367 * We always wait for result of write, for now. It would be nice to make it optional
368 * in future
369 * if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
370 * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
373 #undef FAIL
374 #define FAIL( s ) { printk( KERN_ERR "%s: " s "\n", req->rq_disk->disk_name ); goto error_out; }
376 static void do_nbd_request(request_queue_t * q)
378 struct request *req;
380 while ((req = elv_next_request(q)) != NULL) {
381 struct nbd_device *lo;
383 if (!(req->flags & REQ_CMD))
384 goto error_out;
386 lo = req->rq_disk->private_data;
387 if (!lo->file)
388 FAIL("Request when not-ready.");
389 nbd_cmd(req) = NBD_CMD_READ;
390 if (rq_data_dir(req) == WRITE) {
391 nbd_cmd(req) = NBD_CMD_WRITE;
392 if (lo->flags & NBD_READ_ONLY)
393 FAIL("Write on read-only");
395 BUG_ON(lo->magic != LO_MAGIC);
396 requests_in++;
398 req->errors = 0;
399 blkdev_dequeue_request(req);
400 spin_unlock_irq(q->queue_lock);
402 spin_lock(&lo->queue_lock);
404 if (!lo->file) {
405 spin_unlock(&lo->queue_lock);
406 printk(KERN_ERR "nbd: failed between accept and semaphore, file lost\n");
407 req->errors++;
408 nbd_end_request(req);
409 spin_lock_irq(q->queue_lock);
410 continue;
413 list_add(&req->queuelist, &lo->queue_head);
414 spin_unlock(&lo->queue_lock);
416 nbd_send_req(lo, req);
418 if (req->errors) {
419 printk(KERN_ERR "nbd: nbd_send_req failed\n");
420 spin_lock(&lo->queue_lock);
421 list_del_init(&req->queuelist);
422 spin_unlock(&lo->queue_lock);
423 nbd_end_request(req);
424 spin_lock_irq(q->queue_lock);
425 continue;
428 spin_lock_irq(q->queue_lock);
429 continue;
431 error_out:
432 req->errors++;
433 blkdev_dequeue_request(req);
434 spin_unlock(q->queue_lock);
435 nbd_end_request(req);
436 spin_lock(q->queue_lock);
438 return;
441 static int nbd_ioctl(struct inode *inode, struct file *file,
442 unsigned int cmd, unsigned long arg)
444 struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
445 int error, temp;
446 struct request sreq ;
448 /* Anyone capable of this syscall can do *real bad* things */
450 if (!capable(CAP_SYS_ADMIN))
451 return -EPERM;
452 switch (cmd) {
453 case NBD_DISCONNECT:
454 printk(KERN_INFO "NBD_DISCONNECT\n");
455 sreq.flags = REQ_SPECIAL;
456 nbd_cmd(&sreq) = NBD_CMD_DISC;
457 if (!lo->sock)
458 return -EINVAL;
459 nbd_send_req(lo, &sreq);
460 return 0 ;
462 case NBD_CLEAR_SOCK:
463 nbd_clear_que(lo);
464 spin_lock(&lo->queue_lock);
465 if (!list_empty(&lo->queue_head)) {
466 spin_unlock(&lo->queue_lock);
467 printk(KERN_ERR "nbd: Some requests are in progress -> can not turn off.\n");
468 return -EBUSY;
470 file = lo->file;
471 if (!file) {
472 spin_unlock(&lo->queue_lock);
473 return -EINVAL;
475 lo->file = NULL;
476 lo->sock = NULL;
477 spin_unlock(&lo->queue_lock);
478 fput(file);
479 return 0;
480 case NBD_SET_SOCK:
481 if (lo->file)
482 return -EBUSY;
483 error = -EINVAL;
484 file = fget(arg);
485 if (file) {
486 inode = file->f_dentry->d_inode;
487 if (inode->i_sock) {
488 lo->file = file;
489 lo->sock = SOCKET_I(inode);
490 error = 0;
491 } else {
492 fput(file);
495 return error;
496 case NBD_SET_BLKSIZE:
497 if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
498 return -EINVAL;
499 lo->blksize = arg;
500 temp = arg >> 9;
501 lo->blksize_bits = 9;
502 while (temp > 1) {
503 lo->blksize_bits++;
504 temp >>= 1;
506 lo->bytesize &= ~(lo->blksize-1);
507 set_capacity(lo->disk, lo->bytesize >> 9);
508 return 0;
509 case NBD_SET_SIZE:
510 lo->bytesize = arg & ~(lo->blksize-1);
511 set_capacity(lo->disk, lo->bytesize >> 9);
512 return 0;
513 case NBD_SET_SIZE_BLOCKS:
514 lo->bytesize = ((u64) arg) << lo->blksize_bits;
515 set_capacity(lo->disk, lo->bytesize >> 9);
516 return 0;
517 case NBD_DO_IT:
518 if (!lo->file)
519 return -EINVAL;
520 nbd_do_it(lo);
521 /* on return tidy up in case we have a signal */
522 /* Forcibly shutdown the socket causing all listeners
523 * to error
525 * FIXME: This code is duplicated from sys_shutdown, but
526 * there should be a more generic interface rather than
527 * calling socket ops directly here */
528 down(&lo->tx_lock);
529 printk(KERN_WARNING "nbd: shutting down socket\n");
530 lo->sock->ops->shutdown(lo->sock, SEND_SHUTDOWN|RCV_SHUTDOWN);
531 lo->sock = NULL;
532 up(&lo->tx_lock);
533 spin_lock(&lo->queue_lock);
534 file = lo->file;
535 lo->file = NULL;
536 spin_unlock(&lo->queue_lock);
537 nbd_clear_que(lo);
538 printk(KERN_WARNING "nbd: queue cleared\n");
539 if (file)
540 fput(file);
541 return lo->harderror;
542 case NBD_CLEAR_QUE:
543 nbd_clear_que(lo);
544 return 0;
545 #ifdef PARANOIA
546 case NBD_PRINT_DEBUG:
547 printk(KERN_INFO "%s: next = %p, prev = %p. Global: in %d, out %d\n",
548 inode->i_bdev->bd_disk->disk_name, lo->queue_head.next,
549 lo->queue_head.prev, requests_in, requests_out);
550 return 0;
551 #endif
553 return -EINVAL;
556 static int nbd_release(struct inode *inode, struct file *file)
558 struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
559 if (lo->refcnt <= 0)
560 printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
561 lo->refcnt--;
562 /* N.B. Doesn't lo->file need an fput?? */
563 return 0;
566 static struct block_device_operations nbd_fops =
568 .owner = THIS_MODULE,
569 .open = nbd_open,
570 .release = nbd_release,
571 .ioctl = nbd_ioctl,
575 * And here should be modules and kernel interface
576 * (Just smiley confuses emacs :-)
579 static int __init nbd_init(void)
581 int err = -ENOMEM;
582 int i;
584 if (sizeof(struct nbd_request) != 28) {
585 printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" );
586 return -EIO;
589 for (i = 0; i < MAX_NBD; i++) {
590 struct gendisk *disk = alloc_disk(1);
591 if (!disk)
592 goto out;
593 nbd_dev[i].disk = disk;
595 * The new linux 2.5 block layer implementation requires
596 * every gendisk to have its very own request_queue struct.
597 * These structs are big so we dynamically allocate them.
599 disk->queue = kmalloc(sizeof(struct request_queue), GFP_KERNEL);
600 if (!disk->queue) {
601 put_disk(disk);
602 goto out;
604 blk_init_queue(disk->queue, do_nbd_request, &nbd_lock);
607 if (register_blkdev(NBD_MAJOR, "nbd")) {
608 err = -EIO;
609 goto out;
611 #ifdef MODULE
612 printk("nbd: registered device at major %d\n", NBD_MAJOR);
613 #endif
614 devfs_mk_dir("nbd");
615 for (i = 0; i < MAX_NBD; i++) {
616 struct gendisk *disk = nbd_dev[i].disk;
617 nbd_dev[i].refcnt = 0;
618 nbd_dev[i].file = NULL;
619 nbd_dev[i].magic = LO_MAGIC;
620 nbd_dev[i].flags = 0;
621 spin_lock_init(&nbd_dev[i].queue_lock);
622 INIT_LIST_HEAD(&nbd_dev[i].queue_head);
623 init_MUTEX(&nbd_dev[i].tx_lock);
624 nbd_dev[i].blksize = 1024;
625 nbd_dev[i].blksize_bits = 10;
626 nbd_dev[i].bytesize = ((u64)0x7ffffc00) << 10; /* 2TB */
627 disk->major = NBD_MAJOR;
628 disk->first_minor = i;
629 disk->fops = &nbd_fops;
630 disk->private_data = &nbd_dev[i];
631 sprintf(disk->disk_name, "nbd%d", i);
632 sprintf(disk->devfs_name, "nbd/%d", i);
633 set_capacity(disk, 0x3ffffe);
634 add_disk(disk);
637 return 0;
638 out:
639 while (i--) {
640 kfree(nbd_dev[i].disk->queue);
641 put_disk(nbd_dev[i].disk);
643 return err;
646 static void __exit nbd_cleanup(void)
648 int i;
649 for (i = 0; i < MAX_NBD; i++) {
650 struct gendisk *disk = nbd_dev[i].disk;
651 if (disk) {
652 if (disk->queue) {
653 blk_cleanup_queue(disk->queue);
654 kfree(disk->queue);
655 disk->queue = NULL;
657 del_gendisk(disk);
658 put_disk(disk);
661 devfs_remove("nbd");
662 unregister_blkdev(NBD_MAJOR, "nbd");
663 #ifdef MODULE
664 printk("nbd: unregistered device at major %d\n", NBD_MAJOR);
665 #endif
668 module_init(nbd_init);
669 module_exit(nbd_cleanup);
671 MODULE_DESCRIPTION("Network Block Device");
672 MODULE_LICENSE("GPL");