2 * 2007+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
16 #include <linux/module.h>
17 #include <linux/kernel.h>
18 #include <linux/blkdev.h>
19 #include <linux/bio.h>
20 #include <linux/buffer_head.h>
21 #include <linux/connector.h>
22 #include <linux/dst.h>
23 #include <linux/device.h>
24 #include <linux/jhash.h>
25 #include <linux/idr.h>
26 #include <linux/init.h>
27 #include <linux/namei.h>
28 #include <linux/slab.h>
29 #include <linux/socket.h>
32 #include <linux/in6.h>
38 static DEFINE_MUTEX(dst_hash_lock
);
39 static struct list_head
*dst_hashtable
;
40 static unsigned int dst_hashtable_size
= 128;
41 module_param(dst_hashtable_size
, uint
, 0644);
43 static char dst_name
[] = "Dementianting goldfish";
45 static DEFINE_IDR(dst_index_idr
);
46 static struct cb_id cn_dst_id
= { CN_DST_IDX
, CN_DST_VAL
};
49 * DST sysfs tree for device called 'storage':
51 * /sys/bus/dst/devices/storage/
52 * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
53 * /sys/bus/dst/devices/storage/size : 800
54 * /sys/bus/dst/devices/storage/name : storage
57 static int dst_dev_match(struct device
*dev
, struct device_driver
*drv
)
62 static struct bus_type dst_dev_bus_type
= {
64 .match
= &dst_dev_match
,
67 static void dst_node_release(struct device
*dev
)
69 struct dst_info
*info
= container_of(dev
, struct dst_info
, device
);
74 static struct device dst_node_dev
= {
75 .bus
= &dst_dev_bus_type
,
76 .release
= &dst_node_release
80 * Setting size of the node after it was changed.
82 static void dst_node_set_size(struct dst_node
*n
)
84 struct block_device
*bdev
;
86 set_capacity(n
->disk
, n
->size
>> 9);
88 bdev
= bdget_disk(n
->disk
, 0);
90 mutex_lock(&bdev
->bd_inode
->i_mutex
);
91 i_size_write(bdev
->bd_inode
, n
->size
);
92 mutex_unlock(&bdev
->bd_inode
->i_mutex
);
98 * Distributed storage request processing function.
100 static int dst_request(struct request_queue
*q
, struct bio
*bio
)
102 struct dst_node
*n
= q
->queuedata
;
105 if (bio_empty_barrier(bio
) && !q
->prepare_discard_fn
) {
107 * This is a dirty^Wnice hack, but if we complete this
108 * operation with -EOPNOTSUPP like intended, XFS
109 * will stuck and freeze the machine. This may be
110 * not particulary XFS problem though, but it is the
111 * only FS which sends empty barrier at umount time
114 * Empty barriers are not allowed anyway, see 51fd77bd9f512
115 * for example, although later it was changed to
116 * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not
126 return dst_process_bio(n
, bio
);
134 * Open/close callbacks for appropriate block device.
136 static int dst_bdev_open(struct block_device
*bdev
, fmode_t mode
)
138 struct dst_node
*n
= bdev
->bd_disk
->private_data
;
144 static int dst_bdev_release(struct gendisk
*disk
, fmode_t mode
)
146 struct dst_node
*n
= disk
->private_data
;
152 static struct block_device_operations dst_blk_ops
= {
153 .open
= dst_bdev_open
,
154 .release
= dst_bdev_release
,
155 .owner
= THIS_MODULE
,
159 * Block layer binding - disk is created when array is fully configured
160 * by userspace request.
162 static int dst_node_create_disk(struct dst_node
*n
)
167 n
->queue
= blk_init_queue(NULL
, NULL
);
171 n
->queue
->queuedata
= n
;
172 blk_queue_make_request(n
->queue
, dst_request
);
173 blk_queue_max_phys_segments(n
->queue
, n
->max_pages
);
174 blk_queue_max_hw_segments(n
->queue
, n
->max_pages
);
177 n
->disk
= alloc_disk(1);
179 goto err_out_free_queue
;
181 if (!(n
->state
->permissions
& DST_PERM_WRITE
)) {
182 printk(KERN_INFO
"DST node %s attached read-only.\n", n
->name
);
183 set_disk_ro(n
->disk
, 1);
186 if (!idr_pre_get(&dst_index_idr
, GFP_KERNEL
))
189 mutex_lock(&dst_hash_lock
);
190 err
= idr_get_new(&dst_index_idr
, NULL
, &index
);
191 mutex_unlock(&dst_hash_lock
);
195 n
->disk
->major
= dst_major
;
196 n
->disk
->first_minor
= index
;
197 n
->disk
->fops
= &dst_blk_ops
;
198 n
->disk
->queue
= n
->queue
;
199 n
->disk
->private_data
= n
;
200 snprintf(n
->disk
->disk_name
, sizeof(n
->disk
->disk_name
), "dst-%s", n
->name
);
207 blk_cleanup_queue(n
->queue
);
213 * Sysfs machinery: show device's size.
215 static ssize_t
dst_show_size(struct device
*dev
,
216 struct device_attribute
*attr
, char *buf
)
218 struct dst_info
*info
= container_of(dev
, struct dst_info
, device
);
220 return sprintf(buf
, "%llu\n", info
->size
);
224 * Show local exported device.
226 static ssize_t
dst_show_local(struct device
*dev
,
227 struct device_attribute
*attr
, char *buf
)
229 struct dst_info
*info
= container_of(dev
, struct dst_info
, device
);
231 return sprintf(buf
, "%s\n", info
->local
);
235 * Shows type of the remote node - device major/minor number
236 * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes.
238 static ssize_t
dst_show_type(struct device
*dev
,
239 struct device_attribute
*attr
, char *buf
)
241 struct dst_info
*info
= container_of(dev
, struct dst_info
, device
);
242 int family
= info
->net
.addr
.sa_family
;
244 if (family
== AF_INET
) {
245 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&info
->net
.addr
;
246 return sprintf(buf
, "%u.%u.%u.%u:%d\n",
247 NIPQUAD(sin
->sin_addr
.s_addr
), ntohs(sin
->sin_port
));
248 } else if (family
== AF_INET6
) {
249 struct sockaddr_in6
*sin
= (struct sockaddr_in6
*)&info
->net
.addr
;
252 &sin
->sin6_addr
, ntohs(sin
->sin6_port
));
254 int i
, sz
= PAGE_SIZE
- 2; /* 0 symbol and '\n' below */
255 int size
, addrlen
= info
->net
.addr
.sa_data_len
;
256 unsigned char *a
= (unsigned char *)&info
->net
.addr
.sa_data
;
257 char *buf_orig
= buf
;
259 size
= snprintf(buf
, sz
, "family: %d, addrlen: %u, addr: ",
264 for (i
=0; i
<addrlen
; ++i
) {
268 size
= snprintf(buf
, sz
, "%02x ", a
[i
]);
272 buf
+= sprintf(buf
, "\n");
274 return buf
- buf_orig
;
279 static struct device_attribute dst_node_attrs
[] = {
280 __ATTR(size
, 0444, dst_show_size
, NULL
),
281 __ATTR(type
, 0444, dst_show_type
, NULL
),
282 __ATTR(local
, 0444, dst_show_local
, NULL
),
285 static int dst_create_node_attributes(struct dst_node
*n
)
289 for (i
=0; i
<ARRAY_SIZE(dst_node_attrs
); ++i
) {
290 err
= device_create_file(&n
->info
->device
,
293 goto err_out_remove_all
;
299 device_remove_file(&n
->info
->device
,
305 static void dst_remove_node_attributes(struct dst_node
*n
)
309 for (i
=0; i
<ARRAY_SIZE(dst_node_attrs
); ++i
)
310 device_remove_file(&n
->info
->device
,
315 * Sysfs cleanup and initialization.
316 * Shows number of useful parameters.
318 static void dst_node_sysfs_exit(struct dst_node
*n
)
321 dst_remove_node_attributes(n
);
322 device_unregister(&n
->info
->device
);
327 static int dst_node_sysfs_init(struct dst_node
*n
)
331 n
->info
= kzalloc(sizeof(struct dst_info
), GFP_KERNEL
);
335 memcpy(&n
->info
->device
, &dst_node_dev
, sizeof(struct device
));
336 n
->info
->size
= n
->size
;
338 dev_set_name(&n
->info
->device
, "dst-%s", n
->name
);
339 err
= device_register(&n
->info
->device
);
341 dprintk(KERN_ERR
"Failed to register node '%s', err: %d.\n",
346 dst_create_node_attributes(n
);
357 * DST node hash tables machinery.
359 static inline unsigned int dst_hash(char *str
, unsigned int size
)
361 return (jhash(str
, size
, 0) % dst_hashtable_size
);
364 static void dst_node_remove(struct dst_node
*n
)
366 mutex_lock(&dst_hash_lock
);
367 list_del_init(&n
->node_entry
);
368 mutex_unlock(&dst_hash_lock
);
371 static void dst_node_add(struct dst_node
*n
)
373 unsigned hash
= dst_hash(n
->name
, sizeof(n
->name
));
375 mutex_lock(&dst_hash_lock
);
376 list_add_tail(&n
->node_entry
, &dst_hashtable
[hash
]);
377 mutex_unlock(&dst_hash_lock
);
381 * Cleaning node when it is about to be freed.
382 * There are still users of the socket though,
383 * so connection cleanup should be protected.
385 static void dst_node_cleanup(struct dst_node
*n
)
387 struct dst_state
*st
= n
->state
;
393 blk_cleanup_queue(n
->queue
);
395 mutex_lock(&dst_hash_lock
);
396 idr_remove(&dst_index_idr
, n
->disk
->first_minor
);
397 mutex_unlock(&dst_hash_lock
);
403 sync_blockdev(n
->bdev
);
404 blkdev_put(n
->bdev
, FMODE_READ
|FMODE_WRITE
);
409 dst_state_exit_connected(st
);
410 dst_state_unlock(st
);
412 wake_up(&st
->thread_wait
);
419 * Free security attributes attached to given node.
421 static void dst_security_exit(struct dst_node
*n
)
423 struct dst_secure
*s
, *tmp
;
425 list_for_each_entry_safe(s
, tmp
, &n
->security_list
, sec_entry
) {
426 list_del(&s
->sec_entry
);
432 * Free node when there are no more users.
433 * Actually node has to be freed on behalf od userspace process,
434 * since there are number of threads, which are embedded in the
435 * node, so they can not exit and free node from there, that is
436 * why there is a wakeup if reference counter is not equal to zero.
438 void dst_node_put(struct dst_node
*n
)
443 dprintk("%s: n: %p, refcnt: %d.\n",
444 __func__
, n
, atomic_read(&n
->refcnt
));
446 if (atomic_dec_and_test(&n
->refcnt
)) {
448 n
->trans_scan_timeout
= 0;
450 thread_pool_destroy(n
->pool
);
451 dst_node_sysfs_exit(n
);
452 dst_node_crypto_exit(n
);
453 dst_security_exit(n
);
454 dst_node_trans_exit(n
);
458 dprintk("%s: freed n: %p.\n", __func__
, n
);
465 * This function finds devices major/minor numbers for given pathname.
467 static int dst_lookup_device(const char *path
, dev_t
*dev
)
473 err
= path_lookup(path
, LOOKUP_FOLLOW
, &nd
);
477 inode
= nd
.path
.dentry
->d_inode
;
483 if (!S_ISBLK(inode
->i_mode
)) {
488 *dev
= inode
->i_rdev
;
496 * Setting up export device: lookup by the name, get its size
497 * and setup listening socket, which will accept clients, which
498 * will submit IO for given storage.
500 static int dst_setup_export(struct dst_node
*n
, struct dst_ctl
*ctl
,
501 struct dst_export_ctl
*le
)
504 dev_t dev
= 0; /* gcc likes to scream here */
506 snprintf(n
->info
->local
, sizeof(n
->info
->local
), "%s", le
->device
);
508 err
= dst_lookup_device(le
->device
, &dev
);
512 n
->bdev
= open_by_devnum(dev
, FMODE_READ
|FMODE_WRITE
);
517 n
->size
= min_t(loff_t
, n
->bdev
->bd_inode
->i_size
, n
->size
);
519 n
->size
= n
->bdev
->bd_inode
->i_size
;
521 n
->info
->size
= n
->size
;
522 err
= dst_node_init_listened(n
, le
);
524 goto err_out_cleanup
;
529 blkdev_put(n
->bdev
, FMODE_READ
|FMODE_WRITE
);
535 /* Empty thread pool callbacks for the network processing threads. */
536 static inline void *dst_thread_network_init(void *data
)
538 dprintk("%s: data: %p.\n", __func__
, data
);
542 static inline void dst_thread_network_cleanup(void *data
)
544 dprintk("%s: data: %p.\n", __func__
, data
);
548 * Allocate DST node and initialize some of its parameters.
550 static struct dst_node
*dst_alloc_node(struct dst_ctl
*ctl
,
551 int (*start
)(struct dst_node
*),
557 n
= kzalloc(sizeof(struct dst_node
), GFP_KERNEL
);
561 INIT_LIST_HEAD(&n
->node_entry
);
563 INIT_LIST_HEAD(&n
->security_list
);
564 mutex_init(&n
->security_lock
);
566 init_waitqueue_head(&n
->wait
);
568 n
->trans_scan_timeout
= msecs_to_jiffies(ctl
->trans_scan_timeout
);
569 if (!n
->trans_scan_timeout
)
570 n
->trans_scan_timeout
= HZ
;
572 n
->trans_max_retries
= ctl
->trans_max_retries
;
573 if (!n
->trans_max_retries
)
574 n
->trans_max_retries
= 10;
577 * Pretty much arbitrary default numbers.
578 * 32 matches maximum number of pages in bio originated from ext3 (31).
580 n
->max_pages
= ctl
->max_pages
;
584 if (n
->max_pages
> 1024)
590 atomic_set(&n
->refcnt
, 1);
591 atomic_long_set(&n
->gen
, 0);
592 snprintf(n
->name
, sizeof(n
->name
), "%s", ctl
->name
);
594 err
= dst_node_sysfs_init(n
);
598 n
->pool
= thread_pool_create(num
, n
->name
, dst_thread_network_init
,
599 dst_thread_network_cleanup
, n
);
600 if (IS_ERR(n
->pool
)) {
601 err
= PTR_ERR(n
->pool
);
602 goto err_out_sysfs_exit
;
605 dprintk("%s: n: %p, name: %s.\n", __func__
, n
, n
->name
);
610 dst_node_sysfs_exit(n
);
617 * Starting a node, connected to the remote server:
618 * register block device and initialize transaction mechanism.
619 * In revers order though.
621 * It will autonegotiate some parameters with the remote node
622 * and update local if needed.
624 * Transaction initialization should be the last thing before
625 * starting the node, since transaction should include not only
626 * block IO, but also crypto related data (if any), which are
627 * initialized separately.
629 static int dst_start_remote(struct dst_node
*n
)
633 err
= dst_node_trans_init(n
, sizeof(struct dst_trans
));
637 err
= dst_node_create_disk(n
);
641 dst_node_set_size(n
);
644 dprintk("DST: started remote node '%s', minor: %d.\n", n
->name
, n
->disk
->first_minor
);
650 * Adding remote node and initialize connection.
652 static int dst_add_remote(struct dst_node
*n
, struct dst_ctl
*ctl
,
653 void *data
, unsigned int size
)
656 struct dst_network_ctl
*rctl
= data
;
661 if (size
!= sizeof(struct dst_network_ctl
))
664 n
= dst_alloc_node(ctl
, dst_start_remote
, 1);
668 memcpy(&n
->info
->net
, rctl
, sizeof(struct dst_network_ctl
));
669 err
= dst_node_init_connected(n
, rctl
);
683 * Adding export node: initializing block device and listening socket.
685 static int dst_add_export(struct dst_node
*n
, struct dst_ctl
*ctl
,
686 void *data
, unsigned int size
)
689 struct dst_export_ctl
*le
= data
;
694 if (size
!= sizeof(struct dst_export_ctl
))
697 n
= dst_alloc_node(ctl
, dst_start_export
, 2);
701 err
= dst_setup_export(n
, ctl
, le
);
714 static int dst_node_remove_unload(struct dst_node
*n
)
716 printk(KERN_INFO
"STOPPED name: '%s', size: %llu.\n",
720 del_gendisk(n
->disk
);
723 dst_node_sysfs_exit(n
);
726 * This is not a hack. Really.
727 * Node's reference counter allows to implement fine grained
728 * node freeing, but since all transactions (which hold node's
729 * reference counter) are processed in the dedicated thread,
730 * it is possible that reference will hit zero in that thread,
731 * so we will not be able to exit thread and cleanup the node.
733 * So, we remove disk, so no new activity is possible, and
734 * wait until all pending transaction are completed (either
735 * in receiving thread or by timeout in workqueue), in this
736 * case reference counter will be less or equal to 2 (once set in
737 * dst_alloc_node() and then in connector message parser;
738 * or when we force module unloading, and connector message
739 * parser does not hold a reference, in this case reference
740 * counter will be equal to 1),
741 * and subsequent dst_node_put() calls will free the node.
743 dprintk("%s: going to sleep with %d refcnt.\n", __func__
, atomic_read(&n
->refcnt
));
744 wait_event(n
->wait
, atomic_read(&n
->refcnt
) <= 2);
751 * Remove node from the hash table.
753 static int dst_del_node(struct dst_node
*n
, struct dst_ctl
*ctl
,
754 void *data
, unsigned int size
)
759 return dst_node_remove_unload(n
);
763 * Initialize crypto processing for given node.
765 static int dst_crypto_init(struct dst_node
*n
, struct dst_ctl
*ctl
,
766 void *data
, unsigned int size
)
768 struct dst_crypto_ctl
*crypto
= data
;
773 if (size
!= sizeof(struct dst_crypto_ctl
) + crypto
->hash_keysize
+
774 crypto
->cipher_keysize
)
780 return dst_node_crypto_init(n
, crypto
);
784 * Security attributes for given node.
786 static int dst_security_init(struct dst_node
*n
, struct dst_ctl
*ctl
,
787 void *data
, unsigned int size
)
789 struct dst_secure
*s
;
794 if (size
!= sizeof(struct dst_secure_user
))
797 s
= kmalloc(sizeof(struct dst_secure
), GFP_KERNEL
);
801 memcpy(&s
->sec
, data
, size
);
803 mutex_lock(&n
->security_lock
);
804 list_add_tail(&s
->sec_entry
, &n
->security_list
);
805 mutex_unlock(&n
->security_lock
);
813 static int dst_start_node(struct dst_node
*n
, struct dst_ctl
*ctl
,
814 void *data
, unsigned int size
)
828 printk(KERN_INFO
"STARTED name: '%s', size: %llu.\n", n
->name
, n
->size
);
832 typedef int (*dst_command_func
)(struct dst_node
*n
, struct dst_ctl
*ctl
,
833 void *data
, unsigned int size
);
836 * List of userspace commands.
838 static dst_command_func dst_commands
[] = {
839 [DST_ADD_REMOTE
] = &dst_add_remote
,
840 [DST_ADD_EXPORT
] = &dst_add_export
,
841 [DST_DEL_NODE
] = &dst_del_node
,
842 [DST_CRYPTO
] = &dst_crypto_init
,
843 [DST_SECURITY
] = &dst_security_init
,
844 [DST_START
] = &dst_start_node
,
848 * Configuration parser.
850 static void cn_dst_callback(struct cn_msg
*msg
)
854 struct dst_ctl_ack ack
;
855 struct dst_node
*n
= NULL
, *tmp
;
858 if (msg
->len
< sizeof(struct dst_ctl
)) {
863 ctl
= (struct dst_ctl
*)msg
->data
;
865 if (ctl
->cmd
>= DST_CMD_MAX
) {
869 hash
= dst_hash(ctl
->name
, sizeof(ctl
->name
));
871 mutex_lock(&dst_hash_lock
);
872 list_for_each_entry(tmp
, &dst_hashtable
[hash
], node_entry
) {
873 if (!memcmp(tmp
->name
, ctl
->name
, sizeof(tmp
->name
))) {
879 mutex_unlock(&dst_hash_lock
);
881 err
= dst_commands
[ctl
->cmd
](n
, ctl
, msg
->data
+ sizeof(struct dst_ctl
),
882 msg
->len
- sizeof(struct dst_ctl
));
886 memcpy(&ack
.msg
, msg
, sizeof(struct cn_msg
));
888 ack
.msg
.ack
= msg
->ack
+ 1;
889 ack
.msg
.len
= sizeof(struct dst_ctl_ack
) - sizeof(struct cn_msg
);
893 cn_netlink_send(&ack
.msg
, 0, GFP_KERNEL
);
897 * Global initialization: sysfs, hash table, block device registration,
898 * connector and various caches.
900 static int __init
dst_sysfs_init(void)
902 return bus_register(&dst_dev_bus_type
);
905 static void dst_sysfs_exit(void)
907 bus_unregister(&dst_dev_bus_type
);
910 static int __init
dst_hashtable_init(void)
914 dst_hashtable
= kcalloc(dst_hashtable_size
, sizeof(struct list_head
),
919 for (i
=0; i
<dst_hashtable_size
; ++i
)
920 INIT_LIST_HEAD(&dst_hashtable
[i
]);
925 static void dst_hashtable_exit(void)
928 struct dst_node
*n
, *tmp
;
930 for (i
=0; i
<dst_hashtable_size
; ++i
) {
931 list_for_each_entry_safe(n
, tmp
, &dst_hashtable
[i
], node_entry
) {
932 dst_node_remove_unload(n
);
936 kfree(dst_hashtable
);
939 static int __init
dst_sys_init(void)
943 err
= dst_hashtable_init();
947 err
= dst_export_init();
949 goto err_out_hashtable_exit
;
951 err
= register_blkdev(dst_major
, DST_NAME
);
953 goto err_out_export_exit
;
957 err
= dst_sysfs_init();
959 goto err_out_unregister
;
961 err
= cn_add_callback(&cn_dst_id
, "DST", cn_dst_callback
);
963 goto err_out_sysfs_exit
;
965 printk(KERN_INFO
"Distributed storage, '%s' release.\n", dst_name
);
972 unregister_blkdev(dst_major
, DST_NAME
);
975 err_out_hashtable_exit
:
976 dst_hashtable_exit();
981 static void __exit
dst_sys_exit(void)
983 cn_del_callback(&cn_dst_id
);
984 unregister_blkdev(dst_major
, DST_NAME
);
985 dst_hashtable_exit();
990 module_init(dst_sys_init
);
991 module_exit(dst_sys_exit
);
993 MODULE_DESCRIPTION("Distributed storage");
994 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
995 MODULE_LICENSE("GPL");