2 * Copyright (c) 2006 Oracle. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/module.h>
34 #include <linux/errno.h>
35 #include <linux/kernel.h>
37 #include <linux/poll.h>
42 #include "rdma_transport.h"
44 /* this is just used for stats gathering :/ */
45 static DEFINE_SPINLOCK(rds_sock_lock
);
46 static unsigned long rds_sock_count
;
47 static LIST_HEAD(rds_sock_list
);
48 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq
);
51 * This is called as the final descriptor referencing this socket is closed.
52 * We have to unbind the socket so that another socket can be bound to the
53 * address it was using.
55 * We have to be careful about racing with the incoming path. sock_orphan()
56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
57 * messages shouldn't be queued.
59 static int rds_release(struct socket
*sock
)
61 struct sock
*sk
= sock
->sk
;
68 rs
= rds_sk_to_rs(sk
);
71 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
72 * that ensures the recv path has completed messing
74 rds_clear_recv_queue(rs
);
75 rds_cong_remove_socket(rs
);
77 rds_send_drop_to(rs
, NULL
);
78 rds_rdma_drop_keys(rs
);
79 rds_notify_queue_get(rs
, NULL
);
81 spin_lock_irqsave(&rds_sock_lock
, flags
);
82 list_del_init(&rs
->rs_item
);
84 spin_unlock_irqrestore(&rds_sock_lock
, flags
);
93 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
94 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
95 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
96 * this seems more conservative.
97 * NB - normally, one would use sk_callback_lock for this, but we can
98 * get here from interrupts, whereas the network code grabs sk_callback_lock
99 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
101 void rds_wake_sk_sleep(struct rds_sock
*rs
)
105 read_lock_irqsave(&rs
->rs_recv_lock
, flags
);
106 __rds_wake_sk_sleep(rds_rs_to_sk(rs
));
107 read_unlock_irqrestore(&rs
->rs_recv_lock
, flags
);
110 static int rds_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
111 int *uaddr_len
, int peer
)
113 struct sockaddr_in
*sin
= (struct sockaddr_in
*)uaddr
;
114 struct rds_sock
*rs
= rds_sk_to_rs(sock
->sk
);
116 memset(sin
->sin_zero
, 0, sizeof(sin
->sin_zero
));
118 /* racey, don't care */
120 if (!rs
->rs_conn_addr
)
123 sin
->sin_port
= rs
->rs_conn_port
;
124 sin
->sin_addr
.s_addr
= rs
->rs_conn_addr
;
126 sin
->sin_port
= rs
->rs_bound_port
;
127 sin
->sin_addr
.s_addr
= rs
->rs_bound_addr
;
130 sin
->sin_family
= AF_INET
;
132 *uaddr_len
= sizeof(*sin
);
137 * RDS' poll is without a doubt the least intuitive part of the interface,
138 * as POLLIN and POLLOUT do not behave entirely as you would expect from
139 * a network protocol.
141 * POLLIN is asserted if
142 * - there is data on the receive queue.
143 * - to signal that a previously congested destination may have become
145 * - A notification has been queued to the socket (this can be a congestion
146 * update, or a RDMA completion).
148 * POLLOUT is asserted if there is room on the send queue. This does not mean
149 * however, that the next sendmsg() call will succeed. If the application tries
150 * to send to a congested destination, the system call may still fail (and
153 static unsigned int rds_poll(struct file
*file
, struct socket
*sock
,
156 struct sock
*sk
= sock
->sk
;
157 struct rds_sock
*rs
= rds_sk_to_rs(sk
);
158 unsigned int mask
= 0;
161 poll_wait(file
, sk
->sk_sleep
, wait
);
163 poll_wait(file
, &rds_poll_waitq
, wait
);
165 read_lock_irqsave(&rs
->rs_recv_lock
, flags
);
166 if (!rs
->rs_cong_monitor
) {
167 /* When a congestion map was updated, we signal POLLIN for
168 * "historical" reasons. Applications can also poll for
170 if (rds_cong_updated_since(&rs
->rs_cong_track
))
171 mask
|= (POLLIN
| POLLRDNORM
| POLLWRBAND
);
173 spin_lock(&rs
->rs_lock
);
174 if (rs
->rs_cong_notify
)
175 mask
|= (POLLIN
| POLLRDNORM
);
176 spin_unlock(&rs
->rs_lock
);
178 if (!list_empty(&rs
->rs_recv_queue
)
179 || !list_empty(&rs
->rs_notify_queue
))
180 mask
|= (POLLIN
| POLLRDNORM
);
181 if (rs
->rs_snd_bytes
< rds_sk_sndbuf(rs
))
182 mask
|= (POLLOUT
| POLLWRNORM
);
183 read_unlock_irqrestore(&rs
->rs_recv_lock
, flags
);
188 static int rds_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
193 static int rds_cancel_sent_to(struct rds_sock
*rs
, char __user
*optval
,
196 struct sockaddr_in sin
;
199 /* racing with another thread binding seems ok here */
200 if (rs
->rs_bound_addr
== 0) {
201 ret
= -ENOTCONN
; /* XXX not a great errno */
205 if (len
< sizeof(struct sockaddr_in
)) {
210 if (copy_from_user(&sin
, optval
, sizeof(sin
))) {
215 rds_send_drop_to(rs
, &sin
);
220 static int rds_set_bool_option(unsigned char *optvar
, char __user
*optval
,
225 if (optlen
< sizeof(int))
227 if (get_user(value
, (int __user
*) optval
))
233 static int rds_cong_monitor(struct rds_sock
*rs
, char __user
*optval
,
238 ret
= rds_set_bool_option(&rs
->rs_cong_monitor
, optval
, optlen
);
240 if (rs
->rs_cong_monitor
) {
241 rds_cong_add_socket(rs
);
243 rds_cong_remove_socket(rs
);
244 rs
->rs_cong_mask
= 0;
245 rs
->rs_cong_notify
= 0;
251 static int rds_setsockopt(struct socket
*sock
, int level
, int optname
,
252 char __user
*optval
, int optlen
)
254 struct rds_sock
*rs
= rds_sk_to_rs(sock
->sk
);
257 if (level
!= SOL_RDS
) {
263 case RDS_CANCEL_SENT_TO
:
264 ret
= rds_cancel_sent_to(rs
, optval
, optlen
);
267 ret
= rds_get_mr(rs
, optval
, optlen
);
270 ret
= rds_free_mr(rs
, optval
, optlen
);
273 ret
= rds_set_bool_option(&rs
->rs_recverr
, optval
, optlen
);
275 case RDS_CONG_MONITOR
:
276 ret
= rds_cong_monitor(rs
, optval
, optlen
);
285 static int rds_getsockopt(struct socket
*sock
, int level
, int optname
,
286 char __user
*optval
, int __user
*optlen
)
288 struct rds_sock
*rs
= rds_sk_to_rs(sock
->sk
);
289 int ret
= -ENOPROTOOPT
, len
;
291 if (level
!= SOL_RDS
)
294 if (get_user(len
, optlen
)) {
300 case RDS_INFO_FIRST
... RDS_INFO_LAST
:
301 ret
= rds_info_getsockopt(sock
, optname
, optval
,
306 if (len
< sizeof(int))
309 if (put_user(rs
->rs_recverr
, (int __user
*) optval
)
310 || put_user(sizeof(int), optlen
))
324 static int rds_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
325 int addr_len
, int flags
)
327 struct sock
*sk
= sock
->sk
;
328 struct sockaddr_in
*sin
= (struct sockaddr_in
*)uaddr
;
329 struct rds_sock
*rs
= rds_sk_to_rs(sk
);
334 if (addr_len
!= sizeof(struct sockaddr_in
)) {
339 if (sin
->sin_family
!= AF_INET
) {
344 if (sin
->sin_addr
.s_addr
== htonl(INADDR_ANY
)) {
349 rs
->rs_conn_addr
= sin
->sin_addr
.s_addr
;
350 rs
->rs_conn_port
= sin
->sin_port
;
357 static struct proto rds_proto
= {
359 .owner
= THIS_MODULE
,
360 .obj_size
= sizeof(struct rds_sock
),
363 static struct proto_ops rds_proto_ops
= {
365 .owner
= THIS_MODULE
,
366 .release
= rds_release
,
368 .connect
= rds_connect
,
369 .socketpair
= sock_no_socketpair
,
370 .accept
= sock_no_accept
,
371 .getname
= rds_getname
,
374 .listen
= sock_no_listen
,
375 .shutdown
= sock_no_shutdown
,
376 .setsockopt
= rds_setsockopt
,
377 .getsockopt
= rds_getsockopt
,
378 .sendmsg
= rds_sendmsg
,
379 .recvmsg
= rds_recvmsg
,
380 .mmap
= sock_no_mmap
,
381 .sendpage
= sock_no_sendpage
,
384 static int __rds_create(struct socket
*sock
, struct sock
*sk
, int protocol
)
389 sock_init_data(sock
, sk
);
390 sock
->ops
= &rds_proto_ops
;
391 sk
->sk_protocol
= protocol
;
393 rs
= rds_sk_to_rs(sk
);
394 spin_lock_init(&rs
->rs_lock
);
395 rwlock_init(&rs
->rs_recv_lock
);
396 INIT_LIST_HEAD(&rs
->rs_send_queue
);
397 INIT_LIST_HEAD(&rs
->rs_recv_queue
);
398 INIT_LIST_HEAD(&rs
->rs_notify_queue
);
399 INIT_LIST_HEAD(&rs
->rs_cong_list
);
400 spin_lock_init(&rs
->rs_rdma_lock
);
401 rs
->rs_rdma_keys
= RB_ROOT
;
403 spin_lock_irqsave(&rds_sock_lock
, flags
);
404 list_add_tail(&rs
->rs_item
, &rds_sock_list
);
406 spin_unlock_irqrestore(&rds_sock_lock
, flags
);
411 static int rds_create(struct net
*net
, struct socket
*sock
, int protocol
)
415 if (sock
->type
!= SOCK_SEQPACKET
|| protocol
)
416 return -ESOCKTNOSUPPORT
;
418 sk
= sk_alloc(net
, AF_RDS
, GFP_ATOMIC
, &rds_proto
);
422 return __rds_create(sock
, sk
, protocol
);
425 void rds_sock_addref(struct rds_sock
*rs
)
427 sock_hold(rds_rs_to_sk(rs
));
430 void rds_sock_put(struct rds_sock
*rs
)
432 sock_put(rds_rs_to_sk(rs
));
435 static struct net_proto_family rds_family_ops
= {
437 .create
= rds_create
,
438 .owner
= THIS_MODULE
,
441 static void rds_sock_inc_info(struct socket
*sock
, unsigned int len
,
442 struct rds_info_iterator
*iter
,
443 struct rds_info_lengths
*lens
)
447 struct rds_incoming
*inc
;
449 unsigned int total
= 0;
451 len
/= sizeof(struct rds_info_message
);
453 spin_lock_irqsave(&rds_sock_lock
, flags
);
455 list_for_each_entry(rs
, &rds_sock_list
, rs_item
) {
456 sk
= rds_rs_to_sk(rs
);
457 read_lock(&rs
->rs_recv_lock
);
459 /* XXX too lazy to maintain counts.. */
460 list_for_each_entry(inc
, &rs
->rs_recv_queue
, i_item
) {
463 rds_inc_info_copy(inc
, iter
, inc
->i_saddr
,
464 rs
->rs_bound_addr
, 1);
467 read_unlock(&rs
->rs_recv_lock
);
470 spin_unlock_irqrestore(&rds_sock_lock
, flags
);
473 lens
->each
= sizeof(struct rds_info_message
);
476 static void rds_sock_info(struct socket
*sock
, unsigned int len
,
477 struct rds_info_iterator
*iter
,
478 struct rds_info_lengths
*lens
)
480 struct rds_info_socket sinfo
;
484 len
/= sizeof(struct rds_info_socket
);
486 spin_lock_irqsave(&rds_sock_lock
, flags
);
488 if (len
< rds_sock_count
)
491 list_for_each_entry(rs
, &rds_sock_list
, rs_item
) {
492 sinfo
.sndbuf
= rds_sk_sndbuf(rs
);
493 sinfo
.rcvbuf
= rds_sk_rcvbuf(rs
);
494 sinfo
.bound_addr
= rs
->rs_bound_addr
;
495 sinfo
.connected_addr
= rs
->rs_conn_addr
;
496 sinfo
.bound_port
= rs
->rs_bound_port
;
497 sinfo
.connected_port
= rs
->rs_conn_port
;
498 sinfo
.inum
= sock_i_ino(rds_rs_to_sk(rs
));
500 rds_info_copy(iter
, &sinfo
, sizeof(sinfo
));
504 lens
->nr
= rds_sock_count
;
505 lens
->each
= sizeof(struct rds_info_socket
);
507 spin_unlock_irqrestore(&rds_sock_lock
, flags
);
510 static void __exit
rds_exit(void)
513 sock_unregister(rds_family_ops
.family
);
514 proto_unregister(&rds_proto
);
521 rds_info_deregister_func(RDS_INFO_SOCKETS
, rds_sock_info
);
522 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES
, rds_sock_inc_info
);
524 module_exit(rds_exit
);
526 static int __init
rds_init(void)
530 ret
= rds_conn_init();
533 ret
= rds_threads_init();
536 ret
= rds_sysctl_init();
539 ret
= rds_stats_init();
542 ret
= proto_register(&rds_proto
, 1);
545 ret
= sock_register(&rds_family_ops
);
549 rds_info_register_func(RDS_INFO_SOCKETS
, rds_sock_info
);
550 rds_info_register_func(RDS_INFO_RECV_MESSAGES
, rds_sock_inc_info
);
552 /* ib/iwarp transports currently compiled-in */
553 ret
= rds_rdma_init();
559 sock_unregister(rds_family_ops
.family
);
561 proto_unregister(&rds_proto
);
575 module_init(rds_init
);
577 #define DRV_VERSION "4.0"
578 #define DRV_RELDATE "Feb 12, 2009"
580 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
581 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
582 " v" DRV_VERSION
" (" DRV_RELDATE
")");
583 MODULE_VERSION(DRV_VERSION
);
584 MODULE_LICENSE("Dual BSD/GPL");
585 MODULE_ALIAS_NETPROTO(PF_RDS
);