2 * Copyright (c) 2006-2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_syslink.c,v 1.16 2008/10/26 04:29:19 sephe Exp $
37 * This module implements the core syslink() system call and provides
38 * glue for kernel syslink frontends and backends, creating a intra-host
39 * communications infrastructure and DMA transport abstraction.
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/endian.h>
46 #include <sys/malloc.h>
47 #include <sys/alist.h>
53 #include <sys/objcache.h>
54 #include <sys/queue.h>
55 #include <sys/thread.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysproto.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/socketops.h>
63 #include <sys/sysref.h>
64 #include <sys/syslink.h>
65 #include <sys/syslink_msg.h>
66 #include <netinet/in.h>
68 #include <sys/thread2.h>
69 #include <sys/spinlock2.h>
71 #include <sys/mplock2.h>
73 #include "opt_syslink.h"
76 * Syslink Connection abstraction
85 struct slmsg_rb_tree reply_rb_root
; /* replies to requests */
87 struct sldesc
*peer
; /* peer syslink, if any */
88 struct file
*xfp
; /* external file pointer */
89 struct slcommon
*common
;
91 int rwaiters
; /* number of threads waiting */
92 int wblocked
; /* blocked waiting for us to drain */
93 size_t cmdbytes
; /* unreplied commands pending */
94 size_t repbytes
; /* undrained replies pending */
95 int (*backend_wblocked
)(struct sldesc
*, int, sl_proto_t
);
96 int (*backend_write
)(struct sldesc
*, struct slmsg
*);
97 void (*backend_reply
)(struct sldesc
*,struct slmsg
*,struct slmsg
*);
98 void (*backend_dispose
)(struct sldesc
*, struct slmsg
*);
101 #define SLF_RSHUTDOWN 0x0001
102 #define SLF_WSHUTDOWN 0x0002
104 static int syslink_cmd_new(struct syslink_info_new
*info
, int *result
);
105 static struct sldesc
*allocsldesc(struct slcommon
*common
);
106 static void setsldescfp(struct sldesc
*sl
, struct file
*fp
);
107 static void shutdownsldesc(struct sldesc
*sl
, int how
);
108 static void shutdownsldesc2(struct sldesc
*sl
, int how
);
109 static void sldrop(struct sldesc
*sl
);
110 static int syslink_validate_msg(struct syslink_msg
*msg
, int bytes
);
111 static int syslink_validate_elm(struct syslink_elm
*elm
, sl_reclen_t bytes
,
112 int swapit
, int depth
);
114 static int sl_local_mmap(struct slmsg
*slmsg
, char *base
, size_t len
);
115 static void sl_local_munmap(struct slmsg
*slmsg
);
117 static int backend_wblocked_user(struct sldesc
*sl
, int nbio
, sl_proto_t proto
);
118 static int backend_write_user(struct sldesc
*sl
, struct slmsg
*slmsg
);
119 static void backend_reply_user(struct sldesc
*sl
, struct slmsg
*slcmd
,
120 struct slmsg
*slrep
);
121 static void backend_dispose_user(struct sldesc
*sl
, struct slmsg
*slmsg
);
123 static int backend_wblocked_kern(struct sldesc
*sl
, int nbio
, sl_proto_t proto
);
124 static int backend_write_kern(struct sldesc
*sl
, struct slmsg
*slmsg
);
125 static void backend_reply_kern(struct sldesc
*sl
, struct slmsg
*slcmd
,
126 struct slmsg
*slrep
);
127 static void backend_dispose_kern(struct sldesc
*sl
, struct slmsg
*slmsg
);
128 static void slmsg_put(struct slmsg
*slmsg
);
131 * Objcache memory backend
133 * All three object caches return slmsg structures but each is optimized
134 * for syslink message buffers of varying sizes. We use the slightly
135 * more complex ctor/dtor API in order to provide ready-to-go slmsg's.
138 static struct objcache
*sl_objcache_big
;
139 static struct objcache
*sl_objcache_small
;
140 static struct objcache
*sl_objcache_none
;
142 MALLOC_DEFINE(M_SYSLINK
, "syslink", "syslink manager");
144 static boolean_t
slmsg_ctor(void *data
, void *private, int ocflags
);
145 static void slmsg_dtor(void *data
, void *private);
149 syslinkinit(void *dummy __unused
)
151 size_t n
= sizeof(struct slmsg
);
153 sl_objcache_none
= objcache_create_mbacked(M_SYSLINK
, n
, NULL
, 64,
154 slmsg_ctor
, slmsg_dtor
,
156 sl_objcache_small
= objcache_create_mbacked(M_SYSLINK
, n
, NULL
, 64,
157 slmsg_ctor
, slmsg_dtor
,
159 sl_objcache_big
= objcache_create_mbacked(M_SYSLINK
, n
, NULL
, 16,
160 slmsg_ctor
, slmsg_dtor
,
166 slmsg_ctor(void *data
, void *private, int ocflags
)
168 struct slmsg
*slmsg
= data
;
170 bzero(slmsg
, sizeof(*slmsg
));
172 slmsg
->oc
= *(struct objcache
**)private;
173 if (slmsg
->oc
== sl_objcache_none
) {
175 } else if (slmsg
->oc
== sl_objcache_small
) {
176 slmsg
->maxsize
= SLMSG_SMALL
;
177 } else if (slmsg
->oc
== sl_objcache_big
) {
178 slmsg
->maxsize
= SLMSG_BIG
;
180 panic("slmsg_ctor: bad objcache?\n");
182 if (slmsg
->maxsize
) {
183 slmsg
->msg
= kmalloc(slmsg
->maxsize
,
184 M_SYSLINK
, M_WAITOK
|M_ZERO
);
186 xio_init(&slmsg
->xio
);
192 slmsg_dtor(void *data
, void *private)
194 struct slmsg
*slmsg
= data
;
196 if (slmsg
->maxsize
&& slmsg
->msg
) {
197 kfree(slmsg
->msg
, M_SYSLINK
);
203 SYSINIT(syslink
, SI_BOOT2_MACHDEP
, SI_ORDER_ANY
, syslinkinit
, NULL
)
205 static int rb_slmsg_compare(struct slmsg
*msg1
, struct slmsg
*msg2
);
206 RB_GENERATE2(slmsg_rb_tree
, slmsg
, rbnode
, rb_slmsg_compare
,
207 sysid_t
, msg
->sm_msgid
);
212 static int syslink_enabled
;
213 SYSCTL_NODE(_kern
, OID_AUTO
, syslink
, CTLFLAG_RW
, 0, "Pipe operation");
214 SYSCTL_INT(_kern_syslink
, OID_AUTO
, enabled
,
215 CTLFLAG_RW
, &syslink_enabled
, 0, "Enable SYSLINK");
216 static size_t syslink_bufsize
= 65536;
217 SYSCTL_UINT(_kern_syslink
, OID_AUTO
, bufsize
,
218 CTLFLAG_RW
, &syslink_bufsize
, 0, "Maximum buffer size");
221 * Fileops API - typically used to glue a userland frontend with a
225 static int slfileop_read(struct file
*fp
, struct uio
*uio
,
226 struct ucred
*cred
, int flags
);
227 static int slfileop_write(struct file
*fp
, struct uio
*uio
,
228 struct ucred
*cred
, int flags
);
229 static int slfileop_close(struct file
*fp
);
230 static int slfileop_stat(struct file
*fp
, struct stat
*sb
, struct ucred
*cred
);
231 static int slfileop_shutdown(struct file
*fp
, int how
);
232 static int slfileop_ioctl(struct file
*fp
, u_long cmd
, caddr_t data
,
233 struct ucred
*cred
, struct sysmsg
*msg
);
234 static int slfileop_poll(struct file
*fp
, int events
, struct ucred
*cred
);
235 static int slfileop_kqfilter(struct file
*fp
, struct knote
*kn
);
237 static struct fileops syslinkops
= {
238 .fo_read
= slfileop_read
,
239 .fo_write
= slfileop_write
,
240 .fo_ioctl
= slfileop_ioctl
,
241 .fo_poll
= slfileop_poll
,
242 .fo_kqfilter
= slfileop_kqfilter
,
243 .fo_stat
= slfileop_stat
,
244 .fo_close
= slfileop_close
,
245 .fo_shutdown
= slfileop_shutdown
248 /************************************************************************
249 * PRIMARY SYSTEM CALL INTERFACE *
250 ************************************************************************
252 * syslink(int cmd, struct syslink_info *info, size_t bytes)
257 sys_syslink(struct syslink_args
*uap
)
259 union syslink_info_all info
;
263 * System call is under construction and disabled by default.
264 * Superuser access is also required for now, but eventually
265 * will not be needed.
267 if (syslink_enabled
== 0)
269 error
= priv_check(curthread
, PRIV_ROOT
);
274 * Load and validate the info structure. Unloaded bytes are zerod
275 * out. The label field must always be 0-filled, even if not used
278 bzero(&info
, sizeof(info
));
279 if ((unsigned)uap
->bytes
<= sizeof(info
)) {
281 error
= copyin(uap
->info
, &info
, uap
->bytes
);
290 * Process the command
293 case SYSLINK_CMD_NEW
:
294 error
= syslink_cmd_new(&info
.cmd_new
, &uap
->sysmsg_result
);
302 if (error
== 0 && info
.head
.wbflag
)
303 copyout(&info
, uap
->info
, uap
->bytes
);
308 * Create a linked pair of descriptors, like a pipe.
312 syslink_cmd_new(struct syslink_info_new
*info
, int *result
)
314 struct thread
*td
= curthread
;
315 struct filedesc
*fdp
= td
->td_proc
->p_fd
;
319 struct sldesc
*slpeer
;
323 error
= falloc(td
->td_lwp
, &fp1
, &fd1
);
326 error
= falloc(td
->td_lwp
, &fp2
, &fd2
);
328 fsetfd(fdp
, NULL
, fd1
);
332 slpeer
= allocsldesc(NULL
);
333 slpeer
->backend_wblocked
= backend_wblocked_user
;
334 slpeer
->backend_write
= backend_write_user
;
335 slpeer
->backend_reply
= backend_reply_user
;
336 slpeer
->backend_dispose
= backend_dispose_user
;
337 sl
= allocsldesc(slpeer
->common
);
339 sl
->backend_wblocked
= backend_wblocked_user
;
340 sl
->backend_write
= backend_write_user
;
341 sl
->backend_reply
= backend_reply_user
;
342 sl
->backend_dispose
= backend_dispose_user
;
345 setsldescfp(sl
, fp1
);
346 setsldescfp(slpeer
, fp2
);
348 fsetfd(fdp
, fp1
, fd1
);
350 fsetfd(fdp
, fp2
, fd2
);
353 info
->head
.wbflag
= 1; /* write back */
360 /************************************************************************
361 * LOW LEVEL SLDESC SUPPORT *
362 ************************************************************************
368 allocsldesc(struct slcommon
*common
)
372 sl
= kmalloc(sizeof(struct sldesc
), M_SYSLINK
, M_WAITOK
|M_ZERO
);
374 common
= kmalloc(sizeof(*common
), M_SYSLINK
, M_WAITOK
|M_ZERO
);
375 TAILQ_INIT(&sl
->inq
); /* incoming requests */
376 RB_INIT(&sl
->reply_rb_root
); /* match incoming replies */
377 spin_init(&sl
->spin
);
385 setsldescfp(struct sldesc
*sl
, struct file
*fp
)
388 fp
->f_type
= DTYPE_SYSLINK
;
389 fp
->f_flag
= FREAD
| FWRITE
;
390 fp
->f_ops
= &syslinkops
;
395 * Red-black tree compare function
399 rb_slmsg_compare(struct slmsg
*msg1
, struct slmsg
*msg2
)
401 if (msg1
->msg
->sm_msgid
< msg2
->msg
->sm_msgid
)
403 if (msg1
->msg
->sm_msgid
== msg2
->msg
->sm_msgid
)
410 shutdownsldesc(struct sldesc
*sl
, int how
)
415 shutdownsldesc2(sl
, how
);
418 * Return unread and unreplied messages
420 spin_lock_wr(&sl
->spin
);
421 while ((slmsg
= TAILQ_FIRST(&sl
->inq
)) != NULL
) {
422 TAILQ_REMOVE(&sl
->inq
, slmsg
, tqnode
);
423 spin_unlock_wr(&sl
->spin
);
424 if (slmsg
->msg
->sm_proto
& SM_PROTO_REPLY
) {
425 sl
->repbytes
-= slmsg
->maxsize
;
426 slmsg
->flags
&= ~SLMSGF_ONINQ
;
427 sl
->peer
->backend_dispose(sl
->peer
, slmsg
);
429 /* leave ONINQ set for commands, it will cleared below */
430 spin_lock_wr(&sl
->spin
);
432 while ((slmsg
= RB_ROOT(&sl
->reply_rb_root
)) != NULL
) {
433 RB_REMOVE(slmsg_rb_tree
, &sl
->reply_rb_root
, slmsg
);
434 sl
->cmdbytes
-= slmsg
->maxsize
;
435 spin_unlock_wr(&sl
->spin
);
436 slmsg
->flags
&= ~SLMSGF_ONINQ
;
437 sl
->peer
->backend_reply(sl
->peer
, slmsg
, NULL
);
438 spin_lock_wr(&sl
->spin
);
440 spin_unlock_wr(&sl
->spin
);
443 * Call shutdown on the peer with the opposite flags
457 shutdownsldesc2(sl
->peer
, rhow
);
462 shutdownsldesc2(struct sldesc
*sl
, int how
)
464 spin_lock_wr(&sl
->spin
);
467 sl
->flags
|= SLF_RSHUTDOWN
;
470 sl
->flags
|= SLF_WSHUTDOWN
;
473 sl
->flags
|= SLF_RSHUTDOWN
| SLF_WSHUTDOWN
;
476 spin_unlock_wr(&sl
->spin
);
479 * Handle signaling on the user side
483 wakeup(&sl
->rwaiters
);
487 sl
->wblocked
= 0; /* race ok */
488 wakeup(&sl
->wblocked
);
495 sldrop(struct sldesc
*sl
)
497 struct sldesc
*slpeer
;
499 spin_lock_wr(&sl
->common
->spin
);
500 if (--sl
->common
->refs
== 0) {
501 spin_unlock_wr(&sl
->common
->spin
);
502 if ((slpeer
= sl
->peer
) != NULL
) {
505 slpeer
->common
= NULL
;
506 KKASSERT(slpeer
->xfp
== NULL
);
507 KKASSERT(TAILQ_EMPTY(&slpeer
->inq
));
508 KKASSERT(RB_EMPTY(&slpeer
->reply_rb_root
));
509 kfree(slpeer
, M_SYSLINK
);
511 KKASSERT(sl
->xfp
== NULL
);
512 KKASSERT(TAILQ_EMPTY(&sl
->inq
));
513 KKASSERT(RB_EMPTY(&sl
->reply_rb_root
));
514 kfree(sl
->common
, M_SYSLINK
);
516 kfree(sl
, M_SYSLINK
);
518 spin_unlock_wr(&sl
->common
->spin
);
524 slmsg_put(struct slmsg
*slmsg
)
526 if (slmsg
->flags
& SLMSGF_HASXIO
) {
527 slmsg
->flags
&= ~SLMSGF_HASXIO
;
529 xio_release(&slmsg
->xio
);
532 slmsg
->flags
&= ~SLMSGF_LINMAP
;
533 objcache_put(slmsg
->oc
, slmsg
);
536 /************************************************************************
538 ************************************************************************
540 * Implement userland fileops.
546 slfileop_read(struct file
*fp
, struct uio
*uio
, struct ucred
*cred
, int flags
)
548 struct sldesc
*sl
= fp
->f_data
; /* fp refed on call */
552 struct syslink_msg
*wmsg
;
557 * Kinda messy. Figure out the non-blocking state
559 if (flags
& O_FBLOCKING
)
561 else if (flags
& O_FNONBLOCKING
)
563 else if (fp
->f_flag
& O_NONBLOCK
)
571 * iov0 - message buffer
572 * iov1 - DMA buffer or backup buffer
574 if (uio
->uio_iovcnt
< 1) {
578 iov0
= &uio
->uio_iov
[0];
579 if (uio
->uio_iovcnt
> 2) {
585 * Get a message, blocking if necessary.
587 spin_lock_wr(&sl
->spin
);
588 while ((slmsg
= TAILQ_FIRST(&sl
->inq
)) == NULL
) {
589 if (sl
->flags
& SLF_RSHUTDOWN
) {
598 error
= ssleep(&sl
->rwaiters
, &sl
->spin
, PCATCH
, "slrmsg", 0);
606 * We have a message and still hold the spinlock. Make sure the
607 * uio has enough room to hold the message.
609 * Note that replies do not have XIOs.
611 if (slmsg
->msgsize
> iov0
->iov_len
) {
615 if (slmsg
->xio
.xio_bytes
) {
616 if (uio
->uio_iovcnt
!= 2) {
620 iov1
= &uio
->uio_iov
[1];
621 if (slmsg
->xio
.xio_bytes
> iov1
->iov_len
) {
630 * Dequeue the message. Adjust repbytes immediately. cmdbytes
631 * are adjusted when the command is replied to, not here.
633 TAILQ_REMOVE(&sl
->inq
, slmsg
, tqnode
);
634 if (slmsg
->msg
->sm_proto
& SM_PROTO_REPLY
)
635 sl
->repbytes
-= slmsg
->maxsize
;
636 spin_unlock_wr(&sl
->spin
);
639 * Load the message data into the user buffer.
641 * If receiving a command an XIO may exist specifying a DMA buffer.
642 * For commands, if DMAW is set we have to copy or map the buffer
643 * so the caller can access the data being written. If DMAR is set
644 * we do not have to copy but we still must map the buffer so the
645 * caller can directly fill in the data being requested.
647 error
= uiomove((void *)slmsg
->msg
, slmsg
->msgsize
, uio
);
648 if (error
== 0 && slmsg
->xio
.xio_bytes
&&
649 (wmsg
->sm_head
.se_cmd
& SE_CMDF_REPLY
) == 0) {
650 if (wmsg
->sm_head
.se_cmd
& SE_CMDF_DMAW
) {
652 * Data being passed to caller or being passed in both
653 * directions, copy or map.
656 if ((flags
& O_MAPONREAD
) &&
657 (slmsg
->xio
.xio_flags
& XIOF_VMLINEAR
)) {
658 error
= sl_local_mmap(slmsg
,
662 error
= xio_copy_xtou(&slmsg
->xio
, 0,
664 slmsg
->xio
.xio_bytes
);
666 error
= xio_copy_xtou(&slmsg
->xio
, 0,
668 slmsg
->xio
.xio_bytes
);
671 } else if (wmsg
->sm_head
.se_cmd
& SE_CMDF_DMAR
) {
673 * Data will be passed back to originator, map
674 * the buffer if we can, else use the backup
675 * buffer at the same VA supplied by the caller.
678 if ((flags
& O_MAPONREAD
) &&
679 (slmsg
->xio
.xio_flags
& XIOF_VMLINEAR
)) {
680 error
= sl_local_mmap(slmsg
,
683 error
= 0; /* ignore errors */
694 * Requeue the message if we could not read it successfully
696 spin_lock_wr(&sl
->spin
);
697 TAILQ_INSERT_HEAD(&sl
->inq
, slmsg
, tqnode
);
698 slmsg
->flags
|= SLMSGF_ONINQ
;
699 spin_unlock_wr(&sl
->spin
);
700 } else if (slmsg
->msg
->sm_proto
& SM_PROTO_REPLY
) {
702 * Dispose of any received reply after we've copied it
703 * to userland. We don't need the slmsg any more.
705 slmsg
->flags
&= ~SLMSGF_ONINQ
;
706 sl
->peer
->backend_dispose(sl
->peer
, slmsg
);
707 if (sl
->wblocked
&& sl
->repbytes
< syslink_bufsize
) {
708 sl
->wblocked
= 0; /* MP race ok here */
709 wakeup(&sl
->wblocked
);
713 * Leave the command in the RB tree but clear ONINQ now
714 * that we have returned it to userland so userland can
717 slmsg
->flags
&= ~SLMSGF_ONINQ
;
721 spin_unlock_wr(&sl
->spin
);
727 * Userland writes syslink message (optionally with DMA buffer in iov[1]).
731 slfileop_write(struct file
*fp
, struct uio
*uio
, struct ucred
*cred
, int flags
)
733 struct sldesc
*sl
= fp
->f_data
;
736 struct syslink_msg sltmp
;
737 struct syslink_msg
*wmsg
; /* wire message */
746 * Kinda messy. Figure out the non-blocking state
748 if (flags
& O_FBLOCKING
)
750 else if (flags
& O_FNONBLOCKING
)
752 else if (fp
->f_flag
& O_NONBLOCK
)
760 if (uio
->uio_iovcnt
< 1) {
764 iov0
= &uio
->uio_iov
[0];
765 if (iov0
->iov_len
> SLMSG_BIG
) {
769 if (uio
->uio_iovcnt
> 2) {
773 if (uio
->uio_iovcnt
> 1) {
774 iov1
= &uio
->uio_iov
[1];
775 if (iov1
->iov_len
> XIO_INTERNAL_SIZE
) {
779 if ((intptr_t)iov1
->iov_base
& PAGE_MASK
) {
788 * Handle the buffer-full case. slpeer cmdbytes is managed
789 * by the backend function, not us so if the callback just
790 * directly implements the message and never adjusts cmdbytes,
791 * we will never sleep here.
793 if (sl
->flags
& SLF_WSHUTDOWN
) {
799 * Only commands can block the pipe, not replies. Otherwise a
800 * deadlock is possible.
802 error
= copyin(iov0
->iov_base
, &sltmp
, sizeof(sltmp
));
805 if ((proto
= sltmp
.sm_proto
) & SM_PROTO_ENDIAN_REV
)
806 proto
= bswap16(proto
);
807 error
= sl
->peer
->backend_wblocked(sl
->peer
, nbio
, proto
);
812 * Allocate a slmsg and load the message. Note that the bytes
813 * returned to userland only reflects the primary syslink message
814 * and does not include any DMA buffers.
816 if (iov0
->iov_len
<= SLMSG_SMALL
)
817 slmsg
= objcache_get(sl_objcache_small
, M_WAITOK
);
819 slmsg
= objcache_get(sl_objcache_big
, M_WAITOK
);
820 slmsg
->msgsize
= iov0
->iov_len
;
823 error
= uiomove((void *)wmsg
, iov0
->iov_len
, uio
);
826 error
= syslink_validate_msg(wmsg
, slmsg
->msgsize
);
830 if ((wmsg
->sm_head
.se_cmd
& SE_CMDF_REPLY
) == 0) {
832 * Install the XIO for commands if any DMA flags are set.
834 * XIOF_VMLINEAR requires that the XIO represent a
835 * contiguous set of pages associated with a single VM
836 * object (so the reader side can mmap it easily).
838 * XIOF_VMLINEAR might not be set when the kernel sends
839 * commands to userland so the reader side backs off to
840 * a backup buffer if it isn't set, but we require it
841 * for userland writes.
843 xflags
= XIOF_VMLINEAR
;
844 if (wmsg
->sm_head
.se_cmd
& SE_CMDF_DMAR
)
845 xflags
|= XIOF_READ
| XIOF_WRITE
;
846 else if (wmsg
->sm_head
.se_cmd
& SE_CMDF_DMAW
)
848 if (xflags
&& iov1
) {
850 error
= xio_init_ubuf(&slmsg
->xio
, iov1
->iov_base
,
851 iov1
->iov_len
, xflags
);
855 slmsg
->flags
|= SLMSGF_HASXIO
;
857 error
= sl
->peer
->backend_write(sl
->peer
, slmsg
);
860 * Replies have to be matched up against received commands.
862 spin_lock_wr(&sl
->spin
);
863 slcmd
= slmsg_rb_tree_RB_LOOKUP(&sl
->reply_rb_root
,
864 slmsg
->msg
->sm_msgid
);
865 if (slcmd
== NULL
|| (slcmd
->flags
& SLMSGF_ONINQ
)) {
867 spin_unlock_wr(&sl
->spin
);
870 RB_REMOVE(slmsg_rb_tree
, &sl
->reply_rb_root
, slcmd
);
871 sl
->cmdbytes
-= slcmd
->maxsize
;
872 spin_unlock_wr(&sl
->spin
);
875 * If the original command specified DMAR, has an xio, and
876 * our write specifies a DMA buffer, then we can do a
877 * copyback. But if we are linearly mapped and the caller
878 * is using the map base address, then the caller filled in
879 * the data via the direct memory map and no copyback is
882 if ((slcmd
->msg
->sm_head
.se_cmd
& SE_CMDF_DMAR
) && iov1
&&
883 (slcmd
->flags
& SLMSGF_HASXIO
) &&
884 ((slcmd
->flags
& SLMSGF_LINMAP
) == 0 ||
885 iov1
->iov_base
!= slcmd
->vmbase
)
888 if (iov1
->iov_len
> slcmd
->xio
.xio_bytes
)
889 count
= slcmd
->xio
.xio_bytes
;
891 count
= iov1
->iov_len
;
893 error
= xio_copy_utox(&slcmd
->xio
, 0, iov1
->iov_base
,
899 * If we had mapped a DMA buffer, remove it
901 if (slcmd
->flags
& SLMSGF_LINMAP
) {
903 sl_local_munmap(slcmd
);
908 * Reply and handle unblocking
910 sl
->peer
->backend_reply(sl
->peer
, slcmd
, slmsg
);
911 if (sl
->wblocked
&& sl
->cmdbytes
< syslink_bufsize
) {
912 sl
->wblocked
= 0; /* MP race ok here */
913 wakeup(&sl
->wblocked
);
917 * slmsg has already been dealt with, make sure error is
918 * 0 so we do not double-free it.
932 * Close a syslink descriptor.
934 * Disassociate the syslink from the file descriptor and disconnect from
939 slfileop_close(struct file
*fp
)
944 * Disassociate the file pointer. Take ownership of the ref on the
949 fp
->f_ops
= &badfileops
;
953 * Shutdown both directions. The other side will not issue API
954 * calls to us after we've shutdown both directions.
956 shutdownsldesc(sl
, SHUT_RDWR
);
961 KKASSERT(sl
->cmdbytes
== 0);
962 KKASSERT(sl
->repbytes
== 0);
972 slfileop_stat (struct file
*fp
, struct stat
*sb
, struct ucred
*cred
)
979 slfileop_shutdown (struct file
*fp
, int how
)
981 shutdownsldesc((struct sldesc
*)fp
->f_data
, how
);
987 slfileop_ioctl (struct file
*fp
, u_long cmd
, caddr_t data
,
988 struct ucred
*cred
, struct sysmsg
*msg
)
995 slfileop_poll (struct file
*fp
, int events
, struct ucred
*cred
)
1002 slfileop_kqfilter(struct file
*fp
, struct knote
*kn
)
1007 /************************************************************************
1008 * LOCAL MEMORY MAPPING *
1009 ************************************************************************
1011 * This feature is currently not implemented
1017 sl_local_mmap(struct slmsg
*slmsg
, char *base
, size_t len
)
1019 return (EOPNOTSUPP
);
1024 sl_local_munmap(struct slmsg
*slmsg
)
1033 sl_local_mmap(struct slmsg
*slmsg
, char *base
, size_t len
)
1035 struct vmspace
*vms
= curproc
->p_vmspace
;
1036 vm_offset_t addr
= (vm_offset_t
)base
;
1038 /* XXX check user address range */
1039 error
= vm_map_replace(
1041 (vm_offset_t
)base
, (vm_offset_t
)base
+ len
,
1042 slmsg
->xio
.xio_pages
[0]->object
,
1043 slmsg
->xio
.xio_pages
[0]->pindex
<< PAGE_SHIFT
,
1044 VM_PROT_READ
|VM_PROT_WRITE
,
1045 VM_PROT_READ
|VM_PROT_WRITE
,
1046 MAP_DISABLE_SYNCER
);
1049 slmsg
->flags
|= SLMSGF_LINMAP
;
1050 slmsg
->vmbase
= base
;
1051 slmsg
->vmsize
= len
;
1058 sl_local_munmap(struct slmsg
*slmsg
)
1060 if (slmsg
->flags
& SLMSGF_LINMAP
) {
1061 vm_map_remove(&curproc
->p_vmspace
->vm_map
,
1063 slmsg
->vmbase
+ slcmd
->vmsize
);
1064 slmsg
->flags
&= ~SLMSGF_LINMAP
;
1070 /************************************************************************
1071 * MESSAGE VALIDATION *
1072 ************************************************************************
1074 * Validate that the syslink message. Check that all headers and elements
1075 * conform. Correct the endian if necessary.
1077 * NOTE: If reverse endian needs to be corrected, SE_CMDF_UNTRANSLATED
1078 * is recursively flipped on all syslink_elm's in the message. As the
1079 * message traverses the mesh, multiple flips may occur. It is
1080 * up to the RPC protocol layer to correct opaque data payloads and
1081 * SE_CMDF_UNTRANSLATED prevents the protocol layer from misinterpreting
1082 * a command or reply element which has not been endian-corrected.
1086 syslink_validate_msg(struct syslink_msg
*msg
, int bytes
)
1093 * The raw message must be properly-aligned.
1095 if (bytes
& SL_ALIGNMASK
)
1100 * The message must at least contain the msgid, bytes, and
1103 if (bytes
< SL_MIN_PAD_SIZE
)
1107 * Fix the endian if it is reversed.
1109 if (msg
->sm_proto
& SM_PROTO_ENDIAN_REV
) {
1110 msg
->sm_msgid
= bswap64(msg
->sm_msgid
);
1111 msg
->sm_sessid
= bswap64(msg
->sm_sessid
);
1112 msg
->sm_bytes
= bswap16(msg
->sm_bytes
);
1113 msg
->sm_proto
= bswap16(msg
->sm_proto
);
1114 msg
->sm_rlabel
= bswap32(msg
->sm_rlabel
);
1115 if (msg
->sm_proto
& SM_PROTO_ENDIAN_REV
)
1123 * Validate the contents. For PADs, the entire payload is
1124 * ignored and the minimum message size can be as small as
1127 if (msg
->sm_proto
== SMPROTO_PAD
) {
1128 if (msg
->sm_bytes
< SL_MIN_PAD_SIZE
||
1129 msg
->sm_bytes
> bytes
) {
1132 /* ignore the entire payload, it can be garbage */
1134 if (msg
->sm_bytes
< SL_MIN_MSG_SIZE
||
1135 msg
->sm_bytes
> bytes
) {
1138 error
= syslink_validate_elm(
1141 offsetof(struct syslink_msg
,
1143 swapit
, SL_MAXDEPTH
);
1149 * The aligned payload size must be used to locate the
1150 * next syslink_msg in the buffer.
1152 aligned_reclen
= SL_MSG_ALIGN(msg
->sm_bytes
);
1153 bytes
-= aligned_reclen
;
1154 msg
= (void *)((char *)msg
+ aligned_reclen
);
1161 syslink_validate_elm(struct syslink_elm
*elm
, sl_reclen_t bytes
,
1162 int swapit
, int depth
)
1167 * If the buffer isn't big enough to fit the header, stop now!
1169 if (bytes
< SL_MIN_ELM_SIZE
)
1172 * All syslink_elm headers are recursively endian-adjusted. Opaque
1173 * data payloads are not.
1176 elm
->se_cmd
= bswap16(elm
->se_cmd
) ^ SE_CMDF_UNTRANSLATED
;
1177 elm
->se_bytes
= bswap16(elm
->se_bytes
);
1178 elm
->se_aux
= bswap32(elm
->se_aux
);
1182 * Check element size requirements.
1184 if (elm
->se_bytes
< SL_MIN_ELM_SIZE
|| elm
->se_bytes
> bytes
)
1188 * Recursively check structured payloads. A structured payload may
1189 * contain as few as 0 recursive elements.
1191 if (elm
->se_cmd
& SE_CMDF_STRUCTURED
) {
1194 bytes
-= SL_MIN_ELM_SIZE
;
1197 if (syslink_validate_elm(elm
, bytes
, swapit
, depth
- 1))
1199 aligned_reclen
= SL_MSG_ALIGN(elm
->se_bytes
);
1200 elm
= (void *)((char *)elm
+ aligned_reclen
);
1201 bytes
-= aligned_reclen
;
1207 /************************************************************************
1208 * BACKEND FUNCTIONS - USER DESCRIPTOR *
1209 ************************************************************************
1211 * Peer backend links are primarily used when userland creates a pair
1212 * of linked descriptors.
1216 * Do any required blocking / nbio handling for attempts to write to
1217 * a sldesc associated with a user descriptor.
1221 backend_wblocked_user(struct sldesc
*sl
, int nbio
, sl_proto_t proto
)
1224 int *bytesp
= (proto
& SM_PROTO_REPLY
) ? &sl
->repbytes
: &sl
->cmdbytes
;
1227 * Block until sufficient data is drained by the target. It is
1228 * ok to have a MP race against cmdbytes.
1230 if (*bytesp
>= syslink_bufsize
) {
1231 spin_lock_wr(&sl
->spin
);
1232 while (*bytesp
>= syslink_bufsize
) {
1233 if (sl
->flags
& SLF_WSHUTDOWN
) {
1242 error
= ssleep(&sl
->wblocked
, &sl
->spin
,
1243 PCATCH
, "slwmsg", 0);
1247 spin_unlock_wr(&sl
->spin
);
1253 * Unconditionally write a syslink message to the sldesc associated with
1254 * a user descriptor. Command messages are also placed in a red-black
1255 * tree so their DMA tag (if any) can be accessed and so they can be
1256 * linked to any reply message.
1260 backend_write_user(struct sldesc
*sl
, struct slmsg
*slmsg
)
1264 spin_lock_wr(&sl
->spin
);
1265 if (sl
->flags
& SLF_RSHUTDOWN
) {
1267 * Not accepting new messages
1270 } else if (slmsg
->msg
->sm_proto
& SM_PROTO_REPLY
) {
1274 TAILQ_INSERT_TAIL(&sl
->inq
, slmsg
, tqnode
);
1275 sl
->repbytes
+= slmsg
->maxsize
;
1276 slmsg
->flags
|= SLMSGF_ONINQ
;
1278 } else if (RB_INSERT(slmsg_rb_tree
, &sl
->reply_rb_root
, slmsg
)) {
1280 * Write a command, but there was a msgid collision when
1281 * we tried to insert it into the RB tree.
1286 * Write a command, successful insertion into the RB tree.
1288 TAILQ_INSERT_TAIL(&sl
->inq
, slmsg
, tqnode
);
1289 sl
->cmdbytes
+= slmsg
->maxsize
;
1290 slmsg
->flags
|= SLMSGF_ONINQ
;
1293 spin_unlock_wr(&sl
->spin
);
1295 wakeup(&sl
->rwaiters
);
1300 * Our peer is replying a command we previously sent it back to us, along
1301 * with the reply message (if not NULL). We just queue the reply to
1302 * userland and free of the command.
1306 backend_reply_user(struct sldesc
*sl
, struct slmsg
*slcmd
, struct slmsg
*slrep
)
1312 spin_lock_wr(&sl
->spin
);
1313 if ((sl
->flags
& SLF_RSHUTDOWN
) == 0) {
1314 TAILQ_INSERT_TAIL(&sl
->inq
, slrep
, tqnode
);
1315 sl
->repbytes
+= slrep
->maxsize
;
1320 spin_unlock_wr(&sl
->spin
);
1322 sl
->peer
->backend_dispose(sl
->peer
, slrep
);
1323 else if (sl
->rwaiters
)
1324 wakeup(&sl
->rwaiters
);
1330 backend_dispose_user(struct sldesc
*sl
, struct slmsg
*slmsg
)
1335 /************************************************************************
1336 * KERNEL DRIVER OR FILESYSTEM API *
1337 ************************************************************************
1342 * Create a user<->kernel link, returning the user descriptor in *fdp
1343 * and the kernel descriptor in *kslp. 0 is returned on success, and an
1344 * error code is returned on failure.
1347 syslink_ukbackend(int *pfd
, struct sldesc
**kslp
)
1349 struct thread
*td
= curthread
;
1350 struct filedesc
*fdp
= td
->td_proc
->p_fd
;
1360 error
= falloc(td
->td_lwp
, &fp
, &fd
);
1363 usl
= allocsldesc(NULL
);
1364 usl
->backend_wblocked
= backend_wblocked_user
;
1365 usl
->backend_write
= backend_write_user
;
1366 usl
->backend_reply
= backend_reply_user
;
1367 usl
->backend_dispose
= backend_dispose_user
;
1369 ksl
= allocsldesc(usl
->common
);
1371 ksl
->backend_wblocked
= backend_wblocked_kern
;
1372 ksl
->backend_write
= backend_write_kern
;
1373 ksl
->backend_reply
= backend_reply_kern
;
1374 ksl
->backend_dispose
= backend_dispose_kern
;
1378 setsldescfp(usl
, fp
);
1379 fsetfd(fdp
, fp
, fd
);
1388 * Assign a unique message id, issue a syslink message to userland,
1389 * and wait for a reply.
1392 syslink_kdomsg(struct sldesc
*ksl
, struct slmsg
*slmsg
)
1394 struct syslink_msg
*msg
;
1398 * Finish initializing slmsg and post it to the red-black tree for
1399 * reply matching. If the message id is already in use we return
1400 * EEXIST, giving the originator the chance to roll a new msgid.
1403 slmsg
->msgsize
= msg
->sm_bytes
;
1404 if ((error
= syslink_validate_msg(msg
, msg
->sm_bytes
)) != 0)
1406 msg
->sm_msgid
= allocsysid();
1409 * Issue the request and wait for a matching reply or failure,
1410 * then remove the message from the matching tree and return.
1412 error
= ksl
->peer
->backend_write(ksl
->peer
, slmsg
);
1413 spin_lock_wr(&ksl
->spin
);
1415 while (slmsg
->rep
== NULL
) {
1416 error
= ssleep(slmsg
, &ksl
->spin
, 0, "kwtmsg", 0);
1417 /* XXX ignore error for now */
1419 if (slmsg
->rep
== (struct slmsg
*)-1) {
1423 error
= slmsg
->rep
->msg
->sm_head
.se_aux
;
1426 spin_unlock_wr(&ksl
->spin
);
1431 * Similar to syslink_kdomsg but return immediately instead of
1432 * waiting for a reply. The kernel must supply a callback function
1433 * which will be made in the context of the user process replying
1437 syslink_ksendmsg(struct sldesc
*ksl
, struct slmsg
*slmsg
,
1438 void (*func
)(struct slmsg
*, void *, int), void *arg
)
1440 struct syslink_msg
*msg
;
1444 * Finish initializing slmsg and post it to the red-black tree for
1445 * reply matching. If the message id is already in use we return
1446 * EEXIST, giving the originator the chance to roll a new msgid.
1449 slmsg
->msgsize
= msg
->sm_bytes
;
1450 slmsg
->callback_func
= func
;
1451 slmsg
->callback_data
= arg
;
1452 if ((error
= syslink_validate_msg(msg
, msg
->sm_bytes
)) != 0)
1454 msg
->sm_msgid
= allocsysid();
1457 * Issue the request. If no error occured the operation will be
1458 * in progress, otherwise the operation is considered to have failed
1459 * and the caller can deallocate the slmsg.
1461 error
= ksl
->peer
->backend_write(ksl
->peer
, slmsg
);
1466 syslink_kwaitmsg(struct sldesc
*ksl
, struct slmsg
*slmsg
)
1470 spin_lock_wr(&ksl
->spin
);
1471 while (slmsg
->rep
== NULL
) {
1472 error
= ssleep(slmsg
, &ksl
->spin
, 0, "kwtmsg", 0);
1473 /* XXX ignore error for now */
1475 if (slmsg
->rep
== (struct slmsg
*)-1) {
1479 error
= slmsg
->rep
->msg
->sm_head
.se_aux
;
1481 spin_unlock_wr(&ksl
->spin
);
1486 syslink_kallocmsg(void)
1488 return(objcache_get(sl_objcache_small
, M_WAITOK
));
1492 syslink_kfreemsg(struct sldesc
*ksl
, struct slmsg
*slmsg
)
1496 if ((rep
= slmsg
->rep
) != NULL
) {
1498 ksl
->peer
->backend_dispose(ksl
->peer
, rep
);
1500 slmsg
->callback_func
= NULL
;
1505 syslink_kshutdown(struct sldesc
*ksl
, int how
)
1507 shutdownsldesc(ksl
, how
);
1511 syslink_kclose(struct sldesc
*ksl
)
1513 shutdownsldesc(ksl
, SHUT_RDWR
);
1518 * Associate a DMA buffer with a kernel syslink message prior to it
1519 * being sent to userland. The DMA buffer is set up from the point
1520 * of view of the target.
1523 syslink_kdmabuf_pages(struct slmsg
*slmsg
, struct vm_page
**mbase
, int npages
)
1528 xflags
= XIOF_VMLINEAR
;
1529 if (slmsg
->msg
->sm_head
.se_cmd
& SE_CMDF_DMAR
)
1530 xflags
|= XIOF_READ
| XIOF_WRITE
;
1531 else if (slmsg
->msg
->sm_head
.se_cmd
& SE_CMDF_DMAW
)
1532 xflags
|= XIOF_READ
;
1533 error
= xio_init_pages(&slmsg
->xio
, mbase
, npages
, xflags
);
1534 slmsg
->flags
|= SLMSGF_HASXIO
;
1539 * Associate a DMA buffer with a kernel syslink message prior to it
1540 * being sent to userland. The DMA buffer is set up from the point
1541 * of view of the target.
1544 syslink_kdmabuf_data(struct slmsg
*slmsg
, char *base
, int bytes
)
1548 xflags
= XIOF_VMLINEAR
;
1549 if (slmsg
->msg
->sm_head
.se_cmd
& SE_CMDF_DMAR
)
1550 xflags
|= XIOF_READ
| XIOF_WRITE
;
1551 else if (slmsg
->msg
->sm_head
.se_cmd
& SE_CMDF_DMAW
)
1552 xflags
|= XIOF_READ
;
1553 xio_init_kbuf(&slmsg
->xio
, base
, bytes
);
1554 slmsg
->xio
.xio_flags
|= xflags
;
1555 slmsg
->flags
|= SLMSGF_HASXIO
;
1559 /************************************************************************
1560 * BACKEND FUNCTIONS FOR KERNEL API *
1561 ************************************************************************
1563 * These are the backend functions for a sldesc associated with a kernel
1568 * Our peer wants to write a syslink message to us and is asking us to
1569 * block if our input queue is full. We don't implement command reception
1570 * so don't block right now.
1574 backend_wblocked_kern(struct sldesc
*ksl
, int nbio
, sl_proto_t proto
)
1581 * Our peer is writing a request to the kernel. At the moment we do not
1586 backend_write_kern(struct sldesc
*ksl
, struct slmsg
*slmsg
)
1592 * Our peer wants to reply to a syslink message we sent it earlier. The
1593 * original command (that we passed to our peer), and the peer's reply
1594 * is specified. If the peer has failed slrep will be NULL.
1598 backend_reply_kern(struct sldesc
*ksl
, struct slmsg
*slcmd
, struct slmsg
*slrep
)
1602 spin_lock_wr(&ksl
->spin
);
1603 if (slrep
== NULL
) {
1604 slcmd
->rep
= (struct slmsg
*)-1;
1608 error
= slrep
->msg
->sm_head
.se_aux
;
1610 spin_unlock_wr(&ksl
->spin
);
1613 * Issue callback or wakeup a synchronous waiter.
1615 if (slcmd
->callback_func
) {
1616 slcmd
->callback_func(slcmd
, slcmd
->callback_data
, error
);
1623 * Any reply messages we sent to our peer are returned to us for disposal.
1624 * Since we do not currently accept commands from our peer, there will not
1625 * be any replies returned to the peer to dispose of.
1629 backend_dispose_kern(struct sldesc
*ksl
, struct slmsg
*slmsg
)
1631 panic("backend_dispose_kern: kernel can't accept commands so it "
1632 "certainly did not reply to one!");